29 |
| - "# Authors: Tom Dupre la Tour\n#\n# License: BSD 3 clause\nimport time\nimport sys\n\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn import linear_model\nfrom sklearn.datasets import fetch_openml\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.utils.testing import ignore_warnings\nfrom sklearn.exceptions import ConvergenceWarning\nfrom sklearn.utils import shuffle\n\nprint(__doc__)\n\n\ndef load_mnist(n_samples=None, class_0='0', class_1='8'):\n \"\"\"Load MNIST, select two classes, shuffle and return only n_samples.\"\"\"\n # Load data from http://openml.org/d/554\n mnist = fetch_openml('mnist_784', version=1)\n\n # take only two classes for binary classification\n mask = np.logical_or(mnist.target == class_0, mnist.target == class_1)\n\n X, y = shuffle(mnist.data[mask], mnist.target[mask], random_state=42)\n if n_samples is not None:\n X, y = X[:n_samples], y[:n_samples]\n return X, y\n\n\n@ignore_warnings(category=ConvergenceWarning)\ndef fit_and_score(estimator, max_iter, X_train, X_test, y_train, y_test):\n \"\"\"Fit the estimator on the train set and score it on both sets\"\"\"\n estimator.set_params(max_iter=max_iter)\n estimator.set_params(random_state=0)\n\n start = time.time()\n estimator.fit(X_train, y_train)\n\n fit_time = time.time() - start\n n_iter = estimator.n_iter_\n train_score = estimator.score(X_train, y_train)\n test_score = estimator.score(X_test, y_test)\n\n return fit_time, n_iter, train_score, test_score\n\n\n# Define the estimators to compare\nestimator_dict = {\n 'No stopping criterion':\n linear_model.SGDClassifier(tol=1e-3, n_iter_no_change=3),\n 'Training loss':\n linear_model.SGDClassifier(early_stopping=False, n_iter_no_change=3,\n tol=0.1),\n 'Validation score':\n linear_model.SGDClassifier(early_stopping=True, n_iter_no_change=3,\n tol=0.0001, validation_fraction=0.2)\n}\n\n# Load the dataset\nX, y = load_mnist(n_samples=10000)\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5,\n random_state=0)\n\nresults = []\nfor estimator_name, estimator in estimator_dict.items():\n print(estimator_name + ': ', end='')\n for max_iter in range(1, 50):\n print('.', end='')\n sys.stdout.flush()\n\n fit_time, n_iter, train_score, test_score = fit_and_score(\n estimator, max_iter, X_train, X_test, y_train, y_test)\n\n results.append((estimator_name, max_iter, fit_time, n_iter,\n train_score, test_score))\n print('')\n\n# Transform the results in a pandas dataframe for easy plotting\ncolumns = [\n 'Stopping criterion', 'max_iter', 'Fit time (sec)', 'n_iter_',\n 'Train score', 'Test score'\n]\nresults_df = pd.DataFrame(results, columns=columns)\n\n# Define what to plot (x_axis, y_axis)\nlines = 'Stopping criterion'\nplot_list = [\n ('max_iter', 'Train score'),\n ('max_iter', 'Test score'),\n ('max_iter', 'n_iter_'),\n ('max_iter', 'Fit time (sec)'),\n]\n\nnrows = 2\nncols = int(np.ceil(len(plot_list) / 2.))\nfig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(6 * ncols,\n 4 * nrows))\naxes[0, 0].get_shared_y_axes().join(axes[0, 0], axes[0, 1])\n\nfor ax, (x_axis, y_axis) in zip(axes.ravel(), plot_list):\n for criterion, group_df in results_df.groupby(lines):\n group_df.plot(x=x_axis, y=y_axis, label=criterion, ax=ax)\n ax.set_title(y_axis)\n ax.legend(title=lines)\n\nfig.tight_layout()\nplt.show()" |
| 29 | + "# Authors: Tom Dupre la Tour\n#\n# License: BSD 3 clause\nimport time\nimport sys\n\nimport pandas as pd\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn import linear_model\nfrom sklearn.datasets import fetch_openml\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.utils.testing import ignore_warnings\nfrom sklearn.exceptions import ConvergenceWarning\nfrom sklearn.utils import shuffle\n\nprint(__doc__)\n\n\ndef load_mnist(n_samples=None, class_0='0', class_1='8'):\n \"\"\"Load MNIST, select two classes, shuffle and return only n_samples.\"\"\"\n # Load data from http://openml.org/d/554\n mnist = fetch_openml('mnist_784', version=1)\n\n # take only two classes for binary classification\n mask = np.logical_or(mnist.target == class_0, mnist.target == class_1)\n\n X, y = shuffle(mnist.data[mask], mnist.target[mask], random_state=42)\n if n_samples is not None:\n X, y = X[:n_samples], y[:n_samples]\n return X, y\n\n\n@ignore_warnings(category=ConvergenceWarning)\ndef fit_and_score(estimator, max_iter, X_train, X_test, y_train, y_test):\n \"\"\"Fit the estimator on the train set and score it on both sets\"\"\"\n estimator.set_params(max_iter=max_iter)\n estimator.set_params(random_state=0)\n\n start = time.time()\n estimator.fit(X_train, y_train)\n\n fit_time = time.time() - start\n n_iter = estimator.n_iter_\n train_score = estimator.score(X_train, y_train)\n test_score = estimator.score(X_test, y_test)\n\n return fit_time, n_iter, train_score, test_score\n\n\n# Define the estimators to compare\nestimator_dict = {\n 'No stopping criterion':\n linear_model.SGDClassifier(n_iter_no_change=3),\n 'Training loss':\n linear_model.SGDClassifier(early_stopping=False, n_iter_no_change=3,\n tol=0.1),\n 'Validation score':\n linear_model.SGDClassifier(early_stopping=True, n_iter_no_change=3,\n tol=0.0001, validation_fraction=0.2)\n}\n\n# Load the dataset\nX, y = load_mnist(n_samples=10000)\nX_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5,\n random_state=0)\n\nresults = []\nfor estimator_name, estimator in estimator_dict.items():\n print(estimator_name + ': ', end='')\n for max_iter in range(1, 50):\n print('.', end='')\n sys.stdout.flush()\n\n fit_time, n_iter, train_score, test_score = fit_and_score(\n estimator, max_iter, X_train, X_test, y_train, y_test)\n\n results.append((estimator_name, max_iter, fit_time, n_iter,\n train_score, test_score))\n print('')\n\n# Transform the results in a pandas dataframe for easy plotting\ncolumns = [\n 'Stopping criterion', 'max_iter', 'Fit time (sec)', 'n_iter_',\n 'Train score', 'Test score'\n]\nresults_df = pd.DataFrame(results, columns=columns)\n\n# Define what to plot (x_axis, y_axis)\nlines = 'Stopping criterion'\nplot_list = [\n ('max_iter', 'Train score'),\n ('max_iter', 'Test score'),\n ('max_iter', 'n_iter_'),\n ('max_iter', 'Fit time (sec)'),\n]\n\nnrows = 2\nncols = int(np.ceil(len(plot_list) / 2.))\nfig, axes = plt.subplots(nrows=nrows, ncols=ncols, figsize=(6 * ncols,\n 4 * nrows))\naxes[0, 0].get_shared_y_axes().join(axes[0, 0], axes[0, 1])\n\nfor ax, (x_axis, y_axis) in zip(axes.ravel(), plot_list):\n for criterion, group_df in results_df.groupby(lines):\n group_df.plot(x=x_axis, y=y_axis, label=criterion, ax=ax)\n ax.set_title(y_axis)\n ax.legend(title=lines)\n\nfig.tight_layout()\nplt.show()" |