Commit89997e7

committed

Pushing the docs to dev/ for branch: master, commit 228109cd5c12c0c3374e37f13b48d7382d15a5a7

1 parent8258e8a commit89997e7Copy full SHA for 89997e7

File tree

1,170 files changed

+3647

-3647

lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,170 files changed

+3647

-3647

lines changed

`‎dev/_downloads/1abc4484d4183963e2039c8c679497eb/plot_sgd_comparison.ipynb`

Lines changed: 1 addition & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -26,7 +26,7 @@`
`26`	`26`	`},`
`27`	`27`	`"outputs": [],`
`28`	`28`	`"source": [`
`29`		- "# Author: Rob Zinkov <rob at zinkov dot com>\n# License: BSD 3 clause\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn import datasets\n\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.linear_model import SGDClassifier, Perceptron\nfrom sklearn.linear_model import PassiveAggressiveClassifier\nfrom sklearn.linear_model import LogisticRegression\n\nheldout = [0.95, 0.90, 0.75, 0.50, 0.01]\nrounds = 20\nX, y = datasets.load_digits(return_X_y=True)\n\nclassifiers = [\n (\"SGD\", SGDClassifier(max_iter=100)),\n (\"ASGD\", SGDClassifier(average=True, max_iter=1000)),\n (\"Perceptron\", Perceptron(tol=1e-3)),\n (\"Passive-Aggressive I\", PassiveAggressiveClassifier(loss='hinge',\n C=1.0, tol=1e-4)),\n (\"Passive-Aggressive II\", PassiveAggressiveClassifier(loss='squared_hinge',\n C=1.0, tol=1e-4)),\n (\"SAG\", LogisticRegression(solver='sag', tol=1e-1, C=1.e4 / X.shape[0]))\n]\n\nxx = 1. - np.array(heldout)\n\nfor name, clf in classifiers:\n print(\"training %s\" % name)\n rng = np.random.RandomState(42)\n yy = []\n for i in heldout:\n yy_ = []\n for r in range(rounds):\n X_train, X_test, y_train, y_test = \\\n train_test_split(X, y, test_size=i, random_state=rng)\n clf.fit(X_train, y_train)\n y_pred = clf.predict(X_test)\n yy_.append(1 - np.mean(y_pred == y_test))\n yy.append(np.mean(yy_))\n plt.plot(xx, yy, label=name)\n\nplt.legend(loc=\"upper right\")\nplt.xlabel(\"Proportion train\")\nplt.ylabel(\"Test Error Rate\")\nplt.show()"
	`29`	+ "# Author: Rob Zinkov <rob at zinkov dot com>\n# License: BSD 3 clause\n\nimport numpy as np\nimport matplotlib.pyplot as plt\nfrom sklearn import datasets\n\nfrom sklearn.model_selection import train_test_split\nfrom sklearn.linear_model import SGDClassifier, Perceptron\nfrom sklearn.linear_model import PassiveAggressiveClassifier\nfrom sklearn.linear_model import LogisticRegression\n\nheldout = [0.95, 0.90, 0.75, 0.50, 0.01]\nrounds = 20\nX, y = datasets.load_digits(return_X_y=True)\n\nclassifiers = [\n (\"SGD\", SGDClassifier(max_iter=100)),\n (\"ASGD\", SGDClassifier(average=True)),\n (\"Perceptron\", Perceptron()),\n (\"Passive-Aggressive I\", PassiveAggressiveClassifier(loss='hinge',\n C=1.0, tol=1e-4)),\n (\"Passive-Aggressive II\", PassiveAggressiveClassifier(loss='squared_hinge',\n C=1.0, tol=1e-4)),\n (\"SAG\", LogisticRegression(solver='sag', tol=1e-1, C=1.e4 / X.shape[0]))\n]\n\nxx = 1. - np.array(heldout)\n\nfor name, clf in classifiers:\n print(\"training %s\" % name)\n rng = np.random.RandomState(42)\n yy = []\n for i in heldout:\n yy_ = []\n for r in range(rounds):\n X_train, X_test, y_train, y_test = \\\n train_test_split(X, y, test_size=i, random_state=rng)\n clf.fit(X_train, y_train)\n y_pred = clf.predict(X_test)\n yy_.append(1 - np.mean(y_pred == y_test))\n yy.append(np.mean(yy_))\n plt.plot(xx, yy, label=name)\n\nplt.legend(loc=\"upper right\")\nplt.xlabel(\"Proportion train\")\nplt.ylabel(\"Test Error Rate\")\nplt.show()"
`30`	`30`	`]`
`31`	`31`	`}`
`32`	`32`	`],`

`‎dev/_downloads/3409d9766d352cc9f9b169d4a799a87a/auto_examples_python.zip`

-41 Bytes

Binary file not shown.

`‎dev/_downloads/3650884f0a646ba96d2e47df0a6fb935/plot_sgd_comparison.py`

Lines changed: 2 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -25,8 +25,8 @@`
`25`	`25`
`26`	`26`	`classifiers= [`
`27`	`27`	`("SGD",SGDClassifier(max_iter=100)),`
`28`		`- ("ASGD",SGDClassifier(average=True,max_iter=1000)),`
`29`		`- ("Perceptron",Perceptron(tol=1e-3)),`
	`28`	`+ ("ASGD",SGDClassifier(average=True)),`
	`29`	`+ ("Perceptron",Perceptron()),`
`30`	`30`	`("Passive-Aggressive I",PassiveAggressiveClassifier(loss='hinge',`
`31`	`31`	`C=1.0,tol=1e-4)),`
`32`	`32`	`("Passive-Aggressive II",PassiveAggressiveClassifier(loss='squared_hinge',`

`‎dev/_downloads/388641d133587cc11aa26f2dbef4b950/plot_document_classification_20newsgroups.py`

Lines changed: 1 addition & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -247,7 +247,7 @@ def benchmark(clf):`
`247`	`247`	`results= []`
`248`	`248`	`forclf,namein (`
`249`	`249`	`(RidgeClassifier(tol=1e-2,solver="sag"),"Ridge Classifier"),`
`250`		`- (Perceptron(max_iter=50,tol=1e-3),"Perceptron"),`
	`250`	`+ (Perceptron(max_iter=50),"Perceptron"),`
`251`	`251`	`(PassiveAggressiveClassifier(max_iter=50,tol=1e-3),`
`252`	`252`	`"Passive-Aggressive"),`
`253`	`253`	`(KNeighborsClassifier(n_neighbors=10),"kNN"),`

`‎dev/_downloads/3b31bf37034a6ece04667cd422e5ff79/plot_document_classification_20newsgroups.ipynb`

Lines changed: 1 addition & 1 deletion

Large diffs are not rendered by default.

`‎dev/_downloads/80692cf167e9ea27b27e5bd144159c82/plot_out_of_core_classification.py`

Lines changed: 1 addition & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -208,7 +208,7 @@ def progress(blocknum, bs, size):`
`208`	`208`	# Here are some classifiers that support the `partial_fit` method
`209`	`209`	`partial_fit_classifiers= {`
`210`	`210`	`'SGD':SGDClassifier(max_iter=5),`
`211`		`-'Perceptron':Perceptron(tol=1e-3),`
	`211`	`+'Perceptron':Perceptron(),`
`212`	`212`	`'NB Multinomial':MultinomialNB(alpha=0.01),`
`213`	`213`	`'Passive-Aggressive':PassiveAggressiveClassifier(tol=1e-3),`
`214`	`214`	`}`

`‎dev/_downloads/b86db3a111b621a7beeaa9d099608e5b/plot_out_of_core_classification.ipynb`

Lines changed: 1 addition & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -62,7 +62,7 @@`
`62`	`62`	`},`
`63`	`63`	`"outputs": [],`
`64`	`64`	`"source": [`
`65`		- "vectorizer = HashingVectorizer(decode_error='ignore', n_features=2 18,\n alternate_sign=False)\n\n\n# Iterator over parsed Reuters SGML files.\ndata_stream = stream_reuters_documents()\n\n# We learn a binary classification between the \"acq\" class and all the others.\n# \"acq\" was chosen as it is more or less evenly distributed in the Reuters\n# files. For other datasets, one should take care of creating a test set with\n# a realistic portion of positive instances.\nall_classes = np.array([0, 1])\npositive_class = 'acq'\n\n# Here are some classifiers that support the `partial_fit` method\npartial_fit_classifiers = {\n 'SGD': SGDClassifier(max_iter=5),\n 'Perceptron': Perceptron(tol=1e-3),\n 'NB Multinomial': MultinomialNB(alpha=0.01),\n 'Passive-Aggressive': PassiveAggressiveClassifier(tol=1e-3),\n}\n\n\ndef get_minibatch(doc_iter, size, pos_class=positive_class):\n \"\"\"Extract a minibatch of examples, return a tuple X_text, y.\n\n Note: size is before excluding invalid docs with no topics assigned.\n\n \"\"\"\n data = [('{title}\\n\\n{body}'.format(doc), pos_class in doc['topics'])\n for doc in itertools.islice(doc_iter, size)\n if doc['topics']]\n if not len(data):\n return np.asarray([], dtype=int), np.asarray([], dtype=int)\n X_text, y = zip(*data)\n return X_text, np.asarray(y, dtype=int)\n\n\ndef iter_minibatches(doc_iter, minibatch_size):\n \"\"\"Generator of minibatches.\"\"\"\n X_text, y = get_minibatch(doc_iter, minibatch_size)\n while len(X_text):\n yield X_text, y\n X_text, y = get_minibatch(doc_iter, minibatch_size)\n\n\n# test data statistics\ntest_stats = {'n_test': 0, 'n_test_pos': 0}\n\n# First we hold out a number of examples to estimate accuracy\nn_test_documents = 1000\ntick = time.time()\nX_test_text, y_test = get_minibatch(data_stream, 1000)\nparsing_time = time.time() - tick\ntick = time.time()\nX_test = vectorizer.transform(X_test_text)\nvectorizing_time = time.time() - tick\ntest_stats['n_test'] += len(y_test)\ntest_stats['n_test_pos'] += sum(y_test)\nprint(\"Test set is %d documents (%d positive)\" % (len(y_test), sum(y_test)))\n\n\ndef progress(cls_name, stats):\n \"\"\"Report progress information, return a string.\"\"\"\n duration = time.time() - stats['t0']\n s = \"%20s classifier : \\t\" % cls_name\n s += \"%(n_train)6d train docs (%(n_train_pos)6d positive) \" % stats\n s += \"%(n_test)6d test docs (%(n_test_pos)6d positive) \" % test_stats\n s += \"accuracy: %(accuracy).3f \" % stats\n s += \"in %.2fs (%5d docs/s)\" % (duration, stats['n_train'] / duration)\n return s\n\n\ncls_stats = {}\n\nfor cls_name in partial_fit_classifiers:\n stats = {'n_train': 0, 'n_train_pos': 0,\n 'accuracy': 0.0, 'accuracy_history': [(0, 0)], 't0': time.time(),\n 'runtime_history': [(0, 0)], 'total_fit_time': 0.0}\n cls_stats[cls_name] = stats\n\nget_minibatch(data_stream, n_test_documents)\n# Discard test set\n\n# We will feed the classifier with mini-batches of 1000 documents; this means\n# we have at most 1000 docs in memory at any time. The smaller the document\n# batch, the bigger the relative overhead of the partial fit methods.\nminibatch_size = 1000\n\n# Create the data_stream that parses Reuters SGML files and iterates on\n# documents as a stream.\nminibatch_iterators = iter_minibatches(data_stream, minibatch_size)\ntotal_vect_time = 0.0\n\n# Main loop : iterate on mini-batches of examples\nfor i, (X_train_text, y_train) in enumerate(minibatch_iterators):\n\n tick = time.time()\n X_train = vectorizer.transform(X_train_text)\n total_vect_time += time.time() - tick\n\n for cls_name, cls in partial_fit_classifiers.items():\n tick = time.time()\n # update estimator with examples in the current mini-batch\n cls.partial_fit(X_train, y_train, classes=all_classes)\n\n # accumulate test accuracy stats\n cls_stats[cls_name]['total_fit_time'] += time.time() - tick\n cls_stats[cls_name]['n_train'] += X_train.shape[0]\n cls_stats[cls_name]['n_train_pos'] += sum(y_train)\n tick = time.time()\n cls_stats[cls_name]['accuracy'] = cls.score(X_test, y_test)\n cls_stats[cls_name]['prediction_time'] = time.time() - tick\n acc_history = (cls_stats[cls_name]['accuracy'],\n cls_stats[cls_name]['n_train'])\n cls_stats[cls_name]['accuracy_history'].append(acc_history)\n run_history = (cls_stats[cls_name]['accuracy'],\n total_vect_time + cls_stats[cls_name]['total_fit_time'])\n cls_stats[cls_name]['runtime_history'].append(run_history)\n\n if i % 3 == 0:\n print(progress(cls_name, cls_stats[cls_name]))\n if i % 3 == 0:\n print('\\n')"
	`65`	+ "vectorizer = HashingVectorizer(decode_error='ignore', n_features=2 18,\n alternate_sign=False)\n\n\n# Iterator over parsed Reuters SGML files.\ndata_stream = stream_reuters_documents()\n\n# We learn a binary classification between the \"acq\" class and all the others.\n# \"acq\" was chosen as it is more or less evenly distributed in the Reuters\n# files. For other datasets, one should take care of creating a test set with\n# a realistic portion of positive instances.\nall_classes = np.array([0, 1])\npositive_class = 'acq'\n\n# Here are some classifiers that support the `partial_fit` method\npartial_fit_classifiers = {\n 'SGD': SGDClassifier(max_iter=5),\n 'Perceptron': Perceptron(),\n 'NB Multinomial': MultinomialNB(alpha=0.01),\n 'Passive-Aggressive': PassiveAggressiveClassifier(tol=1e-3),\n}\n\n\ndef get_minibatch(doc_iter, size, pos_class=positive_class):\n \"\"\"Extract a minibatch of examples, return a tuple X_text, y.\n\n Note: size is before excluding invalid docs with no topics assigned.\n\n \"\"\"\n data = [('{title}\\n\\n{body}'.format(doc), pos_class in doc['topics'])\n for doc in itertools.islice(doc_iter, size)\n if doc['topics']]\n if not len(data):\n return np.asarray([], dtype=int), np.asarray([], dtype=int)\n X_text, y = zip(*data)\n return X_text, np.asarray(y, dtype=int)\n\n\ndef iter_minibatches(doc_iter, minibatch_size):\n \"\"\"Generator of minibatches.\"\"\"\n X_text, y = get_minibatch(doc_iter, minibatch_size)\n while len(X_text):\n yield X_text, y\n X_text, y = get_minibatch(doc_iter, minibatch_size)\n\n\n# test data statistics\ntest_stats = {'n_test': 0, 'n_test_pos': 0}\n\n# First we hold out a number of examples to estimate accuracy\nn_test_documents = 1000\ntick = time.time()\nX_test_text, y_test = get_minibatch(data_stream, 1000)\nparsing_time = time.time() - tick\ntick = time.time()\nX_test = vectorizer.transform(X_test_text)\nvectorizing_time = time.time() - tick\ntest_stats['n_test'] += len(y_test)\ntest_stats['n_test_pos'] += sum(y_test)\nprint(\"Test set is %d documents (%d positive)\" % (len(y_test), sum(y_test)))\n\n\ndef progress(cls_name, stats):\n \"\"\"Report progress information, return a string.\"\"\"\n duration = time.time() - stats['t0']\n s = \"%20s classifier : \\t\" % cls_name\n s += \"%(n_train)6d train docs (%(n_train_pos)6d positive) \" % stats\n s += \"%(n_test)6d test docs (%(n_test_pos)6d positive) \" % test_stats\n s += \"accuracy: %(accuracy).3f \" % stats\n s += \"in %.2fs (%5d docs/s)\" % (duration, stats['n_train'] / duration)\n return s\n\n\ncls_stats = {}\n\nfor cls_name in partial_fit_classifiers:\n stats = {'n_train': 0, 'n_train_pos': 0,\n 'accuracy': 0.0, 'accuracy_history': [(0, 0)], 't0': time.time(),\n 'runtime_history': [(0, 0)], 'total_fit_time': 0.0}\n cls_stats[cls_name] = stats\n\nget_minibatch(data_stream, n_test_documents)\n# Discard test set\n\n# We will feed the classifier with mini-batches of 1000 documents; this means\n# we have at most 1000 docs in memory at any time. The smaller the document\n# batch, the bigger the relative overhead of the partial fit methods.\nminibatch_size = 1000\n\n# Create the data_stream that parses Reuters SGML files and iterates on\n# documents as a stream.\nminibatch_iterators = iter_minibatches(data_stream, minibatch_size)\ntotal_vect_time = 0.0\n\n# Main loop : iterate on mini-batches of examples\nfor i, (X_train_text, y_train) in enumerate(minibatch_iterators):\n\n tick = time.time()\n X_train = vectorizer.transform(X_train_text)\n total_vect_time += time.time() - tick\n\n for cls_name, cls in partial_fit_classifiers.items():\n tick = time.time()\n # update estimator with examples in the current mini-batch\n cls.partial_fit(X_train, y_train, classes=all_classes)\n\n # accumulate test accuracy stats\n cls_stats[cls_name]['total_fit_time'] += time.time() - tick\n cls_stats[cls_name]['n_train'] += X_train.shape[0]\n cls_stats[cls_name]['n_train_pos'] += sum(y_train)\n tick = time.time()\n cls_stats[cls_name]['accuracy'] = cls.score(X_test, y_test)\n cls_stats[cls_name]['prediction_time'] = time.time() - tick\n acc_history = (cls_stats[cls_name]['accuracy'],\n cls_stats[cls_name]['n_train'])\n cls_stats[cls_name]['accuracy_history'].append(acc_history)\n run_history = (cls_stats[cls_name]['accuracy'],\n total_vect_time + cls_stats[cls_name]['total_fit_time'])\n cls_stats[cls_name]['runtime_history'].append(run_history)\n\n if i % 3 == 0:\n print(progress(cls_name, cls_stats[cls_name]))\n if i % 3 == 0:\n print('\\n')"
`66`	`66`	`]`
`67`	`67`	`},`
`68`	`68`	`{`