Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit5614bc3

Browse files
committed
Pushing the docs to dev/ for branch: main, commit 0a075179c69eb2edfc8cea44feb01ce28dfbf95b
1 parentc9698e8 commit5614bc3

File tree

1,229 files changed

+5140
-4596
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,229 files changed

+5140
-4596
lines changed
Binary file not shown.

‎dev/_downloads/2c8a162a0e436f4ca9af35453585fc81/plot_adaboost_hastie_10_2.py

Lines changed: 71 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
Discrete versus Real AdaBoost
44
=============================
55
6-
Thisexample is based on Figure 10.2 from Hastie et al 2009 [1]_ and
6+
Thisnotebook is based on Figure 10.2 from Hastie et al 2009 [1]_ and
77
illustrates the difference in performance between the discrete SAMME [2]_
88
boosting algorithm and real SAMME.R boosting algorithm. Both algorithms are
99
evaluated on a binary classification task where the target Y is a non-linear
@@ -15,32 +15,44 @@
1515
.. [1] T. Hastie, R. Tibshirani and J. Friedman, "Elements of Statistical
1616
Learning Ed. 2", Springer, 2009.
1717
18-
.. [2] J. Zhu, H. Zou, S. Rosset, T. Hastie, "Multi-class AdaBoost", 2009.
18+
.. [2] J Zhu, H. Zou, S. Rosset, T. Hastie, "Multi-class AdaBoost",
19+
Statistics and Its Interface, 2009.
1920
2021
"""
2122

22-
# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>,
23-
# Noel Dawe <noel.dawe@gmail.com>
23+
# %%
24+
# Preparing the data and baseline models
25+
# --------------------------------------
26+
# We start by generating the binary classification dataset
27+
# used in Hastie et al. 2009, Example 10.2.
28+
29+
# Authors: Peter Prettenhofer <peter.prettenhofer@gmail.com>,
30+
# Noel Dawe <noel.dawe@gmail.com>
2431
#
2532
# License: BSD 3 clause
2633

27-
importnumpyasnp
28-
importmatplotlib.pyplotasplt
29-
3034
fromsklearnimportdatasets
31-
fromsklearn.treeimportDecisionTreeClassifier
32-
fromsklearn.metricsimportzero_one_loss
33-
fromsklearn.ensembleimportAdaBoostClassifier
3435

36+
X,y=datasets.make_hastie_10_2(n_samples=12_000,random_state=1)
37+
38+
# %%
39+
# Now, we set the hyperparameters for our AdaBoost classifiers.
40+
# Be aware, a learning rate of 1.0 may not be optimal for both SAMME and SAMME.R
3541

3642
n_estimators=400
37-
# A learning rate of 1. may not be optimal for both SAMME and SAMME.R
3843
learning_rate=1.0
3944

40-
X,y=datasets.make_hastie_10_2(n_samples=12000,random_state=1)
45+
# %%
46+
# We split the data into a training and a test set.
47+
# Then, we train our baseline classifiers, a `DecisionTreeClassifier` with `depth=9`
48+
# and a "stump" `DecisionTreeClassifier` with `depth=1` and compute the test error.
4149

42-
X_test,y_test=X[2000:],y[2000:]
43-
X_train,y_train=X[:2000],y[:2000]
50+
fromsklearn.model_selectionimporttrain_test_split
51+
fromsklearn.treeimportDecisionTreeClassifier
52+
53+
X_train,X_test,y_train,y_test=train_test_split(
54+
X,y,test_size=2_000,shuffle=False
55+
)
4456

4557
dt_stump=DecisionTreeClassifier(max_depth=1,min_samples_leaf=1)
4658
dt_stump.fit(X_train,y_train)
@@ -50,6 +62,14 @@
5062
dt.fit(X_train,y_train)
5163
dt_err=1.0-dt.score(X_test,y_test)
5264

65+
# %%
66+
# Adaboost with discrete SAMME and real SAMME.R
67+
# ---------------------------------------------
68+
# We now define the discrete and real AdaBoost classifiers
69+
# and fit them to the training set.
70+
71+
fromsklearn.ensembleimportAdaBoostClassifier
72+
5373
ada_discrete=AdaBoostClassifier(
5474
base_estimator=dt_stump,
5575
learning_rate=learning_rate,
@@ -58,6 +78,8 @@
5878
)
5979
ada_discrete.fit(X_train,y_train)
6080

81+
# %%
82+
6183
ada_real=AdaBoostClassifier(
6284
base_estimator=dt_stump,
6385
learning_rate=learning_rate,
@@ -66,11 +88,13 @@
6688
)
6789
ada_real.fit(X_train,y_train)
6890

69-
fig=plt.figure()
70-
ax=fig.add_subplot(111)
91+
# %%
92+
# Now, let's compute the test error of the discrete and
93+
# real AdaBoost classifiers for each new stump in `n_estimators`
94+
# added to the ensemble.
7195

72-
ax.plot([1,n_estimators], [dt_stump_err]*2,"k-",label="Decision Stump Error")
73-
ax.plot([1,n_estimators], [dt_err]*2,"k--",label="Decision Tree Error")
96+
importnumpyasnp
97+
fromsklearn.metricsimportzero_one_loss
7498

7599
ada_discrete_err=np.zeros((n_estimators,))
76100
fori,y_predinenumerate(ada_discrete.staged_predict(X_test)):
@@ -88,36 +112,60 @@
88112
fori,y_predinenumerate(ada_real.staged_predict(X_train)):
89113
ada_real_err_train[i]=zero_one_loss(y_pred,y_train)
90114

115+
# %%
116+
# Plotting the results
117+
# --------------------
118+
# Finally, we plot the train and test errors of our baselines
119+
# and of the discrete and real AdaBoost classifiers
120+
121+
importmatplotlib.pyplotasplt
122+
importseabornassns
123+
124+
fig=plt.figure()
125+
ax=fig.add_subplot(111)
126+
127+
ax.plot([1,n_estimators], [dt_stump_err]*2,"k-",label="Decision Stump Error")
128+
ax.plot([1,n_estimators], [dt_err]*2,"k--",label="Decision Tree Error")
129+
130+
colors=sns.color_palette("colorblind")
131+
91132
ax.plot(
92133
np.arange(n_estimators)+1,
93134
ada_discrete_err,
94135
label="Discrete AdaBoost Test Error",
95-
color="red",
136+
color=colors[0],
96137
)
97138
ax.plot(
98139
np.arange(n_estimators)+1,
99140
ada_discrete_err_train,
100141
label="Discrete AdaBoost Train Error",
101-
color="blue",
142+
color=colors[1],
102143
)
103144
ax.plot(
104145
np.arange(n_estimators)+1,
105146
ada_real_err,
106147
label="Real AdaBoost Test Error",
107-
color="orange",
148+
color=colors[2],
108149
)
109150
ax.plot(
110151
np.arange(n_estimators)+1,
111152
ada_real_err_train,
112153
label="Real AdaBoost Train Error",
113-
color="green",
154+
color=colors[4],
114155
)
115156

116157
ax.set_ylim((0.0,0.5))
117-
ax.set_xlabel("n_estimators")
158+
ax.set_xlabel("Number of weak learners")
118159
ax.set_ylabel("error rate")
119160

120161
leg=ax.legend(loc="upper right",fancybox=True)
121162
leg.get_frame().set_alpha(0.7)
122163

123164
plt.show()
165+
# %%
166+
#
167+
# Concluding remarks
168+
# ------------------
169+
#
170+
# We observe that the error rate for both train and test sets of real AdaBoost
171+
# is lower than that of discrete AdaBoost.
Binary file not shown.

‎dev/_downloads/97c9b8aba1989fb600a73f3afb354726/plot_adaboost_hastie_10_2.ipynb

Lines changed: 117 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,79 @@
1515
"cell_type":"markdown",
1616
"metadata": {},
1717
"source": [
18-
"\n# Discrete versus Real AdaBoost\n\nThis example is based on Figure 10.2 from Hastie et al 2009 [1]_ and\nillustrates the difference in performance between the discrete SAMME [2]_\nboosting algorithm and real SAMME.R boosting algorithm. Both algorithms are\nevaluated on a binary classification task where the target Y is a non-linear\nfunction of 10 input features.\n\nDiscrete SAMME AdaBoost adapts based on errors in predicted class labels\nwhereas real SAMME.R uses the predicted class probabilities.\n\n.. [1] T. Hastie, R. Tibshirani and J. Friedman,\"Elements of Statistical\n Learning Ed. 2\", Springer, 2009.\n\n.. [2] J. Zhu, H. Zou, S. Rosset, T. Hastie,\"Multi-class AdaBoost\", 2009.\n"
18+
"\n# Discrete versus Real AdaBoost\n\nThis notebook is based on Figure 10.2 from Hastie et al 2009 [1]_ and\nillustrates the difference in performance between the discrete SAMME [2]_\nboosting algorithm and real SAMME.R boosting algorithm. Both algorithms are\nevaluated on a binary classification task where the target Y is a non-linear\nfunction of 10 input features.\n\nDiscrete SAMME AdaBoost adapts based on errors in predicted class labels\nwhereas real SAMME.R uses the predicted class probabilities.\n\n.. [1] T. Hastie, R. Tibshirani and J. Friedman,\"Elements of Statistical\n Learning Ed. 2\", Springer, 2009.\n\n.. [2] J Zhu, H. Zou, S. Rosset, T. Hastie,\"Multi-class AdaBoost\",\n Statistics and Its Interface, 2009.\n"
19+
]
20+
},
21+
{
22+
"cell_type":"markdown",
23+
"metadata": {},
24+
"source": [
25+
"## Preparing the data and baseline models\nWe start by generating the binary classification dataset\nused in Hastie et al. 2009, Example 10.2.\n\n"
26+
]
27+
},
28+
{
29+
"cell_type":"code",
30+
"execution_count":null,
31+
"metadata": {
32+
"collapsed":false
33+
},
34+
"outputs": [],
35+
"source": [
36+
"# Authors: Peter Prettenhofer <peter.prettenhofer@gmail.com>,\n# Noel Dawe <noel.dawe@gmail.com>\n#\n# License: BSD 3 clause\n\nfrom sklearn import datasets\n\nX, y = datasets.make_hastie_10_2(n_samples=12_000, random_state=1)"
37+
]
38+
},
39+
{
40+
"cell_type":"markdown",
41+
"metadata": {},
42+
"source": [
43+
"Now, we set the hyperparameters for our AdaBoost classifiers.\nBe aware, a learning rate of 1.0 may not be optimal for both SAMME and SAMME.R\n\n"
44+
]
45+
},
46+
{
47+
"cell_type":"code",
48+
"execution_count":null,
49+
"metadata": {
50+
"collapsed":false
51+
},
52+
"outputs": [],
53+
"source": [
54+
"n_estimators = 400\nlearning_rate = 1.0"
55+
]
56+
},
57+
{
58+
"cell_type":"markdown",
59+
"metadata": {},
60+
"source": [
61+
"We split the data into a training and a test set.\nThen, we train our baseline classifiers, a `DecisionTreeClassifier` with `depth=9`\nand a\"stump\" `DecisionTreeClassifier` with `depth=1` and compute the test error.\n\n"
62+
]
63+
},
64+
{
65+
"cell_type":"code",
66+
"execution_count":null,
67+
"metadata": {
68+
"collapsed":false
69+
},
70+
"outputs": [],
71+
"source": [
72+
"from sklearn.model_selection import train_test_split\nfrom sklearn.tree import DecisionTreeClassifier\n\nX_train, X_test, y_train, y_test = train_test_split(\n X, y, test_size=2_000, shuffle=False\n)\n\ndt_stump = DecisionTreeClassifier(max_depth=1, min_samples_leaf=1)\ndt_stump.fit(X_train, y_train)\ndt_stump_err = 1.0 - dt_stump.score(X_test, y_test)\n\ndt = DecisionTreeClassifier(max_depth=9, min_samples_leaf=1)\ndt.fit(X_train, y_train)\ndt_err = 1.0 - dt.score(X_test, y_test)"
73+
]
74+
},
75+
{
76+
"cell_type":"markdown",
77+
"metadata": {},
78+
"source": [
79+
"## Adaboost with discrete SAMME and real SAMME.R\nWe now define the discrete and real AdaBoost classifiers\nand fit them to the training set.\n\n"
80+
]
81+
},
82+
{
83+
"cell_type":"code",
84+
"execution_count":null,
85+
"metadata": {
86+
"collapsed":false
87+
},
88+
"outputs": [],
89+
"source": [
90+
"from sklearn.ensemble import AdaBoostClassifier\n\nada_discrete = AdaBoostClassifier(\n base_estimator=dt_stump,\n learning_rate=learning_rate,\n n_estimators=n_estimators,\n algorithm=\"SAMME\",\n)\nada_discrete.fit(X_train, y_train)"
1991
]
2092
},
2193
{
@@ -26,7 +98,50 @@
2698
},
2799
"outputs": [],
28100
"source": [
29-
"# Author: Peter Prettenhofer <peter.prettenhofer@gmail.com>,\n# Noel Dawe <noel.dawe@gmail.com>\n#\n# License: BSD 3 clause\n\nimport numpy as np\nimport matplotlib.pyplot as plt\n\nfrom sklearn import datasets\nfrom sklearn.tree import DecisionTreeClassifier\nfrom sklearn.metrics import zero_one_loss\nfrom sklearn.ensemble import AdaBoostClassifier\n\n\nn_estimators = 400\n# A learning rate of 1. may not be optimal for both SAMME and SAMME.R\nlearning_rate = 1.0\n\nX, y = datasets.make_hastie_10_2(n_samples=12000, random_state=1)\n\nX_test, y_test = X[2000:], y[2000:]\nX_train, y_train = X[:2000], y[:2000]\n\ndt_stump = DecisionTreeClassifier(max_depth=1, min_samples_leaf=1)\ndt_stump.fit(X_train, y_train)\ndt_stump_err = 1.0 - dt_stump.score(X_test, y_test)\n\ndt = DecisionTreeClassifier(max_depth=9, min_samples_leaf=1)\ndt.fit(X_train, y_train)\ndt_err = 1.0 - dt.score(X_test, y_test)\n\nada_discrete = AdaBoostClassifier(\n base_estimator=dt_stump,\n learning_rate=learning_rate,\n n_estimators=n_estimators,\n algorithm=\"SAMME\",\n)\nada_discrete.fit(X_train, y_train)\n\nada_real = AdaBoostClassifier(\n base_estimator=dt_stump,\n learning_rate=learning_rate,\n n_estimators=n_estimators,\n algorithm=\"SAMME.R\",\n)\nada_real.fit(X_train, y_train)\n\nfig = plt.figure()\nax = fig.add_subplot(111)\n\nax.plot([1, n_estimators], [dt_stump_err] * 2, \"k-\", label=\"Decision Stump Error\")\nax.plot([1, n_estimators], [dt_err] * 2, \"k--\", label=\"Decision Tree Error\")\n\nada_discrete_err = np.zeros((n_estimators,))\nfor i, y_pred in enumerate(ada_discrete.staged_predict(X_test)):\n ada_discrete_err[i] = zero_one_loss(y_pred, y_test)\n\nada_discrete_err_train = np.zeros((n_estimators,))\nfor i, y_pred in enumerate(ada_discrete.staged_predict(X_train)):\n ada_discrete_err_train[i] = zero_one_loss(y_pred, y_train)\n\nada_real_err = np.zeros((n_estimators,))\nfor i, y_pred in enumerate(ada_real.staged_predict(X_test)):\n ada_real_err[i] = zero_one_loss(y_pred, y_test)\n\nada_real_err_train = np.zeros((n_estimators,))\nfor i, y_pred in enumerate(ada_real.staged_predict(X_train)):\n ada_real_err_train[i] = zero_one_loss(y_pred, y_train)\n\nax.plot(\n np.arange(n_estimators) + 1,\n ada_discrete_err,\n label=\"Discrete AdaBoost Test Error\",\n color=\"red\",\n)\nax.plot(\n np.arange(n_estimators) + 1,\n ada_discrete_err_train,\n label=\"Discrete AdaBoost Train Error\",\n color=\"blue\",\n)\nax.plot(\n np.arange(n_estimators) + 1,\n ada_real_err,\n label=\"Real AdaBoost Test Error\",\n color=\"orange\",\n)\nax.plot(\n np.arange(n_estimators) + 1,\n ada_real_err_train,\n label=\"Real AdaBoost Train Error\",\n color=\"green\",\n)\n\nax.set_ylim((0.0, 0.5))\nax.set_xlabel(\"n_estimators\")\nax.set_ylabel(\"error rate\")\n\nleg = ax.legend(loc=\"upper right\", fancybox=True)\nleg.get_frame().set_alpha(0.7)\n\nplt.show()"
101+
"ada_real = AdaBoostClassifier(\n base_estimator=dt_stump,\n learning_rate=learning_rate,\n n_estimators=n_estimators,\n algorithm=\"SAMME.R\",\n)\nada_real.fit(X_train, y_train)"
102+
]
103+
},
104+
{
105+
"cell_type":"markdown",
106+
"metadata": {},
107+
"source": [
108+
"Now, let's compute the test error of the discrete and\nreal AdaBoost classifiers for each new stump in `n_estimators`\nadded to the ensemble.\n\n"
109+
]
110+
},
111+
{
112+
"cell_type":"code",
113+
"execution_count":null,
114+
"metadata": {
115+
"collapsed":false
116+
},
117+
"outputs": [],
118+
"source": [
119+
"import numpy as np\nfrom sklearn.metrics import zero_one_loss\n\nada_discrete_err = np.zeros((n_estimators,))\nfor i, y_pred in enumerate(ada_discrete.staged_predict(X_test)):\n ada_discrete_err[i] = zero_one_loss(y_pred, y_test)\n\nada_discrete_err_train = np.zeros((n_estimators,))\nfor i, y_pred in enumerate(ada_discrete.staged_predict(X_train)):\n ada_discrete_err_train[i] = zero_one_loss(y_pred, y_train)\n\nada_real_err = np.zeros((n_estimators,))\nfor i, y_pred in enumerate(ada_real.staged_predict(X_test)):\n ada_real_err[i] = zero_one_loss(y_pred, y_test)\n\nada_real_err_train = np.zeros((n_estimators,))\nfor i, y_pred in enumerate(ada_real.staged_predict(X_train)):\n ada_real_err_train[i] = zero_one_loss(y_pred, y_train)"
120+
]
121+
},
122+
{
123+
"cell_type":"markdown",
124+
"metadata": {},
125+
"source": [
126+
"## Plotting the results\nFinally, we plot the train and test errors of our baselines\nand of the discrete and real AdaBoost classifiers\n\n"
127+
]
128+
},
129+
{
130+
"cell_type":"code",
131+
"execution_count":null,
132+
"metadata": {
133+
"collapsed":false
134+
},
135+
"outputs": [],
136+
"source": [
137+
"import matplotlib.pyplot as plt\nimport seaborn as sns\n\nfig = plt.figure()\nax = fig.add_subplot(111)\n\nax.plot([1, n_estimators], [dt_stump_err] * 2, \"k-\", label=\"Decision Stump Error\")\nax.plot([1, n_estimators], [dt_err] * 2, \"k--\", label=\"Decision Tree Error\")\n\ncolors = sns.color_palette(\"colorblind\")\n\nax.plot(\n np.arange(n_estimators) + 1,\n ada_discrete_err,\n label=\"Discrete AdaBoost Test Error\",\n color=colors[0],\n)\nax.plot(\n np.arange(n_estimators) + 1,\n ada_discrete_err_train,\n label=\"Discrete AdaBoost Train Error\",\n color=colors[1],\n)\nax.plot(\n np.arange(n_estimators) + 1,\n ada_real_err,\n label=\"Real AdaBoost Test Error\",\n color=colors[2],\n)\nax.plot(\n np.arange(n_estimators) + 1,\n ada_real_err_train,\n label=\"Real AdaBoost Train Error\",\n color=colors[4],\n)\n\nax.set_ylim((0.0, 0.5))\nax.set_xlabel(\"Number of weak learners\")\nax.set_ylabel(\"error rate\")\n\nleg = ax.legend(loc=\"upper right\", fancybox=True)\nleg.get_frame().set_alpha(0.7)\n\nplt.show()"
138+
]
139+
},
140+
{
141+
"cell_type":"markdown",
142+
"metadata": {},
143+
"source": [
144+
"## Concluding remarks\n\nWe observe that the error rate for both train and test sets of real AdaBoost\nis lower than that of discrete AdaBoost.\n\n"
30145
]
31146
}
32147
],

‎dev/_downloads/scikit-learn-docs.zip

3.61 KB
Binary file not shown.
124 Bytes
104 Bytes
-22 Bytes
-115 Bytes
-3 Bytes
-65 Bytes
154 Bytes
-3.48 KB
-19 Bytes
-285 Bytes
135 Bytes
-73 Bytes
-52 Bytes
183 Bytes
-83 Bytes
-49 Bytes
31 Bytes
-141 Bytes
372 Bytes
-491 Bytes
104 Bytes
17 Bytes
-2 Bytes
-2 Bytes

‎dev/_sources/auto_examples/applications/plot_cyclical_feature_engineering.rst.txt

Lines changed: 1 addition & 1 deletion

‎dev/_sources/auto_examples/applications/plot_digits_denoising.rst.txt

Lines changed: 1 addition & 1 deletion

‎dev/_sources/auto_examples/applications/plot_face_recognition.rst.txt

Lines changed: 4 additions & 4 deletions

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp