Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitcae3ce0

Browse files
committed
add dimensionality reduction using feature selection tutorial
1 parentccd6b89 commitcae3ce0

File tree

4 files changed

+446
-0
lines changed

4 files changed

+446
-0
lines changed
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
#[Dimensionality Reduction Using Feature Selection in Python](https://www.thepythoncode.com/article/dimensionality-reduction-feature-selection)
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,306 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type":"code",
5+
"execution_count":null,
6+
"metadata": {
7+
"id":"iImkWEpRSiRq"
8+
},
9+
"outputs": [],
10+
"source": [
11+
"\n",
12+
"# Load libraries\n",
13+
"import pandas as pd\n",
14+
"import numpy as np\n",
15+
"from sklearn.datasets import load_iris, make_regression\n",
16+
"from sklearn.feature_selection import SelectKBest, chi2, f_classif, SelectPercentile, VarianceThreshold, RFECV\n",
17+
"from sklearn.preprocessing import StandardScaler\n",
18+
"import warnings\n",
19+
"from sklearn import datasets, linear_model"
20+
]
21+
},
22+
{
23+
"cell_type":"code",
24+
"execution_count":null,
25+
"metadata": {
26+
"colab": {
27+
"base_uri":"https://localhost:8080/"
28+
},
29+
"id":"ZEK7KAyzSokS",
30+
"outputId":"7ce72382-c116-4f51-df7b-1f975c1c25f8"
31+
},
32+
"outputs": [],
33+
"source": [
34+
"# Load libraries\n",
35+
"# import data\n",
36+
"iris = datasets.load_iris()\n",
37+
"# Create features and target\n",
38+
"features_i = iris.data\n",
39+
"target_i = iris.target\n",
40+
"# thresholder creation\n",
41+
"thresholder = VarianceThreshold(threshold=.4)\n",
42+
"# high variance feature matrix creation\n",
43+
"f_high_variance = thresholder.fit_transform(features_i)\n",
44+
"# View high variance feature matrix\n",
45+
"f_high_variance[0:3]"
46+
]
47+
},
48+
{
49+
"cell_type":"code",
50+
"execution_count":null,
51+
"metadata": {
52+
"colab": {
53+
"base_uri":"https://localhost:8080/"
54+
},
55+
"id":"7ZZgOg1-SpuX",
56+
"outputId":"a869adde-0b29-4630-9661-34377f110d4f"
57+
},
58+
"outputs": [],
59+
"source": [
60+
"# View variances\n",
61+
"thresholder.fit(features_i).variances_"
62+
]
63+
},
64+
{
65+
"cell_type":"code",
66+
"execution_count":null,
67+
"metadata": {
68+
"colab": {
69+
"base_uri":"https://localhost:8080/"
70+
},
71+
"id":"zYNK4wP5Sq9R",
72+
"outputId":"30e18ea5-4b63-43e5-819e-9a99251dfae6"
73+
},
74+
"outputs": [],
75+
"source": [
76+
"\n",
77+
"# feature matrix stantardization\n",
78+
"scaler = StandardScaler()\n",
79+
"f_std = scaler.fit_transform(features_i)\n",
80+
"# variance of each feature calculation\n",
81+
"selection = VarianceThreshold()\n",
82+
"selection.fit(f_std).variances_"
83+
]
84+
},
85+
{
86+
"cell_type":"code",
87+
"execution_count":null,
88+
"metadata": {
89+
"colab": {
90+
"base_uri":"https://localhost:8080/"
91+
},
92+
"id":"jDGMP97LSuiB",
93+
"outputId":"c1b9d537-495f-4109-ef75-324fe9943668"
94+
},
95+
"outputs": [],
96+
"source": [
97+
"# feature matrix creation with:\n",
98+
"# for Feature 0: 80% class 0\n",
99+
"# for Feature 1: 80% class 1\n",
100+
"# for Feature 2: 60% class 0, 40% class 1\n",
101+
"features_i = [[0, 2, 0],\n",
102+
"[0, 1, 1],\n",
103+
"[0, 1, 0],\n",
104+
"[0, 1, 1],\n",
105+
"[1, 0, 0]]\n",
106+
"# threshold by variance\n",
107+
"thresholding = VarianceThreshold(threshold=(.65 * (1 - .65)))\n",
108+
"thresholding.fit_transform(features_i)"
109+
]
110+
},
111+
{
112+
"cell_type":"code",
113+
"execution_count":null,
114+
"metadata": {
115+
"colab": {
116+
"base_uri":"https://localhost:8080/",
117+
"height":198
118+
},
119+
"id":"JvnObeKXS6xm",
120+
"outputId":"19dac143-9407-4bb4-cc23-b19b06025617"
121+
},
122+
"outputs": [],
123+
"source": [
124+
"# Create feature matrix with two highly correlated features\n",
125+
"features_m = np.array([[1, 1, 1],\n",
126+
"[2, 2, 0],\n",
127+
"[3, 3, 1],\n",
128+
"[4, 4, 0],\n",
129+
"[5, 5, 1],\n",
130+
"[6, 6, 0],\n",
131+
"[7, 7, 1],\n",
132+
"[8, 7, 0],\n",
133+
"[9, 7, 1]])\n",
134+
"# Conversion of feature matrix\n",
135+
"dataframe = pd.DataFrame(features_m)\n",
136+
"# correlation matrix creation\n",
137+
"corr_m = dataframe.corr().abs()\n",
138+
"# upper triangle selection\n",
139+
"upper1 = corr_m.where(np.triu(np.ones(corr_m.shape),\n",
140+
"k=1).astype(np.bool))\n",
141+
"# For correlation greater than 0.85, Find index of feature columns\n",
142+
"droping = [col for col in upper1.columns if any(upper1[col] > 0.85)]\n",
143+
"# Drop features\n",
144+
"dataframe.drop(dataframe.columns[droping], axis=1).head(3)"
145+
]
146+
},
147+
{
148+
"cell_type":"code",
149+
"execution_count":null,
150+
"metadata": {
151+
"colab": {
152+
"base_uri":"https://localhost:8080/"
153+
},
154+
"id":"Dos1ZfkDS-Zd",
155+
"outputId":"17e96f0d-a55a-4943-90a9-99aa3c31fad3"
156+
},
157+
"outputs": [],
158+
"source": [
159+
"# Load data\n",
160+
"iris_i = load_iris()\n",
161+
"features_v = iris.data\n",
162+
"target = iris.target\n",
163+
"# categorical data coversion\n",
164+
"features_v = features_v.astype(int)\n",
165+
"# Selection of two features using highest chi-squared\n",
166+
"chi2_s = SelectKBest(chi2, k=2)\n",
167+
"f_kbest = chi2_s.fit_transform(features_v, target)\n",
168+
"# Show results\n",
169+
"print(\"Original number of features:\", features_v.shape[1])\n",
170+
"print(\"Reduced number of features:\", f_kbest.shape[1])"
171+
]
172+
},
173+
{
174+
"cell_type":"code",
175+
"execution_count":null,
176+
"metadata": {
177+
"colab": {
178+
"base_uri":"https://localhost:8080/"
179+
},
180+
"id":"y10u_gQbTCwR",
181+
"outputId":"651182ab-d857-4a3d-db61-4fff866d167c"
182+
},
183+
"outputs": [],
184+
"source": [
185+
"# Selection of two features using highest F-values\n",
186+
"f_selector = SelectKBest(f_classif, k=2)\n",
187+
"f_kbest = f_selector.fit_transform(features_v, target)\n",
188+
"# Pisplay results\n",
189+
"print(\"Original number of features:\", features_v.shape[1])\n",
190+
"print(\"Reduced number of features:\", f_kbest.shape[1])"
191+
]
192+
},
193+
{
194+
"cell_type":"code",
195+
"execution_count":null,
196+
"metadata": {
197+
"colab": {
198+
"base_uri":"https://localhost:8080/"
199+
},
200+
"id":"5NXAa6UKTHiu",
201+
"outputId":"c34866b2-c08c-4020-b14d-78deb98f2834"
202+
},
203+
"outputs": [],
204+
"source": [
205+
"# Selection of top 65% of features\n",
206+
"f_selector = SelectPercentile(f_classif, percentile=65)\n",
207+
"f_kbest = f_selector.fit_transform(features_v, target)\n",
208+
"# Display results\n",
209+
"print(\"Original number of features:\", features_v.shape[1])\n",
210+
"print(\"Reduced number of features:\", f_kbest.shape[1])"
211+
]
212+
},
213+
{
214+
"cell_type":"code",
215+
"execution_count":null,
216+
"metadata": {
217+
"colab": {
218+
"base_uri":"https://localhost:8080/"
219+
},
220+
"id":"39-Wq-F9TKVg",
221+
"outputId":"e52c0537-2245-4f12-ea9a-ace232984ec1"
222+
},
223+
"outputs": [],
224+
"source": [
225+
"# Load libraries\n",
226+
"# Suppress an annoying but harmless warning\n",
227+
"warnings.filterwarnings(action=\"ignore\", module=\"scipy\",\n",
228+
"message=\"^internal gelsd\")\n",
229+
"# features matrix, target vector, true coefficients\n",
230+
"features_f, target_t = make_regression(n_samples = 10000,\n",
231+
"n_features = 100,\n",
232+
"n_informative = 2,\n",
233+
"random_state = 1)\n",
234+
"# linear regression creation\n",
235+
"ols = linear_model.LinearRegression()\n",
236+
"# Recursive features elimination\n",
237+
"rfecv = RFECV(estimator=ols, step=2, scoring=\"neg_mean_squared_error\")\n",
238+
"rfecv.fit(features_f, target_t)\n",
239+
"rfecv.transform(features_f)"
240+
]
241+
},
242+
{
243+
"cell_type":"code",
244+
"execution_count":null,
245+
"metadata": {
246+
"colab": {
247+
"base_uri":"https://localhost:8080/"
248+
},
249+
"id":"Ut1mgIGEUhJM",
250+
"outputId":"f365a4d5-63f4-4a55-e828-d331e6f06308"
251+
},
252+
"outputs": [],
253+
"source": [
254+
"# Number of best features\n",
255+
"rfecv.n_features_"
256+
]
257+
},
258+
{
259+
"cell_type":"code",
260+
"execution_count":null,
261+
"metadata": {
262+
"colab": {
263+
"base_uri":"https://localhost:8080/"
264+
},
265+
"id":"Lpt7I_Q0UjN1",
266+
"outputId":"4d6938dc-d813-42a5-c1b7-9ba4865a0e86"
267+
},
268+
"outputs": [],
269+
"source": [
270+
"# What the best categories ?\n",
271+
"rfecv.support_"
272+
]
273+
},
274+
{
275+
"cell_type":"code",
276+
"execution_count":null,
277+
"metadata": {
278+
"colab": {
279+
"base_uri":"https://localhost:8080/"
280+
},
281+
"id":"ojYKsEbTUkMu",
282+
"outputId":"98652d92-f58f-41fe-9ba1-b1ecd3ef7ecb"
283+
},
284+
"outputs": [],
285+
"source": [
286+
"# We can even see how the features are ranked\n",
287+
"rfecv.ranking_"
288+
]
289+
}
290+
],
291+
"metadata": {
292+
"colab": {
293+
"name":"Untitled42.ipynb",
294+
"provenance": []
295+
},
296+
"kernelspec": {
297+
"display_name":"Python 3",
298+
"name":"python3"
299+
},
300+
"language_info": {
301+
"name":"python"
302+
}
303+
},
304+
"nbformat":4,
305+
"nbformat_minor":0
306+
}

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp