Commitcae3ce0

committed

add dimensionality reduction using feature selection tutorial

1 parentccd6b89 commitcae3ce0Copy full SHA for cae3ce0

File tree

4 files changed

+446

-0

lines changed

machine-learning/dimensionality-reduction-feature-selection

4 files changed

+446

-0

lines changed

`‎machine-learning/dimensionality-reduction-feature-selection/README.md`

Lines changed: 1 addition & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1 @@`
	`1`	`+#[Dimensionality Reduction Using Feature Selection in Python](https://www.thepythoncode.com/article/dimensionality-reduction-feature-selection)`

`‎machine-learning/dimensionality-reduction-feature-selection/dimensionality_reduction_using_feature_selection.ipynb`

Lines changed: 306 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,306 @@`
	`1`	`+{`
	`2`	`+"cells": [`
	`3`	`+ {`
	`4`	`+"cell_type":"code",`
	`5`	`+"execution_count":null,`
	`6`	`+"metadata": {`
	`7`	`+"id":"iImkWEpRSiRq"`
	`8`	`+ },`
	`9`	`+"outputs": [],`
	`10`	`+"source": [`
	`11`	`+"\n",`
	`12`	`+"# Load libraries\n",`
	`13`	`+"import pandas as pd\n",`
	`14`	`+"import numpy as np\n",`
	`15`	`+"from sklearn.datasets import load_iris, make_regression\n",`
	`16`	`+"from sklearn.feature_selection import SelectKBest, chi2, f_classif, SelectPercentile, VarianceThreshold, RFECV\n",`
	`17`	`+"from sklearn.preprocessing import StandardScaler\n",`
	`18`	`+"import warnings\n",`
	`19`	`+"from sklearn import datasets, linear_model"`
	`20`	`+ ]`
	`21`	`+ },`
	`22`	`+ {`
	`23`	`+"cell_type":"code",`
	`24`	`+"execution_count":null,`
	`25`	`+"metadata": {`
	`26`	`+"colab": {`
	`27`	`+"base_uri":"https://localhost:8080/"`
	`28`	`+ },`
	`29`	`+"id":"ZEK7KAyzSokS",`
	`30`	`+"outputId":"7ce72382-c116-4f51-df7b-1f975c1c25f8"`
	`31`	`+ },`
	`32`	`+"outputs": [],`
	`33`	`+"source": [`
	`34`	`+"# Load libraries\n",`
	`35`	`+"# import data\n",`
	`36`	`+"iris = datasets.load_iris()\n",`
	`37`	`+"# Create features and target\n",`
	`38`	`+"features_i = iris.data\n",`
	`39`	`+"target_i = iris.target\n",`
	`40`	`+"# thresholder creation\n",`
	`41`	`+"thresholder = VarianceThreshold(threshold=.4)\n",`
	`42`	`+"# high variance feature matrix creation\n",`
	`43`	`+"f_high_variance = thresholder.fit_transform(features_i)\n",`
	`44`	`+"# View high variance feature matrix\n",`
	`45`	`+"f_high_variance[0:3]"`
	`46`	`+ ]`
	`47`	`+ },`
	`48`	`+ {`
	`49`	`+"cell_type":"code",`
	`50`	`+"execution_count":null,`
	`51`	`+"metadata": {`
	`52`	`+"colab": {`
	`53`	`+"base_uri":"https://localhost:8080/"`
	`54`	`+ },`
	`55`	`+"id":"7ZZgOg1-SpuX",`
	`56`	`+"outputId":"a869adde-0b29-4630-9661-34377f110d4f"`
	`57`	`+ },`
	`58`	`+"outputs": [],`
	`59`	`+"source": [`
	`60`	`+"# View variances\n",`
	`61`	`+"thresholder.fit(features_i).variances_"`
	`62`	`+ ]`
	`63`	`+ },`
	`64`	`+ {`
	`65`	`+"cell_type":"code",`
	`66`	`+"execution_count":null,`
	`67`	`+"metadata": {`
	`68`	`+"colab": {`
	`69`	`+"base_uri":"https://localhost:8080/"`
	`70`	`+ },`
	`71`	`+"id":"zYNK4wP5Sq9R",`
	`72`	`+"outputId":"30e18ea5-4b63-43e5-819e-9a99251dfae6"`
	`73`	`+ },`
	`74`	`+"outputs": [],`
	`75`	`+"source": [`
	`76`	`+"\n",`
	`77`	`+"# feature matrix stantardization\n",`
	`78`	`+"scaler = StandardScaler()\n",`
	`79`	`+"f_std = scaler.fit_transform(features_i)\n",`
	`80`	`+"# variance of each feature calculation\n",`
	`81`	`+"selection = VarianceThreshold()\n",`
	`82`	`+"selection.fit(f_std).variances_"`
	`83`	`+ ]`
	`84`	`+ },`
	`85`	`+ {`
	`86`	`+"cell_type":"code",`
	`87`	`+"execution_count":null,`
	`88`	`+"metadata": {`
	`89`	`+"colab": {`
	`90`	`+"base_uri":"https://localhost:8080/"`
	`91`	`+ },`
	`92`	`+"id":"jDGMP97LSuiB",`
	`93`	`+"outputId":"c1b9d537-495f-4109-ef75-324fe9943668"`
	`94`	`+ },`
	`95`	`+"outputs": [],`
	`96`	`+"source": [`
	`97`	`+"# feature matrix creation with:\n",`
	`98`	`+"# for Feature 0: 80% class 0\n",`
	`99`	`+"# for Feature 1: 80% class 1\n",`
	`100`	`+"# for Feature 2: 60% class 0, 40% class 1\n",`
	`101`	`+"features_i = [[0, 2, 0],\n",`
	`102`	`+"[0, 1, 1],\n",`
	`103`	`+"[0, 1, 0],\n",`
	`104`	`+"[0, 1, 1],\n",`
	`105`	`+"[1, 0, 0]]\n",`
	`106`	`+"# threshold by variance\n",`
	`107`	`+"thresholding = VarianceThreshold(threshold=(.65 * (1 - .65)))\n",`
	`108`	`+"thresholding.fit_transform(features_i)"`
	`109`	`+ ]`
	`110`	`+ },`
	`111`	`+ {`
	`112`	`+"cell_type":"code",`
	`113`	`+"execution_count":null,`
	`114`	`+"metadata": {`
	`115`	`+"colab": {`
	`116`	`+"base_uri":"https://localhost:8080/",`
	`117`	`+"height":198`
	`118`	`+ },`
	`119`	`+"id":"JvnObeKXS6xm",`
	`120`	`+"outputId":"19dac143-9407-4bb4-cc23-b19b06025617"`
	`121`	`+ },`
	`122`	`+"outputs": [],`
	`123`	`+"source": [`
	`124`	`+"# Create feature matrix with two highly correlated features\n",`
	`125`	`+"features_m = np.array([[1, 1, 1],\n",`
	`126`	`+"[2, 2, 0],\n",`
	`127`	`+"[3, 3, 1],\n",`
	`128`	`+"[4, 4, 0],\n",`
	`129`	`+"[5, 5, 1],\n",`
	`130`	`+"[6, 6, 0],\n",`
	`131`	`+"[7, 7, 1],\n",`
	`132`	`+"[8, 7, 0],\n",`
	`133`	`+"[9, 7, 1]])\n",`
	`134`	`+"# Conversion of feature matrix\n",`
	`135`	`+"dataframe = pd.DataFrame(features_m)\n",`
	`136`	`+"# correlation matrix creation\n",`
	`137`	`+"corr_m = dataframe.corr().abs()\n",`
	`138`	`+"# upper triangle selection\n",`
	`139`	`+"upper1 = corr_m.where(np.triu(np.ones(corr_m.shape),\n",`
	`140`	`+"k=1).astype(np.bool))\n",`
	`141`	`+"# For correlation greater than 0.85, Find index of feature columns\n",`
	`142`	`+"droping = [col for col in upper1.columns if any(upper1[col] > 0.85)]\n",`
	`143`	`+"# Drop features\n",`
	`144`	`+"dataframe.drop(dataframe.columns[droping], axis=1).head(3)"`
	`145`	`+ ]`
	`146`	`+ },`
	`147`	`+ {`
	`148`	`+"cell_type":"code",`
	`149`	`+"execution_count":null,`
	`150`	`+"metadata": {`
	`151`	`+"colab": {`
	`152`	`+"base_uri":"https://localhost:8080/"`
	`153`	`+ },`
	`154`	`+"id":"Dos1ZfkDS-Zd",`
	`155`	`+"outputId":"17e96f0d-a55a-4943-90a9-99aa3c31fad3"`
	`156`	`+ },`
	`157`	`+"outputs": [],`
	`158`	`+"source": [`
	`159`	`+"# Load data\n",`
	`160`	`+"iris_i = load_iris()\n",`
	`161`	`+"features_v = iris.data\n",`
	`162`	`+"target = iris.target\n",`
	`163`	`+"# categorical data coversion\n",`
	`164`	`+"features_v = features_v.astype(int)\n",`
	`165`	`+"# Selection of two features using highest chi-squared\n",`
	`166`	`+"chi2_s = SelectKBest(chi2, k=2)\n",`
	`167`	`+"f_kbest = chi2_s.fit_transform(features_v, target)\n",`
	`168`	`+"# Show results\n",`
	`169`	`+"print(\"Original number of features:\", features_v.shape[1])\n",`
	`170`	`+"print(\"Reduced number of features:\", f_kbest.shape[1])"`
	`171`	`+ ]`
	`172`	`+ },`
	`173`	`+ {`
	`174`	`+"cell_type":"code",`
	`175`	`+"execution_count":null,`
	`176`	`+"metadata": {`
	`177`	`+"colab": {`
	`178`	`+"base_uri":"https://localhost:8080/"`
	`179`	`+ },`
	`180`	`+"id":"y10u_gQbTCwR",`
	`181`	`+"outputId":"651182ab-d857-4a3d-db61-4fff866d167c"`
	`182`	`+ },`
	`183`	`+"outputs": [],`
	`184`	`+"source": [`
	`185`	`+"# Selection of two features using highest F-values\n",`
	`186`	`+"f_selector = SelectKBest(f_classif, k=2)\n",`
	`187`	`+"f_kbest = f_selector.fit_transform(features_v, target)\n",`
	`188`	`+"# Pisplay results\n",`
	`189`	`+"print(\"Original number of features:\", features_v.shape[1])\n",`
	`190`	`+"print(\"Reduced number of features:\", f_kbest.shape[1])"`
	`191`	`+ ]`
	`192`	`+ },`
	`193`	`+ {`
	`194`	`+"cell_type":"code",`
	`195`	`+"execution_count":null,`
	`196`	`+"metadata": {`
	`197`	`+"colab": {`
	`198`	`+"base_uri":"https://localhost:8080/"`
	`199`	`+ },`
	`200`	`+"id":"5NXAa6UKTHiu",`
	`201`	`+"outputId":"c34866b2-c08c-4020-b14d-78deb98f2834"`
	`202`	`+ },`
	`203`	`+"outputs": [],`
	`204`	`+"source": [`
	`205`	`+"# Selection of top 65% of features\n",`
	`206`	`+"f_selector = SelectPercentile(f_classif, percentile=65)\n",`
	`207`	`+"f_kbest = f_selector.fit_transform(features_v, target)\n",`
	`208`	`+"# Display results\n",`
	`209`	`+"print(\"Original number of features:\", features_v.shape[1])\n",`
	`210`	`+"print(\"Reduced number of features:\", f_kbest.shape[1])"`
	`211`	`+ ]`
	`212`	`+ },`
	`213`	`+ {`
	`214`	`+"cell_type":"code",`
	`215`	`+"execution_count":null,`
	`216`	`+"metadata": {`
	`217`	`+"colab": {`
	`218`	`+"base_uri":"https://localhost:8080/"`
	`219`	`+ },`
	`220`	`+"id":"39-Wq-F9TKVg",`
	`221`	`+"outputId":"e52c0537-2245-4f12-ea9a-ace232984ec1"`
	`222`	`+ },`
	`223`	`+"outputs": [],`
	`224`	`+"source": [`
	`225`	`+"# Load libraries\n",`
	`226`	`+"# Suppress an annoying but harmless warning\n",`
	`227`	`+"warnings.filterwarnings(action=\"ignore\", module=\"scipy\",\n",`
	`228`	`+"message=\"^internal gelsd\")\n",`
	`229`	`+"# features matrix, target vector, true coefficients\n",`
	`230`	`+"features_f, target_t = make_regression(n_samples = 10000,\n",`
	`231`	`+"n_features = 100,\n",`
	`232`	`+"n_informative = 2,\n",`
	`233`	`+"random_state = 1)\n",`
	`234`	`+"# linear regression creation\n",`
	`235`	`+"ols = linear_model.LinearRegression()\n",`
	`236`	`+"# Recursive features elimination\n",`
	`237`	`+"rfecv = RFECV(estimator=ols, step=2, scoring=\"neg_mean_squared_error\")\n",`
	`238`	`+"rfecv.fit(features_f, target_t)\n",`
	`239`	`+"rfecv.transform(features_f)"`
	`240`	`+ ]`
	`241`	`+ },`
	`242`	`+ {`
	`243`	`+"cell_type":"code",`
	`244`	`+"execution_count":null,`
	`245`	`+"metadata": {`
	`246`	`+"colab": {`
	`247`	`+"base_uri":"https://localhost:8080/"`
	`248`	`+ },`
	`249`	`+"id":"Ut1mgIGEUhJM",`
	`250`	`+"outputId":"f365a4d5-63f4-4a55-e828-d331e6f06308"`
	`251`	`+ },`
	`252`	`+"outputs": [],`
	`253`	`+"source": [`
	`254`	`+"# Number of best features\n",`
	`255`	`+"rfecv.n_features_"`
	`256`	`+ ]`
	`257`	`+ },`
	`258`	`+ {`
	`259`	`+"cell_type":"code",`
	`260`	`+"execution_count":null,`
	`261`	`+"metadata": {`
	`262`	`+"colab": {`
	`263`	`+"base_uri":"https://localhost:8080/"`
	`264`	`+ },`
	`265`	`+"id":"Lpt7I_Q0UjN1",`
	`266`	`+"outputId":"4d6938dc-d813-42a5-c1b7-9ba4865a0e86"`
	`267`	`+ },`
	`268`	`+"outputs": [],`
	`269`	`+"source": [`
	`270`	`+"# What the best categories ?\n",`
	`271`	`+"rfecv.support_"`
	`272`	`+ ]`
	`273`	`+ },`
	`274`	`+ {`
	`275`	`+"cell_type":"code",`
	`276`	`+"execution_count":null,`
	`277`	`+"metadata": {`
	`278`	`+"colab": {`
	`279`	`+"base_uri":"https://localhost:8080/"`
	`280`	`+ },`
	`281`	`+"id":"ojYKsEbTUkMu",`
	`282`	`+"outputId":"98652d92-f58f-41fe-9ba1-b1ecd3ef7ecb"`
	`283`	`+ },`
	`284`	`+"outputs": [],`
	`285`	`+"source": [`
	`286`	`+"# We can even see how the features are ranked\n",`
	`287`	`+"rfecv.ranking_"`
	`288`	`+ ]`
	`289`	`+ }`
	`290`	`+ ],`
	`291`	`+"metadata": {`
	`292`	`+"colab": {`
	`293`	`+"name":"Untitled42.ipynb",`
	`294`	`+"provenance": []`
	`295`	`+ },`
	`296`	`+"kernelspec": {`
	`297`	`+"display_name":"Python 3",`
	`298`	`+"name":"python3"`
	`299`	`+ },`
	`300`	`+"language_info": {`
	`301`	`+"name":"python"`
	`302`	`+ }`
	`303`	`+ },`
	`304`	`+"nbformat":4,`
	`305`	`+"nbformat_minor":0`
	`306`	`+}`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commitcae3ce0

File tree

4 files changed

4 files changed

`‎machine-learning/dimensionality-reduction-feature-selection/README.md`

`‎machine-learning/dimensionality-reduction-feature-selection/dimensionality_reduction_using_feature_selection.ipynb`

0 commit comments