Commit6e7adb7

committed

Pushing the docs to dev/ for branch: master, commit 0b6308c2708fe03071cdbf24997eb967403f5965

1 parentc63e022 commit6e7adb7Copy full SHA for 6e7adb7

File tree

1,081 files changed

+4644

-3641

lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

1,081 files changed

+4644

-3641

lines changed

`‎dev/_downloads/auto_examples_jupyter.zip`

-1.5 KB

Binary file not shown.

`‎dev/_downloads/auto_examples_python.zip`

-1.44 KB

Binary file not shown.

`‎dev/_downloads/column_transformer.ipynb`

Lines changed: 54 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,54 @@`
	`1`	`+{`
	`2`	`+"cells": [`
	`3`	`+ {`
	`4`	`+"cell_type":"code",`
	`5`	`+"execution_count":null,`
	`6`	`+"metadata": {`
	`7`	`+"collapsed":false`
	`8`	`+ },`
	`9`	`+"outputs": [],`
	`10`	`+"source": [`
	`11`	`+"%matplotlib inline"`
	`12`	`+ ]`
	`13`	`+ },`
	`14`	`+ {`
	`15`	`+"cell_type":"markdown",`
	`16`	`+"metadata": {},`
	`17`	`+"source": [`
	`18`	+"\n# Column Transformer with Heterogeneous Data Sources\n\n\nDatasets can often contain components of that require different feature\nextraction and processing pipelines. This scenario might occur when:\n\n1. Your dataset consists of heterogeneous data types (e.g. raster images and\n text captions)\n2. Your dataset is stored in a Pandas DataFrame and different columns\n require different processing pipelines.\n\nThis example demonstrates how to use\n:class:`sklearn.compose.ColumnTransformer` on a dataset containing\ndifferent types of features. We use the 20-newsgroups dataset and compute\nstandard bag-of-words features for the subject line and body in separate\npipelines as well as ad hoc features on the body. We combine them (with\nweights) using a ColumnTransformer and finally train a classifier on the\ncombined set of features.\n\nThe choice of features is not particularly helpful, but serves to illustrate\nthe technique.\n\n"
	`19`	`+ ]`
	`20`	`+ },`
	`21`	`+ {`
	`22`	`+"cell_type":"code",`
	`23`	`+"execution_count":null,`
	`24`	`+"metadata": {`
	`25`	`+"collapsed":false`
	`26`	`+ },`
	`27`	`+"outputs": [],`
	`28`	`+"source": [`
	`29`	+ "# Author: Matt Terry <matt.terry@gmail.com>\n#\n# License: BSD 3 clause\nfrom __future__ import print_function\n\nimport numpy as np\n\nfrom sklearn.base import BaseEstimator, TransformerMixin\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.datasets.twenty_newsgroups import strip_newsgroup_footer\nfrom sklearn.datasets.twenty_newsgroups import strip_newsgroup_quoting\nfrom sklearn.decomposition import TruncatedSVD\nfrom sklearn.feature_extraction import DictVectorizer\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.metrics import classification_report\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.compose import ColumnTransformer\nfrom sklearn.svm import SVC\n\n\nclass TextStats(BaseEstimator, TransformerMixin):\n \"\"\"Extract features from each document for DictVectorizer\"\"\"\n\n def fit(self, x, y=None):\n return self\n\n def transform(self, posts):\n return [{'length': len(text),\n 'num_sentences': text.count('.')}\n for text in posts]\n\n\nclass SubjectBodyExtractor(BaseEstimator, TransformerMixin):\n \"\"\"Extract the subject & body from a usenet post in a single pass.\n\n Takes a sequence of strings and produces a dict of sequences. Keys are\n `subject` and `body`.\n \"\"\"\n def fit(self, x, y=None):\n return self\n\n def transform(self, posts):\n # construct object dtype array with two columns\n # first column = 'subject' and second column = 'body'\n features = np.empty(shape=(len(posts), 2), dtype=object)\n for i, text in enumerate(posts):\n headers, _, bod = text.partition('\\n\\n')\n bod = strip_newsgroup_footer(bod)\n bod = strip_newsgroup_quoting(bod)\n features[i, 1] = bod\n\n prefix = 'Subject:'\n sub = ''\n for line in headers.split('\\n'):\n if line.startswith(prefix):\n sub = line[len(prefix):]\n break\n features[i, 0] = sub\n\n return features\n\n\npipeline = Pipeline([\n # Extract the subject & body\n ('subjectbody', SubjectBodyExtractor()),\n\n # Use C toolumnTransformer to combine the features from subject and body\n ('union', ColumnTransformer(\n [\n # Pulling features from the post's subject line (first column)\n ('subject', TfidfVectorizer(min_df=50), 0),\n\n # Pipeline for standard bag-of-words model for body (second column)\n ('body_bow', Pipeline([\n ('tfidf', TfidfVectorizer()),\n ('best', TruncatedSVD(n_components=50)),\n ]), 1),\n\n # Pipeline for pulling ad hoc features from post's body\n ('body_stats', Pipeline([\n ('stats', TextStats()), # returns a list of dicts\n ('vect', DictVectorizer()), # list of dicts -> feature matrix\n ]), 1),\n ],\n\n # weight components in ColumnTransformer\n transformer_weights={\n 'subject': 0.8,\n 'body_bow': 0.5,\n 'body_stats': 1.0,\n }\n )),\n\n # Use a SVC classifier on the combined features\n ('svc', SVC(kernel='linear')),\n])\n\n# limit the list of categories to make running this example faster.\ncategories = ['alt.atheism', 'talk.religion.misc']\ntrain = fetch_20newsgroups(random_state=1,\n subset='train',\n categories=categories,\n )\ntest = fetch_20newsgroups(random_state=1,\n subset='test',\n categories=categories,\n )\n\npipeline.fit(train.data, train.target)\ny = pipeline.predict(test.data)\nprint(classification_report(y, test.target))"
	`30`	`+ ]`
	`31`	`+ }`
	`32`	`+ ],`
	`33`	`+"metadata": {`
	`34`	`+"kernelspec": {`
	`35`	`+"display_name":"Python 3",`
	`36`	`+"language":"python",`
	`37`	`+"name":"python3"`
	`38`	`+ },`
	`39`	`+"language_info": {`
	`40`	`+"codemirror_mode": {`
	`41`	`+"name":"ipython",`
	`42`	`+"version":3`
	`43`	`+ },`
	`44`	`+"file_extension":".py",`
	`45`	`+"mimetype":"text/x-python",`
	`46`	`+"name":"python",`
	`47`	`+"nbconvert_exporter":"python",`
	`48`	`+"pygments_lexer":"ipython3",`
	`49`	`+"version":"3.6.5"`
	`50`	`+ }`
	`51`	`+ },`
	`52`	`+"nbformat":4,`
	`53`	`+"nbformat_minor":0`
	`54`	`+}`

`‎dev/_downloads/hetero_feature_union.pyrenamed to‎dev/_downloads/column_transformer.py`

Lines changed: 22 additions & 67 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,7 +1,7 @@`
`1`	`1`	`"""`
`2`		`-=============================================`
`3`		`-Feature Union with Heterogeneous Data Sources`
`4`		`-=============================================`
	`2`	`+==================================================`
	`3`	`+Column Transformer with Heterogeneous Data Sources`
	`4`	`+==================================================`
`5`	`5`
`6`	`6`	`Datasets can often contain components of that require different feature`
`7`	`7`	`extraction and processing pipelines. This scenario might occur when:`
`@@ -12,12 +12,12 @@`
`12`	`12`	`require different processing pipelines.`
`13`	`13`
`14`	`14`	`This example demonstrates how to use`
`15`		-:class:`sklearn.feature_extraction.FeatureUnion` on a dataset containing
	`15`	+:class:`sklearn.compose.ColumnTransformer` on a dataset containing
`16`	`16`	`different types of features. We use the 20-newsgroups dataset and compute`
`17`	`17`	`standard bag-of-words features for the subject line and body in separate`
`18`	`18`	`pipelines as well as ad hoc features on the body. We combine them (with`
`19`		`-weights) using aFeatureUnion and finally train a classifier on the combined`
`20`		`-set of features.`
	`19`	`+weights) using aColumnTransformer and finally train a classifier on the`
	`20`	`+combinedset of features.`
`21`	`21`
`22`	`22`	`The choice of features is not particularly helpful, but serves to illustrate`
`23`	`23`	`the technique.`
`@@ -38,50 +38,11 @@`
`38`	`38`	`fromsklearn.feature_extractionimportDictVectorizer`
`39`	`39`	`fromsklearn.feature_extraction.textimportTfidfVectorizer`
`40`	`40`	`fromsklearn.metricsimportclassification_report`
`41`		`-fromsklearn.pipelineimportFeatureUnion`
`42`	`41`	`fromsklearn.pipelineimportPipeline`
	`42`	`+fromsklearn.composeimportColumnTransformer`
`43`	`43`	`fromsklearn.svmimportSVC`
`44`	`44`
`45`	`45`
`46`		`-classItemSelector(BaseEstimator,TransformerMixin):`
`47`		`-"""For data grouped by feature, select subset of data at a provided key.`
`48`		`-`
`49`		`- The data is expected to be stored in a 2D data structure, where the first`
`50`		`- index is over features and the second is over samples. i.e.`
`51`		`-`
`52`		`- >> len(data[key]) == n_samples`
`53`		`-`
`54`		`- Please note that this is the opposite convention to scikit-learn feature`
`55`		`- matrixes (where the first index corresponds to sample).`
`56`		`-`
`57`		`- ItemSelector only requires that the collection implement getitem`
`58`		`- (data[key]). Examples include: a dict of lists, 2D numpy array, Pandas`
`59`		`- DataFrame, numpy record array, etc.`
`60`		`-`
`61`		`- >> data = {'a': [1, 5, 2, 5, 2, 8],`
`62`		`- 'b': [9, 4, 1, 4, 1, 3]}`
`63`		`- >> ds = ItemSelector(key='a')`
`64`		`- >> data['a'] == ds.transform(data)`
`65`		`-`
`66`		`- ItemSelector is not designed to handle data grouped by sample. (e.g. a`
`67`		`- list of dicts). If your data is structured this way, consider a`
`68`		- transformer along the lines of `sklearn.feature_extraction.DictVectorizer`.
`69`		`-`
`70`		`- Parameters`
`71`		`- ----------`
`72`		`- key : hashable, required`
`73`		`- The key corresponding to the desired value in a mappable.`
`74`		`- """`
`75`		`-def__init__(self,key):`
`76`		`-self.key=key`
`77`		`-`
`78`		`-deffit(self,x,y=None):`
`79`		`-returnself`
`80`		`-`
`81`		`-deftransform(self,data_dict):`
`82`		`-returndata_dict[self.key]`
`83`		`-`
`84`		`-`
`85`	`46`	`classTextStats(BaseEstimator,TransformerMixin):`
`86`	`47`	`"""Extract features from each document for DictVectorizer"""`
`87`	`48`
`@@ -104,21 +65,22 @@ def fit(self, x, y=None):`
`104`	`65`	`returnself`
`105`	`66`
`106`	`67`	`deftransform(self,posts):`
`107`		`-features=np.recarray(shape=(len(posts),),`
`108`		`-dtype=[('subject',object), ('body',object)])`
	`68`	`+# construct object dtype array with two columns`
	`69`	`+# first column = 'subject' and second column = 'body'`
	`70`	`+features=np.empty(shape=(len(posts),2),dtype=object)`
`109`	`71`	`fori,textinenumerate(posts):`
`110`	`72`	`headers,_,bod=text.partition('\n\n')`
`111`	`73`	`bod=strip_newsgroup_footer(bod)`
`112`	`74`	`bod=strip_newsgroup_quoting(bod)`
`113`		`-features['body'][i]=bod`
	`75`	`+features[i,1]=bod`
`114`	`76`
`115`	`77`	`prefix='Subject:'`
`116`	`78`	`sub=''`
`117`	`79`	`forlineinheaders.split('\n'):`
`118`	`80`	`ifline.startswith(prefix):`
`119`	`81`	`sub=line[len(prefix):]`
`120`	`82`	`break`
`121`		`-features['subject'][i]=sub`
	`83`	`+features[i,0]=sub`
`122`	`84`
`123`	`85`	`returnfeatures`
`124`	`86`
`@@ -127,38 +89,31 @@ def transform(self, posts):`
`127`	`89`	`# Extract the subject & body`
`128`	`90`	`('subjectbody',SubjectBodyExtractor()),`
`129`	`91`
`130`		`-# Use FeatureUnion to combine the features from subject and body`
`131`		`- ('union',FeatureUnion(`
`132`		`-transformer_list=[`
	`92`	`+# Use C toolumnTransformer to combine the features from subject and body`
	`93`	`+ ('union',ColumnTransformer(`
	`94`	`+ [`
	`95`	`+# Pulling features from the post's subject line (first column)`
	`96`	`+ ('subject',TfidfVectorizer(min_df=50),0),`
`133`	`97`
`134`		`-# Pipeline for pulling features from the post's subject line`
`135`		`- ('subject',Pipeline([`
`136`		`- ('selector',ItemSelector(key='subject')),`
`137`		`- ('tfidf',TfidfVectorizer(min_df=50)),`
`138`		`- ])),`
`139`		`-`
`140`		`-# Pipeline for standard bag-of-words model for body`
	`98`	`+# Pipeline for standard bag-of-words model for body (second column)`
`141`	`99`	`('body_bow',Pipeline([`
`142`		`- ('selector',ItemSelector(key='body')),`
`143`	`100`	`('tfidf',TfidfVectorizer()),`
`144`	`101`	`('best',TruncatedSVD(n_components=50)),`
`145`		`- ])),`
	`102`	`+ ]),1),`
`146`	`103`
`147`	`104`	`# Pipeline for pulling ad hoc features from post's body`
`148`	`105`	`('body_stats',Pipeline([`
`149`		`- ('selector',ItemSelector(key='body')),`
`150`	`106`	`('stats',TextStats()),# returns a list of dicts`
`151`	`107`	`('vect',DictVectorizer()),# list of dicts -> feature matrix`
`152`		`- ])),`
`153`		`-`
	`108`	`+ ]),1),`
`154`	`109`	`],`
`155`	`110`
`156`		`-# weight components inFeatureUnion`
	`111`	`+# weight components inColumnTransformer`
`157`	`112`	`transformer_weights={`
`158`	`113`	`'subject':0.8,`
`159`	`114`	`'body_bow':0.5,`
`160`	`115`	`'body_stats':1.0,`
`161`		`- },`
	`116`	`+ }`
`162`	`117`	`)),`
`163`	`118`
`164`	`119`	`# Use a SVC classifier on the combined features`