Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitad8de25

Browse files
committed
Pushing the docs to dev/ for branch: master, commit
1 parent08b2a01 commitad8de25

File tree

3,287 files changed

+61755
-22235
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

3,287 files changed

+61755
-22235
lines changed

‎dev/.buildinfo

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
11
# Sphinx build info version 1
22
# This file hashes the configuration used when building these files. When it is not found, a full rebuild will be done.
3-
config:7f91379cee1de94d73196b0aacaf2c24
3+
config:c6497fb6e649c6f20da8e1e78e49ebd1
44
tags: 645f666f9bcd5a90fca523b33c5a78b7
-9.24 KB
Binary file not shown.
-9.43 KB
Binary file not shown.
Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
{
2+
"cells": [
3+
{
4+
"cell_type":"code",
5+
"execution_count":null,
6+
"metadata": {
7+
"collapsed":false
8+
},
9+
"outputs": [],
10+
"source": [
11+
"%matplotlib inline"
12+
]
13+
},
14+
{
15+
"cell_type":"markdown",
16+
"metadata": {},
17+
"source": [
18+
"\n# Column Transformer with Heterogeneous Data Sources\n\n\nDatasets can often contain components of that require different feature\nextraction and processing pipelines. This scenario might occur when:\n\n1. Your dataset consists of heterogeneous data types (e.g. raster images and\n text captions)\n2. Your dataset is stored in a Pandas DataFrame and different columns\n require different processing pipelines.\n\nThis example demonstrates how to use\n:class:`sklearn.compose.ColumnTransformer` on a dataset containing\ndifferent types of features. We use the 20-newsgroups dataset and compute\nstandard bag-of-words features for the subject line and body in separate\npipelines as well as ad hoc features on the body. We combine them (with\nweights) using a ColumnTransformer and finally train a classifier on the\ncombined set of features.\n\nThe choice of features is not particularly helpful, but serves to illustrate\nthe technique.\n\n"
19+
]
20+
},
21+
{
22+
"cell_type":"code",
23+
"execution_count":null,
24+
"metadata": {
25+
"collapsed":false
26+
},
27+
"outputs": [],
28+
"source": [
29+
"# Author: Matt Terry <matt.terry@gmail.com>\n#\n# License: BSD 3 clause\nfrom __future__ import print_function\n\nimport numpy as np\n\nfrom sklearn.base import BaseEstimator, TransformerMixin\nfrom sklearn.datasets import fetch_20newsgroups\nfrom sklearn.datasets.twenty_newsgroups import strip_newsgroup_footer\nfrom sklearn.datasets.twenty_newsgroups import strip_newsgroup_quoting\nfrom sklearn.decomposition import TruncatedSVD\nfrom sklearn.feature_extraction import DictVectorizer\nfrom sklearn.feature_extraction.text import TfidfVectorizer\nfrom sklearn.metrics import classification_report\nfrom sklearn.pipeline import Pipeline\nfrom sklearn.compose import ColumnTransformer\nfrom sklearn.svm import SVC\n\n\nclass TextStats(BaseEstimator, TransformerMixin):\n \"\"\"Extract features from each document for DictVectorizer\"\"\"\n\n def fit(self, x, y=None):\n return self\n\n def transform(self, posts):\n return [{'length': len(text),\n 'num_sentences': text.count('.')}\n for text in posts]\n\n\nclass SubjectBodyExtractor(BaseEstimator, TransformerMixin):\n \"\"\"Extract the subject & body from a usenet post in a single pass.\n\n Takes a sequence of strings and produces a dict of sequences. Keys are\n `subject` and `body`.\n \"\"\"\n def fit(self, x, y=None):\n return self\n\n def transform(self, posts):\n # construct object dtype array with two columns\n # first column = 'subject' and second column = 'body'\n features = np.empty(shape=(len(posts), 2), dtype=object)\n for i, text in enumerate(posts):\n headers, _, bod = text.partition('\\n\\n')\n bod = strip_newsgroup_footer(bod)\n bod = strip_newsgroup_quoting(bod)\n features[i, 1] = bod\n\n prefix = 'Subject:'\n sub = ''\n for line in headers.split('\\n'):\n if line.startswith(prefix):\n sub = line[len(prefix):]\n break\n features[i, 0] = sub\n\n return features\n\n\npipeline = Pipeline([\n # Extract the subject & body\n ('subjectbody', SubjectBodyExtractor()),\n\n # Use C toolumnTransformer to combine the features from subject and body\n ('union', ColumnTransformer(\n [\n # Pulling features from the post's subject line (first column)\n ('subject', TfidfVectorizer(min_df=50), 0),\n\n # Pipeline for standard bag-of-words model for body (second column)\n ('body_bow', Pipeline([\n ('tfidf', TfidfVectorizer()),\n ('best', TruncatedSVD(n_components=50)),\n ]), 1),\n\n # Pipeline for pulling ad hoc features from post's body\n ('body_stats', Pipeline([\n ('stats', TextStats()), # returns a list of dicts\n ('vect', DictVectorizer()), # list of dicts -> feature matrix\n ]), 1),\n ],\n\n # weight components in ColumnTransformer\n transformer_weights={\n 'subject': 0.8,\n 'body_bow': 0.5,\n 'body_stats': 1.0,\n }\n )),\n\n # Use a SVC classifier on the combined features\n ('svc', SVC(kernel='linear')),\n])\n\n# limit the list of categories to make running this example faster.\ncategories = ['alt.atheism', 'talk.religion.misc']\ntrain = fetch_20newsgroups(random_state=1,\n subset='train',\n categories=categories,\n )\ntest = fetch_20newsgroups(random_state=1,\n subset='test',\n categories=categories,\n )\n\npipeline.fit(train.data, train.target)\ny = pipeline.predict(test.data)\nprint(classification_report(y, test.target))"
30+
]
31+
}
32+
],
33+
"metadata": {
34+
"kernelspec": {
35+
"display_name":"Python 3",
36+
"language":"python",
37+
"name":"python3"
38+
},
39+
"language_info": {
40+
"codemirror_mode": {
41+
"name":"ipython",
42+
"version":3
43+
},
44+
"file_extension":".py",
45+
"mimetype":"text/x-python",
46+
"name":"python",
47+
"nbconvert_exporter":"python",
48+
"pygments_lexer":"ipython3",
49+
"version":"3.6.5"
50+
}
51+
},
52+
"nbformat":4,
53+
"nbformat_minor":0
54+
}

‎dev/_downloads/column_transformer.py

Lines changed: 136 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,136 @@
1+
"""
2+
==================================================
3+
Column Transformer with Heterogeneous Data Sources
4+
==================================================
5+
6+
Datasets can often contain components of that require different feature
7+
extraction and processing pipelines. This scenario might occur when:
8+
9+
1. Your dataset consists of heterogeneous data types (e.g. raster images and
10+
text captions)
11+
2. Your dataset is stored in a Pandas DataFrame and different columns
12+
require different processing pipelines.
13+
14+
This example demonstrates how to use
15+
:class:`sklearn.compose.ColumnTransformer` on a dataset containing
16+
different types of features. We use the 20-newsgroups dataset and compute
17+
standard bag-of-words features for the subject line and body in separate
18+
pipelines as well as ad hoc features on the body. We combine them (with
19+
weights) using a ColumnTransformer and finally train a classifier on the
20+
combined set of features.
21+
22+
The choice of features is not particularly helpful, but serves to illustrate
23+
the technique.
24+
"""
25+
26+
# Author: Matt Terry <matt.terry@gmail.com>
27+
#
28+
# License: BSD 3 clause
29+
from __future__importprint_function
30+
31+
importnumpyasnp
32+
33+
fromsklearn.baseimportBaseEstimator,TransformerMixin
34+
fromsklearn.datasetsimportfetch_20newsgroups
35+
fromsklearn.datasets.twenty_newsgroupsimportstrip_newsgroup_footer
36+
fromsklearn.datasets.twenty_newsgroupsimportstrip_newsgroup_quoting
37+
fromsklearn.decompositionimportTruncatedSVD
38+
fromsklearn.feature_extractionimportDictVectorizer
39+
fromsklearn.feature_extraction.textimportTfidfVectorizer
40+
fromsklearn.metricsimportclassification_report
41+
fromsklearn.pipelineimportPipeline
42+
fromsklearn.composeimportColumnTransformer
43+
fromsklearn.svmimportSVC
44+
45+
46+
classTextStats(BaseEstimator,TransformerMixin):
47+
"""Extract features from each document for DictVectorizer"""
48+
49+
deffit(self,x,y=None):
50+
returnself
51+
52+
deftransform(self,posts):
53+
return [{'length':len(text),
54+
'num_sentences':text.count('.')}
55+
fortextinposts]
56+
57+
58+
classSubjectBodyExtractor(BaseEstimator,TransformerMixin):
59+
"""Extract the subject & body from a usenet post in a single pass.
60+
61+
Takes a sequence of strings and produces a dict of sequences. Keys are
62+
`subject` and `body`.
63+
"""
64+
deffit(self,x,y=None):
65+
returnself
66+
67+
deftransform(self,posts):
68+
# construct object dtype array with two columns
69+
# first column = 'subject' and second column = 'body'
70+
features=np.empty(shape=(len(posts),2),dtype=object)
71+
fori,textinenumerate(posts):
72+
headers,_,bod=text.partition('\n\n')
73+
bod=strip_newsgroup_footer(bod)
74+
bod=strip_newsgroup_quoting(bod)
75+
features[i,1]=bod
76+
77+
prefix='Subject:'
78+
sub=''
79+
forlineinheaders.split('\n'):
80+
ifline.startswith(prefix):
81+
sub=line[len(prefix):]
82+
break
83+
features[i,0]=sub
84+
85+
returnfeatures
86+
87+
88+
pipeline=Pipeline([
89+
# Extract the subject & body
90+
('subjectbody',SubjectBodyExtractor()),
91+
92+
# Use C toolumnTransformer to combine the features from subject and body
93+
('union',ColumnTransformer(
94+
[
95+
# Pulling features from the post's subject line (first column)
96+
('subject',TfidfVectorizer(min_df=50),0),
97+
98+
# Pipeline for standard bag-of-words model for body (second column)
99+
('body_bow',Pipeline([
100+
('tfidf',TfidfVectorizer()),
101+
('best',TruncatedSVD(n_components=50)),
102+
]),1),
103+
104+
# Pipeline for pulling ad hoc features from post's body
105+
('body_stats',Pipeline([
106+
('stats',TextStats()),# returns a list of dicts
107+
('vect',DictVectorizer()),# list of dicts -> feature matrix
108+
]),1),
109+
],
110+
111+
# weight components in ColumnTransformer
112+
transformer_weights={
113+
'subject':0.8,
114+
'body_bow':0.5,
115+
'body_stats':1.0,
116+
}
117+
)),
118+
119+
# Use a SVC classifier on the combined features
120+
('svc',SVC(kernel='linear')),
121+
])
122+
123+
# limit the list of categories to make running this example faster.
124+
categories= ['alt.atheism','talk.religion.misc']
125+
train=fetch_20newsgroups(random_state=1,
126+
subset='train',
127+
categories=categories,
128+
)
129+
test=fetch_20newsgroups(random_state=1,
130+
subset='test',
131+
categories=categories,
132+
)
133+
134+
pipeline.fit(train.data,train.target)
135+
y=pipeline.predict(test.data)
136+
print(classification_report(y,test.target))

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp