Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commita5baff7

Browse files
committed
cleanup code and docs. rename Selector to Extractor
1 parent2a63c5d commita5baff7

File tree

8 files changed

+59
-45
lines changed

8 files changed

+59
-45
lines changed

‎HISTORY.rst‎

Lines changed: 0 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,3 @@
11
=======
22
History
33
=======
4-
5-
0.6.0 (2019-05-22)
6-
------------------
7-
8-
* First release on PyPI.

‎README.rst‎

Lines changed: 22 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,28 @@ selectorlib
2222

2323
A library to read a YML file with Xpath or CSS Selectors and extract data from HTML pages using them
2424

25-
2625
* Free software: MIT license
2726
* Documentation: https://selectorlib.readthedocs.io.
2827

28+
29+
Example
30+
--------
31+
32+
>>>from selectorlibimport Extractor
33+
>>>yaml_string="""
34+
title:
35+
selector: "h1"
36+
type: Text
37+
link:
38+
selector: "h2 a"
39+
type: Link
40+
"""
41+
>>>extractor= Extractor.from_yaml_string(yaml_string)
42+
>>>html="""
43+
<h1>Title</h1>
44+
<h2>Usage
45+
<a class="headerlink" href="http://test">¶</a>
46+
</h2>
47+
"""
48+
>>>selector.extract(html)
49+
{'title': 'Title', 'link': 'http://test'}

‎docs/index.rst‎

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
Welcome to selectorlib's documentation!
2-
======================================
2+
=======================================
33

44
..include::../README.rst
55

‎docs/installation.rst‎

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -38,7 +38,7 @@ Or download the `tarball`_:
3838

3939
..code-block::console
4040
41-
$ curl-OL https://github.com/scrapehero/selectorlib/tarball/master
41+
$ curl -OL https://github.com/scrapehero/selectorlib/tarball/master
4242
4343
Once you have a copy of the source, you can install it with:
4444

‎docs/selectorlib.rst‎

Lines changed: 4 additions & 26 deletions
Original file line numberDiff line numberDiff line change
@@ -5,39 +5,17 @@ Module contents
55
---------------
66

77
..automodule::selectorlib
8-
:members:Selector
8+
:members:Extractor
99

1010

1111

1212
Usage
1313
-----
1414

15-
To use selectorlib in a project::
16-
17-
>>>import selectorlib
18-
19-
>>>yaml_string="""
20-
title:
21-
selector: "h1"
22-
type: Text
23-
link:
24-
selector: "h2 a"
25-
type: Link
26-
"""
27-
>>>selector= selectorlib.Selector.from_yaml_string(yaml_string)
28-
>>>html="""
29-
<h1>Title</h1>
30-
<h2>Usage
31-
<a class="headerlink" href="http:://test">¶</a>
32-
</h2>
33-
"""
34-
>>>selector.extract(html)
35-
{'title': 'Title', 'link': 'http:://test'}
36-
37-
To use selectorlib with requests
15+
To use selectorlib with requests:
3816

3917
>>>import requests
40-
>>>from selectorlibimportSelector
18+
>>>from selectorlibimportExtractor
4119
>>>selector_yaml="""
4220
name:
4321
selector: h1.product_title
@@ -70,7 +48,7 @@ related_products:
7048
price:
7149
selector: .price
7250
"""
73-
>>>selector=Selector.from_yaml_string(selector_yaml)
51+
>>>extractor=Extractor.from_yaml_string(selector_yaml)
7452
>>>url='https://scrapeme.live/shop/Bulbasaur/'
7553
>>>response= requests.get(url)
7654
>>>selector.extract(response.text,base_url=response.url)

‎selectorlib/__init__.py‎

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6,4 +6,4 @@
66
__email__='pypi@scrapehero.com'
77
__version__='0.10.0'
88

9-
from .selectorlibimportSelector# noqa:F401
9+
from .selectorlibimportExtractor# noqa:F401

‎selectorlib/selectorlib.py‎

Lines changed: 29 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -19,7 +19,7 @@ def extract_field(element, item_type, attribute=None, formatter=None):
1919
returncontent
2020

2121

22-
classSelector:
22+
classExtractor:
2323
"""selector class"""
2424
def__init__(self,config,formatters=None):
2525
self.config=config
@@ -31,28 +31,48 @@ def __init__(self, config, formatters=None):
3131

3232
@classmethod
3333
deffrom_yaml_string(cls,yaml_string:str,formatters=None):
34-
"""create selector object from yaml string"""
34+
"""create `Extractor` object from yaml string
35+
36+
>>> yaml_string = '''
37+
title:
38+
selector: "h1"
39+
type: Text
40+
'''
41+
>>> extractor = Extractor.from_yaml_string(yaml_string)
42+
"""
3543
config=yaml.safe_load(yaml_string)
3644
returncls(config,formatters=formatters)
3745

3846
@classmethod
3947
deffrom_yaml_file(cls,yaml_filename:str,formatters=None):
40-
"""create selector object from yaml file"""
48+
"""create `Extractor` object from yaml file
49+
50+
>>> extractor = Extractor.from_yaml_string(yaml_filename='selectors.yaml')
51+
"""
4152
withopen(yaml_filename)asyaml_fileobj:
4253
config=yaml.safe_load(yaml_fileobj.read())
4354
returncls(config,formatters=formatters)
4455

4556
defextract(self,html:str,base_url:str=None):
46-
"""returns extracted dict"""
57+
"""
58+
Args:
59+
html: html string
60+
base_url (str, optional): specifying the base_url will make all extracted Links absolute
61+
Returns:
62+
dict: extracted data from given html string
63+
64+
>>> response = requests.get(url)
65+
>>> selector.extract(response.text, base_url=response.url)
66+
"""
4767
sel=parsel.Selector(html,base_url=base_url)
4868
ifbase_url:
4969
sel.root.make_links_absolute()
5070
fields_data= {}
5171
forselector_name,selector_configinself.config.items():
52-
fields_data[selector_name]=self.extract_selector(selector_config,sel)
72+
fields_data[selector_name]=self._extract_selector(selector_config,sel)
5373
returnfields_data
5474

55-
defextract_selector(self,field_config,parent_parser):
75+
def_extract_selector(self,field_config,parent_parser):
5676
if'xpath'infield_config:
5777
elements=parent_parser.xpath(field_config['xpath'])
5878
else:
@@ -62,7 +82,7 @@ def extract_selector(self, field_config, parent_parser):
6282

6383
forelementinelements:
6484
if'children'infield_config:
65-
value=self.get_child_item(field_config,element)
85+
value=self._get_child_item(field_config,element)
6686
else:
6787
kwargs= {'attribute':field_config.get('attribute')}
6888
if'attribute'infield_config:
@@ -78,10 +98,10 @@ def extract_selector(self, field_config, parent_parser):
7898

7999
returnvalues
80100

81-
defget_child_item(self,field_config,element):
101+
def_get_child_item(self,field_config,element):
82102
children_config=field_config['children']
83103
child_item= {}
84104
forfieldinchildren_config:
85-
child_value=self.extract_selector(children_config[field],element)
105+
child_value=self._extract_selector(children_config[field],element)
86106
child_item[field]=child_value
87107
returnchild_item

‎tests/test_selectorlib.py‎

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ def output_yaml():
3737
deftest_content(html,input_yaml,output_yaml):
3838
base_url="https://scrapeme.live/shop/Bulbasaur/"
3939
formatters= [formatter.Integer]
40-
selector=selectorlib.Selector.from_yaml_string(input_yaml,formatters=formatters)
40+
selector=selectorlib.Extractor.from_yaml_string(input_yaml,formatters=formatters)
4141
output=selector.extract(html,base_url=base_url)
4242
assertoutput==yaml.safe_load(output_yaml)
4343

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp