NotificationsYou must be signed in to change notification settings
Fork11
Star70

Commita5baff7

committed

cleanup code and docs. rename Selector to Extractor

1 parent2a63c5d commita5baff7Copy full SHA for a5baff7

File tree

8 files changed

+59

-45

lines changed

HISTORY.rst
README.rst
docs
selectorlib
- __init__.py
- selectorlib.py
tests
- test_selectorlib.py

8 files changed

+59

-45

lines changed

`‎HISTORY.rst‎`

Lines changed: 0 additions & 5 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,8 +1,3 @@`
`1`	`1`	`=======`
`2`	`2`	`History`
`3`	`3`	`=======`
`4`		`-`
`5`		`-0.6.0 (2019-05-22)`
`6`		`-------------------`
`7`		`-`
`8`		`-* First release on PyPI.`

`‎README.rst‎`

Lines changed: 22 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -22,7 +22,28 @@ selectorlib`
`22`	`22`
`23`	`23`	`A library to read a YML file with Xpath or CSS Selectors and extract data from HTML pages using them`
`24`	`24`
`25`		`-`
`26`	`25`	`* Free software: MIT license`
`27`	`26`	`* Documentation: https://selectorlib.readthedocs.io.`
`28`	`27`
	`28`	`+`
	`29`	`+Example`
	`30`	`+--------`
	`31`	`+`
	`32`	`+>>>from selectorlibimport Extractor`
	`33`	`+>>>yaml_string="""`
	`34`	`+ title:`
	`35`	`+ selector: "h1"`
	`36`	`+ type: Text`
	`37`	`+ link:`
	`38`	`+ selector: "h2 a"`
	`39`	`+ type: Link`
	`40`	`+ """`
	`41`	`+>>>extractor= Extractor.from_yaml_string(yaml_string)`
	`42`	`+>>>html="""`
	`43`	`+ <h1>Title</h1>`
	`44`	`+ <h2>Usage`
	`45`	`+ <a class="headerlink" href="http://test">¶</a>`
	`46`	`+ </h2>`
	`47`	`+ """`
	`48`	`+>>>selector.extract(html)`
	`49`	`+{'title': 'Title', 'link': 'http://test'}`

`‎docs/index.rst‎`

Lines changed: 1 addition & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,5 @@`
`1`	`1`	`Welcome to selectorlib's documentation!`
`2`		`-======================================`
	`2`	`+=======================================`
`3`	`3`
`4`	`4`	`..include::../README.rst`
`5`	`5`

`‎docs/installation.rst‎`

Lines changed: 1 addition & 1 deletion

Original file line number	Diff line number	Diff line change
@@ -38,7 +38,7 @@ Or download the `tarball`_:
`38`	`38`
`39`	`39`	`..code-block::console`
`40`	`40`
`41`		`- $ curl-OL https://github.com/scrapehero/selectorlib/tarball/master`
	`41`	`+ $ curl -OL https://github.com/scrapehero/selectorlib/tarball/master`
`42`	`42`
`43`	`43`	`Once you have a copy of the source, you can install it with:`
`44`	`44`

`‎docs/selectorlib.rst‎`

Lines changed: 4 additions & 26 deletions

Original file line number	Diff line number	Diff line change
`@@ -5,39 +5,17 @@ Module contents`
`5`	`5`	`---------------`
`6`	`6`
`7`	`7`	`..automodule::selectorlib`
`8`		`-:members:Selector`
	`8`	`+:members:Extractor`
`9`	`9`
`10`	`10`
`11`	`11`
`12`	`12`	`Usage`
`13`	`13`	`-----`
`14`	`14`
`15`		`-To use selectorlib in a project::`
`16`		`-`
`17`		`->>>import selectorlib`
`18`		`-`
`19`		`->>>yaml_string="""`
`20`		`- title:`
`21`		`- selector: "h1"`
`22`		`- type: Text`
`23`		`- link:`
`24`		`- selector: "h2 a"`
`25`		`- type: Link`
`26`		`- """`
`27`		`->>>selector= selectorlib.Selector.from_yaml_string(yaml_string)`
`28`		`->>>html="""`
`29`		`- <h1>Title</h1>`
`30`		`- <h2>Usage`
`31`		`- <a class="headerlink" href="http:://test">¶</a>`
`32`		`- </h2>`
`33`		`- """`
`34`		`->>>selector.extract(html)`
`35`		`-{'title': 'Title', 'link': 'http:://test'}`
`36`		`-`
`37`		`-To use selectorlib with requests`
	`15`	`+To use selectorlib with requests:`
`38`	`16`
`39`	`17`	`>>>import requests`
`40`		`->>>from selectorlibimportSelector`
	`18`	`+>>>from selectorlibimportExtractor`
`41`	`19`	`>>>selector_yaml="""`
`42`	`20`	`name:`
`43`	`21`	`selector: h1.product_title`
`@@ -70,7 +48,7 @@ related_products:`
`70`	`48`	`price:`
`71`	`49`	`selector: .price`
`72`	`50`	`"""`
`73`		`->>>selector=Selector.from_yaml_string(selector_yaml)`
	`51`	`+>>>extractor=Extractor.from_yaml_string(selector_yaml)`
`74`	`52`	`>>>url='https://scrapeme.live/shop/Bulbasaur/'`
`75`	`53`	`>>>response= requests.get(url)`
`76`	`54`	`>>>selector.extract(response.text,base_url=response.url)`

`‎selectorlib/init.py‎`

Lines changed: 1 addition & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -6,4 +6,4 @@`
`6`	`6`	`__email__='pypi@scrapehero.com'`
`7`	`7`	`__version__='0.10.0'`
`8`	`8`
`9`		`-from .selectorlibimportSelector# noqa:F401`
	`9`	`+from .selectorlibimportExtractor# noqa:F401`

`‎selectorlib/selectorlib.py‎`

Lines changed: 29 additions & 9 deletions

Original file line number	Diff line number	Diff line change
`@@ -19,7 +19,7 @@ def extract_field(element, item_type, attribute=None, formatter=None):`
`19`	`19`	`returncontent`
`20`	`20`
`21`	`21`
`22`		`-classSelector:`
	`22`	`+classExtractor:`
`23`	`23`	`"""selector class"""`
`24`	`24`	`def__init__(self,config,formatters=None):`
`25`	`25`	`self.config=config`
`@@ -31,28 +31,48 @@ def __init__(self, config, formatters=None):`
`31`	`31`
`32`	`32`	`@classmethod`
`33`	`33`	`deffrom_yaml_string(cls,yaml_string:str,formatters=None):`
`34`		`-"""create selector object from yaml string"""`
	`34`	+"""create `Extractor` object from yaml string
	`35`	`+`
	`36`	`+ >>> yaml_string = '''`
	`37`	`+ title:`
	`38`	`+ selector: "h1"`
	`39`	`+ type: Text`
	`40`	`+ '''`
	`41`	`+ >>> extractor = Extractor.from_yaml_string(yaml_string)`
	`42`	`+ """`
`35`	`43`	`config=yaml.safe_load(yaml_string)`
`36`	`44`	`returncls(config,formatters=formatters)`
`37`	`45`
`38`	`46`	`@classmethod`
`39`	`47`	`deffrom_yaml_file(cls,yaml_filename:str,formatters=None):`
`40`		`-"""create selector object from yaml file"""`
	`48`	+"""create `Extractor` object from yaml file
	`49`	`+`
	`50`	`+ >>> extractor = Extractor.from_yaml_string(yaml_filename='selectors.yaml')`
	`51`	`+ """`
`41`	`52`	`withopen(yaml_filename)asyaml_fileobj:`
`42`	`53`	`config=yaml.safe_load(yaml_fileobj.read())`
`43`	`54`	`returncls(config,formatters=formatters)`
`44`	`55`
`45`	`56`	`defextract(self,html:str,base_url:str=None):`
`46`		`-"""returns extracted dict"""`
	`57`	`+"""`
	`58`	`+ Args:`
	`59`	`+ html: html string`
	`60`	`+ base_url (str, optional): specifying the base_url will make all extracted Links absolute`
	`61`	`+ Returns:`
	`62`	`+ dict: extracted data from given html string`
	`63`	`+`
	`64`	`+ >>> response = requests.get(url)`
	`65`	`+ >>> selector.extract(response.text, base_url=response.url)`
	`66`	`+ """`
`47`	`67`	`sel=parsel.Selector(html,base_url=base_url)`
`48`	`68`	`ifbase_url:`
`49`	`69`	`sel.root.make_links_absolute()`
`50`	`70`	`fields_data= {}`
`51`	`71`	`forselector_name,selector_configinself.config.items():`
`52`		`-fields_data[selector_name]=self.extract_selector(selector_config,sel)`
	`72`	`+fields_data[selector_name]=self._extract_selector(selector_config,sel)`
`53`	`73`	`returnfields_data`
`54`	`74`
`55`		`-defextract_selector(self,field_config,parent_parser):`
	`75`	`+def_extract_selector(self,field_config,parent_parser):`
`56`	`76`	`if'xpath'infield_config:`
`57`	`77`	`elements=parent_parser.xpath(field_config['xpath'])`
`58`	`78`	`else:`
`@@ -62,7 +82,7 @@ def extract_selector(self, field_config, parent_parser):`
`62`	`82`
`63`	`83`	`forelementinelements:`
`64`	`84`	`if'children'infield_config:`
`65`		`-value=self.get_child_item(field_config,element)`
	`85`	`+value=self._get_child_item(field_config,element)`
`66`	`86`	`else:`
`67`	`87`	`kwargs= {'attribute':field_config.get('attribute')}`
`68`	`88`	`if'attribute'infield_config:`
`@@ -78,10 +98,10 @@ def extract_selector(self, field_config, parent_parser):`
`78`	`98`
`79`	`99`	`returnvalues`
`80`	`100`
`81`		`-defget_child_item(self,field_config,element):`
	`101`	`+def_get_child_item(self,field_config,element):`
`82`	`102`	`children_config=field_config['children']`
`83`	`103`	`child_item= {}`
`84`	`104`	`forfieldinchildren_config:`
`85`		`-child_value=self.extract_selector(children_config[field],element)`
	`105`	`+child_value=self._extract_selector(children_config[field],element)`
`86`	`106`	`child_item[field]=child_value`
`87`	`107`	`returnchild_item`

`‎tests/test_selectorlib.py‎`

Lines changed: 1 addition & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -37,7 +37,7 @@ def output_yaml():`
`37`	`37`	`deftest_content(html,input_yaml,output_yaml):`
`38`	`38`	`base_url="https://scrapeme.live/shop/Bulbasaur/"`
`39`	`39`	`formatters= [formatter.Integer]`
`40`		`-selector=selectorlib.Selector.from_yaml_string(input_yaml,formatters=formatters)`
	`40`	`+selector=selectorlib.Extractor.from_yaml_string(input_yaml,formatters=formatters)`
`41`	`41`	`output=selector.extract(html,base_url=base_url)`
`42`	`42`	`assertoutput==yaml.safe_load(output_yaml)`
`43`	`43`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commita5baff7

File tree

8 files changed

8 files changed

`‎HISTORY.rst‎`

`‎README.rst‎`

`‎docs/index.rst‎`

`‎docs/installation.rst‎`

`‎docs/selectorlib.rst‎`

`‎selectorlib/init.py‎`

`‎selectorlib/selectorlib.py‎`

`‎tests/test_selectorlib.py‎`

0 commit comments