@@ -19,7 +19,7 @@ def extract_field(element, item_type, attribute=None, formatter=None):
1919return content
2020
2121
22- class Selector :
22+ class Extractor :
2323"""selector class"""
2424def __init__ (self ,config ,formatters = None ):
2525self .config = config
@@ -31,28 +31,48 @@ def __init__(self, config, formatters=None):
3131
3232@classmethod
3333def from_yaml_string (cls ,yaml_string :str ,formatters = None ):
34- """create selector object from yaml string"""
34+ """create `Extractor` object from yaml string
35+
36+ >>> yaml_string = '''
37+ title:
38+ selector: "h1"
39+ type: Text
40+ '''
41+ >>> extractor = Extractor.from_yaml_string(yaml_string)
42+ """
3543config = yaml .safe_load (yaml_string )
3644return cls (config ,formatters = formatters )
3745
3846@classmethod
3947def from_yaml_file (cls ,yaml_filename :str ,formatters = None ):
40- """create selector object from yaml file"""
48+ """create `Extractor` object from yaml file
49+
50+ >>> extractor = Extractor.from_yaml_string(yaml_filename='selectors.yaml')
51+ """
4152with open (yaml_filename )as yaml_fileobj :
4253config = yaml .safe_load (yaml_fileobj .read ())
4354return cls (config ,formatters = formatters )
4455
4556def extract (self ,html :str ,base_url :str = None ):
46- """returns extracted dict"""
57+ """
58+ Args:
59+ html: html string
60+ base_url (str, optional): specifying the base_url will make all extracted Links absolute
61+ Returns:
62+ dict: extracted data from given html string
63+
64+ >>> response = requests.get(url)
65+ >>> selector.extract(response.text, base_url=response.url)
66+ """
4767sel = parsel .Selector (html ,base_url = base_url )
4868if base_url :
4969sel .root .make_links_absolute ()
5070fields_data = {}
5171for selector_name ,selector_config in self .config .items ():
52- fields_data [selector_name ]= self .extract_selector (selector_config ,sel )
72+ fields_data [selector_name ]= self ._extract_selector (selector_config ,sel )
5373return fields_data
5474
55- def extract_selector (self ,field_config ,parent_parser ):
75+ def _extract_selector (self ,field_config ,parent_parser ):
5676if 'xpath' in field_config :
5777elements = parent_parser .xpath (field_config ['xpath' ])
5878else :
@@ -62,7 +82,7 @@ def extract_selector(self, field_config, parent_parser):
6282
6383for element in elements :
6484if 'children' in field_config :
65- value = self .get_child_item (field_config ,element )
85+ value = self ._get_child_item (field_config ,element )
6686else :
6787kwargs = {'attribute' :field_config .get ('attribute' )}
6888if 'attribute' in field_config :
@@ -78,10 +98,10 @@ def extract_selector(self, field_config, parent_parser):
7898
7999return values
80100
81- def get_child_item (self ,field_config ,element ):
101+ def _get_child_item (self ,field_config ,element ):
82102children_config = field_config ['children' ]
83103child_item = {}
84104for field in children_config :
85- child_value = self .extract_selector (children_config [field ],element )
105+ child_value = self ._extract_selector (children_config [field ],element )
86106child_item [field ]= child_value
87107return child_item