11# -*- coding: utf-8 -*-
22import parsel
33import yaml
4+ import inspect
45
56
6- def extract_field (element ,item_type ,attribute = None ):
7+ def extract_field (element ,item_type ,attribute = None , formatter = None ):
78if item_type == 'Text' :
89texts = [i .strip ()for i in element .xpath ('.//text()' ).getall ()if i .strip ()]
910content = " " .join (texts )
@@ -13,58 +14,33 @@ def extract_field(element, item_type, attribute=None):
1314content = element .get ()
1415elif item_type == 'Attribute' :
1516content = element .attrib .get (attribute )
17+ if formatter :
18+ content = formatter .format (content )
1619return content
1720
1821
19- def get_child_item (field_config ,element ):
20- children_config = field_config ['children' ]
21- child_item = {}
22- for field in children_config :
23- child_value = extract_selector (children_config [field ],element )
24- child_item [field ]= child_value
25- return child_item
26-
27-
28- def extract_selector (field_config ,parent_parser ):
29- if 'xpath' in field_config :
30- elements = parent_parser .xpath (field_config ['xpath' ])
31- else :
32- elements = parent_parser .css (field_config ['selector' ])
33- item_type = field_config .get ('type' ,'Text' )
34- values = []
35-
36- for element in elements :
37- if 'children' in field_config :
38- value = get_child_item (field_config ,element )
39- else :
40- value = extract_field (element ,item_type ,
41- field_config .get ('attribute' ))
42-
43- if field_config .get ('multiple' )is not True :
44- return value
45- else :
46- values .append (value )
47-
48- return values
49-
50-
5122class Selector :
5223"""selector class"""
53- def __init__ (self ,config ):
24+ def __init__ (self ,config , formatters = None ):
5425self .config = config
26+ if formatters :
27+ formatters = [i ()if inspect .isclass (i )else i for i in formatters ]
28+ self .formatters = {i .name :i for i in formatters }
29+ else :
30+ self .formatters = {}
5531
5632@classmethod
57- def from_yaml_string (cls ,yaml_string :str ):
33+ def from_yaml_string (cls ,yaml_string :str , formatters = None ):
5834"""create selector object from yaml string"""
5935config = yaml .safe_load (yaml_string )
60- return cls (config )
36+ return cls (config , formatters = formatters )
6137
6238@classmethod
63- def from_yaml_file (cls ,yaml_filename :str ):
39+ def from_yaml_file (cls ,yaml_filename :str , formatters = None ):
6440"""create selector object from yaml file"""
6541with open (yaml_filename )as yaml_fileobj :
6642config = yaml .safe_load (yaml_fileobj .read ())
67- return cls (config )
43+ return cls (config , formatters = formatters )
6844
6945def extract (self ,html :str ,base_url :str = None ):
7046"""returns extracted dict"""
@@ -73,5 +49,39 @@ def extract(self, html: str, base_url: str = None):
7349sel .root .make_links_absolute ()
7450fields_data = {}
7551for selector_name ,selector_config in self .config .items ():
76- fields_data [selector_name ]= extract_selector (selector_config ,sel )
52+ fields_data [selector_name ]= self . extract_selector (selector_config ,sel )
7753return fields_data
54+
55+ def extract_selector (self ,field_config ,parent_parser ):
56+ if 'xpath' in field_config :
57+ elements = parent_parser .xpath (field_config ['xpath' ])
58+ else :
59+ elements = parent_parser .css (field_config ['selector' ])
60+ item_type = field_config .get ('type' ,'Text' )
61+ values = []
62+
63+ for element in elements :
64+ if 'children' in field_config :
65+ value = self .get_child_item (field_config ,element )
66+ else :
67+ kwargs = {'attribute' :field_config .get ('attribute' )}
68+ if 'attribute' in field_config :
69+ kwargs ['attribute' ]= field_config ['attribute' ]
70+ if 'format' in field_config :
71+ kwargs ['formatter' ]= self .formatters [field_config ['format' ]]
72+ value = extract_field (element ,item_type ,** kwargs )
73+
74+ if field_config .get ('multiple' )is not True :
75+ return value
76+ else :
77+ values .append (value )
78+
79+ return values
80+
81+ def get_child_item (self ,field_config ,element ):
82+ children_config = field_config ['children' ]
83+ child_item = {}
84+ for field in children_config :
85+ child_value = self .extract_selector (children_config [field ],element )
86+ child_item [field ]= child_value
87+ return child_item