Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit661b0a4

Browse files
committed
add basic formatter option
1 parent47ba3aa commit661b0a4

File tree

6 files changed

+72
-458
lines changed

6 files changed

+72
-458
lines changed

‎selectorlib/formatter.py‎

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
importabc
2+
3+
4+
classFormat(abc.ABC):
5+
@abc.abstractmethod
6+
defformat(self,text:str):
7+
"""return text after formatting"""
8+
9+
@property
10+
defname(self):
11+
returnself.__class__.__name__
12+
13+
14+
classInteger(Format):
15+
defformat(self,text):
16+
returnint(text)

‎selectorlib/selectorlib.py‎

Lines changed: 49 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
11
# -*- coding: utf-8 -*-
22
importparsel
33
importyaml
4+
importinspect
45

56

6-
defextract_field(element,item_type,attribute=None):
7+
defextract_field(element,item_type,attribute=None,formatter=None):
78
ifitem_type=='Text':
89
texts= [i.strip()foriinelement.xpath('.//text()').getall()ifi.strip()]
910
content=" ".join(texts)
@@ -13,58 +14,33 @@ def extract_field(element, item_type, attribute=None):
1314
content=element.get()
1415
elifitem_type=='Attribute':
1516
content=element.attrib.get(attribute)
17+
ifformatter:
18+
content=formatter.format(content)
1619
returncontent
1720

1821

19-
defget_child_item(field_config,element):
20-
children_config=field_config['children']
21-
child_item= {}
22-
forfieldinchildren_config:
23-
child_value=extract_selector(children_config[field],element)
24-
child_item[field]=child_value
25-
returnchild_item
26-
27-
28-
defextract_selector(field_config,parent_parser):
29-
if'xpath'infield_config:
30-
elements=parent_parser.xpath(field_config['xpath'])
31-
else:
32-
elements=parent_parser.css(field_config['selector'])
33-
item_type=field_config.get('type','Text')
34-
values= []
35-
36-
forelementinelements:
37-
if'children'infield_config:
38-
value=get_child_item(field_config,element)
39-
else:
40-
value=extract_field(element,item_type,
41-
field_config.get('attribute'))
42-
43-
iffield_config.get('multiple')isnotTrue:
44-
returnvalue
45-
else:
46-
values.append(value)
47-
48-
returnvalues
49-
50-
5122
classSelector:
5223
"""selector class"""
53-
def__init__(self,config):
24+
def__init__(self,config,formatters=None):
5425
self.config=config
26+
ifformatters:
27+
formatters= [i()ifinspect.isclass(i)elseiforiinformatters]
28+
self.formatters= {i.name:iforiinformatters}
29+
else:
30+
self.formatters= {}
5531

5632
@classmethod
57-
deffrom_yaml_string(cls,yaml_string:str):
33+
deffrom_yaml_string(cls,yaml_string:str,formatters=None):
5834
"""create selector object from yaml string"""
5935
config=yaml.safe_load(yaml_string)
60-
returncls(config)
36+
returncls(config,formatters=formatters)
6137

6238
@classmethod
63-
deffrom_yaml_file(cls,yaml_filename:str):
39+
deffrom_yaml_file(cls,yaml_filename:str,formatters=None):
6440
"""create selector object from yaml file"""
6541
withopen(yaml_filename)asyaml_fileobj:
6642
config=yaml.safe_load(yaml_fileobj.read())
67-
returncls(config)
43+
returncls(config,formatters=formatters)
6844

6945
defextract(self,html:str,base_url:str=None):
7046
"""returns extracted dict"""
@@ -73,5 +49,39 @@ def extract(self, html: str, base_url: str = None):
7349
sel.root.make_links_absolute()
7450
fields_data= {}
7551
forselector_name,selector_configinself.config.items():
76-
fields_data[selector_name]=extract_selector(selector_config,sel)
52+
fields_data[selector_name]=self.extract_selector(selector_config,sel)
7753
returnfields_data
54+
55+
defextract_selector(self,field_config,parent_parser):
56+
if'xpath'infield_config:
57+
elements=parent_parser.xpath(field_config['xpath'])
58+
else:
59+
elements=parent_parser.css(field_config['selector'])
60+
item_type=field_config.get('type','Text')
61+
values= []
62+
63+
forelementinelements:
64+
if'children'infield_config:
65+
value=self.get_child_item(field_config,element)
66+
else:
67+
kwargs= {'attribute':field_config.get('attribute')}
68+
if'attribute'infield_config:
69+
kwargs['attribute']=field_config['attribute']
70+
if'format'infield_config:
71+
kwargs['formatter']=self.formatters[field_config['format']]
72+
value=extract_field(element,item_type,**kwargs)
73+
74+
iffield_config.get('multiple')isnotTrue:
75+
returnvalue
76+
else:
77+
values.append(value)
78+
79+
returnvalues
80+
81+
defget_child_item(self,field_config,element):
82+
children_config=field_config['children']
83+
child_item= {}
84+
forfieldinchildren_config:
85+
child_value=self.extract_selector(children_config[field],element)
86+
child_item[field]=child_value
87+
returnchild_item

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp