Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit8205aae

Browse files
committed
Move entities to using DATrie if possible; if not, use a new custom Trie-like interface.
This provides a significant performance gain, both parse-time and memory-usage,over the previous entitiesStartingWith magic, regardless of which impl is used.
1 parentbfdf9bb commit8205aae

File tree

5 files changed

+157
-18
lines changed

5 files changed

+157
-18
lines changed

‎html5lib/tokenizer.py‎

Lines changed: 9 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -18,10 +18,9 @@
1818

1919
from .inputstreamimportHTMLInputStream
2020

21-
# Group entities by their first character, for faster lookups
22-
entitiesByFirstChar= {}
23-
foreinentities:
24-
entitiesByFirstChar.setdefault(e[0], []).append(e)
21+
from .trieimportTrie
22+
23+
entitiesTrie=Trie(entities)
2524

2625
classHTMLTokenizer(object):
2726
""" This class takes care of tokenizing HTML.
@@ -179,28 +178,20 @@ def consumeEntity(self, allowedChar=None, fromAttribute=False):
179178
#
180179
# Consume characters and compare to these to a substring of the
181180
# entity names in the list until the substring no longer matches.
182-
filteredEntityList=entitiesByFirstChar.get(charStack[0], [])
183-
184-
defentitiesStartingWith(name):
185-
return [eforeinfilteredEntityListife.startswith(name)]
186-
187181
while (charStack[-1]isnotEOF):
188-
filteredEntityList=entitiesStartingWith("".join(charStack))
189-
ifnotfilteredEntityList:
182+
ifnotentitiesTrie.has_keys_with_prefix("".join(charStack)):
190183
break
191184
charStack.append(self.stream.char())
192185

193186
# At this point we have a string that starts with some characters
194187
# that may match an entity
195-
entityName=None
196-
197188
# Try to find the longest entity the string will match to take care
198189
# of &noti for instance.
199-
forentityLengthinrange(len(charStack)-1,1,-1):
200-
possibleEntityName="".join(charStack[:entityLength])
201-
ifpossibleEntityNameinentities:
202-
entityName=possibleEntityName
203-
break
190+
try:
191+
entityName=entitiesTrie.longest_prefix("".join(charStack[:-1]))
192+
entityLength=len(entityName)
193+
exceptKeyError:
194+
entityName=None
204195

205196
ifentityNameisnotNone:
206197
ifentityName[-1]!=";":

‎html5lib/trie/__init__.py‎

Lines changed: 10 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,10 @@
1+
from .pyimportTrieasPyTrie
2+
3+
Trie=PyTrie
4+
5+
try:
6+
from .datrieimportTrieasDATrie
7+
exceptImportError:
8+
pass
9+
else:
10+
Trie=DATrie

‎html5lib/trie/_base.py‎

Lines changed: 33 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,33 @@
1+
fromcollectionsimportMapping
2+
3+
classTrie(Mapping):
4+
"""Abstract base class for tries"""
5+
6+
defkeys(self,prefix=None):
7+
keys=super().keys()
8+
9+
ifprefixisNone:
10+
returnset(keys)
11+
12+
return {xforxinkeysifx.startswith(prefix)}
13+
14+
defhas_keys_with_prefix(self,prefix):
15+
forkeyinself.keys():
16+
ifkey.startswith(prefix):
17+
returnTrue
18+
19+
returnFalse
20+
21+
deflongest_prefix(self,prefix):
22+
ifprefixinself:
23+
returnprefix
24+
25+
foriinrange(1,len(prefix)+1):
26+
ifprefix[:-i]inself:
27+
returnprefix[:-i]
28+
29+
raiseKeyError(prefix)
30+
31+
deflongest_prefix_item(self,prefix):
32+
lprefix=self.longest_prefix(prefix)
33+
return (lprefix,self[lprefix])

‎html5lib/trie/datrie.py‎

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
fromitertoolsimportchain
2+
3+
fromdatrieimportTrieasDATrie
4+
5+
from ._baseimportTrieasABCTrie
6+
7+
classTrie(ABCTrie):
8+
def__init__(self,data):
9+
chars=set()
10+
forkeyindata.keys():
11+
ifnotisinstance(key,str):
12+
raiseTypeError("All keys must be strings")
13+
forcharinkey:
14+
chars.add(char)
15+
16+
self._data=DATrie("".join(chars))
17+
forkey,valueindata.items():
18+
self._data[key]=value
19+
20+
def__contains__(self,key):
21+
returnkeyinself._data
22+
23+
def__len__(self):
24+
returnlen(self._data)
25+
26+
def__iter__(self):
27+
raiseNotImplementedError()
28+
29+
def__getitem__(self,key):
30+
returnself._data[key]
31+
32+
defkeys(self,prefix=None):
33+
returnself._data.keys(prefix)
34+
35+
defhas_keys_with_prefix(self,prefix):
36+
returnself._data.has_keys_with_prefix(prefix)
37+
38+
deflongest_prefix(self,prefix):
39+
returnself._data.longest_prefix(prefix)
40+
41+
deflongest_prefix_item(self,prefix):
42+
returnself._data.longest_prefix_item(prefix)

‎html5lib/trie/py.py‎

Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
frombisectimportbisect_left
2+
3+
from ._baseimportTrieasABCTrie
4+
5+
classTrie(ABCTrie):
6+
def__init__(self,data):
7+
ifnotall(isinstance(x,str)forxindata.keys()):
8+
raiseTypeError("All keys must be strings")
9+
10+
self._data=data
11+
self._keys=sorted(data.keys())
12+
self._cachestr=""
13+
self._cachepoints= (0,len(data))
14+
15+
def__contains__(self,key):
16+
returnkeyinself._data
17+
18+
def__len__(self):
19+
returnlen(self._data)
20+
21+
def__iter__(self):
22+
returniter(self._data)
23+
24+
def__getitem__(self,key):
25+
returnself._data[key]
26+
27+
defkeys(self,prefix=None):
28+
ifprefixisNoneorprefix==""ornotself._keys:
29+
returnset(self._keys)
30+
31+
ifprefix.startswith(self._cachestr):
32+
lo,hi=self._cachepoints
33+
start=i=bisect_left(self._keys,prefix,lo,hi)
34+
else:
35+
start=i=bisect_left(self._keys,prefix)
36+
37+
keys=set()
38+
ifstart==len(self._keys):
39+
returnkeys
40+
41+
whileself._keys[i].startswith(prefix):
42+
keys.add(self._keys[i])
43+
i+=1
44+
45+
self._cachestr=prefix
46+
self._cachepoints= (start,i)
47+
48+
returnkeys
49+
50+
defhas_keys_with_prefix(self,prefix):
51+
ifprefixinself._data:
52+
returnTrue
53+
54+
ifprefix.startswith(self._cachestr):
55+
lo,hi=self._cachepoints
56+
i=bisect_left(self._keys,prefix,lo,hi)
57+
else:
58+
i=bisect_left(self._keys,prefix)
59+
60+
ifi==len(self._keys):
61+
returnFalse
62+
63+
returnself._keys[i].startswith(prefix)

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp