Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitf47bc4f

Browse files
committed
Add start of SVG+MathML branch
--HG--branch : svgmathmlextra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/branches/svgmathml%401261
1 parentbf5f514 commitf47bc4f

12 files changed

+911
-775
lines changed

‎parse.py

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,8 @@
1212
#RELEASE remove
1313
sys.path.insert(0,os.path.abspath(os.path.join(__file__,'../src')))
1414
#END RELEASE
15-
fromhtml5libimporthtml5parser,liberalxmlparser,sanitizer,tokenizer
15+
fromhtml5libimporthtml5parser,liberalxmlparser,sanitizer
16+
fromhtml5lib.tokenizerimportHTMLTokenizer
1617
fromhtml5libimporttreebuilders,serializer,treewalkers
1718
fromhtml5libimportconstants
1819

@@ -80,7 +81,7 @@ def parse():
8081
t1=time.time()
8182
printOutput(p,document,opts)
8283
t2=time.time()
83-
print"\n\nRun took: %fs (plus %fs to print the output)"%(t1-t0,t2-t1)
84+
sys.stderr.write("\n\nRun took: %fs (plus %fs to print the output)"%(t1-t0,t2-t1))
8485
else:
8586
document=parseMethod(f,encoding=encoding)
8687
printOutput(p,document,opts)

‎src/html5lib/constants.py

Lines changed: 17 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -72,6 +72,10 @@
7272
_(u"Unexpected end of file in attribute value (')."),
7373
"eof-in-attribute-value-no-quotes":
7474
_(u"Unexpected end of file in attribute value."),
75+
"unexpected-EOF-after-solidus-in-tag":
76+
_(u"Unexpected end of file in tag. Expected >"),
77+
"unexpected-character-after-soldius-in-tag":
78+
_(u"Unexpected character after / in tag. Expected >"),
7579
"expected-dashes-or-doctype":
7680
_(u"Expected '--' or 'DOCTYPE'. Not found."),
7781
"incorrect-comment":
@@ -1098,5 +1102,18 @@
10981102
"ParseError":7
10991103
}
11001104

1105+
namespaces= {
1106+
"html":"http://www.w3.org/1999/xhtml",
1107+
"mathml":"http://www.w3.org/1998/Math/MathML",
1108+
"svg":"http://www.w3.org/2000/svg",
1109+
"xlink":"http://www.w3.org/1999/xlink",
1110+
"xml":"http://www.w3.org/XML/1998/namespace",
1111+
"xmlns":"http://www.w3.org/2000/xmlns/"
1112+
}
1113+
1114+
11011115
classDataLossWarning(UserWarning):
11021116
pass
1117+
1118+
classReparseException(Exception):
1119+
pass

‎src/html5lib/html5parser.py

Lines changed: 612 additions & 598 deletions
Large diffs are not rendered by default.

‎src/html5lib/inputstream.py

Lines changed: 102 additions & 58 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,7 @@
33
importtypes
44

55
fromconstantsimportEOF,spaceCharacters,asciiLetters,asciiUppercase
6-
fromconstantsimportencodings
6+
fromconstantsimportencodings,ReparseException
77

88
#Non-unicode versions of constants for use in the pre-parser
99
spaceCharactersBytes= [str(item)foriteminspaceCharacters]
@@ -16,6 +16,82 @@
1616

1717
# Cache for charsUntil()
1818
charsUntilRegEx= {}
19+
20+
classBufferedStream:
21+
"""Buffering for streams that do not have buffering of their own
22+
23+
The buffer is implemented as a list of chunks on the assumption that
24+
joining many strings will be slow since it is O(n**2)
25+
"""
26+
27+
def__init__(self,stream):
28+
self.stream=stream
29+
self.buffer= []
30+
self.position= [-1,0]#chunk number, offset
31+
32+
deftell(self):
33+
pos=0
34+
forchunkinself.buffer[:self.position[0]]:
35+
pos+=len(chunk)
36+
pos+=self.position[1]
37+
returnpos
38+
39+
defseek(self,pos):
40+
assertpos<self._bufferedBytes()
41+
offset=pos
42+
i=0
43+
whilelen(self.buffer[i])<offset:
44+
offset-=pos
45+
i+=1
46+
self.position= [i,offset]
47+
48+
defread(self,bytes):
49+
ifnotself.buffer:
50+
returnself._readStream(bytes)
51+
elif (self.position[0]==len(self.buffer)and
52+
self.position[1]==len(self.buffer[-1])):
53+
returnself._readStream(bytes)
54+
else:
55+
returnself._readFromBuffer(bytes)
56+
57+
def_bufferedBytes(self):
58+
returnsum([len(item)foriteminself.buffer])
59+
60+
def_readStream(self,bytes):
61+
data=self.stream.read(bytes)
62+
self.buffer.append(data)
63+
self.position[0]+=1
64+
self.position[1]=len(data)
65+
returndata
66+
67+
def_readFromBuffer(self,bytes):
68+
remainingBytes=bytes
69+
rv= []
70+
bufferIndex=self.position[0]
71+
bufferOffset=self.position[1]
72+
whilebufferIndex<len(self.buffer)andremainingBytes!=0:
73+
assertremainingBytes>0
74+
bufferedData=self.buffer[bufferIndex]
75+
76+
ifremainingBytes<=len(bufferedData)-bufferOffset:
77+
bytesToRead=remainingBytes
78+
self.position= [bufferIndex,bufferOffset+bytesToRead]
79+
else:
80+
bytesToRead=len(bufferedData)-bufferOffset
81+
self.position= [bufferIndex,len(bufferedData)]
82+
bufferIndex+=1
83+
data=rv.append(bufferedData[bufferOffset:
84+
bufferOffset+bytesToRead])
85+
remainingBytes-=bytesToRead
86+
87+
bufferOffset=0
88+
89+
ifremainingBytes:
90+
rv.append(self._readStream(remainingBytes))
91+
92+
return"".join(rv)
93+
94+
1995

2096
classHTMLInputStream:
2197
"""Provides a unicode stream of characters to the HTMLTokenizer.
@@ -65,6 +141,9 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
65141
if (self.charEncoding[0]isNone):
66142
self.charEncoding=self.detectEncoding(parseMeta,chardet)
67143

144+
self.reset()
145+
146+
defreset(self):
68147
self.dataStream=codecs.getreader(self.charEncoding[0])(self.rawStream,
69148
'replace')
70149

@@ -100,6 +179,10 @@ def openStream(self, source):
100179
self.charEncoding= ("utf-8","certain")
101180
importcStringIO
102181
stream=cStringIO.StringIO(str(source))
182+
183+
ifnot(hasattr(stream,"tell")andhasattr(stream,"seek")):
184+
stream=BufferedStream(stream)
185+
103186
returnstream
104187

105188
defdetectEncoding(self,parseMeta=True,chardet=True):
@@ -128,7 +211,7 @@ def detectEncoding(self, parseMeta=True, chardet=True):
128211
detector.feed(buffer)
129212
detector.close()
130213
encoding=detector.result['encoding']
131-
self.seek("".join(buffers),0)
214+
self.rawStream.seek(0)
132215
exceptImportError:
133216
pass
134217
# If all else fails use the default encoding
@@ -146,16 +229,18 @@ def detectEncoding(self, parseMeta=True, chardet=True):
146229

147230
defchangeEncoding(self,newEncoding):
148231
newEncoding=codecName(newEncoding)
149-
ifnewEncoding=="utf16":
150-
newEncoding="utf8"
151-
232+
ifnewEncodingin ("utf-16","utf-16-be","utf-16-le"):
233+
newEncoding="utf-8"
152234
ifnewEncodingisNone:
153235
return
154236
elifnewEncoding==self.charEncoding[0]:
155-
self.charEncoding= (self.charEncoding[0]and"certian")
237+
self.charEncoding= (self.charEncoding[0],"certian")
156238
else:
157-
raiseNotImplementedError,"Cannot change character encoding mid stream"
158-
239+
self.rawStream.seek(0)
240+
self.reset()
241+
self.charEncoding= (newEncoding,"certian")
242+
raiseReparseException,"Encoding changed from %s to %s"%(self.charEncoding[0],newEncoding)
243+
159244
defdetectBOM(self):
160245
"""Attempts to detect at BOM at the start of the stream. If
161246
an encoding can be determined from the BOM return the name of the
@@ -182,56 +267,21 @@ def detectBOM(self):
182267

183268
# Set the read position past the BOM if one was found, otherwise
184269
# set it to the start of the stream
185-
self.seek(string,encodingandseekor0)
270+
self.rawStream.seek(encodingandseekor0)
186271

187272
returnencoding
188273

189-
defseek(self,buffer,n):
190-
"""Unget buffer[n:]"""
191-
ifhasattr(self.rawStream,'unget'):
192-
self.rawStream.unget(buffer[n:])
193-
return
194-
195-
ifhasattr(self.rawStream,'seek'):
196-
try:
197-
self.rawStream.seek(n)
198-
return
199-
exceptIOError:
200-
pass
201-
202-
classBufferedStream:
203-
def__init__(self,data,stream):
204-
self.data=data
205-
self.stream=stream
206-
defread(self,chars=-1):
207-
ifchars==-1orchars>len(self.data):
208-
result=self.data
209-
self.data=''
210-
ifchars==-1:
211-
returnresult+self.stream.read()
212-
else:
213-
returnresult+self.stream.read(chars-len(result))
214-
elifnotself.data:
215-
returnself.stream.read(chars)
216-
else:
217-
result=self.data[:chars]
218-
self.data=self.data[chars:]
219-
returnresult
220-
defunget(self,data):
221-
ifself.data:
222-
self.data+=data
223-
else:
224-
self.data=data
225-
226-
self.rawStream=BufferedStream(buffer[n:],self.rawStream)
227-
228274
defdetectEncodingMeta(self):
229275
"""Report the encoding declared by the meta element
230276
"""
231277
buffer=self.rawStream.read(self.numBytesMeta)
232278
parser=EncodingParser(buffer)
233-
self.seek(buffer,0)
279+
self.rawStream.seek(0)
234280
encoding=parser.getEncoding()
281+
282+
ifencodingin ("utf-16","utf-16-be","utf-16-le"):
283+
encoding="utf-8"
284+
235285
returnencoding
236286

237287
defupdatePosition(self,chars):
@@ -485,13 +535,6 @@ def getEncoding(self):
485535
break
486536
ifnotkeepParsing:
487537
break
488-
ifself.encodingisnotNone:
489-
self.encoding=self.encoding.strip()
490-
#Spec violation that complies with hsivonen + mjs
491-
if (ascii_punctuation_re.sub("",self.encoding)in
492-
("utf16","utf16be","utf16le",
493-
"utf32","utf32be","utf32le")):
494-
self.encoding="utf-8"
495538

496539
returnself.encoding
497540

@@ -666,11 +709,12 @@ def parse(self):
666709
exceptStopIteration:
667710
returnNone
668711

712+
669713
defcodecName(encoding):
670714
"""Return the python codec name corresponding to an encoding or None if the
671715
string doesn't correspond to a valid encoding."""
672-
if (encodingisnotNoneandtype(encoding)==types.StringType):
716+
if (encodingisnotNoneandtype(encoding)intypes.StringTypes):
673717
canonicalName=ascii_punctuation_re.sub("",encoding).lower()
674-
returnencodings.get(canonicalName,None)
718+
returnencodings.get(canonicalName,None)
675719
else:
676720
returnNone

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp