NotificationsYou must be signed in to change notification settings
Fork0
Star0

Commitf47bc4f

committed

Add start of SVG+MathML branch

--HG--branch : svgmathmlextra : convert_revision : svn%3Aacbfec75-9323-0410-a652-858a13e371e0/branches/svgmathml%401261

1 parentbf5f514 commitf47bc4fCopy full SHA for f47bc4f

File tree

12 files changed

+911

-775

lines changed

parse.py
src/html5lib
- constants.py
- html5parser.py
- inputstream.py
- tokenizer.py
- treebuilders
tests
- test_encoding.py
- test_tokenizer.py

12 files changed

+911

-775

lines changed

`‎parse.py‎`

Lines changed: 3 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -12,7 +12,8 @@`
`12`	`12`	`#RELEASE remove`
`13`	`13`	`sys.path.insert(0,os.path.abspath(os.path.join(__file__,'../src')))`
`14`	`14`	`#END RELEASE`
`15`		`-fromhtml5libimporthtml5parser,liberalxmlparser,sanitizer,tokenizer`
	`15`	`+fromhtml5libimporthtml5parser,liberalxmlparser,sanitizer`
	`16`	`+fromhtml5lib.tokenizerimportHTMLTokenizer`
`16`	`17`	`fromhtml5libimporttreebuilders,serializer,treewalkers`
`17`	`18`	`fromhtml5libimportconstants`
`18`	`19`
`@@ -80,7 +81,7 @@ def parse():`
`80`	`81`	`t1=time.time()`
`81`	`82`	`printOutput(p,document,opts)`
`82`	`83`	`t2=time.time()`
`83`		`-print"\n\nRun took: %fs (plus %fs to print the output)"%(t1-t0,t2-t1)`
	`84`	`+sys.stderr.write("\n\nRun took: %fs (plus %fs to print the output)"%(t1-t0,t2-t1))`
`84`	`85`	`else:`
`85`	`86`	`document=parseMethod(f,encoding=encoding)`
`86`	`87`	`printOutput(p,document,opts)`

`‎src/html5lib/constants.py‎`

Lines changed: 17 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -72,6 +72,10 @@`
`72`	`72`	`_(u"Unexpected end of file in attribute value (')."),`
`73`	`73`	`"eof-in-attribute-value-no-quotes":`
`74`	`74`	`_(u"Unexpected end of file in attribute value."),`
	`75`	`+"unexpected-EOF-after-solidus-in-tag":`
	`76`	`+_(u"Unexpected end of file in tag. Expected >"),`
	`77`	`+"unexpected-character-after-soldius-in-tag":`
	`78`	`+_(u"Unexpected character after / in tag. Expected >"),`
`75`	`79`	`"expected-dashes-or-doctype":`
`76`	`80`	`_(u"Expected '--' or 'DOCTYPE'. Not found."),`
`77`	`81`	`"incorrect-comment":`
`@@ -1098,5 +1102,18 @@`
`1098`	`1102`	`"ParseError":7`
`1099`	`1103`	`}`
`1100`	`1104`
	`1105`	`+namespaces= {`
	`1106`	`+"html":"http://www.w3.org/1999/xhtml",`
	`1107`	`+"mathml":"http://www.w3.org/1998/Math/MathML",`
	`1108`	`+"svg":"http://www.w3.org/2000/svg",`
	`1109`	`+"xlink":"http://www.w3.org/1999/xlink",`
	`1110`	`+"xml":"http://www.w3.org/XML/1998/namespace",`
	`1111`	`+"xmlns":"http://www.w3.org/2000/xmlns/"`
	`1112`	`+}`
	`1113`	`+`
	`1114`	`+`
`1101`	`1115`	`classDataLossWarning(UserWarning):`
`1102`	`1116`	`pass`
	`1117`	`+`
	`1118`	`+classReparseException(Exception):`
	`1119`	`+pass`

`‎src/html5lib/html5parser.py‎`

Lines changed: 612 additions & 598 deletions

Large diffs are not rendered by default.

`‎src/html5lib/inputstream.py‎`

Lines changed: 102 additions & 58 deletions

Original file line number	Diff line number	Diff line change
`@@ -3,7 +3,7 @@`
`3`	`3`	`importtypes`
`4`	`4`
`5`	`5`	`fromconstantsimportEOF,spaceCharacters,asciiLetters,asciiUppercase`
`6`		`-fromconstantsimportencodings`
	`6`	`+fromconstantsimportencodings,ReparseException`
`7`	`7`
`8`	`8`	`#Non-unicode versions of constants for use in the pre-parser`
`9`	`9`	`spaceCharactersBytes= [str(item)foriteminspaceCharacters]`
`@@ -16,6 +16,82 @@`
`16`	`16`
`17`	`17`	`# Cache for charsUntil()`
`18`	`18`	`charsUntilRegEx= {}`
	`19`	`+`
	`20`	`+classBufferedStream:`
	`21`	`+"""Buffering for streams that do not have buffering of their own`
	`22`	`+`
	`23`	`+ The buffer is implemented as a list of chunks on the assumption that`
	`24`	`+ joining many strings will be slow since it is O(n**2)`
	`25`	`+ """`
	`26`	`+`
	`27`	`+def__init__(self,stream):`
	`28`	`+self.stream=stream`
	`29`	`+self.buffer= []`
	`30`	`+self.position= [-1,0]#chunk number, offset`
	`31`	`+`
	`32`	`+deftell(self):`
	`33`	`+pos=0`
	`34`	`+forchunkinself.buffer[:self.position[0]]:`
	`35`	`+pos+=len(chunk)`
	`36`	`+pos+=self.position[1]`
	`37`	`+returnpos`
	`38`	`+`
	`39`	`+defseek(self,pos):`
	`40`	`+assertpos<self._bufferedBytes()`
	`41`	`+offset=pos`
	`42`	`+i=0`
	`43`	`+whilelen(self.buffer[i])<offset:`
	`44`	`+offset-=pos`
	`45`	`+i+=1`
	`46`	`+self.position= [i,offset]`
	`47`	`+`
	`48`	`+defread(self,bytes):`
	`49`	`+ifnotself.buffer:`
	`50`	`+returnself._readStream(bytes)`
	`51`	`+elif (self.position[0]==len(self.buffer)and`
	`52`	`+self.position[1]==len(self.buffer[-1])):`
	`53`	`+returnself._readStream(bytes)`
	`54`	`+else:`
	`55`	`+returnself._readFromBuffer(bytes)`
	`56`	`+`
	`57`	`+def_bufferedBytes(self):`
	`58`	`+returnsum([len(item)foriteminself.buffer])`
	`59`	`+`
	`60`	`+def_readStream(self,bytes):`
	`61`	`+data=self.stream.read(bytes)`
	`62`	`+self.buffer.append(data)`
	`63`	`+self.position[0]+=1`
	`64`	`+self.position[1]=len(data)`
	`65`	`+returndata`
	`66`	`+`
	`67`	`+def_readFromBuffer(self,bytes):`
	`68`	`+remainingBytes=bytes`
	`69`	`+rv= []`
	`70`	`+bufferIndex=self.position[0]`
	`71`	`+bufferOffset=self.position[1]`
	`72`	`+whilebufferIndex<len(self.buffer)andremainingBytes!=0:`
	`73`	`+assertremainingBytes>0`
	`74`	`+bufferedData=self.buffer[bufferIndex]`
	`75`	`+`
	`76`	`+ifremainingBytes<=len(bufferedData)-bufferOffset:`
	`77`	`+bytesToRead=remainingBytes`
	`78`	`+self.position= [bufferIndex,bufferOffset+bytesToRead]`
	`79`	`+else:`
	`80`	`+bytesToRead=len(bufferedData)-bufferOffset`
	`81`	`+self.position= [bufferIndex,len(bufferedData)]`
	`82`	`+bufferIndex+=1`
	`83`	`+data=rv.append(bufferedData[bufferOffset:`
	`84`	`+bufferOffset+bytesToRead])`
	`85`	`+remainingBytes-=bytesToRead`
	`86`	`+`
	`87`	`+bufferOffset=0`
	`88`	`+`
	`89`	`+ifremainingBytes:`
	`90`	`+rv.append(self._readStream(remainingBytes))`
	`91`	`+`
	`92`	`+return"".join(rv)`
	`93`	`+`
	`94`	`+`
`19`	`95`
`20`	`96`	`classHTMLInputStream:`
`21`	`97`	`"""Provides a unicode stream of characters to the HTMLTokenizer.`
`@@ -65,6 +141,9 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):`
`65`	`141`	`if (self.charEncoding[0]isNone):`
`66`	`142`	`self.charEncoding=self.detectEncoding(parseMeta,chardet)`
`67`	`143`
	`144`	`+self.reset()`
	`145`	`+`
	`146`	`+defreset(self):`
`68`	`147`	`self.dataStream=codecs.getreader(self.charEncoding[0])(self.rawStream,`
`69`	`148`	`'replace')`
`70`	`149`
`@@ -100,6 +179,10 @@ def openStream(self, source):`
`100`	`179`	`self.charEncoding= ("utf-8","certain")`
`101`	`180`	`importcStringIO`
`102`	`181`	`stream=cStringIO.StringIO(str(source))`
	`182`	`+`
	`183`	`+ifnot(hasattr(stream,"tell")andhasattr(stream,"seek")):`
	`184`	`+stream=BufferedStream(stream)`
	`185`	`+`
`103`	`186`	`returnstream`
`104`	`187`
`105`	`188`	`defdetectEncoding(self,parseMeta=True,chardet=True):`
`@@ -128,7 +211,7 @@ def detectEncoding(self, parseMeta=True, chardet=True):`
`128`	`211`	`detector.feed(buffer)`
`129`	`212`	`detector.close()`
`130`	`213`	`encoding=detector.result['encoding']`
`131`		`-self.seek("".join(buffers),0)`
	`214`	`+self.rawStream.seek(0)`
`132`	`215`	`exceptImportError:`
`133`	`216`	`pass`
`134`	`217`	`# If all else fails use the default encoding`
`@@ -146,16 +229,18 @@ def detectEncoding(self, parseMeta=True, chardet=True):`
`146`	`229`
`147`	`230`	`defchangeEncoding(self,newEncoding):`
`148`	`231`	`newEncoding=codecName(newEncoding)`
`149`		`-ifnewEncoding=="utf16":`
`150`		`-newEncoding="utf8"`
`151`		`-`
	`232`	`+ifnewEncodingin ("utf-16","utf-16-be","utf-16-le"):`
	`233`	`+newEncoding="utf-8"`
`152`	`234`	`ifnewEncodingisNone:`
`153`	`235`	`return`
`154`	`236`	`elifnewEncoding==self.charEncoding[0]:`
`155`		`-self.charEncoding= (self.charEncoding[0]and"certian")`
	`237`	`+self.charEncoding= (self.charEncoding[0],"certian")`
`156`	`238`	`else:`
`157`		`-raiseNotImplementedError,"Cannot change character encoding mid stream"`
`158`		`-`
	`239`	`+self.rawStream.seek(0)`
	`240`	`+self.reset()`
	`241`	`+self.charEncoding= (newEncoding,"certian")`
	`242`	`+raiseReparseException,"Encoding changed from %s to %s"%(self.charEncoding[0],newEncoding)`
	`243`	`+`
`159`	`244`	`defdetectBOM(self):`
`160`	`245`	`"""Attempts to detect at BOM at the start of the stream. If`
`161`	`246`	`an encoding can be determined from the BOM return the name of the`
`@@ -182,56 +267,21 @@ def detectBOM(self):`
`182`	`267`
`183`	`268`	`# Set the read position past the BOM if one was found, otherwise`
`184`	`269`	`# set it to the start of the stream`
`185`		`-self.seek(string,encodingandseekor0)`
	`270`	`+self.rawStream.seek(encodingandseekor0)`
`186`	`271`
`187`	`272`	`returnencoding`
`188`	`273`
`189`		`-defseek(self,buffer,n):`
`190`		`-"""Unget buffer[n:]"""`
`191`		`-ifhasattr(self.rawStream,'unget'):`
`192`		`-self.rawStream.unget(buffer[n:])`
`193`		`-return`
`194`		`-`
`195`		`-ifhasattr(self.rawStream,'seek'):`
`196`		`-try:`
`197`		`-self.rawStream.seek(n)`
`198`		`-return`
`199`		`-exceptIOError:`
`200`		`-pass`
`201`		`-`
`202`		`-classBufferedStream:`
`203`		`-def__init__(self,data,stream):`
`204`		`-self.data=data`
`205`		`-self.stream=stream`
`206`		`-defread(self,chars=-1):`
`207`		`-ifchars==-1orchars>len(self.data):`
`208`		`-result=self.data`
`209`		`-self.data=''`
`210`		`-ifchars==-1:`
`211`		`-returnresult+self.stream.read()`
`212`		`-else:`
`213`		`-returnresult+self.stream.read(chars-len(result))`
`214`		`-elifnotself.data:`
`215`		`-returnself.stream.read(chars)`
`216`		`-else:`
`217`		`-result=self.data[:chars]`
`218`		`-self.data=self.data[chars:]`
`219`		`-returnresult`
`220`		`-defunget(self,data):`
`221`		`-ifself.data:`
`222`		`-self.data+=data`
`223`		`-else:`
`224`		`-self.data=data`
`225`		`-`
`226`		`-self.rawStream=BufferedStream(buffer[n:],self.rawStream)`
`227`		`-`
`228`	`274`	`defdetectEncodingMeta(self):`
`229`	`275`	`"""Report the encoding declared by the meta element`
`230`	`276`	`"""`
`231`	`277`	`buffer=self.rawStream.read(self.numBytesMeta)`
`232`	`278`	`parser=EncodingParser(buffer)`
`233`		`-self.seek(buffer,0)`
	`279`	`+self.rawStream.seek(0)`
`234`	`280`	`encoding=parser.getEncoding()`
	`281`	`+`
	`282`	`+ifencodingin ("utf-16","utf-16-be","utf-16-le"):`
	`283`	`+encoding="utf-8"`
	`284`	`+`
`235`	`285`	`returnencoding`
`236`	`286`
`237`	`287`	`defupdatePosition(self,chars):`
`@@ -485,13 +535,6 @@ def getEncoding(self):`
`485`	`535`	`break`
`486`	`536`	`ifnotkeepParsing:`
`487`	`537`	`break`
`488`		`-ifself.encodingisnotNone:`
`489`		`-self.encoding=self.encoding.strip()`
`490`		`-#Spec violation that complies with hsivonen + mjs`
`491`		`-if (ascii_punctuation_re.sub("",self.encoding)in`
`492`		`- ("utf16","utf16be","utf16le",`
`493`		`-"utf32","utf32be","utf32le")):`
`494`		`-self.encoding="utf-8"`
`495`	`538`
`496`	`539`	`returnself.encoding`
`497`	`540`
`@@ -666,11 +709,12 @@ def parse(self):`
`666`	`709`	`exceptStopIteration:`
`667`	`710`	`returnNone`
`668`	`711`
	`712`	`+`
`669`	`713`	`defcodecName(encoding):`
`670`	`714`	`"""Return the python codec name corresponding to an encoding or None if the`
`671`	`715`	`string doesn't correspond to a valid encoding."""`
`672`		`-if (encodingisnotNoneandtype(encoding)==types.StringType):`
	`716`	`+if (encodingisnotNoneandtype(encoding)intypes.StringTypes):`
`673`	`717`	`canonicalName=ascii_punctuation_re.sub("",encoding).lower()`
`674`		`-returnencodings.get(canonicalName,None)`
	`718`	`+returnencodings.get(canonicalName,None)`
`675`	`719`	`else:`
`676`	`720`	`returnNone`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commitf47bc4f

File tree

12 files changed

12 files changed

`‎parse.py‎`

`‎src/html5lib/constants.py‎`

`‎src/html5lib/html5parser.py‎`

`‎src/html5lib/inputstream.py‎`

0 commit comments