html5lib/html5lib-pythonPublic

NotificationsYou must be signed in to change notification settings
Fork302
Star1.2k

Commitbabe4a3

committed

Attempt at merging svgmathml branch to the default branch

--HG--branch : svgmathmlrename : python/parse.py => python3/parse.pyrename : python/src/html5lib/__init__.py => python3/src/html5lib/__init__.pyrename : python/src/html5lib/constants.py => python3/src/html5lib/constants.pyrename : python/src/html5lib/filters/optionaltags.py => python3/src/html5lib/filters/optionaltags.pyrename : python/src/html5lib/html5parser.py => python3/src/html5lib/html5parser.pyrename : python/src/html5lib/inputstream.py => python3/src/html5lib/inputstream.pyrename : python/src/html5lib/sanitizer.py => python3/src/html5lib/sanitizer.pyrename : python/src/html5lib/serializer/__init__.py => python3/src/html5lib/serializer/__init__.pyrename : python/src/html5lib/tokenizer.py => python3/src/html5lib/tokenizer.pyrename : python/src/html5lib/treebuilders/etree_lxml.py => python3/src/html5lib/treebuilders/etree_lxml.pyrename : python/src/html5lib/treebuilders/simpletree.py => python3/src/html5lib/treebuilders/simpletree.pyrename : python/tests/test_encoding.py => python3/tests/test_encoding.pyrename : python/tests/test_parser.py => python3/tests/test_parser.pyrename : python/tests/test_tokenizer.py => python3/tests/test_tokenizer.py

1 parent768ba79 commitbabe4a3Copy full SHA for babe4a3

File tree

8 files changed

+35

-36

lines changed

8 files changed

+35

-36

lines changed

`‎parse.py‎`

Lines changed: 1 addition & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -57,7 +57,7 @@ def parse():`
`57`	`57`	`else:`
`58`	`58`	`tokenizer=HTMLTokenizer`
`59`	`59`
`60`		`-ifopts.xml:`
	`60`	`+ifopts.liberalxml:`
`61`	`61`	`p=liberalxmlparser.XHTMLParser(tree=treebuilder,tokenizer=tokenizer)`
`62`	`62`	`else:`
`63`	`63`	`p=html5parser.HTMLParser(tree=treebuilder,tokenizer=tokenizer)`

`‎src/html5lib/constants.py‎`

Lines changed: 0 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -1070,7 +1070,6 @@`
`1070`	`1070`	`'utf16':'utf-16',`
`1071`	`1071`	`'utf16be':'utf-16-be',`
`1072`	`1072`	`'utf16le':'utf-16-le',`
`1073`		`-'utf7':'utf-7',`
`1074`	`1073`	`'utf8':'utf-8',`
`1075`	`1074`	`'windows1250':'cp1250',`
`1076`	`1075`	`'windows1251':'cp1251',`

`‎src/html5lib/filters/optionaltags.py‎`

Lines changed: 7 additions & 3 deletions

Original file line number	Diff line number	Diff line change
`@@ -31,7 +31,11 @@ def is_optional_start(self, tagname, previous, next):`
`31`	`31`	`eliftagname=='head':`
`32`	`32`	`# A head element's start tag may be omitted if the first thing`
`33`	`33`	`# inside the head element is an element.`
`34`		`-returntype=="StartTag"`
	`34`	`+# XXX: we also omit the start tag if the head element is empty`
	`35`	`+iftypein ("StartTag","EmptyTag"):`
	`36`	`+returnTrue`
	`37`	`+eliftype=="EndTag":`
	`38`	`+returnnext["name"]=="head"`
`35`	`39`	`eliftagname=='body':`
`36`	`40`	`# A body element's start tag may be omitted if the first thing`
`37`	`41`	`# inside the body element is not a space character or a comment,`
`@@ -52,7 +56,7 @@ def is_optional_start(self, tagname, previous, next):`
`52`	`56`	`# inside the colgroup element is a col element, and if the element`
`53`	`57`	`# is not immediately preceeded by another colgroup element whose`
`54`	`58`	`# end tag has been omitted.`
`55`		`-iftype=="StartTag":`
	`59`	`+iftypein ("StartTag","EmptyTag"):`
`56`	`60`	`# XXX: we do not look at the preceding event, so instead we never`
`57`	`61`	`# omit the colgroup element's end tag when it is immediately`
`58`	`62`	`# followed by another colgroup element. See is_optional_end.`
`@@ -114,7 +118,7 @@ def is_optional_end(self, tagname, next):`
`114`	`118`	`# footer, form, h1, h2, h3, h4, h5, h6, header, hr, menu,`
`115`	`119`	`# nav, ol, p, pre, section, table, or ul, element, or if`
`116`	`120`	`# there is no more content in the parent element.`
`117`		`-iftype=="StartTag":`
	`121`	`+iftypein ("StartTag","EmptyTag"):`
`118`	`122`	`returnnext["name"]in ('address','article','aside', \`
`119`	`123`	`'blockquote','datagrid','dialog','dir','div', \`
`120`	`124`	`'dl','fieldset','footer','form','h1','h2','h3', \`

`‎src/html5lib/html5parser.py‎`

Lines changed: 17 additions & 27 deletions

Original file line number	Diff line number	Diff line change
`@@ -108,7 +108,6 @@ def _parse(self, stream, innerHTML=False, container="div",`
`108`	`108`	`# We only seem to have InBodyPhase testcases where the following is`
`109`	`109`	`# relevant ... need others too`
`110`	`110`	`self.lastPhase=None`
`111`		`-`
`112`	`111`	`self.beforeRCDataPhase=None`
`113`	`112`
`114`	`113`	`CharactersToken=tokenTypes["Characters"]`
`@@ -120,6 +119,8 @@ def _parse(self, stream, innerHTML=False, container="div",`
`120`	`119`
`121`	`120`
`122`	`121`	`fortokeninself.normalizedTokens():`
	`122`	`+#print self.phase.__class__.__name__`
	`123`	`+#print token`
`123`	`124`	`type=token["type"]`
`124`	`125`	`iftype==CharactersToken:`
`125`	`126`	`self.phase.processCharacters(token)`
`@@ -271,18 +272,6 @@ def __init__(self, parser, tree):`
`271`	`272`
`272`	`273`	`defprocessEOF(self):`
`273`	`274`	`raiseNotImplementedError`
`274`		`-self.tree.generateImpliedEndTags()`
`275`		`-iflen(self.tree.openElements)>2:`
`276`		`-self.parser.parseError("expected-closing-tag-but-got-eof")`
`277`		`-eliflen(self.tree.openElements)==2and\`
`278`		`-self.tree.openElements[1].name!="body":`
`279`		`-# This happens for framesets or something?`
`280`		`-self.parser.parseError("expected-closing-tag-but-got-eof")`
`281`		`-elifself.parser.innerHTMLandlen(self.tree.openElements)>1 :`
`282`		`-# XXX This is not what the specification says. Not sure what to do`
`283`		`-# here.`
`284`		`-self.parser.parseError("eof-in-innerhtml")`
`285`		`-# Betting ends.`
`286`	`275`
`287`	`276`	`defprocessComment(self,token):`
`288`	`277`	`# For most phases the following is correct. Where it's not it will be`
`@@ -318,7 +307,7 @@ class InitialPhase(Phase):`
`318`	`307`	`# this.`
`319`	`308`	`defprocessEOF(self):`
`320`	`309`	`self.parser.parseError("expected-doctype-but-got-eof")`
`321`		`-self.compatMode="quirks"`
	`310`	`+self.parser.compatMode="quirks"`
`322`	`311`	`self.parser.phase=self.parser.phases["beforeHtml"]`
`323`	`312`	`self.parser.phase.processEOF()`
`324`	`313`
`@@ -346,8 +335,9 @@ def processDoctype(self, token):`
`346`	`335`	`ifpublicId!="":`
`347`	`336`	`publicId=publicId.translate(asciiUpper2Lower)`
`348`	`337`
`349`		`-if (notcorrectortoken["name"]!="html"`
`350`		`-orpublicIdin`
	`338`	`+`
	`339`	`+if ((notcorrect)ornameLower!="html"`
	`340`	`+orpublicIdin`
`351`	`341`	`("+//silmaril//dtd html pro v0r11 19970101//en",`
`352`	`342`	`"-//advasoft ltd//dtd html 3.0 aswedit + extensions//en",`
`353`	`343`	`"-//as//dtd html 3.0 aswedit + extensions//en",`
`@@ -419,19 +409,18 @@ def processDoctype(self, token):`
`419`	`409`	`"html")`
`420`	`410`	`or (publicIdin`
`421`	`411`	`("-//w3c//dtd html 4.01 frameset//EN",`
`422`		`-"-//w3c//dtd html 4.01 transitional//EN")and`
`423`		`-systemId==None)`
	`412`	`+"-//w3c//dtd html 4.01 transitional//EN")andsystemId==None)`
`424`	`413`	`or (systemId!=Noneand`
`425`		`-systemId=="http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd")):`
`426`		`-self.compatMode="quirks"`
	`414`	`+systemId==`
	`415`	`+"http://www.ibm.com/data/dtd/v11/ibmxhtml1-transitional.dtd")):`
	`416`	`+self.parser.compatMode="quirks"`
`427`	`417`	`elif (publicIdin`
`428`		`-("-//w3c//dtd xhtml 1.0 frameset//EN",`
`429`		`-"-//w3c//dtd xhtml 1.0 transitional//EN")`
	`418`	`+ ("-//w3c//dtd xhtml 1.0 frameset//EN",`
	`419`	`+"-//w3c//dtd xhtml 1.0 transitional//EN")`
`430`	`420`	`or (publicIdin`
`431`	`421`	`("-//w3c//dtd html 4.01 frameset//EN",`
`432`		`-"-//w3c//dtd html 4.01 transitional//EN")and`
`433`		`-systemId==None)):`
`434`		`-self.compatMode="limited quirks"`
	`422`	`+"-//w3c//dtd html 4.01 transitional//EN")andsystemId==None)):`
	`423`	`+self.parser.compatMode="limited quirks"`
`435`	`424`
`436`	`425`	`self.parser.phase=self.parser.phases["beforeHtml"]`
`437`	`426`
`@@ -440,7 +429,7 @@ def processSpaceCharacters(self, token):`
`440`	`429`
`441`	`430`	`defprocessCharacters(self,token):`
`442`	`431`	`self.parser.parseError("expected-doctype-but-got-chars")`
`443`		`-self.compatMode="quirks"`
	`432`	`+self.parser.compatMode="quirks"`
`444`	`433`	`self.parser.phase=self.parser.phases["beforeHtml"]`
`445`	`434`	`self.parser.phase.processCharacters(token)`
`446`	`435`
`@@ -595,7 +584,8 @@ def startTagMeta(self, token):`
`595`	`584`	`codec=inputstream.codecName(attributes["charset"])`
`596`	`585`	`self.parser.tokenizer.stream.changeEncoding(codec)`
`597`	`586`	`elif"content"inattributes:`
`598`		`-data=inputstream.EncodingBytes(attributes["content"])`
	`587`	`+data=inputstream.EncodingBytes(`
	`588`	`+attributes["content"].encode(self.parser.tokenizer.stream.charEncoding[0]))`
`599`	`589`	`parser=inputstream.ContentAttrParser(data)`
`600`	`590`	`codec=parser.parse()`
`601`	`591`	`self.parser.tokenizer.stream.changeEncoding(codec)`

`‎src/html5lib/inputstream.py‎`

Lines changed: 6 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -1,6 +1,7 @@`
`1`	`1`	`importcodecs`
`2`	`2`	`importre`
`3`	`3`	`importtypes`
	`4`	`+importsys`
`4`	`5`
`5`	`6`	`from .constantsimportEOF,spaceCharacters,asciiLetters,asciiUppercase`
`6`	`7`	`from .constantsimportencodings,ReparseException`
`@@ -188,7 +189,8 @@ def openStream(self, source):`
`188`	`189`	`importio`
`189`	`190`	`stream=io.BytesIO(bytes(source))`
`190`	`191`
`191`		`-ifnot(hasattr(stream,"tell")andhasattr(stream,"seek")):`
	`192`	`+if (not(hasattr(stream,"tell")andhasattr(stream,"seek"))or`
	`193`	`+streamissys.stdin):`
`192`	`194`	`stream=BufferedStream(stream)`
`193`	`195`
`194`	`196`	`returnstream`
`@@ -452,6 +454,9 @@ class EncodingBytes(bytes):`
`452`	`454`	`"""Bytes-like object with an assosiated position and various extra methods`
`453`	`455`	`If the position is ever greater than the string length then an exception is`
`454`	`456`	`raised"""`
	`457`	`+def__new__(self,value):`
	`458`	`+returnstr.__new__(self,value)`
	`459`	`+`
`455`	`460`	`def__init__(self,value):`
`456`	`461`	`self._position=-1`
`457`	`462`

`‎src/html5lib/sanitizer.py‎`

Lines changed: 1 addition & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -152,7 +152,7 @@ def sanitize_token(self, token):`
`152`	`152`	`continue`
`153`	`153`	val_unescaped=re.sub("[`\000-\040\177-\240\s]+",'',
`154`	`154`	`unescape(attrs[attr])).lower()`
`155`		`-if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped)or`
	`155`	`+if (re.match("^[a-z0-9][-+.a-z0-9]*:",val_unescaped)and`
`156`	`156`	`(val_unescaped.split(':')[0]notin`
`157`	`157`	`self.allowed_protocols)):`
`158`	`158`	`delattrs[attr]`

`‎src/html5lib/tokenizer.py‎`

Lines changed: 1 addition & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -142,7 +142,7 @@ def consumeNumberEntity(self, isHex):`
`142`	`142`	`# Certain characters get replaced with U+FFFD`
`143`	`143`	`if ((charAsInt<=0x0008)or (charAsInt==0x000B)or (0x000E<=charAsInt<=0x001F)`
`144`	`144`	`or (0x007F<=charAsInt<=0x009F)`
`145`		`-or (0xD800<=charAsInt<=0xDFFF)or (0xFDD0<=charAsInt<=0xFDDF)`
	`145`	`+or (0xD800<=charAsInt<=0xDFFF)or (0xFDD0<=charAsInt<=0xFDEF)`
`146`	`146`	`or (charAsInt&0xFFFE==0xFFFE)# catch all U+?FFFE and U+?FFFF, where ? is 0..10`
`147`	`147`	`or (0x10FFFF<charAsInt)):`
`148`	`148`	`char="\uFFFD"`

`‎tests/test_parser.py‎`

Lines changed: 2 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -142,7 +142,8 @@ def buildTestSuite():`
`142`	`142`	`deftestFunc(self,innerHTML=innerHTML,input=input,`
`143`	`143`	`expected=expected,errors=errors,treeCls=treeCls):`
`144`	`144`	`returnself.runParserTest(innerHTML,input,expected,errors,treeCls)`
`145`		`-setattr(TestCase,"test_%s_%d_%s"% (testName,index+1,treeName),`
	`145`	`+testFunc.__name__="test_%s_%d_%s"% (testName,index+1,treeName)`
	`146`	`+setattr(TestCase,testFunc.__name__,`
`146`	`147`	`testFunc)`
`147`	`148`
`148`	`149`	`returnunittest.TestLoader().loadTestsFromTestCase(TestCase)`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commitbabe4a3

File tree

8 files changed

8 files changed

`‎parse.py‎`

`‎src/html5lib/constants.py‎`

`‎src/html5lib/filters/optionaltags.py‎`

`‎src/html5lib/html5parser.py‎`

`‎src/html5lib/inputstream.py‎`

`‎src/html5lib/sanitizer.py‎`

`‎src/html5lib/tokenizer.py‎`

`‎tests/test_parser.py‎`

0 commit comments