Commitf5f28de

committed

Fix handling of numeric entity refs

1 parent002347d commitf5f28deCopy full SHA for f5f28de

File tree

2 files changed

+64

-17

lines changed

src/html5lib
- constants.py
- tokenizer.py

2 files changed

+64

-17

lines changed

`‎src/html5lib/constants.py‎`

Lines changed: 38 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -878,6 +878,44 @@`
`878`	`878`	`"zwnj;":u"\u200C"`
`879`	`879`	`}`
`880`	`880`
	`881`	`+replacementCharacters= {`
	`882`	`+0x0:u"\uFFFD",`
	`883`	`+0x0d:u"\u000A",`
	`884`	`+0x80:u"\u20AC",`
	`885`	`+0x81:u"\u0081",`
	`886`	`+0x81:u"\u0081",`
	`887`	`+0x82:u"\u201A",`
	`888`	`+0x83:u"\u0192",`
	`889`	`+0x84:u"\u201E",`
	`890`	`+0x85:u"\u2026",`
	`891`	`+0x86:u"\u2020",`
	`892`	`+0x87:u"\u2021",`
	`893`	`+0x88:u"\u02C6",`
	`894`	`+0x89:u"\u2030",`
	`895`	`+0x8A:u"\u0160",`
	`896`	`+0x8B:u"\u2039",`
	`897`	`+0x8C:u"\u0152",`
	`898`	`+0x8D:u"\u008D",`
	`899`	`+0x8E:u"\u017D",`
	`900`	`+0x8F:u"\u008F",`
	`901`	`+0x90:u"\u0090",`
	`902`	`+0x91:u"\u2018",`
	`903`	`+0x92:u"\u2019",`
	`904`	`+0x93:u"\u201C",`
	`905`	`+0x94:u"\u201D",`
	`906`	`+0x95:u"\u2022",`
	`907`	`+0x96:u"\u2013",`
	`908`	`+0x97:u"\u2014",`
	`909`	`+0x98:u"\u02DC",`
	`910`	`+0x99:u"\u2122",`
	`911`	`+0x9A:u"\u0161",`
	`912`	`+0x9B:u"\u203A",`
	`913`	`+0x9C:u"\u0153",`
	`914`	`+0x9D:u"\u009D",`
	`915`	`+0x9E:u"\u017E",`
	`916`	`+0x9F:u"\u0178",`
	`917`	`+}`
	`918`	`+`
`881`	`919`	`encodings= {`
`882`	`920`	`'437':'cp437',`
`883`	`921`	`'850':'cp850',`

`‎src/html5lib/tokenizer.py‎`

Lines changed: 26 additions & 17 deletions

Original file line number	Diff line number	Diff line change
`@@ -14,6 +14,7 @@`
`14`	`14`	`fromconstantsimportasciiLowercase,asciiLetters,asciiUpper2Lower`
`15`	`15`	`fromconstantsimportdigits,hexDigits,EOF`
`16`	`16`	`fromconstantsimporttokenTypes,tagTokenTypes`
	`17`	`+fromconstantsimportreplacementCharacters`
`17`	`18`
`18`	`19`	`frominputstreamimportHTMLInputStream`
`19`	`20`
`@@ -96,29 +97,37 @@ def consumeNumberEntity(self, isHex):`
`96`	`97`	`# Convert the set of characters consumed to an int.`
`97`	`98`	`charAsInt=int("".join(charStack),radix)`
`98`	`99`
`99`		`-ifcharAsInt==13:`
	`100`	`+# Certain characters get replaced with others`
	`101`	`+ifcharAsIntinreplacementCharacters:`
	`102`	`+char=replacementCharacters[charAsInt]`
`100`	`103`	`self.tokenQueue.append({"type":tokenTypes["ParseError"],"data":`
`101`		`-"incorrect-cr-newline-entity"})`
`102`		`-charAsInt=10`
`103`		`-elif127<charAsInt<160:`
`104`		`-# If the integer is between 127 and 160 (so 128 and bigger and 159`
`105`		`-# and smaller) we need to do the "windows trick".`
`106`		`-self.tokenQueue.append({"type":tokenTypes["ParseError"],"data":`
`107`		`-"illegal-windows-1252-entity"})`
`108`		`-`
`109`		`-charAsInt=entitiesWindows1252[charAsInt-128]`
`110`		`-`
`111`		`-# Certain characters get replaced with U+FFFD`
`112`		`-if ((charAsInt<=0x0008)or (charAsInt==0x000B)or (0x000E<=charAsInt<=0x001F)`
`113`		`-or (0x007F<=charAsInt<=0x009F)`
`114`		`-or (0xD800<=charAsInt<=0xDFFF)or (0xFDD0<=charAsInt<=0xFDEF)`
`115`		`-or (charAsInt&0xFFFE==0xFFFE)# catch all U+?FFFE and U+?FFFF, where ? is 0..10`
`116`		`-or (0x10FFFF<charAsInt)):`
	`104`	`+"illegal-codepoint-for-numeric-entity",`
	`105`	`+"datavars": {"charAsInt":charAsInt}})`
	`106`	`+elif ((0xD800<=charAsInt<=0xDFFF)or`
	`107`	`+ (charAsInt>0x10FFFF)):`
`117`	`108`	`char=u"\uFFFD"`
`118`	`109`	`self.tokenQueue.append({"type":tokenTypes["ParseError"],"data":`
`119`	`110`	`"illegal-codepoint-for-numeric-entity",`
`120`	`111`	`"datavars": {"charAsInt":charAsInt}})`
`121`	`112`	`else:`
	`113`	`+#Should speed up this check somehow (e.g. move the set to a constant)`
	`114`	`+if ((0x0001<=charAsInt<=0x0008)or`
	`115`	`+ (0x000E<=charAsInt<=0x001F)or`
	`116`	`+ (0x007F<=charAsInt<=0x009F)or`
	`117`	`+ (0xFDD0<=charAsInt<=0xFDEF)or`
	`118`	`+charAsIntinfrozenset([0x000B,0xFFFE,0xFFFF,0x1FFFE,`
	`119`	`+0x1FFFF,0x2FFFE,0x2FFFF,0x3FFFE,`
	`120`	`+0x3FFFF,0x4FFFE,0x4FFFF,0x5FFFE,`
	`121`	`+0x5FFFF,0x6FFFE,0x6FFFF,0x7FFFE,`
	`122`	`+0x7FFFF,0x8FFFE,0x8FFFF,0x9FFFE,`
	`123`	`+0x9FFFF,0xAFFFE,0xAFFFF,0xBFFFE,`
	`124`	`+0xBFFFF,0xCFFFE,0xCFFFF,0xDFFFE,`
	`125`	`+0xDFFFF,0xEFFFE,0xEFFFF,0xFFFFE,`
	`126`	`+0xFFFFF,0x10FFFE,0x10FFFF])):`
	`127`	`+self.tokenQueue.append({"type":tokenTypes["ParseError"],`
	`128`	`+"data":`
	`129`	`+"illegal-codepoint-for-numeric-entity",`
	`130`	`+"datavars": {"charAsInt":charAsInt}})`
`122`	`131`	`try:`
`123`	`132`	`# XXX We should have a separate function that does "int" to`
`124`	`133`	`# "unicodestring" conversion since this doesn't always work`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commitf5f28de

File tree

2 files changed

2 files changed

`‎src/html5lib/constants.py‎`

`‎src/html5lib/tokenizer.py‎`

0 commit comments