Commitb51828b

committed

Allow for Python implementations that don't support lone surrogates (read: Jython).

This is based on earlier work by Jim Baker (thanks!).The two major parts of this are: * Avoiding having lone surrogates in any string literals, and * Avoiding tests that contain lone surrogates.As part of this, the decoder for double-escaped tokenizer tests is rewrittento avoid unicode_escape as that has bogus behaviour with non-ASCII characters.

1 parentb293489 commitb51828bCopy full SHA for b51828b

File tree

5 files changed

+87

-14

lines changed

AUTHORS.rst
CHANGES.rst
html5lib
- inputstream.py
- tests
  - test_tokenizer.py
- utils.py

5 files changed

+87

-14

lines changed

`‎AUTHORS.rst`

Lines changed: 1 addition & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -32,3 +32,4 @@ Patches and suggestions`
`32`	`32`	`- Juan Carlos Garcia Segovia`
`33`	`33`	`- Mike West`
`34`	`34`	`- Marc DM`
	`35`	`+- Jim Baker`

`‎CHANGES.rst`

Lines changed: 3 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -4,9 +4,10 @@ Change Log`
`4`	`4`	`0.9999`
`5`	`5`	`~~~~~~`
`6`	`6`
`7`		`-Released on XXX,2014`
	`7`	`+Released on XXX,2015`
`8`	`8`
`9`		`-* XXX`
	`9`	`+* Add support for Python implementations that don't support lone surrogates`
	`10`	`+ (read: Jython).`
`10`	`11`
`11`	`12`
`12`	`13`	`0.999`

`‎html5lib/inputstream.py`

Lines changed: 26 additions & 9 deletions

Original file line number	Diff line number	Diff line change
`@@ -28,7 +28,18 @@ class BufferedIOBase(object):`
`28`	`28`	`asciiUppercaseBytes=frozenset([item.encode("ascii")foriteminasciiUppercase])`
`29`	`29`	`spacesAngleBrackets=spaceCharactersBytes\|frozenset([b">",b"<"])`
`30`	`30`
`31`		`-invalid_unicode_re=re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")`
	`31`	`+`
	`32`	`+invalid_unicode_no_surrogate="[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]"`
	`33`	`+`
	`34`	`+ifutils.supports_lone_surrogates:`
	`35`	`+# Use one extra step of indirection and create surrogates with`
	`36`	`+# unichr. Not using this indirection would introduce an illegal`
	`37`	`+# unicode literal on platforms not supporting such lone`
	`38`	`+# surrogates.`
	`39`	`+invalid_unicode_re=re.compile(invalid_unicode_no_surrogate+`
	`40`	`+eval('"\\uD800-\\uDFFF"'))`
	`41`	`+else:`
	`42`	`+invalid_unicode_re=re.compile(invalid_unicode_no_surrogate)`
`32`	`43`
`33`	`44`	`non_bmp_invalid_codepoints=set([0x1FFFE,0x1FFFF,0x2FFFE,0x2FFFF,0x3FFFE,`
`34`	`45`	`0x3FFFF,0x4FFFE,0x4FFFF,0x5FFFE,0x5FFFF,`
`@@ -164,13 +175,18 @@ def __init__(self, source):`
`164`	`175`
`165`	`176`	`"""`
`166`	`177`
`167`		`-# Craziness`
`168`		`-iflen("\U0010FFFF")==1:`
	`178`	`+ifnotutils.supports_lone_surrogates:`
	`179`	`+# Such platforms will have already checked for such`
	`180`	`+# surrogate errors, so no need to do this checking.`
	`181`	`+self.reportCharacterErrors=None`
	`182`	`+self.replaceCharactersRegexp=None`
	`183`	`+eliflen("\U0010FFFF")==1:`
`169`	`184`	`self.reportCharacterErrors=self.characterErrorsUCS4`
`170`		`-self.replaceCharactersRegexp=re.compile("[\uD800-\uDFFF]")`
	`185`	`+self.replaceCharactersRegexp=re.compile(eval('"[\\uD800-\\uDFFF]"'))`
`171`	`186`	`else:`
`172`	`187`	`self.reportCharacterErrors=self.characterErrorsUCS2`
`173`		`-self.replaceCharactersRegexp=re.compile("([\uD800-\uDBFF](?![\uDC00-\uDFFF])\|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])")`
	`188`	`+self.replaceCharactersRegexp=re.compile(`
	`189`	`+eval('"([\\uD800-\\uDBFF](?![\\uDC00-\\uDFFF])\|(?<![\\uD800-\\uDBFF])[\\uDC00-\\uDFFF])"'))`
`174`	`190`
`175`	`191`	`# List of where new lines occur`
`176`	`192`	`self.newLines= [0]`
`@@ -265,11 +281,12 @@ def readChunk(self, chunkSize=None):`
`265`	`281`	`self._bufferedCharacter=data[-1]`
`266`	`282`	`data=data[:-1]`
`267`	`283`
`268`		`-self.reportCharacterErrors(data)`
	`284`	`+ifself.reportCharacterErrors:`
	`285`	`+self.reportCharacterErrors(data)`
`269`	`286`
`270`		`-# Replace invalid characters`
`271`		`-# Note U+0000 is dealt with in the tokenizer`
`272`		`-data=self.replaceCharactersRegexp.sub("\ufffd",data)`
	`287`	`+# Replace invalid characters`
	`288`	`+# Note U+0000 is dealt with in the tokenizer`
	`289`	`+data=self.replaceCharactersRegexp.sub("\ufffd",data)`
`273`	`290`
`274`	`291`	`data=data.replace("\r\n","\n")`
`275`	`292`	`data=data.replace("\r","\n")`

`‎html5lib/tests/test_tokenizer.py`

Lines changed: 35 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -4,10 +4,12 @@`
`4`	`4`	`importwarnings`
`5`	`5`	`importre`
`6`	`6`
	`7`	`+fromsiximportunichr`
	`8`	`+`
`7`	`9`	`from .supportimportget_data_files`
`8`	`10`
`9`	`11`	`fromhtml5lib.tokenizerimportHTMLTokenizer`
`10`		`-fromhtml5libimportconstants`
	`12`	`+fromhtml5libimportconstants,utils`
`11`	`13`
`12`	`14`
`13`	`15`	`classTokenizerTestParser(object):`
`@@ -122,9 +124,38 @@ def tokensMatch(expectedTokens, receivedTokens, ignoreErrorOrder,`
`122`	`124`	`returntokens["expected"]==tokens["received"]`
`123`	`125`
`124`	`126`
	`127`	`+_surrogateRe=re.compile(r"\\u([0-9A-Fa-f]{4})(?:\\u([0-9A-Fa-f]{4}))?")`
	`128`	`+`
	`129`	`+`
`125`	`130`	`defunescape(test):`
`126`	`131`	`defdecode(inp):`
`127`		`-returninp.encode("utf-8").decode("unicode-escape")`
	`132`	`+"""Decode\\uXXXX escapes`
	`133`	`+`
	`134`	`+ This decodes\\uXXXX escapes, possibly into non-BMP characters when`
	`135`	`+ two surrogate character escapes are adjacent to each other.`
	`136`	`+ """`
	`137`	`+# This cannot be implemented using the unicode_escape codec`
	`138`	`+# because that requires its input be ISO-8859-1, and we need`
	`139`	`+# arbitrary unicode as input.`
	`140`	`+defrepl(m):`
	`141`	`+ifm.group(2)isnotNone:`
	`142`	`+high=int(m.group(1),16)`
	`143`	`+low=int(m.group(2),16)`
	`144`	`+if0xD800<=high<=0xDBFFand0xDC00<=low<=0xDFFF:`
	`145`	`+cp= ((high-0xD800)<<10)+ (low-0xDC00)+0x10000`
	`146`	`+returnunichr(cp)`
	`147`	`+else:`
	`148`	`+returnunichr(high)+unichr(low)`
	`149`	`+else:`
	`150`	`+returnunichr(int(m.group(1),16))`
	`151`	`+try:`
	`152`	`+return_surrogateRe.sub(repl,inp)`
	`153`	`+exceptValueError:`
	`154`	`+# This occurs when unichr throws ValueError, which should`
	`155`	`+# only be for a lone-surrogate.`
	`156`	`+ifutils.supports_lone_surrogates:`
	`157`	`+raise`
	`158`	`+returnNone`
`128`	`159`
`129`	`160`	`test["input"]=decode(test["input"])`
`130`	`161`	`fortokenintest["output"]:`
`@@ -183,6 +214,8 @@ def testTokenizer():`
`183`	`214`	`test["initialStates"]= ["Data state"]`
`184`	`215`	`if'doubleEscaped'intest:`
`185`	`216`	`test=unescape(test)`
	`217`	`+iftest["input"]isNone:`
	`218`	`+continue# Not valid input for this platform`
`186`	`219`	`forinitialStateintest["initialStates"]:`
`187`	`220`	`test["initialState"]=capitalize(initialState)`
`188`	`221`	`yieldrunTokenizerTest,test`

`‎html5lib/utils.py`

Lines changed: 22 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -2,14 +2,35 @@`
`2`	`2`
`3`	`3`	`fromtypesimportModuleType`
`4`	`4`
	`5`	`+fromsiximporttext_type`
	`6`	`+`
`5`	`7`	`try:`
`6`	`8`	`importxml.etree.cElementTreeasdefault_etree`
`7`	`9`	`exceptImportError:`
`8`	`10`	`importxml.etree.ElementTreeasdefault_etree`
`9`	`11`
`10`	`12`
`11`	`13`	`__all__= ["default_etree","MethodDispatcher","isSurrogatePair",`
`12`		`-"surrogatePairToCodepoint","moduleFactoryFactory"]`
	`14`	`+"surrogatePairToCodepoint","moduleFactoryFactory",`
	`15`	`+"supports_lone_surrogates"]`
	`16`	`+`
	`17`	`+`
	`18`	`+# Platforms not supporting lone surrogates (\uD800-\uDFFF) should be`
	`19`	`+# caught by the below test. In general this would be any platform`
	`20`	`+# using UTF-16 as its encoding of unicode strings, such as`
	`21`	`+# Jython. This is because UTF-16 itself is based on the use of such`
	`22`	`+# surrogates, and there is no mechanism to further escape such`
	`23`	`+# escapes.`
	`24`	`+try:`
	`25`	`+_x=eval('"\\uD800"')`
	`26`	`+ifnotisinstance(_x,text_type):`
	`27`	`+# We need this with u"" because of http://bugs.jython.org/issue2039`
	`28`	`+_x=eval('u"\\uD800"')`
	`29`	`+assertisinstance(_x,text_type)`
	`30`	`+except:`
	`31`	`+supports_lone_surrogates=False`
	`32`	`+else:`
	`33`	`+supports_lone_surrogates=True`
`13`	`34`
`14`	`35`
`15`	`36`	`classMethodDispatcher(dict):`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commitb51828b

File tree

5 files changed

5 files changed

`‎AUTHORS.rst`

`‎CHANGES.rst`

`‎html5lib/inputstream.py`

`‎html5lib/tests/test_tokenizer.py`

`‎html5lib/utils.py`

0 commit comments