We read every piece of feedback, and take your input very seriously.
To see all available qualifiers, see ourdocumentation.
There was an error while loading.Please reload this page.
1 parent7cce65b commit30ea6e4Copy full SHA for 30ea6e4
html5lib/inputstream.py
@@ -167,10 +167,8 @@ def __init__(self, source):
167
# Craziness
168
iflen("\U0010FFFF")==1:
169
self.reportCharacterErrors=self.characterErrorsUCS4
170
-self.replaceCharactersRegexp=re.compile("[\uD800-\uDFFF]")
171
else:
172
self.reportCharacterErrors=self.characterErrorsUCS2
173
-self.replaceCharactersRegexp=re.compile("([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])")
174
175
# List of where new lines occur
176
self.newLines= [0]
@@ -268,9 +266,6 @@ def readChunk(self, chunkSize=None):
268
266
self.reportCharacterErrors(data)
269
267
270
# Replace invalid characters
271
-# Note U+0000 is dealt with in the tokenizer
272
-data=self.replaceCharactersRegexp.sub("\ufffd",data)
273
-
274
data=data.replace("\r\n","\n")
275
data=data.replace("\r","\n")
276