@@ -185,14 +185,10 @@ def __init__(self, source):
185185# Such platforms will have already checked for such
186186# surrogate errors, so no need to do this checking.
187187self .reportCharacterErrors = None
188- self .replaceCharactersRegexp = None
189188elif len ("\U0010FFFF " )== 1 :
190189self .reportCharacterErrors = self .characterErrorsUCS4
191- self .replaceCharactersRegexp = re .compile (eval ('"[\\ uD800-\\ uDFFF]"' ))
192190else :
193191self .reportCharacterErrors = self .characterErrorsUCS2
194- self .replaceCharactersRegexp = re .compile (
195- eval ('"([\\ uD800-\\ uDBFF](?![\\ uDC00-\\ uDFFF])|(?<![\\ uD800-\\ uDBFF])[\\ uDC00-\\ uDFFF])"' ))
196192
197193# List of where new lines occur
198194self .newLines = [0 ]
@@ -290,10 +286,7 @@ def readChunk(self, chunkSize=None):
290286if self .reportCharacterErrors :
291287self .reportCharacterErrors (data )
292288
293- # Replace invalid characters
294- # Note U+0000 is dealt with in the tokenizer
295- data = self .replaceCharactersRegexp .sub ("\ufffd " ,data )
296-
289+ # Replace invalid characters
297290data = data .replace ("\r \n " ,"\n " )
298291data = data .replace ("\r " ,"\n " )
299292