@@ -183,14 +183,10 @@ def __init__(self, source):
183183# Such platforms will have already checked for such
184184# surrogate errors, so no need to do this checking.
185185self .reportCharacterErrors = None
186- self .replaceCharactersRegexp = None
187186elif len ("\U0010FFFF " )== 1 :
188187self .reportCharacterErrors = self .characterErrorsUCS4
189- self .replaceCharactersRegexp = re .compile (eval ('"[\\ uD800-\\ uDFFF]"' ))
190188else :
191189self .reportCharacterErrors = self .characterErrorsUCS2
192- self .replaceCharactersRegexp = re .compile (
193- eval ('"([\\ uD800-\\ uDBFF](?![\\ uDC00-\\ uDFFF])|(?<![\\ uD800-\\ uDBFF])[\\ uDC00-\\ uDFFF])"' ))
194190
195191# List of where new lines occur
196192self .newLines = [0 ]
@@ -288,10 +284,7 @@ def readChunk(self, chunkSize=None):
288284if self .reportCharacterErrors :
289285self .reportCharacterErrors (data )
290286
291- # Replace invalid characters
292- # Note U+0000 is dealt with in the tokenizer
293- data = self .replaceCharactersRegexp .sub ("\ufffd " ,data )
294-
287+ # Replace invalid characters
295288data = data .replace ("\r \n " ,"\n " )
296289data = data .replace ("\r " ,"\n " )
297290