@@ -173,8 +173,17 @@ def consumeNumberEntity(self, isHex):
173173
174174charAsInt = entitiesWindows1252 [charAsInt - 128 ]
175175
176- # 0 is not a good number, neither are illegal Unicode code points (higher than 0x10FFFF) or surrogate characters (in the range 0xD800 to 0xDFFF).
177- if 0 < charAsInt and charAsInt <= 1114111 and not (55296 <= charAsInt and charAsInt <= 57343 ):
176+ # Certain characters get replaced with U+FFFD
177+ if ((charAsInt <= 0x0008 )or (charAsInt == 0x000B )or (0x000E <= charAsInt <= 0x001F )
178+ or (0x007F <= charAsInt <= 0x009F )
179+ or (0xD800 <= charAsInt <= 0xDFFF )or (0xFDD0 <= charAsInt <= 0xFDDF )
180+ or (charAsInt & 0xFFFE == 0xFFFE )# catch all U+?FFFE and U+?FFFF, where ? is 0..10
181+ or (0x10FFFF < charAsInt )):
182+ char = u"\uFFFD "
183+ self .tokenQueue .append ({"type" :"ParseError" ,"data" :
184+ "illegal-codepoint-for-numeric-entity" ,
185+ "datavars" : {"charAsInt" :charAsInt }})
186+ else :
178187try :
179188# XXX We should have a separate function that does "int" to
180189# "unicodestring" conversion since this doesn't always work
@@ -187,11 +196,6 @@ def consumeNumberEntity(self, isHex):
187196self .tokenQueue .append ({"type" :"ParseError" ,"data" :
188197"cant-convert-numeric-entity" ,
189198"datavars" : {"charAsInt" :charAsInt }})
190- else :
191- char = u"\uFFFD "
192- self .tokenQueue .append ({"type" :"ParseError" ,"data" :
193- "illegal-codepoint-for-numeric-entity" ,
194- "datavars" : {"charAsInt" :charAsInt }})
195199
196200# Discard the ; if present. Otherwise, put it back on the queue and
197201# invoke parseError on parser.