|
14 | 14 | fromconstantsimportasciiLowercase,asciiLetters,asciiUpper2Lower |
15 | 15 | fromconstantsimportdigits,hexDigits,EOF |
16 | 16 | fromconstantsimporttokenTypes,tagTokenTypes |
| 17 | +fromconstantsimportreplacementCharacters |
17 | 18 |
|
18 | 19 | frominputstreamimportHTMLInputStream |
19 | 20 |
|
@@ -96,29 +97,37 @@ def consumeNumberEntity(self, isHex): |
96 | 97 | # Convert the set of characters consumed to an int. |
97 | 98 | charAsInt=int("".join(charStack),radix) |
98 | 99 |
|
99 | | -ifcharAsInt==13: |
| 100 | +# Certain characters get replaced with others |
| 101 | +ifcharAsIntinreplacementCharacters: |
| 102 | +char=replacementCharacters[charAsInt] |
100 | 103 | self.tokenQueue.append({"type":tokenTypes["ParseError"],"data": |
101 | | -"incorrect-cr-newline-entity"}) |
102 | | -charAsInt=10 |
103 | | -elif127<charAsInt<160: |
104 | | -# If the integer is between 127 and 160 (so 128 and bigger and 159 |
105 | | -# and smaller) we need to do the "windows trick". |
106 | | -self.tokenQueue.append({"type":tokenTypes["ParseError"],"data": |
107 | | -"illegal-windows-1252-entity"}) |
108 | | - |
109 | | -charAsInt=entitiesWindows1252[charAsInt-128] |
110 | | - |
111 | | -# Certain characters get replaced with U+FFFD |
112 | | -if ((charAsInt<=0x0008)or (charAsInt==0x000B)or (0x000E<=charAsInt<=0x001F) |
113 | | -or (0x007F<=charAsInt<=0x009F) |
114 | | -or (0xD800<=charAsInt<=0xDFFF)or (0xFDD0<=charAsInt<=0xFDEF) |
115 | | -or (charAsInt&0xFFFE==0xFFFE)# catch all U+?FFFE and U+?FFFF, where ? is 0..10 |
116 | | -or (0x10FFFF<charAsInt)): |
| 104 | +"illegal-codepoint-for-numeric-entity", |
| 105 | +"datavars": {"charAsInt":charAsInt}}) |
| 106 | +elif ((0xD800<=charAsInt<=0xDFFF)or |
| 107 | + (charAsInt>0x10FFFF)): |
117 | 108 | char=u"\uFFFD" |
118 | 109 | self.tokenQueue.append({"type":tokenTypes["ParseError"],"data": |
119 | 110 | "illegal-codepoint-for-numeric-entity", |
120 | 111 | "datavars": {"charAsInt":charAsInt}}) |
121 | 112 | else: |
| 113 | +#Should speed up this check somehow (e.g. move the set to a constant) |
| 114 | +if ((0x0001<=charAsInt<=0x0008)or |
| 115 | + (0x000E<=charAsInt<=0x001F)or |
| 116 | + (0x007F<=charAsInt<=0x009F)or |
| 117 | + (0xFDD0<=charAsInt<=0xFDEF)or |
| 118 | +charAsIntinfrozenset([0x000B,0xFFFE,0xFFFF,0x1FFFE, |
| 119 | +0x1FFFF,0x2FFFE,0x2FFFF,0x3FFFE, |
| 120 | +0x3FFFF,0x4FFFE,0x4FFFF,0x5FFFE, |
| 121 | +0x5FFFF,0x6FFFE,0x6FFFF,0x7FFFE, |
| 122 | +0x7FFFF,0x8FFFE,0x8FFFF,0x9FFFE, |
| 123 | +0x9FFFF,0xAFFFE,0xAFFFF,0xBFFFE, |
| 124 | +0xBFFFF,0xCFFFE,0xCFFFF,0xDFFFE, |
| 125 | +0xDFFFF,0xEFFFE,0xEFFFF,0xFFFFE, |
| 126 | +0xFFFFF,0x10FFFE,0x10FFFF])): |
| 127 | +self.tokenQueue.append({"type":tokenTypes["ParseError"], |
| 128 | +"data": |
| 129 | +"illegal-codepoint-for-numeric-entity", |
| 130 | +"datavars": {"charAsInt":charAsInt}}) |
122 | 131 | try: |
123 | 132 | # XXX We should have a separate function that does "int" to |
124 | 133 | # "unicodestring" conversion since this doesn't always work |
|