@@ -2211,28 +2211,29 @@ private function consumeCharacterReference($allowed = false, $inattr = false) {
2211
2211
'type ' =>self ::PARSEERROR ,
2212
2212
'data ' =>'illegal-windows-1252-entity '
2213
2213
));
2214
- $ codepoint = $ new_codepoint ;
2214
+ return HTML5_Data:: utf8chr ( $ new_codepoint) ;
2215
2215
}else {
2216
- // our logic is structured a little differently from the
2217
- // spec's but they're equivalent. The transform is:
2218
- // spec:
2219
- // return character for codepoint
2220
- // if in range:
2221
- // parse error
2222
- // ours:
2223
- // if in range:
2224
- // parse error
2225
- // return character for codepoint
2226
- /* Otherwise, if the number is in the range 0x0000 to 0x0008,
2227
- U+000B, U+000E to 0x001F, 0x007F to 0x009F, 0xD800 to 0xDFFF ,
2228
- 0xFDD0 to 0xFDEF, or is one of 0xFFFE, 0xFFFF, 0x1FFFE, 0x1FFFF,
2229
- 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE,
2230
- 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE, 0x7FFFF, 0x8FFFE, 0x8FFFF,
2231
- 0x9FFFE, 0x9FFFF, 0xAFFFE, 0xAFFFF, 0xBFFFE, 0xBFFFF, 0xCFFFE,
2232
- 0xCFFFF, 0xDFFFE, 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF,
2233
- 0x10FFFE, or 0x10FFFF, or is higher than 0x10FFFF, then this
2234
- is a parse error; return a character token for the U+FFFD
2235
- REPLACEMENT CHARACTER character instead. */
2216
+ /* Otherwise, if the number is greater than 0x10FFFF, then
2217
+ * this is a parse error. Return a U+FFFD REPLACEMENT
2218
+ * CHARACTER. */
2219
+ if ($ codepoint >0x10FFFF ) {
2220
+ $ this ->emitToken (array (
2221
+ 'type ' =>self ::PARSEERROR ,
2222
+ 'data ' =>'overlong-character-entity ' // XXX probably not correct
2223
+ ));
2224
+ return "\xEF\xBF\xBD" ;
2225
+ }
2226
+ /* Otherwise, return a character token for the Unicode
2227
+ * character whose code point is that number. If the
2228
+ * number is in the range 0x0001 to 0x0008, 0x000E to
2229
+ * 0x001F, 0x007F to 0x009F, 0xD800 to 0xDFFF, 0xFDD0 to
2230
+ * 0xFDEF, or is one of 0x000B, 0xFFFE, 0xFFFF, 0x1FFFE,
2231
+ * 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE, 0x3FFFF, 0x4FFFE,
2232
+ * 0x4FFFF, 0x5FFFE, 0x5FFFF, 0x6FFFE, 0x6FFFF, 0x7FFFE,
2233
+ * 0x7FFFF, 0x8FFFE, 0x8FFFF, 0x9FFFE, 0x9FFFF, 0xAFFFE,
2234
+ * 0xAFFFF, 0xBFFFE, 0xBFFFF, 0xCFFFE, 0xCFFFF, 0xDFFFE,
2235
+ * 0xDFFFF, 0xEFFFE, 0xEFFFF, 0xFFFFE, 0xFFFFF, 0x10FFFE,
2236
+ * or 0x10FFFF, then this is a parse error. */
2236
2237
// && has higher precedence than ||
2237
2238
if (
2238
2239
$ codepoint >=0x0000 &&$ codepoint <=0x0008 ||
@@ -2242,18 +2243,15 @@ private function consumeCharacterReference($allowed = false, $inattr = false) {
2242
2243
$ codepoint >=0xD800 &&$ codepoint <=0xDFFF ||
2243
2244
$ codepoint >=0xFDD0 &&$ codepoint <=0xFDEF ||
2244
2245
($ codepoint &0xFFFE ) ===0xFFFE ||
2245
- $ codepoint> 0x10FFFF
2246
+ $ codepoint== 0x10FFFF || $ codepoint == 0x10FFFE
2246
2247
) {
2247
2248
$ this ->emitToken (array (
2248
2249
'type ' =>self ::PARSEERROR ,
2249
2250
'data ' =>'illegal-codepoint-for-numeric-entity '
2250
2251
));
2251
2252
}
2253
+ return HTML5_Data::utf8chr ($ codepoint );
2252
2254
}
2253
-
2254
- /* Otherwise, return a character token for the Unicode
2255
- character whose code point is that number. */
2256
- return HTML5_Data::utf8chr ($ codepoint );
2257
2255
}
2258
2256
2259
2257
}else {