Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitb95d51c

Browse files
committed
Fix test-case failures due to character reference parsing bugs.
More completely, change the entire strucutre of the NCR array to be a multi-dimensional array with each dimension having as a key one character and a value possible further steps, and sometimes 'codepoint', which that point represents. Hence, to get & now, you would need $ncrs['a']['m']['p'][';']['codepoint']. 'codepoint' is required so we can cope with both &amp and & (which means it can't just be a value of &amp).This also removes Data::getNamedCharacterReferenceMaxLength(), as it is now useless.
1 parentef63fc9 commitb95d51c

File tree

4 files changed

+63
-36
lines changed

4 files changed

+63
-36
lines changed

‎library/HTML5/Data.php

Lines changed: 0 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -65,16 +65,6 @@ public static function getNamedCharacterReferences() {
6565
returnself::$namedCharacterReferences;
6666
}
6767

68-
publicstaticfunctiongetNamedCharacterReferenceMaxLength() {
69-
if (!self::$namedCharacterReferenceMaxLength) {
70-
$namedCharacterReferences =self::getNamedCharacterReferences();
71-
$lengths =array_map('strlen',array_keys($namedCharacterReferences));
72-
self::$namedCharacterReferenceMaxLength =max($lengths);
73-
}
74-
returnself::$namedCharacterReferenceMaxLength;
75-
}
76-
77-
7868
/**
7969
* Converts a Unicode codepoint to sequence of UTF-8 bytes.
8070
* @note Shamelessly stolen from HTML Purifier, which is also

‎library/HTML5/Tokenizer.php

Lines changed: 38 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -2197,21 +2197,32 @@ private function consumeCharacterReference($allowed = false, $inattr = false) {
21972197
with the consumed characters matching one of the
21982198
identifiers in the first column of the named character
21992199
references table (in a case-sensitive manner). */
2200-
2201-
// we will implement this by matching the longest
2202-
// alphanumeric + semicolon string, and then working
2203-
// our way backwards
2204-
$chars .=$this->stream->charsWhile(self::DIGIT .self::ALPHA .';', HTML5_Data::getNamedCharacterReferenceMaxLength() -1);
2205-
$len =strlen($chars);
2200+
// What we actually do here is consume as much as we can while it
2201+
// matches the start of one of the identifiers in the first column.
22062202

22072203
$refs = HTML5_Data::getNamedCharacterReferences();
2204+
2205+
// Get the longest string which is the start of an identifier
2206+
// ($chars) as well as the longest identifier which matches ($id)
2207+
// and its codepoint ($codepoint).
22082208
$codepoint =false;
2209-
for($c =$len;$c >0;$c--) {
2210-
$id =substr($chars,0,$c);
2211-
if(isset($refs[$id])) {
2212-
$codepoint =$refs[$id];
2213-
break;
2209+
$char =$chars;
2210+
while ($char !==false &&isset($refs[$char])) {
2211+
$refs =$refs[$char];
2212+
if (isset($refs['codepoint'])) {
2213+
$id =$chars;
2214+
$codepoint =$refs['codepoint'];
22142215
}
2216+
$chars .=$char =$this->stream->char();
2217+
}
2218+
2219+
// Unconsume the one character we just took which caused the while
2220+
// statement to fail. This could be anything and could cause state
2221+
// changes (as if it matches the while loop it must be
2222+
// alphanumeric so we can just concat it to whatever we get later).
2223+
$this->stream->unget();
2224+
if ($char !==false) {
2225+
$chars =substr($chars,0, -1);
22152226
}
22162227

22172228
/* If no match can be made, then this is a parse error.
@@ -2235,7 +2246,6 @@ private function consumeCharacterReference($allowed = false, $inattr = false) {
22352246
$semicolon =false;
22362247
}
22372248

2238-
22392249
/* If the character reference is being consumed as part of
22402250
an attribute, and the last character matched is not a
22412251
U+003B SEMICOLON (;), and the next character is in the
@@ -2245,17 +2255,27 @@ private function consumeCharacterReference($allowed = false, $inattr = false) {
22452255
then, for historical reasons, all the characters that were
22462256
matched after the U+0026 AMPERSAND (&) must be unconsumed,
22472257
and nothing is returned. */
2248-
if (
2249-
$inattr && !$semicolon &&
2250-
strspn(substr($chars,$c,1),self::ALPHA .self::DIGIT)
2251-
) {
2252-
return'&' .$chars;
2258+
if ($inattr && !$semicolon) {
2259+
// The next character is either the next character in $chars or in the stream.
2260+
if (strlen($chars) >strlen($id)) {
2261+
$next =substr($chars,strlen($id),1);
2262+
}else {
2263+
$next =$this->stream->char();
2264+
$this->stream->unget();
2265+
}
2266+
if (
2267+
'0' <=$next &&$next <='9' ||
2268+
'A' <=$next &&$next <='Z' ||
2269+
'a' <=$next &&$next <='z'
2270+
) {
2271+
return'&' .$chars;
2272+
}
22532273
}
22542274

22552275
/* Otherwise, return a character token for the character
22562276
corresponding to the character reference name (as given
22572277
by the second column of the named character references table). */
2258-
return HTML5_Data::utf8chr($codepoint) .substr($chars,$c);
2278+
return HTML5_Data::utf8chr($codepoint) .substr($chars,strlen($id));
22592279
}
22602280
}
22612281

‎library/HTML5/named-character-references.ser

Lines changed: 1 addition & 1 deletion
Large diffs are not rendered by default.

‎maintenance/scrape-ncr.php

Lines changed: 24 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,19 +12,36 @@
1212
}
1313

1414
$url ='http://www.whatwg.org/specs/web-apps/current-work/multipage/named-character-references.html';
15-
$request =newHttpRequest($url);
16-
$request->send();
17-
$html =$request->getResponseBody();
15+
if (extension_loaded('pecl_http')) {
16+
$request =newHttpRequest($url);
17+
$request->send();
18+
$html =$request->getResponseBody();
19+
}else {
20+
$html =file_get_contents($url);
21+
}
1822

1923
preg_match_all(
20-
'#<code title="">\s*([^<]+?)\s*</code>\s*</td>\s*<td>\s*U+([^<]+?)\s*<#',
24+
'#<code title="">\s*([^<]+?)\s*</code>\s*</td>\s*<td>\s*U\+([^<]+?)\s*<#',
2125
$html,$matches,PREG_SET_ORDER);
2226

2327
$table =array();
2428
foreach ($matchesas$match) {
25-
$ncr =$match[1];
26-
$codepoint =hexdec($match[2]);
27-
$table[$ncr] =$codepoint;
29+
list(,$name,$codepoint) =$match;
30+
31+
// Set the subtable we're working with initially to the whole table.
32+
$subtable =&$table;
33+
34+
// Loop over each character to the name creating an array key for it, if it
35+
// doesn't already exist
36+
for ($i =0,$len =strlen($name);$i <$len;$i++) {
37+
if (!isset($subtable[$name[$i]])) {
38+
$subtable[$name[$i]] =null;
39+
}
40+
$subtable =&$subtable[$name[$i]];
41+
}
42+
43+
// Set the key codepoint to the codepoint.
44+
$subtable['codepoint'] =hexdec($codepoint);
2845
}
2946

3047
file_put_contents($output,serialize($table));

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp