Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit61d36a3

Browse files
committed
Split out tokens that consist purely of whitespace immediately after moving to the data state, in similar vain to the Python implementation. This returns us to < 6s to tokenize the spec.
1 parentd552aaf commit61d36a3

File tree

3 files changed

+59
-55
lines changed

3 files changed

+59
-55
lines changed

‎library/HTML5/Tokenizer.php

Lines changed: 30 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -64,13 +64,14 @@ class HTML5_Tokenizer {
6464
// These are constants describing tokens
6565
// XXX should probably be moved somewhere else, probably the
6666
// HTML5 class.
67-
constDOCTYPE =0;
68-
constSTARTTAG =1;
69-
constENDTAG =2;
70-
constCOMMENT =3;
71-
constCHARACTER =4;
72-
constEOF =5;
73-
constPARSEERROR =6;
67+
constDOCTYPE =0;
68+
constSTARTTAG =1;
69+
constENDTAG =2;
70+
constCOMMENT =3;
71+
constCHARACTER =4;
72+
constSPACECHARACTER =5;
73+
constEOF =6;
74+
constPARSEERROR =7;
7475

7576
// These are constants representing bunches of characters.
7677
constALPHA ='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';
@@ -116,23 +117,20 @@ public function parse() {
116117
$escape =false;
117118
//echo "\n\n";
118119
while($state !==null) {
119-
/*
120-
echo $state . ' ';
120+
121+
/*echo $state . ' ';
121122
switch ($this->content_model) {
122123
case self::PCDATA: echo 'PCDATA'; break;
123124
case self::RCDATA: echo 'RCDATA'; break;
124125
case self::CDATA: echo 'CDATA'; break;
125126
case self::PLAINTEXT: echo 'PLAINTEXT'; break;
126127
}
127128
if ($escape) echo " escape";
128-
echo "\n";
129-
*/
129+
echo "\n";*/
130+
130131
switch($state) {
131132
case'data':
132133

133-
// Possible optimization: mark text tokens that contain entirely
134-
// whitespace as whitespace tokens.
135-
136134
/* Consume the next input character */
137135
$char =$this->stream->char();
138136
$lastFourChars .=$char;
@@ -234,31 +232,39 @@ public function parse() {
234232
$this->tree->emitToken(array(
235233
'type' =>self::EOF
236234
));
235+
236+
}elseif($char ==="\t" ||$char ==="\n" ||$char ==="\x0c" ||$char ==='') {
237+
// Directly after emitting a token you switch back to the "data
238+
// state". At that point spaceCharacters are important so they are
239+
// emitted separately.
240+
$chars =$this->stream->charsWhile(self::WHITESPACE);
241+
$this->emitToken(array(
242+
'type' =>self::SPACECHARACTER,
243+
'data' =>$char .$chars
244+
));
245+
$lastFourChars .=$chars;
246+
if (strlen($lastFourChars) >4)$lastFourChars =substr($lastFourChars, -4);
237247

238248
}else {
239249
/* Anything else
240250
THIS IS AN OPTIMIZATION: Get as many character that
241251
otherwise would also be treated as a character token and emit it
242252
as a single character token. Stay in the data state. */
243253
$chars ='';
254+
// XXX: We should only have - and > here when they need to be.
255+
$mask ='->';
256+
if ($amp_cond)$mask .='&';
257+
if ($lt_cond)$mask .='<';
244258

245-
// XSKETCHY: introduced three more fails (at least)
246-
if ($char !=='' &&$char !=="\n" &&$char !=="\r" &&
247-
$char !=="\t" &&$char !=="\x0c") {
248-
// XXX: We should only have - and > here when they need to be.
249-
$mask ='->' .self::WHITESPACE;
250-
if ($amp_cond)$mask .='&';
251-
if ($lt_cond)$mask .='<';
252-
253-
$chars =$this->stream->charsUntil($mask);
254-
}
259+
$chars =$this->stream->charsUntil($mask);
255260

256261
$this->emitToken(array(
257262
'type' =>self::CHARACTER,
258263
'data' =>$char .$chars
259264
));
260265

261266
$lastFourChars .=$chars;
267+
if (strlen($lastFourChars) >4)$lastFourChars =substr($lastFourChars, -4);
262268

263269
$state ='data';
264270
}

‎library/HTML5/TreeConstructer.php

Lines changed: 28 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -158,8 +158,7 @@ public function emitToken($token, $mode = null) {
158158

159159
/* A character token that is one of U+0009 CHARACTER TABULATION,
160160
* U+000A LINE FEED (LF), U+000C FORM FEED (FF), or U+0020 SPACE */
161-
if ($token['type'] === HTML5_Tokenizer::CHARACTER &&
162-
preg_match('/^[\t\n\x0b\x0c ]+$/',$token['data'])) {
161+
if ($token['type'] === HTML5_Tokenizer::SPACECHARACTER) {
163162
/* Ignore the token. */
164163
$this->ignored =true;
165164
}elseif ($token['type'] === HTML5_Tokenizer::DOCTYPE) {
@@ -356,8 +355,7 @@ public function emitToken($token, $mode = null) {
356355
/* A character token that is one of one of U+0009 CHARACTER TABULATION,
357356
U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
358357
or U+0020 SPACE */
359-
}elseif($token['type'] === HTML5_Tokenizer::CHARACTER &&
360-
preg_match('/^[\t\n\x0b\x0c ]+$/',$token['data'])) {
358+
}elseif($token['type'] === HTML5_Tokenizer::SPACECHARACTER) {
361359
/* Ignore the token. */
362360
$this->ignored =true;
363361

@@ -391,8 +389,7 @@ public function emitToken($token, $mode = null) {
391389
/* A character token that is one of one of U+0009 CHARACTER TABULATION,
392390
U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
393391
or U+0020 SPACE */
394-
if($token['type'] === HTML5_Tokenizer::CHARACTER &&
395-
preg_match('/^[\t\n\x0b\x0c ]+$/',$token['data'])) {
392+
if($token['type'] === HTML5_Tokenizer::SPACECHARACTER) {
396393
/* Ignore the token. */
397394
$this->ignored =true;
398395

@@ -465,8 +462,7 @@ public function emitToken($token, $mode = null) {
465462
/* A character token that is one of one of U+0009 CHARACTER TABULATION,
466463
U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
467464
or U+0020 SPACE. */
468-
if($token['type'] === HTML5_Tokenizer::CHARACTER &&
469-
preg_match('/^[\t\n\x0b\x0c ]+$/',$token['data'])) {
465+
if($token['type'] === HTML5_Tokenizer::SPACECHARACTER) {
470466
/* Insert the character into the current node. */
471467
$this->insertText($token['data']);
472468

@@ -600,8 +596,7 @@ public function emitToken($token, $mode = null) {
600596
array_pop($this->stack);
601597
$this->mode =self::IN_HEAD;
602598
}elseif (
603-
($token['type'] === HTML5_Tokenizer::CHARACTER &&
604-
preg_match('/^[\t\n\x0b\x0c ]+$/',$token['data'])) ||
599+
($token['type'] === HTML5_Tokenizer::SPACECHARACTER) ||
605600
($token['type'] === HTML5_Tokenizer::COMMENT) ||
606601
($token['type'] === HTML5_Tokenizer::STARTTAG && (
607602
$token['name'] ==='link' ||$token['name'] ==='meta' ||
@@ -630,8 +625,7 @@ public function emitToken($token, $mode = null) {
630625
/* A character token that is one of one of U+0009 CHARACTER TABULATION,
631626
U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
632627
or U+0020 SPACE */
633-
if($token['type'] === HTML5_Tokenizer::CHARACTER &&
634-
preg_match('/^[\t\n\x0b\x0c ]+$/',$token['data'])) {
628+
if($token['type'] === HTML5_Tokenizer::SPACECHARACTER) {
635629
/* Append the character to the current node. */
636630
$this->insertText($token['data']);
637631

@@ -702,6 +696,7 @@ public function emitToken($token, $mode = null) {
702696
switch($token['type']) {
703697
/* A character token */
704698
case HTML5_Tokenizer::CHARACTER:
699+
case HTML5_Tokenizer::SPACECHARACTER:
705700
/* Reconstruct the active formatting elements, if any. */
706701
$this->reconstructActiveFormattingElements();
707702

@@ -711,7 +706,10 @@ public function emitToken($token, $mode = null) {
711706
/* If the token is not one of U+0009 CHARACTER TABULATION,
712707
* U+000A LINE FEED (LF), U+000C FORM FEED (FF), or U+0020
713708
* SPACE, then set the frameset-ok flag to "not ok". */
714-
// YYY: not implemented
709+
// i.e., if any of the characters is not whitespace
710+
if (strlen($token['data']) !==strspn($token['data'], HTML5_Tokenizer::WHITESPACE)) {
711+
$this->flag_frameset_ok =false;
712+
}
715713
break;
716714

717715
/* A comment token */
@@ -1893,7 +1891,10 @@ public function emitToken($token, $mode = null) {
18931891
break;
18941892

18951893
caseself::IN_CDATA_RCDATA:
1896-
if ($token['type'] === HTML5_Tokenizer::CHARACTER) {
1894+
if (
1895+
$token['type'] === HTML5_Tokenizer::CHARACTER ||
1896+
$token['type'] === HTML5_Tokenizer::SPACECHARACTER
1897+
) {
18971898
$this->insertText($token['data']);
18981899
}elseif ($token['type'] === HTML5_Tokenizer::EOF) {
18991900
// parse error
@@ -1919,8 +1920,7 @@ public function emitToken($token, $mode = null) {
19191920
/* A character token that is one of one of U+0009 CHARACTER TABULATION,
19201921
U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
19211922
or U+0020 SPACE */
1922-
if($token['type'] === HTML5_Tokenizer::CHARACTER &&
1923-
preg_match('/^[\t\n\x0b\x0c ]+$/',$token['data']) &&
1923+
if($token['type'] === HTML5_Tokenizer::SPACECHARACTER &&
19241924
/* If the current table is tainted, then act as described in
19251925
* the "anything else" entry below. */
19261926
// Note: hsivonen has a test that fails due to this line
@@ -2142,8 +2142,7 @@ public function emitToken($token, $mode = null) {
21422142
/* A character token that is one of one of U+0009 CHARACTER TABULATION,
21432143
U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
21442144
or U+0020 SPACE */
2145-
if($token['type'] === HTML5_Tokenizer::CHARACTER &&
2146-
preg_match('/^[\t\n\x0b\x0c ]+$/',$token['data'])) {
2145+
if($token['type'] === HTML5_Tokenizer::SPACECHARACTER) {
21472146
/* Append the character to the current node. */
21482147
$this->insertText($token['data']);
21492148

@@ -2472,7 +2471,10 @@ public function emitToken($token, $mode = null) {
24722471
/* Handle the token as follows: */
24732472

24742473
/* A character token */
2475-
if($token['type'] === HTML5_Tokenizer::CHARACTER) {
2474+
if(
2475+
$token['type'] === HTML5_Tokenizer::CHARACTER ||
2476+
$token['type'] === HTML5_Tokenizer::SPACECHARACTER
2477+
) {
24762478
/* Append the token's character to the current node. */
24772479
$this->insertText($token['data']);
24782480

@@ -2669,8 +2671,7 @@ public function emitToken($token, $mode = null) {
26692671
/* A character token that is one of one of U+0009 CHARACTER TABULATION,
26702672
U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
26712673
or U+0020 SPACE */
2672-
if($token['type'] === HTML5_Tokenizer::CHARACTER &&
2673-
preg_match('/^[\t\n\x0b\x0c ]+$/',$token['data'])) {
2674+
if($token['type'] === HTML5_Tokenizer::SPACECHARACTER) {
26742675
/* Process the token as it would be processed if the insertion mode
26752676
was "in body". */
26762677
$this->processWithRulesFor($token,self::IN_BODY);
@@ -2717,8 +2718,7 @@ public function emitToken($token, $mode = null) {
27172718
/* A character token that is one of one of U+0009 CHARACTER TABULATION,
27182719
U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
27192720
U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
2720-
if($token['type'] === HTML5_Tokenizer::CHARACTER &&
2721-
preg_match('/^[\t\n\x0b\x0c ]+$/',$token['data'])) {
2721+
if($token['type'] === HTML5_Tokenizer::SPACECHARACTER) {
27222722
/* Append the character to the current node. */
27232723
$this->insertText($token['data']);
27242724

@@ -2790,8 +2790,7 @@ public function emitToken($token, $mode = null) {
27902790
/* A character token that is one of one of U+0009 CHARACTER TABULATION,
27912791
U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
27922792
U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */
2793-
if($token['type'] === HTML5_Tokenizer::CHARACTER &&
2794-
preg_match('/^[\t\n\x0b\x0c ]+$/',$token['data'])) {
2793+
if($token['type'] === HTML5_Tokenizer::SPACECHARACTER) {
27952794
/* Append the character to the current node. */
27962795
$this->insertText($token['data']);
27972796

@@ -2836,9 +2835,8 @@ public function emitToken($token, $mode = null) {
28362835
$this->dom->appendChild($comment);
28372836

28382837
}elseif($token['type'] === HTML5_Tokenizer::DOCTYPE ||
2839-
($token['type'] === HTML5_Tokenizer::CHARACTER &&
2840-
preg_match('/^[\t\n\x0b\x0c ]+$/',$token['data']) ||
2841-
($token['type'] === HTML5_Tokenizer::STARTTAG &&$token['name'] ==='html'))) {
2838+
$token['type'] === HTML5_Tokenizer::SPACECHARACTER ||
2839+
($token['type'] === HTML5_Tokenizer::STARTTAG &&$token['name'] ==='html')) {
28422840
$this->processWithRulesFor($token,self::IN_BODY);
28432841

28442842
/* An end-of-file token */
@@ -2860,9 +2858,8 @@ public function emitToken($token, $mode = null) {
28602858
$this->dom->appendChild($comment);
28612859

28622860
}elseif($token['type'] === HTML5_Tokenizer::DOCTYPE ||
2863-
($token['type'] === HTML5_Tokenizer::CHARACTER &&
2864-
preg_match('/^[\t\n\x0b\x0c ]+$/',$token['data']) ||
2865-
($token['type'] === HTML5_Tokenizer::STARTTAG &&$token['name'] ==='html'))) {
2861+
$token['type'] === HTML5_Tokenizer::SPACECHARACTER ||
2862+
($token['type'] === HTML5_Tokenizer::STARTTAG &&$token['name'] ==='html')) {
28662863
$this->processWithRulesFor($token,self::IN_BODY);
28672864

28682865
/* An end-of-file token */

‎tests/HTML5/TestableTokenizer.php

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@ protected function emitToken($token, $checkStream = true) {
6464
$this->outputTokens[] =array('Comment',$token['data']);
6565
break;
6666
caseself::CHARACTER:
67+
caseself::SPACECHARACTER:
6768
if (count($this->outputTokens)) {
6869
$old =array_pop($this->outputTokens);
6970
if ($old[0] ==='Character') {

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp