NotificationsYou must be signed in to change notification settings
Fork69
Star97

Commit61d36a3

committed

Split out tokens that consist purely of whitespace immediately after moving to the data state, in similar vain to the Python implementation. This returns us to < 6s to tokenize the spec.

1 parentd552aaf commit61d36a3Copy full SHA for 61d36a3

File tree

3 files changed

+59

-55

lines changed

library/HTML5
- Tokenizer.php
- TreeConstructer.php
tests/HTML5
- TestableTokenizer.php

3 files changed

+59

-55

lines changed

`‎library/HTML5/Tokenizer.php`

Lines changed: 30 additions & 24 deletions

Original file line number	Diff line number	Diff line change
`@@ -64,13 +64,14 @@ class HTML5_Tokenizer {`
`64`	`64`	`// These are constants describing tokens`
`65`	`65`	`// XXX should probably be moved somewhere else, probably the`
`66`	`66`	`// HTML5 class.`
`67`		`-constDOCTYPE =0;`
`68`		`-constSTARTTAG =1;`
`69`		`-constENDTAG =2;`
`70`		`-constCOMMENT =3;`
`71`		`-constCHARACTER =4;`
`72`		`-constEOF =5;`
`73`		`-constPARSEERROR =6;`
	`67`	`+constDOCTYPE =0;`
	`68`	`+constSTARTTAG =1;`
	`69`	`+constENDTAG =2;`
	`70`	`+constCOMMENT =3;`
	`71`	`+constCHARACTER =4;`
	`72`	`+constSPACECHARACTER =5;`
	`73`	`+constEOF =6;`
	`74`	`+constPARSEERROR =7;`
`74`	`75`
`75`	`76`	`// These are constants representing bunches of characters.`
`76`	`77`	`constALPHA ='ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz';`
`@@ -116,23 +117,20 @@ public function parse() {`
`116`	`117`	`$escape =false;`
`117`	`118`	`//echo "\n\n";`
`118`	`119`	`while($state !==null) {`
`119`		`-/*`
`120`		`- echo $state . ' ';`
	`120`	`+`
	`121`	`+/*echo $state . ' ';`
`121`	`122`	`switch ($this->content_model) {`
`122`	`123`	`case self::PCDATA: echo 'PCDATA'; break;`
`123`	`124`	`case self::RCDATA: echo 'RCDATA'; break;`
`124`	`125`	`case self::CDATA: echo 'CDATA'; break;`
`125`	`126`	`case self::PLAINTEXT: echo 'PLAINTEXT'; break;`
`126`	`127`	`}`
`127`	`128`	`if ($escape) echo " escape";`
`128`		`- echo "\n";`
`129`		`- */`
	`129`	`+ echo "\n";*/`
	`130`	`+`
`130`	`131`	`switch($state) {`
`131`	`132`	`case'data':`
`132`	`133`
`133`		`-// Possible optimization: mark text tokens that contain entirely`
`134`		`-// whitespace as whitespace tokens.`
`135`		`-`
`136`	`134`	`/* Consume the next input character */`
`137`	`135`	`$char =$this->stream->char();`
`138`	`136`	`$lastFourChars .=$char;`
`@@ -234,31 +232,39 @@ public function parse() {`
`234`	`232`	`$this->tree->emitToken(array(`
`235`	`233`	`'type' =>self::EOF`
`236`	`234`	`));`
	`235`	`+`
	`236`	`+ }elseif($char ==="\t" \|\|$char ==="\n" \|\|$char ==="\x0c" \|\|$char ==='') {`
	`237`	`+// Directly after emitting a token you switch back to the "data`
	`238`	`+// state". At that point spaceCharacters are important so they are`
	`239`	`+// emitted separately.`
	`240`	`+$chars =$this->stream->charsWhile(self::WHITESPACE);`
	`241`	`+$this->emitToken(array(`
	`242`	`+'type' =>self::SPACECHARACTER,`
	`243`	`+'data' =>$char .$chars`
	`244`	`+ ));`
	`245`	`+$lastFourChars .=$chars;`
	`246`	`+if (strlen($lastFourChars) >4)$lastFourChars =substr($lastFourChars, -4);`
`237`	`247`
`238`	`248`	`}else {`
`239`	`249`	`/* Anything else`
`240`	`250`	`THIS IS AN OPTIMIZATION: Get as many character that`
`241`	`251`	`otherwise would also be treated as a character token and emit it`
`242`	`252`	`as a single character token. Stay in the data state. */`
`243`	`253`	`$chars ='';`
	`254`	`+// XXX: We should only have - and > here when they need to be.`
	`255`	`+$mask ='->';`
	`256`	`+if ($amp_cond)$mask .='&';`
	`257`	`+if ($lt_cond)$mask .='<';`
`244`	`258`
`245`		`-// XSKETCHY: introduced three more fails (at least)`
`246`		`-if ($char !=='' &&$char !=="\n" &&$char !=="\r" &&`
`247`		`-$char !=="\t" &&$char !=="\x0c") {`
`248`		`-// XXX: We should only have - and > here when they need to be.`
`249`		`-$mask ='->' .self::WHITESPACE;`
`250`		`-if ($amp_cond)$mask .='&';`
`251`		`-if ($lt_cond)$mask .='<';`
`252`		`-`
`253`		`-$chars =$this->stream->charsUntil($mask);`
`254`		`- }`
	`259`	`+$chars =$this->stream->charsUntil($mask);`
`255`	`260`
`256`	`261`	`$this->emitToken(array(`
`257`	`262`	`'type' =>self::CHARACTER,`
`258`	`263`	`'data' =>$char .$chars`
`259`	`264`	`));`
`260`	`265`
`261`	`266`	`$lastFourChars .=$chars;`
	`267`	`+if (strlen($lastFourChars) >4)$lastFourChars =substr($lastFourChars, -4);`
`262`	`268`
`263`	`269`	`$state ='data';`
`264`	`270`	`}`

`‎library/HTML5/TreeConstructer.php`

Lines changed: 28 additions & 31 deletions

Original file line number	Diff line number	Diff line change
`@@ -158,8 +158,7 @@ public function emitToken($token, $mode = null) {`
`158`	`158`
`159`	`159`	`/* A character token that is one of U+0009 CHARACTER TABULATION,`
`160`	`160`	`* U+000A LINE FEED (LF), U+000C FORM FEED (FF), or U+0020 SPACE */`
`161`		`-if ($token['type'] === HTML5_Tokenizer::CHARACTER &&`
`162`		`-preg_match('/^[\t\n\x0b\x0c ]+$/',$token['data'])) {`
	`161`	`+if ($token['type'] === HTML5_Tokenizer::SPACECHARACTER) {`
`163`	`162`	`/* Ignore the token. */`
`164`	`163`	`$this->ignored =true;`
`165`	`164`	`}elseif ($token['type'] === HTML5_Tokenizer::DOCTYPE) {`
`@@ -356,8 +355,7 @@ public function emitToken($token, $mode = null) {`
`356`	`355`	`/* A character token that is one of one of U+0009 CHARACTER TABULATION,`
`357`	`356`	`U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),`
`358`	`357`	`or U+0020 SPACE */`
`359`		`- }elseif($token['type'] === HTML5_Tokenizer::CHARACTER &&`
`360`		`-preg_match('/^[\t\n\x0b\x0c ]+$/',$token['data'])) {`
	`358`	`+ }elseif($token['type'] === HTML5_Tokenizer::SPACECHARACTER) {`
`361`	`359`	`/* Ignore the token. */`
`362`	`360`	`$this->ignored =true;`
`363`	`361`
`@@ -391,8 +389,7 @@ public function emitToken($token, $mode = null) {`
`391`	`389`	`/* A character token that is one of one of U+0009 CHARACTER TABULATION,`
`392`	`390`	`U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),`
`393`	`391`	`or U+0020 SPACE */`
`394`		`-if($token['type'] === HTML5_Tokenizer::CHARACTER &&`
`395`		`-preg_match('/^[\t\n\x0b\x0c ]+$/',$token['data'])) {`
	`392`	`+if($token['type'] === HTML5_Tokenizer::SPACECHARACTER) {`
`396`	`393`	`/* Ignore the token. */`
`397`	`394`	`$this->ignored =true;`
`398`	`395`
`@@ -465,8 +462,7 @@ public function emitToken($token, $mode = null) {`
`465`	`462`	`/* A character token that is one of one of U+0009 CHARACTER TABULATION,`
`466`	`463`	`U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),`
`467`	`464`	`or U+0020 SPACE. */`
`468`		`-if($token['type'] === HTML5_Tokenizer::CHARACTER &&`
`469`		`-preg_match('/^[\t\n\x0b\x0c ]+$/',$token['data'])) {`
	`465`	`+if($token['type'] === HTML5_Tokenizer::SPACECHARACTER) {`
`470`	`466`	`/* Insert the character into the current node. */`
`471`	`467`	`$this->insertText($token['data']);`
`472`	`468`
`@@ -600,8 +596,7 @@ public function emitToken($token, $mode = null) {`
`600`	`596`	`array_pop($this->stack);`
`601`	`597`	`$this->mode =self::IN_HEAD;`
`602`	`598`	`}elseif (`
`603`		`- ($token['type'] === HTML5_Tokenizer::CHARACTER &&`
`604`		`-preg_match('/^[\t\n\x0b\x0c ]+$/',$token['data'])) \|\|`
	`599`	`+ ($token['type'] === HTML5_Tokenizer::SPACECHARACTER) \|\|`
`605`	`600`	`($token['type'] === HTML5_Tokenizer::COMMENT) \|\|`
`606`	`601`	`($token['type'] === HTML5_Tokenizer::STARTTAG && (`
`607`	`602`	`$token['name'] ==='link' \|\|$token['name'] ==='meta' \|\|`
`@@ -630,8 +625,7 @@ public function emitToken($token, $mode = null) {`
`630`	`625`	`/* A character token that is one of one of U+0009 CHARACTER TABULATION,`
`631`	`626`	`U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),`
`632`	`627`	`or U+0020 SPACE */`
`633`		`-if($token['type'] === HTML5_Tokenizer::CHARACTER &&`
`634`		`-preg_match('/^[\t\n\x0b\x0c ]+$/',$token['data'])) {`
	`628`	`+if($token['type'] === HTML5_Tokenizer::SPACECHARACTER) {`
`635`	`629`	`/* Append the character to the current node. */`
`636`	`630`	`$this->insertText($token['data']);`
`637`	`631`
`@@ -702,6 +696,7 @@ public function emitToken($token, $mode = null) {`
`702`	`696`	`switch($token['type']) {`
`703`	`697`	`/* A character token */`
`704`	`698`	`case HTML5_Tokenizer::CHARACTER:`
	`699`	`+case HTML5_Tokenizer::SPACECHARACTER:`
`705`	`700`	`/* Reconstruct the active formatting elements, if any. */`
`706`	`701`	`$this->reconstructActiveFormattingElements();`
`707`	`702`
`@@ -711,7 +706,10 @@ public function emitToken($token, $mode = null) {`
`711`	`706`	`/* If the token is not one of U+0009 CHARACTER TABULATION,`
`712`	`707`	`* U+000A LINE FEED (LF), U+000C FORM FEED (FF), or U+0020`
`713`	`708`	`* SPACE, then set the frameset-ok flag to "not ok". */`
`714`		`-// YYY: not implemented`
	`709`	`+// i.e., if any of the characters is not whitespace`
	`710`	`+if (strlen($token['data']) !==strspn($token['data'], HTML5_Tokenizer::WHITESPACE)) {`
	`711`	`+$this->flag_frameset_ok =false;`
	`712`	`+ }`
`715`	`713`	`break;`
`716`	`714`
`717`	`715`	`/* A comment token */`
`@@ -1893,7 +1891,10 @@ public function emitToken($token, $mode = null) {`
`1893`	`1891`	`break;`
`1894`	`1892`
`1895`	`1893`	`caseself::IN_CDATA_RCDATA:`
`1896`		`-if ($token['type'] === HTML5_Tokenizer::CHARACTER) {`
	`1894`	`+if (`
	`1895`	`+$token['type'] === HTML5_Tokenizer::CHARACTER \|\|`
	`1896`	`+$token['type'] === HTML5_Tokenizer::SPACECHARACTER`
	`1897`	`+ ) {`
`1897`	`1898`	`$this->insertText($token['data']);`
`1898`	`1899`	`}elseif ($token['type'] === HTML5_Tokenizer::EOF) {`
`1899`	`1900`	`// parse error`
`@@ -1919,8 +1920,7 @@ public function emitToken($token, $mode = null) {`
`1919`	`1920`	`/* A character token that is one of one of U+0009 CHARACTER TABULATION,`
`1920`	`1921`	`U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),`
`1921`	`1922`	`or U+0020 SPACE */`
`1922`		`-if($token['type'] === HTML5_Tokenizer::CHARACTER &&`
`1923`		`-preg_match('/^[\t\n\x0b\x0c ]+$/',$token['data']) &&`
	`1923`	`+if($token['type'] === HTML5_Tokenizer::SPACECHARACTER &&`
`1924`	`1924`	`/* If the current table is tainted, then act as described in`
`1925`	`1925`	`* the "anything else" entry below. */`
`1926`	`1926`	`// Note: hsivonen has a test that fails due to this line`
`@@ -2142,8 +2142,7 @@ public function emitToken($token, $mode = null) {`
`2142`	`2142`	`/* A character token that is one of one of U+0009 CHARACTER TABULATION,`
`2143`	`2143`	`U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),`
`2144`	`2144`	`or U+0020 SPACE */`
`2145`		`-if($token['type'] === HTML5_Tokenizer::CHARACTER &&`
`2146`		`-preg_match('/^[\t\n\x0b\x0c ]+$/',$token['data'])) {`
	`2145`	`+if($token['type'] === HTML5_Tokenizer::SPACECHARACTER) {`
`2147`	`2146`	`/* Append the character to the current node. */`
`2148`	`2147`	`$this->insertText($token['data']);`
`2149`	`2148`
`@@ -2472,7 +2471,10 @@ public function emitToken($token, $mode = null) {`
`2472`	`2471`	`/* Handle the token as follows: */`
`2473`	`2472`
`2474`	`2473`	`/* A character token */`
`2475`		`-if($token['type'] === HTML5_Tokenizer::CHARACTER) {`
	`2474`	`+if(`
	`2475`	`+$token['type'] === HTML5_Tokenizer::CHARACTER \|\|`
	`2476`	`+$token['type'] === HTML5_Tokenizer::SPACECHARACTER`
	`2477`	`+ ) {`
`2476`	`2478`	`/* Append the token's character to the current node. */`
`2477`	`2479`	`$this->insertText($token['data']);`
`2478`	`2480`
`@@ -2669,8 +2671,7 @@ public function emitToken($token, $mode = null) {`
`2669`	`2671`	`/* A character token that is one of one of U+0009 CHARACTER TABULATION,`
`2670`	`2672`	`U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),`
`2671`	`2673`	`or U+0020 SPACE */`
`2672`		`-if($token['type'] === HTML5_Tokenizer::CHARACTER &&`
`2673`		`-preg_match('/^[\t\n\x0b\x0c ]+$/',$token['data'])) {`
	`2674`	`+if($token['type'] === HTML5_Tokenizer::SPACECHARACTER) {`
`2674`	`2675`	`/* Process the token as it would be processed if the insertion mode`
`2675`	`2676`	`was "in body". */`
`2676`	`2677`	`$this->processWithRulesFor($token,self::IN_BODY);`
`@@ -2717,8 +2718,7 @@ public function emitToken($token, $mode = null) {`
`2717`	`2718`	`/* A character token that is one of one of U+0009 CHARACTER TABULATION,`
`2718`	`2719`	`U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),`
`2719`	`2720`	`U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */`
`2720`		`-if($token['type'] === HTML5_Tokenizer::CHARACTER &&`
`2721`		`-preg_match('/^[\t\n\x0b\x0c ]+$/',$token['data'])) {`
	`2721`	`+if($token['type'] === HTML5_Tokenizer::SPACECHARACTER) {`
`2722`	`2722`	`/* Append the character to the current node. */`
`2723`	`2723`	`$this->insertText($token['data']);`
`2724`	`2724`
`@@ -2790,8 +2790,7 @@ public function emitToken($token, $mode = null) {`
`2790`	`2790`	`/* A character token that is one of one of U+0009 CHARACTER TABULATION,`
`2791`	`2791`	`U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),`
`2792`	`2792`	`U+000D CARRIAGE RETURN (CR), or U+0020 SPACE */`
`2793`		`-if($token['type'] === HTML5_Tokenizer::CHARACTER &&`
`2794`		`-preg_match('/^[\t\n\x0b\x0c ]+$/',$token['data'])) {`
	`2793`	`+if($token['type'] === HTML5_Tokenizer::SPACECHARACTER) {`
`2795`	`2794`	`/* Append the character to the current node. */`
`2796`	`2795`	`$this->insertText($token['data']);`
`2797`	`2796`
`@@ -2836,9 +2835,8 @@ public function emitToken($token, $mode = null) {`
`2836`	`2835`	`$this->dom->appendChild($comment);`
`2837`	`2836`
`2838`	`2837`	`}elseif($token['type'] === HTML5_Tokenizer::DOCTYPE \|\|`
`2839`		`- ($token['type'] === HTML5_Tokenizer::CHARACTER &&`
`2840`		`-preg_match('/^[\t\n\x0b\x0c ]+$/',$token['data']) \|\|`
`2841`		`- ($token['type'] === HTML5_Tokenizer::STARTTAG &&$token['name'] ==='html'))) {`
	`2838`	`+$token['type'] === HTML5_Tokenizer::SPACECHARACTER \|\|`
	`2839`	`+ ($token['type'] === HTML5_Tokenizer::STARTTAG &&$token['name'] ==='html')) {`
`2842`	`2840`	`$this->processWithRulesFor($token,self::IN_BODY);`
`2843`	`2841`
`2844`	`2842`	`/* An end-of-file token */`
`@@ -2860,9 +2858,8 @@ public function emitToken($token, $mode = null) {`
`2860`	`2858`	`$this->dom->appendChild($comment);`
`2861`	`2859`
`2862`	`2860`	`}elseif($token['type'] === HTML5_Tokenizer::DOCTYPE \|\|`
`2863`		`- ($token['type'] === HTML5_Tokenizer::CHARACTER &&`
`2864`		`-preg_match('/^[\t\n\x0b\x0c ]+$/',$token['data']) \|\|`
`2865`		`- ($token['type'] === HTML5_Tokenizer::STARTTAG &&$token['name'] ==='html'))) {`
	`2861`	`+$token['type'] === HTML5_Tokenizer::SPACECHARACTER \|\|`
	`2862`	`+ ($token['type'] === HTML5_Tokenizer::STARTTAG &&$token['name'] ==='html')) {`
`2866`	`2863`	`$this->processWithRulesFor($token,self::IN_BODY);`
`2867`	`2864`
`2868`	`2865`	`/* An end-of-file token */`

`‎tests/HTML5/TestableTokenizer.php`

Lines changed: 1 addition & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -64,6 +64,7 @@ protected function emitToken($token, $checkStream = true) {`
`64`	`64`	`$this->outputTokens[] =array('Comment',$token['data']);`
`65`	`65`	`break;`
`66`	`66`	`caseself::CHARACTER:`
	`67`	`+caseself::SPACECHARACTER:`
`67`	`68`	`if (count($this->outputTokens)) {`
`68`	`69`	`$old =array_pop($this->outputTokens);`
`69`	`70`	`if ($old[0] ==='Character') {`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit61d36a3

File tree

3 files changed

3 files changed

`‎library/HTML5/Tokenizer.php`

`‎library/HTML5/TreeConstructer.php`

`‎tests/HTML5/TestableTokenizer.php`

0 commit comments