31
31
// XERROR - with regards to parse errors
32
32
// XSCRIPT - with regards to scripting mode
33
33
// XENCODING - with regards to encoding (for reparsing tests)
34
+ // XDOM - DOM specific code (tagName is explicitly not marked).
35
+ // this is not (yet) in helper functions.
34
36
35
37
class HTML5_TreeBuilder {
36
38
public $ stack =array ();
@@ -70,6 +72,9 @@ class HTML5_TreeBuilder {
70
72
'p ' ,'param ' ,'plaintext ' ,'pre ' ,'script ' ,'select ' ,'spacer ' ,'style ' ,
71
73
'tbody ' ,'textarea ' ,'tfoot ' ,'thead ' ,'title ' ,'tr ' ,'ul ' ,'wbr ' );
72
74
75
+ private $ pendingTableCharacters ;
76
+ private $ pendingTableCharactersDirty ;
77
+
73
78
// Tree construction modes
74
79
const INITIAL =0 ;
75
80
const BEFORE_HTML =1 ;
@@ -80,19 +85,20 @@ class HTML5_TreeBuilder {
80
85
const IN_BODY =6 ;
81
86
const IN_CDATA_RCDATA =7 ;
82
87
const IN_TABLE =8 ;
83
- const IN_CAPTION =9 ;
84
- const IN_COLUMN_GROUP =10 ;
85
- const IN_TABLE_BODY =11 ;
86
- const IN_ROW =12 ;
87
- const IN_CELL =13 ;
88
- const IN_SELECT =14 ;
89
- const IN_SELECT_IN_TABLE =15 ;
90
- const IN_FOREIGN_CONTENT =16 ;
91
- const AFTER_BODY =17 ;
92
- const IN_FRAMESET =18 ;
93
- const AFTER_FRAMESET =19 ;
94
- const AFTER_AFTER_BODY =20 ;
95
- const AFTER_AFTER_FRAMESET =21 ;
88
+ const IN_TABLE_TEXT =9 ;
89
+ const IN_CAPTION =10 ;
90
+ const IN_COLUMN_GROUP =11 ;
91
+ const IN_TABLE_BODY =12 ;
92
+ const IN_ROW =13 ;
93
+ const IN_CELL =14 ;
94
+ const IN_SELECT =15 ;
95
+ const IN_SELECT_IN_TABLE =16 ;
96
+ const IN_FOREIGN_CONTENT =17 ;
97
+ const AFTER_BODY =18 ;
98
+ const IN_FRAMESET =19 ;
99
+ const AFTER_FRAMESET =20 ;
100
+ const AFTER_AFTER_BODY =21 ;
101
+ const AFTER_AFTER_FRAMESET =22 ;
96
102
97
103
/**
98
104
* Converts a magic number to a readable name. Use for debugging.
@@ -201,6 +207,7 @@ public function emitToken($token, $mode = null) {
201
207
* doctype attribute of the Document object. */
202
208
if (!isset ($ token ['public ' ]))$ token ['public ' ] =null ;
203
209
if (!isset ($ token ['system ' ]))$ token ['system ' ] =null ;
210
+ // XDOM
204
211
// Yes this is hacky. I'm kind of annoyed that I can't appendChild
205
212
// a doctype to DOMDocument. Maybe I haven't chanted the right
206
213
// syllables.
@@ -363,6 +370,7 @@ public function emitToken($token, $mode = null) {
363
370
}elseif ($ token ['type ' ] === HTML5_Tokenizer::COMMENT ) {
364
371
/* Append a Comment node to the Document object with the data
365
372
attribute set to the data given in the comment token. */
373
+ // XDOM
366
374
$ comment =$ this ->dom ->createComment ($ token ['data ' ]);
367
375
$ this ->dom ->appendChild ($ comment );
368
376
@@ -378,6 +386,7 @@ public function emitToken($token, $mode = null) {
378
386
/* Create an element for the token in the HTML namespace. Append it
379
387
* to the Document object. Put this element in the stack of open
380
388
* elements. */
389
+ // XDOM
381
390
$ html =$ this ->insertElement ($ token ,false );
382
391
$ this ->dom ->appendChild ($ html );
383
392
$ this ->stack [] =$ html ;
@@ -387,6 +396,7 @@ public function emitToken($token, $mode = null) {
387
396
}else {
388
397
/* Create an html element. Append it to the Document object. Put
389
398
* this element in the stack of open elements. */
399
+ // XDOM
390
400
$ html =$ this ->dom ->createElementNS (self ::NS_HTML ,'html ' );
391
401
$ this ->dom ->appendChild ($ html );
392
402
$ this ->stack [] =$ html ;
@@ -1744,6 +1754,7 @@ public function emitToken($token, $mode = null) {
1744
1754
* elements with an entry for the new element, and
1745
1755
* let node be the new element. */
1746
1756
// we don't know what the token is anymore
1757
+ // XDOM
1747
1758
$ clone =$ node ->cloneNode ();
1748
1759
$ a_pos =array_search ($ node ,$ this ->a_formatting ,true );
1749
1760
$ s_pos =array_search ($ node ,$ this ->stack ,true );
@@ -1753,10 +1764,12 @@ public function emitToken($token, $mode = null) {
1753
1764
1754
1765
/* 6.6 Insert last node into node, first removing
1755
1766
it from its previous parent node if any. */
1767
+ // XDOM
1756
1768
if ($ last_node ->parentNode !==null ) {
1757
1769
$ last_node ->parentNode ->removeChild ($ last_node );
1758
1770
}
1759
1771
1772
+ // XDOM
1760
1773
$ node ->appendChild ($ last_node );
1761
1774
1762
1775
/* 6.7 Let last node be node. */
@@ -1770,6 +1783,7 @@ public function emitToken($token, $mode = null) {
1770
1783
* whatever last node ended up being in the previous
1771
1784
* step, first removing it from its previous parent
1772
1785
* node if any. */
1786
+ // XDOM
1773
1787
if ($ last_node ->parentNode ) {// common step
1774
1788
$ last_node ->parentNode ->removeChild ($ last_node );
1775
1789
}
@@ -1780,23 +1794,27 @@ public function emitToken($token, $mode = null) {
1780
1794
* first removing it from its previous parent node if
1781
1795
* any. */
1782
1796
}else {
1797
+ // XDOM
1783
1798
$ common_ancestor ->appendChild ($ last_node );
1784
1799
}
1785
1800
1786
1801
/* 8. Create an element for the token for which the
1787
1802
* formatting element was created. */
1803
+ // XDOM
1788
1804
$ clone =$ formatting_element ->cloneNode ();
1789
1805
1790
1806
/* 9. Take all of the child nodes of the furthest
1791
1807
block and append them to the element created in the
1792
1808
last step. */
1809
+ // XDOM
1793
1810
while ($ furthest_block ->hasChildNodes ()) {
1794
1811
$ child =$ furthest_block ->firstChild ;
1795
1812
$ furthest_block ->removeChild ($ child );
1796
1813
$ clone ->appendChild ($ child );
1797
1814
}
1798
1815
1799
1816
/* 10. Append that clone to the furthest block. */
1817
+ // XDOM
1800
1818
$ furthest_block ->appendChild ($ clone );
1801
1819
1802
1820
/* 11. Remove the formatting element from the list
@@ -1940,17 +1958,21 @@ public function emitToken($token, $mode = null) {
1940
1958
case self ::IN_TABLE :
1941
1959
$ clear =array ('html ' ,'table ' );
1942
1960
1943
- /* A character token that is one of one of U+0009 CHARACTER TABULATION,
1944
- U+000A LINE FEED (LF), U+000B LINE TABULATION, U+000C FORM FEED (FF),
1945
- or U+0020 SPACE */
1946
- if ($ token ['type ' ] === HTML5_Tokenizer::SPACECHARACTER &&
1947
- /* If the current table is tainted, then act as described in
1948
- * the "anything else" entry below. */
1949
- // Note: hsivonen has a test that fails due to this line
1950
- // because he wants to convince Hixie not to do taint
1951
- !$ this ->currentTableIsTainted ()) {
1952
- /* Append the character to the current node. */
1953
- $ this ->insertText ($ token ['data ' ]);
1961
+ /* A character token */
1962
+ if ($ token ['type ' ] === HTML5_Tokenizer::CHARACTER ||
1963
+ $ token ['type ' ] === HTML5_Tokenizer::SPACECHARACTER ) {
1964
+ /* Let the pending table character tokens
1965
+ * be an empty list of tokens. */
1966
+ $ this ->pendingTableCharacters ="" ;
1967
+ $ this ->pendingTableCharactersDirty =false ;
1968
+ /* Let the original insertion mode be the current
1969
+ * insertion mode. */
1970
+ $ this ->original_mode =$ this ->mode ;
1971
+ /* Switch the insertion mode to
1972
+ * "in table text" and
1973
+ * reprocess the token. */
1974
+ $ this ->mode =self ::IN_TABLE_TEXT ;
1975
+ $ this ->emitToken ($ token );
1954
1976
1955
1977
/* A comment token */
1956
1978
}elseif ($ token ['type ' ] === HTML5_Tokenizer::COMMENT ) {
@@ -2096,6 +2118,57 @@ public function emitToken($token, $mode = null) {
2096
2118
}
2097
2119
break ;
2098
2120
2121
+ case self ::IN_TABLE_TEXT :
2122
+ /* A character token */
2123
+ if ($ token ['type ' ] === HTML5_Tokenizer::CHARACTER ) {
2124
+ /* Append the character token to the pending table
2125
+ * character tokens list. */
2126
+ $ this ->pendingTableCharacters .=$ token ['data ' ];
2127
+ $ this ->pendingTableCharactersDirty =true ;
2128
+ }elseif ($ token ['type ' ] === HTML5_Tokenizer::SPACECHARACTER ) {
2129
+ $ this ->pendingTableCharacters .=$ token ['data ' ];
2130
+ /* Anything else */
2131
+ }else {
2132
+ if ($ this ->pendingTableCharacters !=='' &&is_string ($ this ->pendingTableCharacters )) {
2133
+ /* If any of the tokens in the pending table character tokens list
2134
+ * are character tokens that are not one of U+0009 CHARACTER
2135
+ * TABULATION, U+000A LINE FEED (LF), U+000C FORM FEED (FF), or
2136
+ * U+0020 SPACE, then reprocess those character tokens using the
2137
+ * rules given in the "anything else" entry in the in table"
2138
+ * insertion mode.*/
2139
+ if ($ this ->pendingTableCharactersDirty ) {
2140
+ /* Parse error. Process the token using the rules for the
2141
+ * "in body" insertion mode, except that if the current
2142
+ * node is a table, tbody, tfoot, thead, or tr element,
2143
+ * then, whenever a node would be inserted into the current
2144
+ * node, it must instead be foster parented. */
2145
+ // XERROR
2146
+ $ old =$ this ->foster_parent ;
2147
+ $ this ->foster_parent =true ;
2148
+ $ text_token =array (
2149
+ 'type ' => HTML5_Tokenizer::CHARACTER ,
2150
+ 'data ' =>$ this ->pendingTableCharacters ,
2151
+ );
2152
+ $ this ->processWithRulesFor ($ text_token ,self ::IN_BODY );
2153
+ $ this ->foster_parent =$ old ;
2154
+
2155
+ /* Otherwise, insert the characters given by the pending table
2156
+ * character tokens list into the current node. */
2157
+ }else {
2158
+ $ this ->insertText ($ this ->pendingTableCharacters );
2159
+ }
2160
+ $ this ->pendingTableCharacters =null ;
2161
+ $ this ->pendingTableCharactersNull =null ;
2162
+ }
2163
+
2164
+ /* Switch the insertion mode to the original insertion mode and
2165
+ * reprocess the token.
2166
+ */
2167
+ $ this ->mode =$ this ->original_mode ;
2168
+ $ this ->emitToken ($ token );
2169
+ }
2170
+ break ;
2171
+
2099
2172
case self ::IN_CAPTION :
2100
2173
/* An end tag whose tag name is "caption" */
2101
2174
if ($ token ['type ' ] === HTML5_Tokenizer::ENDTAG &&$ token ['name ' ] ==='caption ' ) {
@@ -2694,6 +2767,7 @@ public function emitToken($token, $mode = null) {
2694
2767
// XERROR: parse error
2695
2768
}elseif ($ token ['type ' ] === HTML5_Tokenizer::ENDTAG &&
2696
2769
$ token ['name ' ] ==='script ' &&end ($ this ->stack )->tagName ==='script ' &&
2770
+ // XDOM
2697
2771
end ($ this ->stack )->namespaceURI ===self ::NS_SVG ) {
2698
2772
array_pop ($ this ->stack );
2699
2773
// a bunch of script running mumbo jumbo
@@ -2702,20 +2776,23 @@ public function emitToken($token, $mode = null) {
2702
2776
((
2703
2777
$ token ['name ' ] !=='mglyph ' &&
2704
2778
$ token ['name ' ] !=='malignmark ' &&
2779
+ // XDOM
2705
2780
end ($ this ->stack )->namespaceURI ===self ::NS_MATHML &&
2706
2781
in_array (end ($ this ->stack )->tagName ,array ('mi ' ,'mo ' ,'mn ' ,'ms ' ,'mtext ' ))
2707
2782
) ||
2708
2783
(
2709
2784
$ token ['name ' ] ==='svg ' &&
2785
+ // XDOM
2710
2786
end ($ this ->stack )->namespaceURI ===self ::NS_MATHML &&
2711
2787
end ($ this ->stack )->tagName ==='annotation-xml '
2712
2788
) ||
2713
2789
(
2790
+ // XDOM
2714
2791
end ($ this ->stack )->namespaceURI ===self ::NS_SVG &&
2715
2792
in_array (end ($ this ->stack )->tagName ,array ('foreignObject ' ,'desc ' ,'title ' ))
2716
2793
) ||
2717
2794
(
2718
- // XSKETCHY
2795
+ // XSKETCHY && XDOM
2719
2796
end ($ this ->stack )->namespaceURI ===self ::NS_HTML
2720
2797
))
2721
2798
) ||$ token ['type ' ] === HTML5_Tokenizer::ENDTAG
@@ -2729,6 +2806,7 @@ public function emitToken($token, $mode = null) {
2729
2806
$ found =false ;
2730
2807
// this basically duplicates elementInScope()
2731
2808
for ($ i =count ($ this ->stack ) -1 ;$ i >=0 ;$ i --) {
2809
+ // XDOM
2732
2810
$ node =$ this ->stack [$ i ];
2733
2811
if ($ node ->namespaceURI !==self ::NS_HTML ) {
2734
2812
$ found =true ;
@@ -2756,6 +2834,7 @@ public function emitToken($token, $mode = null) {
2756
2834
// XERROR: parse error
2757
2835
do {
2758
2836
$ node =array_pop ($ this ->stack );
2837
+ // XDOM
2759
2838
}while ($ node ->namespaceURI !==self ::NS_HTML );
2760
2839
$ this ->stack [] =$ node ;
2761
2840
$ this ->mode =$ this ->secondary_mode ;
@@ -2799,6 +2878,7 @@ public function emitToken($token, $mode = null) {
2799
2878
'radialgradient ' =>'radialGradient ' ,
2800
2879
'textpath ' =>'textPath ' ,
2801
2880
);
2881
+ // XDOM
2802
2882
$ current =end ($ this ->stack );
2803
2883
if ($ current ->namespaceURI ===self ::NS_MATHML ) {
2804
2884
$ token =$ this ->adjustMathMLAttributes ($ token );
@@ -2835,6 +2915,7 @@ public function emitToken($token, $mode = null) {
2835
2915
/* Append a Comment node to the first element in the stack of open
2836
2916
elements (the html element), with the data attribute set to the
2837
2917
data given in the comment token. */
2918
+ // XDOM
2838
2919
$ comment =$ this ->dom ->createComment ($ token ['data ' ]);
2839
2920
$ this ->stack [0 ]->appendChild ($ comment );
2840
2921
@@ -2985,6 +3066,7 @@ public function emitToken($token, $mode = null) {
2985
3066
if ($ token ['type ' ] === HTML5_Tokenizer::COMMENT ) {
2986
3067
/* Append a Comment node to the Document object with the data
2987
3068
attribute set to the data given in the comment token. */
3069
+ // XDOM
2988
3070
$ comment =$ this ->dom ->createComment ($ token ['data ' ]);
2989
3071
$ this ->dom ->appendChild ($ comment );
2990
3072
@@ -3008,6 +3090,7 @@ public function emitToken($token, $mode = null) {
3008
3090
if ($ token ['type ' ] === HTML5_Tokenizer::COMMENT ) {
3009
3091
/* Append a Comment node to the Document object with the data
3010
3092
attribute set to the data given in the comment token. */
3093
+ // XDOM
3011
3094
$ comment =$ this ->dom ->createComment ($ token ['data ' ]);
3012
3095
$ this ->dom ->appendChild ($ comment );
3013
3096
@@ -3458,12 +3541,8 @@ private function getFosterParent() {
3458
3541
public function fosterParent ($ node ) {
3459
3542
$ foster_parent =$ this ->getFosterParent ();
3460
3543
$ table =$ this ->getCurrentTable ();// almost equivalent to last table element, except it can be html
3461
- /* When a node node is to be foster parented, the node node must be
3462
- * inserted into the foster parent element, and the current table must
3463
- * be marked as tainted. (Once the current table has been tainted,
3464
- * whitespace characters are inserted into the foster parent element
3465
- * instead of the current node.) */
3466
- $ table ->tainted =true ;
3544
+ /* When a node node is to be foster parented, the node node must be
3545
+ * be inserted into the foster parent element. */
3467
3546
/* If the foster parent element is the parent element of the last table
3468
3547
* element in the stack of open elements, then node must be inserted
3469
3548
* immediately before the last table element in the stack of open