68 $this->cacheThreshold = $options[
'cacheThreshold'] ??
false;
93foreach ( $values as $k => $val ) {
96 [
'name', [ [
'@index', [ $k ] ] ] ],
97 [
'value', [ strval( $val ) ] ],
100 $store = [ [
'part', [
101 [
'name', [ strval( $k ) ] ],
103 [
'value', [ strval( $val ) ] ],
114if ( $this->disableLangConversion ) {
115// Language conversions are globally disabled; implicitly set flag 121 $this->cacheThreshold !==
false &&
122 strlen( $text ) >= $this->cacheThreshold &&
123 ( $flags & self::DOM_UNCACHED ) != self::DOM_UNCACHED
125 $domTreeJson = $this->wanCache->getWithSetCallback(
126 $this->wanCache->makeKey(
'preprocess-hash', sha1( $text ), $flags ),
127 $this->wanCache::TTL_DAY,
128function () use ( $text, $flags, &$domTreeArray ) {
129 $domTreeArray = $this->buildDomTreeArrayFromText( $text, $flags );
133 JSON_UNESCAPED_SLASHES | JSON_UNESCAPED_UNICODE
138 $domTreeArray ??= json_decode( $domTreeJson );
141 $domTreeArray ??= $this->buildDomTreeArrayFromText( $text, $flags );
151privatefunction buildDomTreeArrayFromText( $text, $flags ) {
156 $xmlishElements = $this->parser->getStripList();
157 $xmlishAllowMissingEndTag = [
'includeonly',
'noinclude',
'onlyinclude' ];
158 $enableOnlyinclude =
false;
159if ( $forInclusion ) {
160 $ignoredTags = [
'includeonly',
'/includeonly' ];
161 $ignoredElements = [
'noinclude' ];
162 $xmlishElements[] =
'noinclude';
163if ( str_contains( $text,
'<onlyinclude>' )
164 && str_contains( $text,
'</onlyinclude>' )
166 $enableOnlyinclude =
true;
169 $ignoredTags = [
'noinclude',
'/noinclude',
'onlyinclude',
'/onlyinclude' ];
170 $ignoredElements = [
'includeonly' ];
171 $xmlishElements[] =
'includeonly';
173 $xmlishRegex = implode(
'|', array_merge( $xmlishElements, $ignoredTags ) );
175// Use "A" modifier (anchored) instead of "^", because ^ doesn't work with an offset 176 $elementsRegex =
"~(?:$xmlishRegex)(?=\s|\/>|>)|!--~iA";
178 $stack =
new PPDStack_Hash;
180 $searchBase =
"[{<\n";
181if ( !$langConversionDisabled ) {
185// For fast reverse searches 186 $revText = strrev( $text );
187 $lengthText = strlen( $text );
189// Input pointer, starts out pointing to a pseudo-newline before the start 191// Current accumulator. See the doc comment for Preprocessor_Hash for the format. 192 $accum =& $stack->getAccum();
193// True to find equals signs in arguments 195// True to take notice of pipe characters 198// True if $i is inside a possible heading 200// True if there are no more greater-than (>) signs right of $i 202// Map of tag name => true if there are no more closing tags of given type right of $i 203 $noMoreClosingTag = [];
204// True to ignore all input up to the next <onlyinclude> 205 $findOnlyinclude = $enableOnlyinclude;
206// Do a line-start run without outputting an LF character 207 $fakeLineStart =
true;
210if ( $findOnlyinclude ) {
211// Ignore all input up to the next <onlyinclude> 212 $startPos = strpos( $text,
'<onlyinclude>', $i );
213if ( $startPos ===
false ) {
214// Ignored section runs to the end 215 $accum[] = [
'ignore', [ substr( $text, $i ) ] ];
218 $tagEndPos = $startPos + 13;
// past-the-end of <onlyinclude> 219 $accum[] = [
'ignore', [ substr( $text, $i, $tagEndPos - $i ) ] ];
221 $findOnlyinclude =
false;
224if ( $fakeLineStart ) {
225 $found =
'line-start';
228 # Find next opening brace, closing brace or pipe 229 $search = $searchBase;
230if ( $stack->top ===
false ) {
233 $currentClosing = $stack->top->close;
234 $search .= $currentClosing;
240// First equals will be for the template 244 # Output literal section, advance input counter 245 $literalLength = strcspn( $text, $search, $i );
246if ( $literalLength > 0 ) {
247 self::addLiteral( $accum, substr( $text, $i, $literalLength ) );
248 $i += $literalLength;
250if ( $i >= $lengthText ) {
251if ( $currentClosing ===
"\n" ) {
252// Do a past-the-end run to finish off the heading 260 $curChar = $curTwoChar = $text[$i];
261if ( $i + 1 < $lengthText ) {
262 $curTwoChar .= $text[$i + 1];
264if ( $curChar ===
'|' ) {
266 } elseif ( $curChar ===
'=' ) {
268 } elseif ( $curChar ===
'<' ) {
270 } elseif ( $curChar ===
"\n" ) {
274 $found =
'line-start';
276 } elseif ( $curTwoChar === $currentClosing ) {
278 $curChar = $curTwoChar;
279 } elseif ( $curChar === $currentClosing ) {
281 } elseif ( isset( $this->rules[$curTwoChar] ) ) {
282 $curChar = $curTwoChar;
284 $rule = $this->rules[$curChar];
285 } elseif ( isset( $this->rules[$curChar] ) ) {
287 $rule = $this->rules[$curChar];
289 # Some versions of PHP have a strcspn which stops on 290 # null characters; ignore these and continue. 291 # We also may get '-' and '}' characters here which 292 # don't match -{ or $currentClosing. Add these to 293 # output and continue. 294if ( $curChar ===
'-' || $curChar ===
'}' ) {
295 self::addLiteral( $accum, $curChar );
303if ( $found ===
'angle' ) {
304// Handle </onlyinclude> 305if ( $enableOnlyinclude
306 && substr_compare( $text,
'</onlyinclude>', $i, 14 ) === 0
308 $findOnlyinclude =
true;
312// Determine element name 313if ( !preg_match( $elementsRegex, $text,
$matches, 0, $i + 1 ) ) {
314// Element name missing or not listed 315 self::addLiteral( $accum,
'<' );
321if ( $name ===
'!--' ) {
322// To avoid leaving blank lines, when a sequence of 323// space-separated comments is both preceded and followed by 324// a newline (ignoring spaces), then 325// trim leading and trailing spaces and the trailing newline. 328 $endPos = strpos( $text,
'-->', $i + 4 );
329if ( $endPos ===
false ) {
330// Unclosed comment in input, runs to end 331 $inner = substr( $text, $i );
332 $accum[] = [
'comment', [ $inner ] ];
335// Search backwards for leading whitespace 336// $wsStart is the first char of the comment (first of the leading space or '<') 337 $wsStart = $i ? ( $i - strspn( $revText,
" \t", $lengthText - $i ) ) : 0;
339// $wsEnd will be the char *after* the comment (after last space or the '>' if there's no space) 340 $wsEnd = $endPos + 3;
// add length of --> 341// Search forwards for trailing whitespace 342 $wsEnd += strspn( $text,
" \t", $wsEnd );
344// Keep looking forward as long as we're finding more comments on the line 345 $comments = [ [ $wsStart, $wsEnd ] ];
346while ( substr_compare( $text,
'<!--', $wsEnd, 4 ) === 0 ) {
347 $c = strpos( $text,
'-->', $wsEnd + 4 );
351 $c += 3;
// add length of --> 352// Search forwards for trailing whitespace 353 $c += strspn( $text,
" \t", $c );
354 $comments[] = [ $wsEnd, $c ];
358// Eat the line if possible 359// TODO: This could theoretically be done if $wsStart === 0, i.e. for comments at 360// the overall start. That's not how Sanitizer::removeHTMLcomments() did it, but 361// it's a possible beneficial b/c break. 362if ( $wsStart > 0 && substr_compare( $text,
"\n", $wsStart - 1, 1 ) === 0
363 && substr_compare( $text,
"\n", $wsEnd, 1 ) === 0
365// Remove leading whitespace from the end of the accumulator 366 $wsLength = $i - $wsStart;
367 $endIndex = count( $accum ) - 1;
371 && is_string( $accum[$endIndex] )
372 && strspn( $accum[$endIndex],
" \t", -$wsLength ) === $wsLength
374 $accum[$endIndex] = substr( $accum[$endIndex], 0, -$wsLength );
377// Dump all but the last comment to the accumulator 378// $endPos includes the newline from the if above, want also eat that 379 [ $startPos, $endPos ] = array_pop( $comments );
380foreach ( $comments as [ $cStartPos, $cEndPos ] ) {
381// $cEndPos is the next char, no +1 needed to get correct length between start/end 382 $inner = substr( $text, $cStartPos, $cEndPos - $cStartPos );
383 $accum[] = [
'comment', [ $inner ] ];
386// Do a line-start run next time to look for headings after the comment 387 $fakeLineStart =
true;
389// No line to eat, just take the comment itself 395 $part = $stack->top->getCurrentPart();
396if ( $part->commentEnd !== $wsStart - 1 ) {
397 $part->visualEnd = $wsStart;
399// Else comments abutting, no change in visual end 400 $part->commentEnd = $endPos;
403 $inner = substr( $text, $startPos, $endPos - $startPos + 1 );
404 $accum[] = [
'comment', [ $inner ] ];
408 $attrStart = $i + strlen( $name ) + 1;
411 $tagEndPos = $noMoreGT ? false : strpos( $text,
'>', $attrStart );
412if ( $tagEndPos ===
false ) {
414// Disable tag search to prevent worst-case O(N^2) performance 416 self::addLiteral( $accum,
'<' );
421 $lowerName = strtolower( $name );
422// Handle ignored tags 423if ( in_array( $lowerName, $ignoredTags ) ) {
424 $accum[] = [
'ignore', [ substr( $text, $i, $tagEndPos - $i + 1 ) ] ];
430if ( $text[$tagEndPos - 1] ===
'/' ) {
432 $attrEnd = $tagEndPos - 1;
437 $attrEnd = $tagEndPos;
440 !isset( $noMoreClosingTag[$lowerName] ) &&
441 preg_match(
"/<\/" . preg_quote( $name,
'/' ) .
"\s*>/i",
442 $text,
$matches, PREG_OFFSET_CAPTURE, $tagEndPos + 1 )
444 [ $close, $closeTagStartPos ] =
$matches[0];
445 $inner = substr( $text, $tagEndPos + 1, $closeTagStartPos - $tagEndPos - 1 );
446 $i = $closeTagStartPos + strlen( $close );
449if ( in_array( $name, $xmlishAllowMissingEndTag ) ) {
450// Let it run out to the end of the text. 451 $inner = substr( $text, $tagEndPos + 1 );
455// Don't match the tag, treat opening tag as literal and resume parsing. 457 self::addLiteral( $accum, substr( $text, $tagStartPos, $tagEndPos + 1 - $tagStartPos ) );
458// Cache results, otherwise we have O(N^2) performance for input like <foo><foo><foo>... 459 $noMoreClosingTag[$lowerName] =
true;
464// <includeonly> and <noinclude> just become <ignore> tags 465if ( in_array( $lowerName, $ignoredElements ) ) {
466 $accum[] = [
'ignore', [ substr( $text, $tagStartPos, $i - $tagStartPos ) ] ];
470if ( $attrEnd <= $attrStart ) {
473// Note that the attr element contains the whitespace between name and attribute, 474// this is necessary for precise reconstruction during pre-save transform. 475 $attr = substr( $text, $attrStart, $attrEnd - $attrStart );
479 [
'name', [ $name ] ],
480 [
'attr', [ $attr ] ],
482if ( $inner !==
null ) {
483 $children[] = [
'inner', [ $inner ] ];
485if ( $close !==
null ) {
486 $children[] = [
'close', [ $close ] ];
488 $accum[] = [
'ext', $children ];
489 } elseif ( $found ===
'line-start' ) {
490// Is this the start of a heading? 491// Line break belongs before the heading element in any case 492if ( $fakeLineStart ) {
493 $fakeLineStart =
false;
495 self::addLiteral( $accum, $curChar );
499// Examine upto 6 characters 500 $count = strspn( $text,
'=', $i, min( $lengthText, 6 ) );
501if ( $count === 1 && $findEquals ) {
502// DWIM: This looks kind of like a name/value separator. 503// Let's let the equals handler have it and break the potential 504// heading. This is heuristic, but AFAICT the methods for 505// completely correct disambiguation are very complex. 506 } elseif ( $count > 0 ) {
510'parts' => [
new PPDPart_Hash( str_repeat(
'=', $count ) ) ],
514 $stack->push( $piece );
515 $accum =& $stack->getAccum();
516 $stackFlags = $stack->getFlags();
517if ( isset( $stackFlags[
'findEquals'] ) ) {
518 $findEquals = $stackFlags[
'findEquals'];
520if ( isset( $stackFlags[
'findPipe'] ) ) {
521 $findPipe = $stackFlags[
'findPipe'];
523if ( isset( $stackFlags[
'inHeading'] ) ) {
524 $inHeading = $stackFlags[
'inHeading'];
528 } elseif ( $found ===
'line-end' ) {
529 $piece = $stack->top;
530// A heading must be open, otherwise \n wouldn't have been in the search list 531// FIXME: Don't use assert() 532// phpcs:ignore MediaWiki.Usage.ForbiddenFunctions.assert 533 assert( $piece->open ===
"\n" );
534 $part = $piece->getCurrentPart();
535// Search back through the input to see if it has a proper close. 536// Do this using the reversed string since the other solutions 537// (end anchor, etc.) are inefficient. 538 $wsLength = strspn( $revText,
" \t", $lengthText - $i );
539 $searchStart = $i - $wsLength;
540if ( $part->commentEnd === $searchStart - 1 ) {
541// Comment found at line end 542// Search for equals signs before the comment 543 $searchStart = $part->visualEnd;
544 $searchStart -= strspn( $revText,
" \t", $lengthText - $searchStart );
546 $equalsLength = strspn( $revText,
'=', $lengthText - $searchStart );
547if ( $equalsLength > 0 ) {
548if ( $searchStart - $equalsLength === $piece->startPos ) {
549// This is just a single string of equals signs on its own line 550// Replicate the doHeadings behavior /={count}(.+)={count}/ 551// First find out how many equals signs there really are (don't stop at 6) 552if ( $equalsLength < 3 ) {
555 $count = min( 6, intval( ( $equalsLength - 1 ) / 2 ) );
558 $count = min( $equalsLength, $piece->count );
561// Normal match, output <h> 562 $element = [ [
'possible-h',
565 [
'@level', [ $count ] ],
566 [
'@i', [ $headingIndex++ ] ]
572// Single equals sign on its own line, count=0 576// No match, no <h>, just pass down the inner text 581 $accum =& $stack->getAccum();
582 $stackFlags = $stack->getFlags();
583if ( isset( $stackFlags[
'findEquals'] ) ) {
584 $findEquals = $stackFlags[
'findEquals'];
586if ( isset( $stackFlags[
'findPipe'] ) ) {
587 $findPipe = $stackFlags[
'findPipe'];
589if ( isset( $stackFlags[
'inHeading'] ) ) {
590 $inHeading = $stackFlags[
'inHeading'];
593// Append the result to the enclosing accumulator 594 array_splice( $accum, count( $accum ), 0, $element );
596// Note that we do NOT increment the input pointer. 597// This is because the closing linebreak could be the opening linebreak of 598// another heading. Infinite loops are avoided because the next iteration MUST 599// hit the heading open case above, which unconditionally increments the 601 } elseif ( $found ===
'open' ) {
602 # count opening brace characters 603 $curLen = strlen( $curChar );
605 # allow the final character to repeat 606 ? strspn( $text, $curChar[$curLen - 1], $i + 1 ) + 1
607 : strspn( $text, $curChar, $i );
610 $lineStart = ( $i === 0 ) ? $textStartsInSOLState : ( $text[$i - 1] ===
"\n" );
612if ( $curChar ===
"-{" && $count > $curLen ) {
613// -{ => {{ transition because rightmost wins 618 $rule = $this->rules[$curChar];
621 # we need to add to stack only if opening brace count is enough for one of the rules 622if ( $count >= $rule[
'min'] ) {
623 # Add it to the stack 626'close' => $rule[
'end'],
627'savedPrefix' => $savedPrefix,
629'lineStart' => $lineStart,
632 $stack->push( $piece );
633 $accum =& $stack->getAccum();
634 $stackFlags = $stack->getFlags();
635if ( isset( $stackFlags[
'findEquals'] ) ) {
636 $findEquals = $stackFlags[
'findEquals'];
638if ( isset( $stackFlags[
'findPipe'] ) ) {
639 $findPipe = $stackFlags[
'findPipe'];
641if ( isset( $stackFlags[
'inHeading'] ) ) {
642 $inHeading = $stackFlags[
'inHeading'];
645 # Add literal brace(s) 646 self::addLiteral( $accum, $savedPrefix . str_repeat( $curChar, $count ) );
649 } elseif ( $found ===
'close' ) {
651 $piece = $stack->top;
652'@phan-var PPDStackElement_Hash $piece';
653 # lets check if there are enough characters for closing brace 654 $maxCount = $piece->count;
655if ( $piece->close ===
'}-' && $curChar ===
'}' ) {
656 $maxCount--; # don
't try to match closing '-
' as a '}
' 658 $curLen = strlen( $curChar ); 661 : strspn( $text, $curChar, $i, $maxCount ); 663 # check for maximum matching characters (if there are 5 closing 664 # characters, we will probably need only 3 - depending on the rules) 665 $rule = $this->rules[$piece->open]; 666 if ( $count > $rule['max
'] ) { 667 # The specified maximum exists in the callback array, unless the caller 669 $matchingCount = $rule['max
']; 671 # Count is less than the maximum 672 # Skip any gaps in the callback array to find the true largest match 673 # Need to use array_key_exists not isset because the callback can be null 674 $matchingCount = $count; 675 while ( $matchingCount > 0 && !array_key_exists( $matchingCount, $rule['names
'] ) ) { 680 if ( $matchingCount <= 0 ) { 681 # No matching element found in callback array 682 # Output a literal closing brace and continue 683 $endText = substr( $text, $i, $count ); 684 self::addLiteral( $accum, $endText ); 688 // @phan-suppress-next-line PhanTypeArraySuspiciousNullable 689 $name = $rule['names
'][$matchingCount]; 690 if ( $name === null ) { 691 // No element, just literal text 692 $endText = substr( $text, $i, $matchingCount ); 693 $element = $piece->breakSyntax( $matchingCount ); 694 self::addLiteral( $element, $endText ); 697 $parts = $piece->parts; 698 $titleAccum = $parts[0]->out; 703 # The invocation is at the start of the line if lineStart is set in 704 # the stack, and all opening brackets are used up. 705 if ( $maxCount === $matchingCount && 707 $piece->savedPrefix === '' 709 $children[] = [ '@lineStart
', [ 1 ] ]; 711 $titleNode = [ 'title
', $titleAccum ]; 712 $children[] = $titleNode; 714 foreach ( $parts as $part ) { 715 if ( $part->eqpos !== null ) { 716 $equalsNode = $part->out[$part->eqpos]; 717 $nameNode = [ 'name
', array_slice( $part->out, 0, $part->eqpos ) ]; 718 $valueNode = [ 'value
', array_slice( $part->out, $part->eqpos + 1 ) ]; 719 $partNode = [ 'part
', [ $nameNode, $equalsNode, $valueNode ] ]; 720 $children[] = $partNode; 722 $nameNode = [ 'name
', [ [ '@index
', [ $argIndex++ ] ] ] ]; 723 $valueNode = [ 'value
', $part->out ]; 724 $partNode = [ 'part
', [ $nameNode, $valueNode ] ]; 725 $children[] = $partNode; 728 $element = [ [ $name, $children ] ]; 731 # Advance input pointer 732 $i += $matchingCount; 736 $accum =& $stack->getAccum(); 738 # Re-add the old stack element if it still has unmatched opening characters remaining 739 if ( $matchingCount < $piece->count ) { 740 $piece->parts = [ new PPDPart_Hash ]; 741 $piece->count -= $matchingCount; 742 # do we still qualify for any callback with remaining count? 743 $min = $this->rules[$piece->open]['min
']; 744 if ( $piece->count >= $min ) { 745 $stack->push( $piece ); 746 $accum =& $stack->getAccum(); 747 } elseif ( $piece->count === 1 && $piece->open === '{
' && $piece->savedPrefix === '-
' ) { 748 $piece->savedPrefix = ''; 751 $piece->close = $this->rules[$piece->open]['end
']; 752 $stack->push( $piece ); 753 $accum =& $stack->getAccum(); 755 $s = substr( $piece->open, 0, -1 ); 757 substr( $piece->open, -1 ), 758 $piece->count - strlen( $s ) 760 self::addLiteral( $accum, $piece->savedPrefix . $s ); 762 } elseif ( $piece->savedPrefix !== '' ) { 763 self::addLiteral( $accum, $piece->savedPrefix ); 766 $stackFlags = $stack->getFlags(); 767 if ( isset( $stackFlags['findEquals
'] ) ) { 768 $findEquals = $stackFlags['findEquals
']; 770 if ( isset( $stackFlags['findPipe
'] ) ) { 771 $findPipe = $stackFlags['findPipe
']; 773 if ( isset( $stackFlags['inHeading
'] ) ) { 774 $inHeading = $stackFlags['inHeading
']; 777 # Add XML element to the enclosing accumulator 778 array_splice( $accum, count( $accum ), 0, $element ); 779 } elseif ( $found === 'pipe
' ) { 780 $findEquals = true; // shortcut for getFlags() 782 $accum =& $stack->getAccum(); 784 } elseif ( $found === 'equals
' ) { 785 $findEquals = false; // shortcut for getFlags() 786 $accum[] = [ 'equals
', [ '=
' ] ]; 787 $stack->getCurrentPart()->eqpos = count( $accum ) - 1; 792 # Output any remaining unclosed brackets 793 foreach ( $stack->stack as $piece ) { 794 array_splice( $stack->rootAccum, count( $stack->rootAccum ), 0, $piece->breakSyntax() ); 797 # Enable top-level headings 798 foreach ( $stack->rootAccum as &$node ) { 799 if ( is_array( $node ) && $node[PPNode_Hash_Tree::NAME] === 'possible-h
' ) { 800 $node[PPNode_Hash_Tree::NAME] = 'h
'; 804 return [ [ 'root
', $stack->rootAccum ] ]; 807 private static function addLiteral( array &$accum, string $text ) { 808 $n = count( $accum ); 809 if ( $n && is_string( $accum[$n - 1] ) ) { 810 $accum[$n - 1] .= $text;