Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitd4df251

Browse files
author
Edward Z. Yang ext:(%22)
committed
Release 0.1.0.
1 parent45a220c commitd4df251

File tree

6 files changed

+88
-10
lines changed

6 files changed

+88
-10
lines changed

‎README

Lines changed: 37 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,15 +1,47 @@
1-
This implementation doesn't work yet. Bits and pieces are getting written
2-
at a time.
1+
html5lib - php flavour
32

4-
This is an implementation of the HTML5 specification for PHP. More friendly
5-
details forthcoming, but here are some notes:
3+
This is an implementation of the tokenization and tree-building parts
4+
of the HTML5 specification in PHP. Potential uses of this library
5+
can be found in web-scrapers and HTML filters.
6+
7+
Warning: This is a pre-alpha release, and as such, certain parts of
8+
this code are not up-to-snuff (e.g. error reporting and performance).
9+
However, the code is very close to spec and passes 100% of tests
10+
not related to parse errors. Nevertheless, expect to have to update
11+
your code on the next upgrade.
12+
13+
14+
Usage notes:
15+
16+
<?php
17+
require_once '/path/to/HTML5/Parser.php';
18+
$dom = HTML5_Parser::parse('<html><body>...');
19+
$nodelist = HTML5_Parser::parseFragment('<b>Boo</b><br>');
20+
$nodelist = HTML5_Parser::parseFragment('<td>Bar</td>', 'table');
21+
22+
23+
Documentation:
24+
25+
HTML5_Parser::parse($text)
26+
$text : HTML to parse
27+
return : DOMDocument of parsed document
28+
29+
HTML5_Parser::parseFragment($text, $context)
30+
$text : HTML to parse
31+
$context : String name of context element
32+
return : DOMDocument of parsed document
33+
34+
35+
Developer notes:
636

737
* To setup unit tests, you need to add a small stub file test-settings.php
838
that contains $simpletest_location = 'path/to/simpletest/'; This needs to
939
be version 1.1 (or, until that is released, SVN trunk) of SimpleTest.
1040

1141
* We don't want to ultimately use PHP's DOM because it is not tolerant
1242
of certain types of errors that HTML 5 allows (for example, an element
13-
"foo@bar"). But for now, we will, since it's much easier.
43+
"foo@bar"). But the current implementation uses it, since it's easy.
44+
Eventually, this html5lib implementation will get a version of SimpleTree;
45+
and may possibly start using that by default.
1446

1547
vim: et sw=4 sts=4

‎VERSION

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
0.1.0

‎library/HTML5/Parser.php

Lines changed: 31 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,36 @@
11
<?php
22

3+
require_oncedirname(__FILE__) .'/Data.php';
4+
require_oncedirname(__FILE__) .'/InputStream.php';
5+
require_oncedirname(__FILE__) .'/TreeBuilder.php';
6+
require_oncedirname(__FILE__) .'/Tokenizer.php';
7+
8+
/**
9+
* Outwards facing interface for HTML5.
10+
*/
311
class HTML5_Parser
412
{
5-
13+
/**
14+
* Parses a full HTML document.
15+
* @param $text HTML text to parse
16+
* @param $builder Custom builder implementation
17+
* @return Parsed HTML as DOMDocument
18+
*/
19+
staticpublicfunctionparse($text,$builder =null) {
20+
$tokenizer =newHTML5_Tokenizer($text,$builder);
21+
$tokenizer->parse();
22+
return$tokenizer->save();
23+
}
24+
/**
25+
* Parses an HTML fragment.
26+
* @param $text HTML text to parse
27+
* @param $context String name of context element to pretend parsing is in.
28+
* @param $builder Custom builder implementation
29+
* @return Parsed HTML as DOMDocument
30+
*/
31+
staticpublicfunctionparseFragment($text,$context =null,$builder =null) {
32+
$tokenizer =newHTML5_Tokenizer($text,$builder);
33+
$tokenizer->parseFragment($context);
34+
return$tokenizer->save();
35+
}
636
}

‎library/HTML5/Tokenizer.php

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -84,9 +84,9 @@ class HTML5_Tokenizer {
8484
/**
8585
* @param $data Data to parse
8686
*/
87-
publicfunction__construct($data) {
87+
publicfunction__construct($data,$builder =null) {
8888
$this->stream =newHTML5_InputStream($data);
89-
$this->tree =newHTML5_TreeBuilder;
89+
if (!$builder)$this->tree =newHTML5_TreeBuilder;
9090
$this->content_model =self::PCDATA;
9191
}
9292

‎library/HTML5/TreeBuilder.php

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -121,7 +121,7 @@ private function strConst($number) {
121121
constMARKER =300;
122122

123123
// Namespaces for foreign content
124-
constNS_HTML ='http://www.w3.org/1999/xhtml';
124+
constNS_HTML =null;// to prevent DOM from requiring NS on everything
125125
constNS_MATHML ='http://www.w3.org/1998/Math/MathML';
126126
constNS_SVG ='http://www.w3.org/2000/svg';
127127
constNS_XLINK ='http://www.w3.org/1999/xlink';
@@ -3516,8 +3516,8 @@ public function currentTableIsTainted() {
35163516
*/
35173517
publicfunctionsetupContext($context =null) {
35183518
$this->fragment =true;
3519-
$context =$this->dom->createElementNS(self::NS_HTML,$context);
35203519
if ($context) {
3520+
$context =$this->dom->createElementNS(self::NS_HTML,$context);
35213521
/* 4.1. Set the HTML parser's tokenization stage's content model
35223522
* flag according to the context element, as follows: */
35233523
switch ($context->tagName) {

‎tests/HTML5/ParserTest.php

Lines changed: 15 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,15 @@
1+
<?php
2+
3+
require_oncedirname(__FILE__) .'/../autorun.php';
4+
5+
class HTML5_ParserTestextends UnitTestCase
6+
{
7+
publicfunctiontestParse() {
8+
$result = HTML5_Parser::parse('<html><body></body></html>');
9+
$this->assertIsA($result,'DOMDocument');
10+
}
11+
publicfunctiontestParseFragment() {
12+
$result = HTML5_Parser::parseFragment('<b>asdf</b> foo');
13+
$this->assertIsA($result,'DOMNodeList');
14+
}
15+
}

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp