66
77import sys
88import os
9+ import traceback
910from optparse import OptionParser
1011
1112from html5lib import html5parser ,sanitizer
@@ -48,10 +49,7 @@ def parse():
4849else :
4950tokenizer = HTMLTokenizer
5051
51- if opts .log :
52- html5parser .debug_log = True
53-
54- p = html5parser .HTMLParser (tree = treebuilder ,tokenizer = tokenizer )
52+ p = html5parser .HTMLParser (tree = treebuilder ,tokenizer = tokenizer ,debug = opts .log )
5553
5654if opts .fragment :
5755parseMethod = p .parseFragment
@@ -73,46 +71,54 @@ def parse():
7371elif opts .time :
7472import time
7573t0 = time .time ()
76- document = parseMethod ( f ,encoding = encoding )
74+ document = run ( parseMethod , f ,encoding )
7775t1 = time .time ()
7876printOutput (p ,document ,opts )
7977t2 = time .time ()
8078sys .stderr .write ("\n \n Run took: %fs (plus %fs to print the output)" % (t1 - t0 ,t2 - t1 ))
8179else :
82- document = parseMethod ( f ,encoding = encoding )
80+ document = run ( parseMethod , f ,encoding )
8381printOutput (p ,document ,opts )
8482
83+ def run (parseMethod ,f ,encoding ):
84+ try :
85+ document = parseMethod (f ,encoding = encoding )
86+ except :
87+ document = None
88+ traceback .print_exc ()
89+ return document
90+
8591def printOutput (parser ,document ,opts ):
8692if opts .encoding :
8793print "Encoding:" ,parser .tokenizer .stream .charEncoding
8894
89- if opts .log :
90- for item in parser . log :
91- print item
92-
93- if opts .xml :
94- sys .stdout .write (document .toxml ("utf-8" ))
95- elif opts .tree :
96- if not hasattr (document ,'__getitem__' ):
97- document = [document ]
98- for fragment in document :
99- print parser .tree .testSerializer (fragment ).encode ("utf-8" )
100- elif opts .hilite :
101- sys .stdout .write (document .hilite ("utf-8" ))
102- elif opts .html :
103- kwargs = {}
104- for opt in serializer .HTMLSerializer .options :
105- try :
106- kwargs [opt ]= getattr (opts ,opt )
107- except :
108- pass
109- if not kwargs ['quote_char' ]:
110- del kwargs ['quote_char' ]
111-
112- tokens = treewalkers .getTreeWalker (opts .treebuilder )(document )
113- for text in serializer .HTMLSerializer (** kwargs ).serialize (tokens ,encoding = 'utf-8' ):
114- sys .stdout .write (text )
115- if not text .endswith ('\n ' ):sys .stdout .write ('\n ' )
95+ for item in parser .log :
96+ print item
97+
98+ if document is not None :
99+ if opts .xml :
100+ sys .stdout .write (document .toxml ("utf-8" ))
101+ elif opts .tree :
102+ if not hasattr (document ,'__getitem__' ):
103+ document = [document ]
104+ for fragment in document :
105+ print parser .tree .testSerializer (fragment ).encode ("utf-8" )
106+ elif opts .hilite :
107+ sys .stdout .write (document .hilite ("utf-8" ))
108+ elif opts .html :
109+ kwargs = {}
110+ for opt in serializer .HTMLSerializer .options :
111+ try :
112+ kwargs [opt ]= getattr (opts ,opt )
113+ except :
114+ pass
115+ if not kwargs ['quote_char' ]:
116+ del kwargs ['quote_char' ]
117+
118+ tokens = treewalkers .getTreeWalker (opts .treebuilder )(document )
119+ for text in serializer .HTMLSerializer (** kwargs ).serialize (tokens ,encoding = 'utf-8' ):
120+ sys .stdout .write (text )
121+ if not text .endswith ('\n ' ):sys .stdout .write ('\n ' )
116122if opts .error :
117123errList = []
118124for pos ,errorcode ,datavars in parser .errors :