Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitdb43ce2

Browse files
committed
Get encoding tests passing, and test the pre-scan separately
1 parent2816de7 commitdb43ce2

File tree

3 files changed

+85
-49
lines changed

3 files changed

+85
-49
lines changed

‎html5lib/html5parser.py‎

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -777,7 +777,9 @@ def startTagMeta(self, token):
777777
ifself.parser.tokenizer.stream.charEncoding[1]=="tentative":
778778
if"charset"inattributes:
779779
self.parser.tokenizer.stream.changeEncoding(attributes["charset"])
780-
elif"content"inattributes:
780+
elif ("content"inattributesand
781+
"http-equiv"inattributesand
782+
attributes["http-equiv"].lower()=="content-type"):
781783
# Encoding it as UTF-8 here is a hack, as really we should pass
782784
# the abstract Unicode string, and just use the
783785
# ContentAttrParser on that, but using UTF-8 allows all chars

‎html5lib/inputstream.py‎

Lines changed: 66 additions & 46 deletions
Original file line numberDiff line numberDiff line change
@@ -21,10 +21,10 @@ class BufferedIOBase(object):
2121
pass
2222

2323
#Non-unicode versions of constants for use in the pre-parser
24-
spaceCharactersBytes=frozenset([str(item)foriteminspaceCharacters])
25-
asciiLettersBytes=frozenset([str(item)foriteminasciiLetters])
26-
asciiUppercaseBytes=frozenset([str(item)foriteminasciiUppercase])
27-
spacesAngleBrackets=spaceCharactersBytes|frozenset([">","<"])
24+
spaceCharactersBytes=frozenset([item.encode("ascii")foriteminspaceCharacters])
25+
asciiLettersBytes=frozenset([item.encode("ascii")foriteminasciiLetters])
26+
asciiUppercaseBytes=frozenset([item.encode("ascii")foriteminasciiUppercase])
27+
spacesAngleBrackets=spaceCharactersBytes|frozenset([b">",b"<"])
2828

2929
invalid_unicode_re=re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")
3030

@@ -391,12 +391,14 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
391391
parseMeta - Look for a <meta> element containing encoding information
392392
393393
"""
394-
self.charEncoding= (codecName(encoding),"certain")
395-
396394
# Raw Stream - for unicode objects this will encode to utf-8 and set
397395
# self.charEncoding as appropriate
398396
self.rawStream=self.openStream(source)
399397

398+
HTMLUnicodeInputStream.__init__(self,self.rawStream)
399+
400+
self.charEncoding= (codecName(encoding),"certain")
401+
400402
# Encoding Information
401403
#Number of bytes to use when looking for a meta element with
402404
#encoding information
@@ -411,7 +413,7 @@ def __init__(self, source, encoding=None, parseMeta=True, chardet=True):
411413
self.charEncoding=self.detectEncoding(parseMeta,chardet)
412414

413415
#Call superclass
414-
HTMLUnicodeInputStream.__init__(self,self.rawStream)
416+
self.reset()
415417

416418
defreset(self):
417419
self.dataStream=codecs.getreader(self.charEncoding[0])(self.rawStream,
@@ -538,12 +540,13 @@ def detectEncodingMeta(self):
538540

539541
returnencoding
540542

541-
classEncodingBytes(str):
543+
classEncodingBytes(bytes):
542544
"""String-like object with an associated position and various extra methods
543545
If the position is ever greater than the string length then an exception is
544546
raised"""
545547
def__new__(self,value):
546-
returnstr.__new__(self,value.lower())
548+
assertisinstance(value,bytes)
549+
returnbytes.__new__(self,value.lower())
547550

548551
def__init__(self,value):
549552
self._position=-1
@@ -557,7 +560,7 @@ def __next__(self):
557560
raiseStopIteration
558561
elifp<0:
559562
raiseTypeError
560-
returnself[p]
563+
returnself[p:p+1]
561564

562565
defprevious(self):
563566
p=self._position
@@ -566,7 +569,7 @@ def previous(self):
566569
elifp<0:
567570
raiseTypeError
568571
self._position=p=p-1
569-
returnself[p]
572+
returnself[p:p+1]
570573

571574
defsetPosition(self,position):
572575
ifself._position>=len(self):
@@ -584,15 +587,15 @@ def getPosition(self):
584587
position=property(getPosition,setPosition)
585588

586589
defgetCurrentByte(self):
587-
returnself[self.position]
590+
returnself[self.position:self.position+1]
588591

589592
currentByte=property(getCurrentByte)
590593

591594
defskip(self,chars=spaceCharactersBytes):
592595
"""Skip past a list of characters"""
593596
p=self.position# use property for the error-checking
594597
whilep<len(self):
595-
c=self[p]
598+
c=self[p:p+1]
596599
ifcnotinchars:
597600
self._position=p
598601
returnc
@@ -603,7 +606,7 @@ def skip(self, chars=spaceCharactersBytes):
603606
defskipUntil(self,chars):
604607
p=self.position
605608
whilep<len(self):
606-
c=self[p]
609+
c=self[p:p+1]
607610
ifcinchars:
608611
self._position=p
609612
returnc
@@ -645,12 +648,12 @@ def __init__(self, data):
645648

646649
defgetEncoding(self):
647650
methodDispatch= (
648-
("<!--",self.handleComment),
649-
("<meta",self.handleMeta),
650-
("</",self.handlePossibleEndTag),
651-
("<!",self.handleOther),
652-
("<?",self.handleOther),
653-
("<",self.handlePossibleStartTag))
651+
(b"<!--",self.handleComment),
652+
(b"<meta",self.handleMeta),
653+
(b"</",self.handlePossibleEndTag),
654+
(b"<!",self.handleOther),
655+
(b"<?",self.handleOther),
656+
(b"<",self.handlePossibleStartTag))
654657
forbyteinself.data:
655658
keepParsing=True
656659
forkey,methodinmethodDispatch:
@@ -663,37 +666,48 @@ def getEncoding(self):
663666
break
664667
ifnotkeepParsing:
665668
break
666-
669+
667670
returnself.encoding
668671

669672
defhandleComment(self):
670673
"""Skip over comments"""
671-
returnself.data.jumpTo("-->")
674+
returnself.data.jumpTo(b"-->")
672675

673676
defhandleMeta(self):
674677
ifself.data.currentBytenotinspaceCharactersBytes:
675678
#if we have <meta not followed by a space so just keep going
676679
returnTrue
677680
#We have a valid meta element we want to search for attributes
681+
hasPragma=False
682+
pendingEncoding=None
678683
whileTrue:
679684
#Try to find the next attribute after the current position
680685
attr=self.getAttribute()
681686
ifattrisNone:
682687
returnTrue
683688
else:
684-
ifattr[0]=="charset":
689+
ifattr[0]==b"http-equiv":
690+
hasPragma=attr[1]==b"content-type"
691+
ifhasPragmaandpendingEncodingisnotNone:
692+
self.encoding=pendingEncoding
693+
returnFalse
694+
elifattr[0]==b"charset":
685695
tentativeEncoding=attr[1]
686696
codec=codecName(tentativeEncoding)
687697
ifcodecisnotNone:
688698
self.encoding=codec
689699
returnFalse
690-
elifattr[0]=="content":
700+
elifattr[0]==b"content":
691701
contentParser=ContentAttrParser(EncodingBytes(attr[1]))
692702
tentativeEncoding=contentParser.parse()
693-
codec=codecName(tentativeEncoding)
694-
ifcodecisnotNone:
695-
self.encoding=codec
696-
returnFalse
703+
iftentativeEncodingisnotNone:
704+
codec=codecName(tentativeEncoding)
705+
ifcodecisnotNone:
706+
ifhasPragma:
707+
self.encoding=codec
708+
returnFalse
709+
else:
710+
pendingEncoding=codec
697711

698712
defhandlePossibleStartTag(self):
699713
returnself.handlePossibleTag(False)
@@ -714,7 +728,7 @@ def handlePossibleTag(self, endTag):
714728
returnTrue
715729

716730
c=data.skipUntil(spacesAngleBrackets)
717-
ifc=="<":
731+
ifc==b"<":
718732
#return to the first step in the overall "two step" algorithm
719733
#reprocessing the < byte
720734
data.previous()
@@ -726,31 +740,31 @@ def handlePossibleTag(self, endTag):
726740
returnTrue
727741

728742
defhandleOther(self):
729-
returnself.data.jumpTo(">")
743+
returnself.data.jumpTo(b">")
730744

731745
defgetAttribute(self):
732746
"""Return a name,value pair for the next attribute in the stream,
733747
if one is found, or None"""
734748
data=self.data
735749
# Step 1 (skip chars)
736-
c=data.skip(spaceCharactersBytes|frozenset("/"))
750+
c=data.skip(spaceCharactersBytes|frozenset([b"/"]))
751+
assertcisNoneorlen(c)==1
737752
# Step 2
738-
ifcin (">",None):
753+
ifcin (b">",None):
739754
returnNone
740755
# Step 3
741756
attrName= []
742757
attrValue= []
743758
#Step 4 attribute name
744759
whileTrue:
745-
ifc=="="andattrName:
760+
ifc==b"="andattrName:
746761
break
747762
elifcinspaceCharactersBytes:
748763
#Step 6!
749764
c=data.skip()
750-
c=next(data)
751765
break
752-
elifcin ("/",">"):
753-
return"".join(attrName),""
766+
elifcin (b"/",b">"):
767+
returnb"".join(attrName),b""
754768
elifcinasciiUppercaseBytes:
755769
attrName.append(c.lower())
756770
elifc==None:
@@ -760,15 +774,15 @@ def getAttribute(self):
760774
#Step 5
761775
c=next(data)
762776
#Step 7
763-
ifc!="=":
777+
ifc!=b"=":
764778
data.previous()
765-
return"".join(attrName),""
779+
returnb"".join(attrName),b""
766780
#Step 8
767781
next(data)
768782
#Step 9
769783
c=data.skip()
770784
#Step 10
771-
ifcin ("'",'"'):
785+
ifcin (b"'",b'"'):
772786
#10.1
773787
quoteChar=c
774788
whileTrue:
@@ -777,15 +791,15 @@ def getAttribute(self):
777791
#10.3
778792
ifc==quoteChar:
779793
next(data)
780-
return"".join(attrName),"".join(attrValue)
794+
returnb"".join(attrName),b"".join(attrValue)
781795
#10.4
782796
elifcinasciiUppercaseBytes:
783797
attrValue.append(c.lower())
784798
#10.5
785799
else:
786800
attrValue.append(c)
787-
elifc==">":
788-
return"".join(attrName),""
801+
elifc==b">":
802+
returnb"".join(attrName),b""
789803
elifcinasciiUppercaseBytes:
790804
attrValue.append(c.lower())
791805
elifcisNone:
@@ -796,7 +810,7 @@ def getAttribute(self):
796810
whileTrue:
797811
c=next(data)
798812
ifcinspacesAngleBrackets:
799-
return"".join(attrName),"".join(attrValue)
813+
returnb"".join(attrName),b"".join(attrValue)
800814
elifcinasciiUppercaseBytes:
801815
attrValue.append(c.lower())
802816
elifcisNone:
@@ -807,21 +821,22 @@ def getAttribute(self):
807821

808822
classContentAttrParser(object):
809823
def__init__(self,data):
824+
assertisinstance(data,bytes)
810825
self.data=data
811826
defparse(self):
812827
try:
813828
#Check if the attr name is charset
814829
#otherwise return
815-
self.data.jumpTo("charset")
830+
self.data.jumpTo(b"charset")
816831
self.data.position+=1
817832
self.data.skip()
818-
ifnotself.data.currentByte=="=":
833+
ifnotself.data.currentByte==b"=":
819834
#If there is no = sign keep looking for attrs
820835
returnNone
821836
self.data.position+=1
822837
self.data.skip()
823838
#Look for an encoding between matching quote marks
824-
ifself.data.currentBytein ('"',"'"):
839+
ifself.data.currentBytein (b'"',b"'"):
825840
quoteMark=self.data.currentByte
826841
self.data.position+=1
827842
oldPosition=self.data.position
@@ -845,6 +860,11 @@ def parse(self):
845860
defcodecName(encoding):
846861
"""Return the python codec name corresponding to an encoding or None if the
847862
string doesn't correspond to a valid encoding."""
863+
ifisinstance(encoding,bytes):
864+
try:
865+
encoding=encoding.decode("ascii")
866+
exceptUnicodeDecodeError:
867+
returnNone
848868
ifencoding:
849869
canonicalName=ascii_punctuation_re.sub("",encoding).lower()
850870
returnencodings.get(canonicalName,None)

‎html5lib/tests/test_encoding.py‎

Lines changed: 16 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,7 +23,7 @@ def test_codec_name_c(self):
2323
deftest_codec_name_d(self):
2424
self.assertEqual(inputstream.codecName("ISO_8859--1"),"windows-1252")
2525

26-
defrunEncodingTest(data,encoding):
26+
defrunParserEncodingTest(data,encoding):
2727
p=HTMLParser()
2828
t=p.parse(data,useChardet=False)
2929
encoding=encoding.lower().decode("ascii")
@@ -33,13 +33,27 @@ def runEncodingTest(data, encoding):
3333
repr(p.tokenizer.stream.charEncoding[0])))
3434
assertencoding==p.tokenizer.stream.charEncoding[0],errorMessage
3535

36+
37+
defrunPreScanEncodingTest(data,encoding):
38+
stream=inputstream.HTMLBinaryInputStream(data,chardet=False)
39+
encoding=encoding.lower().decode("ascii")
40+
41+
iflen(data)>stream.numBytesMeta:
42+
return
43+
44+
errorMessage= ("Input:\n%s\nExpected:\n%s\nRecieved\n%s\n"%
45+
(data,repr(encoding),
46+
repr(stream.charEncoding[0])))
47+
assertencoding==stream.charEncoding[0],errorMessage
48+
3649
deftest_encoding():
3750
forfilenameinget_data_files("encoding"):
3851
test_name=os.path.basename(filename).replace('.dat',''). \
3952
replace('-','')
4053
tests=TestData(filename,b"data",encoding=None)
4154
foridx,testinenumerate(tests):
42-
yield (runEncodingTest,test[b'data'],test[b'encoding'])
55+
yield (runParserEncodingTest,test[b'data'],test[b'encoding'])
56+
yield (runPreScanEncodingTest,test[b'data'],test[b'encoding'])
4357

4458
try:
4559
importchardet

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp