Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitf5f28de

Browse files
committed
Fix handling of numeric entity refs
1 parent002347d commitf5f28de

File tree

2 files changed

+64
-17
lines changed

2 files changed

+64
-17
lines changed

‎src/html5lib/constants.py‎

Lines changed: 38 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -878,6 +878,44 @@
878878
"zwnj;":u"\u200C"
879879
}
880880

881+
replacementCharacters= {
882+
0x0:u"\uFFFD",
883+
0x0d:u"\u000A",
884+
0x80:u"\u20AC",
885+
0x81:u"\u0081",
886+
0x81:u"\u0081",
887+
0x82:u"\u201A",
888+
0x83:u"\u0192",
889+
0x84:u"\u201E",
890+
0x85:u"\u2026",
891+
0x86:u"\u2020",
892+
0x87:u"\u2021",
893+
0x88:u"\u02C6",
894+
0x89:u"\u2030",
895+
0x8A:u"\u0160",
896+
0x8B:u"\u2039",
897+
0x8C:u"\u0152",
898+
0x8D:u"\u008D",
899+
0x8E:u"\u017D",
900+
0x8F:u"\u008F",
901+
0x90:u"\u0090",
902+
0x91:u"\u2018",
903+
0x92:u"\u2019",
904+
0x93:u"\u201C",
905+
0x94:u"\u201D",
906+
0x95:u"\u2022",
907+
0x96:u"\u2013",
908+
0x97:u"\u2014",
909+
0x98:u"\u02DC",
910+
0x99:u"\u2122",
911+
0x9A:u"\u0161",
912+
0x9B:u"\u203A",
913+
0x9C:u"\u0153",
914+
0x9D:u"\u009D",
915+
0x9E:u"\u017E",
916+
0x9F:u"\u0178",
917+
}
918+
881919
encodings= {
882920
'437':'cp437',
883921
'850':'cp850',

‎src/html5lib/tokenizer.py‎

Lines changed: 26 additions & 17 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414
fromconstantsimportasciiLowercase,asciiLetters,asciiUpper2Lower
1515
fromconstantsimportdigits,hexDigits,EOF
1616
fromconstantsimporttokenTypes,tagTokenTypes
17+
fromconstantsimportreplacementCharacters
1718

1819
frominputstreamimportHTMLInputStream
1920

@@ -96,29 +97,37 @@ def consumeNumberEntity(self, isHex):
9697
# Convert the set of characters consumed to an int.
9798
charAsInt=int("".join(charStack),radix)
9899

99-
ifcharAsInt==13:
100+
# Certain characters get replaced with others
101+
ifcharAsIntinreplacementCharacters:
102+
char=replacementCharacters[charAsInt]
100103
self.tokenQueue.append({"type":tokenTypes["ParseError"],"data":
101-
"incorrect-cr-newline-entity"})
102-
charAsInt=10
103-
elif127<charAsInt<160:
104-
# If the integer is between 127 and 160 (so 128 and bigger and 159
105-
# and smaller) we need to do the "windows trick".
106-
self.tokenQueue.append({"type":tokenTypes["ParseError"],"data":
107-
"illegal-windows-1252-entity"})
108-
109-
charAsInt=entitiesWindows1252[charAsInt-128]
110-
111-
# Certain characters get replaced with U+FFFD
112-
if ((charAsInt<=0x0008)or (charAsInt==0x000B)or (0x000E<=charAsInt<=0x001F)
113-
or (0x007F<=charAsInt<=0x009F)
114-
or (0xD800<=charAsInt<=0xDFFF)or (0xFDD0<=charAsInt<=0xFDEF)
115-
or (charAsInt&0xFFFE==0xFFFE)# catch all U+?FFFE and U+?FFFF, where ? is 0..10
116-
or (0x10FFFF<charAsInt)):
104+
"illegal-codepoint-for-numeric-entity",
105+
"datavars": {"charAsInt":charAsInt}})
106+
elif ((0xD800<=charAsInt<=0xDFFF)or
107+
(charAsInt>0x10FFFF)):
117108
char=u"\uFFFD"
118109
self.tokenQueue.append({"type":tokenTypes["ParseError"],"data":
119110
"illegal-codepoint-for-numeric-entity",
120111
"datavars": {"charAsInt":charAsInt}})
121112
else:
113+
#Should speed up this check somehow (e.g. move the set to a constant)
114+
if ((0x0001<=charAsInt<=0x0008)or
115+
(0x000E<=charAsInt<=0x001F)or
116+
(0x007F<=charAsInt<=0x009F)or
117+
(0xFDD0<=charAsInt<=0xFDEF)or
118+
charAsIntinfrozenset([0x000B,0xFFFE,0xFFFF,0x1FFFE,
119+
0x1FFFF,0x2FFFE,0x2FFFF,0x3FFFE,
120+
0x3FFFF,0x4FFFE,0x4FFFF,0x5FFFE,
121+
0x5FFFF,0x6FFFE,0x6FFFF,0x7FFFE,
122+
0x7FFFF,0x8FFFE,0x8FFFF,0x9FFFE,
123+
0x9FFFF,0xAFFFE,0xAFFFF,0xBFFFE,
124+
0xBFFFF,0xCFFFE,0xCFFFF,0xDFFFE,
125+
0xDFFFF,0xEFFFE,0xEFFFF,0xFFFFE,
126+
0xFFFFF,0x10FFFE,0x10FFFF])):
127+
self.tokenQueue.append({"type":tokenTypes["ParseError"],
128+
"data":
129+
"illegal-codepoint-for-numeric-entity",
130+
"datavars": {"charAsInt":charAsInt}})
122131
try:
123132
# XXX We should have a separate function that does "int" to
124133
# "unicodestring" conversion since this doesn't always work

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp