Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Do not directly use isolated surrogates in unicode literals#150

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to ourterms of service andprivacy statement. We’ll occasionally send you account related emails.

Already on GitHub?Sign in to your account

Closed
jimbaker wants to merge6 commits intohtml5lib:masterfromjimbaker:master
Closed
Show file tree
Hide file tree
Changes fromall commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions.gitignore
View file
Open in desktop
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
# Because we never want compiled Python
__pycache__/
*.pyc
*.py$class

# Ignore stuff produced by distutils
/build/
Expand Down
42 changes: 32 additions & 10 deletionshtml5lib/inputstream.py
View file
Open in desktop
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
from __future__ import absolute_import, division, unicode_literals
from six import text_type
from six import text_type, unichr
from six.moves import http_client

import codecs
Expand DownExpand Up@@ -28,7 +28,18 @@ class BufferedIOBase(object):
asciiUppercaseBytes = frozenset([item.encode("ascii") for item in asciiUppercase])
spacesAngleBrackets = spaceCharactersBytes | frozenset([b">", b"<"])

invalid_unicode_re = re.compile("[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uD800-\uDFFF\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF]")

invalid_unicode_template = "[\u0001-\u0008\u000B\u000E-\u001F\u007F-\u009F\uFDD0-\uFDEF\uFFFE\uFFFF\U0001FFFE\U0001FFFF\U0002FFFE\U0002FFFF\U0003FFFE\U0003FFFF\U0004FFFE\U0004FFFF\U0005FFFE\U0005FFFF\U0006FFFE\U0006FFFF\U0007FFFE\U0007FFFF\U0008FFFE\U0008FFFF\U0009FFFE\U0009FFFF\U000AFFFE\U000AFFFF\U000BFFFE\U000BFFFF\U000CFFFE\U000CFFFF\U000DFFFE\U000DFFFF\U000EFFFE\U000EFFFF\U000FFFFE\U000FFFFF\U0010FFFE\U0010FFFF%s]"

if utils.supports_lone_surrogates:
# Use one extra step of indirection and create surrogates with
# unichr. Not using this indirection would introduce an illegal
# unicode literal on platforms not supporting such lone
# surrogates.
invalid_unicode_re = re.compile(invalid_unicode_template % (
"%s-%s" % (unichr(0xD800), unichr(0xDFFF)),))
else:
invalid_unicode_re = re.compile(invalid_unicode_template % "")

non_bmp_invalid_codepoints = set([0x1FFFE, 0x1FFFF, 0x2FFFE, 0x2FFFF, 0x3FFFE,
0x3FFFF, 0x4FFFE, 0x4FFFF, 0x5FFFE, 0x5FFFF,
Expand DownExpand Up@@ -164,13 +175,23 @@ def __init__(self, source):

"""

# Craziness
if len("\U0010FFFF") == 1:
if not utils.supports_lone_surrogates:
# Such platforms will have already checked for such
# surrogate errors, so no need to do this checking.
self.reportCharacterErrors = None
self.replaceCharactersRegexp = None
elif len("\U0010FFFF") == 1:
self.reportCharacterErrors = self.characterErrorsUCS4
self.replaceCharactersRegexp = re.compile("[\uD800-\uDFFF]")
self.replaceCharactersRegexp = re.compile("[%s-%s]" % (
unichr(0xD800), unichr(0xDFFF)))
else:
self.reportCharacterErrors = self.characterErrorsUCS2
self.replaceCharactersRegexp = re.compile("([\uD800-\uDBFF](?![\uDC00-\uDFFF])|(?<![\uD800-\uDBFF])[\uDC00-\uDFFF])")
self.replaceCharactersRegexp = re.compile(
"([%s-%s](?![%s-%s])|(?<![%s-%s])[%s-%s])" % (
unichr(0xD800), unichr(0xDBFF),
unichr(0xDC00), unichr(0xDFFF),
unichr(0xD800), unichr(0xDBFF),
unichr(0xDC00), unichr(0xDFFF)))

# List of where new lines occur
self.newLines = [0]
Expand DownExpand Up@@ -265,11 +286,12 @@ def readChunk(self, chunkSize=None):
self._bufferedCharacter = data[-1]
data = data[:-1]

self.reportCharacterErrors(data)
if utils.supports_lone_surrogates:
self.reportCharacterErrors(data)

# Replace invalid characters
# Note U+0000 is dealt with in the tokenizer
data = self.replaceCharactersRegexp.sub("\ufffd", data)
# Replace invalid characters
# Note U+0000 is dealt with in the tokenizer
data = self.replaceCharactersRegexp.sub("\ufffd", data)

data = data.replace("\r\n", "\n")
data = data.replace("\r", "\n")
Expand Down
25 changes: 23 additions & 2 deletionshtml5lib/tests/test_tokenizer.py
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -7,7 +7,7 @@
from .support import get_data_files

from html5lib.tokenizer import HTMLTokenizer
from html5lib import constants
from html5lib import constants, utils


class TokenizerTestParser(object):
Expand DownExpand Up@@ -122,9 +122,28 @@ def tokensMatch(expectedTokens, receivedTokens, ignoreErrorOrder,
return tokens["expected"] == tokens["received"]


_surrogateRe = re.compile(r"\\u(?P<codepoint>[0-9A-Fa-f]{4})")


def unescape(test):
def decode(inp):
return inp.encode("utf-8").decode("unicode-escape")
try:
return inp.encode("utf-8").decode("unicode-escape")
except UnicodeDecodeError:
possible_surrogate_match = _surrogateRe.search(inp)
if possible_surrogate_match and not utils.supports_lone_surrogates:
possible_surrogate = int(possible_surrogate_match.group("codepoint"), 16)
if possible_surrogate >= 0xD800 and possible_surrogate <= 0xDFFF:
# Not valid unicode input for platforms that do
# not have support for lone surrogates.
#
# NOTE it's not even possible to have such
# isolated surrogates in unicode input streams in
# such platforms (like Jython) - the decoding to
# unicode would have raised a similar
# UnicodeDecodeError.
return None
raise

test["input"] = decode(test["input"])
for token in test["output"]:
Expand DownExpand Up@@ -183,6 +202,8 @@ def testTokenizer():
test["initialStates"] = ["Data state"]
if 'doubleEscaped' in test:
test = unescape(test)
if test["input"] is None:
continue # Not valid input for this platform
for initialState in test["initialStates"]:
test["initialState"] = capitalize(initialState)
yield runTokenizerTest, test
14 changes: 13 additions & 1 deletionhtml5lib/utils.py
View file
Open in desktop
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,6 @@
from __future__ import absolute_import, division, unicode_literals

import platform
from types import ModuleType

try:
Expand All@@ -9,7 +10,18 @@


__all__ = ["default_etree", "MethodDispatcher", "isSurrogatePair",
"surrogatePairToCodepoint", "moduleFactoryFactory"]
"surrogatePairToCodepoint", "moduleFactoryFactory",
"supports_lone_surrogates"]


# Platforms not supporting lone surrogates (\uD800-\uDFFF) should be
# added to the below test. In general this would be any platform using
# UTF-16 as its encoding of unicode strings, such as Jython. This is
# because UTF-16 itself is based on the use of such surrogates, and
# there is no mechanism to further escape such escapes.
#
# Otherwise we assume such support.
supports_lone_surrogates = platform.python_implementation() != "Jython"


class MethodDispatcher(dict):
Expand Down

[8]ページ先頭

©2009-2025 Movatter.jp