Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Google Code Issue 157: Add "escape invisible characters" option#38

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to ourterms of service andprivacy statement. We’ll occasionally send you account related emails.

Already on GitHub?Sign in to your account

Open
gsnedders wants to merge2 commits intomaster
base:master
Choose a base branch
Loading
fromgcode-157
Open
Show file tree
Hide file tree
Changes fromall commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
16 changes: 16 additions & 0 deletionshtml5lib/constants.py
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -4,6 +4,9 @@
import gettext
_ = gettext.gettext

from itertools import chain


EOF = None

E = {
Expand DownExpand Up@@ -3078,6 +3081,19 @@
prefixes["http://www.w3.org/1998/Math/MathML"] = "math"


invisibleChars = frozenset(chain(
# ASCII control chars
range(0x0, 0x9), range(0xB, 0xD), range(0xE, 0x20),
# Other control chars
# fixed-width spaces, zero-width marks, bidi marks
range(0x2000, 0x2010),
# LS, PS, bidi control codes
range(0x2028, 0x2030),
# nbsp, mathsp, ideosp, WJ, interlinear
[0x00A0, 0x205F, 0x3000, 0x2060, 0xFFF9, 0xFFFA, 0xFFFB]
))


class DataLossWarning(UserWarning):
pass

Expand Down
10 changes: 9 additions & 1 deletionhtml5lib/serializer/htmlserializer.py
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -94,6 +94,7 @@ class HTMLSerializer(object):
# escaping options
escape_lt_in_attrs = False
escape_rcdata = False
escape_invisible = False
resolve_entities = True

# miscellaneous options
Expand All@@ -105,7 +106,8 @@ class HTMLSerializer(object):
"minimize_boolean_attributes", "use_trailing_solidus",
"space_before_trailing_solidus", "omit_optional_tags",
"strip_whitespace", "inject_meta_charset", "escape_lt_in_attrs",
"escape_rcdata", "resolve_entities", "sanitize")
"escape_rcdata", "escape_invisible", "resolve_entities",
"sanitize")

def __init__(self, **kwargs):
"""Initialize HTMLSerializer.
Expand All@@ -127,6 +129,10 @@ def __init__(self, **kwargs):
escape_rcdata=False|True
Whether to escape characters that need to be escaped within normal
elements within rcdata elements such as style.
escape_invisible=False|True|'numeric'|'named'
Whether to escape invisible characters (such as nbsp, fixed-width
spaces, and control codes). Uses named HTML escapes if 'named'
is specified, otherwise uses numeric codes.
resolve_entities=True|False
Whether to resolve named character entities that appear in the
source tree. The XML predefined entities < > & " '
Expand DownExpand Up@@ -160,6 +166,8 @@ def __init__(self, **kwargs):

def encode(self, string):
assert(isinstance(string, text_type))
if self.escape_invisible:
string = utils.escapeInvisible(string, self.escape_invisible == 'named')
if self.encoding:
return string.encode(self.encoding, unicode_encode_errors)
else:
Expand Down
28 changes: 28 additions & 0 deletionshtml5lib/utils.py
View file
Open in desktop
Original file line numberDiff line numberDiff line change
Expand Up@@ -2,6 +2,10 @@

from types import ModuleType

from six import text_type

from .constants import invisibleChars


class MethodDispatcher(dict):
"""Dict with 2 special properties:
Expand DownExpand Up@@ -71,3 +75,27 @@ def moduleFactory(baseModule, *args, **kwargs):
return mod

return moduleFactory


def escapeInvisible(text, useNamedEntities=False):
"""Escape invisible characters other than Tab, LF, CR, and ASCII space
"""
assert type(text) == text_type
# This algorithm is O(MN) for M len(text) and N num escapable
# But it doesn't modify the text when N is zero (common case) and
# N is expected to be small (usually 1 or 2) in most other cases.
escapable = set()
for c in text:
if ord(c) in invisibleChars:
escapable.add(c)
if useNamedEntities:
# for c in escapable:
# name = codepoint2name.get(ord(c))
# escape = "&%s;" % name if name else "&#x%X;" % ord(c)
# text = text.replace(c, escape)
raise NotImplementedError("This doesn't work on Python 3")
else:
for c in escapable:
text = text.replace(c, "&#x%X;" % ord(c))

return text

[8]ページ先頭

©2009-2025 Movatter.jp