Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitf4f1fb8

Browse files
fantasaigsnedders
authored andcommitted
Google Code Issue 157: Add "escape invisible characters" option
Vaguely updated, but basically working.
1 parent073d792 commitf4f1fb8

File tree

3 files changed

+51
-1
lines changed

3 files changed

+51
-1
lines changed

‎html5lib/constants.py

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,9 @@
44
importgettext
55
_=gettext.gettext
66

7+
fromitertoolsimportchain
8+
9+
710
EOF=None
811

912
E= {
@@ -3078,6 +3081,19 @@
30783081
prefixes["http://www.w3.org/1998/Math/MathML"]="math"
30793082

30803083

3084+
invisibleChars=frozenset(chain(
3085+
# ASCII control chars
3086+
range(0x0,0x9),range(0xB,0xD),range(0xE,0x20),
3087+
# Other control chars
3088+
# fixed-width spaces, zero-width marks, bidi marks
3089+
range(0x2000,0x2010),
3090+
# LS, PS, bidi control codes
3091+
range(0x2028,0x2030),
3092+
# nbsp, mathsp, ideosp, WJ, interlinear
3093+
[0x00A0,0x205F,0x3000,0x2060,0xFFF9,0xFFFA,0xFFFB]
3094+
))
3095+
3096+
30813097
classDataLossWarning(UserWarning):
30823098
pass
30833099

‎html5lib/serializer/htmlserializer.py

Lines changed: 9 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,7 @@ class HTMLSerializer(object):
9494
# escaping options
9595
escape_lt_in_attrs=False
9696
escape_rcdata=False
97+
escape_invisible=False
9798
resolve_entities=True
9899

99100
# miscellaneous options
@@ -105,7 +106,8 @@ class HTMLSerializer(object):
105106
"minimize_boolean_attributes","use_trailing_solidus",
106107
"space_before_trailing_solidus","omit_optional_tags",
107108
"strip_whitespace","inject_meta_charset","escape_lt_in_attrs",
108-
"escape_rcdata","resolve_entities","sanitize")
109+
"escape_rcdata","escape_invisible","resolve_entities",
110+
"sanitize")
109111

110112
def__init__(self,**kwargs):
111113
"""Initialize HTMLSerializer.
@@ -127,6 +129,10 @@ def __init__(self, **kwargs):
127129
escape_rcdata=False|True
128130
Whether to escape characters that need to be escaped within normal
129131
elements within rcdata elements such as style.
132+
escape_invisible=False|True|'numeric'|'named'
133+
Whether to escape invisible characters (such as nbsp, fixed-width
134+
spaces, and control codes). Uses named HTML escapes if 'named'
135+
is specified, otherwise uses numeric codes.
130136
resolve_entities=True|False
131137
Whether to resolve named character entities that appear in the
132138
source tree. The XML predefined entities < > & " '
@@ -160,6 +166,8 @@ def __init__(self, **kwargs):
160166

161167
defencode(self,string):
162168
assert(isinstance(string,text_type))
169+
ifself.escape_invisible:
170+
text=utils.escapeInvisible(text,self.escape_invisible=='named')
163171
ifself.encoding:
164172
returnstring.encode(self.encoding,unicode_encode_errors)
165173
else:

‎html5lib/utils.py

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,8 @@
22

33
fromtypesimportModuleType
44

5+
from .constantsimportinvisibleChars
6+
57

68
classMethodDispatcher(dict):
79
"""Dict with 2 special properties:
@@ -71,3 +73,27 @@ def moduleFactory(baseModule, *args, **kwargs):
7173
returnmod
7274

7375
returnmoduleFactory
76+
77+
78+
defescapeInvisible(text,useNamedEntities=False):
79+
"""Escape invisible characters other than Tab, LF, CR, and ASCII space
80+
"""
81+
asserttype(text)==text_type
82+
# This algorithm is O(MN) for M len(text) and N num escapable
83+
# But it doesn't modify the text when N is zero (common case) and
84+
# N is expected to be small (usually 1 or 2) in most other cases.
85+
escapable=set()
86+
forcintext:
87+
iford(c)ininvisibleChars:
88+
escapable.add(c)
89+
ifuseNamedEntities:
90+
raiseNotImplementedError("This doesn't work on Python 3")
91+
forcinescapable:
92+
name=codepoint2name.get(ord(c))
93+
escape="&%s;"%nameifnameelse"&#x%X;"%ord(c)
94+
text=text.replace(c,escape)
95+
else:
96+
forcinescapable:
97+
text=text.replace(c,"&#x%X;"%ord(c))
98+
99+
returntext

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp