|
3 | 3 |
|
4 | 4 | importre |
5 | 5 |
|
| 6 | +fromcodecsimportregister_error,xmlcharrefreplace_errors |
| 7 | + |
6 | 8 | from ..constantsimportvoidElements,booleanAttributes,spaceCharacters |
7 | 9 | from ..constantsimportrcdataElements,entities,xmlEntities |
8 | 10 | from ..importutils |
|
21 | 23 | "\u2008\u2009\u200a\u2028\u2029\u202f\u205f" |
22 | 24 | "\u3000]") |
23 | 25 |
|
24 | | -try: |
25 | | -fromcodecsimportregister_error,xmlcharrefreplace_errors |
26 | | -exceptImportError: |
27 | | -unicode_encode_errors="strict" |
28 | | -else: |
29 | | -unicode_encode_errors="htmlentityreplace" |
30 | | - |
31 | | -encode_entity_map= {} |
32 | | -is_ucs4=len("\U0010FFFF")==1 |
33 | | -fork,vinlist(entities.items()): |
34 | | -# skip multi-character entities |
35 | | -if ((is_ucs4andlen(v)>1)or |
36 | | - (notis_ucs4andlen(v)>2)): |
37 | | -continue |
38 | | -ifv!="&": |
39 | | -iflen(v)==2: |
40 | | -v=utils.surrogatePairToCodepoint(v) |
41 | | -else: |
42 | | -v=ord(v) |
43 | | -ifvnotinencode_entity_mapork.islower(): |
44 | | -# prefer < over < and similarly for &, >, etc. |
45 | | -encode_entity_map[v]=k |
46 | | - |
47 | | -defhtmlentityreplace_errors(exc): |
48 | | -ifisinstance(exc, (UnicodeEncodeError,UnicodeTranslateError)): |
49 | | -res= [] |
50 | | -codepoints= [] |
51 | | -skip=False |
52 | | -fori,cinenumerate(exc.object[exc.start:exc.end]): |
53 | | -ifskip: |
54 | | -skip=False |
55 | | -continue |
56 | | -index=i+exc.start |
57 | | -ifutils.isSurrogatePair(exc.object[index:min([exc.end,index+2])]): |
58 | | -codepoint=utils.surrogatePairToCodepoint(exc.object[index:index+2]) |
59 | | -skip=True |
60 | | -else: |
61 | | -codepoint=ord(c) |
62 | | -codepoints.append(codepoint) |
63 | | -forcpincodepoints: |
64 | | -e=encode_entity_map.get(cp) |
65 | | -ife: |
66 | | -res.append("&") |
67 | | -res.append(e) |
68 | | -ifnote.endswith(";"): |
69 | | -res.append(";") |
70 | | -else: |
71 | | -res.append("&#x%s;"% (hex(cp)[2:])) |
72 | | -return ("".join(res),exc.end) |
73 | | -else: |
74 | | -returnxmlcharrefreplace_errors(exc) |
75 | 26 |
|
76 | | -register_error(unicode_encode_errors,htmlentityreplace_errors) |
| 27 | +encode_entity_map= {} |
| 28 | +is_ucs4=len("\U0010FFFF")==1 |
| 29 | +fork,vinlist(entities.items()): |
| 30 | +# skip multi-character entities |
| 31 | +if ((is_ucs4andlen(v)>1)or |
| 32 | + (notis_ucs4andlen(v)>2)): |
| 33 | +continue |
| 34 | +ifv!="&": |
| 35 | +iflen(v)==2: |
| 36 | +v=utils.surrogatePairToCodepoint(v) |
| 37 | +else: |
| 38 | +v=ord(v) |
| 39 | +ifvnotinencode_entity_mapork.islower(): |
| 40 | +# prefer < over < and similarly for &, >, etc. |
| 41 | +encode_entity_map[v]=k |
| 42 | + |
| 43 | + |
| 44 | +defhtmlentityreplace_errors(exc): |
| 45 | +ifisinstance(exc, (UnicodeEncodeError,UnicodeTranslateError)): |
| 46 | +res= [] |
| 47 | +codepoints= [] |
| 48 | +skip=False |
| 49 | +fori,cinenumerate(exc.object[exc.start:exc.end]): |
| 50 | +ifskip: |
| 51 | +skip=False |
| 52 | +continue |
| 53 | +index=i+exc.start |
| 54 | +ifutils.isSurrogatePair(exc.object[index:min([exc.end,index+2])]): |
| 55 | +codepoint=utils.surrogatePairToCodepoint(exc.object[index:index+2]) |
| 56 | +skip=True |
| 57 | +else: |
| 58 | +codepoint=ord(c) |
| 59 | +codepoints.append(codepoint) |
| 60 | +forcpincodepoints: |
| 61 | +e=encode_entity_map.get(cp) |
| 62 | +ife: |
| 63 | +res.append("&") |
| 64 | +res.append(e) |
| 65 | +ifnote.endswith(";"): |
| 66 | +res.append(";") |
| 67 | +else: |
| 68 | +res.append("&#x%s;"% (hex(cp)[2:])) |
| 69 | +return ("".join(res),exc.end) |
| 70 | +else: |
| 71 | +returnxmlcharrefreplace_errors(exc) |
77 | 72 |
|
78 | | -delregister_error |
| 73 | +register_error("htmlentityreplace",htmlentityreplace_errors) |
79 | 74 |
|
80 | 75 |
|
81 | 76 | classHTMLSerializer(object): |
@@ -168,7 +163,7 @@ def __init__(self, **kwargs): |
168 | 163 | defencode(self,string): |
169 | 164 | assert(isinstance(string,text_type)) |
170 | 165 | ifself.encoding: |
171 | | -returnstring.encode(self.encoding,unicode_encode_errors) |
| 166 | +returnstring.encode(self.encoding,"htmlentityreplace") |
172 | 167 | else: |
173 | 168 | returnstring |
174 | 169 |
|
|