|
34 | 34 | else: |
35 | 35 | invalid_unicode_re=re.compile(invalid_unicode_no_surrogate) |
36 | 36 |
|
37 | | -non_bmp_invalid_codepoints=set([0x1FFFE,0x1FFFF,0x2FFFE,0x2FFFF,0x3FFFE, |
38 | | -0x3FFFF,0x4FFFE,0x4FFFF,0x5FFFE,0x5FFFF, |
39 | | -0x6FFFE,0x6FFFF,0x7FFFE,0x7FFFF,0x8FFFE, |
40 | | -0x8FFFF,0x9FFFE,0x9FFFF,0xAFFFE,0xAFFFF, |
41 | | -0xBFFFE,0xBFFFF,0xCFFFE,0xCFFFF,0xDFFFE, |
42 | | -0xDFFFF,0xEFFFE,0xEFFFF,0xFFFFE,0xFFFFF, |
43 | | -0x10FFFE,0x10FFFF]) |
| 37 | +non_bmp_invalid_codepoints={0x1FFFE,0x1FFFF,0x2FFFE,0x2FFFF,0x3FFFE, |
| 38 | +0x3FFFF,0x4FFFE,0x4FFFF,0x5FFFE,0x5FFFF, |
| 39 | +0x6FFFE,0x6FFFF,0x7FFFE,0x7FFFF,0x8FFFE, |
| 40 | +0x8FFFF,0x9FFFE,0x9FFFF,0xAFFFE,0xAFFFF, |
| 41 | +0xBFFFE,0xBFFFF,0xCFFFE,0xCFFFF,0xDFFFE, |
| 42 | +0xDFFFF,0xEFFFE,0xEFFFF,0xFFFFE,0xFFFFF, |
| 43 | +0x10FFFE,0x10FFFF} |
44 | 44 |
|
45 | 45 | ascii_punctuation_re=re.compile("[\u0009-\u000D\u0020-\u002F\u003A-\u0040\u005C\u005B-\u0060\u007B-\u007E]") |
46 | 46 |
|
|