1- #!/usr/bin/python2
1+ #!/usr/bin/python
22# -*- coding: utf-8 -*-
33#
44# This script builds unaccent.rules on standard output when given the
2323# [1] http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt
2424# [2] http://unicode.org/cldr/trac/export/12304/tags/release-28/common/transforms/Latin-ASCII.xml
2525
26+ # BEGIN: Python 2/3 compatibility - remove when Python 2 compatibility dropped
27+ # The approach is to be Python3 compatible with Python2 "backports".
28+ from __future__import print_function
29+ from __future__import unicode_literals
30+ import codecs
31+ import sys
32+
33+ if sys .version_info [0 ]<= 2 :
34+ # Encode stdout as UTF-8, so we can just print to it
35+ sys .stdout = codecs .getwriter ('utf8' )(sys .stdout )
36+
37+ # Map Python 2's chr to unichr
38+ chr = unichr
39+
40+ # Python 2 and 3 compatible bytes call
41+ def bytes (source ,encoding = 'ascii' ,errors = 'strict' ):
42+ return source .encode (encoding = encoding ,errors = errors )
43+ # END: Python 2/3 compatibility - remove when Python 2 compatibility dropped
2644
2745import re
2846import argparse
3957 (0x0391 ,0x03a9 ))# GREEK CAPITAL LETTER ALPHA, GREEK CAPITAL LETTER OMEGA
4058
4159def print_record (codepoint ,letter ):
42- print (unichr (codepoint )+ "\t " + letter ). encode ( "UTF-8" )
60+ print (chr (codepoint )+ "\t " + letter )
4361
4462class Codepoint :
4563def __init__ (self ,id ,general_category ,combining_ids ):
@@ -116,7 +134,7 @@ def parse_cldr_latin_ascii_transliterator(latinAsciiFilePath):
116134charactersSet = set ()
117135
118136# RegEx to parse rules
119- rulePattern = re .compile (ur '^(?:(.)|(\\u[0-9a-fA-F]{4})) \u2192 (?:\'(.+)\'|(.+)) ;' )
137+ rulePattern = re .compile (r '^(?:(.)|(\\u[0-9a-fA-F]{4})) \u2192 (?:\'(.+)\'|(.+)) ;' )
120138
121139# construct tree from XML
122140transliterationTree = ET .parse (latinAsciiFilePath )
@@ -134,7 +152,7 @@ def parse_cldr_latin_ascii_transliterator(latinAsciiFilePath):
134152# Group 3: plain "trg" char. Empty if group 4 is not.
135153# Group 4: plain "trg" char between quotes. Empty if group 3 is not.
136154if matches is not None :
137- src = matches .group (1 )if matches .group (1 )is not None else matches .group (2 ).decode ('unicode-escape' )
155+ src = matches .group (1 )if matches .group (1 )is not None else bytes ( matches .group (2 ), 'UTF-8' ).decode ('unicode-escape' )
138156trg = matches .group (3 )if matches .group (3 )is not None else matches .group (4 )
139157
140158# "'" and """ are escaped
@@ -195,10 +213,10 @@ def main(args):
195213len (codepoint .combining_ids )> 1 :
196214if is_letter_with_marks (codepoint ,table ):
197215charactersSet .add ((codepoint .id ,
198- unichr (get_plain_letter (codepoint ,table ).id )))
216+ chr (get_plain_letter (codepoint ,table ).id )))
199217elif args .noLigaturesExpansion is False and is_ligature (codepoint ,table ):
200218charactersSet .add ((codepoint .id ,
201- "" .join (unichr (combining_codepoint .id )
219+ "" .join (chr (combining_codepoint .id )
202220for combining_codepoint \
203221in get_plain_letters (codepoint ,table ))))
204222