Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit9a206d0

Browse files
committed
Improve script generating unaccent rules
Script now use the standard Unicode transliterator Latin-ASCII.Author: Leonard Benedetti
1 parent3aff33a commit9a206d0

File tree

2 files changed

+762
-56
lines changed

2 files changed

+762
-56
lines changed

‎contrib/unaccent/generate_unaccent_rules.py

Lines changed: 107 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -1,20 +1,33 @@
1-
#!/usr/bin/python
1+
#!/usr/bin/python2
2+
# -*- coding: utf-8 -*-
23
#
34
# This script builds unaccent.rules on standard output when given the
4-
# contents of UnicodeData.txt[1] on standard input. Optionally includes
5-
# ligature expansion, if --expand-ligatures is given on the command line.
5+
# contents of UnicodeData.txt [1] and Latin-ASCII.xml [2] given as
6+
# arguments. Optionally includes ligature expansion and Unicode CLDR
7+
# Latin-ASCII transliterator, enabled by default, this can be disabled
8+
# with "--no-ligatures-expansion" command line option.
69
#
710
# The approach is to use the Unicode decomposition data to identify
811
# precomposed codepoints that are equivalent to a ligature of several
912
# letters, or a base letter with any number of diacritical marks.
10-
# There is also a small set of special cases for codepoints that we
11-
# traditionally support even though Unicode doesn't consider them to
12-
# be ligatures or letters with marks.
1313
#
14-
# [1] http://unicode.org/Public/7.0.0/ucd/UnicodeData.txt
14+
# This approach handles most letters with diacritical marks and some
15+
# ligatures. However, several characters (notably a majority of
16+
# ligatures) don't have decomposition. To handle all these cases, one can
17+
# use a standard Unicode transliterator available in Common Locale Data
18+
# Repository (CLDR): Latin-ASCII. This transliterator associates Unicode
19+
# characters to ASCII-range equivalent. Unless "--no-ligatures-expansion"
20+
# option is enabled, the XML file of this transliterator [2] -- given as a
21+
# command line argument -- will be parsed and used.
22+
#
23+
# [1] http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt
24+
# [2] http://unicode.org/cldr/trac/export/12304/tags/release-28/common/transforms/Latin-ASCII.xml
25+
1526

1627
importre
28+
importargparse
1729
importsys
30+
importxml.etree.ElementTreeasET
1831

1932
defprint_record(codepoint,letter):
2033
print (unichr(codepoint)+"\t"+letter).encode("UTF-8")
@@ -63,15 +76,73 @@ def get_plain_letters(codepoint, table):
6376
assert(is_ligature(codepoint,table))
6477
return [get_plain_letter(table[id],table)foridincodepoint.combining_ids]
6578

66-
defmain(expand_ligatures):
79+
defparse_cldr_latin_ascii_transliterator(latinAsciiFilePath):
80+
"""Parse the XML file and return a set of tuples (src, trg), where "src"
81+
is the original character and "trg" the substitute."""
82+
charactersSet=set()
83+
84+
# RegEx to parse rules
85+
rulePattern=re.compile(ur'^(?:(.)|(\\u[0-9a-fA-F]{4})) \u2192 (?:\'(.+)\'|(.+)) ;')
86+
87+
# construct tree from XML
88+
transliterationTree=ET.parse(latinAsciiFilePath)
89+
transliterationTreeRoot=transliterationTree.getroot()
90+
91+
forruleintransliterationTreeRoot.findall("./transforms/transform/tRule"):
92+
matches=rulePattern.search(rule.text)
93+
94+
# The regular expression capture four groups corresponding
95+
# to the characters.
96+
#
97+
# Group 1: plain "src" char. Empty if group 2 is not.
98+
# Group 2: unicode-espaced "src" char (e.g. "\u0110"). Empty if group 1 is not.
99+
#
100+
# Group 3: plain "trg" char. Empty if group 4 is not.
101+
# Group 4: plain "trg" char between quotes. Empty if group 3 is not.
102+
ifmatchesisnotNone:
103+
src=matches.group(1)ifmatches.group(1)isnotNoneelsematches.group(2).decode('unicode-escape')
104+
trg=matches.group(3)ifmatches.group(3)isnotNoneelsematches.group(4)
105+
106+
# "'" and """ are escaped
107+
trg=trg.replace("\\'","'").replace('\\"','"')
108+
109+
# the parser of unaccent only accepts non-whitespace characters
110+
# for "src" and "trg" (see unaccent.c)
111+
ifnotsrc.isspace()andnottrg.isspace():
112+
charactersSet.add((ord(src),trg))
113+
114+
returncharactersSet
115+
116+
defspecial_cases():
117+
"""Returns the special cases which are not handled by other methods"""
118+
charactersSet=set()
119+
120+
# Cyrillic
121+
charactersSet.add((0x0401,u"\u0415"))# CYRILLIC CAPITAL LETTER IO
122+
charactersSet.add((0x0451,u"\u0435"))# CYRILLIC SMALL LETTER IO
123+
124+
# Symbols of "Letterlike Symbols" Unicode Block (U+2100 to U+214F)
125+
charactersSet.add((0x2103,u"\xb0C"))# DEGREE CELSIUS
126+
charactersSet.add((0x2109,u"\xb0F"))# DEGREE FAHRENHEIT
127+
charactersSet.add((0x2117,"(P)"))# SOUND RECORDING COPYRIGHT
128+
129+
returncharactersSet
130+
131+
defmain(args):
67132
# http://www.unicode.org/reports/tr44/tr44-14.html#Character_Decomposition_Mappings
68133
decomposition_type_pattern=re.compile(" *<[^>]*> *")
69134

70135
table= {}
71136
all= []
72137

138+
# unordered set for ensure uniqueness
139+
charactersSet=set()
140+
141+
# read file UnicodeData.txt
142+
unicodeDataFile=open(args.unicodeDataFilePath,'r')
143+
73144
# read everything we need into memory
74-
forlineinsys.stdin.readlines():
145+
forlineinunicodeDataFile:
75146
fields=line.split(";")
76147
iflen(fields)>5:
77148
# http://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt
@@ -89,35 +160,34 @@ def main(expand_ligatures):
89160
ifcodepoint.general_category.startswith('L')and \
90161
len(codepoint.combining_ids)>1:
91162
ifis_letter_with_marks(codepoint,table):
92-
print_record(codepoint.id,
93-
chr(get_plain_letter(codepoint,table).id))
94-
elifexpand_ligaturesandis_ligature(codepoint,table):
95-
print_record(codepoint.id,
163+
charactersSet.add((codepoint.id,
164+
chr(get_plain_letter(codepoint,table).id)))
165+
elifargs.noLigaturesExpansionisFalseandis_ligature(codepoint,table):
166+
charactersSet.add((codepoint.id,
96167
"".join(unichr(combining_codepoint.id)
97168
forcombining_codepoint \
98-
inget_plain_letters(codepoint,table)))
99-
100-
# some special cases
101-
print_record(0x00d8,"O")# LATIN CAPITAL LETTER O WITH STROKE
102-
print_record(0x00f8,"o")# LATIN SMALL LETTER O WITH STROKE
103-
print_record(0x0110,"D")# LATIN CAPITAL LETTER D WITH STROKE
104-
print_record(0x0111,"d")# LATIN SMALL LETTER D WITH STROKE
105-
print_record(0x0131,"i")# LATIN SMALL LETTER DOTLESS I
106-
print_record(0x0126,"H")# LATIN CAPITAL LETTER H WITH STROKE
107-
print_record(0x0127,"h")# LATIN SMALL LETTER H WITH STROKE
108-
print_record(0x0141,"L")# LATIN CAPITAL LETTER L WITH STROKE
109-
print_record(0x0142,"l")# LATIN SMALL LETTER L WITH STROKE
110-
print_record(0x0149,"'n")# LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
111-
print_record(0x0166,"T")# LATIN CAPITAL LETTER T WITH STROKE
112-
print_record(0x0167,"t")# LATIN SMALL LETTER t WITH STROKE
113-
print_record(0x0401,u"\u0415")# CYRILLIC CAPITAL LETTER IO
114-
print_record(0x0451,u"\u0435")# CYRILLIC SMALL LETTER IO
115-
ifexpand_ligatures:
116-
print_record(0x00c6,"AE")# LATIN CAPITAL LETTER AE
117-
print_record(0x00df,"ss")# LATIN SMALL LETTER SHARP S
118-
print_record(0x00e6,"ae")# LATIN SMALL LETTER AE
119-
print_record(0x0152,"OE")# LATIN CAPITAL LIGATURE OE
120-
print_record(0x0153,"oe")# LATIN SMALL LIGATURE OE
169+
inget_plain_letters(codepoint,table))))
170+
171+
# add CLDR Latin-ASCII characters
172+
ifnotargs.noLigaturesExpansion:
173+
charactersSet|=parse_cldr_latin_ascii_transliterator(args.latinAsciiFilePath)
174+
charactersSet|=special_cases()
175+
176+
# sort for more convenient display
177+
charactersList=sorted(charactersSet,key=lambdacharacterPair:characterPair[0])
178+
179+
forcharacterPairincharactersList:
180+
print_record(characterPair[0],characterPair[1])
121181

122182
if__name__=="__main__":
123-
main(len(sys.argv)==2andsys.argv[1]=="--expand-ligatures")
183+
parser=argparse.ArgumentParser(description='This script builds unaccent.rules on standard output when given the contents of UnicodeData.txt and Latin-ASCII.xml given as arguments.')
184+
parser.add_argument("--unicode-data-file",help="Path to formatted text file corresponding to UnicodeData.txt. See <http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt>.",type=str,required=True,dest='unicodeDataFilePath')
185+
parser.add_argument("--latin-ascii-file",help="Path to XML file from Unicode Common Locale Data Repository (CLDR) corresponding to Latin-ASCII transliterator (Latin-ASCII.xml). See <http://unicode.org/cldr/trac/export/12304/tags/release-28/common/transforms/Latin-ASCII.xml>.",type=str,dest='latinAsciiFilePath')
186+
parser.add_argument("--no-ligatures-expansion",help="Do not expand ligatures and do not use Unicode CLDR Latin-ASCII transliterator. By default, this option is not enabled and\"--latin-ascii-file\" argument is required. If this option is enabled,\"--latin-ascii-file\" argument is optional and ignored.",action="store_true",dest='noLigaturesExpansion')
187+
args=parser.parse_args()
188+
189+
ifargs.noLigaturesExpansionisFalseandargs.latinAsciiFilePathisNone:
190+
sys.stderr.write('You must specify the path to Latin-ASCII transliterator file with\"--latin-ascii-file\" option or use\"--no-ligatures-expansion\" option. Use\"-h\" option for help.')
191+
sys.exit(1)
192+
193+
main(args)

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp