Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit1bbd52c

Browse files
committed
Make unaccent handle all diacritics known to Unicode, and expand ligatures correctly
Add Python script for buiding unaccent.rules from Unicode data. Don'tbackpatch because unaccent changes may require tsvector/indexrebuild.Thomas Munro <thomas.munro@enterprisedb.com>
1 parent4aec498 commit1bbd52c

File tree

2 files changed

+415
-66
lines changed

2 files changed

+415
-66
lines changed
Lines changed: 123 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,123 @@
1+
#!/usr/bin/python
2+
#
3+
# This script builds unaccent.rules on standard output when given the
4+
# contents of UnicodeData.txt[1] on standard input. Optionally includes
5+
# ligature expansion, if --expand-ligatures is given on the command line.
6+
#
7+
# The approach is to use the Unicode decomposition data to identify
8+
# precomposed codepoints that are equivalent to a ligature of several
9+
# letters, or a base letter with any number of diacritical marks.
10+
# There is also a small set of special cases for codepoints that we
11+
# traditionally support even though Unicode doesn't consider them to
12+
# be ligatures or letters with marks.
13+
#
14+
# [1] http://unicode.org/Public/7.0.0/ucd/UnicodeData.txt
15+
16+
importre
17+
importsys
18+
19+
defprint_record(codepoint,letter):
20+
print (unichr(codepoint)+"\t"+letter).encode("UTF-8")
21+
22+
classCodepoint:
23+
def__init__(self,id,general_category,combining_ids):
24+
self.id=id
25+
self.general_category=general_category
26+
self.combining_ids=combining_ids
27+
28+
defis_plain_letter(codepoint):
29+
"""Return true if codepoint represents a plain ASCII letter."""
30+
return (codepoint.id>=ord('a')andcodepoint.id<=ord('z'))or \
31+
(codepoint.id>=ord('A')andcodepoint.id<=ord('Z'))
32+
33+
defis_mark(codepoint):
34+
"""Returns true for diacritical marks (combining codepoints)."""
35+
returncodepoint.general_categoryin ("Mn","Me","Mc")
36+
37+
defis_letter_with_marks(codepoint,table):
38+
"""Returns true for plain letters combined with one or more marks."""
39+
# See http://www.unicode.org/reports/tr44/tr44-14.html#General_Category_Values
40+
returnlen(codepoint.combining_ids)>1and \
41+
is_plain_letter(table[codepoint.combining_ids[0]])and \
42+
all(is_mark(table[i])foriincodepoint.combining_ids[1:])
43+
44+
defis_letter(codepoint,table):
45+
"""Return true for letter with or without diacritical marks."""
46+
returnis_plain_letter(codepoint)oris_letter_with_marks(codepoint,table)
47+
48+
defget_plain_letter(codepoint,table):
49+
"""Return the base codepoint without marks."""
50+
ifis_letter_with_marks(codepoint,table):
51+
returntable[codepoint.combining_ids[0]]
52+
elifis_plain_letter(codepoint):
53+
returncodepoint
54+
else:
55+
raise"mu"
56+
57+
defis_ligature(codepoint,table):
58+
"""Return true for letters combined with letters."""
59+
returnall(is_letter(table[i],table)foriincodepoint.combining_ids)
60+
61+
defget_plain_letters(codepoint,table):
62+
"""Return a list of plain letters from a ligature."""
63+
assert(is_ligature(codepoint,table))
64+
return [get_plain_letter(table[id],table)foridincodepoint.combining_ids]
65+
66+
defmain(expand_ligatures):
67+
# http://www.unicode.org/reports/tr44/tr44-14.html#Character_Decomposition_Mappings
68+
decomposition_type_pattern=re.compile(" *<[^>]*> *")
69+
70+
table= {}
71+
all= []
72+
73+
# read everything we need into memory
74+
forlineinsys.stdin.readlines():
75+
fields=line.split(";")
76+
iflen(fields)>5:
77+
# http://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt
78+
general_category=fields[2]
79+
decomposition=fields[5]
80+
decomposition=re.sub(decomposition_type_pattern,' ',decomposition)
81+
id=int(fields[0],16)
82+
combining_ids= [int(s,16)forsindecomposition.split(" ")ifs!=""]
83+
codepoint=Codepoint(id,general_category,combining_ids)
84+
table[id]=codepoint
85+
all.append(codepoint)
86+
87+
# walk through all the codepoints looking for interesting mappings
88+
forcodepointinall:
89+
ifcodepoint.general_category.startswith('L')and \
90+
len(codepoint.combining_ids)>1:
91+
ifis_letter_with_marks(codepoint,table):
92+
print_record(codepoint.id,
93+
chr(get_plain_letter(codepoint,table).id))
94+
elifexpand_ligaturesandis_ligature(codepoint,table):
95+
print_record(codepoint.id,
96+
"".join(unichr(combining_codepoint.id)
97+
forcombining_codepoint \
98+
inget_plain_letters(codepoint,table)))
99+
100+
# some special cases
101+
print_record(0x00d8,"O")# LATIN CAPITAL LETTER O WITH STROKE
102+
print_record(0x00f8,"o")# LATIN SMALL LETTER O WITH STROKE
103+
print_record(0x0110,"D")# LATIN CAPITAL LETTER D WITH STROKE
104+
print_record(0x0111,"d")# LATIN SMALL LETTER D WITH STROKE
105+
print_record(0x0131,"i")# LATIN SMALL LETTER DOTLESS I
106+
print_record(0x0126,"H")# LATIN CAPITAL LETTER H WITH STROKE
107+
print_record(0x0127,"h")# LATIN SMALL LETTER H WITH STROKE
108+
print_record(0x0141,"L")# LATIN CAPITAL LETTER L WITH STROKE
109+
print_record(0x0142,"l")# LATIN SMALL LETTER L WITH STROKE
110+
print_record(0x0149,"'n")# LATIN SMALL LETTER N PRECEDED BY APOSTROPHE
111+
print_record(0x0166,"T")# LATIN CAPITAL LETTER T WITH STROKE
112+
print_record(0x0167,"t")# LATIN SMALL LETTER t WITH STROKE
113+
print_record(0x0401,u"\u0415")# CYRILLIC CAPITAL LETTER IO
114+
print_record(0x0451,u"\u0435")# CYRILLIC SMALL LETTER IO
115+
ifexpand_ligatures:
116+
print_record(0x00c6,"AE")# LATIN CAPITAL LETTER AE
117+
print_record(0x00df,"ss")# LATIN SMALL LETTER SHARP S
118+
print_record(0x00e6,"ae")# LATIN SMALL LETTER AE
119+
print_record(0x0152,"OE")# LATIN CAPITAL LIGATURE OE
120+
print_record(0x0153,"oe")# LATIN SMALL LIGATURE OE
121+
122+
if__name__=="__main__":
123+
main(len(sys.argv)==2andsys.argv[1]=="--expand-ligatures")

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp