NotificationsYou must be signed in to change notification settings
Fork6
Star31

Commit9a206d0

committed

Improve script generating unaccent rules

Script now use the standard Unicode transliterator Latin-ASCII.Author: Leonard Benedetti

1 parent3aff33a commit9a206d0Copy full SHA for 9a206d0

File tree

2 files changed

+762

-56

lines changed

contrib/unaccent
- generate_unaccent_rules.py
- unaccent.rules

2 files changed

+762

-56

lines changed

`‎contrib/unaccent/generate_unaccent_rules.py`

Lines changed: 107 additions & 37 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,20 +1,33 @@`
`1`		`-#!/usr/bin/python`
	`1`	`+#!/usr/bin/python2`
	`2`	`+# -- coding: utf-8 --`
`2`	`3`	`#`
`3`	`4`	`# This script builds unaccent.rules on standard output when given the`
`4`		`-# contents of UnicodeData.txt[1] on standard input. Optionally includes`
`5`		`-# ligature expansion, if --expand-ligatures is given on the command line.`
	`5`	`+# contents of UnicodeData.txt [1] and Latin-ASCII.xml [2] given as`
	`6`	`+# arguments. Optionally includes ligature expansion and Unicode CLDR`
	`7`	`+# Latin-ASCII transliterator, enabled by default, this can be disabled`
	`8`	`+# with "--no-ligatures-expansion" command line option.`
`6`	`9`	`#`
`7`	`10`	`# The approach is to use the Unicode decomposition data to identify`
`8`	`11`	`# precomposed codepoints that are equivalent to a ligature of several`
`9`	`12`	`# letters, or a base letter with any number of diacritical marks.`
`10`		`-# There is also a small set of special cases for codepoints that we`
`11`		`-# traditionally support even though Unicode doesn't consider them to`
`12`		`-# be ligatures or letters with marks.`
`13`	`13`	`#`
`14`		`-# [1] http://unicode.org/Public/7.0.0/ucd/UnicodeData.txt`
	`14`	`+# This approach handles most letters with diacritical marks and some`
	`15`	`+# ligatures. However, several characters (notably a majority of`
	`16`	`+# ligatures) don't have decomposition. To handle all these cases, one can`
	`17`	`+# use a standard Unicode transliterator available in Common Locale Data`
	`18`	`+# Repository (CLDR): Latin-ASCII. This transliterator associates Unicode`
	`19`	`+# characters to ASCII-range equivalent. Unless "--no-ligatures-expansion"`
	`20`	`+# option is enabled, the XML file of this transliterator [2] -- given as a`
	`21`	`+# command line argument -- will be parsed and used.`
	`22`	`+#`
	`23`	`+# [1] http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt`
	`24`	`+# [2] http://unicode.org/cldr/trac/export/12304/tags/release-28/common/transforms/Latin-ASCII.xml`
	`25`	`+`
`15`	`26`
`16`	`27`	`importre`
	`28`	`+importargparse`
`17`	`29`	`importsys`
	`30`	`+importxml.etree.ElementTreeasET`
`18`	`31`
`19`	`32`	`defprint_record(codepoint,letter):`
`20`	`33`	`print (unichr(codepoint)+"\t"+letter).encode("UTF-8")`
`@@ -63,15 +76,73 @@ def get_plain_letters(codepoint, table):`
`63`	`76`	`assert(is_ligature(codepoint,table))`
`64`	`77`	`return [get_plain_letter(table[id],table)foridincodepoint.combining_ids]`
`65`	`78`
`66`		`-defmain(expand_ligatures):`
	`79`	`+defparse_cldr_latin_ascii_transliterator(latinAsciiFilePath):`
	`80`	`+"""Parse the XML file and return a set of tuples (src, trg), where "src"`
	`81`	`+ is the original character and "trg" the substitute."""`
	`82`	`+charactersSet=set()`
	`83`	`+`
	`84`	`+# RegEx to parse rules`
	`85`	`+rulePattern=re.compile(ur'^(?:(.)\|(\\u[0-9a-fA-F]{4})) \u2192 (?:\'(.+)\'\|(.+)) ;')`
	`86`	`+`
	`87`	`+# construct tree from XML`
	`88`	`+transliterationTree=ET.parse(latinAsciiFilePath)`
	`89`	`+transliterationTreeRoot=transliterationTree.getroot()`
	`90`	`+`
	`91`	`+forruleintransliterationTreeRoot.findall("./transforms/transform/tRule"):`
	`92`	`+matches=rulePattern.search(rule.text)`
	`93`	`+`
	`94`	`+# The regular expression capture four groups corresponding`
	`95`	`+# to the characters.`
	`96`	`+#`
	`97`	`+# Group 1: plain "src" char. Empty if group 2 is not.`
	`98`	`+# Group 2: unicode-espaced "src" char (e.g. "\u0110"). Empty if group 1 is not.`
	`99`	`+#`
	`100`	`+# Group 3: plain "trg" char. Empty if group 4 is not.`
	`101`	`+# Group 4: plain "trg" char between quotes. Empty if group 3 is not.`
	`102`	`+ifmatchesisnotNone:`
	`103`	`+src=matches.group(1)ifmatches.group(1)isnotNoneelsematches.group(2).decode('unicode-escape')`
	`104`	`+trg=matches.group(3)ifmatches.group(3)isnotNoneelsematches.group(4)`
	`105`	`+`
	`106`	`+# "'" and """ are escaped`
	`107`	`+trg=trg.replace("\\'","'").replace('\\"','"')`
	`108`	`+`
	`109`	`+# the parser of unaccent only accepts non-whitespace characters`
	`110`	`+# for "src" and "trg" (see unaccent.c)`
	`111`	`+ifnotsrc.isspace()andnottrg.isspace():`
	`112`	`+charactersSet.add((ord(src),trg))`
	`113`	`+`
	`114`	`+returncharactersSet`
	`115`	`+`
	`116`	`+defspecial_cases():`
	`117`	`+"""Returns the special cases which are not handled by other methods"""`
	`118`	`+charactersSet=set()`
	`119`	`+`
	`120`	`+# Cyrillic`
	`121`	`+charactersSet.add((0x0401,u"\u0415"))# CYRILLIC CAPITAL LETTER IO`
	`122`	`+charactersSet.add((0x0451,u"\u0435"))# CYRILLIC SMALL LETTER IO`
	`123`	`+`
	`124`	`+# Symbols of "Letterlike Symbols" Unicode Block (U+2100 to U+214F)`
	`125`	`+charactersSet.add((0x2103,u"\xb0C"))# DEGREE CELSIUS`
	`126`	`+charactersSet.add((0x2109,u"\xb0F"))# DEGREE FAHRENHEIT`
	`127`	`+charactersSet.add((0x2117,"(P)"))# SOUND RECORDING COPYRIGHT`
	`128`	`+`
	`129`	`+returncharactersSet`
	`130`	`+`
	`131`	`+defmain(args):`
`67`	`132`	`# http://www.unicode.org/reports/tr44/tr44-14.html#Character_Decomposition_Mappings`
`68`	`133`	`decomposition_type_pattern=re.compile(" <[^>]> *")`
`69`	`134`
`70`	`135`	`table= {}`
`71`	`136`	`all= []`
`72`	`137`
	`138`	`+# unordered set for ensure uniqueness`
	`139`	`+charactersSet=set()`
	`140`	`+`
	`141`	`+# read file UnicodeData.txt`
	`142`	`+unicodeDataFile=open(args.unicodeDataFilePath,'r')`
	`143`	`+`
`73`	`144`	`# read everything we need into memory`
`74`		`-forlineinsys.stdin.readlines():`
	`145`	`+forlineinunicodeDataFile:`
`75`	`146`	`fields=line.split(";")`
`76`	`147`	`iflen(fields)>5:`
`77`	`148`	`# http://www.unicode.org/reports/tr44/tr44-14.html#UnicodeData.txt`
`@@ -89,35 +160,34 @@ def main(expand_ligatures):`
`89`	`160`	`ifcodepoint.general_category.startswith('L')and \`
`90`	`161`	`len(codepoint.combining_ids)>1:`
`91`	`162`	`ifis_letter_with_marks(codepoint,table):`
`92`		`-print_record(codepoint.id,`
`93`		`-chr(get_plain_letter(codepoint,table).id))`
`94`		`-elifexpand_ligaturesandis_ligature(codepoint,table):`
`95`		`-print_record(codepoint.id,`
	`163`	`+charactersSet.add((codepoint.id,`
	`164`	`+chr(get_plain_letter(codepoint,table).id)))`
	`165`	`+elifargs.noLigaturesExpansionisFalseandis_ligature(codepoint,table):`
	`166`	`+charactersSet.add((codepoint.id,`
`96`	`167`	`"".join(unichr(combining_codepoint.id)`
`97`	`168`	`forcombining_codepoint \`
`98`		`-inget_plain_letters(codepoint,table)))`
`99`		`-`
`100`		`-# some special cases`
`101`		`-print_record(0x00d8,"O")# LATIN CAPITAL LETTER O WITH STROKE`
`102`		`-print_record(0x00f8,"o")# LATIN SMALL LETTER O WITH STROKE`
`103`		`-print_record(0x0110,"D")# LATIN CAPITAL LETTER D WITH STROKE`
`104`		`-print_record(0x0111,"d")# LATIN SMALL LETTER D WITH STROKE`
`105`		`-print_record(0x0131,"i")# LATIN SMALL LETTER DOTLESS I`
`106`		`-print_record(0x0126,"H")# LATIN CAPITAL LETTER H WITH STROKE`
`107`		`-print_record(0x0127,"h")# LATIN SMALL LETTER H WITH STROKE`
`108`		`-print_record(0x0141,"L")# LATIN CAPITAL LETTER L WITH STROKE`
`109`		`-print_record(0x0142,"l")# LATIN SMALL LETTER L WITH STROKE`
`110`		`-print_record(0x0149,"'n")# LATIN SMALL LETTER N PRECEDED BY APOSTROPHE`
`111`		`-print_record(0x0166,"T")# LATIN CAPITAL LETTER T WITH STROKE`
`112`		`-print_record(0x0167,"t")# LATIN SMALL LETTER t WITH STROKE`
`113`		`-print_record(0x0401,u"\u0415")# CYRILLIC CAPITAL LETTER IO`
`114`		`-print_record(0x0451,u"\u0435")# CYRILLIC SMALL LETTER IO`
`115`		`-ifexpand_ligatures:`
`116`		`-print_record(0x00c6,"AE")# LATIN CAPITAL LETTER AE`
`117`		`-print_record(0x00df,"ss")# LATIN SMALL LETTER SHARP S`
`118`		`-print_record(0x00e6,"ae")# LATIN SMALL LETTER AE`
`119`		`-print_record(0x0152,"OE")# LATIN CAPITAL LIGATURE OE`
`120`		`-print_record(0x0153,"oe")# LATIN SMALL LIGATURE OE`
	`169`	`+inget_plain_letters(codepoint,table))))`
	`170`	`+`
	`171`	`+# add CLDR Latin-ASCII characters`
	`172`	`+ifnotargs.noLigaturesExpansion:`
	`173`	`+charactersSet\|=parse_cldr_latin_ascii_transliterator(args.latinAsciiFilePath)`
	`174`	`+charactersSet\|=special_cases()`
	`175`	`+`
	`176`	`+# sort for more convenient display`
	`177`	`+charactersList=sorted(charactersSet,key=lambdacharacterPair:characterPair[0])`
	`178`	`+`
	`179`	`+forcharacterPairincharactersList:`
	`180`	`+print_record(characterPair[0],characterPair[1])`
`121`	`181`
`122`	`182`	`if__name__=="__main__":`
`123`		`-main(len(sys.argv)==2andsys.argv[1]=="--expand-ligatures")`
	`183`	`+parser=argparse.ArgumentParser(description='This script builds unaccent.rules on standard output when given the contents of UnicodeData.txt and Latin-ASCII.xml given as arguments.')`
	`184`	`+parser.add_argument("--unicode-data-file",help="Path to formatted text file corresponding to UnicodeData.txt. See <http://unicode.org/Public/8.0.0/ucd/UnicodeData.txt>.",type=str,required=True,dest='unicodeDataFilePath')`
	`185`	`+parser.add_argument("--latin-ascii-file",help="Path to XML file from Unicode Common Locale Data Repository (CLDR) corresponding to Latin-ASCII transliterator (Latin-ASCII.xml). See <http://unicode.org/cldr/trac/export/12304/tags/release-28/common/transforms/Latin-ASCII.xml>.",type=str,dest='latinAsciiFilePath')`
	`186`	`+parser.add_argument("--no-ligatures-expansion",help="Do not expand ligatures and do not use Unicode CLDR Latin-ASCII transliterator. By default, this option is not enabled and\"--latin-ascii-file\" argument is required. If this option is enabled,\"--latin-ascii-file\" argument is optional and ignored.",action="store_true",dest='noLigaturesExpansion')`
	`187`	`+args=parser.parse_args()`
	`188`	`+`
	`189`	`+ifargs.noLigaturesExpansionisFalseandargs.latinAsciiFilePathisNone:`
	`190`	`+sys.stderr.write('You must specify the path to Latin-ASCII transliterator file with\"--latin-ascii-file\" option or use\"--no-ligatures-expansion\" option. Use\"-h\" option for help.')`
	`191`	`+sys.exit(1)`
	`192`	`+`
	`193`	`+main(args)`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit9a206d0

File tree

2 files changed

2 files changed

`‎contrib/unaccent/generate_unaccent_rules.py`

0 commit comments