|
| 1 | +#!/usr/bin/env python3 |
| 2 | + |
| 3 | +""" |
| 4 | +Unicode character finder utility: |
| 5 | +find characters based on words in their official names. |
| 6 | +
|
| 7 | +This can be used from the command line, just pass words as arguments. |
| 8 | +
|
| 9 | +Here is the ``main`` function which makes it happen:: |
| 10 | +
|
| 11 | + >>> main('rook') # doctest: +NORMALIZE_WHITESPACE |
| 12 | + U+2656 ♖ WHITE CHESS ROOK |
| 13 | + U+265C ♜ BLACK CHESS ROOK |
| 14 | + (2 matches for 'rook') |
| 15 | + >>> main('rook', 'black') # doctest: +NORMALIZE_WHITESPACE |
| 16 | + U+265C ♜ BLACK CHESS ROOK |
| 17 | + (1 match for 'rook black') |
| 18 | + >>> main('white bishop') # doctest: +NORMALIZE_WHITESPACE |
| 19 | + U+2657 ♗ WHITE CHESS BISHOP |
| 20 | + (1 match for 'white bishop') |
| 21 | + >>> main("jabberwocky's vest") |
| 22 | + (No match for "jabberwocky's vest") |
| 23 | +
|
| 24 | +
|
| 25 | +For exploring words that occur in the character names, there is the |
| 26 | +``word_report`` function:: |
| 27 | +
|
| 28 | + >>> index = UnicodeNameIndex(sample_chars) |
| 29 | + >>> index.word_report() |
| 30 | + 3 SIGN |
| 31 | + 2 A |
| 32 | + 2 EURO |
| 33 | + 2 LATIN |
| 34 | + 2 LETTER |
| 35 | + 1 CAPITAL |
| 36 | + 1 CURRENCY |
| 37 | + 1 DOLLAR |
| 38 | + 1 SMALL |
| 39 | + >>> index = UnicodeNameIndex() |
| 40 | + >>> index.word_report(10) |
| 41 | + 75821 CJK |
| 42 | + 75761 IDEOGRAPH |
| 43 | + 74656 UNIFIED |
| 44 | + 13196 SYLLABLE |
| 45 | + 11735 HANGUL |
| 46 | + 7616 LETTER |
| 47 | + 2232 WITH |
| 48 | + 2180 SIGN |
| 49 | + 2122 SMALL |
| 50 | + 1709 CAPITAL |
| 51 | +
|
| 52 | +Note: characters with names starting with 'CJK UNIFIED IDEOGRAPH' |
| 53 | +are indexed with those three words only, excluding the hexadecimal |
| 54 | +codepoint at the end of the name. |
| 55 | +
|
| 56 | +""" |
| 57 | + |
| 58 | +importsys |
| 59 | +importre |
| 60 | +importunicodedata |
| 61 | +importpickle |
| 62 | +importwarnings |
| 63 | +importitertools |
| 64 | +importfunctools |
| 65 | +fromcollectionsimportnamedtuple |
| 66 | + |
| 67 | +RE_WORD=re.compile('\w+') |
| 68 | +RE_UNICODE_NAME=re.compile('^[A-Z0-9 -]+$') |
| 69 | +RE_CODEPOINT=re.compile('U\+([0-9A-F]{4,6})') |
| 70 | + |
| 71 | +INDEX_NAME='charfinder_index.pickle' |
| 72 | +MINIMUM_SAVE_LEN=10000 |
| 73 | +CJK_UNI_PREFIX='CJK UNIFIED IDEOGRAPH' |
| 74 | +CJK_CMP_PREFIX='CJK COMPATIBILITY IDEOGRAPH' |
| 75 | + |
| 76 | +sample_chars= [ |
| 77 | +'$',# DOLLAR SIGN |
| 78 | +'A',# LATIN CAPITAL LETTER A |
| 79 | +'a',# LATIN SMALL LETTER A |
| 80 | +'\u20a0',# EURO-CURRENCY SIGN |
| 81 | +'\u20ac',# EURO SIGN |
| 82 | +] |
| 83 | + |
| 84 | +CharDescription=namedtuple('CharDescription','code_str char name') |
| 85 | + |
| 86 | +QueryResult=namedtuple('QueryResult','count items') |
| 87 | + |
| 88 | + |
| 89 | +deftokenize(text): |
| 90 | +"""return iterable of uppercased words""" |
| 91 | +formatchinRE_WORD.finditer(text): |
| 92 | +yieldmatch.group().upper() |
| 93 | + |
| 94 | + |
| 95 | +defquery_type(text): |
| 96 | +text_upper=text.upper() |
| 97 | +if'U+'intext_upper: |
| 98 | +return'CODEPOINT' |
| 99 | +elifRE_UNICODE_NAME.match(text_upper): |
| 100 | +return'NAME' |
| 101 | +else: |
| 102 | +return'CHARACTERS' |
| 103 | + |
| 104 | + |
| 105 | +classUnicodeNameIndex: |
| 106 | + |
| 107 | +def__init__(self,chars=None): |
| 108 | +self.load(chars) |
| 109 | + |
| 110 | +defload(self,chars=None): |
| 111 | +self.index=None |
| 112 | +ifcharsisNone: |
| 113 | +try: |
| 114 | +withopen(INDEX_NAME,'rb')asfp: |
| 115 | +self.index=pickle.load(fp) |
| 116 | +exceptOSError: |
| 117 | +pass |
| 118 | +ifself.indexisNone: |
| 119 | +self.build_index(chars) |
| 120 | +iflen(self.index)>MINIMUM_SAVE_LEN: |
| 121 | +try: |
| 122 | +self.save() |
| 123 | +exceptOSErrorasexc: |
| 124 | +warnings.warn('Could not save {!r}: {}' |
| 125 | + .format(INDEX_NAME,exc)) |
| 126 | + |
| 127 | +defsave(self): |
| 128 | +withopen(INDEX_NAME,'wb')asfp: |
| 129 | +pickle.dump(self.index,fp) |
| 130 | + |
| 131 | +defbuild_index(self,chars=None): |
| 132 | +ifcharsisNone: |
| 133 | +chars= (chr(i)foriinrange(32,sys.maxunicode)) |
| 134 | +index= {} |
| 135 | +forcharinchars: |
| 136 | +try: |
| 137 | +name=unicodedata.name(char) |
| 138 | +exceptValueError: |
| 139 | +continue |
| 140 | +ifname.startswith(CJK_UNI_PREFIX): |
| 141 | +name=CJK_UNI_PREFIX |
| 142 | +elifname.startswith(CJK_CMP_PREFIX): |
| 143 | +name=CJK_CMP_PREFIX |
| 144 | + |
| 145 | +forwordintokenize(name): |
| 146 | +index.setdefault(word,set()).add(char) |
| 147 | + |
| 148 | +self.index=index |
| 149 | + |
| 150 | +defword_rank(self,top=None): |
| 151 | +res= [(len(self.index[key]),key)forkeyinself.index] |
| 152 | +res.sort(key=lambdaitem: (-item[0],item[1])) |
| 153 | +iftopisnotNone: |
| 154 | +res=res[:top] |
| 155 | +returnres |
| 156 | + |
| 157 | +defword_report(self,top=None): |
| 158 | +forpostings,keyinself.word_rank(top): |
| 159 | +print('{:5} {}'.format(postings,key)) |
| 160 | + |
| 161 | +deffind_chars(self,query,start=0,stop=None): |
| 162 | +stop=sys.maxsizeifstopisNoneelsestop |
| 163 | +result_sets= [] |
| 164 | +forwordintokenize(query): |
| 165 | +chars=self.index.get(word) |
| 166 | +ifcharsisNone:# shorcut: no such word |
| 167 | +result_sets= [] |
| 168 | +break |
| 169 | +result_sets.append(chars) |
| 170 | + |
| 171 | +ifnotresult_sets: |
| 172 | +returnQueryResult(0, ()) |
| 173 | + |
| 174 | +result=functools.reduce(set.intersection,result_sets) |
| 175 | +result=sorted(result)# must sort to support start, stop |
| 176 | +result_iter=itertools.islice(result,start,stop) |
| 177 | +returnQueryResult(len(result), |
| 178 | + (charforcharinresult_iter)) |
| 179 | + |
| 180 | +defdescribe(self,char): |
| 181 | +code_str='U+{:04X}'.format(ord(char)) |
| 182 | +name=unicodedata.name(char) |
| 183 | +returnCharDescription(code_str,char,name) |
| 184 | + |
| 185 | +deffind_descriptions(self,query,start=0,stop=None): |
| 186 | +forcharinself.find_chars(query,start,stop).items: |
| 187 | +yieldself.describe(char) |
| 188 | + |
| 189 | +defget_descriptions(self,chars): |
| 190 | +forcharinchars: |
| 191 | +yieldself.describe(char) |
| 192 | + |
| 193 | +defdescribe_str(self,char): |
| 194 | +return'{:7}\t{}\t{}'.format(*self.describe(char)) |
| 195 | + |
| 196 | +deffind_description_strs(self,query,start=0,stop=None): |
| 197 | +forcharinself.find_chars(query,start,stop).items: |
| 198 | +yieldself.describe_str(char) |
| 199 | + |
| 200 | +@staticmethod# not an instance method due to concurrency |
| 201 | +defstatus(query,counter): |
| 202 | +ifcounter==0: |
| 203 | +msg='No match' |
| 204 | +elifcounter==1: |
| 205 | +msg='1 match' |
| 206 | +else: |
| 207 | +msg='{} matches'.format(counter) |
| 208 | +return'{} for {!r}'.format(msg,query) |
| 209 | + |
| 210 | + |
| 211 | +defmain(*args): |
| 212 | +index=UnicodeNameIndex() |
| 213 | +query=' '.join(args) |
| 214 | +n=0 |
| 215 | +forn,lineinenumerate(index.find_description_strs(query),1): |
| 216 | +print(line) |
| 217 | +print('({})'.format(index.status(query,n))) |
| 218 | + |
| 219 | +if__name__=='__main__': |
| 220 | +iflen(sys.argv)>1: |
| 221 | +main(*sys.argv[1:]) |
| 222 | +else: |
| 223 | +print('Usage: {} word1 [word2]...'.format(sys.argv[0])) |