|
1 | 1 | #!/usr/bin/perl
|
2 | 2 | #
|
3 |
| -# Generate a composition table, using Unicode data files as input |
| 3 | +# Generate a composition table and its lookup utilities, using Unicode data |
| 4 | +# files as input. |
4 | 5 | #
|
5 | 6 | # Input: UnicodeData.txt and CompositionExclusions.txt
|
6 |
| -# Output: unicode_norm_table.h |
| 7 | +# Output: unicode_norm_table.h and unicode_norm_hashfunc.h |
7 | 8 | #
|
8 | 9 | # Copyright (c) 2000-2020, PostgreSQL Global Development Group
|
9 | 10 |
|
10 | 11 | use strict;
|
11 | 12 | use warnings;
|
12 | 13 |
|
13 |
| -my$output_file ="unicode_norm_table.h"; |
| 14 | +use FindBin; |
| 15 | +use lib"$FindBin::RealBin/../../tools/"; |
| 16 | +use PerfectHash; |
| 17 | + |
| 18 | +my$output_table_file ="unicode_norm_table.h"; |
| 19 | +my$output_func_file ="unicode_norm_hashfunc.h"; |
14 | 20 |
|
15 | 21 | my$FH;
|
16 | 22 |
|
|
64 | 70 |
|
65 | 71 | my$num_characters =scalar@characters;
|
66 | 72 |
|
67 |
| -# Start writing out the output file |
68 |
| -openmy$OUTPUT,'>',$output_file |
69 |
| -ordie"Could not open output file$output_file:$!\n"; |
| 73 | +# Start writing out the output files |
| 74 | +openmy$OT,'>',$output_table_file |
| 75 | +ordie"Could not open output file$output_table_file:$!\n"; |
| 76 | +openmy$OF,'>',$output_func_file |
| 77 | +ordie"Could not open output file$output_func_file:$!\n"; |
70 | 78 |
|
71 |
| -print$OUTPUT<<HEADER; |
| 79 | +print$OT<<HEADER; |
72 | 80 | /*-------------------------------------------------------------------------
|
73 | 81 | *
|
74 | 82 | * unicode_norm_table.h
|
|
111 | 119 | {
|
112 | 120 | HEADER
|
113 | 121 |
|
| 122 | +print$OF<<HEADER; |
| 123 | +/*------------------------------------------------------------------------- |
| 124 | + * |
| 125 | + * unicode_norm_hashfunc.h |
| 126 | + * Perfect hash functions used for Unicode normalization |
| 127 | + * |
| 128 | + * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group |
| 129 | + * Portions Copyright (c) 1994, Regents of the University of California |
| 130 | + * |
| 131 | + * src/include/common/unicode_norm_hashfunc.h |
| 132 | + * |
| 133 | + *------------------------------------------------------------------------- |
| 134 | + */ |
| 135 | +
|
| 136 | +/* |
| 137 | + * File auto-generated by src/common/unicode/generate-unicode_norm_table.pl, |
| 138 | + * do not edit. There is deliberately not an #ifndef PG_UNICODE_NORM_HASHFUNC_H |
| 139 | + * here. |
| 140 | + */ |
| 141 | +
|
| 142 | +#include "common/unicode_norm_table.h" |
| 143 | +
|
| 144 | +/* Typedef for perfect hash functions */ |
| 145 | +typedef int (*cp_hash_func) (const void *key); |
| 146 | +
|
| 147 | +/* Information for lookups with perfect hash functions */ |
| 148 | +typedef struct |
| 149 | +{ |
| 150 | +const pg_unicode_decomposition *decomps; |
| 151 | +cp_hash_funchash; |
| 152 | +intnum_decomps; |
| 153 | +} pg_unicode_decompinfo; |
| 154 | +
|
| 155 | +typedef struct |
| 156 | +{ |
| 157 | +const uint16*inverse_lookup; |
| 158 | +cp_hash_funchash; |
| 159 | +intnum_recomps; |
| 160 | +} pg_unicode_recompinfo; |
| 161 | +
|
| 162 | +HEADER |
| 163 | + |
114 | 164 | my$decomp_index = 0;
|
115 | 165 | my$decomp_string ="";
|
| 166 | +my@dec_cp_packed; |
| 167 | +my$main_index = 0; |
| 168 | +my@rec_info; |
116 | 169 |
|
117 | 170 | my$last_code =$characters[-1]->{code};
|
118 | 171 | foreachmy$char (@characters)
|
|
121 | 174 | my$class =$char->{class};
|
122 | 175 | my$decomp =$char->{decomp};
|
123 | 176 |
|
| 177 | +# Save the code point bytes as a string in network order. |
| 178 | +push@dec_cp_packed,pack('N',hex($char->{code})); |
| 179 | + |
124 | 180 | # The character decomposition mapping field in UnicodeData.txt is a list
|
125 | 181 | # of unicode codepoints, separated by space. But it can be prefixed with
|
126 | 182 | # so-called compatibility formatting tag, like "<compat>", or "<font>".
|
|
163 | 219 | {
|
164 | 220 | foreachmy$lcode (@composition_exclusion_codes)
|
165 | 221 | {
|
166 |
| -if ($lcodeeq$char->{code}) |
| 222 | +if ($lcodeeq$code) |
167 | 223 | {
|
168 | 224 | $flags .=" | DECOMP_NO_COMPOSE";
|
169 | 225 | $comment ="in exclusion list";
|
170 | 226 | last;
|
171 | 227 | }
|
172 | 228 | }
|
173 | 229 | }
|
| 230 | + |
| 231 | +# Save info for recomposeable codepoints. |
| 232 | +# Note that this MUST match the macro DECOMPOSITION_NO_COMPOSE in C |
| 233 | +# above! See also the inverse lookup in recompose_code() found in |
| 234 | +# src/common/unicode_norm.c. |
| 235 | +if (!($flags =~/DECOMP_COMPAT/ ||$flags =~/DECOMP_NO_COMPOSE/)) |
| 236 | +{ |
| 237 | +push@rec_info, |
| 238 | + { |
| 239 | +code=>$code, |
| 240 | +main_index=>$main_index, |
| 241 | +first=>$first_decomp, |
| 242 | +second=>$decomp_elts[0] |
| 243 | + }; |
| 244 | +} |
174 | 245 | }
|
175 | 246 |
|
176 | 247 | if ($decomp_size == 0)
|
177 | 248 | {
|
178 |
| -print$OUTPUT"\t{0x$code,$class, 0$flags, 0}"; |
| 249 | +print$OT"\t{0x$code,$class, 0$flags, 0}"; |
179 | 250 | }
|
180 | 251 | elsif ($decomp_size == 1 &&length($first_decomp) <= 4)
|
181 | 252 | {
|
182 | 253 |
|
183 | 254 | # The decomposition consists of a single codepoint, and it fits
|
184 | 255 | # in a uint16, so we can store it "inline" in the main table.
|
185 | 256 | $flags .=" | DECOMP_INLINE";
|
186 |
| -print$OUTPUT"\t{0x$code,$class, 1$flags, 0x$first_decomp}"; |
| 257 | +print$OT"\t{0x$code,$class, 1$flags, 0x$first_decomp}"; |
187 | 258 | }
|
188 | 259 | else
|
189 | 260 | {
|
190 |
| -print$OUTPUT |
191 |
| -"\t{0x$code,$class,$decomp_size$flags,$decomp_index}"; |
| 261 | +print$OT"\t{0x$code,$class,$decomp_size$flags,$decomp_index}"; |
192 | 262 |
|
193 | 263 | # Now save the decompositions into a dedicated area that will
|
194 | 264 | # be written afterwards. First build the entry dedicated to
|
|
205 | 275 | }
|
206 | 276 |
|
207 | 277 | # Print a comma after all items except the last one.
|
208 |
| -print$OUTPUT","unless ($codeeq$last_code); |
209 |
| -if ($commentne"") |
210 |
| -{ |
| 278 | +print$OT","unless ($codeeq$last_code); |
211 | 279 |
|
212 |
| -# If the line is wide already, indent the comment with one tab, |
213 |
| -# otherwise with two. This is to make the output match the way |
214 |
| -# pgindent would mangle it. (This is quite hacky. To do this |
215 |
| -# properly, we should actually track how long the line is so far, |
216 |
| -# but this works for now.) |
217 |
| -print$OUTPUT"\t"if ($decomp_index < 10); |
| 280 | +print$OT"\t/*$comment */"if ($commentne""); |
| 281 | +print$OT"\n"; |
218 | 282 |
|
219 |
| -print$OUTPUT"\t/*$comment */"if ($commentne""); |
220 |
| -} |
221 |
| -print$OUTPUT"\n"; |
| 283 | +$main_index++; |
222 | 284 | }
|
223 |
| -print$OUTPUT"\n};\n\n"; |
| 285 | +print$OT"\n};\n\n"; |
224 | 286 |
|
225 | 287 | # Print the array of decomposed codes.
|
226 |
| -print$OUTPUT<<HEADER; |
| 288 | +print$OT<<HEADER; |
227 | 289 | /* codepoints array */
|
228 | 290 | static const uint32 UnicodeDecomp_codepoints[$decomp_index] =
|
229 | 291 | {
|
230 | 292 | $decomp_string
|
231 | 293 | };
|
232 | 294 | HEADER
|
233 | 295 |
|
234 |
| -close$OUTPUT; |
| 296 | +# Emit the definition of the decomp hash function. |
| 297 | +my$dec_funcname ='Decomp_hash_func'; |
| 298 | +my$dec_func = PerfectHash::generate_hash_function(\@dec_cp_packed, |
| 299 | +$dec_funcname,fixed_key_length=> 4); |
| 300 | +print$OF"/* Perfect hash function for decomposition */\n"; |
| 301 | +print$OF"static$dec_func\n"; |
| 302 | + |
| 303 | +# Emit the structure that wraps the hash lookup information into |
| 304 | +# one variable. |
| 305 | +print$OF<<HEADER; |
| 306 | +/* Hash lookup information for decomposition */ |
| 307 | +static const pg_unicode_decompinfo UnicodeDecompInfo = |
| 308 | +{ |
| 309 | +UnicodeDecompMain, |
| 310 | +$dec_funcname, |
| 311 | +$num_characters |
| 312 | +}; |
| 313 | +
|
| 314 | +HEADER |
| 315 | + |
| 316 | +# Find the lowest codepoint that decomposes to each recomposeable |
| 317 | +# code pair and create a mapping to it. |
| 318 | +my$recomp_string =""; |
| 319 | +my@rec_cp_packed; |
| 320 | +my%seenit; |
| 321 | +my$firstentry = 1; |
| 322 | +foreachmy$rec (sort recomp_sort@rec_info) |
| 323 | +{ |
| 324 | +# The hash key is formed by concatenating the bytes of the two |
| 325 | +# codepoints. See also recompose_code() in common/unicode_norm.c. |
| 326 | +my$hashkey = (hex($rec->{first}) << 32) |hex($rec->{second}); |
| 327 | + |
| 328 | +# We are only interested in the lowest code point that decomposes |
| 329 | +# to the given code pair. |
| 330 | +nextif$seenit{$hashkey}; |
| 331 | + |
| 332 | +# Save the hash key bytes in network order |
| 333 | +push@rec_cp_packed,pack('Q>',$hashkey); |
| 334 | + |
| 335 | +# Append inverse lookup element |
| 336 | +$recomp_string .=",\n"if !$firstentry; |
| 337 | +$recomp_string .=sprintf"\t/* U+%s+%s -> U+%s */%s", |
| 338 | +$rec->{first}, |
| 339 | +$rec->{second}, |
| 340 | +$rec->{code}, |
| 341 | +$rec->{main_index}; |
| 342 | + |
| 343 | +$seenit{$hashkey} = 1; |
| 344 | +$firstentry = 0; |
| 345 | +} |
| 346 | + |
| 347 | +# Emit the inverse lookup array containing indexes into UnicodeDecompMain. |
| 348 | +my$num_recomps =scalar@rec_cp_packed; |
| 349 | +print$OF<<HEADER; |
| 350 | +/* Inverse lookup array -- contains indexes into UnicodeDecompMain[] */ |
| 351 | +static const uint16 RecompInverseLookup[$num_recomps] = |
| 352 | +{ |
| 353 | +$recomp_string |
| 354 | +}; |
| 355 | +
|
| 356 | +HEADER |
| 357 | + |
| 358 | +# Emit the definition of the recomposition hash function. |
| 359 | +my$rec_funcname ='Recomp_hash_func'; |
| 360 | +my$rec_func = |
| 361 | + PerfectHash::generate_hash_function(\@rec_cp_packed,$rec_funcname, |
| 362 | +fixed_key_length=> 8); |
| 363 | +print$OF"/* Perfect hash function for recomposition */\n"; |
| 364 | +print$OF"static$rec_func\n"; |
| 365 | + |
| 366 | +# Emit the structure that wraps the hash lookup information into |
| 367 | +# one variable. |
| 368 | +print$OF<<HEADER; |
| 369 | +/* Hash lookup information for recomposition */ |
| 370 | +static const pg_unicode_recompinfo UnicodeRecompInfo = |
| 371 | +{ |
| 372 | +RecompInverseLookup, |
| 373 | +$rec_funcname, |
| 374 | +$num_recomps |
| 375 | +}; |
| 376 | +HEADER |
| 377 | + |
| 378 | +close$OT; |
| 379 | +close$OF; |
| 380 | + |
| 381 | +subrecomp_sort |
| 382 | +{ |
| 383 | +my$a1 =hex($a->{first}); |
| 384 | +my$b1 =hex($b->{first}); |
| 385 | + |
| 386 | +my$a2 =hex($a->{second}); |
| 387 | +my$b2 =hex($b->{second}); |
| 388 | + |
| 389 | +# First sort by the first code point |
| 390 | +return -1if$a1 <$b1; |
| 391 | +return 1if$a1 >$b1; |
| 392 | + |
| 393 | +# Then sort by the second code point |
| 394 | +return -1if$a2 <$b2; |
| 395 | +return 1if$a2 >$b2; |
| 396 | + |
| 397 | +# Finally sort by the code point that decomposes into first and |
| 398 | +# second ones. |
| 399 | +my$acode =hex($a->{code}); |
| 400 | +my$bcode =hex($b->{code}); |
| 401 | + |
| 402 | +return -1if$acode <$bcode; |
| 403 | +return -1if$acode >$bcode; |
| 404 | + |
| 405 | +die"found duplicate entries of recomposeable code pairs"; |
| 406 | +} |