NotificationsYou must be signed in to change notification settings
Fork6
Star31

Commit783f0cc

committed

Improve performance of Unicode {de,re}composition in the backend

This replaces the existing binary search with two perfect hash functionsfor the composition and the decomposition in the backend code, at thecost of slightly-larger binaries there (35kB in libpgcommon_srv.a). Perthe measurements done, this improves the speed of the recomposition anddecomposition by up to 30~40 times for the NFC and NFKC conversions,while all other operations get at least 40% faster. This is not as"good" as what libicu has, but it closes the gap a lot as per thefeedback from Daniel Verite.The decomposition table remains the same, getting used for the binarysearch in the frontend code, where we care more about the size of thelibraries like libpq over performance as this gets involved only in codepaths related to the SCRAM authentication. In consequence, note thatthe perfect hash function for the recomposition needs to use a newinverse lookup array back to to the existing decomposition table.The size of all frontend deliverables remains unchanged, even with--enable-debug, including libpq.Author: John NaylorReviewed-by: Michael Paquier, Tom LaneDiscussion:https://postgr.es/m/CAFBsxsHUuMFCt6-pU+oG-F1==CmEp8wR+O+bRouXWu6i8kXuqA@mail.gmail.com

1 parent7d6d6bc commit783f0ccCopy full SHA for 783f0cc

File tree

5 files changed

+3227

-44

lines changed

src
- common
  - unicode_norm.c
  - unicode
    - Makefile
    - generate-unicode_norm_table.pl
- include/common
  - unicode_norm_hashfunc.h
- tools/pgindent
  - exclude_file_patterns

5 files changed

+3227

-44

lines changed

`‎src/common/unicode/Makefile‎`

Lines changed: 3 additions & 1 deletion

Original file line number	Diff line number	Diff line change
`@@ -18,7 +18,7 @@ LIBS += $(PTHREAD_LIBS)`
`18`	`18`	`# By default, do nothing.`
`19`	`19`	`all:`
`20`	`20`
`21`		`-update-unicode: unicode_norm_table.h unicode_combining_table.h unicode_normprops_table.h`
	`21`	`+update-unicode: unicode_norm_table.h unicode_combining_table.h unicode_normprops_table.h unicode_norm_hashfunc.h`
`22`	`22`	`mv$^ ../../../src/include/common/`
`23`	`23`	`$(MAKE) normalization-check`
`24`	`24`
`@@ -30,6 +30,8 @@ UnicodeData.txt DerivedNormalizationProps.txt CompositionExclusions.txt Normaliz`
`30`	`30`
`31`	`31`	`# Generation of conversion tables used for string normalization with`
`32`	`32`	`# UTF-8 strings.`
	`33`	`+unicode_norm_hashfunc.h: unicode_norm_table.h`
	`34`	`+`
`33`	`35`	`unicode_norm_table.h: generate-unicode_norm_table.pl UnicodeData.txt CompositionExclusions.txt`
`34`	`36`	`$(PERL) generate-unicode_norm_table.pl`
`35`	`37`

`‎src/common/unicode/generate-unicode_norm_table.pl‎`

Lines changed: 199 additions & 27 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,16 +1,22 @@`
`1`	`1`	`#!/usr/bin/perl`
`2`	`2`	`#`
`3`		`-# Generate a composition table, using Unicode data files as input`
	`3`	`+# Generate a composition table and its lookup utilities, using Unicode data`
	`4`	`+# files as input.`
`4`	`5`	`#`
`5`	`6`	`# Input: UnicodeData.txt and CompositionExclusions.txt`
`6`		`-# Output: unicode_norm_table.h`
	`7`	`+# Output: unicode_norm_table.h and unicode_norm_hashfunc.h`
`7`	`8`	`#`
`8`	`9`	`# Copyright (c) 2000-2020, PostgreSQL Global Development Group`
`9`	`10`
`10`	`11`	`use strict;`
`11`	`12`	`use warnings;`
`12`	`13`
`13`		`-my$output_file ="unicode_norm_table.h";`
	`14`	`+use FindBin;`
	`15`	`+use lib"$FindBin::RealBin/../../tools/";`
	`16`	`+use PerfectHash;`
	`17`	`+`
	`18`	`+my$output_table_file ="unicode_norm_table.h";`
	`19`	`+my$output_func_file ="unicode_norm_hashfunc.h";`
`14`	`20`
`15`	`21`	`my$FH;`
`16`	`22`
`@@ -64,11 +70,13 @@`
`64`	`70`
`65`	`71`	`my$num_characters =scalar@characters;`
`66`	`72`
`67`		`-# Start writing out the output file`
`68`		`-openmy$OUTPUT,'>',$output_file`
`69`		`-ordie"Could not open output file$output_file:$!\n";`
	`73`	`+# Start writing out the output files`
	`74`	`+openmy$OT,'>',$output_table_file`
	`75`	`+ordie"Could not open output file$output_table_file:$!\n";`
	`76`	`+openmy$OF,'>',$output_func_file`
	`77`	`+ordie"Could not open output file$output_func_file:$!\n";`
`70`	`78`
`71`		`-print$OUTPUT<<HEADER;`
	`79`	`+print$OT<<HEADER;`
`72`	`80`	`/*-------------------------------------------------------------------------`
`73`	`81`	`*`
`74`	`82`	`* unicode_norm_table.h`
`@@ -111,8 +119,53 @@`
`111`	`119`	`{`
`112`	`120`	`HEADER`
`113`	`121`
	`122`	`+print$OF<<HEADER;`
	`123`	`+/*-------------------------------------------------------------------------`
	`124`	`+ *`
	`125`	`+ * unicode_norm_hashfunc.h`
	`126`	`+ * Perfect hash functions used for Unicode normalization`
	`127`	`+ *`
	`128`	`+ * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group`
	`129`	`+ * Portions Copyright (c) 1994, Regents of the University of California`
	`130`	`+ *`
	`131`	`+ * src/include/common/unicode_norm_hashfunc.h`
	`132`	`+ *`
	`133`	`+ *-------------------------------------------------------------------------`
	`134`	`+ */`
	`135`	`+`
	`136`	`+/*`
	`137`	`+ * File auto-generated by src/common/unicode/generate-unicode_norm_table.pl,`
	`138`	`+ * do not edit. There is deliberately not an #ifndef PG_UNICODE_NORM_HASHFUNC_H`
	`139`	`+ * here.`
	`140`	`+ */`
	`141`	`+`
	`142`	`+#include "common/unicode_norm_table.h"`
	`143`	`+`
	`144`	`+/* Typedef for perfect hash functions */`
	`145`	`+typedef int (cp_hash_func) (const void key);`
	`146`	`+`
	`147`	`+/* Information for lookups with perfect hash functions */`
	`148`	`+typedef struct`
	`149`	`+{`
	`150`	`+const pg_unicode_decomposition *decomps;`
	`151`	`+cp_hash_funchash;`
	`152`	`+intnum_decomps;`
	`153`	`+} pg_unicode_decompinfo;`
	`154`	`+`
	`155`	`+typedef struct`
	`156`	`+{`
	`157`	`+const uint16*inverse_lookup;`
	`158`	`+cp_hash_funchash;`
	`159`	`+intnum_recomps;`
	`160`	`+} pg_unicode_recompinfo;`
	`161`	`+`
	`162`	`+HEADER`
	`163`	`+`
`114`	`164`	`my$decomp_index = 0;`
`115`	`165`	`my$decomp_string ="";`
	`166`	`+my@dec_cp_packed;`
	`167`	`+my$main_index = 0;`
	`168`	`+my@rec_info;`
`116`	`169`
`117`	`170`	`my$last_code =$characters[-1]->{code};`
`118`	`171`	`foreachmy$char (@characters)`
`@@ -121,6 +174,9 @@`
`121`	`174`	`my$class =$char->{class};`
`122`	`175`	`my$decomp =$char->{decomp};`
`123`	`176`
	`177`	`+# Save the code point bytes as a string in network order.`
	`178`	`+push@dec_cp_packed,pack('N',hex($char->{code}));`
	`179`	`+`
`124`	`180`	`# The character decomposition mapping field in UnicodeData.txt is a list`
`125`	`181`	`# of unicode codepoints, separated by space. But it can be prefixed with`
`126`	`182`	`# so-called compatibility formatting tag, like "<compat>", or "<font>".`
`@@ -163,32 +219,46 @@`
`163`	`219`	`{`
`164`	`220`	`foreachmy$lcode (@composition_exclusion_codes)`
`165`	`221`	`{`
`166`		`-if ($lcodeeq$char->{code})`
	`222`	`+if ($lcodeeq$code)`
`167`	`223`	`{`
`168`	`224`	`$flags .=" \| DECOMP_NO_COMPOSE";`
`169`	`225`	`$comment ="in exclusion list";`
`170`	`226`	`last;`
`171`	`227`	`}`
`172`	`228`	`}`
`173`	`229`	`}`
	`230`	`+`
	`231`	`+# Save info for recomposeable codepoints.`
	`232`	`+# Note that this MUST match the macro DECOMPOSITION_NO_COMPOSE in C`
	`233`	`+# above! See also the inverse lookup in recompose_code() found in`
	`234`	`+# src/common/unicode_norm.c.`
	`235`	`+if (!($flags =~/DECOMP_COMPAT/ \|\|$flags =~/DECOMP_NO_COMPOSE/))`
	`236`	`+{`
	`237`	`+push@rec_info,`
	`238`	`+ {`
	`239`	`+code=>$code,`
	`240`	`+main_index=>$main_index,`
	`241`	`+first=>$first_decomp,`
	`242`	`+second=>$decomp_elts[0]`
	`243`	`+ };`
	`244`	`+}`
`174`	`245`	`}`
`175`	`246`
`176`	`247`	`if ($decomp_size == 0)`
`177`	`248`	`{`
`178`		`-print$OUTPUT"\t{0x$code,$class, 0$flags, 0}";`
	`249`	`+print$OT"\t{0x$code,$class, 0$flags, 0}";`
`179`	`250`	`}`
`180`	`251`	`elsif ($decomp_size == 1 &&length($first_decomp) <= 4)`
`181`	`252`	`{`
`182`	`253`
`183`	`254`	`# The decomposition consists of a single codepoint, and it fits`
`184`	`255`	`# in a uint16, so we can store it "inline" in the main table.`
`185`	`256`	`$flags .=" \| DECOMP_INLINE";`
`186`		`-print$OUTPUT"\t{0x$code,$class, 1$flags, 0x$first_decomp}";`
	`257`	`+print$OT"\t{0x$code,$class, 1$flags, 0x$first_decomp}";`
`187`	`258`	`}`
`188`	`259`	`else`
`189`	`260`	`{`
`190`		`-print$OUTPUT`
`191`		`-"\t{0x$code,$class,$decomp_size$flags,$decomp_index}";`
	`261`	`+print$OT"\t{0x$code,$class,$decomp_size$flags,$decomp_index}";`
`192`	`262`
`193`	`263`	`# Now save the decompositions into a dedicated area that will`
`194`	`264`	`# be written afterwards. First build the entry dedicated to`
`@@ -205,30 +275,132 @@`
`205`	`275`	`}`
`206`	`276`
`207`	`277`	`# Print a comma after all items except the last one.`
`208`		`-print$OUTPUT","unless ($codeeq$last_code);`
`209`		`-if ($commentne"")`
`210`		`-{`
	`278`	`+print$OT","unless ($codeeq$last_code);`
`211`	`279`
`212`		`-# If the line is wide already, indent the comment with one tab,`
`213`		`-# otherwise with two. This is to make the output match the way`
`214`		`-# pgindent would mangle it. (This is quite hacky. To do this`
`215`		`-# properly, we should actually track how long the line is so far,`
`216`		`-# but this works for now.)`
`217`		`-print$OUTPUT"\t"if ($decomp_index < 10);`
	`280`	`+print$OT"\t/$comment /"if ($commentne"");`
	`281`	`+print$OT"\n";`
`218`	`282`
`219`		`-print$OUTPUT"\t/$comment /"if ($commentne"");`
`220`		`-}`
`221`		`-print$OUTPUT"\n";`
	`283`	`+$main_index++;`
`222`	`284`	`}`
`223`		`-print$OUTPUT"\n};\n\n";`
	`285`	`+print$OT"\n};\n\n";`
`224`	`286`
`225`	`287`	`# Print the array of decomposed codes.`
`226`		`-print$OUTPUT<<HEADER;`
	`288`	`+print$OT<<HEADER;`
`227`	`289`	`/* codepoints array */`
`228`	`290`	`static const uint32 UnicodeDecomp_codepoints[$decomp_index] =`
`229`	`291`	`{`
`230`	`292`	`$decomp_string`
`231`	`293`	`};`
`232`	`294`	`HEADER`
`233`	`295`
`234`		`-close$OUTPUT;`
	`296`	`+# Emit the definition of the decomp hash function.`
	`297`	`+my$dec_funcname ='Decomp_hash_func';`
	`298`	`+my$dec_func = PerfectHash::generate_hash_function(\@dec_cp_packed,`
	`299`	`+$dec_funcname,fixed_key_length=> 4);`
	`300`	`+print$OF"/* Perfect hash function for decomposition */\n";`
	`301`	`+print$OF"static$dec_func\n";`
	`302`	`+`
	`303`	`+# Emit the structure that wraps the hash lookup information into`
	`304`	`+# one variable.`
	`305`	`+print$OF<<HEADER;`
	`306`	`+/* Hash lookup information for decomposition */`
	`307`	`+static const pg_unicode_decompinfo UnicodeDecompInfo =`
	`308`	`+{`
	`309`	`+UnicodeDecompMain,`
	`310`	`+$dec_funcname,`
	`311`	`+$num_characters`
	`312`	`+};`
	`313`	`+`
	`314`	`+HEADER`
	`315`	`+`
	`316`	`+# Find the lowest codepoint that decomposes to each recomposeable`
	`317`	`+# code pair and create a mapping to it.`
	`318`	`+my$recomp_string ="";`
	`319`	`+my@rec_cp_packed;`
	`320`	`+my%seenit;`
	`321`	`+my$firstentry = 1;`
	`322`	`+foreachmy$rec (sort recomp_sort@rec_info)`
	`323`	`+{`
	`324`	`+# The hash key is formed by concatenating the bytes of the two`
	`325`	`+# codepoints. See also recompose_code() in common/unicode_norm.c.`
	`326`	`+my$hashkey = (hex($rec->{first}) << 32) \|hex($rec->{second});`
	`327`	`+`
	`328`	`+# We are only interested in the lowest code point that decomposes`
	`329`	`+# to the given code pair.`
	`330`	`+nextif$seenit{$hashkey};`
	`331`	`+`
	`332`	`+# Save the hash key bytes in network order`
	`333`	`+push@rec_cp_packed,pack('Q>',$hashkey);`
	`334`	`+`
	`335`	`+# Append inverse lookup element`
	`336`	`+$recomp_string .=",\n"if !$firstentry;`
	`337`	`+$recomp_string .=sprintf"\t/* U+%s+%s -> U+%s */%s",`
	`338`	`+$rec->{first},`
	`339`	`+$rec->{second},`
	`340`	`+$rec->{code},`
	`341`	`+$rec->{main_index};`
	`342`	`+`
	`343`	`+$seenit{$hashkey} = 1;`
	`344`	`+$firstentry = 0;`
	`345`	`+}`
	`346`	`+`
	`347`	`+# Emit the inverse lookup array containing indexes into UnicodeDecompMain.`
	`348`	`+my$num_recomps =scalar@rec_cp_packed;`
	`349`	`+print$OF<<HEADER;`
	`350`	`+/* Inverse lookup array -- contains indexes into UnicodeDecompMain[] */`
	`351`	`+static const uint16 RecompInverseLookup[$num_recomps] =`
	`352`	`+{`
	`353`	`+$recomp_string`
	`354`	`+};`
	`355`	`+`
	`356`	`+HEADER`
	`357`	`+`
	`358`	`+# Emit the definition of the recomposition hash function.`
	`359`	`+my$rec_funcname ='Recomp_hash_func';`
	`360`	`+my$rec_func =`
	`361`	`+ PerfectHash::generate_hash_function(\@rec_cp_packed,$rec_funcname,`
	`362`	`+fixed_key_length=> 8);`
	`363`	`+print$OF"/* Perfect hash function for recomposition */\n";`
	`364`	`+print$OF"static$rec_func\n";`
	`365`	`+`
	`366`	`+# Emit the structure that wraps the hash lookup information into`
	`367`	`+# one variable.`
	`368`	`+print$OF<<HEADER;`
	`369`	`+/* Hash lookup information for recomposition */`
	`370`	`+static const pg_unicode_recompinfo UnicodeRecompInfo =`
	`371`	`+{`
	`372`	`+RecompInverseLookup,`
	`373`	`+$rec_funcname,`
	`374`	`+$num_recomps`
	`375`	`+};`
	`376`	`+HEADER`
	`377`	`+`
	`378`	`+close$OT;`
	`379`	`+close$OF;`
	`380`	`+`
	`381`	`+subrecomp_sort`
	`382`	`+{`
	`383`	`+my$a1 =hex($a->{first});`
	`384`	`+my$b1 =hex($b->{first});`
	`385`	`+`
	`386`	`+my$a2 =hex($a->{second});`
	`387`	`+my$b2 =hex($b->{second});`
	`388`	`+`
	`389`	`+# First sort by the first code point`
	`390`	`+return -1if$a1 <$b1;`
	`391`	`+return 1if$a1 >$b1;`
	`392`	`+`
	`393`	`+# Then sort by the second code point`
	`394`	`+return -1if$a2 <$b2;`
	`395`	`+return 1if$a2 >$b2;`
	`396`	`+`
	`397`	`+# Finally sort by the code point that decomposes into first and`
	`398`	`+# second ones.`
	`399`	`+my$acode =hex($a->{code});`
	`400`	`+my$bcode =hex($b->{code});`
	`401`	`+`
	`402`	`+return -1if$acode <$bcode;`
	`403`	`+return -1if$acode >$bcode;`
	`404`	`+`
	`405`	`+die"found duplicate entries of recomposeable code pairs";`
	`406`	`+}`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit783f0cc

File tree

5 files changed

5 files changed

`‎src/common/unicode/Makefile‎`

`‎src/common/unicode/generate-unicode_norm_table.pl‎`

0 commit comments