NotificationsYou must be signed in to change notification settings
Fork5.3k
Star19.4k

Commitc64d0cd

committed

Use perfect hashing, instead of binary search, for keyword lookup.

We've been speculating for a long time that hash-based keyword lookupought to be faster than binary search, but up to now we hadn't founda suitable tool for generating the hash function. Joerg Sonnenbergerprovided the inspiration, and sample code, to show us that rolling ourown generator wasn't a ridiculous idea. Hence, do that.The method used here requires a lookup table of approximately 4 bytesper keyword, but that's less than what we saved in the predecessor commitafb0d07, so it's not a big problem. The time savings is indeedsignificant: preliminary testing suggests that the total time for rawparsing (flex + bison phases) drops by ~20%.Patch by me, but it owes its existence to Joerg Sonnenberger;thanks also to John Naylor for review.Discussion:https://postgr.es/m/20190103163340.GA15803@britannica.bec.de

1 parent5d59a6c commitc64d0cdCopy full SHA for c64d0cd

File tree

14 files changed

+516

-107

lines changed

src
- common
  - Makefile
  - kwlookup.c
- include
  - common
    - kwlookup.h
  - parser
    - kwlist.h
- interfaces/ecpg/preproc
- pl/plpgsql/src
- tools

14 files changed

+516

-107

lines changed

`‎src/common/Makefile‎`

Lines changed: 7 additions & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -63,6 +63,11 @@ OBJS_FRONTEND = $(OBJS_COMMON) fe_memutils.o file_utils.o restricted_token.o`
`63`	`63`	`OBJS_SHLIB =$(OBJS_FRONTEND:%.o=%_shlib.o)`
`64`	`64`	`OBJS_SRV =$(OBJS_COMMON:%.o=%_srv.o)`
`65`	`65`
	`66`	`+# where to find gen_keywordlist.pl and subsidiary files`
	`67`	`+TOOLSDIR =$(top_srcdir)/src/tools`
	`68`	`+GEN_KEYWORDLIST =$(PERL) -I$(TOOLSDIR)$(TOOLSDIR)/gen_keywordlist.pl`
	`69`	`+GEN_KEYWORDLIST_DEPS =$(TOOLSDIR)/gen_keywordlist.pl$(TOOLSDIR)/PerfectHash.pm`
	`70`	`+`
`66`	`71`	`all: libpgcommon.a libpgcommon_shlib.a libpgcommon_srv.a`
`67`	`72`
`68`	`73`	`distprep: kwlist_d.h`
`@@ -118,8 +123,8 @@ libpgcommon_srv.a: $(OBJS_SRV)`
`118`	`123`	`$(CC)$(CFLAGS)$(subst -DFRONTEND,,$(CPPFLAGS)) -c$< -o$@`
`119`	`124`
`120`	`125`	`# generate SQL keyword lookup table to be included into keywords*.o.`
`121`		`-kwlist_d.h:$(top_srcdir)/src/include/parser/kwlist.h$(top_srcdir)/src/tools/gen_keywordlist.pl`
`122`		`-$(PERL)$(top_srcdir)/src/tools/gen_keywordlist.pl --extern$<`
	`126`	`+kwlist_d.h:$(top_srcdir)/src/include/parser/kwlist.h$(GEN_KEYWORDLIST_DEPS)`
	`127`	`+$(GEN_KEYWORDLIST) --extern$<`
`123`	`128`
`124`	`129`	`# Dependencies of keywords*.o need to be managed explicitly to make sure`
`125`	`130`	`# that you don't get broken parsing code, even in a non-enable-depend build.`

`‎src/common/kwlookup.c‎`

Lines changed: 32 additions & 41 deletions

Original file line number	Diff line number	Diff line change
`@@ -35,60 +35,51 @@`
`35`	`35`	`* receive a different case-normalization mapping.`
`36`	`36`	`*/`
`37`	`37`	`int`
`38`		`-ScanKeywordLookup(constchar*text,`
	`38`	`+ScanKeywordLookup(constchar*str,`
`39`	`39`	`constScanKeywordList*keywords)`
`40`	`40`	`{`
`41`		`-intlen,`
`42`		`-i;`
`43`		`-charword[NAMEDATALEN];`
`44`		`-constchar*kw_string;`
`45`		`-constuint16*kw_offsets;`
`46`		`-constuint16*low;`
`47`		`-constuint16*high;`
`48`		`-`
`49`		`-len=strlen(text);`
	`41`	`+size_tlen;`
	`42`	`+inth;`
	`43`	`+constchar*kw;`
`50`	`44`
	`45`	`+/*`
	`46`	`+ * Reject immediately if too long to be any keyword. This saves useless`
	`47`	`+ * hashing and downcasing work on long strings.`
	`48`	`+ */`
	`49`	`+len=strlen(str);`
`51`	`50`	`if (len>keywords->max_kw_len)`
`52`		`-return-1;/* too long to be any keyword */`
`53`		`-`
`54`		`-/* We assume all keywords are shorter than NAMEDATALEN. */`
`55`		`-Assert(len<NAMEDATALEN);`
	`51`	`+return-1;`
`56`	`52`
`57`	`53`	`/*`
`58`		`- * Apply an ASCII-only downcasing. We must not use tolower() since it may`
`59`		`- * produce the wrong translation in some locales (eg, Turkish).`
	`54`	`+ * Compute the hash function. We assume it was generated to produce`
	`55`	`+ * case-insensitive results. Since it's a perfect hash, we need only`
	`56`	`+ * match to the specific keyword it identifies.`
`60`	`57`	`*/`
`61`		`-for (i=0;i<len;i++)`
`62`		`-{`
`63`		`-charch=text[i];`
	`58`	`+h=keywords->hash(str,len);`
`64`	`59`
`65`		`-if (ch >='A'&&ch <='Z')`
`66`		`-ch+='a'-'A';`
`67`		`-word[i]=ch;`
`68`		`-}`
`69`		`-word[len]='\0';`
	`60`	`+/* An out-of-range result implies no match */`
	`61`	`+if (h<0\|\|h >=keywords->num_keywords)`
	`62`	`+return-1;`
`70`	`63`
`71`	`64`	`/*`
`72`		`- * Now do a binary search using plain strcmp() comparison.`
	`65`	`+ * Compare character-by-character to see if we have a match, applying an`
	`66`	`+ * ASCII-only downcasing to the input characters. We must not use`
	`67`	`+ * tolower() since it may produce the wrong translation in some locales`
	`68`	`+ * (eg, Turkish).`
`73`	`69`	`*/`
`74`		`-kw_string=keywords->kw_string;`
`75`		`-kw_offsets=keywords->kw_offsets;`
`76`		`-low=kw_offsets;`
`77`		`-high=kw_offsets+ (keywords->num_keywords-1);`
`78`		`-while (low <=high)`
	`70`	`+kw=GetScanKeyword(h,keywords);`
	`71`	`+while (*str!='\0')`
`79`	`72`	`{`
`80`		`-constuint16*middle;`
`81`		`-intdifference;`
	`73`	`+charch=*str++;`
`82`	`74`
`83`		`-middle=low+ (high-low) /2;`
`84`		`-difference=strcmp(kw_string+*middle,word);`
`85`		`-if (difference==0)`
`86`		`-returnmiddle-kw_offsets;`
`87`		`-elseif (difference<0)`
`88`		`-low=middle+1;`
`89`		`-else`
`90`		`-high=middle-1;`
	`75`	`+if (ch >='A'&&ch <='Z')`
	`76`	`+ch+='a'-'A';`
	`77`	`+if (ch!=*kw++)`
	`78`	`+return-1;`
`91`	`79`	`}`
	`80`	`+if (*kw!='\0')`
	`81`	`+return-1;`
`92`	`82`
`93`		`-return-1;`
	`83`	`+/* Success! */`
	`84`	`+returnh;`
`94`	`85`	`}`

`‎src/include/common/kwlookup.h‎`

Lines changed: 4 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -14,6 +14,9 @@`
`14`	`14`	`#ifndefKWLOOKUP_H`
`15`	`15`	`#defineKWLOOKUP_H`
`16`	`16`
	`17`	`+/* Hash function used by ScanKeywordLookup */`
	`18`	`+typedefint (ScanKeywordHashFunc) (constvoidkey,size_tkeylen);`
	`19`	`+`
`17`	`20`	`/*`
`18`	`21`	`* This struct contains the data needed by ScanKeywordLookup to perform a`
`19`	`22`	`* search within a set of keywords. The contents are typically generated by`
`@@ -23,6 +26,7 @@ typedef struct ScanKeywordList`
`23`	`26`	`{`
`24`	`27`	`constcharkw_string;/ all keywords in order, separated by \0 */`
`25`	`28`	`constuint16kw_offsets;/ offsets to the start of each keyword */`
	`29`	`+ScanKeywordHashFunchash;/* perfect hash function for keywords */`
`26`	`30`	`intnum_keywords;/* number of keywords */`
`27`	`31`	`intmax_kw_len;/* length of longest keyword */`
`28`	`32`	`}ScanKeywordList;`

`‎src/include/parser/kwlist.h‎`

Lines changed: 1 addition & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -21,8 +21,7 @@`
`21`	`21`	`/*`
`22`	`22`	`* List of keyword (name, token-value, category) entries.`
`23`	`23`	`*`
`24`		`- * !!WARNING!!: This list must be sorted by ASCII name, because binary`
`25`		`- * search is used to locate entries.`
	`24`	`+ * Note: gen_keywordlist.pl requires the entries to appear in ASCII order.`
`26`	`25`	`*/`
`27`	`26`
`28`	`27`	`/* name, value, category */`

`‎src/interfaces/ecpg/preproc/Makefile‎`

Lines changed: 8 additions & 5 deletions

Original file line number	Diff line number	Diff line change
`@@ -28,7 +28,10 @@ OBJS=preproc.o pgc.o type.o ecpg.o output.o parser.o \`
`28`	`28`	`keywords.o c_keywords.o ecpg_keywords.o typename.o descriptor.o variable.o\`
`29`	`29`	`$(WIN32RES)`
`30`	`30`
`31`		`-GEN_KEYWORDLIST =$(top_srcdir)/src/tools/gen_keywordlist.pl`
	`31`	`+# where to find gen_keywordlist.pl and subsidiary files`
	`32`	`+TOOLSDIR =$(top_srcdir)/src/tools`
	`33`	`+GEN_KEYWORDLIST =$(PERL) -I$(TOOLSDIR)$(TOOLSDIR)/gen_keywordlist.pl`
	`34`	`+GEN_KEYWORDLIST_DEPS =$(TOOLSDIR)/gen_keywordlist.pl$(TOOLSDIR)/PerfectHash.pm`
`32`	`35`
`33`	`36`	`# Suppress parallel build to avoid a bug in GNU make 3.82`
`34`	`37`	`# (see comments in ../Makefile)`
`@@ -56,11 +59,11 @@ preproc.y: ../../../backend/parser/gram.y parse.pl ecpg.addons ecpg.header ecpg.`
`56`	`59`	`$(PERL)$(srcdir)/check_rules.pl$(srcdir)$<`
`57`	`60`
`58`	`61`	`# generate keyword headers`
`59`		`-c_kwlist_d.h: c_kwlist.h$(GEN_KEYWORDLIST)`
`60`		`-$(PERL)$(GEN_KEYWORDLIST) --varname ScanCKeywords$<`
	`62`	`+c_kwlist_d.h: c_kwlist.h$(GEN_KEYWORDLIST_DEPS)`
	`63`	`+$(GEN_KEYWORDLIST) --varname ScanCKeywords --no-case-fold$<`
`61`	`64`
`62`		`-ecpg_kwlist_d.h: ecpg_kwlist.h$(GEN_KEYWORDLIST)`
`63`		`-$(PERL)$(GEN_KEYWORDLIST) --varname ScanECPGKeywords$<`
	`65`	`+ecpg_kwlist_d.h: ecpg_kwlist.h$(GEN_KEYWORDLIST_DEPS)`
	`66`	`+$(GEN_KEYWORDLIST) --varname ScanECPGKeywords$<`
`64`	`67`
`65`	`68`	`# Force these dependencies to be known even without dependency info built:`
`66`	`69`	`ecpg_keywords.oc_keywords.okeywords.opreproc.opgc.oparser.o: preproc.h`

`‎src/interfaces/ecpg/preproc/c_keywords.c‎`

Lines changed: 24 additions & 27 deletions

Original file line number	Diff line number	Diff line change
`@@ -9,8 +9,6 @@`
`9`	`9`	`*/`
`10`	`10`	`#include"postgres_fe.h"`
`11`	`11`
`12`		`-#include<ctype.h>`
`13`		`-`
`14`	`12`	`#include"preproc_extern.h"`
`15`	`13`	`#include"preproc.h"`
`16`	`14`
`@@ -32,39 +30,38 @@ static const uint16 ScanCKeywordTokens[] = {`
`32`	`30`	`*`
`33`	`31`	`* Returns the token value of the keyword, or -1 if no match.`
`34`	`32`	`*`
`35`		`- * Do abinary search using plain strcmp() comparison. This is much like`
	`33`	`+ * Do ahash search using plain strcmp() comparison. This is much like`
`36`	`34`	`* ScanKeywordLookup(), except we want case-sensitive matching.`
`37`	`35`	`*/`
`38`	`36`	`int`
`39`		`-ScanCKeywordLookup(constchar*text)`
	`37`	`+ScanCKeywordLookup(constchar*str)`
`40`	`38`	`{`
`41`		`-constchar*kw_string;`
`42`		`-constuint16*kw_offsets;`
`43`		`-constuint16*low;`
`44`		`-constuint16*high;`
	`39`	`+size_tlen;`
	`40`	`+inth;`
	`41`	`+constchar*kw;`
	`42`	`+`
	`43`	`+/*`
	`44`	`+ * Reject immediately if too long to be any keyword. This saves useless`
	`45`	`+ * hashing work on long strings.`
	`46`	`+ */`
	`47`	`+len=strlen(str);`
	`48`	`+if (len>ScanCKeywords.max_kw_len)`
	`49`	`+return-1;`
`45`	`50`
`46`		`-if (strlen(text)>ScanCKeywords.max_kw_len)`
`47`		`-return-1;/* too long to be any keyword */`
	`51`	`+/*`
	`52`	`+ * Compute the hash function. Since it's a perfect hash, we need only`
	`53`	`+ * match to the specific keyword it identifies.`
	`54`	`+ */`
	`55`	`+h=ScanCKeywords_hash_func(str,len);`
`48`	`56`
`49`		`-kw_string=ScanCKeywords.kw_string;`
`50`		`-kw_offsets=ScanCKeywords.kw_offsets;`
`51`		`-low=kw_offsets;`
`52`		`-high=kw_offsets+ (ScanCKeywords.num_keywords-1);`
	`57`	`+/* An out-of-range result implies no match */`
	`58`	`+if (h<0\|\|h >=ScanCKeywords.num_keywords)`
	`59`	`+return-1;`
`53`	`60`
`54`		`-while (low <=high)`
`55`		`-{`
`56`		`-constuint16*middle;`
`57`		`-intdifference;`
	`61`	`+kw=GetScanKeyword(h,&ScanCKeywords);`
`58`	`62`
`59`		`-middle=low+ (high-low) /2;`
`60`		`-difference=strcmp(kw_string+*middle,text);`
`61`		`-if (difference==0)`
`62`		`-returnScanCKeywordTokens[middle-kw_offsets];`
`63`		`-elseif (difference<0)`
`64`		`-low=middle+1;`
`65`		`-else`
`66`		`-high=middle-1;`
`67`		`-}`
	`63`	`+if (strcmp(kw,str)==0)`
	`64`	`+returnScanCKeywordTokens[h];`
`68`	`65`
`69`	`66`	`return-1;`
`70`	`67`	`}`

`‎src/interfaces/ecpg/preproc/c_kwlist.h‎`

Lines changed: 1 addition & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -20,8 +20,7 @@`
`20`	`20`	`/*`
`21`	`21`	`* List of (keyword-name, keyword-token-value) pairs.`
`22`	`22`	`*`
`23`		`- * !!WARNING!!: This list must be sorted by ASCII name, because binary`
`24`		`- * search is used to locate entries.`
	`23`	`+ * Note: gen_keywordlist.pl requires the entries to appear in ASCII order.`
`25`	`24`	`*/`
`26`	`25`
`27`	`26`	`/* name, value */`

`‎src/interfaces/ecpg/preproc/ecpg_kwlist.h‎`

Lines changed: 1 addition & 2 deletions

Original file line number	Diff line number	Diff line change
`@@ -20,8 +20,7 @@`
`20`	`20`	`/*`
`21`	`21`	`* List of (keyword-name, keyword-token-value) pairs.`
`22`	`22`	`*`
`23`		`- * !!WARNING!!: This list must be sorted by ASCII name, because binary`
`24`		`- * search is used to locate entries.`
	`23`	`+ * Note: gen_keywordlist.pl requires the entries to appear in ASCII order.`
`25`	`24`	`*/`
`26`	`25`
`27`	`26`	`/* name, value */`

`‎src/pl/plpgsql/src/Makefile‎`

Lines changed: 8 additions & 5 deletions

Original file line number	Diff line number	Diff line change
`@@ -29,7 +29,10 @@ REGRESS_OPTS = --dbname=$(PL_TESTDB)`
`29`	`29`	`REGRESS = plpgsql_call plpgsql_control plpgsql_domain plpgsql_record\`
`30`	`30`	`plpgsql_cache plpgsql_transaction plpgsql_trigger plpgsql_varprops`
`31`	`31`
`32`		`-GEN_KEYWORDLIST =$(top_srcdir)/src/tools/gen_keywordlist.pl`
	`32`	`+# where to find gen_keywordlist.pl and subsidiary files`
	`33`	`+TOOLSDIR =$(top_srcdir)/src/tools`
	`34`	`+GEN_KEYWORDLIST =$(PERL) -I$(TOOLSDIR)$(TOOLSDIR)/gen_keywordlist.pl`
	`35`	`+GEN_KEYWORDLIST_DEPS =$(TOOLSDIR)/gen_keywordlist.pl$(TOOLSDIR)/PerfectHash.pm`
`33`	`36`
`34`	`37`	`all: all-lib`
`35`	`38`
`@@ -76,11 +79,11 @@ plerrcodes.h: $(top_srcdir)/src/backend/utils/errcodes.txt generate-plerrcodes.p`
`76`	`79`	`$(PERL)$(srcdir)/generate-plerrcodes.pl$<>$@`
`77`	`80`
`78`	`81`	`# generate keyword headers for the scanner`
`79`		`-pl_reserved_kwlist_d.h: pl_reserved_kwlist.h$(GEN_KEYWORDLIST)`
`80`		`-$(PERL)$(GEN_KEYWORDLIST) --varname ReservedPLKeywords$<`
	`82`	`+pl_reserved_kwlist_d.h: pl_reserved_kwlist.h$(GEN_KEYWORDLIST_DEPS)`
	`83`	`+$(GEN_KEYWORDLIST) --varname ReservedPLKeywords$<`
`81`	`84`
`82`		`-pl_unreserved_kwlist_d.h: pl_unreserved_kwlist.h$(GEN_KEYWORDLIST)`
`83`		`-$(PERL)$(GEN_KEYWORDLIST) --varname UnreservedPLKeywords$<`
	`85`	`+pl_unreserved_kwlist_d.h: pl_unreserved_kwlist.h$(GEN_KEYWORDLIST_DEPS)`
	`86`	`+$(GEN_KEYWORDLIST) --varname UnreservedPLKeywords$<`
`84`	`87`
`85`	`88`
`86`	`89`	`check: submake`

`‎src/pl/plpgsql/src/pl_reserved_kwlist.h‎`

Lines changed: 2 additions & 3 deletions

Original file line number	Diff line number	Diff line change
`@@ -20,10 +20,9 @@`
`20`	`20`	`/*`
`21`	`21`	`* List of (keyword-name, keyword-token-value) pairs.`
`22`	`22`	`*`
`23`		`- * Be careful not to put the same wordin both lists.`
	`23`	`+ * Be careful not to put the same wordinto pl_unreserved_kwlist.h.`
`24`	`24`	`*`
`25`		`- * !!WARNING!!: This list must be sorted by ASCII name, because binary`
`26`		`- * search is used to locate entries.`
	`25`	`+ * Note: gen_keywordlist.pl requires the entries to appear in ASCII order.`
`27`	`26`	`*/`
`28`	`27`
`29`	`28`	`/* name, value */`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commitc64d0cd

File tree

14 files changed

14 files changed

`‎src/common/Makefile‎`

`‎src/common/kwlookup.c‎`

`‎src/include/common/kwlookup.h‎`

`‎src/include/parser/kwlist.h‎`

`‎src/interfaces/ecpg/preproc/Makefile‎`

`‎src/interfaces/ecpg/preproc/c_keywords.c‎`

`‎src/interfaces/ecpg/preproc/c_kwlist.h‎`

`‎src/interfaces/ecpg/preproc/ecpg_kwlist.h‎`

`‎src/pl/plpgsql/src/Makefile‎`

`‎src/pl/plpgsql/src/pl_reserved_kwlist.h‎`

0 commit comments