NotificationsYou must be signed in to change notification settings
Fork5
Star26

Commite00f68e

committed

Add caching of ctype.h/wctype.h results in regc_locale.c.

While this doesn't save a huge amount of runtime, it still seems worthdoing, especially since I realized that the data copying I did in my firstdraft was quite unnecessary. In this version, once we have the resultscached, getting them back for re-use is really very cheap.Also, remove the hard-wired limitation to not consider wctype.h results forcharacter codes above 255. It turns out that we can't push the limit asfar up as I'd originally hoped, because the regex colormap code is notefficient enough to cope very well with character classes containing manythousand letters, which a Unicode locale is entirely capable of producing.Still, we can push it up to U+7FF (which I chose as the limit of 2-byteUTF8 characters), which will at least make Eastern Europeans happy pendinga better solution. Thus, this commit resolves the specific complaint inbug #6457, but not the more general issue that letters of non-westernalphabets are mostly not recognized as matching [[:alpha:]].

1 parent27af914 commite00f68eCopy full SHA for e00f68e

File tree

2 files changed

+260

-81

lines changed

src/backend/regex
- regc_locale.c
- regc_pg_locale.c

2 files changed

+260

-81

lines changed

`‎src/backend/regex/regc_locale.c`

Lines changed: 39 additions & 80 deletions

Original file line number	Diff line number	Diff line change
`@@ -350,6 +350,16 @@ static const struct cname`
`350`	`350`	`};`
`351`	`351`
`352`	`352`
	`353`	`+/*`
	`354`	`+ * We do not use the hard-wired Unicode classification tables that Tcl does.`
	`355`	`+ * This is because (a) we need to deal with other encodings besides Unicode,`
	`356`	`+ * and (b) we want to track the behavior of the libc locale routines as`
	`357`	`+ * closely as possible. For example, it wouldn't be unreasonable for a`
	`358`	`+ * locale to not consider every Unicode letter as a letter. So we build`
	`359`	`+ * character classification cvecs by asking libc, even for Unicode.`
	`360`	`+ */`
	`361`	`+`
	`362`	`+`
`353`	`363`	`/*`
`354`	`364`	`* element - map collating-element name to celt`
`355`	`365`	`*/`
`@@ -489,7 +499,11 @@ eclass(struct vars * v,/* context */`
`489`	`499`	`/*`
`490`	`500`	`* cclass - supply cvec for a character class`
`491`	`501`	`*`
`492`		`- * Must include case counterparts on request.`
	`502`	`+ * Must include case counterparts if "cases" is true.`
	`503`	`+ *`
	`504`	`+ * The returned cvec might be either a transient cvec gotten from getcvec(),`
	`505`	`+ * or a permanently cached one from pg_ctype_get_cache(). This is okay`
	`506`	`+ * because callers are not supposed to explicitly free the result either way.`
`493`	`507`	`*/`
`494`	`508`	`staticstructcvec*`
`495`	`509`	`cclass(structvarsv,/ context */`
`@@ -548,79 +562,54 @@ cclass(struct vars * v,/* context */`
`548`	`562`	`index= (int)CC_ALPHA;`
`549`	`563`
`550`	`564`	`/*`
`551`		`- * Now compute the character class contents.`
`552`		`- *`
`553`		`- * For the moment, assume that only char codes < 256 can be in these`
`554`		`- * classes.`
	`565`	`+ * Now compute the character class contents. For classes that are`
	`566`	`+ * based on the behavior of a <wctype.h> or <ctype.h> function, we use`
	`567`	`+ * pg_ctype_get_cache so that we can cache the results. Other classes`
	`568`	`+ * have definitions that are hard-wired here, and for those we just`
	`569`	`+ * construct a transient cvec on the fly.`
`555`	`570`	`*/`
`556`	`571`
`557`	`572`	`switch ((enumclasses)index)`
`558`	`573`	`{`
`559`	`574`	`caseCC_PRINT:`
`560`		`-cv=getcvec(v,UCHAR_MAX,0);`
`561`		`-if (cv)`
`562`		`-{`
`563`		`-for (i=0;i <=UCHAR_MAX;i++)`
`564`		`-{`
`565`		`-if (pg_wc_isprint((chr)i))`
`566`		`-addchr(cv, (chr)i);`
`567`		`-}`
`568`		`-}`
	`575`	`+cv=pg_ctype_get_cache(pg_wc_isprint);`
`569`	`576`	`break;`
`570`	`577`	`caseCC_ALNUM:`
`571`		`-cv=getcvec(v,UCHAR_MAX,0);`
`572`		`-if (cv)`
`573`		`-{`
`574`		`-for (i=0;i <=UCHAR_MAX;i++)`
`575`		`-{`
`576`		`-if (pg_wc_isalnum((chr)i))`
`577`		`-addchr(cv, (chr)i);`
`578`		`-}`
`579`		`-}`
	`578`	`+cv=pg_ctype_get_cache(pg_wc_isalnum);`
`580`	`579`	`break;`
`581`	`580`	`caseCC_ALPHA:`
`582`		`-cv=getcvec(v,UCHAR_MAX,0);`
`583`		`-if (cv)`
`584`		`-{`
`585`		`-for (i=0;i <=UCHAR_MAX;i++)`
`586`		`-{`
`587`		`-if (pg_wc_isalpha((chr)i))`
`588`		`-addchr(cv, (chr)i);`
`589`		`-}`
`590`		`-}`
	`581`	`+cv=pg_ctype_get_cache(pg_wc_isalpha);`
`591`	`582`	`break;`
`592`	`583`	`caseCC_ASCII:`
	`584`	`+/* hard-wired meaning */`
`593`	`585`	`cv=getcvec(v,0,1);`
`594`	`586`	`if (cv)`
`595`	`587`	`addrange(cv,0,0x7f);`
`596`	`588`	`break;`
`597`	`589`	`caseCC_BLANK:`
	`590`	`+/* hard-wired meaning */`
`598`	`591`	`cv=getcvec(v,2,0);`
`599`	`592`	`addchr(cv,'\t');`
`600`	`593`	`addchr(cv,' ');`
`601`	`594`	`break;`
`602`	`595`	`caseCC_CNTRL:`
	`596`	`+/* hard-wired meaning */`
`603`	`597`	`cv=getcvec(v,0,2);`
`604`	`598`	`addrange(cv,0x0,0x1f);`
`605`	`599`	`addrange(cv,0x7f,0x9f);`
`606`	`600`	`break;`
`607`	`601`	`caseCC_DIGIT:`
`608`		`-cv=getcvec(v,0,1);`
`609`		`-if (cv)`
`610`		`-addrange(cv, (chr)'0', (chr)'9');`
	`602`	`+cv=pg_ctype_get_cache(pg_wc_isdigit);`
`611`	`603`	`break;`
`612`	`604`	`caseCC_PUNCT:`
`613`		`-cv=getcvec(v,UCHAR_MAX,0);`
`614`		`-if (cv)`
`615`		`-{`
`616`		`-for (i=0;i <=UCHAR_MAX;i++)`
`617`		`-{`
`618`		`-if (pg_wc_ispunct((chr)i))`
`619`		`-addchr(cv, (chr)i);`
`620`		`-}`
`621`		`-}`
	`605`	`+cv=pg_ctype_get_cache(pg_wc_ispunct);`
`622`	`606`	`break;`
`623`	`607`	`caseCC_XDIGIT:`
	`608`	`+/*`
	`609`	`+ * It's not clear how to define this in non-western locales, and`
	`610`	`+ * even less clear that there's any particular use in trying.`
	`611`	`+ * So just hard-wire the meaning.`
	`612`	`+ */`
`624`	`613`	`cv=getcvec(v,0,3);`
`625`	`614`	`if (cv)`
`626`	`615`	`{`
`@@ -630,50 +619,20 @@ cclass(struct vars * v,/* context */`
`630`	`619`	`}`
`631`	`620`	`break;`
`632`	`621`	`caseCC_SPACE:`
`633`		`-cv=getcvec(v,UCHAR_MAX,0);`
`634`		`-if (cv)`
`635`		`-{`
`636`		`-for (i=0;i <=UCHAR_MAX;i++)`
`637`		`-{`
`638`		`-if (pg_wc_isspace((chr)i))`
`639`		`-addchr(cv, (chr)i);`
`640`		`-}`
`641`		`-}`
	`622`	`+cv=pg_ctype_get_cache(pg_wc_isspace);`
`642`	`623`	`break;`
`643`	`624`	`caseCC_LOWER:`
`644`		`-cv=getcvec(v,UCHAR_MAX,0);`
`645`		`-if (cv)`
`646`		`-{`
`647`		`-for (i=0;i <=UCHAR_MAX;i++)`
`648`		`-{`
`649`		`-if (pg_wc_islower((chr)i))`
`650`		`-addchr(cv, (chr)i);`
`651`		`-}`
`652`		`-}`
	`625`	`+cv=pg_ctype_get_cache(pg_wc_islower);`
`653`	`626`	`break;`
`654`	`627`	`caseCC_UPPER:`
`655`		`-cv=getcvec(v,UCHAR_MAX,0);`
`656`		`-if (cv)`
`657`		`-{`
`658`		`-for (i=0;i <=UCHAR_MAX;i++)`
`659`		`-{`
`660`		`-if (pg_wc_isupper((chr)i))`
`661`		`-addchr(cv, (chr)i);`
`662`		`-}`
`663`		`-}`
	`628`	`+cv=pg_ctype_get_cache(pg_wc_isupper);`
`664`	`629`	`break;`
`665`	`630`	`caseCC_GRAPH:`
`666`		`-cv=getcvec(v,UCHAR_MAX,0);`
`667`		`-if (cv)`
`668`		`-{`
`669`		`-for (i=0;i <=UCHAR_MAX;i++)`
`670`		`-{`
`671`		`-if (pg_wc_isgraph((chr)i))`
`672`		`-addchr(cv, (chr)i);`
`673`		`-}`
`674`		`-}`
	`631`	`+cv=pg_ctype_get_cache(pg_wc_isgraph);`
`675`	`632`	`break;`
`676`	`633`	`}`
	`634`	`+`
	`635`	`+/* If cv is NULL now, the reason must be "out of memory" */`
`677`	`636`	`if (cv==NULL)`
`678`	`637`	`ERR(REG_ESPACE);`
`679`	`638`	`returncv;`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commite00f68e

File tree

2 files changed

2 files changed

`‎src/backend/regex/regc_locale.c`

0 commit comments