Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commite00f68e

Browse files
committed
Add caching of ctype.h/wctype.h results in regc_locale.c.
While this doesn't save a huge amount of runtime, it still seems worthdoing, especially since I realized that the data copying I did in my firstdraft was quite unnecessary. In this version, once we have the resultscached, getting them back for re-use is really very cheap.Also, remove the hard-wired limitation to not consider wctype.h results forcharacter codes above 255. It turns out that we can't push the limit asfar up as I'd originally hoped, because the regex colormap code is notefficient enough to cope very well with character classes containing manythousand letters, which a Unicode locale is entirely capable of producing.Still, we can push it up to U+7FF (which I chose as the limit of 2-byteUTF8 characters), which will at least make Eastern Europeans happy pendinga better solution. Thus, this commit resolves the specific complaint inbug #6457, but not the more general issue that letters of non-westernalphabets are mostly not recognized as matching [[:alpha:]].
1 parent27af914 commite00f68e

File tree

2 files changed

+260
-81
lines changed

2 files changed

+260
-81
lines changed

‎src/backend/regex/regc_locale.c

Lines changed: 39 additions & 80 deletions
Original file line numberDiff line numberDiff line change
@@ -350,6 +350,16 @@ static const struct cname
350350
};
351351

352352

353+
/*
354+
* We do not use the hard-wired Unicode classification tables that Tcl does.
355+
* This is because (a) we need to deal with other encodings besides Unicode,
356+
* and (b) we want to track the behavior of the libc locale routines as
357+
* closely as possible. For example, it wouldn't be unreasonable for a
358+
* locale to not consider every Unicode letter as a letter. So we build
359+
* character classification cvecs by asking libc, even for Unicode.
360+
*/
361+
362+
353363
/*
354364
* element - map collating-element name to celt
355365
*/
@@ -489,7 +499,11 @@ eclass(struct vars * v,/* context */
489499
/*
490500
* cclass - supply cvec for a character class
491501
*
492-
* Must include case counterparts on request.
502+
* Must include case counterparts if "cases" is true.
503+
*
504+
* The returned cvec might be either a transient cvec gotten from getcvec(),
505+
* or a permanently cached one from pg_ctype_get_cache(). This is okay
506+
* because callers are not supposed to explicitly free the result either way.
493507
*/
494508
staticstructcvec*
495509
cclass(structvars*v,/* context */
@@ -548,79 +562,54 @@ cclass(struct vars * v,/* context */
548562
index= (int)CC_ALPHA;
549563

550564
/*
551-
* Now compute the character class contents.
552-
*
553-
* For the moment, assume that only char codes < 256 can be in these
554-
* classes.
565+
* Now compute the character class contents. For classes that are
566+
* based on the behavior of a <wctype.h> or <ctype.h> function, we use
567+
* pg_ctype_get_cache so that we can cache the results. Other classes
568+
* have definitions that are hard-wired here, and for those we just
569+
* construct a transient cvec on the fly.
555570
*/
556571

557572
switch ((enumclasses)index)
558573
{
559574
caseCC_PRINT:
560-
cv=getcvec(v,UCHAR_MAX,0);
561-
if (cv)
562-
{
563-
for (i=0;i <=UCHAR_MAX;i++)
564-
{
565-
if (pg_wc_isprint((chr)i))
566-
addchr(cv, (chr)i);
567-
}
568-
}
575+
cv=pg_ctype_get_cache(pg_wc_isprint);
569576
break;
570577
caseCC_ALNUM:
571-
cv=getcvec(v,UCHAR_MAX,0);
572-
if (cv)
573-
{
574-
for (i=0;i <=UCHAR_MAX;i++)
575-
{
576-
if (pg_wc_isalnum((chr)i))
577-
addchr(cv, (chr)i);
578-
}
579-
}
578+
cv=pg_ctype_get_cache(pg_wc_isalnum);
580579
break;
581580
caseCC_ALPHA:
582-
cv=getcvec(v,UCHAR_MAX,0);
583-
if (cv)
584-
{
585-
for (i=0;i <=UCHAR_MAX;i++)
586-
{
587-
if (pg_wc_isalpha((chr)i))
588-
addchr(cv, (chr)i);
589-
}
590-
}
581+
cv=pg_ctype_get_cache(pg_wc_isalpha);
591582
break;
592583
caseCC_ASCII:
584+
/* hard-wired meaning */
593585
cv=getcvec(v,0,1);
594586
if (cv)
595587
addrange(cv,0,0x7f);
596588
break;
597589
caseCC_BLANK:
590+
/* hard-wired meaning */
598591
cv=getcvec(v,2,0);
599592
addchr(cv,'\t');
600593
addchr(cv,' ');
601594
break;
602595
caseCC_CNTRL:
596+
/* hard-wired meaning */
603597
cv=getcvec(v,0,2);
604598
addrange(cv,0x0,0x1f);
605599
addrange(cv,0x7f,0x9f);
606600
break;
607601
caseCC_DIGIT:
608-
cv=getcvec(v,0,1);
609-
if (cv)
610-
addrange(cv, (chr)'0', (chr)'9');
602+
cv=pg_ctype_get_cache(pg_wc_isdigit);
611603
break;
612604
caseCC_PUNCT:
613-
cv=getcvec(v,UCHAR_MAX,0);
614-
if (cv)
615-
{
616-
for (i=0;i <=UCHAR_MAX;i++)
617-
{
618-
if (pg_wc_ispunct((chr)i))
619-
addchr(cv, (chr)i);
620-
}
621-
}
605+
cv=pg_ctype_get_cache(pg_wc_ispunct);
622606
break;
623607
caseCC_XDIGIT:
608+
/*
609+
* It's not clear how to define this in non-western locales, and
610+
* even less clear that there's any particular use in trying.
611+
* So just hard-wire the meaning.
612+
*/
624613
cv=getcvec(v,0,3);
625614
if (cv)
626615
{
@@ -630,50 +619,20 @@ cclass(struct vars * v,/* context */
630619
}
631620
break;
632621
caseCC_SPACE:
633-
cv=getcvec(v,UCHAR_MAX,0);
634-
if (cv)
635-
{
636-
for (i=0;i <=UCHAR_MAX;i++)
637-
{
638-
if (pg_wc_isspace((chr)i))
639-
addchr(cv, (chr)i);
640-
}
641-
}
622+
cv=pg_ctype_get_cache(pg_wc_isspace);
642623
break;
643624
caseCC_LOWER:
644-
cv=getcvec(v,UCHAR_MAX,0);
645-
if (cv)
646-
{
647-
for (i=0;i <=UCHAR_MAX;i++)
648-
{
649-
if (pg_wc_islower((chr)i))
650-
addchr(cv, (chr)i);
651-
}
652-
}
625+
cv=pg_ctype_get_cache(pg_wc_islower);
653626
break;
654627
caseCC_UPPER:
655-
cv=getcvec(v,UCHAR_MAX,0);
656-
if (cv)
657-
{
658-
for (i=0;i <=UCHAR_MAX;i++)
659-
{
660-
if (pg_wc_isupper((chr)i))
661-
addchr(cv, (chr)i);
662-
}
663-
}
628+
cv=pg_ctype_get_cache(pg_wc_isupper);
664629
break;
665630
caseCC_GRAPH:
666-
cv=getcvec(v,UCHAR_MAX,0);
667-
if (cv)
668-
{
669-
for (i=0;i <=UCHAR_MAX;i++)
670-
{
671-
if (pg_wc_isgraph((chr)i))
672-
addchr(cv, (chr)i);
673-
}
674-
}
631+
cv=pg_ctype_get_cache(pg_wc_isgraph);
675632
break;
676633
}
634+
635+
/* If cv is NULL now, the reason must be "out of memory" */
677636
if (cv==NULL)
678637
ERR(REG_ESPACE);
679638
returncv;

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp