Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitd6061d2

Browse files
committed
Fix regex_fixed_prefix() to cope reasonably well with regex patterns of the
form '^(foo)$'. Before, these could never be optimized into indexscans.The recent changes to make psql and pg_dump generate such patterns (for \dcommands and -t and related switches, respectively) therefore representeda big performance hit for people with large pg_class catalogs, as seen inrecent gripe from Erik Jones. While at it, be more paranoid aboutcase-sensitivity checking in multibyte encodings, and fix some othercorner cases in which a regex might be interpreted too liberally.
1 parent9c88830 commitd6061d2

File tree

3 files changed

+105
-46
lines changed

3 files changed

+105
-46
lines changed

‎src/backend/utils/adt/regexp.c

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@
88
*
99
*
1010
* IDENTIFICATION
11-
* $PostgreSQL: pgsql/src/backend/utils/adt/regexp.c,v 1.66 2006/10/04 00:29:59 momjian Exp $
11+
* $PostgreSQL: pgsql/src/backend/utils/adt/regexp.c,v 1.67 2007/01/03 22:39:26 tgl Exp $
1212
*
1313
*Alistair Crooks added the code for the regex caching
1414
*agc - cached the regular expressions used - there's a good chance
@@ -624,3 +624,12 @@ similar_escape(PG_FUNCTION_ARGS)
624624

625625
PG_RETURN_TEXT_P(result);
626626
}
627+
628+
/*
629+
* report whether regex_flavor is currently BASIC
630+
*/
631+
bool
632+
regex_flavor_is_basic(void)
633+
{
634+
return (regex_flavor==REG_BASIC);
635+
}

‎src/backend/utils/adt/selfuncs.c

Lines changed: 93 additions & 44 deletions
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@
1515
*
1616
*
1717
* IDENTIFICATION
18-
* $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.216 2006/12/23 00:43:11 tgl Exp $
18+
* $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.217 2007/01/03 22:39:26 tgl Exp $
1919
*
2020
*-------------------------------------------------------------------------
2121
*/
@@ -3805,7 +3805,10 @@ get_variable_maximum(PlannerInfo *root, VariableStatData *vardata,
38053805
* These routines support analysis of LIKE and regular-expression patterns
38063806
* by the planner/optimizer. It's important that they agree with the
38073807
* regular-expression code in backend/regex/ and the LIKE code in
3808-
* backend/utils/adt/like.c.
3808+
* backend/utils/adt/like.c. Also, the computation of the fixed prefix
3809+
* must be conservative: if we report a string longer than the true fixed
3810+
* prefix, the query may produce actually wrong answers, rather than just
3811+
* getting a bad selectivity estimate!
38093812
*
38103813
* Note that the prefix-analysis functions are called from
38113814
* backend/optimizer/path/indxpath.c as well as from routines in this file.
@@ -3837,6 +3840,7 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive,
38373840
Oidtypeid=patt_const->consttype;
38383841
intpos,
38393842
match_pos;
3843+
boolis_multibyte= (pg_database_encoding_max_length()>1);
38403844

38413845
/* the right-hand const is type text or bytea */
38423846
Assert(typeid==BYTEAOID||typeid==TEXTOID);
@@ -3880,11 +3884,16 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive,
38803884
}
38813885

38823886
/*
3883-
* XXXI suspect isalpha() is not an adequately locale-sensitive test
3884-
*for characters that can vary undercase folding?
3887+
* XXXIn multibyte character sets, we can't trust isalpha, so assume
3888+
*any multibyte char is potentiallycase-varying.
38853889
*/
3886-
if (case_insensitive&&isalpha((unsignedchar)patt[pos]))
3887-
break;
3890+
if (case_insensitive)
3891+
{
3892+
if (is_multibyte&& (unsignedchar)patt[pos] >=0x80)
3893+
break;
3894+
if (isalpha((unsignedchar)patt[pos]))
3895+
break;
3896+
}
38883897

38893898
/*
38903899
* NOTE: this code used to think that %% meant a literal %, but
@@ -3929,11 +3938,13 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
39293938
intpos,
39303939
match_pos,
39313940
prev_pos,
3932-
prev_match_pos,
3933-
paren_depth;
3941+
prev_match_pos;
3942+
boolhave_leading_paren;
39343943
char*patt;
39353944
char*rest;
39363945
Oidtypeid=patt_const->consttype;
3946+
boolis_basic=regex_flavor_is_basic();
3947+
boolis_multibyte= (pg_database_encoding_max_length()>1);
39373948

39383949
/*
39393950
* Should be unnecessary, there are no bytea regex operators defined. As
@@ -3948,8 +3959,19 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
39483959
/* the right-hand const is type text for all of these */
39493960
patt=DatumGetCString(DirectFunctionCall1(textout,patt_const->constvalue));
39503961

3962+
/*
3963+
* Check for ARE director prefix. It's worth our trouble to recognize
3964+
* this because similar_escape() uses it.
3965+
*/
3966+
pos=0;
3967+
if (strncmp(patt,"***:",4)==0)
3968+
{
3969+
pos=4;
3970+
is_basic= false;
3971+
}
3972+
39513973
/* Pattern must be anchored left */
3952-
if (patt[0]!='^')
3974+
if (patt[pos]!='^')
39533975
{
39543976
rest=patt;
39553977

@@ -3958,72 +3980,86 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
39583980

39593981
returnPattern_Prefix_None;
39603982
}
3983+
pos++;
39613984

39623985
/*
3963-
* If unquoted | is present at paren level 0 in pattern, then there are
3964-
* multiple alternatives for the start of the string.
3986+
* If '|' is present in pattern, then there may be multiple alternatives
3987+
* for the start of the string. (There are cases where this isn't so,
3988+
* for instance if the '|' is inside parens, but detecting that reliably
3989+
* is too hard.)
39653990
*/
3966-
paren_depth=0;
3967-
for (pos=1;patt[pos];pos++)
3991+
if (strchr(patt+pos,'|')!=NULL)
39683992
{
3969-
if (patt[pos]=='|'&&paren_depth==0)
3970-
{
3971-
rest=patt;
3993+
rest=patt;
39723994

3973-
*prefix_const=NULL;
3974-
*rest_const=string_to_const(rest,typeid);
3995+
*prefix_const=NULL;
3996+
*rest_const=string_to_const(rest,typeid);
39753997

3976-
returnPattern_Prefix_None;
3977-
}
3978-
elseif (patt[pos]=='(')
3979-
paren_depth++;
3980-
elseif (patt[pos]==')'&&paren_depth>0)
3981-
paren_depth--;
3982-
elseif (patt[pos]=='\\')
3983-
{
3984-
/* backslash quotes the next character */
3985-
pos++;
3986-
if (patt[pos]=='\0')
3987-
break;
3988-
}
3998+
returnPattern_Prefix_None;
39893999
}
39904000

39914001
/* OK, allocate space for pattern */
39924002
match=palloc(strlen(patt)+1);
39934003
prev_match_pos=match_pos=0;
39944004

3995-
/* note start at pos 1 to skip leading ^ */
3996-
for (prev_pos=pos=1;patt[pos];)
4005+
/*
4006+
* We special-case the syntax '^(...)$' because psql uses it. But beware:
4007+
* in BRE mode these parentheses are just ordinary characters. Also,
4008+
* sequences beginning "(?" are not what they seem, unless they're "(?:".
4009+
* (We should recognize that, too, because of similar_escape().)
4010+
*
4011+
* Note: it's a bit bogus to be depending on the current regex_flavor
4012+
* setting here, because the setting could change before the pattern is
4013+
* used. We minimize the risk by trusting the flavor as little as we can,
4014+
* but perhaps it would be a good idea to get rid of the "basic" setting.
4015+
*/
4016+
have_leading_paren= false;
4017+
if (patt[pos]=='('&& !is_basic&&
4018+
(patt[pos+1]!='?'||patt[pos+2]==':'))
4019+
{
4020+
have_leading_paren= true;
4021+
pos+= (patt[pos+1]!='?' ?1 :3);
4022+
}
4023+
4024+
/* Scan remainder of pattern */
4025+
prev_pos=pos;
4026+
while (patt[pos])
39974027
{
39984028
intlen;
39994029

40004030
/*
40014031
* Check for characters that indicate multiple possible matches here.
4002-
* XXX I suspect isalpha() is not an adequately locale-sensitive test
4003-
* for characters that can vary under case folding?
4032+
* Also, drop out at ')' or '$' so the termination test works right.
40044033
*/
40054034
if (patt[pos]=='.'||
40064035
patt[pos]=='('||
4036+
patt[pos]==')'||
40074037
patt[pos]=='['||
4008-
patt[pos]=='$'||
4009-
(case_insensitive&&isalpha((unsignedchar)patt[pos])))
4038+
patt[pos]=='^'||
4039+
patt[pos]=='$')
40104040
break;
40114041

40124042
/*
4013-
* In AREs, backslash followed by alphanumeric is an escape, not a
4014-
* quoted character. Must treat it as having multiple possible
4015-
* matches.
4043+
* XXX In multibyte character sets, we can't trust isalpha, so assume
4044+
* any multibyte char is potentially case-varying.
40164045
*/
4017-
if (patt[pos]=='\\'&&isalnum((unsignedchar)patt[pos+1]))
4018-
break;
4046+
if (case_insensitive)
4047+
{
4048+
if (is_multibyte&& (unsignedchar)patt[pos] >=0x80)
4049+
break;
4050+
if (isalpha((unsignedchar)patt[pos]))
4051+
break;
4052+
}
40194053

40204054
/*
40214055
* Check for quantifiers. Except for +, this means the preceding
40224056
* character is optional, so we must remove it from the prefix too!
4057+
* Note: in BREs, \{ is a quantifier.
40234058
*/
40244059
if (patt[pos]=='*'||
40254060
patt[pos]=='?'||
4026-
patt[pos]=='{')
4061+
patt[pos]=='{'||
4062+
(patt[pos]=='\\'&&patt[pos+1]=='{'))
40274063
{
40284064
match_pos=prev_match_pos;
40294065
pos=prev_pos;
@@ -4034,9 +4070,19 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
40344070
pos=prev_pos;
40354071
break;
40364072
}
4073+
4074+
/*
4075+
* Normally, backslash quotes the next character. But in AREs,
4076+
* backslash followed by alphanumeric is an escape, not a quoted
4077+
* character. Must treat it as having multiple possible matches.
4078+
* In BREs, \( is a parenthesis, so don't trust that either.
4079+
* Note: since only ASCII alphanumerics are escapes, we don't have
4080+
* to be paranoid about multibyte here.
4081+
*/
40374082
if (patt[pos]=='\\')
40384083
{
4039-
/* backslash quotes the next character */
4084+
if (isalnum((unsignedchar)patt[pos+1])||patt[pos+1]=='(')
4085+
break;
40404086
pos++;
40414087
if (patt[pos]=='\0')
40424088
break;
@@ -4054,6 +4100,9 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
40544100
match[match_pos]='\0';
40554101
rest=&patt[pos];
40564102

4103+
if (have_leading_paren&&patt[pos]==')')
4104+
pos++;
4105+
40574106
if (patt[pos]=='$'&&patt[pos+1]=='\0')
40584107
{
40594108
rest=&patt[pos+1];

‎src/include/utils/builtins.h

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
* Portions Copyright (c) 1996-2006, PostgreSQL Global Development Group
88
* Portions Copyright (c) 1994, Regents of the University of California
99
*
10-
* $PostgreSQL: pgsql/src/include/utils/builtins.h,v 1.283 2006/12/30 21:21:55 tgl Exp $
10+
* $PostgreSQL: pgsql/src/include/utils/builtins.h,v 1.284 2007/01/03 22:39:26 tgl Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
@@ -477,6 +477,7 @@ extern Datum textregexsubstr(PG_FUNCTION_ARGS);
477477
externDatumtextregexreplace_noopt(PG_FUNCTION_ARGS);
478478
externDatumtextregexreplace(PG_FUNCTION_ARGS);
479479
externDatumsimilar_escape(PG_FUNCTION_ARGS);
480+
externboolregex_flavor_is_basic(void);
480481

481482
/* regproc.c */
482483
externDatumregprocin(PG_FUNCTION_ARGS);

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp