1515 *
1616 *
1717 * IDENTIFICATION
18- * $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.216 2006/12/23 00:43:11 tgl Exp $
18+ * $PostgreSQL: pgsql/src/backend/utils/adt/selfuncs.c,v 1.217 2007/01/03 22:39:26 tgl Exp $
1919 *
2020 *-------------------------------------------------------------------------
2121 */
@@ -3805,7 +3805,10 @@ get_variable_maximum(PlannerInfo *root, VariableStatData *vardata,
38053805 * These routines support analysis of LIKE and regular-expression patterns
38063806 * by the planner/optimizer. It's important that they agree with the
38073807 * regular-expression code in backend/regex/ and the LIKE code in
3808- * backend/utils/adt/like.c.
3808+ * backend/utils/adt/like.c. Also, the computation of the fixed prefix
3809+ * must be conservative: if we report a string longer than the true fixed
3810+ * prefix, the query may produce actually wrong answers, rather than just
3811+ * getting a bad selectivity estimate!
38093812 *
38103813 * Note that the prefix-analysis functions are called from
38113814 * backend/optimizer/path/indxpath.c as well as from routines in this file.
@@ -3837,6 +3840,7 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive,
38373840Oid typeid = patt_const -> consttype ;
38383841int pos ,
38393842match_pos ;
3843+ bool is_multibyte = (pg_database_encoding_max_length ()> 1 );
38403844
38413845/* the right-hand const is type text or bytea */
38423846Assert (typeid == BYTEAOID || typeid == TEXTOID );
@@ -3880,11 +3884,16 @@ like_fixed_prefix(Const *patt_const, bool case_insensitive,
38803884}
38813885
38823886/*
3883- * XXXI suspect isalpha() is not an adequately locale-sensitive test
3884- *for characters that can vary under case folding?
3887+ * XXXIn multibyte character sets, we can't trust isalpha, so assume
3888+ *any multibyte char is potentially case-varying.
38853889 */
3886- if (case_insensitive && isalpha ((unsignedchar )patt [pos ]))
3887- break ;
3890+ if (case_insensitive )
3891+ {
3892+ if (is_multibyte && (unsignedchar )patt [pos ] >=0x80 )
3893+ break ;
3894+ if (isalpha ((unsignedchar )patt [pos ]))
3895+ break ;
3896+ }
38883897
38893898/*
38903899 * NOTE: this code used to think that %% meant a literal %, but
@@ -3929,11 +3938,13 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
39293938int pos ,
39303939match_pos ,
39313940prev_pos ,
3932- prev_match_pos ,
3933- paren_depth ;
3941+ prev_match_pos ;
3942+ bool have_leading_paren ;
39343943char * patt ;
39353944char * rest ;
39363945Oid typeid = patt_const -> consttype ;
3946+ bool is_basic = regex_flavor_is_basic ();
3947+ bool is_multibyte = (pg_database_encoding_max_length ()> 1 );
39373948
39383949/*
39393950 * Should be unnecessary, there are no bytea regex operators defined. As
@@ -3948,8 +3959,19 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
39483959/* the right-hand const is type text for all of these */
39493960patt = DatumGetCString (DirectFunctionCall1 (textout ,patt_const -> constvalue ));
39503961
3962+ /*
3963+ * Check for ARE director prefix. It's worth our trouble to recognize
3964+ * this because similar_escape() uses it.
3965+ */
3966+ pos = 0 ;
3967+ if (strncmp (patt ,"***:" ,4 )== 0 )
3968+ {
3969+ pos = 4 ;
3970+ is_basic = false;
3971+ }
3972+
39513973/* Pattern must be anchored left */
3952- if (patt [0 ]!= '^' )
3974+ if (patt [pos ]!= '^' )
39533975{
39543976rest = patt ;
39553977
@@ -3958,72 +3980,86 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
39583980
39593981return Pattern_Prefix_None ;
39603982}
3983+ pos ++ ;
39613984
39623985/*
3963- * If unquoted | is present at paren level 0 in pattern, then there are
3964- * multiple alternatives for the start of the string.
3986+ * If '|' is present in pattern, then there may be multiple alternatives
3987+ * for the start of the string. (There are cases where this isn't so,
3988+ * for instance if the '|' is inside parens, but detecting that reliably
3989+ * is too hard.)
39653990 */
3966- paren_depth = 0 ;
3967- for (pos = 1 ;patt [pos ];pos ++ )
3991+ if (strchr (patt + pos ,'|' )!= NULL )
39683992{
3969- if (patt [pos ]== '|' && paren_depth == 0 )
3970- {
3971- rest = patt ;
3993+ rest = patt ;
39723994
3973- * prefix_const = NULL ;
3974- * rest_const = string_to_const (rest ,typeid );
3995+ * prefix_const = NULL ;
3996+ * rest_const = string_to_const (rest ,typeid );
39753997
3976- return Pattern_Prefix_None ;
3977- }
3978- else if (patt [pos ]== '(' )
3979- paren_depth ++ ;
3980- else if (patt [pos ]== ')' && paren_depth > 0 )
3981- paren_depth -- ;
3982- else if (patt [pos ]== '\\' )
3983- {
3984- /* backslash quotes the next character */
3985- pos ++ ;
3986- if (patt [pos ]== '\0' )
3987- break ;
3988- }
3998+ return Pattern_Prefix_None ;
39893999}
39904000
39914001/* OK, allocate space for pattern */
39924002match = palloc (strlen (patt )+ 1 );
39934003prev_match_pos = match_pos = 0 ;
39944004
3995- /* note start at pos 1 to skip leading ^ */
3996- for (prev_pos = pos = 1 ;patt [pos ];)
4005+ /*
4006+ * We special-case the syntax '^(...)$' because psql uses it. But beware:
4007+ * in BRE mode these parentheses are just ordinary characters. Also,
4008+ * sequences beginning "(?" are not what they seem, unless they're "(?:".
4009+ * (We should recognize that, too, because of similar_escape().)
4010+ *
4011+ * Note: it's a bit bogus to be depending on the current regex_flavor
4012+ * setting here, because the setting could change before the pattern is
4013+ * used. We minimize the risk by trusting the flavor as little as we can,
4014+ * but perhaps it would be a good idea to get rid of the "basic" setting.
4015+ */
4016+ have_leading_paren = false;
4017+ if (patt [pos ]== '(' && !is_basic &&
4018+ (patt [pos + 1 ]!= '?' || patt [pos + 2 ]== ':' ))
4019+ {
4020+ have_leading_paren = true;
4021+ pos += (patt [pos + 1 ]!= '?' ?1 :3 );
4022+ }
4023+
4024+ /* Scan remainder of pattern */
4025+ prev_pos = pos ;
4026+ while (patt [pos ])
39974027{
39984028int len ;
39994029
40004030/*
40014031 * Check for characters that indicate multiple possible matches here.
4002- * XXX I suspect isalpha() is not an adequately locale-sensitive test
4003- * for characters that can vary under case folding?
4032+ * Also, drop out at ')' or '$' so the termination test works right.
40044033 */
40054034if (patt [pos ]== '.' ||
40064035patt [pos ]== '(' ||
4036+ patt [pos ]== ')' ||
40074037patt [pos ]== '[' ||
4008- patt [pos ]== '$ ' ||
4009- ( case_insensitive && isalpha (( unsigned char ) patt [pos ])) )
4038+ patt [pos ]== '^ ' ||
4039+ patt [pos ]== '$' )
40104040break ;
40114041
40124042/*
4013- * In AREs, backslash followed by alphanumeric is an escape, not a
4014- * quoted character. Must treat it as having multiple possible
4015- * matches.
4043+ * XXX In multibyte character sets, we can't trust isalpha, so assume
4044+ * any multibyte char is potentially case-varying.
40164045 */
4017- if (patt [pos ]== '\\' && isalnum ((unsignedchar )patt [pos + 1 ]))
4018- break ;
4046+ if (case_insensitive )
4047+ {
4048+ if (is_multibyte && (unsignedchar )patt [pos ] >=0x80 )
4049+ break ;
4050+ if (isalpha ((unsignedchar )patt [pos ]))
4051+ break ;
4052+ }
40194053
40204054/*
40214055 * Check for quantifiers. Except for +, this means the preceding
40224056 * character is optional, so we must remove it from the prefix too!
4057+ * Note: in BREs, \{ is a quantifier.
40234058 */
40244059if (patt [pos ]== '*' ||
40254060patt [pos ]== '?' ||
4026- patt [pos ]== '{' )
4061+ patt [pos ]== '{' ||
4062+ (patt [pos ]== '\\' && patt [pos + 1 ]== '{' ))
40274063{
40284064match_pos = prev_match_pos ;
40294065pos = prev_pos ;
@@ -4034,9 +4070,19 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
40344070pos = prev_pos ;
40354071break ;
40364072}
4073+
4074+ /*
4075+ * Normally, backslash quotes the next character. But in AREs,
4076+ * backslash followed by alphanumeric is an escape, not a quoted
4077+ * character. Must treat it as having multiple possible matches.
4078+ * In BREs, \( is a parenthesis, so don't trust that either.
4079+ * Note: since only ASCII alphanumerics are escapes, we don't have
4080+ * to be paranoid about multibyte here.
4081+ */
40374082if (patt [pos ]== '\\' )
40384083{
4039- /* backslash quotes the next character */
4084+ if (isalnum ((unsignedchar )patt [pos + 1 ])|| patt [pos + 1 ]== '(' )
4085+ break ;
40404086pos ++ ;
40414087if (patt [pos ]== '\0' )
40424088break ;
@@ -4054,6 +4100,9 @@ regex_fixed_prefix(Const *patt_const, bool case_insensitive,
40544100match [match_pos ]= '\0' ;
40554101rest = & patt [pos ];
40564102
4103+ if (have_leading_paren && patt [pos ]== ')' )
4104+ pos ++ ;
4105+
40574106if (patt [pos ]== '$' && patt [pos + 1 ]== '\0' )
40584107{
40594108rest = & patt [pos + 1 ];