Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit628cbb5

Browse files
committed
Re-implement extraction of fixed prefixes from regular expressions.
To generate btree-indexable conditions from regex WHERE conditions (such asWHERE indexed_col ~ '^foo'), we need to be able to identify any fixedprefix that a regex might have; that is, find any string that must be aprefix of all strings satisfying the regex. We used to do that withentirely ad-hoc code that looked at the source text of the regex. Itdidn't know very much about regex syntax, which mostly meant that it wouldfail to identify some optimizable cases; but Viktor Rosenfeld reported thatit would produce actively wrong answers for quantified parenthesizedsubexpressions, such as '^(foo)?bar'. Rather than trying to extend thead-hoc code to cover this, let's get rid of it altogether in favor ofidentifying prefixes by examining the compiled form of a regex.To do this, I've added a new entry point "pg_regprefix" to the regex library;hopefully it is defined in a sufficiently general fashion that it can remainin the library when/if that code gets split out as a standalone project.Since this bug has been there for a very long time, this fix needs to getback-patched. However it depends on some other recent commits (particularlythe addition of wchar-to-database-encoding conversion), so I'll commit thisseparately and then go to work on back-porting the necessary fixes.
1 parent00dac60 commit628cbb5

File tree

11 files changed

+461
-178
lines changed

11 files changed

+461
-178
lines changed

‎src/backend/regex/Makefile

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@ subdir = src/backend/regex
1212
top_builddir = ../../..
1313
include$(top_builddir)/src/Makefile.global
1414

15-
OBJS = regcomp.o regerror.o regexec.o regfree.o
15+
OBJS = regcomp.o regerror.o regexec.o regfree.o regprefix.o
1616

1717
include$(top_srcdir)/src/backend/common.mk
1818

‎src/backend/regex/README

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,12 +7,13 @@ So this file is an attempt to reverse-engineer some docs.
77
General source-file layout
88
--------------------------
99

10-
There arefour separately-compilable source files, each exposing exactly
10+
There arefive separately-compilable source files, each exposing exactly
1111
one exported function:
1212
regcomp.c: pg_regcomp
1313
regexec.c: pg_regexec
1414
regerror.c: pg_regerror
1515
regfree.c: pg_regfree
16+
regprefix.c: pg_regprefix
1617
(The pg_ prefixes were added by the Postgres project to distinguish this
1718
library version from any similar one that might be present on a particular
1819
system. They'd need to be removed or replaced in any standalone version
@@ -44,6 +45,7 @@ regexec.cTop-level regex execution code
4445
rege_dfa.cDFA creation and execution
4546
regerror.cpg_regerror: generate text for a regex error code
4647
regfree.cpg_regfree: API to free a no-longer-needed regex_t
48+
regprefix.cCode for extracting a common prefix from a regex_t
4749

4850
The locale-specific code is concerned primarily with case-folding and with
4951
expanding locale-specific character classes, such as [[:alnum:]]. It

‎src/backend/regex/regc_color.c

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -66,8 +66,9 @@ initcm(struct vars * v,
6666
cd=cm->cd;/* cm->cd[WHITE] */
6767
cd->sub=NOSUB;
6868
cd->arcs=NULL;
69-
cd->flags=0;
69+
cd->firstchr=CHR_MIN;
7070
cd->nchrs=CHR_MAX-CHR_MIN+1;
71+
cd->flags=0;
7172

7273
/* upper levels of tree */
7374
for (t=&cm->tree[0],j=NBYTS-1;j>0;t=nextt,j--)
@@ -272,6 +273,7 @@ newcolor(struct colormap * cm)
272273
cd->nchrs=0;
273274
cd->sub=NOSUB;
274275
cd->arcs=NULL;
276+
cd->firstchr=CHR_MIN;/* in case never set otherwise */
275277
cd->flags=0;
276278
cd->block=NULL;
277279

@@ -371,6 +373,8 @@ subcolor(struct colormap * cm, chr c)
371373
if (co==sco)/* already in an open subcolor */
372374
returnco;/* rest is redundant */
373375
cm->cd[co].nchrs--;
376+
if (cm->cd[sco].nchrs==0)
377+
cm->cd[sco].firstchr=c;
374378
cm->cd[sco].nchrs++;
375379
setcolor(cm,c,sco);
376380
returnsco;
@@ -438,6 +442,11 @@ subrange(struct vars * v,
438442

439443
/*
440444
* subblock - allocate new subcolors for one tree block of chrs, fill in arcs
445+
*
446+
* Note: subcolors that are created during execution of this function
447+
* will not be given a useful value of firstchr; it'll be left as CHR_MIN.
448+
* For the current usage of firstchr in pg_regprefix, this does not matter
449+
* because such subcolors won't occur in the common prefix of a regex.
441450
*/
442451
staticvoid
443452
subblock(structvars*v,

‎src/backend/regex/regprefix.c

Lines changed: 259 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,259 @@
1+
/*-------------------------------------------------------------------------
2+
*
3+
* regprefix.c
4+
* Extract a common prefix, if any, from a compiled regex.
5+
*
6+
*
7+
* Portions Copyright (c) 2012, PostgreSQL Global Development Group
8+
* Portions Copyright (c) 1998, 1999 Henry Spencer
9+
*
10+
* IDENTIFICATION
11+
* src/backend/regex/regprefix.c
12+
*
13+
*-------------------------------------------------------------------------
14+
*/
15+
16+
#include"regex/regguts.h"
17+
18+
19+
/*
20+
* forward declarations
21+
*/
22+
staticintfindprefix(structcnfa*cnfa,structcolormap*cm,
23+
chr*string,size_t*slength);
24+
25+
26+
/*
27+
* pg_regprefix - get common prefix for regular expression
28+
*
29+
* Returns one of:
30+
*REG_NOMATCH: there is no common prefix of strings matching the regex
31+
*REG_PREFIX: there is a common prefix of strings matching the regex
32+
*REG_EXACT: all strings satisfying the regex must match the same string
33+
*or a REG_XXX error code
34+
*
35+
* In the non-failure cases, *string is set to a malloc'd string containing
36+
* the common prefix or exact value, of length *slength (measured in chrs
37+
* not bytes!).
38+
*
39+
* This function does not analyze all complex cases (such as lookahead
40+
* constraints) exactly. Therefore it is possible that some strings matching
41+
* the reported prefix or exact-match string do not satisfy the regex. But
42+
* it should never be the case that a string satisfying the regex does not
43+
* match the reported prefix or exact-match string.
44+
*/
45+
int
46+
pg_regprefix(regex_t*re,
47+
chr**string,
48+
size_t*slength)
49+
{
50+
structguts*g;
51+
structcnfa*cnfa;
52+
intst;
53+
54+
/* sanity checks */
55+
if (string==NULL||slength==NULL)
56+
returnREG_INVARG;
57+
*string=NULL;/* initialize for failure cases */
58+
*slength=0;
59+
if (re==NULL||re->re_magic!=REMAGIC)
60+
returnREG_INVARG;
61+
if (re->re_csize!=sizeof(chr))
62+
returnREG_MIXED;
63+
64+
/* Initialize locale-dependent support */
65+
pg_set_regex_collation(re->re_collation);
66+
67+
/* setup */
68+
g= (structguts*)re->re_guts;
69+
if (g->info&REG_UIMPOSSIBLE)
70+
returnREG_NOMATCH;
71+
72+
/*
73+
* This implementation considers only the search NFA for the topmost regex
74+
* tree node. Therefore, constraints such as backrefs are not fully
75+
* applied, which is allowed per the function's API spec.
76+
*/
77+
assert(g->tree!=NULL);
78+
cnfa=&g->tree->cnfa;
79+
80+
/*
81+
* Since a correct NFA should never contain any exit-free loops, it should
82+
* not be possible for our traversal to return to a previously visited
83+
* NFA state. Hence we need at most nstates chrs in the output string.
84+
*/
85+
*string= (chr*)MALLOC(cnfa->nstates*sizeof(chr));
86+
if (*string==NULL)
87+
returnREG_ESPACE;
88+
89+
/* do it */
90+
st=findprefix(cnfa,&g->cmap,*string,slength);
91+
92+
assert(*slength <=cnfa->nstates);
93+
94+
/* clean up */
95+
if (st!=REG_PREFIX&&st!=REG_EXACT)
96+
{
97+
FREE(*string);
98+
*string=NULL;
99+
*slength=0;
100+
}
101+
102+
returnst;
103+
}
104+
105+
/*
106+
* findprefix - extract common prefix from cNFA
107+
*
108+
* Results are returned into the preallocated chr array string[], with
109+
* *slength (which must be preset to zero) incremented for each chr.
110+
*/
111+
staticint/* regprefix return code */
112+
findprefix(structcnfa*cnfa,
113+
structcolormap*cm,
114+
chr*string,
115+
size_t*slength)
116+
{
117+
intst;
118+
intnextst;
119+
colorthiscolor;
120+
chrc;
121+
structcarc*ca;
122+
123+
/*
124+
* The "pre" state must have only BOS/BOL outarcs, else pattern isn't
125+
* anchored left. If we have both BOS and BOL, they must go to the
126+
* same next state.
127+
*/
128+
st=cnfa->pre;
129+
nextst=-1;
130+
for (ca=cnfa->states[st];ca->co!=COLORLESS;ca++)
131+
{
132+
if (ca->co==cnfa->bos[0]||ca->co==cnfa->bos[1])
133+
{
134+
if (nextst==-1)
135+
nextst=ca->to;
136+
elseif (nextst!=ca->to)
137+
returnREG_NOMATCH;
138+
}
139+
else
140+
returnREG_NOMATCH;
141+
}
142+
if (nextst==-1)
143+
returnREG_NOMATCH;
144+
145+
/*
146+
* Scan through successive states, stopping as soon as we find one with
147+
* more than one acceptable transition character (either multiple colors
148+
* on out-arcs, or a color with more than one member chr).
149+
*
150+
* We could find a state with multiple out-arcs that are all labeled with
151+
* the same singleton color; this comes from patterns like "^ab(cde|cxy)".
152+
* In that case we add the chr "c" to the output string but then exit the
153+
* loop with nextst == -1. This leaves a little bit on the table: if the
154+
* pattern is like "^ab(cde|cdy)", we won't notice that "d" could be added
155+
* to the prefix. But chasing multiple parallel state chains doesn't seem
156+
* worth the trouble.
157+
*/
158+
do
159+
{
160+
st=nextst;
161+
nextst=-1;
162+
thiscolor=COLORLESS;
163+
for (ca=cnfa->states[st];ca->co!=COLORLESS;ca++)
164+
{
165+
/* We ignore lookahead constraints */
166+
if (ca->co >=cnfa->ncolors)
167+
continue;
168+
/* We can also ignore BOS/BOL arcs */
169+
if (ca->co==cnfa->bos[0]||ca->co==cnfa->bos[1])
170+
continue;
171+
/* ... but EOS/EOL arcs terminate the search */
172+
if (ca->co==cnfa->eos[0]||ca->co==cnfa->eos[1])
173+
{
174+
thiscolor=COLORLESS;
175+
break;
176+
}
177+
if (thiscolor==COLORLESS)
178+
{
179+
/* First plain outarc */
180+
thiscolor=ca->co;
181+
nextst=ca->to;
182+
}
183+
elseif (thiscolor==ca->co)
184+
{
185+
/* Another plain outarc for same color */
186+
nextst=-1;
187+
}
188+
else
189+
{
190+
/* More than one plain outarc color terminates the search */
191+
thiscolor=COLORLESS;
192+
break;
193+
}
194+
}
195+
/* Done if we didn't find exactly one color on plain outarcs */
196+
if (thiscolor==COLORLESS)
197+
break;
198+
/* The color must be a singleton */
199+
if (cm->cd[thiscolor].nchrs!=1)
200+
break;
201+
202+
/*
203+
* Identify the color's sole member chr and add it to the prefix
204+
* string. In general the colormap data structure doesn't provide a
205+
* way to find color member chrs, except by trying GETCOLOR() on each
206+
* possible chr value, which won't do at all. However, for the cases
207+
* we care about it should be sufficient to test the "firstchr" value,
208+
* that is the first chr ever added to the color. There are cases
209+
* where this might no longer be a member of the color (so we do need
210+
* to test), but none of them are likely to arise for a character that
211+
* is a member of a common prefix. If we do hit such a corner case,
212+
* we just fall out without adding anything to the prefix string.
213+
*/
214+
c=cm->cd[thiscolor].firstchr;
215+
if (GETCOLOR(cm,c)!=thiscolor)
216+
break;
217+
218+
string[(*slength)++]=c;
219+
220+
/* Advance to next state, but only if we have a unique next state */
221+
}while (nextst!=-1);
222+
223+
/*
224+
* If we ended at a state that only has EOS/EOL outarcs leading to the
225+
* "post" state, then we have an exact-match string. Note this is true
226+
* even if the string is of zero length.
227+
*/
228+
nextst=-1;
229+
for (ca=cnfa->states[st];ca->co!=COLORLESS;ca++)
230+
{
231+
if (ca->co==cnfa->eos[0]||ca->co==cnfa->eos[1])
232+
{
233+
if (nextst==-1)
234+
nextst=ca->to;
235+
elseif (nextst!=ca->to)
236+
{
237+
nextst=-1;
238+
break;
239+
}
240+
}
241+
else
242+
{
243+
nextst=-1;
244+
break;
245+
}
246+
}
247+
if (nextst==cnfa->post)
248+
returnREG_EXACT;
249+
250+
/*
251+
* Otherwise, if we were unable to identify any prefix characters, say
252+
* NOMATCH --- the pattern is anchored left, but doesn't specify any
253+
* particular first character.
254+
*/
255+
if (*slength>0)
256+
returnREG_PREFIX;
257+
258+
returnREG_NOMATCH;
259+
}

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp