Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit0e6aa87

Browse files
committed
Avoid determining regexp subexpression matches, when possible.
Identifying the precise match locations for parenthesized subexpressionsis a fairly expensive task given the way our regexp engine works, bothat regexp compile time (where we must create an optimized NFA for eachparenthesized subexpression) and at runtime (where determining exactmatch locations requires laborious search).Up to now we've made little attempt to optimize this situation. Thispatch identifies cases where we know at compile time that we won'tneed to know subexpression match locations, and teaches the regexpcompiler to not bother creating per-subexpression regexps forparenthesis pairs that are not referenced by backrefs elsewhere inthe regexp. (To preserve semantics, we obviously still have topin down the match locations of backref references.) Users couldhave obtained the same results before this by being careful towrite "non capturing" parentheses wherever possible, but few peoplebother with that.Discussion:https://postgr.es/m/2219936.1628115334@sss.pgh.pa.us
1 parent76ad244 commit0e6aa87

File tree

10 files changed

+154
-45
lines changed

10 files changed

+154
-45
lines changed

‎contrib/pg_trgm/trgm_regexp.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -541,9 +541,11 @@ createTrgmNFA(text *text_re, Oid collation,
541541
* Stage 1: Compile the regexp into a NFA, using the regexp library.
542542
*/
543543
#ifdefIGNORECASE
544-
RE_compile(&regex,text_re,REG_ADVANCED |REG_ICASE,collation);
544+
RE_compile(&regex,text_re,
545+
REG_ADVANCED |REG_NOSUB |REG_ICASE,collation);
545546
#else
546-
RE_compile(&regex,text_re,REG_ADVANCED,collation);
547+
RE_compile(&regex,text_re,
548+
REG_ADVANCED |REG_NOSUB,collation);
547549
#endif
548550

549551
/*

‎src/backend/regex/regc_lex.c

Lines changed: 1 addition & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -528,10 +528,7 @@ next(struct vars *v)
528528
}
529529
assert(NOTREACHED);
530530
}
531-
if (v->cflags&REG_NOSUB)
532-
RETV('(',0);/* all parens non-capturing */
533-
else
534-
RETV('(',1);
531+
RETV('(',1);
535532
break;
536533
caseCHR(')'):
537534
if (LASTTYPE('('))

‎src/backend/regex/regcomp.c

Lines changed: 54 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,7 @@ static struct subre *subre(struct vars *, int, int, struct state *, struct state
6565
staticvoidfreesubre(structvars*,structsubre*);
6666
staticvoidfreesubreandsiblings(structvars*,structsubre*);
6767
staticvoidfreesrnode(structvars*,structsubre*);
68-
staticvoidoptst(structvars*,structsubre*);
68+
staticvoidremovecaptures(structvars*,structsubre*);
6969
staticintnumst(structsubre*,int);
7070
staticvoidmarkst(structsubre*);
7171
staticvoidcleanst(structvars*);
@@ -431,7 +431,8 @@ pg_regcomp(regex_t *re,
431431
dumpst(v->tree,debug,1);
432432
}
433433
#endif
434-
optst(v,v->tree);
434+
if (v->cflags&REG_NOSUB)
435+
removecaptures(v,v->tree);
435436
v->ntree=numst(v->tree,1);
436437
markst(v->tree);
437438
cleanst(v);
@@ -1013,14 +1014,16 @@ parseqatom(struct vars *v,
10131014
break;
10141015
caseBACKREF:/* the Feature From The Black Lagoon */
10151016
INSIST(type!=LACON,REG_ESUBREG);
1016-
INSIST(v->nextvalue<v->nsubs,REG_ESUBREG);
1017-
INSIST(v->subs[v->nextvalue]!=NULL,REG_ESUBREG);
1017+
subno=v->nextvalue;
1018+
assert(subno>0);
1019+
INSIST(subno<v->nsubs,REG_ESUBREG);
1020+
NOERRN();
1021+
INSIST(v->subs[subno]!=NULL,REG_ESUBREG);
10181022
NOERRN();
1019-
assert(v->nextvalue>0);
10201023
atom=subre(v,'b',BACKR,lp,rp);
10211024
NOERRN();
1022-
subno=v->nextvalue;
10231025
atom->backno=subno;
1026+
v->subs[subno]->flags |=BRUSE;
10241027
EMPTYARC(lp,rp);/* temporarily, so there's something */
10251028
NEXT();
10261029
break;
@@ -2141,19 +2144,54 @@ freesrnode(struct vars *v,/* might be NULL */
21412144
}
21422145

21432146
/*
2144-
* optst - optimize a subRE subtree
2147+
* removecaptures - remove unnecessary capture subREs
2148+
*
2149+
* If the caller said that it doesn't care about subexpression match data,
2150+
* we may delete the "capture" markers on subREs that are not referenced
2151+
* by any backrefs, and then simplify anything that's become non-messy.
2152+
* Call this only if REG_NOSUB flag is set.
21452153
*/
21462154
staticvoid
2147-
optst(structvars*v,
2148-
structsubre*t)
2155+
removecaptures(structvars*v,
2156+
structsubre*t)
21492157
{
2158+
structsubre*t2;
2159+
2160+
assert(t!=NULL);
2161+
2162+
/*
2163+
* If this isn't itself a backref target, clear capno and tentatively
2164+
* clear CAP flag.
2165+
*/
2166+
if (!(t->flags&BRUSE))
2167+
{
2168+
t->capno=0;
2169+
t->flags &= ~CAP;
2170+
}
2171+
2172+
/* Now recurse to children */
2173+
for (t2=t->child;t2!=NULL;t2=t2->sibling)
2174+
{
2175+
removecaptures(v,t2);
2176+
/* Propagate child CAP flag back up, if it's still set */
2177+
if (t2->flags&CAP)
2178+
t->flags |=CAP;
2179+
}
2180+
21502181
/*
2151-
*DGP (2007-11-13): I assume it was the programmer'sintent to eventually
2152-
*come back and add code to optimize subRE trees, but the routine coded
2153-
*just spends effort traversing the tree and doing nothing. We can do
2154-
*nothing with less effort.
2182+
*If t now contains neither captures nor backrefs, there'sno longer any
2183+
*need to care where its sub-match boundaries are, so we can reduce it to
2184+
*a simple DFA node. (Note in particular that MIXED child greediness is
2185+
*not a hindrance here, so we don't use the MESSY() macro.)
21552186
*/
2156-
return;
2187+
if ((t->flags& (CAP |BACKR))==0)
2188+
{
2189+
if (t->child)
2190+
freesubreandsiblings(v,t->child);
2191+
t->child=NULL;
2192+
t->op='=';
2193+
t->flags &= ~MIXED;
2194+
}
21572195
}
21582196

21592197
/*
@@ -2501,6 +2539,8 @@ stdump(struct subre *t,
25012539
fprintf(f," hascapture");
25022540
if (t->flags&BACKR)
25032541
fprintf(f," hasbackref");
2542+
if (t->flags&BRUSE)
2543+
fprintf(f," isreferenced");
25042544
if (!(t->flags&INUSE))
25052545
fprintf(f," UNUSED");
25062546
if (t->latype!= (char)-1)

‎src/backend/regex/regexec.c

Lines changed: 25 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -213,12 +213,9 @@ pg_regexec(regex_t *re,
213213
returnREG_NOMATCH;
214214
backref= (v->g->info&REG_UBACKREF) ?1 :0;
215215
v->eflags=flags;
216-
if (v->g->cflags&REG_NOSUB)
217-
nmatch=0;/* override client */
218-
v->nmatch=nmatch;
219-
if (backref)
216+
if (backref&&nmatch <=v->g->nsub)
220217
{
221-
/* need work area */
218+
/* needlargerwork area */
222219
if (v->g->nsub+1 <=LOCALMAT)
223220
v->pmatch=mat;
224221
else
@@ -229,7 +226,17 @@ pg_regexec(regex_t *re,
229226
v->nmatch=v->g->nsub+1;
230227
}
231228
else
229+
{
230+
/* we can store results directly in caller's array */
232231
v->pmatch=pmatch;
232+
/* ensure any extra entries in caller's array are filled with -1 */
233+
if (nmatch>0)
234+
zapallsubs(pmatch,nmatch);
235+
/* then forget about extra entries, to avoid useless work in find() */
236+
if (nmatch>v->g->nsub+1)
237+
nmatch=v->g->nsub+1;
238+
v->nmatch=nmatch;
239+
}
233240
v->details=details;
234241
v->start= (chr*)string;
235242
v->search_start= (chr*)string+search_start;
@@ -290,12 +297,20 @@ pg_regexec(regex_t *re,
290297
else
291298
st=find(v,&v->g->tree->cnfa,&v->g->cmap);
292299

293-
/*copy (portion of)match vectorover if necessary */
294-
if (st==REG_OKAY&&v->pmatch!=pmatch&&nmatch>0)
300+
/*on success, ensure caller'smatch vectoris filled correctly */
301+
if (st==REG_OKAY&&nmatch>0)
295302
{
296-
zapallsubs(pmatch,nmatch);
297-
n= (nmatch<v->nmatch) ?nmatch :v->nmatch;
298-
memcpy(VS(pmatch),VS(v->pmatch),n*sizeof(regmatch_t));
303+
if (v->pmatch!=pmatch)
304+
{
305+
/* copy portion of match vector over from (larger) work area */
306+
assert(nmatch <=v->nmatch);
307+
memcpy(VS(pmatch),VS(v->pmatch),nmatch*sizeof(regmatch_t));
308+
}
309+
if (v->g->cflags&REG_NOSUB)
310+
{
311+
/* don't expose possibly-partial sub-match results to caller */
312+
zapallsubs(pmatch,nmatch);
313+
}
299314
}
300315

301316
/* clean up */
@@ -752,7 +767,6 @@ cdissect(struct vars *v,
752767
break;
753768
case'(':/* no-op capture node */
754769
assert(t->child!=NULL);
755-
assert(t->capno>0);
756770
er=cdissect(v,t->child,begin,end);
757771
break;
758772
default:

‎src/backend/utils/adt/jsonpath_gram.y

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -583,6 +583,14 @@ jspConvertRegexFlags(uint32 xflags)
583583
errmsg("XQuery\"x\" flag (expanded regular expressions) is not implemented")));
584584
}
585585

586+
/*
587+
* We'll never need sub-match details at execution. While
588+
* RE_compile_and_execute would set this flag anyway, force it on here to
589+
* ensure that the regex cache entries created by makeItemLikeRegex are
590+
* useful.
591+
*/
592+
cflags |= REG_NOSUB;
593+
586594
return cflags;
587595
}
588596

‎src/backend/utils/adt/regexp.c

Lines changed: 10 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -347,6 +347,10 @@ RE_compile_and_execute(text *text_re, char *dat, int dat_len,
347347
{
348348
regex_t*re;
349349

350+
/* Use REG_NOSUB if caller does not want sub-match details */
351+
if (nmatch<2)
352+
cflags |=REG_NOSUB;
353+
350354
/* Compile RE */
351355
re=RE_compile_and_cache(text_re,cflags,collation);
352356

@@ -1412,6 +1416,7 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
14121416
intorig_len;
14131417
pg_wchar*wide_str;
14141418
intwide_len;
1419+
intcflags;
14151420
regex_t*cpattern;
14161421
regmatch_t*pmatch;
14171422
intpmatch_len;
@@ -1430,7 +1435,10 @@ setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
14301435
wide_len=pg_mb2wchar_with_len(VARDATA_ANY(orig_str),wide_str,orig_len);
14311436

14321437
/* set up the compiled pattern */
1433-
cpattern=RE_compile_and_cache(pattern,re_flags->cflags,collation);
1438+
cflags=re_flags->cflags;
1439+
if (!use_subpatterns)
1440+
cflags |=REG_NOSUB;
1441+
cpattern=RE_compile_and_cache(pattern,cflags,collation);
14341442

14351443
/* do we want to remember subpatterns? */
14361444
if (use_subpatterns&&cpattern->re_nsub>0)
@@ -1952,7 +1960,7 @@ regexp_fixed_prefix(text *text_re, bool case_insensitive, Oid collation,
19521960
if (case_insensitive)
19531961
cflags |=REG_ICASE;
19541962

1955-
re=RE_compile_and_cache(text_re,cflags,collation);
1963+
re=RE_compile_and_cache(text_re,cflags |REG_NOSUB,collation);
19561964

19571965
/* Examine it to see if there's a fixed prefix */
19581966
re_result=pg_regprefix(re,&str,&slen);

‎src/include/regex/regex.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -106,7 +106,7 @@ typedef struct
106106
#defineREG_QUOTE000004/* no special characters, none */
107107
#defineREG_NOSPECREG_QUOTE/* historical synonym */
108108
#defineREG_ICASE000010/* ignore case */
109-
#defineREG_NOSUB000020/*don'tcare about subexpressions */
109+
#defineREG_NOSUB000020/*caller doesn'tneed subexpr match data */
110110
#defineREG_EXPANDED000040/* expanded format, white space & comments */
111111
#defineREG_NLSTOP000100/* \n doesn't match . or [^ ] */
112112
#defineREG_NLANCH000200/* ^ matches after \n, $ before */

‎src/include/regex/regguts.h

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -477,13 +477,14 @@ struct subre
477477
#defineMIXED 04/* mixed preference below */
478478
#defineCAP 010/* capturing parens here or below */
479479
#defineBACKR 020/* back reference here or below */
480+
#defineBRUSE 040/* is referenced by a back reference */
480481
#defineINUSE 0100/* in use in final tree */
481-
#defineNOPROP 03/*bits whichmay not propagate up */
482+
#defineUPPROP (MIXED|CAP|BACKR)/*flags whichshould propagate up */
482483
#defineLMIX(f) ((f)<<2)/* LONGER -> MIXED */
483484
#defineSMIX(f) ((f)<<1)/* SHORTER -> MIXED */
484-
#defineUP(f) (((f)&~NOPROP) | (LMIX(f) & SMIX(f) & MIXED))
485+
#defineUP(f) (((f)&UPPROP) | (LMIX(f) & SMIX(f) & MIXED))
485486
#defineMESSY(f) ((f)&(MIXED|CAP|BACKR))
486-
#definePREF(f) ((f)&NOPROP)
487+
#definePREF(f) ((f)&(LONGER|SHORTER))
487488
#definePREF2(f1,f2) ((PREF(f1) != 0) ? PREF(f1) : PREF(f2))
488489
#defineCOMBINE(f1,f2) (UP((f1)|(f2)) | PREF2(f1, f2))
489490
charlatype;/* LATYPE code, if lookaround constraint */

‎src/test/modules/test_regex/expected/test_regex.out

Lines changed: 36 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -146,12 +146,12 @@ select * from test_regex('(a)e', 'ae', '-');
146146
{ae,a}
147147
(2 rows)
148148

149-
-- expectMatch4.2o(a)eae
150-
select * from test_regex('(a)e', 'ae', 'o');
151-
test_regex
152-
------------
153-
{0}
154-
{NULL}
149+
-- expectMatch4.2oPR(.)\1eabeaaeaae{}
150+
select * from test_regex('(.)\1e', 'abeaae', 'oPR');
151+
test_regex
152+
--------------------------------
153+
{1,REG_UBACKREF,REG_UNONPOSIX}
154+
{aae,NULL}
155155
(2 rows)
156156

157157
-- expectMatch4.3 b{\(a\)b}ababa
@@ -2658,6 +2658,20 @@ select * from test_regex('(\w+(?= )).*\1', 'abc abcd abd', 'HLRP');
26582658
{"abc abc",abc}
26592659
(2 rows)
26602660

2661+
-- exercise oversize-regmatch_t-array paths in regexec()
2662+
-- (that case is not reachable via test_regex, sadly)
2663+
select substring('fffoooooooooooooooooooooooooooooooo', '^(.)\1(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)');
2664+
substring
2665+
-----------
2666+
f
2667+
(1 row)
2668+
2669+
select regexp_split_to_array('abcxxxdefyyyghi', '((.))(\1\2)');
2670+
regexp_split_to_array
2671+
-----------------------
2672+
{abc,def,ghi}
2673+
(1 row)
2674+
26612675
-- doing 15 "octal escapes vs back references"
26622676
-- # initial zero is always octal
26632677
-- expectMatch15.1 MP"a\\010b""a\bb""a\bb"
@@ -3476,6 +3490,22 @@ select * from test_regex('((.))(\2){0}', 'xy', 'RPQ');
34763490
{x,x,x,NULL}
34773491
(2 rows)
34783492

3493+
-- expectMatch21.37 RP((.))(\2)xyyyyyyy
3494+
select * from test_regex('((.))(\2)', 'xyy', 'RP');
3495+
test_regex
3496+
--------------------------------
3497+
{3,REG_UBACKREF,REG_UNONPOSIX}
3498+
{yy,y,y,y}
3499+
(2 rows)
3500+
3501+
-- expectMatch21.38 oRP((.))(\2)xyyyy{}{}{}
3502+
select * from test_regex('((.))(\2)', 'xyy', 'oRP');
3503+
test_regex
3504+
--------------------------------
3505+
{3,REG_UBACKREF,REG_UNONPOSIX}
3506+
{yy,NULL,NULL,NULL}
3507+
(2 rows)
3508+
34793509
-- doing 22 "multicharacter collating elements"
34803510
-- # again ugh
34813511
-- MCCEs are not implemented in Postgres, so we skip all these tests

‎src/test/modules/test_regex/sql/test_regex.sql

Lines changed: 11 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -63,8 +63,8 @@ select * from test_regex('ab', 'ab', 'b');
6363

6464
-- expectMatch4.1 -(a)eaeaea
6565
select*from test_regex('(a)e','ae','-');
66-
-- expectMatch4.2o(a)eae
67-
select*from test_regex('(a)e','ae','o');
66+
-- expectMatch4.2oPR(.)\1eabeaaeaae{}
67+
select*from test_regex('(.)\1e','abeaae','oPR');
6868
-- expectMatch4.3 b{\(a\)b}ababa
6969
select*from test_regex('\(a\)b','ab','b');
7070
-- expectMatch4.4 -a((b)c)abcabcbcb
@@ -775,6 +775,11 @@ select * from test_regex('(^\w+).*\1', 'abc abc abc', 'LRP');
775775
select*from test_regex('(^\w+\M).*\1','abc abcd abd','LRP');
776776
select*from test_regex('(\w+(?= )).*\1','abc abcd abd','HLRP');
777777

778+
-- exercise oversize-regmatch_t-array paths in regexec()
779+
-- (that case is not reachable via test_regex, sadly)
780+
selectsubstring('fffoooooooooooooooooooooooooooooooo','^(.)\1(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)(.)');
781+
select regexp_split_to_array('abcxxxdefyyyghi','((.))(\1\2)');
782+
778783
-- doing 15 "octal escapes vs back references"
779784

780785
-- # initial zero is always octal
@@ -1011,6 +1016,10 @@ select * from test_regex('(a*)*', 'bc', 'N');
10111016
select*from test_regex(' TO (([a-z0-9._]+|"([^"]+|"")+")+)','asd TO foo','M');
10121017
-- expectMatch21.36 RPQ((.))(\2){0}xyxxx{}
10131018
select*from test_regex('((.))(\2){0}','xy','RPQ');
1019+
-- expectMatch21.37 RP((.))(\2)xyyyyyyy
1020+
select*from test_regex('((.))(\2)','xyy','RP');
1021+
-- expectMatch21.38 oRP((.))(\2)xyyyy{}{}{}
1022+
select*from test_regex('((.))(\2)','xyy','oRP');
10141023

10151024
-- doing 22 "multicharacter collating elements"
10161025
-- # again ugh

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp