Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit5227d99

Browse files
committed
Rethink regexp engine's backref-related compilation state.
I had committer's remorse almost immediately after pushingcb76fbd,upon finding that removing capturing subexpressions' subREs from thedata structure broke my proposed patch for REG_NOSUB optimization.Revert that data structure change. Instead, address the concernabout not changing capturing subREs' endpoints by not changing theendpoints. We don't need to, because the point of that bit was justto ensure that the atom has endpoints distinct from the outer statepair that we're stringing the branch between. We already madesuitable states in the parenthesized-subexpression case, so theadditional ones were just useless overhead. This seems moreunderstandable than Spencer's original coding, and it ought to bea shade faster too by saving a few state creations and arc changes.(I actually see a couple percent improvement on Jacobson's webcorpus, though that's barely above the noise floor so I wouldn'tput much stock in that result.)Also, fix the logic added byea1268f to ensure that the subRErecorded in v->subs[subno] is exactly the one with capno == subno.Spencer's original coding recorded the child subRE of the capturenode, which is okay so far as having the right endpoint states isconcerned, but as ofcb76fbd the capturing subRE itself alwayshas those endpoints too. I think the inconsistency is confusingfor the REG_NOSUB optimization.As before, backpatch to v14.Discussion:https://postgr.es/m/0203588E-E609-43AF-9F4F-902854231EE7@enterprisedb.com
1 parent5e6ad63 commit5227d99

File tree

1 file changed

+47
-37
lines changed

1 file changed

+47
-37
lines changed

‎src/backend/regex/regcomp.c

Lines changed: 47 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -233,13 +233,6 @@ static intcmp(const chr *, const chr *, size_t);
233233
staticintcasecmp(constchr*,constchr*,size_t);
234234

235235

236-
/* info we need during compilation about a known capturing subexpression */
237-
structsubinfo
238-
{
239-
structstate*left;/* left end of its sub-NFA */
240-
structstate*right;/* right end of its sub-NFA */
241-
};
242-
243236
/* internal variables, bundled for easy passing around */
244237
structvars
245238
{
@@ -252,10 +245,10 @@ struct vars
252245
intnexttype;/* type of next token */
253246
chrnextvalue;/* value (if any) of next token */
254247
intlexcon;/* lexical context type (see regc_lex.c) */
255-
intnsubexp;/*number of known capturing subexpressions */
256-
structsubinfo*subs;/*info about known capturing subexpressions */
257-
size_tnsubs;/*allocatedlength of subs[] vector */
258-
structsubinfosub10[10];/* initial vector, enough for most */
248+
intnsubexp;/*subexpression count */
249+
structsubre**subs;/*subRE pointer vector */
250+
size_tnsubs;/* length of vector */
251+
structsubre*sub10[10];/* initial vector, enough for most */
259252
structnfa*nfa;/* the NFA */
260253
structcolormap*cm;/* character color map */
261254
colornlcolor;/* color of newline */
@@ -375,7 +368,7 @@ pg_regcomp(regex_t *re,
375368
v->subs=v->sub10;
376369
v->nsubs=10;
377370
for (j=0;j<v->nsubs;j++)
378-
v->subs[j].left=v->subs[j].right=NULL;
371+
v->subs[j]=NULL;
379372
v->nfa=NULL;
380373
v->cm=NULL;
381374
v->nlcolor=COLORLESS;
@@ -511,35 +504,35 @@ pg_regcomp(regex_t *re,
511504
}
512505

513506
/*
514-
* moresubs - enlargecapturing-subexpressions vector
507+
* moresubs - enlargesubRE vector
515508
*/
516509
staticvoid
517510
moresubs(structvars*v,
518511
intwanted)/* want enough room for this one */
519512
{
520-
structsubinfo*p;
513+
structsubre**p;
521514
size_tn;
522515

523516
assert(wanted>0&& (size_t)wanted >=v->nsubs);
524517
n= (size_t)wanted*3 /2+1;
525518

526519
if (v->subs==v->sub10)
527520
{
528-
p= (structsubinfo*)MALLOC(n*sizeof(structsubinfo));
521+
p= (structsubre**)MALLOC(n*sizeof(structsubre*));
529522
if (p!=NULL)
530523
memcpy(VS(p),VS(v->subs),
531-
v->nsubs*sizeof(structsubinfo));
524+
v->nsubs*sizeof(structsubre*));
532525
}
533526
else
534-
p= (structsubinfo*)REALLOC(v->subs,n*sizeof(structsubinfo));
527+
p= (structsubre**)REALLOC(v->subs,n*sizeof(structsubre*));
535528
if (p==NULL)
536529
{
537530
ERR(REG_ESPACE);
538531
return;
539532
}
540533
v->subs=p;
541534
for (p=&v->subs[v->nsubs];v->nsubs<n;p++,v->nsubs++)
542-
p->left=p->right=NULL;
535+
*p=NULL;
543536
assert(v->nsubs==n);
544537
assert((size_t)wanted<v->nsubs);
545538
}
@@ -988,6 +981,7 @@ parseqatom(struct vars *v,
988981
s=newstate(v->nfa);
989982
s2=newstate(v->nfa);
990983
NOERRN();
984+
/* We may not need these arcs, but keep things connected for now */
991985
EMPTYARC(lp,s);
992986
EMPTYARC(s2,rp);
993987
NOERRN();
@@ -997,10 +991,6 @@ parseqatom(struct vars *v,
997991
NOERRN();
998992
if (cap)
999993
{
1000-
/* save the sub-NFA's endpoints for future backrefs to use */
1001-
assert(v->subs[subno].left==NULL);
1002-
v->subs[subno].left=s;
1003-
v->subs[subno].right=s2;
1004994
if (atom->capno==0)
1005995
{
1006996
/* normal case: just mark the atom as capturing */
@@ -1016,13 +1006,15 @@ parseqatom(struct vars *v,
10161006
t->child=atom;
10171007
atom=t;
10181008
}
1009+
assert(v->subs[subno]==NULL);
1010+
v->subs[subno]=atom;
10191011
}
10201012
/* postpone everything else pending possible {0} */
10211013
break;
10221014
caseBACKREF:/* the Feature From The Black Lagoon */
10231015
INSIST(type!=LACON,REG_ESUBREG);
10241016
INSIST(v->nextvalue<v->nsubs,REG_ESUBREG);
1025-
INSIST(v->subs[v->nextvalue].left!=NULL,REG_ESUBREG);
1017+
INSIST(v->subs[v->nextvalue]!=NULL,REG_ESUBREG);
10261018
NOERRN();
10271019
assert(v->nextvalue>0);
10281020
atom=subre(v,'b',BACKR,lp,rp);
@@ -1097,7 +1089,7 @@ parseqatom(struct vars *v,
10971089
if (atom!=NULL)
10981090
freesubre(v,atom);
10991091
if (atomtype=='(')
1100-
v->subs[subno].left=v->subs[subno].right=NULL;
1092+
v->subs[subno]=NULL;
11011093
delsub(v->nfa,lp,rp);
11021094
EMPTYARC(lp,rp);
11031095
returntop;
@@ -1130,30 +1122,48 @@ parseqatom(struct vars *v,
11301122
NOERRN();
11311123
}
11321124

1125+
/*
1126+
* For what follows, we need the atom to have its own begin/end states
1127+
* that are distinct from lp/rp, so that we can wrap iteration structure
1128+
* around it. The parenthesized-atom case above already made suitable
1129+
* states (and we don't want to modify a capturing subre, since it's
1130+
* already recorded in v->subs[]). Otherwise, we need more states.
1131+
*/
1132+
if (atom->begin==lp||atom->end==rp)
1133+
{
1134+
s=newstate(v->nfa);
1135+
s2=newstate(v->nfa);
1136+
NOERRN();
1137+
moveouts(v->nfa,lp,s);
1138+
moveins(v->nfa,rp,s2);
1139+
atom->begin=s;
1140+
atom->end=s2;
1141+
}
1142+
else
1143+
{
1144+
/* The atom's OK, but we must temporarily disconnect it from lp/rp */
1145+
/* (this removes the EMPTY arcs we made above) */
1146+
delsub(v->nfa,lp,atom->begin);
1147+
delsub(v->nfa,atom->end,rp);
1148+
}
1149+
11331150
/*----------
11341151
* Prepare a general-purpose state skeleton.
11351152
*
11361153
* In the no-backrefs case, we want this:
11371154
*
1138-
* [lp] ---> [s] ---prefix--->[begin]---atom---> [end] ---rest---> [rp]
1155+
* [lp] ---> [s] ---prefix---> ---atom---> ---rest---> [rp]
11391156
*
1140-
* where prefix is some repetitions of atom. In the general case we need
1157+
* where prefix is some repetitions of atom, and "rest" is the remainder
1158+
* of the branch. In the general case we need:
11411159
*
11421160
* [lp] ---> [s] ---iterator---> [s2] ---rest---> [rp]
11431161
*
1144-
* where the iterator wraps around[begin] ---atom---> [end]
1162+
* where the iterator wraps aroundtheatom.
11451163
*
11461164
* We make the s state here for both cases; s2 is made below if needed
11471165
*----------
11481166
*/
1149-
s=newstate(v->nfa);/* first, new endpoints for the atom */
1150-
s2=newstate(v->nfa);
1151-
NOERRN();
1152-
moveouts(v->nfa,lp,s);
1153-
moveins(v->nfa,rp,s2);
1154-
NOERRN();
1155-
atom->begin=s;
1156-
atom->end=s2;
11571167
s=newstate(v->nfa);/* set up starting state */
11581168
NOERRN();
11591169
EMPTYARC(lp,s);
@@ -1190,14 +1200,14 @@ parseqatom(struct vars *v,
11901200
{
11911201
assert(atom->begin->nouts==1);/* just the EMPTY */
11921202
delsub(v->nfa,atom->begin,atom->end);
1193-
assert(v->subs[subno].left!=NULL);
1203+
assert(v->subs[subno]!=NULL);
11941204

11951205
/*
11961206
* And here's why the recursion got postponed: it must wait until the
11971207
* skeleton is filled in, because it may hit a backref that wants to
11981208
* copy the filled-in skeleton.
11991209
*/
1200-
dupnfa(v->nfa,v->subs[subno].left,v->subs[subno].right,
1210+
dupnfa(v->nfa,v->subs[subno]->begin,v->subs[subno]->end,
12011211
atom->begin,atom->end);
12021212
NOERRN();
12031213

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp