Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit08c0d6a

Browse files
committed
Invent "rainbow" arcs within the regex engine.
Some regular expression constructs, most notably the "." match-anythingmetacharacter, produce a sheaf of parallel NFA arcs covering allpossible colors (that is, character equivalence classes). We can makea noticeable improvement in the space and time needed to process largeregexes by replacing such cases with a single arc bearing the specialcolor code "RAINBOW". This requires only minor additional complicationin places such as pull() and push().Callers of pg_reg_getoutarcs() must now be prepared for the possibilityof seeing a RAINBOW arc. For the one known user, contrib/pg_trgm,that's a net benefit since it cuts the number of arcs to be dealt with,and the handling isn't any different than for other colors that containtoo many characters to be dealt with individually.This is part of a patch series that in total reduces the regex engine'sruntime by about a factor of four on a large corpus of real-world regexes.Patch by me, reviewed by Joel JacobsonDiscussion:https://postgr.es/m/1340281.1613018383@sss.pgh.pa.us
1 parent1766118 commit08c0d6a

File tree

10 files changed

+177
-37
lines changed

10 files changed

+177
-37
lines changed

‎contrib/pg_trgm/trgm_regexp.c

Lines changed: 18 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -282,8 +282,8 @@ typedef struct
282282
typedefintTrgmColor;
283283

284284
/* We assume that colors returned by the regexp engine cannot be these: */
285-
#defineCOLOR_UNKNOWN(-1)
286-
#defineCOLOR_BLANK(-2)
285+
#defineCOLOR_UNKNOWN(-3)
286+
#defineCOLOR_BLANK(-4)
287287

288288
typedefstruct
289289
{
@@ -780,7 +780,8 @@ getColorInfo(regex_t *regex, TrgmNFA *trgmNFA)
780780
palloc0(colorsCount*sizeof(TrgmColorInfo));
781781

782782
/*
783-
* Loop over colors, filling TrgmColorInfo about each.
783+
* Loop over colors, filling TrgmColorInfo about each. Note we include
784+
* WHITE (0) even though we know it'll be reported as non-expandable.
784785
*/
785786
for (i=0;i<colorsCount;i++)
786787
{
@@ -1098,9 +1099,9 @@ addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key)
10981099
/* Add enter key to this state */
10991100
addKeyToQueue(trgmNFA,&destKey);
11001101
}
1101-
else
1102+
elseif (arc->co >=0)
11021103
{
1103-
/* Regular color */
1104+
/* Regular color(including WHITE)*/
11041105
TrgmColorInfo*colorInfo=&trgmNFA->colorInfo[arc->co];
11051106

11061107
if (colorInfo->expandable)
@@ -1156,6 +1157,14 @@ addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key)
11561157
addKeyToQueue(trgmNFA,&destKey);
11571158
}
11581159
}
1160+
else
1161+
{
1162+
/* RAINBOW: treat as unexpandable color */
1163+
destKey.prefix.colors[0]=COLOR_UNKNOWN;
1164+
destKey.prefix.colors[1]=COLOR_UNKNOWN;
1165+
destKey.nstate=arc->to;
1166+
addKeyToQueue(trgmNFA,&destKey);
1167+
}
11591168
}
11601169

11611170
pfree(arcs);
@@ -1216,10 +1225,10 @@ addArcs(TrgmNFA *trgmNFA, TrgmState *state)
12161225
/*
12171226
* Ignore non-expandable colors; addKey already handled the case.
12181227
*
1219-
* We need no special check for begin/end pseudocolors here. We
1220-
* don't need to do any processing for them, and they will be
1221-
* marked non-expandable since the regex engine will have reported
1222-
* them that way.
1228+
* We need no special check forWHITE orbegin/end pseudocolors
1229+
*here. Wedon't need to do any processing for them, and they
1230+
*will bemarked non-expandable since the regex engine will have
1231+
*reportedthem that way.
12231232
*/
12241233
if (!colorInfo->expandable)
12251234
continue;

‎src/backend/regex/README

Lines changed: 28 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -261,6 +261,18 @@ and the NFA has these arcs:
261261
states 4 -> 5 on color 2 ("x" only)
262262
which can be seen to be a correct representation of the regex.
263263

264+
There is one more complexity, which is how to handle ".", that is a
265+
match-anything atom. We used to do that by generating a "rainbow"
266+
of arcs of all live colors between the two NFA states before and after
267+
the dot. That's expensive in itself when there are lots of colors,
268+
and it also typically adds lots of follow-on arc-splitting work for the
269+
color splitting logic. Now we handle this case by generating a single arc
270+
labeled with the special color RAINBOW, meaning all colors. Such arcs
271+
never need to be split, so they help keep NFAs small in this common case.
272+
(Note: this optimization doesn't help in REG_NLSTOP mode, where "." is
273+
not supposed to match newline. In that case we still handle "." by
274+
generating an almost-rainbow of all colors except newline's color.)
275+
264276
Given this summary, we can see we need the following operations for
265277
colors:
266278

@@ -349,18 +361,20 @@ The possible arc types are:
349361

350362
PLAIN arcs, which specify matching of any character of a given "color"
351363
(see above). These are dumped as "[color_number]->to_state".
364+
In addition there can be "rainbow" PLAIN arcs, which are dumped as
365+
"[*]->to_state".
352366

353367
EMPTY arcs, which specify a no-op transition to another state. These
354368
are dumped as "->to_state".
355369

356370
AHEAD constraints, which represent a "next character must be of this
357371
color" constraint. AHEAD differs from a PLAIN arc in that the input
358372
character is not consumed when crossing the arc. These are dumped as
359-
">color_number>->to_state".
373+
">color_number>->to_state", or possibly ">*>->to_state".
360374

361375
BEHIND constraints, which represent a "previous character must be of
362376
this color" constraint, which likewise consumes no input. These are
363-
dumped as "<color_number<->to_state".
377+
dumped as "<color_number<->to_state", or possibly "<*<->to_state".
364378

365379
'^' arcs, which specify a beginning-of-input constraint. These are
366380
dumped as "^0->to_state" or "^1->to_state" for beginning-of-string and
@@ -396,14 +410,20 @@ substring, or an imaginary following EOS character if the substring is at
396410
the end of the input.
397411
3. If the NFA is (or can be) in the goal state at this point, it matches.
398412

413+
This definition is necessary to support regexes that begin or end with
414+
constraints such as \m and \M, which imply requirements on the adjacent
415+
character if any. The executor implements that by checking if the
416+
adjacent character (or BOS/BOL/EOS/EOL pseudo-character) is of the
417+
right color, and it does that in the same loop that checks characters
418+
within the match.
419+
399420
So one can mentally execute an untransformed NFA by taking ^ and $ as
400421
ordinary constraints that match at start and end of input; but plain
401422
arcs out of the start state should be taken as matches for the character
402423
before the target substring, and similarly, plain arcs leading to the
403424
post state are matches for the character after the target substring.
404-
This definition is necessary to support regexes that begin or end with
405-
constraints such as \m and \M, which imply requirements on the adjacent
406-
character if any. NFAs for simple unanchored patterns will usually have
407-
pre-state outarcs for all possible character colors as well as BOS and
408-
BOL, and post-state inarcs for all possible character colors as well as
409-
EOS and EOL, so that the executor's behavior will work.
425+
After the optimize() transformation, there are explicit arcs mentioning
426+
BOS/BOL/EOS/EOL adjacent to the pre-state and post-state. So a finished
427+
NFA for a pattern without anchors or adjacent-character constraints will
428+
have pre-state outarcs for RAINBOW (all possible character colors) as well
429+
as BOS and BOL, and likewise post-state inarcs for RAINBOW, EOS, and EOL.

‎src/backend/regex/regc_color.c

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -977,6 +977,7 @@ colorchain(struct colormap *cm,
977977
{
978978
structcolordesc*cd=&cm->cd[a->co];
979979

980+
assert(a->co >=0);
980981
if (cd->arcs!=NULL)
981982
cd->arcs->colorchainRev=a;
982983
a->colorchain=cd->arcs;
@@ -994,6 +995,7 @@ uncolorchain(struct colormap *cm,
994995
structcolordesc*cd=&cm->cd[a->co];
995996
structarc*aa=a->colorchainRev;
996997

998+
assert(a->co >=0);
997999
if (aa==NULL)
9981000
{
9991001
assert(cd->arcs==a);
@@ -1012,6 +1014,9 @@ uncolorchain(struct colormap *cm,
10121014

10131015
/*
10141016
* rainbow - add arcs of all full colors (but one) between specified states
1017+
*
1018+
* If there isn't an exception color, we now generate just a single arc
1019+
* labeled RAINBOW, saving lots of arc-munging later on.
10151020
*/
10161021
staticvoid
10171022
rainbow(structnfa*nfa,
@@ -1025,6 +1030,13 @@ rainbow(struct nfa *nfa,
10251030
structcolordesc*end=CDEND(cm);
10261031
colorco;
10271032

1033+
if (but==COLORLESS)
1034+
{
1035+
newarc(nfa,type,RAINBOW,from,to);
1036+
return;
1037+
}
1038+
1039+
/* Gotta do it the hard way. Skip subcolors, pseudocolors, and "but" */
10281040
for (cd=cm->cd,co=0;cd<end&& !CISERR();cd++,co++)
10291041
if (!UNUSEDCOLOR(cd)&&cd->sub!=co&&co!=but&&
10301042
!(cd->flags&PSEUDO))
@@ -1034,13 +1046,16 @@ rainbow(struct nfa *nfa,
10341046
/*
10351047
* colorcomplement - add arcs of complementary colors
10361048
*
1049+
* We add arcs of all colors that are not pseudocolors and do not match
1050+
* any of the "of" state's PLAIN outarcs.
1051+
*
10371052
* The calling sequence ought to be reconciled with cloneouts().
10381053
*/
10391054
staticvoid
10401055
colorcomplement(structnfa*nfa,
10411056
structcolormap*cm,
10421057
inttype,
1043-
structstate*of,/* complements of this guy's PLAIN outarcs */
1058+
structstate*of,
10441059
structstate*from,
10451060
structstate*to)
10461061
{
@@ -1049,6 +1064,11 @@ colorcomplement(struct nfa *nfa,
10491064
colorco;
10501065

10511066
assert(of!=from);
1067+
1068+
/* A RAINBOW arc matches all colors, making the complement empty */
1069+
if (findarc(of,PLAIN,RAINBOW)!=NULL)
1070+
return;
1071+
10521072
for (cd=cm->cd,co=0;cd<end&& !CISERR();cd++,co++)
10531073
if (!UNUSEDCOLOR(cd)&& !(cd->flags&PSEUDO))
10541074
if (findarc(of,PLAIN,co)==NULL)

‎src/backend/regex/regc_nfa.c

Lines changed: 74 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -271,6 +271,11 @@ destroystate(struct nfa *nfa,
271271
*
272272
* This function checks to make sure that no duplicate arcs are created.
273273
* In general we never want duplicates.
274+
*
275+
* However: in principle, a RAINBOW arc is redundant with any plain arc
276+
* (unless that arc is for a pseudocolor). But we don't try to recognize
277+
* that redundancy, either here or in allied operations such as moveins().
278+
* The pseudocolor consideration makes that more costly than it seems worth.
274279
*/
275280
staticvoid
276281
newarc(structnfa*nfa,
@@ -1170,6 +1175,9 @@ copyouts(struct nfa *nfa,
11701175

11711176
/*
11721177
* cloneouts - copy out arcs of a state to another state pair, modifying type
1178+
*
1179+
* This is only used to convert PLAIN arcs to AHEAD/BEHIND arcs, which share
1180+
* the same interpretation of "co". It wouldn't be sensible with LACONs.
11731181
*/
11741182
staticvoid
11751183
cloneouts(structnfa*nfa,
@@ -1181,9 +1189,13 @@ cloneouts(struct nfa *nfa,
11811189
structarc*a;
11821190

11831191
assert(old!=from);
1192+
assert(type==AHEAD||type==BEHIND);
11841193

11851194
for (a=old->outs;a!=NULL;a=a->outchain)
1195+
{
1196+
assert(a->type==PLAIN);
11861197
newarc(nfa,type,a->co,from,to);
1198+
}
11871199
}
11881200

11891201
/*
@@ -1597,7 +1609,7 @@ pull(struct nfa *nfa,
15971609
for (a=from->ins;a!=NULL&& !NISERR();a=nexta)
15981610
{
15991611
nexta=a->inchain;
1600-
switch (combine(con,a))
1612+
switch (combine(nfa,con,a))
16011613
{
16021614
caseINCOMPATIBLE:/* destroy the arc */
16031615
freearc(nfa,a);
@@ -1624,6 +1636,10 @@ pull(struct nfa *nfa,
16241636
cparc(nfa,a,s,to);
16251637
freearc(nfa,a);
16261638
break;
1639+
caseREPLACEARC:/* replace arc's color */
1640+
newarc(nfa,a->type,con->co,a->from,to);
1641+
freearc(nfa,a);
1642+
break;
16271643
default:
16281644
assert(NOTREACHED);
16291645
break;
@@ -1764,7 +1780,7 @@ push(struct nfa *nfa,
17641780
for (a=to->outs;a!=NULL&& !NISERR();a=nexta)
17651781
{
17661782
nexta=a->outchain;
1767-
switch (combine(con,a))
1783+
switch (combine(nfa,con,a))
17681784
{
17691785
caseINCOMPATIBLE:/* destroy the arc */
17701786
freearc(nfa,a);
@@ -1791,6 +1807,10 @@ push(struct nfa *nfa,
17911807
cparc(nfa,a,from,s);
17921808
freearc(nfa,a);
17931809
break;
1810+
caseREPLACEARC:/* replace arc's color */
1811+
newarc(nfa,a->type,con->co,from,a->to);
1812+
freearc(nfa,a);
1813+
break;
17941814
default:
17951815
assert(NOTREACHED);
17961816
break;
@@ -1810,9 +1830,11 @@ push(struct nfa *nfa,
18101830
* #def INCOMPATIBLE1// destroys arc
18111831
* #def SATISFIED2// constraint satisfied
18121832
* #def COMPATIBLE3// compatible but not satisfied yet
1833+
* #def REPLACEARC4// replace arc's color with constraint color
18131834
*/
18141835
staticint
1815-
combine(structarc*con,
1836+
combine(structnfa*nfa,
1837+
structarc*con,
18161838
structarc*a)
18171839
{
18181840
#defineCA(ct,at) (((ct)<<CHAR_BIT) | (at))
@@ -1827,14 +1849,46 @@ combine(struct arc *con,
18271849
caseCA(BEHIND,PLAIN):
18281850
if (con->co==a->co)
18291851
returnSATISFIED;
1852+
if (con->co==RAINBOW)
1853+
{
1854+
/* con is satisfied unless arc's color is a pseudocolor */
1855+
if (!(nfa->cm->cd[a->co].flags&PSEUDO))
1856+
returnSATISFIED;
1857+
}
1858+
elseif (a->co==RAINBOW)
1859+
{
1860+
/* con is incompatible if it's for a pseudocolor */
1861+
if (nfa->cm->cd[con->co].flags&PSEUDO)
1862+
returnINCOMPATIBLE;
1863+
/* otherwise, constraint constrains arc to be only its color */
1864+
returnREPLACEARC;
1865+
}
18301866
returnINCOMPATIBLE;
18311867
break;
18321868
caseCA('^','^'):/* collision, similar constraints */
18331869
caseCA('$','$'):
1834-
caseCA(AHEAD,AHEAD):
1870+
if (con->co==a->co)/* true duplication */
1871+
returnSATISFIED;
1872+
returnINCOMPATIBLE;
1873+
break;
1874+
caseCA(AHEAD,AHEAD):/* collision, similar constraints */
18351875
caseCA(BEHIND,BEHIND):
18361876
if (con->co==a->co)/* true duplication */
18371877
returnSATISFIED;
1878+
if (con->co==RAINBOW)
1879+
{
1880+
/* con is satisfied unless arc's color is a pseudocolor */
1881+
if (!(nfa->cm->cd[a->co].flags&PSEUDO))
1882+
returnSATISFIED;
1883+
}
1884+
elseif (a->co==RAINBOW)
1885+
{
1886+
/* con is incompatible if it's for a pseudocolor */
1887+
if (nfa->cm->cd[con->co].flags&PSEUDO)
1888+
returnINCOMPATIBLE;
1889+
/* otherwise, constraint constrains arc to be only its color */
1890+
returnREPLACEARC;
1891+
}
18381892
returnINCOMPATIBLE;
18391893
break;
18401894
caseCA('^',BEHIND):/* collision, dissimilar constraints */
@@ -2895,6 +2949,7 @@ compact(struct nfa *nfa,
28952949
break;
28962950
caseLACON:
28972951
assert(s->no!=cnfa->pre);
2952+
assert(a->co >=0);
28982953
ca->co= (color) (cnfa->ncolors+a->co);
28992954
ca->to=a->to->no;
29002955
ca++;
@@ -3068,13 +3123,22 @@ dumparc(struct arc *a,
30683123
switch (a->type)
30693124
{
30703125
casePLAIN:
3071-
fprintf(f,"[%ld]", (long)a->co);
3126+
if (a->co==RAINBOW)
3127+
fprintf(f,"[*]");
3128+
else
3129+
fprintf(f,"[%ld]", (long)a->co);
30723130
break;
30733131
caseAHEAD:
3074-
fprintf(f,">%ld>", (long)a->co);
3132+
if (a->co==RAINBOW)
3133+
fprintf(f,">*>");
3134+
else
3135+
fprintf(f,">%ld>", (long)a->co);
30753136
break;
30763137
caseBEHIND:
3077-
fprintf(f,"<%ld<", (long)a->co);
3138+
if (a->co==RAINBOW)
3139+
fprintf(f,"<*<");
3140+
else
3141+
fprintf(f,"<%ld<", (long)a->co);
30783142
break;
30793143
caseLACON:
30803144
fprintf(f,":%ld:", (long)a->co);
@@ -3161,7 +3225,9 @@ dumpcstate(int st,
31613225
pos=1;
31623226
for (ca=cnfa->states[st];ca->co!=COLORLESS;ca++)
31633227
{
3164-
if (ca->co<cnfa->ncolors)
3228+
if (ca->co==RAINBOW)
3229+
fprintf(f,"\t[*]->%d",ca->to);
3230+
elseif (ca->co<cnfa->ncolors)
31653231
fprintf(f,"\t[%ld]->%d", (long)ca->co,ca->to);
31663232
else
31673233
fprintf(f,"\t:%ld:->%d", (long) (ca->co-cnfa->ncolors),ca->to);

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp