Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit78a843f

Browse files
committed
Improve regex compiler's arc moving/copying logic.
The functions moveins(), copyins(), moveouts(), copyouts() arerequired to preserve the invariant that there are no duplicate arcs inthe regex's NFA. Spencer's original implementation of them was O(N^2)since it checked separately for a match to each source arc. In commit579840c I improved that by adding sort/merge logic to be used ifmore than a few arcs are to be moved/copied. However, I now realizethat that missed a bet. At many call sites, the target state is newlymade and cannot have any existing in-arcs (respectively out-arcs)that could be duplicates. So spending any cycles at all on checkingfor duplicates is wasted effort; in these cases we can just blindlymove/copy all the source arcs. Add code paths to do that.It turns out that for copyins()/copyouts(), *all* the call sites havethis property, making all the "improved" logic in them flat outunreachable. Perhaps we'll need the full capability again someday,so I just #ifdef'd those paths out rather than removing them entirely.In passing, add a few test cases to improve code coverage in thisarea as well as in regc_locale.c/regc_pg_locale.c.Discussion:https://postgr.es/m/810272.1629064063@sss.pgh.pa.us
1 parentf576de1 commit78a843f

File tree

5 files changed

+338
-4
lines changed

5 files changed

+338
-4
lines changed

‎src/backend/regex/regc_nfa.c

Lines changed: 62 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -777,6 +777,10 @@ sortouts_cmp(const void *a, const void *b)
777777
* However, if we have a whole lot of arcs to deal with, retail duplicate
778778
* checks become too slow. In that case we proceed by sorting and merging
779779
* the arc lists, and then we can indeed just update the arcs in-place.
780+
*
781+
* On the other hand, it's also true that this is frequently called with
782+
* a brand-new newState that has no existing in-arcs. In that case,
783+
* de-duplication is unnecessary, so we can just blindly move all the arcs.
780784
*/
781785
staticvoid
782786
moveins(structnfa*nfa,
@@ -785,7 +789,18 @@ moveins(struct nfa *nfa,
785789
{
786790
assert(oldState!=newState);
787791

788-
if (!BULK_ARC_OP_USE_SORT(oldState->nins,newState->nins))
792+
if (newState->nins==0)
793+
{
794+
/* No need for de-duplication */
795+
structarc*a;
796+
797+
while ((a=oldState->ins)!=NULL)
798+
{
799+
createarc(nfa,a->type,a->co,a->from,newState);
800+
freearc(nfa,a);
801+
}
802+
}
803+
elseif (!BULK_ARC_OP_USE_SORT(oldState->nins,newState->nins))
789804
{
790805
/* With not too many arcs, just do them one at a time */
791806
structarc*a;
@@ -869,15 +884,30 @@ moveins(struct nfa *nfa,
869884

870885
/*
871886
* copyins - copy in arcs of a state to another state
887+
*
888+
* The comments for moveins() apply here as well. However, in current
889+
* usage, this is *only* called with brand-new target states, so that
890+
* only the "no need for de-duplication" code path is ever reached.
891+
* We keep the rest #ifdef'd out in case it's needed in the future.
872892
*/
873893
staticvoid
874894
copyins(structnfa*nfa,
875895
structstate*oldState,
876896
structstate*newState)
877897
{
878898
assert(oldState!=newState);
899+
assert(newState->nins==0);/* see comment above */
900+
901+
if (newState->nins==0)
902+
{
903+
/* No need for de-duplication */
904+
structarc*a;
879905

880-
if (!BULK_ARC_OP_USE_SORT(oldState->nins,newState->nins))
906+
for (a=oldState->ins;a!=NULL;a=a->inchain)
907+
createarc(nfa,a->type,a->co,a->from,newState);
908+
}
909+
#ifdefNOT_USED/* see comment above */
910+
elseif (!BULK_ARC_OP_USE_SORT(oldState->nins,newState->nins))
881911
{
882912
/* With not too many arcs, just do them one at a time */
883913
structarc*a;
@@ -944,6 +974,7 @@ copyins(struct nfa *nfa,
944974
createarc(nfa,a->type,a->co,a->from,newState);
945975
}
946976
}
977+
#endif/* NOT_USED */
947978
}
948979

949980
/*
@@ -1058,7 +1089,18 @@ moveouts(struct nfa *nfa,
10581089
{
10591090
assert(oldState!=newState);
10601091

1061-
if (!BULK_ARC_OP_USE_SORT(oldState->nouts,newState->nouts))
1092+
if (newState->nouts==0)
1093+
{
1094+
/* No need for de-duplication */
1095+
structarc*a;
1096+
1097+
while ((a=oldState->outs)!=NULL)
1098+
{
1099+
createarc(nfa,a->type,a->co,newState,a->to);
1100+
freearc(nfa,a);
1101+
}
1102+
}
1103+
elseif (!BULK_ARC_OP_USE_SORT(oldState->nouts,newState->nouts))
10621104
{
10631105
/* With not too many arcs, just do them one at a time */
10641106
structarc*a;
@@ -1142,15 +1184,27 @@ moveouts(struct nfa *nfa,
11421184

11431185
/*
11441186
* copyouts - copy out arcs of a state to another state
1187+
*
1188+
* See comments for copyins()
11451189
*/
11461190
staticvoid
11471191
copyouts(structnfa*nfa,
11481192
structstate*oldState,
11491193
structstate*newState)
11501194
{
11511195
assert(oldState!=newState);
1196+
assert(newState->nouts==0);/* see comment above */
1197+
1198+
if (newState->nouts==0)
1199+
{
1200+
/* No need for de-duplication */
1201+
structarc*a;
11521202

1153-
if (!BULK_ARC_OP_USE_SORT(oldState->nouts,newState->nouts))
1203+
for (a=oldState->outs;a!=NULL;a=a->outchain)
1204+
createarc(nfa,a->type,a->co,newState,a->to);
1205+
}
1206+
#ifdefNOT_USED/* see comment above */
1207+
elseif (!BULK_ARC_OP_USE_SORT(oldState->nouts,newState->nouts))
11541208
{
11551209
/* With not too many arcs, just do them one at a time */
11561210
structarc*a;
@@ -1217,6 +1271,7 @@ copyouts(struct nfa *nfa,
12171271
createarc(nfa,a->type,a->co,newState,a->to);
12181272
}
12191273
}
1274+
#endif/* NOT_USED */
12201275
}
12211276

12221277
/*
@@ -1975,6 +2030,7 @@ combine(struct nfa *nfa,
19752030
elseif (a->co==RAINBOW)
19762031
{
19772032
/* con is incompatible if it's for a pseudocolor */
2033+
/* (this is hypothetical; we make no such constraints today) */
19782034
if (nfa->cm->cd[con->co].flags&PSEUDO)
19792035
returnINCOMPATIBLE;
19802036
/* otherwise, constraint constrains arc to be only its color */
@@ -2001,6 +2057,7 @@ combine(struct nfa *nfa,
20012057
elseif (a->co==RAINBOW)
20022058
{
20032059
/* con is incompatible if it's for a pseudocolor */
2060+
/* (this is hypothetical; we make no such constraints today) */
20042061
if (nfa->cm->cd[con->co].flags&PSEUDO)
20052062
returnINCOMPATIBLE;
20062063
/* otherwise, constraint constrains arc to be only its color */
@@ -3562,6 +3619,7 @@ carc_cmp(const void *a, const void *b)
35623619
return-1;
35633620
if (aa->to>bb->to)
35643621
return+1;
3622+
/* This is unreached, since there should be no duplicate arcs now: */
35653623
return0;
35663624
}
35673625

‎src/test/modules/test_regex/expected/test_regex.out

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -937,6 +937,34 @@ select * from test_regex('a[[=x=]]', 'az', '+Lb');
937937
{0,REG_ULOCALE}
938938
(1 row)
939939

940+
-- expectMatch9.9b &iL{a[[=Y=]]}ayay
941+
select * from test_regex('a[[=Y=]]', 'ay', 'iL');
942+
test_regex
943+
-----------------
944+
{0,REG_ULOCALE}
945+
{ay}
946+
(2 rows)
947+
948+
select * from test_regex('a[[=Y=]]', 'ay', 'iLb');
949+
test_regex
950+
-----------------
951+
{0,REG_ULOCALE}
952+
{ay}
953+
(2 rows)
954+
955+
-- expectNomatch9.9c &L{a[[=Y=]]}ay
956+
select * from test_regex('a[[=Y=]]', 'ay', 'L');
957+
test_regex
958+
-----------------
959+
{0,REG_ULOCALE}
960+
(1 row)
961+
962+
select * from test_regex('a[[=Y=]]', 'ay', 'Lb');
963+
test_regex
964+
-----------------
965+
{0,REG_ULOCALE}
966+
(1 row)
967+
940968
-- expectError9.10 &{a[0-[=x=]]}ERANGE
941969
select * from test_regex('a[0-[=x=]]', '', '');
942970
ERROR: invalid regular expression: invalid character range
@@ -2932,6 +2960,34 @@ select * from test_regex('a[^b-d]', 'aC', 'iMb');
29322960
{0,REG_UUNPORT}
29332961
(1 row)
29342962

2963+
-- expectMatch19.6 &iM{a[B-Z]}aCaC
2964+
select * from test_regex('a[B-Z]', 'aC', 'iM');
2965+
test_regex
2966+
-----------------
2967+
{0,REG_UUNPORT}
2968+
{aC}
2969+
(2 rows)
2970+
2971+
select * from test_regex('a[B-Z]', 'aC', 'iMb');
2972+
test_regex
2973+
-----------------
2974+
{0,REG_UUNPORT}
2975+
{aC}
2976+
(2 rows)
2977+
2978+
-- expectNomatch19.7 &iM{a[^B-Z]}aC
2979+
select * from test_regex('a[^B-Z]', 'aC', 'iM');
2980+
test_regex
2981+
-----------------
2982+
{0,REG_UUNPORT}
2983+
(1 row)
2984+
2985+
select * from test_regex('a[^B-Z]', 'aC', 'iMb');
2986+
test_regex
2987+
-----------------
2988+
{0,REG_UUNPORT}
2989+
(1 row)
2990+
29352991
-- doing 20 "directors and embedded options"
29362992
-- expectError20.1 &***?BADPAT
29372993
select * from test_regex('***?', '', '');
@@ -3850,6 +3906,14 @@ select * from test_regex('^([^/]+?)(?:/([^/]+?))(?:/([^/]+?))?$', 'foo/bar/baz',
38503906
{foo/bar/baz,foo,bar,baz}
38513907
(2 rows)
38523908

3909+
-- expectMatch24.14 PRT{^(.+?)(?:/(.+?))(?:/(.+?)\3)?$}{foo/bar/baz/quux}{foo/bar/baz/quux}{foo}{bar/baz/quux}{}
3910+
select * from test_regex('^(.+?)(?:/(.+?))(?:/(.+?)\3)?$', 'foo/bar/baz/quux', 'PRT');
3911+
test_regex
3912+
----------------------------------------------
3913+
{3,REG_UBACKREF,REG_UNONPOSIX,REG_USHORTEST}
3914+
{foo/bar/baz/quux,foo,bar/baz/quux,NULL}
3915+
(2 rows)
3916+
38533917
-- doing 25 "mixed quantifiers"
38543918
-- # this is very incomplete as yet
38553919
-- # should include |
@@ -4926,3 +4990,59 @@ select * from test_regex('(\Y)+', 'foo', 'LNP');
49264990
{"",""}
49274991
(2 rows)
49284992

4993+
-- and now, tests not from either Spencer or the Tcl project
4994+
-- These cases exercise additional code paths in pushfwd()/push()/combine()
4995+
select * from test_regex('a\Y(?=45)', 'a45', 'HLP');
4996+
test_regex
4997+
-----------------------------------------------
4998+
{0,REG_ULOOKAROUND,REG_UNONPOSIX,REG_ULOCALE}
4999+
{a}
5000+
(2 rows)
5001+
5002+
select * from test_regex('a(?=.)c', 'ac', 'HP');
5003+
test_regex
5004+
-----------------------------------
5005+
{0,REG_ULOOKAROUND,REG_UNONPOSIX}
5006+
{ac}
5007+
(2 rows)
5008+
5009+
select * from test_regex('a(?=.).*(?=3)3*', 'azz33', 'HP');
5010+
test_regex
5011+
-----------------------------------
5012+
{0,REG_ULOOKAROUND,REG_UNONPOSIX}
5013+
{azz33}
5014+
(2 rows)
5015+
5016+
select * from test_regex('a(?=\w)\w*(?=.).*', 'az3%', 'HLP');
5017+
test_regex
5018+
-----------------------------------------------
5019+
{0,REG_ULOOKAROUND,REG_UNONPOSIX,REG_ULOCALE}
5020+
{az3%}
5021+
(2 rows)
5022+
5023+
-- These exercise the bulk-arc-movement paths in moveins() and moveouts();
5024+
-- you may need to make them longer if you change BULK_ARC_OP_USE_SORT()
5025+
select * from test_regex('ABCDEFGHIJKLMNOPQRSTUVWXYZ(?:\w|a|b|c|d|e|f|0|1|2|3|4|5|6|Q)',
5026+
'ABCDEFGHIJKLMNOPQRSTUVWXYZ3', 'LP');
5027+
test_regex
5028+
-------------------------------
5029+
{0,REG_UNONPOSIX,REG_ULOCALE}
5030+
{ABCDEFGHIJKLMNOPQRSTUVWXYZ3}
5031+
(2 rows)
5032+
5033+
select * from test_regex('ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789(\Y\Y)+',
5034+
'ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789Z', 'LP');
5035+
test_regex
5036+
-------------------------------------------
5037+
{1,REG_UNONPOSIX,REG_ULOCALE}
5038+
{ABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789,""}
5039+
(2 rows)
5040+
5041+
select * from test_regex('((x|xabcdefghijklmnopqrstuvwxyz0123456789)x*|[^y]z)$',
5042+
'az', '');
5043+
test_regex
5044+
--------------
5045+
{2}
5046+
{az,az,NULL}
5047+
(2 rows)
5048+

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp