Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit53b5a8c

Browse files
committed
Fix contrib/pg_trgm's extraction of trigrams from regular expressions.
The logic for removing excess trigrams from the result was faulty.It intends to avoid merging the initial and final states of the NFA,which is necessary, but in testing whether removal of a specific trigramwould cause that, it failed to consider the combined effects of all thestate merges that that trigram's removal would cause. This could resultin a broken final graph that would never match anything, leading to GINor GiST indexscans not finding anything.To fix, add a "tentParent" field that is used only within this loop,and set it to show state merges that we are tentatively going to do.While examining a particular arc, we must chase up through tentParentlinks as well as regular parent links (the former can only appear atopthe latter), and we must account for state init/fin flag merges thathaven't actually been done yet.To simplify the latter, combine the separate init and fin bool fieldsinto a bitmap flags field. I also chose to get rid of the "children"state list, which seems entirely inessential.Per bug #14563 from Alexey Isayko, which the added test cases are based on.Back-patch to 9.3 where this code was added.Report:https://postgr.es/m/20170222111446.1256.67547@wrigleys.postgresql.orgDiscussion:https://postgr.es/m/8816.1487787594@sss.pgh.pa.us
1 parent3f613c6 commit53b5a8c

File tree

3 files changed

+91
-37
lines changed

3 files changed

+91
-37
lines changed

‎contrib/pg_trgm/expected/pg_trgm.out

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1194,6 +1194,12 @@ select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988
11941194
0.5 | qwertyu0987
11951195
(2 rows)
11961196

1197+
select count(*) from test_trgm where t ~ '[qwerty]{2}-?[qwerty]{2}';
1198+
count
1199+
-------
1200+
1000
1201+
(1 row)
1202+
11971203
create index trgm_idx on test_trgm using gist (t gist_trgm_ops);
11981204
set enable_seqscan=off;
11991205
select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu0988' order by sml desc, t;
@@ -2338,6 +2344,12 @@ select t <-> 'q0987wertyu0988', t from test_trgm order by t <-> 'q0987wertyu0988
23382344
0.5 | qwertyu0987
23392345
(2 rows)
23402346

2347+
select count(*) from test_trgm where t ~ '[qwerty]{2}-?[qwerty]{2}';
2348+
count
2349+
-------
2350+
1000
2351+
(1 row)
2352+
23412353
drop index trgm_idx;
23422354
create index trgm_idx on test_trgm using gin (t gin_trgm_ops);
23432355
set enable_seqscan=off;
@@ -3467,6 +3479,12 @@ select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu198
34673479
qwertyu0988 | 0.333333
34683480
(1 row)
34693481

3482+
select count(*) from test_trgm where t ~ '[qwerty]{2}-?[qwerty]{2}';
3483+
count
3484+
-------
3485+
1000
3486+
(1 row)
3487+
34703488
create table test2(t text);
34713489
insert into test2 values ('abcdef');
34723490
insert into test2 values ('quark');

‎contrib/pg_trgm/sql/pg_trgm.sql

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,6 +21,7 @@ select t,similarity(t,'qwertyu0988') as sml from test_trgm where t % 'qwertyu098
2121
select t,similarity(t,'gwertyu0988')as smlfrom test_trgmwhere t %'gwertyu0988'order by smldesc, t;
2222
select t,similarity(t,'gwertyu1988')as smlfrom test_trgmwhere t %'gwertyu1988'order by smldesc, t;
2323
select t<->'q0987wertyu0988', tfrom test_trgmorder by t<->'q0987wertyu0988'limit2;
24+
selectcount(*)from test_trgmwhere t ~'[qwerty]{2}-?[qwerty]{2}';
2425

2526
createindextrgm_idxon test_trgm using gist (t gist_trgm_ops);
2627
set enable_seqscan=off;
@@ -31,6 +32,7 @@ select t,similarity(t,'gwertyu1988') as sml from test_trgm where t % 'gwertyu198
3132
explain (costs off)
3233
select t<->'q0987wertyu0988', tfrom test_trgmorder by t<->'q0987wertyu0988'limit2;
3334
select t<->'q0987wertyu0988', tfrom test_trgmorder by t<->'q0987wertyu0988'limit2;
35+
selectcount(*)from test_trgmwhere t ~'[qwerty]{2}-?[qwerty]{2}';
3436

3537
dropindex trgm_idx;
3638
createindextrgm_idxon test_trgm using gin (t gin_trgm_ops);
@@ -39,6 +41,7 @@ set enable_seqscan=off;
3941
select t,similarity(t,'qwertyu0988')as smlfrom test_trgmwhere t %'qwertyu0988'order by smldesc, t;
4042
select t,similarity(t,'gwertyu0988')as smlfrom test_trgmwhere t %'gwertyu0988'order by smldesc, t;
4143
select t,similarity(t,'gwertyu1988')as smlfrom test_trgmwhere t %'gwertyu1988'order by smldesc, t;
44+
selectcount(*)from test_trgmwhere t ~'[qwerty]{2}-?[qwerty]{2}';
4245

4346
createtabletest2(ttext);
4447
insert into test2values ('abcdef');

‎contrib/pg_trgm/trgm_regexp.c

Lines changed: 70 additions & 37 deletions
Original file line numberDiff line numberDiff line change
@@ -287,21 +287,22 @@ typedef struct
287287
*arcs - outgoing arcs of this state (List of TrgmArc)
288288
*enterKeys - enter keys reachable from this state without reading any
289289
* predictable trigram (List of TrgmStateKey)
290-
*fin - flag indicating this state is final
291-
*init - flag indicating this state is initial
290+
*flags - flag bits
292291
*parent - parent state, if this state has been merged into another
293-
*children -child states (states that have been merged into this one)
292+
*tentParent -planned parent state, if considering a merge
294293
*number - number of this state (used at the packaging stage)
295294
*/
295+
#defineTSTATE_INIT0x01/* flag indicating this state is initial */
296+
#defineTSTATE_FIN0x02/* flag indicating this state is final */
297+
296298
typedefstructTrgmState
297299
{
298300
TrgmStateKeystateKey;/* hashtable key: must be first field */
299301
List*arcs;
300302
List*enterKeys;
301-
boolfin;
302-
boolinit;
303+
intflags;
303304
structTrgmState*parent;
304-
List*children;
305+
structTrgmState*tentParent;
305306
intnumber;
306307
}TrgmState;
307308

@@ -569,7 +570,7 @@ createTrgmNFAInternal(regex_t *regex, TrgmPackedGraph **graph,
569570
* get from the initial state to the final state without reading any
570571
* predictable trigram.
571572
*/
572-
if (trgmNFA.initState->fin)
573+
if (trgmNFA.initState->flags&TSTATE_FIN)
573574
returnNULL;
574575

575576
/*
@@ -896,7 +897,7 @@ transformGraph(TrgmNFA *trgmNFA)
896897
initkey.nstate=pg_reg_getinitialstate(trgmNFA->regex);
897898

898899
initstate=getState(trgmNFA,&initkey);
899-
initstate->init= true;
900+
initstate->flags |=TSTATE_INIT;
900901
trgmNFA->initState=initstate;
901902

902903
/*
@@ -914,7 +915,7 @@ transformGraph(TrgmNFA *trgmNFA)
914915
* actual processing.
915916
*/
916917
if (trgmNFA->overflowed)
917-
state->fin= true;
918+
state->flags |=TSTATE_FIN;
918919
else
919920
processState(trgmNFA,state);
920921

@@ -939,7 +940,7 @@ processState(TrgmNFA *trgmNFA, TrgmState *state)
939940
* queue is empty. But we can quit if the state gets marked final.
940941
*/
941942
addKey(trgmNFA,state,&state->stateKey);
942-
while (trgmNFA->keysQueue!=NIL&& !state->fin)
943+
while (trgmNFA->keysQueue!=NIL&& !(state->flags&TSTATE_FIN))
943944
{
944945
TrgmStateKey*key= (TrgmStateKey*)linitial(trgmNFA->keysQueue);
945946

@@ -951,7 +952,7 @@ processState(TrgmNFA *trgmNFA, TrgmState *state)
951952
* Add outgoing arcs only if state isn't final (we have no interest in
952953
* outgoing arcs if we already match)
953954
*/
954-
if (!state->fin)
955+
if (!(state->flags&TSTATE_FIN))
955956
addArcs(trgmNFA,state);
956957
}
957958

@@ -960,7 +961,7 @@ processState(TrgmNFA *trgmNFA, TrgmState *state)
960961
* whether this should result in any further enter keys being added.
961962
* If so, add those keys to keysQueue so that processState will handle them.
962963
*
963-
* If the enter key is for the NFA's final state,set state->fin = TRUE.
964+
* If the enter key is for the NFA's final state,mark state as TSTATE_FIN.
964965
* This situation means that we can reach the final state from this expanded
965966
* state without reading any predictable trigram, so we must consider this
966967
* state as an accepting one.
@@ -1030,7 +1031,7 @@ addKey(TrgmNFA *trgmNFA, TrgmState *state, TrgmStateKey *key)
10301031
/* If state is now known final, mark it and we're done */
10311032
if (key->nstate==pg_reg_getfinalstate(trgmNFA->regex))
10321033
{
1033-
state->fin= true;
1034+
state->flags |=TSTATE_FIN;
10341035
return;
10351036
}
10361037

@@ -1356,10 +1357,9 @@ getState(TrgmNFA *trgmNFA, TrgmStateKey *key)
13561357
/* New state: initialize and queue it */
13571358
state->arcs=NIL;
13581359
state->enterKeys=NIL;
1359-
state->init= false;
1360-
state->fin= false;
1360+
state->flags=0;
13611361
state->parent=NULL;
1362-
state->children=NIL;
1362+
state->tentParent=NULL;
13631363
state->number=-1;
13641364

13651365
trgmNFA->queue=lappend(trgmNFA->queue,state);
@@ -1538,20 +1538,60 @@ selectColorTrigrams(TrgmNFA *trgmNFA)
15381538
TrgmArcInfo*arcInfo= (TrgmArcInfo*)lfirst(cell);
15391539
TrgmState*source=arcInfo->source,
15401540
*target=arcInfo->target;
1541+
intsource_flags,
1542+
target_flags;
15411543

15421544
/* examine parent states, if any merging has already happened */
15431545
while (source->parent)
15441546
source=source->parent;
15451547
while (target->parent)
15461548
target=target->parent;
15471549

1548-
if ((source->init||target->init)&&
1549-
(source->fin||target->fin))
1550+
/* we must also consider merges we are planning right now */
1551+
source_flags=source->flags;
1552+
while (source->tentParent)
1553+
{
1554+
source=source->tentParent;
1555+
source_flags |=source->flags;
1556+
}
1557+
target_flags=target->flags;
1558+
while (target->tentParent)
1559+
{
1560+
target=target->tentParent;
1561+
target_flags |=target->flags;
1562+
}
1563+
1564+
/* would fully-merged state have both INIT and FIN set? */
1565+
if (((source_flags |target_flags)& (TSTATE_INIT |TSTATE_FIN))==
1566+
(TSTATE_INIT |TSTATE_FIN))
15501567
{
15511568
canRemove= false;
15521569
break;
15531570
}
1571+
1572+
/* ok so far, so remember planned merge */
1573+
if (source!=target)
1574+
target->tentParent=source;
15541575
}
1576+
1577+
/* We must clear all the tentParent fields before continuing */
1578+
foreach(cell,trgmInfo->arcs)
1579+
{
1580+
TrgmArcInfo*arcInfo= (TrgmArcInfo*)lfirst(cell);
1581+
TrgmState*target=arcInfo->target;
1582+
TrgmState*ttarget;
1583+
1584+
while (target->parent)
1585+
target=target->parent;
1586+
1587+
while ((ttarget=target->tentParent)!=NULL)
1588+
{
1589+
target->tentParent=NULL;
1590+
target=ttarget;
1591+
}
1592+
}
1593+
1594+
/* Now, move on if we can't drop this trigram */
15551595
if (!canRemove)
15561596
continue;
15571597

@@ -1567,7 +1607,12 @@ selectColorTrigrams(TrgmNFA *trgmNFA)
15671607
while (target->parent)
15681608
target=target->parent;
15691609
if (source!=target)
1610+
{
15701611
mergeStates(source,target);
1612+
/* Assert we didn't merge initial and final states */
1613+
Assert((source->flags& (TSTATE_INIT |TSTATE_FIN))!=
1614+
(TSTATE_INIT |TSTATE_FIN));
1615+
}
15711616
}
15721617

15731618
/* Mark trigram unexpanded, and update totalTrgmCount */
@@ -1709,27 +1754,15 @@ fillTrgm(trgm *ptrgm, trgm_mb_char s[3])
17091754
staticvoid
17101755
mergeStates(TrgmState*state1,TrgmState*state2)
17111756
{
1712-
ListCell*cell;
1713-
17141757
Assert(state1!=state2);
17151758
Assert(!state1->parent);
17161759
Assert(!state2->parent);
17171760

1718-
/* state1 absorbs state2's init/fin flags */
1719-
state1->init |=state2->init;
1720-
state1->fin |=state2->fin;
1761+
/* state1 absorbs state2's flags */
1762+
state1->flags |=state2->flags;
17211763

1722-
/* state2, and all its children, become children of state1 */
1723-
foreach(cell,state2->children)
1724-
{
1725-
TrgmState*state= (TrgmState*)lfirst(cell);
1726-
1727-
state->parent=state1;
1728-
}
1764+
/* state2, and indirectly all its children, become children of state1 */
17291765
state2->parent=state1;
1730-
state1->children=list_concat(state1->children,state2->children);
1731-
state1->children=lappend(state1->children,state2);
1732-
state2->children=NIL;
17331766
}
17341767

17351768
/*
@@ -1798,9 +1831,9 @@ packGraph(TrgmNFA *trgmNFA, MemoryContext rcontext)
17981831

17991832
if (state->number<0)
18001833
{
1801-
if (state->init)
1834+
if (state->flags&TSTATE_INIT)
18021835
state->number=0;
1803-
elseif (state->fin)
1836+
elseif (state->flags&TSTATE_FIN)
18041837
state->number=1;
18051838
else
18061839
{
@@ -2064,9 +2097,9 @@ printTrgmNFA(TrgmNFA *trgmNFA)
20642097
ListCell*cell;
20652098

20662099
appendStringInfo(&buf,"s%p", (void*)state);
2067-
if (state->fin)
2100+
if (state->flags&TSTATE_FIN)
20682101
appendStringInfo(&buf," [shape = doublecircle]");
2069-
if (state->init)
2102+
if (state->flags&TSTATE_INIT)
20702103
initstate=state;
20712104
appendStringInfo(&buf," [label = \"%d\"]",state->stateKey.nstate);
20722105
appendStringInfo(&buf,";\n");

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp