Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitc9b0c67

Browse files
committed
Fix default text search parser's ts_headline code for phrase queries.
This code could produce very poor results when asked to highlight astring based on a query using phrase-match operators. The root causeis that hlCover(), which is supposed to find a minimal substring thatmatches the query, was written assuming that word position is notsignificant. I'm only 95% convinced that its algorithm was correct evenfor plain AND/OR queries; but it definitely fails completely for phrasematches, causing it to possibly not identify a cover string at all.Hence, rewrite hlCover() with a less-tense algorithm that just triesall the possible substrings, earlier and shorter ones first. (This isnot as bad as it sounds performance-wise, because all of the stringmatching has been done already: the repeated tsquery match checksboil down to pointer comparisons.)Unfortunately, since that approach produces more candidate coverstrings than before, it also exposes that there were bugs in theheuristics in mark_hl_words() for selecting a best cover string.Fixes there include:* Do not apply the ShortWord filter to words that appear in the query.* Remove a misguided optimization for quickly rejecting a cover.* Fix order-of-operation bug that could cause computation of awrong figure of merit (poslen) when shortening a cover.* Change the preference rule so that candidate headlines that do notinclude their whole cover string (after MaxWords trimming) are lowestpriority, since they may not actually satisfy the user's query.This results in some changes in existing regression test cases,but they all seem reasonable. Note in particular that the testsinvolving strings like "1 2 3" were previously being affected bythe ShortWord filter, masking the normal matching behavior.Per bug #16345 from Augustinas Jokubauskas; the new test cases arebased on that example. Back-patch to 9.6 where phrase search wasadded to tsquery.Discussion:https://postgr.es/m/16345-2e0cf5cddbdcd3b4@postgresql.org
1 parentb10f8bb commitc9b0c67

File tree

3 files changed

+141
-86
lines changed

3 files changed

+141
-86
lines changed

‎src/backend/tsearch/wparser_def.c

Lines changed: 99 additions & 74 deletions
Original file line numberDiff line numberDiff line change
@@ -1942,9 +1942,10 @@ prsd_end(PG_FUNCTION_ARGS)
19421942
#defineINTERESTINGWORD(j) \
19431943
(prs->words[j].item && !prs->words[j].repeated)
19441944

1945-
/* Don't want to end at a non-word or a short word */
1945+
/* Don't want to end at a non-word or a short word, unless interesting */
19461946
#defineBADENDPOINT(j) \
1947-
(NOENDTOKEN(prs->words[j].type) || prs->words[j].len <= shortword)
1947+
((NOENDTOKEN(prs->words[j].type) || prs->words[j].len <= shortword) && \
1948+
!INTERESTINGWORD(j))
19481949

19491950
typedefstruct
19501951
{
@@ -2003,75 +2004,97 @@ checkcondition_HL(void *opaque, QueryOperand *val, ExecPhraseData *data)
20032004
return false;
20042005
}
20052006

2006-
2007-
staticbool
2008-
hlCover(HeadlineParsedText*prs,TSQueryquery,int*p,int*q)
2007+
/*
2008+
* hlFirstIndex: find first index >= pos containing any word used in query
2009+
*
2010+
* Returns -1 if no such index
2011+
*/
2012+
staticint
2013+
hlFirstIndex(HeadlineParsedText*prs,TSQueryquery,intpos)
20092014
{
2010-
inti,
2011-
j;
2012-
QueryItem*item=GETQUERY(query);
2013-
intpos=*p;
2014-
2015-
*q=-1;
2016-
*p=INT_MAX;
2015+
inti;
20172016

2018-
for (j=0;j<query->size;j++)
2017+
/* For each word ... */
2018+
for (i=pos;i<prs->curwords;i++)
20192019
{
2020-
if (item->type!=QI_VAL)
2020+
/* ... scan the query to see if this word matches any operand */
2021+
QueryItem*item=GETQUERY(query);
2022+
intj;
2023+
2024+
for (j=0;j<query->size;j++)
20212025
{
2026+
if (item->type==QI_VAL&&
2027+
prs->words[i].item==&item->qoperand)
2028+
returni;
20222029
item++;
2023-
continue;
2024-
}
2025-
for (i=pos;i<prs->curwords;i++)
2026-
{
2027-
if (prs->words[i].item==&item->qoperand)
2028-
{
2029-
if (i>*q)
2030-
*q=i;
2031-
break;
2032-
}
20332030
}
2034-
item++;
20352031
}
2032+
return-1;
2033+
}
20362034

2037-
if (*q<0)
2038-
return false;
2035+
/*
2036+
* hlCover: try to find a substring of prs' word list that satisfies query
2037+
*
2038+
* At entry, *p must be the first word index to consider (initialize this to
2039+
* zero, or to the next index after a previous successful search).
2040+
*
2041+
* On success, sets *p to first word index and *q to last word index of the
2042+
* cover substring, and returns true.
2043+
*
2044+
* The result is a minimal cover, in the sense that both *p and *q will be
2045+
* words used in the query.
2046+
*/
2047+
staticbool
2048+
hlCover(HeadlineParsedText*prs,TSQueryquery,int*p,int*q)
2049+
{
2050+
intpmin,
2051+
pmax,
2052+
nextpmin,
2053+
nextpmax;
2054+
hlCheckch;
20392055

2040-
item=GETQUERY(query);
2041-
for (j=0;j<query->size;j++)
2056+
/*
2057+
* We look for the earliest, shortest substring of prs->words that
2058+
* satisfies the query. Both the pmin and pmax indices must be words
2059+
* appearing in the query; there's no point in trying endpoints in between
2060+
* such points.
2061+
*/
2062+
pmin=hlFirstIndex(prs,query,*p);
2063+
while (pmin >=0)
20422064
{
2043-
if (item->type!=QI_VAL)
2065+
/* This useless assignment just keeps stupider compilers quiet */
2066+
nextpmin=-1;
2067+
/* Consider substrings starting at pmin */
2068+
ch.words=&(prs->words[pmin]);
2069+
/* Consider the length-one substring first, then longer substrings */
2070+
pmax=pmin;
2071+
do
20442072
{
2045-
item++;
2046-
continue;
2047-
}
2048-
for (i=*q;i >=pos;i--)
2049-
{
2050-
if (prs->words[i].item==&item->qoperand)
2073+
/* Try to match query against pmin .. pmax substring */
2074+
ch.len=pmax-pmin+1;
2075+
if (TS_execute(GETQUERY(query),&ch,
2076+
TS_EXEC_EMPTY,checkcondition_HL))
20512077
{
2052-
if (i<*p)
2053-
*p=i;
2054-
break;
2078+
*p=pmin;
2079+
*q=pmax;
2080+
return true;
20552081
}
2056-
}
2057-
item++;
2058-
}
2059-
2060-
if (*p <=*q)
2061-
{
2062-
hlCheckch;
2082+
/* Nope, so advance pmax to next feasible endpoint */
2083+
nextpmax=hlFirstIndex(prs,query,pmax+1);
20632084

2064-
ch.words=&(prs->words[*p]);
2065-
ch.len=*q-*p+1;
2066-
if (TS_execute(GETQUERY(query),&ch,TS_EXEC_EMPTY,checkcondition_HL))
2067-
return true;
2068-
else
2069-
{
2070-
(*p)++;
2071-
returnhlCover(prs,query,p,q);
2085+
/*
2086+
* If this is our first advance past pmin, then the result is also
2087+
* the next feasible value of pmin; remember it to save a
2088+
* redundant search.
2089+
*/
2090+
if (pmax==pmin)
2091+
nextpmin=nextpmax;
2092+
pmax=nextpmax;
20722093
}
2094+
while (pmax >=0);
2095+
/* No luck here, so try next feasible startpoint */
2096+
pmin=nextpmin;
20732097
}
2074-
20752098
return false;
20762099
}
20772100

@@ -2357,11 +2380,12 @@ mark_hl_words(HeadlineParsedText *prs, TSQuery query, bool highlightall,
23572380
intbestb=-1,
23582381
beste=-1;
23592382
intbestlen=-1;
2383+
boolbestcover= false;
23602384
intpose,
23612385
posb,
23622386
poslen,
23632387
curlen;
2364-
2388+
boolposcover;
23652389
inti;
23662390

23672391
if (!highlightall)
@@ -2387,14 +2411,6 @@ mark_hl_words(HeadlineParsedText *prs, TSQuery query, bool highlightall,
23872411
pose=i;
23882412
}
23892413

2390-
/* XXX this optimization seems unnecessary and wrong */
2391-
if (poslen<bestlen&& !BADENDPOINT(beste))
2392-
{
2393-
/* better cover already found, so try next cover */
2394-
p++;
2395-
continue;
2396-
}
2397-
23982414
if (curlen<max_words)
23992415
{
24002416
/*
@@ -2449,29 +2465,38 @@ mark_hl_words(HeadlineParsedText *prs, TSQuery query, bool highlightall,
24492465
i=q;
24502466
for (;curlen>min_words;i--)
24512467
{
2468+
if (!BADENDPOINT(i))
2469+
break;
24522470
if (!NONWORDTOKEN(prs->words[i].type))
24532471
curlen--;
24542472
if (INTERESTINGWORD(i))
24552473
poslen--;
2456-
pose=i;
2457-
if (!BADENDPOINT(i))
2458-
break;
2474+
pose=i-1;
24592475
}
24602476
}
24612477

24622478
/*
2463-
* Adopt this headline if it's the first, or if it has more
2464-
* interesting words and isn't ending at a bad endpoint, or if it
2465-
* replaces a bad endpoint with a good one (XXX even if it has
2466-
* fewer interesting words? Really?)
2479+
* Check whether the proposed headline includes the original
2480+
* cover; it might not if we trimmed it due to max_words.
2481+
*/
2482+
poscover= (posb <=p&&pose >=q);
2483+
2484+
/*
2485+
* Adopt this headline if it's better than the last one, giving
2486+
* highest priority to headlines including the cover, then to
2487+
* headlines with more interesting words, then to headlines with
2488+
* good stopping points. (Since bestlen is initially -1, we will
2489+
* certainly adopt the first headline.)
24672490
*/
2468-
if (bestlen<0||
2469-
(poslen>bestlen&& !BADENDPOINT(pose))||
2470-
(!BADENDPOINT(pose)&&BADENDPOINT(beste)))
2491+
if (poscover>bestcover||
2492+
(poscover==bestcover&&poslen>bestlen)||
2493+
(poscover==bestcover&&poslen==bestlen&&
2494+
!BADENDPOINT(pose)&&BADENDPOINT(beste)))
24712495
{
24722496
bestb=posb;
24732497
beste=pose;
24742498
bestlen=poslen;
2499+
bestcover=poscover;
24752500
}
24762501

24772502
/* move p to generate the next cover */

‎src/test/regress/expected/tsearch.out

Lines changed: 31 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -1301,12 +1301,12 @@ Water, water, every where,
13011301
Nor any drop to drink.
13021302
S. T. Coleridge (1772-1834)
13031303
', phraseto_tsquery('english', 'painted Ocean'));
1304-
ts_headline
1305-
----------------------------------
1306-
<b>painted</b><b>Ocean</b>. +
1307-
Water, water, every where+
1308-
And all the boards did shrink;+
1309-
Water, water, every
1304+
ts_headline
1305+
---------------------------------------
1306+
<b>painted</b>Ship +
1307+
Upon a <b>painted</b> <b>Ocean</b>.+
1308+
Water, water, every where+
1309+
And all the boards did shrink
13101310
(1 row)
13111311

13121312
SELECT ts_headline('english', '
@@ -1328,6 +1328,15 @@ S. T. Coleridge (1772-1834)
13281328
And all the boards
13291329
(1 row)
13301330

1331+
SELECT ts_headline('english',
1332+
'Lorem ipsum urna. Nullam nullam ullamcorper urna.',
1333+
to_tsquery('english','Lorem') && phraseto_tsquery('english','ullamcorper urna'),
1334+
'MaxWords=100, MinWords=1');
1335+
ts_headline
1336+
-------------------------------------------------------------------------------
1337+
<b>Lorem</b> ipsum <b>urna</b>. Nullam nullam <b>ullamcorper</b> <b>urna</b>
1338+
(1 row)
1339+
13311340
SELECT ts_headline('english', '
13321341
<html>
13331342
<!-- some comment -->
@@ -1364,15 +1373,15 @@ SELECT ts_headline('simple', '1 2 3 1 3'::text, '1 <-> 3', 'MaxWords=2, MinWords
13641373
(1 row)
13651374

13661375
SELECT ts_headline('simple', '1 2 3 1 3'::text, '1 & 3', 'MaxWords=4, MinWords=1');
1367-
ts_headline
1368-
------------------------------
1369-
<b>1</b> 2 <b>3</b> <b>1</b>
1376+
ts_headline
1377+
---------------------
1378+
<b>1</b> 2 <b>3</b>
13701379
(1 row)
13711380

13721381
SELECT ts_headline('simple', '1 2 3 1 3'::text, '1 <-> 3', 'MaxWords=4, MinWords=1');
1373-
ts_headline
1374-
-------------------
1375-
<b>1</b> <b>3</b>
1382+
ts_headline
1383+
----------------------------
1384+
<b>3</b> <b>1</b> <b>3</b>
13761385
(1 row)
13771386

13781387
--Check if headline fragments work
@@ -1467,6 +1476,16 @@ S. T. Coleridge (1772-1834)
14671476
S. T. <b>Coleridge</b>
14681477
(1 row)
14691478

1479+
--Fragments with phrase search
1480+
SELECT ts_headline('english',
1481+
'Lorem ipsum urna. Nullam nullam ullamcorper urna.',
1482+
to_tsquery('english','Lorem') && phraseto_tsquery('english','ullamcorper urna'),
1483+
'MaxFragments=100, MaxWords=100, MinWords=1');
1484+
ts_headline
1485+
-------------------------------------------------------------------------------
1486+
<b>Lorem</b> ipsum <b>urna</b>. Nullam nullam <b>ullamcorper</b> <b>urna</b>
1487+
(1 row)
1488+
14701489
--Rewrite sub system
14711490
CREATE TABLE test_tsquery (txtkeyword TEXT, txtsample TEXT);
14721491
\set ECHO none

‎src/test/regress/sql/tsearch.sql

Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -384,6 +384,11 @@ Water, water, every where,
384384
S. T. Coleridge (1772-1834)
385385
', phraseto_tsquery('english','idle as a painted Ship'));
386386

387+
SELECT ts_headline('english',
388+
'Lorem ipsum urna. Nullam nullam ullamcorper urna.',
389+
to_tsquery('english','Lorem') && phraseto_tsquery('english','ullamcorper urna'),
390+
'MaxWords=100, MinWords=1');
391+
387392
SELECT ts_headline('english','
388393
<html>
389394
<!-- some comment -->
@@ -454,6 +459,12 @@ Water, water, every where,
454459
S. T. Coleridge (1772-1834)
455460
', to_tsquery('english','Coleridge & stuck'),'MaxFragments=2,FragmentDelimiter=***');
456461

462+
--Fragments with phrase search
463+
SELECT ts_headline('english',
464+
'Lorem ipsum urna. Nullam nullam ullamcorper urna.',
465+
to_tsquery('english','Lorem') && phraseto_tsquery('english','ullamcorper urna'),
466+
'MaxFragments=100, MaxWords=100, MinWords=1');
467+
457468
--Rewrite sub system
458469

459470
CREATETABLEtest_tsquery (txtkeywordTEXT, txtsampleTEXT);

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp