Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit97532f7

Browse files
committed
Add some knowledge about prefix matches to tsmatchsel(). It's not terribly
bright, but it beats assuming that a prefix match behaves identically to anexact match, which is what the code was doing before :-(. Noted whileexperimenting with Artur Dobrowski's example.
1 parentd4fe61b commit97532f7

File tree

1 file changed

+108
-54
lines changed

1 file changed

+108
-54
lines changed

‎src/backend/tsearch/ts_selfuncs.c

Lines changed: 108 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
*
88
*
99
* IDENTIFICATION
10-
* $PostgreSQL: pgsql/src/backend/tsearch/ts_selfuncs.c,v 1.8 2010/07/31 03:27:40 tgl Exp $
10+
* $PostgreSQL: pgsql/src/backend/tsearch/ts_selfuncs.c,v 1.9 2010/08/01 21:31:08 tgl Exp $
1111
*
1212
*-------------------------------------------------------------------------
1313
*/
@@ -257,93 +257,147 @@ mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
257257
*
258258
* 1 - select(oper) in NOT nodes
259259
*
260-
* freq[val] in VAL nodes, if the value is in MCELEM
260+
* histogram-based estimation in prefix VAL nodes
261+
*
262+
* freq[val] in exact VAL nodes, if the value is in MCELEM
261263
* min(freq[MCELEM]) / 2 in VAL nodes, if it is not
262264
*
263265
* The MCELEM array is already sorted (see ts_typanalyze.c), so we can use
264266
* binary search for determining freq[MCELEM].
265267
*
266268
* If we don't have stats for the tsvector, we still use this logic,
267-
* except wealwaysuseDEFAULT_TS_MATCH_SELfor VAL nodes. This case
268-
*is signaledby lookup == NULL.
269+
* except we usedefault estimatesfor VAL nodes. This case is signaled
270+
* by lookup == NULL.
269271
*/
270272
staticSelectivity
271273
tsquery_opr_selec(QueryItem*item,char*operand,
272274
TextFreq*lookup,intlength,float4minfreq)
273275
{
274-
LexemeKeykey;
275-
TextFreq*searchres;
276-
Selectivityselec,
277-
s1,
278-
s2;
276+
Selectivityselec;
279277

280278
/* since this function recurses, it could be driven to stack overflow */
281279
check_stack_depth();
282280

283281
if (item->type==QI_VAL)
284282
{
285283
QueryOperand*oper= (QueryOperand*)item;
286-
287-
/* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */
288-
if (lookup==NULL)
289-
return (Selectivity)DEFAULT_TS_MATCH_SEL;
284+
LexemeKeykey;
290285

291286
/*
292287
* Prepare the key for bsearch().
293288
*/
294289
key.lexeme=operand+oper->distance;
295290
key.length=oper->length;
296291

297-
searchres= (TextFreq*)bsearch(&key,lookup,length,
298-
sizeof(TextFreq),
299-
compare_lexeme_textfreq);
300-
301-
if (searchres)
292+
if (oper->prefix)
302293
{
294+
/* Prefix match, ie the query item is lexeme:* */
295+
Selectivitymatched,
296+
allmcvs;
297+
inti;
298+
299+
/*
300+
* Our strategy is to scan through the MCV list and add up the
301+
* frequencies of the ones that match the prefix, thereby
302+
* assuming that the MCVs are representative of the whole lexeme
303+
* population in this respect. Compare histogram_selectivity().
304+
*
305+
* This is only a good plan if we have a pretty fair number of
306+
* MCVs available; we set the threshold at 100. If no stats or
307+
* insufficient stats, arbitrarily use DEFAULT_TS_MATCH_SEL*4.
308+
*/
309+
if (lookup==NULL||length<100)
310+
return (Selectivity) (DEFAULT_TS_MATCH_SEL*4);
311+
312+
matched=allmcvs=0;
313+
for (i=0;i<length;i++)
314+
{
315+
TextFreq*t=lookup+i;
316+
inttlen=VARSIZE_ANY_EXHDR(t->element);
317+
318+
if (tlen >=key.length&&
319+
strncmp(key.lexeme,VARDATA_ANY(t->element),
320+
key.length)==0)
321+
matched+=t->frequency;
322+
allmcvs+=t->frequency;
323+
}
324+
325+
if (allmcvs>0)/* paranoia about zero divide */
326+
selec=matched /allmcvs;
327+
else
328+
selec= (Selectivity) (DEFAULT_TS_MATCH_SEL*4);
329+
303330
/*
304-
*The element is in MCELEM. Return precise selectivity (or at
305-
*least as precise as ANALYZE could find out).
331+
*In any case, never believe that a prefix match has selectivity
332+
*less than DEFAULT_TS_MATCH_SEL.
306333
*/
307-
return (Selectivity)searchres->frequency;
334+
selec=Max(DEFAULT_TS_MATCH_SEL,selec);
308335
}
309336
else
310337
{
311-
/*
312-
* The element is not in MCELEM. Punt, but assume that the
313-
* selectivity cannot be more than minfreq / 2.
314-
*/
315-
return (Selectivity)Min(DEFAULT_TS_MATCH_SEL,minfreq /2);
338+
/* Regular exact lexeme match */
339+
TextFreq*searchres;
340+
341+
/* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */
342+
if (lookup==NULL)
343+
return (Selectivity)DEFAULT_TS_MATCH_SEL;
344+
345+
searchres= (TextFreq*)bsearch(&key,lookup,length,
346+
sizeof(TextFreq),
347+
compare_lexeme_textfreq);
348+
349+
if (searchres)
350+
{
351+
/*
352+
* The element is in MCELEM. Return precise selectivity (or
353+
* at least as precise as ANALYZE could find out).
354+
*/
355+
selec=searchres->frequency;
356+
}
357+
else
358+
{
359+
/*
360+
* The element is not in MCELEM. Punt, but assume that the
361+
* selectivity cannot be more than minfreq / 2.
362+
*/
363+
selec=Min(DEFAULT_TS_MATCH_SEL,minfreq /2);
364+
}
316365
}
317366
}
318-
319-
/* Current TSQuery node is an operator */
320-
switch (item->qoperator.oper)
367+
else
321368
{
322-
caseOP_NOT:
323-
selec=1.0-tsquery_opr_selec(item+1,operand,
324-
lookup,length,minfreq);
325-
break;
326-
327-
caseOP_AND:
328-
s1=tsquery_opr_selec(item+1,operand,
329-
lookup,length,minfreq);
330-
s2=tsquery_opr_selec(item+item->qoperator.left,operand,
331-
lookup,length,minfreq);
332-
selec=s1*s2;
333-
break;
334-
335-
caseOP_OR:
336-
s1=tsquery_opr_selec(item+1,operand,
337-
lookup,length,minfreq);
338-
s2=tsquery_opr_selec(item+item->qoperator.left,operand,
339-
lookup,length,minfreq);
340-
selec=s1+s2-s1*s2;
341-
break;
342-
343-
default:
344-
elog(ERROR,"unrecognized operator: %d",item->qoperator.oper);
345-
selec=0;/* keep compiler quiet */
346-
break;
369+
/* Current TSQuery node is an operator */
370+
Selectivitys1,
371+
s2;
372+
373+
switch (item->qoperator.oper)
374+
{
375+
caseOP_NOT:
376+
selec=1.0-tsquery_opr_selec(item+1,operand,
377+
lookup,length,minfreq);
378+
break;
379+
380+
caseOP_AND:
381+
s1=tsquery_opr_selec(item+1,operand,
382+
lookup,length,minfreq);
383+
s2=tsquery_opr_selec(item+item->qoperator.left,operand,
384+
lookup,length,minfreq);
385+
selec=s1*s2;
386+
break;
387+
388+
caseOP_OR:
389+
s1=tsquery_opr_selec(item+1,operand,
390+
lookup,length,minfreq);
391+
s2=tsquery_opr_selec(item+item->qoperator.left,operand,
392+
lookup,length,minfreq);
393+
selec=s1+s2-s1*s2;
394+
break;
395+
396+
default:
397+
elog(ERROR,"unrecognized operator: %d",item->qoperator.oper);
398+
selec=0;/* keep compiler quiet */
399+
break;
400+
}
347401
}
348402

349403
/* Clamp intermediate results to stay sane despite roundoff error */

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp