|
7 | 7 | *
|
8 | 8 | *
|
9 | 9 | * IDENTIFICATION
|
10 |
| - * $PostgreSQL: pgsql/src/backend/tsearch/ts_selfuncs.c,v 1.8 2010/07/31 03:27:40 tgl Exp $ |
| 10 | + * $PostgreSQL: pgsql/src/backend/tsearch/ts_selfuncs.c,v 1.9 2010/08/01 21:31:08 tgl Exp $ |
11 | 11 | *
|
12 | 12 | *-------------------------------------------------------------------------
|
13 | 13 | */
|
@@ -257,93 +257,147 @@ mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem,
|
257 | 257 | *
|
258 | 258 | * 1 - select(oper) in NOT nodes
|
259 | 259 | *
|
260 |
| - * freq[val] in VAL nodes, if the value is in MCELEM |
| 260 | + * histogram-based estimation in prefix VAL nodes |
| 261 | + * |
| 262 | + * freq[val] in exact VAL nodes, if the value is in MCELEM |
261 | 263 | * min(freq[MCELEM]) / 2 in VAL nodes, if it is not
|
262 | 264 | *
|
263 | 265 | * The MCELEM array is already sorted (see ts_typanalyze.c), so we can use
|
264 | 266 | * binary search for determining freq[MCELEM].
|
265 | 267 | *
|
266 | 268 | * If we don't have stats for the tsvector, we still use this logic,
|
267 |
| - * except wealwaysuseDEFAULT_TS_MATCH_SELfor VAL nodes. This case |
268 |
| - *is signaledby lookup == NULL. |
| 269 | + * except we usedefault estimatesfor VAL nodes. This case is signaled |
| 270 | + * by lookup == NULL. |
269 | 271 | */
|
270 | 272 | staticSelectivity
|
271 | 273 | tsquery_opr_selec(QueryItem*item,char*operand,
|
272 | 274 | TextFreq*lookup,intlength,float4minfreq)
|
273 | 275 | {
|
274 |
| -LexemeKeykey; |
275 |
| -TextFreq*searchres; |
276 |
| -Selectivityselec, |
277 |
| -s1, |
278 |
| -s2; |
| 276 | +Selectivityselec; |
279 | 277 |
|
280 | 278 | /* since this function recurses, it could be driven to stack overflow */
|
281 | 279 | check_stack_depth();
|
282 | 280 |
|
283 | 281 | if (item->type==QI_VAL)
|
284 | 282 | {
|
285 | 283 | QueryOperand*oper= (QueryOperand*)item;
|
286 |
| - |
287 |
| -/* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */ |
288 |
| -if (lookup==NULL) |
289 |
| -return (Selectivity)DEFAULT_TS_MATCH_SEL; |
| 284 | +LexemeKeykey; |
290 | 285 |
|
291 | 286 | /*
|
292 | 287 | * Prepare the key for bsearch().
|
293 | 288 | */
|
294 | 289 | key.lexeme=operand+oper->distance;
|
295 | 290 | key.length=oper->length;
|
296 | 291 |
|
297 |
| -searchres= (TextFreq*)bsearch(&key,lookup,length, |
298 |
| -sizeof(TextFreq), |
299 |
| -compare_lexeme_textfreq); |
300 |
| - |
301 |
| -if (searchres) |
| 292 | +if (oper->prefix) |
302 | 293 | {
|
| 294 | +/* Prefix match, ie the query item is lexeme:* */ |
| 295 | +Selectivitymatched, |
| 296 | +allmcvs; |
| 297 | +inti; |
| 298 | + |
| 299 | +/* |
| 300 | + * Our strategy is to scan through the MCV list and add up the |
| 301 | + * frequencies of the ones that match the prefix, thereby |
| 302 | + * assuming that the MCVs are representative of the whole lexeme |
| 303 | + * population in this respect. Compare histogram_selectivity(). |
| 304 | + * |
| 305 | + * This is only a good plan if we have a pretty fair number of |
| 306 | + * MCVs available; we set the threshold at 100. If no stats or |
| 307 | + * insufficient stats, arbitrarily use DEFAULT_TS_MATCH_SEL*4. |
| 308 | + */ |
| 309 | +if (lookup==NULL||length<100) |
| 310 | +return (Selectivity) (DEFAULT_TS_MATCH_SEL*4); |
| 311 | + |
| 312 | +matched=allmcvs=0; |
| 313 | +for (i=0;i<length;i++) |
| 314 | +{ |
| 315 | +TextFreq*t=lookup+i; |
| 316 | +inttlen=VARSIZE_ANY_EXHDR(t->element); |
| 317 | + |
| 318 | +if (tlen >=key.length&& |
| 319 | +strncmp(key.lexeme,VARDATA_ANY(t->element), |
| 320 | +key.length)==0) |
| 321 | +matched+=t->frequency; |
| 322 | +allmcvs+=t->frequency; |
| 323 | +} |
| 324 | + |
| 325 | +if (allmcvs>0)/* paranoia about zero divide */ |
| 326 | +selec=matched /allmcvs; |
| 327 | +else |
| 328 | +selec= (Selectivity) (DEFAULT_TS_MATCH_SEL*4); |
| 329 | + |
303 | 330 | /*
|
304 |
| - *The element is in MCELEM. Return precise selectivity (or at |
305 |
| - *least as precise as ANALYZE could find out). |
| 331 | + *In any case, never believe that a prefix match has selectivity |
| 332 | + *less than DEFAULT_TS_MATCH_SEL. |
306 | 333 | */
|
307 |
| -return (Selectivity)searchres->frequency; |
| 334 | +selec=Max(DEFAULT_TS_MATCH_SEL,selec); |
308 | 335 | }
|
309 | 336 | else
|
310 | 337 | {
|
311 |
| -/* |
312 |
| - * The element is not in MCELEM. Punt, but assume that the |
313 |
| - * selectivity cannot be more than minfreq / 2. |
314 |
| - */ |
315 |
| -return (Selectivity)Min(DEFAULT_TS_MATCH_SEL,minfreq /2); |
| 338 | +/* Regular exact lexeme match */ |
| 339 | +TextFreq*searchres; |
| 340 | + |
| 341 | +/* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */ |
| 342 | +if (lookup==NULL) |
| 343 | +return (Selectivity)DEFAULT_TS_MATCH_SEL; |
| 344 | + |
| 345 | +searchres= (TextFreq*)bsearch(&key,lookup,length, |
| 346 | +sizeof(TextFreq), |
| 347 | +compare_lexeme_textfreq); |
| 348 | + |
| 349 | +if (searchres) |
| 350 | +{ |
| 351 | +/* |
| 352 | + * The element is in MCELEM. Return precise selectivity (or |
| 353 | + * at least as precise as ANALYZE could find out). |
| 354 | + */ |
| 355 | +selec=searchres->frequency; |
| 356 | +} |
| 357 | +else |
| 358 | +{ |
| 359 | +/* |
| 360 | + * The element is not in MCELEM. Punt, but assume that the |
| 361 | + * selectivity cannot be more than minfreq / 2. |
| 362 | + */ |
| 363 | +selec=Min(DEFAULT_TS_MATCH_SEL,minfreq /2); |
| 364 | +} |
316 | 365 | }
|
317 | 366 | }
|
318 |
| - |
319 |
| -/* Current TSQuery node is an operator */ |
320 |
| -switch (item->qoperator.oper) |
| 367 | +else |
321 | 368 | {
|
322 |
| -caseOP_NOT: |
323 |
| -selec=1.0-tsquery_opr_selec(item+1,operand, |
324 |
| -lookup,length,minfreq); |
325 |
| -break; |
326 |
| - |
327 |
| -caseOP_AND: |
328 |
| -s1=tsquery_opr_selec(item+1,operand, |
329 |
| -lookup,length,minfreq); |
330 |
| -s2=tsquery_opr_selec(item+item->qoperator.left,operand, |
331 |
| -lookup,length,minfreq); |
332 |
| -selec=s1*s2; |
333 |
| -break; |
334 |
| - |
335 |
| -caseOP_OR: |
336 |
| -s1=tsquery_opr_selec(item+1,operand, |
337 |
| -lookup,length,minfreq); |
338 |
| -s2=tsquery_opr_selec(item+item->qoperator.left,operand, |
339 |
| -lookup,length,minfreq); |
340 |
| -selec=s1+s2-s1*s2; |
341 |
| -break; |
342 |
| - |
343 |
| -default: |
344 |
| -elog(ERROR,"unrecognized operator: %d",item->qoperator.oper); |
345 |
| -selec=0;/* keep compiler quiet */ |
346 |
| -break; |
| 369 | +/* Current TSQuery node is an operator */ |
| 370 | +Selectivitys1, |
| 371 | +s2; |
| 372 | + |
| 373 | +switch (item->qoperator.oper) |
| 374 | +{ |
| 375 | +caseOP_NOT: |
| 376 | +selec=1.0-tsquery_opr_selec(item+1,operand, |
| 377 | +lookup,length,minfreq); |
| 378 | +break; |
| 379 | + |
| 380 | +caseOP_AND: |
| 381 | +s1=tsquery_opr_selec(item+1,operand, |
| 382 | +lookup,length,minfreq); |
| 383 | +s2=tsquery_opr_selec(item+item->qoperator.left,operand, |
| 384 | +lookup,length,minfreq); |
| 385 | +selec=s1*s2; |
| 386 | +break; |
| 387 | + |
| 388 | +caseOP_OR: |
| 389 | +s1=tsquery_opr_selec(item+1,operand, |
| 390 | +lookup,length,minfreq); |
| 391 | +s2=tsquery_opr_selec(item+item->qoperator.left,operand, |
| 392 | +lookup,length,minfreq); |
| 393 | +selec=s1+s2-s1*s2; |
| 394 | +break; |
| 395 | + |
| 396 | +default: |
| 397 | +elog(ERROR,"unrecognized operator: %d",item->qoperator.oper); |
| 398 | +selec=0;/* keep compiler quiet */ |
| 399 | +break; |
| 400 | +} |
347 | 401 | }
|
348 | 402 |
|
349 | 403 | /* Clamp intermediate results to stay sane despite roundoff error */
|
|