|
7 | 7 | * |
8 | 8 | * |
9 | 9 | * IDENTIFICATION |
10 | | - * $PostgreSQL: pgsql/src/backend/tsearch/ts_selfuncs.c,v 1.8 2010/07/31 03:27:40 tgl Exp $ |
| 10 | + * $PostgreSQL: pgsql/src/backend/tsearch/ts_selfuncs.c,v 1.9 2010/08/01 21:31:08 tgl Exp $ |
11 | 11 | * |
12 | 12 | *------------------------------------------------------------------------- |
13 | 13 | */ |
@@ -257,93 +257,147 @@ mcelem_tsquery_selec(TSQuery query, Datum *mcelem, int nmcelem, |
257 | 257 | * |
258 | 258 | * 1 - select(oper) in NOT nodes |
259 | 259 | * |
260 | | - * freq[val] in VAL nodes, if the value is in MCELEM |
| 260 | + * histogram-based estimation in prefix VAL nodes |
| 261 | + * |
| 262 | + * freq[val] in exact VAL nodes, if the value is in MCELEM |
261 | 263 | * min(freq[MCELEM]) / 2 in VAL nodes, if it is not |
262 | 264 | * |
263 | 265 | * The MCELEM array is already sorted (see ts_typanalyze.c), so we can use |
264 | 266 | * binary search for determining freq[MCELEM]. |
265 | 267 | * |
266 | 268 | * If we don't have stats for the tsvector, we still use this logic, |
267 | | - * except wealwaysuseDEFAULT_TS_MATCH_SELfor VAL nodes. This case |
268 | | - *is signaledby lookup == NULL. |
| 269 | + * except we usedefault estimatesfor VAL nodes. This case is signaled |
| 270 | + * by lookup == NULL. |
269 | 271 | */ |
270 | 272 | staticSelectivity |
271 | 273 | tsquery_opr_selec(QueryItem*item,char*operand, |
272 | 274 | TextFreq*lookup,intlength,float4minfreq) |
273 | 275 | { |
274 | | -LexemeKeykey; |
275 | | -TextFreq*searchres; |
276 | | -Selectivityselec, |
277 | | -s1, |
278 | | -s2; |
| 276 | +Selectivityselec; |
279 | 277 |
|
280 | 278 | /* since this function recurses, it could be driven to stack overflow */ |
281 | 279 | check_stack_depth(); |
282 | 280 |
|
283 | 281 | if (item->type==QI_VAL) |
284 | 282 | { |
285 | 283 | QueryOperand*oper= (QueryOperand*)item; |
286 | | - |
287 | | -/* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */ |
288 | | -if (lookup==NULL) |
289 | | -return (Selectivity)DEFAULT_TS_MATCH_SEL; |
| 284 | +LexemeKeykey; |
290 | 285 |
|
291 | 286 | /* |
292 | 287 | * Prepare the key for bsearch(). |
293 | 288 | */ |
294 | 289 | key.lexeme=operand+oper->distance; |
295 | 290 | key.length=oper->length; |
296 | 291 |
|
297 | | -searchres= (TextFreq*)bsearch(&key,lookup,length, |
298 | | -sizeof(TextFreq), |
299 | | -compare_lexeme_textfreq); |
300 | | - |
301 | | -if (searchres) |
| 292 | +if (oper->prefix) |
302 | 293 | { |
| 294 | +/* Prefix match, ie the query item is lexeme:* */ |
| 295 | +Selectivitymatched, |
| 296 | +allmcvs; |
| 297 | +inti; |
| 298 | + |
| 299 | +/* |
| 300 | + * Our strategy is to scan through the MCV list and add up the |
| 301 | + * frequencies of the ones that match the prefix, thereby |
| 302 | + * assuming that the MCVs are representative of the whole lexeme |
| 303 | + * population in this respect. Compare histogram_selectivity(). |
| 304 | + * |
| 305 | + * This is only a good plan if we have a pretty fair number of |
| 306 | + * MCVs available; we set the threshold at 100. If no stats or |
| 307 | + * insufficient stats, arbitrarily use DEFAULT_TS_MATCH_SEL*4. |
| 308 | + */ |
| 309 | +if (lookup==NULL||length<100) |
| 310 | +return (Selectivity) (DEFAULT_TS_MATCH_SEL*4); |
| 311 | + |
| 312 | +matched=allmcvs=0; |
| 313 | +for (i=0;i<length;i++) |
| 314 | +{ |
| 315 | +TextFreq*t=lookup+i; |
| 316 | +inttlen=VARSIZE_ANY_EXHDR(t->element); |
| 317 | + |
| 318 | +if (tlen >=key.length&& |
| 319 | +strncmp(key.lexeme,VARDATA_ANY(t->element), |
| 320 | +key.length)==0) |
| 321 | +matched+=t->frequency; |
| 322 | +allmcvs+=t->frequency; |
| 323 | +} |
| 324 | + |
| 325 | +if (allmcvs>0)/* paranoia about zero divide */ |
| 326 | +selec=matched /allmcvs; |
| 327 | +else |
| 328 | +selec= (Selectivity) (DEFAULT_TS_MATCH_SEL*4); |
| 329 | + |
303 | 330 | /* |
304 | | - *The element is in MCELEM. Return precise selectivity (or at |
305 | | - *least as precise as ANALYZE could find out). |
| 331 | + *In any case, never believe that a prefix match has selectivity |
| 332 | + *less than DEFAULT_TS_MATCH_SEL. |
306 | 333 | */ |
307 | | -return (Selectivity)searchres->frequency; |
| 334 | +selec=Max(DEFAULT_TS_MATCH_SEL,selec); |
308 | 335 | } |
309 | 336 | else |
310 | 337 | { |
311 | | -/* |
312 | | - * The element is not in MCELEM. Punt, but assume that the |
313 | | - * selectivity cannot be more than minfreq / 2. |
314 | | - */ |
315 | | -return (Selectivity)Min(DEFAULT_TS_MATCH_SEL,minfreq /2); |
| 338 | +/* Regular exact lexeme match */ |
| 339 | +TextFreq*searchres; |
| 340 | + |
| 341 | +/* If no stats for the variable, use DEFAULT_TS_MATCH_SEL */ |
| 342 | +if (lookup==NULL) |
| 343 | +return (Selectivity)DEFAULT_TS_MATCH_SEL; |
| 344 | + |
| 345 | +searchres= (TextFreq*)bsearch(&key,lookup,length, |
| 346 | +sizeof(TextFreq), |
| 347 | +compare_lexeme_textfreq); |
| 348 | + |
| 349 | +if (searchres) |
| 350 | +{ |
| 351 | +/* |
| 352 | + * The element is in MCELEM. Return precise selectivity (or |
| 353 | + * at least as precise as ANALYZE could find out). |
| 354 | + */ |
| 355 | +selec=searchres->frequency; |
| 356 | +} |
| 357 | +else |
| 358 | +{ |
| 359 | +/* |
| 360 | + * The element is not in MCELEM. Punt, but assume that the |
| 361 | + * selectivity cannot be more than minfreq / 2. |
| 362 | + */ |
| 363 | +selec=Min(DEFAULT_TS_MATCH_SEL,minfreq /2); |
| 364 | +} |
316 | 365 | } |
317 | 366 | } |
318 | | - |
319 | | -/* Current TSQuery node is an operator */ |
320 | | -switch (item->qoperator.oper) |
| 367 | +else |
321 | 368 | { |
322 | | -caseOP_NOT: |
323 | | -selec=1.0-tsquery_opr_selec(item+1,operand, |
324 | | -lookup,length,minfreq); |
325 | | -break; |
326 | | - |
327 | | -caseOP_AND: |
328 | | -s1=tsquery_opr_selec(item+1,operand, |
329 | | -lookup,length,minfreq); |
330 | | -s2=tsquery_opr_selec(item+item->qoperator.left,operand, |
331 | | -lookup,length,minfreq); |
332 | | -selec=s1*s2; |
333 | | -break; |
334 | | - |
335 | | -caseOP_OR: |
336 | | -s1=tsquery_opr_selec(item+1,operand, |
337 | | -lookup,length,minfreq); |
338 | | -s2=tsquery_opr_selec(item+item->qoperator.left,operand, |
339 | | -lookup,length,minfreq); |
340 | | -selec=s1+s2-s1*s2; |
341 | | -break; |
342 | | - |
343 | | -default: |
344 | | -elog(ERROR,"unrecognized operator: %d",item->qoperator.oper); |
345 | | -selec=0;/* keep compiler quiet */ |
346 | | -break; |
| 369 | +/* Current TSQuery node is an operator */ |
| 370 | +Selectivitys1, |
| 371 | +s2; |
| 372 | + |
| 373 | +switch (item->qoperator.oper) |
| 374 | +{ |
| 375 | +caseOP_NOT: |
| 376 | +selec=1.0-tsquery_opr_selec(item+1,operand, |
| 377 | +lookup,length,minfreq); |
| 378 | +break; |
| 379 | + |
| 380 | +caseOP_AND: |
| 381 | +s1=tsquery_opr_selec(item+1,operand, |
| 382 | +lookup,length,minfreq); |
| 383 | +s2=tsquery_opr_selec(item+item->qoperator.left,operand, |
| 384 | +lookup,length,minfreq); |
| 385 | +selec=s1*s2; |
| 386 | +break; |
| 387 | + |
| 388 | +caseOP_OR: |
| 389 | +s1=tsquery_opr_selec(item+1,operand, |
| 390 | +lookup,length,minfreq); |
| 391 | +s2=tsquery_opr_selec(item+item->qoperator.left,operand, |
| 392 | +lookup,length,minfreq); |
| 393 | +selec=s1+s2-s1*s2; |
| 394 | +break; |
| 395 | + |
| 396 | +default: |
| 397 | +elog(ERROR,"unrecognized operator: %d",item->qoperator.oper); |
| 398 | +selec=0;/* keep compiler quiet */ |
| 399 | +break; |
| 400 | +} |
347 | 401 | } |
348 | 402 |
|
349 | 403 | /* Clamp intermediate results to stay sane despite roundoff error */ |
|