Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit4e57668

Browse files
committed
Create a selectivity estimation function for the text search @@ operator.
Jan Urbanski
1 parente2b7d0c commit4e57668

File tree

9 files changed

+463
-30
lines changed

9 files changed

+463
-30
lines changed

‎doc/src/sgml/catalogs.sgml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
<!-- $PostgreSQL: pgsql/doc/src/sgml/catalogs.sgml,v 2.174 2008/09/15 18:43:41 tgl Exp $ -->
1+
<!-- $PostgreSQL: pgsql/doc/src/sgml/catalogs.sgml,v 2.175 2008/09/19 19:03:40 tgl Exp $ -->
22
<!--
33
Documentation of the system catalogs, directed toward PostgreSQL developers
44
-->
@@ -6664,6 +6664,9 @@
66646664
A list of the frequencies of the most common values or elements,
66656665
i.e., number of occurrences of each divided by total number of rows.
66666666
(NULL when <structfield>most_common_vals</structfield> is.)
6667+
For some datatypes such as <type>tsvector</>, it can also store some
6668+
additional information, making it longer than the
6669+
<structfield>most_common_vals</> array.
66676670
</entry>
66686671
</row>
66696672

‎src/backend/tsearch/Makefile

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,7 @@
44
#
55
# Copyright (c) 2006-2008, PostgreSQL Global Development Group
66
#
7-
# $PostgreSQL: pgsql/src/backend/tsearch/Makefile,v 1.7 2008/07/14 00:51:45 tgl Exp $
7+
# $PostgreSQL: pgsql/src/backend/tsearch/Makefile,v 1.8 2008/09/19 19:03:40 tgl Exp $
88
#
99
#-------------------------------------------------------------------------
1010
subdir = src/backend/tsearch
@@ -19,7 +19,7 @@ DICTFILES=synonym_sample.syn thesaurus_sample.ths hunspell_sample.affix \
1919
OBJS = ts_locale.o ts_parse.o wparser.o wparser_def.o dict.o\
2020
dict_simple.o dict_synonym.o dict_thesaurus.o\
2121
dict_ispell.o regis.o spell.o\
22-
to_tsany.o ts_typanalyze.o ts_utils.o
22+
to_tsany.ots_selfuncs.ots_typanalyze.o ts_utils.o
2323

2424
include$(top_srcdir)/src/backend/common.mk
2525

‎src/backend/tsearch/ts_selfuncs.c

Lines changed: 363 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,363 @@
1+
/*-------------------------------------------------------------------------
2+
*
3+
* ts_selfuncs.c
4+
* Selectivity estimation functions for text search operators.
5+
*
6+
* Portions Copyright (c) 1996-2008, PostgreSQL Global Development Group
7+
*
8+
*
9+
* IDENTIFICATION
10+
* $PostgreSQL: pgsql/src/backend/tsearch/ts_selfuncs.c,v 1.1 2008/09/19 19:03:40 tgl Exp $
11+
*
12+
*-------------------------------------------------------------------------
13+
*/
14+
#include"postgres.h"
15+
16+
#include"catalog/pg_statistic.h"
17+
#include"catalog/pg_type.h"
18+
#include"miscadmin.h"
19+
#include"nodes/nodes.h"
20+
#include"tsearch/ts_type.h"
21+
#include"utils/lsyscache.h"
22+
#include"utils/selfuncs.h"
23+
#include"utils/syscache.h"
24+
25+
26+
/*
27+
* The default text search selectivity is chosen to be small enough to
28+
* encourage indexscans for typical table densities. See selfuncs.h and
29+
* DEFAULT_EQ_SEL for details.
30+
*/
31+
#defineDEFAULT_TS_MATCH_SEL 0.005
32+
33+
/* lookup table type for binary searching through MCELEMs */
34+
typedefstruct
35+
{
36+
text*element;
37+
float4frequency;
38+
}TextFreq;
39+
40+
/* type of keys for bsearch'ing through an array of TextFreqs */
41+
typedefstruct
42+
{
43+
char*lexeme;
44+
intlength;
45+
}LexemeKey;
46+
47+
staticSelectivitytsquerysel(VariableStatData*vardata,Datumconstval);
48+
staticSelectivitymcelem_tsquery_selec(TSQueryquery,
49+
Datum*mcelem,intnmcelem,
50+
float4*numbers,intnnumbers);
51+
staticSelectivitytsquery_opr_selec(QueryItem*item,char*operand,
52+
TextFreq*lookup,intlength,float4minfreq);
53+
staticintcompare_lexeme_textfreq(constvoid*e1,constvoid*e2);
54+
55+
56+
/*
57+
*tsmatchsel -- Selectivity of "@@"
58+
*
59+
* restriction selectivity function for tsvector @@ tsquery and
60+
* tsquery @@ tsvector
61+
*/
62+
Datum
63+
tsmatchsel(PG_FUNCTION_ARGS)
64+
{
65+
PlannerInfo*root= (PlannerInfo*)PG_GETARG_POINTER(0);
66+
#ifdefNOT_USED
67+
Oidoperator=PG_GETARG_OID(1);
68+
#endif
69+
List*args= (List*)PG_GETARG_POINTER(2);
70+
intvarRelid=PG_GETARG_INT32(3);
71+
VariableStatDatavardata;
72+
Node*other;
73+
boolvaronleft;
74+
Selectivityselec;
75+
76+
/*
77+
* If expression is not variable = something or something = variable, then
78+
* punt and return a default estimate.
79+
*/
80+
if (!get_restriction_variable(root,args,varRelid,
81+
&vardata,&other,&varonleft))
82+
PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
83+
84+
/*
85+
* Can't do anything useful if the something is not a constant, either.
86+
*/
87+
if (!IsA(other,Const))
88+
{
89+
ReleaseVariableStats(vardata);
90+
PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
91+
}
92+
93+
/*
94+
* The "@@" operator is strict, so we can cope with NULL right away
95+
*/
96+
if (((Const*)other)->constisnull)
97+
{
98+
ReleaseVariableStats(vardata);
99+
PG_RETURN_FLOAT8(0.0);
100+
}
101+
102+
/*
103+
* OK, there's a Var and a Const we're dealing with here. We need the Var
104+
* to be a TSVector (or else we don't have any useful statistic for it).
105+
* We have to check this because the Var might be the TSQuery not the
106+
* TSVector.
107+
*/
108+
if (vardata.vartype==TSVECTOROID)
109+
{
110+
/* tsvector @@ tsquery or the other way around */
111+
Assert(((Const*)other)->consttype==TSQUERYOID);
112+
113+
selec=tsquerysel(&vardata, ((Const*)other)->constvalue);
114+
}
115+
else
116+
{
117+
/* The Var is something we don't have useful statistics for */
118+
selec=DEFAULT_TS_MATCH_SEL;
119+
}
120+
121+
ReleaseVariableStats(vardata);
122+
123+
CLAMP_PROBABILITY(selec);
124+
125+
PG_RETURN_FLOAT8((float8)selec);
126+
}
127+
128+
129+
/*
130+
*tsmatchjoinsel -- join selectivity of "@@"
131+
*
132+
* join selectivity function for tsvector @@ tsquery and tsquery @@ tsvector
133+
*/
134+
Datum
135+
tsmatchjoinsel(PG_FUNCTION_ARGS)
136+
{
137+
/* for the moment we just punt */
138+
PG_RETURN_FLOAT8(DEFAULT_TS_MATCH_SEL);
139+
}
140+
141+
142+
/*
143+
* @@ selectivity for tsvector var vs tsquery constant
144+
*/
145+
staticSelectivity
146+
tsquerysel(VariableStatData*vardata,Datumconstval)
147+
{
148+
Selectivityselec;
149+
150+
if (HeapTupleIsValid(vardata->statsTuple))
151+
{
152+
TSQueryquery;
153+
Form_pg_statisticstats;
154+
Datum*values;
155+
intnvalues;
156+
float4*numbers;
157+
intnnumbers;
158+
159+
/* The caller made sure the const is a TSQuery, so get it now */
160+
query=DatumGetTSQuery(constval);
161+
162+
stats= (Form_pg_statistic)GETSTRUCT(vardata->statsTuple);
163+
164+
/* MCELEM will be an array of TEXT elements for a tsvector column */
165+
if (get_attstatsslot(vardata->statsTuple,
166+
TEXTOID,-1,
167+
STATISTIC_KIND_MCELEM,InvalidOid,
168+
&values,&nvalues,
169+
&numbers,&nnumbers))
170+
{
171+
/*
172+
* There is a most-common-elements slot for the tsvector Var, so
173+
* use that.
174+
*/
175+
selec=mcelem_tsquery_selec(query,values,nvalues,
176+
numbers,nnumbers);
177+
free_attstatsslot(TEXTOID,values,nvalues,numbers,nnumbers);
178+
}
179+
else
180+
{
181+
/* No most-common-elements info, so we must punt */
182+
selec= (Selectivity)DEFAULT_TS_MATCH_SEL;
183+
}
184+
}
185+
else
186+
{
187+
/* No stats at all, so we must punt */
188+
selec= (Selectivity)DEFAULT_TS_MATCH_SEL;
189+
}
190+
191+
returnselec;
192+
}
193+
194+
/*
195+
* Extract data from the pg_statistic arrays into useful format.
196+
*/
197+
staticSelectivity
198+
mcelem_tsquery_selec(TSQueryquery,Datum*mcelem,intnmcelem,
199+
float4*numbers,intnnumbers)
200+
{
201+
float4minfreq;
202+
TextFreq*lookup;
203+
Selectivityselec;
204+
inti;
205+
206+
/*
207+
* There should be two more Numbers than Values, because the last two
208+
* cells are taken for minimal and maximal frequency. Punt if not.
209+
*/
210+
if (nnumbers!=nmcelem+2)
211+
returnDEFAULT_TS_MATCH_SEL;
212+
213+
/*
214+
* Transpose the data into a single array so we can use bsearch().
215+
*/
216+
lookup= (TextFreq*)palloc(sizeof(TextFreq)*nmcelem);
217+
for (i=0;i<nmcelem;i++)
218+
{
219+
/*
220+
* The text Datums came from an array, so it cannot be compressed
221+
* or stored out-of-line -- it's safe to use VARSIZE_ANY*.
222+
*/
223+
Assert(!VARATT_IS_COMPRESSED(mcelem[i])&& !VARATT_IS_EXTERNAL(mcelem[i]));
224+
lookup[i].element= (text*)DatumGetPointer(mcelem[i]);
225+
lookup[i].frequency=numbers[i];
226+
}
227+
228+
/*
229+
* Grab the lowest frequency. compute_tsvector_stats() stored it for us in
230+
* the one before the last cell of the Numbers array. See ts_typanalyze.c
231+
*/
232+
minfreq=numbers[nnumbers-2];
233+
234+
selec=tsquery_opr_selec(GETQUERY(query),GETOPERAND(query),lookup,
235+
nmcelem,minfreq);
236+
237+
pfree(lookup);
238+
239+
returnselec;
240+
}
241+
242+
/*
243+
* Traverse the tsquery in preorder, calculating selectivity as:
244+
*
245+
* selec(left_oper) * selec(right_oper) in AND nodes,
246+
*
247+
* selec(left_oper) + selec(right_oper) -
248+
* selec(left_oper) * selec(right_oper) in OR nodes,
249+
*
250+
* 1 - select(oper) in NOT nodes
251+
*
252+
* freq[val] in VAL nodes, if the value is in MCELEM
253+
* min(freq[MCELEM]) / 2 in VAL nodes, if it is not
254+
*
255+
*
256+
* The MCELEM array is already sorted (see ts_typanalyze.c), so we can use
257+
* binary search for determining freq[MCELEM].
258+
*/
259+
staticSelectivity
260+
tsquery_opr_selec(QueryItem*item,char*operand,
261+
TextFreq*lookup,intlength,float4minfreq)
262+
{
263+
LexemeKeykey;
264+
TextFreq*searchres;
265+
Selectivityselec,s1,s2;
266+
267+
/* since this function recurses, it could be driven to stack overflow */
268+
check_stack_depth();
269+
270+
if (item->type==QI_VAL)
271+
{
272+
QueryOperand*oper= (QueryOperand*)item;
273+
274+
/*
275+
* Prepare the key for bsearch().
276+
*/
277+
key.lexeme=operand+oper->distance;
278+
key.length=oper->length;
279+
280+
searchres= (TextFreq*)bsearch(&key,lookup,length,
281+
sizeof(TextFreq),
282+
compare_lexeme_textfreq);
283+
284+
if (searchres)
285+
{
286+
/*
287+
* The element is in MCELEM. Return precise selectivity (or at
288+
* least as precise as ANALYZE could find out).
289+
*/
290+
return (Selectivity)searchres->frequency;
291+
}
292+
else
293+
{
294+
/*
295+
* The element is not in MCELEM. Punt, but assert that the
296+
* selectivity cannot be more than minfreq / 2.
297+
*/
298+
return (Selectivity)Min(DEFAULT_TS_MATCH_SEL,minfreq /2);
299+
}
300+
}
301+
302+
/* Current TSQuery node is an operator */
303+
switch (item->operator.oper)
304+
{
305+
caseOP_NOT:
306+
selec=1.0-tsquery_opr_selec(item+1,operand,
307+
lookup,length,minfreq);
308+
break;
309+
310+
caseOP_AND:
311+
s1=tsquery_opr_selec(item+1,operand,
312+
lookup,length,minfreq);
313+
s2=tsquery_opr_selec(item+item->operator.left,operand,
314+
lookup,length,minfreq);
315+
selec=s1*s2;
316+
break;
317+
318+
caseOP_OR:
319+
s1=tsquery_opr_selec(item+1,operand,
320+
lookup,length,minfreq);
321+
s2=tsquery_opr_selec(item+item->operator.left,operand,
322+
lookup,length,minfreq);
323+
selec=s1+s2-s1*s2;
324+
break;
325+
326+
default:
327+
elog(ERROR,"unrecognized operator: %d",item->operator.oper);
328+
selec=0;/* keep compiler quiet */
329+
break;
330+
}
331+
332+
/* Clamp intermediate results to stay sane despite roundoff error */
333+
CLAMP_PROBABILITY(selec);
334+
335+
returnselec;
336+
}
337+
338+
/*
339+
* bsearch() comparator for a lexeme (non-NULL terminated string with length)
340+
* and a TextFreq. Use length, then byte-for-byte comparison, because that's
341+
* how ANALYZE code sorted data before storing it in a statistic tuple.
342+
* See ts_typanalyze.c for details.
343+
*/
344+
staticint
345+
compare_lexeme_textfreq(constvoid*e1,constvoid*e2)
346+
{
347+
constLexemeKey*key= (constLexemeKey*)e1;
348+
constTextFreq*t= (constTextFreq*)e2;
349+
intlen1,
350+
len2;
351+
352+
len1=key->length;
353+
len2=VARSIZE_ANY_EXHDR(t->element);
354+
355+
/* Compare lengths first, possibly avoiding a strncmp call */
356+
if (len1>len2)
357+
return1;
358+
elseif (len1<len2)
359+
return-1;
360+
361+
/* Fall back on byte-for-byte comparison */
362+
returnstrncmp(key->lexeme,VARDATA_ANY(t->element),len1);
363+
}

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp