Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit90b1638

Browse files
author
Alexander Korotkov
committed
Implement estimate_idf().
1 parent86f185f commit90b1638

File tree

2 files changed

+200
-11
lines changed

2 files changed

+200
-11
lines changed

‎src/rum.h

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1013,5 +1013,6 @@ extern Datum FunctionCall10Coll(FmgrInfo *flinfo, Oid collation,
10131013
externchar*TFIDFSource;
10141014
externboolcheck_tf_idf_source(char**newval,void**extra,GucSourcesource);
10151015
externvoidassign_tf_idf_source(constchar*newval,void*extra);
1016+
externfloat4estimate_idf(char*lexeme,intlength);
10161017

10171018
#endif/* __RUM_H__ */

‎src/tf_idf.c

Lines changed: 199 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -10,14 +10,48 @@
1010
#include"postgres.h"
1111

1212
#include"catalog/namespace.h"
13+
#include"catalog/pg_statistic.h"
1314
#include"catalog/pg_type.h"
1415
#include"utils/builtins.h"
1516
#include"utils/lsyscache.h"
17+
#include"utils/memutils.h"
18+
#include"utils/syscache.h"
1619
#include"utils/varlena.h"
1720

1821
#include"rum.h"
1922

20-
char*TFIDFSource;
23+
/* lookup table type for binary searching through MCELEMs */
24+
typedefstruct
25+
{
26+
text*element;
27+
float4frequency;
28+
}TextFreq;
29+
30+
/* type of keys for bsearch'ing through an array of TextFreqs */
31+
typedefstruct
32+
{
33+
char*lexeme;
34+
intlength;
35+
}LexemeKey;
36+
37+
typedefstruct
38+
{
39+
TextFreq*lookup;
40+
intnmcelem;
41+
float4minfreq;
42+
}MCelemStats;
43+
44+
typedefstruct
45+
{
46+
OidrelId;
47+
AttrNumberattrno;
48+
}RelAttrInfo;
49+
50+
char*TFIDFSource;
51+
staticRelAttrInfoTFIDFSourceParsed;
52+
staticboolTDIDFLoaded= false;
53+
staticMemoryContextTFIDFContext=NULL;
54+
staticMCelemStatsTDIDFStats;
2155

2256
#defineEXIT_CHECK_TF_IDF_SOURCE(error) \
2357
do { \
@@ -29,18 +63,24 @@ char *TFIDFSource;
2963
return false; \
3064
} while (false);
3165

66+
staticvoidload_tf_idf_source(void);
67+
staticvoidcheck_load_tf_idf_source(void);
68+
staticvoidforget_tf_idf_stats(void);
69+
staticintcompare_lexeme_textfreq(constvoid*e1,constvoid*e2);
70+
3271
bool
3372
check_tf_idf_source(char**newval,void**extra,GucSourcesource)
3473
{
35-
char*rawname;
36-
char*attname;
37-
List*namelist;
38-
OidnamespaceId;
39-
OidrelId;
40-
Relationrel=NULL;
41-
TupleDesctupDesc;
42-
AttrNumberattrno;
43-
inti;
74+
char*rawname;
75+
char*attname;
76+
List*namelist;
77+
OidnamespaceId;
78+
OidrelId;
79+
Relationrel=NULL;
80+
TupleDesctupDesc;
81+
AttrNumberattrno;
82+
inti;
83+
RelAttrInfo*myextra;
4484

4585
/* Need a modifiable copy of string */
4686
rawname=pstrdup(*newval);
@@ -107,6 +147,11 @@ check_tf_idf_source(char **newval, void **extra, GucSource source)
107147
if (tupDesc->attrs[attrno-1]->atttypid!=TSVECTOROID)
108148
EXIT_CHECK_TF_IDF_SOURCE("attribute should be of tsvector type");
109149

150+
myextra= (RelAttrInfo*)malloc(sizeof(RelAttrInfo));
151+
myextra->relId=relId;
152+
myextra->attrno=attrno;
153+
*extra= (void*)myextra;
154+
110155
pfree(rawname);
111156
list_free(namelist);
112157
RelationClose(rel);
@@ -117,5 +162,148 @@ check_tf_idf_source(char **newval, void **extra, GucSource source)
117162
void
118163
assign_tf_idf_source(constchar*newval,void*extra)
119164
{
165+
RelAttrInfo*myextra= (RelAttrInfo*)extra;
166+
167+
TFIDFSourceParsed=*myextra;
168+
forget_tf_idf_stats();
169+
}
170+
171+
staticvoid
172+
load_tf_idf_source(void)
173+
{
174+
HeapTuplestatsTuple;
175+
AttStatsSlotsslot;
176+
MemoryContextoldContext;
177+
inti;
178+
179+
if (!TFIDFContext)
180+
TFIDFContext=AllocSetContextCreate(TopMemoryContext,
181+
"Memory context for TF/IDF statistics",
182+
ALLOCSET_DEFAULT_SIZES);
183+
184+
statsTuple=SearchSysCache3(STATRELATTINH,
185+
ObjectIdGetDatum(TFIDFSourceParsed.relId),
186+
Int16GetDatum(TFIDFSourceParsed.attrno),
187+
BoolGetDatum(true));
188+
189+
if (!statsTuple)
190+
statsTuple=SearchSysCache3(STATRELATTINH,
191+
ObjectIdGetDatum(TFIDFSourceParsed.relId),
192+
Int16GetDatum(TFIDFSourceParsed.attrno),
193+
BoolGetDatum(false));
194+
195+
MemoryContextReset(TFIDFContext);
196+
TDIDFLoaded= false;
197+
198+
oldContext=MemoryContextSwitchTo(TFIDFContext);
199+
200+
if (!statsTuple
201+
|| !get_attstatsslot(&sslot,statsTuple,
202+
STATISTIC_KIND_MCELEM,InvalidOid,
203+
ATTSTATSSLOT_VALUES |ATTSTATSSLOT_NUMBERS)
204+
||sslot.nnumbers!=sslot.nvalues+2)
205+
{
206+
ereport(ERROR,
207+
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
208+
errmsg("statistics for TD/IDF is not found"),
209+
errhint("consider running ANALYZE")));
210+
}
211+
212+
TDIDFStats.nmcelem=sslot.nvalues;
213+
TDIDFStats.minfreq=sslot.numbers[sslot.nnumbers-2];
214+
/*
215+
* Transpose the data into a single array so we can use bsearch().
216+
*/
217+
TDIDFStats.lookup= (TextFreq*)palloc(sizeof(TextFreq)*TDIDFStats.nmcelem);
218+
for (i=0;i<TDIDFStats.nmcelem;i++)
219+
{
220+
/*
221+
* The text Datums came from an array, so it cannot be compressed or
222+
* stored out-of-line -- it's safe to use VARSIZE_ANY*.
223+
*/
224+
Assert(!VARATT_IS_COMPRESSED(sslot.values[i])&& !VARATT_IS_EXTERNAL(sslot.values[i]));
225+
TDIDFStats.lookup[i].element= (text*)DatumGetPointer(sslot.values[i]);
226+
TDIDFStats.lookup[i].frequency=sslot.numbers[i];
227+
}
120228

121-
}
229+
MemoryContextSwitchTo(oldContext);
230+
231+
ReleaseSysCache(statsTuple);
232+
}
233+
234+
staticvoid
235+
check_load_tf_idf_source(void)
236+
{
237+
if (!TDIDFLoaded)
238+
load_tf_idf_source();
239+
}
240+
241+
staticvoid
242+
forget_tf_idf_stats(void)
243+
{
244+
MemoryContextReset(TFIDFContext);
245+
TDIDFLoaded= false;
246+
}
247+
248+
/*
249+
* bsearch() comparator for a lexeme (non-NULL terminated string with length)
250+
* and a TextFreq. Use length, then byte-for-byte comparison, because that's
251+
* how ANALYZE code sorted data before storing it in a statistic tuple.
252+
* See ts_typanalyze.c for details.
253+
*/
254+
staticint
255+
compare_lexeme_textfreq(constvoid*e1,constvoid*e2)
256+
{
257+
constLexemeKey*key= (constLexemeKey*)e1;
258+
constTextFreq*t= (constTextFreq*)e2;
259+
intlen1,
260+
len2;
261+
262+
len1=key->length;
263+
len2=VARSIZE_ANY_EXHDR(t->element);
264+
265+
/* Compare lengths first, possibly avoiding a strncmp call */
266+
if (len1>len2)
267+
return1;
268+
elseif (len1<len2)
269+
return-1;
270+
271+
/* Fall back on byte-for-byte comparison */
272+
returnstrncmp(key->lexeme,VARDATA_ANY(t->element),len1);
273+
}
274+
275+
float4
276+
estimate_idf(char*lexeme,intlength)
277+
{
278+
TextFreq*searchres;
279+
LexemeKeykey;
280+
float4selec;
281+
282+
check_load_tf_idf_source();
283+
284+
key.lexeme=lexeme;
285+
key.length=length;
286+
287+
searchres= (TextFreq*)bsearch(&key,TDIDFStats.lookup,TDIDFStats.nmcelem,
288+
sizeof(TextFreq),
289+
compare_lexeme_textfreq);
290+
291+
if (searchres)
292+
{
293+
/*
294+
* The element is in MCELEM. Return precise selectivity (or
295+
* at least as precise as ANALYZE could find out).
296+
*/
297+
selec=searchres->frequency;
298+
}
299+
else
300+
{
301+
/*
302+
* The element is not in MCELEM. Punt, but assume that the
303+
* selectivity cannot be more than minfreq / 2.
304+
*/
305+
selec=TDIDFStats.minfreq /2;
306+
}
307+
308+
return1.0f /selec;
309+
}

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp