Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit3bb8244

Browse files
author
Alexander Korotkov
committed
Add IDF to scoring (vary basic).
1 parent90b1638 commit3bb8244

File tree

2 files changed

+88
-11
lines changed

2 files changed

+88
-11
lines changed

‎src/rum_ts_utils.c

Lines changed: 45 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -101,18 +101,25 @@ typedef struct
101101

102102
typedefstruct
103103
{
104-
booloperandexist;
104+
booloperandexist;
105105
WordEntryPospos;
106106
}
107107
QueryRepresentationOperand;
108108

109+
typedefstruct
110+
{
111+
float4idf;
112+
boolidfloaded;
113+
}QueryRepresentationIDF;
114+
109115
typedefstruct
110116
{
111117
TSQueryquery;
112118
/* Used in rum_tsquery_distance() */
113119
int*map_item_operand;
114120

115121
QueryRepresentationOperand*operandData;
122+
QueryRepresentationIDF*operandIdf;
116123
intlength;
117124
}QueryRepresentation;
118125

@@ -140,6 +147,7 @@ static WordEntryPosVector POSNULL = {
140147
#defineRANK_NORM_UNIQ0x08
141148
#defineRANK_NORM_LOGUNIQ0x10
142149
#defineRANK_NORM_RDIVRPLUS10x20
150+
#defineRANK_NORM_IDF0x40
143151
#defineDEF_NORM_METHODRANK_NO_NORM
144152

145153
#defineQR_GET_OPERAND(q,v)\
@@ -1229,6 +1237,7 @@ calc_score_docr(float4 *arrdata, DocRepresentation *doc, uint32 doclen,
12291237
{
12301238
doubleCpos=0.0;
12311239
doubleInvSum=0.0;
1240+
doubleIdf=0.0;
12321241
intnNoise;
12331242
DocRepresentation*ptr=ext.begin;
12341243
/* Added by SK */
@@ -1278,13 +1287,43 @@ calc_score_docr(float4 *arrdata, DocRepresentation *doc, uint32 doclen,
12781287

12791288
/* Compute the number of query terms in the cover */
12801289
for (i=0;i<qr->length;i++)
1290+
{
12811291
if (qr->operandData[i].operandexist)
1282-
nitems++;
1292+
{
1293+
if (method&RANK_NORM_IDF)
1294+
{
1295+
if (!qr->operandIdf[i].idfloaded)
1296+
{
1297+
QueryOperand*oper= (QueryOperand*) (GETQUERY(qr->query)+i);
1298+
qr->operandIdf[i].idf=
1299+
estimate_idf(
1300+
GETOPERAND(qr->query)+oper->distance,
1301+
oper->length
1302+
);
1303+
qr->operandIdf[i].idfloaded= true;
1304+
}
1305+
1306+
Idf+=qr->operandIdf[i].idf;
1307+
}
1308+
else
1309+
{
1310+
nitems++;
1311+
}
1312+
}
1313+
}
12831314

12841315
Cpos= ((double) (ext.end-ext.begin+1)) /InvSum;
12851316

1286-
if (nitems>0)
1287-
Cpos *=nitems;
1317+
if (method&RANK_NORM_IDF)
1318+
{
1319+
if (Idf >=1.0)
1320+
Cpos *=Idf;
1321+
}
1322+
else
1323+
{
1324+
if (nitems>0)
1325+
Cpos *=nitems;
1326+
}
12881327

12891328
/*
12901329
* if doc are big enough then ext.q may be equal to ext.p due to limit
@@ -1369,6 +1408,8 @@ calc_score(float4 *arrdata, TSVector txt, TSQuery query, int method)
13691408
qr.query=query;
13701409
qr.map_item_operand=NULL;
13711410
qr.operandData=palloc0(sizeof(qr.operandData[0])*query->size);
1411+
if (method&RANK_NORM_IDF)
1412+
qr.operandIdf=palloc0(sizeof(qr.operandIdf[0])*query->size);
13721413
qr.length=query->size;
13731414

13741415
doc=get_docrep(txt,&qr,&doclen);

‎src/tf_idf.c

Lines changed: 43 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@
1212
#include"catalog/namespace.h"
1313
#include"catalog/pg_statistic.h"
1414
#include"catalog/pg_type.h"
15+
#include"nodes/nodeFuncs.h"
1516
#include"utils/builtins.h"
1617
#include"utils/lsyscache.h"
1718
#include"utils/memutils.h"
@@ -20,6 +21,12 @@
2021

2122
#include"rum.h"
2223

24+
/*
25+
* FIXME:
26+
* * cache IDF
27+
* * handle prefix search
28+
*/
29+
2330
/* lookup table type for binary searching through MCELEMs */
2431
typedefstruct
2532
{
@@ -77,7 +84,6 @@ check_tf_idf_source(char **newval, void **extra, GucSource source)
7784
OidnamespaceId;
7885
OidrelId;
7986
Relationrel=NULL;
80-
TupleDesctupDesc;
8187
AttrNumberattrno;
8288
inti;
8389
RelAttrInfo*myextra;
@@ -119,17 +125,27 @@ check_tf_idf_source(char **newval, void **extra, GucSource source)
119125
EXIT_CHECK_TF_IDF_SOURCE("relation not found");
120126

121127
rel=RelationIdGetRelation(relId);
122-
tupDesc=rel->rd_att;
123128
if (rel->rd_rel->relkind==RELKIND_INDEX)
124129
{
130+
intexprnum=0;
131+
125132
attrno=pg_atoi(attname,sizeof(attrno),10);
126133
if (attrno <=0||attrno>rel->rd_index->indnatts)
127134
EXIT_CHECK_TF_IDF_SOURCE("wrong index attribute number");
128135
if (rel->rd_index->indkey.values[attrno-1]!=InvalidAttrNumber)
129136
EXIT_CHECK_TF_IDF_SOURCE("regular indexed column is specified");
137+
for (i=0;i<attrno-1;i++)
138+
{
139+
if (rel->rd_index->indkey.values[i]==InvalidAttrNumber)
140+
exprnum++;
141+
}
142+
if (exprType((Node*)list_nth(rel->rd_indexprs,exprnum))!=TSVECTOROID)
143+
EXIT_CHECK_TF_IDF_SOURCE("indexed expression should be of tsvector type");
130144
}
131145
else
132146
{
147+
TupleDesctupDesc=rel->rd_att;
148+
133149
attrno=InvalidAttrNumber;
134150
for (i=0;i<tupDesc->natts;i++)
135151
{
@@ -139,13 +155,12 @@ check_tf_idf_source(char **newval, void **extra, GucSource source)
139155
break;
140156
}
141157
}
142-
143158
if (attrno==InvalidAttrNumber)
144159
EXIT_CHECK_TF_IDF_SOURCE("attribute not found");
160+
if (tupDesc->attrs[attrno-1]->atttypid!=TSVECTOROID)
161+
EXIT_CHECK_TF_IDF_SOURCE("attribute should be of tsvector type");
145162
}
146163

147-
if (tupDesc->attrs[attrno-1]->atttypid!=TSVECTOROID)
148-
EXIT_CHECK_TF_IDF_SOURCE("attribute should be of tsvector type");
149164

150165
myextra= (RelAttrInfo*)malloc(sizeof(RelAttrInfo));
151166
myextra->relId=relId;
@@ -164,7 +179,16 @@ assign_tf_idf_source(const char *newval, void *extra)
164179
{
165180
RelAttrInfo*myextra= (RelAttrInfo*)extra;
166181

167-
TFIDFSourceParsed=*myextra;
182+
if (myextra)
183+
{
184+
TFIDFSourceParsed=*myextra;
185+
}
186+
else
187+
{
188+
TFIDFSourceParsed.relId=InvalidOid;
189+
TFIDFSourceParsed.attrno=InvalidAttrNumber;
190+
}
191+
168192
forget_tf_idf_stats();
169193
}
170194

@@ -181,6 +205,15 @@ load_tf_idf_source(void)
181205
"Memory context for TF/IDF statistics",
182206
ALLOCSET_DEFAULT_SIZES);
183207

208+
if (!OidIsValid(TFIDFSourceParsed.relId)
209+
||TFIDFSourceParsed.attrno==InvalidAttrNumber)
210+
{
211+
ereport(ERROR,
212+
(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
213+
errmsg("statistics for TD/IDF is not defined"),
214+
errhint("consider setting tf_idf_source GUC")));
215+
}
216+
184217
statsTuple=SearchSysCache3(STATRELATTINH,
185218
ObjectIdGetDatum(TFIDFSourceParsed.relId),
186219
Int16GetDatum(TFIDFSourceParsed.attrno),
@@ -228,6 +261,8 @@ load_tf_idf_source(void)
228261

229262
MemoryContextSwitchTo(oldContext);
230263

264+
TDIDFLoaded= true;
265+
231266
ReleaseSysCache(statsTuple);
232267
}
233268

@@ -241,7 +276,8 @@ check_load_tf_idf_source(void)
241276
staticvoid
242277
forget_tf_idf_stats(void)
243278
{
244-
MemoryContextReset(TFIDFContext);
279+
if (TFIDFContext)
280+
MemoryContextReset(TFIDFContext);
245281
TDIDFLoaded= false;
246282
}
247283

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp