Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit581daf5

Browse files
author
Alexander Korotkov
committed
Better IDF calculation.
1 parent3bb8244 commit581daf5

File tree

2 files changed

+20
-45
lines changed

2 files changed

+20
-45
lines changed

‎src/rum_ts_utils.c

Lines changed: 17 additions & 43 deletions
Original file line numberDiff line numberDiff line change
@@ -96,6 +96,7 @@ typedef struct
9696
}key;
9797
}data;
9898
uint8wclass;
99+
float4idf;
99100
int32pos;
100101
}DocRepresentation;
101102

@@ -106,20 +107,13 @@ typedef struct
106107
}
107108
QueryRepresentationOperand;
108109

109-
typedefstruct
110-
{
111-
float4idf;
112-
boolidfloaded;
113-
}QueryRepresentationIDF;
114-
115110
typedefstruct
116111
{
117112
TSQueryquery;
118113
/* Used in rum_tsquery_distance() */
119114
int*map_item_operand;
120115

121116
QueryRepresentationOperand*operandData;
122-
QueryRepresentationIDF*operandIdf;
123117
intlength;
124118
}QueryRepresentation;
125119

@@ -1098,7 +1092,7 @@ find_wordentry(TSVector t, TSQuery q, QueryOperand *item, int32 *nitem)
10981092
}
10991093

11001094
staticDocRepresentation*
1101-
get_docrep(TSVectortxt,QueryRepresentation*qr,uint32*doclen)
1095+
get_docrep(TSVectortxt,QueryRepresentation*qr,uint32*doclen,boolload_idf)
11021096
{
11031097
QueryItem*item=GETQUERY(qr->query);
11041098
WordEntry*entry,
@@ -1134,6 +1128,8 @@ get_docrep(TSVector txt, QueryRepresentation *qr, uint32 *doclen)
11341128

11351129
while (entry-firstentry<nitem)
11361130
{
1131+
float4idf;
1132+
11371133
if (entry->haspos)
11381134
{
11391135
dimt=POSDATALEN(txt,entry);
@@ -1187,12 +1183,18 @@ get_docrep(TSVector txt, QueryRepresentation *qr, uint32 *doclen)
11871183

11881184
}
11891185
}
1186+
1187+
if (load_idf)
1188+
idf=estimate_idf(STRPTR(txt)+entry->pos,entry->len);
1189+
else
1190+
idf=1.0f;
11901191
}
11911192
else
11921193
{
11931194
doc[cur].data.item.nitem=doc[cur-1].data.item.nitem;
11941195
doc[cur].data.item.item=doc[cur-1].data.item.item;
11951196
}
1197+
doc[cur].idf=idf;
11961198
doc[cur].pos=WEP_GETPOS(post[j]);
11971199
doc[cur].wclass=WEP_GETWEIGHT(post[j]);
11981200
cur++;
@@ -1256,6 +1258,7 @@ calc_score_docr(float4 *arrdata, DocRepresentation *doc, uint32 doclen,
12561258
/* For rum_tsquery_distance() */
12571259
else
12581260
new_cover_key+= (int)(uintptr_t)ptr->data.key.item_first;
1261+
Idf+=ptr->idf;
12591262
ptr++;
12601263
}
12611264

@@ -1287,43 +1290,16 @@ calc_score_docr(float4 *arrdata, DocRepresentation *doc, uint32 doclen,
12871290

12881291
/* Compute the number of query terms in the cover */
12891292
for (i=0;i<qr->length;i++)
1290-
{
12911293
if (qr->operandData[i].operandexist)
1292-
{
1293-
if (method&RANK_NORM_IDF)
1294-
{
1295-
if (!qr->operandIdf[i].idfloaded)
1296-
{
1297-
QueryOperand*oper= (QueryOperand*) (GETQUERY(qr->query)+i);
1298-
qr->operandIdf[i].idf=
1299-
estimate_idf(
1300-
GETOPERAND(qr->query)+oper->distance,
1301-
oper->length
1302-
);
1303-
qr->operandIdf[i].idfloaded= true;
1304-
}
1305-
1306-
Idf+=qr->operandIdf[i].idf;
1307-
}
1308-
else
1309-
{
1310-
nitems++;
1311-
}
1312-
}
1313-
}
1294+
nitems++;
13141295

13151296
Cpos= ((double) (ext.end-ext.begin+1)) /InvSum;
13161297

1298+
if (nitems>0)
1299+
Cpos *=nitems;
1300+
13171301
if (method&RANK_NORM_IDF)
1318-
{
1319-
if (Idf >=1.0)
1320-
Cpos *=Idf;
1321-
}
1322-
else
1323-
{
1324-
if (nitems>0)
1325-
Cpos *=nitems;
1326-
}
1302+
Cpos *=Idf;
13271303

13281304
/*
13291305
* if doc are big enough then ext.q may be equal to ext.p due to limit
@@ -1408,11 +1384,9 @@ calc_score(float4 *arrdata, TSVector txt, TSQuery query, int method)
14081384
qr.query=query;
14091385
qr.map_item_operand=NULL;
14101386
qr.operandData=palloc0(sizeof(qr.operandData[0])*query->size);
1411-
if (method&RANK_NORM_IDF)
1412-
qr.operandIdf=palloc0(sizeof(qr.operandIdf[0])*query->size);
14131387
qr.length=query->size;
14141388

1415-
doc=get_docrep(txt,&qr,&doclen);
1389+
doc=get_docrep(txt,&qr,&doclen, (method&RANK_NORM_IDF) ? true : false);
14161390
if (!doc)
14171391
{
14181392
pfree(qr.operandData);

‎src/tf_idf.c

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,8 +23,8 @@
2323

2424
/*
2525
* FIXME:
26-
* * cache IDF
27-
* *handle prefix search
26+
* * cache IDF for ts_query (non-prefix search?)
27+
* *calculate IDF from RUM index
2828
*/
2929

3030
/* lookup table type for binary searching through MCELEMs */
@@ -139,6 +139,7 @@ check_tf_idf_source(char **newval, void **extra, GucSource source)
139139
if (rel->rd_index->indkey.values[i]==InvalidAttrNumber)
140140
exprnum++;
141141
}
142+
RelationGetIndexExpressions(rel);
142143
if (exprType((Node*)list_nth(rel->rd_indexprs,exprnum))!=TSVECTOROID)
143144
EXIT_CHECK_TF_IDF_SOURCE("indexed expression should be of tsvector type");
144145
}

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp