Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit38c4fe8

Browse files
committed
Significantly improve ranking:
1) rank_cd now use weight of lexemes2) rank_cd and rank can use any combination of normalization methods: no normalization normalization by log(length of document) -----/------- by length of document -----/------- by number of unique word in document -----/------- by log(number of unique word in document) -----/------- by number of covers (only rank_cd)Improve cover's search.TODO: changes in documentation
1 parent85fa9f5 commit38c4fe8

File tree

3 files changed

+161
-92
lines changed

3 files changed

+161
-92
lines changed

‎contrib/tsearch2/expected/tsearch2.out

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -2315,9 +2315,9 @@ An hour of storm to place
23152315
The sculpture of these granite seams,
23162316
Upon a woman s face. E. J. Pratt (1882 1964)
23172317
'), to_tsquery('sea&thousand&years'));
2318-
rank_cd
2319-
---------
2320-
1.2
2318+
rank_cd
2319+
-----------
2320+
0.0555556
23212321
(1 row)
23222322

23232323
select rank_cd(to_tsvector('Erosion It took the sea a thousand years,
@@ -2329,9 +2329,9 @@ An hour of storm to place
23292329
The sculpture of these granite seams,
23302330
Upon a woman s face. E. J. Pratt (1882 1964)
23312331
'), to_tsquery('granite&sea'));
2332-
rank_cd
2333-
----------
2334-
0.880303
2332+
rank_cd
2333+
-----------
2334+
0.0238095
23352335
(1 row)
23362336

23372337
select rank_cd(to_tsvector('Erosion It took the sea a thousand years,
@@ -2345,7 +2345,7 @@ Upon a woman s face. E. J. Pratt (1882 1964)
23452345
'), to_tsquery('sea'));
23462346
rank_cd
23472347
---------
2348-
2
2348+
0.2
23492349
(1 row)
23502350

23512351
select get_covers(to_tsvector('Erosion It took the sea a thousand years,

‎contrib/tsearch2/rank.c

Lines changed: 152 additions & 83 deletions
Original file line numberDiff line numberDiff line change
@@ -41,7 +41,13 @@ static float weights[] = {0.1, 0.2, 0.4, 1.0};
4141

4242
#definewpos(wep)( w[ WEP_GETWEIGHT(wep) ] )
4343

44-
#defineDEF_NORM_METHOD 0
44+
#defineRANK_NO_NORM0x00
45+
#defineRANK_NORM_LOGLENGTH 0x01
46+
#defineRANK_NORM_LENGTH 0x02
47+
#defineRANK_NORM_EXTDIST0x04
48+
#defineRANK_NORM_UNIQ0x08
49+
#defineRANK_NORM_LOGUNIQ0x10
50+
#defineDEF_NORM_METHOD RANK_NO_NORM
4551

4652
staticfloatcalc_rank_or(float*w,tsvector*t,QUERYTYPE*q);
4753
staticfloatcalc_rank_and(float*w,tsvector*t,QUERYTYPE*q);
@@ -328,23 +334,21 @@ calc_rank(float *w, tsvector * t, QUERYTYPE * q, int4 method)
328334
if (res<0)
329335
res=1e-20;
330336

331-
switch (method)
332-
{
333-
case0:
334-
break;
335-
case1:
336-
res /=log((float) (cnt_length(t)+1)) /log(2.0);
337-
break;
338-
case2:
339-
len=cnt_length(t);
340-
if (len>0)
341-
res /= (float)len;
342-
break;
343-
default:
344-
/* internal error */
345-
elog(ERROR,"unrecognized normalization method: %d",method);
337+
if ( (method&RANK_NORM_LOGLENGTH)&&t->size>0 )
338+
res /=log((double) (cnt_length(t)+1)) /log(2.0);
339+
340+
if (method&RANK_NORM_LENGTH ) {
341+
len=cnt_length(t);
342+
if (len>0 )
343+
res /= (float)len;
346344
}
347345

346+
if ( (method&RANK_NORM_UNIQ)&&t->size>0 )
347+
res /= (float)(t->size );
348+
349+
if ( (method&RANK_NORM_LOGUNIQ)&&t->size>0 )
350+
res /=log((double) (t->size+1)) /log(2.0);
351+
348352
returnres;
349353
}
350354

@@ -420,6 +424,7 @@ typedef struct
420424
ITEM**item;
421425
int16nitem;
422426
boolneedfree;
427+
uint8wclass;
423428
int32pos;
424429
}DocRepresentation;
425430

@@ -452,19 +457,28 @@ reset_istrue_flag(QUERYTYPE * query)
452457
}
453458
}
454459

460+
typedefstruct {
461+
intpos;
462+
intp;
463+
intq;
464+
DocRepresentation*begin;
465+
DocRepresentation*end;
466+
}Extention;
467+
468+
455469
staticbool
456-
Cover(DocRepresentation*doc,intlen,QUERYTYPE*query,int*pos,int*p,int*q)
470+
Cover(DocRepresentation*doc,intlen,QUERYTYPE*query,Extention*ext)
457471
{
458472
DocRepresentation*ptr;
459-
intlastpos=*pos;
473+
intlastpos=ext->pos;
460474
inti;
461475
boolfound= false;
462476

463477
reset_istrue_flag(query);
464478

465-
*p=0x7fffffff;
466-
*q=0;
467-
ptr=doc+*pos;
479+
ext->p=0x7fffffff;
480+
ext->q=0;
481+
ptr=doc+ext->pos;
468482

469483
/* find upper bound of cover from current position, move up */
470484
while (ptr-doc<len)
@@ -473,9 +487,10 @@ Cover(DocRepresentation * doc, int len, QUERYTYPE * query, int *pos, int *p, int
473487
ptr->item[i]->istrue=1;
474488
if (TS_execute(GETQUERY(query),NULL, false,checkcondition_ITEM))
475489
{
476-
if (ptr->pos>*q)
490+
if (ptr->pos>ext->q)
477491
{
478-
*q=ptr->pos;
492+
ext->q=ptr->pos;
493+
ext->end=ptr;
479494
lastpos=ptr-doc;
480495
found= true;
481496
}
@@ -498,25 +513,27 @@ Cover(DocRepresentation * doc, int len, QUERYTYPE * query, int *pos, int *p, int
498513
ptr->item[i]->istrue=1;
499514
if (TS_execute(GETQUERY(query),NULL, true,checkcondition_ITEM))
500515
{
501-
if (ptr->pos<*p)
502-
*p=ptr->pos;
516+
if (ptr->pos<ext->p) {
517+
ext->begin=ptr;
518+
ext->p=ptr->pos;
519+
}
503520
break;
504521
}
505522
ptr--;
506523
}
507524

508-
if (*p <=*q)
525+
if (ext->p <=ext->q)
509526
{
510527
/*
511528
* set position for next try to next lexeme after begining of founded
512529
* cover
513530
*/
514-
*pos= (ptr-doc)+1;
531+
ext->pos= (ptr-doc)+1;
515532
return true;
516533
}
517534

518-
(*pos)++;
519-
returnCover(doc,len,query,pos,p,q);
535+
ext->pos++;
536+
returnCover(doc,len,query,ext);
520537
}
521538

522539
staticDocRepresentation*
@@ -593,6 +610,7 @@ get_docrep(tsvector * txt, QUERYTYPE * query, int *doclen)
593610
doc[cur].item=doc[cur-1].item;
594611
}
595612
doc[cur].pos=WEP_GETPOS(post[j]);
613+
doc[cur].wclass=WEP_GETWEIGHT(post[j]);
596614
cur++;
597615
}
598616
}
@@ -610,61 +628,110 @@ get_docrep(tsvector * txt, QUERYTYPE * query, int *doclen)
610628
returnNULL;
611629
}
612630

613-
614-
Datum
615-
rank_cd(PG_FUNCTION_ARGS)
616-
{
617-
intK=PG_GETARG_INT32(0);
618-
tsvector*txt= (tsvector*)PG_DETOAST_DATUM(PG_GETARG_DATUM(1));
619-
QUERYTYPE*query= (QUERYTYPE*)PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(2));
620-
intmethod=DEF_NORM_METHOD;
631+
staticfloat4
632+
calc_rank_cd(float4*arrdata,tsvector*txt,QUERYTYPE*query,intmethod) {
621633
DocRepresentation*doc;
622-
floatres=0.0;
623-
intp=0,
624-
q=0,
625-
len,
626-
cur,
634+
intlen,
627635
i,
628636
doclen=0;
637+
Extentionext;
638+
doubleWdoc=0.0;
639+
doubleinvws[lengthof(weights)];
640+
doubleSumDist=0.0,PrevExtPos=0.0,CurExtPos=0.0;
641+
intNExtent=0;
629642

630-
doc=get_docrep(txt,query,&doclen);
631-
if (!doc)
643+
for (i=0;i<lengthof(weights);i++)
632644
{
633-
PG_FREE_IF_COPY(txt,1);
634-
PG_FREE_IF_COPY(query,2);
635-
PG_RETURN_FLOAT4(0.0);
645+
invws[i]= ((double)((arrdata[i] >=0) ?arrdata[i] :weights[i]));
646+
if (invws[i]>1.0)
647+
ereport(ERROR,
648+
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
649+
errmsg("weight out of range")));
650+
invws[i]=1.0/invws[i];
636651
}
637652

638-
cur=0;
639-
if (K <=0)
640-
K=4;
641-
while (Cover(doc,doclen,query,&cur,&p,&q))
642-
res+= (q-p+1>K) ? ((float)K) / ((float) (q-p+1)) :1.0;
653+
doc=get_docrep(txt,query,&doclen);
654+
if (!doc)
655+
return0.0;
643656

644-
if (PG_NARGS()==4)
645-
method=PG_GETARG_INT32(3);
657+
MemSet(&ext,0,sizeof(Extention) );
658+
while (Cover(doc,doclen,query,&ext)) {
659+
doubleCpos=0.0;
660+
doubleInvSum=0.0;
661+
DocRepresentation*ptr=ext.begin;
646662

647-
switch (method)
648-
{
649-
case0:
650-
break;
651-
case1:
652-
res /=log((float) (cnt_length(txt)+1));
653-
break;
654-
case2:
655-
len=cnt_length(txt);
656-
if (len>0)
657-
res /= (float)len;
658-
break;
659-
default:
660-
/* internal error */
661-
elog(ERROR,"unrecognized normalization method: %d",method);
663+
while (ptr<=ext.end ) {
664+
InvSum+=invws[ptr->wclass ];
665+
ptr++;
666+
}
667+
668+
Cpos= ((double)(ext.end-ext.begin+1 )) /InvSum;
669+
Wdoc+=Cpos / ( (double)((1+ (ext.q-ext.p)- (ext.end-ext.begin) )) );
670+
671+
CurExtPos= ((double)(ext.q+ext.p))/2.0;
672+
if (NExtent>0&&CurExtPos>PrevExtPos/* prevent devision by zero in a case of multiple lexize */ )
673+
SumDist+=1.0/(CurExtPos-PrevExtPos );
674+
675+
PrevExtPos=CurExtPos;
676+
NExtent++;
677+
}
678+
679+
if ( (method&RANK_NORM_LOGLENGTH)&&txt->size>0 )
680+
Wdoc /=log((double) (cnt_length(txt)+1));
681+
682+
if (method&RANK_NORM_LENGTH ) {
683+
len=cnt_length(txt);
684+
if (len>0 )
685+
Wdoc /= (double)len;
662686
}
663687

688+
if ( (method&RANK_NORM_EXTDIST)&&SumDist>0 )
689+
Wdoc /= ((double)NExtent) /SumDist;
690+
691+
if ( (method&RANK_NORM_UNIQ)&&txt->size>0 )
692+
Wdoc /= (double)(txt->size );
693+
694+
if ( (method&RANK_NORM_LOGUNIQ)&&txt->size>0 )
695+
Wdoc /=log((double) (txt->size+1)) /log(2.0);
696+
664697
for (i=0;i<doclen;i++)
665698
if (doc[i].needfree)
666699
pfree(doc[i].item);
667700
pfree(doc);
701+
702+
return (float4)Wdoc;
703+
}
704+
705+
Datum
706+
rank_cd(PG_FUNCTION_ARGS)
707+
{
708+
ArrayType*win= (ArrayType*)PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
709+
tsvector*txt= (tsvector*)PG_DETOAST_DATUM(PG_GETARG_DATUM(1));
710+
QUERYTYPE*query= (QUERYTYPE*)PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(2));
711+
intmethod=DEF_NORM_METHOD;
712+
float4res;
713+
714+
if (ARR_NDIM(win)!=1)
715+
ereport(ERROR,
716+
(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
717+
errmsg("array of weight must be one-dimensional")));
718+
719+
if (ARRNELEMS(win)<lengthof(weights))
720+
ereport(ERROR,
721+
(errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
722+
errmsg("array of weight is too short")));
723+
724+
if (ARR_HASNULL(win))
725+
ereport(ERROR,
726+
(errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
727+
errmsg("array of weight must not contain nulls")));
728+
729+
if (PG_NARGS()==4)
730+
method=PG_GETARG_INT32(3);
731+
732+
res=calc_rank_cd( (float4*)ARR_DATA_PTR(win),txt,query,method);
733+
734+
PG_FREE_IF_COPY(win,0);
668735
PG_FREE_IF_COPY(txt,1);
669736
PG_FREE_IF_COPY(query,2);
670737

@@ -675,13 +742,16 @@ rank_cd(PG_FUNCTION_ARGS)
675742
Datum
676743
rank_cd_def(PG_FUNCTION_ARGS)
677744
{
678-
PG_RETURN_DATUM(DirectFunctionCall4(
679-
rank_cd,
680-
Int32GetDatum(-1),
681-
PG_GETARG_DATUM(0),
682-
PG_GETARG_DATUM(1),
683-
(PG_NARGS()==3) ?PG_GETARG_DATUM(2) :Int32GetDatum(DEF_NORM_METHOD)
684-
));
745+
tsvector*txt= (tsvector*)PG_DETOAST_DATUM(PG_GETARG_DATUM(0));
746+
QUERYTYPE*query= (QUERYTYPE*)PG_DETOAST_DATUM_COPY(PG_GETARG_DATUM(1));
747+
float4res;
748+
749+
res=calc_rank_cd(weights,txt,query, (PG_NARGS()==3) ?PG_GETARG_DATUM(2) :DEF_NORM_METHOD);
750+
751+
PG_FREE_IF_COPY(txt,1);
752+
PG_FREE_IF_COPY(query,2);
753+
754+
PG_RETURN_FLOAT4(res);
685755
}
686756

687757
/**************debug*************/
@@ -721,11 +791,9 @@ get_covers(PG_FUNCTION_ARGS)
721791
text*out;
722792
char*cptr;
723793
DocRepresentation*doc;
724-
intpos=0,
725-
p,
726-
q,
727-
olddwpos=0;
794+
intolddwpos=0;
728795
intncover=1;
796+
Extentionext;
729797

730798
doc=get_docrep(txt,query,&rlen);
731799

@@ -765,14 +833,15 @@ get_covers(PG_FUNCTION_ARGS)
765833
}
766834
qsort((void*)dw,dlen,sizeof(DocWord),compareDocWord);
767835

768-
while (Cover(doc,rlen,query,&pos,&p,&q))
836+
MemSet(&ext,0,sizeof(Extention) );
837+
while (Cover(doc,rlen,query,&ext))
769838
{
770839
dwptr=dw+olddwpos;
771-
while (dwptr->pos<p&&dwptr-dw<dlen)
840+
while (dwptr->pos<ext.p&&dwptr-dw<dlen)
772841
dwptr++;
773842
olddwpos=dwptr-dw;
774843
dwptr->start=ncover;
775-
while (dwptr->pos<q+1&&dwptr-dw<dlen)
844+
while (dwptr->pos<ext.q+1&&dwptr-dw<dlen)
776845
dwptr++;
777846
(dwptr-1)->finish=ncover;
778847
len+=4/* {}+two spaces */+2*16/* numbers */ ;

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp