@@ -41,7 +41,13 @@ static float weights[] = {0.1, 0.2, 0.4, 1.0};
4141
4242#define wpos (wep )( w[ WEP_GETWEIGHT(wep) ] )
4343
44- #define DEF_NORM_METHOD 0
44+ #define RANK_NO_NORM 0x00
45+ #define RANK_NORM_LOGLENGTH 0x01
46+ #define RANK_NORM_LENGTH 0x02
47+ #define RANK_NORM_EXTDIST 0x04
48+ #define RANK_NORM_UNIQ 0x08
49+ #define RANK_NORM_LOGUNIQ 0x10
50+ #define DEF_NORM_METHOD RANK_NO_NORM
4551
4652static float calc_rank_or (float * w ,tsvector * t ,QUERYTYPE * q );
4753static float calc_rank_and (float * w ,tsvector * t ,QUERYTYPE * q );
@@ -328,23 +334,21 @@ calc_rank(float *w, tsvector * t, QUERYTYPE * q, int4 method)
328334if (res < 0 )
329335res = 1e-20 ;
330336
331- switch (method )
332- {
333- case 0 :
334- break ;
335- case 1 :
336- res /=log ((float ) (cnt_length (t )+ 1 )) /log (2.0 );
337- break ;
338- case 2 :
339- len = cnt_length (t );
340- if (len > 0 )
341- res /= (float )len ;
342- break ;
343- default :
344- /* internal error */
345- elog (ERROR ,"unrecognized normalization method: %d" ,method );
337+ if ( (method & RANK_NORM_LOGLENGTH )&& t -> size > 0 )
338+ res /=log ((double ) (cnt_length (t )+ 1 )) /log (2.0 );
339+
340+ if (method & RANK_NORM_LENGTH ) {
341+ len = cnt_length (t );
342+ if (len > 0 )
343+ res /= (float )len ;
346344}
347345
346+ if ( (method & RANK_NORM_UNIQ )&& t -> size > 0 )
347+ res /= (float )(t -> size );
348+
349+ if ( (method & RANK_NORM_LOGUNIQ )&& t -> size > 0 )
350+ res /=log ((double ) (t -> size + 1 )) /log (2.0 );
351+
348352return res ;
349353}
350354
@@ -420,6 +424,7 @@ typedef struct
420424ITEM * * item ;
421425int16 nitem ;
422426bool needfree ;
427+ uint8 wclass ;
423428int32 pos ;
424429}DocRepresentation ;
425430
@@ -452,19 +457,28 @@ reset_istrue_flag(QUERYTYPE * query)
452457}
453458}
454459
460+ typedef struct {
461+ int pos ;
462+ int p ;
463+ int q ;
464+ DocRepresentation * begin ;
465+ DocRepresentation * end ;
466+ }Extention ;
467+
468+
455469static bool
456- Cover (DocRepresentation * doc ,int len ,QUERYTYPE * query ,int * pos , int * p , int * q )
470+ Cover (DocRepresentation * doc ,int len ,QUERYTYPE * query ,Extention * ext )
457471{
458472DocRepresentation * ptr ;
459- int lastpos = * pos ;
473+ int lastpos = ext -> pos ;
460474int i ;
461475bool found = false;
462476
463477reset_istrue_flag (query );
464478
465- * p = 0x7fffffff ;
466- * q = 0 ;
467- ptr = doc + * pos ;
479+ ext -> p = 0x7fffffff ;
480+ ext -> q = 0 ;
481+ ptr = doc + ext -> pos ;
468482
469483/* find upper bound of cover from current position, move up */
470484while (ptr - doc < len )
@@ -473,9 +487,10 @@ Cover(DocRepresentation * doc, int len, QUERYTYPE * query, int *pos, int *p, int
473487ptr -> item [i ]-> istrue = 1 ;
474488if (TS_execute (GETQUERY (query ),NULL , false,checkcondition_ITEM ))
475489{
476- if (ptr -> pos > * q )
490+ if (ptr -> pos > ext -> q )
477491{
478- * q = ptr -> pos ;
492+ ext -> q = ptr -> pos ;
493+ ext -> end = ptr ;
479494lastpos = ptr - doc ;
480495found = true;
481496}
@@ -498,25 +513,27 @@ Cover(DocRepresentation * doc, int len, QUERYTYPE * query, int *pos, int *p, int
498513ptr -> item [i ]-> istrue = 1 ;
499514if (TS_execute (GETQUERY (query ),NULL , true,checkcondition_ITEM ))
500515{
501- if (ptr -> pos < * p )
502- * p = ptr -> pos ;
516+ if (ptr -> pos < ext -> p ) {
517+ ext -> begin = ptr ;
518+ ext -> p = ptr -> pos ;
519+ }
503520break ;
504521}
505522ptr -- ;
506523}
507524
508- if (* p <=* q )
525+ if (ext -> p <=ext -> q )
509526{
510527/*
511528 * set position for next try to next lexeme after begining of founded
512529 * cover
513530 */
514- * pos = (ptr - doc )+ 1 ;
531+ ext -> pos = (ptr - doc )+ 1 ;
515532return true;
516533}
517534
518- ( * pos ) ++ ;
519- return Cover (doc ,len ,query ,pos , p , q );
535+ ext -> pos ++ ;
536+ return Cover (doc ,len ,query ,ext );
520537}
521538
522539static DocRepresentation *
@@ -593,6 +610,7 @@ get_docrep(tsvector * txt, QUERYTYPE * query, int *doclen)
593610doc [cur ].item = doc [cur - 1 ].item ;
594611}
595612doc [cur ].pos = WEP_GETPOS (post [j ]);
613+ doc [cur ].wclass = WEP_GETWEIGHT (post [j ]);
596614cur ++ ;
597615}
598616}
@@ -610,61 +628,110 @@ get_docrep(tsvector * txt, QUERYTYPE * query, int *doclen)
610628return NULL ;
611629}
612630
613-
614- Datum
615- rank_cd (PG_FUNCTION_ARGS )
616- {
617- int K = PG_GETARG_INT32 (0 );
618- tsvector * txt = (tsvector * )PG_DETOAST_DATUM (PG_GETARG_DATUM (1 ));
619- QUERYTYPE * query = (QUERYTYPE * )PG_DETOAST_DATUM_COPY (PG_GETARG_DATUM (2 ));
620- int method = DEF_NORM_METHOD ;
631+ static float4
632+ calc_rank_cd (float4 * arrdata ,tsvector * txt ,QUERYTYPE * query ,int method ) {
621633DocRepresentation * doc ;
622- float res = 0.0 ;
623- int p = 0 ,
624- q = 0 ,
625- len ,
626- cur ,
634+ int len ,
627635i ,
628636doclen = 0 ;
637+ Extention ext ;
638+ double Wdoc = 0.0 ;
639+ double invws [lengthof (weights )];
640+ double SumDist = 0.0 ,PrevExtPos = 0.0 ,CurExtPos = 0.0 ;
641+ int NExtent = 0 ;
629642
630- doc = get_docrep (txt ,query ,& doclen );
631- if (!doc )
643+ for (i = 0 ;i < lengthof (weights );i ++ )
632644{
633- PG_FREE_IF_COPY (txt ,1 );
634- PG_FREE_IF_COPY (query ,2 );
635- PG_RETURN_FLOAT4 (0.0 );
645+ invws [i ]= ((double )((arrdata [i ] >=0 ) ?arrdata [i ] :weights [i ]));
646+ if (invws [i ]> 1.0 )
647+ ereport (ERROR ,
648+ (errcode (ERRCODE_INVALID_PARAMETER_VALUE ),
649+ errmsg ("weight out of range" )));
650+ invws [i ]= 1.0 /invws [i ];
636651}
637652
638- cur = 0 ;
639- if (K <=0 )
640- K = 4 ;
641- while (Cover (doc ,doclen ,query ,& cur ,& p ,& q ))
642- res += (q - p + 1 > K ) ? ((float )K ) / ((float ) (q - p + 1 )) :1.0 ;
653+ doc = get_docrep (txt ,query ,& doclen );
654+ if (!doc )
655+ return 0.0 ;
643656
644- if (PG_NARGS ()== 4 )
645- method = PG_GETARG_INT32 (3 );
657+ MemSet (& ext ,0 ,sizeof (Extention ) );
658+ while (Cover (doc ,doclen ,query ,& ext )) {
659+ double Cpos = 0.0 ;
660+ double InvSum = 0.0 ;
661+ DocRepresentation * ptr = ext .begin ;
646662
647- switch (method )
648- {
649- case 0 :
650- break ;
651- case 1 :
652- res /=log ((float ) (cnt_length (txt )+ 1 ));
653- break ;
654- case 2 :
655- len = cnt_length (txt );
656- if (len > 0 )
657- res /= (float )len ;
658- break ;
659- default :
660- /* internal error */
661- elog (ERROR ,"unrecognized normalization method: %d" ,method );
663+ while (ptr <=ext .end ) {
664+ InvSum += invws [ptr -> wclass ];
665+ ptr ++ ;
666+ }
667+
668+ Cpos = ((double )(ext .end - ext .begin + 1 )) /InvSum ;
669+ Wdoc += Cpos / ( (double )((1 + (ext .q - ext .p )- (ext .end - ext .begin ) )) );
670+
671+ CurExtPos = ((double )(ext .q + ext .p ))/2.0 ;
672+ if (NExtent > 0 && CurExtPos > PrevExtPos /* prevent devision by zero in a case of multiple lexize */ )
673+ SumDist += 1.0 /(CurExtPos - PrevExtPos );
674+
675+ PrevExtPos = CurExtPos ;
676+ NExtent ++ ;
677+ }
678+
679+ if ( (method & RANK_NORM_LOGLENGTH )&& txt -> size > 0 )
680+ Wdoc /=log ((double ) (cnt_length (txt )+ 1 ));
681+
682+ if (method & RANK_NORM_LENGTH ) {
683+ len = cnt_length (txt );
684+ if (len > 0 )
685+ Wdoc /= (double )len ;
662686}
663687
688+ if ( (method & RANK_NORM_EXTDIST )&& SumDist > 0 )
689+ Wdoc /= ((double )NExtent ) /SumDist ;
690+
691+ if ( (method & RANK_NORM_UNIQ )&& txt -> size > 0 )
692+ Wdoc /= (double )(txt -> size );
693+
694+ if ( (method & RANK_NORM_LOGUNIQ )&& txt -> size > 0 )
695+ Wdoc /=log ((double ) (txt -> size + 1 )) /log (2.0 );
696+
664697for (i = 0 ;i < doclen ;i ++ )
665698if (doc [i ].needfree )
666699pfree (doc [i ].item );
667700pfree (doc );
701+
702+ return (float4 )Wdoc ;
703+ }
704+
705+ Datum
706+ rank_cd (PG_FUNCTION_ARGS )
707+ {
708+ ArrayType * win = (ArrayType * )PG_DETOAST_DATUM (PG_GETARG_DATUM (0 ));
709+ tsvector * txt = (tsvector * )PG_DETOAST_DATUM (PG_GETARG_DATUM (1 ));
710+ QUERYTYPE * query = (QUERYTYPE * )PG_DETOAST_DATUM_COPY (PG_GETARG_DATUM (2 ));
711+ int method = DEF_NORM_METHOD ;
712+ float4 res ;
713+
714+ if (ARR_NDIM (win )!= 1 )
715+ ereport (ERROR ,
716+ (errcode (ERRCODE_ARRAY_SUBSCRIPT_ERROR ),
717+ errmsg ("array of weight must be one-dimensional" )));
718+
719+ if (ARRNELEMS (win )< lengthof (weights ))
720+ ereport (ERROR ,
721+ (errcode (ERRCODE_ARRAY_SUBSCRIPT_ERROR ),
722+ errmsg ("array of weight is too short" )));
723+
724+ if (ARR_HASNULL (win ))
725+ ereport (ERROR ,
726+ (errcode (ERRCODE_NULL_VALUE_NOT_ALLOWED ),
727+ errmsg ("array of weight must not contain nulls" )));
728+
729+ if (PG_NARGS ()== 4 )
730+ method = PG_GETARG_INT32 (3 );
731+
732+ res = calc_rank_cd ( (float4 * )ARR_DATA_PTR (win ),txt ,query ,method );
733+
734+ PG_FREE_IF_COPY (win ,0 );
668735PG_FREE_IF_COPY (txt ,1 );
669736PG_FREE_IF_COPY (query ,2 );
670737
@@ -675,13 +742,16 @@ rank_cd(PG_FUNCTION_ARGS)
675742Datum
676743rank_cd_def (PG_FUNCTION_ARGS )
677744{
678- PG_RETURN_DATUM (DirectFunctionCall4 (
679- rank_cd ,
680- Int32GetDatum (-1 ),
681- PG_GETARG_DATUM (0 ),
682- PG_GETARG_DATUM (1 ),
683- (PG_NARGS ()== 3 ) ?PG_GETARG_DATUM (2 ) :Int32GetDatum (DEF_NORM_METHOD )
684- ));
745+ tsvector * txt = (tsvector * )PG_DETOAST_DATUM (PG_GETARG_DATUM (0 ));
746+ QUERYTYPE * query = (QUERYTYPE * )PG_DETOAST_DATUM_COPY (PG_GETARG_DATUM (1 ));
747+ float4 res ;
748+
749+ res = calc_rank_cd (weights ,txt ,query , (PG_NARGS ()== 3 ) ?PG_GETARG_DATUM (2 ) :DEF_NORM_METHOD );
750+
751+ PG_FREE_IF_COPY (txt ,1 );
752+ PG_FREE_IF_COPY (query ,2 );
753+
754+ PG_RETURN_FLOAT4 (res );
685755}
686756
687757/**************debug*************/
@@ -721,11 +791,9 @@ get_covers(PG_FUNCTION_ARGS)
721791text * out ;
722792char * cptr ;
723793DocRepresentation * doc ;
724- int pos = 0 ,
725- p ,
726- q ,
727- olddwpos = 0 ;
794+ int olddwpos = 0 ;
728795int ncover = 1 ;
796+ Extention ext ;
729797
730798doc = get_docrep (txt ,query ,& rlen );
731799
@@ -765,14 +833,15 @@ get_covers(PG_FUNCTION_ARGS)
765833}
766834qsort ((void * )dw ,dlen ,sizeof (DocWord ),compareDocWord );
767835
768- while (Cover (doc ,rlen ,query ,& pos ,& p ,& q ))
836+ MemSet (& ext ,0 ,sizeof (Extention ) );
837+ while (Cover (doc ,rlen ,query ,& ext ))
769838{
770839dwptr = dw + olddwpos ;
771- while (dwptr -> pos < p && dwptr - dw < dlen )
840+ while (dwptr -> pos < ext . p && dwptr - dw < dlen )
772841dwptr ++ ;
773842olddwpos = dwptr - dw ;
774843dwptr -> start = ncover ;
775- while (dwptr -> pos < q + 1 && dwptr - dw < dlen )
844+ while (dwptr -> pos < ext . q + 1 && dwptr - dw < dlen )
776845dwptr ++ ;
777846(dwptr - 1 )-> finish = ncover ;
778847len += 4 /* {}+two spaces */ + 2 * 16 /* numbers */ ;