|
| 1 | +/*------------------------------------------------------------------------- |
| 2 | + * |
| 3 | + * rum_ts_utils.c |
| 4 | + *various support functions |
| 5 | + * |
| 6 | + * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group |
| 7 | + * |
| 8 | + *------------------------------------------------------------------------- |
| 9 | + */ |
| 10 | + |
| 11 | +#include"postgres.h" |
| 12 | + |
| 13 | +#include"catalog/pg_type.h" |
| 14 | +#include"tsearch/ts_type.h" |
| 15 | +#include"tsearch/ts_utils.h" |
| 16 | + |
| 17 | +#include"rum.h" |
| 18 | + |
| 19 | +#include<math.h> |
| 20 | + |
| 21 | +PG_FUNCTION_INFO_V1(gin_tsvector_config); |
| 22 | +PG_FUNCTION_INFO_V1(gin_tsquery_pre_consistent); |
| 23 | +PG_FUNCTION_INFO_V1(gin_tsquery_distance); |
| 24 | + |
| 25 | +staticfloatcalc_rank_and(float*w,Datum*addInfo,bool*addInfoIsNull, |
| 26 | +intsize); |
| 27 | +staticfloatcalc_rank_or(float*w,Datum*addInfo,bool*addInfoIsNull, |
| 28 | +intsize); |
| 29 | + |
| 30 | +typedefstruct |
| 31 | +{ |
| 32 | +QueryItem*first_item; |
| 33 | +bool*check; |
| 34 | +int*map_item_operand; |
| 35 | +bool*need_recheck; |
| 36 | +}GinChkVal; |
| 37 | + |
| 38 | +staticbool |
| 39 | +checkcondition_gin(void*checkval,QueryOperand*val,ExecPhraseData*data) |
| 40 | +{ |
| 41 | +GinChkVal*gcv= (GinChkVal*)checkval; |
| 42 | +intj; |
| 43 | + |
| 44 | +/* if any val requiring a weight is used, set recheck flag */ |
| 45 | +if (val->weight!=0) |
| 46 | +*(gcv->need_recheck)= true; |
| 47 | + |
| 48 | +/* convert item's number to corresponding entry's (operand's) number */ |
| 49 | +j=gcv->map_item_operand[((QueryItem*)val)-gcv->first_item]; |
| 50 | + |
| 51 | +/* return presence of current entry in indexed value */ |
| 52 | +returngcv->check[j]; |
| 53 | +} |
| 54 | + |
| 55 | +Datum |
| 56 | +gin_tsquery_pre_consistent(PG_FUNCTION_ARGS) |
| 57 | +{ |
| 58 | +bool*check= (bool*)PG_GETARG_POINTER(0); |
| 59 | + |
| 60 | +TSQueryquery=PG_GETARG_TSQUERY(2); |
| 61 | + |
| 62 | +Pointer*extra_data= (Pointer*)PG_GETARG_POINTER(4); |
| 63 | +boolrecheck; |
| 64 | +boolres= FALSE; |
| 65 | + |
| 66 | +if (query->size>0) |
| 67 | +{ |
| 68 | +QueryItem*item; |
| 69 | +GinChkValgcv; |
| 70 | + |
| 71 | +/* |
| 72 | + * check-parameter array has one entry for each value (operand) in the |
| 73 | + * query. |
| 74 | + */ |
| 75 | +gcv.first_item=item=GETQUERY(query); |
| 76 | +gcv.check=check; |
| 77 | +gcv.map_item_operand= (int*) (extra_data[0]); |
| 78 | +gcv.need_recheck=&recheck; |
| 79 | + |
| 80 | +res=TS_execute(GETQUERY(query), |
| 81 | +&gcv, |
| 82 | + false, |
| 83 | +checkcondition_gin); |
| 84 | +} |
| 85 | + |
| 86 | +PG_RETURN_BOOL(res); |
| 87 | +} |
| 88 | + |
| 89 | +staticfloatweights[]= {0.1f,0.2f,0.4f,1.0f}; |
| 90 | + |
| 91 | +#definewpos(wep)( w[ WEP_GETWEIGHT(wep) ] ) |
| 92 | +/* A dummy WordEntryPos array to use when haspos is false */ |
| 93 | +staticWordEntryPosVectorPOSNULL= { |
| 94 | +1,/* Number of elements that follow */ |
| 95 | +{0} |
| 96 | +}; |
| 97 | + |
| 98 | +#defineLOWERMASK 0x1F |
| 99 | + |
| 100 | +/* |
| 101 | + * Returns a weight of a word collocation |
| 102 | + */ |
| 103 | +staticfloat4 |
| 104 | +word_distance(int32w) |
| 105 | +{ |
| 106 | +if (w>100) |
| 107 | +return1e-30f; |
| 108 | + |
| 109 | +return1.0 / (1.005+0.05*exp(((float4)w) /1.5-2)); |
| 110 | +} |
| 111 | + |
| 112 | +staticchar* |
| 113 | +decompress_pos(char*ptr,uint16*pos) |
| 114 | +{ |
| 115 | +inti; |
| 116 | +uint8v; |
| 117 | +uint16delta=0; |
| 118 | + |
| 119 | +i=0; |
| 120 | +while (true) |
| 121 | +{ |
| 122 | +v=*ptr; |
| 123 | +ptr++; |
| 124 | +if (v&HIGHBIT) |
| 125 | +{ |
| 126 | +delta |= (v& (~HIGHBIT)) <<i; |
| 127 | +} |
| 128 | +else |
| 129 | +{ |
| 130 | +delta |= (v&LOWERMASK) <<i; |
| 131 | +*pos+=delta; |
| 132 | +WEP_SETWEIGHT(*pos,v >>5); |
| 133 | +returnptr; |
| 134 | +} |
| 135 | +i+=7; |
| 136 | +} |
| 137 | +} |
| 138 | + |
| 139 | +staticint |
| 140 | +count_pos(char*ptr,intlen) |
| 141 | +{ |
| 142 | +intcount=0,i; |
| 143 | +for (i=0;i<len;i++) |
| 144 | +{ |
| 145 | +if (!(ptr[i]&HIGHBIT)) |
| 146 | +count++; |
| 147 | +} |
| 148 | +returncount; |
| 149 | +} |
| 150 | + |
| 151 | +staticfloat |
| 152 | +calc_rank_and(float*w,Datum*addInfo,bool*addInfoIsNull,intsize) |
| 153 | +{ |
| 154 | +inti, |
| 155 | +k, |
| 156 | +l, |
| 157 | +p; |
| 158 | +WordEntryPospost, |
| 159 | +ct; |
| 160 | +int32dimt, |
| 161 | +lenct, |
| 162 | +dist; |
| 163 | +floatres=-1.0; |
| 164 | +char*ptrt,*ptrc; |
| 165 | + |
| 166 | +if (size<2) |
| 167 | +{ |
| 168 | +returncalc_rank_or(w,addInfo,addInfoIsNull,size); |
| 169 | +} |
| 170 | +WEP_SETPOS(POSNULL.pos[0],MAXENTRYPOS-1); |
| 171 | + |
| 172 | +for (i=0;i<size;i++) |
| 173 | +{ |
| 174 | +if (!addInfoIsNull[i]) |
| 175 | +{ |
| 176 | +dimt=count_pos(VARDATA_ANY(addInfo[i]),VARSIZE_ANY_EXHDR(addInfo[i])); |
| 177 | +ptrt= (char*)VARDATA_ANY(addInfo[i]); |
| 178 | +} |
| 179 | +else |
| 180 | +{ |
| 181 | +dimt=POSNULL.npos; |
| 182 | +ptrt= (char*)POSNULL.pos; |
| 183 | +} |
| 184 | +for (k=0;k<i;k++) |
| 185 | +{ |
| 186 | +if (!addInfoIsNull[k]) |
| 187 | +lenct=count_pos(VARDATA_ANY(addInfo[k]),VARSIZE_ANY_EXHDR(addInfo[k])); |
| 188 | +else |
| 189 | +lenct=POSNULL.npos; |
| 190 | +post=0; |
| 191 | +for (l=0;l<dimt;l++) |
| 192 | +{ |
| 193 | +ptrt=decompress_pos(ptrt,&post); |
| 194 | +ct=0; |
| 195 | +if (!addInfoIsNull[k]) |
| 196 | +ptrc= (char*)VARDATA_ANY(addInfo[k]); |
| 197 | +else |
| 198 | +ptrc= (char*)POSNULL.pos; |
| 199 | +for (p=0;p<lenct;p++) |
| 200 | +{ |
| 201 | +ptrc=decompress_pos(ptrc,&ct); |
| 202 | +dist=Abs((int)WEP_GETPOS(post)- (int)WEP_GETPOS(ct)); |
| 203 | +if (dist|| (dist==0&& (ptrt== (char*)POSNULL.pos||ptrc== (char*)POSNULL.pos))) |
| 204 | +{ |
| 205 | +floatcurw; |
| 206 | + |
| 207 | +if (!dist) |
| 208 | +dist=MAXENTRYPOS; |
| 209 | +curw=sqrt(wpos(post)*wpos(ct)*word_distance(dist)); |
| 210 | +res= (res<0) ?curw :1.0- (1.0-res)* (1.0-curw); |
| 211 | +} |
| 212 | +} |
| 213 | +} |
| 214 | +} |
| 215 | + |
| 216 | +} |
| 217 | +returnres; |
| 218 | +} |
| 219 | + |
| 220 | +staticfloat |
| 221 | +calc_rank_or(float*w,Datum*addInfo,bool*addInfoIsNull,intsize) |
| 222 | +{ |
| 223 | +WordEntryPospost; |
| 224 | +int32dimt, |
| 225 | +j, |
| 226 | +i; |
| 227 | +floatres=0.0; |
| 228 | +char*ptrt; |
| 229 | + |
| 230 | +for (i=0;i<size;i++) |
| 231 | +{ |
| 232 | +floatresj, |
| 233 | +wjm; |
| 234 | +int32jm; |
| 235 | + |
| 236 | +if (!addInfoIsNull[i]) |
| 237 | +{ |
| 238 | +dimt=count_pos(VARDATA_ANY(addInfo[i]),VARSIZE_ANY_EXHDR(addInfo[i])); |
| 239 | +ptrt= (char*)VARDATA_ANY(addInfo[i]); |
| 240 | +} |
| 241 | +else |
| 242 | +{ |
| 243 | +dimt=POSNULL.npos; |
| 244 | +ptrt= (char*)POSNULL.pos; |
| 245 | +} |
| 246 | + |
| 247 | +resj=0.0; |
| 248 | +wjm=-1.0; |
| 249 | +jm=0; |
| 250 | +post=0; |
| 251 | +for (j=0;j<dimt;j++) |
| 252 | +{ |
| 253 | +ptrt=decompress_pos(ptrt,&post); |
| 254 | +resj=resj+wpos(post) / ((j+1)* (j+1)); |
| 255 | +if (wpos(post)>wjm) |
| 256 | +{ |
| 257 | +wjm=wpos(post); |
| 258 | +jm=j; |
| 259 | +} |
| 260 | +} |
| 261 | +/* |
| 262 | +limit (sum(i/i^2),i->inf) = pi^2/6 |
| 263 | +resj = sum(wi/i^2),i=1,noccurence, |
| 264 | +wi - should be sorted desc, |
| 265 | +don't sort for now, just choose maximum weight. This should be corrected |
| 266 | +Oleg Bartunov |
| 267 | +*/ |
| 268 | +res=res+ (wjm+resj-wjm / ((jm+1)* (jm+1))) /1.64493406685; |
| 269 | + |
| 270 | +} |
| 271 | +if (size>0) |
| 272 | +res=res /size; |
| 273 | +returnres; |
| 274 | +} |
| 275 | + |
| 276 | +staticfloat |
| 277 | +calc_rank(float*w,TSQueryq,Datum*addInfo,bool*addInfoIsNull,intsize) |
| 278 | +{ |
| 279 | +QueryItem*item=GETQUERY(q); |
| 280 | +floatres=0.0; |
| 281 | + |
| 282 | +if (!size|| !q->size) |
| 283 | +return0.0; |
| 284 | + |
| 285 | +/* XXX: What about NOT? */ |
| 286 | +res= (item->type==QI_OPR&&item->qoperator.oper==OP_AND) ? |
| 287 | +calc_rank_and(w,addInfo,addInfoIsNull,size) :calc_rank_or(w,addInfo,addInfoIsNull,size); |
| 288 | + |
| 289 | +if (res<0) |
| 290 | +res=1e-20f; |
| 291 | + |
| 292 | +returnres; |
| 293 | +} |
| 294 | + |
| 295 | +Datum |
| 296 | +gin_tsquery_distance(PG_FUNCTION_ARGS) |
| 297 | +{ |
| 298 | +/* bool *check = (bool *) PG_GETARG_POINTER(0); */ |
| 299 | + |
| 300 | +/* StrategyNumber strategy = PG_GETARG_UINT16(1); */ |
| 301 | +TSQueryquery=PG_GETARG_TSQUERY(2); |
| 302 | + |
| 303 | +int32nkeys=PG_GETARG_INT32(3); |
| 304 | +/* Pointer *extra_data = (Pointer *) PG_GETARG_POINTER(4); */ |
| 305 | +Datum*addInfo= (Datum*)PG_GETARG_POINTER(8); |
| 306 | +bool*addInfoIsNull= (bool*)PG_GETARG_POINTER(9); |
| 307 | +float8res; |
| 308 | + |
| 309 | +res=1.0 / (float8)calc_rank(weights,query,addInfo,addInfoIsNull,nkeys); |
| 310 | + |
| 311 | +PG_RETURN_FLOAT8(res); |
| 312 | +} |
| 313 | + |
| 314 | +Datum |
| 315 | +gin_tsvector_config(PG_FUNCTION_ARGS) |
| 316 | +{ |
| 317 | +GinConfig*config= (GinConfig*)PG_GETARG_POINTER(0); |
| 318 | +config->addInfoTypeOid=BYTEAOID; |
| 319 | +PG_RETURN_VOID(); |
| 320 | +} |