Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit22505f4

Browse files
committed
Add thesaurus dictionary which can replace N>0 lexemes by M>0 lexemes.
It required some changes in lexize algorithm, but interface withdictionaries stays compatible with old dictionaries.Funded by Georgia Public Library Service and LibLime, Inc.
1 parent3b7ed9b commit22505f4

File tree

13 files changed

+1257
-129
lines changed

13 files changed

+1257
-129
lines changed

‎contrib/tsearch2/Makefile

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,13 +1,13 @@
1-
# $PostgreSQL: pgsql/contrib/tsearch2/Makefile,v 1.14 2006/05/02 11:28:54 teodor Exp $
1+
# $PostgreSQL: pgsql/contrib/tsearch2/Makefile,v 1.15 2006/05/31 14:05:31 teodor Exp $
22

33
MODULE_big = tsearch2
44
OBJS = dict_ex.o dict.o snmap.o stopword.o common.o prs_dcfg.o\
5-
dict_snowball.o dict_ispell.o dict_syn.o\
5+
dict_snowball.o dict_ispell.o dict_syn.odict_thesaurus.o\
66
wparser.o wparser_def.o\
77
ts_cfg.o tsvector.o query_cleanup.o crc32.o query.o gistidx.o\
88
tsvector_op.o rank.o ts_stat.o\
99
query_util.o query_support.o query_rewrite.o query_gist.o\
10-
ts_locale.o ginidx.o
10+
ts_locale.ots_lexize.oginidx.o
1111

1212
SUBDIRS := snowball ispell wordparser
1313
SUBDIROBJS :=$(SUBDIRS:%=%/SUBSYS.o)
@@ -16,7 +16,7 @@ OBJS+= $(SUBDIROBJS)
1616

1717
PG_CPPFLAGS = -I$(srcdir)/snowball -I$(srcdir)/ispell -I$(srcdir)/wordparser
1818

19-
DATA = stopword/english.stop stopword/russian.stop stopword/russian.stop.utf8
19+
DATA = stopword/english.stop stopword/russian.stop stopword/russian.stop.utf8 thesaurus
2020
DATA_built = tsearch2.sql untsearch2.sql
2121
DOCS = README.tsearch2
2222
REGRESS = tsearch2

‎contrib/tsearch2/common.c

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@
55
#include"catalog/pg_proc.h"
66
#include"catalog/pg_namespace.h"
77
#include"utils/syscache.h"
8+
#include"miscadmin.h"
89

910
#include"ts_cfg.h"
1011
#include"dict.h"
@@ -163,3 +164,23 @@ get_oidnamespace(Oid funcoid)
163164

164165
returnnspoid;
165166
}
167+
168+
/* if path is relative, take it as relative to share dir */
169+
char*
170+
to_absfilename(char*filename) {
171+
if (!is_absolute_path(filename)) {
172+
charsharepath[MAXPGPATH];
173+
char*absfn;
174+
#ifdefWIN32
175+
chardelim='\\';
176+
#else
177+
chardelim='/';
178+
#endif
179+
get_share_path(my_exec_path,sharepath);
180+
absfn=palloc(strlen(sharepath)+strlen(filename)+2);
181+
sprintf(absfn,"%s%c%s",sharepath,delim,filename);
182+
filename=absfn;
183+
}
184+
185+
returnfilename;
186+
}

‎contrib/tsearch2/common.h

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,8 @@ text *mtextdup(text *in);
1616

1717
inttext_cmp(text*a,text*b);
1818

19+
char*to_absfilename(char*filename);
20+
1921
#defineNEXTVAL(x) ( (text*)( (char*)(x) + INTALIGN( VARSIZE(x) ) ) )
2022
#defineARRNELEMS(x) ArrayGetNItems( ARR_NDIM(x), ARR_DIMS(x))
2123

‎contrib/tsearch2/dict.c

Lines changed: 50 additions & 24 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* $PostgreSQL: pgsql/contrib/tsearch2/dict.c,v 1.11 2006/03/11 04:38:30 momjian Exp $ */
1+
/* $PostgreSQL: pgsql/contrib/tsearch2/dict.c,v 1.12 2006/05/31 14:05:31 teodor Exp $ */
22

33
/*
44
* interface functions to dictionary
@@ -50,16 +50,19 @@ init_dict(Oid id, DictInfo * dict)
5050
Datumopt;
5151
Oidoid=InvalidOid;
5252

53+
/* setup dictlexize method */
54+
oid=DatumGetObjectId(SPI_getbinval(SPI_tuptable->vals[0],SPI_tuptable->tupdesc,3,&isnull));
55+
if (isnull||oid==InvalidOid)
56+
ts_error(ERROR,"Null dict_lexize for dictonary %d",id);
57+
fmgr_info_cxt(oid,&(dict->lexize_info),TopMemoryContext);
58+
59+
/* setup and call dictinit method, optinally */
5360
oid=DatumGetObjectId(SPI_getbinval(SPI_tuptable->vals[0],SPI_tuptable->tupdesc,1,&isnull));
5461
if (!(isnull||oid==InvalidOid))
5562
{
5663
opt=SPI_getbinval(SPI_tuptable->vals[0],SPI_tuptable->tupdesc,2,&isnull);
5764
dict->dictionary= (void*)DatumGetPointer(OidFunctionCall1(oid,opt));
5865
}
59-
oid=DatumGetObjectId(SPI_getbinval(SPI_tuptable->vals[0],SPI_tuptable->tupdesc,3,&isnull));
60-
if (isnull||oid==InvalidOid)
61-
ts_error(ERROR,"Null dict_lexize for dictonary %d",id);
62-
fmgr_info_cxt(oid,&(dict->lexize_info),TopMemoryContext);
6366
dict->dict_id=id;
6467
}
6568
else
@@ -98,6 +101,29 @@ comparedict(const void *a, const void *b)
98101
return (((DictInfo*)a)->dict_id< ((DictInfo*)b)->dict_id) ?-1 :1;
99102
}
100103

104+
staticvoid
105+
insertdict(Oidid) {
106+
DictInfonewdict;
107+
108+
if (DList.len==DList.reallen)
109+
{
110+
DictInfo*tmp;
111+
intreallen= (DList.reallen) ?2*DList.reallen :16;
112+
113+
tmp= (DictInfo*)realloc(DList.list,sizeof(DictInfo)*reallen);
114+
if (!tmp)
115+
ts_error(ERROR,"No memory");
116+
DList.reallen=reallen;
117+
DList.list=tmp;
118+
}
119+
init_dict(id,&newdict);
120+
121+
DList.list[DList.len]=newdict;
122+
DList.len++;
123+
124+
qsort(DList.list,DList.len,sizeof(DictInfo),comparedict);
125+
}
126+
101127
DictInfo*
102128
finddict(Oidid)
103129
{
@@ -117,23 +143,8 @@ finddict(Oid id)
117143
returnDList.last_dict;
118144
}
119145

120-
/* last chance */
121-
if (DList.len==DList.reallen)
122-
{
123-
DictInfo*tmp;
124-
intreallen= (DList.reallen) ?2*DList.reallen :16;
125-
126-
tmp= (DictInfo*)realloc(DList.list,sizeof(DictInfo)*reallen);
127-
if (!tmp)
128-
ts_error(ERROR,"No memory");
129-
DList.reallen=reallen;
130-
DList.list=tmp;
131-
}
132-
DList.last_dict=&(DList.list[DList.len]);
133-
init_dict(id,DList.last_dict);
134-
135-
DList.len++;
136-
qsort(DList.list,DList.len,sizeof(DictInfo),comparedict);
146+
/* insert new dictionary */
147+
insertdict(id);
137148
returnfinddict(id);/* qsort changed order!! */ ;
138149
}
139150

@@ -190,17 +201,32 @@ lexize(PG_FUNCTION_ARGS)
190201
*ptr;
191202
Datum*da;
192203
ArrayType*a;
204+
DictSubStatedstate= { false, false,NULL };
193205

194206
SET_FUNCOID();
195207
dict=finddict(PG_GETARG_OID(0));
196208

197209
ptr=res= (TSLexeme*)DatumGetPointer(
198-
FunctionCall3(&(dict->lexize_info),
210+
FunctionCall4(&(dict->lexize_info),
211+
PointerGetDatum(dict->dictionary),
212+
PointerGetDatum(VARDATA(in)),
213+
Int32GetDatum(VARSIZE(in)-VARHDRSZ),
214+
PointerGetDatum(&dstate)
215+
)
216+
);
217+
218+
if (dstate.getnext) {
219+
dstate.isend= true;
220+
ptr=res= (TSLexeme*)DatumGetPointer(
221+
FunctionCall4(&(dict->lexize_info),
199222
PointerGetDatum(dict->dictionary),
200223
PointerGetDatum(VARDATA(in)),
201-
Int32GetDatum(VARSIZE(in)-VARHDRSZ)
224+
Int32GetDatum(VARSIZE(in)-VARHDRSZ),
225+
PointerGetDatum(&dstate)
202226
)
203227
);
228+
}
229+
204230
PG_FREE_IF_COPY(in,1);
205231
if (!res)
206232
{

‎contrib/tsearch2/dict.h

Lines changed: 55 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -1,9 +1,10 @@
1-
/* $PostgreSQL: pgsql/contrib/tsearch2/dict.h,v 1.6 2006/03/11 04:38:30 momjian Exp $ */
1+
/* $PostgreSQL: pgsql/contrib/tsearch2/dict.h,v 1.7 2006/05/31 14:05:31 teodor Exp $ */
22

33
#ifndef__DICT_H__
44
#define__DICT_H__
55
#include"postgres.h"
66
#include"fmgr.h"
7+
#include"ts_cfg.h"
78

89
typedefstruct
910
{
@@ -29,6 +30,11 @@ DictInfo *finddict(Oid id);
2930
Oidname2id_dict(text*name);
3031
voidreset_dict(void);
3132

33+
typedefstruct {
34+
boolisend;/* in: marks for lexize_info about text end is reached */
35+
boolgetnext;/* out: dict wants next lexeme */
36+
void*private;/* internal dict state between calls with getnext == true */
37+
}DictSubState;
3238

3339
/* simple parser of cfg string */
3440
typedefstruct
@@ -45,17 +51,61 @@ typedef struct
4551
/*
4652
* number of variant of split word , for example Word 'fotballklubber'
4753
* (norwegian) has two varian to split: ( fotball, klubb ) and ( fot,
48-
* ball, klubb ). So, dictionary should return: nvariantlexeme 1
49-
* fotball 1 klubb 2 fot 2 ball 2 klubb
50-
*
54+
* ball, klubb ). So, dictionary should return:
55+
* nvariantlexeme
56+
* 1 fotball
57+
* 1 klubb
58+
* 2fot
59+
* 2ball
60+
* 2klubb
5161
*/
5262
uint16nvariant;
5363

54-
/* currently unused */
5564
uint16flags;
5665

5766
/* C-string */
5867
char*lexeme;
5968
}TSLexeme;
6069

70+
#defineTSL_ADDPOS0x01
71+
72+
73+
/*
74+
* Lexize subsystem
75+
*/
76+
77+
typedefstructParsedLex {
78+
inttype;
79+
char*lemm;
80+
intlenlemm;
81+
boolresfollow;
82+
structParsedLex*next;
83+
}ParsedLex;
84+
85+
typedefstructListParsedLex {
86+
ParsedLex*head;
87+
ParsedLex*tail;
88+
}ListParsedLex;
89+
90+
typedefstruct {
91+
TSCfgInfo*cfg;
92+
OidcurDictId;
93+
intposDict;
94+
DictSubStatedictState;
95+
ParsedLex*curSub;
96+
ListParsedLextowork;/* current list to work */
97+
ListParsedLexwaste;/* list of lexemes that already lexized */
98+
99+
/* fields to store last variant to lexize (basically, thesaurus
100+
or similar to, which wants several lexemes */
101+
102+
ParsedLex*lastRes;
103+
TSLexeme*tmpRes;
104+
}LexizeData;
105+
106+
107+
voidLexizeInit(LexizeData*ld,TSCfgInfo*cfg);
108+
voidLexizeAddLemm(LexizeData*ld,inttype,char*lemm,intlenlemm);
109+
TSLexeme*LexizeExec(LexizeData*ld,ParsedLex**correspondLexem);
110+
61111
#endif

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp