Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit92bcb5a

Browse files
committed
Allow do not lexize words in substitution.
Docs will be submitted some later, now it's athttp://www.sai.msu.su/~megera/oddmuse/index.cgi/Thesaurus_dictionary
1 parent63e464a commit92bcb5a

File tree

2 files changed

+69
-30
lines changed

2 files changed

+69
-30
lines changed

‎contrib/tsearch2/dict_thesaurus.c

Lines changed: 60 additions & 23 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
/* $PostgreSQL: pgsql/contrib/tsearch2/dict_thesaurus.c,v 1.4 2006/06/02 18:03:06 teodor Exp $ */
1+
/* $PostgreSQL: pgsql/contrib/tsearch2/dict_thesaurus.c,v 1.5 2006/06/06 16:25:55 teodor Exp $ */
22

33
/*
44
* thesaurus
@@ -13,6 +13,11 @@
1313
#include"common.h"
1414
#include"ts_locale.h"
1515

16+
/*
17+
* Temporay we use TSLexeme.flags for inner use...
18+
*/
19+
#defineDT_USEASIS0x1000
20+
1621
typedefstructLexemeInfo {
1722
uint16idsubst;/* entry's number in DictThesaurus->subst */
1823
uint16posinsubst;/* pos info in entry */
@@ -94,7 +99,7 @@ newLexeme( DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 posinsubst
9499
}
95100

96101
staticvoid
97-
addWrd(DictThesaurus*d,char*b,char*e,uint16idsubst,uint16nwrd,uint16posinsubst ) {
102+
addWrd(DictThesaurus*d,char*b,char*e,uint16idsubst,uint16nwrd,uint16posinsubst,booluseasis ) {
98103
staticintnres=0;
99104
staticintntres=0;
100105
TheSubstitute*ptr;
@@ -138,7 +143,10 @@ addWrd( DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 nwrd, uint16
138143
ptr->res[nres ].lexeme[e-b]='\0';
139144

140145
ptr->res[nres ].nvariant=nwrd;
141-
ptr->res[nres ].flags=TSL_ADDPOS;
146+
if (useasis )
147+
ptr->res[nres ].flags=DT_USEASIS;
148+
else
149+
ptr->res[nres ].flags=0;
142150

143151
ptr->res[++nres ].lexeme=NULL;
144152
}
@@ -154,6 +162,7 @@ thesaurusRead( char *filename, DictThesaurus *d ) {
154162
charstr[BUFSIZ];
155163
intlineno=0;
156164
uint16idsubst=0;
165+
booluseasis=false;
157166

158167
fh=fopen(to_absfilename(filename),"r");
159168
if (!fh)
@@ -196,13 +205,24 @@ thesaurusRead( char *filename, DictThesaurus *d ) {
196205
state=TR_WAITLEX;
197206
}
198207
}elseif (state==TR_WAITSUBS ) {
199-
if ( !t_isspace(ptr) ) {
208+
if (t_iseq(ptr,'*') ) {
209+
useasis= true;
210+
state=TR_INSUBS;
211+
beginwrd=ptr+pg_mblen(ptr);
212+
}elseif (t_iseq(ptr,'\\') ) {
213+
useasis= false;
214+
state=TR_INSUBS;
215+
beginwrd=ptr+pg_mblen(ptr);
216+
}elseif ( !t_isspace(ptr) ) {
217+
useasis= false;
200218
beginwrd=ptr;
201219
state=TR_INSUBS;
202220
}
203221
}elseif (state==TR_INSUBS ) {
204222
if (t_isspace(ptr) ) {
205-
addWrd(d,beginwrd,ptr,idsubst,nwrd++,posinsubst );
223+
if (ptr==beginwrd )
224+
elog(ERROR,"Thesaurus: Unexpected end of line or lexeme at %d line",lineno);
225+
addWrd(d,beginwrd,ptr,idsubst,nwrd++,posinsubst,useasis );
206226
state=TR_WAITSUBS;
207227
}
208228
}else
@@ -211,8 +231,11 @@ thesaurusRead( char *filename, DictThesaurus *d ) {
211231
ptr+=pg_mblen(ptr);
212232
}
213233

214-
if (state==TR_INSUBS )
215-
addWrd(d,beginwrd,ptr,idsubst,nwrd++,posinsubst );
234+
if (state==TR_INSUBS ) {
235+
if (ptr==beginwrd )
236+
elog(ERROR,"Thesaurus: Unexpected end of line or lexeme at %d line",lineno);
237+
addWrd(d,beginwrd,ptr,idsubst,nwrd++,posinsubst,useasis );
238+
}
216239

217240
idsubst++;
218241

@@ -319,7 +342,9 @@ compileTheLexeme(DictThesaurus *d) {
319342
elog(ERROR,"Out of memory");
320343

321344
for(i=0;i<d->nwrds;i++) {
322-
TSLexeme*ptr= (TSLexeme*)DatumGetPointer(
345+
TSLexeme*ptr;
346+
347+
ptr= (TSLexeme*)DatumGetPointer(
323348
FunctionCall4(
324349
&(d->subdict.lexize_info),
325350
PointerGetDatum(d->subdict.dictionary),
@@ -331,9 +356,11 @@ compileTheLexeme(DictThesaurus *d) {
331356

332357
if ( !(ptr&&ptr->lexeme) ) {
333358
if ( !ptr )
334-
elog(ERROR,"Thesaurus: word '%s' isn't recognized by subdictionary",d->wrds[i].lexeme);
359+
elog(ERROR,"Thesaurus: word-sample '%s' isn't recognized by subdictionary (rule %d)",
360+
d->wrds[i].lexeme,d->wrds[i].entries->idsubst+1);
335361
else
336-
elog(NOTICE,"Thesaurus: word '%s' is recognized as stop-word, assign any stop-word",d->wrds[i].lexeme);
362+
elog(NOTICE,"Thesaurus: word-sample '%s' is recognized as stop-word, assign any stop-word (rule %d)",
363+
d->wrds[i].lexeme,d->wrds[i].entries->idsubst+1);
337364

338365
newwrds=addCompiledLexeme(newwrds,&nnw,&tnm,NULL,d->wrds[i].entries,0);
339366
}else {
@@ -413,17 +440,25 @@ compileTheSubstitute(DictThesaurus *d) {
413440
inptr=rem;
414441

415442
while(inptr&&inptr->lexeme ) {
416-
TSLexeme*reml,*lexized= (TSLexeme*)DatumGetPointer(
417-
FunctionCall4(
418-
&(d->subdict.lexize_info),
419-
PointerGetDatum(d->subdict.dictionary),
420-
PointerGetDatum(inptr->lexeme),
421-
Int32GetDatum(strlen(inptr->lexeme)),
422-
PointerGetDatum(NULL)
423-
)
424-
);
443+
TSLexeme*lexized,tmplex[2];
444+
445+
if (inptr->flags&DT_USEASIS ) {/* do not lexize */
446+
tmplex[0]=*inptr;
447+
tmplex[0].flags=0;
448+
tmplex[1].lexeme=NULL;
449+
lexized=tmplex;
450+
}else {
451+
lexized= (TSLexeme*)DatumGetPointer(
452+
FunctionCall4(
453+
&(d->subdict.lexize_info),
454+
PointerGetDatum(d->subdict.dictionary),
455+
PointerGetDatum(inptr->lexeme),
456+
Int32GetDatum(strlen(inptr->lexeme)),
457+
PointerGetDatum(NULL)
458+
)
459+
);
460+
}
425461

426-
reml=lexized;
427462
if (lexized&&lexized->lexeme ) {
428463
inttoset= (lexized->lexeme&&outptr!=d->subst[i].res ) ? (outptr-d->subst[i].res) :-1;
429464

@@ -447,8 +482,10 @@ compileTheSubstitute(DictThesaurus *d) {
447482

448483
if (toset>0)
449484
d->subst[i].res[toset].flags |=TSL_ADDPOS;
485+
}elseif (lexized ) {
486+
elog(NOTICE,"Thesaurus: word '%s' in substition is a stop-word, ignored (rule %d)",inptr->lexeme,i+1);
450487
}else {
451-
elog(NOTICE,"Thesaurus: word '%s' isn't recognizedby subdictionary or it's a stop-word, ignored",inptr->lexeme);
488+
elog(ERROR,"Thesaurus: word '%s'in substitionisn't recognized(rule %d)",inptr->lexeme,i+1);
452489
}
453490

454491
if (inptr->lexeme )
@@ -457,7 +494,7 @@ compileTheSubstitute(DictThesaurus *d) {
457494
}
458495

459496
if (outptr==d->subst[i].res )
460-
elog(ERROR,"Thesaurus: all words in subsitutionaren't recognized by subdictionary");
497+
elog(ERROR,"Thesaurus: all words in subsitutionare stop word (rule %d)",i+1);
461498

462499
d->subst[i].reslen=outptr-d->subst[i].res;
463500

@@ -717,7 +754,7 @@ thesaurus_lexize(PG_FUNCTION_ARGS)
717754

718755
infos= (LexemeInfo**)palloc(sizeof(LexemeInfo*)*nlex);
719756
for(i=0;i<nlex;i++)
720-
if ( (infos[i]=findTheLexeme(d,basevar[i].lexeme))==NULL )
757+
if ( (infos[i]=findTheLexeme(d,basevar[i].lexeme))==NULL )
721758
break;
722759

723760
if (i<nlex ) {

‎contrib/tsearch2/thesaurus

Lines changed: 9 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -1,14 +1,16 @@
11
#
22
# Theasurus config file. Character ':' splits
3-
# string to part:
4-
# to be substituted string
5-
# substituting string
3+
# string to part, example:
4+
# sample-words : substitute-words
65
#
6+
# Any substitute-word can be marked by preceding '*' character,
7+
# which means do not lexize this word
8+
# Docs: http://www.sai.msu.su/~megera/oddmuse/index.cgi/Thesaurus_dictionary
79

8-
#one two three : 123
9-
#one two : 12
10-
#one : 1
11-
#two : 2
10+
#one two three :*123
11+
#one two :*12
12+
#one :*1
13+
#two :*2
1214

1315
#foo bar : blah blah
1416
#f bar : fbar

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp