1
- /* $PostgreSQL: pgsql/contrib/tsearch2/dict_thesaurus.c,v 1.4 2006/06/02 18:03:06 teodor Exp $ */
1
+ /* $PostgreSQL: pgsql/contrib/tsearch2/dict_thesaurus.c,v 1.5 2006/06/06 16:25:55 teodor Exp $ */
2
2
3
3
/*
4
4
* thesaurus
13
13
#include "common.h"
14
14
#include "ts_locale.h"
15
15
16
+ /*
17
+ * Temporay we use TSLexeme.flags for inner use...
18
+ */
19
+ #define DT_USEASIS 0x1000
20
+
16
21
typedef struct LexemeInfo {
17
22
uint16 idsubst ;/* entry's number in DictThesaurus->subst */
18
23
uint16 posinsubst ;/* pos info in entry */
@@ -94,7 +99,7 @@ newLexeme( DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 posinsubst
94
99
}
95
100
96
101
static void
97
- addWrd (DictThesaurus * d ,char * b ,char * e ,uint16 idsubst ,uint16 nwrd ,uint16 posinsubst ) {
102
+ addWrd (DictThesaurus * d ,char * b ,char * e ,uint16 idsubst ,uint16 nwrd ,uint16 posinsubst , bool useasis ) {
98
103
static int nres = 0 ;
99
104
static int ntres = 0 ;
100
105
TheSubstitute * ptr ;
@@ -138,7 +143,10 @@ addWrd( DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 nwrd, uint16
138
143
ptr -> res [nres ].lexeme [e - b ]= '\0' ;
139
144
140
145
ptr -> res [nres ].nvariant = nwrd ;
141
- ptr -> res [nres ].flags = TSL_ADDPOS ;
146
+ if (useasis )
147
+ ptr -> res [nres ].flags = DT_USEASIS ;
148
+ else
149
+ ptr -> res [nres ].flags = 0 ;
142
150
143
151
ptr -> res [++ nres ].lexeme = NULL ;
144
152
}
@@ -154,6 +162,7 @@ thesaurusRead( char *filename, DictThesaurus *d ) {
154
162
char str [BUFSIZ ];
155
163
int lineno = 0 ;
156
164
uint16 idsubst = 0 ;
165
+ bool useasis = false;
157
166
158
167
fh = fopen (to_absfilename (filename ),"r" );
159
168
if (!fh )
@@ -196,13 +205,24 @@ thesaurusRead( char *filename, DictThesaurus *d ) {
196
205
state = TR_WAITLEX ;
197
206
}
198
207
}else if (state == TR_WAITSUBS ) {
199
- if ( !t_isspace (ptr ) ) {
208
+ if (t_iseq (ptr ,'*' ) ) {
209
+ useasis = true;
210
+ state = TR_INSUBS ;
211
+ beginwrd = ptr + pg_mblen (ptr );
212
+ }else if (t_iseq (ptr ,'\\' ) ) {
213
+ useasis = false;
214
+ state = TR_INSUBS ;
215
+ beginwrd = ptr + pg_mblen (ptr );
216
+ }else if ( !t_isspace (ptr ) ) {
217
+ useasis = false;
200
218
beginwrd = ptr ;
201
219
state = TR_INSUBS ;
202
220
}
203
221
}else if (state == TR_INSUBS ) {
204
222
if (t_isspace (ptr ) ) {
205
- addWrd (d ,beginwrd ,ptr ,idsubst ,nwrd ++ ,posinsubst );
223
+ if (ptr == beginwrd )
224
+ elog (ERROR ,"Thesaurus: Unexpected end of line or lexeme at %d line" ,lineno );
225
+ addWrd (d ,beginwrd ,ptr ,idsubst ,nwrd ++ ,posinsubst ,useasis );
206
226
state = TR_WAITSUBS ;
207
227
}
208
228
}else
@@ -211,8 +231,11 @@ thesaurusRead( char *filename, DictThesaurus *d ) {
211
231
ptr += pg_mblen (ptr );
212
232
}
213
233
214
- if (state == TR_INSUBS )
215
- addWrd (d ,beginwrd ,ptr ,idsubst ,nwrd ++ ,posinsubst );
234
+ if (state == TR_INSUBS ) {
235
+ if (ptr == beginwrd )
236
+ elog (ERROR ,"Thesaurus: Unexpected end of line or lexeme at %d line" ,lineno );
237
+ addWrd (d ,beginwrd ,ptr ,idsubst ,nwrd ++ ,posinsubst ,useasis );
238
+ }
216
239
217
240
idsubst ++ ;
218
241
@@ -319,7 +342,9 @@ compileTheLexeme(DictThesaurus *d) {
319
342
elog (ERROR ,"Out of memory" );
320
343
321
344
for (i = 0 ;i < d -> nwrds ;i ++ ) {
322
- TSLexeme * ptr = (TSLexeme * )DatumGetPointer (
345
+ TSLexeme * ptr ;
346
+
347
+ ptr = (TSLexeme * )DatumGetPointer (
323
348
FunctionCall4 (
324
349
& (d -> subdict .lexize_info ),
325
350
PointerGetDatum (d -> subdict .dictionary ),
@@ -331,9 +356,11 @@ compileTheLexeme(DictThesaurus *d) {
331
356
332
357
if ( !(ptr && ptr -> lexeme ) ) {
333
358
if ( !ptr )
334
- elog (ERROR ,"Thesaurus: word '%s' isn't recognized by subdictionary" ,d -> wrds [i ].lexeme );
359
+ elog (ERROR ,"Thesaurus: word-sample '%s' isn't recognized by subdictionary (rule %d)" ,
360
+ d -> wrds [i ].lexeme ,d -> wrds [i ].entries -> idsubst + 1 );
335
361
else
336
- elog (NOTICE ,"Thesaurus: word '%s' is recognized as stop-word, assign any stop-word" ,d -> wrds [i ].lexeme );
362
+ elog (NOTICE ,"Thesaurus: word-sample '%s' is recognized as stop-word, assign any stop-word (rule %d)" ,
363
+ d -> wrds [i ].lexeme ,d -> wrds [i ].entries -> idsubst + 1 );
337
364
338
365
newwrds = addCompiledLexeme (newwrds ,& nnw ,& tnm ,NULL ,d -> wrds [i ].entries ,0 );
339
366
}else {
@@ -413,17 +440,25 @@ compileTheSubstitute(DictThesaurus *d) {
413
440
inptr = rem ;
414
441
415
442
while (inptr && inptr -> lexeme ) {
416
- TSLexeme * reml ,* lexized = (TSLexeme * )DatumGetPointer (
417
- FunctionCall4 (
418
- & (d -> subdict .lexize_info ),
419
- PointerGetDatum (d -> subdict .dictionary ),
420
- PointerGetDatum (inptr -> lexeme ),
421
- Int32GetDatum (strlen (inptr -> lexeme )),
422
- PointerGetDatum (NULL )
423
- )
424
- );
443
+ TSLexeme * lexized ,tmplex [2 ];
444
+
445
+ if (inptr -> flags & DT_USEASIS ) {/* do not lexize */
446
+ tmplex [0 ]= * inptr ;
447
+ tmplex [0 ].flags = 0 ;
448
+ tmplex [1 ].lexeme = NULL ;
449
+ lexized = tmplex ;
450
+ }else {
451
+ lexized = (TSLexeme * )DatumGetPointer (
452
+ FunctionCall4 (
453
+ & (d -> subdict .lexize_info ),
454
+ PointerGetDatum (d -> subdict .dictionary ),
455
+ PointerGetDatum (inptr -> lexeme ),
456
+ Int32GetDatum (strlen (inptr -> lexeme )),
457
+ PointerGetDatum (NULL )
458
+ )
459
+ );
460
+ }
425
461
426
- reml = lexized ;
427
462
if (lexized && lexized -> lexeme ) {
428
463
int toset = (lexized -> lexeme && outptr != d -> subst [i ].res ) ? (outptr - d -> subst [i ].res ) :-1 ;
429
464
@@ -447,8 +482,10 @@ compileTheSubstitute(DictThesaurus *d) {
447
482
448
483
if (toset > 0 )
449
484
d -> subst [i ].res [toset ].flags |=TSL_ADDPOS ;
485
+ }else if (lexized ) {
486
+ elog (NOTICE ,"Thesaurus: word '%s' in substition is a stop-word, ignored (rule %d)" ,inptr -> lexeme ,i + 1 );
450
487
}else {
451
- elog (NOTICE ,"Thesaurus: word '%s' isn't recognizedby subdictionary or it's a stop-word, ignored " ,inptr -> lexeme );
488
+ elog (ERROR ,"Thesaurus: word '%s'in substition isn't recognized(rule %d) " ,inptr -> lexeme , i + 1 );
452
489
}
453
490
454
491
if (inptr -> lexeme )
@@ -457,7 +494,7 @@ compileTheSubstitute(DictThesaurus *d) {
457
494
}
458
495
459
496
if (outptr == d -> subst [i ].res )
460
- elog (ERROR ,"Thesaurus: all words in subsitutionaren't recognized by subdictionary" );
497
+ elog (ERROR ,"Thesaurus: all words in subsitutionare stop word (rule %d)" , i + 1 );
461
498
462
499
d -> subst [i ].reslen = outptr - d -> subst [i ].res ;
463
500
@@ -717,7 +754,7 @@ thesaurus_lexize(PG_FUNCTION_ARGS)
717
754
718
755
infos = (LexemeInfo * * )palloc (sizeof (LexemeInfo * )* nlex );
719
756
for (i = 0 ;i < nlex ;i ++ )
720
- if ( (infos [i ]= findTheLexeme (d ,basevar [i ].lexeme ))== NULL )
757
+ if ( (infos [i ]= findTheLexeme (d ,basevar [i ].lexeme ))== NULL )
721
758
break ;
722
759
723
760
if (i < nlex ) {