1- /* $PostgreSQL: pgsql/contrib/tsearch2/dict_thesaurus.c,v 1.4 2006/06/02 18:03:06 teodor Exp $ */
1+ /* $PostgreSQL: pgsql/contrib/tsearch2/dict_thesaurus.c,v 1.5 2006/06/06 16:25:55 teodor Exp $ */
22
33/*
44 * thesaurus
1313#include "common.h"
1414#include "ts_locale.h"
1515
16+ /*
17+ * Temporay we use TSLexeme.flags for inner use...
18+ */
19+ #define DT_USEASIS 0x1000
20+
1621typedef struct LexemeInfo {
1722uint16 idsubst ;/* entry's number in DictThesaurus->subst */
1823uint16 posinsubst ;/* pos info in entry */
@@ -94,7 +99,7 @@ newLexeme( DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 posinsubst
9499}
95100
96101static void
97- addWrd (DictThesaurus * d ,char * b ,char * e ,uint16 idsubst ,uint16 nwrd ,uint16 posinsubst ) {
102+ addWrd (DictThesaurus * d ,char * b ,char * e ,uint16 idsubst ,uint16 nwrd ,uint16 posinsubst , bool useasis ) {
98103static int nres = 0 ;
99104static int ntres = 0 ;
100105TheSubstitute * ptr ;
@@ -138,7 +143,10 @@ addWrd( DictThesaurus *d, char *b, char *e, uint16 idsubst, uint16 nwrd, uint16
138143ptr -> res [nres ].lexeme [e - b ]= '\0' ;
139144
140145ptr -> res [nres ].nvariant = nwrd ;
141- ptr -> res [nres ].flags = TSL_ADDPOS ;
146+ if (useasis )
147+ ptr -> res [nres ].flags = DT_USEASIS ;
148+ else
149+ ptr -> res [nres ].flags = 0 ;
142150
143151ptr -> res [++ nres ].lexeme = NULL ;
144152}
@@ -154,6 +162,7 @@ thesaurusRead( char *filename, DictThesaurus *d ) {
154162char str [BUFSIZ ];
155163int lineno = 0 ;
156164uint16 idsubst = 0 ;
165+ bool useasis = false;
157166
158167fh = fopen (to_absfilename (filename ),"r" );
159168if (!fh )
@@ -196,13 +205,24 @@ thesaurusRead( char *filename, DictThesaurus *d ) {
196205state = TR_WAITLEX ;
197206}
198207}else if (state == TR_WAITSUBS ) {
199- if ( !t_isspace (ptr ) ) {
208+ if (t_iseq (ptr ,'*' ) ) {
209+ useasis = true;
210+ state = TR_INSUBS ;
211+ beginwrd = ptr + pg_mblen (ptr );
212+ }else if (t_iseq (ptr ,'\\' ) ) {
213+ useasis = false;
214+ state = TR_INSUBS ;
215+ beginwrd = ptr + pg_mblen (ptr );
216+ }else if ( !t_isspace (ptr ) ) {
217+ useasis = false;
200218beginwrd = ptr ;
201219state = TR_INSUBS ;
202220}
203221}else if (state == TR_INSUBS ) {
204222if (t_isspace (ptr ) ) {
205- addWrd (d ,beginwrd ,ptr ,idsubst ,nwrd ++ ,posinsubst );
223+ if (ptr == beginwrd )
224+ elog (ERROR ,"Thesaurus: Unexpected end of line or lexeme at %d line" ,lineno );
225+ addWrd (d ,beginwrd ,ptr ,idsubst ,nwrd ++ ,posinsubst ,useasis );
206226state = TR_WAITSUBS ;
207227}
208228}else
@@ -211,8 +231,11 @@ thesaurusRead( char *filename, DictThesaurus *d ) {
211231ptr += pg_mblen (ptr );
212232}
213233
214- if (state == TR_INSUBS )
215- addWrd (d ,beginwrd ,ptr ,idsubst ,nwrd ++ ,posinsubst );
234+ if (state == TR_INSUBS ) {
235+ if (ptr == beginwrd )
236+ elog (ERROR ,"Thesaurus: Unexpected end of line or lexeme at %d line" ,lineno );
237+ addWrd (d ,beginwrd ,ptr ,idsubst ,nwrd ++ ,posinsubst ,useasis );
238+ }
216239
217240idsubst ++ ;
218241
@@ -319,7 +342,9 @@ compileTheLexeme(DictThesaurus *d) {
319342elog (ERROR ,"Out of memory" );
320343
321344for (i = 0 ;i < d -> nwrds ;i ++ ) {
322- TSLexeme * ptr = (TSLexeme * )DatumGetPointer (
345+ TSLexeme * ptr ;
346+
347+ ptr = (TSLexeme * )DatumGetPointer (
323348FunctionCall4 (
324349& (d -> subdict .lexize_info ),
325350PointerGetDatum (d -> subdict .dictionary ),
@@ -331,9 +356,11 @@ compileTheLexeme(DictThesaurus *d) {
331356
332357if ( !(ptr && ptr -> lexeme ) ) {
333358if ( !ptr )
334- elog (ERROR ,"Thesaurus: word '%s' isn't recognized by subdictionary" ,d -> wrds [i ].lexeme );
359+ elog (ERROR ,"Thesaurus: word-sample '%s' isn't recognized by subdictionary (rule %d)" ,
360+ d -> wrds [i ].lexeme ,d -> wrds [i ].entries -> idsubst + 1 );
335361else
336- elog (NOTICE ,"Thesaurus: word '%s' is recognized as stop-word, assign any stop-word" ,d -> wrds [i ].lexeme );
362+ elog (NOTICE ,"Thesaurus: word-sample '%s' is recognized as stop-word, assign any stop-word (rule %d)" ,
363+ d -> wrds [i ].lexeme ,d -> wrds [i ].entries -> idsubst + 1 );
337364
338365newwrds = addCompiledLexeme (newwrds ,& nnw ,& tnm ,NULL ,d -> wrds [i ].entries ,0 );
339366}else {
@@ -413,17 +440,25 @@ compileTheSubstitute(DictThesaurus *d) {
413440inptr = rem ;
414441
415442while (inptr && inptr -> lexeme ) {
416- TSLexeme * reml ,* lexized = (TSLexeme * )DatumGetPointer (
417- FunctionCall4 (
418- & (d -> subdict .lexize_info ),
419- PointerGetDatum (d -> subdict .dictionary ),
420- PointerGetDatum (inptr -> lexeme ),
421- Int32GetDatum (strlen (inptr -> lexeme )),
422- PointerGetDatum (NULL )
423- )
424- );
443+ TSLexeme * lexized ,tmplex [2 ];
444+
445+ if (inptr -> flags & DT_USEASIS ) {/* do not lexize */
446+ tmplex [0 ]= * inptr ;
447+ tmplex [0 ].flags = 0 ;
448+ tmplex [1 ].lexeme = NULL ;
449+ lexized = tmplex ;
450+ }else {
451+ lexized = (TSLexeme * )DatumGetPointer (
452+ FunctionCall4 (
453+ & (d -> subdict .lexize_info ),
454+ PointerGetDatum (d -> subdict .dictionary ),
455+ PointerGetDatum (inptr -> lexeme ),
456+ Int32GetDatum (strlen (inptr -> lexeme )),
457+ PointerGetDatum (NULL )
458+ )
459+ );
460+ }
425461
426- reml = lexized ;
427462if (lexized && lexized -> lexeme ) {
428463int toset = (lexized -> lexeme && outptr != d -> subst [i ].res ) ? (outptr - d -> subst [i ].res ) :-1 ;
429464
@@ -447,8 +482,10 @@ compileTheSubstitute(DictThesaurus *d) {
447482
448483if (toset > 0 )
449484d -> subst [i ].res [toset ].flags |=TSL_ADDPOS ;
485+ }else if (lexized ) {
486+ elog (NOTICE ,"Thesaurus: word '%s' in substition is a stop-word, ignored (rule %d)" ,inptr -> lexeme ,i + 1 );
450487}else {
451- elog (NOTICE ,"Thesaurus: word '%s' isn't recognizedby subdictionary or it's a stop-word, ignored " ,inptr -> lexeme );
488+ elog (ERROR ,"Thesaurus: word '%s'in substition isn't recognized(rule %d) " ,inptr -> lexeme , i + 1 );
452489}
453490
454491if (inptr -> lexeme )
@@ -457,7 +494,7 @@ compileTheSubstitute(DictThesaurus *d) {
457494}
458495
459496if (outptr == d -> subst [i ].res )
460- elog (ERROR ,"Thesaurus: all words in subsitutionaren't recognized by subdictionary" );
497+ elog (ERROR ,"Thesaurus: all words in subsitutionare stop word (rule %d)" , i + 1 );
461498
462499d -> subst [i ].reslen = outptr - d -> subst [i ].res ;
463500
@@ -717,7 +754,7 @@ thesaurus_lexize(PG_FUNCTION_ARGS)
717754
718755infos = (LexemeInfo * * )palloc (sizeof (LexemeInfo * )* nlex );
719756for (i = 0 ;i < nlex ;i ++ )
720- if ( (infos [i ]= findTheLexeme (d ,basevar [i ].lexeme ))== NULL )
757+ if ( (infos [i ]= findTheLexeme (d ,basevar [i ].lexeme ))== NULL )
721758break ;
722759
723760if (i < nlex ) {