NotificationsYou must be signed in to change notification settings
Fork5
Star27

Commit92bcb5a

committed

Allow do not lexize words in substitution.

Docs will be submitted some later, now it's athttp://www.sai.msu.su/~megera/oddmuse/index.cgi/Thesaurus_dictionary

1 parent63e464a commit92bcb5aCopy full SHA for 92bcb5a

File tree

2 files changed

+69

-30

lines changed

contrib/tsearch2
- dict_thesaurus.c
- thesaurus

2 files changed

+69

-30

lines changed

`‎contrib/tsearch2/dict_thesaurus.c‎`

Lines changed: 60 additions & 23 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-/* $PostgreSQL: pgsql/contrib/tsearch2/dict_thesaurus.c,v 1.4 2006/06/02 18:03:06 teodor Exp $ */`
	`1`	`+/* $PostgreSQL: pgsql/contrib/tsearch2/dict_thesaurus.c,v 1.5 2006/06/06 16:25:55 teodor Exp $ */`
`2`	`2`
`3`	`3`	`/*`
`4`	`4`	`* thesaurus`
`@@ -13,6 +13,11 @@`
`13`	`13`	`#include"common.h"`
`14`	`14`	`#include"ts_locale.h"`
`15`	`15`
	`16`	`+/*`
	`17`	`+ * Temporay we use TSLexeme.flags for inner use...`
	`18`	`+ */`
	`19`	`+#defineDT_USEASIS0x1000`
	`20`	`+`
`16`	`21`	`typedefstructLexemeInfo {`
`17`	`22`	`uint16idsubst;/* entry's number in DictThesaurus->subst */`
`18`	`23`	`uint16posinsubst;/* pos info in entry */`
`@@ -94,7 +99,7 @@ newLexeme( DictThesaurus d, char b, char *e, uint16 idsubst, uint16 posinsubst`
`94`	`99`	`}`
`95`	`100`
`96`	`101`	`staticvoid`
`97`		`-addWrd(DictThesaurusd,charb,char*e,uint16idsubst,uint16nwrd,uint16posinsubst ) {`
	`102`	`+addWrd(DictThesaurusd,charb,char*e,uint16idsubst,uint16nwrd,uint16posinsubst,booluseasis ) {`
`98`	`103`	`staticintnres=0;`
`99`	`104`	`staticintntres=0;`
`100`	`105`	`TheSubstitute*ptr;`
`@@ -138,7 +143,10 @@ addWrd( DictThesaurus d, char b, char *e, uint16 idsubst, uint16 nwrd, uint16`
`138`	`143`	`ptr->res[nres ].lexeme[e-b]='\0';`
`139`	`144`
`140`	`145`	`ptr->res[nres ].nvariant=nwrd;`
`141`		`-ptr->res[nres ].flags=TSL_ADDPOS;`
	`146`	`+if (useasis )`
	`147`	`+ptr->res[nres ].flags=DT_USEASIS;`
	`148`	`+else`
	`149`	`+ptr->res[nres ].flags=0;`
`142`	`150`
`143`	`151`	`ptr->res[++nres ].lexeme=NULL;`
`144`	`152`	`}`
`@@ -154,6 +162,7 @@ thesaurusRead( char filename, DictThesaurus d ) {`
`154`	`162`	`charstr[BUFSIZ];`
`155`	`163`	`intlineno=0;`
`156`	`164`	`uint16idsubst=0;`
	`165`	`+booluseasis=false;`
`157`	`166`
`158`	`167`	`fh=fopen(to_absfilename(filename),"r");`
`159`	`168`	`if (!fh)`
`@@ -196,13 +205,24 @@ thesaurusRead( char filename, DictThesaurus d ) {`
`196`	`205`	`state=TR_WAITLEX;`
`197`	`206`	`}`
`198`	`207`	`}elseif (state==TR_WAITSUBS ) {`
`199`		`-if ( !t_isspace(ptr) ) {`
	`208`	`+if (t_iseq(ptr,'*') ) {`
	`209`	`+useasis= true;`
	`210`	`+state=TR_INSUBS;`
	`211`	`+beginwrd=ptr+pg_mblen(ptr);`
	`212`	`+}elseif (t_iseq(ptr,'\\') ) {`
	`213`	`+useasis= false;`
	`214`	`+state=TR_INSUBS;`
	`215`	`+beginwrd=ptr+pg_mblen(ptr);`
	`216`	`+}elseif ( !t_isspace(ptr) ) {`
	`217`	`+useasis= false;`
`200`	`218`	`beginwrd=ptr;`
`201`	`219`	`state=TR_INSUBS;`
`202`	`220`	`}`
`203`	`221`	`}elseif (state==TR_INSUBS ) {`
`204`	`222`	`if (t_isspace(ptr) ) {`
`205`		`-addWrd(d,beginwrd,ptr,idsubst,nwrd++,posinsubst );`
	`223`	`+if (ptr==beginwrd )`
	`224`	`+elog(ERROR,"Thesaurus: Unexpected end of line or lexeme at %d line",lineno);`
	`225`	`+addWrd(d,beginwrd,ptr,idsubst,nwrd++,posinsubst,useasis );`
`206`	`226`	`state=TR_WAITSUBS;`
`207`	`227`	`}`
`208`	`228`	`}else`
`@@ -211,8 +231,11 @@ thesaurusRead( char filename, DictThesaurus d ) {`
`211`	`231`	`ptr+=pg_mblen(ptr);`
`212`	`232`	`}`
`213`	`233`
`214`		`-if (state==TR_INSUBS )`
`215`		`-addWrd(d,beginwrd,ptr,idsubst,nwrd++,posinsubst );`
	`234`	`+if (state==TR_INSUBS ) {`
	`235`	`+if (ptr==beginwrd )`
	`236`	`+elog(ERROR,"Thesaurus: Unexpected end of line or lexeme at %d line",lineno);`
	`237`	`+addWrd(d,beginwrd,ptr,idsubst,nwrd++,posinsubst,useasis );`
	`238`	`+}`
`216`	`239`
`217`	`240`	`idsubst++;`
`218`	`241`
`@@ -319,7 +342,9 @@ compileTheLexeme(DictThesaurus *d) {`
`319`	`342`	`elog(ERROR,"Out of memory");`
`320`	`343`
`321`	`344`	`for(i=0;i<d->nwrds;i++) {`
`322`		`-TSLexemeptr= (TSLexeme)DatumGetPointer(`
	`345`	`+TSLexeme*ptr;`
	`346`	`+`
	`347`	`+ptr= (TSLexeme*)DatumGetPointer(`
`323`	`348`	`FunctionCall4(`
`324`	`349`	`&(d->subdict.lexize_info),`
`325`	`350`	`PointerGetDatum(d->subdict.dictionary),`
`@@ -331,9 +356,11 @@ compileTheLexeme(DictThesaurus *d) {`
`331`	`356`
`332`	`357`	`if ( !(ptr&&ptr->lexeme) ) {`
`333`	`358`	`if ( !ptr )`
`334`		`-elog(ERROR,"Thesaurus: word '%s' isn't recognized by subdictionary",d->wrds[i].lexeme);`
	`359`	`+elog(ERROR,"Thesaurus: word-sample '%s' isn't recognized by subdictionary (rule %d)",`
	`360`	`+d->wrds[i].lexeme,d->wrds[i].entries->idsubst+1);`
`335`	`361`	`else`
`336`		`-elog(NOTICE,"Thesaurus: word '%s' is recognized as stop-word, assign any stop-word",d->wrds[i].lexeme);`
	`362`	`+elog(NOTICE,"Thesaurus: word-sample '%s' is recognized as stop-word, assign any stop-word (rule %d)",`
	`363`	`+d->wrds[i].lexeme,d->wrds[i].entries->idsubst+1);`
`337`	`364`
`338`	`365`	`newwrds=addCompiledLexeme(newwrds,&nnw,&tnm,NULL,d->wrds[i].entries,0);`
`339`	`366`	`}else {`
`@@ -413,17 +440,25 @@ compileTheSubstitute(DictThesaurus *d) {`
`413`	`440`	`inptr=rem;`
`414`	`441`
`415`	`442`	`while(inptr&&inptr->lexeme ) {`
`416`		`-TSLexemereml,lexized= (TSLexeme*)DatumGetPointer(`
`417`		`-FunctionCall4(`
`418`		`-&(d->subdict.lexize_info),`
`419`		`-PointerGetDatum(d->subdict.dictionary),`
`420`		`-PointerGetDatum(inptr->lexeme),`
`421`		`-Int32GetDatum(strlen(inptr->lexeme)),`
`422`		`-PointerGetDatum(NULL)`
`423`		`-)`
`424`		`-);`
	`443`	`+TSLexeme*lexized,tmplex[2];`
	`444`	`+`
	`445`	`+if (inptr->flags&DT_USEASIS ) {/* do not lexize */`
	`446`	`+tmplex[0]=*inptr;`
	`447`	`+tmplex[0].flags=0;`
	`448`	`+tmplex[1].lexeme=NULL;`
	`449`	`+lexized=tmplex;`
	`450`	`+}else {`
	`451`	`+lexized= (TSLexeme*)DatumGetPointer(`
	`452`	`+FunctionCall4(`
	`453`	`+&(d->subdict.lexize_info),`
	`454`	`+PointerGetDatum(d->subdict.dictionary),`
	`455`	`+PointerGetDatum(inptr->lexeme),`
	`456`	`+Int32GetDatum(strlen(inptr->lexeme)),`
	`457`	`+PointerGetDatum(NULL)`
	`458`	`+)`
	`459`	`+);`
	`460`	`+}`
`425`	`461`
`426`		`-reml=lexized;`
`427`	`462`	`if (lexized&&lexized->lexeme ) {`
`428`	`463`	`inttoset= (lexized->lexeme&&outptr!=d->subst[i].res ) ? (outptr-d->subst[i].res) :-1;`
`429`	`464`
`@@ -447,8 +482,10 @@ compileTheSubstitute(DictThesaurus *d) {`
`447`	`482`
`448`	`483`	`if (toset>0)`
`449`	`484`	`d->subst[i].res[toset].flags \|=TSL_ADDPOS;`
	`485`	`+}elseif (lexized ) {`
	`486`	`+elog(NOTICE,"Thesaurus: word '%s' in substition is a stop-word, ignored (rule %d)",inptr->lexeme,i+1);`
`450`	`487`	`}else {`
`451`		`-elog(NOTICE,"Thesaurus: word '%s' isn't recognizedby subdictionary or it's a stop-word, ignored",inptr->lexeme);`
	`488`	`+elog(ERROR,"Thesaurus: word '%s'in substitionisn't recognized(rule %d)",inptr->lexeme,i+1);`
`452`	`489`	`}`
`453`	`490`
`454`	`491`	`if (inptr->lexeme )`
`@@ -457,7 +494,7 @@ compileTheSubstitute(DictThesaurus *d) {`
`457`	`494`	`}`
`458`	`495`
`459`	`496`	`if (outptr==d->subst[i].res )`
`460`		`-elog(ERROR,"Thesaurus: all words in subsitutionaren't recognized by subdictionary");`
	`497`	`+elog(ERROR,"Thesaurus: all words in subsitutionare stop word (rule %d)",i+1);`
`461`	`498`
`462`	`499`	`d->subst[i].reslen=outptr-d->subst[i].res;`
`463`	`500`
`@@ -717,7 +754,7 @@ thesaurus_lexize(PG_FUNCTION_ARGS)`
`717`	`754`
`718`	`755`	`infos= (LexemeInfo*)palloc(sizeof(LexemeInfo)*nlex);`
`719`	`756`	`for(i=0;i<nlex;i++)`
`720`		`-if ( (infos[i]=findTheLexeme(d,basevar[i].lexeme))==NULL )`
	`757`	`+if ( (infos[i]=findTheLexeme(d,basevar[i].lexeme))==NULL )`
`721`	`758`	`break;`
`722`	`759`
`723`	`760`	`if (i<nlex ) {`

`‎contrib/tsearch2/thesaurus‎`

Lines changed: 9 additions & 7 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,14 +1,16 @@`
`1`	`1`	`#`
`2`	`2`	`# Theasurus config file. Character ':' splits`
`3`		`-# string to part:`
`4`		`-# to be substituted string`
`5`		`-# substituting string`
	`3`	`+# string to part, example:`
	`4`	`+# sample-words : substitute-words`
`6`	`5`	`#`
	`6`	`+# Any substitute-word can be marked by preceding '*' character,`
	`7`	`+# which means do not lexize this word`
	`8`	`+# Docs: http://www.sai.msu.su/~megera/oddmuse/index.cgi/Thesaurus_dictionary`
`7`	`9`
`8`		`-#one two three : 123`
`9`		`-#one two : 12`
`10`		`-#one : 1`
`11`		`-#two : 2`
	`10`	`+#one two three :*123`
	`11`	`+#one two :*12`
	`12`	`+#one :*1`
	`13`	`+#two :*2`
`12`	`14`
`13`	`15`	`#foo bar : blah blah`
`14`	`16`	`#f bar : fbar`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit92bcb5a

File tree

2 files changed

2 files changed

`‎contrib/tsearch2/dict_thesaurus.c‎`

`‎contrib/tsearch2/thesaurus‎`

0 commit comments