NotificationsYou must be signed in to change notification settings
Fork6
Star31

Commit1b24887

committed

Allow multi-character source strings in contrib/unaccent.

This could be useful in languages where diacritic signs are represented asseparate characters; more generally it supports using unaccent dictionariesfor substring substitutions beyond narrowly conceived "diacritic removal".In any case, since the rule-file parser doesn't complain aboutmulti-character source strings, it behooves us to do something unsurprisingwith them.

1 parent97c40ce commit1b24887Copy full SHA for 1b24887

File tree

2 files changed

+67

-32

lines changed

contrib/unaccent
- unaccent.c
doc/src/sgml
- unaccent.sgml

2 files changed

+67

-32

lines changed

`‎contrib/unaccent/unaccent.c‎`

Lines changed: 59 additions & 32 deletions

Original file line number	Diff line number	Diff line change
`@@ -23,9 +23,16 @@`
`23`	`23`	`PG_MODULE_MAGIC;`
`24`	`24`
`25`	`25`	`/*`
`26`		`- * Unaccent dictionary uses a trie to find a character to replace. Each node of`
`27`		`- * the trie is an array of 256 TrieChar structs (n-th element of array`
`28`		`- * corresponds to byte)`
	`26`	`+ * An unaccent dictionary uses a trie to find a string to replace. Each node`
	`27`	`+ * of the trie is an array of 256 TrieChar structs; the N-th element of the`
	`28`	`+ * array corresponds to next byte value N. That element can contain both a`
	`29`	`+ * replacement string (to be used if the source string ends with this byte)`
	`30`	`+ * and a link to another trie node (to be followed if there are more bytes).`
	`31`	`+ *`
	`32`	`+ * Note that the trie search logic pays no attention to multibyte character`
	`33`	`+ * boundaries. This is OK as long as both the data entered into the trie and`
	`34`	`+ * the data we're trying to look up are validly encoded; no partial-character`
	`35`	`+ * matches will occur.`
`29`	`36`	`*/`
`30`	`37`	`typedefstructTrieChar`
`31`	`38`	`{`
`@@ -36,34 +43,38 @@ typedef struct TrieChar`
`36`	`43`
`37`	`44`	`/*`
`38`	`45`	`* placeChar - put str into trie's structure, byte by byte.`
	`46`	`+ *`
	`47`	`+ * If node is NULL, we need to make a new node, which will be returned;`
	`48`	`+ * otherwise the return value is the same as node.`
`39`	`49`	`*/`
`40`	`50`	`staticTrieChar*`
`41`		`-placeChar(TrieCharnode,unsignedcharstr,intlenstr,char*replaceTo,intreplacelen)`
	`51`	`+placeChar(TrieCharnode,constunsignedcharstr,intlenstr,`
	`52`	`+constchar*replaceTo,intreplacelen)`
`42`	`53`	`{`
`43`	`54`	`TrieChar*curnode;`
`44`	`55`
`45`	`56`	`if (!node)`
`46`		`-{`
`47`		`-node=palloc(sizeof(TrieChar)*256);`
`48`		`-memset(node,0,sizeof(TrieChar)*256);`
`49`		`-}`
	`57`	`+node= (TrieChar)palloc0(sizeof(TrieChar)256);`
	`58`	`+`
	`59`	`+Assert(lenstr>0);/* else str[0] doesn't exist */`
`50`	`60`
`51`	`61`	`curnode=node+*str;`
`52`	`62`
`53`		`-if (lenstr==1)`
	`63`	`+if (lenstr<=1)`
`54`	`64`	`{`
`55`	`65`	`if (curnode->replaceTo)`
`56`		`-elog(WARNING,"duplicateTO argument, usefirst one");`
	`66`	`+elog(WARNING,"duplicatesource strings,first one will be used");`
`57`	`67`	`else`
`58`	`68`	`{`
`59`	`69`	`curnode->replacelen=replacelen;`
`60`		`-curnode->replaceTo=palloc(replacelen);`
	`70`	`+curnode->replaceTo=(char*)palloc(replacelen);`
`61`	`71`	`memcpy(curnode->replaceTo,replaceTo,replacelen);`
`62`	`72`	`}`
`63`	`73`	`}`
`64`	`74`	`else`
`65`	`75`	`{`
`66`		`-curnode->nextChar=placeChar(curnode->nextChar,str+1,lenstr-1,replaceTo,replacelen);`
	`76`	`+curnode->nextChar=placeChar(curnode->nextChar,str+1,lenstr-1,`
	`77`	`+replaceTo,replacelen);`
`67`	`78`	`}`
`68`	`79`
`69`	`80`	`returnnode;`
`@@ -213,23 +224,35 @@ initTrie(char *filename)`
`213`	`224`	`}`
`214`	`225`
`215`	`226`	`/*`
`216`		`- * findReplaceTo - find multibyte character in trie`
	`227`	`+ * findReplaceTo - find longest possible match in trie`
	`228`	`+ *`
	`229`	`+ * On success, returns pointer to ending subnode, plus length of matched`
	`230`	`+ * source string in *p_matchlen. On failure, returns NULL.`
`217`	`231`	`*/`
`218`	`232`	`staticTrieChar*`
`219`		`-findReplaceTo(TrieCharnode,unsignedcharsrc,intsrclen)`
	`233`	`+findReplaceTo(TrieCharnode,constunsignedcharsrc,intsrclen,`
	`234`	`+int*p_matchlen)`
`220`	`235`	`{`
`221`		`-while (node)`
	`236`	`+TrieChar*result=NULL;`
	`237`	`+intmatchlen=0;`
	`238`	`+`
	`239`	`+p_matchlen=0;/ prevent uninitialized-variable warnings */`
	`240`	`+`
	`241`	`+while (node&&matchlen<srclen)`
`222`	`242`	`{`
`223`		`-node=node+*src;`
`224`		`-if (srclen==1)`
`225`		`-returnnode;`
	`243`	`+node=node+src[matchlen];`
	`244`	`+matchlen++;`
	`245`	`+`
	`246`	`+if (node->replaceTo)`
	`247`	`+{`
	`248`	`+result=node;`
	`249`	`+*p_matchlen=matchlen;`
	`250`	`+}`
`226`	`251`
`227`		`-src++;`
`228`		`-srclen--;`
`229`	`252`	`node=node->nextChar;`
`230`	`253`	`}`
`231`	`254`
`232`		`-returnNULL;`
	`255`	`+returnresult;`
`233`	`256`	`}`
`234`	`257`
`235`	`258`	`PG_FUNCTION_INFO_V1(unaccent_init);`
`@@ -280,18 +303,17 @@ unaccent_lexize(PG_FUNCTION_ARGS)`
`280`	`303`	`TrieCharrootTrie= (TrieChar)PG_GETARG_POINTER(0);`
`281`	`304`	`charsrcchar= (char)PG_GETARG_POINTER(1);`
`282`	`305`	`int32len=PG_GETARG_INT32(2);`
`283`		`-char*srcstart,`
	`306`	`+char*srcstart=srcchar,`
`284`	`307`	`*trgchar=NULL;`
`285`		`-intcharlen;`
`286`	`308`	`TSLexeme*res=NULL;`
`287`		`-TrieChar*node;`
`288`	`309`
`289`		`-srcstart=srcchar;`
`290`		`-while (srcchar-srcstart<len)`
	`310`	`+while (len>0)`
`291`	`311`	`{`
`292`		`-charlen=pg_mblen(srcchar);`
	`312`	`+TrieChar*node;`
	`313`	`+intmatchlen;`
`293`	`314`
`294`		`-node=findReplaceTo(rootTrie, (unsignedchar*)srcchar,charlen);`
	`315`	`+node=findReplaceTo(rootTrie, (unsignedchar*)srcchar,len,`
	`316`	`+&matchlen);`
`295`	`317`	`if (node&&node->replaceTo)`
`296`	`318`	`{`
`297`	`319`	`if (!res)`
`@@ -309,13 +331,18 @@ unaccent_lexize(PG_FUNCTION_ARGS)`
`309`	`331`	`memcpy(trgchar,node->replaceTo,node->replacelen);`
`310`	`332`	`trgchar+=node->replacelen;`
`311`	`333`	`}`
`312`		`-elseif (res)`
	`334`	`+else`
`313`	`335`	`{`
`314`		`-memcpy(trgchar,srcchar,charlen);`
`315`		`-trgchar+=charlen;`
	`336`	`+matchlen=pg_mblen(srcchar);`
	`337`	`+if (res)`
	`338`	`+{`
	`339`	`+memcpy(trgchar,srcchar,matchlen);`
	`340`	`+trgchar+=matchlen;`
	`341`	`+}`
`316`	`342`	`}`
`317`	`343`
`318`		`-srcchar+=charlen;`
	`344`	`+srcchar+=matchlen;`
	`345`	`+len-=matchlen;`
`319`	`346`	`}`
`320`	`347`
`321`	`348`	`if (res)`

`‎doc/src/sgml/unaccent.sgml‎`

Lines changed: 8 additions & 0 deletions

Original file line number	Diff line number	Diff line change
`@@ -70,6 +70,14 @@`
`70`	`70`	`</para>`
`71`	`71`	`</listitem>`
`72`	`72`
	`73`	`+ <listitem>`
	`74`	`+ <para>`
	`75`	`+ Actually, each <quote>character</> can be any string not containing`
	`76`	`+ whitespace, so <filename>unaccent</> dictionaries could be used for`
	`77`	`+ other sorts of substring substitutions besides diacritic removal.`
	`78`	`+ </para>`
	`79`	`+ </listitem>`
	`80`	`+`
`73`	`81`	`<listitem>`
`74`	`82`	`<para>`
`75`	`83`	`As with other <productname>PostgreSQL</> text search configuration files,`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit1b24887

File tree

2 files changed

2 files changed

`‎contrib/unaccent/unaccent.c‎`

`‎doc/src/sgml/unaccent.sgml‎`

0 commit comments