Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit1b24887

Browse files
committed
Allow multi-character source strings in contrib/unaccent.
This could be useful in languages where diacritic signs are represented asseparate characters; more generally it supports using unaccent dictionariesfor substring substitutions beyond narrowly conceived "diacritic removal".In any case, since the rule-file parser doesn't complain aboutmulti-character source strings, it behooves us to do something unsurprisingwith them.
1 parent97c40ce commit1b24887

File tree

2 files changed

+67
-32
lines changed

2 files changed

+67
-32
lines changed

‎contrib/unaccent/unaccent.c

Lines changed: 59 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -23,9 +23,16 @@
2323
PG_MODULE_MAGIC;
2424

2525
/*
26-
* Unaccent dictionary uses a trie to find a character to replace. Each node of
27-
* the trie is an array of 256 TrieChar structs (n-th element of array
28-
* corresponds to byte)
26+
* An unaccent dictionary uses a trie to find a string to replace. Each node
27+
* of the trie is an array of 256 TrieChar structs; the N-th element of the
28+
* array corresponds to next byte value N. That element can contain both a
29+
* replacement string (to be used if the source string ends with this byte)
30+
* and a link to another trie node (to be followed if there are more bytes).
31+
*
32+
* Note that the trie search logic pays no attention to multibyte character
33+
* boundaries. This is OK as long as both the data entered into the trie and
34+
* the data we're trying to look up are validly encoded; no partial-character
35+
* matches will occur.
2936
*/
3037
typedefstructTrieChar
3138
{
@@ -36,34 +43,38 @@ typedef struct TrieChar
3643

3744
/*
3845
* placeChar - put str into trie's structure, byte by byte.
46+
*
47+
* If node is NULL, we need to make a new node, which will be returned;
48+
* otherwise the return value is the same as node.
3949
*/
4050
staticTrieChar*
41-
placeChar(TrieChar*node,unsignedchar*str,intlenstr,char*replaceTo,intreplacelen)
51+
placeChar(TrieChar*node,constunsignedchar*str,intlenstr,
52+
constchar*replaceTo,intreplacelen)
4253
{
4354
TrieChar*curnode;
4455

4556
if (!node)
46-
{
47-
node=palloc(sizeof(TrieChar)*256);
48-
memset(node,0,sizeof(TrieChar)*256);
49-
}
57+
node= (TrieChar*)palloc0(sizeof(TrieChar)*256);
58+
59+
Assert(lenstr>0);/* else str[0] doesn't exist */
5060

5161
curnode=node+*str;
5262

53-
if (lenstr==1)
63+
if (lenstr<=1)
5464
{
5565
if (curnode->replaceTo)
56-
elog(WARNING,"duplicateTO argument, usefirst one");
66+
elog(WARNING,"duplicatesource strings,first one will be used");
5767
else
5868
{
5969
curnode->replacelen=replacelen;
60-
curnode->replaceTo=palloc(replacelen);
70+
curnode->replaceTo=(char*)palloc(replacelen);
6171
memcpy(curnode->replaceTo,replaceTo,replacelen);
6272
}
6373
}
6474
else
6575
{
66-
curnode->nextChar=placeChar(curnode->nextChar,str+1,lenstr-1,replaceTo,replacelen);
76+
curnode->nextChar=placeChar(curnode->nextChar,str+1,lenstr-1,
77+
replaceTo,replacelen);
6778
}
6879

6980
returnnode;
@@ -213,23 +224,35 @@ initTrie(char *filename)
213224
}
214225

215226
/*
216-
* findReplaceTo - find multibyte character in trie
227+
* findReplaceTo - find longest possible match in trie
228+
*
229+
* On success, returns pointer to ending subnode, plus length of matched
230+
* source string in *p_matchlen. On failure, returns NULL.
217231
*/
218232
staticTrieChar*
219-
findReplaceTo(TrieChar*node,unsignedchar*src,intsrclen)
233+
findReplaceTo(TrieChar*node,constunsignedchar*src,intsrclen,
234+
int*p_matchlen)
220235
{
221-
while (node)
236+
TrieChar*result=NULL;
237+
intmatchlen=0;
238+
239+
*p_matchlen=0;/* prevent uninitialized-variable warnings */
240+
241+
while (node&&matchlen<srclen)
222242
{
223-
node=node+*src;
224-
if (srclen==1)
225-
returnnode;
243+
node=node+src[matchlen];
244+
matchlen++;
245+
246+
if (node->replaceTo)
247+
{
248+
result=node;
249+
*p_matchlen=matchlen;
250+
}
226251

227-
src++;
228-
srclen--;
229252
node=node->nextChar;
230253
}
231254

232-
returnNULL;
255+
returnresult;
233256
}
234257

235258
PG_FUNCTION_INFO_V1(unaccent_init);
@@ -280,18 +303,17 @@ unaccent_lexize(PG_FUNCTION_ARGS)
280303
TrieChar*rootTrie= (TrieChar*)PG_GETARG_POINTER(0);
281304
char*srcchar= (char*)PG_GETARG_POINTER(1);
282305
int32len=PG_GETARG_INT32(2);
283-
char*srcstart,
306+
char*srcstart=srcchar,
284307
*trgchar=NULL;
285-
intcharlen;
286308
TSLexeme*res=NULL;
287-
TrieChar*node;
288309

289-
srcstart=srcchar;
290-
while (srcchar-srcstart<len)
310+
while (len>0)
291311
{
292-
charlen=pg_mblen(srcchar);
312+
TrieChar*node;
313+
intmatchlen;
293314

294-
node=findReplaceTo(rootTrie, (unsignedchar*)srcchar,charlen);
315+
node=findReplaceTo(rootTrie, (unsignedchar*)srcchar,len,
316+
&matchlen);
295317
if (node&&node->replaceTo)
296318
{
297319
if (!res)
@@ -309,13 +331,18 @@ unaccent_lexize(PG_FUNCTION_ARGS)
309331
memcpy(trgchar,node->replaceTo,node->replacelen);
310332
trgchar+=node->replacelen;
311333
}
312-
elseif (res)
334+
else
313335
{
314-
memcpy(trgchar,srcchar,charlen);
315-
trgchar+=charlen;
336+
matchlen=pg_mblen(srcchar);
337+
if (res)
338+
{
339+
memcpy(trgchar,srcchar,matchlen);
340+
trgchar+=matchlen;
341+
}
316342
}
317343

318-
srcchar+=charlen;
344+
srcchar+=matchlen;
345+
len-=matchlen;
319346
}
320347

321348
if (res)

‎doc/src/sgml/unaccent.sgml

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -70,6 +70,14 @@
7070
</para>
7171
</listitem>
7272

73+
<listitem>
74+
<para>
75+
Actually, each <quote>character</> can be any string not containing
76+
whitespace, so <filename>unaccent</> dictionaries could be used for
77+
other sorts of substring substitutions besides diacritic removal.
78+
</para>
79+
</listitem>
80+
7381
<listitem>
7482
<para>
7583
As with other <productname>PostgreSQL</> text search configuration files,

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp