2323PG_MODULE_MAGIC ;
2424
2525/*
26- * Unaccent dictionary uses a trie to find a character to replace. Each node of
27- * the trie is an array of 256 TrieChar structs (n-th element of array
28- * corresponds to byte)
26+ * An unaccent dictionary uses a trie to find a string to replace. Each node
27+ * of the trie is an array of 256 TrieChar structs; the N-th element of the
28+ * array corresponds to next byte value N. That element can contain both a
29+ * replacement string (to be used if the source string ends with this byte)
30+ * and a link to another trie node (to be followed if there are more bytes).
31+ *
32+ * Note that the trie search logic pays no attention to multibyte character
33+ * boundaries. This is OK as long as both the data entered into the trie and
34+ * the data we're trying to look up are validly encoded; no partial-character
35+ * matches will occur.
2936 */
3037typedef struct TrieChar
3138{
@@ -36,34 +43,38 @@ typedef struct TrieChar
3643
3744/*
3845 * placeChar - put str into trie's structure, byte by byte.
46+ *
47+ * If node is NULL, we need to make a new node, which will be returned;
48+ * otherwise the return value is the same as node.
3949 */
4050static TrieChar *
41- placeChar (TrieChar * node ,unsignedchar * str ,int lenstr ,char * replaceTo ,int replacelen )
51+ placeChar (TrieChar * node ,const unsignedchar * str ,int lenstr ,
52+ const char * replaceTo ,int replacelen )
4253{
4354TrieChar * curnode ;
4455
4556if (!node )
46- {
47- node = palloc (sizeof (TrieChar )* 256 );
48- memset (node ,0 ,sizeof (TrieChar )* 256 );
49- }
57+ node = (TrieChar * )palloc0 (sizeof (TrieChar )* 256 );
58+
59+ Assert (lenstr > 0 );/* else str[0] doesn't exist */
5060
5161curnode = node + * str ;
5262
53- if (lenstr = =1 )
63+ if (lenstr < =1 )
5464{
5565if (curnode -> replaceTo )
56- elog (WARNING ,"duplicateTO argument, use first one" );
66+ elog (WARNING ,"duplicatesource strings, first one will be used " );
5767else
5868{
5969curnode -> replacelen = replacelen ;
60- curnode -> replaceTo = palloc (replacelen );
70+ curnode -> replaceTo = ( char * ) palloc (replacelen );
6171memcpy (curnode -> replaceTo ,replaceTo ,replacelen );
6272}
6373}
6474else
6575{
66- curnode -> nextChar = placeChar (curnode -> nextChar ,str + 1 ,lenstr - 1 ,replaceTo ,replacelen );
76+ curnode -> nextChar = placeChar (curnode -> nextChar ,str + 1 ,lenstr - 1 ,
77+ replaceTo ,replacelen );
6778}
6879
6980return node ;
@@ -213,23 +224,35 @@ initTrie(char *filename)
213224}
214225
215226/*
216- * findReplaceTo - find multibyte character in trie
227+ * findReplaceTo - find longest possible match in trie
228+ *
229+ * On success, returns pointer to ending subnode, plus length of matched
230+ * source string in *p_matchlen. On failure, returns NULL.
217231 */
218232static TrieChar *
219- findReplaceTo (TrieChar * node ,unsignedchar * src ,int srclen )
233+ findReplaceTo (TrieChar * node ,const unsignedchar * src ,int srclen ,
234+ int * p_matchlen )
220235{
221- while (node )
236+ TrieChar * result = NULL ;
237+ int matchlen = 0 ;
238+
239+ * p_matchlen = 0 ;/* prevent uninitialized-variable warnings */
240+
241+ while (node && matchlen < srclen )
222242{
223- node = node + * src ;
224- if (srclen == 1 )
225- return node ;
243+ node = node + src [matchlen ];
244+ matchlen ++ ;
245+
246+ if (node -> replaceTo )
247+ {
248+ result = node ;
249+ * p_matchlen = matchlen ;
250+ }
226251
227- src ++ ;
228- srclen -- ;
229252node = node -> nextChar ;
230253}
231254
232- return NULL ;
255+ return result ;
233256}
234257
235258PG_FUNCTION_INFO_V1 (unaccent_init );
@@ -280,18 +303,17 @@ unaccent_lexize(PG_FUNCTION_ARGS)
280303TrieChar * rootTrie = (TrieChar * )PG_GETARG_POINTER (0 );
281304char * srcchar = (char * )PG_GETARG_POINTER (1 );
282305int32 len = PG_GETARG_INT32 (2 );
283- char * srcstart ,
306+ char * srcstart = srcchar ,
284307* trgchar = NULL ;
285- int charlen ;
286308TSLexeme * res = NULL ;
287- TrieChar * node ;
288309
289- srcstart = srcchar ;
290- while (srcchar - srcstart < len )
310+ while (len > 0 )
291311{
292- charlen = pg_mblen (srcchar );
312+ TrieChar * node ;
313+ int matchlen ;
293314
294- node = findReplaceTo (rootTrie , (unsignedchar * )srcchar ,charlen );
315+ node = findReplaceTo (rootTrie , (unsignedchar * )srcchar ,len ,
316+ & matchlen );
295317if (node && node -> replaceTo )
296318{
297319if (!res )
@@ -309,13 +331,18 @@ unaccent_lexize(PG_FUNCTION_ARGS)
309331memcpy (trgchar ,node -> replaceTo ,node -> replacelen );
310332trgchar += node -> replacelen ;
311333}
312- else if ( res )
334+ else
313335{
314- memcpy (trgchar ,srcchar ,charlen );
315- trgchar += charlen ;
336+ matchlen = pg_mblen (srcchar );
337+ if (res )
338+ {
339+ memcpy (trgchar ,srcchar ,matchlen );
340+ trgchar += matchlen ;
341+ }
316342}
317343
318- srcchar += charlen ;
344+ srcchar += matchlen ;
345+ len -= matchlen ;
319346}
320347
321348if (res )