Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit57d9aef

Browse files
committed
Teach levenshtein() about multi-byte characters.
Based on a patch by, and further ideas from, Alexander Korotkov.
1 parentad17ff9 commit57d9aef

File tree

2 files changed

+122
-22
lines changed

2 files changed

+122
-22
lines changed

‎contrib/fuzzystrmatch/fuzzystrmatch.c

Lines changed: 118 additions & 19 deletions
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
*
66
* Joe Conway <mail@joeconway.com>
77
*
8-
* $PostgreSQL: pgsql/contrib/fuzzystrmatch/fuzzystrmatch.c,v 1.33 2010/07/2920:11:48 rhaas Exp $
8+
* $PostgreSQL: pgsql/contrib/fuzzystrmatch/fuzzystrmatch.c,v 1.34 2010/08/02 23:20:23 rhaas Exp $
99
* Copyright (c) 2001-2010, PostgreSQL Global Development Group
1010
* ALL RIGHTS RESERVED;
1111
*
@@ -50,6 +50,7 @@
5050
#include<ctype.h>
5151

5252
#include"fmgr.h"
53+
#include"mb/pg_wchar.h"
5354
#include"utils/builtins.h"
5455

5556
PG_MODULE_MAGIC;
@@ -183,6 +184,18 @@ getcode(char c)
183184
/* These prevent GH from becoming F */
184185
#defineNOGHTOF(c)(getcode(c) & 16)/* BDH */
185186

187+
/* Faster than memcmp(), for this use case. */
188+
staticboolinline
189+
rest_of_char_same(constchar*s1,constchar*s2,intlen)
190+
{
191+
while (len>0)
192+
{
193+
len--;
194+
if (s1[len]!=s2[len])
195+
return false;
196+
}
197+
return true;
198+
}
186199

187200
/*
188201
* levenshtein_internal - Calculates Levenshtein distance metric
@@ -195,16 +208,27 @@ levenshtein_internal(text *s, text *t,
195208
intins_c,intdel_c,intsub_c)
196209
{
197210
intm,
198-
n;
211+
n,
212+
s_bytes,
213+
t_bytes;
199214
int*prev;
200215
int*curr;
216+
int*s_char_len=NULL;
201217
inti,
202218
j;
203-
constchar*x;
219+
constchar*s_data;
220+
constchar*t_data;
204221
constchar*y;
205222

206-
m=VARSIZE_ANY_EXHDR(s);
207-
n=VARSIZE_ANY_EXHDR(t);
223+
/* Extract a pointer to the actual character data. */
224+
s_data=VARDATA_ANY(s);
225+
t_data=VARDATA_ANY(t);
226+
227+
/* Determine length of each string in bytes and characters. */
228+
s_bytes=VARSIZE_ANY_EXHDR(s);
229+
t_bytes=VARSIZE_ANY_EXHDR(t);
230+
m=pg_mbstrlen_with_len(s_data,s_bytes);
231+
n=pg_mbstrlen_with_len(t_data,t_bytes);
208232

209233
/*
210234
* We can transform an empty s into t with n insertions, or a non-empty t
@@ -226,6 +250,28 @@ levenshtein_internal(text *s, text *t,
226250
errmsg("argument exceeds the maximum length of %d bytes",
227251
MAX_LEVENSHTEIN_STRLEN)));
228252

253+
/*
254+
* In order to avoid calling pg_mblen() repeatedly on each character in s,
255+
* we cache all the lengths before starting the main loop -- but if all the
256+
* characters in both strings are single byte, then we skip this and use
257+
* a fast-path in the main loop. If only one string contains multi-byte
258+
* characters, we still build the array, so that the fast-path needn't
259+
* deal with the case where the array hasn't been initialized.
260+
*/
261+
if (m!=s_bytes||n!=t_bytes)
262+
{
263+
inti;
264+
constchar*cp=s_data;
265+
266+
s_char_len= (int*)palloc((m+1)*sizeof(int));
267+
for (i=0;i<m;++i)
268+
{
269+
s_char_len[i]=pg_mblen(cp);
270+
cp+=s_char_len[i];
271+
}
272+
s_char_len[i]=0;
273+
}
274+
229275
/* One more cell for initialization column and row. */
230276
++m;
231277
++n;
@@ -244,36 +290,89 @@ levenshtein_internal(text *s, text *t,
244290
prev[i]=i*del_c;
245291

246292
/* Loop through rows of the notional array */
247-
for (y=VARDATA_ANY(t),j=1;j<n;y++,j++)
293+
for (y=t_data,j=1;j<n;j++)
248294
{
249295
int*temp;
296+
constchar*x=s_data;
297+
inty_char_len=n!=t_bytes+1 ?pg_mblen(y) :1;
250298

251299
/*
252300
* First cell must increment sequentially, as we're on the j'th row of
253301
* the (m+1)x(n+1) array.
254302
*/
255303
curr[0]=j*ins_c;
256304

257-
for (x=VARDATA_ANY(s),i=1;i<m;x++,i++)
305+
/*
306+
* This inner loop is critical to performance, so we include a
307+
* fast-path to handle the (fairly common) case where no multibyte
308+
* characters are in the mix. The fast-path is entitled to assume
309+
* that if s_char_len is not initialized then BOTH strings contain
310+
* only single-byte characters.
311+
*/
312+
if (s_char_len!=NULL)
258313
{
259-
intins;
260-
intdel;
261-
intsub;
262-
263-
/* Calculate costs for probable operations. */
264-
ins=prev[i]+ins_c;/* Insertion*/
265-
del=curr[i-1]+del_c;/* Deletion*/
266-
sub=prev[i-1]+ ((*x==*y) ?0 :sub_c);/* Substitution */
267-
268-
/* Take the one with minimum cost. */
269-
curr[i]=Min(ins,del);
270-
curr[i]=Min(curr[i],sub);
314+
for (i=1;i<m;i++)
315+
{
316+
intins;
317+
intdel;
318+
intsub;
319+
intx_char_len=s_char_len[i-1];
320+
321+
/*
322+
* Calculate costs for insertion, deletion, and substitution.
323+
*
324+
* When calculating cost for substitution, we compare the last
325+
* character of each possibly-multibyte character first,
326+
* because that's enough to rule out most mis-matches. If we
327+
* get past that test, then we compare the lengths and the
328+
* remaining bytes.
329+
*/
330+
ins=prev[i]+ins_c;
331+
del=curr[i-1]+del_c;
332+
if (x[x_char_len-1]==y[y_char_len-1]
333+
&&x_char_len==y_char_len&&
334+
(x_char_len==1||rest_of_char_same(x,y,x_char_len)))
335+
sub=prev[i-1];
336+
else
337+
sub=prev[i-1]+sub_c;
338+
339+
/* Take the one with minimum cost. */
340+
curr[i]=Min(ins,del);
341+
curr[i]=Min(curr[i],sub);
342+
343+
/* Point to next character. */
344+
x+=x_char_len;
345+
}
346+
}
347+
else
348+
{
349+
for (i=1;i<m;i++)
350+
{
351+
intins;
352+
intdel;
353+
intsub;
354+
355+
/* Calculate costs for insertion, deletion, and substitution. */
356+
ins=prev[i]+ins_c;
357+
del=curr[i-1]+del_c;
358+
sub=prev[i-1]+ ((*x==*y) ?0 :sub_c);
359+
360+
/* Take the one with minimum cost. */
361+
curr[i]=Min(ins,del);
362+
curr[i]=Min(curr[i],sub);
363+
364+
/* Point to next character. */
365+
x++;
366+
}
271367
}
272368

273369
/* Swap current row with previous row. */
274370
temp=curr;
275371
curr=prev;
276372
prev=temp;
373+
374+
/* Point to next character. */
375+
y+=y_char_len;
277376
}
278377

279378
/*

‎doc/src/sgml/fuzzystrmatch.sgml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
<!-- $PostgreSQL: pgsql/doc/src/sgml/fuzzystrmatch.sgml,v 1.6 2010/07/29 19:34:40 petere Exp $ -->
1+
<!-- $PostgreSQL: pgsql/doc/src/sgml/fuzzystrmatch.sgml,v 1.7 2010/08/02 23:20:23 rhaas Exp $ -->
22

33
<sect1 id="fuzzystrmatch">
44
<title>fuzzystrmatch</title>
@@ -14,8 +14,9 @@
1414

1515
<caution>
1616
<para>
17-
At present, <filename>fuzzystrmatch</> does not work well with
18-
multi-byte encodings (such as UTF-8).
17+
At present, the <function>soundex</>, <function>metaphone</>,
18+
<function>dmetaphone</>, and <function>dmetaphone_alt</> functions do
19+
not work well with multi-byte encodings (such as UTF-8).
1920
</para>
2021
</caution>
2122

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp