55 *
66 * Joe Conway <mail@joeconway.com>
77 *
8- * $PostgreSQL: pgsql/contrib/fuzzystrmatch/fuzzystrmatch.c,v 1.33 2010/07/29 20:11:48 rhaas Exp $
8+ * $PostgreSQL: pgsql/contrib/fuzzystrmatch/fuzzystrmatch.c,v 1.34 2010/08/02 23: 20:23 rhaas Exp $
99 * Copyright (c) 2001-2010, PostgreSQL Global Development Group
1010 * ALL RIGHTS RESERVED;
1111 *
5050#include <ctype.h>
5151
5252#include "fmgr.h"
53+ #include "mb/pg_wchar.h"
5354#include "utils/builtins.h"
5455
5556PG_MODULE_MAGIC ;
@@ -183,6 +184,18 @@ getcode(char c)
183184/* These prevent GH from becoming F */
184185#define NOGHTOF (c )(getcode(c) & 16)/* BDH */
185186
187+ /* Faster than memcmp(), for this use case. */
188+ static bool inline
189+ rest_of_char_same (const char * s1 ,const char * s2 ,int len )
190+ {
191+ while (len > 0 )
192+ {
193+ len -- ;
194+ if (s1 [len ]!= s2 [len ])
195+ return false;
196+ }
197+ return true;
198+ }
186199
187200/*
188201 * levenshtein_internal - Calculates Levenshtein distance metric
@@ -195,16 +208,27 @@ levenshtein_internal(text *s, text *t,
195208int ins_c ,int del_c ,int sub_c )
196209{
197210int m ,
198- n ;
211+ n ,
212+ s_bytes ,
213+ t_bytes ;
199214int * prev ;
200215int * curr ;
216+ int * s_char_len = NULL ;
201217int i ,
202218j ;
203- const char * x ;
219+ const char * s_data ;
220+ const char * t_data ;
204221const char * y ;
205222
206- m = VARSIZE_ANY_EXHDR (s );
207- n = VARSIZE_ANY_EXHDR (t );
223+ /* Extract a pointer to the actual character data. */
224+ s_data = VARDATA_ANY (s );
225+ t_data = VARDATA_ANY (t );
226+
227+ /* Determine length of each string in bytes and characters. */
228+ s_bytes = VARSIZE_ANY_EXHDR (s );
229+ t_bytes = VARSIZE_ANY_EXHDR (t );
230+ m = pg_mbstrlen_with_len (s_data ,s_bytes );
231+ n = pg_mbstrlen_with_len (t_data ,t_bytes );
208232
209233/*
210234 * We can transform an empty s into t with n insertions, or a non-empty t
@@ -226,6 +250,28 @@ levenshtein_internal(text *s, text *t,
226250errmsg ("argument exceeds the maximum length of %d bytes" ,
227251MAX_LEVENSHTEIN_STRLEN )));
228252
253+ /*
254+ * In order to avoid calling pg_mblen() repeatedly on each character in s,
255+ * we cache all the lengths before starting the main loop -- but if all the
256+ * characters in both strings are single byte, then we skip this and use
257+ * a fast-path in the main loop. If only one string contains multi-byte
258+ * characters, we still build the array, so that the fast-path needn't
259+ * deal with the case where the array hasn't been initialized.
260+ */
261+ if (m != s_bytes || n != t_bytes )
262+ {
263+ int i ;
264+ const char * cp = s_data ;
265+
266+ s_char_len = (int * )palloc ((m + 1 )* sizeof (int ));
267+ for (i = 0 ;i < m ;++ i )
268+ {
269+ s_char_len [i ]= pg_mblen (cp );
270+ cp += s_char_len [i ];
271+ }
272+ s_char_len [i ]= 0 ;
273+ }
274+
229275/* One more cell for initialization column and row. */
230276++ m ;
231277++ n ;
@@ -244,36 +290,89 @@ levenshtein_internal(text *s, text *t,
244290prev [i ]= i * del_c ;
245291
246292/* Loop through rows of the notional array */
247- for (y = VARDATA_ANY ( t ) ,j = 1 ;j < n ;y ++ , j ++ )
293+ for (y = t_data ,j = 1 ;j < n ;j ++ )
248294{
249295int * temp ;
296+ const char * x = s_data ;
297+ int y_char_len = n != t_bytes + 1 ?pg_mblen (y ) :1 ;
250298
251299/*
252300 * First cell must increment sequentially, as we're on the j'th row of
253301 * the (m+1)x(n+1) array.
254302 */
255303curr [0 ]= j * ins_c ;
256304
257- for (x = VARDATA_ANY (s ),i = 1 ;i < m ;x ++ ,i ++ )
305+ /*
306+ * This inner loop is critical to performance, so we include a
307+ * fast-path to handle the (fairly common) case where no multibyte
308+ * characters are in the mix. The fast-path is entitled to assume
309+ * that if s_char_len is not initialized then BOTH strings contain
310+ * only single-byte characters.
311+ */
312+ if (s_char_len != NULL )
258313{
259- int ins ;
260- int del ;
261- int sub ;
262-
263- /* Calculate costs for probable operations. */
264- ins = prev [i ]+ ins_c ;/* Insertion*/
265- del = curr [i - 1 ]+ del_c ;/* Deletion*/
266- sub = prev [i - 1 ]+ ((* x == * y ) ?0 :sub_c );/* Substitution */
267-
268- /* Take the one with minimum cost. */
269- curr [i ]= Min (ins ,del );
270- curr [i ]= Min (curr [i ],sub );
314+ for (i = 1 ;i < m ;i ++ )
315+ {
316+ int ins ;
317+ int del ;
318+ int sub ;
319+ int x_char_len = s_char_len [i - 1 ];
320+
321+ /*
322+ * Calculate costs for insertion, deletion, and substitution.
323+ *
324+ * When calculating cost for substitution, we compare the last
325+ * character of each possibly-multibyte character first,
326+ * because that's enough to rule out most mis-matches. If we
327+ * get past that test, then we compare the lengths and the
328+ * remaining bytes.
329+ */
330+ ins = prev [i ]+ ins_c ;
331+ del = curr [i - 1 ]+ del_c ;
332+ if (x [x_char_len - 1 ]== y [y_char_len - 1 ]
333+ && x_char_len == y_char_len &&
334+ (x_char_len == 1 || rest_of_char_same (x ,y ,x_char_len )))
335+ sub = prev [i - 1 ];
336+ else
337+ sub = prev [i - 1 ]+ sub_c ;
338+
339+ /* Take the one with minimum cost. */
340+ curr [i ]= Min (ins ,del );
341+ curr [i ]= Min (curr [i ],sub );
342+
343+ /* Point to next character. */
344+ x += x_char_len ;
345+ }
346+ }
347+ else
348+ {
349+ for (i = 1 ;i < m ;i ++ )
350+ {
351+ int ins ;
352+ int del ;
353+ int sub ;
354+
355+ /* Calculate costs for insertion, deletion, and substitution. */
356+ ins = prev [i ]+ ins_c ;
357+ del = curr [i - 1 ]+ del_c ;
358+ sub = prev [i - 1 ]+ ((* x == * y ) ?0 :sub_c );
359+
360+ /* Take the one with minimum cost. */
361+ curr [i ]= Min (ins ,del );
362+ curr [i ]= Min (curr [i ],sub );
363+
364+ /* Point to next character. */
365+ x ++ ;
366+ }
271367}
272368
273369/* Swap current row with previous row. */
274370temp = curr ;
275371curr = prev ;
276372prev = temp ;
373+
374+ /* Point to next character. */
375+ y += y_char_len ;
277376}
278377
279378/*