NotificationsYou must be signed in to change notification settings
Fork5
Star26

Commit57d9aef

committed

Teach levenshtein() about multi-byte characters.

Based on a patch by, and further ideas from, Alexander Korotkov.

1 parentad17ff9 commit57d9aefCopy full SHA for 57d9aef

File tree

2 files changed

+122

-22

lines changed

contrib/fuzzystrmatch
- fuzzystrmatch.c
doc/src/sgml
- fuzzystrmatch.sgml

2 files changed

+122

-22

lines changed

`‎contrib/fuzzystrmatch/fuzzystrmatch.c`

Lines changed: 118 additions & 19 deletions

Original file line number	Diff line number	Diff line change
`@@ -5,7 +5,7 @@`
`5`	`5`	`*`
`6`	`6`	`* Joe Conway <mail@joeconway.com>`
`7`	`7`	`*`
`8`		`- * $PostgreSQL: pgsql/contrib/fuzzystrmatch/fuzzystrmatch.c,v 1.33 2010/07/2920:11:48 rhaas Exp $`
	`8`	`+ * $PostgreSQL: pgsql/contrib/fuzzystrmatch/fuzzystrmatch.c,v 1.34 2010/08/02 23:20:23 rhaas Exp $`
`9`	`9`	`* Copyright (c) 2001-2010, PostgreSQL Global Development Group`
`10`	`10`	`* ALL RIGHTS RESERVED;`
`11`	`11`	`*`
`@@ -50,6 +50,7 @@`
`50`	`50`	`#include<ctype.h>`
`51`	`51`
`52`	`52`	`#include"fmgr.h"`
	`53`	`+#include"mb/pg_wchar.h"`
`53`	`54`	`#include"utils/builtins.h"`
`54`	`55`
`55`	`56`	`PG_MODULE_MAGIC;`
`@@ -183,6 +184,18 @@ getcode(char c)`
`183`	`184`	`/* These prevent GH from becoming F */`
`184`	`185`	`#defineNOGHTOF(c)(getcode(c) & 16)/* BDH */`
`185`	`186`
	`187`	`+/* Faster than memcmp(), for this use case. */`
	`188`	`+staticboolinline`
	`189`	`+rest_of_char_same(constchars1,constchars2,intlen)`
	`190`	`+{`
	`191`	`+while (len>0)`
	`192`	`+{`
	`193`	`+len--;`
	`194`	`+if (s1[len]!=s2[len])`
	`195`	`+return false;`
	`196`	`+}`
	`197`	`+return true;`
	`198`	`+}`
`186`	`199`
`187`	`200`	`/*`
`188`	`201`	`* levenshtein_internal - Calculates Levenshtein distance metric`
`@@ -195,16 +208,27 @@ levenshtein_internal(text s, text t,`
`195`	`208`	`intins_c,intdel_c,intsub_c)`
`196`	`209`	`{`
`197`	`210`	`intm,`
`198`		`-n;`
	`211`	`+n,`
	`212`	`+s_bytes,`
	`213`	`+t_bytes;`
`199`	`214`	`int*prev;`
`200`	`215`	`int*curr;`
	`216`	`+int*s_char_len=NULL;`
`201`	`217`	`inti,`
`202`	`218`	`j;`
`203`		`-constchar*x;`
	`219`	`+constchar*s_data;`
	`220`	`+constchar*t_data;`
`204`	`221`	`constchar*y;`
`205`	`222`
`206`		`-m=VARSIZE_ANY_EXHDR(s);`
`207`		`-n=VARSIZE_ANY_EXHDR(t);`
	`223`	`+/* Extract a pointer to the actual character data. */`
	`224`	`+s_data=VARDATA_ANY(s);`
	`225`	`+t_data=VARDATA_ANY(t);`
	`226`	`+`
	`227`	`+/* Determine length of each string in bytes and characters. */`
	`228`	`+s_bytes=VARSIZE_ANY_EXHDR(s);`
	`229`	`+t_bytes=VARSIZE_ANY_EXHDR(t);`
	`230`	`+m=pg_mbstrlen_with_len(s_data,s_bytes);`
	`231`	`+n=pg_mbstrlen_with_len(t_data,t_bytes);`
`208`	`232`
`209`	`233`	`/*`
`210`	`234`	`* We can transform an empty s into t with n insertions, or a non-empty t`
`@@ -226,6 +250,28 @@ levenshtein_internal(text s, text t,`
`226`	`250`	`errmsg("argument exceeds the maximum length of %d bytes",`
`227`	`251`	`MAX_LEVENSHTEIN_STRLEN)));`
`228`	`252`
	`253`	`+/*`
	`254`	`+ * In order to avoid calling pg_mblen() repeatedly on each character in s,`
	`255`	`+ * we cache all the lengths before starting the main loop -- but if all the`
	`256`	`+ * characters in both strings are single byte, then we skip this and use`
	`257`	`+ * a fast-path in the main loop. If only one string contains multi-byte`
	`258`	`+ * characters, we still build the array, so that the fast-path needn't`
	`259`	`+ * deal with the case where the array hasn't been initialized.`
	`260`	`+ */`
	`261`	`+if (m!=s_bytes\|\|n!=t_bytes)`
	`262`	`+{`
	`263`	`+inti;`
	`264`	`+constchar*cp=s_data;`
	`265`	`+`
	`266`	`+s_char_len= (int)palloc((m+1)sizeof(int));`
	`267`	`+for (i=0;i<m;++i)`
	`268`	`+{`
	`269`	`+s_char_len[i]=pg_mblen(cp);`
	`270`	`+cp+=s_char_len[i];`
	`271`	`+}`
	`272`	`+s_char_len[i]=0;`
	`273`	`+}`
	`274`	`+`
`229`	`275`	`/* One more cell for initialization column and row. */`
`230`	`276`	`++m;`
`231`	`277`	`++n;`
`@@ -244,36 +290,89 @@ levenshtein_internal(text s, text t,`
`244`	`290`	`prev[i]=i*del_c;`
`245`	`291`
`246`	`292`	`/* Loop through rows of the notional array */`
`247`		`-for (y=VARDATA_ANY(t),j=1;j<n;y++,j++)`
	`293`	`+for (y=t_data,j=1;j<n;j++)`
`248`	`294`	`{`
`249`	`295`	`int*temp;`
	`296`	`+constchar*x=s_data;`
	`297`	`+inty_char_len=n!=t_bytes+1 ?pg_mblen(y) :1;`
`250`	`298`
`251`	`299`	`/*`
`252`	`300`	`* First cell must increment sequentially, as we're on the j'th row of`
`253`	`301`	`* the (m+1)x(n+1) array.`
`254`	`302`	`*/`
`255`	`303`	`curr[0]=j*ins_c;`
`256`	`304`
`257`		`-for (x=VARDATA_ANY(s),i=1;i<m;x++,i++)`
	`305`	`+/*`
	`306`	`+ * This inner loop is critical to performance, so we include a`
	`307`	`+ * fast-path to handle the (fairly common) case where no multibyte`
	`308`	`+ * characters are in the mix. The fast-path is entitled to assume`
	`309`	`+ * that if s_char_len is not initialized then BOTH strings contain`
	`310`	`+ * only single-byte characters.`
	`311`	`+ */`
	`312`	`+if (s_char_len!=NULL)`
`258`	`313`	`{`
`259`		`-intins;`
`260`		`-intdel;`
`261`		`-intsub;`
`262`		`-`
`263`		`-/* Calculate costs for probable operations. */`
`264`		`-ins=prev[i]+ins_c;/* Insertion*/`
`265`		`-del=curr[i-1]+del_c;/* Deletion*/`
`266`		`-sub=prev[i-1]+ ((x==y) ?0 :sub_c);/* Substitution */`
`267`		`-`
`268`		`-/* Take the one with minimum cost. */`
`269`		`-curr[i]=Min(ins,del);`
`270`		`-curr[i]=Min(curr[i],sub);`
	`314`	`+for (i=1;i<m;i++)`
	`315`	`+{`
	`316`	`+intins;`
	`317`	`+intdel;`
	`318`	`+intsub;`
	`319`	`+intx_char_len=s_char_len[i-1];`
	`320`	`+`
	`321`	`+/*`
	`322`	`+ * Calculate costs for insertion, deletion, and substitution.`
	`323`	`+ *`
	`324`	`+ * When calculating cost for substitution, we compare the last`
	`325`	`+ * character of each possibly-multibyte character first,`
	`326`	`+ * because that's enough to rule out most mis-matches. If we`
	`327`	`+ * get past that test, then we compare the lengths and the`
	`328`	`+ * remaining bytes.`
	`329`	`+ */`
	`330`	`+ins=prev[i]+ins_c;`
	`331`	`+del=curr[i-1]+del_c;`
	`332`	`+if (x[x_char_len-1]==y[y_char_len-1]`
	`333`	`+&&x_char_len==y_char_len&&`
	`334`	`+(x_char_len==1\|\|rest_of_char_same(x,y,x_char_len)))`
	`335`	`+sub=prev[i-1];`
	`336`	`+else`
	`337`	`+sub=prev[i-1]+sub_c;`
	`338`	`+`
	`339`	`+/* Take the one with minimum cost. */`
	`340`	`+curr[i]=Min(ins,del);`
	`341`	`+curr[i]=Min(curr[i],sub);`
	`342`	`+`
	`343`	`+/* Point to next character. */`
	`344`	`+x+=x_char_len;`
	`345`	`+}`
	`346`	`+}`
	`347`	`+else`
	`348`	`+{`
	`349`	`+for (i=1;i<m;i++)`
	`350`	`+{`
	`351`	`+intins;`
	`352`	`+intdel;`
	`353`	`+intsub;`
	`354`	`+`
	`355`	`+/* Calculate costs for insertion, deletion, and substitution. */`
	`356`	`+ins=prev[i]+ins_c;`
	`357`	`+del=curr[i-1]+del_c;`
	`358`	`+sub=prev[i-1]+ ((x==y) ?0 :sub_c);`
	`359`	`+`
	`360`	`+/* Take the one with minimum cost. */`
	`361`	`+curr[i]=Min(ins,del);`
	`362`	`+curr[i]=Min(curr[i],sub);`
	`363`	`+`
	`364`	`+/* Point to next character. */`
	`365`	`+x++;`
	`366`	`+}`
`271`	`367`	`}`
`272`	`368`
`273`	`369`	`/* Swap current row with previous row. */`
`274`	`370`	`temp=curr;`
`275`	`371`	`curr=prev;`
`276`	`372`	`prev=temp;`
	`373`	`+`
	`374`	`+/* Point to next character. */`
	`375`	`+y+=y_char_len;`
`277`	`376`	`}`
`278`	`377`
`279`	`378`	`/*`

`‎doc/src/sgml/fuzzystrmatch.sgml`

Lines changed: 4 additions & 3 deletions

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-<!-- $PostgreSQL: pgsql/doc/src/sgml/fuzzystrmatch.sgml,v 1.6 2010/07/29 19:34:40 petere Exp $ -->`
	`1`	`+<!-- $PostgreSQL: pgsql/doc/src/sgml/fuzzystrmatch.sgml,v 1.7 2010/08/02 23:20:23 rhaas Exp $ -->`
`2`	`2`
`3`	`3`	`<sect1 id="fuzzystrmatch">`
`4`	`4`	`<title>fuzzystrmatch</title>`
`@@ -14,8 +14,9 @@`
`14`	`14`
`15`	`15`	`<caution>`
`16`	`16`	`<para>`
`17`		`- At present, <filename>fuzzystrmatch</> does not work well with`
`18`		`- multi-byte encodings (such as UTF-8).`
	`17`	`+ At present, the <function>soundex</>, <function>metaphone</>,`
	`18`	`+ <function>dmetaphone</>, and <function>dmetaphone_alt</> functions do`
	`19`	`+ not work well with multi-byte encodings (such as UTF-8).`
`19`	`20`	`</para>`
`20`	`21`	`</caution>`
`21`	`22`

0 commit comments

Comments

(0)

Movatterモバイル変換

Navigation Menu

Search code, repositories, users, issues, pull requests...

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Commit57d9aef

File tree

2 files changed

2 files changed

`‎contrib/fuzzystrmatch/fuzzystrmatch.c`

`‎doc/src/sgml/fuzzystrmatch.sgml`

0 commit comments