Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitc0828b7

Browse files
committed
Move the guts of our Levenshtein implementation into core.
The hope is that we can use this to produce better diagnostics insome cases.Peter Geoghegan, reviewed by Michael Paquier, with some furtherchanges by me.
1 parent1d69ae4 commitc0828b7

File tree

6 files changed

+122
-83
lines changed

6 files changed

+122
-83
lines changed

‎contrib/fuzzystrmatch/Makefile

Lines changed: 0 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,3 @@ top_builddir = ../..
1717
include$(top_builddir)/src/Makefile.global
1818
include$(top_srcdir)/contrib/contrib-global.mk
1919
endif
20-
21-
# levenshtein.c is #included by fuzzystrmatch.c
22-
fuzzystrmatch.o: fuzzystrmatch.c levenshtein.c

‎contrib/fuzzystrmatch/fuzzystrmatch.c

Lines changed: 57 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -154,23 +154,6 @@ getcode(char c)
154154
/* These prevent GH from becoming F */
155155
#defineNOGHTOF(c)(getcode(c) & 16)/* BDH */
156156

157-
/* Faster than memcmp(), for this use case. */
158-
staticinlinebool
159-
rest_of_char_same(constchar*s1,constchar*s2,intlen)
160-
{
161-
while (len>0)
162-
{
163-
len--;
164-
if (s1[len]!=s2[len])
165-
return false;
166-
}
167-
return true;
168-
}
169-
170-
#include"levenshtein.c"
171-
#defineLEVENSHTEIN_LESS_EQUAL
172-
#include"levenshtein.c"
173-
174157
PG_FUNCTION_INFO_V1(levenshtein_with_costs);
175158
Datum
176159
levenshtein_with_costs(PG_FUNCTION_ARGS)
@@ -180,8 +163,20 @@ levenshtein_with_costs(PG_FUNCTION_ARGS)
180163
intins_c=PG_GETARG_INT32(2);
181164
intdel_c=PG_GETARG_INT32(3);
182165
intsub_c=PG_GETARG_INT32(4);
183-
184-
PG_RETURN_INT32(levenshtein_internal(src,dst,ins_c,del_c,sub_c));
166+
constchar*s_data;
167+
constchar*t_data;
168+
ints_bytes,
169+
t_bytes;
170+
171+
/* Extract a pointer to the actual character data */
172+
s_data=VARDATA_ANY(src);
173+
t_data=VARDATA_ANY(dst);
174+
/* Determine length of each string in bytes and characters */
175+
s_bytes=VARSIZE_ANY_EXHDR(src);
176+
t_bytes=VARSIZE_ANY_EXHDR(dst);
177+
178+
PG_RETURN_INT32(varstr_levenshtein(s_data,s_bytes,t_data,t_bytes,ins_c,
179+
del_c,sub_c));
185180
}
186181

187182

@@ -191,8 +186,20 @@ levenshtein(PG_FUNCTION_ARGS)
191186
{
192187
text*src=PG_GETARG_TEXT_PP(0);
193188
text*dst=PG_GETARG_TEXT_PP(1);
194-
195-
PG_RETURN_INT32(levenshtein_internal(src,dst,1,1,1));
189+
constchar*s_data;
190+
constchar*t_data;
191+
ints_bytes,
192+
t_bytes;
193+
194+
/* Extract a pointer to the actual character data */
195+
s_data=VARDATA_ANY(src);
196+
t_data=VARDATA_ANY(dst);
197+
/* Determine length of each string in bytes and characters */
198+
s_bytes=VARSIZE_ANY_EXHDR(src);
199+
t_bytes=VARSIZE_ANY_EXHDR(dst);
200+
201+
PG_RETURN_INT32(varstr_levenshtein(s_data,s_bytes,t_data,t_bytes,1,1,
202+
1));
196203
}
197204

198205

@@ -206,8 +213,21 @@ levenshtein_less_equal_with_costs(PG_FUNCTION_ARGS)
206213
intdel_c=PG_GETARG_INT32(3);
207214
intsub_c=PG_GETARG_INT32(4);
208215
intmax_d=PG_GETARG_INT32(5);
209-
210-
PG_RETURN_INT32(levenshtein_less_equal_internal(src,dst,ins_c,del_c,sub_c,max_d));
216+
constchar*s_data;
217+
constchar*t_data;
218+
ints_bytes,
219+
t_bytes;
220+
221+
/* Extract a pointer to the actual character data */
222+
s_data=VARDATA_ANY(src);
223+
t_data=VARDATA_ANY(dst);
224+
/* Determine length of each string in bytes and characters */
225+
s_bytes=VARSIZE_ANY_EXHDR(src);
226+
t_bytes=VARSIZE_ANY_EXHDR(dst);
227+
228+
PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data,s_bytes,t_data,
229+
t_bytes,ins_c,del_c,
230+
sub_c,max_d));
211231
}
212232

213233

@@ -218,8 +238,20 @@ levenshtein_less_equal(PG_FUNCTION_ARGS)
218238
text*src=PG_GETARG_TEXT_PP(0);
219239
text*dst=PG_GETARG_TEXT_PP(1);
220240
intmax_d=PG_GETARG_INT32(2);
221-
222-
PG_RETURN_INT32(levenshtein_less_equal_internal(src,dst,1,1,1,max_d));
241+
constchar*s_data;
242+
constchar*t_data;
243+
ints_bytes,
244+
t_bytes;
245+
246+
/* Extract a pointer to the actual character data */
247+
s_data=VARDATA_ANY(src);
248+
t_data=VARDATA_ANY(dst);
249+
/* Determine length of each string in bytes and characters */
250+
s_bytes=VARSIZE_ANY_EXHDR(src);
251+
t_bytes=VARSIZE_ANY_EXHDR(dst);
252+
253+
PG_RETURN_INT32(varstr_levenshtein_less_equal(s_data,s_bytes,t_data,
254+
t_bytes,1,1,1,max_d));
223255
}
224256

225257

‎src/backend/utils/adt/Makefile

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,4 +38,6 @@ OBJS = acl.o arrayfuncs.o array_selfuncs.o array_typanalyze.o \
3838

3939
like.o: like.c like_match.c
4040

41+
varlena.o: varlena.c levenshtein.c
42+
4143
include$(top_srcdir)/src/backend/common.mk

‎contrib/fuzzystrmatch/levenshtein.crenamed to‎src/backend/utils/adt/levenshtein.c

Lines changed: 37 additions & 54 deletions
Original file line numberDiff line numberDiff line change
@@ -1,41 +1,34 @@
1-
/*
1+
/*-------------------------------------------------------------------------
2+
*
23
* levenshtein.c
4+
* Levenshtein distance implementation.
35
*
4-
*Functions for "fuzzy" comparison of strings
6+
*Original author: Joe Conway <mail@joeconway.com>
57
*
6-
* Joe Conway <mail@joeconway.com>
8+
* This file is included by varlena.c twice, to provide matching code for (1)
9+
* Levenshtein distance with custom costings, and (2) Levenshtein distance with
10+
* custom costings and a "max" value above which exact distances are not
11+
* interesting. Before the inclusion, we rely on the presence of the inline
12+
* function rest_of_char_same().
13+
*
14+
* Written based on a description of the algorithm by Michael Gilleland found
15+
* at http://www.merriampark.com/ld.htm. Also looked at levenshtein.c in the
16+
* PHP 4.0.6 distribution for inspiration. Configurable penalty costs
17+
* extension is introduced by Volkan YAZICI <volkan.yazici@gmail.com.
718
*
819
* Copyright (c) 2001-2014, PostgreSQL Global Development Group
9-
* ALL RIGHTS RESERVED;
1020
*
11-
* levenshtein()
12-
* -------------
13-
* Written based on a description of the algorithm by Michael Gilleland
14-
* found at http://www.merriampark.com/ld.htm
15-
* Also looked at levenshtein.c in the PHP 4.0.6 distribution for
16-
* inspiration.
17-
* Configurable penalty costs extension is introduced by Volkan
18-
* YAZICI <volkan.yazici@gmail.com>.
19-
*/
20-
21-
/*
22-
* External declarations for exported functions
21+
* IDENTIFICATION
22+
*src/backend/utils/adt/levenshtein.c
23+
*
24+
*-------------------------------------------------------------------------
2325
*/
24-
#ifdefLEVENSHTEIN_LESS_EQUAL
25-
staticintlevenshtein_less_equal_internal(text*s,text*t,
26-
intins_c,intdel_c,intsub_c,intmax_d);
27-
#else
28-
staticintlevenshtein_internal(text*s,text*t,
29-
intins_c,intdel_c,intsub_c);
30-
#endif
31-
3226
#defineMAX_LEVENSHTEIN_STRLEN255
3327

34-
3528
/*
36-
* Calculates Levenshtein distance metric between suppliedstrings. Generally
37-
* (1, 1, 1) penalty costs suffices for common cases, but your mileage may
38-
* vary.
29+
* Calculates Levenshtein distance metric between suppliedcsrings, which are
30+
*not necessarily null-terminated. Generally(1, 1, 1) penalty costs suffices
31+
*for common cases, but your mileage mayvary.
3932
*
4033
* One way to compute Levenshtein distance is to incrementally construct
4134
* an (m+1)x(n+1) matrix where cell (i, j) represents the minimum number
@@ -63,30 +56,27 @@ static int levenshtein_internal(text *s, text *t,
6356
* identify the portion of the matrix close to the diagonal which can still
6457
* affect the final answer.
6558
*/
66-
staticint
59+
int
6760
#ifdefLEVENSHTEIN_LESS_EQUAL
68-
levenshtein_less_equal_internal(text*s,text*t,
69-
intins_c,intdel_c,intsub_c,intmax_d)
61+
varstr_levenshtein_less_equal(constchar*source,intslen,constchar*target,
62+
inttlen,intins_c,intdel_c,intsub_c,
63+
intmax_d)
7064
#else
71-
levenshtein_internal(text*s,text*t,
72-
intins_c,intdel_c,intsub_c)
65+
varstr_levenshtein(constchar*source,intslen,constchar*target,inttlen,
66+
intins_c,intdel_c,intsub_c)
7367
#endif
7468
{
7569
intm,
76-
n,
77-
s_bytes,
78-
t_bytes;
70+
n;
7971
int*prev;
8072
int*curr;
8173
int*s_char_len=NULL;
8274
inti,
8375
j;
84-
constchar*s_data;
85-
constchar*t_data;
8676
constchar*y;
8777

8878
/*
89-
* Forlevenshtein_less_equal_internal, we have real variables called
79+
* Forvarstr_levenshtein_less_equal, we have real variables called
9080
* start_column and stop_column; otherwise it's just short-hand for 0 and
9181
* m.
9282
*/
@@ -105,15 +95,8 @@ levenshtein_internal(text *s, text *t,
10595
#defineSTOP_COLUMN m
10696
#endif
10797

108-
/* Extract a pointer to the actual character data. */
109-
s_data=VARDATA_ANY(s);
110-
t_data=VARDATA_ANY(t);
111-
112-
/* Determine length of each string in bytes and characters. */
113-
s_bytes=VARSIZE_ANY_EXHDR(s);
114-
t_bytes=VARSIZE_ANY_EXHDR(t);
115-
m=pg_mbstrlen_with_len(s_data,s_bytes);
116-
n=pg_mbstrlen_with_len(t_data,t_bytes);
98+
m=pg_mbstrlen_with_len(source,slen);
99+
n=pg_mbstrlen_with_len(target,tlen);
117100

118101
/*
119102
* We can transform an empty s into t with n insertions, or a non-empty t
@@ -193,10 +176,10 @@ levenshtein_internal(text *s, text *t,
193176
* multi-byte characters, we still build the array, so that the fast-path
194177
* needn't deal with the case where the array hasn't been initialized.
195178
*/
196-
if (m!=s_bytes||n!=t_bytes)
179+
if (m!=slen||n!=tlen)
197180
{
198181
inti;
199-
constchar*cp=s_data;
182+
constchar*cp=source;
200183

201184
s_char_len= (int*)palloc((m+1)*sizeof(int));
202185
for (i=0;i<m;++i)
@@ -223,11 +206,11 @@ levenshtein_internal(text *s, text *t,
223206
prev[i]=i*del_c;
224207

225208
/* Loop through rows of the notional array */
226-
for (y=t_data,j=1;j<n;j++)
209+
for (y=target,j=1;j<n;j++)
227210
{
228211
int*temp;
229-
constchar*x=s_data;
230-
inty_char_len=n!=t_bytes+1 ?pg_mblen(y) :1;
212+
constchar*x=source;
213+
inty_char_len=n!=tlen+1 ?pg_mblen(y) :1;
231214

232215
#ifdefLEVENSHTEIN_LESS_EQUAL
233216

@@ -384,7 +367,7 @@ levenshtein_internal(text *s, text *t,
384367
prev[start_column]=max_d+1;
385368
curr[start_column]=max_d+1;
386369
if (start_column!=0)
387-
s_data+= (s_char_len!=NULL) ?s_char_len[start_column-1] :1;
370+
source+= (s_char_len!=NULL) ?s_char_len[start_column-1] :1;
388371
start_column++;
389372
}
390373

‎src/backend/utils/adt/varlena.c

Lines changed: 21 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1546,7 +1546,6 @@ varstr_cmp(char *arg1, int len1, char *arg2, int len2, Oid collid)
15461546
returnresult;
15471547
}
15481548

1549-
15501549
/* text_cmp()
15511550
* Internal comparison function for text strings.
15521551
* Returns -1, 0 or 1
@@ -4747,3 +4746,24 @@ text_format_nv(PG_FUNCTION_ARGS)
47474746
{
47484747
returntext_format(fcinfo);
47494748
}
4749+
4750+
/*
4751+
* Helper function for Levenshtein distance functions. Faster than memcmp(),
4752+
* for this use case.
4753+
*/
4754+
staticinlinebool
4755+
rest_of_char_same(constchar*s1,constchar*s2,intlen)
4756+
{
4757+
while (len>0)
4758+
{
4759+
len--;
4760+
if (s1[len]!=s2[len])
4761+
return false;
4762+
}
4763+
return true;
4764+
}
4765+
4766+
/* Expand each Levenshtein distance variant */
4767+
#include"levenshtein.c"
4768+
#defineLEVENSHTEIN_LESS_EQUAL
4769+
#include"levenshtein.c"

‎src/include/utils/builtins.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -786,6 +786,11 @@ extern Datum textoverlay_no_len(PG_FUNCTION_ARGS);
786786
externDatumname_text(PG_FUNCTION_ARGS);
787787
externDatumtext_name(PG_FUNCTION_ARGS);
788788
externintvarstr_cmp(char*arg1,intlen1,char*arg2,intlen2,Oidcollid);
789+
externintvarstr_levenshtein(constchar*source,intslen,constchar*target,
790+
inttlen,intins_c,intdel_c,intsub_c);
791+
externintvarstr_levenshtein_less_equal(constchar*source,intslen,
792+
constchar*target,inttlen,intins_c,
793+
intdel_c,intsub_c,intmax_d);
789794
externList*textToQualifiedNameList(text*textval);
790795
externboolSplitIdentifierString(char*rawstring,charseparator,
791796
List**namelist);

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp