Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commit604ab08

Browse files
committed
Add levenshtein_less_equal, optimized version for small distances.
Alexander Korotkov, heavily revised by me.
1 parent262c1a4 commit604ab08

File tree

5 files changed

+459
-213
lines changed

5 files changed

+459
-213
lines changed

‎contrib/fuzzystrmatch/fuzzystrmatch.c

Lines changed: 31 additions & 213 deletions
Original file line numberDiff line numberDiff line change
@@ -9,15 +9,6 @@
99
* Copyright (c) 2001-2010, PostgreSQL Global Development Group
1010
* ALL RIGHTS RESERVED;
1111
*
12-
* levenshtein()
13-
* -------------
14-
* Written based on a description of the algorithm by Michael Gilleland
15-
* found at http://www.merriampark.com/ld.htm
16-
* Also looked at levenshtein.c in the PHP 4.0.6 distribution for
17-
* inspiration.
18-
* Configurable penalty costs extension is introduced by Volkan
19-
* YAZICI <volkan.yazici@gmail.com>.
20-
*
2112
* metaphone()
2213
* -----------
2314
* Modified for PostgreSQL by Joe Conway.
@@ -61,6 +52,8 @@ PG_MODULE_MAGIC;
6152
*/
6253
externDatumlevenshtein_with_costs(PG_FUNCTION_ARGS);
6354
externDatumlevenshtein(PG_FUNCTION_ARGS);
55+
externDatumlevenshtein_less_equal_with_costs(PG_FUNCTION_ARGS);
56+
externDatumlevenshtein_less_equal(PG_FUNCTION_ARGS);
6457
externDatummetaphone(PG_FUNCTION_ARGS);
6558
externDatumsoundex(PG_FUNCTION_ARGS);
6659
externDatumdifference(PG_FUNCTION_ARGS);
@@ -85,16 +78,6 @@ soundex_code(char letter)
8578
returnletter;
8679
}
8780

88-
89-
/*
90-
* Levenshtein
91-
*/
92-
#defineMAX_LEVENSHTEIN_STRLEN255
93-
94-
staticintlevenshtein_internal(text*s,text*t,
95-
intins_c,intdel_c,intsub_c);
96-
97-
9881
/*
9982
* Metaphone
10083
*/
@@ -197,224 +180,59 @@ rest_of_char_same(const char *s1, const char *s2, int len)
197180
return true;
198181
}
199182

200-
/*
201-
* levenshtein_internal - Calculates Levenshtein distance metric
202-
* between supplied strings. Generally
203-
* (1, 1, 1) penalty costs suffices common
204-
* cases, but your mileage may vary.
205-
*/
206-
staticint
207-
levenshtein_internal(text*s,text*t,
208-
intins_c,intdel_c,intsub_c)
209-
{
210-
intm,
211-
n,
212-
s_bytes,
213-
t_bytes;
214-
int*prev;
215-
int*curr;
216-
int*s_char_len=NULL;
217-
inti,
218-
j;
219-
constchar*s_data;
220-
constchar*t_data;
221-
constchar*y;
222-
223-
/* Extract a pointer to the actual character data. */
224-
s_data=VARDATA_ANY(s);
225-
t_data=VARDATA_ANY(t);
226-
227-
/* Determine length of each string in bytes and characters. */
228-
s_bytes=VARSIZE_ANY_EXHDR(s);
229-
t_bytes=VARSIZE_ANY_EXHDR(t);
230-
m=pg_mbstrlen_with_len(s_data,s_bytes);
231-
n=pg_mbstrlen_with_len(t_data,t_bytes);
232-
233-
/*
234-
* We can transform an empty s into t with n insertions, or a non-empty t
235-
* into an empty s with m deletions.
236-
*/
237-
if (!m)
238-
returnn*ins_c;
239-
if (!n)
240-
returnm*del_c;
241-
242-
/*
243-
* For security concerns, restrict excessive CPU+RAM usage. (This
244-
* implementation uses O(m) memory and has O(mn) complexity.)
245-
*/
246-
if (m>MAX_LEVENSHTEIN_STRLEN||
247-
n>MAX_LEVENSHTEIN_STRLEN)
248-
ereport(ERROR,
249-
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
250-
errmsg("argument exceeds the maximum length of %d bytes",
251-
MAX_LEVENSHTEIN_STRLEN)));
252-
253-
/*
254-
* In order to avoid calling pg_mblen() repeatedly on each character in s,
255-
* we cache all the lengths before starting the main loop -- but if all the
256-
* characters in both strings are single byte, then we skip this and use
257-
* a fast-path in the main loop. If only one string contains multi-byte
258-
* characters, we still build the array, so that the fast-path needn't
259-
* deal with the case where the array hasn't been initialized.
260-
*/
261-
if (m!=s_bytes||n!=t_bytes)
262-
{
263-
inti;
264-
constchar*cp=s_data;
265-
266-
s_char_len= (int*)palloc((m+1)*sizeof(int));
267-
for (i=0;i<m;++i)
268-
{
269-
s_char_len[i]=pg_mblen(cp);
270-
cp+=s_char_len[i];
271-
}
272-
s_char_len[i]=0;
273-
}
274-
275-
/* One more cell for initialization column and row. */
276-
++m;
277-
++n;
278-
279-
/*
280-
* One way to compute Levenshtein distance is to incrementally construct
281-
* an (m+1)x(n+1) matrix where cell (i, j) represents the minimum number
282-
* of operations required to transform the first i characters of s into
283-
* the first j characters of t. The last column of the final row is the
284-
* answer.
285-
*
286-
* We use that algorithm here with some modification. In lieu of holding
287-
* the entire array in memory at once, we'll just use two arrays of size
288-
* m+1 for storing accumulated values. At each step one array represents
289-
* the "previous" row and one is the "current" row of the notional large
290-
* array.
291-
*/
292-
prev= (int*)palloc(2*m*sizeof(int));
293-
curr=prev+m;
294-
295-
/*
296-
* To transform the first i characters of s into the first 0 characters
297-
* of t, we must perform i deletions.
298-
*/
299-
for (i=0;i<m;i++)
300-
prev[i]=i*del_c;
301-
302-
/* Loop through rows of the notional array */
303-
for (y=t_data,j=1;j<n;j++)
304-
{
305-
int*temp;
306-
constchar*x=s_data;
307-
inty_char_len=n!=t_bytes+1 ?pg_mblen(y) :1;
308-
309-
/*
310-
* To transform the first 0 characters of s into the first j
311-
* characters of t, we must perform j insertions.
312-
*/
313-
curr[0]=j*ins_c;
314-
315-
/*
316-
* This inner loop is critical to performance, so we include a
317-
* fast-path to handle the (fairly common) case where no multibyte
318-
* characters are in the mix. The fast-path is entitled to assume
319-
* that if s_char_len is not initialized then BOTH strings contain
320-
* only single-byte characters.
321-
*/
322-
if (s_char_len!=NULL)
323-
{
324-
for (i=1;i<m;i++)
325-
{
326-
intins;
327-
intdel;
328-
intsub;
329-
intx_char_len=s_char_len[i-1];
330-
331-
/*
332-
* Calculate costs for insertion, deletion, and substitution.
333-
*
334-
* When calculating cost for substitution, we compare the last
335-
* character of each possibly-multibyte character first,
336-
* because that's enough to rule out most mis-matches. If we
337-
* get past that test, then we compare the lengths and the
338-
* remaining bytes.
339-
*/
340-
ins=prev[i]+ins_c;
341-
del=curr[i-1]+del_c;
342-
if (x[x_char_len-1]==y[y_char_len-1]
343-
&&x_char_len==y_char_len&&
344-
(x_char_len==1||rest_of_char_same(x,y,x_char_len)))
345-
sub=prev[i-1];
346-
else
347-
sub=prev[i-1]+sub_c;
348-
349-
/* Take the one with minimum cost. */
350-
curr[i]=Min(ins,del);
351-
curr[i]=Min(curr[i],sub);
352-
353-
/* Point to next character. */
354-
x+=x_char_len;
355-
}
356-
}
357-
else
358-
{
359-
for (i=1;i<m;i++)
360-
{
361-
intins;
362-
intdel;
363-
intsub;
183+
#include"levenshtein.c"
184+
#defineLEVENSHTEIN_LESS_EQUAL
185+
#include"levenshtein.c"
364186

365-
/* Calculate costs for insertion, deletion, and substitution. */
366-
ins=prev[i]+ins_c;
367-
del=curr[i-1]+del_c;
368-
sub=prev[i-1]+ ((*x==*y) ?0 :sub_c);
369-
370-
/* Take the one with minimum cost. */
371-
curr[i]=Min(ins,del);
372-
curr[i]=Min(curr[i],sub);
187+
PG_FUNCTION_INFO_V1(levenshtein_with_costs);
188+
Datum
189+
levenshtein_with_costs(PG_FUNCTION_ARGS)
190+
{
191+
text*src=PG_GETARG_TEXT_PP(0);
192+
text*dst=PG_GETARG_TEXT_PP(1);
193+
intins_c=PG_GETARG_INT32(2);
194+
intdel_c=PG_GETARG_INT32(3);
195+
intsub_c=PG_GETARG_INT32(4);
373196

374-
/* Point to next character. */
375-
x++;
376-
}
377-
}
197+
PG_RETURN_INT32(levenshtein_internal(src,dst,ins_c,del_c,sub_c));
198+
}
378199

379-
/* Swap current row with previous row. */
380-
temp=curr;
381-
curr=prev;
382-
prev=temp;
383200

384-
/* Point to next character. */
385-
y+=y_char_len;
386-
}
201+
PG_FUNCTION_INFO_V1(levenshtein);
202+
Datum
203+
levenshtein(PG_FUNCTION_ARGS)
204+
{
205+
text*src=PG_GETARG_TEXT_PP(0);
206+
text*dst=PG_GETARG_TEXT_PP(1);
387207

388-
/*
389-
* Because the final value was swapped from the previous row to the
390-
* current row, that's where we'll find it.
391-
*/
392-
returnprev[m-1];
208+
PG_RETURN_INT32(levenshtein_internal(src,dst,1,1,1));
393209
}
394210

395211

396-
PG_FUNCTION_INFO_V1(levenshtein_with_costs);
212+
PG_FUNCTION_INFO_V1(levenshtein_less_equal_with_costs);
397213
Datum
398-
levenshtein_with_costs(PG_FUNCTION_ARGS)
214+
levenshtein_less_equal_with_costs(PG_FUNCTION_ARGS)
399215
{
400216
text*src=PG_GETARG_TEXT_PP(0);
401217
text*dst=PG_GETARG_TEXT_PP(1);
402218
intins_c=PG_GETARG_INT32(2);
403219
intdel_c=PG_GETARG_INT32(3);
404220
intsub_c=PG_GETARG_INT32(4);
221+
intmax_d=PG_GETARG_INT32(5);
405222

406-
PG_RETURN_INT32(levenshtein_internal(src,dst,ins_c,del_c,sub_c));
223+
PG_RETURN_INT32(levenshtein_less_equal_internal(src,dst,ins_c,del_c,sub_c,max_d));
407224
}
408225

409226

410-
PG_FUNCTION_INFO_V1(levenshtein);
227+
PG_FUNCTION_INFO_V1(levenshtein_less_equal);
411228
Datum
412-
levenshtein(PG_FUNCTION_ARGS)
229+
levenshtein_less_equal(PG_FUNCTION_ARGS)
413230
{
414231
text*src=PG_GETARG_TEXT_PP(0);
415232
text*dst=PG_GETARG_TEXT_PP(1);
233+
intmax_d=PG_GETARG_INT32(2);
416234

417-
PG_RETURN_INT32(levenshtein_internal(src,dst,1,1,1));
235+
PG_RETURN_INT32(levenshtein_less_equal_internal(src,dst,1,1,1,max_d));
418236
}
419237

420238

‎contrib/fuzzystrmatch/fuzzystrmatch.sql.in

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,14 @@ CREATE OR REPLACE FUNCTION levenshtein (text,text,int,int,int) RETURNS int
1111
AS 'MODULE_PATHNAME','levenshtein_with_costs'
1212
LANGUAGE C IMMUTABLE STRICT;
1313

14+
CREATE OR REPLACE FUNCTION levenshtein_less_equal (text,text,int) RETURNS int
15+
AS 'MODULE_PATHNAME','levenshtein_less_equal'
16+
LANGUAGE C IMMUTABLE STRICT;
17+
18+
CREATE OR REPLACE FUNCTION levenshtein_less_equal (text,text,int,int,int,int) RETURNS int
19+
AS 'MODULE_PATHNAME','levenshtein_less_equal_with_costs'
20+
LANGUAGE C IMMUTABLE STRICT;
21+
1422
CREATE OR REPLACE FUNCTION metaphone (text,int) RETURNS text
1523
AS 'MODULE_PATHNAME','metaphone'
1624
LANGUAGE C IMMUTABLE STRICT;

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp