Movatterモバイル変換


[0]ホーム

URL:


Skip to content

Navigation Menu

Sign in
Appearance settings

Search code, repositories, users, issues, pull requests...

Provide feedback

We read every piece of feedback, and take your input very seriously.

Saved searches

Use saved searches to filter your results more quickly

Sign up
Appearance settings

Commitb87b52b

Browse files
committed
Support of multibyte encoding for pg_trgm
1 parente4ffd14 commitb87b52b

File tree

3 files changed

+161
-88
lines changed

3 files changed

+161
-88
lines changed

‎contrib/pg_trgm/trgm.h

Lines changed: 9 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* $PostgreSQL: pgsql/contrib/pg_trgm/trgm.h,v 1.9 2008/05/17 01:28:21 adunstan Exp $
2+
* $PostgreSQL: pgsql/contrib/pg_trgm/trgm.h,v 1.10 2008/11/12 13:43:54 teodor Exp $
33
*/
44
#ifndef__TRGM_H__
55
#define__TRGM_H__
@@ -31,7 +31,14 @@ typedef char trgm[3];
3131
*(((char*)(a))+2) = *(((char*)(b))+2);\
3232
} while(0);
3333

34-
#defineTRGMINT(a) ( (*(((char*)(a))+2)<<16)+(*(((char*)(a))+1)<<8)+*(((char*)(a))+0) )
34+
uint32trgm2int(trgm*ptr);
35+
36+
#ifdefKEEPONLYALNUM
37+
#defineISPRINTABLECHAR(a)( isascii( *(unsigned char*)(a) ) && (isalnum( *(unsigned char*)(a) ) || *(unsigned char*)(a)==' ') )
38+
#else
39+
#defineISPRINTABLECHAR(a)( isascii( *(unsigned char*)(a) ) && isprint( *(unsigned char*)(a) ) )
40+
#endif
41+
#defineISPRINTABLETRGM(t)( ISPRINTABLECHAR( ((char*)t) ) && ISPRINTABLECHAR( ((char*)t)+1 ) && ISPRINTABLECHAR( ((char*)t)+2 ) )
3542

3643
typedefstruct
3744
{

‎contrib/pg_trgm/trgm_gin.c

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* $PostgreSQL: pgsql/contrib/pg_trgm/trgm_gin.c,v 1.5 2008/07/11 11:56:48 teodor Exp $
2+
* $PostgreSQL: pgsql/contrib/pg_trgm/trgm_gin.c,v 1.6 2008/11/12 13:43:54 teodor Exp $
33
*/
44
#include"trgm.h"
55

@@ -42,7 +42,7 @@ gin_extract_trgm(PG_FUNCTION_ARGS)
4242
ptr=GETARR(trg);
4343
while (ptr-GETARR(trg)<ARRNELEM(trg))
4444
{
45-
item=TRGMINT(ptr);
45+
item=trgm2int(ptr);
4646
entries[i++]=Int32GetDatum(item);
4747

4848
ptr++;

‎contrib/pg_trgm/trgm_op.c

Lines changed: 150 additions & 84 deletions
Original file line numberDiff line numberDiff line change
@@ -1,10 +1,11 @@
11
/*
2-
* $PostgreSQL: pgsql/contrib/pg_trgm/trgm_op.c,v 1.10 2008/05/17 01:28:21 adunstan Exp $
2+
* $PostgreSQL: pgsql/contrib/pg_trgm/trgm_op.c,v 1.11 2008/11/12 13:43:54 teodor Exp $
33
*/
44
#include"trgm.h"
55
#include<ctype.h>
66
#include"utils/array.h"
77
#include"catalog/pg_type.h"
8+
#include"tsearch/ts_locale.h"
89

910
PG_MODULE_MAGIC;
1011

@@ -31,9 +32,6 @@ show_limit(PG_FUNCTION_ARGS)
3132
PG_RETURN_FLOAT4(trgm_limit);
3233
}
3334

34-
#defineWORDWAIT0
35-
#defineINWORD1
36-
3735
staticint
3836
comp_trgm(constvoid*a,constvoid*b)
3937
{
@@ -60,18 +58,119 @@ unique_array(trgm * a, int len)
6058
returncurend+1-a;
6159
}
6260

61+
#ifdefKEEPONLYALNUM
62+
#defineiswordchr(c)(t_isalpha(c) || t_isdigit(c))
63+
#else
64+
#defineiswordchr(c)(!t_isspace(c))
65+
#endif
66+
67+
/*
68+
* Finds first word in string, returns pointer to the word,
69+
* endword points to the character after word
70+
*/
71+
staticchar*
72+
find_word(char*str,intlenstr,char**endword,int*charlen)
73+
{
74+
char*beginword=str;
75+
76+
while(beginword-str<lenstr&& !iswordchr(beginword) )
77+
beginword+=pg_mblen(beginword);
78+
79+
if (beginword-str >=lenstr)
80+
returnNULL;
81+
82+
*endword=beginword;
83+
*charlen=0;
84+
while(*endword-str<lenstr&&iswordchr(*endword) )
85+
{
86+
*endword+=pg_mblen(*endword);
87+
(*charlen)++;
88+
}
89+
90+
returnbeginword;
91+
}
92+
93+
#ifdefUSE_WIDE_UPPER_LOWER
94+
staticvoid
95+
cnt_trigram(trgm*tptr,char*str,intbytelen)
96+
{
97+
if (bytelen==3 )
98+
{
99+
CPTRGM(tptr,str);
100+
}
101+
else
102+
{
103+
pg_crc32crc;
104+
105+
INIT_CRC32(crc);
106+
COMP_CRC32(crc,str,bytelen);
107+
FIN_CRC32(crc);
108+
109+
/*
110+
* use only 3 upper bytes from crc, hope, it's
111+
* good enough hashing
112+
*/
113+
CPTRGM(tptr,&crc);
114+
}
115+
}
116+
#endif
117+
118+
/*
119+
* Adds trigramm from words (already padded).
120+
*/
121+
statictrgm*
122+
make_trigrams(trgm*tptr,char*str,intbytelen,intcharlen )
123+
{
124+
char*ptr=str;
125+
126+
if (charlen<3 )
127+
returntptr;
128+
129+
#ifdefUSE_WIDE_UPPER_LOWER
130+
if (pg_database_encoding_max_length()>1)
131+
{
132+
intlenfirst=pg_mblen(str),
133+
lenmiddle=pg_mblen(str+lenfirst),
134+
lenlast=pg_mblen(str+lenfirst+lenmiddle);
135+
136+
while( (ptr-str)+lenfirst+lenmiddle+lenlast <=bytelen )
137+
{
138+
cnt_trigram(tptr,ptr,lenfirst+lenmiddle+lenlast);
139+
140+
ptr+=lenfirst;
141+
tptr++;
142+
143+
lenfirst=lenmiddle;
144+
lenmiddle=lenlast;
145+
lenlast=pg_mblen(ptr+lenfirst+lenmiddle);
146+
}
147+
}
148+
else
149+
#endif
150+
{
151+
Assert(bytelen==charlen );
152+
153+
while (ptr-str<bytelen-2/* number of trigrams = strlen - 2 */ )
154+
{
155+
CPTRGM(tptr,ptr);
156+
ptr++;
157+
tptr++;
158+
}
159+
}
160+
161+
returntptr;
162+
}
63163

64164
TRGM*
65165
generate_trgm(char*str,intslen)
66166
{
67167
TRGM*trg;
68-
char*buf,
69-
*sptr,
70-
*bufptr;
168+
char*buf;
71169
trgm*tptr;
72-
intstate=WORDWAIT;
73-
intwl,
74-
len;
170+
intlen,
171+
charlen,
172+
bytelen;
173+
char*bword,*eword;
75174

76175
trg= (TRGM*)palloc(TRGMHDRSIZE+sizeof(trgm)* (slen /2+1)*3);
77176
trg->flag=ARRKEY;
@@ -83,7 +182,6 @@ generate_trgm(char *str, int slen)
83182
tptr=GETARR(trg);
84183

85184
buf=palloc(sizeof(char)* (slen+4));
86-
sptr=str;
87185

88186
if (LPADDING>0)
89187
{
@@ -92,82 +190,29 @@ generate_trgm(char *str, int slen)
92190
*(buf+1)=' ';
93191
}
94192

95-
bufptr=buf+LPADDING;
96-
while (sptr-str<slen)
193+
eword=str;
194+
while( (bword=find_word(eword,slen- (eword-str),&eword,&charlen))!=NULL )
97195
{
98-
if (state==WORDWAIT)
99-
{
100-
if (
101-
#ifdefKEEPONLYALNUM
102-
isalnum((unsignedchar)*sptr)
103-
#else
104-
!isspace((unsignedchar)*sptr)
105-
#endif
106-
)
107-
{
108-
*bufptr=*sptr;/* start put word in buffer */
109-
bufptr++;
110-
state=INWORD;
111-
if (sptr-str==slen-1/* last char */ )
112-
gotogettrg;
113-
}
114-
}
115-
else
116-
{
117-
if (
118-
#ifdefKEEPONLYALNUM
119-
!isalnum((unsignedchar)*sptr)
196+
#ifdefIGNORECASE
197+
bword=lowerstr_with_len(bword,eword-bword);
198+
bytelen=strlen(bword);
120199
#else
121-
isspace((unsignedchar)*sptr)
200+
bytelen=eword-bword;
122201
#endif
123-
)
124-
{
125-
gettrg:
126-
/* word in buffer, so count trigrams */
127-
*bufptr=' ';
128-
*(bufptr+1)=' ';
129-
wl=bufptr- (buf+LPADDING)-2+LPADDING+RPADDING;
130-
if (wl <=0)
131-
{
132-
bufptr=buf+LPADDING;
133-
state=WORDWAIT;
134-
sptr++;
135-
continue;
136-
}
202+
203+
memcpy(buf+LPADDING,bword,bytelen);
137204

138205
#ifdefIGNORECASE
139-
do
140-
{/* lower word */
141-
intwwl=bufptr-buf;
142-
143-
bufptr=buf+LPADDING;
144-
while (bufptr-buf<wwl)
145-
{
146-
*bufptr=tolower((unsignedchar)*bufptr);
147-
bufptr++;
148-
}
149-
}while (0);
206+
pfree(bword);
150207
#endif
151-
bufptr=buf;
152-
/* set trigrams */
153-
while (bufptr-buf<wl)
154-
{
155-
CPTRGM(tptr,bufptr);
156-
bufptr++;
157-
tptr++;
158-
}
159-
bufptr=buf+LPADDING;
160-
state=WORDWAIT;
161-
}
162-
else
163-
{
164-
*bufptr=*sptr;/* put in buffer */
165-
bufptr++;
166-
if (sptr-str==slen-1)
167-
gotogettrg;
168-
}
169-
}
170-
sptr++;
208+
buf[LPADDING+bytelen]=' ';
209+
buf[LPADDING+bytelen+1]=' ';
210+
211+
/*
212+
* count trigrams
213+
*/
214+
tptr=make_trigrams(tptr,buf,bytelen+LPADDING+RPADDING,
215+
charlen+LPADDING+RPADDING );
171216
}
172217

173218
pfree(buf);
@@ -186,6 +231,19 @@ generate_trgm(char *str, int slen)
186231
returntrg;
187232
}
188233

234+
uint32
235+
trgm2int(trgm*ptr)
236+
{
237+
uint32val=0;
238+
239+
val |=*( ((unsignedchar*)ptr) );
240+
val <<=8;
241+
val |=*( ((unsignedchar*)ptr)+1 );
242+
val <<=8;
243+
val |=*( ((unsignedchar*)ptr)+2 );
244+
245+
returnval;
246+
}
189247

190248
PG_FUNCTION_INFO_V1(show_trgm);
191249
Datumshow_trgm(PG_FUNCTION_ARGS);
@@ -204,10 +262,18 @@ show_trgm(PG_FUNCTION_ARGS)
204262

205263
for (i=0,ptr=GETARR(trg);i<ARRNELEM(trg);i++,ptr++)
206264
{
207-
text*item= (text*)palloc(VARHDRSZ+3);
265+
text*item= (text*)palloc(VARHDRSZ+Max(12,pg_database_encoding_max_length()*3));
208266

209-
SET_VARSIZE(item,VARHDRSZ+3);
210-
CPTRGM(VARDATA(item),ptr);
267+
if (pg_database_encoding_max_length()>1&& !ISPRINTABLETRGM(ptr) )
268+
{
269+
snprintf(VARDATA(item),12,"0x%06x",trgm2int(ptr));
270+
SET_VARSIZE(item,VARHDRSZ+strlen(VARDATA(item)));
271+
}
272+
else
273+
{
274+
SET_VARSIZE(item,VARHDRSZ+3);
275+
CPTRGM(VARDATA(item),ptr);
276+
}
211277
d[i]=PointerGetDatum(item);
212278
}
213279

0 commit comments

Comments
 (0)

[8]ページ先頭

©2009-2025 Movatter.jp