11/*
2- * $PostgreSQL: pgsql/contrib/pg_trgm/trgm_op.c,v 1.10 2008/05/17 01:28:21 adunstan Exp $
2+ * $PostgreSQL: pgsql/contrib/pg_trgm/trgm_op.c,v 1.11 2008/11/12 13:43:54 teodor Exp $
33 */
44#include "trgm.h"
55#include <ctype.h>
66#include "utils/array.h"
77#include "catalog/pg_type.h"
8+ #include "tsearch/ts_locale.h"
89
910PG_MODULE_MAGIC ;
1011
@@ -31,9 +32,6 @@ show_limit(PG_FUNCTION_ARGS)
3132PG_RETURN_FLOAT4 (trgm_limit );
3233}
3334
34- #define WORDWAIT 0
35- #define INWORD 1
36-
3735static int
3836comp_trgm (const void * a ,const void * b )
3937{
@@ -60,18 +58,119 @@ unique_array(trgm * a, int len)
6058return curend + 1 - a ;
6159}
6260
61+ #ifdef KEEPONLYALNUM
62+ #define iswordchr (c )(t_isalpha(c) || t_isdigit(c))
63+ #else
64+ #define iswordchr (c )(!t_isspace(c))
65+ #endif
66+
67+ /*
68+ * Finds first word in string, returns pointer to the word,
69+ * endword points to the character after word
70+ */
71+ static char *
72+ find_word (char * str ,int lenstr ,char * * endword ,int * charlen )
73+ {
74+ char * beginword = str ;
75+
76+ while (beginword - str < lenstr && !iswordchr (beginword ) )
77+ beginword += pg_mblen (beginword );
78+
79+ if (beginword - str >=lenstr )
80+ return NULL ;
81+
82+ * endword = beginword ;
83+ * charlen = 0 ;
84+ while (* endword - str < lenstr && iswordchr (* endword ) )
85+ {
86+ * endword += pg_mblen (* endword );
87+ (* charlen )++ ;
88+ }
89+
90+ return beginword ;
91+ }
92+
93+ #ifdef USE_WIDE_UPPER_LOWER
94+ static void
95+ cnt_trigram (trgm * tptr ,char * str ,int bytelen )
96+ {
97+ if (bytelen == 3 )
98+ {
99+ CPTRGM (tptr ,str );
100+ }
101+ else
102+ {
103+ pg_crc32 crc ;
104+
105+ INIT_CRC32 (crc );
106+ COMP_CRC32 (crc ,str ,bytelen );
107+ FIN_CRC32 (crc );
108+
109+ /*
110+ * use only 3 upper bytes from crc, hope, it's
111+ * good enough hashing
112+ */
113+ CPTRGM (tptr ,& crc );
114+ }
115+ }
116+ #endif
117+
118+ /*
119+ * Adds trigramm from words (already padded).
120+ */
121+ static trgm *
122+ make_trigrams (trgm * tptr ,char * str ,int bytelen ,int charlen )
123+ {
124+ char * ptr = str ;
125+
126+ if (charlen < 3 )
127+ return tptr ;
128+
129+ #ifdef USE_WIDE_UPPER_LOWER
130+ if (pg_database_encoding_max_length ()> 1 )
131+ {
132+ int lenfirst = pg_mblen (str ),
133+ lenmiddle = pg_mblen (str + lenfirst ),
134+ lenlast = pg_mblen (str + lenfirst + lenmiddle );
135+
136+ while ( (ptr - str )+ lenfirst + lenmiddle + lenlast <=bytelen )
137+ {
138+ cnt_trigram (tptr ,ptr ,lenfirst + lenmiddle + lenlast );
139+
140+ ptr += lenfirst ;
141+ tptr ++ ;
142+
143+ lenfirst = lenmiddle ;
144+ lenmiddle = lenlast ;
145+ lenlast = pg_mblen (ptr + lenfirst + lenmiddle );
146+ }
147+ }
148+ else
149+ #endif
150+ {
151+ Assert (bytelen == charlen );
152+
153+ while (ptr - str < bytelen - 2 /* number of trigrams = strlen - 2 */ )
154+ {
155+ CPTRGM (tptr ,ptr );
156+ ptr ++ ;
157+ tptr ++ ;
158+ }
159+ }
160+
161+ return tptr ;
162+ }
63163
64164TRGM *
65165generate_trgm (char * str ,int slen )
66166{
67167TRGM * trg ;
68- char * buf ,
69- * sptr ,
70- * bufptr ;
168+ char * buf ;
71169trgm * tptr ;
72- int state = WORDWAIT ;
73- int wl ,
74- len ;
170+ int len ,
171+ charlen ,
172+ bytelen ;
173+ char * bword ,* eword ;
75174
76175trg = (TRGM * )palloc (TRGMHDRSIZE + sizeof (trgm )* (slen /2 + 1 )* 3 );
77176trg -> flag = ARRKEY ;
@@ -83,7 +182,6 @@ generate_trgm(char *str, int slen)
83182tptr = GETARR (trg );
84183
85184buf = palloc (sizeof (char )* (slen + 4 ));
86- sptr = str ;
87185
88186if (LPADDING > 0 )
89187{
@@ -92,82 +190,29 @@ generate_trgm(char *str, int slen)
92190* (buf + 1 )= ' ' ;
93191}
94192
95- bufptr = buf + LPADDING ;
96- while ( sptr - str < slen )
193+ eword = str ;
194+ while ( ( bword = find_word ( eword , slen - ( eword - str ), & eword , & charlen )) != NULL )
97195{
98- if (state == WORDWAIT )
99- {
100- if (
101- #ifdef KEEPONLYALNUM
102- isalnum ((unsignedchar )* sptr )
103- #else
104- !isspace ((unsignedchar )* sptr )
105- #endif
106- )
107- {
108- * bufptr = * sptr ;/* start put word in buffer */
109- bufptr ++ ;
110- state = INWORD ;
111- if (sptr - str == slen - 1 /* last char */ )
112- gotogettrg ;
113- }
114- }
115- else
116- {
117- if (
118- #ifdef KEEPONLYALNUM
119- !isalnum ((unsignedchar )* sptr )
196+ #ifdef IGNORECASE
197+ bword = lowerstr_with_len (bword ,eword - bword );
198+ bytelen = strlen (bword );
120199#else
121- isspace (( unsigned char ) * sptr )
200+ bytelen = eword - bword ;
122201#endif
123- )
124- {
125- gettrg :
126- /* word in buffer, so count trigrams */
127- * bufptr = ' ' ;
128- * (bufptr + 1 )= ' ' ;
129- wl = bufptr - (buf + LPADDING )- 2 + LPADDING + RPADDING ;
130- if (wl <=0 )
131- {
132- bufptr = buf + LPADDING ;
133- state = WORDWAIT ;
134- sptr ++ ;
135- continue ;
136- }
202+
203+ memcpy (buf + LPADDING ,bword ,bytelen );
137204
138205#ifdef IGNORECASE
139- do
140- {/* lower word */
141- int wwl = bufptr - buf ;
142-
143- bufptr = buf + LPADDING ;
144- while (bufptr - buf < wwl )
145- {
146- * bufptr = tolower ((unsignedchar )* bufptr );
147- bufptr ++ ;
148- }
149- }while (0 );
206+ pfree (bword );
150207#endif
151- bufptr = buf ;
152- /* set trigrams */
153- while (bufptr - buf < wl )
154- {
155- CPTRGM (tptr ,bufptr );
156- bufptr ++ ;
157- tptr ++ ;
158- }
159- bufptr = buf + LPADDING ;
160- state = WORDWAIT ;
161- }
162- else
163- {
164- * bufptr = * sptr ;/* put in buffer */
165- bufptr ++ ;
166- if (sptr - str == slen - 1 )
167- gotogettrg ;
168- }
169- }
170- sptr ++ ;
208+ buf [LPADDING + bytelen ]= ' ' ;
209+ buf [LPADDING + bytelen + 1 ]= ' ' ;
210+
211+ /*
212+ * count trigrams
213+ */
214+ tptr = make_trigrams (tptr ,buf ,bytelen + LPADDING + RPADDING ,
215+ charlen + LPADDING + RPADDING );
171216}
172217
173218pfree (buf );
@@ -186,6 +231,19 @@ generate_trgm(char *str, int slen)
186231return trg ;
187232}
188233
234+ uint32
235+ trgm2int (trgm * ptr )
236+ {
237+ uint32 val = 0 ;
238+
239+ val |=* ( ((unsignedchar * )ptr ) );
240+ val <<=8 ;
241+ val |=* ( ((unsignedchar * )ptr )+ 1 );
242+ val <<=8 ;
243+ val |=* ( ((unsignedchar * )ptr )+ 2 );
244+
245+ return val ;
246+ }
189247
190248PG_FUNCTION_INFO_V1 (show_trgm );
191249Datum show_trgm (PG_FUNCTION_ARGS );
@@ -204,10 +262,18 @@ show_trgm(PG_FUNCTION_ARGS)
204262
205263for (i = 0 ,ptr = GETARR (trg );i < ARRNELEM (trg );i ++ ,ptr ++ )
206264{
207- text * item = (text * )palloc (VARHDRSZ + 3 );
265+ text * item = (text * )palloc (VARHDRSZ + Max ( 12 , pg_database_encoding_max_length () * 3 ) );
208266
209- SET_VARSIZE (item ,VARHDRSZ + 3 );
210- CPTRGM (VARDATA (item ),ptr );
267+ if (pg_database_encoding_max_length ()> 1 && !ISPRINTABLETRGM (ptr ) )
268+ {
269+ snprintf (VARDATA (item ),12 ,"0x%06x" ,trgm2int (ptr ));
270+ SET_VARSIZE (item ,VARHDRSZ + strlen (VARDATA (item )));
271+ }
272+ else
273+ {
274+ SET_VARSIZE (item ,VARHDRSZ + 3 );
275+ CPTRGM (VARDATA (item ),ptr );
276+ }
211277d [i ]= PointerGetDatum (item );
212278}
213279