77 *
88 *
99 * IDENTIFICATION
10- * $PostgreSQL: pgsql/src/backend/utils/adt/tsvector.c,v 1.5 2007/10/21 22:29:56 tgl Exp $
10+ * $PostgreSQL: pgsql/src/backend/utils/adt/tsvector.c,v 1.6 2007/10/23 00:51:23 tgl Exp $
1111 *
1212 *-------------------------------------------------------------------------
1313 */
2222
2323typedef struct
2424{
25- WordEntry entry ;/*should be first ! */
25+ WordEntry entry ;/*must be first! */
2626WordEntryPos * pos ;
2727int poslen ;/* number of elements in pos */
2828}WordEntryIN ;
2929
30+
31+ /* Compare two WordEntryPos values for qsort */
3032static int
3133comparePos (const void * a ,const void * b )
3234{
33- int apos = WEP_GETPOS (* (WordEntryPos * )a );
34- int bpos = WEP_GETPOS (* (WordEntryPos * )b );
35+ int apos = WEP_GETPOS (* (const WordEntryPos * )a );
36+ int bpos = WEP_GETPOS (* (const WordEntryPos * )b );
3537
3638if (apos == bpos )
3739return 0 ;
@@ -53,17 +55,18 @@ uniquePos(WordEntryPos * a, int l)
5355if (l <=1 )
5456return l ;
5557
56- res = a ;
5758qsort ((void * )a ,l ,sizeof (WordEntryPos ),comparePos );
5859
60+ res = a ;
5961ptr = a + 1 ;
6062while (ptr - a < l )
6163{
6264if (WEP_GETPOS (* ptr )!= WEP_GETPOS (* res ))
6365{
6466res ++ ;
6567* res = * ptr ;
66- if (res - a >=MAXNUMPOS - 1 || WEP_GETPOS (* res )== MAXENTRYPOS - 1 )
68+ if (res - a >=MAXNUMPOS - 1 ||
69+ WEP_GETPOS (* res )== MAXENTRYPOS - 1 )
6770break ;
6871}
6972else if (WEP_GETWEIGHT (* ptr )> WEP_GETWEIGHT (* res ))
@@ -74,12 +77,13 @@ uniquePos(WordEntryPos * a, int l)
7477return res + 1 - a ;
7578}
7679
80+ /* Compare two WordEntryIN values for qsort */
7781static int
7882compareentry (const void * va ,const void * vb ,void * arg )
7983{
84+ const WordEntryIN * a = (const WordEntryIN * )va ;
85+ const WordEntryIN * b = (const WordEntryIN * )vb ;
8086char * BufferStr = (char * )arg ;
81- WordEntryIN * a = (WordEntryIN * )va ;
82- WordEntryIN * b = (WordEntryIN * )vb ;
8387
8488if (a -> entry .len == b -> entry .len )
8589{
@@ -91,82 +95,78 @@ compareentry(const void *va, const void *vb, void *arg)
9195return (a -> entry .len > b -> entry .len ) ?1 :-1 ;
9296}
9397
98+ /*
99+ * Sort an array of WordEntryIN, remove duplicates.
100+ * *outbuflen receives the amount of space needed for strings and positions.
101+ */
94102static int
95103uniqueentry (WordEntryIN * a ,int l ,char * buf ,int * outbuflen )
96104{
105+ int buflen ;
97106WordEntryIN * ptr ,
98107* res ;
99108
100109Assert (l >=1 );
101110
102- if (l == 1 )
103- {
104- if (a -> entry .haspos )
105- {
106- a -> poslen = uniquePos (a -> pos ,a -> poslen );
107- * outbuflen = SHORTALIGN (a -> entry .len )+ (a -> poslen + 1 )* sizeof (WordEntryPos );
108- }
109- else
110- * outbuflen = a -> entry .len ;
111+ if (l > 1 )
112+ qsort_arg ((void * )a ,l ,sizeof (WordEntryIN ),compareentry ,
113+ (void * )buf );
111114
112- return l ;
113- }
115+ buflen = 0 ;
114116res = a ;
115-
116117ptr = a + 1 ;
117- qsort_arg ((void * )a ,l ,sizeof (WordEntryIN ),compareentry , (void * )buf );
118-
119118while (ptr - a < l )
120119{
121120if (!(ptr -> entry .len == res -> entry .len &&
122- strncmp (& buf [ptr -> entry .pos ],& buf [res -> entry .pos ],res -> entry .len )== 0 ))
121+ strncmp (& buf [ptr -> entry .pos ],& buf [res -> entry .pos ],
122+ res -> entry .len )== 0 ))
123123{
124+ /* done accumulating data into *res, count space needed */
125+ buflen += res -> entry .len ;
124126if (res -> entry .haspos )
125127{
126- * outbuflen += SHORTALIGN (res -> entry .len );
127128res -> poslen = uniquePos (res -> pos ,res -> poslen );
128- * outbuflen += res -> poslen * sizeof (WordEntryPos );
129+ buflen = SHORTALIGN (buflen );
130+ buflen += res -> poslen * sizeof (WordEntryPos )+ sizeof (uint16 );
129131}
130- else
131- * outbuflen += res -> entry .len ;
132132res ++ ;
133133memcpy (res ,ptr ,sizeof (WordEntryIN ));
134134}
135135else if (ptr -> entry .haspos )
136136{
137137if (res -> entry .haspos )
138138{
139+ /* append ptr's positions to res's positions */
139140int newlen = ptr -> poslen + res -> poslen ;
140141
141- /* Append res to pos */
142-
143- res -> pos = (WordEntryPos * )repalloc (res -> pos ,newlen * sizeof (WordEntryPos ));
144- memcpy (& res -> pos [res -> poslen ],
145- ptr -> pos ,ptr -> poslen * sizeof (WordEntryPos ));
142+ res -> pos = (WordEntryPos * )
143+ repalloc (res -> pos ,newlen * sizeof (WordEntryPos ));
144+ memcpy (& res -> pos [res -> poslen ],ptr -> pos ,
145+ ptr -> poslen * sizeof (WordEntryPos ));
146146res -> poslen = newlen ;
147147pfree (ptr -> pos );
148148}
149149else
150150{
151+ /* just give ptr's positions to pos */
151152res -> entry .haspos = 1 ;
152153res -> pos = ptr -> pos ;
154+ res -> poslen = ptr -> poslen ;
153155}
154156}
155157ptr ++ ;
156158}
157159
158- /*add last item */
159-
160+ /*count space needed for last item */
161+ buflen += res -> entry . len ;
160162if (res -> entry .haspos )
161163{
162- * outbuflen += SHORTALIGN (res -> entry .len );
163-
164164res -> poslen = uniquePos (res -> pos ,res -> poslen );
165- * outbuflen += res -> poslen * sizeof (WordEntryPos );
165+ buflen = SHORTALIGN (buflen );
166+ buflen += res -> poslen * sizeof (WordEntryPos )+ sizeof (uint16 );
166167}
167- else
168- * outbuflen += res -> entry .len ;
169168
169+ * outbuflen = buflen ;
170170return res + 1 - a ;
171171}
172172
@@ -193,6 +193,8 @@ tsvectorin(PG_FUNCTION_ARGS)
193193int toklen ;
194194WordEntryPos * pos ;
195195int poslen ;
196+ char * strbuf ;
197+ int stroff ;
196198
197199/*
198200 * Tokens are appended to tmpbuf, cur is a pointer
@@ -212,27 +214,26 @@ tsvectorin(PG_FUNCTION_ARGS)
212214
213215while (gettoken_tsvector (state ,& token ,& toklen ,& pos ,& poslen ,NULL ))
214216{
215-
216217if (toklen >=MAXSTRLEN )
217218ereport (ERROR ,
218- (errcode (ERRCODE_SYNTAX_ERROR ),
219+ (errcode (ERRCODE_PROGRAM_LIMIT_EXCEEDED ),
219220errmsg ("word is too long (%ld bytes, max %ld bytes)" ,
220221(long )toklen ,
221- (long )MAXSTRLEN )));
222-
222+ (long ) (MAXSTRLEN - 1 ))));
223223
224224if (cur - tmpbuf > MAXSTRPOS )
225225ereport (ERROR ,
226- (errcode (ERRCODE_SYNTAX_ERROR ),
227- errmsg ("position value is toolarge " )));
226+ (errcode (ERRCODE_PROGRAM_LIMIT_EXCEEDED ),
227+ errmsg ("string is toolong for tsvector " )));
228228
229229/*
230230 * Enlarge buffers if needed
231231 */
232232if (len >=arrlen )
233233{
234234arrlen *=2 ;
235- arr = (WordEntryIN * )repalloc ((void * )arr ,sizeof (WordEntryIN )* arrlen );
235+ arr = (WordEntryIN * )
236+ repalloc ((void * )arr ,sizeof (WordEntryIN )* arrlen );
236237}
237238while ((cur - tmpbuf )+ toklen >=buflen )
238239{
@@ -254,7 +255,11 @@ tsvectorin(PG_FUNCTION_ARGS)
254255arr [len ].poslen = poslen ;
255256}
256257else
258+ {
257259arr [len ].entry .haspos = 0 ;
260+ arr [len ].pos = NULL ;
261+ arr [len ].poslen = 0 ;
262+ }
258263len ++ ;
259264}
260265
@@ -264,40 +269,45 @@ tsvectorin(PG_FUNCTION_ARGS)
264269len = uniqueentry (arr ,len ,tmpbuf ,& buflen );
265270else
266271buflen = 0 ;
272+
273+ if (buflen > MAXSTRPOS )
274+ ereport (ERROR ,
275+ (errcode (ERRCODE_PROGRAM_LIMIT_EXCEEDED ),
276+ errmsg ("string is too long for tsvector" )));
277+
267278totallen = CALCDATASIZE (len ,buflen );
268279in = (TSVector )palloc0 (totallen );
269-
270280SET_VARSIZE (in ,totallen );
271281in -> size = len ;
272- cur = STRPTR (in );
273282inarr = ARRPTR (in );
283+ strbuf = STRPTR (in );
284+ stroff = 0 ;
274285for (i = 0 ;i < len ;i ++ )
275286{
276- memcpy (( void * ) cur , ( void * ) & tmpbuf [arr [i ].entry .pos ],arr [i ].entry .len );
277- arr [i ].entry .pos = cur - STRPTR ( in ) ;
278- cur += SHORTALIGN ( arr [i ].entry .len ) ;
287+ memcpy (strbuf + stroff , & tmpbuf [arr [i ].entry .pos ],arr [i ].entry .len );
288+ arr [i ].entry .pos = stroff ;
289+ stroff += arr [i ].entry .len ;
279290if (arr [i ].entry .haspos )
280291{
281- uint16 tmplen ;
282-
283- if (arr [i ].poslen > 0xFFFF )
292+ if (arr [i ].poslen > 0xFFFF )
284293elog (ERROR ,"positions array too long" );
285294
286- tmplen = (uint16 )arr [i ].poslen ;
287-
288- /* Copy length to output struct */
289- memcpy (cur ,& tmplen ,sizeof (uint16 ));
290- cur += sizeof (uint16 );
295+ /* Copy number of positions */
296+ stroff = SHORTALIGN (stroff );
297+ * (uint16 * ) (strbuf + stroff )= (uint16 )arr [i ].poslen ;
298+ stroff += sizeof (uint16 );
291299
292300/* Copy positions */
293- memcpy (cur ,arr [i ].pos ,( arr [i ].poslen ) * sizeof (WordEntryPos ));
294- cur += arr [i ].poslen * sizeof (WordEntryPos );
301+ memcpy (strbuf + stroff ,arr [i ].pos ,arr [i ].poslen * sizeof (WordEntryPos ));
302+ stroff += arr [i ].poslen * sizeof (WordEntryPos );
295303
296304pfree (arr [i ].pos );
297305}
298306inarr [i ]= arr [i ].entry ;
299307}
300308
309+ Assert ((strbuf + stroff - (char * )in )== totallen );
310+
301311PG_RETURN_TSVECTOR (in );
302312}
303313
@@ -495,11 +505,12 @@ tsvectorrecv(PG_FUNCTION_ARGS)
495505
496506datalen += lex_len ;
497507
498- if (i > 0 && WordEntryCMP (& vec -> entries [i ],& vec -> entries [i - 1 ],STRPTR (vec )) <=0 )
508+ if (i > 0 && WordEntryCMP (& vec -> entries [i ],
509+ & vec -> entries [i - 1 ],
510+ STRPTR (vec )) <=0 )
499511elog (ERROR ,"lexemes are misordered" );
500512
501513/* Receive positions */
502-
503514if (npos > 0 )
504515{
505516uint16 j ;