77 *
88 *
99 * IDENTIFICATION
10- * $PostgreSQL: pgsql/src/backend/utils/adt/tsvector_parser.c,v 1.1 2007/09/07 15:09 :56teodor Exp $
10+ * $PostgreSQL: pgsql/src/backend/utils/adt/tsvector_parser.c,v 1.2 2007/10/21 22:29 :56tgl Exp $
1111 *
1212 *-------------------------------------------------------------------------
1313 */
2020#include "tsearch/ts_utils.h"
2121#include "utils/memutils.h"
2222
23+
24+ /*
25+ * Private state of tsvector parser. Note that tsquery also uses this code to
26+ * parse its input, hence the boolean flags. The two flags are both true or
27+ * both false in current usage, but we keep them separate for clarity.
28+ * is_tsquery affects *only* the content of error messages.
29+ */
2330struct TSVectorParseStateData
2431{
25- char * prsbuf ;
26- char * word ;/* buffer to hold the current word */
27- int len ;/* size in bytes allocated for 'word' */
28- bool oprisdelim ;
32+ char * prsbuf ;/* next input character */
33+ char * bufstart ;/* whole string (used only for errors) */
34+ char * word ;/* buffer to hold the current word */
35+ int len ;/* size in bytes allocated for 'word' */
36+ int eml ;/* max bytes per character */
37+ bool oprisdelim ;/* treat ! | * ( ) as delimiters? */
38+ bool is_tsquery ;/* say "tsquery" not "tsvector" in errors? */
2939};
3040
41+
3142/*
3243 * Initializes parser for the input string. If oprisdelim is set, the
3344 * following characters are treated as delimiters in addition to whitespace:
3445 * ! | & ( )
3546 */
3647TSVectorParseState
37- init_tsvector_parser (char * input ,bool oprisdelim )
48+ init_tsvector_parser (char * input ,bool oprisdelim , bool is_tsquery )
3849{
3950TSVectorParseState state ;
4051
4152state = (TSVectorParseState )palloc (sizeof (struct TSVectorParseStateData ));
4253state -> prsbuf = input ;
54+ state -> bufstart = input ;
4355state -> len = 32 ;
4456state -> word = (char * )palloc (state -> len );
57+ state -> eml = pg_database_encoding_max_length ();
4558state -> oprisdelim = oprisdelim ;
59+ state -> is_tsquery = is_tsquery ;
4660
4761return state ;
4862}
4963
5064/*
51- * Reinitializes parserfor parsing 'input', instead of previous input.
65+ * Reinitializes parserto parse 'input', instead of previous input.
5266 */
5367void
5468reset_tsvector_parser (TSVectorParseState state ,char * input )
@@ -66,21 +80,21 @@ close_tsvector_parser(TSVectorParseState state)
6680pfree (state );
6781}
6882
83+ /* increase the size of 'word' if needed to hold one more character */
6984#define RESIZEPRSBUF \
7085do { \
71- if ( curpos - state->word + pg_database_encoding_max_length() >= state->len ) \
86+ int clen = curpos - state->word; \
87+ if ( clen + state->eml >= state->len ) \
7288{ \
73- int clen = curpos - state->word; \
7489state->len *= 2; \
75- state->word = (char*) repalloc( (void*) state->word, state->len ); \
90+ state->word = (char *) repalloc(state->word, state->len); \
7691curpos = state->word + clen; \
7792} \
7893} while (0)
7994
80-
8195#define ISOPERATOR (x )( pg_mblen(x)==1 && ( *(x)=='!' || *(x)=='&' || *(x)=='|' || *(x)=='(' || *(x)==')' ) )
8296
83- /* Fillsthe output parameters, and returns true */
97+ /* Fillsgettoken_tsvector's output parameters, and returns true */
8498#define RETURN_TOKEN \
8599do { \
86100if (pos_ptr != NULL) \
@@ -111,18 +125,34 @@ do { \
111125#define WAITPOSDELIM 7
112126#define WAITCHARCMPLX 8
113127
128+ #define PRSSYNTAXERROR prssyntaxerror(state)
129+
130+ static void
131+ prssyntaxerror (TSVectorParseState state )
132+ {
133+ ereport (ERROR ,
134+ (errcode (ERRCODE_SYNTAX_ERROR ),
135+ state -> is_tsquery ?
136+ errmsg ("syntax error in tsquery: \"%s\"" ,state -> bufstart ) :
137+ errmsg ("syntax error in tsvector: \"%s\"" ,state -> bufstart )));
138+ }
139+
140+
114141/*
115- * Get next token from string being parsed. Returnsfalse if
116- * end of input string is reached, otherwise strval, lenval, pos_ptr
117- *and poslen output parameters are filled in:
142+ * Get next token from string being parsed. Returnstrue if successful,
143+ *false if end of input string is reached. On success, these output
144+ * parameters are filled in:
118145 *
119- * *strval token
120- * *lenval length of*strval
146+ * *strvalpointer to token
147+ * *lenval length of *strval
121148 * *pos_ptrpointer to a palloc'd array of positions and weights
122149 * associated with the token. If the caller is not interested
123150 *in the information, NULL can be supplied. Otherwise
124151 *the caller is responsible for pfreeing the array.
125152 * *poslennumber of elements in *pos_ptr
153+ * *endptrscan resumption point
154+ *
155+ * Pass NULL for unwanted output parameters.
126156 */
127157bool
128158gettoken_tsvector (TSVectorParseState state ,
@@ -155,9 +185,7 @@ gettoken_tsvector(TSVectorParseState state,
155185oldstate = WAITENDWORD ;
156186}
157187else if (state -> oprisdelim && ISOPERATOR (state -> prsbuf ))
158- ereport (ERROR ,
159- (errcode (ERRCODE_SYNTAX_ERROR ),
160- errmsg ("syntax error in tsvector" )));
188+ PRSSYNTAXERROR ;
161189else if (!t_isspace (state -> prsbuf ))
162190{
163191COPYCHAR (curpos ,state -> prsbuf );
@@ -170,7 +198,8 @@ gettoken_tsvector(TSVectorParseState state,
170198if (* (state -> prsbuf )== '\0' )
171199ereport (ERROR ,
172200(errcode (ERRCODE_SYNTAX_ERROR ),
173- errmsg ("there is no escaped character" )));
201+ errmsg ("there is no escaped character: \"%s\"" ,
202+ state -> bufstart )));
174203else
175204{
176205RESIZEPRSBUF ;
@@ -192,18 +221,14 @@ gettoken_tsvector(TSVectorParseState state,
192221{
193222RESIZEPRSBUF ;
194223if (curpos == state -> word )
195- ereport (ERROR ,
196- (errcode (ERRCODE_SYNTAX_ERROR ),
197- errmsg ("syntax error in tsvector" )));
224+ PRSSYNTAXERROR ;
198225* (curpos )= '\0' ;
199226RETURN_TOKEN ;
200227}
201228else if (t_iseq (state -> prsbuf ,':' ))
202229{
203230if (curpos == state -> word )
204- ereport (ERROR ,
205- (errcode (ERRCODE_SYNTAX_ERROR ),
206- errmsg ("syntax error in tsvector" )));
231+ PRSSYNTAXERROR ;
207232* (curpos )= '\0' ;
208233if (state -> oprisdelim )
209234RETURN_TOKEN ;
@@ -229,9 +254,7 @@ gettoken_tsvector(TSVectorParseState state,
229254oldstate = WAITENDCMPLX ;
230255}
231256else if (* (state -> prsbuf )== '\0' )
232- ereport (ERROR ,
233- (errcode (ERRCODE_SYNTAX_ERROR ),
234- errmsg ("syntax error in tsvector" )));
257+ PRSSYNTAXERROR ;
235258else
236259{
237260RESIZEPRSBUF ;
@@ -253,9 +276,7 @@ gettoken_tsvector(TSVectorParseState state,
253276RESIZEPRSBUF ;
254277* (curpos )= '\0' ;
255278if (curpos == state -> word )
256- ereport (ERROR ,
257- (errcode (ERRCODE_SYNTAX_ERROR ),
258- errmsg ("syntax error in tsvector" )));
279+ PRSSYNTAXERROR ;
259280if (state -> oprisdelim )
260281{
261282/* state->prsbuf+=pg_mblen(state->prsbuf); */
@@ -290,17 +311,17 @@ gettoken_tsvector(TSVectorParseState state,
290311}
291312npos ++ ;
292313WEP_SETPOS (pos [npos - 1 ],LIMITPOS (atoi (state -> prsbuf )));
314+ /* we cannot get here in tsquery, so no need for 2 errmsgs */
293315if (WEP_GETPOS (pos [npos - 1 ])== 0 )
294316ereport (ERROR ,
295317(errcode (ERRCODE_SYNTAX_ERROR ),
296- errmsg ("wrong position info in tsvector" )));
318+ errmsg ("wrong position info in tsvector: \"%s\"" ,
319+ state -> bufstart )));
297320WEP_SETWEIGHT (pos [npos - 1 ],0 );
298321statecode = WAITPOSDELIM ;
299322}
300323else
301- ereport (ERROR ,
302- (errcode (ERRCODE_SYNTAX_ERROR ),
303- errmsg ("syntax error in tsvector" )));
324+ PRSSYNTAXERROR ;
304325}
305326else if (statecode == WAITPOSDELIM )
306327{
@@ -309,42 +330,32 @@ gettoken_tsvector(TSVectorParseState state,
309330else if (t_iseq (state -> prsbuf ,'a' )|| t_iseq (state -> prsbuf ,'A' )|| t_iseq (state -> prsbuf ,'*' ))
310331{
311332if (WEP_GETWEIGHT (pos [npos - 1 ]))
312- ereport (ERROR ,
313- (errcode (ERRCODE_SYNTAX_ERROR ),
314- errmsg ("syntax error in tsvector" )));
333+ PRSSYNTAXERROR ;
315334WEP_SETWEIGHT (pos [npos - 1 ],3 );
316335}
317336else if (t_iseq (state -> prsbuf ,'b' )|| t_iseq (state -> prsbuf ,'B' ))
318337{
319338if (WEP_GETWEIGHT (pos [npos - 1 ]))
320- ereport (ERROR ,
321- (errcode (ERRCODE_SYNTAX_ERROR ),
322- errmsg ("syntax error in tsvector" )));
339+ PRSSYNTAXERROR ;
323340WEP_SETWEIGHT (pos [npos - 1 ],2 );
324341}
325342else if (t_iseq (state -> prsbuf ,'c' )|| t_iseq (state -> prsbuf ,'C' ))
326343{
327344if (WEP_GETWEIGHT (pos [npos - 1 ]))
328- ereport (ERROR ,
329- (errcode (ERRCODE_SYNTAX_ERROR ),
330- errmsg ("syntax error in tsvector" )));
345+ PRSSYNTAXERROR ;
331346WEP_SETWEIGHT (pos [npos - 1 ],1 );
332347}
333348else if (t_iseq (state -> prsbuf ,'d' )|| t_iseq (state -> prsbuf ,'D' ))
334349{
335350if (WEP_GETWEIGHT (pos [npos - 1 ]))
336- ereport (ERROR ,
337- (errcode (ERRCODE_SYNTAX_ERROR ),
338- errmsg ("syntax error in tsvector" )));
351+ PRSSYNTAXERROR ;
339352WEP_SETWEIGHT (pos [npos - 1 ],0 );
340353}
341354else if (t_isspace (state -> prsbuf )||
342355* (state -> prsbuf )== '\0' )
343356RETURN_TOKEN ;
344357else if (!t_isdigit (state -> prsbuf ))
345- ereport (ERROR ,
346- (errcode (ERRCODE_SYNTAX_ERROR ),
347- errmsg ("syntax error in tsvector" )));
358+ PRSSYNTAXERROR ;
348359}
349360else /* internal error */
350361elog (ERROR ,"internal error in gettoken_tsvector" );