7
7
*
8
8
*
9
9
* IDENTIFICATION
10
- * $PostgreSQL: pgsql/src/backend/utils/adt/tsvector_parser.c,v 1.1 2007/09/07 15:09 :56teodor Exp $
10
+ * $PostgreSQL: pgsql/src/backend/utils/adt/tsvector_parser.c,v 1.2 2007/10/21 22:29 :56tgl Exp $
11
11
*
12
12
*-------------------------------------------------------------------------
13
13
*/
20
20
#include "tsearch/ts_utils.h"
21
21
#include "utils/memutils.h"
22
22
23
+
24
+ /*
25
+ * Private state of tsvector parser. Note that tsquery also uses this code to
26
+ * parse its input, hence the boolean flags. The two flags are both true or
27
+ * both false in current usage, but we keep them separate for clarity.
28
+ * is_tsquery affects *only* the content of error messages.
29
+ */
23
30
struct TSVectorParseStateData
24
31
{
25
- char * prsbuf ;
26
- char * word ;/* buffer to hold the current word */
27
- int len ;/* size in bytes allocated for 'word' */
28
- bool oprisdelim ;
32
+ char * prsbuf ;/* next input character */
33
+ char * bufstart ;/* whole string (used only for errors) */
34
+ char * word ;/* buffer to hold the current word */
35
+ int len ;/* size in bytes allocated for 'word' */
36
+ int eml ;/* max bytes per character */
37
+ bool oprisdelim ;/* treat ! | * ( ) as delimiters? */
38
+ bool is_tsquery ;/* say "tsquery" not "tsvector" in errors? */
29
39
};
30
40
41
+
31
42
/*
32
43
* Initializes parser for the input string. If oprisdelim is set, the
33
44
* following characters are treated as delimiters in addition to whitespace:
34
45
* ! | & ( )
35
46
*/
36
47
TSVectorParseState
37
- init_tsvector_parser (char * input ,bool oprisdelim )
48
+ init_tsvector_parser (char * input ,bool oprisdelim , bool is_tsquery )
38
49
{
39
50
TSVectorParseState state ;
40
51
41
52
state = (TSVectorParseState )palloc (sizeof (struct TSVectorParseStateData ));
42
53
state -> prsbuf = input ;
54
+ state -> bufstart = input ;
43
55
state -> len = 32 ;
44
56
state -> word = (char * )palloc (state -> len );
57
+ state -> eml = pg_database_encoding_max_length ();
45
58
state -> oprisdelim = oprisdelim ;
59
+ state -> is_tsquery = is_tsquery ;
46
60
47
61
return state ;
48
62
}
49
63
50
64
/*
51
- * Reinitializes parserfor parsing 'input', instead of previous input.
65
+ * Reinitializes parserto parse 'input', instead of previous input.
52
66
*/
53
67
void
54
68
reset_tsvector_parser (TSVectorParseState state ,char * input )
@@ -66,21 +80,21 @@ close_tsvector_parser(TSVectorParseState state)
66
80
pfree (state );
67
81
}
68
82
83
+ /* increase the size of 'word' if needed to hold one more character */
69
84
#define RESIZEPRSBUF \
70
85
do { \
71
- if ( curpos - state->word + pg_database_encoding_max_length() >= state->len ) \
86
+ int clen = curpos - state->word; \
87
+ if ( clen + state->eml >= state->len ) \
72
88
{ \
73
- int clen = curpos - state->word; \
74
89
state->len *= 2; \
75
- state->word = (char*) repalloc( (void*) state->word, state->len ); \
90
+ state->word = (char *) repalloc(state->word, state->len); \
76
91
curpos = state->word + clen; \
77
92
} \
78
93
} while (0)
79
94
80
-
81
95
#define ISOPERATOR (x )( pg_mblen(x)==1 && ( *(x)=='!' || *(x)=='&' || *(x)=='|' || *(x)=='(' || *(x)==')' ) )
82
96
83
- /* Fillsthe output parameters, and returns true */
97
+ /* Fillsgettoken_tsvector's output parameters, and returns true */
84
98
#define RETURN_TOKEN \
85
99
do { \
86
100
if (pos_ptr != NULL) \
@@ -111,18 +125,34 @@ do { \
111
125
#define WAITPOSDELIM 7
112
126
#define WAITCHARCMPLX 8
113
127
128
+ #define PRSSYNTAXERROR prssyntaxerror(state)
129
+
130
+ static void
131
+ prssyntaxerror (TSVectorParseState state )
132
+ {
133
+ ereport (ERROR ,
134
+ (errcode (ERRCODE_SYNTAX_ERROR ),
135
+ state -> is_tsquery ?
136
+ errmsg ("syntax error in tsquery: \"%s\"" ,state -> bufstart ) :
137
+ errmsg ("syntax error in tsvector: \"%s\"" ,state -> bufstart )));
138
+ }
139
+
140
+
114
141
/*
115
- * Get next token from string being parsed. Returnsfalse if
116
- * end of input string is reached, otherwise strval, lenval, pos_ptr
117
- *and poslen output parameters are filled in:
142
+ * Get next token from string being parsed. Returnstrue if successful,
143
+ *false if end of input string is reached. On success, these output
144
+ * parameters are filled in:
118
145
*
119
- * *strval token
120
- * *lenval length of*strval
146
+ * *strvalpointer to token
147
+ * *lenval length of *strval
121
148
* *pos_ptrpointer to a palloc'd array of positions and weights
122
149
* associated with the token. If the caller is not interested
123
150
*in the information, NULL can be supplied. Otherwise
124
151
*the caller is responsible for pfreeing the array.
125
152
* *poslennumber of elements in *pos_ptr
153
+ * *endptrscan resumption point
154
+ *
155
+ * Pass NULL for unwanted output parameters.
126
156
*/
127
157
bool
128
158
gettoken_tsvector (TSVectorParseState state ,
@@ -155,9 +185,7 @@ gettoken_tsvector(TSVectorParseState state,
155
185
oldstate = WAITENDWORD ;
156
186
}
157
187
else if (state -> oprisdelim && ISOPERATOR (state -> prsbuf ))
158
- ereport (ERROR ,
159
- (errcode (ERRCODE_SYNTAX_ERROR ),
160
- errmsg ("syntax error in tsvector" )));
188
+ PRSSYNTAXERROR ;
161
189
else if (!t_isspace (state -> prsbuf ))
162
190
{
163
191
COPYCHAR (curpos ,state -> prsbuf );
@@ -170,7 +198,8 @@ gettoken_tsvector(TSVectorParseState state,
170
198
if (* (state -> prsbuf )== '\0' )
171
199
ereport (ERROR ,
172
200
(errcode (ERRCODE_SYNTAX_ERROR ),
173
- errmsg ("there is no escaped character" )));
201
+ errmsg ("there is no escaped character: \"%s\"" ,
202
+ state -> bufstart )));
174
203
else
175
204
{
176
205
RESIZEPRSBUF ;
@@ -192,18 +221,14 @@ gettoken_tsvector(TSVectorParseState state,
192
221
{
193
222
RESIZEPRSBUF ;
194
223
if (curpos == state -> word )
195
- ereport (ERROR ,
196
- (errcode (ERRCODE_SYNTAX_ERROR ),
197
- errmsg ("syntax error in tsvector" )));
224
+ PRSSYNTAXERROR ;
198
225
* (curpos )= '\0' ;
199
226
RETURN_TOKEN ;
200
227
}
201
228
else if (t_iseq (state -> prsbuf ,':' ))
202
229
{
203
230
if (curpos == state -> word )
204
- ereport (ERROR ,
205
- (errcode (ERRCODE_SYNTAX_ERROR ),
206
- errmsg ("syntax error in tsvector" )));
231
+ PRSSYNTAXERROR ;
207
232
* (curpos )= '\0' ;
208
233
if (state -> oprisdelim )
209
234
RETURN_TOKEN ;
@@ -229,9 +254,7 @@ gettoken_tsvector(TSVectorParseState state,
229
254
oldstate = WAITENDCMPLX ;
230
255
}
231
256
else if (* (state -> prsbuf )== '\0' )
232
- ereport (ERROR ,
233
- (errcode (ERRCODE_SYNTAX_ERROR ),
234
- errmsg ("syntax error in tsvector" )));
257
+ PRSSYNTAXERROR ;
235
258
else
236
259
{
237
260
RESIZEPRSBUF ;
@@ -253,9 +276,7 @@ gettoken_tsvector(TSVectorParseState state,
253
276
RESIZEPRSBUF ;
254
277
* (curpos )= '\0' ;
255
278
if (curpos == state -> word )
256
- ereport (ERROR ,
257
- (errcode (ERRCODE_SYNTAX_ERROR ),
258
- errmsg ("syntax error in tsvector" )));
279
+ PRSSYNTAXERROR ;
259
280
if (state -> oprisdelim )
260
281
{
261
282
/* state->prsbuf+=pg_mblen(state->prsbuf); */
@@ -290,17 +311,17 @@ gettoken_tsvector(TSVectorParseState state,
290
311
}
291
312
npos ++ ;
292
313
WEP_SETPOS (pos [npos - 1 ],LIMITPOS (atoi (state -> prsbuf )));
314
+ /* we cannot get here in tsquery, so no need for 2 errmsgs */
293
315
if (WEP_GETPOS (pos [npos - 1 ])== 0 )
294
316
ereport (ERROR ,
295
317
(errcode (ERRCODE_SYNTAX_ERROR ),
296
- errmsg ("wrong position info in tsvector" )));
318
+ errmsg ("wrong position info in tsvector: \"%s\"" ,
319
+ state -> bufstart )));
297
320
WEP_SETWEIGHT (pos [npos - 1 ],0 );
298
321
statecode = WAITPOSDELIM ;
299
322
}
300
323
else
301
- ereport (ERROR ,
302
- (errcode (ERRCODE_SYNTAX_ERROR ),
303
- errmsg ("syntax error in tsvector" )));
324
+ PRSSYNTAXERROR ;
304
325
}
305
326
else if (statecode == WAITPOSDELIM )
306
327
{
@@ -309,42 +330,32 @@ gettoken_tsvector(TSVectorParseState state,
309
330
else if (t_iseq (state -> prsbuf ,'a' )|| t_iseq (state -> prsbuf ,'A' )|| t_iseq (state -> prsbuf ,'*' ))
310
331
{
311
332
if (WEP_GETWEIGHT (pos [npos - 1 ]))
312
- ereport (ERROR ,
313
- (errcode (ERRCODE_SYNTAX_ERROR ),
314
- errmsg ("syntax error in tsvector" )));
333
+ PRSSYNTAXERROR ;
315
334
WEP_SETWEIGHT (pos [npos - 1 ],3 );
316
335
}
317
336
else if (t_iseq (state -> prsbuf ,'b' )|| t_iseq (state -> prsbuf ,'B' ))
318
337
{
319
338
if (WEP_GETWEIGHT (pos [npos - 1 ]))
320
- ereport (ERROR ,
321
- (errcode (ERRCODE_SYNTAX_ERROR ),
322
- errmsg ("syntax error in tsvector" )));
339
+ PRSSYNTAXERROR ;
323
340
WEP_SETWEIGHT (pos [npos - 1 ],2 );
324
341
}
325
342
else if (t_iseq (state -> prsbuf ,'c' )|| t_iseq (state -> prsbuf ,'C' ))
326
343
{
327
344
if (WEP_GETWEIGHT (pos [npos - 1 ]))
328
- ereport (ERROR ,
329
- (errcode (ERRCODE_SYNTAX_ERROR ),
330
- errmsg ("syntax error in tsvector" )));
345
+ PRSSYNTAXERROR ;
331
346
WEP_SETWEIGHT (pos [npos - 1 ],1 );
332
347
}
333
348
else if (t_iseq (state -> prsbuf ,'d' )|| t_iseq (state -> prsbuf ,'D' ))
334
349
{
335
350
if (WEP_GETWEIGHT (pos [npos - 1 ]))
336
- ereport (ERROR ,
337
- (errcode (ERRCODE_SYNTAX_ERROR ),
338
- errmsg ("syntax error in tsvector" )));
351
+ PRSSYNTAXERROR ;
339
352
WEP_SETWEIGHT (pos [npos - 1 ],0 );
340
353
}
341
354
else if (t_isspace (state -> prsbuf )||
342
355
* (state -> prsbuf )== '\0' )
343
356
RETURN_TOKEN ;
344
357
else if (!t_isdigit (state -> prsbuf ))
345
- ereport (ERROR ,
346
- (errcode (ERRCODE_SYNTAX_ERROR ),
347
- errmsg ("syntax error in tsvector" )));
358
+ PRSSYNTAXERROR ;
348
359
}
349
360
else /* internal error */
350
361
elog (ERROR ,"internal error in gettoken_tsvector" );