Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * tsvector_parser.c
4 : * Parser for tsvector
5 : *
6 : * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7 : *
8 : *
9 : * IDENTIFICATION
10 : * src/backend/utils/adt/tsvector_parser.c
11 : *
12 : *-------------------------------------------------------------------------
13 : */
14 :
15 : #include "postgres.h"
16 :
17 : #include "tsearch/ts_locale.h"
18 : #include "tsearch/ts_utils.h"
19 :
20 :
21 : /*
22 : * Private state of tsvector parser. Note that tsquery also uses this code to
23 : * parse its input, hence the boolean flags. The two flags are both true or
24 : * both false in current usage, but we keep them separate for clarity.
25 : * is_tsquery affects *only* the content of error messages.
26 : */
27 : struct TSVectorParseStateData
28 : {
29 : char *prsbuf; /* next input character */
30 : char *bufstart; /* whole string (used only for errors) */
31 : char *word; /* buffer to hold the current word */
32 : int len; /* size in bytes allocated for 'word' */
33 : int eml; /* max bytes per character */
34 : bool oprisdelim; /* treat ! | * ( ) as delimiters? */
35 : bool is_tsquery; /* say "tsquery" not "tsvector" in errors? */
36 : };
37 :
38 :
39 : /*
40 : * Initializes parser for the input string. If oprisdelim is set, the
41 : * following characters are treated as delimiters in addition to whitespace:
42 : * ! | & ( )
43 : */
44 : TSVectorParseState
45 1028 : init_tsvector_parser(char *input, bool oprisdelim, bool is_tsquery)
46 : {
47 : TSVectorParseState state;
48 :
49 1028 : state = (TSVectorParseState) palloc(sizeof(struct TSVectorParseStateData));
50 1028 : state->prsbuf = input;
51 1028 : state->bufstart = input;
52 1028 : state->len = 32;
53 1028 : state->word = (char *) palloc(state->len);
54 1028 : state->eml = pg_database_encoding_max_length();
55 1028 : state->oprisdelim = oprisdelim;
56 1028 : state->is_tsquery = is_tsquery;
57 :
58 1028 : return state;
59 : }
60 :
61 : /*
62 : * Reinitializes parser to parse 'input', instead of previous input.
63 : */
64 : void
65 927 : reset_tsvector_parser(TSVectorParseState state, char *input)
66 : {
67 927 : state->prsbuf = input;
68 927 : }
69 :
70 : /*
71 : * Shuts down a tsvector parser.
72 : */
73 : void
74 1028 : close_tsvector_parser(TSVectorParseState state)
75 : {
76 1028 : pfree(state->word);
77 1028 : pfree(state);
78 1028 : }
79 :
80 : /* increase the size of 'word' if needed to hold one more character */
81 : #define RESIZEPRSBUF \
82 : do { \
83 : int clen = curpos - state->word; \
84 : if ( clen + state->eml >= state->len ) \
85 : { \
86 : state->len *= 2; \
87 : state->word = (char *) repalloc(state->word, state->len); \
88 : curpos = state->word + clen; \
89 : } \
90 : } while (0)
91 :
92 : /* phrase operator begins with '<' */
93 : #define ISOPERATOR(x) \
94 : ( pg_mblen(x) == 1 && ( *(x) == '!' || \
95 : *(x) == '&' || \
96 : *(x) == '|' || \
97 : *(x) == '(' || \
98 : *(x) == ')' || \
99 : *(x) == '<' \
100 : ) )
101 :
102 : /* Fills gettoken_tsvector's output parameters, and returns true */
103 : #define RETURN_TOKEN \
104 : do { \
105 : if (pos_ptr != NULL) \
106 : { \
107 : *pos_ptr = pos; \
108 : *poslen = npos; \
109 : } \
110 : else if (pos != NULL) \
111 : pfree(pos); \
112 : \
113 : if (strval != NULL) \
114 : *strval = state->word; \
115 : if (lenval != NULL) \
116 : *lenval = curpos - state->word; \
117 : if (endptr != NULL) \
118 : *endptr = state->prsbuf; \
119 : return true; \
120 : } while(0)
121 :
122 :
123 : /* State codes used in gettoken_tsvector */
124 : #define WAITWORD 1
125 : #define WAITENDWORD 2
126 : #define WAITNEXTCHAR 3
127 : #define WAITENDCMPLX 4
128 : #define WAITPOSINFO 5
129 : #define INPOSINFO 6
130 : #define WAITPOSDELIM 7
131 : #define WAITCHARCMPLX 8
132 :
133 : #define PRSSYNTAXERROR prssyntaxerror(state)
134 :
135 : static void
136 0 : prssyntaxerror(TSVectorParseState state)
137 : {
138 0 : ereport(ERROR,
139 : (errcode(ERRCODE_SYNTAX_ERROR),
140 : state->is_tsquery ?
141 : errmsg("syntax error in tsquery: \"%s\"", state->bufstart) :
142 : errmsg("syntax error in tsvector: \"%s\"", state->bufstart)));
143 : }
144 :
145 :
146 : /*
147 : * Get next token from string being parsed. Returns true if successful,
148 : * false if end of input string is reached. On success, these output
149 : * parameters are filled in:
150 : *
151 : * *strval pointer to token
152 : * *lenval length of *strval
153 : * *pos_ptr pointer to a palloc'd array of positions and weights
154 : * associated with the token. If the caller is not interested
155 : * in the information, NULL can be supplied. Otherwise
156 : * the caller is responsible for pfreeing the array.
157 : * *poslen number of elements in *pos_ptr
158 : * *endptr scan resumption point
159 : *
160 : * Pass NULL for unwanted output parameters.
161 : */
162 : bool
163 31669 : gettoken_tsvector(TSVectorParseState state,
164 : char **strval, int *lenval,
165 : WordEntryPos **pos_ptr, int *poslen,
166 : char **endptr)
167 : {
168 31669 : int oldstate = 0;
169 31669 : char *curpos = state->word;
170 31669 : int statecode = WAITWORD;
171 :
172 : /*
173 : * pos is for collecting the comma delimited list of positions followed by
174 : * the actual token.
175 : */
176 31669 : WordEntryPos *pos = NULL;
177 31669 : int npos = 0; /* elements of pos used */
178 31669 : int posalen = 0; /* allocated size of pos */
179 :
180 : while (1)
181 : {
182 125206 : if (statecode == WAITWORD)
183 : {
184 61238 : if (*(state->prsbuf) == '\0')
185 614 : return false;
186 60624 : else if (t_iseq(state->prsbuf, '\''))
187 25 : statecode = WAITENDCMPLX;
188 60599 : else if (t_iseq(state->prsbuf, '\\'))
189 : {
190 1 : statecode = WAITNEXTCHAR;
191 1 : oldstate = WAITENDWORD;
192 : }
193 60598 : else if (state->oprisdelim && ISOPERATOR(state->prsbuf))
194 0 : PRSSYNTAXERROR;
195 60598 : else if (!t_isspace(state->prsbuf))
196 : {
197 31029 : COPYCHAR(curpos, state->prsbuf);
198 31029 : curpos += pg_mblen(state->prsbuf);
199 31029 : statecode = WAITENDWORD;
200 : }
201 : }
202 63968 : else if (statecode == WAITNEXTCHAR)
203 : {
204 27 : if (*(state->prsbuf) == '\0')
205 0 : ereport(ERROR,
206 : (errcode(ERRCODE_SYNTAX_ERROR),
207 : errmsg("there is no escaped character: \"%s\"",
208 : state->bufstart)));
209 : else
210 : {
211 27 : RESIZEPRSBUF;
212 27 : COPYCHAR(curpos, state->prsbuf);
213 27 : curpos += pg_mblen(state->prsbuf);
214 27 : Assert(oldstate != 0);
215 27 : statecode = oldstate;
216 : }
217 : }
218 63941 : else if (statecode == WAITENDWORD)
219 : {
220 62760 : if (t_iseq(state->prsbuf, '\\'))
221 : {
222 12 : statecode = WAITNEXTCHAR;
223 12 : oldstate = WAITENDWORD;
224 : }
225 94991 : else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' ||
226 33725 : (state->oprisdelim && ISOPERATOR(state->prsbuf)))
227 : {
228 30736 : RESIZEPRSBUF;
229 30736 : if (curpos == state->word)
230 0 : PRSSYNTAXERROR;
231 30736 : *(curpos) = '\0';
232 30736 : RETURN_TOKEN;
233 : }
234 32012 : else if (t_iseq(state->prsbuf, ':'))
235 : {
236 294 : if (curpos == state->word)
237 0 : PRSSYNTAXERROR;
238 294 : *(curpos) = '\0';
239 294 : if (state->oprisdelim)
240 62 : RETURN_TOKEN;
241 : else
242 232 : statecode = INPOSINFO;
243 : }
244 : else
245 : {
246 31718 : RESIZEPRSBUF;
247 31718 : COPYCHAR(curpos, state->prsbuf);
248 31718 : curpos += pg_mblen(state->prsbuf);
249 : }
250 : }
251 1181 : else if (statecode == WAITENDCMPLX)
252 : {
253 162 : if (t_iseq(state->prsbuf, '\''))
254 : {
255 25 : statecode = WAITCHARCMPLX;
256 : }
257 137 : else if (t_iseq(state->prsbuf, '\\'))
258 : {
259 14 : statecode = WAITNEXTCHAR;
260 14 : oldstate = WAITENDCMPLX;
261 : }
262 123 : else if (*(state->prsbuf) == '\0')
263 0 : PRSSYNTAXERROR;
264 : else
265 : {
266 123 : RESIZEPRSBUF;
267 123 : COPYCHAR(curpos, state->prsbuf);
268 123 : curpos += pg_mblen(state->prsbuf);
269 : }
270 : }
271 1019 : else if (statecode == WAITCHARCMPLX)
272 : {
273 25 : if (t_iseq(state->prsbuf, '\''))
274 : {
275 0 : RESIZEPRSBUF;
276 0 : COPYCHAR(curpos, state->prsbuf);
277 0 : curpos += pg_mblen(state->prsbuf);
278 0 : statecode = WAITENDCMPLX;
279 : }
280 : else
281 : {
282 25 : RESIZEPRSBUF;
283 25 : *(curpos) = '\0';
284 25 : if (curpos == state->word)
285 0 : PRSSYNTAXERROR;
286 25 : if (state->oprisdelim)
287 : {
288 : /* state->prsbuf+=pg_mblen(state->prsbuf); */
289 12 : RETURN_TOKEN;
290 : }
291 : else
292 13 : statecode = WAITPOSINFO;
293 13 : continue; /* recheck current character */
294 : }
295 : }
296 994 : else if (statecode == WAITPOSINFO)
297 : {
298 13 : if (t_iseq(state->prsbuf, ':'))
299 0 : statecode = INPOSINFO;
300 : else
301 13 : RETURN_TOKEN;
302 : }
303 981 : else if (statecode == INPOSINFO)
304 : {
305 336 : if (t_isdigit(state->prsbuf))
306 : {
307 336 : if (posalen == 0)
308 : {
309 232 : posalen = 4;
310 232 : pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * posalen);
311 232 : npos = 0;
312 : }
313 104 : else if (npos + 1 >= posalen)
314 : {
315 20 : posalen *= 2;
316 20 : pos = (WordEntryPos *) repalloc(pos, sizeof(WordEntryPos) * posalen);
317 : }
318 336 : npos++;
319 336 : WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf)));
320 : /* we cannot get here in tsquery, so no need for 2 errmsgs */
321 336 : if (WEP_GETPOS(pos[npos - 1]) == 0)
322 0 : ereport(ERROR,
323 : (errcode(ERRCODE_SYNTAX_ERROR),
324 : errmsg("wrong position info in tsvector: \"%s\"",
325 : state->bufstart)));
326 336 : WEP_SETWEIGHT(pos[npos - 1], 0);
327 336 : statecode = WAITPOSDELIM;
328 : }
329 : else
330 0 : PRSSYNTAXERROR;
331 : }
332 645 : else if (statecode == WAITPOSDELIM)
333 : {
334 645 : if (t_iseq(state->prsbuf, ','))
335 104 : statecode = INPOSINFO;
336 541 : else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*'))
337 : {
338 60 : if (WEP_GETWEIGHT(pos[npos - 1]))
339 0 : PRSSYNTAXERROR;
340 60 : WEP_SETWEIGHT(pos[npos - 1], 3);
341 : }
342 481 : else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B'))
343 : {
344 37 : if (WEP_GETWEIGHT(pos[npos - 1]))
345 0 : PRSSYNTAXERROR;
346 37 : WEP_SETWEIGHT(pos[npos - 1], 2);
347 : }
348 444 : else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C'))
349 : {
350 46 : if (WEP_GETWEIGHT(pos[npos - 1]))
351 0 : PRSSYNTAXERROR;
352 46 : WEP_SETWEIGHT(pos[npos - 1], 1);
353 : }
354 398 : else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D'))
355 : {
356 19 : if (WEP_GETWEIGHT(pos[npos - 1]))
357 0 : PRSSYNTAXERROR;
358 19 : WEP_SETWEIGHT(pos[npos - 1], 0);
359 : }
360 567 : else if (t_isspace(state->prsbuf) ||
361 188 : *(state->prsbuf) == '\0')
362 232 : RETURN_TOKEN;
363 147 : else if (!t_isdigit(state->prsbuf))
364 0 : PRSSYNTAXERROR;
365 : }
366 : else /* internal error */
367 0 : elog(ERROR, "unrecognized state in gettoken_tsvector: %d",
368 : statecode);
369 :
370 : /* get next char */
371 93524 : state->prsbuf += pg_mblen(state->prsbuf);
372 93537 : }
373 : }
|