Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * tsquery_cleanup.c
4 : * Cleanup query from NOT values and/or stopword
5 : * Utility functions to correct work.
6 : *
7 : * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/utils/adt/tsquery_cleanup.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 :
16 : #include "postgres.h"
17 :
18 : #include "tsearch/ts_utils.h"
19 : #include "miscadmin.h"
20 :
21 : typedef struct NODE
22 : {
23 : struct NODE *left;
24 : struct NODE *right;
25 : QueryItem *valnode;
26 : } NODE;
27 :
28 : /*
29 : * make query tree from plain view of query
30 : */
31 : static NODE *
32 301 : maketree(QueryItem *in)
33 : {
34 301 : NODE *node = (NODE *) palloc(sizeof(NODE));
35 :
36 : /* since this function recurses, it could be driven to stack overflow. */
37 301 : check_stack_depth();
38 :
39 301 : node->valnode = in;
40 301 : node->right = node->left = NULL;
41 301 : if (in->type == QI_OPR)
42 : {
43 129 : node->right = maketree(in + 1);
44 129 : if (in->qoperator.oper != OP_NOT)
45 124 : node->left = maketree(in + in->qoperator.left);
46 : }
47 301 : return node;
48 : }
49 :
50 : /*
51 : * Internal state for plaintree and plainnode
52 : */
53 : typedef struct
54 : {
55 : QueryItem *ptr;
56 : int len; /* allocated size of ptr */
57 : int cur; /* number of elements in ptr */
58 : } PLAINTREE;
59 :
60 : static void
61 152 : plainnode(PLAINTREE *state, NODE *node)
62 : {
63 : /* since this function recurses, it could be driven to stack overflow. */
64 152 : check_stack_depth();
65 :
66 152 : if (state->cur == state->len)
67 : {
68 0 : state->len *= 2;
69 0 : state->ptr = (QueryItem *) repalloc((void *) state->ptr, state->len * sizeof(QueryItem));
70 : }
71 152 : memcpy((void *) &(state->ptr[state->cur]), (void *) node->valnode, sizeof(QueryItem));
72 152 : if (node->valnode->type == QI_VAL)
73 98 : state->cur++;
74 54 : else if (node->valnode->qoperator.oper == OP_NOT)
75 : {
76 4 : state->ptr[state->cur].qoperator.left = 1;
77 4 : state->cur++;
78 4 : plainnode(state, node->right);
79 : }
80 : else
81 : {
82 50 : int cur = state->cur;
83 :
84 50 : state->cur++;
85 50 : plainnode(state, node->right);
86 50 : state->ptr[cur].qoperator.left = state->cur - cur;
87 50 : plainnode(state, node->left);
88 : }
89 152 : pfree(node);
90 152 : }
91 :
92 : /*
93 : * make plain view of tree from a NODE-tree representation
94 : */
95 : static QueryItem *
96 48 : plaintree(NODE *root, int *len)
97 : {
98 : PLAINTREE pl;
99 :
100 48 : pl.cur = 0;
101 48 : pl.len = 16;
102 48 : if (root && (root->valnode->type == QI_VAL || root->valnode->type == QI_OPR))
103 : {
104 48 : pl.ptr = (QueryItem *) palloc(pl.len * sizeof(QueryItem));
105 48 : plainnode(&pl, root);
106 : }
107 : else
108 0 : pl.ptr = NULL;
109 48 : *len = pl.cur;
110 48 : return pl.ptr;
111 : }
112 :
113 : static void
114 1 : freetree(NODE *node)
115 : {
116 : /* since this function recurses, it could be driven to stack overflow. */
117 1 : check_stack_depth();
118 :
119 1 : if (!node)
120 1 : return;
121 1 : if (node->left)
122 0 : freetree(node->left);
123 1 : if (node->right)
124 0 : freetree(node->right);
125 1 : pfree(node);
126 : }
127 :
128 : /*
129 : * clean tree for ! operator.
130 : * It's useful for debug, but in
131 : * other case, such view is used with search in index.
132 : * Operator ! always return TRUE
133 : */
134 : static NODE *
135 0 : clean_NOT_intree(NODE *node)
136 : {
137 : /* since this function recurses, it could be driven to stack overflow. */
138 0 : check_stack_depth();
139 :
140 0 : if (node->valnode->type == QI_VAL)
141 0 : return node;
142 :
143 0 : if (node->valnode->qoperator.oper == OP_NOT)
144 : {
145 0 : freetree(node);
146 0 : return NULL;
147 : }
148 :
149 : /* operator & or | */
150 0 : if (node->valnode->qoperator.oper == OP_OR)
151 : {
152 0 : if ((node->left = clean_NOT_intree(node->left)) == NULL ||
153 0 : (node->right = clean_NOT_intree(node->right)) == NULL)
154 : {
155 0 : freetree(node);
156 0 : return NULL;
157 : }
158 : }
159 : else
160 : {
161 0 : NODE *res = node;
162 :
163 0 : Assert(node->valnode->qoperator.oper == OP_AND ||
164 : node->valnode->qoperator.oper == OP_PHRASE);
165 :
166 0 : node->left = clean_NOT_intree(node->left);
167 0 : node->right = clean_NOT_intree(node->right);
168 0 : if (node->left == NULL && node->right == NULL)
169 : {
170 0 : pfree(node);
171 0 : res = NULL;
172 : }
173 0 : else if (node->left == NULL)
174 : {
175 0 : res = node->right;
176 0 : pfree(node);
177 : }
178 0 : else if (node->right == NULL)
179 : {
180 0 : res = node->left;
181 0 : pfree(node);
182 : }
183 0 : return res;
184 : }
185 0 : return node;
186 : }
187 :
188 : QueryItem *
189 0 : clean_NOT(QueryItem *ptr, int *len)
190 : {
191 0 : NODE *root = maketree(ptr);
192 :
193 0 : return plaintree(clean_NOT_intree(root), len);
194 : }
195 :
196 :
197 : /*
198 : * Remove QI_VALSTOP (stopword) nodes from query tree.
199 : *
200 : * Returns NULL if the query degenerates to nothing. Input must not be NULL.
201 : *
202 : * When we remove a phrase operator due to removing one or both of its
203 : * arguments, we might need to adjust the distance of a parent phrase
204 : * operator. For example, 'a' is a stopword, so:
205 : * (b <-> a) <-> c should become b <2> c
206 : * b <-> (a <-> c) should become b <2> c
207 : * (b <-> (a <-> a)) <-> c should become b <3> c
208 : * b <-> ((a <-> a) <-> c) should become b <3> c
209 : * To handle that, we define two output parameters:
210 : * ladd: amount to add to a phrase distance to the left of this node
211 : * radd: amount to add to a phrase distance to the right of this node
212 : * We need two outputs because we could need to bubble up adjustments to two
213 : * different parent phrase operators. Consider
214 : * w <-> (((a <-> x) <2> (y <3> a)) <-> z)
215 : * After we've removed the two a's and are considering the <2> node (which is
216 : * now just x <2> y), we have an ladd distance of 1 that needs to propagate
217 : * up to the topmost (leftmost) <->, and an radd distance of 3 that needs to
218 : * propagate to the rightmost <->, so that we'll end up with
219 : * w <2> ((x <2> y) <4> z)
220 : * Near the bottom of the tree, we may have subtrees consisting only of
221 : * stopwords. The distances of any phrase operators within such a subtree are
222 : * summed and propagated to both ladd and radd, since we don't know which side
223 : * of the lowest surviving phrase operator we are in. The rule is that any
224 : * subtree that degenerates to NULL must return equal values of ladd and radd,
225 : * and the parent node dealing with it should incorporate only one of those.
226 : *
227 : * Currently, we only implement this adjustment for adjacent phrase operators.
228 : * Thus for example 'x <-> ((a <-> y) | z)' will become 'x <-> (y | z)', which
229 : * isn't ideal, but there is no way to represent the really desired semantics
230 : * without some redesign of the tsquery structure. Certainly it would not be
231 : * any better to convert that to 'x <2> (y | z)'. Since this is such a weird
232 : * corner case, let it go for now. But we can fix it in cases where the
233 : * intervening non-phrase operator also gets removed, for example
234 : * '((x <-> a) | a) <-> y' will become 'x <2> y'.
235 : */
236 : static NODE *
237 301 : clean_stopword_intree(NODE *node, int *ladd, int *radd)
238 : {
239 : /* since this function recurses, it could be driven to stack overflow. */
240 301 : check_stack_depth();
241 :
242 : /* default output parameters indicate no change in parent distance */
243 301 : *ladd = *radd = 0;
244 :
245 301 : if (node->valnode->type == QI_VAL)
246 98 : return node;
247 203 : else if (node->valnode->type == QI_VALSTOP)
248 : {
249 74 : pfree(node);
250 74 : return NULL;
251 : }
252 :
253 129 : Assert(node->valnode->type == QI_OPR);
254 :
255 129 : if (node->valnode->qoperator.oper == OP_NOT)
256 : {
257 : /* NOT doesn't change pattern width, so just report child distances */
258 5 : node->right = clean_stopword_intree(node->right, ladd, radd);
259 5 : if (!node->right)
260 : {
261 1 : freetree(node);
262 1 : return NULL;
263 : }
264 : }
265 : else
266 : {
267 124 : NODE *res = node;
268 : bool isphrase;
269 : int ndistance,
270 : lladd,
271 : lradd,
272 : rladd,
273 : rradd;
274 :
275 : /* First, recurse */
276 124 : node->left = clean_stopword_intree(node->left, &lladd, &lradd);
277 124 : node->right = clean_stopword_intree(node->right, &rladd, &rradd);
278 :
279 : /* Check if current node is OP_PHRASE, get its distance */
280 124 : isphrase = (node->valnode->qoperator.oper == OP_PHRASE);
281 124 : ndistance = isphrase ? node->valnode->qoperator.distance : 0;
282 :
283 124 : if (node->left == NULL && node->right == NULL)
284 : {
285 : /*
286 : * When we collapse out a phrase node entirely, propagate its own
287 : * distance into both *ladd and *radd; it is the responsibility of
288 : * the parent node to count it only once. Also, for a phrase
289 : * node, distances coming from children are summed and propagated
290 : * up to parent (we assume lladd == lradd and rladd == rradd, else
291 : * rule was broken at a lower level). But if this isn't a phrase
292 : * node, take the larger of the two child distances; that
293 : * corresponds to what TS_execute will do in non-stopword cases.
294 : */
295 0 : if (isphrase)
296 0 : *ladd = *radd = lladd + ndistance + rladd;
297 : else
298 0 : *ladd = *radd = Max(lladd, rladd);
299 0 : freetree(node);
300 0 : return NULL;
301 : }
302 124 : else if (node->left == NULL)
303 : {
304 : /* Removing this operator and left subnode */
305 : /* lladd and lradd are equal/redundant, don't count both */
306 34 : if (isphrase)
307 : {
308 : /* operator's own distance must propagate to left */
309 27 : *ladd = lladd + ndistance + rladd;
310 27 : *radd = rradd;
311 : }
312 : else
313 : {
314 : /* at non-phrase op, just forget the left subnode entirely */
315 7 : *ladd = rladd;
316 7 : *radd = rradd;
317 : }
318 34 : res = node->right;
319 34 : pfree(node);
320 : }
321 90 : else if (node->right == NULL)
322 : {
323 : /* Removing this operator and right subnode */
324 : /* rladd and rradd are equal/redundant, don't count both */
325 40 : if (isphrase)
326 : {
327 : /* operator's own distance must propagate to right */
328 36 : *ladd = lladd;
329 36 : *radd = lradd + ndistance + rradd;
330 : }
331 : else
332 : {
333 : /* at non-phrase op, just forget the right subnode entirely */
334 4 : *ladd = lladd;
335 4 : *radd = lradd;
336 : }
337 40 : res = node->left;
338 40 : pfree(node);
339 : }
340 50 : else if (isphrase)
341 : {
342 : /* Absorb appropriate corrections at this level */
343 43 : node->valnode->qoperator.distance += lradd + rladd;
344 : /* Propagate up any unaccounted-for corrections */
345 43 : *ladd = lladd;
346 43 : *radd = rradd;
347 : }
348 : else
349 : {
350 : /* We're keeping a non-phrase operator, so ladd/radd remain 0 */
351 : }
352 :
353 124 : return res;
354 : }
355 4 : return node;
356 : }
357 :
358 : /*
359 : * Number of elements in query tree
360 : */
361 : static int32
362 152 : calcstrlen(NODE *node)
363 : {
364 152 : int32 size = 0;
365 :
366 152 : if (node->valnode->type == QI_VAL)
367 : {
368 98 : size = node->valnode->qoperand.length + 1;
369 : }
370 : else
371 : {
372 54 : Assert(node->valnode->type == QI_OPR);
373 :
374 54 : size = calcstrlen(node->right);
375 54 : if (node->valnode->qoperator.oper != OP_NOT)
376 50 : size += calcstrlen(node->left);
377 : }
378 :
379 152 : return size;
380 : }
381 :
382 : /*
383 : * Remove QI_VALSTOP (stopword) nodes from TSQuery.
384 : */
385 : TSQuery
386 48 : cleanup_tsquery_stopwords(TSQuery in)
387 : {
388 : int32 len,
389 : lenstr,
390 : commonlen,
391 : i;
392 : NODE *root;
393 : int ladd,
394 : radd;
395 : TSQuery out;
396 : QueryItem *items;
397 : char *operands;
398 :
399 48 : if (in->size == 0)
400 0 : return in;
401 :
402 : /* eliminate stop words */
403 48 : root = clean_stopword_intree(maketree(GETQUERY(in)), &ladd, &radd);
404 48 : if (root == NULL)
405 : {
406 0 : ereport(NOTICE,
407 : (errmsg("text-search query contains only stop words or doesn't contain lexemes, ignored")));
408 0 : out = palloc(HDRSIZETQ);
409 0 : out->size = 0;
410 0 : SET_VARSIZE(out, HDRSIZETQ);
411 0 : return out;
412 : }
413 :
414 : /*
415 : * Build TSQuery from plain view
416 : */
417 :
418 48 : lenstr = calcstrlen(root);
419 48 : items = plaintree(root, &len);
420 48 : commonlen = COMPUTESIZE(len, lenstr);
421 :
422 48 : out = palloc(commonlen);
423 48 : SET_VARSIZE(out, commonlen);
424 48 : out->size = len;
425 :
426 48 : memcpy(GETQUERY(out), items, len * sizeof(QueryItem));
427 :
428 48 : items = GETQUERY(out);
429 48 : operands = GETOPERAND(out);
430 200 : for (i = 0; i < out->size; i++)
431 : {
432 152 : QueryOperand *op = (QueryOperand *) &items[i];
433 :
434 152 : if (op->type != QI_VAL)
435 54 : continue;
436 :
437 98 : memcpy(operands, GETOPERAND(in) + op->distance, op->length);
438 98 : operands[op->length] = '\0';
439 98 : op->distance = operands - GETOPERAND(out);
440 98 : operands += op->length + 1;
441 : }
442 :
443 48 : return out;
444 : }
|