LCOV - code coverage report
Current view: top level - src/backend/utils/adt - tsquery_cleanup.c (source / functions) Hit Total Coverage
Test: PostgreSQL Lines: 115 161 71.4 %
Date: 2017-09-29 13:40:31 Functions: 7 9 77.8 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * tsquery_cleanup.c
       4             :  *   Cleanup query from NOT values and/or stopword
       5             :  *   Utility functions to correct work.
       6             :  *
       7             :  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
       8             :  *
       9             :  *
      10             :  * IDENTIFICATION
      11             :  *    src/backend/utils/adt/tsquery_cleanup.c
      12             :  *
      13             :  *-------------------------------------------------------------------------
      14             :  */
      15             : 
      16             : #include "postgres.h"
      17             : 
      18             : #include "tsearch/ts_utils.h"
      19             : #include "miscadmin.h"
      20             : 
      21             : typedef struct NODE
      22             : {
      23             :     struct NODE *left;
      24             :     struct NODE *right;
      25             :     QueryItem  *valnode;
      26             : } NODE;
      27             : 
      28             : /*
      29             :  * make query tree from plain view of query
      30             :  */
      31             : static NODE *
      32         301 : maketree(QueryItem *in)
      33             : {
      34         301 :     NODE       *node = (NODE *) palloc(sizeof(NODE));
      35             : 
      36             :     /* since this function recurses, it could be driven to stack overflow. */
      37         301 :     check_stack_depth();
      38             : 
      39         301 :     node->valnode = in;
      40         301 :     node->right = node->left = NULL;
      41         301 :     if (in->type == QI_OPR)
      42             :     {
      43         129 :         node->right = maketree(in + 1);
      44         129 :         if (in->qoperator.oper != OP_NOT)
      45         124 :             node->left = maketree(in + in->qoperator.left);
      46             :     }
      47         301 :     return node;
      48             : }
      49             : 
      50             : /*
      51             :  * Internal state for plaintree and plainnode
      52             :  */
      53             : typedef struct
      54             : {
      55             :     QueryItem  *ptr;
      56             :     int         len;            /* allocated size of ptr */
      57             :     int         cur;            /* number of elements in ptr */
      58             : } PLAINTREE;
      59             : 
      60             : static void
      61         152 : plainnode(PLAINTREE *state, NODE *node)
      62             : {
      63             :     /* since this function recurses, it could be driven to stack overflow. */
      64         152 :     check_stack_depth();
      65             : 
      66         152 :     if (state->cur == state->len)
      67             :     {
      68           0 :         state->len *= 2;
      69           0 :         state->ptr = (QueryItem *) repalloc((void *) state->ptr, state->len * sizeof(QueryItem));
      70             :     }
      71         152 :     memcpy((void *) &(state->ptr[state->cur]), (void *) node->valnode, sizeof(QueryItem));
      72         152 :     if (node->valnode->type == QI_VAL)
      73          98 :         state->cur++;
      74          54 :     else if (node->valnode->qoperator.oper == OP_NOT)
      75             :     {
      76           4 :         state->ptr[state->cur].qoperator.left = 1;
      77           4 :         state->cur++;
      78           4 :         plainnode(state, node->right);
      79             :     }
      80             :     else
      81             :     {
      82          50 :         int         cur = state->cur;
      83             : 
      84          50 :         state->cur++;
      85          50 :         plainnode(state, node->right);
      86          50 :         state->ptr[cur].qoperator.left = state->cur - cur;
      87          50 :         plainnode(state, node->left);
      88             :     }
      89         152 :     pfree(node);
      90         152 : }
      91             : 
      92             : /*
      93             :  * make plain view of tree from a NODE-tree representation
      94             :  */
      95             : static QueryItem *
      96          48 : plaintree(NODE *root, int *len)
      97             : {
      98             :     PLAINTREE   pl;
      99             : 
     100          48 :     pl.cur = 0;
     101          48 :     pl.len = 16;
     102          48 :     if (root && (root->valnode->type == QI_VAL || root->valnode->type == QI_OPR))
     103             :     {
     104          48 :         pl.ptr = (QueryItem *) palloc(pl.len * sizeof(QueryItem));
     105          48 :         plainnode(&pl, root);
     106             :     }
     107             :     else
     108           0 :         pl.ptr = NULL;
     109          48 :     *len = pl.cur;
     110          48 :     return pl.ptr;
     111             : }
     112             : 
     113             : static void
     114           1 : freetree(NODE *node)
     115             : {
     116             :     /* since this function recurses, it could be driven to stack overflow. */
     117           1 :     check_stack_depth();
     118             : 
     119           1 :     if (!node)
     120           1 :         return;
     121           1 :     if (node->left)
     122           0 :         freetree(node->left);
     123           1 :     if (node->right)
     124           0 :         freetree(node->right);
     125           1 :     pfree(node);
     126             : }
     127             : 
     128             : /*
     129             :  * clean tree for ! operator.
     130             :  * It's useful for debug, but in
     131             :  * other case, such view is used with search in index.
     132             :  * Operator ! always return TRUE
     133             :  */
     134             : static NODE *
     135           0 : clean_NOT_intree(NODE *node)
     136             : {
     137             :     /* since this function recurses, it could be driven to stack overflow. */
     138           0 :     check_stack_depth();
     139             : 
     140           0 :     if (node->valnode->type == QI_VAL)
     141           0 :         return node;
     142             : 
     143           0 :     if (node->valnode->qoperator.oper == OP_NOT)
     144             :     {
     145           0 :         freetree(node);
     146           0 :         return NULL;
     147             :     }
     148             : 
     149             :     /* operator & or | */
     150           0 :     if (node->valnode->qoperator.oper == OP_OR)
     151             :     {
     152           0 :         if ((node->left = clean_NOT_intree(node->left)) == NULL ||
     153           0 :             (node->right = clean_NOT_intree(node->right)) == NULL)
     154             :         {
     155           0 :             freetree(node);
     156           0 :             return NULL;
     157             :         }
     158             :     }
     159             :     else
     160             :     {
     161           0 :         NODE       *res = node;
     162             : 
     163           0 :         Assert(node->valnode->qoperator.oper == OP_AND ||
     164             :                node->valnode->qoperator.oper == OP_PHRASE);
     165             : 
     166           0 :         node->left = clean_NOT_intree(node->left);
     167           0 :         node->right = clean_NOT_intree(node->right);
     168           0 :         if (node->left == NULL && node->right == NULL)
     169             :         {
     170           0 :             pfree(node);
     171           0 :             res = NULL;
     172             :         }
     173           0 :         else if (node->left == NULL)
     174             :         {
     175           0 :             res = node->right;
     176           0 :             pfree(node);
     177             :         }
     178           0 :         else if (node->right == NULL)
     179             :         {
     180           0 :             res = node->left;
     181           0 :             pfree(node);
     182             :         }
     183           0 :         return res;
     184             :     }
     185           0 :     return node;
     186             : }
     187             : 
     188             : QueryItem *
     189           0 : clean_NOT(QueryItem *ptr, int *len)
     190             : {
     191           0 :     NODE       *root = maketree(ptr);
     192             : 
     193           0 :     return plaintree(clean_NOT_intree(root), len);
     194             : }
     195             : 
     196             : 
     197             : /*
     198             :  * Remove QI_VALSTOP (stopword) nodes from query tree.
     199             :  *
     200             :  * Returns NULL if the query degenerates to nothing.  Input must not be NULL.
     201             :  *
     202             :  * When we remove a phrase operator due to removing one or both of its
     203             :  * arguments, we might need to adjust the distance of a parent phrase
     204             :  * operator.  For example, 'a' is a stopword, so:
     205             :  *      (b <-> a) <-> c  should become  b <2> c
     206             :  *      b <-> (a <-> c)  should become  b <2> c
     207             :  *      (b <-> (a <-> a)) <-> c  should become    b <3> c
     208             :  *      b <-> ((a <-> a) <-> c)  should become    b <3> c
     209             :  * To handle that, we define two output parameters:
     210             :  *      ladd: amount to add to a phrase distance to the left of this node
     211             :  *      radd: amount to add to a phrase distance to the right of this node
     212             :  * We need two outputs because we could need to bubble up adjustments to two
     213             :  * different parent phrase operators.  Consider
     214             :  *      w <-> (((a <-> x) <2> (y <3> a)) <-> z)
     215             :  * After we've removed the two a's and are considering the <2> node (which is
     216             :  * now just x <2> y), we have an ladd distance of 1 that needs to propagate
     217             :  * up to the topmost (leftmost) <->, and an radd distance of 3 that needs to
     218             :  * propagate to the rightmost <->, so that we'll end up with
     219             :  *      w <2> ((x <2> y) <4> z)
     220             :  * Near the bottom of the tree, we may have subtrees consisting only of
     221             :  * stopwords.  The distances of any phrase operators within such a subtree are
     222             :  * summed and propagated to both ladd and radd, since we don't know which side
     223             :  * of the lowest surviving phrase operator we are in.  The rule is that any
     224             :  * subtree that degenerates to NULL must return equal values of ladd and radd,
     225             :  * and the parent node dealing with it should incorporate only one of those.
     226             :  *
     227             :  * Currently, we only implement this adjustment for adjacent phrase operators.
     228             :  * Thus for example 'x <-> ((a <-> y) | z)' will become 'x <-> (y | z)', which
     229             :  * isn't ideal, but there is no way to represent the really desired semantics
     230             :  * without some redesign of the tsquery structure.  Certainly it would not be
     231             :  * any better to convert that to 'x <2> (y | z)'.  Since this is such a weird
     232             :  * corner case, let it go for now.  But we can fix it in cases where the
     233             :  * intervening non-phrase operator also gets removed, for example
     234             :  * '((x <-> a) | a) <-> y' will become 'x <2> y'.
     235             :  */
     236             : static NODE *
     237         301 : clean_stopword_intree(NODE *node, int *ladd, int *radd)
     238             : {
     239             :     /* since this function recurses, it could be driven to stack overflow. */
     240         301 :     check_stack_depth();
     241             : 
     242             :     /* default output parameters indicate no change in parent distance */
     243         301 :     *ladd = *radd = 0;
     244             : 
     245         301 :     if (node->valnode->type == QI_VAL)
     246          98 :         return node;
     247         203 :     else if (node->valnode->type == QI_VALSTOP)
     248             :     {
     249          74 :         pfree(node);
     250          74 :         return NULL;
     251             :     }
     252             : 
     253         129 :     Assert(node->valnode->type == QI_OPR);
     254             : 
     255         129 :     if (node->valnode->qoperator.oper == OP_NOT)
     256             :     {
     257             :         /* NOT doesn't change pattern width, so just report child distances */
     258           5 :         node->right = clean_stopword_intree(node->right, ladd, radd);
     259           5 :         if (!node->right)
     260             :         {
     261           1 :             freetree(node);
     262           1 :             return NULL;
     263             :         }
     264             :     }
     265             :     else
     266             :     {
     267         124 :         NODE       *res = node;
     268             :         bool        isphrase;
     269             :         int         ndistance,
     270             :                     lladd,
     271             :                     lradd,
     272             :                     rladd,
     273             :                     rradd;
     274             : 
     275             :         /* First, recurse */
     276         124 :         node->left = clean_stopword_intree(node->left, &lladd, &lradd);
     277         124 :         node->right = clean_stopword_intree(node->right, &rladd, &rradd);
     278             : 
     279             :         /* Check if current node is OP_PHRASE, get its distance */
     280         124 :         isphrase = (node->valnode->qoperator.oper == OP_PHRASE);
     281         124 :         ndistance = isphrase ? node->valnode->qoperator.distance : 0;
     282             : 
     283         124 :         if (node->left == NULL && node->right == NULL)
     284             :         {
     285             :             /*
     286             :              * When we collapse out a phrase node entirely, propagate its own
     287             :              * distance into both *ladd and *radd; it is the responsibility of
     288             :              * the parent node to count it only once.  Also, for a phrase
     289             :              * node, distances coming from children are summed and propagated
     290             :              * up to parent (we assume lladd == lradd and rladd == rradd, else
     291             :              * rule was broken at a lower level).  But if this isn't a phrase
     292             :              * node, take the larger of the two child distances; that
     293             :              * corresponds to what TS_execute will do in non-stopword cases.
     294             :              */
     295           0 :             if (isphrase)
     296           0 :                 *ladd = *radd = lladd + ndistance + rladd;
     297             :             else
     298           0 :                 *ladd = *radd = Max(lladd, rladd);
     299           0 :             freetree(node);
     300           0 :             return NULL;
     301             :         }
     302         124 :         else if (node->left == NULL)
     303             :         {
     304             :             /* Removing this operator and left subnode */
     305             :             /* lladd and lradd are equal/redundant, don't count both */
     306          34 :             if (isphrase)
     307             :             {
     308             :                 /* operator's own distance must propagate to left */
     309          27 :                 *ladd = lladd + ndistance + rladd;
     310          27 :                 *radd = rradd;
     311             :             }
     312             :             else
     313             :             {
     314             :                 /* at non-phrase op, just forget the left subnode entirely */
     315           7 :                 *ladd = rladd;
     316           7 :                 *radd = rradd;
     317             :             }
     318          34 :             res = node->right;
     319          34 :             pfree(node);
     320             :         }
     321          90 :         else if (node->right == NULL)
     322             :         {
     323             :             /* Removing this operator and right subnode */
     324             :             /* rladd and rradd are equal/redundant, don't count both */
     325          40 :             if (isphrase)
     326             :             {
     327             :                 /* operator's own distance must propagate to right */
     328          36 :                 *ladd = lladd;
     329          36 :                 *radd = lradd + ndistance + rradd;
     330             :             }
     331             :             else
     332             :             {
     333             :                 /* at non-phrase op, just forget the right subnode entirely */
     334           4 :                 *ladd = lladd;
     335           4 :                 *radd = lradd;
     336             :             }
     337          40 :             res = node->left;
     338          40 :             pfree(node);
     339             :         }
     340          50 :         else if (isphrase)
     341             :         {
     342             :             /* Absorb appropriate corrections at this level */
     343          43 :             node->valnode->qoperator.distance += lradd + rladd;
     344             :             /* Propagate up any unaccounted-for corrections */
     345          43 :             *ladd = lladd;
     346          43 :             *radd = rradd;
     347             :         }
     348             :         else
     349             :         {
     350             :             /* We're keeping a non-phrase operator, so ladd/radd remain 0 */
     351             :         }
     352             : 
     353         124 :         return res;
     354             :     }
     355           4 :     return node;
     356             : }
     357             : 
     358             : /*
     359             :  * Number of elements in query tree
     360             :  */
     361             : static int32
     362         152 : calcstrlen(NODE *node)
     363             : {
     364         152 :     int32       size = 0;
     365             : 
     366         152 :     if (node->valnode->type == QI_VAL)
     367             :     {
     368          98 :         size = node->valnode->qoperand.length + 1;
     369             :     }
     370             :     else
     371             :     {
     372          54 :         Assert(node->valnode->type == QI_OPR);
     373             : 
     374          54 :         size = calcstrlen(node->right);
     375          54 :         if (node->valnode->qoperator.oper != OP_NOT)
     376          50 :             size += calcstrlen(node->left);
     377             :     }
     378             : 
     379         152 :     return size;
     380             : }
     381             : 
     382             : /*
     383             :  * Remove QI_VALSTOP (stopword) nodes from TSQuery.
     384             :  */
     385             : TSQuery
     386          48 : cleanup_tsquery_stopwords(TSQuery in)
     387             : {
     388             :     int32       len,
     389             :                 lenstr,
     390             :                 commonlen,
     391             :                 i;
     392             :     NODE       *root;
     393             :     int         ladd,
     394             :                 radd;
     395             :     TSQuery     out;
     396             :     QueryItem  *items;
     397             :     char       *operands;
     398             : 
     399          48 :     if (in->size == 0)
     400           0 :         return in;
     401             : 
     402             :     /* eliminate stop words */
     403          48 :     root = clean_stopword_intree(maketree(GETQUERY(in)), &ladd, &radd);
     404          48 :     if (root == NULL)
     405             :     {
     406           0 :         ereport(NOTICE,
     407             :                 (errmsg("text-search query contains only stop words or doesn't contain lexemes, ignored")));
     408           0 :         out = palloc(HDRSIZETQ);
     409           0 :         out->size = 0;
     410           0 :         SET_VARSIZE(out, HDRSIZETQ);
     411           0 :         return out;
     412             :     }
     413             : 
     414             :     /*
     415             :      * Build TSQuery from plain view
     416             :      */
     417             : 
     418          48 :     lenstr = calcstrlen(root);
     419          48 :     items = plaintree(root, &len);
     420          48 :     commonlen = COMPUTESIZE(len, lenstr);
     421             : 
     422          48 :     out = palloc(commonlen);
     423          48 :     SET_VARSIZE(out, commonlen);
     424          48 :     out->size = len;
     425             : 
     426          48 :     memcpy(GETQUERY(out), items, len * sizeof(QueryItem));
     427             : 
     428          48 :     items = GETQUERY(out);
     429          48 :     operands = GETOPERAND(out);
     430         200 :     for (i = 0; i < out->size; i++)
     431             :     {
     432         152 :         QueryOperand *op = (QueryOperand *) &items[i];
     433             : 
     434         152 :         if (op->type != QI_VAL)
     435          54 :             continue;
     436             : 
     437          98 :         memcpy(operands, GETOPERAND(in) + op->distance, op->length);
     438          98 :         operands[op->length] = '\0';
     439          98 :         op->distance = operands - GETOPERAND(out);
     440          98 :         operands += op->length + 1;
     441             :     }
     442             : 
     443          48 :     return out;
     444             : }

Generated by: LCOV version 1.11