LCOV - code coverage report
Current view: top level - src/backend/utils/adt - tsvector_parser.c (source / functions) Hit Total Coverage
Test: PostgreSQL Lines: 120 141 85.1 %
Date: 2017-09-29 13:40:31 Functions: 4 5 80.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * tsvector_parser.c
       4             :  *    Parser for tsvector
       5             :  *
       6             :  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
       7             :  *
       8             :  *
       9             :  * IDENTIFICATION
      10             :  *    src/backend/utils/adt/tsvector_parser.c
      11             :  *
      12             :  *-------------------------------------------------------------------------
      13             :  */
      14             : 
      15             : #include "postgres.h"
      16             : 
      17             : #include "tsearch/ts_locale.h"
      18             : #include "tsearch/ts_utils.h"
      19             : 
      20             : 
      21             : /*
      22             :  * Private state of tsvector parser.  Note that tsquery also uses this code to
      23             :  * parse its input, hence the boolean flags.  The two flags are both true or
      24             :  * both false in current usage, but we keep them separate for clarity.
      25             :  * is_tsquery affects *only* the content of error messages.
      26             :  */
      27             : struct TSVectorParseStateData
      28             : {
      29             :     char       *prsbuf;         /* next input character */
      30             :     char       *bufstart;       /* whole string (used only for errors) */
      31             :     char       *word;           /* buffer to hold the current word */
      32             :     int         len;            /* size in bytes allocated for 'word' */
      33             :     int         eml;            /* max bytes per character */
      34             :     bool        oprisdelim;     /* treat ! | * ( ) as delimiters? */
      35             :     bool        is_tsquery;     /* say "tsquery" not "tsvector" in errors? */
      36             : };
      37             : 
      38             : 
      39             : /*
      40             :  * Initializes parser for the input string. If oprisdelim is set, the
      41             :  * following characters are treated as delimiters in addition to whitespace:
      42             :  * ! | & ( )
      43             :  */
      44             : TSVectorParseState
      45        1028 : init_tsvector_parser(char *input, bool oprisdelim, bool is_tsquery)
      46             : {
      47             :     TSVectorParseState state;
      48             : 
      49        1028 :     state = (TSVectorParseState) palloc(sizeof(struct TSVectorParseStateData));
      50        1028 :     state->prsbuf = input;
      51        1028 :     state->bufstart = input;
      52        1028 :     state->len = 32;
      53        1028 :     state->word = (char *) palloc(state->len);
      54        1028 :     state->eml = pg_database_encoding_max_length();
      55        1028 :     state->oprisdelim = oprisdelim;
      56        1028 :     state->is_tsquery = is_tsquery;
      57             : 
      58        1028 :     return state;
      59             : }
      60             : 
      61             : /*
      62             :  * Reinitializes parser to parse 'input', instead of previous input.
      63             :  */
      64             : void
      65         927 : reset_tsvector_parser(TSVectorParseState state, char *input)
      66             : {
      67         927 :     state->prsbuf = input;
      68         927 : }
      69             : 
      70             : /*
      71             :  * Shuts down a tsvector parser.
      72             :  */
      73             : void
      74        1028 : close_tsvector_parser(TSVectorParseState state)
      75             : {
      76        1028 :     pfree(state->word);
      77        1028 :     pfree(state);
      78        1028 : }
      79             : 
      80             : /* increase the size of 'word' if needed to hold one more character */
      81             : #define RESIZEPRSBUF \
      82             : do { \
      83             :     int clen = curpos - state->word; \
      84             :     if ( clen + state->eml >= state->len ) \
      85             :     { \
      86             :         state->len *= 2; \
      87             :         state->word = (char *) repalloc(state->word, state->len); \
      88             :         curpos = state->word + clen; \
      89             :     } \
      90             : } while (0)
      91             : 
      92             : /* phrase operator begins with '<' */
      93             : #define ISOPERATOR(x) \
      94             :     ( pg_mblen(x) == 1 && ( *(x) == '!' ||  \
      95             :                             *(x) == '&' ||  \
      96             :                             *(x) == '|' ||  \
      97             :                             *(x) == '(' ||  \
      98             :                             *(x) == ')' ||  \
      99             :                             *(x) == '<'      \
     100             :                           ) )
     101             : 
     102             : /* Fills gettoken_tsvector's output parameters, and returns true */
     103             : #define RETURN_TOKEN \
     104             : do { \
     105             :     if (pos_ptr != NULL) \
     106             :     { \
     107             :         *pos_ptr = pos; \
     108             :         *poslen = npos; \
     109             :     } \
     110             :     else if (pos != NULL) \
     111             :         pfree(pos); \
     112             :     \
     113             :     if (strval != NULL) \
     114             :         *strval = state->word; \
     115             :     if (lenval != NULL) \
     116             :         *lenval = curpos - state->word; \
     117             :     if (endptr != NULL) \
     118             :         *endptr = state->prsbuf; \
     119             :     return true; \
     120             : } while(0)
     121             : 
     122             : 
     123             : /* State codes used in gettoken_tsvector */
     124             : #define WAITWORD        1
     125             : #define WAITENDWORD     2
     126             : #define WAITNEXTCHAR    3
     127             : #define WAITENDCMPLX    4
     128             : #define WAITPOSINFO     5
     129             : #define INPOSINFO       6
     130             : #define WAITPOSDELIM    7
     131             : #define WAITCHARCMPLX   8
     132             : 
     133             : #define PRSSYNTAXERROR prssyntaxerror(state)
     134             : 
     135             : static void
     136           0 : prssyntaxerror(TSVectorParseState state)
     137             : {
     138           0 :     ereport(ERROR,
     139             :             (errcode(ERRCODE_SYNTAX_ERROR),
     140             :              state->is_tsquery ?
     141             :              errmsg("syntax error in tsquery: \"%s\"", state->bufstart) :
     142             :              errmsg("syntax error in tsvector: \"%s\"", state->bufstart)));
     143             : }
     144             : 
     145             : 
     146             : /*
     147             :  * Get next token from string being parsed. Returns true if successful,
     148             :  * false if end of input string is reached.  On success, these output
     149             :  * parameters are filled in:
     150             :  *
     151             :  * *strval      pointer to token
     152             :  * *lenval      length of *strval
     153             :  * *pos_ptr     pointer to a palloc'd array of positions and weights
     154             :  *              associated with the token. If the caller is not interested
     155             :  *              in the information, NULL can be supplied. Otherwise
     156             :  *              the caller is responsible for pfreeing the array.
     157             :  * *poslen      number of elements in *pos_ptr
     158             :  * *endptr      scan resumption point
     159             :  *
     160             :  * Pass NULL for unwanted output parameters.
     161             :  */
     162             : bool
     163       31669 : gettoken_tsvector(TSVectorParseState state,
     164             :                   char **strval, int *lenval,
     165             :                   WordEntryPos **pos_ptr, int *poslen,
     166             :                   char **endptr)
     167             : {
     168       31669 :     int         oldstate = 0;
     169       31669 :     char       *curpos = state->word;
     170       31669 :     int         statecode = WAITWORD;
     171             : 
     172             :     /*
     173             :      * pos is for collecting the comma delimited list of positions followed by
     174             :      * the actual token.
     175             :      */
     176       31669 :     WordEntryPos *pos = NULL;
     177       31669 :     int         npos = 0;       /* elements of pos used */
     178       31669 :     int         posalen = 0;    /* allocated size of pos */
     179             : 
     180             :     while (1)
     181             :     {
     182      125206 :         if (statecode == WAITWORD)
     183             :         {
     184       61238 :             if (*(state->prsbuf) == '\0')
     185         614 :                 return false;
     186       60624 :             else if (t_iseq(state->prsbuf, '\''))
     187          25 :                 statecode = WAITENDCMPLX;
     188       60599 :             else if (t_iseq(state->prsbuf, '\\'))
     189             :             {
     190           1 :                 statecode = WAITNEXTCHAR;
     191           1 :                 oldstate = WAITENDWORD;
     192             :             }
     193       60598 :             else if (state->oprisdelim && ISOPERATOR(state->prsbuf))
     194           0 :                 PRSSYNTAXERROR;
     195       60598 :             else if (!t_isspace(state->prsbuf))
     196             :             {
     197       31029 :                 COPYCHAR(curpos, state->prsbuf);
     198       31029 :                 curpos += pg_mblen(state->prsbuf);
     199       31029 :                 statecode = WAITENDWORD;
     200             :             }
     201             :         }
     202       63968 :         else if (statecode == WAITNEXTCHAR)
     203             :         {
     204          27 :             if (*(state->prsbuf) == '\0')
     205           0 :                 ereport(ERROR,
     206             :                         (errcode(ERRCODE_SYNTAX_ERROR),
     207             :                          errmsg("there is no escaped character: \"%s\"",
     208             :                                 state->bufstart)));
     209             :             else
     210             :             {
     211          27 :                 RESIZEPRSBUF;
     212          27 :                 COPYCHAR(curpos, state->prsbuf);
     213          27 :                 curpos += pg_mblen(state->prsbuf);
     214          27 :                 Assert(oldstate != 0);
     215          27 :                 statecode = oldstate;
     216             :             }
     217             :         }
     218       63941 :         else if (statecode == WAITENDWORD)
     219             :         {
     220       62760 :             if (t_iseq(state->prsbuf, '\\'))
     221             :             {
     222          12 :                 statecode = WAITNEXTCHAR;
     223          12 :                 oldstate = WAITENDWORD;
     224             :             }
     225       94991 :             else if (t_isspace(state->prsbuf) || *(state->prsbuf) == '\0' ||
     226       33725 :                      (state->oprisdelim && ISOPERATOR(state->prsbuf)))
     227             :             {
     228       30736 :                 RESIZEPRSBUF;
     229       30736 :                 if (curpos == state->word)
     230           0 :                     PRSSYNTAXERROR;
     231       30736 :                 *(curpos) = '\0';
     232       30736 :                 RETURN_TOKEN;
     233             :             }
     234       32012 :             else if (t_iseq(state->prsbuf, ':'))
     235             :             {
     236         294 :                 if (curpos == state->word)
     237           0 :                     PRSSYNTAXERROR;
     238         294 :                 *(curpos) = '\0';
     239         294 :                 if (state->oprisdelim)
     240          62 :                     RETURN_TOKEN;
     241             :                 else
     242         232 :                     statecode = INPOSINFO;
     243             :             }
     244             :             else
     245             :             {
     246       31718 :                 RESIZEPRSBUF;
     247       31718 :                 COPYCHAR(curpos, state->prsbuf);
     248       31718 :                 curpos += pg_mblen(state->prsbuf);
     249             :             }
     250             :         }
     251        1181 :         else if (statecode == WAITENDCMPLX)
     252             :         {
     253         162 :             if (t_iseq(state->prsbuf, '\''))
     254             :             {
     255          25 :                 statecode = WAITCHARCMPLX;
     256             :             }
     257         137 :             else if (t_iseq(state->prsbuf, '\\'))
     258             :             {
     259          14 :                 statecode = WAITNEXTCHAR;
     260          14 :                 oldstate = WAITENDCMPLX;
     261             :             }
     262         123 :             else if (*(state->prsbuf) == '\0')
     263           0 :                 PRSSYNTAXERROR;
     264             :             else
     265             :             {
     266         123 :                 RESIZEPRSBUF;
     267         123 :                 COPYCHAR(curpos, state->prsbuf);
     268         123 :                 curpos += pg_mblen(state->prsbuf);
     269             :             }
     270             :         }
     271        1019 :         else if (statecode == WAITCHARCMPLX)
     272             :         {
     273          25 :             if (t_iseq(state->prsbuf, '\''))
     274             :             {
     275           0 :                 RESIZEPRSBUF;
     276           0 :                 COPYCHAR(curpos, state->prsbuf);
     277           0 :                 curpos += pg_mblen(state->prsbuf);
     278           0 :                 statecode = WAITENDCMPLX;
     279             :             }
     280             :             else
     281             :             {
     282          25 :                 RESIZEPRSBUF;
     283          25 :                 *(curpos) = '\0';
     284          25 :                 if (curpos == state->word)
     285           0 :                     PRSSYNTAXERROR;
     286          25 :                 if (state->oprisdelim)
     287             :                 {
     288             :                     /* state->prsbuf+=pg_mblen(state->prsbuf); */
     289          12 :                     RETURN_TOKEN;
     290             :                 }
     291             :                 else
     292          13 :                     statecode = WAITPOSINFO;
     293          13 :                 continue;       /* recheck current character */
     294             :             }
     295             :         }
     296         994 :         else if (statecode == WAITPOSINFO)
     297             :         {
     298          13 :             if (t_iseq(state->prsbuf, ':'))
     299           0 :                 statecode = INPOSINFO;
     300             :             else
     301          13 :                 RETURN_TOKEN;
     302             :         }
     303         981 :         else if (statecode == INPOSINFO)
     304             :         {
     305         336 :             if (t_isdigit(state->prsbuf))
     306             :             {
     307         336 :                 if (posalen == 0)
     308             :                 {
     309         232 :                     posalen = 4;
     310         232 :                     pos = (WordEntryPos *) palloc(sizeof(WordEntryPos) * posalen);
     311         232 :                     npos = 0;
     312             :                 }
     313         104 :                 else if (npos + 1 >= posalen)
     314             :                 {
     315          20 :                     posalen *= 2;
     316          20 :                     pos = (WordEntryPos *) repalloc(pos, sizeof(WordEntryPos) * posalen);
     317             :                 }
     318         336 :                 npos++;
     319         336 :                 WEP_SETPOS(pos[npos - 1], LIMITPOS(atoi(state->prsbuf)));
     320             :                 /* we cannot get here in tsquery, so no need for 2 errmsgs */
     321         336 :                 if (WEP_GETPOS(pos[npos - 1]) == 0)
     322           0 :                     ereport(ERROR,
     323             :                             (errcode(ERRCODE_SYNTAX_ERROR),
     324             :                              errmsg("wrong position info in tsvector: \"%s\"",
     325             :                                     state->bufstart)));
     326         336 :                 WEP_SETWEIGHT(pos[npos - 1], 0);
     327         336 :                 statecode = WAITPOSDELIM;
     328             :             }
     329             :             else
     330           0 :                 PRSSYNTAXERROR;
     331             :         }
     332         645 :         else if (statecode == WAITPOSDELIM)
     333             :         {
     334         645 :             if (t_iseq(state->prsbuf, ','))
     335         104 :                 statecode = INPOSINFO;
     336         541 :             else if (t_iseq(state->prsbuf, 'a') || t_iseq(state->prsbuf, 'A') || t_iseq(state->prsbuf, '*'))
     337             :             {
     338          60 :                 if (WEP_GETWEIGHT(pos[npos - 1]))
     339           0 :                     PRSSYNTAXERROR;
     340          60 :                 WEP_SETWEIGHT(pos[npos - 1], 3);
     341             :             }
     342         481 :             else if (t_iseq(state->prsbuf, 'b') || t_iseq(state->prsbuf, 'B'))
     343             :             {
     344          37 :                 if (WEP_GETWEIGHT(pos[npos - 1]))
     345           0 :                     PRSSYNTAXERROR;
     346          37 :                 WEP_SETWEIGHT(pos[npos - 1], 2);
     347             :             }
     348         444 :             else if (t_iseq(state->prsbuf, 'c') || t_iseq(state->prsbuf, 'C'))
     349             :             {
     350          46 :                 if (WEP_GETWEIGHT(pos[npos - 1]))
     351           0 :                     PRSSYNTAXERROR;
     352          46 :                 WEP_SETWEIGHT(pos[npos - 1], 1);
     353             :             }
     354         398 :             else if (t_iseq(state->prsbuf, 'd') || t_iseq(state->prsbuf, 'D'))
     355             :             {
     356          19 :                 if (WEP_GETWEIGHT(pos[npos - 1]))
     357           0 :                     PRSSYNTAXERROR;
     358          19 :                 WEP_SETWEIGHT(pos[npos - 1], 0);
     359             :             }
     360         567 :             else if (t_isspace(state->prsbuf) ||
     361         188 :                      *(state->prsbuf) == '\0')
     362         232 :                 RETURN_TOKEN;
     363         147 :             else if (!t_isdigit(state->prsbuf))
     364           0 :                 PRSSYNTAXERROR;
     365             :         }
     366             :         else                    /* internal error */
     367           0 :             elog(ERROR, "unrecognized state in gettoken_tsvector: %d",
     368             :                  statecode);
     369             : 
     370             :         /* get next char */
     371       93524 :         state->prsbuf += pg_mblen(state->prsbuf);
     372       93537 :     }
     373             : }

Generated by: LCOV version 1.11