LCOV - code coverage report
Current view: top level - src/backend/regex - regprefix.c (source / functions) Hit Total Coverage
Test: PostgreSQL Lines: 68 80 85.0 %
Date: 2017-09-29 13:40:31 Functions: 2 2 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * regprefix.c
       4             :  *    Extract a common prefix, if any, from a compiled regex.
       5             :  *
       6             :  *
       7             :  * Portions Copyright (c) 2012-2017, PostgreSQL Global Development Group
       8             :  * Portions Copyright (c) 1998, 1999 Henry Spencer
       9             :  *
      10             :  * IDENTIFICATION
      11             :  *    src/backend/regex/regprefix.c
      12             :  *
      13             :  *-------------------------------------------------------------------------
      14             :  */
      15             : 
      16             : #include "regex/regguts.h"
      17             : 
      18             : 
      19             : /*
      20             :  * forward declarations
      21             :  */
      22             : static int findprefix(struct cnfa *cnfa, struct colormap *cm,
      23             :            chr *string, size_t *slength);
      24             : 
      25             : 
      26             : /*
      27             :  * pg_regprefix - get common prefix for regular expression
      28             :  *
      29             :  * Returns one of:
      30             :  *  REG_NOMATCH: there is no common prefix of strings matching the regex
      31             :  *  REG_PREFIX: there is a common prefix of strings matching the regex
      32             :  *  REG_EXACT: all strings satisfying the regex must match the same string
      33             :  *  or a REG_XXX error code
      34             :  *
      35             :  * In the non-failure cases, *string is set to a malloc'd string containing
      36             :  * the common prefix or exact value, of length *slength (measured in chrs
      37             :  * not bytes!).
      38             :  *
      39             :  * This function does not analyze all complex cases (such as lookaround
      40             :  * constraints) exactly.  Therefore it is possible that some strings matching
      41             :  * the reported prefix or exact-match string do not satisfy the regex.  But
      42             :  * it should never be the case that a string satisfying the regex does not
      43             :  * match the reported prefix or exact-match string.
      44             :  */
      45             : int
      46        1256 : pg_regprefix(regex_t *re,
      47             :              chr **string,
      48             :              size_t *slength)
      49             : {
      50             :     struct guts *g;
      51             :     struct cnfa *cnfa;
      52             :     int         st;
      53             : 
      54             :     /* sanity checks */
      55        1256 :     if (string == NULL || slength == NULL)
      56           0 :         return REG_INVARG;
      57        1256 :     *string = NULL;             /* initialize for failure cases */
      58        1256 :     *slength = 0;
      59        1256 :     if (re == NULL || re->re_magic != REMAGIC)
      60           0 :         return REG_INVARG;
      61        1256 :     if (re->re_csize != sizeof(chr))
      62           0 :         return REG_MIXED;
      63             : 
      64             :     /* Initialize locale-dependent support */
      65        1256 :     pg_set_regex_collation(re->re_collation);
      66             : 
      67             :     /* setup */
      68        1256 :     g = (struct guts *) re->re_guts;
      69        1256 :     if (g->info & REG_UIMPOSSIBLE)
      70           0 :         return REG_NOMATCH;
      71             : 
      72             :     /*
      73             :      * This implementation considers only the search NFA for the topmost regex
      74             :      * tree node.  Therefore, constraints such as backrefs are not fully
      75             :      * applied, which is allowed per the function's API spec.
      76             :      */
      77        1256 :     assert(g->tree != NULL);
      78        1256 :     cnfa = &g->tree->cnfa;
      79             : 
      80             :     /*
      81             :      * Since a correct NFA should never contain any exit-free loops, it should
      82             :      * not be possible for our traversal to return to a previously visited NFA
      83             :      * state.  Hence we need at most nstates chrs in the output string.
      84             :      */
      85        1256 :     *string = (chr *) MALLOC(cnfa->nstates * sizeof(chr));
      86        1256 :     if (*string == NULL)
      87           0 :         return REG_ESPACE;
      88             : 
      89             :     /* do it */
      90        1256 :     st = findprefix(cnfa, &g->cmap, *string, slength);
      91             : 
      92        1256 :     assert(*slength <= cnfa->nstates);
      93             : 
      94             :     /* clean up */
      95        1256 :     if (st != REG_PREFIX && st != REG_EXACT)
      96             :     {
      97          46 :         FREE(*string);
      98          46 :         *string = NULL;
      99          46 :         *slength = 0;
     100             :     }
     101             : 
     102        1256 :     return st;
     103             : }
     104             : 
     105             : /*
     106             :  * findprefix - extract common prefix from cNFA
     107             :  *
     108             :  * Results are returned into the preallocated chr array string[], with
     109             :  * *slength (which must be preset to zero) incremented for each chr.
     110             :  */
     111             : static int                      /* regprefix return code */
     112        1256 : findprefix(struct cnfa *cnfa,
     113             :            struct colormap *cm,
     114             :            chr *string,
     115             :            size_t *slength)
     116             : {
     117             :     int         st;
     118             :     int         nextst;
     119             :     color       thiscolor;
     120             :     chr         c;
     121             :     struct carc *ca;
     122             : 
     123             :     /*
     124             :      * The "pre" state must have only BOS/BOL outarcs, else pattern isn't
     125             :      * anchored left.  If we have both BOS and BOL, they must go to the same
     126             :      * next state.
     127             :      */
     128        1256 :     st = cnfa->pre;
     129        1256 :     nextst = -1;
     130        2471 :     for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
     131             :     {
     132        1258 :         if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
     133             :         {
     134        2432 :             if (nextst == -1)
     135        1215 :                 nextst = ca->to;
     136           2 :             else if (nextst != ca->to)
     137           2 :                 return REG_NOMATCH;
     138             :         }
     139             :         else
     140          41 :             return REG_NOMATCH;
     141             :     }
     142        1213 :     if (nextst == -1)
     143           0 :         return REG_NOMATCH;
     144             : 
     145             :     /*
     146             :      * Scan through successive states, stopping as soon as we find one with
     147             :      * more than one acceptable transition character (either multiple colors
     148             :      * on out-arcs, or a color with more than one member chr).
     149             :      *
     150             :      * We could find a state with multiple out-arcs that are all labeled with
     151             :      * the same singleton color; this comes from patterns like "^ab(cde|cxy)".
     152             :      * In that case we add the chr "c" to the output string but then exit the
     153             :      * loop with nextst == -1.  This leaves a little bit on the table: if the
     154             :      * pattern is like "^ab(cde|cdy)", we won't notice that "d" could be added
     155             :      * to the prefix.  But chasing multiple parallel state chains doesn't seem
     156             :      * worth the trouble.
     157             :      */
     158             :     do
     159             :     {
     160       13865 :         st = nextst;
     161       13865 :         nextst = -1;
     162       13865 :         thiscolor = COLORLESS;
     163       26622 :         for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
     164             :         {
     165             :             /* We can ignore BOS/BOL arcs */
     166       13955 :             if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
     167           0 :                 continue;
     168             :             /* ... but EOS/EOL arcs terminate the search, as do LACONs */
     169       26798 :             if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1] ||
     170       12843 :                 ca->co >= cnfa->ncolors)
     171             :             {
     172        1116 :                 thiscolor = COLORLESS;
     173        1116 :                 break;
     174             :             }
     175       12839 :             if (thiscolor == COLORLESS)
     176             :             {
     177             :                 /* First plain outarc */
     178       12753 :                 thiscolor = ca->co;
     179       12753 :                 nextst = ca->to;
     180             :             }
     181          86 :             else if (thiscolor == ca->co)
     182             :             {
     183             :                 /* Another plain outarc for same color */
     184           4 :                 nextst = -1;
     185             :             }
     186             :             else
     187             :             {
     188             :                 /* More than one plain outarc color terminates the search */
     189          82 :                 thiscolor = COLORLESS;
     190          82 :                 break;
     191             :             }
     192             :         }
     193             :         /* Done if we didn't find exactly one color on plain outarcs */
     194       13865 :         if (thiscolor == COLORLESS)
     195        1198 :             break;
     196             :         /* The color must be a singleton */
     197       12667 :         if (cm->cd[thiscolor].nschrs != 1)
     198          11 :             break;
     199             :         /* Must not have any high-color-map entries */
     200       12656 :         if (cm->cd[thiscolor].nuchrs != 0)
     201           0 :             break;
     202             : 
     203             :         /*
     204             :          * Identify the color's sole member chr and add it to the prefix
     205             :          * string.  In general the colormap data structure doesn't provide a
     206             :          * way to find color member chrs, except by trying GETCOLOR() on each
     207             :          * possible chr value, which won't do at all.  However, for the cases
     208             :          * we care about it should be sufficient to test the "firstchr" value,
     209             :          * that is the first chr ever added to the color.  There are cases
     210             :          * where this might no longer be a member of the color (so we do need
     211             :          * to test), but none of them are likely to arise for a character that
     212             :          * is a member of a common prefix.  If we do hit such a corner case,
     213             :          * we just fall out without adding anything to the prefix string.
     214             :          */
     215       12656 :         c = cm->cd[thiscolor].firstchr;
     216       12656 :         if (GETCOLOR(cm, c) != thiscolor)
     217           0 :             break;
     218             : 
     219       12656 :         string[(*slength)++] = c;
     220             : 
     221             :         /* Advance to next state, but only if we have a unique next state */
     222       12656 :     } while (nextst != -1);
     223             : 
     224             :     /*
     225             :      * If we ended at a state that only has EOS/EOL outarcs leading to the
     226             :      * "post" state, then we have an exact-match string.  Note this is true
     227             :      * even if the string is of zero length.
     228             :      */
     229        1213 :     nextst = -1;
     230        2325 :     for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
     231             :     {
     232        1213 :         if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1])
     233             :         {
     234        2224 :             if (nextst == -1)
     235        1112 :                 nextst = ca->to;
     236           0 :             else if (nextst != ca->to)
     237             :             {
     238           0 :                 nextst = -1;
     239           0 :                 break;
     240             :             }
     241             :         }
     242             :         else
     243             :         {
     244         101 :             nextst = -1;
     245         101 :             break;
     246             :         }
     247             :     }
     248        1213 :     if (nextst == cnfa->post)
     249        1112 :         return REG_EXACT;
     250             : 
     251             :     /*
     252             :      * Otherwise, if we were unable to identify any prefix characters, say
     253             :      * NOMATCH --- the pattern is anchored left, but doesn't specify any
     254             :      * particular first character.
     255             :      */
     256         101 :     if (*slength > 0)
     257          98 :         return REG_PREFIX;
     258             : 
     259           3 :     return REG_NOMATCH;
     260             : }

Generated by: LCOV version 1.11