|           Line data    Source code 
       1             : /*
       2             :  * re_*exec and friends - match REs
       3             :  *
       4             :  * Copyright (c) 1998, 1999 Henry Spencer.  All rights reserved.
       5             :  *
       6             :  * Development of this software was funded, in part, by Cray Research Inc.,
       7             :  * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
       8             :  * Corporation, none of whom are responsible for the results.  The author
       9             :  * thanks all of them.
      10             :  *
      11             :  * Redistribution and use in source and binary forms -- with or without
      12             :  * modification -- are permitted for any purpose, provided that
      13             :  * redistributions in source form retain this entire copyright notice and
      14             :  * indicate the origin and nature of any modifications.
      15             :  *
      16             :  * I'd appreciate being given credit for this package in the documentation
      17             :  * of software which uses it, but that is not a requirement.
      18             :  *
      19             :  * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
      20             :  * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
      21             :  * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.  IN NO EVENT SHALL
      22             :  * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
      23             :  * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
      24             :  * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
      25             :  * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
      26             :  * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
      27             :  * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
      28             :  * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
      29             :  *
      30             :  * src/backend/regex/regexec.c
      31             :  *
      32             :  */
      33             : 
      34             : #include "regex/regguts.h"
      35             : 
      36             : 
      37             : 
      38             : /* lazy-DFA representation */
      39             : struct arcp
      40             : {                               /* "pointer" to an outarc */
      41             :     struct sset *ss;
      42             :     color       co;
      43             : };
      44             : 
      45             : struct sset
      46             : {                               /* state set */
      47             :     unsigned   *states;         /* pointer to bitvector */
      48             :     unsigned    hash;           /* hash of bitvector */
      49             : #define  HASH(bv, nw)    (((nw) == 1) ? *(bv) : hash(bv, nw))
      50             : #define  HIT(h,bv,ss,nw) ((ss)->hash == (h) && ((nw) == 1 || \
      51             :         memcmp(VS(bv), VS((ss)->states), (nw)*sizeof(unsigned)) == 0))
      52             :     int         flags;
      53             : #define  STARTER     01         /* the initial state set */
      54             : #define  POSTSTATE   02         /* includes the goal state */
      55             : #define  LOCKED      04         /* locked in cache */
      56             : #define  NOPROGRESS  010        /* zero-progress state set */
      57             :     struct arcp ins;            /* chain of inarcs pointing here */
      58             :     chr        *lastseen;       /* last entered on arrival here */
      59             :     struct sset **outs;         /* outarc vector indexed by color */
      60             :     struct arcp *inchain;       /* chain-pointer vector for outarcs */
      61             : };
      62             : 
      63             : struct dfa
      64             : {
      65             :     int         nssets;         /* size of cache */
      66             :     int         nssused;        /* how many entries occupied yet */
      67             :     int         nstates;        /* number of states */
      68             :     int         ncolors;        /* length of outarc and inchain vectors */
      69             :     int         wordsper;       /* length of state-set bitvectors */
      70             :     struct sset *ssets;         /* state-set cache */
      71             :     unsigned   *statesarea;     /* bitvector storage */
      72             :     unsigned   *work;           /* pointer to work area within statesarea */
      73             :     struct sset **outsarea;     /* outarc-vector storage */
      74             :     struct arcp *incarea;       /* inchain storage */
      75             :     struct cnfa *cnfa;
      76             :     struct colormap *cm;
      77             :     chr        *lastpost;       /* location of last cache-flushed success */
      78             :     chr        *lastnopr;       /* location of last cache-flushed NOPROGRESS */
      79             :     struct sset *search;        /* replacement-search-pointer memory */
      80             :     int         cptsmalloced;   /* were the areas individually malloced? */
      81             :     char       *mallocarea;     /* self, or master malloced area, or NULL */
      82             : };
      83             : 
      84             : #define WORK    1               /* number of work bitvectors needed */
      85             : 
      86             : /* setup for non-malloc allocation for small cases */
      87             : #define FEWSTATES   20          /* must be less than UBITS */
      88             : #define FEWCOLORS   15
      89             : struct smalldfa
      90             : {
      91             :     struct dfa  dfa;
      92             :     struct sset ssets[FEWSTATES * 2];
      93             :     unsigned    statesarea[FEWSTATES * 2 + WORK];
      94             :     struct sset *outsarea[FEWSTATES * 2 * FEWCOLORS];
      95             :     struct arcp incarea[FEWSTATES * 2 * FEWCOLORS];
      96             : };
      97             : 
      98             : #define DOMALLOC    ((struct smalldfa *)NULL)   /* force malloc */
      99             : 
     100             : 
     101             : 
     102             : /* internal variables, bundled for easy passing around */
     103             : struct vars
     104             : {
     105             :     regex_t    *re;
     106             :     struct guts *g;
     107             :     int         eflags;         /* copies of arguments */
     108             :     size_t      nmatch;
     109             :     regmatch_t *pmatch;
     110             :     rm_detail_t *details;
     111             :     chr        *start;          /* start of string */
     112             :     chr        *search_start;   /* search start of string */
     113             :     chr        *stop;           /* just past end of string */
     114             :     int         err;            /* error code if any (0 none) */
     115             :     struct dfa **subdfas;       /* per-tree-subre DFAs */
     116             :     struct dfa **ladfas;        /* per-lacon-subre DFAs */
     117             :     struct sset **lblastcss;    /* per-lacon-subre lookbehind restart data */
     118             :     chr       **lblastcp;       /* per-lacon-subre lookbehind restart data */
     119             :     struct smalldfa dfa1;
     120             :     struct smalldfa dfa2;
     121             : };
     122             : 
     123             : #define VISERR(vv)  ((vv)->err != 0) /* have we seen an error yet? */
     124             : #define ISERR() VISERR(v)
     125             : #define VERR(vv,e)  ((vv)->err = ((vv)->err ? (vv)->err : (e)))
     126             : #define ERR(e)  VERR(v, e)      /* record an error */
     127             : #define NOERR() {if (ISERR()) return v->err;}    /* if error seen, return it */
     128             : #define OFF(p)  ((p) - v->start)
     129             : #define LOFF(p) ((long)OFF(p))
     130             : 
     131             : 
     132             : 
     133             : /*
     134             :  * forward declarations
     135             :  */
     136             : /* === regexec.c === */
     137             : static struct dfa *getsubdfa(struct vars *, struct subre *);
     138             : static struct dfa *getladfa(struct vars *, int);
     139             : static int  find(struct vars *, struct cnfa *, struct colormap *);
     140             : static int  cfind(struct vars *, struct cnfa *, struct colormap *);
     141             : static int  cfindloop(struct vars *, struct cnfa *, struct colormap *, struct dfa *, struct dfa *, chr **);
     142             : static void zapallsubs(regmatch_t *, size_t);
     143             : static void zaptreesubs(struct vars *, struct subre *);
     144             : static void subset(struct vars *, struct subre *, chr *, chr *);
     145             : static int  cdissect(struct vars *, struct subre *, chr *, chr *);
     146             : static int  ccondissect(struct vars *, struct subre *, chr *, chr *);
     147             : static int  crevcondissect(struct vars *, struct subre *, chr *, chr *);
     148             : static int  cbrdissect(struct vars *, struct subre *, chr *, chr *);
     149             : static int  caltdissect(struct vars *, struct subre *, chr *, chr *);
     150             : static int  citerdissect(struct vars *, struct subre *, chr *, chr *);
     151             : static int  creviterdissect(struct vars *, struct subre *, chr *, chr *);
     152             : 
     153             : /* === rege_dfa.c === */
     154             : static chr *longest(struct vars *, struct dfa *, chr *, chr *, int *);
     155             : static chr *shortest(struct vars *, struct dfa *, chr *, chr *, chr *, chr **, int *);
     156             : static int  matchuntil(struct vars *, struct dfa *, chr *, struct sset **, chr **);
     157             : static chr *lastcold(struct vars *, struct dfa *);
     158             : static struct dfa *newdfa(struct vars *, struct cnfa *, struct colormap *, struct smalldfa *);
     159             : static void freedfa(struct dfa *);
     160             : static unsigned hash(unsigned *, int);
     161             : static struct sset *initialize(struct vars *, struct dfa *, chr *);
     162             : static struct sset *miss(struct vars *, struct dfa *, struct sset *, color, chr *, chr *);
     163             : static int  lacon(struct vars *, struct cnfa *, chr *, color);
     164             : static struct sset *getvacant(struct vars *, struct dfa *, chr *, chr *);
     165             : static struct sset *pickss(struct vars *, struct dfa *, chr *, chr *);
     166             : 
     167             : 
     168             : /*
     169             :  * pg_regexec - match regular expression
     170             :  */
     171             : int
     172       32302 : pg_regexec(regex_t *re,
     173             :            const chr *string,
     174             :            size_t len,
     175             :            size_t search_start,
     176             :            rm_detail_t *details,
     177             :            size_t nmatch,
     178             :            regmatch_t pmatch[],
     179             :            int flags)
     180             : {
     181             :     struct vars var;
     182       32302 :     register struct vars *v = &var;
     183             :     int         st;
     184             :     size_t      n;
     185             :     size_t      i;
     186             :     int         backref;
     187             : 
     188             : #define  LOCALMAT    20
     189             :     regmatch_t  mat[LOCALMAT];
     190             : 
     191             : #define  LOCALDFAS   40
     192             :     struct dfa *subdfas[LOCALDFAS];
     193             : 
     194             :     /* sanity checks */
     195       32302 :     if (re == NULL || string == NULL || re->re_magic != REMAGIC)
     196           0 :         return REG_INVARG;
     197       32302 :     if (re->re_csize != sizeof(chr))
     198           0 :         return REG_MIXED;
     199             : 
     200             :     /* Initialize locale-dependent support */
     201       32302 :     pg_set_regex_collation(re->re_collation);
     202             : 
     203             :     /* setup */
     204       32302 :     v->re = re;
     205       32302 :     v->g = (struct guts *) re->re_guts;
     206       32302 :     if ((v->g->cflags & REG_EXPECT) && details == NULL)
     207           0 :         return REG_INVARG;
     208       32302 :     if (v->g->info & REG_UIMPOSSIBLE)
     209           3 :         return REG_NOMATCH;
     210       32299 :     backref = (v->g->info & REG_UBACKREF) ? 1 : 0;
     211       32299 :     v->eflags = flags;
     212       32299 :     if (v->g->cflags & REG_NOSUB)
     213           0 :         nmatch = 0;             /* override client */
     214       32299 :     v->nmatch = nmatch;
     215       32299 :     if (backref)
     216             :     {
     217             :         /* need work area */
     218          19 :         if (v->g->nsub + 1 <= LOCALMAT)
     219          19 :             v->pmatch = mat;
     220             :         else
     221           0 :             v->pmatch = (regmatch_t *) MALLOC((v->g->nsub + 1) *
     222             :                                               sizeof(regmatch_t));
     223          19 :         if (v->pmatch == NULL)
     224           0 :             return REG_ESPACE;
     225          19 :         v->nmatch = v->g->nsub + 1;
     226             :     }
     227             :     else
     228       32280 :         v->pmatch = pmatch;
     229       32299 :     v->details = details;
     230       32299 :     v->start = (chr *) string;
     231       32299 :     v->search_start = (chr *) string + search_start;
     232       32299 :     v->stop = (chr *) string + len;
     233       32299 :     v->err = 0;
     234       32299 :     v->subdfas = NULL;
     235       32299 :     v->ladfas = NULL;
     236       32299 :     v->lblastcss = NULL;
     237       32299 :     v->lblastcp = NULL;
     238             :     /* below this point, "goto cleanup" will behave sanely */
     239             : 
     240       32299 :     assert(v->g->ntree >= 0);
     241       32299 :     n = (size_t) v->g->ntree;
     242       32299 :     if (n <= LOCALDFAS)
     243       32297 :         v->subdfas = subdfas;
     244             :     else
     245             :     {
     246           2 :         v->subdfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *));
     247           2 :         if (v->subdfas == NULL)
     248             :         {
     249           0 :             st = REG_ESPACE;
     250           0 :             goto cleanup;
     251             :         }
     252             :     }
     253      105140 :     for (i = 0; i < n; i++)
     254       72841 :         v->subdfas[i] = NULL;
     255             : 
     256       32299 :     assert(v->g->nlacons >= 0);
     257       32299 :     n = (size_t) v->g->nlacons;
     258       32299 :     if (n > 0)
     259             :     {
     260         206 :         v->ladfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *));
     261         206 :         if (v->ladfas == NULL)
     262             :         {
     263           0 :             st = REG_ESPACE;
     264           0 :             goto cleanup;
     265             :         }
     266         626 :         for (i = 0; i < n; i++)
     267         420 :             v->ladfas[i] = NULL;
     268         206 :         v->lblastcss = (struct sset **) MALLOC(n * sizeof(struct sset *));
     269         206 :         v->lblastcp = (chr **) MALLOC(n * sizeof(chr *));
     270         206 :         if (v->lblastcss == NULL || v->lblastcp == NULL)
     271             :         {
     272           0 :             st = REG_ESPACE;
     273           0 :             goto cleanup;
     274             :         }
     275         626 :         for (i = 0; i < n; i++)
     276             :         {
     277         420 :             v->lblastcss[i] = NULL;
     278         420 :             v->lblastcp[i] = NULL;
     279             :         }
     280             :     }
     281             : 
     282             :     /* do it */
     283       32299 :     assert(v->g->tree != NULL);
     284       32299 :     if (backref)
     285          19 :         st = cfind(v, &v->g->tree->cnfa, &v->g->cmap);
     286             :     else
     287       32280 :         st = find(v, &v->g->tree->cnfa, &v->g->cmap);
     288             : 
     289             :     /* copy (portion of) match vector over if necessary */
     290       32299 :     if (st == REG_OKAY && v->pmatch != pmatch && nmatch > 0)
     291             :     {
     292           2 :         zapallsubs(pmatch, nmatch);
     293           2 :         n = (nmatch < v->nmatch) ? nmatch : v->nmatch;
     294           2 :         memcpy(VS(pmatch), VS(v->pmatch), n * sizeof(regmatch_t));
     295             :     }
     296             : 
     297             :     /* clean up */
     298             : cleanup:
     299       32299 :     if (v->pmatch != pmatch && v->pmatch != mat)
     300           0 :         FREE(v->pmatch);
     301       32299 :     if (v->subdfas != NULL)
     302             :     {
     303       32299 :         n = (size_t) v->g->ntree;
     304      105140 :         for (i = 0; i < n; i++)
     305             :         {
     306       72841 :             if (v->subdfas[i] != NULL)
     307         386 :                 freedfa(v->subdfas[i]);
     308             :         }
     309       32299 :         if (v->subdfas != subdfas)
     310           2 :             FREE(v->subdfas);
     311             :     }
     312       32299 :     if (v->ladfas != NULL)
     313             :     {
     314         206 :         n = (size_t) v->g->nlacons;
     315         626 :         for (i = 0; i < n; i++)
     316             :         {
     317         420 :             if (v->ladfas[i] != NULL)
     318          12 :                 freedfa(v->ladfas[i]);
     319             :         }
     320         206 :         FREE(v->ladfas);
     321             :     }
     322       32299 :     if (v->lblastcss != NULL)
     323         206 :         FREE(v->lblastcss);
     324       32299 :     if (v->lblastcp != NULL)
     325         206 :         FREE(v->lblastcp);
     326             : 
     327       32299 :     return st;
     328             : }
     329             : 
     330             : /*
     331             :  * getsubdfa - create or re-fetch the DFA for a tree subre node
     332             :  *
     333             :  * We only need to create the DFA once per overall regex execution.
     334             :  * The DFA will be freed by the cleanup step in pg_regexec().
     335             :  */
     336             : static struct dfa *
     337         709 : getsubdfa(struct vars *v,
     338             :           struct subre *t)
     339             : {
     340         709 :     if (v->subdfas[t->id] == NULL)
     341             :     {
     342         386 :         v->subdfas[t->id] = newdfa(v, &t->cnfa, &v->g->cmap, DOMALLOC);
     343         386 :         if (ISERR())
     344           0 :             return NULL;
     345             :     }
     346         709 :     return v->subdfas[t->id];
     347             : }
     348             : 
     349             : /*
     350             :  * getladfa - create or re-fetch the DFA for a LACON subre node
     351             :  *
     352             :  * Same as above, but for LACONs.
     353             :  */
     354             : static struct dfa *
     355          26 : getladfa(struct vars *v,
     356             :          int n)
     357             : {
     358          26 :     assert(n > 0 && n < v->g->nlacons && v->g->lacons != NULL);
     359             : 
     360          26 :     if (v->ladfas[n] == NULL)
     361             :     {
     362          12 :         struct subre *sub = &v->g->lacons[n];
     363             : 
     364          12 :         v->ladfas[n] = newdfa(v, &sub->cnfa, &v->g->cmap, DOMALLOC);
     365          12 :         if (ISERR())
     366           0 :             return NULL;
     367             :     }
     368          26 :     return v->ladfas[n];
     369             : }
     370             : 
     371             : /*
     372             :  * find - find a match for the main NFA (no-complications case)
     373             :  */
     374             : static int
     375       32280 : find(struct vars *v,
     376             :      struct cnfa *cnfa,
     377             :      struct colormap *cm)
     378             : {
     379             :     struct dfa *s;
     380             :     struct dfa *d;
     381             :     chr        *begin;
     382       32280 :     chr        *end = NULL;
     383             :     chr        *cold;
     384             :     chr        *open;           /* open and close of range of possible starts */
     385             :     chr        *close;
     386             :     int         hitend;
     387       32280 :     int         shorter = (v->g->tree->flags & SHORTER) ? 1 : 0;
     388             : 
     389             :     /* first, a shot with the search RE */
     390       32280 :     s = newdfa(v, &v->g->search, cm, &v->dfa1);
     391       32280 :     assert(!(ISERR() && s != NULL));
     392       32280 :     NOERR();
     393             :     MDEBUG(("\nsearch at %ld\n", LOFF(v->start)));
     394       32280 :     cold = NULL;
     395       32280 :     close = shortest(v, s, v->search_start, v->search_start, v->stop,
     396             :                      &cold, (int *) NULL);
     397       32280 :     freedfa(s);
     398       32280 :     NOERR();
     399       32280 :     if (v->g->cflags & REG_EXPECT)
     400             :     {
     401           0 :         assert(v->details != NULL);
     402           0 :         if (cold != NULL)
     403           0 :             v->details->rm_extend.rm_so = OFF(cold);
     404             :         else
     405           0 :             v->details->rm_extend.rm_so = OFF(v->stop);
     406           0 :         v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */
     407             :     }
     408       32280 :     if (close == NULL)          /* not found */
     409       29910 :         return REG_NOMATCH;
     410        2370 :     if (v->nmatch == 0)          /* found, don't need exact location */
     411        2090 :         return REG_OKAY;
     412             : 
     413             :     /* find starting point and match */
     414         280 :     assert(cold != NULL);
     415         280 :     open = cold;
     416         280 :     cold = NULL;
     417             :     MDEBUG(("between %ld and %ld\n", LOFF(open), LOFF(close)));
     418         280 :     d = newdfa(v, cnfa, cm, &v->dfa1);
     419         280 :     assert(!(ISERR() && d != NULL));
     420         280 :     NOERR();
     421         282 :     for (begin = open; begin <= close; begin++)
     422             :     {
     423             :         MDEBUG(("\nfind trying at %ld\n", LOFF(begin)));
     424         282 :         if (shorter)
     425           1 :             end = shortest(v, d, begin, begin, v->stop,
     426             :                            (chr **) NULL, &hitend);
     427             :         else
     428         281 :             end = longest(v, d, begin, v->stop, &hitend);
     429         282 :         if (ISERR())
     430             :         {
     431           0 :             freedfa(d);
     432           0 :             return v->err;
     433             :         }
     434         282 :         if (hitend && cold == NULL)
     435          45 :             cold = begin;
     436         282 :         if (end != NULL)
     437         280 :             break;              /* NOTE BREAK OUT */
     438             :     }
     439         280 :     assert(end != NULL);        /* search RE succeeded so loop should */
     440         280 :     freedfa(d);
     441             : 
     442             :     /* and pin down details */
     443         280 :     assert(v->nmatch > 0);
     444         280 :     v->pmatch[0].rm_so = OFF(begin);
     445         280 :     v->pmatch[0].rm_eo = OFF(end);
     446         280 :     if (v->g->cflags & REG_EXPECT)
     447             :     {
     448           0 :         if (cold != NULL)
     449           0 :             v->details->rm_extend.rm_so = OFF(cold);
     450             :         else
     451           0 :             v->details->rm_extend.rm_so = OFF(v->stop);
     452           0 :         v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */
     453             :     }
     454         280 :     if (v->nmatch == 1)          /* no need for submatches */
     455         253 :         return REG_OKAY;
     456             : 
     457             :     /* find submatches */
     458          27 :     zapallsubs(v->pmatch, v->nmatch);
     459          27 :     return cdissect(v, v->g->tree, begin, end);
     460             : }
     461             : 
     462             : /*
     463             :  * cfind - find a match for the main NFA (with complications)
     464             :  */
     465             : static int
     466          19 : cfind(struct vars *v,
     467             :       struct cnfa *cnfa,
     468             :       struct colormap *cm)
     469             : {
     470             :     struct dfa *s;
     471             :     struct dfa *d;
     472             :     chr        *cold;
     473             :     int         ret;
     474             : 
     475          19 :     s = newdfa(v, &v->g->search, cm, &v->dfa1);
     476          19 :     NOERR();
     477          19 :     d = newdfa(v, cnfa, cm, &v->dfa2);
     478          19 :     if (ISERR())
     479             :     {
     480           0 :         assert(d == NULL);
     481           0 :         freedfa(s);
     482           0 :         return v->err;
     483             :     }
     484             : 
     485          19 :     ret = cfindloop(v, cnfa, cm, d, s, &cold);
     486             : 
     487          19 :     freedfa(d);
     488          19 :     freedfa(s);
     489          19 :     NOERR();
     490          19 :     if (v->g->cflags & REG_EXPECT)
     491             :     {
     492           0 :         assert(v->details != NULL);
     493           0 :         if (cold != NULL)
     494           0 :             v->details->rm_extend.rm_so = OFF(cold);
     495             :         else
     496           0 :             v->details->rm_extend.rm_so = OFF(v->stop);
     497           0 :         v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */
     498             :     }
     499          19 :     return ret;
     500             : }
     501             : 
     502             : /*
     503             :  * cfindloop - the heart of cfind
     504             :  */
     505             : static int
     506          19 : cfindloop(struct vars *v,
     507             :           struct cnfa *cnfa,
     508             :           struct colormap *cm,
     509             :           struct dfa *d,
     510             :           struct dfa *s,
     511             :           chr **coldp)          /* where to put coldstart pointer */
     512             : {
     513             :     chr        *begin;
     514             :     chr        *end;
     515             :     chr        *cold;
     516             :     chr        *open;           /* open and close of range of possible starts */
     517             :     chr        *close;
     518             :     chr        *estart;
     519             :     chr        *estop;
     520             :     int         er;
     521          19 :     int         shorter = v->g->tree->flags & SHORTER;
     522             :     int         hitend;
     523             : 
     524          19 :     assert(d != NULL && s != NULL);
     525          19 :     cold = NULL;
     526          19 :     close = v->search_start;
     527             :     do
     528             :     {
     529             :         /* Search with the search RE for match range at/beyond "close" */
     530             :         MDEBUG(("\ncsearch at %ld\n", LOFF(close)));
     531          19 :         close = shortest(v, s, close, close, v->stop, &cold, (int *) NULL);
     532          19 :         if (ISERR())
     533             :         {
     534           0 :             *coldp = cold;
     535           0 :             return v->err;
     536             :         }
     537          19 :         if (close == NULL)
     538           1 :             break;              /* no more possible match anywhere */
     539          18 :         assert(cold != NULL);
     540          18 :         open = cold;
     541          18 :         cold = NULL;
     542             :         /* Search for matches starting between "open" and "close" inclusive */
     543             :         MDEBUG(("cbetween %ld and %ld\n", LOFF(open), LOFF(close)));
     544          78 :         for (begin = open; begin <= close; begin++)
     545             :         {
     546             :             MDEBUG(("\ncfind trying at %ld\n", LOFF(begin)));
     547          70 :             estart = begin;
     548          70 :             estop = v->stop;
     549             :             for (;;)
     550             :             {
     551             :                 /* Here we use the top node's detailed RE */
     552          98 :                 if (shorter)
     553          32 :                     end = shortest(v, d, begin, estart,
     554             :                                    estop, (chr **) NULL, &hitend);
     555             :                 else
     556          66 :                     end = longest(v, d, begin, estop,
     557             :                                   &hitend);
     558          98 :                 if (ISERR())
     559             :                 {
     560           0 :                     *coldp = cold;
     561           0 :                     return v->err;
     562             :                 }
     563          98 :                 if (hitend && cold == NULL)
     564          15 :                     cold = begin;
     565          98 :                 if (end == NULL)
     566          54 :                     break;      /* no match with this begin point, try next */
     567             :                 MDEBUG(("tentative end %ld\n", LOFF(end)));
     568             :                 /* Dissect the potential match to see if it really matches */
     569          44 :                 zapallsubs(v->pmatch, v->nmatch);
     570          44 :                 er = cdissect(v, v->g->tree, begin, end);
     571          44 :                 if (er == REG_OKAY)
     572             :                 {
     573          10 :                     if (v->nmatch > 0)
     574             :                     {
     575          10 :                         v->pmatch[0].rm_so = OFF(begin);
     576          10 :                         v->pmatch[0].rm_eo = OFF(end);
     577             :                     }
     578          10 :                     *coldp = cold;
     579          10 :                     return REG_OKAY;
     580             :                 }
     581          34 :                 if (er != REG_NOMATCH)
     582             :                 {
     583           0 :                     ERR(er);
     584           0 :                     *coldp = cold;
     585           0 :                     return er;
     586             :                 }
     587             :                 /* Try next longer/shorter match with same begin point */
     588          34 :                 if (shorter)
     589             :                 {
     590          27 :                     if (end == estop)
     591           4 :                         break;  /* no more, so try next begin point */
     592          23 :                     estart = end + 1;
     593             :                 }
     594             :                 else
     595             :                 {
     596           7 :                     if (end == begin)
     597           2 :                         break;  /* no more, so try next begin point */
     598           5 :                     estop = end - 1;
     599             :                 }
     600          28 :             }                   /* end loop over endpoint positions */
     601             :         }                       /* end loop over beginning positions */
     602             : 
     603             :         /*
     604             :          * If we get here, there is no possible match starting at or before
     605             :          * "close", so consider matches beyond that.  We'll do a fresh search
     606             :          * with the search RE to find a new promising match range.
     607             :          */
     608           8 :         close++;
     609           8 :     } while (close < v->stop);
     610             : 
     611           9 :     *coldp = cold;
     612           9 :     return REG_NOMATCH;
     613             : }
     614             : 
     615             : /*
     616             :  * zapallsubs - initialize all subexpression matches to "no match"
     617             :  */
     618             : static void
     619          73 : zapallsubs(regmatch_t *p,
     620             :            size_t n)
     621             : {
     622             :     size_t      i;
     623             : 
     624         279 :     for (i = n - 1; i > 0; i--)
     625             :     {
     626         206 :         p[i].rm_so = -1;
     627         206 :         p[i].rm_eo = -1;
     628             :     }
     629          73 : }
     630             : 
     631             : /*
     632             :  * zaptreesubs - initialize subexpressions within subtree to "no match"
     633             :  */
     634             : static void
     635        1773 : zaptreesubs(struct vars *v,
     636             :             struct subre *t)
     637             : {
     638        1773 :     if (t->op == '(')
     639             :     {
     640         195 :         int         n = t->subno;
     641             : 
     642         195 :         assert(n > 0);
     643         195 :         if ((size_t) n < v->nmatch)
     644             :         {
     645         191 :             v->pmatch[n].rm_so = -1;
     646         191 :             v->pmatch[n].rm_eo = -1;
     647             :         }
     648             :     }
     649             : 
     650        1773 :     if (t->left != NULL)
     651         738 :         zaptreesubs(v, t->left);
     652        1773 :     if (t->right != NULL)
     653         473 :         zaptreesubs(v, t->right);
     654        1773 : }
     655             : 
     656             : /*
     657             :  * subset - set subexpression match data for a successful subre
     658             :  */
     659             : static void
     660          97 : subset(struct vars *v,
     661             :        struct subre *sub,
     662             :        chr *begin,
     663             :        chr *end)
     664             : {
     665          97 :     int         n = sub->subno;
     666             : 
     667          97 :     assert(n > 0);
     668          97 :     if ((size_t) n >= v->nmatch)
     669         100 :         return;
     670             : 
     671             :     MDEBUG(("setting %d\n", n));
     672          94 :     v->pmatch[n].rm_so = OFF(begin);
     673          94 :     v->pmatch[n].rm_eo = OFF(end);
     674             : }
     675             : 
     676             : /*
     677             :  * cdissect - check backrefs and determine subexpression matches
     678             :  *
     679             :  * cdissect recursively processes a subre tree to check matching of backrefs
     680             :  * and/or identify submatch boundaries for capture nodes.  The proposed match
     681             :  * runs from "begin" to "end" (not including "end"), and we are basically
     682             :  * "dissecting" it to see where the submatches are.
     683             :  *
     684             :  * Before calling any level of cdissect, the caller must have run the node's
     685             :  * DFA and found that the proposed substring satisfies the DFA.  (We make
     686             :  * the caller do that because in concatenation and iteration nodes, it's
     687             :  * much faster to check all the substrings against the child DFAs before we
     688             :  * recurse.)  Also, caller must have cleared subexpression match data via
     689             :  * zaptreesubs (or zapallsubs at the top level).
     690             :  */
     691             : static int                      /* regexec return code */
     692         850 : cdissect(struct vars *v,
     693             :          struct subre *t,
     694             :          chr *begin,            /* beginning of relevant substring */
     695             :          chr *end)              /* end of same */
     696             : {
     697             :     int         er;
     698             : 
     699         850 :     assert(t != NULL);
     700             :     MDEBUG(("cdissect %ld-%ld %c\n", LOFF(begin), LOFF(end), t->op));
     701             : 
     702             :     /* handy place to check for operation cancel */
     703         850 :     if (CANCEL_REQUESTED(v->re))
     704           0 :         return REG_CANCEL;
     705             :     /* ... and stack overrun */
     706         850 :     if (STACK_TOO_DEEP(v->re))
     707           0 :         return REG_ETOOBIG;
     708             : 
     709         850 :     switch (t->op)
     710             :     {
     711             :         case '=':               /* terminal node */
     712         312 :             assert(t->left == NULL && t->right == NULL);
     713         312 :             er = REG_OKAY;      /* no action, parent did the work */
     714         312 :             break;
     715             :         case 'b':               /* back reference */
     716          54 :             assert(t->left == NULL && t->right == NULL);
     717          54 :             er = cbrdissect(v, t, begin, end);
     718          54 :             break;
     719             :         case '.':               /* concatenation */
     720         346 :             assert(t->left != NULL && t->right != NULL);
     721         346 :             if (t->left->flags & SHORTER) /* reverse scan */
     722          52 :                 er = crevcondissect(v, t, begin, end);
     723             :             else
     724         294 :                 er = ccondissect(v, t, begin, end);
     725         346 :             break;
     726             :         case '|':               /* alternation */
     727           3 :             assert(t->left != NULL);
     728           3 :             er = caltdissect(v, t, begin, end);
     729           3 :             break;
     730             :         case '*':               /* iteration */
     731          12 :             assert(t->left != NULL);
     732          12 :             if (t->left->flags & SHORTER) /* reverse scan */
     733           1 :                 er = creviterdissect(v, t, begin, end);
     734             :             else
     735          11 :                 er = citerdissect(v, t, begin, end);
     736          12 :             break;
     737             :         case '(':               /* capturing */
     738         123 :             assert(t->left != NULL && t->right == NULL);
     739         123 :             assert(t->subno > 0);
     740         123 :             er = cdissect(v, t->left, begin, end);
     741         123 :             if (er == REG_OKAY)
     742          97 :                 subset(v, t, begin, end);
     743         123 :             break;
     744             :         default:
     745           0 :             er = REG_ASSERT;
     746           0 :             break;
     747             :     }
     748             : 
     749             :     /*
     750             :      * We should never have a match failure unless backrefs lurk below;
     751             :      * otherwise, either caller failed to check the DFA, or there's some
     752             :      * inconsistency between the DFA and the node's innards.
     753             :      */
     754         850 :     assert(er != REG_NOMATCH || (t->flags & BACKR));
     755             : 
     756         850 :     return er;
     757             : }
     758             : 
     759             : /*
     760             :  * ccondissect - dissect match for concatenation node
     761             :  */
     762             : static int                      /* regexec return code */
     763         294 : ccondissect(struct vars *v,
     764             :             struct subre *t,
     765             :             chr *begin,         /* beginning of relevant substring */
     766             :             chr *end)           /* end of same */
     767             : {
     768             :     struct dfa *d;
     769             :     struct dfa *d2;
     770             :     chr        *mid;
     771             :     int         er;
     772             : 
     773         294 :     assert(t->op == '.');
     774         294 :     assert(t->left != NULL && t->left->cnfa.nstates > 0);
     775         294 :     assert(t->right != NULL && t->right->cnfa.nstates > 0);
     776         294 :     assert(!(t->left->flags & SHORTER));
     777             : 
     778         294 :     d = getsubdfa(v, t->left);
     779         294 :     NOERR();
     780         294 :     d2 = getsubdfa(v, t->right);
     781         294 :     NOERR();
     782             :     MDEBUG(("cconcat %d\n", t->id));
     783             : 
     784             :     /* pick a tentative midpoint */
     785         294 :     mid = longest(v, d, begin, end, (int *) NULL);
     786         294 :     NOERR();
     787         294 :     if (mid == NULL)
     788           0 :         return REG_NOMATCH;
     789             :     MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
     790             : 
     791             :     /* iterate until satisfaction or failure */
     792             :     for (;;)
     793             :     {
     794             :         /* try this midpoint on for size */
     795         402 :         if (longest(v, d2, mid, end, (int *) NULL) == end)
     796             :         {
     797         297 :             er = cdissect(v, t->left, begin, mid);
     798         297 :             if (er == REG_OKAY)
     799             :             {
     800         250 :                 er = cdissect(v, t->right, mid, end);
     801         250 :                 if (er == REG_OKAY)
     802             :                 {
     803             :                     /* satisfaction */
     804             :                     MDEBUG(("successful\n"));
     805         147 :                     return REG_OKAY;
     806             :                 }
     807             :             }
     808         150 :             if (er != REG_NOMATCH)
     809           0 :                 return er;
     810             :         }
     811         255 :         NOERR();
     812             : 
     813             :         /* that midpoint didn't work, find a new one */
     814         255 :         if (mid == begin)
     815             :         {
     816             :             /* all possibilities exhausted */
     817             :             MDEBUG(("%d no midpoint\n", t->id));
     818          61 :             return REG_NOMATCH;
     819             :         }
     820         194 :         mid = longest(v, d, begin, mid - 1, (int *) NULL);
     821         194 :         NOERR();
     822         194 :         if (mid == NULL)
     823             :         {
     824             :             /* failed to find a new one */
     825             :             MDEBUG(("%d failed midpoint\n", t->id));
     826          86 :             return REG_NOMATCH;
     827             :         }
     828             :         MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
     829         108 :         zaptreesubs(v, t->left);
     830         108 :         zaptreesubs(v, t->right);
     831         108 :     }
     832             : 
     833             :     /* can't get here */
     834             :     return REG_ASSERT;
     835             : }
     836             : 
     837             : /*
     838             :  * crevcondissect - dissect match for concatenation node, shortest-first
     839             :  */
     840             : static int                      /* regexec return code */
     841          52 : crevcondissect(struct vars *v,
     842             :                struct subre *t,
     843             :                chr *begin,      /* beginning of relevant substring */
     844             :                chr *end)        /* end of same */
     845             : {
     846             :     struct dfa *d;
     847             :     struct dfa *d2;
     848             :     chr        *mid;
     849             :     int         er;
     850             : 
     851          52 :     assert(t->op == '.');
     852          52 :     assert(t->left != NULL && t->left->cnfa.nstates > 0);
     853          52 :     assert(t->right != NULL && t->right->cnfa.nstates > 0);
     854          52 :     assert(t->left->flags & SHORTER);
     855             : 
     856          52 :     d = getsubdfa(v, t->left);
     857          52 :     NOERR();
     858          52 :     d2 = getsubdfa(v, t->right);
     859          52 :     NOERR();
     860             :     MDEBUG(("crevcon %d\n", t->id));
     861             : 
     862             :     /* pick a tentative midpoint */
     863          52 :     mid = shortest(v, d, begin, begin, end, (chr **) NULL, (int *) NULL);
     864          52 :     NOERR();
     865          52 :     if (mid == NULL)
     866           0 :         return REG_NOMATCH;
     867             :     MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
     868             : 
     869             :     /* iterate until satisfaction or failure */
     870             :     for (;;)
     871             :     {
     872             :         /* try this midpoint on for size */
     873         216 :         if (longest(v, d2, mid, end, (int *) NULL) == end)
     874             :         {
     875          52 :             er = cdissect(v, t->left, begin, mid);
     876          52 :             if (er == REG_OKAY)
     877             :             {
     878          36 :                 er = cdissect(v, t->right, mid, end);
     879          36 :                 if (er == REG_OKAY)
     880             :                 {
     881             :                     /* satisfaction */
     882             :                     MDEBUG(("successful\n"));
     883           9 :                     return REG_OKAY;
     884             :                 }
     885             :             }
     886          43 :             if (er != REG_NOMATCH)
     887           0 :                 return er;
     888             :         }
     889         207 :         NOERR();
     890             : 
     891             :         /* that midpoint didn't work, find a new one */
     892         207 :         if (mid == end)
     893             :         {
     894             :             /* all possibilities exhausted */
     895             :             MDEBUG(("%d no midpoint\n", t->id));
     896          43 :             return REG_NOMATCH;
     897             :         }
     898         164 :         mid = shortest(v, d, begin, mid + 1, end, (chr **) NULL, (int *) NULL);
     899         164 :         NOERR();
     900         164 :         if (mid == NULL)
     901             :         {
     902             :             /* failed to find a new one */
     903             :             MDEBUG(("%d failed midpoint\n", t->id));
     904           0 :             return REG_NOMATCH;
     905             :         }
     906             :         MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
     907         164 :         zaptreesubs(v, t->left);
     908         164 :         zaptreesubs(v, t->right);
     909         164 :     }
     910             : 
     911             :     /* can't get here */
     912             :     return REG_ASSERT;
     913             : }
     914             : 
     915             : /*
     916             :  * cbrdissect - dissect match for backref node
     917             :  */
     918             : static int                      /* regexec return code */
     919          54 : cbrdissect(struct vars *v,
     920             :            struct subre *t,
     921             :            chr *begin,          /* beginning of relevant substring */
     922             :            chr *end)            /* end of same */
     923             : {
     924          54 :     int         n = t->subno;
     925             :     size_t      numreps;
     926             :     size_t      tlen;
     927             :     size_t      brlen;
     928             :     chr        *brstring;
     929             :     chr        *p;
     930          54 :     int         min = t->min;
     931          54 :     int         max = t->max;
     932             : 
     933          54 :     assert(t != NULL);
     934          54 :     assert(t->op == 'b');
     935          54 :     assert(n >= 0);
     936          54 :     assert((size_t) n < v->nmatch);
     937             : 
     938             :     MDEBUG(("cbackref n%d %d{%d-%d}\n", t->id, n, min, max));
     939             : 
     940             :     /* get the backreferenced string */
     941          54 :     if (v->pmatch[n].rm_so == -1)
     942           2 :         return REG_NOMATCH;
     943          52 :     brstring = v->start + v->pmatch[n].rm_so;
     944          52 :     brlen = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;
     945             : 
     946             :     /* special cases for zero-length strings */
     947          52 :     if (brlen == 0)
     948             :     {
     949             :         /*
     950             :          * matches only if target is zero length, but any number of
     951             :          * repetitions can be considered to be present
     952             :          */
     953           2 :         if (begin == end && min <= max)
     954             :         {
     955             :             MDEBUG(("cbackref matched trivially\n"));
     956           2 :             return REG_OKAY;
     957             :         }
     958           0 :         return REG_NOMATCH;
     959             :     }
     960          50 :     if (begin == end)
     961             :     {
     962             :         /* matches only if zero repetitions are okay */
     963           1 :         if (min == 0)
     964             :         {
     965             :             MDEBUG(("cbackref matched trivially\n"));
     966           1 :             return REG_OKAY;
     967             :         }
     968           0 :         return REG_NOMATCH;
     969             :     }
     970             : 
     971             :     /*
     972             :      * check target length to see if it could possibly be an allowed number of
     973             :      * repetitions of brstring
     974             :      */
     975          49 :     assert(end > begin);
     976          49 :     tlen = end - begin;
     977          49 :     if (tlen % brlen != 0)
     978           6 :         return REG_NOMATCH;
     979          43 :     numreps = tlen / brlen;
     980          43 :     if (numreps < min || (numreps > max && max != DUPINF))
     981           0 :         return REG_NOMATCH;
     982             : 
     983             :     /* okay, compare the actual string contents */
     984          43 :     p = begin;
     985         102 :     while (numreps-- > 0)
     986             :     {
     987          48 :         if ((*v->g->compare) (brstring, p, brlen) != 0)
     988          32 :             return REG_NOMATCH;
     989          16 :         p += brlen;
     990             :     }
     991             : 
     992             :     MDEBUG(("cbackref matched\n"));
     993          11 :     return REG_OKAY;
     994             : }
     995             : 
     996             : /*
     997             :  * caltdissect - dissect match for alternation node
     998             :  */
     999             : static int                      /* regexec return code */
    1000           3 : caltdissect(struct vars *v,
    1001             :             struct subre *t,
    1002             :             chr *begin,         /* beginning of relevant substring */
    1003             :             chr *end)           /* end of same */
    1004             : {
    1005             :     struct dfa *d;
    1006             :     int         er;
    1007             : 
    1008             :     /* We loop, rather than tail-recurse, to handle a chain of alternatives */
    1009          10 :     while (t != NULL)
    1010             :     {
    1011           5 :         assert(t->op == '|');
    1012           5 :         assert(t->left != NULL && t->left->cnfa.nstates > 0);
    1013             : 
    1014             :         MDEBUG(("calt n%d\n", t->id));
    1015             : 
    1016           5 :         d = getsubdfa(v, t->left);
    1017           5 :         NOERR();
    1018           5 :         if (longest(v, d, begin, end, (int *) NULL) == end)
    1019             :         {
    1020             :             MDEBUG(("calt matched\n"));
    1021           3 :             er = cdissect(v, t->left, begin, end);
    1022           3 :             if (er != REG_NOMATCH)
    1023           1 :                 return er;
    1024             :         }
    1025           4 :         NOERR();
    1026             : 
    1027           4 :         t = t->right;
    1028             :     }
    1029             : 
    1030           2 :     return REG_NOMATCH;
    1031             : }
    1032             : 
    1033             : /*
    1034             :  * citerdissect - dissect match for iteration node
    1035             :  */
    1036             : static int                      /* regexec return code */
    1037          11 : citerdissect(struct vars *v,
    1038             :              struct subre *t,
    1039             :              chr *begin,        /* beginning of relevant substring */
    1040             :              chr *end)          /* end of same */
    1041             : {
    1042             :     struct dfa *d;
    1043             :     chr       **endpts;
    1044             :     chr        *limit;
    1045             :     int         min_matches;
    1046             :     size_t      max_matches;
    1047             :     int         nverified;
    1048             :     int         k;
    1049             :     int         i;
    1050             :     int         er;
    1051             : 
    1052          11 :     assert(t->op == '*');
    1053          11 :     assert(t->left != NULL && t->left->cnfa.nstates > 0);
    1054          11 :     assert(!(t->left->flags & SHORTER));
    1055          11 :     assert(begin <= end);
    1056             : 
    1057             :     /*
    1058             :      * For the moment, assume the minimum number of matches is 1.  If zero
    1059             :      * matches are allowed, and the target string is empty, we are allowed to
    1060             :      * match regardless of the contents of the iter node --- but we would
    1061             :      * prefer to match once, so that capturing parens get set.  (An example of
    1062             :      * the concern here is a pattern like "()*\1", which historically this
    1063             :      * code has allowed to succeed.)  Therefore, we deal with the zero-matches
    1064             :      * case at the bottom, after failing to find any other way to match.
    1065             :      */
    1066          11 :     min_matches = t->min;
    1067          11 :     if (min_matches <= 0)
    1068           2 :         min_matches = 1;
    1069             : 
    1070             :     /*
    1071             :      * We need workspace to track the endpoints of each sub-match.  Normally
    1072             :      * we consider only nonzero-length sub-matches, so there can be at most
    1073             :      * end-begin of them.  However, if min is larger than that, we will also
    1074             :      * consider zero-length sub-matches in order to find enough matches.
    1075             :      *
    1076             :      * For convenience, endpts[0] contains the "begin" pointer and we store
    1077             :      * sub-match endpoints in endpts[1..max_matches].
    1078             :      */
    1079          11 :     max_matches = end - begin;
    1080          11 :     if (max_matches > t->max && t->max != DUPINF)
    1081           0 :         max_matches = t->max;
    1082          11 :     if (max_matches < min_matches)
    1083           2 :         max_matches = min_matches;
    1084          11 :     endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
    1085          11 :     if (endpts == NULL)
    1086           0 :         return REG_ESPACE;
    1087          11 :     endpts[0] = begin;
    1088             : 
    1089          11 :     d = getsubdfa(v, t->left);
    1090          11 :     if (ISERR())
    1091             :     {
    1092           0 :         FREE(endpts);
    1093           0 :         return v->err;
    1094             :     }
    1095             :     MDEBUG(("citer %d\n", t->id));
    1096             : 
    1097             :     /*
    1098             :      * Our strategy is to first find a set of sub-match endpoints that are
    1099             :      * valid according to the child node's DFA, and then recursively dissect
    1100             :      * each sub-match to confirm validity.  If any validity check fails,
    1101             :      * backtrack the last sub-match and try again.  And, when we next try for
    1102             :      * a validity check, we need not recheck any successfully verified
    1103             :      * sub-matches that we didn't move the endpoints of.  nverified remembers
    1104             :      * how many sub-matches are currently known okay.
    1105             :      */
    1106             : 
    1107             :     /* initialize to consider first sub-match */
    1108          11 :     nverified = 0;
    1109          11 :     k = 1;
    1110          11 :     limit = end;
    1111             : 
    1112             :     /* iterate until satisfaction or failure */
    1113         112 :     while (k > 0)
    1114             :     {
    1115             :         /* try to find an endpoint for the k'th sub-match */
    1116          93 :         endpts[k] = longest(v, d, endpts[k - 1], limit, (int *) NULL);
    1117          93 :         if (ISERR())
    1118             :         {
    1119           0 :             FREE(endpts);
    1120           0 :             return v->err;
    1121             :         }
    1122          93 :         if (endpts[k] == NULL)
    1123             :         {
    1124             :             /* no match possible, so see if we can shorten previous one */
    1125          43 :             k--;
    1126          43 :             goto backtrack;
    1127             :         }
    1128             :         MDEBUG(("%d: working endpoint %d: %ld\n",
    1129             :                 t->id, k, LOFF(endpts[k])));
    1130             : 
    1131             :         /* k'th sub-match can no longer be considered verified */
    1132          50 :         if (nverified >= k)
    1133           2 :             nverified = k - 1;
    1134             : 
    1135          50 :         if (endpts[k] != end)
    1136             :         {
    1137             :             /* haven't reached end yet, try another iteration if allowed */
    1138          37 :             if (k >= max_matches)
    1139             :             {
    1140             :                 /* must try to shorten some previous match */
    1141           0 :                 k--;
    1142           0 :                 goto backtrack;
    1143             :             }
    1144             : 
    1145             :             /* reject zero-length match unless necessary to achieve min */
    1146          37 :             if (endpts[k] == endpts[k - 1] &&
    1147           0 :                 (k >= min_matches || min_matches - k < end - endpts[k]))
    1148             :                 goto backtrack;
    1149             : 
    1150          37 :             k++;
    1151          37 :             limit = end;
    1152          37 :             continue;
    1153             :         }
    1154             : 
    1155             :         /*
    1156             :          * We've identified a way to divide the string into k sub-matches that
    1157             :          * works so far as the child DFA can tell.  If k is an allowed number
    1158             :          * of matches, start the slow part: recurse to verify each sub-match.
    1159             :          * We always have k <= max_matches, needn't check that.
    1160             :          */
    1161          13 :         if (k < min_matches)
    1162           0 :             goto backtrack;
    1163             : 
    1164             :         MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + 1, k));
    1165             : 
    1166          40 :         for (i = nverified + 1; i <= k; i++)
    1167             :         {
    1168          17 :             zaptreesubs(v, t->left);
    1169          17 :             er = cdissect(v, t->left, endpts[i - 1], endpts[i]);
    1170          17 :             if (er == REG_OKAY)
    1171             :             {
    1172           7 :                 nverified = i;
    1173           7 :                 continue;
    1174             :             }
    1175          10 :             if (er == REG_NOMATCH)
    1176          10 :                 break;
    1177             :             /* oops, something failed */
    1178           0 :             FREE(endpts);
    1179           0 :             return er;
    1180             :         }
    1181             : 
    1182          13 :         if (i > k)
    1183             :         {
    1184             :             /* satisfaction */
    1185             :             MDEBUG(("%d successful\n", t->id));
    1186           3 :             FREE(endpts);
    1187           3 :             return REG_OKAY;
    1188             :         }
    1189             : 
    1190             :         /* match failed to verify, so backtrack */
    1191             : 
    1192             : backtrack:
    1193             : 
    1194             :         /*
    1195             :          * Must consider shorter versions of the current sub-match.  However,
    1196             :          * we'll only ask for a zero-length match if necessary.
    1197             :          */
    1198         106 :         while (k > 0)
    1199             :         {
    1200          45 :             chr        *prev_end = endpts[k - 1];
    1201             : 
    1202          45 :             if (endpts[k] > prev_end)
    1203             :             {
    1204          45 :                 limit = endpts[k] - 1;
    1205          45 :                 if (limit > prev_end ||
    1206           0 :                     (k < min_matches && min_matches - k >= end - prev_end))
    1207             :                 {
    1208             :                     /* break out of backtrack loop, continue the outer one */
    1209             :                     break;
    1210             :                 }
    1211             :             }
    1212             :             /* can't shorten k'th sub-match any more, consider previous one */
    1213           0 :             k--;
    1214             :         }
    1215             :     }
    1216             : 
    1217             :     /* all possibilities exhausted */
    1218           8 :     FREE(endpts);
    1219             : 
    1220             :     /*
    1221             :      * Now consider the possibility that we can match to a zero-length string
    1222             :      * by using zero repetitions.
    1223             :      */
    1224           8 :     if (t->min == 0 && begin == end)
    1225             :     {
    1226             :         MDEBUG(("%d allowing zero matches\n", t->id));
    1227           1 :         return REG_OKAY;
    1228             :     }
    1229             : 
    1230             :     MDEBUG(("%d failed\n", t->id));
    1231           7 :     return REG_NOMATCH;
    1232             : }
    1233             : 
    1234             : /*
    1235             :  * creviterdissect - dissect match for iteration node, shortest-first
    1236             :  */
    1237             : static int                      /* regexec return code */
    1238           1 : creviterdissect(struct vars *v,
    1239             :                 struct subre *t,
    1240             :                 chr *begin,     /* beginning of relevant substring */
    1241             :                 chr *end)       /* end of same */
    1242             : {
    1243             :     struct dfa *d;
    1244             :     chr       **endpts;
    1245             :     chr        *limit;
    1246             :     int         min_matches;
    1247             :     size_t      max_matches;
    1248             :     int         nverified;
    1249             :     int         k;
    1250             :     int         i;
    1251             :     int         er;
    1252             : 
    1253           1 :     assert(t->op == '*');
    1254           1 :     assert(t->left != NULL && t->left->cnfa.nstates > 0);
    1255           1 :     assert(t->left->flags & SHORTER);
    1256           1 :     assert(begin <= end);
    1257             : 
    1258             :     /*
    1259             :      * If zero matches are allowed, and target string is empty, just declare
    1260             :      * victory.  OTOH, if target string isn't empty, zero matches can't work
    1261             :      * so we pretend the min is 1.
    1262             :      */
    1263           1 :     min_matches = t->min;
    1264           1 :     if (min_matches <= 0)
    1265             :     {
    1266           1 :         if (begin == end)
    1267           0 :             return REG_OKAY;
    1268           1 :         min_matches = 1;
    1269             :     }
    1270             : 
    1271             :     /*
    1272             :      * We need workspace to track the endpoints of each sub-match.  Normally
    1273             :      * we consider only nonzero-length sub-matches, so there can be at most
    1274             :      * end-begin of them.  However, if min is larger than that, we will also
    1275             :      * consider zero-length sub-matches in order to find enough matches.
    1276             :      *
    1277             :      * For convenience, endpts[0] contains the "begin" pointer and we store
    1278             :      * sub-match endpoints in endpts[1..max_matches].
    1279             :      */
    1280           1 :     max_matches = end - begin;
    1281           1 :     if (max_matches > t->max && t->max != DUPINF)
    1282           1 :         max_matches = t->max;
    1283           1 :     if (max_matches < min_matches)
    1284           0 :         max_matches = min_matches;
    1285           1 :     endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
    1286           1 :     if (endpts == NULL)
    1287           0 :         return REG_ESPACE;
    1288           1 :     endpts[0] = begin;
    1289             : 
    1290           1 :     d = getsubdfa(v, t->left);
    1291           1 :     if (ISERR())
    1292             :     {
    1293           0 :         FREE(endpts);
    1294           0 :         return v->err;
    1295             :     }
    1296             :     MDEBUG(("creviter %d\n", t->id));
    1297             : 
    1298             :     /*
    1299             :      * Our strategy is to first find a set of sub-match endpoints that are
    1300             :      * valid according to the child node's DFA, and then recursively dissect
    1301             :      * each sub-match to confirm validity.  If any validity check fails,
    1302             :      * backtrack the last sub-match and try again.  And, when we next try for
    1303             :      * a validity check, we need not recheck any successfully verified
    1304             :      * sub-matches that we didn't move the endpoints of.  nverified remembers
    1305             :      * how many sub-matches are currently known okay.
    1306             :      */
    1307             : 
    1308             :     /* initialize to consider first sub-match */
    1309           1 :     nverified = 0;
    1310           1 :     k = 1;
    1311           1 :     limit = begin;
    1312             : 
    1313             :     /* iterate until satisfaction or failure */
    1314           2 :     while (k > 0)
    1315             :     {
    1316             :         /* disallow zero-length match unless necessary to achieve min */
    1317           1 :         if (limit == endpts[k - 1] &&
    1318           1 :             limit != end &&
    1319           0 :             (k >= min_matches || min_matches - k < end - limit))
    1320           1 :             limit++;
    1321             : 
    1322             :         /* if this is the last allowed sub-match, it must reach to the end */
    1323           1 :         if (k >= max_matches)
    1324           1 :             limit = end;
    1325             : 
    1326             :         /* try to find an endpoint for the k'th sub-match */
    1327           1 :         endpts[k] = shortest(v, d, endpts[k - 1], limit, end,
    1328             :                              (chr **) NULL, (int *) NULL);
    1329           1 :         if (ISERR())
    1330             :         {
    1331           0 :             FREE(endpts);
    1332           0 :             return v->err;
    1333             :         }
    1334           1 :         if (endpts[k] == NULL)
    1335             :         {
    1336             :             /* no match possible, so see if we can lengthen previous one */
    1337           0 :             k--;
    1338           0 :             goto backtrack;
    1339             :         }
    1340             :         MDEBUG(("%d: working endpoint %d: %ld\n",
    1341             :                 t->id, k, LOFF(endpts[k])));
    1342             : 
    1343             :         /* k'th sub-match can no longer be considered verified */
    1344           1 :         if (nverified >= k)
    1345           0 :             nverified = k - 1;
    1346             : 
    1347           1 :         if (endpts[k] != end)
    1348             :         {
    1349             :             /* haven't reached end yet, try another iteration if allowed */
    1350           0 :             if (k >= max_matches)
    1351             :             {
    1352             :                 /* must try to lengthen some previous match */
    1353           0 :                 k--;
    1354           0 :                 goto backtrack;
    1355             :             }
    1356             : 
    1357           0 :             k++;
    1358           0 :             limit = endpts[k - 1];
    1359           0 :             continue;
    1360             :         }
    1361             : 
    1362             :         /*
    1363             :          * We've identified a way to divide the string into k sub-matches that
    1364             :          * works so far as the child DFA can tell.  If k is an allowed number
    1365             :          * of matches, start the slow part: recurse to verify each sub-match.
    1366             :          * We always have k <= max_matches, needn't check that.
    1367             :          */
    1368           1 :         if (k < min_matches)
    1369           0 :             goto backtrack;
    1370             : 
    1371             :         MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + 1, k));
    1372             : 
    1373           4 :         for (i = nverified + 1; i <= k; i++)
    1374             :         {
    1375           1 :             zaptreesubs(v, t->left);
    1376           1 :             er = cdissect(v, t->left, endpts[i - 1], endpts[i]);
    1377           1 :             if (er == REG_OKAY)
    1378             :             {
    1379           1 :                 nverified = i;
    1380           1 :                 continue;
    1381             :             }
    1382           0 :             if (er == REG_NOMATCH)
    1383           0 :                 break;
    1384             :             /* oops, something failed */
    1385           0 :             FREE(endpts);
    1386           0 :             return er;
    1387             :         }
    1388             : 
    1389           1 :         if (i > k)
    1390             :         {
    1391             :             /* satisfaction */
    1392             :             MDEBUG(("%d successful\n", t->id));
    1393           1 :             FREE(endpts);
    1394           1 :             return REG_OKAY;
    1395             :         }
    1396             : 
    1397             :         /* match failed to verify, so backtrack */
    1398             : 
    1399             : backtrack:
    1400             : 
    1401             :         /*
    1402             :          * Must consider longer versions of the current sub-match.
    1403             :          */
    1404           0 :         while (k > 0)
    1405             :         {
    1406           0 :             if (endpts[k] < end)
    1407             :             {
    1408           0 :                 limit = endpts[k] + 1;
    1409             :                 /* break out of backtrack loop, continue the outer one */
    1410           0 :                 break;
    1411             :             }
    1412             :             /* can't lengthen k'th sub-match any more, consider previous one */
    1413           0 :             k--;
    1414             :         }
    1415             :     }
    1416             : 
    1417             :     /* all possibilities exhausted */
    1418             :     MDEBUG(("%d failed\n", t->id));
    1419           0 :     FREE(endpts);
    1420           0 :     return REG_NOMATCH;
    1421             : }
    1422             : 
    1423             : 
    1424             : 
    1425             : #include "rege_dfa.c"
 |