Line data Source code
1 : /*
2 : * re_*exec and friends - match REs
3 : *
4 : * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
5 : *
6 : * Development of this software was funded, in part, by Cray Research Inc.,
7 : * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
8 : * Corporation, none of whom are responsible for the results. The author
9 : * thanks all of them.
10 : *
11 : * Redistribution and use in source and binary forms -- with or without
12 : * modification -- are permitted for any purpose, provided that
13 : * redistributions in source form retain this entire copyright notice and
14 : * indicate the origin and nature of any modifications.
15 : *
16 : * I'd appreciate being given credit for this package in the documentation
17 : * of software which uses it, but that is not a requirement.
18 : *
19 : * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
20 : * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
21 : * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
22 : * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
23 : * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
24 : * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
25 : * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
26 : * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
27 : * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
28 : * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
29 : *
30 : * src/backend/regex/regexec.c
31 : *
32 : */
33 :
34 : #include "regex/regguts.h"
35 :
36 :
37 :
38 : /* lazy-DFA representation */
39 : struct arcp
40 : { /* "pointer" to an outarc */
41 : struct sset *ss;
42 : color co;
43 : };
44 :
45 : struct sset
46 : { /* state set */
47 : unsigned *states; /* pointer to bitvector */
48 : unsigned hash; /* hash of bitvector */
49 : #define HASH(bv, nw) (((nw) == 1) ? *(bv) : hash(bv, nw))
50 : #define HIT(h,bv,ss,nw) ((ss)->hash == (h) && ((nw) == 1 || \
51 : memcmp(VS(bv), VS((ss)->states), (nw)*sizeof(unsigned)) == 0))
52 : int flags;
53 : #define STARTER 01 /* the initial state set */
54 : #define POSTSTATE 02 /* includes the goal state */
55 : #define LOCKED 04 /* locked in cache */
56 : #define NOPROGRESS 010 /* zero-progress state set */
57 : struct arcp ins; /* chain of inarcs pointing here */
58 : chr *lastseen; /* last entered on arrival here */
59 : struct sset **outs; /* outarc vector indexed by color */
60 : struct arcp *inchain; /* chain-pointer vector for outarcs */
61 : };
62 :
63 : struct dfa
64 : {
65 : int nssets; /* size of cache */
66 : int nssused; /* how many entries occupied yet */
67 : int nstates; /* number of states */
68 : int ncolors; /* length of outarc and inchain vectors */
69 : int wordsper; /* length of state-set bitvectors */
70 : struct sset *ssets; /* state-set cache */
71 : unsigned *statesarea; /* bitvector storage */
72 : unsigned *work; /* pointer to work area within statesarea */
73 : struct sset **outsarea; /* outarc-vector storage */
74 : struct arcp *incarea; /* inchain storage */
75 : struct cnfa *cnfa;
76 : struct colormap *cm;
77 : chr *lastpost; /* location of last cache-flushed success */
78 : chr *lastnopr; /* location of last cache-flushed NOPROGRESS */
79 : struct sset *search; /* replacement-search-pointer memory */
80 : int cptsmalloced; /* were the areas individually malloced? */
81 : char *mallocarea; /* self, or master malloced area, or NULL */
82 : };
83 :
84 : #define WORK 1 /* number of work bitvectors needed */
85 :
86 : /* setup for non-malloc allocation for small cases */
87 : #define FEWSTATES 20 /* must be less than UBITS */
88 : #define FEWCOLORS 15
89 : struct smalldfa
90 : {
91 : struct dfa dfa;
92 : struct sset ssets[FEWSTATES * 2];
93 : unsigned statesarea[FEWSTATES * 2 + WORK];
94 : struct sset *outsarea[FEWSTATES * 2 * FEWCOLORS];
95 : struct arcp incarea[FEWSTATES * 2 * FEWCOLORS];
96 : };
97 :
98 : #define DOMALLOC ((struct smalldfa *)NULL) /* force malloc */
99 :
100 :
101 :
102 : /* internal variables, bundled for easy passing around */
103 : struct vars
104 : {
105 : regex_t *re;
106 : struct guts *g;
107 : int eflags; /* copies of arguments */
108 : size_t nmatch;
109 : regmatch_t *pmatch;
110 : rm_detail_t *details;
111 : chr *start; /* start of string */
112 : chr *search_start; /* search start of string */
113 : chr *stop; /* just past end of string */
114 : int err; /* error code if any (0 none) */
115 : struct dfa **subdfas; /* per-tree-subre DFAs */
116 : struct dfa **ladfas; /* per-lacon-subre DFAs */
117 : struct sset **lblastcss; /* per-lacon-subre lookbehind restart data */
118 : chr **lblastcp; /* per-lacon-subre lookbehind restart data */
119 : struct smalldfa dfa1;
120 : struct smalldfa dfa2;
121 : };
122 :
123 : #define VISERR(vv) ((vv)->err != 0) /* have we seen an error yet? */
124 : #define ISERR() VISERR(v)
125 : #define VERR(vv,e) ((vv)->err = ((vv)->err ? (vv)->err : (e)))
126 : #define ERR(e) VERR(v, e) /* record an error */
127 : #define NOERR() {if (ISERR()) return v->err;} /* if error seen, return it */
128 : #define OFF(p) ((p) - v->start)
129 : #define LOFF(p) ((long)OFF(p))
130 :
131 :
132 :
133 : /*
134 : * forward declarations
135 : */
136 : /* === regexec.c === */
137 : static struct dfa *getsubdfa(struct vars *, struct subre *);
138 : static struct dfa *getladfa(struct vars *, int);
139 : static int find(struct vars *, struct cnfa *, struct colormap *);
140 : static int cfind(struct vars *, struct cnfa *, struct colormap *);
141 : static int cfindloop(struct vars *, struct cnfa *, struct colormap *, struct dfa *, struct dfa *, chr **);
142 : static void zapallsubs(regmatch_t *, size_t);
143 : static void zaptreesubs(struct vars *, struct subre *);
144 : static void subset(struct vars *, struct subre *, chr *, chr *);
145 : static int cdissect(struct vars *, struct subre *, chr *, chr *);
146 : static int ccondissect(struct vars *, struct subre *, chr *, chr *);
147 : static int crevcondissect(struct vars *, struct subre *, chr *, chr *);
148 : static int cbrdissect(struct vars *, struct subre *, chr *, chr *);
149 : static int caltdissect(struct vars *, struct subre *, chr *, chr *);
150 : static int citerdissect(struct vars *, struct subre *, chr *, chr *);
151 : static int creviterdissect(struct vars *, struct subre *, chr *, chr *);
152 :
153 : /* === rege_dfa.c === */
154 : static chr *longest(struct vars *, struct dfa *, chr *, chr *, int *);
155 : static chr *shortest(struct vars *, struct dfa *, chr *, chr *, chr *, chr **, int *);
156 : static int matchuntil(struct vars *, struct dfa *, chr *, struct sset **, chr **);
157 : static chr *lastcold(struct vars *, struct dfa *);
158 : static struct dfa *newdfa(struct vars *, struct cnfa *, struct colormap *, struct smalldfa *);
159 : static void freedfa(struct dfa *);
160 : static unsigned hash(unsigned *, int);
161 : static struct sset *initialize(struct vars *, struct dfa *, chr *);
162 : static struct sset *miss(struct vars *, struct dfa *, struct sset *, color, chr *, chr *);
163 : static int lacon(struct vars *, struct cnfa *, chr *, color);
164 : static struct sset *getvacant(struct vars *, struct dfa *, chr *, chr *);
165 : static struct sset *pickss(struct vars *, struct dfa *, chr *, chr *);
166 :
167 :
168 : /*
169 : * pg_regexec - match regular expression
170 : */
171 : int
172 32302 : pg_regexec(regex_t *re,
173 : const chr *string,
174 : size_t len,
175 : size_t search_start,
176 : rm_detail_t *details,
177 : size_t nmatch,
178 : regmatch_t pmatch[],
179 : int flags)
180 : {
181 : struct vars var;
182 32302 : register struct vars *v = &var;
183 : int st;
184 : size_t n;
185 : size_t i;
186 : int backref;
187 :
188 : #define LOCALMAT 20
189 : regmatch_t mat[LOCALMAT];
190 :
191 : #define LOCALDFAS 40
192 : struct dfa *subdfas[LOCALDFAS];
193 :
194 : /* sanity checks */
195 32302 : if (re == NULL || string == NULL || re->re_magic != REMAGIC)
196 0 : return REG_INVARG;
197 32302 : if (re->re_csize != sizeof(chr))
198 0 : return REG_MIXED;
199 :
200 : /* Initialize locale-dependent support */
201 32302 : pg_set_regex_collation(re->re_collation);
202 :
203 : /* setup */
204 32302 : v->re = re;
205 32302 : v->g = (struct guts *) re->re_guts;
206 32302 : if ((v->g->cflags & REG_EXPECT) && details == NULL)
207 0 : return REG_INVARG;
208 32302 : if (v->g->info & REG_UIMPOSSIBLE)
209 3 : return REG_NOMATCH;
210 32299 : backref = (v->g->info & REG_UBACKREF) ? 1 : 0;
211 32299 : v->eflags = flags;
212 32299 : if (v->g->cflags & REG_NOSUB)
213 0 : nmatch = 0; /* override client */
214 32299 : v->nmatch = nmatch;
215 32299 : if (backref)
216 : {
217 : /* need work area */
218 19 : if (v->g->nsub + 1 <= LOCALMAT)
219 19 : v->pmatch = mat;
220 : else
221 0 : v->pmatch = (regmatch_t *) MALLOC((v->g->nsub + 1) *
222 : sizeof(regmatch_t));
223 19 : if (v->pmatch == NULL)
224 0 : return REG_ESPACE;
225 19 : v->nmatch = v->g->nsub + 1;
226 : }
227 : else
228 32280 : v->pmatch = pmatch;
229 32299 : v->details = details;
230 32299 : v->start = (chr *) string;
231 32299 : v->search_start = (chr *) string + search_start;
232 32299 : v->stop = (chr *) string + len;
233 32299 : v->err = 0;
234 32299 : v->subdfas = NULL;
235 32299 : v->ladfas = NULL;
236 32299 : v->lblastcss = NULL;
237 32299 : v->lblastcp = NULL;
238 : /* below this point, "goto cleanup" will behave sanely */
239 :
240 32299 : assert(v->g->ntree >= 0);
241 32299 : n = (size_t) v->g->ntree;
242 32299 : if (n <= LOCALDFAS)
243 32297 : v->subdfas = subdfas;
244 : else
245 : {
246 2 : v->subdfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *));
247 2 : if (v->subdfas == NULL)
248 : {
249 0 : st = REG_ESPACE;
250 0 : goto cleanup;
251 : }
252 : }
253 105140 : for (i = 0; i < n; i++)
254 72841 : v->subdfas[i] = NULL;
255 :
256 32299 : assert(v->g->nlacons >= 0);
257 32299 : n = (size_t) v->g->nlacons;
258 32299 : if (n > 0)
259 : {
260 206 : v->ladfas = (struct dfa **) MALLOC(n * sizeof(struct dfa *));
261 206 : if (v->ladfas == NULL)
262 : {
263 0 : st = REG_ESPACE;
264 0 : goto cleanup;
265 : }
266 626 : for (i = 0; i < n; i++)
267 420 : v->ladfas[i] = NULL;
268 206 : v->lblastcss = (struct sset **) MALLOC(n * sizeof(struct sset *));
269 206 : v->lblastcp = (chr **) MALLOC(n * sizeof(chr *));
270 206 : if (v->lblastcss == NULL || v->lblastcp == NULL)
271 : {
272 0 : st = REG_ESPACE;
273 0 : goto cleanup;
274 : }
275 626 : for (i = 0; i < n; i++)
276 : {
277 420 : v->lblastcss[i] = NULL;
278 420 : v->lblastcp[i] = NULL;
279 : }
280 : }
281 :
282 : /* do it */
283 32299 : assert(v->g->tree != NULL);
284 32299 : if (backref)
285 19 : st = cfind(v, &v->g->tree->cnfa, &v->g->cmap);
286 : else
287 32280 : st = find(v, &v->g->tree->cnfa, &v->g->cmap);
288 :
289 : /* copy (portion of) match vector over if necessary */
290 32299 : if (st == REG_OKAY && v->pmatch != pmatch && nmatch > 0)
291 : {
292 2 : zapallsubs(pmatch, nmatch);
293 2 : n = (nmatch < v->nmatch) ? nmatch : v->nmatch;
294 2 : memcpy(VS(pmatch), VS(v->pmatch), n * sizeof(regmatch_t));
295 : }
296 :
297 : /* clean up */
298 : cleanup:
299 32299 : if (v->pmatch != pmatch && v->pmatch != mat)
300 0 : FREE(v->pmatch);
301 32299 : if (v->subdfas != NULL)
302 : {
303 32299 : n = (size_t) v->g->ntree;
304 105140 : for (i = 0; i < n; i++)
305 : {
306 72841 : if (v->subdfas[i] != NULL)
307 386 : freedfa(v->subdfas[i]);
308 : }
309 32299 : if (v->subdfas != subdfas)
310 2 : FREE(v->subdfas);
311 : }
312 32299 : if (v->ladfas != NULL)
313 : {
314 206 : n = (size_t) v->g->nlacons;
315 626 : for (i = 0; i < n; i++)
316 : {
317 420 : if (v->ladfas[i] != NULL)
318 12 : freedfa(v->ladfas[i]);
319 : }
320 206 : FREE(v->ladfas);
321 : }
322 32299 : if (v->lblastcss != NULL)
323 206 : FREE(v->lblastcss);
324 32299 : if (v->lblastcp != NULL)
325 206 : FREE(v->lblastcp);
326 :
327 32299 : return st;
328 : }
329 :
330 : /*
331 : * getsubdfa - create or re-fetch the DFA for a tree subre node
332 : *
333 : * We only need to create the DFA once per overall regex execution.
334 : * The DFA will be freed by the cleanup step in pg_regexec().
335 : */
336 : static struct dfa *
337 709 : getsubdfa(struct vars *v,
338 : struct subre *t)
339 : {
340 709 : if (v->subdfas[t->id] == NULL)
341 : {
342 386 : v->subdfas[t->id] = newdfa(v, &t->cnfa, &v->g->cmap, DOMALLOC);
343 386 : if (ISERR())
344 0 : return NULL;
345 : }
346 709 : return v->subdfas[t->id];
347 : }
348 :
349 : /*
350 : * getladfa - create or re-fetch the DFA for a LACON subre node
351 : *
352 : * Same as above, but for LACONs.
353 : */
354 : static struct dfa *
355 26 : getladfa(struct vars *v,
356 : int n)
357 : {
358 26 : assert(n > 0 && n < v->g->nlacons && v->g->lacons != NULL);
359 :
360 26 : if (v->ladfas[n] == NULL)
361 : {
362 12 : struct subre *sub = &v->g->lacons[n];
363 :
364 12 : v->ladfas[n] = newdfa(v, &sub->cnfa, &v->g->cmap, DOMALLOC);
365 12 : if (ISERR())
366 0 : return NULL;
367 : }
368 26 : return v->ladfas[n];
369 : }
370 :
371 : /*
372 : * find - find a match for the main NFA (no-complications case)
373 : */
374 : static int
375 32280 : find(struct vars *v,
376 : struct cnfa *cnfa,
377 : struct colormap *cm)
378 : {
379 : struct dfa *s;
380 : struct dfa *d;
381 : chr *begin;
382 32280 : chr *end = NULL;
383 : chr *cold;
384 : chr *open; /* open and close of range of possible starts */
385 : chr *close;
386 : int hitend;
387 32280 : int shorter = (v->g->tree->flags & SHORTER) ? 1 : 0;
388 :
389 : /* first, a shot with the search RE */
390 32280 : s = newdfa(v, &v->g->search, cm, &v->dfa1);
391 32280 : assert(!(ISERR() && s != NULL));
392 32280 : NOERR();
393 : MDEBUG(("\nsearch at %ld\n", LOFF(v->start)));
394 32280 : cold = NULL;
395 32280 : close = shortest(v, s, v->search_start, v->search_start, v->stop,
396 : &cold, (int *) NULL);
397 32280 : freedfa(s);
398 32280 : NOERR();
399 32280 : if (v->g->cflags & REG_EXPECT)
400 : {
401 0 : assert(v->details != NULL);
402 0 : if (cold != NULL)
403 0 : v->details->rm_extend.rm_so = OFF(cold);
404 : else
405 0 : v->details->rm_extend.rm_so = OFF(v->stop);
406 0 : v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */
407 : }
408 32280 : if (close == NULL) /* not found */
409 29910 : return REG_NOMATCH;
410 2370 : if (v->nmatch == 0) /* found, don't need exact location */
411 2090 : return REG_OKAY;
412 :
413 : /* find starting point and match */
414 280 : assert(cold != NULL);
415 280 : open = cold;
416 280 : cold = NULL;
417 : MDEBUG(("between %ld and %ld\n", LOFF(open), LOFF(close)));
418 280 : d = newdfa(v, cnfa, cm, &v->dfa1);
419 280 : assert(!(ISERR() && d != NULL));
420 280 : NOERR();
421 282 : for (begin = open; begin <= close; begin++)
422 : {
423 : MDEBUG(("\nfind trying at %ld\n", LOFF(begin)));
424 282 : if (shorter)
425 1 : end = shortest(v, d, begin, begin, v->stop,
426 : (chr **) NULL, &hitend);
427 : else
428 281 : end = longest(v, d, begin, v->stop, &hitend);
429 282 : if (ISERR())
430 : {
431 0 : freedfa(d);
432 0 : return v->err;
433 : }
434 282 : if (hitend && cold == NULL)
435 45 : cold = begin;
436 282 : if (end != NULL)
437 280 : break; /* NOTE BREAK OUT */
438 : }
439 280 : assert(end != NULL); /* search RE succeeded so loop should */
440 280 : freedfa(d);
441 :
442 : /* and pin down details */
443 280 : assert(v->nmatch > 0);
444 280 : v->pmatch[0].rm_so = OFF(begin);
445 280 : v->pmatch[0].rm_eo = OFF(end);
446 280 : if (v->g->cflags & REG_EXPECT)
447 : {
448 0 : if (cold != NULL)
449 0 : v->details->rm_extend.rm_so = OFF(cold);
450 : else
451 0 : v->details->rm_extend.rm_so = OFF(v->stop);
452 0 : v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */
453 : }
454 280 : if (v->nmatch == 1) /* no need for submatches */
455 253 : return REG_OKAY;
456 :
457 : /* find submatches */
458 27 : zapallsubs(v->pmatch, v->nmatch);
459 27 : return cdissect(v, v->g->tree, begin, end);
460 : }
461 :
462 : /*
463 : * cfind - find a match for the main NFA (with complications)
464 : */
465 : static int
466 19 : cfind(struct vars *v,
467 : struct cnfa *cnfa,
468 : struct colormap *cm)
469 : {
470 : struct dfa *s;
471 : struct dfa *d;
472 : chr *cold;
473 : int ret;
474 :
475 19 : s = newdfa(v, &v->g->search, cm, &v->dfa1);
476 19 : NOERR();
477 19 : d = newdfa(v, cnfa, cm, &v->dfa2);
478 19 : if (ISERR())
479 : {
480 0 : assert(d == NULL);
481 0 : freedfa(s);
482 0 : return v->err;
483 : }
484 :
485 19 : ret = cfindloop(v, cnfa, cm, d, s, &cold);
486 :
487 19 : freedfa(d);
488 19 : freedfa(s);
489 19 : NOERR();
490 19 : if (v->g->cflags & REG_EXPECT)
491 : {
492 0 : assert(v->details != NULL);
493 0 : if (cold != NULL)
494 0 : v->details->rm_extend.rm_so = OFF(cold);
495 : else
496 0 : v->details->rm_extend.rm_so = OFF(v->stop);
497 0 : v->details->rm_extend.rm_eo = OFF(v->stop); /* unknown */
498 : }
499 19 : return ret;
500 : }
501 :
502 : /*
503 : * cfindloop - the heart of cfind
504 : */
505 : static int
506 19 : cfindloop(struct vars *v,
507 : struct cnfa *cnfa,
508 : struct colormap *cm,
509 : struct dfa *d,
510 : struct dfa *s,
511 : chr **coldp) /* where to put coldstart pointer */
512 : {
513 : chr *begin;
514 : chr *end;
515 : chr *cold;
516 : chr *open; /* open and close of range of possible starts */
517 : chr *close;
518 : chr *estart;
519 : chr *estop;
520 : int er;
521 19 : int shorter = v->g->tree->flags & SHORTER;
522 : int hitend;
523 :
524 19 : assert(d != NULL && s != NULL);
525 19 : cold = NULL;
526 19 : close = v->search_start;
527 : do
528 : {
529 : /* Search with the search RE for match range at/beyond "close" */
530 : MDEBUG(("\ncsearch at %ld\n", LOFF(close)));
531 19 : close = shortest(v, s, close, close, v->stop, &cold, (int *) NULL);
532 19 : if (ISERR())
533 : {
534 0 : *coldp = cold;
535 0 : return v->err;
536 : }
537 19 : if (close == NULL)
538 1 : break; /* no more possible match anywhere */
539 18 : assert(cold != NULL);
540 18 : open = cold;
541 18 : cold = NULL;
542 : /* Search for matches starting between "open" and "close" inclusive */
543 : MDEBUG(("cbetween %ld and %ld\n", LOFF(open), LOFF(close)));
544 78 : for (begin = open; begin <= close; begin++)
545 : {
546 : MDEBUG(("\ncfind trying at %ld\n", LOFF(begin)));
547 70 : estart = begin;
548 70 : estop = v->stop;
549 : for (;;)
550 : {
551 : /* Here we use the top node's detailed RE */
552 98 : if (shorter)
553 32 : end = shortest(v, d, begin, estart,
554 : estop, (chr **) NULL, &hitend);
555 : else
556 66 : end = longest(v, d, begin, estop,
557 : &hitend);
558 98 : if (ISERR())
559 : {
560 0 : *coldp = cold;
561 0 : return v->err;
562 : }
563 98 : if (hitend && cold == NULL)
564 15 : cold = begin;
565 98 : if (end == NULL)
566 54 : break; /* no match with this begin point, try next */
567 : MDEBUG(("tentative end %ld\n", LOFF(end)));
568 : /* Dissect the potential match to see if it really matches */
569 44 : zapallsubs(v->pmatch, v->nmatch);
570 44 : er = cdissect(v, v->g->tree, begin, end);
571 44 : if (er == REG_OKAY)
572 : {
573 10 : if (v->nmatch > 0)
574 : {
575 10 : v->pmatch[0].rm_so = OFF(begin);
576 10 : v->pmatch[0].rm_eo = OFF(end);
577 : }
578 10 : *coldp = cold;
579 10 : return REG_OKAY;
580 : }
581 34 : if (er != REG_NOMATCH)
582 : {
583 0 : ERR(er);
584 0 : *coldp = cold;
585 0 : return er;
586 : }
587 : /* Try next longer/shorter match with same begin point */
588 34 : if (shorter)
589 : {
590 27 : if (end == estop)
591 4 : break; /* no more, so try next begin point */
592 23 : estart = end + 1;
593 : }
594 : else
595 : {
596 7 : if (end == begin)
597 2 : break; /* no more, so try next begin point */
598 5 : estop = end - 1;
599 : }
600 28 : } /* end loop over endpoint positions */
601 : } /* end loop over beginning positions */
602 :
603 : /*
604 : * If we get here, there is no possible match starting at or before
605 : * "close", so consider matches beyond that. We'll do a fresh search
606 : * with the search RE to find a new promising match range.
607 : */
608 8 : close++;
609 8 : } while (close < v->stop);
610 :
611 9 : *coldp = cold;
612 9 : return REG_NOMATCH;
613 : }
614 :
615 : /*
616 : * zapallsubs - initialize all subexpression matches to "no match"
617 : */
618 : static void
619 73 : zapallsubs(regmatch_t *p,
620 : size_t n)
621 : {
622 : size_t i;
623 :
624 279 : for (i = n - 1; i > 0; i--)
625 : {
626 206 : p[i].rm_so = -1;
627 206 : p[i].rm_eo = -1;
628 : }
629 73 : }
630 :
631 : /*
632 : * zaptreesubs - initialize subexpressions within subtree to "no match"
633 : */
634 : static void
635 1773 : zaptreesubs(struct vars *v,
636 : struct subre *t)
637 : {
638 1773 : if (t->op == '(')
639 : {
640 195 : int n = t->subno;
641 :
642 195 : assert(n > 0);
643 195 : if ((size_t) n < v->nmatch)
644 : {
645 191 : v->pmatch[n].rm_so = -1;
646 191 : v->pmatch[n].rm_eo = -1;
647 : }
648 : }
649 :
650 1773 : if (t->left != NULL)
651 738 : zaptreesubs(v, t->left);
652 1773 : if (t->right != NULL)
653 473 : zaptreesubs(v, t->right);
654 1773 : }
655 :
656 : /*
657 : * subset - set subexpression match data for a successful subre
658 : */
659 : static void
660 97 : subset(struct vars *v,
661 : struct subre *sub,
662 : chr *begin,
663 : chr *end)
664 : {
665 97 : int n = sub->subno;
666 :
667 97 : assert(n > 0);
668 97 : if ((size_t) n >= v->nmatch)
669 100 : return;
670 :
671 : MDEBUG(("setting %d\n", n));
672 94 : v->pmatch[n].rm_so = OFF(begin);
673 94 : v->pmatch[n].rm_eo = OFF(end);
674 : }
675 :
676 : /*
677 : * cdissect - check backrefs and determine subexpression matches
678 : *
679 : * cdissect recursively processes a subre tree to check matching of backrefs
680 : * and/or identify submatch boundaries for capture nodes. The proposed match
681 : * runs from "begin" to "end" (not including "end"), and we are basically
682 : * "dissecting" it to see where the submatches are.
683 : *
684 : * Before calling any level of cdissect, the caller must have run the node's
685 : * DFA and found that the proposed substring satisfies the DFA. (We make
686 : * the caller do that because in concatenation and iteration nodes, it's
687 : * much faster to check all the substrings against the child DFAs before we
688 : * recurse.) Also, caller must have cleared subexpression match data via
689 : * zaptreesubs (or zapallsubs at the top level).
690 : */
691 : static int /* regexec return code */
692 850 : cdissect(struct vars *v,
693 : struct subre *t,
694 : chr *begin, /* beginning of relevant substring */
695 : chr *end) /* end of same */
696 : {
697 : int er;
698 :
699 850 : assert(t != NULL);
700 : MDEBUG(("cdissect %ld-%ld %c\n", LOFF(begin), LOFF(end), t->op));
701 :
702 : /* handy place to check for operation cancel */
703 850 : if (CANCEL_REQUESTED(v->re))
704 0 : return REG_CANCEL;
705 : /* ... and stack overrun */
706 850 : if (STACK_TOO_DEEP(v->re))
707 0 : return REG_ETOOBIG;
708 :
709 850 : switch (t->op)
710 : {
711 : case '=': /* terminal node */
712 312 : assert(t->left == NULL && t->right == NULL);
713 312 : er = REG_OKAY; /* no action, parent did the work */
714 312 : break;
715 : case 'b': /* back reference */
716 54 : assert(t->left == NULL && t->right == NULL);
717 54 : er = cbrdissect(v, t, begin, end);
718 54 : break;
719 : case '.': /* concatenation */
720 346 : assert(t->left != NULL && t->right != NULL);
721 346 : if (t->left->flags & SHORTER) /* reverse scan */
722 52 : er = crevcondissect(v, t, begin, end);
723 : else
724 294 : er = ccondissect(v, t, begin, end);
725 346 : break;
726 : case '|': /* alternation */
727 3 : assert(t->left != NULL);
728 3 : er = caltdissect(v, t, begin, end);
729 3 : break;
730 : case '*': /* iteration */
731 12 : assert(t->left != NULL);
732 12 : if (t->left->flags & SHORTER) /* reverse scan */
733 1 : er = creviterdissect(v, t, begin, end);
734 : else
735 11 : er = citerdissect(v, t, begin, end);
736 12 : break;
737 : case '(': /* capturing */
738 123 : assert(t->left != NULL && t->right == NULL);
739 123 : assert(t->subno > 0);
740 123 : er = cdissect(v, t->left, begin, end);
741 123 : if (er == REG_OKAY)
742 97 : subset(v, t, begin, end);
743 123 : break;
744 : default:
745 0 : er = REG_ASSERT;
746 0 : break;
747 : }
748 :
749 : /*
750 : * We should never have a match failure unless backrefs lurk below;
751 : * otherwise, either caller failed to check the DFA, or there's some
752 : * inconsistency between the DFA and the node's innards.
753 : */
754 850 : assert(er != REG_NOMATCH || (t->flags & BACKR));
755 :
756 850 : return er;
757 : }
758 :
759 : /*
760 : * ccondissect - dissect match for concatenation node
761 : */
762 : static int /* regexec return code */
763 294 : ccondissect(struct vars *v,
764 : struct subre *t,
765 : chr *begin, /* beginning of relevant substring */
766 : chr *end) /* end of same */
767 : {
768 : struct dfa *d;
769 : struct dfa *d2;
770 : chr *mid;
771 : int er;
772 :
773 294 : assert(t->op == '.');
774 294 : assert(t->left != NULL && t->left->cnfa.nstates > 0);
775 294 : assert(t->right != NULL && t->right->cnfa.nstates > 0);
776 294 : assert(!(t->left->flags & SHORTER));
777 :
778 294 : d = getsubdfa(v, t->left);
779 294 : NOERR();
780 294 : d2 = getsubdfa(v, t->right);
781 294 : NOERR();
782 : MDEBUG(("cconcat %d\n", t->id));
783 :
784 : /* pick a tentative midpoint */
785 294 : mid = longest(v, d, begin, end, (int *) NULL);
786 294 : NOERR();
787 294 : if (mid == NULL)
788 0 : return REG_NOMATCH;
789 : MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
790 :
791 : /* iterate until satisfaction or failure */
792 : for (;;)
793 : {
794 : /* try this midpoint on for size */
795 402 : if (longest(v, d2, mid, end, (int *) NULL) == end)
796 : {
797 297 : er = cdissect(v, t->left, begin, mid);
798 297 : if (er == REG_OKAY)
799 : {
800 250 : er = cdissect(v, t->right, mid, end);
801 250 : if (er == REG_OKAY)
802 : {
803 : /* satisfaction */
804 : MDEBUG(("successful\n"));
805 147 : return REG_OKAY;
806 : }
807 : }
808 150 : if (er != REG_NOMATCH)
809 0 : return er;
810 : }
811 255 : NOERR();
812 :
813 : /* that midpoint didn't work, find a new one */
814 255 : if (mid == begin)
815 : {
816 : /* all possibilities exhausted */
817 : MDEBUG(("%d no midpoint\n", t->id));
818 61 : return REG_NOMATCH;
819 : }
820 194 : mid = longest(v, d, begin, mid - 1, (int *) NULL);
821 194 : NOERR();
822 194 : if (mid == NULL)
823 : {
824 : /* failed to find a new one */
825 : MDEBUG(("%d failed midpoint\n", t->id));
826 86 : return REG_NOMATCH;
827 : }
828 : MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
829 108 : zaptreesubs(v, t->left);
830 108 : zaptreesubs(v, t->right);
831 108 : }
832 :
833 : /* can't get here */
834 : return REG_ASSERT;
835 : }
836 :
837 : /*
838 : * crevcondissect - dissect match for concatenation node, shortest-first
839 : */
840 : static int /* regexec return code */
841 52 : crevcondissect(struct vars *v,
842 : struct subre *t,
843 : chr *begin, /* beginning of relevant substring */
844 : chr *end) /* end of same */
845 : {
846 : struct dfa *d;
847 : struct dfa *d2;
848 : chr *mid;
849 : int er;
850 :
851 52 : assert(t->op == '.');
852 52 : assert(t->left != NULL && t->left->cnfa.nstates > 0);
853 52 : assert(t->right != NULL && t->right->cnfa.nstates > 0);
854 52 : assert(t->left->flags & SHORTER);
855 :
856 52 : d = getsubdfa(v, t->left);
857 52 : NOERR();
858 52 : d2 = getsubdfa(v, t->right);
859 52 : NOERR();
860 : MDEBUG(("crevcon %d\n", t->id));
861 :
862 : /* pick a tentative midpoint */
863 52 : mid = shortest(v, d, begin, begin, end, (chr **) NULL, (int *) NULL);
864 52 : NOERR();
865 52 : if (mid == NULL)
866 0 : return REG_NOMATCH;
867 : MDEBUG(("tentative midpoint %ld\n", LOFF(mid)));
868 :
869 : /* iterate until satisfaction or failure */
870 : for (;;)
871 : {
872 : /* try this midpoint on for size */
873 216 : if (longest(v, d2, mid, end, (int *) NULL) == end)
874 : {
875 52 : er = cdissect(v, t->left, begin, mid);
876 52 : if (er == REG_OKAY)
877 : {
878 36 : er = cdissect(v, t->right, mid, end);
879 36 : if (er == REG_OKAY)
880 : {
881 : /* satisfaction */
882 : MDEBUG(("successful\n"));
883 9 : return REG_OKAY;
884 : }
885 : }
886 43 : if (er != REG_NOMATCH)
887 0 : return er;
888 : }
889 207 : NOERR();
890 :
891 : /* that midpoint didn't work, find a new one */
892 207 : if (mid == end)
893 : {
894 : /* all possibilities exhausted */
895 : MDEBUG(("%d no midpoint\n", t->id));
896 43 : return REG_NOMATCH;
897 : }
898 164 : mid = shortest(v, d, begin, mid + 1, end, (chr **) NULL, (int *) NULL);
899 164 : NOERR();
900 164 : if (mid == NULL)
901 : {
902 : /* failed to find a new one */
903 : MDEBUG(("%d failed midpoint\n", t->id));
904 0 : return REG_NOMATCH;
905 : }
906 : MDEBUG(("%d: new midpoint %ld\n", t->id, LOFF(mid)));
907 164 : zaptreesubs(v, t->left);
908 164 : zaptreesubs(v, t->right);
909 164 : }
910 :
911 : /* can't get here */
912 : return REG_ASSERT;
913 : }
914 :
915 : /*
916 : * cbrdissect - dissect match for backref node
917 : */
918 : static int /* regexec return code */
919 54 : cbrdissect(struct vars *v,
920 : struct subre *t,
921 : chr *begin, /* beginning of relevant substring */
922 : chr *end) /* end of same */
923 : {
924 54 : int n = t->subno;
925 : size_t numreps;
926 : size_t tlen;
927 : size_t brlen;
928 : chr *brstring;
929 : chr *p;
930 54 : int min = t->min;
931 54 : int max = t->max;
932 :
933 54 : assert(t != NULL);
934 54 : assert(t->op == 'b');
935 54 : assert(n >= 0);
936 54 : assert((size_t) n < v->nmatch);
937 :
938 : MDEBUG(("cbackref n%d %d{%d-%d}\n", t->id, n, min, max));
939 :
940 : /* get the backreferenced string */
941 54 : if (v->pmatch[n].rm_so == -1)
942 2 : return REG_NOMATCH;
943 52 : brstring = v->start + v->pmatch[n].rm_so;
944 52 : brlen = v->pmatch[n].rm_eo - v->pmatch[n].rm_so;
945 :
946 : /* special cases for zero-length strings */
947 52 : if (brlen == 0)
948 : {
949 : /*
950 : * matches only if target is zero length, but any number of
951 : * repetitions can be considered to be present
952 : */
953 2 : if (begin == end && min <= max)
954 : {
955 : MDEBUG(("cbackref matched trivially\n"));
956 2 : return REG_OKAY;
957 : }
958 0 : return REG_NOMATCH;
959 : }
960 50 : if (begin == end)
961 : {
962 : /* matches only if zero repetitions are okay */
963 1 : if (min == 0)
964 : {
965 : MDEBUG(("cbackref matched trivially\n"));
966 1 : return REG_OKAY;
967 : }
968 0 : return REG_NOMATCH;
969 : }
970 :
971 : /*
972 : * check target length to see if it could possibly be an allowed number of
973 : * repetitions of brstring
974 : */
975 49 : assert(end > begin);
976 49 : tlen = end - begin;
977 49 : if (tlen % brlen != 0)
978 6 : return REG_NOMATCH;
979 43 : numreps = tlen / brlen;
980 43 : if (numreps < min || (numreps > max && max != DUPINF))
981 0 : return REG_NOMATCH;
982 :
983 : /* okay, compare the actual string contents */
984 43 : p = begin;
985 102 : while (numreps-- > 0)
986 : {
987 48 : if ((*v->g->compare) (brstring, p, brlen) != 0)
988 32 : return REG_NOMATCH;
989 16 : p += brlen;
990 : }
991 :
992 : MDEBUG(("cbackref matched\n"));
993 11 : return REG_OKAY;
994 : }
995 :
996 : /*
997 : * caltdissect - dissect match for alternation node
998 : */
999 : static int /* regexec return code */
1000 3 : caltdissect(struct vars *v,
1001 : struct subre *t,
1002 : chr *begin, /* beginning of relevant substring */
1003 : chr *end) /* end of same */
1004 : {
1005 : struct dfa *d;
1006 : int er;
1007 :
1008 : /* We loop, rather than tail-recurse, to handle a chain of alternatives */
1009 10 : while (t != NULL)
1010 : {
1011 5 : assert(t->op == '|');
1012 5 : assert(t->left != NULL && t->left->cnfa.nstates > 0);
1013 :
1014 : MDEBUG(("calt n%d\n", t->id));
1015 :
1016 5 : d = getsubdfa(v, t->left);
1017 5 : NOERR();
1018 5 : if (longest(v, d, begin, end, (int *) NULL) == end)
1019 : {
1020 : MDEBUG(("calt matched\n"));
1021 3 : er = cdissect(v, t->left, begin, end);
1022 3 : if (er != REG_NOMATCH)
1023 1 : return er;
1024 : }
1025 4 : NOERR();
1026 :
1027 4 : t = t->right;
1028 : }
1029 :
1030 2 : return REG_NOMATCH;
1031 : }
1032 :
1033 : /*
1034 : * citerdissect - dissect match for iteration node
1035 : */
1036 : static int /* regexec return code */
1037 11 : citerdissect(struct vars *v,
1038 : struct subre *t,
1039 : chr *begin, /* beginning of relevant substring */
1040 : chr *end) /* end of same */
1041 : {
1042 : struct dfa *d;
1043 : chr **endpts;
1044 : chr *limit;
1045 : int min_matches;
1046 : size_t max_matches;
1047 : int nverified;
1048 : int k;
1049 : int i;
1050 : int er;
1051 :
1052 11 : assert(t->op == '*');
1053 11 : assert(t->left != NULL && t->left->cnfa.nstates > 0);
1054 11 : assert(!(t->left->flags & SHORTER));
1055 11 : assert(begin <= end);
1056 :
1057 : /*
1058 : * For the moment, assume the minimum number of matches is 1. If zero
1059 : * matches are allowed, and the target string is empty, we are allowed to
1060 : * match regardless of the contents of the iter node --- but we would
1061 : * prefer to match once, so that capturing parens get set. (An example of
1062 : * the concern here is a pattern like "()*\1", which historically this
1063 : * code has allowed to succeed.) Therefore, we deal with the zero-matches
1064 : * case at the bottom, after failing to find any other way to match.
1065 : */
1066 11 : min_matches = t->min;
1067 11 : if (min_matches <= 0)
1068 2 : min_matches = 1;
1069 :
1070 : /*
1071 : * We need workspace to track the endpoints of each sub-match. Normally
1072 : * we consider only nonzero-length sub-matches, so there can be at most
1073 : * end-begin of them. However, if min is larger than that, we will also
1074 : * consider zero-length sub-matches in order to find enough matches.
1075 : *
1076 : * For convenience, endpts[0] contains the "begin" pointer and we store
1077 : * sub-match endpoints in endpts[1..max_matches].
1078 : */
1079 11 : max_matches = end - begin;
1080 11 : if (max_matches > t->max && t->max != DUPINF)
1081 0 : max_matches = t->max;
1082 11 : if (max_matches < min_matches)
1083 2 : max_matches = min_matches;
1084 11 : endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
1085 11 : if (endpts == NULL)
1086 0 : return REG_ESPACE;
1087 11 : endpts[0] = begin;
1088 :
1089 11 : d = getsubdfa(v, t->left);
1090 11 : if (ISERR())
1091 : {
1092 0 : FREE(endpts);
1093 0 : return v->err;
1094 : }
1095 : MDEBUG(("citer %d\n", t->id));
1096 :
1097 : /*
1098 : * Our strategy is to first find a set of sub-match endpoints that are
1099 : * valid according to the child node's DFA, and then recursively dissect
1100 : * each sub-match to confirm validity. If any validity check fails,
1101 : * backtrack the last sub-match and try again. And, when we next try for
1102 : * a validity check, we need not recheck any successfully verified
1103 : * sub-matches that we didn't move the endpoints of. nverified remembers
1104 : * how many sub-matches are currently known okay.
1105 : */
1106 :
1107 : /* initialize to consider first sub-match */
1108 11 : nverified = 0;
1109 11 : k = 1;
1110 11 : limit = end;
1111 :
1112 : /* iterate until satisfaction or failure */
1113 112 : while (k > 0)
1114 : {
1115 : /* try to find an endpoint for the k'th sub-match */
1116 93 : endpts[k] = longest(v, d, endpts[k - 1], limit, (int *) NULL);
1117 93 : if (ISERR())
1118 : {
1119 0 : FREE(endpts);
1120 0 : return v->err;
1121 : }
1122 93 : if (endpts[k] == NULL)
1123 : {
1124 : /* no match possible, so see if we can shorten previous one */
1125 43 : k--;
1126 43 : goto backtrack;
1127 : }
1128 : MDEBUG(("%d: working endpoint %d: %ld\n",
1129 : t->id, k, LOFF(endpts[k])));
1130 :
1131 : /* k'th sub-match can no longer be considered verified */
1132 50 : if (nverified >= k)
1133 2 : nverified = k - 1;
1134 :
1135 50 : if (endpts[k] != end)
1136 : {
1137 : /* haven't reached end yet, try another iteration if allowed */
1138 37 : if (k >= max_matches)
1139 : {
1140 : /* must try to shorten some previous match */
1141 0 : k--;
1142 0 : goto backtrack;
1143 : }
1144 :
1145 : /* reject zero-length match unless necessary to achieve min */
1146 37 : if (endpts[k] == endpts[k - 1] &&
1147 0 : (k >= min_matches || min_matches - k < end - endpts[k]))
1148 : goto backtrack;
1149 :
1150 37 : k++;
1151 37 : limit = end;
1152 37 : continue;
1153 : }
1154 :
1155 : /*
1156 : * We've identified a way to divide the string into k sub-matches that
1157 : * works so far as the child DFA can tell. If k is an allowed number
1158 : * of matches, start the slow part: recurse to verify each sub-match.
1159 : * We always have k <= max_matches, needn't check that.
1160 : */
1161 13 : if (k < min_matches)
1162 0 : goto backtrack;
1163 :
1164 : MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + 1, k));
1165 :
1166 40 : for (i = nverified + 1; i <= k; i++)
1167 : {
1168 17 : zaptreesubs(v, t->left);
1169 17 : er = cdissect(v, t->left, endpts[i - 1], endpts[i]);
1170 17 : if (er == REG_OKAY)
1171 : {
1172 7 : nverified = i;
1173 7 : continue;
1174 : }
1175 10 : if (er == REG_NOMATCH)
1176 10 : break;
1177 : /* oops, something failed */
1178 0 : FREE(endpts);
1179 0 : return er;
1180 : }
1181 :
1182 13 : if (i > k)
1183 : {
1184 : /* satisfaction */
1185 : MDEBUG(("%d successful\n", t->id));
1186 3 : FREE(endpts);
1187 3 : return REG_OKAY;
1188 : }
1189 :
1190 : /* match failed to verify, so backtrack */
1191 :
1192 : backtrack:
1193 :
1194 : /*
1195 : * Must consider shorter versions of the current sub-match. However,
1196 : * we'll only ask for a zero-length match if necessary.
1197 : */
1198 106 : while (k > 0)
1199 : {
1200 45 : chr *prev_end = endpts[k - 1];
1201 :
1202 45 : if (endpts[k] > prev_end)
1203 : {
1204 45 : limit = endpts[k] - 1;
1205 45 : if (limit > prev_end ||
1206 0 : (k < min_matches && min_matches - k >= end - prev_end))
1207 : {
1208 : /* break out of backtrack loop, continue the outer one */
1209 : break;
1210 : }
1211 : }
1212 : /* can't shorten k'th sub-match any more, consider previous one */
1213 0 : k--;
1214 : }
1215 : }
1216 :
1217 : /* all possibilities exhausted */
1218 8 : FREE(endpts);
1219 :
1220 : /*
1221 : * Now consider the possibility that we can match to a zero-length string
1222 : * by using zero repetitions.
1223 : */
1224 8 : if (t->min == 0 && begin == end)
1225 : {
1226 : MDEBUG(("%d allowing zero matches\n", t->id));
1227 1 : return REG_OKAY;
1228 : }
1229 :
1230 : MDEBUG(("%d failed\n", t->id));
1231 7 : return REG_NOMATCH;
1232 : }
1233 :
1234 : /*
1235 : * creviterdissect - dissect match for iteration node, shortest-first
1236 : */
1237 : static int /* regexec return code */
1238 1 : creviterdissect(struct vars *v,
1239 : struct subre *t,
1240 : chr *begin, /* beginning of relevant substring */
1241 : chr *end) /* end of same */
1242 : {
1243 : struct dfa *d;
1244 : chr **endpts;
1245 : chr *limit;
1246 : int min_matches;
1247 : size_t max_matches;
1248 : int nverified;
1249 : int k;
1250 : int i;
1251 : int er;
1252 :
1253 1 : assert(t->op == '*');
1254 1 : assert(t->left != NULL && t->left->cnfa.nstates > 0);
1255 1 : assert(t->left->flags & SHORTER);
1256 1 : assert(begin <= end);
1257 :
1258 : /*
1259 : * If zero matches are allowed, and target string is empty, just declare
1260 : * victory. OTOH, if target string isn't empty, zero matches can't work
1261 : * so we pretend the min is 1.
1262 : */
1263 1 : min_matches = t->min;
1264 1 : if (min_matches <= 0)
1265 : {
1266 1 : if (begin == end)
1267 0 : return REG_OKAY;
1268 1 : min_matches = 1;
1269 : }
1270 :
1271 : /*
1272 : * We need workspace to track the endpoints of each sub-match. Normally
1273 : * we consider only nonzero-length sub-matches, so there can be at most
1274 : * end-begin of them. However, if min is larger than that, we will also
1275 : * consider zero-length sub-matches in order to find enough matches.
1276 : *
1277 : * For convenience, endpts[0] contains the "begin" pointer and we store
1278 : * sub-match endpoints in endpts[1..max_matches].
1279 : */
1280 1 : max_matches = end - begin;
1281 1 : if (max_matches > t->max && t->max != DUPINF)
1282 1 : max_matches = t->max;
1283 1 : if (max_matches < min_matches)
1284 0 : max_matches = min_matches;
1285 1 : endpts = (chr **) MALLOC((max_matches + 1) * sizeof(chr *));
1286 1 : if (endpts == NULL)
1287 0 : return REG_ESPACE;
1288 1 : endpts[0] = begin;
1289 :
1290 1 : d = getsubdfa(v, t->left);
1291 1 : if (ISERR())
1292 : {
1293 0 : FREE(endpts);
1294 0 : return v->err;
1295 : }
1296 : MDEBUG(("creviter %d\n", t->id));
1297 :
1298 : /*
1299 : * Our strategy is to first find a set of sub-match endpoints that are
1300 : * valid according to the child node's DFA, and then recursively dissect
1301 : * each sub-match to confirm validity. If any validity check fails,
1302 : * backtrack the last sub-match and try again. And, when we next try for
1303 : * a validity check, we need not recheck any successfully verified
1304 : * sub-matches that we didn't move the endpoints of. nverified remembers
1305 : * how many sub-matches are currently known okay.
1306 : */
1307 :
1308 : /* initialize to consider first sub-match */
1309 1 : nverified = 0;
1310 1 : k = 1;
1311 1 : limit = begin;
1312 :
1313 : /* iterate until satisfaction or failure */
1314 2 : while (k > 0)
1315 : {
1316 : /* disallow zero-length match unless necessary to achieve min */
1317 1 : if (limit == endpts[k - 1] &&
1318 1 : limit != end &&
1319 0 : (k >= min_matches || min_matches - k < end - limit))
1320 1 : limit++;
1321 :
1322 : /* if this is the last allowed sub-match, it must reach to the end */
1323 1 : if (k >= max_matches)
1324 1 : limit = end;
1325 :
1326 : /* try to find an endpoint for the k'th sub-match */
1327 1 : endpts[k] = shortest(v, d, endpts[k - 1], limit, end,
1328 : (chr **) NULL, (int *) NULL);
1329 1 : if (ISERR())
1330 : {
1331 0 : FREE(endpts);
1332 0 : return v->err;
1333 : }
1334 1 : if (endpts[k] == NULL)
1335 : {
1336 : /* no match possible, so see if we can lengthen previous one */
1337 0 : k--;
1338 0 : goto backtrack;
1339 : }
1340 : MDEBUG(("%d: working endpoint %d: %ld\n",
1341 : t->id, k, LOFF(endpts[k])));
1342 :
1343 : /* k'th sub-match can no longer be considered verified */
1344 1 : if (nverified >= k)
1345 0 : nverified = k - 1;
1346 :
1347 1 : if (endpts[k] != end)
1348 : {
1349 : /* haven't reached end yet, try another iteration if allowed */
1350 0 : if (k >= max_matches)
1351 : {
1352 : /* must try to lengthen some previous match */
1353 0 : k--;
1354 0 : goto backtrack;
1355 : }
1356 :
1357 0 : k++;
1358 0 : limit = endpts[k - 1];
1359 0 : continue;
1360 : }
1361 :
1362 : /*
1363 : * We've identified a way to divide the string into k sub-matches that
1364 : * works so far as the child DFA can tell. If k is an allowed number
1365 : * of matches, start the slow part: recurse to verify each sub-match.
1366 : * We always have k <= max_matches, needn't check that.
1367 : */
1368 1 : if (k < min_matches)
1369 0 : goto backtrack;
1370 :
1371 : MDEBUG(("%d: verifying %d..%d\n", t->id, nverified + 1, k));
1372 :
1373 4 : for (i = nverified + 1; i <= k; i++)
1374 : {
1375 1 : zaptreesubs(v, t->left);
1376 1 : er = cdissect(v, t->left, endpts[i - 1], endpts[i]);
1377 1 : if (er == REG_OKAY)
1378 : {
1379 1 : nverified = i;
1380 1 : continue;
1381 : }
1382 0 : if (er == REG_NOMATCH)
1383 0 : break;
1384 : /* oops, something failed */
1385 0 : FREE(endpts);
1386 0 : return er;
1387 : }
1388 :
1389 1 : if (i > k)
1390 : {
1391 : /* satisfaction */
1392 : MDEBUG(("%d successful\n", t->id));
1393 1 : FREE(endpts);
1394 1 : return REG_OKAY;
1395 : }
1396 :
1397 : /* match failed to verify, so backtrack */
1398 :
1399 : backtrack:
1400 :
1401 : /*
1402 : * Must consider longer versions of the current sub-match.
1403 : */
1404 0 : while (k > 0)
1405 : {
1406 0 : if (endpts[k] < end)
1407 : {
1408 0 : limit = endpts[k] + 1;
1409 : /* break out of backtrack loop, continue the outer one */
1410 0 : break;
1411 : }
1412 : /* can't lengthen k'th sub-match any more, consider previous one */
1413 0 : k--;
1414 : }
1415 : }
1416 :
1417 : /* all possibilities exhausted */
1418 : MDEBUG(("%d failed\n", t->id));
1419 0 : FREE(endpts);
1420 0 : return REG_NOMATCH;
1421 : }
1422 :
1423 :
1424 :
1425 : #include "rege_dfa.c"
|