Line data Source code
1 : /*
2 : * DFA routines
3 : * This file is #included by regexec.c.
4 : *
5 : * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
6 : *
7 : * Development of this software was funded, in part, by Cray Research Inc.,
8 : * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9 : * Corporation, none of whom are responsible for the results. The author
10 : * thanks all of them.
11 : *
12 : * Redistribution and use in source and binary forms -- with or without
13 : * modification -- are permitted for any purpose, provided that
14 : * redistributions in source form retain this entire copyright notice and
15 : * indicate the origin and nature of any modifications.
16 : *
17 : * I'd appreciate being given credit for this package in the documentation
18 : * of software which uses it, but that is not a requirement.
19 : *
20 : * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21 : * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22 : * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
23 : * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24 : * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 : * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26 : * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 : * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 : * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29 : * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 : *
31 : * src/backend/regex/rege_dfa.c
32 : *
33 : */
34 :
35 : /*
36 : * longest - longest-preferred matching engine
37 : *
38 : * On success, returns match endpoint address. Returns NULL on no match.
39 : * Internal errors also return NULL, with v->err set.
40 : */
41 : static chr *
42 1551 : longest(struct vars *v,
43 : struct dfa *d,
44 : chr *start, /* where the match should start */
45 : chr *stop, /* match must end at or before here */
46 : int *hitstopp) /* record whether hit v->stop, if non-NULL */
47 : {
48 : chr *cp;
49 1551 : chr *realstop = (stop == v->stop) ? stop : stop + 1;
50 : color co;
51 : struct sset *css;
52 : struct sset *ss;
53 : chr *post;
54 : int i;
55 1551 : struct colormap *cm = d->cm;
56 :
57 : /* prevent "uninitialized variable" warnings */
58 1551 : if (hitstopp != NULL)
59 347 : *hitstopp = 0;
60 :
61 : /* initialize */
62 1551 : css = initialize(v, d, start);
63 1551 : if (css == NULL)
64 0 : return NULL;
65 1551 : cp = start;
66 :
67 : /* startup */
68 : FDEBUG(("+++ startup +++\n"));
69 1551 : if (cp == v->start)
70 : {
71 255 : co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
72 : FDEBUG(("color %ld\n", (long) co));
73 : }
74 : else
75 : {
76 1296 : co = GETCOLOR(cm, *(cp - 1));
77 : FDEBUG(("char %c, color %ld\n", (char) *(cp - 1), (long) co));
78 : }
79 1551 : css = miss(v, d, css, co, cp, start);
80 1551 : if (css == NULL)
81 47 : return NULL;
82 1504 : css->lastseen = cp;
83 :
84 : /*
85 : * This is the main text-scanning loop. It seems worth having two copies
86 : * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
87 : * builds, when you're not actively tracing.
88 : */
89 : #ifdef REG_DEBUG
90 : if (v->eflags & REG_FTRACE)
91 : {
92 : while (cp < realstop)
93 : {
94 : FDEBUG(("+++ at c%d +++\n", (int) (css - d->ssets)));
95 : co = GETCOLOR(cm, *cp);
96 : FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
97 : ss = css->outs[co];
98 : if (ss == NULL)
99 : {
100 : ss = miss(v, d, css, co, cp + 1, start);
101 : if (ss == NULL)
102 : break; /* NOTE BREAK OUT */
103 : }
104 : cp++;
105 : ss->lastseen = cp;
106 : css = ss;
107 : }
108 : }
109 : else
110 : #endif
111 : {
112 9732 : while (cp < realstop)
113 : {
114 7398 : co = GETCOLOR(cm, *cp);
115 7398 : ss = css->outs[co];
116 7398 : if (ss == NULL)
117 : {
118 2825 : ss = miss(v, d, css, co, cp + 1, start);
119 2825 : if (ss == NULL)
120 674 : break; /* NOTE BREAK OUT */
121 : }
122 6724 : cp++;
123 6724 : ss->lastseen = cp;
124 6724 : css = ss;
125 : }
126 : }
127 :
128 1504 : if (ISERR())
129 0 : return NULL;
130 :
131 : /* shutdown */
132 : FDEBUG(("+++ shutdown at c%d +++\n", (int) (css - d->ssets)));
133 1504 : if (cp == v->stop && stop == v->stop)
134 : {
135 311 : if (hitstopp != NULL)
136 59 : *hitstopp = 1;
137 311 : co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
138 : FDEBUG(("color %ld\n", (long) co));
139 311 : ss = miss(v, d, css, co, cp, start);
140 311 : if (ISERR())
141 0 : return NULL;
142 : /* special case: match ended at eol? */
143 311 : if (ss != NULL && (ss->flags & POSTSTATE))
144 263 : return cp;
145 48 : else if (ss != NULL)
146 0 : ss->lastseen = cp; /* to be tidy */
147 : }
148 :
149 : /* find last match, if any */
150 1241 : post = d->lastpost;
151 6291 : for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--)
152 5050 : if ((ss->flags & POSTSTATE) && post != ss->lastseen &&
153 223 : (post == NULL || post < ss->lastseen))
154 1059 : post = ss->lastseen;
155 1241 : if (post != NULL) /* found one */
156 999 : return post - 1;
157 :
158 242 : return NULL;
159 : }
160 :
161 : /*
162 : * shortest - shortest-preferred matching engine
163 : *
164 : * On success, returns match endpoint address. Returns NULL on no match.
165 : * Internal errors also return NULL, with v->err set.
166 : */
167 : static chr *
168 32561 : shortest(struct vars *v,
169 : struct dfa *d,
170 : chr *start, /* where the match should start */
171 : chr *min, /* match must end at or after here */
172 : chr *max, /* match must end at or before here */
173 : chr **coldp, /* store coldstart pointer here, if non-NULL */
174 : int *hitstopp) /* record whether hit v->stop, if non-NULL */
175 : {
176 : chr *cp;
177 32561 : chr *realmin = (min == v->stop) ? min : min + 1;
178 32561 : chr *realmax = (max == v->stop) ? max : max + 1;
179 : color co;
180 : struct sset *css;
181 : struct sset *ss;
182 32561 : struct colormap *cm = d->cm;
183 :
184 : /* prevent "uninitialized variable" warnings */
185 32561 : if (coldp != NULL)
186 32299 : *coldp = NULL;
187 32561 : if (hitstopp != NULL)
188 33 : *hitstopp = 0;
189 :
190 : /* initialize */
191 32561 : css = initialize(v, d, start);
192 32561 : if (css == NULL)
193 0 : return NULL;
194 32561 : cp = start;
195 :
196 : /* startup */
197 : FDEBUG(("--- startup ---\n"));
198 32561 : if (cp == v->start)
199 : {
200 32093 : co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
201 : FDEBUG(("color %ld\n", (long) co));
202 : }
203 : else
204 : {
205 468 : co = GETCOLOR(cm, *(cp - 1));
206 : FDEBUG(("char %c, color %ld\n", (char) *(cp - 1), (long) co));
207 : }
208 32561 : css = miss(v, d, css, co, cp, start);
209 32561 : if (css == NULL)
210 0 : return NULL;
211 32561 : css->lastseen = cp;
212 32561 : ss = css;
213 :
214 : /*
215 : * This is the main text-scanning loop. It seems worth having two copies
216 : * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
217 : * builds, when you're not actively tracing.
218 : */
219 : #ifdef REG_DEBUG
220 : if (v->eflags & REG_FTRACE)
221 : {
222 : while (cp < realmax)
223 : {
224 : FDEBUG(("--- at c%d ---\n", (int) (css - d->ssets)));
225 : co = GETCOLOR(cm, *cp);
226 : FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
227 : ss = css->outs[co];
228 : if (ss == NULL)
229 : {
230 : ss = miss(v, d, css, co, cp + 1, start);
231 : if (ss == NULL)
232 : break; /* NOTE BREAK OUT */
233 : }
234 : cp++;
235 : ss->lastseen = cp;
236 : css = ss;
237 : if ((ss->flags & POSTSTATE) && cp >= realmin)
238 : break; /* NOTE BREAK OUT */
239 : }
240 : }
241 : else
242 : #endif
243 : {
244 631837 : while (cp < realmax)
245 : {
246 580189 : co = GETCOLOR(cm, *cp);
247 580189 : ss = css->outs[co];
248 580189 : if (ss == NULL)
249 : {
250 82580 : ss = miss(v, d, css, co, cp + 1, start);
251 82580 : if (ss == NULL)
252 11652 : break; /* NOTE BREAK OUT */
253 : }
254 568537 : cp++;
255 568537 : ss->lastseen = cp;
256 568537 : css = ss;
257 568537 : if ((ss->flags & POSTSTATE) && cp >= realmin)
258 1822 : break; /* NOTE BREAK OUT */
259 : }
260 : }
261 :
262 32561 : if (ss == NULL)
263 11652 : return NULL;
264 :
265 20909 : if (coldp != NULL) /* report last no-progress state set, if any */
266 20653 : *coldp = lastcold(v, d);
267 :
268 20909 : if ((ss->flags & POSTSTATE) && cp > min)
269 : {
270 1811 : assert(cp >= realmin);
271 1811 : cp--;
272 : }
273 19098 : else if (cp == v->stop && max == v->stop)
274 : {
275 19098 : co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
276 : FDEBUG(("color %ld\n", (long) co));
277 19098 : ss = miss(v, d, css, co, cp, start);
278 : /* match might have ended at eol */
279 19098 : if ((ss == NULL || !(ss->flags & POSTSTATE)) && hitstopp != NULL)
280 2 : *hitstopp = 1;
281 : }
282 :
283 20909 : if (ss == NULL || !(ss->flags & POSTSTATE))
284 18267 : return NULL;
285 :
286 2642 : return cp;
287 : }
288 :
289 : /*
290 : * matchuntil - incremental matching engine
291 : *
292 : * This is meant for use with a search-style NFA (that is, the pattern is
293 : * known to act as though it had a leading .*). We determine whether a
294 : * match exists starting at v->start and ending at probe. Multiple calls
295 : * require only O(N) time not O(N^2) so long as the probe values are
296 : * nondecreasing. *lastcss and *lastcp must be initialized to NULL before
297 : * starting a series of calls.
298 : *
299 : * Returns 1 if a match exists, 0 if not.
300 : * Internal errors also return 0, with v->err set.
301 : */
302 : static int
303 14 : matchuntil(struct vars *v,
304 : struct dfa *d,
305 : chr *probe, /* we want to know if a match ends here */
306 : struct sset **lastcss, /* state storage across calls */
307 : chr **lastcp) /* state storage across calls */
308 : {
309 14 : chr *cp = *lastcp;
310 : color co;
311 14 : struct sset *css = *lastcss;
312 : struct sset *ss;
313 14 : struct colormap *cm = d->cm;
314 :
315 : /* initialize and startup, or restart, if necessary */
316 14 : if (cp == NULL || cp > probe)
317 : {
318 4 : cp = v->start;
319 4 : css = initialize(v, d, cp);
320 4 : if (css == NULL)
321 0 : return 0;
322 :
323 : FDEBUG((">>> startup >>>\n"));
324 4 : co = d->cnfa->bos[(v->eflags & REG_NOTBOL) ? 0 : 1];
325 : FDEBUG(("color %ld\n", (long) co));
326 :
327 4 : css = miss(v, d, css, co, cp, v->start);
328 4 : if (css == NULL)
329 0 : return 0;
330 4 : css->lastseen = cp;
331 : }
332 10 : else if (css == NULL)
333 : {
334 : /* we previously found that no match is possible beyond *lastcp */
335 0 : return 0;
336 : }
337 14 : ss = css;
338 :
339 : /*
340 : * This is the main text-scanning loop. It seems worth having two copies
341 : * to avoid the overhead of REG_FTRACE tests here, even in REG_DEBUG
342 : * builds, when you're not actively tracing.
343 : */
344 : #ifdef REG_DEBUG
345 : if (v->eflags & REG_FTRACE)
346 : {
347 : while (cp < probe)
348 : {
349 : FDEBUG((">>> at c%d >>>\n", (int) (css - d->ssets)));
350 : co = GETCOLOR(cm, *cp);
351 : FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
352 : ss = css->outs[co];
353 : if (ss == NULL)
354 : {
355 : ss = miss(v, d, css, co, cp + 1, v->start);
356 : if (ss == NULL)
357 : break; /* NOTE BREAK OUT */
358 : }
359 : cp++;
360 : ss->lastseen = cp;
361 : css = ss;
362 : }
363 : }
364 : else
365 : #endif
366 : {
367 44 : while (cp < probe)
368 : {
369 16 : co = GETCOLOR(cm, *cp);
370 16 : ss = css->outs[co];
371 16 : if (ss == NULL)
372 : {
373 2 : ss = miss(v, d, css, co, cp + 1, v->start);
374 2 : if (ss == NULL)
375 0 : break; /* NOTE BREAK OUT */
376 : }
377 16 : cp++;
378 16 : ss->lastseen = cp;
379 16 : css = ss;
380 : }
381 : }
382 :
383 14 : *lastcss = ss;
384 14 : *lastcp = cp;
385 :
386 14 : if (ss == NULL)
387 0 : return 0; /* impossible match, or internal error */
388 :
389 : /* We need to process one more chr, or the EOS symbol, to check match */
390 14 : if (cp < v->stop)
391 : {
392 : FDEBUG((">>> at c%d >>>\n", (int) (css - d->ssets)));
393 14 : co = GETCOLOR(cm, *cp);
394 : FDEBUG(("char %c, color %ld\n", (char) *cp, (long) co));
395 14 : ss = css->outs[co];
396 14 : if (ss == NULL)
397 9 : ss = miss(v, d, css, co, cp + 1, v->start);
398 : }
399 : else
400 : {
401 0 : assert(cp == v->stop);
402 0 : co = d->cnfa->eos[(v->eflags & REG_NOTEOL) ? 0 : 1];
403 : FDEBUG(("color %ld\n", (long) co));
404 0 : ss = miss(v, d, css, co, cp, v->start);
405 : }
406 :
407 14 : if (ss == NULL || !(ss->flags & POSTSTATE))
408 10 : return 0;
409 :
410 4 : return 1;
411 : }
412 :
413 : /*
414 : * lastcold - determine last point at which no progress had been made
415 : */
416 : static chr * /* endpoint, or NULL */
417 20653 : lastcold(struct vars *v,
418 : struct dfa *d)
419 : {
420 : struct sset *ss;
421 : chr *nopr;
422 : int i;
423 :
424 20653 : nopr = d->lastnopr;
425 20653 : if (nopr == NULL)
426 20653 : nopr = v->start;
427 85026 : for (ss = d->ssets, i = d->nssused; i > 0; ss++, i--)
428 64373 : if ((ss->flags & NOPROGRESS) && nopr < ss->lastseen)
429 13175 : nopr = ss->lastseen;
430 20653 : return nopr;
431 : }
432 :
433 : /*
434 : * newdfa - set up a fresh DFA
435 : */
436 : static struct dfa *
437 32996 : newdfa(struct vars *v,
438 : struct cnfa *cnfa,
439 : struct colormap *cm,
440 : struct smalldfa *sml) /* preallocated space, may be NULL */
441 : {
442 : struct dfa *d;
443 32996 : size_t nss = cnfa->nstates * 2;
444 32996 : int wordsper = (cnfa->nstates + UBITS - 1) / UBITS;
445 32996 : struct smalldfa *smallwas = sml;
446 :
447 32996 : assert(cnfa != NULL && cnfa->nstates != 0);
448 :
449 32996 : if (nss <= FEWSTATES && cnfa->ncolors <= FEWCOLORS)
450 : {
451 21989 : assert(wordsper == 1);
452 21989 : if (sml == NULL)
453 : {
454 328 : sml = (struct smalldfa *) MALLOC(sizeof(struct smalldfa));
455 328 : if (sml == NULL)
456 : {
457 0 : ERR(REG_ESPACE);
458 0 : return NULL;
459 : }
460 : }
461 21989 : d = &sml->dfa;
462 21989 : d->ssets = sml->ssets;
463 21989 : d->statesarea = sml->statesarea;
464 21989 : d->work = &d->statesarea[nss];
465 21989 : d->outsarea = sml->outsarea;
466 21989 : d->incarea = sml->incarea;
467 21989 : d->cptsmalloced = 0;
468 21989 : d->mallocarea = (smallwas == NULL) ? (char *) sml : NULL;
469 : }
470 : else
471 : {
472 11007 : d = (struct dfa *) MALLOC(sizeof(struct dfa));
473 11007 : if (d == NULL)
474 : {
475 0 : ERR(REG_ESPACE);
476 0 : return NULL;
477 : }
478 11007 : d->ssets = (struct sset *) MALLOC(nss * sizeof(struct sset));
479 11007 : d->statesarea = (unsigned *) MALLOC((nss + WORK) * wordsper *
480 : sizeof(unsigned));
481 11007 : d->work = &d->statesarea[nss * wordsper];
482 11007 : d->outsarea = (struct sset **) MALLOC(nss * cnfa->ncolors *
483 : sizeof(struct sset *));
484 11007 : d->incarea = (struct arcp *) MALLOC(nss * cnfa->ncolors *
485 : sizeof(struct arcp));
486 11007 : d->cptsmalloced = 1;
487 11007 : d->mallocarea = (char *) d;
488 22014 : if (d->ssets == NULL || d->statesarea == NULL ||
489 22014 : d->outsarea == NULL || d->incarea == NULL)
490 : {
491 0 : freedfa(d);
492 0 : ERR(REG_ESPACE);
493 0 : return NULL;
494 : }
495 : }
496 :
497 32996 : d->nssets = (v->eflags & REG_SMALL) ? 7 : nss;
498 32996 : d->nssused = 0;
499 32996 : d->nstates = cnfa->nstates;
500 32996 : d->ncolors = cnfa->ncolors;
501 32996 : d->wordsper = wordsper;
502 32996 : d->cnfa = cnfa;
503 32996 : d->cm = cm;
504 32996 : d->lastpost = NULL;
505 32996 : d->lastnopr = NULL;
506 32996 : d->search = d->ssets;
507 :
508 : /* initialization of sset fields is done as needed */
509 :
510 32996 : return d;
511 : }
512 :
513 : /*
514 : * freedfa - free a DFA
515 : */
516 : static void
517 32996 : freedfa(struct dfa *d)
518 : {
519 32996 : if (d->cptsmalloced)
520 : {
521 11007 : if (d->ssets != NULL)
522 11007 : FREE(d->ssets);
523 11007 : if (d->statesarea != NULL)
524 11007 : FREE(d->statesarea);
525 11007 : if (d->outsarea != NULL)
526 11007 : FREE(d->outsarea);
527 11007 : if (d->incarea != NULL)
528 11007 : FREE(d->incarea);
529 : }
530 :
531 32996 : if (d->mallocarea != NULL)
532 11335 : FREE(d->mallocarea);
533 32996 : }
534 :
535 : /*
536 : * hash - construct a hash code for a bitvector
537 : *
538 : * There are probably better ways, but they're more expensive.
539 : */
540 : static unsigned
541 3 : hash(unsigned *uv,
542 : int n)
543 : {
544 : int i;
545 : unsigned h;
546 :
547 3 : h = 0;
548 12 : for (i = 0; i < n; i++)
549 9 : h ^= uv[i];
550 3 : return h;
551 : }
552 :
553 : /*
554 : * initialize - hand-craft a cache entry for startup, otherwise get ready
555 : */
556 : static struct sset *
557 34116 : initialize(struct vars *v,
558 : struct dfa *d,
559 : chr *start)
560 : {
561 : struct sset *ss;
562 : int i;
563 :
564 : /* is previous one still there? */
565 34116 : if (d->nssused > 0 && (d->ssets[0].flags & STARTER))
566 1121 : ss = &d->ssets[0];
567 : else
568 : { /* no, must (re)build it */
569 32995 : ss = getvacant(v, d, start, start);
570 32995 : if (ss == NULL)
571 0 : return NULL;
572 65992 : for (i = 0; i < d->wordsper; i++)
573 32997 : ss->states[i] = 0;
574 32995 : BSET(ss->states, d->cnfa->pre);
575 32995 : ss->hash = HASH(ss->states, d->wordsper);
576 32995 : assert(d->cnfa->pre != d->cnfa->post);
577 32995 : ss->flags = STARTER | LOCKED | NOPROGRESS;
578 : /* lastseen dealt with below */
579 : }
580 :
581 71769 : for (i = 0; i < d->nssused; i++)
582 37653 : d->ssets[i].lastseen = NULL;
583 34116 : ss->lastseen = start; /* maybe untrue, but harmless */
584 34116 : d->lastpost = NULL;
585 34116 : d->lastnopr = NULL;
586 34116 : return ss;
587 : }
588 :
589 : /*
590 : * miss - handle a stateset cache miss
591 : *
592 : * css is the current stateset, co is the color of the current input character,
593 : * cp points to the character after that (which is where we may need to test
594 : * LACONs). start does not affect matching behavior but is needed for pickss'
595 : * heuristics about which stateset cache entry to replace.
596 : *
597 : * Ordinarily, returns the address of the next stateset (the one that is
598 : * valid after consuming the input character). Returns NULL if no valid
599 : * NFA states remain, ie we have a certain match failure.
600 : * Internal errors also return NULL, with v->err set.
601 : */
602 : static struct sset *
603 138941 : miss(struct vars *v,
604 : struct dfa *d,
605 : struct sset *css,
606 : color co,
607 : chr *cp, /* next chr */
608 : chr *start) /* where the attempt got started */
609 : {
610 138941 : struct cnfa *cnfa = d->cnfa;
611 : int i;
612 : unsigned h;
613 : struct carc *ca;
614 : struct sset *p;
615 : int ispost;
616 : int noprogress;
617 : int gotstate;
618 : int dolacons;
619 : int sawlacons;
620 :
621 : /* for convenience, we can be called even if it might not be a miss */
622 138941 : if (css->outs[co] != NULL)
623 : {
624 : FDEBUG(("hit\n"));
625 1075 : return css->outs[co];
626 : }
627 : FDEBUG(("miss\n"));
628 :
629 : /*
630 : * Checking for operation cancel in the inner text search loop seems
631 : * unduly expensive. As a compromise, check during cache misses.
632 : */
633 137866 : if (CANCEL_REQUESTED(v->re))
634 : {
635 0 : ERR(REG_CANCEL);
636 0 : return NULL;
637 : }
638 :
639 : /*
640 : * What set of states would we end up in after consuming the co character?
641 : * We first consider PLAIN arcs that consume the character, and then look
642 : * to see what LACON arcs could be traversed after consuming it.
643 : */
644 275736 : for (i = 0; i < d->wordsper; i++)
645 137870 : d->work[i] = 0; /* build new stateset bitmap in d->work */
646 137866 : ispost = 0;
647 137866 : noprogress = 1;
648 137866 : gotstate = 0;
649 1402487 : for (i = 0; i < d->nstates; i++)
650 1264621 : if (ISBSET(css->states, i))
651 2376738 : for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
652 2123893 : if (ca->co == co)
653 : {
654 238544 : BSET(d->work, ca->to);
655 238544 : gotstate = 1;
656 238544 : if (ca->to == cnfa->post)
657 3630 : ispost = 1;
658 238544 : if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS))
659 55710 : noprogress = 0;
660 : FDEBUG(("%d -> %d\n", i, ca->to));
661 : }
662 137866 : if (!gotstate)
663 30688 : return NULL; /* character cannot reach any new state */
664 107178 : dolacons = (cnfa->flags & HASLACONS);
665 107178 : sawlacons = 0;
666 : /* outer loop handles transitive closure of reachable-by-LACON states */
667 214607 : while (dolacons)
668 : {
669 251 : dolacons = 0;
670 2385 : for (i = 0; i < d->nstates; i++)
671 2134 : if (ISBSET(d->work, i))
672 754 : for (ca = cnfa->states[i]; ca->co != COLORLESS; ca++)
673 : {
674 462 : if (ca->co < cnfa->ncolors)
675 424 : continue; /* not a LACON arc */
676 38 : if (ISBSET(d->work, ca->to))
677 12 : continue; /* arc would be a no-op anyway */
678 26 : sawlacons = 1; /* this LACON affects our result */
679 26 : if (!lacon(v, cnfa, cp, ca->co))
680 : {
681 16 : if (ISERR())
682 0 : return NULL;
683 16 : continue; /* LACON arc cannot be traversed */
684 : }
685 10 : if (ISERR())
686 0 : return NULL;
687 10 : BSET(d->work, ca->to);
688 10 : dolacons = 1;
689 10 : if (ca->to == cnfa->post)
690 0 : ispost = 1;
691 10 : if (!(cnfa->stflags[ca->to] & CNFA_NOPROGRESS))
692 10 : noprogress = 0;
693 : FDEBUG(("%d :> %d\n", i, ca->to));
694 : }
695 : }
696 107178 : h = HASH(d->work, d->wordsper);
697 :
698 : /* Is this stateset already in the cache? */
699 314651 : for (p = d->ssets, i = d->nssused; i > 0; p++, i--)
700 255126 : if (HIT(h, d->work, p, d->wordsper))
701 : {
702 : FDEBUG(("cached c%d\n", (int) (p - d->ssets)));
703 : break; /* NOTE BREAK OUT */
704 : }
705 107178 : if (i == 0)
706 : { /* nope, need a new cache entry */
707 59525 : p = getvacant(v, d, cp, start);
708 59525 : if (p == NULL)
709 0 : return NULL;
710 59525 : assert(p != css);
711 119054 : for (i = 0; i < d->wordsper; i++)
712 59529 : p->states[i] = d->work[i];
713 59525 : p->hash = h;
714 59525 : p->flags = (ispost) ? POSTSTATE : 0;
715 59525 : if (noprogress)
716 32999 : p->flags |= NOPROGRESS;
717 : /* lastseen to be dealt with by caller */
718 : }
719 :
720 : /*
721 : * Link new stateset to old, unless a LACON affected the result, in which
722 : * case we don't create the link. That forces future transitions across
723 : * this same arc (same prior stateset and character color) to come through
724 : * miss() again, so that we can recheck the LACON(s), which might or might
725 : * not pass since context will be different.
726 : */
727 107178 : if (!sawlacons)
728 : {
729 : FDEBUG(("c%d[%d]->c%d\n",
730 : (int) (css - d->ssets), co, (int) (p - d->ssets)));
731 107159 : css->outs[co] = p;
732 107159 : css->inchain[co] = p->ins;
733 107159 : p->ins.ss = css;
734 107159 : p->ins.co = co;
735 : }
736 107178 : return p;
737 : }
738 :
739 : /*
740 : * lacon - lookaround-constraint checker for miss()
741 : */
742 : static int /* predicate: constraint satisfied? */
743 26 : lacon(struct vars *v,
744 : struct cnfa *pcnfa, /* parent cnfa */
745 : chr *cp,
746 : color co) /* "color" of the lookaround constraint */
747 : {
748 : int n;
749 : struct subre *sub;
750 : struct dfa *d;
751 : chr *end;
752 : int satisfied;
753 :
754 : /* Since this is recursive, it could be driven to stack overflow */
755 26 : if (STACK_TOO_DEEP(v->re))
756 : {
757 0 : ERR(REG_ETOOBIG);
758 0 : return 0;
759 : }
760 :
761 26 : n = co - pcnfa->ncolors;
762 26 : assert(n > 0 && n < v->g->nlacons && v->g->lacons != NULL);
763 : FDEBUG(("=== testing lacon %d\n", n));
764 26 : sub = &v->g->lacons[n];
765 26 : d = getladfa(v, n);
766 26 : if (d == NULL)
767 0 : return 0;
768 26 : if (LATYPE_IS_AHEAD(sub->subno))
769 : {
770 : /* used to use longest() here, but shortest() could be much cheaper */
771 12 : end = shortest(v, d, cp, cp, v->stop,
772 : (chr **) NULL, (int *) NULL);
773 12 : satisfied = LATYPE_IS_POS(sub->subno) ? (end != NULL) : (end == NULL);
774 : }
775 : else
776 : {
777 : /*
778 : * To avoid doing O(N^2) work when repeatedly testing a lookbehind
779 : * constraint in an N-character string, we use matchuntil() which can
780 : * cache the DFA state across calls. We only need to restart if the
781 : * probe point decreases, which is not common. The NFA we're using is
782 : * a search NFA, so it doesn't mind scanning over stuff before the
783 : * nominal match.
784 : */
785 14 : satisfied = matchuntil(v, d, cp, &v->lblastcss[n], &v->lblastcp[n]);
786 14 : if (!LATYPE_IS_POS(sub->subno))
787 0 : satisfied = !satisfied;
788 : }
789 : FDEBUG(("=== lacon %d satisfied %d\n", n, satisfied));
790 26 : return satisfied;
791 : }
792 :
793 : /*
794 : * getvacant - get a vacant state set
795 : *
796 : * This routine clears out the inarcs and outarcs, but does not otherwise
797 : * clear the innards of the state set -- that's up to the caller.
798 : */
799 : static struct sset *
800 92520 : getvacant(struct vars *v,
801 : struct dfa *d,
802 : chr *cp,
803 : chr *start)
804 : {
805 : int i;
806 : struct sset *ss;
807 : struct sset *p;
808 : struct arcp ap;
809 : color co;
810 :
811 92520 : ss = pickss(v, d, cp, start);
812 92520 : if (ss == NULL)
813 0 : return NULL;
814 92520 : assert(!(ss->flags & LOCKED));
815 :
816 : /* clear out its inarcs, including self-referential ones */
817 92520 : ap = ss->ins;
818 185040 : while ((p = ap.ss) != NULL)
819 : {
820 0 : co = ap.co;
821 : FDEBUG(("zapping c%d's %ld outarc\n", (int) (p - d->ssets), (long) co));
822 0 : p->outs[co] = NULL;
823 0 : ap = p->inchain[co];
824 0 : p->inchain[co].ss = NULL; /* paranoia */
825 : }
826 92520 : ss->ins.ss = NULL;
827 :
828 : /* take it off the inarc chains of the ssets reached by its outarcs */
829 1007608 : for (i = 0; i < d->ncolors; i++)
830 : {
831 915088 : p = ss->outs[i];
832 915088 : assert(p != ss); /* not self-referential */
833 915088 : if (p == NULL)
834 915088 : continue; /* NOTE CONTINUE */
835 : FDEBUG(("del outarc %d from c%d's in chn\n", i, (int) (p - d->ssets)));
836 0 : if (p->ins.ss == ss && p->ins.co == i)
837 0 : p->ins = ss->inchain[i];
838 : else
839 : {
840 0 : struct arcp lastap = {NULL, 0};
841 :
842 0 : assert(p->ins.ss != NULL);
843 0 : for (ap = p->ins; ap.ss != NULL &&
844 0 : !(ap.ss == ss && ap.co == i);
845 0 : ap = ap.ss->inchain[ap.co])
846 0 : lastap = ap;
847 0 : assert(ap.ss != NULL);
848 0 : lastap.ss->inchain[lastap.co] = ss->inchain[i];
849 : }
850 0 : ss->outs[i] = NULL;
851 0 : ss->inchain[i].ss = NULL;
852 : }
853 :
854 : /* if ss was a success state, may need to remember location */
855 92520 : if ((ss->flags & POSTSTATE) && ss->lastseen != d->lastpost &&
856 0 : (d->lastpost == NULL || d->lastpost < ss->lastseen))
857 0 : d->lastpost = ss->lastseen;
858 :
859 : /* likewise for a no-progress state */
860 92520 : if ((ss->flags & NOPROGRESS) && ss->lastseen != d->lastnopr &&
861 0 : (d->lastnopr == NULL || d->lastnopr < ss->lastseen))
862 0 : d->lastnopr = ss->lastseen;
863 :
864 92520 : return ss;
865 : }
866 :
867 : /*
868 : * pickss - pick the next stateset to be used
869 : */
870 : static struct sset *
871 92520 : pickss(struct vars *v,
872 : struct dfa *d,
873 : chr *cp,
874 : chr *start)
875 : {
876 : int i;
877 : struct sset *ss;
878 : struct sset *end;
879 : chr *ancient;
880 :
881 : /* shortcut for cases where cache isn't full */
882 92520 : if (d->nssused < d->nssets)
883 : {
884 92520 : i = d->nssused;
885 92520 : d->nssused++;
886 92520 : ss = &d->ssets[i];
887 : FDEBUG(("new c%d\n", i));
888 : /* set up innards */
889 92520 : ss->states = &d->statesarea[i * d->wordsper];
890 92520 : ss->flags = 0;
891 92520 : ss->ins.ss = NULL;
892 92520 : ss->ins.co = WHITE; /* give it some value */
893 92520 : ss->outs = &d->outsarea[i * d->ncolors];
894 92520 : ss->inchain = &d->incarea[i * d->ncolors];
895 1007608 : for (i = 0; i < d->ncolors; i++)
896 : {
897 915088 : ss->outs[i] = NULL;
898 915088 : ss->inchain[i].ss = NULL;
899 : }
900 92520 : return ss;
901 : }
902 :
903 : /* look for oldest, or old enough anyway */
904 0 : if (cp - start > d->nssets * 2 / 3) /* oldest 33% are expendable */
905 0 : ancient = cp - d->nssets * 2 / 3;
906 : else
907 0 : ancient = start;
908 0 : for (ss = d->search, end = &d->ssets[d->nssets]; ss < end; ss++)
909 0 : if ((ss->lastseen == NULL || ss->lastseen < ancient) &&
910 0 : !(ss->flags & LOCKED))
911 : {
912 0 : d->search = ss + 1;
913 : FDEBUG(("replacing c%d\n", (int) (ss - d->ssets)));
914 0 : return ss;
915 : }
916 0 : for (ss = d->ssets, end = d->search; ss < end; ss++)
917 0 : if ((ss->lastseen == NULL || ss->lastseen < ancient) &&
918 0 : !(ss->flags & LOCKED))
919 : {
920 0 : d->search = ss + 1;
921 : FDEBUG(("replacing c%d\n", (int) (ss - d->ssets)));
922 0 : return ss;
923 : }
924 :
925 : /* nobody's old enough?!? -- something's really wrong */
926 : FDEBUG(("cannot find victim to replace!\n"));
927 0 : ERR(REG_ASSERT);
928 0 : return NULL;
929 : }
|