Line data Source code
1 : /*
2 : * NFA utilities.
3 : * This file is #included by regcomp.c.
4 : *
5 : * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
6 : *
7 : * Development of this software was funded, in part, by Cray Research Inc.,
8 : * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9 : * Corporation, none of whom are responsible for the results. The author
10 : * thanks all of them.
11 : *
12 : * Redistribution and use in source and binary forms -- with or without
13 : * modification -- are permitted for any purpose, provided that
14 : * redistributions in source form retain this entire copyright notice and
15 : * indicate the origin and nature of any modifications.
16 : *
17 : * I'd appreciate being given credit for this package in the documentation
18 : * of software which uses it, but that is not a requirement.
19 : *
20 : * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21 : * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22 : * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
23 : * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24 : * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 : * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26 : * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 : * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 : * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29 : * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 : *
31 : * src/backend/regex/regc_nfa.c
32 : *
33 : *
34 : * One or two things that technically ought to be in here
35 : * are actually in color.c, thanks to some incestuous relationships in
36 : * the color chains.
37 : */
38 :
39 : #define NISERR() VISERR(nfa->v)
40 : #define NERR(e) VERR(nfa->v, (e))
41 :
42 :
43 : /*
44 : * newnfa - set up an NFA
45 : */
46 : static struct nfa * /* the NFA, or NULL */
47 1785 : newnfa(struct vars *v,
48 : struct colormap *cm,
49 : struct nfa *parent) /* NULL if primary NFA */
50 : {
51 : struct nfa *nfa;
52 :
53 1785 : nfa = (struct nfa *) MALLOC(sizeof(struct nfa));
54 1785 : if (nfa == NULL)
55 : {
56 0 : ERR(REG_ESPACE);
57 0 : return NULL;
58 : }
59 :
60 1785 : nfa->states = NULL;
61 1785 : nfa->slast = NULL;
62 1785 : nfa->free = NULL;
63 1785 : nfa->nstates = 0;
64 1785 : nfa->cm = cm;
65 1785 : nfa->v = v;
66 1785 : nfa->bos[0] = nfa->bos[1] = COLORLESS;
67 1785 : nfa->eos[0] = nfa->eos[1] = COLORLESS;
68 1785 : nfa->parent = parent; /* Precedes newfstate so parent is valid. */
69 1785 : nfa->post = newfstate(nfa, '@'); /* number 0 */
70 1785 : nfa->pre = newfstate(nfa, '>'); /* number 1 */
71 :
72 1785 : nfa->init = newstate(nfa); /* may become invalid later */
73 1785 : nfa->final = newstate(nfa);
74 1785 : if (ISERR())
75 : {
76 0 : freenfa(nfa);
77 0 : return NULL;
78 : }
79 1785 : rainbow(nfa, nfa->cm, PLAIN, COLORLESS, nfa->pre, nfa->init);
80 1785 : newarc(nfa, '^', 1, nfa->pre, nfa->init);
81 1785 : newarc(nfa, '^', 0, nfa->pre, nfa->init);
82 1785 : rainbow(nfa, nfa->cm, PLAIN, COLORLESS, nfa->final, nfa->post);
83 1785 : newarc(nfa, '$', 1, nfa->final, nfa->post);
84 1785 : newarc(nfa, '$', 0, nfa->final, nfa->post);
85 :
86 1785 : if (ISERR())
87 : {
88 0 : freenfa(nfa);
89 0 : return NULL;
90 : }
91 1785 : return nfa;
92 : }
93 :
94 : /*
95 : * freenfa - free an entire NFA
96 : */
97 : static void
98 1785 : freenfa(struct nfa *nfa)
99 : {
100 : struct state *s;
101 :
102 30390 : while ((s = nfa->states) != NULL)
103 : {
104 26820 : s->nins = s->nouts = 0; /* don't worry about arcs */
105 26820 : freestate(nfa, s);
106 : }
107 73698 : while ((s = nfa->free) != NULL)
108 : {
109 70128 : nfa->free = s->next;
110 70128 : destroystate(nfa, s);
111 : }
112 :
113 1785 : nfa->slast = NULL;
114 1785 : nfa->nstates = -1;
115 1785 : nfa->pre = NULL;
116 1785 : nfa->post = NULL;
117 1785 : FREE(nfa);
118 1785 : }
119 :
120 : /*
121 : * newstate - allocate an NFA state, with zero flag value
122 : */
123 : static struct state * /* NULL on error */
124 75658 : newstate(struct nfa *nfa)
125 : {
126 : struct state *s;
127 :
128 : /*
129 : * This is a handy place to check for operation cancel during regex
130 : * compilation, since no code path will go very long without making a new
131 : * state or arc.
132 : */
133 75658 : if (CANCEL_REQUESTED(nfa->v->re))
134 : {
135 0 : NERR(REG_CANCEL);
136 0 : return NULL;
137 : }
138 :
139 75658 : if (nfa->free != NULL)
140 : {
141 5530 : s = nfa->free;
142 5530 : nfa->free = s->next;
143 : }
144 : else
145 : {
146 70128 : if (nfa->v->spaceused >= REG_MAX_COMPILE_SPACE)
147 : {
148 0 : NERR(REG_ETOOBIG);
149 0 : return NULL;
150 : }
151 70128 : s = (struct state *) MALLOC(sizeof(struct state));
152 70128 : if (s == NULL)
153 : {
154 0 : NERR(REG_ESPACE);
155 0 : return NULL;
156 : }
157 70128 : nfa->v->spaceused += sizeof(struct state);
158 70128 : s->oas.next = NULL;
159 70128 : s->free = NULL;
160 70128 : s->noas = 0;
161 : }
162 :
163 75658 : assert(nfa->nstates >= 0);
164 75658 : s->no = nfa->nstates++;
165 75658 : s->flag = 0;
166 75658 : if (nfa->states == NULL)
167 1785 : nfa->states = s;
168 75658 : s->nins = 0;
169 75658 : s->ins = NULL;
170 75658 : s->nouts = 0;
171 75658 : s->outs = NULL;
172 75658 : s->tmp = NULL;
173 75658 : s->next = NULL;
174 75658 : if (nfa->slast != NULL)
175 : {
176 73873 : assert(nfa->slast->next == NULL);
177 73873 : nfa->slast->next = s;
178 : }
179 75658 : s->prev = nfa->slast;
180 75658 : nfa->slast = s;
181 75658 : return s;
182 : }
183 :
184 : /*
185 : * newfstate - allocate an NFA state with a specified flag value
186 : */
187 : static struct state * /* NULL on error */
188 3570 : newfstate(struct nfa *nfa, int flag)
189 : {
190 : struct state *s;
191 :
192 3570 : s = newstate(nfa);
193 3570 : if (s != NULL)
194 3570 : s->flag = (char) flag;
195 3570 : return s;
196 : }
197 :
198 : /*
199 : * dropstate - delete a state's inarcs and outarcs and free it
200 : */
201 : static void
202 48807 : dropstate(struct nfa *nfa,
203 : struct state *s)
204 : {
205 : struct arc *a;
206 :
207 109572 : while ((a = s->ins) != NULL)
208 11958 : freearc(nfa, a);
209 132252 : while ((a = s->outs) != NULL)
210 34638 : freearc(nfa, a);
211 48807 : freestate(nfa, s);
212 48807 : }
213 :
214 : /*
215 : * freestate - free a state, which has no in-arcs or out-arcs
216 : */
217 : static void
218 75658 : freestate(struct nfa *nfa,
219 : struct state *s)
220 : {
221 75658 : assert(s != NULL);
222 75658 : assert(s->nins == 0 && s->nouts == 0);
223 :
224 75658 : s->no = FREESTATE;
225 75658 : s->flag = 0;
226 75658 : if (s->next != NULL)
227 72514 : s->next->prev = s->prev;
228 : else
229 : {
230 3144 : assert(s == nfa->slast);
231 3144 : nfa->slast = s->prev;
232 : }
233 75658 : if (s->prev != NULL)
234 48838 : s->prev->next = s->next;
235 : else
236 : {
237 26820 : assert(s == nfa->states);
238 26820 : nfa->states = s->next;
239 : }
240 75658 : s->prev = NULL;
241 75658 : s->next = nfa->free; /* don't delete it, put it on the free list */
242 75658 : nfa->free = s;
243 75658 : }
244 :
245 : /*
246 : * destroystate - really get rid of an already-freed state
247 : */
248 : static void
249 70128 : destroystate(struct nfa *nfa,
250 : struct state *s)
251 : {
252 : struct arcbatch *ab;
253 : struct arcbatch *abnext;
254 :
255 70128 : assert(s->no == FREESTATE);
256 271170 : for (ab = s->oas.next; ab != NULL; ab = abnext)
257 : {
258 201042 : abnext = ab->next;
259 201042 : FREE(ab);
260 201042 : nfa->v->spaceused -= sizeof(struct arcbatch);
261 : }
262 70128 : s->ins = NULL;
263 70128 : s->outs = NULL;
264 70128 : s->next = NULL;
265 70128 : FREE(s);
266 70128 : nfa->v->spaceused -= sizeof(struct state);
267 70128 : }
268 :
269 : /*
270 : * newarc - set up a new arc within an NFA
271 : *
272 : * This function checks to make sure that no duplicate arcs are created.
273 : * In general we never want duplicates.
274 : */
275 : static void
276 256137 : newarc(struct nfa *nfa,
277 : int t,
278 : color co,
279 : struct state *from,
280 : struct state *to)
281 : {
282 : struct arc *a;
283 :
284 256137 : assert(from != NULL && to != NULL);
285 :
286 : /*
287 : * This is a handy place to check for operation cancel during regex
288 : * compilation, since no code path will go very long without making a new
289 : * state or arc.
290 : */
291 256137 : if (CANCEL_REQUESTED(nfa->v->re))
292 : {
293 0 : NERR(REG_CANCEL);
294 0 : return;
295 : }
296 :
297 : /* check for duplicate arc, using whichever chain is shorter */
298 256137 : if (from->nouts <= to->nins)
299 : {
300 572339 : for (a = from->outs; a != NULL; a = a->outchain)
301 399681 : if (a->to == to && a->co == co && a->type == t)
302 4069 : return;
303 : }
304 : else
305 : {
306 379424 : for (a = to->ins; a != NULL; a = a->inchain)
307 305358 : if (a->from == from && a->co == co && a->type == t)
308 5344 : return;
309 : }
310 :
311 : /* no dup, so create the arc */
312 246724 : createarc(nfa, t, co, from, to);
313 : }
314 :
315 : /*
316 : * createarc - create a new arc within an NFA
317 : *
318 : * This function must *only* be used after verifying that there is no existing
319 : * identical arc (same type/color/from/to).
320 : */
321 : static void
322 2260459 : createarc(struct nfa *nfa,
323 : int t,
324 : color co,
325 : struct state *from,
326 : struct state *to)
327 : {
328 : struct arc *a;
329 :
330 : /* the arc is physically allocated within its from-state */
331 2260459 : a = allocarc(nfa, from);
332 2260459 : if (NISERR())
333 2260768 : return;
334 2260150 : assert(a != NULL);
335 :
336 2260150 : a->type = t;
337 2260150 : a->co = co;
338 2260150 : a->to = to;
339 2260150 : a->from = from;
340 :
341 : /*
342 : * Put the new arc on the beginning, not the end, of the chains; it's
343 : * simpler here, and freearc() is the same cost either way. See also the
344 : * logic in moveins() and its cohorts, as well as fixempties().
345 : */
346 2260150 : a->inchain = to->ins;
347 2260150 : a->inchainRev = NULL;
348 2260150 : if (to->ins)
349 2177998 : to->ins->inchainRev = a;
350 2260150 : to->ins = a;
351 2260150 : a->outchain = from->outs;
352 2260150 : a->outchainRev = NULL;
353 2260150 : if (from->outs)
354 2183031 : from->outs->outchainRev = a;
355 2260150 : from->outs = a;
356 :
357 2260150 : from->nouts++;
358 2260150 : to->nins++;
359 :
360 2260150 : if (COLORED(a) && nfa->parent == NULL)
361 31329 : colorchain(nfa->cm, a);
362 : }
363 :
364 : /*
365 : * allocarc - allocate a new out-arc within a state
366 : */
367 : static struct arc * /* NULL for failure */
368 2260459 : allocarc(struct nfa *nfa,
369 : struct state *s)
370 : {
371 : struct arc *a;
372 :
373 : /* shortcut */
374 2260459 : if (s->free == NULL && s->noas < ABSIZE)
375 : {
376 167029 : a = &s->oas.a[s->noas];
377 167029 : s->noas++;
378 167029 : return a;
379 : }
380 :
381 : /* if none at hand, get more */
382 2093430 : if (s->free == NULL)
383 : {
384 : struct arcbatch *newAb;
385 : int i;
386 :
387 201073 : if (nfa->v->spaceused >= REG_MAX_COMPILE_SPACE)
388 : {
389 31 : NERR(REG_ETOOBIG);
390 31 : return NULL;
391 : }
392 201042 : newAb = (struct arcbatch *) MALLOC(sizeof(struct arcbatch));
393 201042 : if (newAb == NULL)
394 : {
395 0 : NERR(REG_ESPACE);
396 0 : return NULL;
397 : }
398 201042 : nfa->v->spaceused += sizeof(struct arcbatch);
399 201042 : newAb->next = s->oas.next;
400 201042 : s->oas.next = newAb;
401 :
402 2211462 : for (i = 0; i < ABSIZE; i++)
403 : {
404 2010420 : newAb->a[i].type = 0;
405 2010420 : newAb->a[i].freechain = &newAb->a[i + 1];
406 : }
407 201042 : newAb->a[ABSIZE - 1].freechain = NULL;
408 201042 : s->free = &newAb->a[0];
409 : }
410 2093399 : assert(s->free != NULL);
411 :
412 2093399 : a = s->free;
413 2093399 : s->free = a->freechain;
414 2093399 : return a;
415 : }
416 :
417 : /*
418 : * freearc - free an arc
419 : */
420 : static void
421 212234 : freearc(struct nfa *nfa,
422 : struct arc *victim)
423 : {
424 212234 : struct state *from = victim->from;
425 212234 : struct state *to = victim->to;
426 : struct arc *predecessor;
427 :
428 212234 : assert(victim->type != 0);
429 :
430 : /* take it off color chain if necessary */
431 212234 : if (COLORED(victim) && nfa->parent == NULL)
432 22517 : uncolorchain(nfa->cm, victim);
433 :
434 : /* take it off source's out-chain */
435 212234 : assert(from != NULL);
436 212234 : predecessor = victim->outchainRev;
437 212234 : if (predecessor == NULL)
438 : {
439 99911 : assert(from->outs == victim);
440 99911 : from->outs = victim->outchain;
441 : }
442 : else
443 : {
444 112323 : assert(predecessor->outchain == victim);
445 112323 : predecessor->outchain = victim->outchain;
446 : }
447 212234 : if (victim->outchain != NULL)
448 : {
449 115596 : assert(victim->outchain->outchainRev == victim);
450 115596 : victim->outchain->outchainRev = predecessor;
451 : }
452 212234 : from->nouts--;
453 :
454 : /* take it off target's in-chain */
455 212234 : assert(to != NULL);
456 212234 : predecessor = victim->inchainRev;
457 212234 : if (predecessor == NULL)
458 : {
459 111438 : assert(to->ins == victim);
460 111438 : to->ins = victim->inchain;
461 : }
462 : else
463 : {
464 100796 : assert(predecessor->inchain == victim);
465 100796 : predecessor->inchain = victim->inchain;
466 : }
467 212234 : if (victim->inchain != NULL)
468 : {
469 113589 : assert(victim->inchain->inchainRev == victim);
470 113589 : victim->inchain->inchainRev = predecessor;
471 : }
472 212234 : to->nins--;
473 :
474 : /* clean up and place on from-state's free list */
475 212234 : victim->type = 0;
476 212234 : victim->from = NULL; /* precautions... */
477 212234 : victim->to = NULL;
478 212234 : victim->inchain = NULL;
479 212234 : victim->inchainRev = NULL;
480 212234 : victim->outchain = NULL;
481 212234 : victim->outchainRev = NULL;
482 212234 : victim->freechain = from->free;
483 212234 : from->free = victim;
484 212234 : }
485 :
486 : /*
487 : * changearctarget - flip an arc to have a different to state
488 : *
489 : * Caller must have verified that there is no pre-existing duplicate arc.
490 : *
491 : * Note that because we store arcs in their from state, we can't easily have
492 : * a similar changearcsource function.
493 : */
494 : static void
495 0 : changearctarget(struct arc *a, struct state *newto)
496 : {
497 0 : struct state *oldto = a->to;
498 : struct arc *predecessor;
499 :
500 0 : assert(oldto != newto);
501 :
502 : /* take it off old target's in-chain */
503 0 : assert(oldto != NULL);
504 0 : predecessor = a->inchainRev;
505 0 : if (predecessor == NULL)
506 : {
507 0 : assert(oldto->ins == a);
508 0 : oldto->ins = a->inchain;
509 : }
510 : else
511 : {
512 0 : assert(predecessor->inchain == a);
513 0 : predecessor->inchain = a->inchain;
514 : }
515 0 : if (a->inchain != NULL)
516 : {
517 0 : assert(a->inchain->inchainRev == a);
518 0 : a->inchain->inchainRev = predecessor;
519 : }
520 0 : oldto->nins--;
521 :
522 0 : a->to = newto;
523 :
524 : /* prepend it to new target's in-chain */
525 0 : a->inchain = newto->ins;
526 0 : a->inchainRev = NULL;
527 0 : if (newto->ins)
528 0 : newto->ins->inchainRev = a;
529 0 : newto->ins = a;
530 0 : newto->nins++;
531 0 : }
532 :
533 : /*
534 : * hasnonemptyout - Does state have a non-EMPTY out arc?
535 : */
536 : static int
537 21197 : hasnonemptyout(struct state *s)
538 : {
539 : struct arc *a;
540 :
541 30104 : for (a = s->outs; a != NULL; a = a->outchain)
542 : {
543 27570 : if (a->type != EMPTY)
544 18663 : return 1;
545 : }
546 2534 : return 0;
547 : }
548 :
549 : /*
550 : * findarc - find arc, if any, from given source with given type and color
551 : * If there is more than one such arc, the result is random.
552 : */
553 : static struct arc *
554 71 : findarc(struct state *s,
555 : int type,
556 : color co)
557 : {
558 : struct arc *a;
559 :
560 166 : for (a = s->outs; a != NULL; a = a->outchain)
561 134 : if (a->type == type && a->co == co)
562 39 : return a;
563 32 : return NULL;
564 : }
565 :
566 : /*
567 : * cparc - allocate a new arc within an NFA, copying details from old one
568 : */
569 : static void
570 201586 : cparc(struct nfa *nfa,
571 : struct arc *oa,
572 : struct state *from,
573 : struct state *to)
574 : {
575 201586 : newarc(nfa, oa->type, oa->co, from, to);
576 201586 : }
577 :
578 : /*
579 : * sortins - sort the in arcs of a state by from/color/type
580 : */
581 : static void
582 5645 : sortins(struct nfa *nfa,
583 : struct state *s)
584 : {
585 : struct arc **sortarray;
586 : struct arc *a;
587 5645 : int n = s->nins;
588 : int i;
589 :
590 5645 : if (n <= 1)
591 152 : return; /* nothing to do */
592 : /* make an array of arc pointers ... */
593 5493 : sortarray = (struct arc **) MALLOC(n * sizeof(struct arc *));
594 5493 : if (sortarray == NULL)
595 : {
596 0 : NERR(REG_ESPACE);
597 0 : return;
598 : }
599 5493 : i = 0;
600 26185 : for (a = s->ins; a != NULL; a = a->inchain)
601 20692 : sortarray[i++] = a;
602 5493 : assert(i == n);
603 : /* ... sort the array */
604 5493 : qsort(sortarray, n, sizeof(struct arc *), sortins_cmp);
605 : /* ... and rebuild arc list in order */
606 : /* it seems worth special-casing first and last items to simplify loop */
607 5493 : a = sortarray[0];
608 5493 : s->ins = a;
609 5493 : a->inchain = sortarray[1];
610 5493 : a->inchainRev = NULL;
611 15199 : for (i = 1; i < n - 1; i++)
612 : {
613 9706 : a = sortarray[i];
614 9706 : a->inchain = sortarray[i + 1];
615 9706 : a->inchainRev = sortarray[i - 1];
616 : }
617 5493 : a = sortarray[i];
618 5493 : a->inchain = NULL;
619 5493 : a->inchainRev = sortarray[i - 1];
620 5493 : FREE(sortarray);
621 : }
622 :
623 : static int
624 10191167 : sortins_cmp(const void *a, const void *b)
625 : {
626 10191167 : const struct arc *aa = *((const struct arc *const *) a);
627 10191167 : const struct arc *bb = *((const struct arc *const *) b);
628 :
629 : /* we check the fields in the order they are most likely to be different */
630 10191167 : if (aa->from->no < bb->from->no)
631 8032822 : return -1;
632 2158345 : if (aa->from->no > bb->from->no)
633 2033526 : return 1;
634 124819 : if (aa->co < bb->co)
635 70818 : return -1;
636 54001 : if (aa->co > bb->co)
637 38646 : return 1;
638 15355 : if (aa->type < bb->type)
639 9144 : return -1;
640 6211 : if (aa->type > bb->type)
641 4624 : return 1;
642 1587 : return 0;
643 : }
644 :
645 : /*
646 : * sortouts - sort the out arcs of a state by to/color/type
647 : */
648 : static void
649 32 : sortouts(struct nfa *nfa,
650 : struct state *s)
651 : {
652 : struct arc **sortarray;
653 : struct arc *a;
654 32 : int n = s->nouts;
655 : int i;
656 :
657 32 : if (n <= 1)
658 16 : return; /* nothing to do */
659 : /* make an array of arc pointers ... */
660 16 : sortarray = (struct arc **) MALLOC(n * sizeof(struct arc *));
661 16 : if (sortarray == NULL)
662 : {
663 0 : NERR(REG_ESPACE);
664 0 : return;
665 : }
666 16 : i = 0;
667 544 : for (a = s->outs; a != NULL; a = a->outchain)
668 528 : sortarray[i++] = a;
669 16 : assert(i == n);
670 : /* ... sort the array */
671 16 : qsort(sortarray, n, sizeof(struct arc *), sortouts_cmp);
672 : /* ... and rebuild arc list in order */
673 : /* it seems worth special-casing first and last items to simplify loop */
674 16 : a = sortarray[0];
675 16 : s->outs = a;
676 16 : a->outchain = sortarray[1];
677 16 : a->outchainRev = NULL;
678 512 : for (i = 1; i < n - 1; i++)
679 : {
680 496 : a = sortarray[i];
681 496 : a->outchain = sortarray[i + 1];
682 496 : a->outchainRev = sortarray[i - 1];
683 : }
684 16 : a = sortarray[i];
685 16 : a->outchain = NULL;
686 16 : a->outchainRev = sortarray[i - 1];
687 16 : FREE(sortarray);
688 : }
689 :
690 : static int
691 3408 : sortouts_cmp(const void *a, const void *b)
692 : {
693 3408 : const struct arc *aa = *((const struct arc *const *) a);
694 3408 : const struct arc *bb = *((const struct arc *const *) b);
695 :
696 : /* we check the fields in the order they are most likely to be different */
697 3408 : if (aa->to->no < bb->to->no)
698 2144 : return -1;
699 1264 : if (aa->to->no > bb->to->no)
700 1264 : return 1;
701 0 : if (aa->co < bb->co)
702 0 : return -1;
703 0 : if (aa->co > bb->co)
704 0 : return 1;
705 0 : if (aa->type < bb->type)
706 0 : return -1;
707 0 : if (aa->type > bb->type)
708 0 : return 1;
709 0 : return 0;
710 : }
711 :
712 : /*
713 : * Common decision logic about whether to use arc-by-arc operations or
714 : * sort/merge. If there's just a few source arcs we cannot recoup the
715 : * cost of sorting the destination arc list, no matter how large it is.
716 : * Otherwise, limit the number of arc-by-arc comparisons to about 1000
717 : * (a somewhat arbitrary choice, but the breakeven point would probably
718 : * be machine dependent anyway).
719 : */
720 : #define BULK_ARC_OP_USE_SORT(nsrcarcs, ndestarcs) \
721 : ((nsrcarcs) < 4 ? 0 : ((nsrcarcs) > 32 || (ndestarcs) > 32))
722 :
723 : /*
724 : * moveins - move all in arcs of a state to another state
725 : *
726 : * You might think this could be done better by just updating the
727 : * existing arcs, and you would be right if it weren't for the need
728 : * for duplicate suppression, which makes it easier to just make new
729 : * ones to exploit the suppression built into newarc.
730 : *
731 : * However, if we have a whole lot of arcs to deal with, retail duplicate
732 : * checks become too slow. In that case we proceed by sorting and merging
733 : * the arc lists, and then we can indeed just update the arcs in-place.
734 : */
735 : static void
736 44169 : moveins(struct nfa *nfa,
737 : struct state *oldState,
738 : struct state *newState)
739 : {
740 44169 : assert(oldState != newState);
741 :
742 44169 : if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins))
743 44126 : {
744 : /* With not too many arcs, just do them one at a time */
745 : struct arc *a;
746 :
747 156107 : while ((a = oldState->ins) != NULL)
748 : {
749 67855 : cparc(nfa, a, a->from, newState);
750 67855 : freearc(nfa, a);
751 : }
752 : }
753 : else
754 : {
755 : /*
756 : * With many arcs, use a sort-merge approach. Note changearctarget()
757 : * will put the arc onto the front of newState's chain, so it does not
758 : * break our walk through the sorted part of the chain.
759 : */
760 : struct arc *oa;
761 : struct arc *na;
762 :
763 : /*
764 : * Because we bypass newarc() in this code path, we'd better include a
765 : * cancel check.
766 : */
767 43 : if (CANCEL_REQUESTED(nfa->v->re))
768 : {
769 0 : NERR(REG_CANCEL);
770 0 : return;
771 : }
772 :
773 43 : sortins(nfa, oldState);
774 43 : sortins(nfa, newState);
775 43 : if (NISERR())
776 0 : return; /* might have failed to sort */
777 43 : oa = oldState->ins;
778 43 : na = newState->ins;
779 1830 : while (oa != NULL && na != NULL)
780 : {
781 1744 : struct arc *a = oa;
782 :
783 1744 : switch (sortins_cmp(&oa, &na))
784 : {
785 : case -1:
786 : /* newState does not have anything matching oa */
787 0 : oa = oa->inchain;
788 :
789 : /*
790 : * Rather than doing createarc+freearc, we can just unlink
791 : * and relink the existing arc struct.
792 : */
793 0 : changearctarget(a, newState);
794 0 : break;
795 : case 0:
796 : /* match, advance in both lists */
797 254 : oa = oa->inchain;
798 254 : na = na->inchain;
799 : /* ... and drop duplicate arc from oldState */
800 254 : freearc(nfa, a);
801 254 : break;
802 : case +1:
803 : /* advance only na; oa might have a match later */
804 1490 : na = na->inchain;
805 1490 : break;
806 : default:
807 0 : assert(NOTREACHED);
808 : }
809 : }
810 86 : while (oa != NULL)
811 : {
812 : /* newState does not have anything matching oa */
813 0 : struct arc *a = oa;
814 :
815 0 : oa = oa->inchain;
816 0 : changearctarget(a, newState);
817 : }
818 : }
819 :
820 44169 : assert(oldState->nins == 0);
821 44169 : assert(oldState->ins == NULL);
822 : }
823 :
824 : /*
825 : * copyins - copy in arcs of a state to another state
826 : */
827 : static void
828 2618 : copyins(struct nfa *nfa,
829 : struct state *oldState,
830 : struct state *newState)
831 : {
832 2618 : assert(oldState != newState);
833 :
834 2618 : if (!BULK_ARC_OP_USE_SORT(oldState->nins, newState->nins))
835 2466 : {
836 : /* With not too many arcs, just do them one at a time */
837 : struct arc *a;
838 :
839 26208 : for (a = oldState->ins; a != NULL; a = a->inchain)
840 23742 : cparc(nfa, a, a->from, newState);
841 : }
842 : else
843 : {
844 : /*
845 : * With many arcs, use a sort-merge approach. Note that createarc()
846 : * will put new arcs onto the front of newState's chain, so it does
847 : * not break our walk through the sorted part of the chain.
848 : */
849 : struct arc *oa;
850 : struct arc *na;
851 :
852 : /*
853 : * Because we bypass newarc() in this code path, we'd better include a
854 : * cancel check.
855 : */
856 152 : if (CANCEL_REQUESTED(nfa->v->re))
857 : {
858 0 : NERR(REG_CANCEL);
859 0 : return;
860 : }
861 :
862 152 : sortins(nfa, oldState);
863 152 : sortins(nfa, newState);
864 152 : if (NISERR())
865 0 : return; /* might have failed to sort */
866 152 : oa = oldState->ins;
867 152 : na = newState->ins;
868 304 : while (oa != NULL && na != NULL)
869 : {
870 0 : struct arc *a = oa;
871 :
872 0 : switch (sortins_cmp(&oa, &na))
873 : {
874 : case -1:
875 : /* newState does not have anything matching oa */
876 0 : oa = oa->inchain;
877 0 : createarc(nfa, a->type, a->co, a->from, newState);
878 0 : break;
879 : case 0:
880 : /* match, advance in both lists */
881 0 : oa = oa->inchain;
882 0 : na = na->inchain;
883 0 : break;
884 : case +1:
885 : /* advance only na; oa might have a match later */
886 0 : na = na->inchain;
887 0 : break;
888 : default:
889 0 : assert(NOTREACHED);
890 : }
891 : }
892 6374 : while (oa != NULL)
893 : {
894 : /* newState does not have anything matching oa */
895 6070 : struct arc *a = oa;
896 :
897 6070 : oa = oa->inchain;
898 6070 : createarc(nfa, a->type, a->co, a->from, newState);
899 : }
900 : }
901 : }
902 :
903 : /*
904 : * mergeins - merge a list of inarcs into a state
905 : *
906 : * This is much like copyins, but the source arcs are listed in an array,
907 : * and are not guaranteed unique. It's okay to clobber the array contents.
908 : */
909 : static void
910 22221 : mergeins(struct nfa *nfa,
911 : struct state *s,
912 : struct arc **arcarray,
913 : int arccount)
914 : {
915 : struct arc *na;
916 : int i;
917 : int j;
918 :
919 22221 : if (arccount <= 0)
920 33932 : return;
921 :
922 : /*
923 : * Because we bypass newarc() in this code path, we'd better include a
924 : * cancel check.
925 : */
926 5255 : if (CANCEL_REQUESTED(nfa->v->re))
927 : {
928 0 : NERR(REG_CANCEL);
929 0 : return;
930 : }
931 :
932 : /* Sort existing inarcs as well as proposed new ones */
933 5255 : sortins(nfa, s);
934 5255 : if (NISERR())
935 0 : return; /* might have failed to sort */
936 :
937 5255 : qsort(arcarray, arccount, sizeof(struct arc *), sortins_cmp);
938 :
939 : /*
940 : * arcarray very likely includes dups, so we must eliminate them. (This
941 : * could be folded into the next loop, but it's not worth the trouble.)
942 : */
943 5255 : j = 0;
944 2007774 : for (i = 1; i < arccount; i++)
945 : {
946 2002519 : switch (sortins_cmp(&arcarray[j], &arcarray[i]))
947 : {
948 : case -1:
949 : /* non-dup */
950 2001891 : arcarray[++j] = arcarray[i];
951 2001891 : break;
952 : case 0:
953 : /* dup */
954 628 : break;
955 : default:
956 : /* trouble */
957 0 : assert(NOTREACHED);
958 : }
959 : }
960 5255 : arccount = j + 1;
961 :
962 : /*
963 : * Now merge into s' inchain. Note that createarc() will put new arcs
964 : * onto the front of s's chain, so it does not break our walk through the
965 : * sorted part of the chain.
966 : */
967 5255 : i = 0;
968 5255 : na = s->ins;
969 2015615 : while (i < arccount && na != NULL)
970 : {
971 2005105 : struct arc *a = arcarray[i];
972 :
973 2005105 : switch (sortins_cmp(&a, &na))
974 : {
975 : case -1:
976 : /* s does not have anything matching a */
977 1997462 : createarc(nfa, a->type, a->co, a->from, s);
978 1997462 : i++;
979 1997462 : break;
980 : case 0:
981 : /* match, advance in both lists */
982 9 : i++;
983 9 : na = na->inchain;
984 9 : break;
985 : case +1:
986 : /* advance only na; array might have a match later */
987 7634 : na = na->inchain;
988 7634 : break;
989 : default:
990 0 : assert(NOTREACHED);
991 : }
992 : }
993 20185 : while (i < arccount)
994 : {
995 : /* s does not have anything matching a */
996 9675 : struct arc *a = arcarray[i];
997 :
998 9675 : createarc(nfa, a->type, a->co, a->from, s);
999 9675 : i++;
1000 : }
1001 : }
1002 :
1003 : /*
1004 : * moveouts - move all out arcs of a state to another state
1005 : */
1006 : static void
1007 11707 : moveouts(struct nfa *nfa,
1008 : struct state *oldState,
1009 : struct state *newState)
1010 : {
1011 11707 : assert(oldState != newState);
1012 :
1013 11707 : if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts))
1014 11707 : {
1015 : /* With not too many arcs, just do them one at a time */
1016 : struct arc *a;
1017 :
1018 37850 : while ((a = oldState->outs) != NULL)
1019 : {
1020 14436 : cparc(nfa, a, newState, a->to);
1021 14436 : freearc(nfa, a);
1022 : }
1023 : }
1024 : else
1025 : {
1026 : /*
1027 : * With many arcs, use a sort-merge approach. Note that createarc()
1028 : * will put new arcs onto the front of newState's chain, so it does
1029 : * not break our walk through the sorted part of the chain.
1030 : */
1031 : struct arc *oa;
1032 : struct arc *na;
1033 :
1034 : /*
1035 : * Because we bypass newarc() in this code path, we'd better include a
1036 : * cancel check.
1037 : */
1038 0 : if (CANCEL_REQUESTED(nfa->v->re))
1039 : {
1040 0 : NERR(REG_CANCEL);
1041 0 : return;
1042 : }
1043 :
1044 0 : sortouts(nfa, oldState);
1045 0 : sortouts(nfa, newState);
1046 0 : if (NISERR())
1047 0 : return; /* might have failed to sort */
1048 0 : oa = oldState->outs;
1049 0 : na = newState->outs;
1050 0 : while (oa != NULL && na != NULL)
1051 : {
1052 0 : struct arc *a = oa;
1053 :
1054 0 : switch (sortouts_cmp(&oa, &na))
1055 : {
1056 : case -1:
1057 : /* newState does not have anything matching oa */
1058 0 : oa = oa->outchain;
1059 0 : createarc(nfa, a->type, a->co, newState, a->to);
1060 0 : freearc(nfa, a);
1061 0 : break;
1062 : case 0:
1063 : /* match, advance in both lists */
1064 0 : oa = oa->outchain;
1065 0 : na = na->outchain;
1066 : /* ... and drop duplicate arc from oldState */
1067 0 : freearc(nfa, a);
1068 0 : break;
1069 : case +1:
1070 : /* advance only na; oa might have a match later */
1071 0 : na = na->outchain;
1072 0 : break;
1073 : default:
1074 0 : assert(NOTREACHED);
1075 : }
1076 : }
1077 0 : while (oa != NULL)
1078 : {
1079 : /* newState does not have anything matching oa */
1080 0 : struct arc *a = oa;
1081 :
1082 0 : oa = oa->outchain;
1083 0 : createarc(nfa, a->type, a->co, newState, a->to);
1084 0 : freearc(nfa, a);
1085 : }
1086 : }
1087 :
1088 11707 : assert(oldState->nouts == 0);
1089 11707 : assert(oldState->outs == NULL);
1090 : }
1091 :
1092 : /*
1093 : * copyouts - copy out arcs of a state to another state
1094 : */
1095 : static void
1096 2875 : copyouts(struct nfa *nfa,
1097 : struct state *oldState,
1098 : struct state *newState)
1099 : {
1100 2875 : assert(oldState != newState);
1101 :
1102 2875 : if (!BULK_ARC_OP_USE_SORT(oldState->nouts, newState->nouts))
1103 2859 : {
1104 : /* With not too many arcs, just do them one at a time */
1105 : struct arc *a;
1106 :
1107 14716 : for (a = oldState->outs; a != NULL; a = a->outchain)
1108 11857 : cparc(nfa, a, newState, a->to);
1109 : }
1110 : else
1111 : {
1112 : /*
1113 : * With many arcs, use a sort-merge approach. Note that createarc()
1114 : * will put new arcs onto the front of newState's chain, so it does
1115 : * not break our walk through the sorted part of the chain.
1116 : */
1117 : struct arc *oa;
1118 : struct arc *na;
1119 :
1120 : /*
1121 : * Because we bypass newarc() in this code path, we'd better include a
1122 : * cancel check.
1123 : */
1124 16 : if (CANCEL_REQUESTED(nfa->v->re))
1125 : {
1126 0 : NERR(REG_CANCEL);
1127 0 : return;
1128 : }
1129 :
1130 16 : sortouts(nfa, oldState);
1131 16 : sortouts(nfa, newState);
1132 16 : if (NISERR())
1133 0 : return; /* might have failed to sort */
1134 16 : oa = oldState->outs;
1135 16 : na = newState->outs;
1136 32 : while (oa != NULL && na != NULL)
1137 : {
1138 0 : struct arc *a = oa;
1139 :
1140 0 : switch (sortouts_cmp(&oa, &na))
1141 : {
1142 : case -1:
1143 : /* newState does not have anything matching oa */
1144 0 : oa = oa->outchain;
1145 0 : createarc(nfa, a->type, a->co, newState, a->to);
1146 0 : break;
1147 : case 0:
1148 : /* match, advance in both lists */
1149 0 : oa = oa->outchain;
1150 0 : na = na->outchain;
1151 0 : break;
1152 : case +1:
1153 : /* advance only na; oa might have a match later */
1154 0 : na = na->outchain;
1155 0 : break;
1156 : default:
1157 0 : assert(NOTREACHED);
1158 : }
1159 : }
1160 560 : while (oa != NULL)
1161 : {
1162 : /* newState does not have anything matching oa */
1163 528 : struct arc *a = oa;
1164 :
1165 528 : oa = oa->outchain;
1166 528 : createarc(nfa, a->type, a->co, newState, a->to);
1167 : }
1168 : }
1169 : }
1170 :
1171 : /*
1172 : * cloneouts - copy out arcs of a state to another state pair, modifying type
1173 : */
1174 : static void
1175 18 : cloneouts(struct nfa *nfa,
1176 : struct state *old,
1177 : struct state *from,
1178 : struct state *to,
1179 : int type)
1180 : {
1181 : struct arc *a;
1182 :
1183 18 : assert(old != from);
1184 :
1185 54 : for (a = old->outs; a != NULL; a = a->outchain)
1186 36 : newarc(nfa, type, a->co, from, to);
1187 18 : }
1188 :
1189 : /*
1190 : * delsub - delete a sub-NFA, updating subre pointers if necessary
1191 : *
1192 : * This uses a recursive traversal of the sub-NFA, marking already-seen
1193 : * states using their tmp pointer.
1194 : */
1195 : static void
1196 9 : delsub(struct nfa *nfa,
1197 : struct state *lp, /* the sub-NFA goes from here... */
1198 : struct state *rp) /* ...to here, *not* inclusive */
1199 : {
1200 9 : assert(lp != rp);
1201 :
1202 9 : rp->tmp = rp; /* mark end */
1203 :
1204 9 : deltraverse(nfa, lp, lp);
1205 9 : if (NISERR())
1206 9 : return; /* asserts might not hold after failure */
1207 9 : assert(lp->nouts == 0 && rp->nins == 0); /* did the job */
1208 9 : assert(lp->no != FREESTATE && rp->no != FREESTATE); /* no more */
1209 :
1210 9 : rp->tmp = NULL; /* unmark end */
1211 9 : lp->tmp = NULL; /* and begin, marked by deltraverse */
1212 : }
1213 :
1214 : /*
1215 : * deltraverse - the recursive heart of delsub
1216 : * This routine's basic job is to destroy all out-arcs of the state.
1217 : */
1218 : static void
1219 18 : deltraverse(struct nfa *nfa,
1220 : struct state *leftend,
1221 : struct state *s)
1222 : {
1223 : struct arc *a;
1224 : struct state *to;
1225 :
1226 : /* Since this is recursive, it could be driven to stack overflow */
1227 18 : if (STACK_TOO_DEEP(nfa->v->re))
1228 : {
1229 0 : NERR(REG_ETOOBIG);
1230 0 : return;
1231 : }
1232 :
1233 18 : if (s->nouts == 0)
1234 9 : return; /* nothing to do */
1235 9 : if (s->tmp != NULL)
1236 0 : return; /* already in progress */
1237 :
1238 9 : s->tmp = s; /* mark as in progress */
1239 :
1240 27 : while ((a = s->outs) != NULL)
1241 : {
1242 9 : to = a->to;
1243 9 : deltraverse(nfa, leftend, to);
1244 9 : if (NISERR())
1245 0 : return; /* asserts might not hold after failure */
1246 9 : assert(to->nouts == 0 || to->tmp != NULL);
1247 9 : freearc(nfa, a);
1248 9 : if (to->nins == 0 && to->tmp == NULL)
1249 : {
1250 0 : assert(to->nouts == 0);
1251 0 : freestate(nfa, to);
1252 : }
1253 : }
1254 :
1255 9 : assert(s->no != FREESTATE); /* we're still here */
1256 9 : assert(s == leftend || s->nins != 0); /* and still reachable */
1257 9 : assert(s->nouts == 0); /* but have no outarcs */
1258 :
1259 9 : s->tmp = NULL; /* we're done here */
1260 : }
1261 :
1262 : /*
1263 : * dupnfa - duplicate sub-NFA
1264 : *
1265 : * Another recursive traversal, this time using tmp to point to duplicates
1266 : * as well as mark already-seen states. (You knew there was a reason why
1267 : * it's a state pointer, didn't you? :-))
1268 : */
1269 : static void
1270 1589 : dupnfa(struct nfa *nfa,
1271 : struct state *start, /* duplicate of subNFA starting here */
1272 : struct state *stop, /* and stopping here */
1273 : struct state *from, /* stringing duplicate from here */
1274 : struct state *to) /* to here */
1275 : {
1276 1589 : if (start == stop)
1277 : {
1278 68 : newarc(nfa, EMPTY, 0, from, to);
1279 1657 : return;
1280 : }
1281 :
1282 1521 : stop->tmp = to;
1283 1521 : duptraverse(nfa, start, from);
1284 : /* done, except for clearing out the tmp pointers */
1285 :
1286 1521 : stop->tmp = NULL;
1287 1521 : cleartraverse(nfa, start);
1288 : }
1289 :
1290 : /*
1291 : * duptraverse - recursive heart of dupnfa
1292 : */
1293 : static void
1294 66157 : duptraverse(struct nfa *nfa,
1295 : struct state *s,
1296 : struct state *stmp) /* s's duplicate, or NULL */
1297 : {
1298 : struct arc *a;
1299 :
1300 : /* Since this is recursive, it could be driven to stack overflow */
1301 66157 : if (STACK_TOO_DEEP(nfa->v->re))
1302 : {
1303 0 : NERR(REG_ETOOBIG);
1304 0 : return;
1305 : }
1306 :
1307 66157 : if (s->tmp != NULL)
1308 12646 : return; /* already done */
1309 :
1310 53511 : s->tmp = (stmp == NULL) ? newstate(nfa) : stmp;
1311 53511 : if (s->tmp == NULL)
1312 : {
1313 0 : assert(NISERR());
1314 0 : return;
1315 : }
1316 :
1317 118147 : for (a = s->outs; a != NULL && !NISERR(); a = a->outchain)
1318 : {
1319 64636 : duptraverse(nfa, a->to, (struct state *) NULL);
1320 64636 : if (NISERR())
1321 0 : break;
1322 64636 : assert(a->to->tmp != NULL);
1323 64636 : cparc(nfa, a, s->tmp, a->to->tmp);
1324 : }
1325 : }
1326 :
1327 : /*
1328 : * cleartraverse - recursive cleanup for algorithms that leave tmp ptrs set
1329 : */
1330 : static void
1331 219497 : cleartraverse(struct nfa *nfa,
1332 : struct state *s)
1333 : {
1334 : struct arc *a;
1335 :
1336 : /* Since this is recursive, it could be driven to stack overflow */
1337 219497 : if (STACK_TOO_DEEP(nfa->v->re))
1338 : {
1339 0 : NERR(REG_ETOOBIG);
1340 0 : return;
1341 : }
1342 :
1343 219497 : if (s->tmp == NULL)
1344 85214 : return;
1345 134283 : s->tmp = NULL;
1346 :
1347 348702 : for (a = s->outs; a != NULL; a = a->outchain)
1348 214419 : cleartraverse(nfa, a->to);
1349 : }
1350 :
1351 : /*
1352 : * single_color_transition - does getting from s1 to s2 cross one PLAIN arc?
1353 : *
1354 : * If traversing from s1 to s2 requires a single PLAIN match (possibly of any
1355 : * of a set of colors), return a state whose outarc list contains only PLAIN
1356 : * arcs of those color(s). Otherwise return NULL.
1357 : *
1358 : * This is used before optimizing the NFA, so there may be EMPTY arcs, which
1359 : * we should ignore; the possibility of an EMPTY is why the result state could
1360 : * be different from s1.
1361 : *
1362 : * It's worth troubling to handle multiple parallel PLAIN arcs here because a
1363 : * bracket construct such as [abc] might yield either one or several parallel
1364 : * PLAIN arcs depending on earlier atoms in the expression. We'd rather that
1365 : * that implementation detail not create user-visible performance differences.
1366 : */
1367 : static struct state *
1368 25 : single_color_transition(struct state *s1, struct state *s2)
1369 : {
1370 : struct arc *a;
1371 :
1372 : /* Ignore leading EMPTY arc, if any */
1373 25 : if (s1->nouts == 1 && s1->outs->type == EMPTY)
1374 25 : s1 = s1->outs->to;
1375 : /* Likewise for any trailing EMPTY arc */
1376 25 : if (s2->nins == 1 && s2->ins->type == EMPTY)
1377 25 : s2 = s2->ins->from;
1378 : /* Perhaps we could have a single-state loop in between, if so reject */
1379 25 : if (s1 == s2)
1380 0 : return NULL;
1381 : /* s1 must have at least one outarc... */
1382 25 : if (s1->outs == NULL)
1383 0 : return NULL;
1384 : /* ... and they must all be PLAIN arcs to s2 */
1385 45 : for (a = s1->outs; a != NULL; a = a->outchain)
1386 : {
1387 27 : if (a->type != PLAIN || a->to != s2)
1388 7 : return NULL;
1389 : }
1390 : /* OK, return s1 as the possessor of the relevant outarcs */
1391 18 : return s1;
1392 : }
1393 :
1394 : /*
1395 : * specialcolors - fill in special colors for an NFA
1396 : */
1397 : static void
1398 1780 : specialcolors(struct nfa *nfa)
1399 : {
1400 : /* false colors for BOS, BOL, EOS, EOL */
1401 1780 : if (nfa->parent == NULL)
1402 : {
1403 248 : nfa->bos[0] = pseudocolor(nfa->cm);
1404 248 : nfa->bos[1] = pseudocolor(nfa->cm);
1405 248 : nfa->eos[0] = pseudocolor(nfa->cm);
1406 248 : nfa->eos[1] = pseudocolor(nfa->cm);
1407 : }
1408 : else
1409 : {
1410 1532 : assert(nfa->parent->bos[0] != COLORLESS);
1411 1532 : nfa->bos[0] = nfa->parent->bos[0];
1412 1532 : assert(nfa->parent->bos[1] != COLORLESS);
1413 1532 : nfa->bos[1] = nfa->parent->bos[1];
1414 1532 : assert(nfa->parent->eos[0] != COLORLESS);
1415 1532 : nfa->eos[0] = nfa->parent->eos[0];
1416 1532 : assert(nfa->parent->eos[1] != COLORLESS);
1417 1532 : nfa->eos[1] = nfa->parent->eos[1];
1418 : }
1419 1780 : }
1420 :
1421 : /*
1422 : * optimize - optimize an NFA
1423 : *
1424 : * The main goal of this function is not so much "optimization" (though it
1425 : * does try to get rid of useless NFA states) as reducing the NFA to a form
1426 : * the regex executor can handle. The executor, and indeed the cNFA format
1427 : * that is its input, can only handle PLAIN and LACON arcs. The output of
1428 : * the regex parser also includes EMPTY (do-nothing) arcs, as well as
1429 : * ^, $, AHEAD, and BEHIND constraint arcs, which we must get rid of here.
1430 : * We first get rid of EMPTY arcs and then deal with the constraint arcs.
1431 : * The hardest part of either job is to get rid of circular loops of the
1432 : * target arc type. We would have to do that in any case, though, as such a
1433 : * loop would otherwise allow the executor to cycle through the loop endlessly
1434 : * without making any progress in the input string.
1435 : */
1436 : static long /* re_info bits */
1437 1779 : optimize(struct nfa *nfa,
1438 : FILE *f) /* for debug output; NULL none */
1439 : {
1440 : #ifdef REG_DEBUG
1441 : int verbose = (f != NULL) ? 1 : 0;
1442 :
1443 : if (verbose)
1444 : fprintf(f, "\ninitial cleanup:\n");
1445 : #endif
1446 1779 : cleanup(nfa); /* may simplify situation */
1447 : #ifdef REG_DEBUG
1448 : if (verbose)
1449 : dumpnfa(nfa, f);
1450 : if (verbose)
1451 : fprintf(f, "\nempties:\n");
1452 : #endif
1453 1779 : fixempties(nfa, f); /* get rid of EMPTY arcs */
1454 : #ifdef REG_DEBUG
1455 : if (verbose)
1456 : fprintf(f, "\nconstraints:\n");
1457 : #endif
1458 1779 : fixconstraintloops(nfa, f); /* get rid of constraint loops */
1459 1779 : pullback(nfa, f); /* pull back constraints backward */
1460 1779 : pushfwd(nfa, f); /* push fwd constraints forward */
1461 : #ifdef REG_DEBUG
1462 : if (verbose)
1463 : fprintf(f, "\nfinal cleanup:\n");
1464 : #endif
1465 1779 : cleanup(nfa); /* final tidying */
1466 : #ifdef REG_DEBUG
1467 : if (verbose)
1468 : dumpnfa(nfa, f);
1469 : #endif
1470 1779 : return analyze(nfa); /* and analysis */
1471 : }
1472 :
1473 : /*
1474 : * pullback - pull back constraints backward to eliminate them
1475 : */
1476 : static void
1477 2377 : pullback(struct nfa *nfa,
1478 : FILE *f) /* for debug output; NULL none */
1479 : {
1480 : struct state *s;
1481 : struct state *nexts;
1482 : struct arc *a;
1483 : struct arc *nexta;
1484 : struct state *intermediates;
1485 : int progress;
1486 :
1487 : /* find and pull until there are no more */
1488 : do
1489 : {
1490 2377 : progress = 0;
1491 35177 : for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
1492 : {
1493 32800 : nexts = s->next;
1494 32800 : intermediates = NULL;
1495 127265 : for (a = s->outs; a != NULL && !NISERR(); a = nexta)
1496 : {
1497 94465 : nexta = a->outchain;
1498 94465 : if (a->type == '^' || a->type == BEHIND)
1499 10582 : if (pull(nfa, a, &intermediates))
1500 3973 : progress = 1;
1501 : }
1502 : /* clear tmp fields of intermediate states created here */
1503 65931 : while (intermediates != NULL)
1504 : {
1505 331 : struct state *ns = intermediates->tmp;
1506 :
1507 331 : intermediates->tmp = NULL;
1508 331 : intermediates = ns;
1509 : }
1510 : /* if s is now useless, get rid of it */
1511 32800 : if ((s->nins == 0 || s->nouts == 0) && !s->flag)
1512 3684 : dropstate(nfa, s);
1513 : }
1514 2377 : if (progress && f != NULL)
1515 0 : dumpnfa(nfa, f);
1516 2377 : } while (progress && !NISERR());
1517 1779 : if (NISERR())
1518 1780 : return;
1519 :
1520 : /*
1521 : * Any ^ constraints we were able to pull to the start state can now be
1522 : * replaced by PLAIN arcs referencing the BOS or BOL colors. There should
1523 : * be no other ^ or BEHIND arcs left in the NFA, though we do not check
1524 : * that here (compact() will fail if so).
1525 : */
1526 18140 : for (a = nfa->pre->outs; a != NULL; a = nexta)
1527 : {
1528 16362 : nexta = a->outchain;
1529 16362 : if (a->type == '^')
1530 : {
1531 5016 : assert(a->co == 0 || a->co == 1);
1532 5016 : newarc(nfa, PLAIN, nfa->bos[a->co], a->from, a->to);
1533 5016 : freearc(nfa, a);
1534 : }
1535 : }
1536 : }
1537 :
1538 : /*
1539 : * pull - pull a back constraint backward past its source state
1540 : *
1541 : * Returns 1 if successful (which it always is unless the source is the
1542 : * start state or we have an internal error), 0 if nothing happened.
1543 : *
1544 : * A significant property of this function is that it deletes no pre-existing
1545 : * states, and no outarcs of the constraint's from state other than the given
1546 : * constraint arc. This makes the loops in pullback() safe, at the cost that
1547 : * we may leave useless states behind. Therefore, we leave it to pullback()
1548 : * to delete such states.
1549 : *
1550 : * If the from state has multiple back-constraint outarcs, and/or multiple
1551 : * compatible constraint inarcs, we only need to create one new intermediate
1552 : * state per combination of predecessor and successor states. *intermediates
1553 : * points to a list of such intermediate states for this from state (chained
1554 : * through their tmp fields).
1555 : */
1556 : static int
1557 10582 : pull(struct nfa *nfa,
1558 : struct arc *con,
1559 : struct state **intermediates)
1560 : {
1561 10582 : struct state *from = con->from;
1562 10582 : struct state *to = con->to;
1563 : struct arc *a;
1564 : struct arc *nexta;
1565 : struct state *s;
1566 :
1567 10582 : assert(from != to); /* should have gotten rid of this earlier */
1568 10582 : if (from->flag) /* can't pull back beyond start */
1569 6609 : return 0;
1570 3973 : if (from->nins == 0)
1571 : { /* unreachable */
1572 573 : freearc(nfa, con);
1573 573 : return 1;
1574 : }
1575 :
1576 : /*
1577 : * First, clone from state if necessary to avoid other outarcs. This may
1578 : * seem wasteful, but it simplifies the logic, and we'll get rid of the
1579 : * clone state again at the bottom.
1580 : */
1581 3400 : if (from->nouts > 1)
1582 : {
1583 2618 : s = newstate(nfa);
1584 2618 : if (NISERR())
1585 0 : return 0;
1586 2618 : copyins(nfa, from, s); /* duplicate inarcs */
1587 2618 : cparc(nfa, con, s, to); /* move constraint arc */
1588 2618 : freearc(nfa, con);
1589 2618 : if (NISERR())
1590 0 : return 0;
1591 2618 : from = s;
1592 2618 : con = from->outs;
1593 : }
1594 3400 : assert(from->nouts == 1);
1595 :
1596 : /* propagate the constraint into the from state's inarcs */
1597 42288 : for (a = from->ins; a != NULL && !NISERR(); a = nexta)
1598 : {
1599 38888 : nexta = a->inchain;
1600 38888 : switch (combine(con, a))
1601 : {
1602 : case INCOMPATIBLE: /* destroy the arc */
1603 32198 : freearc(nfa, a);
1604 32198 : break;
1605 : case SATISFIED: /* no action needed */
1606 3686 : break;
1607 : case COMPATIBLE: /* swap the two arcs, more or less */
1608 : /* need an intermediate state, but might have one already */
1609 3639 : for (s = *intermediates; s != NULL; s = s->tmp)
1610 : {
1611 3308 : assert(s->nins > 0 && s->nouts > 0);
1612 3308 : if (s->ins->from == a->from && s->outs->to == to)
1613 2673 : break;
1614 : }
1615 3004 : if (s == NULL)
1616 : {
1617 331 : s = newstate(nfa);
1618 331 : if (NISERR())
1619 0 : return 0;
1620 331 : s->tmp = *intermediates;
1621 331 : *intermediates = s;
1622 : }
1623 3004 : cparc(nfa, con, a->from, s);
1624 3004 : cparc(nfa, a, s, to);
1625 3004 : freearc(nfa, a);
1626 3004 : break;
1627 : default:
1628 0 : assert(NOTREACHED);
1629 : break;
1630 : }
1631 : }
1632 :
1633 : /* remaining inarcs, if any, incorporate the constraint */
1634 3400 : moveins(nfa, from, to);
1635 3400 : freearc(nfa, con);
1636 : /* from state is now useless, but we leave it to pullback() to clean up */
1637 3400 : return 1;
1638 : }
1639 :
1640 : /*
1641 : * pushfwd - push forward constraints forward to eliminate them
1642 : */
1643 : static void
1644 2437 : pushfwd(struct nfa *nfa,
1645 : FILE *f) /* for debug output; NULL none */
1646 : {
1647 : struct state *s;
1648 : struct state *nexts;
1649 : struct arc *a;
1650 : struct arc *nexta;
1651 : struct state *intermediates;
1652 : int progress;
1653 :
1654 : /* find and push until there are no more */
1655 : do
1656 : {
1657 2437 : progress = 0;
1658 32579 : for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
1659 : {
1660 30142 : nexts = s->next;
1661 30142 : intermediates = NULL;
1662 108580 : for (a = s->ins; a != NULL && !NISERR(); a = nexta)
1663 : {
1664 78438 : nexta = a->inchain;
1665 78438 : if (a->type == '$' || a->type == AHEAD)
1666 8130 : if (push(nfa, a, &intermediates))
1667 3582 : progress = 1;
1668 : }
1669 : /* clear tmp fields of intermediate states created here */
1670 60284 : while (intermediates != NULL)
1671 : {
1672 0 : struct state *ns = intermediates->tmp;
1673 :
1674 0 : intermediates->tmp = NULL;
1675 0 : intermediates = ns;
1676 : }
1677 : /* if s is now useless, get rid of it */
1678 30142 : if ((s->nins == 0 || s->nouts == 0) && !s->flag)
1679 3895 : dropstate(nfa, s);
1680 : }
1681 2437 : if (progress && f != NULL)
1682 0 : dumpnfa(nfa, f);
1683 2437 : } while (progress && !NISERR());
1684 1779 : if (NISERR())
1685 1780 : return;
1686 :
1687 : /*
1688 : * Any $ constraints we were able to push to the post state can now be
1689 : * replaced by PLAIN arcs referencing the EOS or EOL colors. There should
1690 : * be no other $ or AHEAD arcs left in the NFA, though we do not check
1691 : * that here (compact() will fail if so).
1692 : */
1693 12602 : for (a = nfa->post->ins; a != NULL; a = nexta)
1694 : {
1695 10824 : nexta = a->inchain;
1696 10824 : if (a->type == '$')
1697 : {
1698 3140 : assert(a->co == 0 || a->co == 1);
1699 3140 : newarc(nfa, PLAIN, nfa->eos[a->co], a->from, a->to);
1700 3140 : freearc(nfa, a);
1701 : }
1702 : }
1703 : }
1704 :
1705 : /*
1706 : * push - push a forward constraint forward past its destination state
1707 : *
1708 : * Returns 1 if successful (which it always is unless the destination is the
1709 : * post state or we have an internal error), 0 if nothing happened.
1710 : *
1711 : * A significant property of this function is that it deletes no pre-existing
1712 : * states, and no inarcs of the constraint's to state other than the given
1713 : * constraint arc. This makes the loops in pushfwd() safe, at the cost that
1714 : * we may leave useless states behind. Therefore, we leave it to pushfwd()
1715 : * to delete such states.
1716 : *
1717 : * If the to state has multiple forward-constraint inarcs, and/or multiple
1718 : * compatible constraint outarcs, we only need to create one new intermediate
1719 : * state per combination of predecessor and successor states. *intermediates
1720 : * points to a list of such intermediate states for this to state (chained
1721 : * through their tmp fields).
1722 : */
1723 : static int
1724 8130 : push(struct nfa *nfa,
1725 : struct arc *con,
1726 : struct state **intermediates)
1727 : {
1728 8130 : struct state *from = con->from;
1729 8130 : struct state *to = con->to;
1730 : struct arc *a;
1731 : struct arc *nexta;
1732 : struct state *s;
1733 :
1734 8130 : assert(to != from); /* should have gotten rid of this earlier */
1735 8130 : if (to->flag) /* can't push forward beyond end */
1736 4548 : return 0;
1737 3582 : if (to->nouts == 0)
1738 : { /* dead end */
1739 67 : freearc(nfa, con);
1740 67 : return 1;
1741 : }
1742 :
1743 : /*
1744 : * First, clone to state if necessary to avoid other inarcs. This may
1745 : * seem wasteful, but it simplifies the logic, and we'll get rid of the
1746 : * clone state again at the bottom.
1747 : */
1748 3515 : if (to->nins > 1)
1749 : {
1750 2820 : s = newstate(nfa);
1751 2820 : if (NISERR())
1752 0 : return 0;
1753 2820 : copyouts(nfa, to, s); /* duplicate outarcs */
1754 2820 : cparc(nfa, con, from, s); /* move constraint arc */
1755 2820 : freearc(nfa, con);
1756 2820 : if (NISERR())
1757 0 : return 0;
1758 2820 : to = s;
1759 2820 : con = to->ins;
1760 : }
1761 3515 : assert(to->nins == 1);
1762 :
1763 : /* propagate the constraint into the to state's outarcs */
1764 21842 : for (a = to->outs; a != NULL && !NISERR(); a = nexta)
1765 : {
1766 18327 : nexta = a->outchain;
1767 18327 : switch (combine(con, a))
1768 : {
1769 : case INCOMPATIBLE: /* destroy the arc */
1770 16274 : freearc(nfa, a);
1771 16274 : break;
1772 : case SATISFIED: /* no action needed */
1773 2053 : break;
1774 : case COMPATIBLE: /* swap the two arcs, more or less */
1775 : /* need an intermediate state, but might have one already */
1776 0 : for (s = *intermediates; s != NULL; s = s->tmp)
1777 : {
1778 0 : assert(s->nins > 0 && s->nouts > 0);
1779 0 : if (s->ins->from == from && s->outs->to == a->to)
1780 0 : break;
1781 : }
1782 0 : if (s == NULL)
1783 : {
1784 0 : s = newstate(nfa);
1785 0 : if (NISERR())
1786 0 : return 0;
1787 0 : s->tmp = *intermediates;
1788 0 : *intermediates = s;
1789 : }
1790 0 : cparc(nfa, con, s, a->to);
1791 0 : cparc(nfa, a, from, s);
1792 0 : freearc(nfa, a);
1793 0 : break;
1794 : default:
1795 0 : assert(NOTREACHED);
1796 : break;
1797 : }
1798 : }
1799 :
1800 : /* remaining outarcs, if any, incorporate the constraint */
1801 3515 : moveouts(nfa, to, from);
1802 3515 : freearc(nfa, con);
1803 : /* to state is now useless, but we leave it to pushfwd() to clean up */
1804 3515 : return 1;
1805 : }
1806 :
1807 : /*
1808 : * combine - constraint lands on an arc, what happens?
1809 : *
1810 : * #def INCOMPATIBLE 1 // destroys arc
1811 : * #def SATISFIED 2 // constraint satisfied
1812 : * #def COMPATIBLE 3 // compatible but not satisfied yet
1813 : */
1814 : static int
1815 57215 : combine(struct arc *con,
1816 : struct arc *a)
1817 : {
1818 : #define CA(ct,at) (((ct)<<CHAR_BIT) | (at))
1819 :
1820 57215 : switch (CA(con->type, a->type))
1821 : {
1822 : case CA('^', PLAIN): /* newlines are handled separately */
1823 : case CA('$', PLAIN):
1824 44562 : return INCOMPATIBLE;
1825 : break;
1826 : case CA(AHEAD, PLAIN): /* color constraints meet colors */
1827 : case CA(BEHIND, PLAIN):
1828 1281 : if (con->co == a->co)
1829 270 : return SATISFIED;
1830 1011 : return INCOMPATIBLE;
1831 : break;
1832 : case CA('^', '^'): /* collision, similar constraints */
1833 : case CA('$', '$'):
1834 : case CA(AHEAD, AHEAD):
1835 : case CA(BEHIND, BEHIND):
1836 7790 : if (con->co == a->co) /* true duplication */
1837 5469 : return SATISFIED;
1838 2321 : return INCOMPATIBLE;
1839 : break;
1840 : case CA('^', BEHIND): /* collision, dissimilar constraints */
1841 : case CA(BEHIND, '^'):
1842 : case CA('$', AHEAD):
1843 : case CA(AHEAD, '$'):
1844 578 : return INCOMPATIBLE;
1845 : break;
1846 : case CA('^', '$'): /* constraints passing each other */
1847 : case CA('^', AHEAD):
1848 : case CA(BEHIND, '$'):
1849 : case CA(BEHIND, AHEAD):
1850 : case CA('$', '^'):
1851 : case CA('$', BEHIND):
1852 : case CA(AHEAD, '^'):
1853 : case CA(AHEAD, BEHIND):
1854 : case CA('^', LACON):
1855 : case CA(BEHIND, LACON):
1856 : case CA('$', LACON):
1857 : case CA(AHEAD, LACON):
1858 3004 : return COMPATIBLE;
1859 : break;
1860 : }
1861 0 : assert(NOTREACHED);
1862 : return INCOMPATIBLE; /* for benefit of blind compilers */
1863 : }
1864 :
1865 : /*
1866 : * fixempties - get rid of EMPTY arcs
1867 : */
1868 : static void
1869 1779 : fixempties(struct nfa *nfa,
1870 : FILE *f) /* for debug output; NULL none */
1871 : {
1872 : struct state *s;
1873 : struct state *s2;
1874 : struct state *nexts;
1875 : struct arc *a;
1876 : struct arc *nexta;
1877 : int totalinarcs;
1878 : struct arc **inarcsorig;
1879 : struct arc **arcarray;
1880 : int arccount;
1881 : int prevnins;
1882 : int nskip;
1883 :
1884 : /*
1885 : * First, get rid of any states whose sole out-arc is an EMPTY, since
1886 : * they're basically just aliases for their successor. The parsing
1887 : * algorithm creates enough of these that it's worth special-casing this.
1888 : */
1889 64886 : for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
1890 : {
1891 63107 : nexts = s->next;
1892 63107 : if (s->flag || s->nouts != 1)
1893 15715 : continue;
1894 47392 : a = s->outs;
1895 47392 : assert(a != NULL && a->outchain == NULL);
1896 47392 : if (a->type != EMPTY)
1897 14890 : continue;
1898 32502 : if (s != a->to)
1899 32502 : moveins(nfa, s, a->to);
1900 32502 : dropstate(nfa, s);
1901 : }
1902 :
1903 : /*
1904 : * Similarly, get rid of any state with a single EMPTY in-arc, by folding
1905 : * it into its predecessor.
1906 : */
1907 32384 : for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
1908 : {
1909 30605 : nexts = s->next;
1910 : /* while we're at it, ensure tmp fields are clear for next step */
1911 30605 : assert(s->tmp == NULL);
1912 30605 : if (s->flag || s->nins != 1)
1913 12633 : continue;
1914 17972 : a = s->ins;
1915 17972 : assert(a != NULL && a->inchain == NULL);
1916 17972 : if (a->type != EMPTY)
1917 13136 : continue;
1918 4836 : if (s != a->from)
1919 4836 : moveouts(nfa, s, a->from);
1920 4836 : dropstate(nfa, s);
1921 : }
1922 :
1923 1779 : if (NISERR())
1924 0 : return;
1925 :
1926 : /*
1927 : * For each remaining NFA state, find all other states from which it is
1928 : * reachable by a chain of one or more EMPTY arcs. Then generate new arcs
1929 : * that eliminate the need for each such chain.
1930 : *
1931 : * We could replace a chain of EMPTY arcs that leads from a "from" state
1932 : * to a "to" state either by pushing non-EMPTY arcs forward (linking
1933 : * directly from "from"'s predecessors to "to") or by pulling them back
1934 : * (linking directly from "from" to "to"'s successors). We choose to
1935 : * always do the former; this choice is somewhat arbitrary, but the
1936 : * approach below requires that we uniformly do one or the other.
1937 : *
1938 : * Suppose we have a chain of N successive EMPTY arcs (where N can easily
1939 : * approach the size of the NFA). All of the intermediate states must
1940 : * have additional inarcs and outarcs, else they'd have been removed by
1941 : * the steps above. Assuming their inarcs are mostly not empties, we will
1942 : * add O(N^2) arcs to the NFA, since a non-EMPTY inarc leading to any one
1943 : * state in the chain must be duplicated to lead to all its successor
1944 : * states as well. So there is no hope of doing less than O(N^2) work;
1945 : * however, we should endeavor to keep the big-O cost from being even
1946 : * worse than that, which it can easily become without care. In
1947 : * particular, suppose we were to copy all S1's inarcs forward to S2, and
1948 : * then also to S3, and then later we consider pushing S2's inarcs forward
1949 : * to S3. If we include the arcs already copied from S1 in that, we'd be
1950 : * doing O(N^3) work. (The duplicate-arc elimination built into newarc()
1951 : * and its cohorts would get rid of the extra arcs, but not without cost.)
1952 : *
1953 : * We can avoid this cost by treating only arcs that existed at the start
1954 : * of this phase as candidates to be pushed forward. To identify those,
1955 : * we remember the first inarc each state had to start with. We rely on
1956 : * the fact that newarc() and friends put new arcs on the front of their
1957 : * to-states' inchains, and that this phase never deletes arcs, so that
1958 : * the original arcs must be the last arcs in their to-states' inchains.
1959 : *
1960 : * So the process here is that, for each state in the NFA, we gather up
1961 : * all non-EMPTY inarcs of states that can reach the target state via
1962 : * EMPTY arcs. We then sort, de-duplicate, and merge these arcs into the
1963 : * target state's inchain. (We can safely use sort-merge for this as long
1964 : * as we update each state's original-arcs pointer after we add arcs to
1965 : * it; the sort step of mergeins probably changed the order of the old
1966 : * arcs.)
1967 : *
1968 : * Another refinement worth making is that, because we only add non-EMPTY
1969 : * arcs during this phase, and all added arcs have the same from-state as
1970 : * the non-EMPTY arc they were cloned from, we know ahead of time that any
1971 : * states having only EMPTY outarcs will be useless for lack of outarcs
1972 : * after we drop the EMPTY arcs. (They cannot gain non-EMPTY outarcs if
1973 : * they had none to start with.) So we need not bother to update the
1974 : * inchains of such states at all.
1975 : */
1976 :
1977 : /* Remember the states' first original inarcs */
1978 : /* ... and while at it, count how many old inarcs there are altogether */
1979 1779 : inarcsorig = (struct arc **) MALLOC(nfa->nstates * sizeof(struct arc *));
1980 1779 : if (inarcsorig == NULL)
1981 : {
1982 0 : NERR(REG_ESPACE);
1983 0 : return;
1984 : }
1985 1779 : totalinarcs = 0;
1986 27548 : for (s = nfa->states; s != NULL; s = s->next)
1987 : {
1988 25769 : inarcsorig[s->no] = s->ins;
1989 25769 : totalinarcs += s->nins;
1990 : }
1991 :
1992 : /*
1993 : * Create a workspace for accumulating the inarcs to be added to the
1994 : * current target state. totalinarcs is probably a considerable
1995 : * overestimate of the space needed, but the NFA is unlikely to be large
1996 : * enough at this point to make it worth being smarter.
1997 : */
1998 1779 : arcarray = (struct arc **) MALLOC(totalinarcs * sizeof(struct arc *));
1999 1779 : if (arcarray == NULL)
2000 : {
2001 0 : NERR(REG_ESPACE);
2002 0 : FREE(inarcsorig);
2003 0 : return;
2004 : }
2005 :
2006 : /* And iterate over the target states */
2007 26534 : for (s = nfa->states; s != NULL && !NISERR(); s = s->next)
2008 : {
2009 : /* Ignore target states without non-EMPTY outarcs, per note above */
2010 24755 : if (!s->flag && !hasnonemptyout(s))
2011 2534 : continue;
2012 :
2013 : /* Find predecessor states and accumulate their original inarcs */
2014 22221 : arccount = 0;
2015 2022239 : for (s2 = emptyreachable(nfa, s, s, inarcsorig); s2 != s; s2 = nexts)
2016 : {
2017 : /* Add s2's original inarcs to arcarray[], but ignore empties */
2018 6024400 : for (a = inarcsorig[s2->no]; a != NULL; a = a->inchain)
2019 : {
2020 4024382 : if (a->type != EMPTY)
2021 2007774 : arcarray[arccount++] = a;
2022 : }
2023 :
2024 : /* Reset the tmp fields as we walk back */
2025 2000018 : nexts = s2->tmp;
2026 2000018 : s2->tmp = NULL;
2027 : }
2028 22221 : s->tmp = NULL;
2029 22221 : assert(arccount <= totalinarcs);
2030 :
2031 : /* Remember how many original inarcs this state has */
2032 22221 : prevnins = s->nins;
2033 :
2034 : /* Add non-duplicate inarcs to target state */
2035 22221 : mergeins(nfa, s, arcarray, arccount);
2036 :
2037 : /* Now we must update the state's inarcsorig pointer */
2038 22221 : nskip = s->nins - prevnins;
2039 22221 : a = s->ins;
2040 2051270 : while (nskip-- > 0)
2041 2006828 : a = a->inchain;
2042 22221 : inarcsorig[s->no] = a;
2043 : }
2044 :
2045 1779 : FREE(arcarray);
2046 1779 : FREE(inarcsorig);
2047 :
2048 1779 : if (NISERR())
2049 1 : return;
2050 :
2051 : /*
2052 : * Now remove all the EMPTY arcs, since we don't need them anymore.
2053 : */
2054 24545 : for (s = nfa->states; s != NULL; s = s->next)
2055 : {
2056 104535 : for (a = s->outs; a != NULL; a = nexta)
2057 : {
2058 81768 : nexta = a->outchain;
2059 81768 : if (a->type == EMPTY)
2060 8775 : freearc(nfa, a);
2061 : }
2062 : }
2063 :
2064 : /*
2065 : * And remove any states that have become useless. (This cleanup is not
2066 : * very thorough, and would be even less so if we tried to combine it with
2067 : * the previous step; but cleanup() will take care of anything we miss.)
2068 : */
2069 24545 : for (s = nfa->states; s != NULL; s = nexts)
2070 : {
2071 22767 : nexts = s->next;
2072 22767 : if ((s->nins == 0 || s->nouts == 0) && !s->flag)
2073 2534 : dropstate(nfa, s);
2074 : }
2075 :
2076 1778 : if (f != NULL)
2077 0 : dumpnfa(nfa, f);
2078 : }
2079 :
2080 : /*
2081 : * emptyreachable - recursively find all states that can reach s by EMPTY arcs
2082 : *
2083 : * The return value is the last such state found. Its tmp field links back
2084 : * to the next-to-last such state, and so on back to s, so that all these
2085 : * states can be located without searching the whole NFA.
2086 : *
2087 : * Since this is only used in fixempties(), we pass in the inarcsorig[] array
2088 : * maintained by that function. This lets us skip over all new inarcs, which
2089 : * are certainly not EMPTY arcs.
2090 : *
2091 : * The maximum recursion depth here is equal to the length of the longest
2092 : * loop-free chain of EMPTY arcs, which is surely no more than the size of
2093 : * the NFA ... but that could still be enough to cause trouble.
2094 : */
2095 : static struct state *
2096 2022239 : emptyreachable(struct nfa *nfa,
2097 : struct state *s,
2098 : struct state *lastfound,
2099 : struct arc **inarcsorig)
2100 : {
2101 : struct arc *a;
2102 :
2103 : /* Since this is recursive, it could be driven to stack overflow */
2104 2022239 : if (STACK_TOO_DEEP(nfa->v->re))
2105 : {
2106 0 : NERR(REG_ETOOBIG);
2107 0 : return lastfound;
2108 : }
2109 :
2110 2022239 : s->tmp = lastfound;
2111 2022239 : lastfound = s;
2112 6102589 : for (a = inarcsorig[s->no]; a != NULL; a = a->inchain)
2113 : {
2114 4080350 : if (a->type == EMPTY && a->from->tmp == NULL)
2115 2000018 : lastfound = emptyreachable(nfa, a->from, lastfound, inarcsorig);
2116 : }
2117 2022239 : return lastfound;
2118 : }
2119 :
2120 : /*
2121 : * isconstraintarc - detect whether an arc is of a constraint type
2122 : */
2123 : static inline int
2124 181085 : isconstraintarc(struct arc *a)
2125 : {
2126 181085 : switch (a->type)
2127 : {
2128 : case '^':
2129 : case '$':
2130 : case BEHIND:
2131 : case AHEAD:
2132 : case LACON:
2133 44107 : return 1;
2134 : }
2135 136978 : return 0;
2136 : }
2137 :
2138 : /*
2139 : * hasconstraintout - does state have a constraint out arc?
2140 : */
2141 : static int
2142 6806 : hasconstraintout(struct state *s)
2143 : {
2144 : struct arc *a;
2145 :
2146 17006 : for (a = s->outs; a != NULL; a = a->outchain)
2147 : {
2148 13257 : if (isconstraintarc(a))
2149 3057 : return 1;
2150 : }
2151 3749 : return 0;
2152 : }
2153 :
2154 : /*
2155 : * fixconstraintloops - get rid of loops containing only constraint arcs
2156 : *
2157 : * A loop of states that contains only constraint arcs is useless, since
2158 : * passing around the loop represents no forward progress. Moreover, it
2159 : * would cause infinite looping in pullback/pushfwd, so we need to get rid
2160 : * of such loops before doing that.
2161 : */
2162 : static void
2163 1779 : fixconstraintloops(struct nfa *nfa,
2164 : FILE *f) /* for debug output; NULL none */
2165 : {
2166 : struct state *s;
2167 : struct state *nexts;
2168 : struct arc *a;
2169 : struct arc *nexta;
2170 : int hasconstraints;
2171 :
2172 : /*
2173 : * In the trivial case of a state that loops to itself, we can just drop
2174 : * the constraint arc altogether. This is worth special-casing because
2175 : * such loops are far more common than loops containing multiple states.
2176 : * While we're at it, note whether any constraint arcs survive.
2177 : */
2178 1779 : hasconstraints = 0;
2179 22012 : for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
2180 : {
2181 20233 : nexts = s->next;
2182 : /* while we're at it, ensure tmp fields are clear for next step */
2183 20233 : assert(s->tmp == NULL);
2184 90740 : for (a = s->outs; a != NULL && !NISERR(); a = nexta)
2185 : {
2186 70507 : nexta = a->outchain;
2187 70507 : if (isconstraintarc(a))
2188 : {
2189 12942 : if (a->to == s)
2190 177 : freearc(nfa, a);
2191 : else
2192 12765 : hasconstraints = 1;
2193 : }
2194 : }
2195 : /* If we removed all the outarcs, the state is useless. */
2196 20233 : if (s->nouts == 0 && !s->flag)
2197 0 : dropstate(nfa, s);
2198 : }
2199 :
2200 : /* Nothing to do if no remaining constraint arcs */
2201 1779 : if (NISERR() || !hasconstraints)
2202 1 : return;
2203 :
2204 : /*
2205 : * Starting from each remaining NFA state, search outwards for a
2206 : * constraint loop. If we find a loop, break the loop, then start the
2207 : * search over. (We could possibly retain some state from the first scan,
2208 : * but it would complicate things greatly, and multi-state constraint
2209 : * loops are rare enough that it's not worth optimizing the case.)
2210 : */
2211 : restart:
2212 24082 : for (s = nfa->states; s != NULL && !NISERR(); s = s->next)
2213 : {
2214 22304 : if (findconstraintloop(nfa, s))
2215 130 : goto restart;
2216 : }
2217 :
2218 1778 : if (NISERR())
2219 0 : return;
2220 :
2221 : /*
2222 : * Now remove any states that have become useless. (This cleanup is not
2223 : * very thorough, and would be even less so if we tried to combine it with
2224 : * the previous step; but cleanup() will take care of anything we miss.)
2225 : *
2226 : * Because findconstraintloop intentionally doesn't reset all tmp fields,
2227 : * we have to clear them after it's done. This is a convenient place to
2228 : * do that, too.
2229 : */
2230 22278 : for (s = nfa->states; s != NULL; s = nexts)
2231 : {
2232 20500 : nexts = s->next;
2233 20500 : s->tmp = NULL;
2234 20500 : if ((s->nins == 0 || s->nouts == 0) && !s->flag)
2235 75 : dropstate(nfa, s);
2236 : }
2237 :
2238 1778 : if (f != NULL)
2239 0 : dumpnfa(nfa, f);
2240 : }
2241 :
2242 : /*
2243 : * findconstraintloop - recursively find a loop of constraint arcs
2244 : *
2245 : * If we find a loop, break it by calling breakconstraintloop(), then
2246 : * return 1; otherwise return 0.
2247 : *
2248 : * State tmp fields are guaranteed all NULL on a success return, because
2249 : * breakconstraintloop does that. After a failure return, any state that
2250 : * is known not to be part of a loop is marked with s->tmp == s; this allows
2251 : * us not to have to re-prove that fact on later calls. (This convention is
2252 : * workable because we already eliminated single-state loops.)
2253 : *
2254 : * Note that the found loop doesn't necessarily include the first state we
2255 : * are called on. Any loop reachable from that state will do.
2256 : *
2257 : * The maximum recursion depth here is one more than the length of the longest
2258 : * loop-free chain of constraint arcs, which is surely no more than the size
2259 : * of the NFA ... but that could still be enough to cause trouble.
2260 : */
2261 : static int
2262 43107 : findconstraintloop(struct nfa *nfa, struct state *s)
2263 : {
2264 : struct arc *a;
2265 :
2266 : /* Since this is recursive, it could be driven to stack overflow */
2267 43107 : if (STACK_TOO_DEEP(nfa->v->re))
2268 : {
2269 0 : NERR(REG_ETOOBIG);
2270 0 : return 1; /* to exit as quickly as possible */
2271 : }
2272 :
2273 43107 : if (s->tmp != NULL)
2274 : {
2275 : /* Already proven uninteresting? */
2276 18978 : if (s->tmp == s)
2277 18848 : return 0;
2278 : /* Found a loop involving s */
2279 130 : breakconstraintloop(nfa, s);
2280 : /* The tmp fields have been cleaned up by breakconstraintloop */
2281 130 : return 1;
2282 : }
2283 111863 : for (a = s->outs; a != NULL; a = a->outchain)
2284 : {
2285 88148 : if (isconstraintarc(a))
2286 : {
2287 20803 : struct state *sto = a->to;
2288 :
2289 20803 : assert(sto != s);
2290 20803 : s->tmp = sto;
2291 20803 : if (findconstraintloop(nfa, sto))
2292 414 : return 1;
2293 : }
2294 : }
2295 :
2296 : /*
2297 : * If we get here, no constraint loop exists leading out from s. Mark it
2298 : * with s->tmp == s so we need not rediscover that fact again later.
2299 : */
2300 23715 : s->tmp = s;
2301 23715 : return 0;
2302 : }
2303 :
2304 : /*
2305 : * breakconstraintloop - break a loop of constraint arcs
2306 : *
2307 : * sinitial is any one member state of the loop. Each loop member's tmp
2308 : * field links to its successor within the loop. (Note that this function
2309 : * will reset all the tmp fields to NULL.)
2310 : *
2311 : * We can break the loop by, for any one state S1 in the loop, cloning its
2312 : * loop successor state S2 (and possibly following states), and then moving
2313 : * all S1->S2 constraint arcs to point to the cloned S2. The cloned S2 should
2314 : * copy any non-constraint outarcs of S2. Constraint outarcs should be
2315 : * dropped if they point back to S1, else they need to be copied as arcs to
2316 : * similarly cloned states S3, S4, etc. In general, each cloned state copies
2317 : * non-constraint outarcs, drops constraint outarcs that would lead to itself
2318 : * or any earlier cloned state, and sends other constraint outarcs to newly
2319 : * cloned states. No cloned state will have any inarcs that aren't constraint
2320 : * arcs or do not lead from S1 or earlier-cloned states. It's okay to drop
2321 : * constraint back-arcs since they would not take us to any state we've not
2322 : * already been in; therefore, no new constraint loop is created. In this way
2323 : * we generate a modified NFA that can still represent every useful state
2324 : * sequence, but not sequences that represent state loops with no consumption
2325 : * of input data. Note that the set of cloned states will certainly include
2326 : * all of the loop member states other than S1, and it may also include
2327 : * non-loop states that are reachable from S2 via constraint arcs. This is
2328 : * important because there is no guarantee that findconstraintloop found a
2329 : * maximal loop (and searching for one would be NP-hard, so don't try).
2330 : * Frequently the "non-loop states" are actually part of a larger loop that
2331 : * we didn't notice, and indeed there may be several overlapping loops.
2332 : * This technique ensures convergence in such cases, while considering only
2333 : * the originally-found loop does not.
2334 : *
2335 : * If there is only one S1->S2 constraint arc, then that constraint is
2336 : * certainly satisfied when we enter any of the clone states. This means that
2337 : * in the common case where many of the constraint arcs are identically
2338 : * labeled, we can merge together clone states linked by a similarly-labeled
2339 : * constraint: if we can get to the first one we can certainly get to the
2340 : * second, so there's no need to distinguish. This greatly reduces the number
2341 : * of new states needed, so we preferentially break the given loop at a state
2342 : * pair where this is true.
2343 : *
2344 : * Furthermore, it's fairly common to find that a cloned successor state has
2345 : * no outarcs, especially if we're a bit aggressive about removing unnecessary
2346 : * outarcs. If that happens, then there is simply not any interesting state
2347 : * that can be reached through the predecessor's loop arcs, which means we can
2348 : * break the loop just by removing those loop arcs, with no new states added.
2349 : */
2350 : static void
2351 130 : breakconstraintloop(struct nfa *nfa, struct state *sinitial)
2352 : {
2353 : struct state *s;
2354 : struct state *shead;
2355 : struct state *stail;
2356 : struct state *sclone;
2357 : struct state *nexts;
2358 : struct arc *refarc;
2359 : struct arc *a;
2360 : struct arc *nexta;
2361 :
2362 : /*
2363 : * Start by identifying which loop step we want to break at.
2364 : * Preferentially this is one with only one constraint arc. (XXX are
2365 : * there any other secondary heuristics we want to use here?) Set refarc
2366 : * to point to the selected lone constraint arc, if there is one.
2367 : */
2368 130 : refarc = NULL;
2369 130 : s = sinitial;
2370 : do
2371 : {
2372 296 : nexts = s->tmp;
2373 296 : assert(nexts != s); /* should not see any one-element loops */
2374 296 : if (refarc == NULL)
2375 : {
2376 170 : int narcs = 0;
2377 :
2378 1591 : for (a = s->outs; a != NULL; a = a->outchain)
2379 : {
2380 1421 : if (a->to == nexts && isconstraintarc(a))
2381 : {
2382 325 : refarc = a;
2383 325 : narcs++;
2384 : }
2385 : }
2386 170 : assert(narcs > 0);
2387 170 : if (narcs > 1)
2388 58 : refarc = NULL; /* multiple constraint arcs here, no good */
2389 : }
2390 296 : s = nexts;
2391 296 : } while (s != sinitial);
2392 :
2393 130 : if (refarc)
2394 : {
2395 : /* break at the refarc */
2396 112 : shead = refarc->from;
2397 112 : stail = refarc->to;
2398 112 : assert(stail == shead->tmp);
2399 : }
2400 : else
2401 : {
2402 : /* for lack of a better idea, break after sinitial */
2403 18 : shead = sinitial;
2404 18 : stail = sinitial->tmp;
2405 : }
2406 :
2407 : /*
2408 : * Reset the tmp fields so that we can use them for local storage in
2409 : * clonesuccessorstates. (findconstraintloop won't mind, since it's just
2410 : * going to abandon its search anyway.)
2411 : */
2412 14592 : for (s = nfa->states; s != NULL; s = s->next)
2413 14462 : s->tmp = NULL;
2414 :
2415 : /*
2416 : * Recursively build clone state(s) as needed.
2417 : */
2418 130 : sclone = newstate(nfa);
2419 130 : if (sclone == NULL)
2420 : {
2421 0 : assert(NISERR());
2422 0 : return;
2423 : }
2424 :
2425 130 : clonesuccessorstates(nfa, stail, sclone, shead, refarc,
2426 : NULL, NULL, nfa->nstates);
2427 :
2428 130 : if (NISERR())
2429 0 : return;
2430 :
2431 : /*
2432 : * It's possible that sclone has no outarcs at all, in which case it's
2433 : * useless. (We don't try extremely hard to get rid of useless states
2434 : * here, but this is an easy and fairly common case.)
2435 : */
2436 130 : if (sclone->nouts == 0)
2437 : {
2438 23 : freestate(nfa, sclone);
2439 23 : sclone = NULL;
2440 : }
2441 :
2442 : /*
2443 : * Move shead's constraint-loop arcs to point to sclone, or just drop them
2444 : * if we discovered we don't need sclone.
2445 : */
2446 1325 : for (a = shead->outs; a != NULL; a = nexta)
2447 : {
2448 1195 : nexta = a->outchain;
2449 1195 : if (a->to == stail && isconstraintarc(a))
2450 : {
2451 174 : if (sclone)
2452 143 : cparc(nfa, a, shead, sclone);
2453 174 : freearc(nfa, a);
2454 174 : if (NISERR())
2455 0 : break;
2456 : }
2457 : }
2458 : }
2459 :
2460 : /*
2461 : * clonesuccessorstates - create a tree of constraint-arc successor states
2462 : *
2463 : * ssource is the state to be cloned, and sclone is the state to copy its
2464 : * outarcs into. sclone's inarcs, if any, should already be set up.
2465 : *
2466 : * spredecessor is the original predecessor state that we are trying to build
2467 : * successors for (it may not be the immediate predecessor of ssource).
2468 : * refarc, if not NULL, is the original constraint arc that is known to have
2469 : * been traversed out of spredecessor to reach the successor(s).
2470 : *
2471 : * For each cloned successor state, we transiently create a "donemap" that is
2472 : * a boolean array showing which source states we've already visited for this
2473 : * clone state. This prevents infinite recursion as well as useless repeat
2474 : * visits to the same state subtree (which can add up fast, since typical NFAs
2475 : * have multiple redundant arc pathways). Each donemap is a char array
2476 : * indexed by state number. The donemaps are all of the same size "nstates",
2477 : * which is nfa->nstates as of the start of the recursion. This is enough to
2478 : * have entries for all pre-existing states, but *not* entries for clone
2479 : * states created during the recursion. That's okay since we have no need to
2480 : * mark those.
2481 : *
2482 : * curdonemap is NULL when recursing to a new sclone state, or sclone's
2483 : * donemap when we are recursing without having created a new state (which we
2484 : * do when we decide we can merge a successor state into the current clone
2485 : * state). outerdonemap is NULL at the top level and otherwise the parent
2486 : * clone state's donemap.
2487 : *
2488 : * The successor states we create and fill here form a strict tree structure,
2489 : * with each state having exactly one predecessor, except that the toplevel
2490 : * state has no inarcs as yet (breakconstraintloop will add its inarcs from
2491 : * spredecessor after we're done). Thus, we can examine sclone's inarcs back
2492 : * to the root, plus refarc if any, to identify the set of constraints already
2493 : * known valid at the current point. This allows us to avoid generating extra
2494 : * successor states.
2495 : */
2496 : static void
2497 1094 : clonesuccessorstates(struct nfa *nfa,
2498 : struct state *ssource,
2499 : struct state *sclone,
2500 : struct state *spredecessor,
2501 : struct arc *refarc,
2502 : char *curdonemap,
2503 : char *outerdonemap,
2504 : int nstates)
2505 : {
2506 : char *donemap;
2507 : struct arc *a;
2508 :
2509 : /* Since this is recursive, it could be driven to stack overflow */
2510 1094 : if (STACK_TOO_DEEP(nfa->v->re))
2511 : {
2512 0 : NERR(REG_ETOOBIG);
2513 0 : return;
2514 : }
2515 :
2516 : /* If this state hasn't already got a donemap, create one */
2517 1094 : donemap = curdonemap;
2518 1094 : if (donemap == NULL)
2519 : {
2520 290 : donemap = (char *) MALLOC(nstates * sizeof(char));
2521 290 : if (donemap == NULL)
2522 : {
2523 0 : NERR(REG_ESPACE);
2524 0 : return;
2525 : }
2526 :
2527 290 : if (outerdonemap != NULL)
2528 : {
2529 : /*
2530 : * Not at outermost recursion level, so copy the outer level's
2531 : * donemap; this ensures that we see states in process of being
2532 : * visited at outer levels, or already merged into predecessor
2533 : * states, as ones we shouldn't traverse back to.
2534 : */
2535 160 : memcpy(donemap, outerdonemap, nstates * sizeof(char));
2536 : }
2537 : else
2538 : {
2539 : /* At outermost level, only spredecessor is off-limits */
2540 130 : memset(donemap, 0, nstates * sizeof(char));
2541 130 : assert(spredecessor->no < nstates);
2542 130 : donemap[spredecessor->no] = 1;
2543 : }
2544 : }
2545 :
2546 : /* Mark ssource as visited in the donemap */
2547 1094 : assert(ssource->no < nstates);
2548 1094 : assert(donemap[ssource->no] == 0);
2549 1094 : donemap[ssource->no] = 1;
2550 :
2551 : /*
2552 : * We proceed by first cloning all of ssource's outarcs, creating new
2553 : * clone states as needed but not doing more with them than that. Then in
2554 : * a second pass, recurse to process the child clone states. This allows
2555 : * us to have only one child clone state per reachable source state, even
2556 : * when there are multiple outarcs leading to the same state. Also, when
2557 : * we do visit a child state, its set of inarcs is known exactly, which
2558 : * makes it safe to apply the constraint-is-already-checked optimization.
2559 : * Also, this ensures that we've merged all the states we can into the
2560 : * current clone before we recurse to any children, thus possibly saving
2561 : * them from making extra images of those states.
2562 : *
2563 : * While this function runs, child clone states of the current state are
2564 : * marked by setting their tmp fields to point to the original state they
2565 : * were cloned from. This makes it possible to detect multiple outarcs
2566 : * leading to the same state, and also makes it easy to distinguish clone
2567 : * states from original states (which will have tmp == NULL).
2568 : */
2569 9768 : for (a = ssource->outs; a != NULL && !NISERR(); a = a->outchain)
2570 : {
2571 8674 : struct state *sto = a->to;
2572 :
2573 : /*
2574 : * We do not consider cloning successor states that have no constraint
2575 : * outarcs; just link to them as-is. They cannot be part of a
2576 : * constraint loop so there is no need to make copies. In particular,
2577 : * this rule keeps us from trying to clone the post state, which would
2578 : * be a bad idea.
2579 : */
2580 8674 : if (isconstraintarc(a) && hasconstraintout(sto))
2581 1325 : {
2582 : struct state *prevclone;
2583 : int canmerge;
2584 : struct arc *a2;
2585 :
2586 : /*
2587 : * Back-link constraint arcs must not be followed. Nor is there a
2588 : * need to revisit states previously merged into this clone.
2589 : */
2590 3057 : assert(sto->no < nstates);
2591 3057 : if (donemap[sto->no] != 0)
2592 1732 : continue;
2593 :
2594 : /*
2595 : * Check whether we already have a child clone state for this
2596 : * source state.
2597 : */
2598 1325 : prevclone = NULL;
2599 12309 : for (a2 = sclone->outs; a2 != NULL; a2 = a2->outchain)
2600 : {
2601 11345 : if (a2->to->tmp == sto)
2602 : {
2603 361 : prevclone = a2->to;
2604 361 : break;
2605 : }
2606 : }
2607 :
2608 : /*
2609 : * If this arc is labeled the same as refarc, or the same as any
2610 : * arc we must have traversed to get to sclone, then no additional
2611 : * constraints need to be met to get to sto, so we should just
2612 : * merge its outarcs into sclone.
2613 : */
2614 1325 : if (refarc && a->type == refarc->type && a->co == refarc->co)
2615 804 : canmerge = 1;
2616 : else
2617 : {
2618 : struct state *s;
2619 :
2620 521 : canmerge = 0;
2621 2437 : for (s = sclone; s->ins; s = s->ins->from)
2622 : {
2623 1921 : if (s->nins == 1 &&
2624 10 : a->type == s->ins->type && a->co == s->ins->co)
2625 : {
2626 0 : canmerge = 1;
2627 0 : break;
2628 : }
2629 : }
2630 : }
2631 :
2632 1325 : if (canmerge)
2633 : {
2634 : /*
2635 : * We can merge into sclone. If we previously made a child
2636 : * clone state, drop it; there's no need to visit it. (This
2637 : * can happen if ssource has multiple pathways to sto, and we
2638 : * only just now found one that is provably a no-op.)
2639 : */
2640 804 : if (prevclone)
2641 0 : dropstate(nfa, prevclone); /* kills our outarc, too */
2642 :
2643 : /* Recurse to merge sto's outarcs into sclone */
2644 804 : clonesuccessorstates(nfa,
2645 : sto,
2646 : sclone,
2647 : spredecessor,
2648 : refarc,
2649 : donemap,
2650 : outerdonemap,
2651 : nstates);
2652 : /* sto should now be marked as previously visited */
2653 804 : assert(NISERR() || donemap[sto->no] == 1);
2654 : }
2655 521 : else if (prevclone)
2656 : {
2657 : /*
2658 : * We already have a clone state for this successor, so just
2659 : * make another arc to it.
2660 : */
2661 361 : cparc(nfa, a, sclone, prevclone);
2662 : }
2663 : else
2664 : {
2665 : /*
2666 : * We need to create a new successor clone state.
2667 : */
2668 : struct state *stoclone;
2669 :
2670 160 : stoclone = newstate(nfa);
2671 160 : if (stoclone == NULL)
2672 : {
2673 0 : assert(NISERR());
2674 0 : break;
2675 : }
2676 : /* Mark it as to what it's a clone of */
2677 160 : stoclone->tmp = sto;
2678 : /* ... and add the outarc leading to it */
2679 160 : cparc(nfa, a, sclone, stoclone);
2680 : }
2681 : }
2682 : else
2683 : {
2684 : /*
2685 : * Non-constraint outarcs just get copied to sclone, as do outarcs
2686 : * leading to states with no constraint outarc.
2687 : */
2688 5617 : cparc(nfa, a, sclone, sto);
2689 : }
2690 : }
2691 :
2692 : /*
2693 : * If we are at outer level for this clone state, recurse to all its child
2694 : * clone states, clearing their tmp fields as we go. (If we're not
2695 : * outermost for sclone, leave this to be done by the outer call level.)
2696 : * Note that if we have multiple outarcs leading to the same clone state,
2697 : * it will only be recursed-to once.
2698 : */
2699 1094 : if (curdonemap == NULL)
2700 : {
2701 3261 : for (a = sclone->outs; a != NULL && !NISERR(); a = a->outchain)
2702 : {
2703 2971 : struct state *stoclone = a->to;
2704 2971 : struct state *sto = stoclone->tmp;
2705 :
2706 2971 : if (sto != NULL)
2707 : {
2708 160 : stoclone->tmp = NULL;
2709 160 : clonesuccessorstates(nfa,
2710 : sto,
2711 : stoclone,
2712 : spredecessor,
2713 : refarc,
2714 : NULL,
2715 : donemap,
2716 : nstates);
2717 : }
2718 : }
2719 :
2720 : /* Don't forget to free sclone's donemap when done with it */
2721 290 : FREE(donemap);
2722 : }
2723 : }
2724 :
2725 : /*
2726 : * cleanup - clean up NFA after optimizations
2727 : */
2728 : static void
2729 3558 : cleanup(struct nfa *nfa)
2730 : {
2731 : struct state *s;
2732 : struct state *nexts;
2733 : int n;
2734 :
2735 3558 : if (NISERR())
2736 3559 : return;
2737 :
2738 : /* clear out unreachable or dead-end states */
2739 : /* use pre to mark reachable, then post to mark can-reach-post */
2740 3557 : markreachable(nfa, nfa->pre, (struct state *) NULL, nfa->pre);
2741 3557 : markcanreach(nfa, nfa->post, nfa->pre, nfa->post);
2742 85611 : for (s = nfa->states; s != NULL && !NISERR(); s = nexts)
2743 : {
2744 82054 : nexts = s->next;
2745 82054 : if (s->tmp != nfa->post && !s->flag)
2746 1273 : dropstate(nfa, s);
2747 : }
2748 3557 : assert(NISERR() || nfa->post->nins == 0 || nfa->post->tmp == nfa->post);
2749 3557 : cleartraverse(nfa, nfa->pre);
2750 3557 : assert(NISERR() || nfa->post->nins == 0 || nfa->post->tmp == NULL);
2751 : /* the nins==0 (final unreachable) case will be caught later */
2752 :
2753 : /* renumber surviving states */
2754 3557 : n = 0;
2755 84338 : for (s = nfa->states; s != NULL; s = s->next)
2756 80781 : s->no = n++;
2757 3557 : nfa->nstates = n;
2758 : }
2759 :
2760 : /*
2761 : * markreachable - recursive marking of reachable states
2762 : */
2763 : static void
2764 153405 : markreachable(struct nfa *nfa,
2765 : struct state *s,
2766 : struct state *okay, /* consider only states with this mark */
2767 : struct state *mark) /* the value to mark with */
2768 : {
2769 : struct arc *a;
2770 :
2771 : /* Since this is recursive, it could be driven to stack overflow */
2772 153405 : if (STACK_TOO_DEEP(nfa->v->re))
2773 : {
2774 0 : NERR(REG_ETOOBIG);
2775 0 : return;
2776 : }
2777 :
2778 153405 : if (s->tmp != okay)
2779 72618 : return;
2780 80787 : s->tmp = mark;
2781 :
2782 230635 : for (a = s->outs; a != NULL; a = a->outchain)
2783 149848 : markreachable(nfa, a->to, okay, mark);
2784 : }
2785 :
2786 : /*
2787 : * markcanreach - recursive marking of states which can reach here
2788 : */
2789 : static void
2790 153851 : markcanreach(struct nfa *nfa,
2791 : struct state *s,
2792 : struct state *okay, /* consider only states with this mark */
2793 : struct state *mark) /* the value to mark with */
2794 : {
2795 : struct arc *a;
2796 :
2797 : /* Since this is recursive, it could be driven to stack overflow */
2798 153851 : if (STACK_TOO_DEEP(nfa->v->re))
2799 : {
2800 0 : NERR(REG_ETOOBIG);
2801 0 : return;
2802 : }
2803 :
2804 153851 : if (s->tmp != okay)
2805 73088 : return;
2806 80763 : s->tmp = mark;
2807 :
2808 231057 : for (a = s->ins; a != NULL; a = a->inchain)
2809 150294 : markcanreach(nfa, a->from, okay, mark);
2810 : }
2811 :
2812 : /*
2813 : * analyze - ascertain potentially-useful facts about an optimized NFA
2814 : */
2815 : static long /* re_info bits to be ORed in */
2816 1779 : analyze(struct nfa *nfa)
2817 : {
2818 : struct arc *a;
2819 : struct arc *aa;
2820 :
2821 1779 : if (NISERR())
2822 1 : return 0;
2823 :
2824 1778 : if (nfa->pre->outs == NULL)
2825 9 : return REG_UIMPOSSIBLE;
2826 9248 : for (a = nfa->pre->outs; a != NULL; a = a->outchain)
2827 22067 : for (aa = a->to->outs; aa != NULL; aa = aa->outchain)
2828 14588 : if (aa->to == nfa->post)
2829 696 : return REG_UEMPTYMATCH;
2830 1073 : return 0;
2831 : }
2832 :
2833 : /*
2834 : * compact - construct the compact representation of an NFA
2835 : */
2836 : static void
2837 1778 : compact(struct nfa *nfa,
2838 : struct cnfa *cnfa)
2839 : {
2840 : struct state *s;
2841 : struct arc *a;
2842 : size_t nstates;
2843 : size_t narcs;
2844 : struct carc *ca;
2845 : struct carc *first;
2846 :
2847 1778 : assert(!NISERR());
2848 :
2849 1778 : nstates = 0;
2850 1778 : narcs = 0;
2851 19507 : for (s = nfa->states; s != NULL; s = s->next)
2852 : {
2853 17729 : nstates++;
2854 17729 : narcs += s->nouts + 1; /* need one extra for endmarker */
2855 : }
2856 :
2857 1778 : cnfa->stflags = (char *) MALLOC(nstates * sizeof(char));
2858 1778 : cnfa->states = (struct carc **) MALLOC(nstates * sizeof(struct carc *));
2859 1778 : cnfa->arcs = (struct carc *) MALLOC(narcs * sizeof(struct carc));
2860 1778 : if (cnfa->stflags == NULL || cnfa->states == NULL || cnfa->arcs == NULL)
2861 : {
2862 0 : if (cnfa->stflags != NULL)
2863 0 : FREE(cnfa->stflags);
2864 0 : if (cnfa->states != NULL)
2865 0 : FREE(cnfa->states);
2866 0 : if (cnfa->arcs != NULL)
2867 0 : FREE(cnfa->arcs);
2868 0 : NERR(REG_ESPACE);
2869 1778 : return;
2870 : }
2871 1778 : cnfa->nstates = nstates;
2872 1778 : cnfa->pre = nfa->pre->no;
2873 1778 : cnfa->post = nfa->post->no;
2874 1778 : cnfa->bos[0] = nfa->bos[0];
2875 1778 : cnfa->bos[1] = nfa->bos[1];
2876 1778 : cnfa->eos[0] = nfa->eos[0];
2877 1778 : cnfa->eos[1] = nfa->eos[1];
2878 1778 : cnfa->ncolors = maxcolor(nfa->cm) + 1;
2879 1778 : cnfa->flags = 0;
2880 :
2881 1778 : ca = cnfa->arcs;
2882 19507 : for (s = nfa->states; s != NULL; s = s->next)
2883 : {
2884 17729 : assert((size_t) s->no < nstates);
2885 17729 : cnfa->stflags[s->no] = 0;
2886 17729 : cnfa->states[s->no] = ca;
2887 17729 : first = ca;
2888 67788 : for (a = s->outs; a != NULL; a = a->outchain)
2889 50059 : switch (a->type)
2890 : {
2891 : case PLAIN:
2892 50026 : ca->co = a->co;
2893 50026 : ca->to = a->to->no;
2894 50026 : ca++;
2895 50026 : break;
2896 : case LACON:
2897 33 : assert(s->no != cnfa->pre);
2898 33 : ca->co = (color) (cnfa->ncolors + a->co);
2899 33 : ca->to = a->to->no;
2900 33 : ca++;
2901 33 : cnfa->flags |= HASLACONS;
2902 33 : break;
2903 : default:
2904 0 : NERR(REG_ASSERT);
2905 0 : break;
2906 : }
2907 17729 : carcsort(first, ca - first);
2908 17729 : ca->co = COLORLESS;
2909 17729 : ca->to = 0;
2910 17729 : ca++;
2911 : }
2912 1778 : assert(ca == &cnfa->arcs[narcs]);
2913 1778 : assert(cnfa->nstates != 0);
2914 :
2915 : /* mark no-progress states */
2916 16567 : for (a = nfa->pre->outs; a != NULL; a = a->outchain)
2917 14789 : cnfa->stflags[a->to->no] = CNFA_NOPROGRESS;
2918 1778 : cnfa->stflags[nfa->pre->no] = CNFA_NOPROGRESS;
2919 : }
2920 :
2921 : /*
2922 : * carcsort - sort compacted-NFA arcs by color
2923 : */
2924 : static void
2925 17729 : carcsort(struct carc *first, size_t n)
2926 : {
2927 17729 : if (n > 1)
2928 4584 : qsort(first, n, sizeof(struct carc), carc_cmp);
2929 17729 : }
2930 :
2931 : static int
2932 128631 : carc_cmp(const void *a, const void *b)
2933 : {
2934 128631 : const struct carc *aa = (const struct carc *) a;
2935 128631 : const struct carc *bb = (const struct carc *) b;
2936 :
2937 128631 : if (aa->co < bb->co)
2938 26960 : return -1;
2939 101671 : if (aa->co > bb->co)
2940 40563 : return +1;
2941 61108 : if (aa->to < bb->to)
2942 32762 : return -1;
2943 28346 : if (aa->to > bb->to)
2944 28346 : return +1;
2945 0 : return 0;
2946 : }
2947 :
2948 : /*
2949 : * freecnfa - free a compacted NFA
2950 : */
2951 : static void
2952 149 : freecnfa(struct cnfa *cnfa)
2953 : {
2954 149 : assert(cnfa->nstates != 0); /* not empty already */
2955 149 : cnfa->nstates = 0;
2956 149 : FREE(cnfa->stflags);
2957 149 : FREE(cnfa->states);
2958 149 : FREE(cnfa->arcs);
2959 149 : }
2960 :
2961 : /*
2962 : * dumpnfa - dump an NFA in human-readable form
2963 : */
2964 : static void
2965 0 : dumpnfa(struct nfa *nfa,
2966 : FILE *f)
2967 : {
2968 : #ifdef REG_DEBUG
2969 : struct state *s;
2970 : int nstates = 0;
2971 : int narcs = 0;
2972 :
2973 : fprintf(f, "pre %d, post %d", nfa->pre->no, nfa->post->no);
2974 : if (nfa->bos[0] != COLORLESS)
2975 : fprintf(f, ", bos [%ld]", (long) nfa->bos[0]);
2976 : if (nfa->bos[1] != COLORLESS)
2977 : fprintf(f, ", bol [%ld]", (long) nfa->bos[1]);
2978 : if (nfa->eos[0] != COLORLESS)
2979 : fprintf(f, ", eos [%ld]", (long) nfa->eos[0]);
2980 : if (nfa->eos[1] != COLORLESS)
2981 : fprintf(f, ", eol [%ld]", (long) nfa->eos[1]);
2982 : fprintf(f, "\n");
2983 : for (s = nfa->states; s != NULL; s = s->next)
2984 : {
2985 : dumpstate(s, f);
2986 : nstates++;
2987 : narcs += s->nouts;
2988 : }
2989 : fprintf(f, "total of %d states, %d arcs\n", nstates, narcs);
2990 : if (nfa->parent == NULL)
2991 : dumpcolors(nfa->cm, f);
2992 : fflush(f);
2993 : #endif
2994 0 : }
2995 :
2996 : #ifdef REG_DEBUG /* subordinates of dumpnfa */
2997 :
2998 : /*
2999 : * dumpstate - dump an NFA state in human-readable form
3000 : */
3001 : static void
3002 : dumpstate(struct state *s,
3003 : FILE *f)
3004 : {
3005 : struct arc *a;
3006 :
3007 : fprintf(f, "%d%s%c", s->no, (s->tmp != NULL) ? "T" : "",
3008 : (s->flag) ? s->flag : '.');
3009 : if (s->prev != NULL && s->prev->next != s)
3010 : fprintf(f, "\tstate chain bad\n");
3011 : if (s->nouts == 0)
3012 : fprintf(f, "\tno out arcs\n");
3013 : else
3014 : dumparcs(s, f);
3015 : fflush(f);
3016 : for (a = s->ins; a != NULL; a = a->inchain)
3017 : {
3018 : if (a->to != s)
3019 : fprintf(f, "\tlink from %d to %d on %d's in-chain\n",
3020 : a->from->no, a->to->no, s->no);
3021 : }
3022 : }
3023 :
3024 : /*
3025 : * dumparcs - dump out-arcs in human-readable form
3026 : */
3027 : static void
3028 : dumparcs(struct state *s,
3029 : FILE *f)
3030 : {
3031 : int pos;
3032 : struct arc *a;
3033 :
3034 : /* printing oldest arcs first is usually clearer */
3035 : a = s->outs;
3036 : assert(a != NULL);
3037 : while (a->outchain != NULL)
3038 : a = a->outchain;
3039 : pos = 1;
3040 : do
3041 : {
3042 : dumparc(a, s, f);
3043 : if (pos == 5)
3044 : {
3045 : fprintf(f, "\n");
3046 : pos = 1;
3047 : }
3048 : else
3049 : pos++;
3050 : a = a->outchainRev;
3051 : } while (a != NULL);
3052 : if (pos != 1)
3053 : fprintf(f, "\n");
3054 : }
3055 :
3056 : /*
3057 : * dumparc - dump one outarc in readable form, including prefixing tab
3058 : */
3059 : static void
3060 : dumparc(struct arc *a,
3061 : struct state *s,
3062 : FILE *f)
3063 : {
3064 : struct arc *aa;
3065 : struct arcbatch *ab;
3066 :
3067 : fprintf(f, "\t");
3068 : switch (a->type)
3069 : {
3070 : case PLAIN:
3071 : fprintf(f, "[%ld]", (long) a->co);
3072 : break;
3073 : case AHEAD:
3074 : fprintf(f, ">%ld>", (long) a->co);
3075 : break;
3076 : case BEHIND:
3077 : fprintf(f, "<%ld<", (long) a->co);
3078 : break;
3079 : case LACON:
3080 : fprintf(f, ":%ld:", (long) a->co);
3081 : break;
3082 : case '^':
3083 : case '$':
3084 : fprintf(f, "%c%d", a->type, (int) a->co);
3085 : break;
3086 : case EMPTY:
3087 : break;
3088 : default:
3089 : fprintf(f, "0x%x/0%lo", a->type, (long) a->co);
3090 : break;
3091 : }
3092 : if (a->from != s)
3093 : fprintf(f, "?%d?", a->from->no);
3094 : for (ab = &a->from->oas; ab != NULL; ab = ab->next)
3095 : {
3096 : for (aa = &ab->a[0]; aa < &ab->a[ABSIZE]; aa++)
3097 : if (aa == a)
3098 : break; /* NOTE BREAK OUT */
3099 : if (aa < &ab->a[ABSIZE]) /* propagate break */
3100 : break; /* NOTE BREAK OUT */
3101 : }
3102 : if (ab == NULL)
3103 : fprintf(f, "?!?"); /* not in allocated space */
3104 : fprintf(f, "->");
3105 : if (a->to == NULL)
3106 : {
3107 : fprintf(f, "NULL");
3108 : return;
3109 : }
3110 : fprintf(f, "%d", a->to->no);
3111 : for (aa = a->to->ins; aa != NULL; aa = aa->inchain)
3112 : if (aa == a)
3113 : break; /* NOTE BREAK OUT */
3114 : if (aa == NULL)
3115 : fprintf(f, "?!?"); /* missing from in-chain */
3116 : }
3117 : #endif /* REG_DEBUG */
3118 :
3119 : /*
3120 : * dumpcnfa - dump a compacted NFA in human-readable form
3121 : */
3122 : #ifdef REG_DEBUG
3123 : static void
3124 : dumpcnfa(struct cnfa *cnfa,
3125 : FILE *f)
3126 : {
3127 : int st;
3128 :
3129 : fprintf(f, "pre %d, post %d", cnfa->pre, cnfa->post);
3130 : if (cnfa->bos[0] != COLORLESS)
3131 : fprintf(f, ", bos [%ld]", (long) cnfa->bos[0]);
3132 : if (cnfa->bos[1] != COLORLESS)
3133 : fprintf(f, ", bol [%ld]", (long) cnfa->bos[1]);
3134 : if (cnfa->eos[0] != COLORLESS)
3135 : fprintf(f, ", eos [%ld]", (long) cnfa->eos[0]);
3136 : if (cnfa->eos[1] != COLORLESS)
3137 : fprintf(f, ", eol [%ld]", (long) cnfa->eos[1]);
3138 : if (cnfa->flags & HASLACONS)
3139 : fprintf(f, ", haslacons");
3140 : fprintf(f, "\n");
3141 : for (st = 0; st < cnfa->nstates; st++)
3142 : dumpcstate(st, cnfa, f);
3143 : fflush(f);
3144 : }
3145 : #endif
3146 :
3147 : #ifdef REG_DEBUG /* subordinates of dumpcnfa */
3148 :
3149 : /*
3150 : * dumpcstate - dump a compacted-NFA state in human-readable form
3151 : */
3152 : static void
3153 : dumpcstate(int st,
3154 : struct cnfa *cnfa,
3155 : FILE *f)
3156 : {
3157 : struct carc *ca;
3158 : int pos;
3159 :
3160 : fprintf(f, "%d%s", st, (cnfa->stflags[st] & CNFA_NOPROGRESS) ? ":" : ".");
3161 : pos = 1;
3162 : for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
3163 : {
3164 : if (ca->co < cnfa->ncolors)
3165 : fprintf(f, "\t[%ld]->%d", (long) ca->co, ca->to);
3166 : else
3167 : fprintf(f, "\t:%ld:->%d", (long) (ca->co - cnfa->ncolors), ca->to);
3168 : if (pos == 5)
3169 : {
3170 : fprintf(f, "\n");
3171 : pos = 1;
3172 : }
3173 : else
3174 : pos++;
3175 : }
3176 : if (ca == cnfa->states[st] || pos != 1)
3177 : fprintf(f, "\n");
3178 : fflush(f);
3179 : }
3180 :
3181 : #endif /* REG_DEBUG */
|