Line data Source code
1 : /*
2 : * re_*comp and friends - compile REs
3 : * This file #includes several others (see the bottom).
4 : *
5 : * Copyright (c) 1998, 1999 Henry Spencer. All rights reserved.
6 : *
7 : * Development of this software was funded, in part, by Cray Research Inc.,
8 : * UUNET Communications Services Inc., Sun Microsystems Inc., and Scriptics
9 : * Corporation, none of whom are responsible for the results. The author
10 : * thanks all of them.
11 : *
12 : * Redistribution and use in source and binary forms -- with or without
13 : * modification -- are permitted for any purpose, provided that
14 : * redistributions in source form retain this entire copyright notice and
15 : * indicate the origin and nature of any modifications.
16 : *
17 : * I'd appreciate being given credit for this package in the documentation
18 : * of software which uses it, but that is not a requirement.
19 : *
20 : * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES,
21 : * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY
22 : * AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL
23 : * HENRY SPENCER BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
24 : * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
25 : * PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS;
26 : * OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
27 : * WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
28 : * OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
29 : * ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
30 : *
31 : * src/backend/regex/regcomp.c
32 : *
33 : */
34 :
35 : #include "regex/regguts.h"
36 :
37 : /*
38 : * forward declarations, up here so forward datatypes etc. are defined early
39 : */
40 : /* === regcomp.c === */
41 : static void moresubs(struct vars *, int);
42 : static int freev(struct vars *, int);
43 : static void makesearch(struct vars *, struct nfa *);
44 : static struct subre *parse(struct vars *, int, int, struct state *, struct state *);
45 : static struct subre *parsebranch(struct vars *, int, int, struct state *, struct state *, int);
46 : static void parseqatom(struct vars *, int, int, struct state *, struct state *, struct subre *);
47 : static void nonword(struct vars *, int, struct state *, struct state *);
48 : static void word(struct vars *, int, struct state *, struct state *);
49 : static int scannum(struct vars *);
50 : static void repeat(struct vars *, struct state *, struct state *, int, int);
51 : static void bracket(struct vars *, struct state *, struct state *);
52 : static void cbracket(struct vars *, struct state *, struct state *);
53 : static void brackpart(struct vars *, struct state *, struct state *);
54 : static const chr *scanplain(struct vars *);
55 : static void onechr(struct vars *, chr, struct state *, struct state *);
56 : static void wordchrs(struct vars *);
57 : static void processlacon(struct vars *, struct state *, struct state *, int,
58 : struct state *, struct state *);
59 : static struct subre *subre(struct vars *, int, int, struct state *, struct state *);
60 : static void freesubre(struct vars *, struct subre *);
61 : static void freesrnode(struct vars *, struct subre *);
62 : static void optst(struct vars *, struct subre *);
63 : static int numst(struct subre *, int);
64 : static void markst(struct subre *);
65 : static void cleanst(struct vars *);
66 : static long nfatree(struct vars *, struct subre *, FILE *);
67 : static long nfanode(struct vars *, struct subre *, int, FILE *);
68 : static int newlacon(struct vars *, struct state *, struct state *, int);
69 : static void freelacons(struct subre *, int);
70 : static void rfree(regex_t *);
71 : static int rcancelrequested(void);
72 : static int rstacktoodeep(void);
73 :
74 : #ifdef REG_DEBUG
75 : static void dump(regex_t *, FILE *);
76 : static void dumpst(struct subre *, FILE *, int);
77 : static void stdump(struct subre *, FILE *, int);
78 : static const char *stid(struct subre *, char *, size_t);
79 : #endif
80 : /* === regc_lex.c === */
81 : static void lexstart(struct vars *);
82 : static void prefixes(struct vars *);
83 : static void lexnest(struct vars *, const chr *, const chr *);
84 : static void lexword(struct vars *);
85 : static int next(struct vars *);
86 : static int lexescape(struct vars *);
87 : static chr lexdigits(struct vars *, int, int, int);
88 : static int brenext(struct vars *, chr);
89 : static void skip(struct vars *);
90 : static chr newline(void);
91 : static chr chrnamed(struct vars *, const chr *, const chr *, chr);
92 :
93 : /* === regc_color.c === */
94 : static void initcm(struct vars *, struct colormap *);
95 : static void freecm(struct colormap *);
96 : static color maxcolor(struct colormap *);
97 : static color newcolor(struct colormap *);
98 : static void freecolor(struct colormap *, color);
99 : static color pseudocolor(struct colormap *);
100 : static color subcolor(struct colormap *, chr);
101 : static color subcolorhi(struct colormap *, color *);
102 : static color newsub(struct colormap *, color);
103 : static int newhicolorrow(struct colormap *, int);
104 : static void newhicolorcols(struct colormap *);
105 : static void subcolorcvec(struct vars *, struct cvec *, struct state *, struct state *);
106 : static void subcoloronechr(struct vars *, chr, struct state *, struct state *, color *);
107 : static void subcoloronerange(struct vars *, chr, chr, struct state *, struct state *, color *);
108 : static void subcoloronerow(struct vars *, int, struct state *, struct state *, color *);
109 : static void okcolors(struct nfa *, struct colormap *);
110 : static void colorchain(struct colormap *, struct arc *);
111 : static void uncolorchain(struct colormap *, struct arc *);
112 : static void rainbow(struct nfa *, struct colormap *, int, color, struct state *, struct state *);
113 : static void colorcomplement(struct nfa *, struct colormap *, int, struct state *, struct state *, struct state *);
114 :
115 : #ifdef REG_DEBUG
116 : static void dumpcolors(struct colormap *, FILE *);
117 : static void dumpchr(chr, FILE *);
118 : #endif
119 : /* === regc_nfa.c === */
120 : static struct nfa *newnfa(struct vars *, struct colormap *, struct nfa *);
121 : static void freenfa(struct nfa *);
122 : static struct state *newstate(struct nfa *);
123 : static struct state *newfstate(struct nfa *, int flag);
124 : static void dropstate(struct nfa *, struct state *);
125 : static void freestate(struct nfa *, struct state *);
126 : static void destroystate(struct nfa *, struct state *);
127 : static void newarc(struct nfa *, int, color, struct state *, struct state *);
128 : static void createarc(struct nfa *, int, color, struct state *, struct state *);
129 : static struct arc *allocarc(struct nfa *, struct state *);
130 : static void freearc(struct nfa *, struct arc *);
131 : static void changearctarget(struct arc *, struct state *);
132 : static int hasnonemptyout(struct state *);
133 : static struct arc *findarc(struct state *, int, color);
134 : static void cparc(struct nfa *, struct arc *, struct state *, struct state *);
135 : static void sortins(struct nfa *, struct state *);
136 : static int sortins_cmp(const void *, const void *);
137 : static void sortouts(struct nfa *, struct state *);
138 : static int sortouts_cmp(const void *, const void *);
139 : static void moveins(struct nfa *, struct state *, struct state *);
140 : static void copyins(struct nfa *, struct state *, struct state *);
141 : static void mergeins(struct nfa *, struct state *, struct arc **, int);
142 : static void moveouts(struct nfa *, struct state *, struct state *);
143 : static void copyouts(struct nfa *, struct state *, struct state *);
144 : static void cloneouts(struct nfa *, struct state *, struct state *, struct state *, int);
145 : static void delsub(struct nfa *, struct state *, struct state *);
146 : static void deltraverse(struct nfa *, struct state *, struct state *);
147 : static void dupnfa(struct nfa *, struct state *, struct state *, struct state *, struct state *);
148 : static void duptraverse(struct nfa *, struct state *, struct state *);
149 : static void cleartraverse(struct nfa *, struct state *);
150 : static struct state *single_color_transition(struct state *, struct state *);
151 : static void specialcolors(struct nfa *);
152 : static long optimize(struct nfa *, FILE *);
153 : static void pullback(struct nfa *, FILE *);
154 : static int pull(struct nfa *, struct arc *, struct state **);
155 : static void pushfwd(struct nfa *, FILE *);
156 : static int push(struct nfa *, struct arc *, struct state **);
157 :
158 : #define INCOMPATIBLE 1 /* destroys arc */
159 : #define SATISFIED 2 /* constraint satisfied */
160 : #define COMPATIBLE 3 /* compatible but not satisfied yet */
161 : static int combine(struct arc *, struct arc *);
162 : static void fixempties(struct nfa *, FILE *);
163 : static struct state *emptyreachable(struct nfa *, struct state *,
164 : struct state *, struct arc **);
165 : static int isconstraintarc(struct arc *);
166 : static int hasconstraintout(struct state *);
167 : static void fixconstraintloops(struct nfa *, FILE *);
168 : static int findconstraintloop(struct nfa *, struct state *);
169 : static void breakconstraintloop(struct nfa *, struct state *);
170 : static void clonesuccessorstates(struct nfa *, struct state *, struct state *,
171 : struct state *, struct arc *,
172 : char *, char *, int);
173 : static void cleanup(struct nfa *);
174 : static void markreachable(struct nfa *, struct state *, struct state *, struct state *);
175 : static void markcanreach(struct nfa *, struct state *, struct state *, struct state *);
176 : static long analyze(struct nfa *);
177 : static void compact(struct nfa *, struct cnfa *);
178 : static void carcsort(struct carc *, size_t);
179 : static int carc_cmp(const void *, const void *);
180 : static void freecnfa(struct cnfa *);
181 : static void dumpnfa(struct nfa *, FILE *);
182 :
183 : #ifdef REG_DEBUG
184 : static void dumpstate(struct state *, FILE *);
185 : static void dumparcs(struct state *, FILE *);
186 : static void dumparc(struct arc *, struct state *, FILE *);
187 : static void dumpcnfa(struct cnfa *, FILE *);
188 : static void dumpcstate(int, struct cnfa *, FILE *);
189 : #endif
190 : /* === regc_cvec.c === */
191 : static struct cvec *newcvec(int, int);
192 : static struct cvec *clearcvec(struct cvec *);
193 : static void addchr(struct cvec *, chr);
194 : static void addrange(struct cvec *, chr, chr);
195 : static struct cvec *getcvec(struct vars *, int, int);
196 : static void freecvec(struct cvec *);
197 :
198 : /* === regc_pg_locale.c === */
199 : static int pg_wc_isdigit(pg_wchar c);
200 : static int pg_wc_isalpha(pg_wchar c);
201 : static int pg_wc_isalnum(pg_wchar c);
202 : static int pg_wc_isupper(pg_wchar c);
203 : static int pg_wc_islower(pg_wchar c);
204 : static int pg_wc_isgraph(pg_wchar c);
205 : static int pg_wc_isprint(pg_wchar c);
206 : static int pg_wc_ispunct(pg_wchar c);
207 : static int pg_wc_isspace(pg_wchar c);
208 : static pg_wchar pg_wc_toupper(pg_wchar c);
209 : static pg_wchar pg_wc_tolower(pg_wchar c);
210 :
211 : /* === regc_locale.c === */
212 : static chr element(struct vars *, const chr *, const chr *);
213 : static struct cvec *range(struct vars *, chr, chr, int);
214 : static int before(chr, chr);
215 : static struct cvec *eclass(struct vars *, chr, int);
216 : static struct cvec *cclass(struct vars *, const chr *, const chr *, int);
217 : static int cclass_column_index(struct colormap *, chr);
218 : static struct cvec *allcases(struct vars *, chr);
219 : static int cmp(const chr *, const chr *, size_t);
220 : static int casecmp(const chr *, const chr *, size_t);
221 :
222 :
223 : /* internal variables, bundled for easy passing around */
224 : struct vars
225 : {
226 : regex_t *re;
227 : const chr *now; /* scan pointer into string */
228 : const chr *stop; /* end of string */
229 : const chr *savenow; /* saved now and stop for "subroutine call" */
230 : const chr *savestop;
231 : int err; /* error code (0 if none) */
232 : int cflags; /* copy of compile flags */
233 : int lasttype; /* type of previous token */
234 : int nexttype; /* type of next token */
235 : chr nextvalue; /* value (if any) of next token */
236 : int lexcon; /* lexical context type (see lex.c) */
237 : int nsubexp; /* subexpression count */
238 : struct subre **subs; /* subRE pointer vector */
239 : size_t nsubs; /* length of vector */
240 : struct subre *sub10[10]; /* initial vector, enough for most */
241 : struct nfa *nfa; /* the NFA */
242 : struct colormap *cm; /* character color map */
243 : color nlcolor; /* color of newline */
244 : struct state *wordchrs; /* state in nfa holding word-char outarcs */
245 : struct subre *tree; /* subexpression tree */
246 : struct subre *treechain; /* all tree nodes allocated */
247 : struct subre *treefree; /* any free tree nodes */
248 : int ntree; /* number of tree nodes, plus one */
249 : struct cvec *cv; /* interface cvec */
250 : struct cvec *cv2; /* utility cvec */
251 : struct subre *lacons; /* lookaround-constraint vector */
252 : int nlacons; /* size of lacons[]; note that only slots
253 : * numbered 1 .. nlacons-1 are used */
254 : size_t spaceused; /* approx. space used for compilation */
255 : };
256 :
257 : /* parsing macros; most know that `v' is the struct vars pointer */
258 : #define NEXT() (next(v)) /* advance by one token */
259 : #define SEE(t) (v->nexttype == (t)) /* is next token this? */
260 : #define EAT(t) (SEE(t) && next(v)) /* if next is this, swallow it */
261 : #define VISERR(vv) ((vv)->err != 0) /* have we seen an error yet? */
262 : #define ISERR() VISERR(v)
263 : #define VERR(vv,e) ((vv)->nexttype = EOS, \
264 : (vv)->err = ((vv)->err ? (vv)->err : (e)))
265 : #define ERR(e) VERR(v, e) /* record an error */
266 : #define NOERR() {if (ISERR()) return;} /* if error seen, return */
267 : #define NOERRN() {if (ISERR()) return NULL;} /* NOERR with retval */
268 : #define NOERRZ() {if (ISERR()) return 0;} /* NOERR with retval */
269 : #define INSIST(c, e) do { if (!(c)) ERR(e); } while (0) /* error if c false */
270 : #define NOTE(b) (v->re->re_info |= (b)) /* note visible condition */
271 : #define EMPTYARC(x, y) newarc(v->nfa, EMPTY, 0, x, y)
272 :
273 : /* token type codes, some also used as NFA arc types */
274 : #define EMPTY 'n' /* no token present */
275 : #define EOS 'e' /* end of string */
276 : #define PLAIN 'p' /* ordinary character */
277 : #define DIGIT 'd' /* digit (in bound) */
278 : #define BACKREF 'b' /* back reference */
279 : #define COLLEL 'I' /* start of [. */
280 : #define ECLASS 'E' /* start of [= */
281 : #define CCLASS 'C' /* start of [: */
282 : #define END 'X' /* end of [. [= [: */
283 : #define RANGE 'R' /* - within [] which might be range delim. */
284 : #define LACON 'L' /* lookaround constraint subRE */
285 : #define AHEAD 'a' /* color-lookahead arc */
286 : #define BEHIND 'r' /* color-lookbehind arc */
287 : #define WBDRY 'w' /* word boundary constraint */
288 : #define NWBDRY 'W' /* non-word-boundary constraint */
289 : #define SBEGIN 'A' /* beginning of string (even if not BOL) */
290 : #define SEND 'Z' /* end of string (even if not EOL) */
291 : #define PREFER 'P' /* length preference */
292 :
293 : /* is an arc colored, and hence on a color chain? */
294 : #define COLORED(a) \
295 : ((a)->type == PLAIN || (a)->type == AHEAD || (a)->type == BEHIND)
296 :
297 :
298 : /* static function list */
299 : static const struct fns functions = {
300 : rfree, /* regfree insides */
301 : rcancelrequested, /* check for cancel request */
302 : rstacktoodeep /* check for stack getting dangerously deep */
303 : };
304 :
305 :
306 :
307 : /*
308 : * pg_regcomp - compile regular expression
309 : *
310 : * Note: on failure, no resources remain allocated, so pg_regfree()
311 : * need not be applied to re.
312 : */
313 : int
314 253 : pg_regcomp(regex_t *re,
315 : const chr *string,
316 : size_t len,
317 : int flags,
318 : Oid collation)
319 : {
320 : struct vars var;
321 253 : struct vars *v = &var;
322 : struct guts *g;
323 : int i;
324 : size_t j;
325 :
326 : #ifdef REG_DEBUG
327 : FILE *debug = (flags & REG_PROGRESS) ? stdout : (FILE *) NULL;
328 : #else
329 253 : FILE *debug = (FILE *) NULL;
330 : #endif
331 :
332 : #define CNOERR() { if (ISERR()) return freev(v, v->err); }
333 :
334 : /* sanity checks */
335 :
336 253 : if (re == NULL || string == NULL)
337 0 : return REG_INVARG;
338 253 : if ((flags & REG_QUOTE) &&
339 0 : (flags & (REG_ADVANCED | REG_EXPANDED | REG_NEWLINE)))
340 0 : return REG_INVARG;
341 253 : if (!(flags & REG_EXTENDED) && (flags & REG_ADVF))
342 0 : return REG_INVARG;
343 :
344 : /* Initialize locale-dependent support */
345 253 : pg_set_regex_collation(collation);
346 :
347 : /* initial setup (after which freev() is callable) */
348 253 : v->re = re;
349 253 : v->now = string;
350 253 : v->stop = v->now + len;
351 253 : v->savenow = v->savestop = NULL;
352 253 : v->err = 0;
353 253 : v->cflags = flags;
354 253 : v->nsubexp = 0;
355 253 : v->subs = v->sub10;
356 253 : v->nsubs = 10;
357 2783 : for (j = 0; j < v->nsubs; j++)
358 2530 : v->subs[j] = NULL;
359 253 : v->nfa = NULL;
360 253 : v->cm = NULL;
361 253 : v->nlcolor = COLORLESS;
362 253 : v->wordchrs = NULL;
363 253 : v->tree = NULL;
364 253 : v->treechain = NULL;
365 253 : v->treefree = NULL;
366 253 : v->cv = NULL;
367 253 : v->cv2 = NULL;
368 253 : v->lacons = NULL;
369 253 : v->nlacons = 0;
370 253 : v->spaceused = 0;
371 253 : re->re_magic = REMAGIC;
372 253 : re->re_info = 0; /* bits get set during parse */
373 253 : re->re_csize = sizeof(chr);
374 253 : re->re_collation = collation;
375 253 : re->re_guts = NULL;
376 253 : re->re_fns = VS(&functions);
377 :
378 : /* more complex setup, malloced things */
379 253 : re->re_guts = VS(MALLOC(sizeof(struct guts)));
380 253 : if (re->re_guts == NULL)
381 0 : return freev(v, REG_ESPACE);
382 253 : g = (struct guts *) re->re_guts;
383 253 : g->tree = NULL;
384 253 : initcm(v, &g->cmap);
385 253 : v->cm = &g->cmap;
386 253 : g->lacons = NULL;
387 253 : g->nlacons = 0;
388 253 : ZAPCNFA(g->search);
389 253 : v->nfa = newnfa(v, v->cm, (struct nfa *) NULL);
390 253 : CNOERR();
391 : /* set up a reasonably-sized transient cvec for getcvec usage */
392 253 : v->cv = newcvec(100, 20);
393 253 : if (v->cv == NULL)
394 0 : return freev(v, REG_ESPACE);
395 :
396 : /* parsing */
397 253 : lexstart(v); /* also handles prefixes */
398 253 : if ((v->cflags & REG_NLSTOP) || (v->cflags & REG_NLANCH))
399 : {
400 : /* assign newline a unique color */
401 5 : v->nlcolor = subcolor(v->cm, newline());
402 5 : okcolors(v->nfa, v->cm);
403 : }
404 253 : CNOERR();
405 252 : v->tree = parse(v, EOS, PLAIN, v->nfa->init, v->nfa->final);
406 252 : assert(SEE(EOS)); /* even if error; ISERR() => SEE(EOS) */
407 252 : CNOERR();
408 248 : assert(v->tree != NULL);
409 :
410 : /* finish setup of nfa and its subre tree */
411 248 : specialcolors(v->nfa);
412 248 : CNOERR();
413 : #ifdef REG_DEBUG
414 : if (debug != NULL)
415 : {
416 : fprintf(debug, "\n\n\n========= RAW ==========\n");
417 : dumpnfa(v->nfa, debug);
418 : dumpst(v->tree, debug, 1);
419 : }
420 : #endif
421 248 : optst(v, v->tree);
422 248 : v->ntree = numst(v->tree, 1);
423 248 : markst(v->tree);
424 248 : cleanst(v);
425 : #ifdef REG_DEBUG
426 : if (debug != NULL)
427 : {
428 : fprintf(debug, "\n\n\n========= TREE FIXED ==========\n");
429 : dumpst(v->tree, debug, 1);
430 : }
431 : #endif
432 :
433 : /* build compacted NFAs for tree and lacons */
434 248 : re->re_info |= nfatree(v, v->tree, debug);
435 248 : CNOERR();
436 247 : assert(v->nlacons == 0 || v->lacons != NULL);
437 254 : for (i = 1; i < v->nlacons; i++)
438 : {
439 7 : struct subre *lasub = &v->lacons[i];
440 :
441 : #ifdef REG_DEBUG
442 : if (debug != NULL)
443 : fprintf(debug, "\n\n\n========= LA%d ==========\n", i);
444 : #endif
445 :
446 : /* Prepend .* to pattern if it's a lookbehind LACON */
447 7 : nfanode(v, lasub, !LATYPE_IS_AHEAD(lasub->subno), debug);
448 : }
449 247 : CNOERR();
450 247 : if (v->tree->flags & SHORTER)
451 3 : NOTE(REG_USHORTEST);
452 :
453 : /* build compacted NFAs for tree, lacons, fast search */
454 : #ifdef REG_DEBUG
455 : if (debug != NULL)
456 : fprintf(debug, "\n\n\n========= SEARCH ==========\n");
457 : #endif
458 : /* can sacrifice main NFA now, so use it as work area */
459 247 : (DISCARD) optimize(v->nfa, debug);
460 247 : CNOERR();
461 247 : makesearch(v, v->nfa);
462 247 : CNOERR();
463 247 : compact(v->nfa, &g->search);
464 247 : CNOERR();
465 :
466 : /* looks okay, package it up */
467 247 : re->re_nsub = v->nsubexp;
468 247 : v->re = NULL; /* freev no longer frees re */
469 247 : g->magic = GUTSMAGIC;
470 247 : g->cflags = v->cflags;
471 247 : g->info = re->re_info;
472 247 : g->nsub = re->re_nsub;
473 247 : g->tree = v->tree;
474 247 : v->tree = NULL;
475 247 : g->ntree = v->ntree;
476 247 : g->compare = (v->cflags & REG_ICASE) ? casecmp : cmp;
477 247 : g->lacons = v->lacons;
478 247 : v->lacons = NULL;
479 247 : g->nlacons = v->nlacons;
480 :
481 : #ifdef REG_DEBUG
482 : if (flags & REG_DUMP)
483 : dump(re, stdout);
484 : #endif
485 :
486 247 : assert(v->err == 0);
487 247 : return freev(v, 0);
488 : }
489 :
490 : /*
491 : * moresubs - enlarge subRE vector
492 : */
493 : static void
494 0 : moresubs(struct vars *v,
495 : int wanted) /* want enough room for this one */
496 : {
497 : struct subre **p;
498 : size_t n;
499 :
500 0 : assert(wanted > 0 && (size_t) wanted >= v->nsubs);
501 0 : n = (size_t) wanted * 3 / 2 + 1;
502 :
503 0 : if (v->subs == v->sub10)
504 : {
505 0 : p = (struct subre **) MALLOC(n * sizeof(struct subre *));
506 0 : if (p != NULL)
507 0 : memcpy(VS(p), VS(v->subs),
508 0 : v->nsubs * sizeof(struct subre *));
509 : }
510 : else
511 0 : p = (struct subre **) REALLOC(v->subs, n * sizeof(struct subre *));
512 0 : if (p == NULL)
513 : {
514 0 : ERR(REG_ESPACE);
515 0 : return;
516 : }
517 0 : v->subs = p;
518 0 : for (p = &v->subs[v->nsubs]; v->nsubs < n; p++, v->nsubs++)
519 0 : *p = NULL;
520 0 : assert(v->nsubs == n);
521 0 : assert((size_t) wanted < v->nsubs);
522 : }
523 :
524 : /*
525 : * freev - free vars struct's substructures where necessary
526 : *
527 : * Optionally does error-number setting, and always returns error code
528 : * (if any), to make error-handling code terser.
529 : */
530 : static int
531 253 : freev(struct vars *v,
532 : int err)
533 : {
534 253 : if (v->re != NULL)
535 6 : rfree(v->re);
536 253 : if (v->subs != v->sub10)
537 0 : FREE(v->subs);
538 253 : if (v->nfa != NULL)
539 253 : freenfa(v->nfa);
540 253 : if (v->tree != NULL)
541 1 : freesubre(v, v->tree);
542 253 : if (v->treechain != NULL)
543 4 : cleanst(v);
544 253 : if (v->cv != NULL)
545 253 : freecvec(v->cv);
546 253 : if (v->cv2 != NULL)
547 0 : freecvec(v->cv2);
548 253 : if (v->lacons != NULL)
549 0 : freelacons(v->lacons, v->nlacons);
550 253 : ERR(err); /* nop if err==0 */
551 :
552 253 : return v->err;
553 : }
554 :
555 : /*
556 : * makesearch - turn an NFA into a search NFA (implicit prepend of .*?)
557 : * NFA must have been optimize()d already.
558 : */
559 : static void
560 249 : makesearch(struct vars *v,
561 : struct nfa *nfa)
562 : {
563 : struct arc *a;
564 : struct arc *b;
565 249 : struct state *pre = nfa->pre;
566 : struct state *s;
567 : struct state *s2;
568 : struct state *slist;
569 :
570 : /* no loops are needed if it's anchored */
571 641 : for (a = pre->outs; a != NULL; a = a->outchain)
572 : {
573 475 : assert(a->type == PLAIN);
574 475 : if (a->co != nfa->bos[0] && a->co != nfa->bos[1])
575 83 : break;
576 : }
577 249 : if (a != NULL)
578 : {
579 : /* add implicit .* in front */
580 83 : rainbow(nfa, v->cm, PLAIN, COLORLESS, pre, pre);
581 :
582 : /* and ^* and \A* too -- not always necessary, but harmless */
583 83 : newarc(nfa, PLAIN, nfa->bos[0], pre, pre);
584 83 : newarc(nfa, PLAIN, nfa->bos[1], pre, pre);
585 : }
586 :
587 : /*
588 : * Now here's the subtle part. Because many REs have no lookback
589 : * constraints, often knowing when you were in the pre state tells you
590 : * little; it's the next state(s) that are informative. But some of them
591 : * may have other inarcs, i.e. it may be possible to make actual progress
592 : * and then return to one of them. We must de-optimize such cases,
593 : * splitting each such state into progress and no-progress states.
594 : */
595 :
596 : /* first, make a list of the states reachable from pre and elsewhere */
597 249 : slist = NULL;
598 1484 : for (a = pre->outs; a != NULL; a = a->outchain)
599 : {
600 1235 : s = a->to;
601 8248 : for (b = s->ins; b != NULL; b = b->inchain)
602 : {
603 7215 : if (b->from != pre)
604 202 : break;
605 : }
606 :
607 : /*
608 : * We want to mark states as being in the list already by having non
609 : * NULL tmp fields, but we can't just store the old slist value in tmp
610 : * because that doesn't work for the first such state. Instead, the
611 : * first list entry gets its own address in tmp.
612 : */
613 1235 : if (b != NULL && s->tmp == NULL)
614 : {
615 55 : s->tmp = (slist != NULL) ? slist : s;
616 55 : slist = s;
617 : }
618 : }
619 :
620 : /* do the splits */
621 304 : for (s = slist; s != NULL; s = s2)
622 : {
623 55 : s2 = newstate(nfa);
624 55 : NOERR();
625 55 : copyouts(nfa, s, s2);
626 55 : NOERR();
627 1590 : for (a = s->ins; a != NULL; a = b)
628 : {
629 1535 : b = a->inchain;
630 1535 : if (a->from != pre)
631 : {
632 1333 : cparc(nfa, a, a->from, s2);
633 1333 : freearc(nfa, a);
634 : }
635 : }
636 55 : s2 = (s->tmp != s) ? s->tmp : NULL;
637 55 : s->tmp = NULL; /* clean up while we're at it */
638 : }
639 : }
640 :
641 : /*
642 : * parse - parse an RE
643 : *
644 : * This is actually just the top level, which parses a bunch of branches
645 : * tied together with '|'. They appear in the tree as the left children
646 : * of a chain of '|' subres.
647 : */
648 : static struct subre *
649 512 : parse(struct vars *v,
650 : int stopper, /* EOS or ')' */
651 : int type, /* LACON (lookaround subRE) or PLAIN */
652 : struct state *init, /* initial state */
653 : struct state *final) /* final state */
654 : {
655 : struct state *left; /* scaffolding for branch */
656 : struct state *right;
657 : struct subre *branches; /* top level */
658 : struct subre *branch; /* current branch */
659 : struct subre *t; /* temporary */
660 : int firstbranch; /* is this the first branch? */
661 :
662 512 : assert(stopper == ')' || stopper == EOS);
663 :
664 512 : branches = subre(v, '|', LONGER, init, final);
665 512 : NOERRN();
666 512 : branch = branches;
667 512 : firstbranch = 1;
668 : do
669 : { /* a branch */
670 538 : if (!firstbranch)
671 : {
672 : /* need a place to hang it */
673 26 : branch->right = subre(v, '|', LONGER, init, final);
674 26 : NOERRN();
675 26 : branch = branch->right;
676 : }
677 538 : firstbranch = 0;
678 538 : left = newstate(v->nfa);
679 538 : right = newstate(v->nfa);
680 538 : NOERRN();
681 538 : EMPTYARC(init, left);
682 538 : EMPTYARC(right, final);
683 538 : NOERRN();
684 538 : branch->left = parsebranch(v, stopper, type, left, right, 0);
685 538 : NOERRN();
686 531 : branch->flags |= UP(branch->flags | branch->left->flags);
687 531 : if ((branch->flags & ~branches->flags) != 0) /* new flags */
688 6 : for (t = branches; t != branch; t = t->right)
689 3 : t->flags |= branch->flags;
690 531 : } while (EAT('|'));
691 505 : assert(SEE(stopper) || SEE(EOS));
692 :
693 505 : if (!SEE(stopper))
694 : {
695 1 : assert(stopper == ')' && SEE(EOS));
696 1 : ERR(REG_EPAREN);
697 : }
698 :
699 : /* optimize out simple cases */
700 505 : if (branch == branches)
701 : { /* only one branch */
702 479 : assert(branch->right == NULL);
703 479 : t = branch->left;
704 479 : branch->left = NULL;
705 479 : freesubre(v, branches);
706 479 : branches = t;
707 : }
708 26 : else if (!MESSY(branches->flags))
709 : { /* no interesting innards */
710 7 : freesubre(v, branches->left);
711 7 : branches->left = NULL;
712 7 : freesubre(v, branches->right);
713 7 : branches->right = NULL;
714 7 : branches->op = '=';
715 : }
716 :
717 505 : return branches;
718 : }
719 :
720 : /*
721 : * parsebranch - parse one branch of an RE
722 : *
723 : * This mostly manages concatenation, working closely with parseqatom().
724 : * Concatenated things are bundled up as much as possible, with separate
725 : * ',' nodes introduced only when necessary due to substructure.
726 : */
727 : static struct subre *
728 715 : parsebranch(struct vars *v,
729 : int stopper, /* EOS or ')' */
730 : int type, /* LACON (lookaround subRE) or PLAIN */
731 : struct state *left, /* leftmost state */
732 : struct state *right, /* rightmost state */
733 : int partial) /* is this only part of a branch? */
734 : {
735 : struct state *lp; /* left end of current construct */
736 : int seencontent; /* is there anything in this branch yet? */
737 : struct subre *t;
738 :
739 715 : lp = left;
740 715 : seencontent = 0;
741 715 : t = subre(v, '=', 0, left, right); /* op '=' is tentative */
742 715 : NOERRN();
743 7044 : while (!SEE('|') && !SEE(stopper) && !SEE(EOS))
744 : {
745 5624 : if (seencontent)
746 : { /* implicit concat operator */
747 4932 : lp = newstate(v->nfa);
748 4932 : NOERRN();
749 4932 : moveins(v->nfa, right, lp);
750 : }
751 5624 : seencontent = 1;
752 :
753 : /* NB, recursion in parseqatom() may swallow rest of branch */
754 5624 : parseqatom(v, stopper, type, lp, right, t);
755 5624 : NOERRN();
756 : }
757 :
758 705 : if (!seencontent)
759 : { /* empty branch */
760 23 : if (!partial)
761 23 : NOTE(REG_UUNSPEC);
762 23 : assert(lp == left);
763 23 : EMPTYARC(left, right);
764 : }
765 :
766 705 : return t;
767 : }
768 :
769 : /*
770 : * parseqatom - parse one quantified atom or constraint of an RE
771 : *
772 : * The bookkeeping near the end cooperates very closely with parsebranch();
773 : * in particular, it contains a recursion that can involve parsing the rest
774 : * of the branch, making this function's name somewhat inaccurate.
775 : */
776 : static void
777 5624 : parseqatom(struct vars *v,
778 : int stopper, /* EOS or ')' */
779 : int type, /* LACON (lookaround subRE) or PLAIN */
780 : struct state *lp, /* left state to hang it on */
781 : struct state *rp, /* right state to hang it on */
782 : struct subre *top) /* subtree top */
783 : {
784 : struct state *s; /* temporaries for new states */
785 : struct state *s2;
786 :
787 : #define ARCV(t, val) newarc(v->nfa, t, val, lp, rp)
788 : int m,
789 : n;
790 : struct subre *atom; /* atom's subtree */
791 : struct subre *t;
792 : int cap; /* capturing parens? */
793 : int latype; /* lookaround constraint type */
794 : int subno; /* capturing-parens or backref number */
795 : int atomtype;
796 : int qprefer; /* quantifier short/long preference */
797 : int f;
798 : struct subre **atomp; /* where the pointer to atom is */
799 :
800 : /* initial bookkeeping */
801 5624 : atom = NULL;
802 5624 : assert(lp->nouts == 0); /* must string new code */
803 5624 : assert(rp->nins == 0); /* between lp and rp */
804 5624 : subno = 0; /* just to shut lint up */
805 :
806 : /* an atom or constraint... */
807 5624 : atomtype = v->nexttype;
808 5624 : switch (atomtype)
809 : {
810 : /* first, constraints, which end by returning */
811 : case '^':
812 175 : ARCV('^', 1);
813 175 : if (v->cflags & REG_NLANCH)
814 3 : ARCV(BEHIND, v->nlcolor);
815 175 : NEXT();
816 175 : return;
817 : break;
818 : case '$':
819 152 : ARCV('$', 1);
820 152 : if (v->cflags & REG_NLANCH)
821 3 : ARCV(AHEAD, v->nlcolor);
822 152 : NEXT();
823 152 : return;
824 : break;
825 : case SBEGIN:
826 0 : ARCV('^', 1); /* BOL */
827 0 : ARCV('^', 0); /* or BOS */
828 0 : NEXT();
829 0 : return;
830 : break;
831 : case SEND:
832 0 : ARCV('$', 1); /* EOL */
833 0 : ARCV('$', 0); /* or EOS */
834 0 : NEXT();
835 0 : return;
836 : break;
837 : case '<':
838 1 : wordchrs(v); /* does NEXT() */
839 1 : s = newstate(v->nfa);
840 1 : NOERR();
841 1 : nonword(v, BEHIND, lp, s);
842 1 : word(v, AHEAD, s, rp);
843 1 : return;
844 : break;
845 : case '>':
846 1 : wordchrs(v); /* does NEXT() */
847 1 : s = newstate(v->nfa);
848 1 : NOERR();
849 1 : word(v, BEHIND, lp, s);
850 1 : nonword(v, AHEAD, s, rp);
851 1 : return;
852 : break;
853 : case WBDRY:
854 0 : wordchrs(v); /* does NEXT() */
855 0 : s = newstate(v->nfa);
856 0 : NOERR();
857 0 : nonword(v, BEHIND, lp, s);
858 0 : word(v, AHEAD, s, rp);
859 0 : s = newstate(v->nfa);
860 0 : NOERR();
861 0 : word(v, BEHIND, lp, s);
862 0 : nonword(v, AHEAD, s, rp);
863 0 : return;
864 : break;
865 : case NWBDRY:
866 2 : wordchrs(v); /* does NEXT() */
867 2 : s = newstate(v->nfa);
868 2 : NOERR();
869 2 : word(v, BEHIND, lp, s);
870 2 : word(v, AHEAD, s, rp);
871 2 : s = newstate(v->nfa);
872 2 : NOERR();
873 2 : nonword(v, BEHIND, lp, s);
874 2 : nonword(v, AHEAD, s, rp);
875 2 : return;
876 : break;
877 : case LACON: /* lookaround constraint */
878 27 : latype = v->nextvalue;
879 27 : NEXT();
880 27 : s = newstate(v->nfa);
881 27 : s2 = newstate(v->nfa);
882 27 : NOERR();
883 27 : t = parse(v, ')', LACON, s, s2);
884 27 : freesubre(v, t); /* internal structure irrelevant */
885 27 : NOERR();
886 25 : assert(SEE(')'));
887 25 : NEXT();
888 25 : processlacon(v, s, s2, latype, lp, rp);
889 25 : return;
890 : break;
891 : /* then errors, to get them out of the way */
892 : case '*':
893 : case '+':
894 : case '?':
895 : case '{':
896 0 : ERR(REG_BADRPT);
897 0 : return;
898 : break;
899 : default:
900 0 : ERR(REG_ASSERT);
901 0 : return;
902 : break;
903 : /* then plain characters, and minor variants on that theme */
904 : case ')': /* unbalanced paren */
905 0 : if ((v->cflags & REG_ADVANCED) != REG_EXTENDED)
906 : {
907 0 : ERR(REG_EPAREN);
908 0 : return;
909 : }
910 : /* legal in EREs due to specification botch */
911 0 : NOTE(REG_UPBOTCH);
912 : /* fallthrough into case PLAIN */
913 : case PLAIN:
914 4962 : onechr(v, v->nextvalue, lp, rp);
915 4962 : okcolors(v->nfa, v->cm);
916 4962 : NOERR();
917 4962 : NEXT();
918 4962 : break;
919 : case '[':
920 33 : if (v->nextvalue == 1)
921 25 : bracket(v, lp, rp);
922 : else
923 8 : cbracket(v, lp, rp);
924 33 : assert(SEE(']') || ISERR());
925 33 : NEXT();
926 33 : break;
927 : case '.':
928 30 : rainbow(v->nfa, v->cm, PLAIN,
929 30 : (v->cflags & REG_NLSTOP) ? v->nlcolor : COLORLESS,
930 : lp, rp);
931 27 : NEXT();
932 27 : break;
933 : /* and finally the ugly stuff */
934 : case '(': /* value flags as capturing or non */
935 233 : cap = (type == LACON) ? 0 : v->nextvalue;
936 233 : if (cap)
937 : {
938 225 : v->nsubexp++;
939 225 : subno = v->nsubexp;
940 225 : if ((size_t) subno >= v->nsubs)
941 0 : moresubs(v, subno);
942 225 : assert((size_t) subno < v->nsubs);
943 : }
944 : else
945 8 : atomtype = PLAIN; /* something that's not '(' */
946 233 : NEXT();
947 : /* need new endpoints because tree will contain pointers */
948 233 : s = newstate(v->nfa);
949 233 : s2 = newstate(v->nfa);
950 233 : NOERR();
951 233 : EMPTYARC(lp, s);
952 233 : EMPTYARC(s2, rp);
953 233 : NOERR();
954 233 : atom = parse(v, ')', type, s, s2);
955 233 : assert(SEE(')') || ISERR());
956 233 : NEXT();
957 233 : NOERR();
958 231 : if (cap)
959 : {
960 224 : v->subs[subno] = atom;
961 224 : t = subre(v, '(', atom->flags | CAP, lp, rp);
962 224 : NOERR();
963 224 : t->subno = subno;
964 224 : t->left = atom;
965 224 : atom = t;
966 : }
967 : /* postpone everything else pending possible {0} */
968 231 : break;
969 : case BACKREF: /* the Feature From The Black Lagoon */
970 11 : INSIST(type != LACON, REG_ESUBREG);
971 11 : INSIST(v->nextvalue < v->nsubs, REG_ESUBREG);
972 11 : INSIST(v->subs[v->nextvalue] != NULL, REG_ESUBREG);
973 11 : NOERR();
974 9 : assert(v->nextvalue > 0);
975 9 : atom = subre(v, 'b', BACKR, lp, rp);
976 9 : NOERR();
977 9 : subno = v->nextvalue;
978 9 : atom->subno = subno;
979 9 : EMPTYARC(lp, rp); /* temporarily, so there's something */
980 9 : NEXT();
981 9 : break;
982 : }
983 :
984 : /* ...and an atom may be followed by a quantifier */
985 5262 : switch (v->nexttype)
986 : {
987 : case '*':
988 3042 : m = 0;
989 3042 : n = DUPINF;
990 3042 : qprefer = (v->nextvalue) ? LONGER : SHORTER;
991 3042 : NEXT();
992 3042 : break;
993 : case '+':
994 57 : m = 1;
995 57 : n = DUPINF;
996 57 : qprefer = (v->nextvalue) ? LONGER : SHORTER;
997 57 : NEXT();
998 57 : break;
999 : case '?':
1000 6 : m = 0;
1001 6 : n = 1;
1002 6 : qprefer = (v->nextvalue) ? LONGER : SHORTER;
1003 6 : NEXT();
1004 6 : break;
1005 : case '{':
1006 4 : NEXT();
1007 4 : m = scannum(v);
1008 4 : if (EAT(','))
1009 : {
1010 1 : if (SEE(DIGIT))
1011 1 : n = scannum(v);
1012 : else
1013 0 : n = DUPINF;
1014 1 : if (m > n)
1015 : {
1016 1 : ERR(REG_BADBR);
1017 1 : return;
1018 : }
1019 : /* {m,n} exercises preference, even if it's {m,m} */
1020 0 : qprefer = (v->nextvalue) ? LONGER : SHORTER;
1021 : }
1022 : else
1023 : {
1024 3 : n = m;
1025 : /* {m} passes operand's preference through */
1026 3 : qprefer = 0;
1027 : }
1028 3 : if (!SEE('}'))
1029 : { /* catches errors too */
1030 0 : ERR(REG_BADBR);
1031 0 : return;
1032 : }
1033 3 : NEXT();
1034 3 : break;
1035 : default: /* no quantifier */
1036 2153 : m = n = 1;
1037 2153 : qprefer = 0;
1038 2153 : break;
1039 : }
1040 :
1041 : /* annoying special case: {0} or {0,0} cancels everything */
1042 5261 : if (m == 0 && n == 0)
1043 : {
1044 0 : if (atom != NULL)
1045 0 : freesubre(v, atom);
1046 0 : if (atomtype == '(')
1047 0 : v->subs[subno] = NULL;
1048 0 : delsub(v->nfa, lp, rp);
1049 0 : EMPTYARC(lp, rp);
1050 0 : return;
1051 : }
1052 :
1053 : /* if not a messy case, avoid hard part */
1054 5261 : assert(!MESSY(top->flags));
1055 5261 : f = top->flags | qprefer | ((atom != NULL) ? atom->flags : 0);
1056 5261 : if (atomtype != '(' && atomtype != BACKREF && !MESSY(UP(f)))
1057 : {
1058 5025 : if (!(m == 1 && n == 1))
1059 3066 : repeat(v, lp, rp, m, n);
1060 5025 : if (atom != NULL)
1061 3 : freesubre(v, atom);
1062 5025 : top->flags = f;
1063 5025 : return;
1064 : }
1065 :
1066 : /*
1067 : * hard part: something messy
1068 : *
1069 : * That is, capturing parens, back reference, short/long clash, or an atom
1070 : * with substructure containing one of those.
1071 : */
1072 :
1073 : /* now we'll need a subre for the contents even if they're boring */
1074 236 : if (atom == NULL)
1075 : {
1076 0 : atom = subre(v, '=', 0, lp, rp);
1077 0 : NOERR();
1078 : }
1079 :
1080 : /*----------
1081 : * Prepare a general-purpose state skeleton.
1082 : *
1083 : * In the no-backrefs case, we want this:
1084 : *
1085 : * [lp] ---> [s] ---prefix---> [begin] ---atom---> [end] ---rest---> [rp]
1086 : *
1087 : * where prefix is some repetitions of atom. In the general case we need
1088 : *
1089 : * [lp] ---> [s] ---iterator---> [s2] ---rest---> [rp]
1090 : *
1091 : * where the iterator wraps around [begin] ---atom---> [end]
1092 : *
1093 : * We make the s state here for both cases; s2 is made below if needed
1094 : *----------
1095 : */
1096 236 : s = newstate(v->nfa); /* first, new endpoints for the atom */
1097 236 : s2 = newstate(v->nfa);
1098 236 : NOERR();
1099 236 : moveouts(v->nfa, lp, s);
1100 236 : moveins(v->nfa, rp, s2);
1101 236 : NOERR();
1102 236 : atom->begin = s;
1103 236 : atom->end = s2;
1104 236 : s = newstate(v->nfa); /* set up starting state */
1105 236 : NOERR();
1106 236 : EMPTYARC(lp, s);
1107 236 : NOERR();
1108 :
1109 : /* break remaining subRE into x{...} and what follows */
1110 236 : t = subre(v, '.', COMBINE(qprefer, atom->flags), lp, rp);
1111 236 : NOERR();
1112 236 : t->left = atom;
1113 236 : atomp = &t->left;
1114 :
1115 : /* here we should recurse... but we must postpone that to the end */
1116 :
1117 : /* split top into prefix and remaining */
1118 236 : assert(top->op == '=' && top->left == NULL && top->right == NULL);
1119 236 : top->left = subre(v, '=', top->flags, top->begin, lp);
1120 236 : NOERR();
1121 236 : top->op = '.';
1122 236 : top->right = t;
1123 :
1124 : /* if it's a backref, now is the time to replicate the subNFA */
1125 236 : if (atomtype == BACKREF)
1126 : {
1127 9 : assert(atom->begin->nouts == 1); /* just the EMPTY */
1128 9 : delsub(v->nfa, atom->begin, atom->end);
1129 9 : assert(v->subs[subno] != NULL);
1130 :
1131 : /*
1132 : * And here's why the recursion got postponed: it must wait until the
1133 : * skeleton is filled in, because it may hit a backref that wants to
1134 : * copy the filled-in skeleton.
1135 : */
1136 9 : dupnfa(v->nfa, v->subs[subno]->begin, v->subs[subno]->end,
1137 : atom->begin, atom->end);
1138 9 : NOERR();
1139 : }
1140 :
1141 : /*
1142 : * It's quantifier time. If the atom is just a backref, we'll let it deal
1143 : * with quantifiers internally.
1144 : */
1145 236 : if (atomtype == BACKREF)
1146 : {
1147 : /* special case: backrefs have internal quantifiers */
1148 9 : EMPTYARC(s, atom->begin); /* empty prefix */
1149 : /* just stuff everything into atom */
1150 9 : repeat(v, atom->begin, atom->end, m, n);
1151 9 : atom->min = (short) m;
1152 9 : atom->max = (short) n;
1153 9 : atom->flags |= COMBINE(qprefer, atom->flags);
1154 : /* rest of branch can be strung starting from atom->end */
1155 9 : s2 = atom->end;
1156 : }
1157 227 : else if (m == 1 && n == 1)
1158 : {
1159 : /* no/vacuous quantifier: done */
1160 186 : EMPTYARC(s, atom->begin); /* empty prefix */
1161 : /* rest of branch can be strung starting from atom->end */
1162 186 : s2 = atom->end;
1163 : }
1164 41 : else if (m > 0 && !(atom->flags & BACKR))
1165 : {
1166 : /*
1167 : * If there's no backrefs involved, we can turn x{m,n} into
1168 : * x{m-1,n-1}x, with capturing parens in only the second x. This is
1169 : * valid because we only care about capturing matches from the final
1170 : * iteration of the quantifier. It's a win because we can implement
1171 : * the backref-free left side as a plain DFA node, since we don't
1172 : * really care where its submatches are.
1173 : */
1174 27 : dupnfa(v->nfa, atom->begin, atom->end, s, atom->begin);
1175 27 : assert(m >= 1 && m != DUPINF && n >= 1);
1176 27 : repeat(v, s, atom->begin, m - 1, (n == DUPINF) ? n : n - 1);
1177 27 : f = COMBINE(qprefer, atom->flags);
1178 27 : t = subre(v, '.', f, s, atom->end); /* prefix and atom */
1179 27 : NOERR();
1180 27 : t->left = subre(v, '=', PREF(f), s, atom->begin);
1181 27 : NOERR();
1182 27 : t->right = atom;
1183 27 : *atomp = t;
1184 : /* rest of branch can be strung starting from atom->end */
1185 27 : s2 = atom->end;
1186 : }
1187 : else
1188 : {
1189 : /* general case: need an iteration node */
1190 14 : s2 = newstate(v->nfa);
1191 14 : NOERR();
1192 14 : moveouts(v->nfa, atom->end, s2);
1193 14 : NOERR();
1194 14 : dupnfa(v->nfa, atom->begin, atom->end, s, s2);
1195 14 : repeat(v, s, s2, m, n);
1196 14 : f = COMBINE(qprefer, atom->flags);
1197 14 : t = subre(v, '*', f, s, s2);
1198 14 : NOERR();
1199 14 : t->min = (short) m;
1200 14 : t->max = (short) n;
1201 14 : t->left = atom;
1202 14 : *atomp = t;
1203 : /* rest of branch is to be strung from iteration's end state */
1204 : }
1205 :
1206 : /* and finally, look after that postponed recursion */
1207 236 : t = top->right;
1208 236 : if (!(SEE('|') || SEE(stopper) || SEE(EOS)))
1209 177 : t->right = parsebranch(v, stopper, type, s2, rp, 1);
1210 : else
1211 : {
1212 59 : EMPTYARC(s2, rp);
1213 59 : t->right = subre(v, '=', 0, s2, rp);
1214 : }
1215 236 : NOERR();
1216 233 : assert(SEE('|') || SEE(stopper) || SEE(EOS));
1217 233 : t->flags |= COMBINE(t->flags, t->right->flags);
1218 233 : top->flags |= COMBINE(top->flags, t->flags);
1219 : }
1220 :
1221 : /*
1222 : * nonword - generate arcs for non-word-character ahead or behind
1223 : */
1224 : static void
1225 6 : nonword(struct vars *v,
1226 : int dir, /* AHEAD or BEHIND */
1227 : struct state *lp,
1228 : struct state *rp)
1229 : {
1230 6 : int anchor = (dir == AHEAD) ? '$' : '^';
1231 :
1232 6 : assert(dir == AHEAD || dir == BEHIND);
1233 6 : newarc(v->nfa, anchor, 1, lp, rp);
1234 6 : newarc(v->nfa, anchor, 0, lp, rp);
1235 6 : colorcomplement(v->nfa, v->cm, dir, v->wordchrs, lp, rp);
1236 : /* (no need for special attention to \n) */
1237 6 : }
1238 :
1239 : /*
1240 : * word - generate arcs for word character ahead or behind
1241 : */
1242 : static void
1243 6 : word(struct vars *v,
1244 : int dir, /* AHEAD or BEHIND */
1245 : struct state *lp,
1246 : struct state *rp)
1247 : {
1248 6 : assert(dir == AHEAD || dir == BEHIND);
1249 6 : cloneouts(v->nfa, v->wordchrs, lp, rp, dir);
1250 : /* (no need for special attention to \n) */
1251 6 : }
1252 :
1253 : /*
1254 : * scannum - scan a number
1255 : */
1256 : static int /* value, <= DUPMAX */
1257 5 : scannum(struct vars *v)
1258 : {
1259 5 : int n = 0;
1260 :
1261 15 : while (SEE(DIGIT) && n < DUPMAX)
1262 : {
1263 5 : n = n * 10 + v->nextvalue;
1264 5 : NEXT();
1265 : }
1266 5 : if (SEE(DIGIT) || n > DUPMAX)
1267 : {
1268 0 : ERR(REG_BADBR);
1269 0 : return 0;
1270 : }
1271 5 : return n;
1272 : }
1273 :
1274 : /*
1275 : * repeat - replicate subNFA for quantifiers
1276 : *
1277 : * The sub-NFA strung from lp to rp is modified to represent m to n
1278 : * repetitions of its initial contents.
1279 : *
1280 : * The duplication sequences used here are chosen carefully so that any
1281 : * pointers starting out pointing into the subexpression end up pointing into
1282 : * the last occurrence. (Note that it may not be strung between the same
1283 : * left and right end states, however!) This used to be important for the
1284 : * subRE tree, although the important bits are now handled by the in-line
1285 : * code in parse(), and when this is called, it doesn't matter any more.
1286 : */
1287 : static void
1288 3123 : repeat(struct vars *v,
1289 : struct state *lp,
1290 : struct state *rp,
1291 : int m,
1292 : int n)
1293 : {
1294 : #define SOME 2
1295 : #define INF 3
1296 : #define PAIR(x, y) ((x)*4 + (y))
1297 : #define REDUCE(x) ( ((x) == DUPINF) ? INF : (((x) > 1) ? SOME : (x)) )
1298 3123 : const int rm = REDUCE(m);
1299 3123 : const int rn = REDUCE(n);
1300 : struct state *s;
1301 : struct state *s2;
1302 :
1303 3123 : switch (PAIR(rm, rn))
1304 : {
1305 : case PAIR(0, 0): /* empty string */
1306 0 : delsub(v->nfa, lp, rp);
1307 0 : EMPTYARC(lp, rp);
1308 0 : break;
1309 : case PAIR(0, 1): /* do as x| */
1310 6 : EMPTYARC(lp, rp);
1311 6 : break;
1312 : case PAIR(0, SOME): /* do as x{1,n}| */
1313 0 : repeat(v, lp, rp, 1, n);
1314 0 : NOERR();
1315 0 : EMPTYARC(lp, rp);
1316 0 : break;
1317 : case PAIR(0, INF): /* loop x around */
1318 3069 : s = newstate(v->nfa);
1319 3069 : NOERR();
1320 3069 : moveouts(v->nfa, lp, s);
1321 3069 : moveins(v->nfa, rp, s);
1322 3069 : EMPTYARC(lp, s);
1323 3069 : EMPTYARC(s, rp);
1324 3069 : break;
1325 : case PAIR(1, 1): /* no action required */
1326 11 : break;
1327 : case PAIR(1, SOME): /* do as x{0,n-1}x = (x{1,n-1}|)x */
1328 0 : s = newstate(v->nfa);
1329 0 : NOERR();
1330 0 : moveouts(v->nfa, lp, s);
1331 0 : dupnfa(v->nfa, s, rp, lp, s);
1332 0 : NOERR();
1333 0 : repeat(v, lp, s, 1, n - 1);
1334 0 : NOERR();
1335 0 : EMPTYARC(lp, s);
1336 0 : break;
1337 : case PAIR(1, INF): /* add loopback arc */
1338 30 : s = newstate(v->nfa);
1339 30 : s2 = newstate(v->nfa);
1340 30 : NOERR();
1341 30 : moveouts(v->nfa, lp, s);
1342 30 : moveins(v->nfa, rp, s2);
1343 30 : EMPTYARC(lp, s);
1344 30 : EMPTYARC(s2, rp);
1345 30 : EMPTYARC(s2, s);
1346 30 : break;
1347 : case PAIR(SOME, SOME): /* do as x{m-1,n-1}x */
1348 7 : s = newstate(v->nfa);
1349 7 : NOERR();
1350 7 : moveouts(v->nfa, lp, s);
1351 7 : dupnfa(v->nfa, s, rp, lp, s);
1352 7 : NOERR();
1353 7 : repeat(v, lp, s, m - 1, n - 1);
1354 7 : break;
1355 : case PAIR(SOME, INF): /* do as x{m-1,}x */
1356 0 : s = newstate(v->nfa);
1357 0 : NOERR();
1358 0 : moveouts(v->nfa, lp, s);
1359 0 : dupnfa(v->nfa, s, rp, lp, s);
1360 0 : NOERR();
1361 0 : repeat(v, lp, s, m - 1, n);
1362 0 : break;
1363 : default:
1364 0 : ERR(REG_ASSERT);
1365 0 : break;
1366 : }
1367 : }
1368 :
1369 : /*
1370 : * bracket - handle non-complemented bracket expression
1371 : * Also called from cbracket for complemented bracket expressions.
1372 : */
1373 : static void
1374 36 : bracket(struct vars *v,
1375 : struct state *lp,
1376 : struct state *rp)
1377 : {
1378 36 : assert(SEE('['));
1379 36 : NEXT();
1380 141 : while (!SEE(']') && !SEE(EOS))
1381 69 : brackpart(v, lp, rp);
1382 36 : assert(SEE(']') || ISERR());
1383 36 : okcolors(v->nfa, v->cm);
1384 36 : }
1385 :
1386 : /*
1387 : * cbracket - handle complemented bracket expression
1388 : * We do it by calling bracket() with dummy endpoints, and then complementing
1389 : * the result. The alternative would be to invoke rainbow(), and then delete
1390 : * arcs as the b.e. is seen... but that gets messy.
1391 : */
1392 : static void
1393 8 : cbracket(struct vars *v,
1394 : struct state *lp,
1395 : struct state *rp)
1396 : {
1397 8 : struct state *left = newstate(v->nfa);
1398 8 : struct state *right = newstate(v->nfa);
1399 :
1400 8 : NOERR();
1401 8 : bracket(v, left, right);
1402 8 : if (v->cflags & REG_NLSTOP)
1403 0 : newarc(v->nfa, PLAIN, v->nlcolor, left, right);
1404 8 : NOERR();
1405 :
1406 8 : assert(lp->nouts == 0); /* all outarcs will be ours */
1407 :
1408 : /*
1409 : * Easy part of complementing, and all there is to do since the MCCE code
1410 : * was removed.
1411 : */
1412 8 : colorcomplement(v->nfa, v->cm, PLAIN, left, lp, rp);
1413 8 : NOERR();
1414 8 : dropstate(v->nfa, left);
1415 8 : assert(right->nins == 0);
1416 8 : freestate(v->nfa, right);
1417 : }
1418 :
1419 : /*
1420 : * brackpart - handle one item (or range) within a bracket expression
1421 : */
1422 : static void
1423 69 : brackpart(struct vars *v,
1424 : struct state *lp,
1425 : struct state *rp)
1426 : {
1427 : chr startc;
1428 : chr endc;
1429 : struct cvec *cv;
1430 : const chr *startp;
1431 : const chr *endp;
1432 : chr c[1];
1433 :
1434 : /* parse something, get rid of special cases, take shortcuts */
1435 69 : switch (v->nexttype)
1436 : {
1437 : case RANGE: /* a-b-c or other botch */
1438 0 : ERR(REG_ERANGE);
1439 55 : return;
1440 : break;
1441 : case PLAIN:
1442 51 : c[0] = v->nextvalue;
1443 51 : NEXT();
1444 : /* shortcut for ordinary chr (not range) */
1445 51 : if (!SEE(RANGE))
1446 : {
1447 37 : onechr(v, c[0], lp, rp);
1448 37 : return;
1449 : }
1450 14 : startc = element(v, c, c + 1);
1451 14 : NOERR();
1452 14 : break;
1453 : case COLLEL:
1454 0 : startp = v->now;
1455 0 : endp = scanplain(v);
1456 0 : INSIST(startp < endp, REG_ECOLLATE);
1457 0 : NOERR();
1458 0 : startc = element(v, startp, endp);
1459 0 : NOERR();
1460 0 : break;
1461 : case ECLASS:
1462 0 : startp = v->now;
1463 0 : endp = scanplain(v);
1464 0 : INSIST(startp < endp, REG_ECOLLATE);
1465 0 : NOERR();
1466 0 : startc = element(v, startp, endp);
1467 0 : NOERR();
1468 0 : cv = eclass(v, startc, (v->cflags & REG_ICASE));
1469 0 : NOERR();
1470 0 : subcolorcvec(v, cv, lp, rp);
1471 0 : return;
1472 : break;
1473 : case CCLASS:
1474 18 : startp = v->now;
1475 18 : endp = scanplain(v);
1476 18 : INSIST(startp < endp, REG_ECTYPE);
1477 18 : NOERR();
1478 18 : cv = cclass(v, startp, endp, (v->cflags & REG_ICASE));
1479 18 : NOERR();
1480 18 : subcolorcvec(v, cv, lp, rp);
1481 18 : return;
1482 : break;
1483 : default:
1484 0 : ERR(REG_ASSERT);
1485 0 : return;
1486 : break;
1487 : }
1488 :
1489 14 : if (SEE(RANGE))
1490 : {
1491 14 : NEXT();
1492 14 : switch (v->nexttype)
1493 : {
1494 : case PLAIN:
1495 : case RANGE:
1496 14 : c[0] = v->nextvalue;
1497 14 : NEXT();
1498 14 : endc = element(v, c, c + 1);
1499 14 : NOERR();
1500 14 : break;
1501 : case COLLEL:
1502 0 : startp = v->now;
1503 0 : endp = scanplain(v);
1504 0 : INSIST(startp < endp, REG_ECOLLATE);
1505 0 : NOERR();
1506 0 : endc = element(v, startp, endp);
1507 0 : NOERR();
1508 0 : break;
1509 : default:
1510 0 : ERR(REG_ERANGE);
1511 0 : return;
1512 : break;
1513 : }
1514 : }
1515 : else
1516 0 : endc = startc;
1517 :
1518 : /*
1519 : * Ranges are unportable. Actually, standard C does guarantee that digits
1520 : * are contiguous, but making that an exception is just too complicated.
1521 : */
1522 14 : if (startc != endc)
1523 14 : NOTE(REG_UUNPORT);
1524 14 : cv = range(v, startc, endc, (v->cflags & REG_ICASE));
1525 14 : NOERR();
1526 14 : subcolorcvec(v, cv, lp, rp);
1527 : }
1528 :
1529 : /*
1530 : * scanplain - scan PLAIN contents of [. etc.
1531 : *
1532 : * Certain bits of trickery in lex.c know that this code does not try
1533 : * to look past the final bracket of the [. etc.
1534 : */
1535 : static const chr * /* just after end of sequence */
1536 18 : scanplain(struct vars *v)
1537 : {
1538 : const chr *endp;
1539 :
1540 18 : assert(SEE(COLLEL) || SEE(ECLASS) || SEE(CCLASS));
1541 18 : NEXT();
1542 :
1543 18 : endp = v->now;
1544 126 : while (SEE(PLAIN))
1545 : {
1546 90 : endp = v->now;
1547 90 : NEXT();
1548 : }
1549 :
1550 18 : assert(SEE(END) || ISERR());
1551 18 : NEXT();
1552 :
1553 18 : return endp;
1554 : }
1555 :
1556 : /*
1557 : * onechr - fill in arcs for a plain character, and possible case complements
1558 : * This is mostly a shortcut for efficient handling of the common case.
1559 : */
1560 : static void
1561 4999 : onechr(struct vars *v,
1562 : chr c,
1563 : struct state *lp,
1564 : struct state *rp)
1565 : {
1566 4999 : if (!(v->cflags & REG_ICASE))
1567 : {
1568 4983 : color lastsubcolor = COLORLESS;
1569 :
1570 4983 : subcoloronechr(v, c, lp, rp, &lastsubcolor);
1571 9982 : return;
1572 : }
1573 :
1574 : /* rats, need general case anyway... */
1575 16 : subcolorcvec(v, allcases(v, c), lp, rp);
1576 : }
1577 :
1578 : /*
1579 : * wordchrs - set up word-chr list for word-boundary stuff, if needed
1580 : *
1581 : * The list is kept as a bunch of arcs between two dummy states; it's
1582 : * disposed of by the unreachable-states sweep in NFA optimization.
1583 : * Does NEXT(). Must not be called from any unusual lexical context.
1584 : * This should be reconciled with the \w etc. handling in lex.c, and
1585 : * should be cleaned up to reduce dependencies on input scanning.
1586 : */
1587 : static void
1588 4 : wordchrs(struct vars *v)
1589 : {
1590 : struct state *left;
1591 : struct state *right;
1592 :
1593 4 : if (v->wordchrs != NULL)
1594 : {
1595 1 : NEXT(); /* for consistency */
1596 1 : return;
1597 : }
1598 :
1599 3 : left = newstate(v->nfa);
1600 3 : right = newstate(v->nfa);
1601 3 : NOERR();
1602 : /* fine point: implemented with [::], and lexer will set REG_ULOCALE */
1603 3 : lexword(v);
1604 3 : NEXT();
1605 3 : assert(v->savenow != NULL && SEE('['));
1606 3 : bracket(v, left, right);
1607 3 : assert((v->savenow != NULL && SEE(']')) || ISERR());
1608 3 : NEXT();
1609 3 : NOERR();
1610 3 : v->wordchrs = left;
1611 : }
1612 :
1613 : /*
1614 : * processlacon - generate the NFA representation of a LACON
1615 : *
1616 : * In the general case this is just newlacon() + newarc(), but some cases
1617 : * can be optimized.
1618 : */
1619 : static void
1620 25 : processlacon(struct vars *v,
1621 : struct state *begin, /* start of parsed LACON sub-re */
1622 : struct state *end, /* end of parsed LACON sub-re */
1623 : int latype,
1624 : struct state *lp, /* left state to hang it on */
1625 : struct state *rp) /* right state to hang it on */
1626 : {
1627 : struct state *s1;
1628 : int n;
1629 :
1630 : /*
1631 : * Check for lookaround RE consisting of a single plain color arc (or set
1632 : * of arcs); this would typically be a simple chr or a bracket expression.
1633 : */
1634 25 : s1 = single_color_transition(begin, end);
1635 25 : switch (latype)
1636 : {
1637 : case LATYPE_AHEAD_POS:
1638 : /* If lookahead RE is just colorset C, convert to AHEAD(C) */
1639 6 : if (s1 != NULL)
1640 : {
1641 5 : cloneouts(v->nfa, s1, lp, rp, AHEAD);
1642 5 : return;
1643 : }
1644 1 : break;
1645 : case LATYPE_AHEAD_NEG:
1646 : /* If lookahead RE is just colorset C, convert to AHEAD(^C)|$ */
1647 6 : if (s1 != NULL)
1648 : {
1649 2 : colorcomplement(v->nfa, v->cm, AHEAD, s1, lp, rp);
1650 2 : newarc(v->nfa, '$', 1, lp, rp);
1651 2 : newarc(v->nfa, '$', 0, lp, rp);
1652 2 : return;
1653 : }
1654 4 : break;
1655 : case LATYPE_BEHIND_POS:
1656 : /* If lookbehind RE is just colorset C, convert to BEHIND(C) */
1657 9 : if (s1 != NULL)
1658 : {
1659 7 : cloneouts(v->nfa, s1, lp, rp, BEHIND);
1660 7 : return;
1661 : }
1662 2 : break;
1663 : case LATYPE_BEHIND_NEG:
1664 : /* If lookbehind RE is just colorset C, convert to BEHIND(^C)|^ */
1665 4 : if (s1 != NULL)
1666 : {
1667 4 : colorcomplement(v->nfa, v->cm, BEHIND, s1, lp, rp);
1668 4 : newarc(v->nfa, '^', 1, lp, rp);
1669 4 : newarc(v->nfa, '^', 0, lp, rp);
1670 4 : return;
1671 : }
1672 0 : break;
1673 : default:
1674 0 : assert(NOTREACHED);
1675 : }
1676 :
1677 : /* General case: we need a LACON subre and arc */
1678 7 : n = newlacon(v, begin, end, latype);
1679 7 : newarc(v->nfa, LACON, n, lp, rp);
1680 : }
1681 :
1682 : /*
1683 : * subre - allocate a subre
1684 : */
1685 : static struct subre *
1686 2085 : subre(struct vars *v,
1687 : int op,
1688 : int flags,
1689 : struct state *begin,
1690 : struct state *end)
1691 : {
1692 2085 : struct subre *ret = v->treefree;
1693 :
1694 : /*
1695 : * Checking for stack overflow here is sufficient to protect parse() and
1696 : * its recursive subroutines.
1697 : */
1698 2085 : if (STACK_TOO_DEEP(v->re))
1699 : {
1700 0 : ERR(REG_ETOOBIG);
1701 0 : return NULL;
1702 : }
1703 :
1704 2085 : if (ret != NULL)
1705 238 : v->treefree = ret->left;
1706 : else
1707 : {
1708 1847 : ret = (struct subre *) MALLOC(sizeof(struct subre));
1709 1847 : if (ret == NULL)
1710 : {
1711 0 : ERR(REG_ESPACE);
1712 0 : return NULL;
1713 : }
1714 1847 : ret->chain = v->treechain;
1715 1847 : v->treechain = ret;
1716 : }
1717 :
1718 2085 : assert(strchr("=b|.*(", op) != NULL);
1719 :
1720 2085 : ret->op = op;
1721 2085 : ret->flags = flags;
1722 2085 : ret->id = 0; /* will be assigned later */
1723 2085 : ret->subno = 0;
1724 2085 : ret->min = ret->max = 1;
1725 2085 : ret->left = NULL;
1726 2085 : ret->right = NULL;
1727 2085 : ret->begin = begin;
1728 2085 : ret->end = end;
1729 2085 : ZAPCNFA(ret->cnfa);
1730 :
1731 2085 : return ret;
1732 : }
1733 :
1734 : /*
1735 : * freesubre - free a subRE subtree
1736 : */
1737 : static void
1738 652 : freesubre(struct vars *v, /* might be NULL */
1739 : struct subre *sr)
1740 : {
1741 652 : if (sr == NULL)
1742 654 : return;
1743 :
1744 650 : if (sr->left != NULL)
1745 63 : freesubre(v, sr->left);
1746 650 : if (sr->right != NULL)
1747 39 : freesubre(v, sr->right);
1748 :
1749 650 : freesrnode(v, sr);
1750 : }
1751 :
1752 : /*
1753 : * freesrnode - free one node in a subRE subtree
1754 : */
1755 : static void
1756 650 : freesrnode(struct vars *v, /* might be NULL */
1757 : struct subre *sr)
1758 : {
1759 650 : if (sr == NULL)
1760 650 : return;
1761 :
1762 650 : if (!NULLCNFA(sr->cnfa))
1763 121 : freecnfa(&sr->cnfa);
1764 650 : sr->flags = 0;
1765 :
1766 650 : if (v != NULL && v->treechain != NULL)
1767 : {
1768 : /* we're still parsing, maybe we can reuse the subre */
1769 528 : sr->left = v->treefree;
1770 528 : v->treefree = sr;
1771 : }
1772 : else
1773 122 : FREE(sr);
1774 : }
1775 :
1776 : /*
1777 : * optst - optimize a subRE subtree
1778 : */
1779 : static void
1780 248 : optst(struct vars *v,
1781 : struct subre *t)
1782 : {
1783 : /*
1784 : * DGP (2007-11-13): I assume it was the programmer's intent to eventually
1785 : * come back and add code to optimize subRE trees, but the routine coded
1786 : * just spends effort traversing the tree and doing nothing. We can do
1787 : * nothing with less effort.
1788 : */
1789 248 : return;
1790 : }
1791 :
1792 : /*
1793 : * numst - number tree nodes (assigning "id" indexes)
1794 : */
1795 : static int /* next number */
1796 1525 : numst(struct subre *t,
1797 : int start) /* starting point for subtree numbers */
1798 : {
1799 : int i;
1800 :
1801 1525 : assert(t != NULL);
1802 :
1803 1525 : i = start;
1804 1525 : t->id = (short) i++;
1805 1525 : if (t->left != NULL)
1806 765 : i = numst(t->left, i);
1807 1525 : if (t->right != NULL)
1808 512 : i = numst(t->right, i);
1809 1525 : return i;
1810 : }
1811 :
1812 : /*
1813 : * markst - mark tree nodes as INUSE
1814 : *
1815 : * Note: this is a great deal more subtle than it looks. During initial
1816 : * parsing of a regex, all subres are linked into the treechain list;
1817 : * discarded ones are also linked into the treefree list for possible reuse.
1818 : * After we are done creating all subres required for a regex, we run markst()
1819 : * then cleanst(), which results in discarding all subres not reachable from
1820 : * v->tree. We then clear v->treechain, indicating that subres must be found
1821 : * by descending from v->tree. This changes the behavior of freesubre(): it
1822 : * will henceforth FREE() unwanted subres rather than sticking them into the
1823 : * treefree list. (Doing that any earlier would result in dangling links in
1824 : * the treechain list.) This all means that freev() will clean up correctly
1825 : * if invoked before or after markst()+cleanst(); but it would not work if
1826 : * called partway through this state conversion, so we mustn't error out
1827 : * in or between these two functions.
1828 : */
1829 : static void
1830 1525 : markst(struct subre *t)
1831 : {
1832 1525 : assert(t != NULL);
1833 :
1834 1525 : t->flags |= INUSE;
1835 1525 : if (t->left != NULL)
1836 765 : markst(t->left);
1837 1525 : if (t->right != NULL)
1838 512 : markst(t->right);
1839 1525 : }
1840 :
1841 : /*
1842 : * cleanst - free any tree nodes not marked INUSE
1843 : */
1844 : static void
1845 252 : cleanst(struct vars *v)
1846 : {
1847 : struct subre *t;
1848 : struct subre *next;
1849 :
1850 2099 : for (t = v->treechain; t != NULL; t = next)
1851 : {
1852 1847 : next = t->chain;
1853 1847 : if (!(t->flags & INUSE))
1854 322 : FREE(t);
1855 : }
1856 252 : v->treechain = NULL;
1857 252 : v->treefree = NULL; /* just on general principles */
1858 252 : }
1859 :
1860 : /*
1861 : * nfatree - turn a subRE subtree into a tree of compacted NFAs
1862 : */
1863 : static long /* optimize results from top node */
1864 1525 : nfatree(struct vars *v,
1865 : struct subre *t,
1866 : FILE *f) /* for debug output */
1867 : {
1868 1525 : assert(t != NULL && t->begin != NULL);
1869 :
1870 1525 : if (t->left != NULL)
1871 765 : (DISCARD) nfatree(v, t->left, f);
1872 1525 : if (t->right != NULL)
1873 512 : (DISCARD) nfatree(v, t->right, f);
1874 :
1875 1525 : return nfanode(v, t, 0, f);
1876 : }
1877 :
1878 : /*
1879 : * nfanode - do one NFA for nfatree or lacons
1880 : *
1881 : * If converttosearch is true, apply makesearch() to the NFA.
1882 : */
1883 : static long /* optimize results */
1884 1532 : nfanode(struct vars *v,
1885 : struct subre *t,
1886 : int converttosearch,
1887 : FILE *f) /* for debug output */
1888 : {
1889 : struct nfa *nfa;
1890 1532 : long ret = 0;
1891 :
1892 1532 : assert(t->begin != NULL);
1893 :
1894 : #ifdef REG_DEBUG
1895 : if (f != NULL)
1896 : {
1897 : char idbuf[50];
1898 :
1899 : fprintf(f, "\n\n\n========= TREE NODE %s ==========\n",
1900 : stid(t, idbuf, sizeof(idbuf)));
1901 : }
1902 : #endif
1903 1532 : nfa = newnfa(v, v->cm, v->nfa);
1904 1532 : NOERRZ();
1905 1532 : dupnfa(nfa, t->begin, t->end, nfa->init, nfa->final);
1906 1532 : if (!ISERR())
1907 1532 : specialcolors(nfa);
1908 1532 : if (!ISERR())
1909 1532 : ret = optimize(nfa, f);
1910 1532 : if (converttosearch && !ISERR())
1911 2 : makesearch(v, nfa);
1912 1532 : if (!ISERR())
1913 1531 : compact(nfa, &t->cnfa);
1914 :
1915 1532 : freenfa(nfa);
1916 1532 : return ret;
1917 : }
1918 :
1919 : /*
1920 : * newlacon - allocate a lookaround-constraint subRE
1921 : */
1922 : static int /* lacon number */
1923 7 : newlacon(struct vars *v,
1924 : struct state *begin,
1925 : struct state *end,
1926 : int latype)
1927 : {
1928 : int n;
1929 : struct subre *newlacons;
1930 : struct subre *sub;
1931 :
1932 7 : if (v->nlacons == 0)
1933 : {
1934 5 : n = 1; /* skip 0th */
1935 5 : newlacons = (struct subre *) MALLOC(2 * sizeof(struct subre));
1936 : }
1937 : else
1938 : {
1939 2 : n = v->nlacons;
1940 2 : newlacons = (struct subre *) REALLOC(v->lacons,
1941 : (n + 1) * sizeof(struct subre));
1942 : }
1943 7 : if (newlacons == NULL)
1944 : {
1945 0 : ERR(REG_ESPACE);
1946 0 : return 0;
1947 : }
1948 7 : v->lacons = newlacons;
1949 7 : v->nlacons = n + 1;
1950 7 : sub = &v->lacons[n];
1951 7 : sub->begin = begin;
1952 7 : sub->end = end;
1953 7 : sub->subno = latype;
1954 7 : ZAPCNFA(sub->cnfa);
1955 7 : return n;
1956 : }
1957 :
1958 : /*
1959 : * freelacons - free lookaround-constraint subRE vector
1960 : */
1961 : static void
1962 2 : freelacons(struct subre *subs,
1963 : int n)
1964 : {
1965 : struct subre *sub;
1966 : int i;
1967 :
1968 2 : assert(n > 0);
1969 4 : for (sub = subs + 1, i = n - 1; i > 0; sub++, i--) /* no 0th */
1970 2 : if (!NULLCNFA(sub->cnfa))
1971 2 : freecnfa(&sub->cnfa);
1972 2 : FREE(subs);
1973 2 : }
1974 :
1975 : /*
1976 : * rfree - free a whole RE (insides of regfree)
1977 : */
1978 : static void
1979 32 : rfree(regex_t *re)
1980 : {
1981 : struct guts *g;
1982 :
1983 32 : if (re == NULL || re->re_magic != REMAGIC)
1984 32 : return;
1985 :
1986 32 : re->re_magic = 0; /* invalidate RE */
1987 32 : g = (struct guts *) re->re_guts;
1988 32 : re->re_guts = NULL;
1989 32 : re->re_fns = NULL;
1990 32 : if (g != NULL)
1991 : {
1992 32 : g->magic = 0;
1993 32 : freecm(&g->cmap);
1994 32 : if (g->tree != NULL)
1995 26 : freesubre((struct vars *) NULL, g->tree);
1996 32 : if (g->lacons != NULL)
1997 2 : freelacons(g->lacons, g->nlacons);
1998 32 : if (!NULLCNFA(g->search))
1999 26 : freecnfa(&g->search);
2000 32 : FREE(g);
2001 : }
2002 : }
2003 :
2004 : /*
2005 : * rcancelrequested - check for external request to cancel regex operation
2006 : *
2007 : * Return nonzero to fail the operation with error code REG_CANCEL,
2008 : * zero to keep going
2009 : *
2010 : * The current implementation is Postgres-specific. If we ever get around
2011 : * to splitting the regex code out as a standalone library, there will need
2012 : * to be some API to let applications define a callback function for this.
2013 : */
2014 : static int
2015 475977 : rcancelrequested(void)
2016 : {
2017 475977 : return InterruptPending && (QueryCancelPending || ProcDiePending);
2018 : }
2019 :
2020 : /*
2021 : * rstacktoodeep - check for stack getting dangerously deep
2022 : *
2023 : * Return nonzero to fail the operation with error code REG_ETOOBIG,
2024 : * zero to keep going
2025 : *
2026 : * The current implementation is Postgres-specific. If we ever get around
2027 : * to splitting the regex code out as a standalone library, there will need
2028 : * to be some API to let applications define a callback function for this.
2029 : */
2030 : static int
2031 2662329 : rstacktoodeep(void)
2032 : {
2033 2662329 : return stack_is_too_deep();
2034 : }
2035 :
2036 : #ifdef REG_DEBUG
2037 :
2038 : /*
2039 : * dump - dump an RE in human-readable form
2040 : */
2041 : static void
2042 : dump(regex_t *re,
2043 : FILE *f)
2044 : {
2045 : struct guts *g;
2046 : int i;
2047 :
2048 : if (re->re_magic != REMAGIC)
2049 : fprintf(f, "bad magic number (0x%x not 0x%x)\n", re->re_magic,
2050 : REMAGIC);
2051 : if (re->re_guts == NULL)
2052 : {
2053 : fprintf(f, "NULL guts!!!\n");
2054 : return;
2055 : }
2056 : g = (struct guts *) re->re_guts;
2057 : if (g->magic != GUTSMAGIC)
2058 : fprintf(f, "bad guts magic number (0x%x not 0x%x)\n", g->magic,
2059 : GUTSMAGIC);
2060 :
2061 : fprintf(f, "\n\n\n========= DUMP ==========\n");
2062 : fprintf(f, "nsub %d, info 0%lo, csize %d, ntree %d\n",
2063 : (int) re->re_nsub, re->re_info, re->re_csize, g->ntree);
2064 :
2065 : dumpcolors(&g->cmap, f);
2066 : if (!NULLCNFA(g->search))
2067 : {
2068 : fprintf(f, "\nsearch:\n");
2069 : dumpcnfa(&g->search, f);
2070 : }
2071 : for (i = 1; i < g->nlacons; i++)
2072 : {
2073 : struct subre *lasub = &g->lacons[i];
2074 : const char *latype;
2075 :
2076 : switch (lasub->subno)
2077 : {
2078 : case LATYPE_AHEAD_POS:
2079 : latype = "positive lookahead";
2080 : break;
2081 : case LATYPE_AHEAD_NEG:
2082 : latype = "negative lookahead";
2083 : break;
2084 : case LATYPE_BEHIND_POS:
2085 : latype = "positive lookbehind";
2086 : break;
2087 : case LATYPE_BEHIND_NEG:
2088 : latype = "negative lookbehind";
2089 : break;
2090 : default:
2091 : latype = "???";
2092 : break;
2093 : }
2094 : fprintf(f, "\nla%d (%s):\n", i, latype);
2095 : dumpcnfa(&lasub->cnfa, f);
2096 : }
2097 : fprintf(f, "\n");
2098 : dumpst(g->tree, f, 0);
2099 : }
2100 :
2101 : /*
2102 : * dumpst - dump a subRE tree
2103 : */
2104 : static void
2105 : dumpst(struct subre *t,
2106 : FILE *f,
2107 : int nfapresent) /* is the original NFA still around? */
2108 : {
2109 : if (t == NULL)
2110 : fprintf(f, "null tree\n");
2111 : else
2112 : stdump(t, f, nfapresent);
2113 : fflush(f);
2114 : }
2115 :
2116 : /*
2117 : * stdump - recursive guts of dumpst
2118 : */
2119 : static void
2120 : stdump(struct subre *t,
2121 : FILE *f,
2122 : int nfapresent) /* is the original NFA still around? */
2123 : {
2124 : char idbuf[50];
2125 :
2126 : fprintf(f, "%s. `%c'", stid(t, idbuf, sizeof(idbuf)), t->op);
2127 : if (t->flags & LONGER)
2128 : fprintf(f, " longest");
2129 : if (t->flags & SHORTER)
2130 : fprintf(f, " shortest");
2131 : if (t->flags & MIXED)
2132 : fprintf(f, " hasmixed");
2133 : if (t->flags & CAP)
2134 : fprintf(f, " hascapture");
2135 : if (t->flags & BACKR)
2136 : fprintf(f, " hasbackref");
2137 : if (!(t->flags & INUSE))
2138 : fprintf(f, " UNUSED");
2139 : if (t->subno != 0)
2140 : fprintf(f, " (#%d)", t->subno);
2141 : if (t->min != 1 || t->max != 1)
2142 : {
2143 : fprintf(f, " {%d,", t->min);
2144 : if (t->max != DUPINF)
2145 : fprintf(f, "%d", t->max);
2146 : fprintf(f, "}");
2147 : }
2148 : if (nfapresent)
2149 : fprintf(f, " %ld-%ld", (long) t->begin->no, (long) t->end->no);
2150 : if (t->left != NULL)
2151 : fprintf(f, " L:%s", stid(t->left, idbuf, sizeof(idbuf)));
2152 : if (t->right != NULL)
2153 : fprintf(f, " R:%s", stid(t->right, idbuf, sizeof(idbuf)));
2154 : if (!NULLCNFA(t->cnfa))
2155 : {
2156 : fprintf(f, "\n");
2157 : dumpcnfa(&t->cnfa, f);
2158 : }
2159 : fprintf(f, "\n");
2160 : if (t->left != NULL)
2161 : stdump(t->left, f, nfapresent);
2162 : if (t->right != NULL)
2163 : stdump(t->right, f, nfapresent);
2164 : }
2165 :
2166 : /*
2167 : * stid - identify a subtree node for dumping
2168 : */
2169 : static const char * /* points to buf or constant string */
2170 : stid(struct subre *t,
2171 : char *buf,
2172 : size_t bufsize)
2173 : {
2174 : /* big enough for hex int or decimal t->id? */
2175 : if (bufsize < sizeof(void *) * 2 + 3 || bufsize < sizeof(t->id) * 3 + 1)
2176 : return "unable";
2177 : if (t->id != 0)
2178 : sprintf(buf, "%d", t->id);
2179 : else
2180 : sprintf(buf, "%p", t);
2181 : return buf;
2182 : }
2183 : #endif /* REG_DEBUG */
2184 :
2185 :
2186 : #include "regc_lex.c"
2187 : #include "regc_color.c"
2188 : #include "regc_nfa.c"
2189 : #include "regc_cvec.c"
2190 : #include "regc_pg_locale.c"
2191 : #include "regc_locale.c"
|