Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * regprefix.c
4 : * Extract a common prefix, if any, from a compiled regex.
5 : *
6 : *
7 : * Portions Copyright (c) 2012-2017, PostgreSQL Global Development Group
8 : * Portions Copyright (c) 1998, 1999 Henry Spencer
9 : *
10 : * IDENTIFICATION
11 : * src/backend/regex/regprefix.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 :
16 : #include "regex/regguts.h"
17 :
18 :
19 : /*
20 : * forward declarations
21 : */
22 : static int findprefix(struct cnfa *cnfa, struct colormap *cm,
23 : chr *string, size_t *slength);
24 :
25 :
26 : /*
27 : * pg_regprefix - get common prefix for regular expression
28 : *
29 : * Returns one of:
30 : * REG_NOMATCH: there is no common prefix of strings matching the regex
31 : * REG_PREFIX: there is a common prefix of strings matching the regex
32 : * REG_EXACT: all strings satisfying the regex must match the same string
33 : * or a REG_XXX error code
34 : *
35 : * In the non-failure cases, *string is set to a malloc'd string containing
36 : * the common prefix or exact value, of length *slength (measured in chrs
37 : * not bytes!).
38 : *
39 : * This function does not analyze all complex cases (such as lookaround
40 : * constraints) exactly. Therefore it is possible that some strings matching
41 : * the reported prefix or exact-match string do not satisfy the regex. But
42 : * it should never be the case that a string satisfying the regex does not
43 : * match the reported prefix or exact-match string.
44 : */
45 : int
46 1256 : pg_regprefix(regex_t *re,
47 : chr **string,
48 : size_t *slength)
49 : {
50 : struct guts *g;
51 : struct cnfa *cnfa;
52 : int st;
53 :
54 : /* sanity checks */
55 1256 : if (string == NULL || slength == NULL)
56 0 : return REG_INVARG;
57 1256 : *string = NULL; /* initialize for failure cases */
58 1256 : *slength = 0;
59 1256 : if (re == NULL || re->re_magic != REMAGIC)
60 0 : return REG_INVARG;
61 1256 : if (re->re_csize != sizeof(chr))
62 0 : return REG_MIXED;
63 :
64 : /* Initialize locale-dependent support */
65 1256 : pg_set_regex_collation(re->re_collation);
66 :
67 : /* setup */
68 1256 : g = (struct guts *) re->re_guts;
69 1256 : if (g->info & REG_UIMPOSSIBLE)
70 0 : return REG_NOMATCH;
71 :
72 : /*
73 : * This implementation considers only the search NFA for the topmost regex
74 : * tree node. Therefore, constraints such as backrefs are not fully
75 : * applied, which is allowed per the function's API spec.
76 : */
77 1256 : assert(g->tree != NULL);
78 1256 : cnfa = &g->tree->cnfa;
79 :
80 : /*
81 : * Since a correct NFA should never contain any exit-free loops, it should
82 : * not be possible for our traversal to return to a previously visited NFA
83 : * state. Hence we need at most nstates chrs in the output string.
84 : */
85 1256 : *string = (chr *) MALLOC(cnfa->nstates * sizeof(chr));
86 1256 : if (*string == NULL)
87 0 : return REG_ESPACE;
88 :
89 : /* do it */
90 1256 : st = findprefix(cnfa, &g->cmap, *string, slength);
91 :
92 1256 : assert(*slength <= cnfa->nstates);
93 :
94 : /* clean up */
95 1256 : if (st != REG_PREFIX && st != REG_EXACT)
96 : {
97 46 : FREE(*string);
98 46 : *string = NULL;
99 46 : *slength = 0;
100 : }
101 :
102 1256 : return st;
103 : }
104 :
105 : /*
106 : * findprefix - extract common prefix from cNFA
107 : *
108 : * Results are returned into the preallocated chr array string[], with
109 : * *slength (which must be preset to zero) incremented for each chr.
110 : */
111 : static int /* regprefix return code */
112 1256 : findprefix(struct cnfa *cnfa,
113 : struct colormap *cm,
114 : chr *string,
115 : size_t *slength)
116 : {
117 : int st;
118 : int nextst;
119 : color thiscolor;
120 : chr c;
121 : struct carc *ca;
122 :
123 : /*
124 : * The "pre" state must have only BOS/BOL outarcs, else pattern isn't
125 : * anchored left. If we have both BOS and BOL, they must go to the same
126 : * next state.
127 : */
128 1256 : st = cnfa->pre;
129 1256 : nextst = -1;
130 2471 : for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
131 : {
132 1258 : if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
133 : {
134 2432 : if (nextst == -1)
135 1215 : nextst = ca->to;
136 2 : else if (nextst != ca->to)
137 2 : return REG_NOMATCH;
138 : }
139 : else
140 41 : return REG_NOMATCH;
141 : }
142 1213 : if (nextst == -1)
143 0 : return REG_NOMATCH;
144 :
145 : /*
146 : * Scan through successive states, stopping as soon as we find one with
147 : * more than one acceptable transition character (either multiple colors
148 : * on out-arcs, or a color with more than one member chr).
149 : *
150 : * We could find a state with multiple out-arcs that are all labeled with
151 : * the same singleton color; this comes from patterns like "^ab(cde|cxy)".
152 : * In that case we add the chr "c" to the output string but then exit the
153 : * loop with nextst == -1. This leaves a little bit on the table: if the
154 : * pattern is like "^ab(cde|cdy)", we won't notice that "d" could be added
155 : * to the prefix. But chasing multiple parallel state chains doesn't seem
156 : * worth the trouble.
157 : */
158 : do
159 : {
160 13865 : st = nextst;
161 13865 : nextst = -1;
162 13865 : thiscolor = COLORLESS;
163 26622 : for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
164 : {
165 : /* We can ignore BOS/BOL arcs */
166 13955 : if (ca->co == cnfa->bos[0] || ca->co == cnfa->bos[1])
167 0 : continue;
168 : /* ... but EOS/EOL arcs terminate the search, as do LACONs */
169 26798 : if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1] ||
170 12843 : ca->co >= cnfa->ncolors)
171 : {
172 1116 : thiscolor = COLORLESS;
173 1116 : break;
174 : }
175 12839 : if (thiscolor == COLORLESS)
176 : {
177 : /* First plain outarc */
178 12753 : thiscolor = ca->co;
179 12753 : nextst = ca->to;
180 : }
181 86 : else if (thiscolor == ca->co)
182 : {
183 : /* Another plain outarc for same color */
184 4 : nextst = -1;
185 : }
186 : else
187 : {
188 : /* More than one plain outarc color terminates the search */
189 82 : thiscolor = COLORLESS;
190 82 : break;
191 : }
192 : }
193 : /* Done if we didn't find exactly one color on plain outarcs */
194 13865 : if (thiscolor == COLORLESS)
195 1198 : break;
196 : /* The color must be a singleton */
197 12667 : if (cm->cd[thiscolor].nschrs != 1)
198 11 : break;
199 : /* Must not have any high-color-map entries */
200 12656 : if (cm->cd[thiscolor].nuchrs != 0)
201 0 : break;
202 :
203 : /*
204 : * Identify the color's sole member chr and add it to the prefix
205 : * string. In general the colormap data structure doesn't provide a
206 : * way to find color member chrs, except by trying GETCOLOR() on each
207 : * possible chr value, which won't do at all. However, for the cases
208 : * we care about it should be sufficient to test the "firstchr" value,
209 : * that is the first chr ever added to the color. There are cases
210 : * where this might no longer be a member of the color (so we do need
211 : * to test), but none of them are likely to arise for a character that
212 : * is a member of a common prefix. If we do hit such a corner case,
213 : * we just fall out without adding anything to the prefix string.
214 : */
215 12656 : c = cm->cd[thiscolor].firstchr;
216 12656 : if (GETCOLOR(cm, c) != thiscolor)
217 0 : break;
218 :
219 12656 : string[(*slength)++] = c;
220 :
221 : /* Advance to next state, but only if we have a unique next state */
222 12656 : } while (nextst != -1);
223 :
224 : /*
225 : * If we ended at a state that only has EOS/EOL outarcs leading to the
226 : * "post" state, then we have an exact-match string. Note this is true
227 : * even if the string is of zero length.
228 : */
229 1213 : nextst = -1;
230 2325 : for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
231 : {
232 1213 : if (ca->co == cnfa->eos[0] || ca->co == cnfa->eos[1])
233 : {
234 2224 : if (nextst == -1)
235 1112 : nextst = ca->to;
236 0 : else if (nextst != ca->to)
237 : {
238 0 : nextst = -1;
239 0 : break;
240 : }
241 : }
242 : else
243 : {
244 101 : nextst = -1;
245 101 : break;
246 : }
247 : }
248 1213 : if (nextst == cnfa->post)
249 1112 : return REG_EXACT;
250 :
251 : /*
252 : * Otherwise, if we were unable to identify any prefix characters, say
253 : * NOMATCH --- the pattern is anchored left, but doesn't specify any
254 : * particular first character.
255 : */
256 101 : if (*slength > 0)
257 98 : return REG_PREFIX;
258 :
259 3 : return REG_NOMATCH;
260 : }
|