Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * regexport.c
4 : * Functions for exporting info about a regex's NFA
5 : *
6 : * In this implementation, the NFA defines a necessary but not sufficient
7 : * condition for a string to match the regex: that is, there can be strings
8 : * that match the NFA but don't match the full regex, but not vice versa.
9 : * Thus, for example, it is okay for the functions below to treat lookaround
10 : * constraints as no-ops, since they merely constrain the string some more.
11 : *
12 : * Notice that these functions return info into caller-provided arrays
13 : * rather than doing their own malloc's. This simplifies the APIs by
14 : * eliminating a class of error conditions, and in the case of colors
15 : * allows the caller to decide how big is too big to bother with.
16 : *
17 : *
18 : * Portions Copyright (c) 2013-2017, PostgreSQL Global Development Group
19 : * Portions Copyright (c) 1998, 1999 Henry Spencer
20 : *
21 : * IDENTIFICATION
22 : * src/backend/regex/regexport.c
23 : *
24 : *-------------------------------------------------------------------------
25 : */
26 :
27 : #include "regex/regguts.h"
28 :
29 : #include "regex/regexport.h"
30 :
31 :
32 : /*
33 : * Get total number of NFA states.
34 : */
35 : int
36 0 : pg_reg_getnumstates(const regex_t *regex)
37 : {
38 : struct cnfa *cnfa;
39 :
40 0 : assert(regex != NULL && regex->re_magic == REMAGIC);
41 0 : cnfa = &((struct guts *) regex->re_guts)->search;
42 :
43 0 : return cnfa->nstates;
44 : }
45 :
46 : /*
47 : * Get initial state of NFA.
48 : */
49 : int
50 0 : pg_reg_getinitialstate(const regex_t *regex)
51 : {
52 : struct cnfa *cnfa;
53 :
54 0 : assert(regex != NULL && regex->re_magic == REMAGIC);
55 0 : cnfa = &((struct guts *) regex->re_guts)->search;
56 :
57 0 : return cnfa->pre;
58 : }
59 :
60 : /*
61 : * Get final state of NFA.
62 : */
63 : int
64 0 : pg_reg_getfinalstate(const regex_t *regex)
65 : {
66 : struct cnfa *cnfa;
67 :
68 0 : assert(regex != NULL && regex->re_magic == REMAGIC);
69 0 : cnfa = &((struct guts *) regex->re_guts)->search;
70 :
71 0 : return cnfa->post;
72 : }
73 :
74 : /*
75 : * pg_reg_getnumoutarcs() and pg_reg_getoutarcs() mask the existence of LACON
76 : * arcs from the caller, treating any LACON as being automatically satisfied.
77 : * Since the output representation does not support arcs that consume no
78 : * character when traversed, we have to recursively traverse LACON arcs here,
79 : * and report whatever normal arcs are reachable by traversing LACON arcs.
80 : * Note that this wouldn't work if it were possible to reach the final state
81 : * via LACON traversal, but the regex library never builds NFAs that have
82 : * LACON arcs leading directly to the final state. (This is because the
83 : * regex executor is designed to consume one character beyond the nominal
84 : * match end --- possibly an EOS indicator --- so there is always a set of
85 : * ordinary arcs leading to the final state.)
86 : *
87 : * traverse_lacons is a recursive subroutine used by both exported functions
88 : * to count and then emit the reachable regular arcs. *arcs_count is
89 : * incremented by the number of reachable arcs, and as many as will fit in
90 : * arcs_len (possibly 0) are emitted into arcs[].
91 : */
92 : static void
93 0 : traverse_lacons(struct cnfa *cnfa, int st,
94 : int *arcs_count,
95 : regex_arc_t *arcs, int arcs_len)
96 : {
97 : struct carc *ca;
98 :
99 : /*
100 : * Since this function recurses, it could theoretically be driven to stack
101 : * overflow. In practice, this is mostly useful to backstop against a
102 : * failure of the regex compiler to remove a loop of LACON arcs.
103 : */
104 0 : check_stack_depth();
105 :
106 0 : for (ca = cnfa->states[st]; ca->co != COLORLESS; ca++)
107 : {
108 0 : if (ca->co < cnfa->ncolors)
109 : {
110 : /* Ordinary arc, so count and possibly emit it */
111 0 : int ndx = (*arcs_count)++;
112 :
113 0 : if (ndx < arcs_len)
114 : {
115 0 : arcs[ndx].co = ca->co;
116 0 : arcs[ndx].to = ca->to;
117 : }
118 : }
119 : else
120 : {
121 : /* LACON arc --- assume it's satisfied and recurse... */
122 : /* ... but first, assert it doesn't lead directly to post state */
123 0 : Assert(ca->to != cnfa->post);
124 :
125 0 : traverse_lacons(cnfa, ca->to, arcs_count, arcs, arcs_len);
126 : }
127 : }
128 0 : }
129 :
130 : /*
131 : * Get number of outgoing NFA arcs of state number "st".
132 : */
133 : int
134 0 : pg_reg_getnumoutarcs(const regex_t *regex, int st)
135 : {
136 : struct cnfa *cnfa;
137 : int arcs_count;
138 :
139 0 : assert(regex != NULL && regex->re_magic == REMAGIC);
140 0 : cnfa = &((struct guts *) regex->re_guts)->search;
141 :
142 0 : if (st < 0 || st >= cnfa->nstates)
143 0 : return 0;
144 0 : arcs_count = 0;
145 0 : traverse_lacons(cnfa, st, &arcs_count, NULL, 0);
146 0 : return arcs_count;
147 : }
148 :
149 : /*
150 : * Write array of outgoing NFA arcs of state number "st" into arcs[],
151 : * whose length arcs_len must be at least as long as indicated by
152 : * pg_reg_getnumoutarcs(), else not all arcs will be returned.
153 : */
154 : void
155 0 : pg_reg_getoutarcs(const regex_t *regex, int st,
156 : regex_arc_t *arcs, int arcs_len)
157 : {
158 : struct cnfa *cnfa;
159 : int arcs_count;
160 :
161 0 : assert(regex != NULL && regex->re_magic == REMAGIC);
162 0 : cnfa = &((struct guts *) regex->re_guts)->search;
163 :
164 0 : if (st < 0 || st >= cnfa->nstates || arcs_len <= 0)
165 0 : return;
166 0 : arcs_count = 0;
167 0 : traverse_lacons(cnfa, st, &arcs_count, arcs, arcs_len);
168 : }
169 :
170 : /*
171 : * Get total number of colors.
172 : */
173 : int
174 0 : pg_reg_getnumcolors(const regex_t *regex)
175 : {
176 : struct colormap *cm;
177 :
178 0 : assert(regex != NULL && regex->re_magic == REMAGIC);
179 0 : cm = &((struct guts *) regex->re_guts)->cmap;
180 :
181 0 : return cm->max + 1;
182 : }
183 :
184 : /*
185 : * Check if color is beginning of line/string.
186 : *
187 : * (We might at some point need to offer more refined handling of pseudocolors,
188 : * but this will do for now.)
189 : */
190 : int
191 0 : pg_reg_colorisbegin(const regex_t *regex, int co)
192 : {
193 : struct cnfa *cnfa;
194 :
195 0 : assert(regex != NULL && regex->re_magic == REMAGIC);
196 0 : cnfa = &((struct guts *) regex->re_guts)->search;
197 :
198 0 : if (co == cnfa->bos[0] || co == cnfa->bos[1])
199 0 : return true;
200 : else
201 0 : return false;
202 : }
203 :
204 : /*
205 : * Check if color is end of line/string.
206 : */
207 : int
208 0 : pg_reg_colorisend(const regex_t *regex, int co)
209 : {
210 : struct cnfa *cnfa;
211 :
212 0 : assert(regex != NULL && regex->re_magic == REMAGIC);
213 0 : cnfa = &((struct guts *) regex->re_guts)->search;
214 :
215 0 : if (co == cnfa->eos[0] || co == cnfa->eos[1])
216 0 : return true;
217 : else
218 0 : return false;
219 : }
220 :
221 : /*
222 : * Get number of member chrs of color number "co".
223 : *
224 : * Note: we return -1 if the color number is invalid, or if it is a special
225 : * color (WHITE or a pseudocolor), or if the number of members is uncertain.
226 : * Callers should not try to extract the members if -1 is returned.
227 : */
228 : int
229 0 : pg_reg_getnumcharacters(const regex_t *regex, int co)
230 : {
231 : struct colormap *cm;
232 :
233 0 : assert(regex != NULL && regex->re_magic == REMAGIC);
234 0 : cm = &((struct guts *) regex->re_guts)->cmap;
235 :
236 0 : if (co <= 0 || co > cm->max) /* we reject 0 which is WHITE */
237 0 : return -1;
238 0 : if (cm->cd[co].flags & PSEUDO) /* also pseudocolors (BOS etc) */
239 0 : return -1;
240 :
241 : /*
242 : * If the color appears anywhere in the high colormap, treat its number of
243 : * members as uncertain. In principle we could determine all the specific
244 : * chrs corresponding to each such entry, but it would be expensive
245 : * (particularly if character class tests are required) and it doesn't
246 : * seem worth it.
247 : */
248 0 : if (cm->cd[co].nuchrs != 0)
249 0 : return -1;
250 :
251 : /* OK, return the known number of member chrs */
252 0 : return cm->cd[co].nschrs;
253 : }
254 :
255 : /*
256 : * Write array of member chrs of color number "co" into chars[],
257 : * whose length chars_len must be at least as long as indicated by
258 : * pg_reg_getnumcharacters(), else not all chars will be returned.
259 : *
260 : * Fetching the members of WHITE or a pseudocolor is not supported.
261 : *
262 : * Caution: this is a relatively expensive operation.
263 : */
264 : void
265 0 : pg_reg_getcharacters(const regex_t *regex, int co,
266 : pg_wchar *chars, int chars_len)
267 : {
268 : struct colormap *cm;
269 : chr c;
270 :
271 0 : assert(regex != NULL && regex->re_magic == REMAGIC);
272 0 : cm = &((struct guts *) regex->re_guts)->cmap;
273 :
274 0 : if (co <= 0 || co > cm->max || chars_len <= 0)
275 0 : return;
276 0 : if (cm->cd[co].flags & PSEUDO)
277 0 : return;
278 :
279 : /*
280 : * We need only examine the low character map; there should not be any
281 : * matching entries in the high map.
282 : */
283 0 : for (c = CHR_MIN; c <= MAX_SIMPLE_CHR; c++)
284 : {
285 0 : if (cm->locolormap[c - CHR_MIN] == co)
286 : {
287 0 : *chars++ = c;
288 0 : if (--chars_len == 0)
289 0 : break;
290 : }
291 : }
292 : }
|