Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * regexp.c
4 : * Postgres' interface to the regular expression package.
5 : *
6 : * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/utils/adt/regexp.c
12 : *
13 : * Alistair Crooks added the code for the regex caching
14 : * agc - cached the regular expressions used - there's a good chance
15 : * that we'll get a hit, so this saves a compile step for every
16 : * attempted match. I haven't actually measured the speed improvement,
17 : * but it `looks' a lot quicker visually when watching regression
18 : * test output.
19 : *
20 : * agc - incorporated Keith Bostic's Berkeley regex code into
21 : * the tree for all ports. To distinguish this regex code from any that
22 : * is existent on a platform, I've prepended the string "pg_" to
23 : * the functions regcomp, regerror, regexec and regfree.
24 : * Fixed a bug that was originally a typo by me, where `i' was used
25 : * instead of `oldest' when compiling regular expressions - benign
26 : * results mostly, although occasionally it bit you...
27 : *
28 : *-------------------------------------------------------------------------
29 : */
30 : #include "postgres.h"
31 :
32 : #include "catalog/pg_type.h"
33 : #include "funcapi.h"
34 : #include "miscadmin.h"
35 : #include "regex/regex.h"
36 : #include "utils/array.h"
37 : #include "utils/builtins.h"
38 : #include "utils/varlena.h"
39 :
40 : #define PG_GETARG_TEXT_PP_IF_EXISTS(_n) \
41 : (PG_NARGS() > (_n) ? PG_GETARG_TEXT_PP(_n) : NULL)
42 :
43 :
44 : /* all the options of interest for regex functions */
45 : typedef struct pg_re_flags
46 : {
47 : int cflags; /* compile flags for Spencer's regex code */
48 : bool glob; /* do it globally (for each occurrence) */
49 : } pg_re_flags;
50 :
51 : /* cross-call state for regexp_match and regexp_split functions */
52 : typedef struct regexp_matches_ctx
53 : {
54 : text *orig_str; /* data string in original TEXT form */
55 : int nmatches; /* number of places where pattern matched */
56 : int npatterns; /* number of capturing subpatterns */
57 : /* We store start char index and end+1 char index for each match */
58 : /* so the number of entries in match_locs is nmatches * npatterns * 2 */
59 : int *match_locs; /* 0-based character indexes */
60 : int next_match; /* 0-based index of next match to process */
61 : /* workspace for build_regexp_match_result() */
62 : Datum *elems; /* has npatterns elements */
63 : bool *nulls; /* has npatterns elements */
64 : } regexp_matches_ctx;
65 :
66 : /*
67 : * We cache precompiled regular expressions using a "self organizing list"
68 : * structure, in which recently-used items tend to be near the front.
69 : * Whenever we use an entry, it's moved up to the front of the list.
70 : * Over time, an item's average position corresponds to its frequency of use.
71 : *
72 : * When we first create an entry, it's inserted at the front of
73 : * the array, dropping the entry at the end of the array if necessary to
74 : * make room. (This might seem to be weighting the new entry too heavily,
75 : * but if we insert new entries further back, we'll be unable to adjust to
76 : * a sudden shift in the query mix where we are presented with MAX_CACHED_RES
77 : * never-before-seen items used circularly. We ought to be able to handle
78 : * that case, so we have to insert at the front.)
79 : *
80 : * Knuth mentions a variant strategy in which a used item is moved up just
81 : * one place in the list. Although he says this uses fewer comparisons on
82 : * average, it seems not to adapt very well to the situation where you have
83 : * both some reusable patterns and a steady stream of non-reusable patterns.
84 : * A reusable pattern that isn't used at least as often as non-reusable
85 : * patterns are seen will "fail to keep up" and will drop off the end of the
86 : * cache. With move-to-front, a reusable pattern is guaranteed to stay in
87 : * the cache as long as it's used at least once in every MAX_CACHED_RES uses.
88 : */
89 :
90 : /* this is the maximum number of cached regular expressions */
91 : #ifndef MAX_CACHED_RES
92 : #define MAX_CACHED_RES 32
93 : #endif
94 :
95 : /* this structure describes one cached regular expression */
96 : typedef struct cached_re_str
97 : {
98 : char *cre_pat; /* original RE (not null terminated!) */
99 : int cre_pat_len; /* length of original RE, in bytes */
100 : int cre_flags; /* compile flags: extended,icase etc */
101 : Oid cre_collation; /* collation to use */
102 : regex_t cre_re; /* the compiled regular expression */
103 : } cached_re_str;
104 :
105 : static int num_res = 0; /* # of cached re's */
106 : static cached_re_str re_array[MAX_CACHED_RES]; /* cached re's */
107 :
108 :
109 : /* Local functions */
110 : static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern,
111 : pg_re_flags *flags,
112 : Oid collation,
113 : bool use_subpatterns,
114 : bool ignore_degenerate);
115 : static void cleanup_regexp_matches(regexp_matches_ctx *matchctx);
116 : static ArrayType *build_regexp_match_result(regexp_matches_ctx *matchctx);
117 : static Datum build_regexp_split_result(regexp_matches_ctx *splitctx);
118 :
119 :
120 : /*
121 : * RE_compile_and_cache - compile a RE, caching if possible
122 : *
123 : * Returns regex_t *
124 : *
125 : * text_re --- the pattern, expressed as a TEXT object
126 : * cflags --- compile options for the pattern
127 : * collation --- collation to use for LC_CTYPE-dependent behavior
128 : *
129 : * Pattern is given in the database encoding. We internally convert to
130 : * an array of pg_wchar, which is what Spencer's regex package wants.
131 : */
132 : static regex_t *
133 33696 : RE_compile_and_cache(text *text_re, int cflags, Oid collation)
134 : {
135 33696 : int text_re_len = VARSIZE_ANY_EXHDR(text_re);
136 33696 : char *text_re_val = VARDATA_ANY(text_re);
137 : pg_wchar *pattern;
138 : int pattern_len;
139 : int i;
140 : int regcomp_result;
141 : cached_re_str re_temp;
142 : char errMsg[100];
143 :
144 : /*
145 : * Look for a match among previously compiled REs. Since the data
146 : * structure is self-organizing with most-used entries at the front, our
147 : * search strategy can just be to scan from the front.
148 : */
149 36420 : for (i = 0; i < num_res; i++)
150 : {
151 69910 : if (re_array[i].cre_pat_len == text_re_len &&
152 67458 : re_array[i].cre_flags == cflags &&
153 67430 : re_array[i].cre_collation == collation &&
154 33715 : memcmp(re_array[i].cre_pat, text_re_val, text_re_len) == 0)
155 : {
156 : /*
157 : * Found a match; move it to front if not there already.
158 : */
159 33443 : if (i > 0)
160 : {
161 257 : re_temp = re_array[i];
162 257 : memmove(&re_array[1], &re_array[0], i * sizeof(cached_re_str));
163 257 : re_array[0] = re_temp;
164 : }
165 :
166 33443 : return &re_array[0].cre_re;
167 : }
168 : }
169 :
170 : /*
171 : * Couldn't find it, so try to compile the new RE. To avoid leaking
172 : * resources on failure, we build into the re_temp local.
173 : */
174 :
175 : /* Convert pattern string to wide characters */
176 253 : pattern = (pg_wchar *) palloc((text_re_len + 1) * sizeof(pg_wchar));
177 253 : pattern_len = pg_mb2wchar_with_len(text_re_val,
178 : pattern,
179 : text_re_len);
180 :
181 253 : regcomp_result = pg_regcomp(&re_temp.cre_re,
182 : pattern,
183 : pattern_len,
184 : cflags,
185 : collation);
186 :
187 253 : pfree(pattern);
188 :
189 253 : if (regcomp_result != REG_OKAY)
190 : {
191 : /* re didn't compile (no need for pg_regfree, if so) */
192 :
193 : /*
194 : * Here and in other places in this file, do CHECK_FOR_INTERRUPTS
195 : * before reporting a regex error. This is so that if the regex
196 : * library aborts and returns REG_CANCEL, we don't print an error
197 : * message that implies the regex was invalid.
198 : */
199 6 : CHECK_FOR_INTERRUPTS();
200 :
201 6 : pg_regerror(regcomp_result, &re_temp.cre_re, errMsg, sizeof(errMsg));
202 6 : ereport(ERROR,
203 : (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
204 : errmsg("invalid regular expression: %s", errMsg)));
205 : }
206 :
207 : /*
208 : * We use malloc/free for the cre_pat field because the storage has to
209 : * persist across transactions, and because we want to get control back on
210 : * out-of-memory. The Max() is because some malloc implementations return
211 : * NULL for malloc(0).
212 : */
213 247 : re_temp.cre_pat = malloc(Max(text_re_len, 1));
214 247 : if (re_temp.cre_pat == NULL)
215 : {
216 0 : pg_regfree(&re_temp.cre_re);
217 0 : ereport(ERROR,
218 : (errcode(ERRCODE_OUT_OF_MEMORY),
219 : errmsg("out of memory")));
220 : }
221 247 : memcpy(re_temp.cre_pat, text_re_val, text_re_len);
222 247 : re_temp.cre_pat_len = text_re_len;
223 247 : re_temp.cre_flags = cflags;
224 247 : re_temp.cre_collation = collation;
225 :
226 : /*
227 : * Okay, we have a valid new item in re_temp; insert it into the storage
228 : * array. Discard last entry if needed.
229 : */
230 247 : if (num_res >= MAX_CACHED_RES)
231 : {
232 26 : --num_res;
233 26 : Assert(num_res < MAX_CACHED_RES);
234 26 : pg_regfree(&re_array[num_res].cre_re);
235 26 : free(re_array[num_res].cre_pat);
236 : }
237 :
238 247 : if (num_res > 0)
239 201 : memmove(&re_array[1], &re_array[0], num_res * sizeof(cached_re_str));
240 :
241 247 : re_array[0] = re_temp;
242 247 : num_res++;
243 :
244 247 : return &re_array[0].cre_re;
245 : }
246 :
247 : /*
248 : * RE_wchar_execute - execute a RE on pg_wchar data
249 : *
250 : * Returns TRUE on match, FALSE on no match
251 : *
252 : * re --- the compiled pattern as returned by RE_compile_and_cache
253 : * data --- the data to match against (need not be null-terminated)
254 : * data_len --- the length of the data string
255 : * start_search -- the offset in the data to start searching
256 : * nmatch, pmatch --- optional return area for match details
257 : *
258 : * Data is given as array of pg_wchar which is what Spencer's regex package
259 : * wants.
260 : */
261 : static bool
262 32643 : RE_wchar_execute(regex_t *re, pg_wchar *data, int data_len,
263 : int start_search, int nmatch, regmatch_t *pmatch)
264 : {
265 : int regexec_result;
266 : char errMsg[100];
267 :
268 : /* Perform RE match and return result */
269 32643 : regexec_result = pg_regexec(re,
270 : data,
271 : data_len,
272 : start_search,
273 : NULL, /* no details */
274 : nmatch,
275 : pmatch,
276 : 0);
277 :
278 32643 : if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH)
279 : {
280 : /* re failed??? */
281 0 : CHECK_FOR_INTERRUPTS();
282 0 : pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
283 0 : ereport(ERROR,
284 : (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
285 : errmsg("regular expression failed: %s", errMsg)));
286 : }
287 :
288 32643 : return (regexec_result == REG_OKAY);
289 : }
290 :
291 : /*
292 : * RE_execute - execute a RE
293 : *
294 : * Returns TRUE on match, FALSE on no match
295 : *
296 : * re --- the compiled pattern as returned by RE_compile_and_cache
297 : * dat --- the data to match against (need not be null-terminated)
298 : * dat_len --- the length of the data string
299 : * nmatch, pmatch --- optional return area for match details
300 : *
301 : * Data is given in the database encoding. We internally
302 : * convert to array of pg_wchar which is what Spencer's regex package wants.
303 : */
304 : static bool
305 32358 : RE_execute(regex_t *re, char *dat, int dat_len,
306 : int nmatch, regmatch_t *pmatch)
307 : {
308 : pg_wchar *data;
309 : int data_len;
310 : bool match;
311 :
312 : /* Convert data string to wide characters */
313 32358 : data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
314 32358 : data_len = pg_mb2wchar_with_len(dat, data, dat_len);
315 :
316 : /* Perform RE match and return result */
317 32358 : match = RE_wchar_execute(re, data, data_len, 0, nmatch, pmatch);
318 :
319 32358 : pfree(data);
320 32358 : return match;
321 : }
322 :
323 : /*
324 : * RE_compile_and_execute - compile and execute a RE
325 : *
326 : * Returns TRUE on match, FALSE on no match
327 : *
328 : * text_re --- the pattern, expressed as a TEXT object
329 : * dat --- the data to match against (need not be null-terminated)
330 : * dat_len --- the length of the data string
331 : * cflags --- compile options for the pattern
332 : * collation --- collation to use for LC_CTYPE-dependent behavior
333 : * nmatch, pmatch --- optional return area for match details
334 : *
335 : * Both pattern and data are given in the database encoding. We internally
336 : * convert to array of pg_wchar which is what Spencer's regex package wants.
337 : */
338 : static bool
339 32354 : RE_compile_and_execute(text *text_re, char *dat, int dat_len,
340 : int cflags, Oid collation,
341 : int nmatch, regmatch_t *pmatch)
342 : {
343 : regex_t *re;
344 :
345 : /* Compile RE */
346 32354 : re = RE_compile_and_cache(text_re, cflags, collation);
347 :
348 32350 : return RE_execute(re, dat, dat_len, nmatch, pmatch);
349 : }
350 :
351 :
352 : /*
353 : * parse_re_flags - parse the options argument of regexp_match and friends
354 : *
355 : * flags --- output argument, filled with desired options
356 : * opts --- TEXT object, or NULL for defaults
357 : *
358 : * This accepts all the options allowed by any of the callers; callers that
359 : * don't want some have to reject them after the fact.
360 : */
361 : static void
362 64 : parse_re_flags(pg_re_flags *flags, text *opts)
363 : {
364 : /* regex flavor is always folded into the compile flags */
365 64 : flags->cflags = REG_ADVANCED;
366 64 : flags->glob = false;
367 :
368 64 : if (opts)
369 : {
370 23 : char *opt_p = VARDATA_ANY(opts);
371 23 : int opt_len = VARSIZE_ANY_EXHDR(opts);
372 : int i;
373 :
374 51 : for (i = 0; i < opt_len; i++)
375 : {
376 32 : switch (opt_p[i])
377 : {
378 : case 'g':
379 15 : flags->glob = true;
380 15 : break;
381 : case 'b': /* BREs (but why???) */
382 0 : flags->cflags &= ~(REG_ADVANCED | REG_EXTENDED | REG_QUOTE);
383 0 : break;
384 : case 'c': /* case sensitive */
385 0 : flags->cflags &= ~REG_ICASE;
386 0 : break;
387 : case 'e': /* plain EREs */
388 0 : flags->cflags |= REG_EXTENDED;
389 0 : flags->cflags &= ~(REG_ADVANCED | REG_QUOTE);
390 0 : break;
391 : case 'i': /* case insensitive */
392 7 : flags->cflags |= REG_ICASE;
393 7 : break;
394 : case 'm': /* Perloid synonym for n */
395 : case 'n': /* \n affects ^ $ . [^ */
396 6 : flags->cflags |= REG_NEWLINE;
397 6 : break;
398 : case 'p': /* ~Perl, \n affects . [^ */
399 0 : flags->cflags |= REG_NLSTOP;
400 0 : flags->cflags &= ~REG_NLANCH;
401 0 : break;
402 : case 'q': /* literal string */
403 0 : flags->cflags |= REG_QUOTE;
404 0 : flags->cflags &= ~(REG_ADVANCED | REG_EXTENDED);
405 0 : break;
406 : case 's': /* single line, \n ordinary */
407 0 : flags->cflags &= ~REG_NEWLINE;
408 0 : break;
409 : case 't': /* tight syntax */
410 0 : flags->cflags &= ~REG_EXPANDED;
411 0 : break;
412 : case 'w': /* weird, \n affects ^ $ only */
413 0 : flags->cflags &= ~REG_NLSTOP;
414 0 : flags->cflags |= REG_NLANCH;
415 0 : break;
416 : case 'x': /* expanded syntax */
417 0 : flags->cflags |= REG_EXPANDED;
418 0 : break;
419 : default:
420 4 : ereport(ERROR,
421 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
422 : errmsg("invalid regexp option: \"%c\"",
423 : opt_p[i])));
424 : break;
425 : }
426 : }
427 : }
428 60 : }
429 :
430 :
431 : /*
432 : * interface routines called by the function manager
433 : */
434 :
435 : Datum
436 4257 : nameregexeq(PG_FUNCTION_ARGS)
437 : {
438 4257 : Name n = PG_GETARG_NAME(0);
439 4257 : text *p = PG_GETARG_TEXT_PP(1);
440 :
441 4257 : PG_RETURN_BOOL(RE_compile_and_execute(p,
442 : NameStr(*n),
443 : strlen(NameStr(*n)),
444 : REG_ADVANCED,
445 : PG_GET_COLLATION(),
446 : 0, NULL));
447 : }
448 :
449 : Datum
450 288 : nameregexne(PG_FUNCTION_ARGS)
451 : {
452 288 : Name n = PG_GETARG_NAME(0);
453 288 : text *p = PG_GETARG_TEXT_PP(1);
454 :
455 288 : PG_RETURN_BOOL(!RE_compile_and_execute(p,
456 : NameStr(*n),
457 : strlen(NameStr(*n)),
458 : REG_ADVANCED,
459 : PG_GET_COLLATION(),
460 : 0, NULL));
461 : }
462 :
463 : Datum
464 22123 : textregexeq(PG_FUNCTION_ARGS)
465 : {
466 22123 : text *s = PG_GETARG_TEXT_PP(0);
467 22123 : text *p = PG_GETARG_TEXT_PP(1);
468 :
469 22123 : PG_RETURN_BOOL(RE_compile_and_execute(p,
470 : VARDATA_ANY(s),
471 : VARSIZE_ANY_EXHDR(s),
472 : REG_ADVANCED,
473 : PG_GET_COLLATION(),
474 : 0, NULL));
475 : }
476 :
477 : Datum
478 5685 : textregexne(PG_FUNCTION_ARGS)
479 : {
480 5685 : text *s = PG_GETARG_TEXT_PP(0);
481 5685 : text *p = PG_GETARG_TEXT_PP(1);
482 :
483 5685 : PG_RETURN_BOOL(!RE_compile_and_execute(p,
484 : VARDATA_ANY(s),
485 : VARSIZE_ANY_EXHDR(s),
486 : REG_ADVANCED,
487 : PG_GET_COLLATION(),
488 : 0, NULL));
489 : }
490 :
491 :
492 : /*
493 : * routines that use the regexp stuff, but ignore the case.
494 : * for this, we use the REG_ICASE flag to pg_regcomp
495 : */
496 :
497 :
498 : Datum
499 0 : nameicregexeq(PG_FUNCTION_ARGS)
500 : {
501 0 : Name n = PG_GETARG_NAME(0);
502 0 : text *p = PG_GETARG_TEXT_PP(1);
503 :
504 0 : PG_RETURN_BOOL(RE_compile_and_execute(p,
505 : NameStr(*n),
506 : strlen(NameStr(*n)),
507 : REG_ADVANCED | REG_ICASE,
508 : PG_GET_COLLATION(),
509 : 0, NULL));
510 : }
511 :
512 : Datum
513 1 : nameicregexne(PG_FUNCTION_ARGS)
514 : {
515 1 : Name n = PG_GETARG_NAME(0);
516 1 : text *p = PG_GETARG_TEXT_PP(1);
517 :
518 1 : PG_RETURN_BOOL(!RE_compile_and_execute(p,
519 : NameStr(*n),
520 : strlen(NameStr(*n)),
521 : REG_ADVANCED | REG_ICASE,
522 : PG_GET_COLLATION(),
523 : 0, NULL));
524 : }
525 :
526 : Datum
527 0 : texticregexeq(PG_FUNCTION_ARGS)
528 : {
529 0 : text *s = PG_GETARG_TEXT_PP(0);
530 0 : text *p = PG_GETARG_TEXT_PP(1);
531 :
532 0 : PG_RETURN_BOOL(RE_compile_and_execute(p,
533 : VARDATA_ANY(s),
534 : VARSIZE_ANY_EXHDR(s),
535 : REG_ADVANCED | REG_ICASE,
536 : PG_GET_COLLATION(),
537 : 0, NULL));
538 : }
539 :
540 : Datum
541 0 : texticregexne(PG_FUNCTION_ARGS)
542 : {
543 0 : text *s = PG_GETARG_TEXT_PP(0);
544 0 : text *p = PG_GETARG_TEXT_PP(1);
545 :
546 0 : PG_RETURN_BOOL(!RE_compile_and_execute(p,
547 : VARDATA_ANY(s),
548 : VARSIZE_ANY_EXHDR(s),
549 : REG_ADVANCED | REG_ICASE,
550 : PG_GET_COLLATION(),
551 : 0, NULL));
552 : }
553 :
554 :
555 : /*
556 : * textregexsubstr()
557 : * Return a substring matched by a regular expression.
558 : */
559 : Datum
560 8 : textregexsubstr(PG_FUNCTION_ARGS)
561 : {
562 8 : text *s = PG_GETARG_TEXT_PP(0);
563 8 : text *p = PG_GETARG_TEXT_PP(1);
564 : regex_t *re;
565 : regmatch_t pmatch[2];
566 : int so,
567 : eo;
568 :
569 : /* Compile RE */
570 8 : re = RE_compile_and_cache(p, REG_ADVANCED, PG_GET_COLLATION());
571 :
572 : /*
573 : * We pass two regmatch_t structs to get info about the overall match and
574 : * the match for the first parenthesized subexpression (if any). If there
575 : * is a parenthesized subexpression, we return what it matched; else
576 : * return what the whole regexp matched.
577 : */
578 32 : if (!RE_execute(re,
579 32 : VARDATA_ANY(s), VARSIZE_ANY_EXHDR(s),
580 : 2, pmatch))
581 1 : PG_RETURN_NULL(); /* definitely no match */
582 :
583 7 : if (re->re_nsub > 0)
584 : {
585 : /* has parenthesized subexpressions, use the first one */
586 5 : so = pmatch[1].rm_so;
587 5 : eo = pmatch[1].rm_eo;
588 : }
589 : else
590 : {
591 : /* no parenthesized subexpression, use whole match */
592 2 : so = pmatch[0].rm_so;
593 2 : eo = pmatch[0].rm_eo;
594 : }
595 :
596 : /*
597 : * It is possible to have a match to the whole pattern but no match for a
598 : * subexpression; for example 'foo(bar)?' is considered to match 'foo' but
599 : * there is no subexpression match. So this extra test for match failure
600 : * is not redundant.
601 : */
602 7 : if (so < 0 || eo < 0)
603 0 : PG_RETURN_NULL();
604 :
605 7 : return DirectFunctionCall3(text_substr,
606 : PointerGetDatum(s),
607 : Int32GetDatum(so + 1),
608 : Int32GetDatum(eo - so));
609 : }
610 :
611 : /*
612 : * textregexreplace_noopt()
613 : * Return a string matched by a regular expression, with replacement.
614 : *
615 : * This version doesn't have an option argument: we default to case
616 : * sensitive match, replace the first instance only.
617 : */
618 : Datum
619 21 : textregexreplace_noopt(PG_FUNCTION_ARGS)
620 : {
621 21 : text *s = PG_GETARG_TEXT_PP(0);
622 21 : text *p = PG_GETARG_TEXT_PP(1);
623 21 : text *r = PG_GETARG_TEXT_PP(2);
624 : regex_t *re;
625 :
626 21 : re = RE_compile_and_cache(p, REG_ADVANCED, PG_GET_COLLATION());
627 :
628 21 : PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, false));
629 : }
630 :
631 : /*
632 : * textregexreplace()
633 : * Return a string matched by a regular expression, with replacement.
634 : */
635 : Datum
636 5 : textregexreplace(PG_FUNCTION_ARGS)
637 : {
638 5 : text *s = PG_GETARG_TEXT_PP(0);
639 5 : text *p = PG_GETARG_TEXT_PP(1);
640 5 : text *r = PG_GETARG_TEXT_PP(2);
641 5 : text *opt = PG_GETARG_TEXT_PP(3);
642 : regex_t *re;
643 : pg_re_flags flags;
644 :
645 5 : parse_re_flags(&flags, opt);
646 :
647 4 : re = RE_compile_and_cache(p, flags.cflags, PG_GET_COLLATION());
648 :
649 4 : PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, flags.glob));
650 : }
651 :
652 : /*
653 : * similar_escape()
654 : * Convert a SQL:2008 regexp pattern to POSIX style, so it can be used by
655 : * our regexp engine.
656 : */
657 : Datum
658 2 : similar_escape(PG_FUNCTION_ARGS)
659 : {
660 : text *pat_text;
661 : text *esc_text;
662 : text *result;
663 : char *p,
664 : *e,
665 : *r;
666 : int plen,
667 : elen;
668 2 : bool afterescape = false;
669 2 : bool incharclass = false;
670 2 : int nquotes = 0;
671 :
672 : /* This function is not strict, so must test explicitly */
673 2 : if (PG_ARGISNULL(0))
674 0 : PG_RETURN_NULL();
675 2 : pat_text = PG_GETARG_TEXT_PP(0);
676 2 : p = VARDATA_ANY(pat_text);
677 2 : plen = VARSIZE_ANY_EXHDR(pat_text);
678 2 : if (PG_ARGISNULL(1))
679 : {
680 : /* No ESCAPE clause provided; default to backslash as escape */
681 0 : e = "\\";
682 0 : elen = 1;
683 : }
684 : else
685 : {
686 2 : esc_text = PG_GETARG_TEXT_PP(1);
687 2 : e = VARDATA_ANY(esc_text);
688 2 : elen = VARSIZE_ANY_EXHDR(esc_text);
689 2 : if (elen == 0)
690 0 : e = NULL; /* no escape character */
691 : else
692 : {
693 2 : int escape_mblen = pg_mbstrlen_with_len(e, elen);
694 :
695 2 : if (escape_mblen > 1)
696 0 : ereport(ERROR,
697 : (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
698 : errmsg("invalid escape string"),
699 : errhint("Escape string must be empty or one character.")));
700 : }
701 : }
702 :
703 : /*----------
704 : * We surround the transformed input string with
705 : * ^(?: ... )$
706 : * which requires some explanation. We need "^" and "$" to force
707 : * the pattern to match the entire input string as per SQL99 spec.
708 : * The "(?:" and ")" are a non-capturing set of parens; we have to have
709 : * parens in case the string contains "|", else the "^" and "$" will
710 : * be bound into the first and last alternatives which is not what we
711 : * want, and the parens must be non capturing because we don't want them
712 : * to count when selecting output for SUBSTRING.
713 : *----------
714 : */
715 :
716 : /*
717 : * We need room for the prefix/postfix plus as many as 3 output bytes per
718 : * input byte; since the input is at most 1GB this can't overflow
719 : */
720 2 : result = (text *) palloc(VARHDRSZ + 6 + 3 * plen);
721 2 : r = VARDATA(result);
722 :
723 2 : *r++ = '^';
724 2 : *r++ = '(';
725 2 : *r++ = '?';
726 2 : *r++ = ':';
727 :
728 25 : while (plen > 0)
729 : {
730 21 : char pchar = *p;
731 :
732 : /*
733 : * If both the escape character and the current character from the
734 : * pattern are multi-byte, we need to take the slow path.
735 : *
736 : * But if one of them is single-byte, we can process the pattern one
737 : * byte at a time, ignoring multi-byte characters. (This works
738 : * because all server-encodings have the property that a valid
739 : * multi-byte character representation cannot contain the
740 : * representation of a valid single-byte character.)
741 : */
742 :
743 21 : if (elen > 1)
744 : {
745 0 : int mblen = pg_mblen(p);
746 :
747 0 : if (mblen > 1)
748 : {
749 : /* slow, multi-byte path */
750 0 : if (afterescape)
751 : {
752 0 : *r++ = '\\';
753 0 : memcpy(r, p, mblen);
754 0 : r += mblen;
755 0 : afterescape = false;
756 : }
757 0 : else if (e && elen == mblen && memcmp(e, p, mblen) == 0)
758 : {
759 : /* SQL99 escape character; do not send to output */
760 0 : afterescape = true;
761 : }
762 : else
763 : {
764 : /*
765 : * We know it's a multi-byte character, so we don't need
766 : * to do all the comparisons to single-byte characters
767 : * that we do below.
768 : */
769 0 : memcpy(r, p, mblen);
770 0 : r += mblen;
771 : }
772 :
773 0 : p += mblen;
774 0 : plen -= mblen;
775 :
776 0 : continue;
777 : }
778 : }
779 :
780 : /* fast path */
781 21 : if (afterescape)
782 : {
783 4 : if (pchar == '"' && !incharclass) /* for SUBSTRING patterns */
784 4 : *r++ = ((nquotes++ % 2) == 0) ? '(' : ')';
785 : else
786 : {
787 0 : *r++ = '\\';
788 0 : *r++ = pchar;
789 : }
790 4 : afterescape = false;
791 : }
792 17 : else if (e && pchar == *e)
793 : {
794 : /* SQL99 escape character; do not send to output */
795 4 : afterescape = true;
796 : }
797 13 : else if (incharclass)
798 : {
799 0 : if (pchar == '\\')
800 0 : *r++ = '\\';
801 0 : *r++ = pchar;
802 0 : if (pchar == ']')
803 0 : incharclass = false;
804 : }
805 13 : else if (pchar == '[')
806 : {
807 0 : *r++ = pchar;
808 0 : incharclass = true;
809 : }
810 13 : else if (pchar == '%')
811 : {
812 2 : *r++ = '.';
813 2 : *r++ = '*';
814 : }
815 11 : else if (pchar == '_')
816 2 : *r++ = '.';
817 9 : else if (pchar == '(')
818 : {
819 : /* convert to non-capturing parenthesis */
820 2 : *r++ = '(';
821 2 : *r++ = '?';
822 2 : *r++ = ':';
823 : }
824 7 : else if (pchar == '\\' || pchar == '.' ||
825 7 : pchar == '^' || pchar == '$')
826 : {
827 0 : *r++ = '\\';
828 0 : *r++ = pchar;
829 : }
830 : else
831 7 : *r++ = pchar;
832 21 : p++, plen--;
833 : }
834 :
835 2 : *r++ = ')';
836 2 : *r++ = '$';
837 :
838 2 : SET_VARSIZE(result, r - ((char *) result));
839 :
840 2 : PG_RETURN_TEXT_P(result);
841 : }
842 :
843 : /*
844 : * regexp_match()
845 : * Return the first substring(s) matching a pattern within a string.
846 : */
847 : Datum
848 5 : regexp_match(PG_FUNCTION_ARGS)
849 : {
850 5 : text *orig_str = PG_GETARG_TEXT_PP(0);
851 5 : text *pattern = PG_GETARG_TEXT_PP(1);
852 5 : text *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2);
853 : pg_re_flags re_flags;
854 : regexp_matches_ctx *matchctx;
855 :
856 : /* Determine options */
857 5 : parse_re_flags(&re_flags, flags);
858 : /* User mustn't specify 'g' */
859 5 : if (re_flags.glob)
860 1 : ereport(ERROR,
861 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
862 : errmsg("regexp_match does not support the global option"),
863 : errhint("Use the regexp_matches function instead.")));
864 :
865 4 : matchctx = setup_regexp_matches(orig_str, pattern, &re_flags,
866 : PG_GET_COLLATION(), true, false);
867 :
868 4 : if (matchctx->nmatches == 0)
869 1 : PG_RETURN_NULL();
870 :
871 3 : Assert(matchctx->nmatches == 1);
872 :
873 : /* Create workspace that build_regexp_match_result needs */
874 3 : matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns);
875 3 : matchctx->nulls = (bool *) palloc(sizeof(bool) * matchctx->npatterns);
876 :
877 3 : PG_RETURN_DATUM(PointerGetDatum(build_regexp_match_result(matchctx)));
878 : }
879 :
880 : /* This is separate to keep the opr_sanity regression test from complaining */
881 : Datum
882 3 : regexp_match_no_flags(PG_FUNCTION_ARGS)
883 : {
884 3 : return regexp_match(fcinfo);
885 : }
886 :
887 : /*
888 : * regexp_matches()
889 : * Return a table of all matches of a pattern within a string.
890 : */
891 : Datum
892 91 : regexp_matches(PG_FUNCTION_ARGS)
893 : {
894 : FuncCallContext *funcctx;
895 : regexp_matches_ctx *matchctx;
896 :
897 91 : if (SRF_IS_FIRSTCALL())
898 : {
899 37 : text *pattern = PG_GETARG_TEXT_PP(1);
900 37 : text *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2);
901 : pg_re_flags re_flags;
902 : MemoryContext oldcontext;
903 :
904 37 : funcctx = SRF_FIRSTCALL_INIT();
905 37 : oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
906 :
907 : /* Determine options */
908 37 : parse_re_flags(&re_flags, flags);
909 :
910 : /* be sure to copy the input string into the multi-call ctx */
911 36 : matchctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
912 : &re_flags,
913 : PG_GET_COLLATION(),
914 : true, false);
915 :
916 : /* Pre-create workspace that build_regexp_match_result needs */
917 34 : matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns);
918 34 : matchctx->nulls = (bool *) palloc(sizeof(bool) * matchctx->npatterns);
919 :
920 34 : MemoryContextSwitchTo(oldcontext);
921 34 : funcctx->user_fctx = (void *) matchctx;
922 : }
923 :
924 88 : funcctx = SRF_PERCALL_SETUP();
925 88 : matchctx = (regexp_matches_ctx *) funcctx->user_fctx;
926 :
927 88 : if (matchctx->next_match < matchctx->nmatches)
928 : {
929 : ArrayType *result_ary;
930 :
931 54 : result_ary = build_regexp_match_result(matchctx);
932 54 : matchctx->next_match++;
933 54 : SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary));
934 : }
935 :
936 : /* release space in multi-call ctx to avoid intraquery memory leak */
937 34 : cleanup_regexp_matches(matchctx);
938 :
939 34 : SRF_RETURN_DONE(funcctx);
940 : }
941 :
942 : /* This is separate to keep the opr_sanity regression test from complaining */
943 : Datum
944 43 : regexp_matches_no_flags(PG_FUNCTION_ARGS)
945 : {
946 43 : return regexp_matches(fcinfo);
947 : }
948 :
949 : /*
950 : * setup_regexp_matches --- do the initial matching for regexp_match
951 : * and regexp_split functions
952 : *
953 : * To avoid having to re-find the compiled pattern on each call, we do
954 : * all the matching in one swoop. The returned regexp_matches_ctx contains
955 : * the locations of all the substrings matching the pattern.
956 : *
957 : * The two bool parameters have only two patterns (one for matching, one for
958 : * splitting) but it seems clearer to distinguish the functionality this way
959 : * than to key it all off one "is_split" flag.
960 : */
961 : static regexp_matches_ctx *
962 53 : setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
963 : Oid collation,
964 : bool use_subpatterns,
965 : bool ignore_degenerate)
966 : {
967 53 : regexp_matches_ctx *matchctx = palloc0(sizeof(regexp_matches_ctx));
968 : int orig_len;
969 : pg_wchar *wide_str;
970 : int wide_len;
971 : regex_t *cpattern;
972 : regmatch_t *pmatch;
973 : int pmatch_len;
974 : int array_len;
975 : int array_idx;
976 : int prev_match_end;
977 : int start_search;
978 :
979 : /* save original string --- we'll extract result substrings from it */
980 53 : matchctx->orig_str = orig_str;
981 :
982 : /* convert string to pg_wchar form for matching */
983 53 : orig_len = VARSIZE_ANY_EXHDR(orig_str);
984 53 : wide_str = (pg_wchar *) palloc(sizeof(pg_wchar) * (orig_len + 1));
985 53 : wide_len = pg_mb2wchar_with_len(VARDATA_ANY(orig_str), wide_str, orig_len);
986 :
987 : /* set up the compiled pattern */
988 53 : cpattern = RE_compile_and_cache(pattern, re_flags->cflags, collation);
989 :
990 : /* do we want to remember subpatterns? */
991 51 : if (use_subpatterns && cpattern->re_nsub > 0)
992 : {
993 9 : matchctx->npatterns = cpattern->re_nsub;
994 9 : pmatch_len = cpattern->re_nsub + 1;
995 : }
996 : else
997 : {
998 42 : use_subpatterns = false;
999 42 : matchctx->npatterns = 1;
1000 42 : pmatch_len = 1;
1001 : }
1002 :
1003 : /* temporary output space for RE package */
1004 51 : pmatch = palloc(sizeof(regmatch_t) * pmatch_len);
1005 :
1006 : /* the real output space (grown dynamically if needed) */
1007 51 : array_len = re_flags->glob ? 256 : 32;
1008 51 : matchctx->match_locs = (int *) palloc(sizeof(int) * array_len);
1009 51 : array_idx = 0;
1010 :
1011 : /* search for the pattern, perhaps repeatedly */
1012 51 : prev_match_end = 0;
1013 51 : start_search = 0;
1014 336 : while (RE_wchar_execute(cpattern, wide_str, wide_len, start_search,
1015 : pmatch_len, pmatch))
1016 : {
1017 : /*
1018 : * If requested, ignore degenerate matches, which are zero-length
1019 : * matches occurring at the start or end of a string or just after a
1020 : * previous match.
1021 : */
1022 469 : if (!ignore_degenerate ||
1023 408 : (pmatch[0].rm_so < wide_len &&
1024 202 : pmatch[0].rm_eo > prev_match_end))
1025 : {
1026 : /* enlarge output space if needed */
1027 478 : while (array_idx + matchctx->npatterns * 2 > array_len)
1028 : {
1029 0 : array_len *= 2;
1030 0 : matchctx->match_locs = (int *) repalloc(matchctx->match_locs,
1031 : sizeof(int) * array_len);
1032 : }
1033 :
1034 : /* save this match's locations */
1035 239 : if (use_subpatterns)
1036 : {
1037 : int i;
1038 :
1039 33 : for (i = 1; i <= matchctx->npatterns; i++)
1040 : {
1041 23 : matchctx->match_locs[array_idx++] = pmatch[i].rm_so;
1042 23 : matchctx->match_locs[array_idx++] = pmatch[i].rm_eo;
1043 : }
1044 : }
1045 : else
1046 : {
1047 229 : matchctx->match_locs[array_idx++] = pmatch[0].rm_so;
1048 229 : matchctx->match_locs[array_idx++] = pmatch[0].rm_eo;
1049 : }
1050 239 : matchctx->nmatches++;
1051 : }
1052 263 : prev_match_end = pmatch[0].rm_eo;
1053 :
1054 : /* if not glob, stop after one match */
1055 263 : if (!re_flags->glob)
1056 21 : break;
1057 :
1058 : /*
1059 : * Advance search position. Normally we start the next search at the
1060 : * end of the previous match; but if the match was of zero length, we
1061 : * have to advance by one character, or we'd just find the same match
1062 : * again.
1063 : */
1064 242 : start_search = prev_match_end;
1065 242 : if (pmatch[0].rm_so == pmatch[0].rm_eo)
1066 180 : start_search++;
1067 242 : if (start_search > wide_len)
1068 8 : break;
1069 : }
1070 :
1071 : /* Clean up temp storage */
1072 51 : pfree(wide_str);
1073 51 : pfree(pmatch);
1074 :
1075 51 : return matchctx;
1076 : }
1077 :
1078 : /*
1079 : * cleanup_regexp_matches - release memory of a regexp_matches_ctx
1080 : */
1081 : static void
1082 39 : cleanup_regexp_matches(regexp_matches_ctx *matchctx)
1083 : {
1084 39 : pfree(matchctx->orig_str);
1085 39 : pfree(matchctx->match_locs);
1086 39 : if (matchctx->elems)
1087 34 : pfree(matchctx->elems);
1088 39 : if (matchctx->nulls)
1089 34 : pfree(matchctx->nulls);
1090 39 : pfree(matchctx);
1091 39 : }
1092 :
1093 : /*
1094 : * build_regexp_match_result - build output array for current match
1095 : */
1096 : static ArrayType *
1097 57 : build_regexp_match_result(regexp_matches_ctx *matchctx)
1098 : {
1099 57 : Datum *elems = matchctx->elems;
1100 57 : bool *nulls = matchctx->nulls;
1101 : int dims[1];
1102 : int lbs[1];
1103 : int loc;
1104 : int i;
1105 :
1106 : /* Extract matching substrings from the original string */
1107 57 : loc = matchctx->next_match * matchctx->npatterns * 2;
1108 127 : for (i = 0; i < matchctx->npatterns; i++)
1109 : {
1110 70 : int so = matchctx->match_locs[loc++];
1111 70 : int eo = matchctx->match_locs[loc++];
1112 :
1113 70 : if (so < 0 || eo < 0)
1114 : {
1115 1 : elems[i] = (Datum) 0;
1116 1 : nulls[i] = true;
1117 : }
1118 : else
1119 : {
1120 69 : elems[i] = DirectFunctionCall3(text_substr,
1121 : PointerGetDatum(matchctx->orig_str),
1122 : Int32GetDatum(so + 1),
1123 : Int32GetDatum(eo - so));
1124 69 : nulls[i] = false;
1125 : }
1126 : }
1127 :
1128 : /* And form an array */
1129 57 : dims[0] = matchctx->npatterns;
1130 57 : lbs[0] = 1;
1131 : /* XXX: this hardcodes assumptions about the text type */
1132 57 : return construct_md_array(elems, nulls, 1, dims, lbs,
1133 : TEXTOID, -1, false, 'i');
1134 : }
1135 :
1136 : /*
1137 : * regexp_split_to_table()
1138 : * Split the string at matches of the pattern, returning the
1139 : * split-out substrings as a table.
1140 : */
1141 : Datum
1142 99 : regexp_split_to_table(PG_FUNCTION_ARGS)
1143 : {
1144 : FuncCallContext *funcctx;
1145 : regexp_matches_ctx *splitctx;
1146 :
1147 99 : if (SRF_IS_FIRSTCALL())
1148 : {
1149 7 : text *pattern = PG_GETARG_TEXT_PP(1);
1150 7 : text *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2);
1151 : pg_re_flags re_flags;
1152 : MemoryContext oldcontext;
1153 :
1154 7 : funcctx = SRF_FIRSTCALL_INIT();
1155 7 : oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
1156 :
1157 : /* Determine options */
1158 7 : parse_re_flags(&re_flags, flags);
1159 : /* User mustn't specify 'g' */
1160 6 : if (re_flags.glob)
1161 1 : ereport(ERROR,
1162 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1163 : errmsg("regexp_split_to_table does not support the global option")));
1164 : /* But we find all the matches anyway */
1165 5 : re_flags.glob = true;
1166 :
1167 : /* be sure to copy the input string into the multi-call ctx */
1168 5 : splitctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
1169 : &re_flags,
1170 : PG_GET_COLLATION(),
1171 : false, true);
1172 :
1173 5 : MemoryContextSwitchTo(oldcontext);
1174 5 : funcctx->user_fctx = (void *) splitctx;
1175 : }
1176 :
1177 97 : funcctx = SRF_PERCALL_SETUP();
1178 97 : splitctx = (regexp_matches_ctx *) funcctx->user_fctx;
1179 :
1180 97 : if (splitctx->next_match <= splitctx->nmatches)
1181 : {
1182 92 : Datum result = build_regexp_split_result(splitctx);
1183 :
1184 92 : splitctx->next_match++;
1185 92 : SRF_RETURN_NEXT(funcctx, result);
1186 : }
1187 :
1188 : /* release space in multi-call ctx to avoid intraquery memory leak */
1189 5 : cleanup_regexp_matches(splitctx);
1190 :
1191 5 : SRF_RETURN_DONE(funcctx);
1192 : }
1193 :
1194 : /* This is separate to keep the opr_sanity regression test from complaining */
1195 : Datum
1196 92 : regexp_split_to_table_no_flags(PG_FUNCTION_ARGS)
1197 : {
1198 92 : return regexp_split_to_table(fcinfo);
1199 : }
1200 :
1201 : /*
1202 : * regexp_split_to_array()
1203 : * Split the string at matches of the pattern, returning the
1204 : * split-out substrings as an array.
1205 : */
1206 : Datum
1207 10 : regexp_split_to_array(PG_FUNCTION_ARGS)
1208 : {
1209 10 : ArrayBuildState *astate = NULL;
1210 : pg_re_flags re_flags;
1211 : regexp_matches_ctx *splitctx;
1212 :
1213 : /* Determine options */
1214 10 : parse_re_flags(&re_flags, PG_GETARG_TEXT_PP_IF_EXISTS(2));
1215 : /* User mustn't specify 'g' */
1216 9 : if (re_flags.glob)
1217 1 : ereport(ERROR,
1218 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
1219 : errmsg("regexp_split_to_array does not support the global option")));
1220 : /* But we find all the matches anyway */
1221 8 : re_flags.glob = true;
1222 :
1223 16 : splitctx = setup_regexp_matches(PG_GETARG_TEXT_PP(0),
1224 8 : PG_GETARG_TEXT_PP(1),
1225 : &re_flags,
1226 : PG_GET_COLLATION(),
1227 : false, true);
1228 :
1229 119 : while (splitctx->next_match <= splitctx->nmatches)
1230 : {
1231 103 : astate = accumArrayResult(astate,
1232 : build_regexp_split_result(splitctx),
1233 : false,
1234 : TEXTOID,
1235 : CurrentMemoryContext);
1236 103 : splitctx->next_match++;
1237 : }
1238 :
1239 : /*
1240 : * We don't call cleanup_regexp_matches here; it would try to pfree the
1241 : * input string, which we didn't copy. The space is not in a long-lived
1242 : * memory context anyway.
1243 : */
1244 :
1245 8 : PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate, CurrentMemoryContext));
1246 : }
1247 :
1248 : /* This is separate to keep the opr_sanity regression test from complaining */
1249 : Datum
1250 7 : regexp_split_to_array_no_flags(PG_FUNCTION_ARGS)
1251 : {
1252 7 : return regexp_split_to_array(fcinfo);
1253 : }
1254 :
1255 : /*
1256 : * build_regexp_split_result - build output string for current match
1257 : *
1258 : * We return the string between the current match and the previous one,
1259 : * or the string after the last match when next_match == nmatches.
1260 : */
1261 : static Datum
1262 195 : build_regexp_split_result(regexp_matches_ctx *splitctx)
1263 : {
1264 : int startpos;
1265 : int endpos;
1266 :
1267 195 : if (splitctx->next_match > 0)
1268 182 : startpos = splitctx->match_locs[splitctx->next_match * 2 - 1];
1269 : else
1270 13 : startpos = 0;
1271 195 : if (startpos < 0)
1272 0 : elog(ERROR, "invalid match ending position");
1273 :
1274 195 : if (splitctx->next_match < splitctx->nmatches)
1275 : {
1276 182 : endpos = splitctx->match_locs[splitctx->next_match * 2];
1277 182 : if (endpos < startpos)
1278 0 : elog(ERROR, "invalid match starting position");
1279 182 : return DirectFunctionCall3(text_substr,
1280 : PointerGetDatum(splitctx->orig_str),
1281 : Int32GetDatum(startpos + 1),
1282 : Int32GetDatum(endpos - startpos));
1283 : }
1284 : else
1285 : {
1286 : /* no more matches, return rest of string */
1287 13 : return DirectFunctionCall2(text_substr_no_len,
1288 : PointerGetDatum(splitctx->orig_str),
1289 : Int32GetDatum(startpos + 1));
1290 : }
1291 : }
1292 :
1293 : /*
1294 : * regexp_fixed_prefix - extract fixed prefix, if any, for a regexp
1295 : *
1296 : * The result is NULL if there is no fixed prefix, else a palloc'd string.
1297 : * If it is an exact match, not just a prefix, *exact is returned as TRUE.
1298 : */
1299 : char *
1300 1256 : regexp_fixed_prefix(text *text_re, bool case_insensitive, Oid collation,
1301 : bool *exact)
1302 : {
1303 : char *result;
1304 : regex_t *re;
1305 : int cflags;
1306 : int re_result;
1307 : pg_wchar *str;
1308 : size_t slen;
1309 : size_t maxlen;
1310 : char errMsg[100];
1311 :
1312 1256 : *exact = false; /* default result */
1313 :
1314 : /* Compile RE */
1315 1256 : cflags = REG_ADVANCED;
1316 1256 : if (case_insensitive)
1317 0 : cflags |= REG_ICASE;
1318 :
1319 1256 : re = RE_compile_and_cache(text_re, cflags, collation);
1320 :
1321 : /* Examine it to see if there's a fixed prefix */
1322 1256 : re_result = pg_regprefix(re, &str, &slen);
1323 :
1324 1256 : switch (re_result)
1325 : {
1326 : case REG_NOMATCH:
1327 46 : return NULL;
1328 :
1329 : case REG_PREFIX:
1330 : /* continue with wchar conversion */
1331 98 : break;
1332 :
1333 : case REG_EXACT:
1334 1112 : *exact = true;
1335 : /* continue with wchar conversion */
1336 1112 : break;
1337 :
1338 : default:
1339 : /* re failed??? */
1340 0 : CHECK_FOR_INTERRUPTS();
1341 0 : pg_regerror(re_result, re, errMsg, sizeof(errMsg));
1342 0 : ereport(ERROR,
1343 : (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
1344 : errmsg("regular expression failed: %s", errMsg)));
1345 : break;
1346 : }
1347 :
1348 : /* Convert pg_wchar result back to database encoding */
1349 1210 : maxlen = pg_database_encoding_max_length() * slen + 1;
1350 1210 : result = (char *) palloc(maxlen);
1351 1210 : slen = pg_wchar2mb_with_len(str, result, slen);
1352 1210 : Assert(slen < maxlen);
1353 :
1354 1210 : free(str);
1355 :
1356 1210 : return result;
1357 : }
|