LCOV - PostgreSQL - src/backend/utils/adt/regexp.c

LCOV - code coverage report

Current view:	top level - src/backend/utils/adt - regexp.c (source / functions)		Hit	Total	Coverage
Test:	PostgreSQL	Lines:	331	409	80.9 %
Date:	2017-09-29 15:12:54	Functions:	27	30	90.0 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * regexp.c
       4             :  *    Postgres' interface to the regular expression package.
       5             :  *
       6             :  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
       7             :  * Portions Copyright (c) 1994, Regents of the University of California
       8             :  *
       9             :  *
      10             :  * IDENTIFICATION
      11             :  *    src/backend/utils/adt/regexp.c
      12             :  *
      13             :  *      Alistair Crooks added the code for the regex caching
      14             :  *      agc - cached the regular expressions used - there's a good chance
      15             :  *      that we'll get a hit, so this saves a compile step for every
      16             :  *      attempted match. I haven't actually measured the speed improvement,
      17             :  *      but it `looks' a lot quicker visually when watching regression
      18             :  *      test output.
      19             :  *
      20             :  *      agc - incorporated Keith Bostic's Berkeley regex code into
      21             :  *      the tree for all ports. To distinguish this regex code from any that
      22             :  *      is existent on a platform, I've prepended the string "pg_" to
      23             :  *      the functions regcomp, regerror, regexec and regfree.
      24             :  *      Fixed a bug that was originally a typo by me, where `i' was used
      25             :  *      instead of `oldest' when compiling regular expressions - benign
      26             :  *      results mostly, although occasionally it bit you...
      27             :  *
      28             :  *-------------------------------------------------------------------------
      29             :  */
      30             : #include "postgres.h"
      31             : 
      32             : #include "catalog/pg_type.h"
      33             : #include "funcapi.h"
      34             : #include "miscadmin.h"
      35             : #include "regex/regex.h"
      36             : #include "utils/array.h"
      37             : #include "utils/builtins.h"
      38             : #include "utils/varlena.h"
      39             : 
      40             : #define PG_GETARG_TEXT_PP_IF_EXISTS(_n) \
      41             :     (PG_NARGS() > (_n) ? PG_GETARG_TEXT_PP(_n) : NULL)
      42             : 
      43             : 
      44             : /* all the options of interest for regex functions */
      45             : typedef struct pg_re_flags
      46             : {
      47             :     int         cflags;         /* compile flags for Spencer's regex code */
      48             :     bool        glob;           /* do it globally (for each occurrence) */
      49             : } pg_re_flags;
      50             : 
      51             : /* cross-call state for regexp_match and regexp_split functions */
      52             : typedef struct regexp_matches_ctx
      53             : {
      54             :     text       *orig_str;       /* data string in original TEXT form */
      55             :     int         nmatches;       /* number of places where pattern matched */
      56             :     int         npatterns;      /* number of capturing subpatterns */
      57             :     /* We store start char index and end+1 char index for each match */
      58             :     /* so the number of entries in match_locs is nmatches * npatterns * 2 */
      59             :     int        *match_locs;     /* 0-based character indexes */
      60             :     int         next_match;     /* 0-based index of next match to process */
      61             :     /* workspace for build_regexp_match_result() */
      62             :     Datum      *elems;          /* has npatterns elements */
      63             :     bool       *nulls;          /* has npatterns elements */
      64             : } regexp_matches_ctx;
      65             : 
      66             : /*
      67             :  * We cache precompiled regular expressions using a "self organizing list"
      68             :  * structure, in which recently-used items tend to be near the front.
      69             :  * Whenever we use an entry, it's moved up to the front of the list.
      70             :  * Over time, an item's average position corresponds to its frequency of use.
      71             :  *
      72             :  * When we first create an entry, it's inserted at the front of
      73             :  * the array, dropping the entry at the end of the array if necessary to
      74             :  * make room.  (This might seem to be weighting the new entry too heavily,
      75             :  * but if we insert new entries further back, we'll be unable to adjust to
      76             :  * a sudden shift in the query mix where we are presented with MAX_CACHED_RES
      77             :  * never-before-seen items used circularly.  We ought to be able to handle
      78             :  * that case, so we have to insert at the front.)
      79             :  *
      80             :  * Knuth mentions a variant strategy in which a used item is moved up just
      81             :  * one place in the list.  Although he says this uses fewer comparisons on
      82             :  * average, it seems not to adapt very well to the situation where you have
      83             :  * both some reusable patterns and a steady stream of non-reusable patterns.
      84             :  * A reusable pattern that isn't used at least as often as non-reusable
      85             :  * patterns are seen will "fail to keep up" and will drop off the end of the
      86             :  * cache.  With move-to-front, a reusable pattern is guaranteed to stay in
      87             :  * the cache as long as it's used at least once in every MAX_CACHED_RES uses.
      88             :  */
      89             : 
      90             : /* this is the maximum number of cached regular expressions */
      91             : #ifndef MAX_CACHED_RES
      92             : #define MAX_CACHED_RES  32
      93             : #endif
      94             : 
      95             : /* this structure describes one cached regular expression */
      96             : typedef struct cached_re_str
      97             : {
      98             :     char       *cre_pat;        /* original RE (not null terminated!) */
      99             :     int         cre_pat_len;    /* length of original RE, in bytes */
     100             :     int         cre_flags;      /* compile flags: extended,icase etc */
     101             :     Oid         cre_collation;  /* collation to use */
     102             :     regex_t     cre_re;         /* the compiled regular expression */
     103             : } cached_re_str;
     104             : 
     105             : static int  num_res = 0;        /* # of cached re's */
     106             : static cached_re_str re_array[MAX_CACHED_RES];  /* cached re's */
     107             : 
     108             : 
     109             : /* Local functions */
     110             : static regexp_matches_ctx *setup_regexp_matches(text *orig_str, text *pattern,
     111             :                      pg_re_flags *flags,
     112             :                      Oid collation,
     113             :                      bool use_subpatterns,
     114             :                      bool ignore_degenerate);
     115             : static void cleanup_regexp_matches(regexp_matches_ctx *matchctx);
     116             : static ArrayType *build_regexp_match_result(regexp_matches_ctx *matchctx);
     117             : static Datum build_regexp_split_result(regexp_matches_ctx *splitctx);
     118             : 
     119             : 
     120             : /*
     121             :  * RE_compile_and_cache - compile a RE, caching if possible
     122             :  *
     123             :  * Returns regex_t *
     124             :  *
     125             :  *  text_re --- the pattern, expressed as a TEXT object
     126             :  *  cflags --- compile options for the pattern
     127             :  *  collation --- collation to use for LC_CTYPE-dependent behavior
     128             :  *
     129             :  * Pattern is given in the database encoding.  We internally convert to
     130             :  * an array of pg_wchar, which is what Spencer's regex package wants.
     131             :  */
     132             : static regex_t *
     133       33696 : RE_compile_and_cache(text *text_re, int cflags, Oid collation)
     134             : {
     135       33696 :     int         text_re_len = VARSIZE_ANY_EXHDR(text_re);
     136       33696 :     char       *text_re_val = VARDATA_ANY(text_re);
     137             :     pg_wchar   *pattern;
     138             :     int         pattern_len;
     139             :     int         i;
     140             :     int         regcomp_result;
     141             :     cached_re_str re_temp;
     142             :     char        errMsg[100];
     143             : 
     144             :     /*
     145             :      * Look for a match among previously compiled REs.  Since the data
     146             :      * structure is self-organizing with most-used entries at the front, our
     147             :      * search strategy can just be to scan from the front.
     148             :      */
     149       36420 :     for (i = 0; i < num_res; i++)
     150             :     {
     151       69910 :         if (re_array[i].cre_pat_len == text_re_len &&
     152       67458 :             re_array[i].cre_flags == cflags &&
     153       67430 :             re_array[i].cre_collation == collation &&
     154       33715 :             memcmp(re_array[i].cre_pat, text_re_val, text_re_len) == 0)
     155             :         {
     156             :             /*
     157             :              * Found a match; move it to front if not there already.
     158             :              */
     159       33443 :             if (i > 0)
     160             :             {
     161         257 :                 re_temp = re_array[i];
     162         257 :                 memmove(&re_array[1], &re_array[0], i * sizeof(cached_re_str));
     163         257 :                 re_array[0] = re_temp;
     164             :             }
     165             : 
     166       33443 :             return &re_array[0].cre_re;
     167             :         }
     168             :     }
     169             : 
     170             :     /*
     171             :      * Couldn't find it, so try to compile the new RE.  To avoid leaking
     172             :      * resources on failure, we build into the re_temp local.
     173             :      */
     174             : 
     175             :     /* Convert pattern string to wide characters */
     176         253 :     pattern = (pg_wchar *) palloc((text_re_len + 1) * sizeof(pg_wchar));
     177         253 :     pattern_len = pg_mb2wchar_with_len(text_re_val,
     178             :                                        pattern,
     179             :                                        text_re_len);
     180             : 
     181         253 :     regcomp_result = pg_regcomp(&re_temp.cre_re,
     182             :                                 pattern,
     183             :                                 pattern_len,
     184             :                                 cflags,
     185             :                                 collation);
     186             : 
     187         253 :     pfree(pattern);
     188             : 
     189         253 :     if (regcomp_result != REG_OKAY)
     190             :     {
     191             :         /* re didn't compile (no need for pg_regfree, if so) */
     192             : 
     193             :         /*
     194             :          * Here and in other places in this file, do CHECK_FOR_INTERRUPTS
     195             :          * before reporting a regex error.  This is so that if the regex
     196             :          * library aborts and returns REG_CANCEL, we don't print an error
     197             :          * message that implies the regex was invalid.
     198             :          */
     199           6 :         CHECK_FOR_INTERRUPTS();
     200             : 
     201           6 :         pg_regerror(regcomp_result, &re_temp.cre_re, errMsg, sizeof(errMsg));
     202           6 :         ereport(ERROR,
     203             :                 (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
     204             :                  errmsg("invalid regular expression: %s", errMsg)));
     205             :     }
     206             : 
     207             :     /*
     208             :      * We use malloc/free for the cre_pat field because the storage has to
     209             :      * persist across transactions, and because we want to get control back on
     210             :      * out-of-memory.  The Max() is because some malloc implementations return
     211             :      * NULL for malloc(0).
     212             :      */
     213         247 :     re_temp.cre_pat = malloc(Max(text_re_len, 1));
     214         247 :     if (re_temp.cre_pat == NULL)
     215             :     {
     216           0 :         pg_regfree(&re_temp.cre_re);
     217           0 :         ereport(ERROR,
     218             :                 (errcode(ERRCODE_OUT_OF_MEMORY),
     219             :                  errmsg("out of memory")));
     220             :     }
     221         247 :     memcpy(re_temp.cre_pat, text_re_val, text_re_len);
     222         247 :     re_temp.cre_pat_len = text_re_len;
     223         247 :     re_temp.cre_flags = cflags;
     224         247 :     re_temp.cre_collation = collation;
     225             : 
     226             :     /*
     227             :      * Okay, we have a valid new item in re_temp; insert it into the storage
     228             :      * array.  Discard last entry if needed.
     229             :      */
     230         247 :     if (num_res >= MAX_CACHED_RES)
     231             :     {
     232          26 :         --num_res;
     233          26 :         Assert(num_res < MAX_CACHED_RES);
     234          26 :         pg_regfree(&re_array[num_res].cre_re);
     235          26 :         free(re_array[num_res].cre_pat);
     236             :     }
     237             : 
     238         247 :     if (num_res > 0)
     239         201 :         memmove(&re_array[1], &re_array[0], num_res * sizeof(cached_re_str));
     240             : 
     241         247 :     re_array[0] = re_temp;
     242         247 :     num_res++;
     243             : 
     244         247 :     return &re_array[0].cre_re;
     245             : }
     246             : 
     247             : /*
     248             :  * RE_wchar_execute - execute a RE on pg_wchar data
     249             :  *
     250             :  * Returns TRUE on match, FALSE on no match
     251             :  *
     252             :  *  re --- the compiled pattern as returned by RE_compile_and_cache
     253             :  *  data --- the data to match against (need not be null-terminated)
     254             :  *  data_len --- the length of the data string
     255             :  *  start_search -- the offset in the data to start searching
     256             :  *  nmatch, pmatch  --- optional return area for match details
     257             :  *
     258             :  * Data is given as array of pg_wchar which is what Spencer's regex package
     259             :  * wants.
     260             :  */
     261             : static bool
     262       32643 : RE_wchar_execute(regex_t *re, pg_wchar *data, int data_len,
     263             :                  int start_search, int nmatch, regmatch_t *pmatch)
     264             : {
     265             :     int         regexec_result;
     266             :     char        errMsg[100];
     267             : 
     268             :     /* Perform RE match and return result */
     269       32643 :     regexec_result = pg_regexec(re,
     270             :                                 data,
     271             :                                 data_len,
     272             :                                 start_search,
     273             :                                 NULL,   /* no details */
     274             :                                 nmatch,
     275             :                                 pmatch,
     276             :                                 0);
     277             : 
     278       32643 :     if (regexec_result != REG_OKAY && regexec_result != REG_NOMATCH)
     279             :     {
     280             :         /* re failed??? */
     281           0 :         CHECK_FOR_INTERRUPTS();
     282           0 :         pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
     283           0 :         ereport(ERROR,
     284             :                 (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
     285             :                  errmsg("regular expression failed: %s", errMsg)));
     286             :     }
     287             : 
     288       32643 :     return (regexec_result == REG_OKAY);
     289             : }
     290             : 
     291             : /*
     292             :  * RE_execute - execute a RE
     293             :  *
     294             :  * Returns TRUE on match, FALSE on no match
     295             :  *
     296             :  *  re --- the compiled pattern as returned by RE_compile_and_cache
     297             :  *  dat --- the data to match against (need not be null-terminated)
     298             :  *  dat_len --- the length of the data string
     299             :  *  nmatch, pmatch  --- optional return area for match details
     300             :  *
     301             :  * Data is given in the database encoding.  We internally
     302             :  * convert to array of pg_wchar which is what Spencer's regex package wants.
     303             :  */
     304             : static bool
     305       32358 : RE_execute(regex_t *re, char *dat, int dat_len,
     306             :            int nmatch, regmatch_t *pmatch)
     307             : {
     308             :     pg_wchar   *data;
     309             :     int         data_len;
     310             :     bool        match;
     311             : 
     312             :     /* Convert data string to wide characters */
     313       32358 :     data = (pg_wchar *) palloc((dat_len + 1) * sizeof(pg_wchar));
     314       32358 :     data_len = pg_mb2wchar_with_len(dat, data, dat_len);
     315             : 
     316             :     /* Perform RE match and return result */
     317       32358 :     match = RE_wchar_execute(re, data, data_len, 0, nmatch, pmatch);
     318             : 
     319       32358 :     pfree(data);
     320       32358 :     return match;
     321             : }
     322             : 
     323             : /*
     324             :  * RE_compile_and_execute - compile and execute a RE
     325             :  *
     326             :  * Returns TRUE on match, FALSE on no match
     327             :  *
     328             :  *  text_re --- the pattern, expressed as a TEXT object
     329             :  *  dat --- the data to match against (need not be null-terminated)
     330             :  *  dat_len --- the length of the data string
     331             :  *  cflags --- compile options for the pattern
     332             :  *  collation --- collation to use for LC_CTYPE-dependent behavior
     333             :  *  nmatch, pmatch  --- optional return area for match details
     334             :  *
     335             :  * Both pattern and data are given in the database encoding.  We internally
     336             :  * convert to array of pg_wchar which is what Spencer's regex package wants.
     337             :  */
     338             : static bool
     339       32354 : RE_compile_and_execute(text *text_re, char *dat, int dat_len,
     340             :                        int cflags, Oid collation,
     341             :                        int nmatch, regmatch_t *pmatch)
     342             : {
     343             :     regex_t    *re;
     344             : 
     345             :     /* Compile RE */
     346       32354 :     re = RE_compile_and_cache(text_re, cflags, collation);
     347             : 
     348       32350 :     return RE_execute(re, dat, dat_len, nmatch, pmatch);
     349             : }
     350             : 
     351             : 
     352             : /*
     353             :  * parse_re_flags - parse the options argument of regexp_match and friends
     354             :  *
     355             :  *  flags --- output argument, filled with desired options
     356             :  *  opts --- TEXT object, or NULL for defaults
     357             :  *
     358             :  * This accepts all the options allowed by any of the callers; callers that
     359             :  * don't want some have to reject them after the fact.
     360             :  */
     361             : static void
     362          64 : parse_re_flags(pg_re_flags *flags, text *opts)
     363             : {
     364             :     /* regex flavor is always folded into the compile flags */
     365          64 :     flags->cflags = REG_ADVANCED;
     366          64 :     flags->glob = false;
     367             : 
     368          64 :     if (opts)
     369             :     {
     370          23 :         char       *opt_p = VARDATA_ANY(opts);
     371          23 :         int         opt_len = VARSIZE_ANY_EXHDR(opts);
     372             :         int         i;
     373             : 
     374          51 :         for (i = 0; i < opt_len; i++)
     375             :         {
     376          32 :             switch (opt_p[i])
     377             :             {
     378             :                 case 'g':
     379          15 :                     flags->glob = true;
     380          15 :                     break;
     381             :                 case 'b':       /* BREs (but why???) */
     382           0 :                     flags->cflags &= ~(REG_ADVANCED | REG_EXTENDED | REG_QUOTE);
     383           0 :                     break;
     384             :                 case 'c':       /* case sensitive */
     385           0 :                     flags->cflags &= ~REG_ICASE;
     386           0 :                     break;
     387             :                 case 'e':       /* plain EREs */
     388           0 :                     flags->cflags |= REG_EXTENDED;
     389           0 :                     flags->cflags &= ~(REG_ADVANCED | REG_QUOTE);
     390           0 :                     break;
     391             :                 case 'i':       /* case insensitive */
     392           7 :                     flags->cflags |= REG_ICASE;
     393           7 :                     break;
     394             :                 case 'm':       /* Perloid synonym for n */
     395             :                 case 'n':       /* \n affects ^ $ . [^ */
     396           6 :                     flags->cflags |= REG_NEWLINE;
     397           6 :                     break;
     398             :                 case 'p':       /* ~Perl, \n affects . [^ */
     399           0 :                     flags->cflags |= REG_NLSTOP;
     400           0 :                     flags->cflags &= ~REG_NLANCH;
     401           0 :                     break;
     402             :                 case 'q':       /* literal string */
     403           0 :                     flags->cflags |= REG_QUOTE;
     404           0 :                     flags->cflags &= ~(REG_ADVANCED | REG_EXTENDED);
     405           0 :                     break;
     406             :                 case 's':       /* single line, \n ordinary */
     407           0 :                     flags->cflags &= ~REG_NEWLINE;
     408           0 :                     break;
     409             :                 case 't':       /* tight syntax */
     410           0 :                     flags->cflags &= ~REG_EXPANDED;
     411           0 :                     break;
     412             :                 case 'w':       /* weird, \n affects ^ $ only */
     413           0 :                     flags->cflags &= ~REG_NLSTOP;
     414           0 :                     flags->cflags |= REG_NLANCH;
     415           0 :                     break;
     416             :                 case 'x':       /* expanded syntax */
     417           0 :                     flags->cflags |= REG_EXPANDED;
     418           0 :                     break;
     419             :                 default:
     420           4 :                     ereport(ERROR,
     421             :                             (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     422             :                              errmsg("invalid regexp option: \"%c\"",
     423             :                                     opt_p[i])));
     424             :                     break;
     425             :             }
     426             :         }
     427             :     }
     428          60 : }
     429             : 
     430             : 
     431             : /*
     432             :  *  interface routines called by the function manager
     433             :  */
     434             : 
     435             : Datum
     436        4257 : nameregexeq(PG_FUNCTION_ARGS)
     437             : {
     438        4257 :     Name        n = PG_GETARG_NAME(0);
     439        4257 :     text       *p = PG_GETARG_TEXT_PP(1);
     440             : 
     441        4257 :     PG_RETURN_BOOL(RE_compile_and_execute(p,
     442             :                                           NameStr(*n),
     443             :                                           strlen(NameStr(*n)),
     444             :                                           REG_ADVANCED,
     445             :                                           PG_GET_COLLATION(),
     446             :                                           0, NULL));
     447             : }
     448             : 
     449             : Datum
     450         288 : nameregexne(PG_FUNCTION_ARGS)
     451             : {
     452         288 :     Name        n = PG_GETARG_NAME(0);
     453         288 :     text       *p = PG_GETARG_TEXT_PP(1);
     454             : 
     455         288 :     PG_RETURN_BOOL(!RE_compile_and_execute(p,
     456             :                                            NameStr(*n),
     457             :                                            strlen(NameStr(*n)),
     458             :                                            REG_ADVANCED,
     459             :                                            PG_GET_COLLATION(),
     460             :                                            0, NULL));
     461             : }
     462             : 
     463             : Datum
     464       22123 : textregexeq(PG_FUNCTION_ARGS)
     465             : {
     466       22123 :     text       *s = PG_GETARG_TEXT_PP(0);
     467       22123 :     text       *p = PG_GETARG_TEXT_PP(1);
     468             : 
     469       22123 :     PG_RETURN_BOOL(RE_compile_and_execute(p,
     470             :                                           VARDATA_ANY(s),
     471             :                                           VARSIZE_ANY_EXHDR(s),
     472             :                                           REG_ADVANCED,
     473             :                                           PG_GET_COLLATION(),
     474             :                                           0, NULL));
     475             : }
     476             : 
     477             : Datum
     478        5685 : textregexne(PG_FUNCTION_ARGS)
     479             : {
     480        5685 :     text       *s = PG_GETARG_TEXT_PP(0);
     481        5685 :     text       *p = PG_GETARG_TEXT_PP(1);
     482             : 
     483        5685 :     PG_RETURN_BOOL(!RE_compile_and_execute(p,
     484             :                                            VARDATA_ANY(s),
     485             :                                            VARSIZE_ANY_EXHDR(s),
     486             :                                            REG_ADVANCED,
     487             :                                            PG_GET_COLLATION(),
     488             :                                            0, NULL));
     489             : }
     490             : 
     491             : 
     492             : /*
     493             :  *  routines that use the regexp stuff, but ignore the case.
     494             :  *  for this, we use the REG_ICASE flag to pg_regcomp
     495             :  */
     496             : 
     497             : 
     498             : Datum
     499           0 : nameicregexeq(PG_FUNCTION_ARGS)
     500             : {
     501           0 :     Name        n = PG_GETARG_NAME(0);
     502           0 :     text       *p = PG_GETARG_TEXT_PP(1);
     503             : 
     504           0 :     PG_RETURN_BOOL(RE_compile_and_execute(p,
     505             :                                           NameStr(*n),
     506             :                                           strlen(NameStr(*n)),
     507             :                                           REG_ADVANCED | REG_ICASE,
     508             :                                           PG_GET_COLLATION(),
     509             :                                           0, NULL));
     510             : }
     511             : 
     512             : Datum
     513           1 : nameicregexne(PG_FUNCTION_ARGS)
     514             : {
     515           1 :     Name        n = PG_GETARG_NAME(0);
     516           1 :     text       *p = PG_GETARG_TEXT_PP(1);
     517             : 
     518           1 :     PG_RETURN_BOOL(!RE_compile_and_execute(p,
     519             :                                            NameStr(*n),
     520             :                                            strlen(NameStr(*n)),
     521             :                                            REG_ADVANCED | REG_ICASE,
     522             :                                            PG_GET_COLLATION(),
     523             :                                            0, NULL));
     524             : }
     525             : 
     526             : Datum
     527           0 : texticregexeq(PG_FUNCTION_ARGS)
     528             : {
     529           0 :     text       *s = PG_GETARG_TEXT_PP(0);
     530           0 :     text       *p = PG_GETARG_TEXT_PP(1);
     531             : 
     532           0 :     PG_RETURN_BOOL(RE_compile_and_execute(p,
     533             :                                           VARDATA_ANY(s),
     534             :                                           VARSIZE_ANY_EXHDR(s),
     535             :                                           REG_ADVANCED | REG_ICASE,
     536             :                                           PG_GET_COLLATION(),
     537             :                                           0, NULL));
     538             : }
     539             : 
     540             : Datum
     541           0 : texticregexne(PG_FUNCTION_ARGS)
     542             : {
     543           0 :     text       *s = PG_GETARG_TEXT_PP(0);
     544           0 :     text       *p = PG_GETARG_TEXT_PP(1);
     545             : 
     546           0 :     PG_RETURN_BOOL(!RE_compile_and_execute(p,
     547             :                                            VARDATA_ANY(s),
     548             :                                            VARSIZE_ANY_EXHDR(s),
     549             :                                            REG_ADVANCED | REG_ICASE,
     550             :                                            PG_GET_COLLATION(),
     551             :                                            0, NULL));
     552             : }
     553             : 
     554             : 
     555             : /*
     556             :  * textregexsubstr()
     557             :  *      Return a substring matched by a regular expression.
     558             :  */
     559             : Datum
     560           8 : textregexsubstr(PG_FUNCTION_ARGS)
     561             : {
     562           8 :     text       *s = PG_GETARG_TEXT_PP(0);
     563           8 :     text       *p = PG_GETARG_TEXT_PP(1);
     564             :     regex_t    *re;
     565             :     regmatch_t  pmatch[2];
     566             :     int         so,
     567             :                 eo;
     568             : 
     569             :     /* Compile RE */
     570           8 :     re = RE_compile_and_cache(p, REG_ADVANCED, PG_GET_COLLATION());
     571             : 
     572             :     /*
     573             :      * We pass two regmatch_t structs to get info about the overall match and
     574             :      * the match for the first parenthesized subexpression (if any). If there
     575             :      * is a parenthesized subexpression, we return what it matched; else
     576             :      * return what the whole regexp matched.
     577             :      */
     578          32 :     if (!RE_execute(re,
     579          32 :                     VARDATA_ANY(s), VARSIZE_ANY_EXHDR(s),
     580             :                     2, pmatch))
     581           1 :         PG_RETURN_NULL();       /* definitely no match */
     582             : 
     583           7 :     if (re->re_nsub > 0)
     584             :     {
     585             :         /* has parenthesized subexpressions, use the first one */
     586           5 :         so = pmatch[1].rm_so;
     587           5 :         eo = pmatch[1].rm_eo;
     588             :     }
     589             :     else
     590             :     {
     591             :         /* no parenthesized subexpression, use whole match */
     592           2 :         so = pmatch[0].rm_so;
     593           2 :         eo = pmatch[0].rm_eo;
     594             :     }
     595             : 
     596             :     /*
     597             :      * It is possible to have a match to the whole pattern but no match for a
     598             :      * subexpression; for example 'foo(bar)?' is considered to match 'foo' but
     599             :      * there is no subexpression match.  So this extra test for match failure
     600             :      * is not redundant.
     601             :      */
     602           7 :     if (so < 0 || eo < 0)
     603           0 :         PG_RETURN_NULL();
     604             : 
     605           7 :     return DirectFunctionCall3(text_substr,
     606             :                                PointerGetDatum(s),
     607             :                                Int32GetDatum(so + 1),
     608             :                                Int32GetDatum(eo - so));
     609             : }
     610             : 
     611             : /*
     612             :  * textregexreplace_noopt()
     613             :  *      Return a string matched by a regular expression, with replacement.
     614             :  *
     615             :  * This version doesn't have an option argument: we default to case
     616             :  * sensitive match, replace the first instance only.
     617             :  */
     618             : Datum
     619          21 : textregexreplace_noopt(PG_FUNCTION_ARGS)
     620             : {
     621          21 :     text       *s = PG_GETARG_TEXT_PP(0);
     622          21 :     text       *p = PG_GETARG_TEXT_PP(1);
     623          21 :     text       *r = PG_GETARG_TEXT_PP(2);
     624             :     regex_t    *re;
     625             : 
     626          21 :     re = RE_compile_and_cache(p, REG_ADVANCED, PG_GET_COLLATION());
     627             : 
     628          21 :     PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, false));
     629             : }
     630             : 
     631             : /*
     632             :  * textregexreplace()
     633             :  *      Return a string matched by a regular expression, with replacement.
     634             :  */
     635             : Datum
     636           5 : textregexreplace(PG_FUNCTION_ARGS)
     637             : {
     638           5 :     text       *s = PG_GETARG_TEXT_PP(0);
     639           5 :     text       *p = PG_GETARG_TEXT_PP(1);
     640           5 :     text       *r = PG_GETARG_TEXT_PP(2);
     641           5 :     text       *opt = PG_GETARG_TEXT_PP(3);
     642             :     regex_t    *re;
     643             :     pg_re_flags flags;
     644             : 
     645           5 :     parse_re_flags(&flags, opt);
     646             : 
     647           4 :     re = RE_compile_and_cache(p, flags.cflags, PG_GET_COLLATION());
     648             : 
     649           4 :     PG_RETURN_TEXT_P(replace_text_regexp(s, (void *) re, r, flags.glob));
     650             : }
     651             : 
     652             : /*
     653             :  * similar_escape()
     654             :  * Convert a SQL:2008 regexp pattern to POSIX style, so it can be used by
     655             :  * our regexp engine.
     656             :  */
     657             : Datum
     658           2 : similar_escape(PG_FUNCTION_ARGS)
     659             : {
     660             :     text       *pat_text;
     661             :     text       *esc_text;
     662             :     text       *result;
     663             :     char       *p,
     664             :                *e,
     665             :                *r;
     666             :     int         plen,
     667             :                 elen;
     668           2 :     bool        afterescape = false;
     669           2 :     bool        incharclass = false;
     670           2 :     int         nquotes = 0;
     671             : 
     672             :     /* This function is not strict, so must test explicitly */
     673           2 :     if (PG_ARGISNULL(0))
     674           0 :         PG_RETURN_NULL();
     675           2 :     pat_text = PG_GETARG_TEXT_PP(0);
     676           2 :     p = VARDATA_ANY(pat_text);
     677           2 :     plen = VARSIZE_ANY_EXHDR(pat_text);
     678           2 :     if (PG_ARGISNULL(1))
     679             :     {
     680             :         /* No ESCAPE clause provided; default to backslash as escape */
     681           0 :         e = "\\";
     682           0 :         elen = 1;
     683             :     }
     684             :     else
     685             :     {
     686           2 :         esc_text = PG_GETARG_TEXT_PP(1);
     687           2 :         e = VARDATA_ANY(esc_text);
     688           2 :         elen = VARSIZE_ANY_EXHDR(esc_text);
     689           2 :         if (elen == 0)
     690           0 :             e = NULL;           /* no escape character */
     691             :         else
     692             :         {
     693           2 :             int         escape_mblen = pg_mbstrlen_with_len(e, elen);
     694             : 
     695           2 :             if (escape_mblen > 1)
     696           0 :                 ereport(ERROR,
     697             :                         (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
     698             :                          errmsg("invalid escape string"),
     699             :                          errhint("Escape string must be empty or one character.")));
     700             :         }
     701             :     }
     702             : 
     703             :     /*----------
     704             :      * We surround the transformed input string with
     705             :      *          ^(?: ... )$
     706             :      * which requires some explanation.  We need "^" and "$" to force
     707             :      * the pattern to match the entire input string as per SQL99 spec.
     708             :      * The "(?:" and ")" are a non-capturing set of parens; we have to have
     709             :      * parens in case the string contains "|", else the "^" and "$" will
     710             :      * be bound into the first and last alternatives which is not what we
     711             :      * want, and the parens must be non capturing because we don't want them
     712             :      * to count when selecting output for SUBSTRING.
     713             :      *----------
     714             :      */
     715             : 
     716             :     /*
     717             :      * We need room for the prefix/postfix plus as many as 3 output bytes per
     718             :      * input byte; since the input is at most 1GB this can't overflow
     719             :      */
     720           2 :     result = (text *) palloc(VARHDRSZ + 6 + 3 * plen);
     721           2 :     r = VARDATA(result);
     722             : 
     723           2 :     *r++ = '^';
     724           2 :     *r++ = '(';
     725           2 :     *r++ = '?';
     726           2 :     *r++ = ':';
     727             : 
     728          25 :     while (plen > 0)
     729             :     {
     730          21 :         char        pchar = *p;
     731             : 
     732             :         /*
     733             :          * If both the escape character and the current character from the
     734             :          * pattern are multi-byte, we need to take the slow path.
     735             :          *
     736             :          * But if one of them is single-byte, we can process the pattern one
     737             :          * byte at a time, ignoring multi-byte characters.  (This works
     738             :          * because all server-encodings have the property that a valid
     739             :          * multi-byte character representation cannot contain the
     740             :          * representation of a valid single-byte character.)
     741             :          */
     742             : 
     743          21 :         if (elen > 1)
     744             :         {
     745           0 :             int         mblen = pg_mblen(p);
     746             : 
     747           0 :             if (mblen > 1)
     748             :             {
     749             :                 /* slow, multi-byte path */
     750           0 :                 if (afterescape)
     751             :                 {
     752           0 :                     *r++ = '\\';
     753           0 :                     memcpy(r, p, mblen);
     754           0 :                     r += mblen;
     755           0 :                     afterescape = false;
     756             :                 }
     757           0 :                 else if (e && elen == mblen && memcmp(e, p, mblen) == 0)
     758             :                 {
     759             :                     /* SQL99 escape character; do not send to output */
     760           0 :                     afterescape = true;
     761             :                 }
     762             :                 else
     763             :                 {
     764             :                     /*
     765             :                      * We know it's a multi-byte character, so we don't need
     766             :                      * to do all the comparisons to single-byte characters
     767             :                      * that we do below.
     768             :                      */
     769           0 :                     memcpy(r, p, mblen);
     770           0 :                     r += mblen;
     771             :                 }
     772             : 
     773           0 :                 p += mblen;
     774           0 :                 plen -= mblen;
     775             : 
     776           0 :                 continue;
     777             :             }
     778             :         }
     779             : 
     780             :         /* fast path */
     781          21 :         if (afterescape)
     782             :         {
     783           4 :             if (pchar == '"' && !incharclass)  /* for SUBSTRING patterns */
     784           4 :                 *r++ = ((nquotes++ % 2) == 0) ? '(' : ')';
     785             :             else
     786             :             {
     787           0 :                 *r++ = '\\';
     788           0 :                 *r++ = pchar;
     789             :             }
     790           4 :             afterescape = false;
     791             :         }
     792          17 :         else if (e && pchar == *e)
     793             :         {
     794             :             /* SQL99 escape character; do not send to output */
     795           4 :             afterescape = true;
     796             :         }
     797          13 :         else if (incharclass)
     798             :         {
     799           0 :             if (pchar == '\\')
     800           0 :                 *r++ = '\\';
     801           0 :             *r++ = pchar;
     802           0 :             if (pchar == ']')
     803           0 :                 incharclass = false;
     804             :         }
     805          13 :         else if (pchar == '[')
     806             :         {
     807           0 :             *r++ = pchar;
     808           0 :             incharclass = true;
     809             :         }
     810          13 :         else if (pchar == '%')
     811             :         {
     812           2 :             *r++ = '.';
     813           2 :             *r++ = '*';
     814             :         }
     815          11 :         else if (pchar == '_')
     816           2 :             *r++ = '.';
     817           9 :         else if (pchar == '(')
     818             :         {
     819             :             /* convert to non-capturing parenthesis */
     820           2 :             *r++ = '(';
     821           2 :             *r++ = '?';
     822           2 :             *r++ = ':';
     823             :         }
     824           7 :         else if (pchar == '\\' || pchar == '.' ||
     825           7 :                  pchar == '^' || pchar == '$')
     826             :         {
     827           0 :             *r++ = '\\';
     828           0 :             *r++ = pchar;
     829             :         }
     830             :         else
     831           7 :             *r++ = pchar;
     832          21 :         p++, plen--;
     833             :     }
     834             : 
     835           2 :     *r++ = ')';
     836           2 :     *r++ = '$';
     837             : 
     838           2 :     SET_VARSIZE(result, r - ((char *) result));
     839             : 
     840           2 :     PG_RETURN_TEXT_P(result);
     841             : }
     842             : 
     843             : /*
     844             :  * regexp_match()
     845             :  *      Return the first substring(s) matching a pattern within a string.
     846             :  */
     847             : Datum
     848           5 : regexp_match(PG_FUNCTION_ARGS)
     849             : {
     850           5 :     text       *orig_str = PG_GETARG_TEXT_PP(0);
     851           5 :     text       *pattern = PG_GETARG_TEXT_PP(1);
     852           5 :     text       *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2);
     853             :     pg_re_flags re_flags;
     854             :     regexp_matches_ctx *matchctx;
     855             : 
     856             :     /* Determine options */
     857           5 :     parse_re_flags(&re_flags, flags);
     858             :     /* User mustn't specify 'g' */
     859           5 :     if (re_flags.glob)
     860           1 :         ereport(ERROR,
     861             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     862             :                  errmsg("regexp_match does not support the global option"),
     863             :                  errhint("Use the regexp_matches function instead.")));
     864             : 
     865           4 :     matchctx = setup_regexp_matches(orig_str, pattern, &re_flags,
     866             :                                     PG_GET_COLLATION(), true, false);
     867             : 
     868           4 :     if (matchctx->nmatches == 0)
     869           1 :         PG_RETURN_NULL();
     870             : 
     871           3 :     Assert(matchctx->nmatches == 1);
     872             : 
     873             :     /* Create workspace that build_regexp_match_result needs */
     874           3 :     matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns);
     875           3 :     matchctx->nulls = (bool *) palloc(sizeof(bool) * matchctx->npatterns);
     876             : 
     877           3 :     PG_RETURN_DATUM(PointerGetDatum(build_regexp_match_result(matchctx)));
     878             : }
     879             : 
     880             : /* This is separate to keep the opr_sanity regression test from complaining */
     881             : Datum
     882           3 : regexp_match_no_flags(PG_FUNCTION_ARGS)
     883             : {
     884           3 :     return regexp_match(fcinfo);
     885             : }
     886             : 
     887             : /*
     888             :  * regexp_matches()
     889             :  *      Return a table of all matches of a pattern within a string.
     890             :  */
     891             : Datum
     892          91 : regexp_matches(PG_FUNCTION_ARGS)
     893             : {
     894             :     FuncCallContext *funcctx;
     895             :     regexp_matches_ctx *matchctx;
     896             : 
     897          91 :     if (SRF_IS_FIRSTCALL())
     898             :     {
     899          37 :         text       *pattern = PG_GETARG_TEXT_PP(1);
     900          37 :         text       *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2);
     901             :         pg_re_flags re_flags;
     902             :         MemoryContext oldcontext;
     903             : 
     904          37 :         funcctx = SRF_FIRSTCALL_INIT();
     905          37 :         oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
     906             : 
     907             :         /* Determine options */
     908          37 :         parse_re_flags(&re_flags, flags);
     909             : 
     910             :         /* be sure to copy the input string into the multi-call ctx */
     911          36 :         matchctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
     912             :                                         &re_flags,
     913             :                                         PG_GET_COLLATION(),
     914             :                                         true, false);
     915             : 
     916             :         /* Pre-create workspace that build_regexp_match_result needs */
     917          34 :         matchctx->elems = (Datum *) palloc(sizeof(Datum) * matchctx->npatterns);
     918          34 :         matchctx->nulls = (bool *) palloc(sizeof(bool) * matchctx->npatterns);
     919             : 
     920          34 :         MemoryContextSwitchTo(oldcontext);
     921          34 :         funcctx->user_fctx = (void *) matchctx;
     922             :     }
     923             : 
     924          88 :     funcctx = SRF_PERCALL_SETUP();
     925          88 :     matchctx = (regexp_matches_ctx *) funcctx->user_fctx;
     926             : 
     927          88 :     if (matchctx->next_match < matchctx->nmatches)
     928             :     {
     929             :         ArrayType  *result_ary;
     930             : 
     931          54 :         result_ary = build_regexp_match_result(matchctx);
     932          54 :         matchctx->next_match++;
     933          54 :         SRF_RETURN_NEXT(funcctx, PointerGetDatum(result_ary));
     934             :     }
     935             : 
     936             :     /* release space in multi-call ctx to avoid intraquery memory leak */
     937          34 :     cleanup_regexp_matches(matchctx);
     938             : 
     939          34 :     SRF_RETURN_DONE(funcctx);
     940             : }
     941             : 
     942             : /* This is separate to keep the opr_sanity regression test from complaining */
     943             : Datum
     944          43 : regexp_matches_no_flags(PG_FUNCTION_ARGS)
     945             : {
     946          43 :     return regexp_matches(fcinfo);
     947             : }
     948             : 
     949             : /*
     950             :  * setup_regexp_matches --- do the initial matching for regexp_match
     951             :  *      and regexp_split functions
     952             :  *
     953             :  * To avoid having to re-find the compiled pattern on each call, we do
     954             :  * all the matching in one swoop.  The returned regexp_matches_ctx contains
     955             :  * the locations of all the substrings matching the pattern.
     956             :  *
     957             :  * The two bool parameters have only two patterns (one for matching, one for
     958             :  * splitting) but it seems clearer to distinguish the functionality this way
     959             :  * than to key it all off one "is_split" flag.
     960             :  */
     961             : static regexp_matches_ctx *
     962          53 : setup_regexp_matches(text *orig_str, text *pattern, pg_re_flags *re_flags,
     963             :                      Oid collation,
     964             :                      bool use_subpatterns,
     965             :                      bool ignore_degenerate)
     966             : {
     967          53 :     regexp_matches_ctx *matchctx = palloc0(sizeof(regexp_matches_ctx));
     968             :     int         orig_len;
     969             :     pg_wchar   *wide_str;
     970             :     int         wide_len;
     971             :     regex_t    *cpattern;
     972             :     regmatch_t *pmatch;
     973             :     int         pmatch_len;
     974             :     int         array_len;
     975             :     int         array_idx;
     976             :     int         prev_match_end;
     977             :     int         start_search;
     978             : 
     979             :     /* save original string --- we'll extract result substrings from it */
     980          53 :     matchctx->orig_str = orig_str;
     981             : 
     982             :     /* convert string to pg_wchar form for matching */
     983          53 :     orig_len = VARSIZE_ANY_EXHDR(orig_str);
     984          53 :     wide_str = (pg_wchar *) palloc(sizeof(pg_wchar) * (orig_len + 1));
     985          53 :     wide_len = pg_mb2wchar_with_len(VARDATA_ANY(orig_str), wide_str, orig_len);
     986             : 
     987             :     /* set up the compiled pattern */
     988          53 :     cpattern = RE_compile_and_cache(pattern, re_flags->cflags, collation);
     989             : 
     990             :     /* do we want to remember subpatterns? */
     991          51 :     if (use_subpatterns && cpattern->re_nsub > 0)
     992             :     {
     993           9 :         matchctx->npatterns = cpattern->re_nsub;
     994           9 :         pmatch_len = cpattern->re_nsub + 1;
     995             :     }
     996             :     else
     997             :     {
     998          42 :         use_subpatterns = false;
     999          42 :         matchctx->npatterns = 1;
    1000          42 :         pmatch_len = 1;
    1001             :     }
    1002             : 
    1003             :     /* temporary output space for RE package */
    1004          51 :     pmatch = palloc(sizeof(regmatch_t) * pmatch_len);
    1005             : 
    1006             :     /* the real output space (grown dynamically if needed) */
    1007          51 :     array_len = re_flags->glob ? 256 : 32;
    1008          51 :     matchctx->match_locs = (int *) palloc(sizeof(int) * array_len);
    1009          51 :     array_idx = 0;
    1010             : 
    1011             :     /* search for the pattern, perhaps repeatedly */
    1012          51 :     prev_match_end = 0;
    1013          51 :     start_search = 0;
    1014         336 :     while (RE_wchar_execute(cpattern, wide_str, wide_len, start_search,
    1015             :                             pmatch_len, pmatch))
    1016             :     {
    1017             :         /*
    1018             :          * If requested, ignore degenerate matches, which are zero-length
    1019             :          * matches occurring at the start or end of a string or just after a
    1020             :          * previous match.
    1021             :          */
    1022         469 :         if (!ignore_degenerate ||
    1023         408 :             (pmatch[0].rm_so < wide_len &&
    1024         202 :              pmatch[0].rm_eo > prev_match_end))
    1025             :         {
    1026             :             /* enlarge output space if needed */
    1027         478 :             while (array_idx + matchctx->npatterns * 2 > array_len)
    1028             :             {
    1029           0 :                 array_len *= 2;
    1030           0 :                 matchctx->match_locs = (int *) repalloc(matchctx->match_locs,
    1031             :                                                         sizeof(int) * array_len);
    1032             :             }
    1033             : 
    1034             :             /* save this match's locations */
    1035         239 :             if (use_subpatterns)
    1036             :             {
    1037             :                 int         i;
    1038             : 
    1039          33 :                 for (i = 1; i <= matchctx->npatterns; i++)
    1040             :                 {
    1041          23 :                     matchctx->match_locs[array_idx++] = pmatch[i].rm_so;
    1042          23 :                     matchctx->match_locs[array_idx++] = pmatch[i].rm_eo;
    1043             :                 }
    1044             :             }
    1045             :             else
    1046             :             {
    1047         229 :                 matchctx->match_locs[array_idx++] = pmatch[0].rm_so;
    1048         229 :                 matchctx->match_locs[array_idx++] = pmatch[0].rm_eo;
    1049             :             }
    1050         239 :             matchctx->nmatches++;
    1051             :         }
    1052         263 :         prev_match_end = pmatch[0].rm_eo;
    1053             : 
    1054             :         /* if not glob, stop after one match */
    1055         263 :         if (!re_flags->glob)
    1056          21 :             break;
    1057             : 
    1058             :         /*
    1059             :          * Advance search position.  Normally we start the next search at the
    1060             :          * end of the previous match; but if the match was of zero length, we
    1061             :          * have to advance by one character, or we'd just find the same match
    1062             :          * again.
    1063             :          */
    1064         242 :         start_search = prev_match_end;
    1065         242 :         if (pmatch[0].rm_so == pmatch[0].rm_eo)
    1066         180 :             start_search++;
    1067         242 :         if (start_search > wide_len)
    1068           8 :             break;
    1069             :     }
    1070             : 
    1071             :     /* Clean up temp storage */
    1072          51 :     pfree(wide_str);
    1073          51 :     pfree(pmatch);
    1074             : 
    1075          51 :     return matchctx;
    1076             : }
    1077             : 
    1078             : /*
    1079             :  * cleanup_regexp_matches - release memory of a regexp_matches_ctx
    1080             :  */
    1081             : static void
    1082          39 : cleanup_regexp_matches(regexp_matches_ctx *matchctx)
    1083             : {
    1084          39 :     pfree(matchctx->orig_str);
    1085          39 :     pfree(matchctx->match_locs);
    1086          39 :     if (matchctx->elems)
    1087          34 :         pfree(matchctx->elems);
    1088          39 :     if (matchctx->nulls)
    1089          34 :         pfree(matchctx->nulls);
    1090          39 :     pfree(matchctx);
    1091          39 : }
    1092             : 
    1093             : /*
    1094             :  * build_regexp_match_result - build output array for current match
    1095             :  */
    1096             : static ArrayType *
    1097          57 : build_regexp_match_result(regexp_matches_ctx *matchctx)
    1098             : {
    1099          57 :     Datum      *elems = matchctx->elems;
    1100          57 :     bool       *nulls = matchctx->nulls;
    1101             :     int         dims[1];
    1102             :     int         lbs[1];
    1103             :     int         loc;
    1104             :     int         i;
    1105             : 
    1106             :     /* Extract matching substrings from the original string */
    1107          57 :     loc = matchctx->next_match * matchctx->npatterns * 2;
    1108         127 :     for (i = 0; i < matchctx->npatterns; i++)
    1109             :     {
    1110          70 :         int         so = matchctx->match_locs[loc++];
    1111          70 :         int         eo = matchctx->match_locs[loc++];
    1112             : 
    1113          70 :         if (so < 0 || eo < 0)
    1114             :         {
    1115           1 :             elems[i] = (Datum) 0;
    1116           1 :             nulls[i] = true;
    1117             :         }
    1118             :         else
    1119             :         {
    1120          69 :             elems[i] = DirectFunctionCall3(text_substr,
    1121             :                                            PointerGetDatum(matchctx->orig_str),
    1122             :                                            Int32GetDatum(so + 1),
    1123             :                                            Int32GetDatum(eo - so));
    1124          69 :             nulls[i] = false;
    1125             :         }
    1126             :     }
    1127             : 
    1128             :     /* And form an array */
    1129          57 :     dims[0] = matchctx->npatterns;
    1130          57 :     lbs[0] = 1;
    1131             :     /* XXX: this hardcodes assumptions about the text type */
    1132          57 :     return construct_md_array(elems, nulls, 1, dims, lbs,
    1133             :                               TEXTOID, -1, false, 'i');
    1134             : }
    1135             : 
    1136             : /*
    1137             :  * regexp_split_to_table()
    1138             :  *      Split the string at matches of the pattern, returning the
    1139             :  *      split-out substrings as a table.
    1140             :  */
    1141             : Datum
    1142          99 : regexp_split_to_table(PG_FUNCTION_ARGS)
    1143             : {
    1144             :     FuncCallContext *funcctx;
    1145             :     regexp_matches_ctx *splitctx;
    1146             : 
    1147          99 :     if (SRF_IS_FIRSTCALL())
    1148             :     {
    1149           7 :         text       *pattern = PG_GETARG_TEXT_PP(1);
    1150           7 :         text       *flags = PG_GETARG_TEXT_PP_IF_EXISTS(2);
    1151             :         pg_re_flags re_flags;
    1152             :         MemoryContext oldcontext;
    1153             : 
    1154           7 :         funcctx = SRF_FIRSTCALL_INIT();
    1155           7 :         oldcontext = MemoryContextSwitchTo(funcctx->multi_call_memory_ctx);
    1156             : 
    1157             :         /* Determine options */
    1158           7 :         parse_re_flags(&re_flags, flags);
    1159             :         /* User mustn't specify 'g' */
    1160           6 :         if (re_flags.glob)
    1161           1 :             ereport(ERROR,
    1162             :                     (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1163             :                      errmsg("regexp_split_to_table does not support the global option")));
    1164             :         /* But we find all the matches anyway */
    1165           5 :         re_flags.glob = true;
    1166             : 
    1167             :         /* be sure to copy the input string into the multi-call ctx */
    1168           5 :         splitctx = setup_regexp_matches(PG_GETARG_TEXT_P_COPY(0), pattern,
    1169             :                                         &re_flags,
    1170             :                                         PG_GET_COLLATION(),
    1171             :                                         false, true);
    1172             : 
    1173           5 :         MemoryContextSwitchTo(oldcontext);
    1174           5 :         funcctx->user_fctx = (void *) splitctx;
    1175             :     }
    1176             : 
    1177          97 :     funcctx = SRF_PERCALL_SETUP();
    1178          97 :     splitctx = (regexp_matches_ctx *) funcctx->user_fctx;
    1179             : 
    1180          97 :     if (splitctx->next_match <= splitctx->nmatches)
    1181             :     {
    1182          92 :         Datum       result = build_regexp_split_result(splitctx);
    1183             : 
    1184          92 :         splitctx->next_match++;
    1185          92 :         SRF_RETURN_NEXT(funcctx, result);
    1186             :     }
    1187             : 
    1188             :     /* release space in multi-call ctx to avoid intraquery memory leak */
    1189           5 :     cleanup_regexp_matches(splitctx);
    1190             : 
    1191           5 :     SRF_RETURN_DONE(funcctx);
    1192             : }
    1193             : 
    1194             : /* This is separate to keep the opr_sanity regression test from complaining */
    1195             : Datum
    1196          92 : regexp_split_to_table_no_flags(PG_FUNCTION_ARGS)
    1197             : {
    1198          92 :     return regexp_split_to_table(fcinfo);
    1199             : }
    1200             : 
    1201             : /*
    1202             :  * regexp_split_to_array()
    1203             :  *      Split the string at matches of the pattern, returning the
    1204             :  *      split-out substrings as an array.
    1205             :  */
    1206             : Datum
    1207          10 : regexp_split_to_array(PG_FUNCTION_ARGS)
    1208             : {
    1209          10 :     ArrayBuildState *astate = NULL;
    1210             :     pg_re_flags re_flags;
    1211             :     regexp_matches_ctx *splitctx;
    1212             : 
    1213             :     /* Determine options */
    1214          10 :     parse_re_flags(&re_flags, PG_GETARG_TEXT_PP_IF_EXISTS(2));
    1215             :     /* User mustn't specify 'g' */
    1216           9 :     if (re_flags.glob)
    1217           1 :         ereport(ERROR,
    1218             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
    1219             :                  errmsg("regexp_split_to_array does not support the global option")));
    1220             :     /* But we find all the matches anyway */
    1221           8 :     re_flags.glob = true;
    1222             : 
    1223          16 :     splitctx = setup_regexp_matches(PG_GETARG_TEXT_PP(0),
    1224           8 :                                     PG_GETARG_TEXT_PP(1),
    1225             :                                     &re_flags,
    1226             :                                     PG_GET_COLLATION(),
    1227             :                                     false, true);
    1228             : 
    1229         119 :     while (splitctx->next_match <= splitctx->nmatches)
    1230             :     {
    1231         103 :         astate = accumArrayResult(astate,
    1232             :                                   build_regexp_split_result(splitctx),
    1233             :                                   false,
    1234             :                                   TEXTOID,
    1235             :                                   CurrentMemoryContext);
    1236         103 :         splitctx->next_match++;
    1237             :     }
    1238             : 
    1239             :     /*
    1240             :      * We don't call cleanup_regexp_matches here; it would try to pfree the
    1241             :      * input string, which we didn't copy.  The space is not in a long-lived
    1242             :      * memory context anyway.
    1243             :      */
    1244             : 
    1245           8 :     PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate, CurrentMemoryContext));
    1246             : }
    1247             : 
    1248             : /* This is separate to keep the opr_sanity regression test from complaining */
    1249             : Datum
    1250           7 : regexp_split_to_array_no_flags(PG_FUNCTION_ARGS)
    1251             : {
    1252           7 :     return regexp_split_to_array(fcinfo);
    1253             : }
    1254             : 
    1255             : /*
    1256             :  * build_regexp_split_result - build output string for current match
    1257             :  *
    1258             :  * We return the string between the current match and the previous one,
    1259             :  * or the string after the last match when next_match == nmatches.
    1260             :  */
    1261             : static Datum
    1262         195 : build_regexp_split_result(regexp_matches_ctx *splitctx)
    1263             : {
    1264             :     int         startpos;
    1265             :     int         endpos;
    1266             : 
    1267         195 :     if (splitctx->next_match > 0)
    1268         182 :         startpos = splitctx->match_locs[splitctx->next_match * 2 - 1];
    1269             :     else
    1270          13 :         startpos = 0;
    1271         195 :     if (startpos < 0)
    1272           0 :         elog(ERROR, "invalid match ending position");
    1273             : 
    1274         195 :     if (splitctx->next_match < splitctx->nmatches)
    1275             :     {
    1276         182 :         endpos = splitctx->match_locs[splitctx->next_match * 2];
    1277         182 :         if (endpos < startpos)
    1278           0 :             elog(ERROR, "invalid match starting position");
    1279         182 :         return DirectFunctionCall3(text_substr,
    1280             :                                    PointerGetDatum(splitctx->orig_str),
    1281             :                                    Int32GetDatum(startpos + 1),
    1282             :                                    Int32GetDatum(endpos - startpos));
    1283             :     }
    1284             :     else
    1285             :     {
    1286             :         /* no more matches, return rest of string */
    1287          13 :         return DirectFunctionCall2(text_substr_no_len,
    1288             :                                    PointerGetDatum(splitctx->orig_str),
    1289             :                                    Int32GetDatum(startpos + 1));
    1290             :     }
    1291             : }
    1292             : 
    1293             : /*
    1294             :  * regexp_fixed_prefix - extract fixed prefix, if any, for a regexp
    1295             :  *
    1296             :  * The result is NULL if there is no fixed prefix, else a palloc'd string.
    1297             :  * If it is an exact match, not just a prefix, *exact is returned as TRUE.
    1298             :  */
    1299             : char *
    1300        1256 : regexp_fixed_prefix(text *text_re, bool case_insensitive, Oid collation,
    1301             :                     bool *exact)
    1302             : {
    1303             :     char       *result;
    1304             :     regex_t    *re;
    1305             :     int         cflags;
    1306             :     int         re_result;
    1307             :     pg_wchar   *str;
    1308             :     size_t      slen;
    1309             :     size_t      maxlen;
    1310             :     char        errMsg[100];
    1311             : 
    1312        1256 :     *exact = false;             /* default result */
    1313             : 
    1314             :     /* Compile RE */
    1315        1256 :     cflags = REG_ADVANCED;
    1316        1256 :     if (case_insensitive)
    1317           0 :         cflags |= REG_ICASE;
    1318             : 
    1319        1256 :     re = RE_compile_and_cache(text_re, cflags, collation);
    1320             : 
    1321             :     /* Examine it to see if there's a fixed prefix */
    1322        1256 :     re_result = pg_regprefix(re, &str, &slen);
    1323             : 
    1324        1256 :     switch (re_result)
    1325             :     {
    1326             :         case REG_NOMATCH:
    1327          46 :             return NULL;
    1328             : 
    1329             :         case REG_PREFIX:
    1330             :             /* continue with wchar conversion */
    1331          98 :             break;
    1332             : 
    1333             :         case REG_EXACT:
    1334        1112 :             *exact = true;
    1335             :             /* continue with wchar conversion */
    1336        1112 :             break;
    1337             : 
    1338             :         default:
    1339             :             /* re failed??? */
    1340           0 :             CHECK_FOR_INTERRUPTS();
    1341           0 :             pg_regerror(re_result, re, errMsg, sizeof(errMsg));
    1342           0 :             ereport(ERROR,
    1343             :                     (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
    1344             :                      errmsg("regular expression failed: %s", errMsg)));
    1345             :             break;
    1346             :     }
    1347             : 
    1348             :     /* Convert pg_wchar result back to database encoding */
    1349        1210 :     maxlen = pg_database_encoding_max_length() * slen + 1;
    1350        1210 :     result = (char *) palloc(maxlen);
    1351        1210 :     slen = pg_wchar2mb_with_len(str, result, slen);
    1352        1210 :     Assert(slen < maxlen);
    1353             : 
    1354        1210 :     free(str);
    1355             : 
    1356        1210 :     return result;
    1357             : }

Generated by: LCOV version 1.11