Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * varlena.c
4 : * Functions for the variable-length built-in types.
5 : *
6 : * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7 : * Portions Copyright (c) 1994, Regents of the University of California
8 : *
9 : *
10 : * IDENTIFICATION
11 : * src/backend/utils/adt/varlena.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : #include "postgres.h"
16 :
17 : #include <ctype.h>
18 : #include <limits.h>
19 :
20 : #include "access/hash.h"
21 : #include "access/tuptoaster.h"
22 : #include "catalog/pg_collation.h"
23 : #include "catalog/pg_type.h"
24 : #include "common/md5.h"
25 : #include "lib/hyperloglog.h"
26 : #include "libpq/pqformat.h"
27 : #include "miscadmin.h"
28 : #include "parser/scansup.h"
29 : #include "port/pg_bswap.h"
30 : #include "regex/regex.h"
31 : #include "utils/builtins.h"
32 : #include "utils/bytea.h"
33 : #include "utils/lsyscache.h"
34 : #include "utils/memutils.h"
35 : #include "utils/pg_locale.h"
36 : #include "utils/sortsupport.h"
37 : #include "utils/varlena.h"
38 :
39 :
40 : /* GUC variable */
41 : int bytea_output = BYTEA_OUTPUT_HEX;
42 :
43 : typedef struct varlena unknown;
44 : typedef struct varlena VarString;
45 :
46 : typedef struct
47 : {
48 : bool use_wchar; /* T if multibyte encoding */
49 : char *str1; /* use these if not use_wchar */
50 : char *str2; /* note: these point to original texts */
51 : pg_wchar *wstr1; /* use these if use_wchar */
52 : pg_wchar *wstr2; /* note: these are palloc'd */
53 : int len1; /* string lengths in logical characters */
54 : int len2;
55 : /* Skip table for Boyer-Moore-Horspool search algorithm: */
56 : int skiptablemask; /* mask for ANDing with skiptable subscripts */
57 : int skiptable[256]; /* skip distance for given mismatched char */
58 : } TextPositionState;
59 :
60 : typedef struct
61 : {
62 : char *buf1; /* 1st string, or abbreviation original string
63 : * buf */
64 : char *buf2; /* 2nd string, or abbreviation strxfrm() buf */
65 : int buflen1;
66 : int buflen2;
67 : int last_len1; /* Length of last buf1 string/strxfrm() input */
68 : int last_len2; /* Length of last buf2 string/strxfrm() blob */
69 : int last_returned; /* Last comparison result (cache) */
70 : bool cache_blob; /* Does buf2 contain strxfrm() blob, etc? */
71 : bool collate_c;
72 : bool bpchar; /* Sorting bpchar, not varchar/text/bytea? */
73 : hyperLogLogState abbr_card; /* Abbreviated key cardinality state */
74 : hyperLogLogState full_card; /* Full key cardinality state */
75 : double prop_card; /* Required cardinality proportion */
76 : pg_locale_t locale;
77 : } VarStringSortSupport;
78 :
79 : /*
80 : * This should be large enough that most strings will fit, but small enough
81 : * that we feel comfortable putting it on the stack
82 : */
83 : #define TEXTBUFLEN 1024
84 :
85 : #define DatumGetUnknownP(X) ((unknown *) PG_DETOAST_DATUM(X))
86 : #define DatumGetUnknownPCopy(X) ((unknown *) PG_DETOAST_DATUM_COPY(X))
87 : #define PG_GETARG_UNKNOWN_P(n) DatumGetUnknownP(PG_GETARG_DATUM(n))
88 : #define PG_GETARG_UNKNOWN_P_COPY(n) DatumGetUnknownPCopy(PG_GETARG_DATUM(n))
89 : #define PG_RETURN_UNKNOWN_P(x) PG_RETURN_POINTER(x)
90 :
91 : #define DatumGetVarStringP(X) ((VarString *) PG_DETOAST_DATUM(X))
92 : #define DatumGetVarStringPP(X) ((VarString *) PG_DETOAST_DATUM_PACKED(X))
93 :
94 : static int varstrfastcmp_c(Datum x, Datum y, SortSupport ssup);
95 : static int bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup);
96 : static int varstrfastcmp_locale(Datum x, Datum y, SortSupport ssup);
97 : static int varstrcmp_abbrev(Datum x, Datum y, SortSupport ssup);
98 : static Datum varstr_abbrev_convert(Datum original, SortSupport ssup);
99 : static bool varstr_abbrev_abort(int memtupcount, SortSupport ssup);
100 : static int32 text_length(Datum str);
101 : static text *text_catenate(text *t1, text *t2);
102 : static text *text_substring(Datum str,
103 : int32 start,
104 : int32 length,
105 : bool length_not_specified);
106 : static text *text_overlay(text *t1, text *t2, int sp, int sl);
107 : static int text_position(text *t1, text *t2);
108 : static void text_position_setup(text *t1, text *t2, TextPositionState *state);
109 : static int text_position_next(int start_pos, TextPositionState *state);
110 : static void text_position_cleanup(TextPositionState *state);
111 : static int text_cmp(text *arg1, text *arg2, Oid collid);
112 : static bytea *bytea_catenate(bytea *t1, bytea *t2);
113 : static bytea *bytea_substring(Datum str,
114 : int S,
115 : int L,
116 : bool length_not_specified);
117 : static bytea *bytea_overlay(bytea *t1, bytea *t2, int sp, int sl);
118 : static void appendStringInfoText(StringInfo str, const text *t);
119 : static Datum text_to_array_internal(PG_FUNCTION_ARGS);
120 : static text *array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
121 : const char *fldsep, const char *null_string);
122 : static StringInfo makeStringAggState(FunctionCallInfo fcinfo);
123 : static bool text_format_parse_digits(const char **ptr, const char *end_ptr,
124 : int *value);
125 : static const char *text_format_parse_format(const char *start_ptr,
126 : const char *end_ptr,
127 : int *argpos, int *widthpos,
128 : int *flags, int *width);
129 : static void text_format_string_conversion(StringInfo buf, char conversion,
130 : FmgrInfo *typOutputInfo,
131 : Datum value, bool isNull,
132 : int flags, int width);
133 : static void text_format_append_string(StringInfo buf, const char *str,
134 : int flags, int width);
135 :
136 :
137 : /*****************************************************************************
138 : * CONVERSION ROUTINES EXPORTED FOR USE BY C CODE *
139 : *****************************************************************************/
140 :
141 : /*
142 : * cstring_to_text
143 : *
144 : * Create a text value from a null-terminated C string.
145 : *
146 : * The new text value is freshly palloc'd with a full-size VARHDR.
147 : */
148 : text *
149 263161 : cstring_to_text(const char *s)
150 : {
151 263161 : return cstring_to_text_with_len(s, strlen(s));
152 : }
153 :
154 : /*
155 : * cstring_to_text_with_len
156 : *
157 : * Same as cstring_to_text except the caller specifies the string length;
158 : * the string need not be null_terminated.
159 : */
160 : text *
161 316872 : cstring_to_text_with_len(const char *s, int len)
162 : {
163 316872 : text *result = (text *) palloc(len + VARHDRSZ);
164 :
165 316872 : SET_VARSIZE(result, len + VARHDRSZ);
166 316872 : memcpy(VARDATA(result), s, len);
167 :
168 316872 : return result;
169 : }
170 :
171 : /*
172 : * text_to_cstring
173 : *
174 : * Create a palloc'd, null-terminated C string from a text value.
175 : *
176 : * We support being passed a compressed or toasted text value.
177 : * This is a bit bogus since such values shouldn't really be referred to as
178 : * "text *", but it seems useful for robustness. If we didn't handle that
179 : * case here, we'd need another routine that did, anyway.
180 : */
181 : char *
182 89391 : text_to_cstring(const text *t)
183 : {
184 : /* must cast away the const, unfortunately */
185 89391 : text *tunpacked = pg_detoast_datum_packed((struct varlena *) t);
186 89391 : int len = VARSIZE_ANY_EXHDR(tunpacked);
187 : char *result;
188 :
189 89391 : result = (char *) palloc(len + 1);
190 89391 : memcpy(result, VARDATA_ANY(tunpacked), len);
191 89391 : result[len] = '\0';
192 :
193 89391 : if (tunpacked != t)
194 1209 : pfree(tunpacked);
195 :
196 89391 : return result;
197 : }
198 :
199 : /*
200 : * text_to_cstring_buffer
201 : *
202 : * Copy a text value into a caller-supplied buffer of size dst_len.
203 : *
204 : * The text string is truncated if necessary to fit. The result is
205 : * guaranteed null-terminated (unless dst_len == 0).
206 : *
207 : * We support being passed a compressed or toasted text value.
208 : * This is a bit bogus since such values shouldn't really be referred to as
209 : * "text *", but it seems useful for robustness. If we didn't handle that
210 : * case here, we'd need another routine that did, anyway.
211 : */
212 : void
213 92 : text_to_cstring_buffer(const text *src, char *dst, size_t dst_len)
214 : {
215 : /* must cast away the const, unfortunately */
216 92 : text *srcunpacked = pg_detoast_datum_packed((struct varlena *) src);
217 92 : size_t src_len = VARSIZE_ANY_EXHDR(srcunpacked);
218 :
219 92 : if (dst_len > 0)
220 : {
221 92 : dst_len--;
222 92 : if (dst_len >= src_len)
223 92 : dst_len = src_len;
224 : else /* ensure truncation is encoding-safe */
225 0 : dst_len = pg_mbcliplen(VARDATA_ANY(srcunpacked), src_len, dst_len);
226 92 : memcpy(dst, VARDATA_ANY(srcunpacked), dst_len);
227 92 : dst[dst_len] = '\0';
228 : }
229 :
230 92 : if (srcunpacked != src)
231 0 : pfree(srcunpacked);
232 92 : }
233 :
234 :
235 : /*****************************************************************************
236 : * USER I/O ROUTINES *
237 : *****************************************************************************/
238 :
239 :
240 : #define VAL(CH) ((CH) - '0')
241 : #define DIG(VAL) ((VAL) + '0')
242 :
243 : /*
244 : * byteain - converts from printable representation of byte array
245 : *
246 : * Non-printable characters must be passed as '\nnn' (octal) and are
247 : * converted to internal form. '\' must be passed as '\\'.
248 : * ereport(ERROR, ...) if bad form.
249 : *
250 : * BUGS:
251 : * The input is scanned twice.
252 : * The error checking of input is minimal.
253 : */
254 : Datum
255 746 : byteain(PG_FUNCTION_ARGS)
256 : {
257 746 : char *inputText = PG_GETARG_CSTRING(0);
258 : char *tp;
259 : char *rp;
260 : int bc;
261 : bytea *result;
262 :
263 : /* Recognize hex input */
264 746 : if (inputText[0] == '\\' && inputText[1] == 'x')
265 : {
266 9 : size_t len = strlen(inputText);
267 :
268 9 : bc = (len - 2) / 2 + VARHDRSZ; /* maximum possible length */
269 9 : result = palloc(bc);
270 9 : bc = hex_decode(inputText + 2, len - 2, VARDATA(result));
271 7 : SET_VARSIZE(result, bc + VARHDRSZ); /* actual length */
272 :
273 7 : PG_RETURN_BYTEA_P(result);
274 : }
275 :
276 : /* Else, it's the traditional escaped style */
277 10311 : for (bc = 0, tp = inputText; *tp != '\0'; bc++)
278 : {
279 9575 : if (tp[0] != '\\')
280 9471 : tp++;
281 208 : else if ((tp[0] == '\\') &&
282 311 : (tp[1] >= '0' && tp[1] <= '3') &&
283 309 : (tp[2] >= '0' && tp[2] <= '7') &&
284 206 : (tp[3] >= '0' && tp[3] <= '7'))
285 103 : tp += 4;
286 2 : else if ((tp[0] == '\\') &&
287 1 : (tp[1] == '\\'))
288 0 : tp += 2;
289 : else
290 : {
291 : /*
292 : * one backslash, not followed by another or ### valid octal
293 : */
294 1 : ereport(ERROR,
295 : (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
296 : errmsg("invalid input syntax for type %s", "bytea")));
297 : }
298 : }
299 :
300 736 : bc += VARHDRSZ;
301 :
302 736 : result = (bytea *) palloc(bc);
303 736 : SET_VARSIZE(result, bc);
304 :
305 736 : tp = inputText;
306 736 : rp = VARDATA(result);
307 11044 : while (*tp != '\0')
308 : {
309 9572 : if (tp[0] != '\\')
310 9469 : *rp++ = *tp++;
311 206 : else if ((tp[0] == '\\') &&
312 309 : (tp[1] >= '0' && tp[1] <= '3') &&
313 309 : (tp[2] >= '0' && tp[2] <= '7') &&
314 206 : (tp[3] >= '0' && tp[3] <= '7'))
315 : {
316 103 : bc = VAL(tp[1]);
317 103 : bc <<= 3;
318 103 : bc += VAL(tp[2]);
319 103 : bc <<= 3;
320 103 : *rp++ = bc + VAL(tp[3]);
321 :
322 103 : tp += 4;
323 : }
324 0 : else if ((tp[0] == '\\') &&
325 0 : (tp[1] == '\\'))
326 : {
327 0 : *rp++ = '\\';
328 0 : tp += 2;
329 : }
330 : else
331 : {
332 : /*
333 : * We should never get here. The first pass should not allow it.
334 : */
335 0 : ereport(ERROR,
336 : (errcode(ERRCODE_INVALID_TEXT_REPRESENTATION),
337 : errmsg("invalid input syntax for type %s", "bytea")));
338 : }
339 : }
340 :
341 736 : PG_RETURN_BYTEA_P(result);
342 : }
343 :
344 : /*
345 : * byteaout - converts to printable representation of byte array
346 : *
347 : * In the traditional escaped format, non-printable characters are
348 : * printed as '\nnn' (octal) and '\' as '\\'.
349 : */
350 : Datum
351 67 : byteaout(PG_FUNCTION_ARGS)
352 : {
353 67 : bytea *vlena = PG_GETARG_BYTEA_PP(0);
354 : char *result;
355 : char *rp;
356 :
357 67 : if (bytea_output == BYTEA_OUTPUT_HEX)
358 : {
359 : /* Print hex format */
360 34 : rp = result = palloc(VARSIZE_ANY_EXHDR(vlena) * 2 + 2 + 1);
361 34 : *rp++ = '\\';
362 34 : *rp++ = 'x';
363 34 : rp += hex_encode(VARDATA_ANY(vlena), VARSIZE_ANY_EXHDR(vlena), rp);
364 : }
365 33 : else if (bytea_output == BYTEA_OUTPUT_ESCAPE)
366 : {
367 : /* Print traditional escaped format */
368 : char *vp;
369 : int len;
370 : int i;
371 :
372 33 : len = 1; /* empty string has 1 char */
373 33 : vp = VARDATA_ANY(vlena);
374 380 : for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
375 : {
376 347 : if (*vp == '\\')
377 0 : len += 2;
378 347 : else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
379 79 : len += 4;
380 : else
381 268 : len++;
382 : }
383 33 : rp = result = (char *) palloc(len);
384 33 : vp = VARDATA_ANY(vlena);
385 380 : for (i = VARSIZE_ANY_EXHDR(vlena); i != 0; i--, vp++)
386 : {
387 347 : if (*vp == '\\')
388 : {
389 0 : *rp++ = '\\';
390 0 : *rp++ = '\\';
391 : }
392 347 : else if ((unsigned char) *vp < 0x20 || (unsigned char) *vp > 0x7e)
393 79 : {
394 : int val; /* holds unprintable chars */
395 :
396 79 : val = *vp;
397 79 : rp[0] = '\\';
398 79 : rp[3] = DIG(val & 07);
399 79 : val >>= 3;
400 79 : rp[2] = DIG(val & 07);
401 79 : val >>= 3;
402 79 : rp[1] = DIG(val & 03);
403 79 : rp += 4;
404 : }
405 : else
406 268 : *rp++ = *vp;
407 : }
408 : }
409 : else
410 : {
411 0 : elog(ERROR, "unrecognized bytea_output setting: %d",
412 : bytea_output);
413 : rp = result = NULL; /* keep compiler quiet */
414 : }
415 67 : *rp = '\0';
416 67 : PG_RETURN_CSTRING(result);
417 : }
418 :
419 : /*
420 : * bytearecv - converts external binary format to bytea
421 : */
422 : Datum
423 164 : bytearecv(PG_FUNCTION_ARGS)
424 : {
425 164 : StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
426 : bytea *result;
427 : int nbytes;
428 :
429 164 : nbytes = buf->len - buf->cursor;
430 164 : result = (bytea *) palloc(nbytes + VARHDRSZ);
431 164 : SET_VARSIZE(result, nbytes + VARHDRSZ);
432 164 : pq_copymsgbytes(buf, VARDATA(result), nbytes);
433 164 : PG_RETURN_BYTEA_P(result);
434 : }
435 :
436 : /*
437 : * byteasend - converts bytea to binary format
438 : *
439 : * This is a special case: just copy the input...
440 : */
441 : Datum
442 83 : byteasend(PG_FUNCTION_ARGS)
443 : {
444 83 : bytea *vlena = PG_GETARG_BYTEA_P_COPY(0);
445 :
446 83 : PG_RETURN_BYTEA_P(vlena);
447 : }
448 :
449 : Datum
450 7 : bytea_string_agg_transfn(PG_FUNCTION_ARGS)
451 : {
452 : StringInfo state;
453 :
454 7 : state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
455 :
456 : /* Append the value unless null. */
457 7 : if (!PG_ARGISNULL(1))
458 : {
459 7 : bytea *value = PG_GETARG_BYTEA_PP(1);
460 :
461 : /* On the first time through, we ignore the delimiter. */
462 7 : if (state == NULL)
463 4 : state = makeStringAggState(fcinfo);
464 3 : else if (!PG_ARGISNULL(2))
465 : {
466 2 : bytea *delim = PG_GETARG_BYTEA_PP(2);
467 :
468 2 : appendBinaryStringInfo(state, VARDATA_ANY(delim), VARSIZE_ANY_EXHDR(delim));
469 : }
470 :
471 7 : appendBinaryStringInfo(state, VARDATA_ANY(value), VARSIZE_ANY_EXHDR(value));
472 : }
473 :
474 : /*
475 : * The transition type for string_agg() is declared to be "internal",
476 : * which is a pass-by-value type the same size as a pointer.
477 : */
478 7 : PG_RETURN_POINTER(state);
479 : }
480 :
481 : Datum
482 5 : bytea_string_agg_finalfn(PG_FUNCTION_ARGS)
483 : {
484 : StringInfo state;
485 :
486 : /* cannot be called directly because of internal-type argument */
487 5 : Assert(AggCheckCallContext(fcinfo, NULL));
488 :
489 5 : state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
490 :
491 5 : if (state != NULL)
492 : {
493 : bytea *result;
494 :
495 4 : result = (bytea *) palloc(state->len + VARHDRSZ);
496 4 : SET_VARSIZE(result, state->len + VARHDRSZ);
497 4 : memcpy(VARDATA(result), state->data, state->len);
498 4 : PG_RETURN_BYTEA_P(result);
499 : }
500 : else
501 1 : PG_RETURN_NULL();
502 : }
503 :
504 : /*
505 : * textin - converts "..." to internal representation
506 : */
507 : Datum
508 137657 : textin(PG_FUNCTION_ARGS)
509 : {
510 137657 : char *inputText = PG_GETARG_CSTRING(0);
511 :
512 137657 : PG_RETURN_TEXT_P(cstring_to_text(inputText));
513 : }
514 :
515 : /*
516 : * textout - converts internal representation to "..."
517 : */
518 : Datum
519 39899 : textout(PG_FUNCTION_ARGS)
520 : {
521 39899 : Datum txt = PG_GETARG_DATUM(0);
522 :
523 39899 : PG_RETURN_CSTRING(TextDatumGetCString(txt));
524 : }
525 :
526 : /*
527 : * textrecv - converts external binary format to text
528 : */
529 : Datum
530 3 : textrecv(PG_FUNCTION_ARGS)
531 : {
532 3 : StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
533 : text *result;
534 : char *str;
535 : int nbytes;
536 :
537 3 : str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
538 :
539 3 : result = cstring_to_text_with_len(str, nbytes);
540 3 : pfree(str);
541 3 : PG_RETURN_TEXT_P(result);
542 : }
543 :
544 : /*
545 : * textsend - converts text to binary format
546 : */
547 : Datum
548 3 : textsend(PG_FUNCTION_ARGS)
549 : {
550 3 : text *t = PG_GETARG_TEXT_PP(0);
551 : StringInfoData buf;
552 :
553 3 : pq_begintypsend(&buf);
554 3 : pq_sendtext(&buf, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
555 3 : PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
556 : }
557 :
558 :
559 : /*
560 : * unknownin - converts "..." to internal representation
561 : */
562 : Datum
563 0 : unknownin(PG_FUNCTION_ARGS)
564 : {
565 0 : char *str = PG_GETARG_CSTRING(0);
566 :
567 : /* representation is same as cstring */
568 0 : PG_RETURN_CSTRING(pstrdup(str));
569 : }
570 :
571 : /*
572 : * unknownout - converts internal representation to "..."
573 : */
574 : Datum
575 65 : unknownout(PG_FUNCTION_ARGS)
576 : {
577 : /* representation is same as cstring */
578 65 : char *str = PG_GETARG_CSTRING(0);
579 :
580 65 : PG_RETURN_CSTRING(pstrdup(str));
581 : }
582 :
583 : /*
584 : * unknownrecv - converts external binary format to unknown
585 : */
586 : Datum
587 0 : unknownrecv(PG_FUNCTION_ARGS)
588 : {
589 0 : StringInfo buf = (StringInfo) PG_GETARG_POINTER(0);
590 : char *str;
591 : int nbytes;
592 :
593 0 : str = pq_getmsgtext(buf, buf->len - buf->cursor, &nbytes);
594 : /* representation is same as cstring */
595 0 : PG_RETURN_CSTRING(str);
596 : }
597 :
598 : /*
599 : * unknownsend - converts unknown to binary format
600 : */
601 : Datum
602 0 : unknownsend(PG_FUNCTION_ARGS)
603 : {
604 : /* representation is same as cstring */
605 0 : char *str = PG_GETARG_CSTRING(0);
606 : StringInfoData buf;
607 :
608 0 : pq_begintypsend(&buf);
609 0 : pq_sendtext(&buf, str, strlen(str));
610 0 : PG_RETURN_BYTEA_P(pq_endtypsend(&buf));
611 : }
612 :
613 :
614 : /* ========== PUBLIC ROUTINES ========== */
615 :
616 : /*
617 : * textlen -
618 : * returns the logical length of a text*
619 : * (which is less than the VARSIZE of the text*)
620 : */
621 : Datum
622 10247 : textlen(PG_FUNCTION_ARGS)
623 : {
624 10247 : Datum str = PG_GETARG_DATUM(0);
625 :
626 : /* try to avoid decompressing argument */
627 10247 : PG_RETURN_INT32(text_length(str));
628 : }
629 :
630 : /*
631 : * text_length -
632 : * Does the real work for textlen()
633 : *
634 : * This is broken out so it can be called directly by other string processing
635 : * functions. Note that the argument is passed as a Datum, to indicate that
636 : * it may still be in compressed form. We can avoid decompressing it at all
637 : * in some cases.
638 : */
639 : static int32
640 10249 : text_length(Datum str)
641 : {
642 : /* fastpath when max encoding length is one */
643 10249 : if (pg_database_encoding_max_length() == 1)
644 0 : PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
645 : else
646 : {
647 10249 : text *t = DatumGetTextPP(str);
648 :
649 10249 : PG_RETURN_INT32(pg_mbstrlen_with_len(VARDATA_ANY(t),
650 : VARSIZE_ANY_EXHDR(t)));
651 : }
652 : }
653 :
654 : /*
655 : * textoctetlen -
656 : * returns the physical length of a text*
657 : * (which is less than the VARSIZE of the text*)
658 : */
659 : Datum
660 3 : textoctetlen(PG_FUNCTION_ARGS)
661 : {
662 3 : Datum str = PG_GETARG_DATUM(0);
663 :
664 : /* We need not detoast the input at all */
665 3 : PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
666 : }
667 :
668 : /*
669 : * textcat -
670 : * takes two text* and returns a text* that is the concatenation of
671 : * the two.
672 : *
673 : * Rewritten by Sapa, sapa@hq.icb.chel.su. 8-Jul-96.
674 : * Updated by Thomas, Thomas.Lockhart@jpl.nasa.gov 1997-07-10.
675 : * Allocate space for output in all cases.
676 : * XXX - thomas 1997-07-10
677 : */
678 : Datum
679 59669 : textcat(PG_FUNCTION_ARGS)
680 : {
681 59669 : text *t1 = PG_GETARG_TEXT_PP(0);
682 59669 : text *t2 = PG_GETARG_TEXT_PP(1);
683 :
684 59669 : PG_RETURN_TEXT_P(text_catenate(t1, t2));
685 : }
686 :
687 : /*
688 : * text_catenate
689 : * Guts of textcat(), broken out so it can be used by other functions
690 : *
691 : * Arguments can be in short-header form, but not compressed or out-of-line
692 : */
693 : static text *
694 59677 : text_catenate(text *t1, text *t2)
695 : {
696 : text *result;
697 : int len1,
698 : len2,
699 : len;
700 : char *ptr;
701 :
702 59677 : len1 = VARSIZE_ANY_EXHDR(t1);
703 59677 : len2 = VARSIZE_ANY_EXHDR(t2);
704 :
705 : /* paranoia ... probably should throw error instead? */
706 59677 : if (len1 < 0)
707 0 : len1 = 0;
708 59677 : if (len2 < 0)
709 0 : len2 = 0;
710 :
711 59677 : len = len1 + len2 + VARHDRSZ;
712 59677 : result = (text *) palloc(len);
713 :
714 : /* Set size of result string... */
715 59677 : SET_VARSIZE(result, len);
716 :
717 : /* Fill data field of result string... */
718 59677 : ptr = VARDATA(result);
719 59677 : if (len1 > 0)
720 59637 : memcpy(ptr, VARDATA_ANY(t1), len1);
721 59677 : if (len2 > 0)
722 59660 : memcpy(ptr + len1, VARDATA_ANY(t2), len2);
723 :
724 59677 : return result;
725 : }
726 :
727 : /*
728 : * charlen_to_bytelen()
729 : * Compute the number of bytes occupied by n characters starting at *p
730 : *
731 : * It is caller's responsibility that there actually are n characters;
732 : * the string need not be null-terminated.
733 : */
734 : static int
735 152 : charlen_to_bytelen(const char *p, int n)
736 : {
737 152 : if (pg_database_encoding_max_length() == 1)
738 : {
739 : /* Optimization for single-byte encodings */
740 0 : return n;
741 : }
742 : else
743 : {
744 : const char *s;
745 :
746 1324 : for (s = p; n > 0; n--)
747 1172 : s += pg_mblen(s);
748 :
749 152 : return s - p;
750 : }
751 : }
752 :
753 : /*
754 : * text_substr()
755 : * Return a substring starting at the specified position.
756 : * - thomas 1997-12-31
757 : *
758 : * Input:
759 : * - string
760 : * - starting position (is one-based)
761 : * - string length
762 : *
763 : * If the starting position is zero or less, then return from the start of the string
764 : * adjusting the length to be consistent with the "negative start" per SQL.
765 : * If the length is less than zero, return the remaining string.
766 : *
767 : * Added multibyte support.
768 : * - Tatsuo Ishii 1998-4-21
769 : * Changed behavior if starting position is less than one to conform to SQL behavior.
770 : * Formerly returned the entire string; now returns a portion.
771 : * - Thomas Lockhart 1998-12-10
772 : * Now uses faster TOAST-slicing interface
773 : * - John Gray 2002-02-22
774 : * Remove "#ifdef MULTIBYTE" and test for encoding_max_length instead. Change
775 : * behaviors conflicting with SQL to meet SQL (if E = S + L < S throw
776 : * error; if E < 1, return '', not entire string). Fixed MB related bug when
777 : * S > LC and < LC + 4 sometimes garbage characters are returned.
778 : * - Joe Conway 2002-08-10
779 : */
780 : Datum
781 1094 : text_substr(PG_FUNCTION_ARGS)
782 : {
783 1094 : PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
784 : PG_GETARG_INT32(1),
785 : PG_GETARG_INT32(2),
786 : false));
787 : }
788 :
789 : /*
790 : * text_substr_no_len -
791 : * Wrapper to avoid opr_sanity failure due to
792 : * one function accepting a different number of args.
793 : */
794 : Datum
795 18 : text_substr_no_len(PG_FUNCTION_ARGS)
796 : {
797 18 : PG_RETURN_TEXT_P(text_substring(PG_GETARG_DATUM(0),
798 : PG_GETARG_INT32(1),
799 : -1, true));
800 : }
801 :
802 : /*
803 : * text_substring -
804 : * Does the real work for text_substr() and text_substr_no_len()
805 : *
806 : * This is broken out so it can be called directly by other string processing
807 : * functions. Note that the argument is passed as a Datum, to indicate that
808 : * it may still be in compressed/toasted form. We can avoid detoasting all
809 : * of it in some cases.
810 : *
811 : * The result is always a freshly palloc'd datum.
812 : */
813 : static text *
814 1123 : text_substring(Datum str, int32 start, int32 length, bool length_not_specified)
815 : {
816 1123 : int32 eml = pg_database_encoding_max_length();
817 1123 : int32 S = start; /* start position */
818 : int32 S1; /* adjusted start position */
819 : int32 L1; /* adjusted substring length */
820 :
821 : /* life is easy if the encoding max length is 1 */
822 1123 : if (eml == 1)
823 : {
824 0 : S1 = Max(S, 1);
825 :
826 0 : if (length_not_specified) /* special case - get length to end of
827 : * string */
828 0 : L1 = -1;
829 : else
830 : {
831 : /* end position */
832 0 : int E = S + length;
833 :
834 : /*
835 : * A negative value for L is the only way for the end position to
836 : * be before the start. SQL99 says to throw an error.
837 : */
838 0 : if (E < S)
839 0 : ereport(ERROR,
840 : (errcode(ERRCODE_SUBSTRING_ERROR),
841 : errmsg("negative substring length not allowed")));
842 :
843 : /*
844 : * A zero or negative value for the end position can happen if the
845 : * start was negative or one. SQL99 says to return a zero-length
846 : * string.
847 : */
848 0 : if (E < 1)
849 0 : return cstring_to_text("");
850 :
851 0 : L1 = E - S1;
852 : }
853 :
854 : /*
855 : * If the start position is past the end of the string, SQL99 says to
856 : * return a zero-length string -- PG_GETARG_TEXT_P_SLICE() will do
857 : * that for us. Convert to zero-based starting position
858 : */
859 0 : return DatumGetTextPSlice(str, S1 - 1, L1);
860 : }
861 1123 : else if (eml > 1)
862 : {
863 : /*
864 : * When encoding max length is > 1, we can't get LC without
865 : * detoasting, so we'll grab a conservatively large slice now and go
866 : * back later to do the right thing
867 : */
868 : int32 slice_start;
869 : int32 slice_size;
870 : int32 slice_strlen;
871 : text *slice;
872 : int32 E1;
873 : int32 i;
874 : char *p;
875 : char *s;
876 : text *ret;
877 :
878 : /*
879 : * if S is past the end of the string, the tuple toaster will return a
880 : * zero-length string to us
881 : */
882 1123 : S1 = Max(S, 1);
883 :
884 : /*
885 : * We need to start at position zero because there is no way to know
886 : * in advance which byte offset corresponds to the supplied start
887 : * position.
888 : */
889 1123 : slice_start = 0;
890 :
891 1123 : if (length_not_specified) /* special case - get length to end of
892 : * string */
893 23 : slice_size = L1 = -1;
894 : else
895 : {
896 1100 : int E = S + length;
897 :
898 : /*
899 : * A negative value for L is the only way for the end position to
900 : * be before the start. SQL99 says to throw an error.
901 : */
902 1100 : if (E < S)
903 1 : ereport(ERROR,
904 : (errcode(ERRCODE_SUBSTRING_ERROR),
905 : errmsg("negative substring length not allowed")));
906 :
907 : /*
908 : * A zero or negative value for the end position can happen if the
909 : * start was negative or one. SQL99 says to return a zero-length
910 : * string.
911 : */
912 1099 : if (E < 1)
913 0 : return cstring_to_text("");
914 :
915 : /*
916 : * if E is past the end of the string, the tuple toaster will
917 : * truncate the length for us
918 : */
919 1099 : L1 = E - S1;
920 :
921 : /*
922 : * Total slice size in bytes can't be any longer than the start
923 : * position plus substring length times the encoding max length.
924 : */
925 1099 : slice_size = (S1 + L1) * eml;
926 : }
927 :
928 : /*
929 : * If we're working with an untoasted source, no need to do an extra
930 : * copying step.
931 : */
932 2238 : if (VARATT_IS_COMPRESSED(DatumGetPointer(str)) ||
933 1116 : VARATT_IS_EXTERNAL(DatumGetPointer(str)))
934 17 : slice = DatumGetTextPSlice(str, slice_start, slice_size);
935 : else
936 1105 : slice = (text *) DatumGetPointer(str);
937 :
938 : /* see if we got back an empty string */
939 1122 : if (VARSIZE_ANY_EXHDR(slice) == 0)
940 : {
941 0 : if (slice != (text *) DatumGetPointer(str))
942 0 : pfree(slice);
943 0 : return cstring_to_text("");
944 : }
945 :
946 : /* Now we can get the actual length of the slice in MB characters */
947 3366 : slice_strlen = pg_mbstrlen_with_len(VARDATA_ANY(slice),
948 3366 : VARSIZE_ANY_EXHDR(slice));
949 :
950 : /*
951 : * Check that the start position wasn't > slice_strlen. If so, SQL99
952 : * says to return a zero-length string.
953 : */
954 1122 : if (S1 > slice_strlen)
955 : {
956 7 : if (slice != (text *) DatumGetPointer(str))
957 0 : pfree(slice);
958 7 : return cstring_to_text("");
959 : }
960 :
961 : /*
962 : * Adjust L1 and E1 now that we know the slice string length. Again
963 : * remember that S1 is one based, and slice_start is zero based.
964 : */
965 1115 : if (L1 > -1)
966 1095 : E1 = Min(S1 + L1, slice_start + 1 + slice_strlen);
967 : else
968 20 : E1 = slice_start + 1 + slice_strlen;
969 :
970 : /*
971 : * Find the start position in the slice; remember S1 is not zero based
972 : */
973 1115 : p = VARDATA_ANY(slice);
974 805133 : for (i = 0; i < S1 - 1; i++)
975 804018 : p += pg_mblen(p);
976 :
977 : /* hang onto a pointer to our start position */
978 1115 : s = p;
979 :
980 : /*
981 : * Count the actual bytes used by the substring of the requested
982 : * length.
983 : */
984 14283 : for (i = S1; i < E1; i++)
985 13168 : p += pg_mblen(p);
986 :
987 1115 : ret = (text *) palloc(VARHDRSZ + (p - s));
988 1115 : SET_VARSIZE(ret, VARHDRSZ + (p - s));
989 1115 : memcpy(VARDATA(ret), s, (p - s));
990 :
991 1115 : if (slice != (text *) DatumGetPointer(str))
992 17 : pfree(slice);
993 :
994 1115 : return ret;
995 : }
996 : else
997 0 : elog(ERROR, "invalid backend encoding: encoding max length < 1");
998 :
999 : /* not reached: suppress compiler warning */
1000 : return NULL;
1001 : }
1002 :
1003 : /*
1004 : * textoverlay
1005 : * Replace specified substring of first string with second
1006 : *
1007 : * The SQL standard defines OVERLAY() in terms of substring and concatenation.
1008 : * This code is a direct implementation of what the standard says.
1009 : */
1010 : Datum
1011 2 : textoverlay(PG_FUNCTION_ARGS)
1012 : {
1013 2 : text *t1 = PG_GETARG_TEXT_PP(0);
1014 2 : text *t2 = PG_GETARG_TEXT_PP(1);
1015 2 : int sp = PG_GETARG_INT32(2); /* substring start position */
1016 2 : int sl = PG_GETARG_INT32(3); /* substring length */
1017 :
1018 2 : PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1019 : }
1020 :
1021 : Datum
1022 2 : textoverlay_no_len(PG_FUNCTION_ARGS)
1023 : {
1024 2 : text *t1 = PG_GETARG_TEXT_PP(0);
1025 2 : text *t2 = PG_GETARG_TEXT_PP(1);
1026 2 : int sp = PG_GETARG_INT32(2); /* substring start position */
1027 : int sl;
1028 :
1029 2 : sl = text_length(PointerGetDatum(t2)); /* defaults to length(t2) */
1030 2 : PG_RETURN_TEXT_P(text_overlay(t1, t2, sp, sl));
1031 : }
1032 :
1033 : static text *
1034 4 : text_overlay(text *t1, text *t2, int sp, int sl)
1035 : {
1036 : text *result;
1037 : text *s1;
1038 : text *s2;
1039 : int sp_pl_sl;
1040 :
1041 : /*
1042 : * Check for possible integer-overflow cases. For negative sp, throw a
1043 : * "substring length" error because that's what should be expected
1044 : * according to the spec's definition of OVERLAY().
1045 : */
1046 4 : if (sp <= 0)
1047 0 : ereport(ERROR,
1048 : (errcode(ERRCODE_SUBSTRING_ERROR),
1049 : errmsg("negative substring length not allowed")));
1050 4 : sp_pl_sl = sp + sl;
1051 4 : if (sp_pl_sl <= sl)
1052 0 : ereport(ERROR,
1053 : (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
1054 : errmsg("integer out of range")));
1055 :
1056 4 : s1 = text_substring(PointerGetDatum(t1), 1, sp - 1, false);
1057 4 : s2 = text_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
1058 4 : result = text_catenate(s1, t2);
1059 4 : result = text_catenate(result, s2);
1060 :
1061 4 : return result;
1062 : }
1063 :
1064 : /*
1065 : * textpos -
1066 : * Return the position of the specified substring.
1067 : * Implements the SQL POSITION() function.
1068 : * Ref: A Guide To The SQL Standard, Date & Darwen, 1997
1069 : * - thomas 1997-07-27
1070 : */
1071 : Datum
1072 4 : textpos(PG_FUNCTION_ARGS)
1073 : {
1074 4 : text *str = PG_GETARG_TEXT_PP(0);
1075 4 : text *search_str = PG_GETARG_TEXT_PP(1);
1076 :
1077 4 : PG_RETURN_INT32((int32) text_position(str, search_str));
1078 : }
1079 :
1080 : /*
1081 : * text_position -
1082 : * Does the real work for textpos()
1083 : *
1084 : * Inputs:
1085 : * t1 - string to be searched
1086 : * t2 - pattern to match within t1
1087 : * Result:
1088 : * Character index of the first matched char, starting from 1,
1089 : * or 0 if no match.
1090 : *
1091 : * This is broken out so it can be called directly by other string processing
1092 : * functions.
1093 : */
1094 : static int
1095 4 : text_position(text *t1, text *t2)
1096 : {
1097 : TextPositionState state;
1098 : int result;
1099 :
1100 4 : text_position_setup(t1, t2, &state);
1101 4 : result = text_position_next(1, &state);
1102 4 : text_position_cleanup(&state);
1103 4 : return result;
1104 : }
1105 :
1106 :
1107 : /*
1108 : * text_position_setup, text_position_next, text_position_cleanup -
1109 : * Component steps of text_position()
1110 : *
1111 : * These are broken out so that a string can be efficiently searched for
1112 : * multiple occurrences of the same pattern. text_position_next may be
1113 : * called multiple times with increasing values of start_pos, which is
1114 : * the 1-based character position to start the search from. The "state"
1115 : * variable is normally just a local variable in the caller.
1116 : */
1117 :
1118 : static void
1119 30 : text_position_setup(text *t1, text *t2, TextPositionState *state)
1120 : {
1121 30 : int len1 = VARSIZE_ANY_EXHDR(t1);
1122 30 : int len2 = VARSIZE_ANY_EXHDR(t2);
1123 :
1124 30 : if (pg_database_encoding_max_length() == 1)
1125 : {
1126 : /* simple case - single byte encoding */
1127 0 : state->use_wchar = false;
1128 0 : state->str1 = VARDATA_ANY(t1);
1129 0 : state->str2 = VARDATA_ANY(t2);
1130 0 : state->len1 = len1;
1131 0 : state->len2 = len2;
1132 : }
1133 : else
1134 : {
1135 : /* not as simple - multibyte encoding */
1136 : pg_wchar *p1,
1137 : *p2;
1138 :
1139 30 : p1 = (pg_wchar *) palloc((len1 + 1) * sizeof(pg_wchar));
1140 30 : len1 = pg_mb2wchar_with_len(VARDATA_ANY(t1), p1, len1);
1141 30 : p2 = (pg_wchar *) palloc((len2 + 1) * sizeof(pg_wchar));
1142 30 : len2 = pg_mb2wchar_with_len(VARDATA_ANY(t2), p2, len2);
1143 :
1144 30 : state->use_wchar = true;
1145 30 : state->wstr1 = p1;
1146 30 : state->wstr2 = p2;
1147 30 : state->len1 = len1;
1148 30 : state->len2 = len2;
1149 : }
1150 :
1151 : /*
1152 : * Prepare the skip table for Boyer-Moore-Horspool searching. In these
1153 : * notes we use the terminology that the "haystack" is the string to be
1154 : * searched (t1) and the "needle" is the pattern being sought (t2).
1155 : *
1156 : * If the needle is empty or bigger than the haystack then there is no
1157 : * point in wasting cycles initializing the table. We also choose not to
1158 : * use B-M-H for needles of length 1, since the skip table can't possibly
1159 : * save anything in that case.
1160 : */
1161 30 : if (len1 >= len2 && len2 > 1)
1162 : {
1163 8 : int searchlength = len1 - len2;
1164 : int skiptablemask;
1165 : int last;
1166 : int i;
1167 :
1168 : /*
1169 : * First we must determine how much of the skip table to use. The
1170 : * declaration of TextPositionState allows up to 256 elements, but for
1171 : * short search problems we don't really want to have to initialize so
1172 : * many elements --- it would take too long in comparison to the
1173 : * actual search time. So we choose a useful skip table size based on
1174 : * the haystack length minus the needle length. The closer the needle
1175 : * length is to the haystack length the less useful skipping becomes.
1176 : *
1177 : * Note: since we use bit-masking to select table elements, the skip
1178 : * table size MUST be a power of 2, and so the mask must be 2^N-1.
1179 : */
1180 8 : if (searchlength < 16)
1181 6 : skiptablemask = 3;
1182 2 : else if (searchlength < 64)
1183 0 : skiptablemask = 7;
1184 2 : else if (searchlength < 128)
1185 0 : skiptablemask = 15;
1186 2 : else if (searchlength < 512)
1187 2 : skiptablemask = 31;
1188 0 : else if (searchlength < 2048)
1189 0 : skiptablemask = 63;
1190 0 : else if (searchlength < 4096)
1191 0 : skiptablemask = 127;
1192 : else
1193 0 : skiptablemask = 255;
1194 8 : state->skiptablemask = skiptablemask;
1195 :
1196 : /*
1197 : * Initialize the skip table. We set all elements to the needle
1198 : * length, since this is the correct skip distance for any character
1199 : * not found in the needle.
1200 : */
1201 96 : for (i = 0; i <= skiptablemask; i++)
1202 88 : state->skiptable[i] = len2;
1203 :
1204 : /*
1205 : * Now examine the needle. For each character except the last one,
1206 : * set the corresponding table element to the appropriate skip
1207 : * distance. Note that when two characters share the same skip table
1208 : * entry, the one later in the needle must determine the skip
1209 : * distance.
1210 : */
1211 8 : last = len2 - 1;
1212 :
1213 8 : if (!state->use_wchar)
1214 : {
1215 0 : const char *str2 = state->str2;
1216 :
1217 0 : for (i = 0; i < last; i++)
1218 0 : state->skiptable[(unsigned char) str2[i] & skiptablemask] = last - i;
1219 : }
1220 : else
1221 : {
1222 8 : const pg_wchar *wstr2 = state->wstr2;
1223 :
1224 39 : for (i = 0; i < last; i++)
1225 31 : state->skiptable[wstr2[i] & skiptablemask] = last - i;
1226 : }
1227 : }
1228 30 : }
1229 :
1230 : static int
1231 85 : text_position_next(int start_pos, TextPositionState *state)
1232 : {
1233 85 : int haystack_len = state->len1;
1234 85 : int needle_len = state->len2;
1235 85 : int skiptablemask = state->skiptablemask;
1236 :
1237 85 : Assert(start_pos > 0); /* else caller error */
1238 :
1239 85 : if (needle_len <= 0)
1240 0 : return start_pos; /* result for empty pattern */
1241 :
1242 85 : start_pos--; /* adjust for zero based arrays */
1243 :
1244 : /* Done if the needle can't possibly fit */
1245 85 : if (haystack_len < start_pos + needle_len)
1246 4 : return 0;
1247 :
1248 81 : if (!state->use_wchar)
1249 : {
1250 : /* simple case - single byte encoding */
1251 0 : const char *haystack = state->str1;
1252 0 : const char *needle = state->str2;
1253 0 : const char *haystack_end = &haystack[haystack_len];
1254 : const char *hptr;
1255 :
1256 0 : if (needle_len == 1)
1257 : {
1258 : /* No point in using B-M-H for a one-character needle */
1259 0 : char nchar = *needle;
1260 :
1261 0 : hptr = &haystack[start_pos];
1262 0 : while (hptr < haystack_end)
1263 : {
1264 0 : if (*hptr == nchar)
1265 0 : return hptr - haystack + 1;
1266 0 : hptr++;
1267 : }
1268 : }
1269 : else
1270 : {
1271 0 : const char *needle_last = &needle[needle_len - 1];
1272 :
1273 : /* Start at startpos plus the length of the needle */
1274 0 : hptr = &haystack[start_pos + needle_len - 1];
1275 0 : while (hptr < haystack_end)
1276 : {
1277 : /* Match the needle scanning *backward* */
1278 : const char *nptr;
1279 : const char *p;
1280 :
1281 0 : nptr = needle_last;
1282 0 : p = hptr;
1283 0 : while (*nptr == *p)
1284 : {
1285 : /* Matched it all? If so, return 1-based position */
1286 0 : if (nptr == needle)
1287 0 : return p - haystack + 1;
1288 0 : nptr--, p--;
1289 : }
1290 :
1291 : /*
1292 : * No match, so use the haystack char at hptr to decide how
1293 : * far to advance. If the needle had any occurrence of that
1294 : * character (or more precisely, one sharing the same
1295 : * skiptable entry) before its last character, then we advance
1296 : * far enough to align the last such needle character with
1297 : * that haystack position. Otherwise we can advance by the
1298 : * whole needle length.
1299 : */
1300 0 : hptr += state->skiptable[(unsigned char) *hptr & skiptablemask];
1301 : }
1302 : }
1303 : }
1304 : else
1305 : {
1306 : /* The multibyte char version. This works exactly the same way. */
1307 81 : const pg_wchar *haystack = state->wstr1;
1308 81 : const pg_wchar *needle = state->wstr2;
1309 81 : const pg_wchar *haystack_end = &haystack[haystack_len];
1310 : const pg_wchar *hptr;
1311 :
1312 81 : if (needle_len == 1)
1313 : {
1314 : /* No point in using B-M-H for a one-character needle */
1315 70 : pg_wchar nchar = *needle;
1316 :
1317 70 : hptr = &haystack[start_pos];
1318 778 : while (hptr < haystack_end)
1319 : {
1320 697 : if (*hptr == nchar)
1321 59 : return hptr - haystack + 1;
1322 638 : hptr++;
1323 : }
1324 : }
1325 : else
1326 : {
1327 11 : const pg_wchar *needle_last = &needle[needle_len - 1];
1328 :
1329 : /* Start at startpos plus the length of the needle */
1330 11 : hptr = &haystack[start_pos + needle_len - 1];
1331 105 : while (hptr < haystack_end)
1332 : {
1333 : /* Match the needle scanning *backward* */
1334 : const pg_wchar *nptr;
1335 : const pg_wchar *p;
1336 :
1337 90 : nptr = needle_last;
1338 90 : p = hptr;
1339 196 : while (*nptr == *p)
1340 : {
1341 : /* Matched it all? If so, return 1-based position */
1342 23 : if (nptr == needle)
1343 7 : return p - haystack + 1;
1344 16 : nptr--, p--;
1345 : }
1346 :
1347 : /*
1348 : * No match, so use the haystack char at hptr to decide how
1349 : * far to advance. If the needle had any occurrence of that
1350 : * character (or more precisely, one sharing the same
1351 : * skiptable entry) before its last character, then we advance
1352 : * far enough to align the last such needle character with
1353 : * that haystack position. Otherwise we can advance by the
1354 : * whole needle length.
1355 : */
1356 83 : hptr += state->skiptable[*hptr & skiptablemask];
1357 : }
1358 : }
1359 : }
1360 :
1361 15 : return 0; /* not found */
1362 : }
1363 :
1364 : static void
1365 30 : text_position_cleanup(TextPositionState *state)
1366 : {
1367 30 : if (state->use_wchar)
1368 : {
1369 30 : pfree(state->wstr1);
1370 30 : pfree(state->wstr2);
1371 : }
1372 30 : }
1373 :
1374 : /* varstr_cmp()
1375 : * Comparison function for text strings with given lengths.
1376 : * Includes locale support, but must copy strings to temporary memory
1377 : * to allow null-termination for inputs to strcoll().
1378 : * Returns an integer less than, equal to, or greater than zero, indicating
1379 : * whether arg1 is less than, equal to, or greater than arg2.
1380 : */
1381 : int
1382 408050 : varstr_cmp(char *arg1, int len1, char *arg2, int len2, Oid collid)
1383 : {
1384 : int result;
1385 :
1386 : /*
1387 : * Unfortunately, there is no strncoll(), so in the non-C locale case we
1388 : * have to do some memory copying. This turns out to be significantly
1389 : * slower, so we optimize the case where LC_COLLATE is C. We also try to
1390 : * optimize relatively-short strings by avoiding palloc/pfree overhead.
1391 : */
1392 408050 : if (lc_collate_is_c(collid))
1393 : {
1394 121865 : result = memcmp(arg1, arg2, Min(len1, len2));
1395 121865 : if ((result == 0) && (len1 != len2))
1396 3321 : result = (len1 < len2) ? -1 : 1;
1397 : }
1398 : else
1399 : {
1400 : char a1buf[TEXTBUFLEN];
1401 : char a2buf[TEXTBUFLEN];
1402 : char *a1p,
1403 : *a2p;
1404 286185 : pg_locale_t mylocale = 0;
1405 :
1406 286185 : if (collid != DEFAULT_COLLATION_OID)
1407 : {
1408 1 : if (!OidIsValid(collid))
1409 : {
1410 : /*
1411 : * This typically means that the parser could not resolve a
1412 : * conflict of implicit collations, so report it that way.
1413 : */
1414 1 : ereport(ERROR,
1415 : (errcode(ERRCODE_INDETERMINATE_COLLATION),
1416 : errmsg("could not determine which collation to use for string comparison"),
1417 : errhint("Use the COLLATE clause to set the collation explicitly.")));
1418 : }
1419 0 : mylocale = pg_newlocale_from_collation(collid);
1420 : }
1421 :
1422 : /*
1423 : * memcmp() can't tell us which of two unequal strings sorts first,
1424 : * but it's a cheap way to tell if they're equal. Testing shows that
1425 : * memcmp() followed by strcoll() is only trivially slower than
1426 : * strcoll() by itself, so we don't lose much if this doesn't work out
1427 : * very often, and if it does - for example, because there are many
1428 : * equal strings in the input - then we win big by avoiding expensive
1429 : * collation-aware comparisons.
1430 : */
1431 286184 : if (len1 == len2 && memcmp(arg1, arg2, len1) == 0)
1432 102932 : return 0;
1433 :
1434 : #ifdef WIN32
1435 : /* Win32 does not have UTF-8, so we need to map to UTF-16 */
1436 : if (GetDatabaseEncoding() == PG_UTF8
1437 : && (!mylocale || mylocale->provider == COLLPROVIDER_LIBC))
1438 : {
1439 : int a1len;
1440 : int a2len;
1441 : int r;
1442 :
1443 : if (len1 >= TEXTBUFLEN / 2)
1444 : {
1445 : a1len = len1 * 2 + 2;
1446 : a1p = palloc(a1len);
1447 : }
1448 : else
1449 : {
1450 : a1len = TEXTBUFLEN;
1451 : a1p = a1buf;
1452 : }
1453 : if (len2 >= TEXTBUFLEN / 2)
1454 : {
1455 : a2len = len2 * 2 + 2;
1456 : a2p = palloc(a2len);
1457 : }
1458 : else
1459 : {
1460 : a2len = TEXTBUFLEN;
1461 : a2p = a2buf;
1462 : }
1463 :
1464 : /* stupid Microsloth API does not work for zero-length input */
1465 : if (len1 == 0)
1466 : r = 0;
1467 : else
1468 : {
1469 : r = MultiByteToWideChar(CP_UTF8, 0, arg1, len1,
1470 : (LPWSTR) a1p, a1len / 2);
1471 : if (!r)
1472 : ereport(ERROR,
1473 : (errmsg("could not convert string to UTF-16: error code %lu",
1474 : GetLastError())));
1475 : }
1476 : ((LPWSTR) a1p)[r] = 0;
1477 :
1478 : if (len2 == 0)
1479 : r = 0;
1480 : else
1481 : {
1482 : r = MultiByteToWideChar(CP_UTF8, 0, arg2, len2,
1483 : (LPWSTR) a2p, a2len / 2);
1484 : if (!r)
1485 : ereport(ERROR,
1486 : (errmsg("could not convert string to UTF-16: error code %lu",
1487 : GetLastError())));
1488 : }
1489 : ((LPWSTR) a2p)[r] = 0;
1490 :
1491 : errno = 0;
1492 : #ifdef HAVE_LOCALE_T
1493 : if (mylocale)
1494 : result = wcscoll_l((LPWSTR) a1p, (LPWSTR) a2p, mylocale->info.lt);
1495 : else
1496 : #endif
1497 : result = wcscoll((LPWSTR) a1p, (LPWSTR) a2p);
1498 : if (result == 2147483647) /* _NLSCMPERROR; missing from mingw
1499 : * headers */
1500 : ereport(ERROR,
1501 : (errmsg("could not compare Unicode strings: %m")));
1502 :
1503 : /*
1504 : * In some locales wcscoll() can claim that nonidentical strings
1505 : * are equal. Believing that would be bad news for a number of
1506 : * reasons, so we follow Perl's lead and sort "equal" strings
1507 : * according to strcmp (on the UTF-8 representation).
1508 : */
1509 : if (result == 0)
1510 : {
1511 : result = memcmp(arg1, arg2, Min(len1, len2));
1512 : if ((result == 0) && (len1 != len2))
1513 : result = (len1 < len2) ? -1 : 1;
1514 : }
1515 :
1516 : if (a1p != a1buf)
1517 : pfree(a1p);
1518 : if (a2p != a2buf)
1519 : pfree(a2p);
1520 :
1521 : return result;
1522 : }
1523 : #endif /* WIN32 */
1524 :
1525 183252 : if (len1 >= TEXTBUFLEN)
1526 200 : a1p = (char *) palloc(len1 + 1);
1527 : else
1528 183052 : a1p = a1buf;
1529 183252 : if (len2 >= TEXTBUFLEN)
1530 0 : a2p = (char *) palloc(len2 + 1);
1531 : else
1532 183252 : a2p = a2buf;
1533 :
1534 183252 : memcpy(a1p, arg1, len1);
1535 183252 : a1p[len1] = '\0';
1536 183252 : memcpy(a2p, arg2, len2);
1537 183252 : a2p[len2] = '\0';
1538 :
1539 183252 : if (mylocale)
1540 : {
1541 0 : if (mylocale->provider == COLLPROVIDER_ICU)
1542 : {
1543 : #ifdef USE_ICU
1544 : #ifdef HAVE_UCOL_STRCOLLUTF8
1545 : if (GetDatabaseEncoding() == PG_UTF8)
1546 : {
1547 : UErrorCode status;
1548 :
1549 : status = U_ZERO_ERROR;
1550 : result = ucol_strcollUTF8(mylocale->info.icu.ucol,
1551 : arg1, len1,
1552 : arg2, len2,
1553 : &status);
1554 : if (U_FAILURE(status))
1555 : ereport(ERROR,
1556 : (errmsg("collation failed: %s", u_errorName(status))));
1557 : }
1558 : else
1559 : #endif
1560 : {
1561 : int32_t ulen1,
1562 : ulen2;
1563 : UChar *uchar1,
1564 : *uchar2;
1565 :
1566 : ulen1 = icu_to_uchar(&uchar1, arg1, len1);
1567 : ulen2 = icu_to_uchar(&uchar2, arg2, len2);
1568 :
1569 : result = ucol_strcoll(mylocale->info.icu.ucol,
1570 : uchar1, ulen1,
1571 : uchar2, ulen2);
1572 :
1573 : pfree(uchar1);
1574 : pfree(uchar2);
1575 : }
1576 : #else /* not USE_ICU */
1577 : /* shouldn't happen */
1578 0 : elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
1579 : #endif /* not USE_ICU */
1580 : }
1581 : else
1582 : {
1583 : #ifdef HAVE_LOCALE_T
1584 0 : result = strcoll_l(a1p, a2p, mylocale->info.lt);
1585 : #else
1586 : /* shouldn't happen */
1587 : elog(ERROR, "unsupported collprovider: %c", mylocale->provider);
1588 : #endif
1589 : }
1590 : }
1591 : else
1592 183252 : result = strcoll(a1p, a2p);
1593 :
1594 : /*
1595 : * In some locales strcoll() can claim that nonidentical strings are
1596 : * equal. Believing that would be bad news for a number of reasons,
1597 : * so we follow Perl's lead and sort "equal" strings according to
1598 : * strcmp().
1599 : */
1600 183252 : if (result == 0)
1601 0 : result = strcmp(a1p, a2p);
1602 :
1603 183252 : if (a1p != a1buf)
1604 200 : pfree(a1p);
1605 183252 : if (a2p != a2buf)
1606 0 : pfree(a2p);
1607 : }
1608 :
1609 305117 : return result;
1610 : }
1611 :
1612 : /* text_cmp()
1613 : * Internal comparison function for text strings.
1614 : * Returns -1, 0 or 1
1615 : */
1616 : static int
1617 169795 : text_cmp(text *arg1, text *arg2, Oid collid)
1618 : {
1619 : char *a1p,
1620 : *a2p;
1621 : int len1,
1622 : len2;
1623 :
1624 169795 : a1p = VARDATA_ANY(arg1);
1625 169795 : a2p = VARDATA_ANY(arg2);
1626 :
1627 169795 : len1 = VARSIZE_ANY_EXHDR(arg1);
1628 169795 : len2 = VARSIZE_ANY_EXHDR(arg2);
1629 :
1630 169795 : return varstr_cmp(a1p, len1, a2p, len2, collid);
1631 : }
1632 :
1633 : /*
1634 : * Comparison functions for text strings.
1635 : *
1636 : * Note: btree indexes need these routines not to leak memory; therefore,
1637 : * be careful to free working copies of toasted datums. Most places don't
1638 : * need to be so careful.
1639 : */
1640 :
1641 : Datum
1642 85250 : texteq(PG_FUNCTION_ARGS)
1643 : {
1644 85250 : Datum arg1 = PG_GETARG_DATUM(0);
1645 85250 : Datum arg2 = PG_GETARG_DATUM(1);
1646 : bool result;
1647 : Size len1,
1648 : len2;
1649 :
1650 : /*
1651 : * Since we only care about equality or not-equality, we can avoid all the
1652 : * expense of strcoll() here, and just do bitwise comparison. In fact, we
1653 : * don't even have to do a bitwise comparison if we can show the lengths
1654 : * of the strings are unequal; which might save us from having to detoast
1655 : * one or both values.
1656 : */
1657 85250 : len1 = toast_raw_datum_size(arg1);
1658 85250 : len2 = toast_raw_datum_size(arg2);
1659 85250 : if (len1 != len2)
1660 38592 : result = false;
1661 : else
1662 : {
1663 46658 : text *targ1 = DatumGetTextPP(arg1);
1664 46658 : text *targ2 = DatumGetTextPP(arg2);
1665 :
1666 93316 : result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1667 46658 : len1 - VARHDRSZ) == 0);
1668 :
1669 46658 : PG_FREE_IF_COPY(targ1, 0);
1670 46658 : PG_FREE_IF_COPY(targ2, 1);
1671 : }
1672 :
1673 85250 : PG_RETURN_BOOL(result);
1674 : }
1675 :
1676 : Datum
1677 842 : textne(PG_FUNCTION_ARGS)
1678 : {
1679 842 : Datum arg1 = PG_GETARG_DATUM(0);
1680 842 : Datum arg2 = PG_GETARG_DATUM(1);
1681 : bool result;
1682 : Size len1,
1683 : len2;
1684 :
1685 : /* See comment in texteq() */
1686 842 : len1 = toast_raw_datum_size(arg1);
1687 842 : len2 = toast_raw_datum_size(arg2);
1688 842 : if (len1 != len2)
1689 66 : result = true;
1690 : else
1691 : {
1692 776 : text *targ1 = DatumGetTextPP(arg1);
1693 776 : text *targ2 = DatumGetTextPP(arg2);
1694 :
1695 1552 : result = (memcmp(VARDATA_ANY(targ1), VARDATA_ANY(targ2),
1696 776 : len1 - VARHDRSZ) != 0);
1697 :
1698 776 : PG_FREE_IF_COPY(targ1, 0);
1699 776 : PG_FREE_IF_COPY(targ2, 1);
1700 : }
1701 :
1702 842 : PG_RETURN_BOOL(result);
1703 : }
1704 :
1705 : Datum
1706 12923 : text_lt(PG_FUNCTION_ARGS)
1707 : {
1708 12923 : text *arg1 = PG_GETARG_TEXT_PP(0);
1709 12923 : text *arg2 = PG_GETARG_TEXT_PP(1);
1710 : bool result;
1711 :
1712 12923 : result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0);
1713 :
1714 12922 : PG_FREE_IF_COPY(arg1, 0);
1715 12922 : PG_FREE_IF_COPY(arg2, 1);
1716 :
1717 12922 : PG_RETURN_BOOL(result);
1718 : }
1719 :
1720 : Datum
1721 10326 : text_le(PG_FUNCTION_ARGS)
1722 : {
1723 10326 : text *arg1 = PG_GETARG_TEXT_PP(0);
1724 10326 : text *arg2 = PG_GETARG_TEXT_PP(1);
1725 : bool result;
1726 :
1727 10326 : result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) <= 0);
1728 :
1729 10326 : PG_FREE_IF_COPY(arg1, 0);
1730 10326 : PG_FREE_IF_COPY(arg2, 1);
1731 :
1732 10326 : PG_RETURN_BOOL(result);
1733 : }
1734 :
1735 : Datum
1736 6770 : text_gt(PG_FUNCTION_ARGS)
1737 : {
1738 6770 : text *arg1 = PG_GETARG_TEXT_PP(0);
1739 6770 : text *arg2 = PG_GETARG_TEXT_PP(1);
1740 : bool result;
1741 :
1742 6770 : result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0);
1743 :
1744 6770 : PG_FREE_IF_COPY(arg1, 0);
1745 6770 : PG_FREE_IF_COPY(arg2, 1);
1746 :
1747 6770 : PG_RETURN_BOOL(result);
1748 : }
1749 :
1750 : Datum
1751 8135 : text_ge(PG_FUNCTION_ARGS)
1752 : {
1753 8135 : text *arg1 = PG_GETARG_TEXT_PP(0);
1754 8135 : text *arg2 = PG_GETARG_TEXT_PP(1);
1755 : bool result;
1756 :
1757 8135 : result = (text_cmp(arg1, arg2, PG_GET_COLLATION()) >= 0);
1758 :
1759 8135 : PG_FREE_IF_COPY(arg1, 0);
1760 8135 : PG_FREE_IF_COPY(arg2, 1);
1761 :
1762 8135 : PG_RETURN_BOOL(result);
1763 : }
1764 :
1765 : Datum
1766 131628 : bttextcmp(PG_FUNCTION_ARGS)
1767 : {
1768 131628 : text *arg1 = PG_GETARG_TEXT_PP(0);
1769 131628 : text *arg2 = PG_GETARG_TEXT_PP(1);
1770 : int32 result;
1771 :
1772 131628 : result = text_cmp(arg1, arg2, PG_GET_COLLATION());
1773 :
1774 131628 : PG_FREE_IF_COPY(arg1, 0);
1775 131628 : PG_FREE_IF_COPY(arg2, 1);
1776 :
1777 131628 : PG_RETURN_INT32(result);
1778 : }
1779 :
1780 : Datum
1781 971 : bttextsortsupport(PG_FUNCTION_ARGS)
1782 : {
1783 971 : SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
1784 971 : Oid collid = ssup->ssup_collation;
1785 : MemoryContext oldcontext;
1786 :
1787 971 : oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
1788 :
1789 : /* Use generic string SortSupport */
1790 971 : varstr_sortsupport(ssup, collid, false);
1791 :
1792 970 : MemoryContextSwitchTo(oldcontext);
1793 :
1794 970 : PG_RETURN_VOID();
1795 : }
1796 :
1797 : /*
1798 : * Generic sortsupport interface for character type's operator classes.
1799 : * Includes locale support, and support for BpChar semantics (i.e. removing
1800 : * trailing spaces before comparison).
1801 : *
1802 : * Relies on the assumption that text, VarChar, BpChar, and bytea all have the
1803 : * same representation. Callers that always use the C collation (e.g.
1804 : * non-collatable type callers like bytea) may have NUL bytes in their strings;
1805 : * this will not work with any other collation, though.
1806 : */
1807 : void
1808 1082 : varstr_sortsupport(SortSupport ssup, Oid collid, bool bpchar)
1809 : {
1810 1082 : bool abbreviate = ssup->abbreviate;
1811 1082 : bool collate_c = false;
1812 : VarStringSortSupport *sss;
1813 1082 : pg_locale_t locale = 0;
1814 :
1815 : /*
1816 : * If possible, set ssup->comparator to a function which can be used to
1817 : * directly compare two datums. If we can do this, we'll avoid the
1818 : * overhead of a trip through the fmgr layer for every comparison, which
1819 : * can be substantial.
1820 : *
1821 : * Most typically, we'll set the comparator to varstrfastcmp_locale, which
1822 : * uses strcoll() to perform comparisons and knows about the special
1823 : * requirements of BpChar callers. However, if LC_COLLATE = C, we can
1824 : * make things quite a bit faster with varstrfastcmp_c or bpcharfastcmp_c,
1825 : * both of which use memcmp() rather than strcoll().
1826 : *
1827 : * There is a further exception on Windows. When the database encoding is
1828 : * UTF-8 and we are not using the C collation, complex hacks are required.
1829 : * We don't currently have a comparator that handles that case, so we fall
1830 : * back on the slow method of having the sort code invoke bttextcmp() (in
1831 : * the case of text) via the fmgr trampoline.
1832 : */
1833 1082 : if (lc_collate_is_c(collid))
1834 : {
1835 77 : if (!bpchar)
1836 75 : ssup->comparator = varstrfastcmp_c;
1837 : else
1838 2 : ssup->comparator = bpcharfastcmp_c;
1839 :
1840 77 : collate_c = true;
1841 : }
1842 : #ifdef WIN32
1843 : else if (GetDatabaseEncoding() == PG_UTF8)
1844 : return;
1845 : #endif
1846 : else
1847 : {
1848 1005 : ssup->comparator = varstrfastcmp_locale;
1849 :
1850 : /*
1851 : * We need a collation-sensitive comparison. To make things faster,
1852 : * we'll figure out the collation based on the locale id and cache the
1853 : * result.
1854 : */
1855 1005 : if (collid != DEFAULT_COLLATION_OID)
1856 : {
1857 1 : if (!OidIsValid(collid))
1858 : {
1859 : /*
1860 : * This typically means that the parser could not resolve a
1861 : * conflict of implicit collations, so report it that way.
1862 : */
1863 1 : ereport(ERROR,
1864 : (errcode(ERRCODE_INDETERMINATE_COLLATION),
1865 : errmsg("could not determine which collation to use for string comparison"),
1866 : errhint("Use the COLLATE clause to set the collation explicitly.")));
1867 : }
1868 0 : locale = pg_newlocale_from_collation(collid);
1869 : }
1870 : }
1871 :
1872 : /*
1873 : * Unfortunately, it seems that abbreviation for non-C collations is
1874 : * broken on many common platforms; testing of multiple versions of glibc
1875 : * reveals that, for many locales, strcoll() and strxfrm() do not return
1876 : * consistent results, which is fatal to this optimization. While no
1877 : * other libc other than Cygwin has so far been shown to have a problem,
1878 : * we take the conservative course of action for right now and disable
1879 : * this categorically. (Users who are certain this isn't a problem on
1880 : * their system can define TRUST_STRXFRM.)
1881 : *
1882 : * Even apart from the risk of broken locales, it's possible that there
1883 : * are platforms where the use of abbreviated keys should be disabled at
1884 : * compile time. Having only 4 byte datums could make worst-case
1885 : * performance drastically more likely, for example. Moreover, macOS's
1886 : * strxfrm() implementation is known to not effectively concentrate a
1887 : * significant amount of entropy from the original string in earlier
1888 : * transformed blobs. It's possible that other supported platforms are
1889 : * similarly encumbered. So, if we ever get past disabling this
1890 : * categorically, we may still want or need to disable it for particular
1891 : * platforms.
1892 : */
1893 : #ifndef TRUST_STRXFRM
1894 1081 : if (!collate_c && !(locale && locale->provider == COLLPROVIDER_ICU))
1895 1004 : abbreviate = false;
1896 : #endif
1897 :
1898 : /*
1899 : * If we're using abbreviated keys, or if we're using a locale-aware
1900 : * comparison, we need to initialize a StringSortSupport object. Both
1901 : * cases will make use of the temporary buffers we initialize here for
1902 : * scratch space (and to detect requirement for BpChar semantics from
1903 : * caller), and the abbreviation case requires additional state.
1904 : */
1905 1081 : if (abbreviate || !collate_c)
1906 : {
1907 1059 : sss = palloc(sizeof(VarStringSortSupport));
1908 1059 : sss->buf1 = palloc(TEXTBUFLEN);
1909 1059 : sss->buflen1 = TEXTBUFLEN;
1910 1059 : sss->buf2 = palloc(TEXTBUFLEN);
1911 1059 : sss->buflen2 = TEXTBUFLEN;
1912 : /* Start with invalid values */
1913 1059 : sss->last_len1 = -1;
1914 1059 : sss->last_len2 = -1;
1915 : /* Initialize */
1916 1059 : sss->last_returned = 0;
1917 1059 : sss->locale = locale;
1918 :
1919 : /*
1920 : * To avoid somehow confusing a strxfrm() blob and an original string,
1921 : * constantly keep track of the variety of data that buf1 and buf2
1922 : * currently contain.
1923 : *
1924 : * Comparisons may be interleaved with conversion calls. Frequently,
1925 : * conversions and comparisons are batched into two distinct phases,
1926 : * but the correctness of caching cannot hinge upon this. For
1927 : * comparison caching, buffer state is only trusted if cache_blob is
1928 : * found set to false, whereas strxfrm() caching only trusts the state
1929 : * when cache_blob is found set to true.
1930 : *
1931 : * Arbitrarily initialize cache_blob to true.
1932 : */
1933 1059 : sss->cache_blob = true;
1934 1059 : sss->collate_c = collate_c;
1935 1059 : sss->bpchar = bpchar;
1936 1059 : ssup->ssup_extra = sss;
1937 :
1938 : /*
1939 : * If possible, plan to use the abbreviated keys optimization. The
1940 : * core code may switch back to authoritative comparator should
1941 : * abbreviation be aborted.
1942 : */
1943 1059 : if (abbreviate)
1944 : {
1945 55 : sss->prop_card = 0.20;
1946 55 : initHyperLogLog(&sss->abbr_card, 10);
1947 55 : initHyperLogLog(&sss->full_card, 10);
1948 55 : ssup->abbrev_full_comparator = ssup->comparator;
1949 55 : ssup->comparator = varstrcmp_abbrev;
1950 55 : ssup->abbrev_converter = varstr_abbrev_convert;
1951 55 : ssup->abbrev_abort = varstr_abbrev_abort;
1952 : }
1953 : }
1954 1081 : }
1955 :
1956 : /*
1957 : * sortsupport comparison func (for C locale case)
1958 : */
1959 : static int
1960 3367 : varstrfastcmp_c(Datum x, Datum y, SortSupport ssup)
1961 : {
1962 3367 : VarString *arg1 = DatumGetVarStringPP(x);
1963 3367 : VarString *arg2 = DatumGetVarStringPP(y);
1964 : char *a1p,
1965 : *a2p;
1966 : int len1,
1967 : len2,
1968 : result;
1969 :
1970 3367 : a1p = VARDATA_ANY(arg1);
1971 3367 : a2p = VARDATA_ANY(arg2);
1972 :
1973 3367 : len1 = VARSIZE_ANY_EXHDR(arg1);
1974 3367 : len2 = VARSIZE_ANY_EXHDR(arg2);
1975 :
1976 3367 : result = memcmp(a1p, a2p, Min(len1, len2));
1977 3367 : if ((result == 0) && (len1 != len2))
1978 0 : result = (len1 < len2) ? -1 : 1;
1979 :
1980 : /* We can't afford to leak memory here. */
1981 3367 : if (PointerGetDatum(arg1) != x)
1982 0 : pfree(arg1);
1983 3367 : if (PointerGetDatum(arg2) != y)
1984 0 : pfree(arg2);
1985 :
1986 3367 : return result;
1987 : }
1988 :
1989 : /*
1990 : * sortsupport comparison func (for BpChar C locale case)
1991 : *
1992 : * BpChar outsources its sortsupport to this module. Specialization for the
1993 : * varstr_sortsupport BpChar case, modeled on
1994 : * internal_bpchar_pattern_compare().
1995 : */
1996 : static int
1997 0 : bpcharfastcmp_c(Datum x, Datum y, SortSupport ssup)
1998 : {
1999 0 : BpChar *arg1 = DatumGetBpCharPP(x);
2000 0 : BpChar *arg2 = DatumGetBpCharPP(y);
2001 : char *a1p,
2002 : *a2p;
2003 : int len1,
2004 : len2,
2005 : result;
2006 :
2007 0 : a1p = VARDATA_ANY(arg1);
2008 0 : a2p = VARDATA_ANY(arg2);
2009 :
2010 0 : len1 = bpchartruelen(a1p, VARSIZE_ANY_EXHDR(arg1));
2011 0 : len2 = bpchartruelen(a2p, VARSIZE_ANY_EXHDR(arg2));
2012 :
2013 0 : result = memcmp(a1p, a2p, Min(len1, len2));
2014 0 : if ((result == 0) && (len1 != len2))
2015 0 : result = (len1 < len2) ? -1 : 1;
2016 :
2017 : /* We can't afford to leak memory here. */
2018 0 : if (PointerGetDatum(arg1) != x)
2019 0 : pfree(arg1);
2020 0 : if (PointerGetDatum(arg2) != y)
2021 0 : pfree(arg2);
2022 :
2023 0 : return result;
2024 : }
2025 :
2026 : /*
2027 : * sortsupport comparison func (for locale case)
2028 : */
2029 : static int
2030 4487295 : varstrfastcmp_locale(Datum x, Datum y, SortSupport ssup)
2031 : {
2032 4487295 : VarString *arg1 = DatumGetVarStringPP(x);
2033 4487295 : VarString *arg2 = DatumGetVarStringPP(y);
2034 : bool arg1_match;
2035 4487295 : VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2036 :
2037 : /* working state */
2038 : char *a1p,
2039 : *a2p;
2040 : int len1,
2041 : len2,
2042 : result;
2043 :
2044 4487295 : a1p = VARDATA_ANY(arg1);
2045 4487295 : a2p = VARDATA_ANY(arg2);
2046 :
2047 4487295 : len1 = VARSIZE_ANY_EXHDR(arg1);
2048 4487295 : len2 = VARSIZE_ANY_EXHDR(arg2);
2049 :
2050 : /* Fast pre-check for equality, as discussed in varstr_cmp() */
2051 4487295 : if (len1 == len2 && memcmp(a1p, a2p, len1) == 0)
2052 : {
2053 : /*
2054 : * No change in buf1 or buf2 contents, so avoid changing last_len1 or
2055 : * last_len2. Existing contents of buffers might still be used by
2056 : * next call.
2057 : *
2058 : * It's fine to allow the comparison of BpChar padding bytes here,
2059 : * even though that implies that the memcmp() will usually be
2060 : * performed for BpChar callers (though multibyte characters could
2061 : * still prevent that from occurring). The memcmp() is still very
2062 : * cheap, and BpChar's funny semantics have us remove trailing spaces
2063 : * (not limited to padding), so we need make no distinction between
2064 : * padding space characters and "real" space characters.
2065 : */
2066 1261217 : result = 0;
2067 1261217 : goto done;
2068 : }
2069 :
2070 3226078 : if (sss->bpchar)
2071 : {
2072 : /* Get true number of bytes, ignoring trailing spaces */
2073 1401 : len1 = bpchartruelen(a1p, len1);
2074 1401 : len2 = bpchartruelen(a2p, len2);
2075 : }
2076 :
2077 3226078 : if (len1 >= sss->buflen1)
2078 : {
2079 0 : pfree(sss->buf1);
2080 0 : sss->buflen1 = Max(len1 + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2081 0 : sss->buf1 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen1);
2082 : }
2083 3226078 : if (len2 >= sss->buflen2)
2084 : {
2085 0 : pfree(sss->buf2);
2086 0 : sss->buflen2 = Max(len2 + 1, Min(sss->buflen2 * 2, MaxAllocSize));
2087 0 : sss->buf2 = MemoryContextAlloc(ssup->ssup_cxt, sss->buflen2);
2088 : }
2089 :
2090 : /*
2091 : * We're likely to be asked to compare the same strings repeatedly, and
2092 : * memcmp() is so much cheaper than strcoll() that it pays to try to cache
2093 : * comparisons, even though in general there is no reason to think that
2094 : * that will work out (every string datum may be unique). Caching does
2095 : * not slow things down measurably when it doesn't work out, and can speed
2096 : * things up by rather a lot when it does. In part, this is because the
2097 : * memcmp() compares data from cachelines that are needed in L1 cache even
2098 : * when the last comparison's result cannot be reused.
2099 : */
2100 3226078 : arg1_match = true;
2101 3226078 : if (len1 != sss->last_len1 || memcmp(sss->buf1, a1p, len1) != 0)
2102 : {
2103 2641645 : arg1_match = false;
2104 2641645 : memcpy(sss->buf1, a1p, len1);
2105 2641645 : sss->buf1[len1] = '\0';
2106 2641645 : sss->last_len1 = len1;
2107 : }
2108 :
2109 : /*
2110 : * If we're comparing the same two strings as last time, we can return the
2111 : * same answer without calling strcoll() again. This is more likely than
2112 : * it seems (at least with moderate to low cardinality sets), because
2113 : * quicksort compares the same pivot against many values.
2114 : */
2115 3226078 : if (len2 != sss->last_len2 || memcmp(sss->buf2, a2p, len2) != 0)
2116 : {
2117 405475 : memcpy(sss->buf2, a2p, len2);
2118 405475 : sss->buf2[len2] = '\0';
2119 405475 : sss->last_len2 = len2;
2120 : }
2121 2820603 : else if (arg1_match && !sss->cache_blob)
2122 : {
2123 : /* Use result cached following last actual strcoll() call */
2124 515151 : result = sss->last_returned;
2125 515151 : goto done;
2126 : }
2127 :
2128 2710927 : if (sss->locale)
2129 : {
2130 0 : if (sss->locale->provider == COLLPROVIDER_ICU)
2131 : {
2132 : #ifdef USE_ICU
2133 : #ifdef HAVE_UCOL_STRCOLLUTF8
2134 : if (GetDatabaseEncoding() == PG_UTF8)
2135 : {
2136 : UErrorCode status;
2137 :
2138 : status = U_ZERO_ERROR;
2139 : result = ucol_strcollUTF8(sss->locale->info.icu.ucol,
2140 : a1p, len1,
2141 : a2p, len2,
2142 : &status);
2143 : if (U_FAILURE(status))
2144 : ereport(ERROR,
2145 : (errmsg("collation failed: %s", u_errorName(status))));
2146 : }
2147 : else
2148 : #endif
2149 : {
2150 : int32_t ulen1,
2151 : ulen2;
2152 : UChar *uchar1,
2153 : *uchar2;
2154 :
2155 : ulen1 = icu_to_uchar(&uchar1, a1p, len1);
2156 : ulen2 = icu_to_uchar(&uchar2, a2p, len2);
2157 :
2158 : result = ucol_strcoll(sss->locale->info.icu.ucol,
2159 : uchar1, ulen1,
2160 : uchar2, ulen2);
2161 :
2162 : pfree(uchar1);
2163 : pfree(uchar2);
2164 : }
2165 : #else /* not USE_ICU */
2166 : /* shouldn't happen */
2167 0 : elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
2168 : #endif /* not USE_ICU */
2169 : }
2170 : else
2171 : {
2172 : #ifdef HAVE_LOCALE_T
2173 0 : result = strcoll_l(sss->buf1, sss->buf2, sss->locale->info.lt);
2174 : #else
2175 : /* shouldn't happen */
2176 : elog(ERROR, "unsupported collprovider: %c", sss->locale->provider);
2177 : #endif
2178 : }
2179 : }
2180 : else
2181 2710927 : result = strcoll(sss->buf1, sss->buf2);
2182 :
2183 : /*
2184 : * In some locales strcoll() can claim that nonidentical strings are
2185 : * equal. Believing that would be bad news for a number of reasons, so we
2186 : * follow Perl's lead and sort "equal" strings according to strcmp().
2187 : */
2188 2710927 : if (result == 0)
2189 0 : result = strcmp(sss->buf1, sss->buf2);
2190 :
2191 : /* Cache result, perhaps saving an expensive strcoll() call next time */
2192 2710927 : sss->cache_blob = false;
2193 2710927 : sss->last_returned = result;
2194 : done:
2195 : /* We can't afford to leak memory here. */
2196 4487295 : if (PointerGetDatum(arg1) != x)
2197 0 : pfree(arg1);
2198 4487295 : if (PointerGetDatum(arg2) != y)
2199 0 : pfree(arg2);
2200 :
2201 4487295 : return result;
2202 : }
2203 :
2204 : /*
2205 : * Abbreviated key comparison func
2206 : */
2207 : static int
2208 4793 : varstrcmp_abbrev(Datum x, Datum y, SortSupport ssup)
2209 : {
2210 : /*
2211 : * When 0 is returned, the core system will call varstrfastcmp_c()
2212 : * (bpcharfastcmp_c() in BpChar case) or varstrfastcmp_locale(). Even a
2213 : * strcmp() on two non-truncated strxfrm() blobs cannot indicate *equality*
2214 : * authoritatively, for the same reason that there is a strcoll()
2215 : * tie-breaker call to strcmp() in varstr_cmp().
2216 : */
2217 4793 : if (x > y)
2218 832 : return 1;
2219 3961 : else if (x == y)
2220 3335 : return 0;
2221 : else
2222 626 : return -1;
2223 : }
2224 :
2225 : /*
2226 : * Conversion routine for sortsupport. Converts original to abbreviated key
2227 : * representation. Our encoding strategy is simple -- pack the first 8 bytes
2228 : * of a strxfrm() blob into a Datum (on little-endian machines, the 8 bytes are
2229 : * stored in reverse order), and treat it as an unsigned integer. When the "C"
2230 : * locale is used, or in case of bytea, just memcpy() from original instead.
2231 : */
2232 : static Datum
2233 1071 : varstr_abbrev_convert(Datum original, SortSupport ssup)
2234 : {
2235 1071 : VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2236 1071 : VarString *authoritative = DatumGetVarStringPP(original);
2237 1071 : char *authoritative_data = VARDATA_ANY(authoritative);
2238 :
2239 : /* working state */
2240 : Datum res;
2241 : char *pres;
2242 : int len;
2243 : uint32 hash;
2244 :
2245 1071 : pres = (char *) &res;
2246 : /* memset(), so any non-overwritten bytes are NUL */
2247 1071 : memset(pres, 0, sizeof(Datum));
2248 1071 : len = VARSIZE_ANY_EXHDR(authoritative);
2249 :
2250 : /* Get number of bytes, ignoring trailing spaces */
2251 1071 : if (sss->bpchar)
2252 0 : len = bpchartruelen(authoritative_data, len);
2253 :
2254 : /*
2255 : * If we're using the C collation, use memcpy(), rather than strxfrm(), to
2256 : * abbreviate keys. The full comparator for the C locale is always
2257 : * memcmp(). It would be incorrect to allow bytea callers (callers that
2258 : * always force the C collation -- bytea isn't a collatable type, but this
2259 : * approach is convenient) to use strxfrm(). This is because bytea
2260 : * strings may contain NUL bytes. Besides, this should be faster, too.
2261 : *
2262 : * More generally, it's okay that bytea callers can have NUL bytes in
2263 : * strings because varstrcmp_abbrev() need not make a distinction between
2264 : * terminating NUL bytes, and NUL bytes representing actual NULs in the
2265 : * authoritative representation. Hopefully a comparison at or past one
2266 : * abbreviated key's terminating NUL byte will resolve the comparison
2267 : * without consulting the authoritative representation; specifically, some
2268 : * later non-NUL byte in the longer string can resolve the comparison
2269 : * against a subsequent terminating NUL in the shorter string. There will
2270 : * usually be what is effectively a "length-wise" resolution there and
2271 : * then.
2272 : *
2273 : * If that doesn't work out -- if all bytes in the longer string
2274 : * positioned at or past the offset of the smaller string's (first)
2275 : * terminating NUL are actually representative of NUL bytes in the
2276 : * authoritative binary string (perhaps with some *terminating* NUL bytes
2277 : * towards the end of the longer string iff it happens to still be small)
2278 : * -- then an authoritative tie-breaker will happen, and do the right
2279 : * thing: explicitly consider string length.
2280 : */
2281 1071 : if (sss->collate_c)
2282 1071 : memcpy(pres, authoritative_data, Min(len, sizeof(Datum)));
2283 : else
2284 : {
2285 : Size bsize;
2286 : #ifdef USE_ICU
2287 : int32_t ulen = -1;
2288 : UChar *uchar = NULL;
2289 : #endif
2290 :
2291 : /*
2292 : * We're not using the C collation, so fall back on strxfrm or ICU
2293 : * analogs.
2294 : */
2295 :
2296 : /* By convention, we use buffer 1 to store and NUL-terminate */
2297 0 : if (len >= sss->buflen1)
2298 : {
2299 0 : pfree(sss->buf1);
2300 0 : sss->buflen1 = Max(len + 1, Min(sss->buflen1 * 2, MaxAllocSize));
2301 0 : sss->buf1 = palloc(sss->buflen1);
2302 : }
2303 :
2304 : /* Might be able to reuse strxfrm() blob from last call */
2305 0 : if (sss->last_len1 == len && sss->cache_blob &&
2306 0 : memcmp(sss->buf1, authoritative_data, len) == 0)
2307 : {
2308 0 : memcpy(pres, sss->buf2, Min(sizeof(Datum), sss->last_len2));
2309 : /* No change affecting cardinality, so no hashing required */
2310 0 : goto done;
2311 : }
2312 :
2313 0 : memcpy(sss->buf1, authoritative_data, len);
2314 :
2315 : /*
2316 : * Just like strcoll(), strxfrm() expects a NUL-terminated string. Not
2317 : * necessary for ICU, but doesn't hurt.
2318 : */
2319 0 : sss->buf1[len] = '\0';
2320 0 : sss->last_len1 = len;
2321 :
2322 : #ifdef USE_ICU
2323 : /* When using ICU and not UTF8, convert string to UChar. */
2324 : if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU &&
2325 : GetDatabaseEncoding() != PG_UTF8)
2326 : ulen = icu_to_uchar(&uchar, sss->buf1, len);
2327 : #endif
2328 :
2329 : /*
2330 : * Loop: Call strxfrm() or ucol_getSortKey(), possibly enlarge buffer,
2331 : * and try again. Both of these functions have the result buffer
2332 : * content undefined if the result did not fit, so we need to retry
2333 : * until everything fits, even though we only need the first few bytes
2334 : * in the end. When using ucol_nextSortKeyPart(), however, we only
2335 : * ask for as many bytes as we actually need.
2336 : */
2337 : for (;;)
2338 : {
2339 : #ifdef USE_ICU
2340 : if (sss->locale && sss->locale->provider == COLLPROVIDER_ICU)
2341 : {
2342 : /*
2343 : * When using UTF8, use the iteration interface so we only
2344 : * need to produce as many bytes as we actually need.
2345 : */
2346 : if (GetDatabaseEncoding() == PG_UTF8)
2347 : {
2348 : UCharIterator iter;
2349 : uint32_t state[2];
2350 : UErrorCode status;
2351 :
2352 : uiter_setUTF8(&iter, sss->buf1, len);
2353 : state[0] = state[1] = 0; /* won't need that again */
2354 : status = U_ZERO_ERROR;
2355 : bsize = ucol_nextSortKeyPart(sss->locale->info.icu.ucol,
2356 : &iter,
2357 : state,
2358 : (uint8_t *) sss->buf2,
2359 : Min(sizeof(Datum), sss->buflen2),
2360 : &status);
2361 : if (U_FAILURE(status))
2362 : ereport(ERROR,
2363 : (errmsg("sort key generation failed: %s",
2364 : u_errorName(status))));
2365 : }
2366 : else
2367 : bsize = ucol_getSortKey(sss->locale->info.icu.ucol,
2368 : uchar, ulen,
2369 : (uint8_t *) sss->buf2, sss->buflen2);
2370 : }
2371 : else
2372 : #endif
2373 : #ifdef HAVE_LOCALE_T
2374 0 : if (sss->locale && sss->locale->provider == COLLPROVIDER_LIBC)
2375 0 : bsize = strxfrm_l(sss->buf2, sss->buf1,
2376 0 : sss->buflen2, sss->locale->info.lt);
2377 : else
2378 : #endif
2379 0 : bsize = strxfrm(sss->buf2, sss->buf1, sss->buflen2);
2380 :
2381 0 : sss->last_len2 = bsize;
2382 0 : if (bsize < sss->buflen2)
2383 0 : break;
2384 :
2385 : /*
2386 : * Grow buffer and retry.
2387 : */
2388 0 : pfree(sss->buf2);
2389 0 : sss->buflen2 = Max(bsize + 1,
2390 : Min(sss->buflen2 * 2, MaxAllocSize));
2391 0 : sss->buf2 = palloc(sss->buflen2);
2392 0 : }
2393 :
2394 : /*
2395 : * Every Datum byte is always compared. This is safe because the
2396 : * strxfrm() blob is itself NUL terminated, leaving no danger of
2397 : * misinterpreting any NUL bytes not intended to be interpreted as
2398 : * logically representing termination.
2399 : *
2400 : * (Actually, even if there were NUL bytes in the blob it would be
2401 : * okay. See remarks on bytea case above.)
2402 : */
2403 0 : memcpy(pres, sss->buf2, Min(sizeof(Datum), bsize));
2404 :
2405 : #ifdef USE_ICU
2406 : if (uchar)
2407 : pfree(uchar);
2408 : #endif
2409 : }
2410 :
2411 : /*
2412 : * Maintain approximate cardinality of both abbreviated keys and original,
2413 : * authoritative keys using HyperLogLog. Used as cheap insurance against
2414 : * the worst case, where we do many string transformations for no saving
2415 : * in full strcoll()-based comparisons. These statistics are used by
2416 : * varstr_abbrev_abort().
2417 : *
2418 : * First, Hash key proper, or a significant fraction of it. Mix in length
2419 : * in order to compensate for cases where differences are past
2420 : * PG_CACHE_LINE_SIZE bytes, so as to limit the overhead of hashing.
2421 : */
2422 1071 : hash = DatumGetUInt32(hash_any((unsigned char *) authoritative_data,
2423 : Min(len, PG_CACHE_LINE_SIZE)));
2424 :
2425 1071 : if (len > PG_CACHE_LINE_SIZE)
2426 0 : hash ^= DatumGetUInt32(hash_uint32((uint32) len));
2427 :
2428 1071 : addHyperLogLog(&sss->full_card, hash);
2429 :
2430 : /* Hash abbreviated key */
2431 : #if SIZEOF_DATUM == 8
2432 : {
2433 : uint32 lohalf,
2434 : hihalf;
2435 :
2436 : lohalf = (uint32) res;
2437 : hihalf = (uint32) (res >> 32);
2438 : hash = DatumGetUInt32(hash_uint32(lohalf ^ hihalf));
2439 : }
2440 : #else /* SIZEOF_DATUM != 8 */
2441 1071 : hash = DatumGetUInt32(hash_uint32((uint32) res));
2442 : #endif
2443 :
2444 1071 : addHyperLogLog(&sss->abbr_card, hash);
2445 :
2446 : /* Cache result, perhaps saving an expensive strxfrm() call next time */
2447 1071 : sss->cache_blob = true;
2448 : done:
2449 :
2450 : /*
2451 : * Byteswap on little-endian machines.
2452 : *
2453 : * This is needed so that varstrcmp_abbrev() (an unsigned integer 3-way
2454 : * comparator) works correctly on all platforms. If we didn't do this,
2455 : * the comparator would have to call memcmp() with a pair of pointers to
2456 : * the first byte of each abbreviated key, which is slower.
2457 : */
2458 1071 : res = DatumBigEndianToNative(res);
2459 :
2460 : /* Don't leak memory here */
2461 1071 : if (PointerGetDatum(authoritative) != original)
2462 0 : pfree(authoritative);
2463 :
2464 1071 : return res;
2465 : }
2466 :
2467 : /*
2468 : * Callback for estimating effectiveness of abbreviated key optimization, using
2469 : * heuristic rules. Returns value indicating if the abbreviation optimization
2470 : * should be aborted, based on its projected effectiveness.
2471 : */
2472 : static bool
2473 7 : varstr_abbrev_abort(int memtupcount, SortSupport ssup)
2474 : {
2475 7 : VarStringSortSupport *sss = (VarStringSortSupport *) ssup->ssup_extra;
2476 : double abbrev_distinct,
2477 : key_distinct;
2478 :
2479 7 : Assert(ssup->abbreviate);
2480 :
2481 : /* Have a little patience */
2482 7 : if (memtupcount < 100)
2483 4 : return false;
2484 :
2485 3 : abbrev_distinct = estimateHyperLogLog(&sss->abbr_card);
2486 3 : key_distinct = estimateHyperLogLog(&sss->full_card);
2487 :
2488 : /*
2489 : * Clamp cardinality estimates to at least one distinct value. While
2490 : * NULLs are generally disregarded, if only NULL values were seen so far,
2491 : * that might misrepresent costs if we failed to clamp.
2492 : */
2493 3 : if (abbrev_distinct <= 1.0)
2494 0 : abbrev_distinct = 1.0;
2495 :
2496 3 : if (key_distinct <= 1.0)
2497 0 : key_distinct = 1.0;
2498 :
2499 : /*
2500 : * In the worst case all abbreviated keys are identical, while at the same
2501 : * time there are differences within full key strings not captured in
2502 : * abbreviations.
2503 : */
2504 : #ifdef TRACE_SORT
2505 3 : if (trace_sort)
2506 : {
2507 0 : double norm_abbrev_card = abbrev_distinct / (double) memtupcount;
2508 :
2509 0 : elog(LOG, "varstr_abbrev: abbrev_distinct after %d: %f "
2510 : "(key_distinct: %f, norm_abbrev_card: %f, prop_card: %f)",
2511 : memtupcount, abbrev_distinct, key_distinct, norm_abbrev_card,
2512 : sss->prop_card);
2513 : }
2514 : #endif
2515 :
2516 : /*
2517 : * If the number of distinct abbreviated keys approximately matches the
2518 : * number of distinct authoritative original keys, that's reason enough to
2519 : * proceed. We can win even with a very low cardinality set if most
2520 : * tie-breakers only memcmp(). This is by far the most important
2521 : * consideration.
2522 : *
2523 : * While comparisons that are resolved at the abbreviated key level are
2524 : * considerably cheaper than tie-breakers resolved with memcmp(), both of
2525 : * those two outcomes are so much cheaper than a full strcoll() once
2526 : * sorting is underway that it doesn't seem worth it to weigh abbreviated
2527 : * cardinality against the overall size of the set in order to more
2528 : * accurately model costs. Assume that an abbreviated comparison, and an
2529 : * abbreviated comparison with a cheap memcmp()-based authoritative
2530 : * resolution are equivalent.
2531 : */
2532 3 : if (abbrev_distinct > key_distinct * sss->prop_card)
2533 : {
2534 : /*
2535 : * When we have exceeded 10,000 tuples, decay required cardinality
2536 : * aggressively for next call.
2537 : *
2538 : * This is useful because the number of comparisons required on
2539 : * average increases at a linearithmic rate, and at roughly 10,000
2540 : * tuples that factor will start to dominate over the linear costs of
2541 : * string transformation (this is a conservative estimate). The decay
2542 : * rate is chosen to be a little less aggressive than halving -- which
2543 : * (since we're called at points at which memtupcount has doubled)
2544 : * would never see the cost model actually abort past the first call
2545 : * following a decay. This decay rate is mostly a precaution against
2546 : * a sudden, violent swing in how well abbreviated cardinality tracks
2547 : * full key cardinality. The decay also serves to prevent a marginal
2548 : * case from being aborted too late, when too much has already been
2549 : * invested in string transformation.
2550 : *
2551 : * It's possible for sets of several million distinct strings with
2552 : * mere tens of thousands of distinct abbreviated keys to still
2553 : * benefit very significantly. This will generally occur provided
2554 : * each abbreviated key is a proxy for a roughly uniform number of the
2555 : * set's full keys. If it isn't so, we hope to catch that early and
2556 : * abort. If it isn't caught early, by the time the problem is
2557 : * apparent it's probably not worth aborting.
2558 : */
2559 3 : if (memtupcount > 10000)
2560 0 : sss->prop_card *= 0.65;
2561 :
2562 3 : return false;
2563 : }
2564 :
2565 : /*
2566 : * Abort abbreviation strategy.
2567 : *
2568 : * The worst case, where all abbreviated keys are identical while all
2569 : * original strings differ will typically only see a regression of about
2570 : * 10% in execution time for small to medium sized lists of strings.
2571 : * Whereas on modern CPUs where cache stalls are the dominant cost, we can
2572 : * often expect very large improvements, particularly with sets of strings
2573 : * of moderately high to high abbreviated cardinality. There is little to
2574 : * lose but much to gain, which our strategy reflects.
2575 : */
2576 : #ifdef TRACE_SORT
2577 0 : if (trace_sort)
2578 0 : elog(LOG, "varstr_abbrev: aborted abbreviation at %d "
2579 : "(abbrev_distinct: %f, key_distinct: %f, prop_card: %f)",
2580 : memtupcount, abbrev_distinct, key_distinct, sss->prop_card);
2581 : #endif
2582 :
2583 0 : return true;
2584 : }
2585 :
2586 : Datum
2587 7 : text_larger(PG_FUNCTION_ARGS)
2588 : {
2589 7 : text *arg1 = PG_GETARG_TEXT_PP(0);
2590 7 : text *arg2 = PG_GETARG_TEXT_PP(1);
2591 : text *result;
2592 :
2593 7 : result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) > 0) ? arg1 : arg2);
2594 :
2595 7 : PG_RETURN_TEXT_P(result);
2596 : }
2597 :
2598 : Datum
2599 6 : text_smaller(PG_FUNCTION_ARGS)
2600 : {
2601 6 : text *arg1 = PG_GETARG_TEXT_PP(0);
2602 6 : text *arg2 = PG_GETARG_TEXT_PP(1);
2603 : text *result;
2604 :
2605 6 : result = ((text_cmp(arg1, arg2, PG_GET_COLLATION()) < 0) ? arg1 : arg2);
2606 :
2607 6 : PG_RETURN_TEXT_P(result);
2608 : }
2609 :
2610 :
2611 : /*
2612 : * The following operators support character-by-character comparison
2613 : * of text datums, to allow building indexes suitable for LIKE clauses.
2614 : * Note that the regular texteq/textne comparison operators, and regular
2615 : * support functions 1 and 2 with "C" collation are assumed to be
2616 : * compatible with these!
2617 : */
2618 :
2619 : static int
2620 25163 : internal_text_pattern_compare(text *arg1, text *arg2)
2621 : {
2622 : int result;
2623 : int len1,
2624 : len2;
2625 :
2626 25163 : len1 = VARSIZE_ANY_EXHDR(arg1);
2627 25163 : len2 = VARSIZE_ANY_EXHDR(arg2);
2628 :
2629 25163 : result = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
2630 25163 : if (result != 0)
2631 25151 : return result;
2632 12 : else if (len1 < len2)
2633 0 : return -1;
2634 12 : else if (len1 > len2)
2635 3 : return 1;
2636 : else
2637 9 : return 0;
2638 : }
2639 :
2640 :
2641 : Datum
2642 6401 : text_pattern_lt(PG_FUNCTION_ARGS)
2643 : {
2644 6401 : text *arg1 = PG_GETARG_TEXT_PP(0);
2645 6401 : text *arg2 = PG_GETARG_TEXT_PP(1);
2646 : int result;
2647 :
2648 6401 : result = internal_text_pattern_compare(arg1, arg2);
2649 :
2650 6401 : PG_FREE_IF_COPY(arg1, 0);
2651 6401 : PG_FREE_IF_COPY(arg2, 1);
2652 :
2653 6401 : PG_RETURN_BOOL(result < 0);
2654 : }
2655 :
2656 :
2657 : Datum
2658 6251 : text_pattern_le(PG_FUNCTION_ARGS)
2659 : {
2660 6251 : text *arg1 = PG_GETARG_TEXT_PP(0);
2661 6251 : text *arg2 = PG_GETARG_TEXT_PP(1);
2662 : int result;
2663 :
2664 6251 : result = internal_text_pattern_compare(arg1, arg2);
2665 :
2666 6251 : PG_FREE_IF_COPY(arg1, 0);
2667 6251 : PG_FREE_IF_COPY(arg2, 1);
2668 :
2669 6251 : PG_RETURN_BOOL(result <= 0);
2670 : }
2671 :
2672 :
2673 : Datum
2674 6251 : text_pattern_ge(PG_FUNCTION_ARGS)
2675 : {
2676 6251 : text *arg1 = PG_GETARG_TEXT_PP(0);
2677 6251 : text *arg2 = PG_GETARG_TEXT_PP(1);
2678 : int result;
2679 :
2680 6251 : result = internal_text_pattern_compare(arg1, arg2);
2681 :
2682 6251 : PG_FREE_IF_COPY(arg1, 0);
2683 6251 : PG_FREE_IF_COPY(arg2, 1);
2684 :
2685 6251 : PG_RETURN_BOOL(result >= 0);
2686 : }
2687 :
2688 :
2689 : Datum
2690 6251 : text_pattern_gt(PG_FUNCTION_ARGS)
2691 : {
2692 6251 : text *arg1 = PG_GETARG_TEXT_PP(0);
2693 6251 : text *arg2 = PG_GETARG_TEXT_PP(1);
2694 : int result;
2695 :
2696 6251 : result = internal_text_pattern_compare(arg1, arg2);
2697 :
2698 6251 : PG_FREE_IF_COPY(arg1, 0);
2699 6251 : PG_FREE_IF_COPY(arg2, 1);
2700 :
2701 6251 : PG_RETURN_BOOL(result > 0);
2702 : }
2703 :
2704 :
2705 : Datum
2706 9 : bttext_pattern_cmp(PG_FUNCTION_ARGS)
2707 : {
2708 9 : text *arg1 = PG_GETARG_TEXT_PP(0);
2709 9 : text *arg2 = PG_GETARG_TEXT_PP(1);
2710 : int result;
2711 :
2712 9 : result = internal_text_pattern_compare(arg1, arg2);
2713 :
2714 9 : PG_FREE_IF_COPY(arg1, 0);
2715 9 : PG_FREE_IF_COPY(arg2, 1);
2716 :
2717 9 : PG_RETURN_INT32(result);
2718 : }
2719 :
2720 :
2721 : Datum
2722 21 : bttext_pattern_sortsupport(PG_FUNCTION_ARGS)
2723 : {
2724 21 : SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
2725 : MemoryContext oldcontext;
2726 :
2727 21 : oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
2728 :
2729 : /* Use generic string SortSupport, forcing "C" collation */
2730 21 : varstr_sortsupport(ssup, C_COLLATION_OID, false);
2731 :
2732 21 : MemoryContextSwitchTo(oldcontext);
2733 :
2734 21 : PG_RETURN_VOID();
2735 : }
2736 :
2737 :
2738 : /*-------------------------------------------------------------
2739 : * byteaoctetlen
2740 : *
2741 : * get the number of bytes contained in an instance of type 'bytea'
2742 : *-------------------------------------------------------------
2743 : */
2744 : Datum
2745 0 : byteaoctetlen(PG_FUNCTION_ARGS)
2746 : {
2747 0 : Datum str = PG_GETARG_DATUM(0);
2748 :
2749 : /* We need not detoast the input at all */
2750 0 : PG_RETURN_INT32(toast_raw_datum_size(str) - VARHDRSZ);
2751 : }
2752 :
2753 : /*
2754 : * byteacat -
2755 : * takes two bytea* and returns a bytea* that is the concatenation of
2756 : * the two.
2757 : *
2758 : * Cloned from textcat and modified as required.
2759 : */
2760 : Datum
2761 0 : byteacat(PG_FUNCTION_ARGS)
2762 : {
2763 0 : bytea *t1 = PG_GETARG_BYTEA_PP(0);
2764 0 : bytea *t2 = PG_GETARG_BYTEA_PP(1);
2765 :
2766 0 : PG_RETURN_BYTEA_P(bytea_catenate(t1, t2));
2767 : }
2768 :
2769 : /*
2770 : * bytea_catenate
2771 : * Guts of byteacat(), broken out so it can be used by other functions
2772 : *
2773 : * Arguments can be in short-header form, but not compressed or out-of-line
2774 : */
2775 : static bytea *
2776 6 : bytea_catenate(bytea *t1, bytea *t2)
2777 : {
2778 : bytea *result;
2779 : int len1,
2780 : len2,
2781 : len;
2782 : char *ptr;
2783 :
2784 6 : len1 = VARSIZE_ANY_EXHDR(t1);
2785 6 : len2 = VARSIZE_ANY_EXHDR(t2);
2786 :
2787 : /* paranoia ... probably should throw error instead? */
2788 6 : if (len1 < 0)
2789 0 : len1 = 0;
2790 6 : if (len2 < 0)
2791 0 : len2 = 0;
2792 :
2793 6 : len = len1 + len2 + VARHDRSZ;
2794 6 : result = (bytea *) palloc(len);
2795 :
2796 : /* Set size of result string... */
2797 6 : SET_VARSIZE(result, len);
2798 :
2799 : /* Fill data field of result string... */
2800 6 : ptr = VARDATA(result);
2801 6 : if (len1 > 0)
2802 6 : memcpy(ptr, VARDATA_ANY(t1), len1);
2803 6 : if (len2 > 0)
2804 3 : memcpy(ptr + len1, VARDATA_ANY(t2), len2);
2805 :
2806 6 : return result;
2807 : }
2808 :
2809 : #define PG_STR_GET_BYTEA(str_) \
2810 : DatumGetByteaPP(DirectFunctionCall1(byteain, CStringGetDatum(str_)))
2811 :
2812 : /*
2813 : * bytea_substr()
2814 : * Return a substring starting at the specified position.
2815 : * Cloned from text_substr and modified as required.
2816 : *
2817 : * Input:
2818 : * - string
2819 : * - starting position (is one-based)
2820 : * - string length (optional)
2821 : *
2822 : * If the starting position is zero or less, then return from the start of the string
2823 : * adjusting the length to be consistent with the "negative start" per SQL.
2824 : * If the length is less than zero, an ERROR is thrown. If no third argument
2825 : * (length) is provided, the length to the end of the string is assumed.
2826 : */
2827 : Datum
2828 9 : bytea_substr(PG_FUNCTION_ARGS)
2829 : {
2830 9 : PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
2831 : PG_GETARG_INT32(1),
2832 : PG_GETARG_INT32(2),
2833 : false));
2834 : }
2835 :
2836 : /*
2837 : * bytea_substr_no_len -
2838 : * Wrapper to avoid opr_sanity failure due to
2839 : * one function accepting a different number of args.
2840 : */
2841 : Datum
2842 4 : bytea_substr_no_len(PG_FUNCTION_ARGS)
2843 : {
2844 4 : PG_RETURN_BYTEA_P(bytea_substring(PG_GETARG_DATUM(0),
2845 : PG_GETARG_INT32(1),
2846 : -1,
2847 : true));
2848 : }
2849 :
2850 : static bytea *
2851 19 : bytea_substring(Datum str,
2852 : int S,
2853 : int L,
2854 : bool length_not_specified)
2855 : {
2856 : int S1; /* adjusted start position */
2857 : int L1; /* adjusted substring length */
2858 :
2859 19 : S1 = Max(S, 1);
2860 :
2861 19 : if (length_not_specified)
2862 : {
2863 : /*
2864 : * Not passed a length - DatumGetByteaPSlice() grabs everything to the
2865 : * end of the string if we pass it a negative value for length.
2866 : */
2867 7 : L1 = -1;
2868 : }
2869 : else
2870 : {
2871 : /* end position */
2872 12 : int E = S + L;
2873 :
2874 : /*
2875 : * A negative value for L is the only way for the end position to be
2876 : * before the start. SQL99 says to throw an error.
2877 : */
2878 12 : if (E < S)
2879 1 : ereport(ERROR,
2880 : (errcode(ERRCODE_SUBSTRING_ERROR),
2881 : errmsg("negative substring length not allowed")));
2882 :
2883 : /*
2884 : * A zero or negative value for the end position can happen if the
2885 : * start was negative or one. SQL99 says to return a zero-length
2886 : * string.
2887 : */
2888 11 : if (E < 1)
2889 0 : return PG_STR_GET_BYTEA("");
2890 :
2891 11 : L1 = E - S1;
2892 : }
2893 :
2894 : /*
2895 : * If the start position is past the end of the string, SQL99 says to
2896 : * return a zero-length string -- DatumGetByteaPSlice() will do that for
2897 : * us. Convert to zero-based starting position
2898 : */
2899 18 : return DatumGetByteaPSlice(str, S1 - 1, L1);
2900 : }
2901 :
2902 : /*
2903 : * byteaoverlay
2904 : * Replace specified substring of first string with second
2905 : *
2906 : * The SQL standard defines OVERLAY() in terms of substring and concatenation.
2907 : * This code is a direct implementation of what the standard says.
2908 : */
2909 : Datum
2910 1 : byteaoverlay(PG_FUNCTION_ARGS)
2911 : {
2912 1 : bytea *t1 = PG_GETARG_BYTEA_PP(0);
2913 1 : bytea *t2 = PG_GETARG_BYTEA_PP(1);
2914 1 : int sp = PG_GETARG_INT32(2); /* substring start position */
2915 1 : int sl = PG_GETARG_INT32(3); /* substring length */
2916 :
2917 1 : PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
2918 : }
2919 :
2920 : Datum
2921 2 : byteaoverlay_no_len(PG_FUNCTION_ARGS)
2922 : {
2923 2 : bytea *t1 = PG_GETARG_BYTEA_PP(0);
2924 2 : bytea *t2 = PG_GETARG_BYTEA_PP(1);
2925 2 : int sp = PG_GETARG_INT32(2); /* substring start position */
2926 : int sl;
2927 :
2928 2 : sl = VARSIZE_ANY_EXHDR(t2); /* defaults to length(t2) */
2929 2 : PG_RETURN_BYTEA_P(bytea_overlay(t1, t2, sp, sl));
2930 : }
2931 :
2932 : static bytea *
2933 3 : bytea_overlay(bytea *t1, bytea *t2, int sp, int sl)
2934 : {
2935 : bytea *result;
2936 : bytea *s1;
2937 : bytea *s2;
2938 : int sp_pl_sl;
2939 :
2940 : /*
2941 : * Check for possible integer-overflow cases. For negative sp, throw a
2942 : * "substring length" error because that's what should be expected
2943 : * according to the spec's definition of OVERLAY().
2944 : */
2945 3 : if (sp <= 0)
2946 0 : ereport(ERROR,
2947 : (errcode(ERRCODE_SUBSTRING_ERROR),
2948 : errmsg("negative substring length not allowed")));
2949 3 : sp_pl_sl = sp + sl;
2950 3 : if (sp_pl_sl <= sl)
2951 0 : ereport(ERROR,
2952 : (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
2953 : errmsg("integer out of range")));
2954 :
2955 3 : s1 = bytea_substring(PointerGetDatum(t1), 1, sp - 1, false);
2956 3 : s2 = bytea_substring(PointerGetDatum(t1), sp_pl_sl, -1, true);
2957 3 : result = bytea_catenate(s1, t2);
2958 3 : result = bytea_catenate(result, s2);
2959 :
2960 3 : return result;
2961 : }
2962 :
2963 : /*
2964 : * byteapos -
2965 : * Return the position of the specified substring.
2966 : * Implements the SQL POSITION() function.
2967 : * Cloned from textpos and modified as required.
2968 : */
2969 : Datum
2970 0 : byteapos(PG_FUNCTION_ARGS)
2971 : {
2972 0 : bytea *t1 = PG_GETARG_BYTEA_PP(0);
2973 0 : bytea *t2 = PG_GETARG_BYTEA_PP(1);
2974 : int pos;
2975 : int px,
2976 : p;
2977 : int len1,
2978 : len2;
2979 : char *p1,
2980 : *p2;
2981 :
2982 0 : len1 = VARSIZE_ANY_EXHDR(t1);
2983 0 : len2 = VARSIZE_ANY_EXHDR(t2);
2984 :
2985 0 : if (len2 <= 0)
2986 0 : PG_RETURN_INT32(1); /* result for empty pattern */
2987 :
2988 0 : p1 = VARDATA_ANY(t1);
2989 0 : p2 = VARDATA_ANY(t2);
2990 :
2991 0 : pos = 0;
2992 0 : px = (len1 - len2);
2993 0 : for (p = 0; p <= px; p++)
2994 : {
2995 0 : if ((*p2 == *p1) && (memcmp(p1, p2, len2) == 0))
2996 : {
2997 0 : pos = p + 1;
2998 0 : break;
2999 : };
3000 0 : p1++;
3001 : };
3002 :
3003 0 : PG_RETURN_INT32(pos);
3004 : }
3005 :
3006 : /*-------------------------------------------------------------
3007 : * byteaGetByte
3008 : *
3009 : * this routine treats "bytea" as an array of bytes.
3010 : * It returns the Nth byte (a number between 0 and 255).
3011 : *-------------------------------------------------------------
3012 : */
3013 : Datum
3014 0 : byteaGetByte(PG_FUNCTION_ARGS)
3015 : {
3016 0 : bytea *v = PG_GETARG_BYTEA_PP(0);
3017 0 : int32 n = PG_GETARG_INT32(1);
3018 : int len;
3019 : int byte;
3020 :
3021 0 : len = VARSIZE_ANY_EXHDR(v);
3022 :
3023 0 : if (n < 0 || n >= len)
3024 0 : ereport(ERROR,
3025 : (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3026 : errmsg("index %d out of valid range, 0..%d",
3027 : n, len - 1)));
3028 :
3029 0 : byte = ((unsigned char *) VARDATA_ANY(v))[n];
3030 :
3031 0 : PG_RETURN_INT32(byte);
3032 : }
3033 :
3034 : /*-------------------------------------------------------------
3035 : * byteaGetBit
3036 : *
3037 : * This routine treats a "bytea" type like an array of bits.
3038 : * It returns the value of the Nth bit (0 or 1).
3039 : *
3040 : *-------------------------------------------------------------
3041 : */
3042 : Datum
3043 0 : byteaGetBit(PG_FUNCTION_ARGS)
3044 : {
3045 0 : bytea *v = PG_GETARG_BYTEA_PP(0);
3046 0 : int32 n = PG_GETARG_INT32(1);
3047 : int byteNo,
3048 : bitNo;
3049 : int len;
3050 : int byte;
3051 :
3052 0 : len = VARSIZE_ANY_EXHDR(v);
3053 :
3054 0 : if (n < 0 || n >= len * 8)
3055 0 : ereport(ERROR,
3056 : (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3057 : errmsg("index %d out of valid range, 0..%d",
3058 : n, len * 8 - 1)));
3059 :
3060 0 : byteNo = n / 8;
3061 0 : bitNo = n % 8;
3062 :
3063 0 : byte = ((unsigned char *) VARDATA_ANY(v))[byteNo];
3064 :
3065 0 : if (byte & (1 << bitNo))
3066 0 : PG_RETURN_INT32(1);
3067 : else
3068 0 : PG_RETURN_INT32(0);
3069 : }
3070 :
3071 : /*-------------------------------------------------------------
3072 : * byteaSetByte
3073 : *
3074 : * Given an instance of type 'bytea' creates a new one with
3075 : * the Nth byte set to the given value.
3076 : *
3077 : *-------------------------------------------------------------
3078 : */
3079 : Datum
3080 0 : byteaSetByte(PG_FUNCTION_ARGS)
3081 : {
3082 0 : bytea *res = PG_GETARG_BYTEA_P_COPY(0);
3083 0 : int32 n = PG_GETARG_INT32(1);
3084 0 : int32 newByte = PG_GETARG_INT32(2);
3085 : int len;
3086 :
3087 0 : len = VARSIZE(res) - VARHDRSZ;
3088 :
3089 0 : if (n < 0 || n >= len)
3090 0 : ereport(ERROR,
3091 : (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3092 : errmsg("index %d out of valid range, 0..%d",
3093 : n, len - 1)));
3094 :
3095 : /*
3096 : * Now set the byte.
3097 : */
3098 0 : ((unsigned char *) VARDATA(res))[n] = newByte;
3099 :
3100 0 : PG_RETURN_BYTEA_P(res);
3101 : }
3102 :
3103 : /*-------------------------------------------------------------
3104 : * byteaSetBit
3105 : *
3106 : * Given an instance of type 'bytea' creates a new one with
3107 : * the Nth bit set to the given value.
3108 : *
3109 : *-------------------------------------------------------------
3110 : */
3111 : Datum
3112 0 : byteaSetBit(PG_FUNCTION_ARGS)
3113 : {
3114 0 : bytea *res = PG_GETARG_BYTEA_P_COPY(0);
3115 0 : int32 n = PG_GETARG_INT32(1);
3116 0 : int32 newBit = PG_GETARG_INT32(2);
3117 : int len;
3118 : int oldByte,
3119 : newByte;
3120 : int byteNo,
3121 : bitNo;
3122 :
3123 0 : len = VARSIZE(res) - VARHDRSZ;
3124 :
3125 0 : if (n < 0 || n >= len * 8)
3126 0 : ereport(ERROR,
3127 : (errcode(ERRCODE_ARRAY_SUBSCRIPT_ERROR),
3128 : errmsg("index %d out of valid range, 0..%d",
3129 : n, len * 8 - 1)));
3130 :
3131 0 : byteNo = n / 8;
3132 0 : bitNo = n % 8;
3133 :
3134 : /*
3135 : * sanity check!
3136 : */
3137 0 : if (newBit != 0 && newBit != 1)
3138 0 : ereport(ERROR,
3139 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
3140 : errmsg("new bit must be 0 or 1")));
3141 :
3142 : /*
3143 : * Update the byte.
3144 : */
3145 0 : oldByte = ((unsigned char *) VARDATA(res))[byteNo];
3146 :
3147 0 : if (newBit == 0)
3148 0 : newByte = oldByte & (~(1 << bitNo));
3149 : else
3150 0 : newByte = oldByte | (1 << bitNo);
3151 :
3152 0 : ((unsigned char *) VARDATA(res))[byteNo] = newByte;
3153 :
3154 0 : PG_RETURN_BYTEA_P(res);
3155 : }
3156 :
3157 :
3158 : /* text_name()
3159 : * Converts a text type to a Name type.
3160 : */
3161 : Datum
3162 191 : text_name(PG_FUNCTION_ARGS)
3163 : {
3164 191 : text *s = PG_GETARG_TEXT_PP(0);
3165 : Name result;
3166 : int len;
3167 :
3168 191 : len = VARSIZE_ANY_EXHDR(s);
3169 :
3170 : /* Truncate oversize input */
3171 191 : if (len >= NAMEDATALEN)
3172 1 : len = pg_mbcliplen(VARDATA_ANY(s), len, NAMEDATALEN - 1);
3173 :
3174 : /* We use palloc0 here to ensure result is zero-padded */
3175 191 : result = (Name) palloc0(NAMEDATALEN);
3176 191 : memcpy(NameStr(*result), VARDATA_ANY(s), len);
3177 :
3178 191 : PG_RETURN_NAME(result);
3179 : }
3180 :
3181 : /* name_text()
3182 : * Converts a Name type to a text type.
3183 : */
3184 : Datum
3185 60410 : name_text(PG_FUNCTION_ARGS)
3186 : {
3187 60410 : Name s = PG_GETARG_NAME(0);
3188 :
3189 60410 : PG_RETURN_TEXT_P(cstring_to_text(NameStr(*s)));
3190 : }
3191 :
3192 :
3193 : /*
3194 : * textToQualifiedNameList - convert a text object to list of names
3195 : *
3196 : * This implements the input parsing needed by nextval() and other
3197 : * functions that take a text parameter representing a qualified name.
3198 : * We split the name at dots, downcase if not double-quoted, and
3199 : * truncate names if they're too long.
3200 : */
3201 : List *
3202 120 : textToQualifiedNameList(text *textval)
3203 : {
3204 : char *rawname;
3205 120 : List *result = NIL;
3206 : List *namelist;
3207 : ListCell *l;
3208 :
3209 : /* Convert to C string (handles possible detoasting). */
3210 : /* Note we rely on being able to modify rawname below. */
3211 120 : rawname = text_to_cstring(textval);
3212 :
3213 120 : if (!SplitIdentifierString(rawname, '.', &namelist))
3214 0 : ereport(ERROR,
3215 : (errcode(ERRCODE_INVALID_NAME),
3216 : errmsg("invalid name syntax")));
3217 :
3218 120 : if (namelist == NIL)
3219 0 : ereport(ERROR,
3220 : (errcode(ERRCODE_INVALID_NAME),
3221 : errmsg("invalid name syntax")));
3222 :
3223 257 : foreach(l, namelist)
3224 : {
3225 137 : char *curname = (char *) lfirst(l);
3226 :
3227 137 : result = lappend(result, makeString(pstrdup(curname)));
3228 : }
3229 :
3230 120 : pfree(rawname);
3231 120 : list_free(namelist);
3232 :
3233 120 : return result;
3234 : }
3235 :
3236 : /*
3237 : * SplitIdentifierString --- parse a string containing identifiers
3238 : *
3239 : * This is the guts of textToQualifiedNameList, and is exported for use in
3240 : * other situations such as parsing GUC variables. In the GUC case, it's
3241 : * important to avoid memory leaks, so the API is designed to minimize the
3242 : * amount of stuff that needs to be allocated and freed.
3243 : *
3244 : * Inputs:
3245 : * rawstring: the input string; must be overwritable! On return, it's
3246 : * been modified to contain the separated identifiers.
3247 : * separator: the separator punctuation expected between identifiers
3248 : * (typically '.' or ','). Whitespace may also appear around
3249 : * identifiers.
3250 : * Outputs:
3251 : * namelist: filled with a palloc'd list of pointers to identifiers within
3252 : * rawstring. Caller should list_free() this even on error return.
3253 : *
3254 : * Returns TRUE if okay, FALSE if there is a syntax error in the string.
3255 : *
3256 : * Note that an empty string is considered okay here, though not in
3257 : * textToQualifiedNameList.
3258 : */
3259 : bool
3260 3189 : SplitIdentifierString(char *rawstring, char separator,
3261 : List **namelist)
3262 : {
3263 3189 : char *nextp = rawstring;
3264 3189 : bool done = false;
3265 :
3266 3189 : *namelist = NIL;
3267 :
3268 6378 : while (scanner_isspace(*nextp))
3269 0 : nextp++; /* skip leading whitespace */
3270 :
3271 3189 : if (*nextp == '\0')
3272 344 : return true; /* allow empty string */
3273 :
3274 : /* At the top of the loop, we are at start of a new identifier. */
3275 : do
3276 : {
3277 : char *curname;
3278 : char *endp;
3279 :
3280 4753 : if (*nextp == '"')
3281 : {
3282 : /* Quoted name --- collapse quote-quote pairs, no downcasing */
3283 710 : curname = nextp + 1;
3284 : for (;;)
3285 : {
3286 710 : endp = strchr(nextp + 1, '"');
3287 710 : if (endp == NULL)
3288 0 : return false; /* mismatched quotes */
3289 710 : if (endp[1] != '"')
3290 710 : break; /* found end of quoted name */
3291 : /* Collapse adjacent quotes into one quote, and look again */
3292 0 : memmove(endp, endp + 1, strlen(endp));
3293 0 : nextp = endp;
3294 0 : }
3295 : /* endp now points at the terminating quote */
3296 710 : nextp = endp + 1;
3297 : }
3298 : else
3299 : {
3300 : /* Unquoted name --- extends to separator or whitespace */
3301 : char *downname;
3302 : int len;
3303 :
3304 4043 : curname = nextp;
3305 70854 : while (*nextp && *nextp != separator &&
3306 31384 : !scanner_isspace(*nextp))
3307 31384 : nextp++;
3308 4043 : endp = nextp;
3309 4043 : if (curname == nextp)
3310 0 : return false; /* empty unquoted name not allowed */
3311 :
3312 : /*
3313 : * Downcase the identifier, using same code as main lexer does.
3314 : *
3315 : * XXX because we want to overwrite the input in-place, we cannot
3316 : * support a downcasing transformation that increases the string
3317 : * length. This is not a problem given the current implementation
3318 : * of downcase_truncate_identifier, but we'll probably have to do
3319 : * something about this someday.
3320 : */
3321 4043 : len = endp - curname;
3322 4043 : downname = downcase_truncate_identifier(curname, len, false);
3323 4043 : Assert(strlen(downname) <= len);
3324 4043 : strncpy(curname, downname, len); /* strncpy is required here */
3325 4043 : pfree(downname);
3326 : }
3327 :
3328 9506 : while (scanner_isspace(*nextp))
3329 0 : nextp++; /* skip trailing whitespace */
3330 :
3331 4753 : if (*nextp == separator)
3332 : {
3333 1908 : nextp++;
3334 5048 : while (scanner_isspace(*nextp))
3335 1232 : nextp++; /* skip leading whitespace for next */
3336 : /* we expect another name, so done remains false */
3337 : }
3338 2845 : else if (*nextp == '\0')
3339 2845 : done = true;
3340 : else
3341 0 : return false; /* invalid syntax */
3342 :
3343 : /* Now safe to overwrite separator with a null */
3344 4753 : *endp = '\0';
3345 :
3346 : /* Truncate name if it's overlength */
3347 4753 : truncate_identifier(curname, strlen(curname), false);
3348 :
3349 : /*
3350 : * Finished isolating current name --- add it to list
3351 : */
3352 4753 : *namelist = lappend(*namelist, curname);
3353 :
3354 : /* Loop back if we didn't reach end of string */
3355 4753 : } while (!done);
3356 :
3357 2845 : return true;
3358 : }
3359 :
3360 :
3361 : /*
3362 : * SplitDirectoriesString --- parse a string containing file/directory names
3363 : *
3364 : * This works fine on file names too; the function name is historical.
3365 : *
3366 : * This is similar to SplitIdentifierString, except that the parsing
3367 : * rules are meant to handle pathnames instead of identifiers: there is
3368 : * no downcasing, embedded spaces are allowed, the max length is MAXPGPATH-1,
3369 : * and we apply canonicalize_path() to each extracted string. Because of the
3370 : * last, the returned strings are separately palloc'd rather than being
3371 : * pointers into rawstring --- but we still scribble on rawstring.
3372 : *
3373 : * Inputs:
3374 : * rawstring: the input string; must be modifiable!
3375 : * separator: the separator punctuation expected between directories
3376 : * (typically ',' or ';'). Whitespace may also appear around
3377 : * directories.
3378 : * Outputs:
3379 : * namelist: filled with a palloc'd list of directory names.
3380 : * Caller should list_free_deep() this even on error return.
3381 : *
3382 : * Returns TRUE if okay, FALSE if there is a syntax error in the string.
3383 : *
3384 : * Note that an empty string is considered okay here.
3385 : */
3386 : bool
3387 1 : SplitDirectoriesString(char *rawstring, char separator,
3388 : List **namelist)
3389 : {
3390 1 : char *nextp = rawstring;
3391 1 : bool done = false;
3392 :
3393 1 : *namelist = NIL;
3394 :
3395 2 : while (scanner_isspace(*nextp))
3396 0 : nextp++; /* skip leading whitespace */
3397 :
3398 1 : if (*nextp == '\0')
3399 0 : return true; /* allow empty string */
3400 :
3401 : /* At the top of the loop, we are at start of a new directory. */
3402 : do
3403 : {
3404 : char *curname;
3405 : char *endp;
3406 :
3407 1 : if (*nextp == '"')
3408 : {
3409 : /* Quoted name --- collapse quote-quote pairs */
3410 0 : curname = nextp + 1;
3411 : for (;;)
3412 : {
3413 0 : endp = strchr(nextp + 1, '"');
3414 0 : if (endp == NULL)
3415 0 : return false; /* mismatched quotes */
3416 0 : if (endp[1] != '"')
3417 0 : break; /* found end of quoted name */
3418 : /* Collapse adjacent quotes into one quote, and look again */
3419 0 : memmove(endp, endp + 1, strlen(endp));
3420 0 : nextp = endp;
3421 0 : }
3422 : /* endp now points at the terminating quote */
3423 0 : nextp = endp + 1;
3424 : }
3425 : else
3426 : {
3427 : /* Unquoted name --- extends to separator or end of string */
3428 1 : curname = endp = nextp;
3429 24 : while (*nextp && *nextp != separator)
3430 : {
3431 : /* trailing whitespace should not be included in name */
3432 22 : if (!scanner_isspace(*nextp))
3433 22 : endp = nextp + 1;
3434 22 : nextp++;
3435 : }
3436 1 : if (curname == endp)
3437 0 : return false; /* empty unquoted name not allowed */
3438 : }
3439 :
3440 2 : while (scanner_isspace(*nextp))
3441 0 : nextp++; /* skip trailing whitespace */
3442 :
3443 1 : if (*nextp == separator)
3444 : {
3445 0 : nextp++;
3446 0 : while (scanner_isspace(*nextp))
3447 0 : nextp++; /* skip leading whitespace for next */
3448 : /* we expect another name, so done remains false */
3449 : }
3450 1 : else if (*nextp == '\0')
3451 1 : done = true;
3452 : else
3453 0 : return false; /* invalid syntax */
3454 :
3455 : /* Now safe to overwrite separator with a null */
3456 1 : *endp = '\0';
3457 :
3458 : /* Truncate path if it's overlength */
3459 1 : if (strlen(curname) >= MAXPGPATH)
3460 0 : curname[MAXPGPATH - 1] = '\0';
3461 :
3462 : /*
3463 : * Finished isolating current name --- add it to list
3464 : */
3465 1 : curname = pstrdup(curname);
3466 1 : canonicalize_path(curname);
3467 1 : *namelist = lappend(*namelist, curname);
3468 :
3469 : /* Loop back if we didn't reach end of string */
3470 1 : } while (!done);
3471 :
3472 1 : return true;
3473 : }
3474 :
3475 :
3476 : /*****************************************************************************
3477 : * Comparison Functions used for bytea
3478 : *
3479 : * Note: btree indexes need these routines not to leak memory; therefore,
3480 : * be careful to free working copies of toasted datums. Most places don't
3481 : * need to be so careful.
3482 : *****************************************************************************/
3483 :
3484 : Datum
3485 429 : byteaeq(PG_FUNCTION_ARGS)
3486 : {
3487 429 : Datum arg1 = PG_GETARG_DATUM(0);
3488 429 : Datum arg2 = PG_GETARG_DATUM(1);
3489 : bool result;
3490 : Size len1,
3491 : len2;
3492 :
3493 : /*
3494 : * We can use a fast path for unequal lengths, which might save us from
3495 : * having to detoast one or both values.
3496 : */
3497 429 : len1 = toast_raw_datum_size(arg1);
3498 429 : len2 = toast_raw_datum_size(arg2);
3499 429 : if (len1 != len2)
3500 0 : result = false;
3501 : else
3502 : {
3503 429 : bytea *barg1 = DatumGetByteaPP(arg1);
3504 429 : bytea *barg2 = DatumGetByteaPP(arg2);
3505 :
3506 858 : result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
3507 429 : len1 - VARHDRSZ) == 0);
3508 :
3509 429 : PG_FREE_IF_COPY(barg1, 0);
3510 429 : PG_FREE_IF_COPY(barg2, 1);
3511 : }
3512 :
3513 429 : PG_RETURN_BOOL(result);
3514 : }
3515 :
3516 : Datum
3517 132 : byteane(PG_FUNCTION_ARGS)
3518 : {
3519 132 : Datum arg1 = PG_GETARG_DATUM(0);
3520 132 : Datum arg2 = PG_GETARG_DATUM(1);
3521 : bool result;
3522 : Size len1,
3523 : len2;
3524 :
3525 : /*
3526 : * We can use a fast path for unequal lengths, which might save us from
3527 : * having to detoast one or both values.
3528 : */
3529 132 : len1 = toast_raw_datum_size(arg1);
3530 132 : len2 = toast_raw_datum_size(arg2);
3531 132 : if (len1 != len2)
3532 0 : result = true;
3533 : else
3534 : {
3535 132 : bytea *barg1 = DatumGetByteaPP(arg1);
3536 132 : bytea *barg2 = DatumGetByteaPP(arg2);
3537 :
3538 264 : result = (memcmp(VARDATA_ANY(barg1), VARDATA_ANY(barg2),
3539 132 : len1 - VARHDRSZ) != 0);
3540 :
3541 132 : PG_FREE_IF_COPY(barg1, 0);
3542 132 : PG_FREE_IF_COPY(barg2, 1);
3543 : }
3544 :
3545 132 : PG_RETURN_BOOL(result);
3546 : }
3547 :
3548 : Datum
3549 510 : bytealt(PG_FUNCTION_ARGS)
3550 : {
3551 510 : bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3552 510 : bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3553 : int len1,
3554 : len2;
3555 : int cmp;
3556 :
3557 510 : len1 = VARSIZE_ANY_EXHDR(arg1);
3558 510 : len2 = VARSIZE_ANY_EXHDR(arg2);
3559 :
3560 510 : cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3561 :
3562 510 : PG_FREE_IF_COPY(arg1, 0);
3563 510 : PG_FREE_IF_COPY(arg2, 1);
3564 :
3565 510 : PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 < len2)));
3566 : }
3567 :
3568 : Datum
3569 400 : byteale(PG_FUNCTION_ARGS)
3570 : {
3571 400 : bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3572 400 : bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3573 : int len1,
3574 : len2;
3575 : int cmp;
3576 :
3577 400 : len1 = VARSIZE_ANY_EXHDR(arg1);
3578 400 : len2 = VARSIZE_ANY_EXHDR(arg2);
3579 :
3580 400 : cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3581 :
3582 400 : PG_FREE_IF_COPY(arg1, 0);
3583 400 : PG_FREE_IF_COPY(arg2, 1);
3584 :
3585 400 : PG_RETURN_BOOL((cmp < 0) || ((cmp == 0) && (len1 <= len2)));
3586 : }
3587 :
3588 : Datum
3589 510 : byteagt(PG_FUNCTION_ARGS)
3590 : {
3591 510 : bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3592 510 : bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3593 : int len1,
3594 : len2;
3595 : int cmp;
3596 :
3597 510 : len1 = VARSIZE_ANY_EXHDR(arg1);
3598 510 : len2 = VARSIZE_ANY_EXHDR(arg2);
3599 :
3600 510 : cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3601 :
3602 510 : PG_FREE_IF_COPY(arg1, 0);
3603 510 : PG_FREE_IF_COPY(arg2, 1);
3604 :
3605 510 : PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 > len2)));
3606 : }
3607 :
3608 : Datum
3609 307 : byteage(PG_FUNCTION_ARGS)
3610 : {
3611 307 : bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3612 307 : bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3613 : int len1,
3614 : len2;
3615 : int cmp;
3616 :
3617 307 : len1 = VARSIZE_ANY_EXHDR(arg1);
3618 307 : len2 = VARSIZE_ANY_EXHDR(arg2);
3619 :
3620 307 : cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3621 :
3622 307 : PG_FREE_IF_COPY(arg1, 0);
3623 307 : PG_FREE_IF_COPY(arg2, 1);
3624 :
3625 307 : PG_RETURN_BOOL((cmp > 0) || ((cmp == 0) && (len1 >= len2)));
3626 : }
3627 :
3628 : Datum
3629 0 : byteacmp(PG_FUNCTION_ARGS)
3630 : {
3631 0 : bytea *arg1 = PG_GETARG_BYTEA_PP(0);
3632 0 : bytea *arg2 = PG_GETARG_BYTEA_PP(1);
3633 : int len1,
3634 : len2;
3635 : int cmp;
3636 :
3637 0 : len1 = VARSIZE_ANY_EXHDR(arg1);
3638 0 : len2 = VARSIZE_ANY_EXHDR(arg2);
3639 :
3640 0 : cmp = memcmp(VARDATA_ANY(arg1), VARDATA_ANY(arg2), Min(len1, len2));
3641 0 : if ((cmp == 0) && (len1 != len2))
3642 0 : cmp = (len1 < len2) ? -1 : 1;
3643 :
3644 0 : PG_FREE_IF_COPY(arg1, 0);
3645 0 : PG_FREE_IF_COPY(arg2, 1);
3646 :
3647 0 : PG_RETURN_INT32(cmp);
3648 : }
3649 :
3650 : Datum
3651 1 : bytea_sortsupport(PG_FUNCTION_ARGS)
3652 : {
3653 1 : SortSupport ssup = (SortSupport) PG_GETARG_POINTER(0);
3654 : MemoryContext oldcontext;
3655 :
3656 1 : oldcontext = MemoryContextSwitchTo(ssup->ssup_cxt);
3657 :
3658 : /* Use generic string SortSupport, forcing "C" collation */
3659 1 : varstr_sortsupport(ssup, C_COLLATION_OID, false);
3660 :
3661 1 : MemoryContextSwitchTo(oldcontext);
3662 :
3663 1 : PG_RETURN_VOID();
3664 : }
3665 :
3666 : /*
3667 : * appendStringInfoText
3668 : *
3669 : * Append a text to str.
3670 : * Like appendStringInfoString(str, text_to_cstring(t)) but faster.
3671 : */
3672 : static void
3673 20789 : appendStringInfoText(StringInfo str, const text *t)
3674 : {
3675 20789 : appendBinaryStringInfo(str, VARDATA_ANY(t), VARSIZE_ANY_EXHDR(t));
3676 20789 : }
3677 :
3678 : /*
3679 : * replace_text
3680 : * replace all occurrences of 'old_sub_str' in 'orig_str'
3681 : * with 'new_sub_str' to form 'new_str'
3682 : *
3683 : * returns 'orig_str' if 'old_sub_str' == '' or 'orig_str' == ''
3684 : * otherwise returns 'new_str'
3685 : */
3686 : Datum
3687 6 : replace_text(PG_FUNCTION_ARGS)
3688 : {
3689 6 : text *src_text = PG_GETARG_TEXT_PP(0);
3690 6 : text *from_sub_text = PG_GETARG_TEXT_PP(1);
3691 6 : text *to_sub_text = PG_GETARG_TEXT_PP(2);
3692 : int src_text_len;
3693 : int from_sub_text_len;
3694 : TextPositionState state;
3695 : text *ret_text;
3696 : int start_posn;
3697 : int curr_posn;
3698 : int chunk_len;
3699 : char *start_ptr;
3700 : StringInfoData str;
3701 :
3702 6 : text_position_setup(src_text, from_sub_text, &state);
3703 :
3704 : /*
3705 : * Note: we check the converted string length, not the original, because
3706 : * they could be different if the input contained invalid encoding.
3707 : */
3708 6 : src_text_len = state.len1;
3709 6 : from_sub_text_len = state.len2;
3710 :
3711 : /* Return unmodified source string if empty source or pattern */
3712 6 : if (src_text_len < 1 || from_sub_text_len < 1)
3713 : {
3714 0 : text_position_cleanup(&state);
3715 0 : PG_RETURN_TEXT_P(src_text);
3716 : }
3717 :
3718 6 : start_posn = 1;
3719 6 : curr_posn = text_position_next(1, &state);
3720 :
3721 : /* When the from_sub_text is not found, there is nothing to do. */
3722 6 : if (curr_posn == 0)
3723 : {
3724 2 : text_position_cleanup(&state);
3725 2 : PG_RETURN_TEXT_P(src_text);
3726 : }
3727 :
3728 : /* start_ptr points to the start_posn'th character of src_text */
3729 4 : start_ptr = VARDATA_ANY(src_text);
3730 :
3731 4 : initStringInfo(&str);
3732 :
3733 : do
3734 : {
3735 6 : CHECK_FOR_INTERRUPTS();
3736 :
3737 : /* copy the data skipped over by last text_position_next() */
3738 6 : chunk_len = charlen_to_bytelen(start_ptr, curr_posn - start_posn);
3739 6 : appendBinaryStringInfo(&str, start_ptr, chunk_len);
3740 :
3741 6 : appendStringInfoText(&str, to_sub_text);
3742 :
3743 6 : start_posn = curr_posn;
3744 6 : start_ptr += chunk_len;
3745 6 : start_posn += from_sub_text_len;
3746 6 : start_ptr += charlen_to_bytelen(start_ptr, from_sub_text_len);
3747 :
3748 6 : curr_posn = text_position_next(start_posn, &state);
3749 : }
3750 6 : while (curr_posn > 0);
3751 :
3752 : /* copy trailing data */
3753 4 : chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
3754 4 : appendBinaryStringInfo(&str, start_ptr, chunk_len);
3755 :
3756 4 : text_position_cleanup(&state);
3757 :
3758 4 : ret_text = cstring_to_text_with_len(str.data, str.len);
3759 4 : pfree(str.data);
3760 :
3761 4 : PG_RETURN_TEXT_P(ret_text);
3762 : }
3763 :
3764 : /*
3765 : * check_replace_text_has_escape_char
3766 : *
3767 : * check whether replace_text contains escape char.
3768 : */
3769 : static bool
3770 25 : check_replace_text_has_escape_char(const text *replace_text)
3771 : {
3772 25 : const char *p = VARDATA_ANY(replace_text);
3773 25 : const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
3774 :
3775 25 : if (pg_database_encoding_max_length() == 1)
3776 : {
3777 0 : for (; p < p_end; p++)
3778 : {
3779 0 : if (*p == '\\')
3780 0 : return true;
3781 : }
3782 : }
3783 : else
3784 : {
3785 161 : for (; p < p_end; p += pg_mblen(p))
3786 : {
3787 145 : if (*p == '\\')
3788 9 : return true;
3789 : }
3790 : }
3791 :
3792 16 : return false;
3793 : }
3794 :
3795 : /*
3796 : * appendStringInfoRegexpSubstr
3797 : *
3798 : * Append replace_text to str, substituting regexp back references for
3799 : * \n escapes. start_ptr is the start of the match in the source string,
3800 : * at logical character position data_pos.
3801 : */
3802 : static void
3803 4 : appendStringInfoRegexpSubstr(StringInfo str, text *replace_text,
3804 : regmatch_t *pmatch,
3805 : char *start_ptr, int data_pos)
3806 : {
3807 4 : const char *p = VARDATA_ANY(replace_text);
3808 4 : const char *p_end = p + VARSIZE_ANY_EXHDR(replace_text);
3809 4 : int eml = pg_database_encoding_max_length();
3810 :
3811 : for (;;)
3812 : {
3813 13 : const char *chunk_start = p;
3814 : int so;
3815 : int eo;
3816 :
3817 : /* Find next escape char. */
3818 13 : if (eml == 1)
3819 : {
3820 0 : for (; p < p_end && *p != '\\'; p++)
3821 : /* nothing */ ;
3822 : }
3823 : else
3824 : {
3825 13 : for (; p < p_end && *p != '\\'; p += pg_mblen(p))
3826 : /* nothing */ ;
3827 : }
3828 :
3829 : /* Copy the text we just scanned over, if any. */
3830 13 : if (p > chunk_start)
3831 9 : appendBinaryStringInfo(str, chunk_start, p - chunk_start);
3832 :
3833 : /* Done if at end of string, else advance over escape char. */
3834 13 : if (p >= p_end)
3835 4 : break;
3836 9 : p++;
3837 :
3838 9 : if (p >= p_end)
3839 : {
3840 : /* Escape at very end of input. Treat same as unexpected char */
3841 0 : appendStringInfoChar(str, '\\');
3842 0 : break;
3843 : }
3844 :
3845 9 : if (*p >= '1' && *p <= '9')
3846 9 : {
3847 : /* Use the back reference of regexp. */
3848 9 : int idx = *p - '0';
3849 :
3850 9 : so = pmatch[idx].rm_so;
3851 9 : eo = pmatch[idx].rm_eo;
3852 9 : p++;
3853 : }
3854 0 : else if (*p == '&')
3855 : {
3856 : /* Use the entire matched string. */
3857 0 : so = pmatch[0].rm_so;
3858 0 : eo = pmatch[0].rm_eo;
3859 0 : p++;
3860 : }
3861 0 : else if (*p == '\\')
3862 : {
3863 : /* \\ means transfer one \ to output. */
3864 0 : appendStringInfoChar(str, '\\');
3865 0 : p++;
3866 0 : continue;
3867 : }
3868 : else
3869 : {
3870 : /*
3871 : * If escape char is not followed by any expected char, just treat
3872 : * it as ordinary data to copy. (XXX would it be better to throw
3873 : * an error?)
3874 : */
3875 0 : appendStringInfoChar(str, '\\');
3876 0 : continue;
3877 : }
3878 :
3879 9 : if (so != -1 && eo != -1)
3880 : {
3881 : /*
3882 : * Copy the text that is back reference of regexp. Note so and eo
3883 : * are counted in characters not bytes.
3884 : */
3885 : char *chunk_start;
3886 : int chunk_len;
3887 :
3888 9 : Assert(so >= data_pos);
3889 9 : chunk_start = start_ptr;
3890 9 : chunk_start += charlen_to_bytelen(chunk_start, so - data_pos);
3891 9 : chunk_len = charlen_to_bytelen(chunk_start, eo - so);
3892 9 : appendBinaryStringInfo(str, chunk_start, chunk_len);
3893 : }
3894 9 : }
3895 4 : }
3896 :
3897 : #define REGEXP_REPLACE_BACKREF_CNT 10
3898 :
3899 : /*
3900 : * replace_text_regexp
3901 : *
3902 : * replace text that matches to regexp in src_text to replace_text.
3903 : *
3904 : * Note: to avoid having to include regex.h in builtins.h, we declare
3905 : * the regexp argument as void *, but really it's regex_t *.
3906 : */
3907 : text *
3908 25 : replace_text_regexp(text *src_text, void *regexp,
3909 : text *replace_text, bool glob)
3910 : {
3911 : text *ret_text;
3912 25 : regex_t *re = (regex_t *) regexp;
3913 25 : int src_text_len = VARSIZE_ANY_EXHDR(src_text);
3914 : StringInfoData buf;
3915 : regmatch_t pmatch[REGEXP_REPLACE_BACKREF_CNT];
3916 : pg_wchar *data;
3917 : size_t data_len;
3918 : int search_start;
3919 : int data_pos;
3920 : char *start_ptr;
3921 : bool have_escape;
3922 :
3923 25 : initStringInfo(&buf);
3924 :
3925 : /* Convert data string to wide characters. */
3926 25 : data = (pg_wchar *) palloc((src_text_len + 1) * sizeof(pg_wchar));
3927 25 : data_len = pg_mb2wchar_with_len(VARDATA_ANY(src_text), data, src_text_len);
3928 :
3929 : /* Check whether replace_text has escape char. */
3930 25 : have_escape = check_replace_text_has_escape_char(replace_text);
3931 :
3932 : /* start_ptr points to the data_pos'th character of src_text */
3933 25 : start_ptr = (char *) VARDATA_ANY(src_text);
3934 25 : data_pos = 0;
3935 :
3936 25 : search_start = 0;
3937 57 : while (search_start <= data_len)
3938 : {
3939 : int regexec_result;
3940 :
3941 31 : CHECK_FOR_INTERRUPTS();
3942 :
3943 31 : regexec_result = pg_regexec(re,
3944 : data,
3945 : data_len,
3946 : search_start,
3947 : NULL, /* no details */
3948 : REGEXP_REPLACE_BACKREF_CNT,
3949 : pmatch,
3950 : 0);
3951 :
3952 31 : if (regexec_result == REG_NOMATCH)
3953 19 : break;
3954 :
3955 12 : if (regexec_result != REG_OKAY)
3956 : {
3957 : char errMsg[100];
3958 :
3959 0 : CHECK_FOR_INTERRUPTS();
3960 0 : pg_regerror(regexec_result, re, errMsg, sizeof(errMsg));
3961 0 : ereport(ERROR,
3962 : (errcode(ERRCODE_INVALID_REGULAR_EXPRESSION),
3963 : errmsg("regular expression failed: %s", errMsg)));
3964 : }
3965 :
3966 : /*
3967 : * Copy the text to the left of the match position. Note we are given
3968 : * character not byte indexes.
3969 : */
3970 12 : if (pmatch[0].rm_so - data_pos > 0)
3971 : {
3972 : int chunk_len;
3973 :
3974 6 : chunk_len = charlen_to_bytelen(start_ptr,
3975 6 : pmatch[0].rm_so - data_pos);
3976 6 : appendBinaryStringInfo(&buf, start_ptr, chunk_len);
3977 :
3978 : /*
3979 : * Advance start_ptr over that text, to avoid multiple rescans of
3980 : * it if the replace_text contains multiple back-references.
3981 : */
3982 6 : start_ptr += chunk_len;
3983 6 : data_pos = pmatch[0].rm_so;
3984 : }
3985 :
3986 : /*
3987 : * Copy the replace_text. Process back references when the
3988 : * replace_text has escape characters.
3989 : */
3990 12 : if (have_escape)
3991 4 : appendStringInfoRegexpSubstr(&buf, replace_text, pmatch,
3992 : start_ptr, data_pos);
3993 : else
3994 8 : appendStringInfoText(&buf, replace_text);
3995 :
3996 : /* Advance start_ptr and data_pos over the matched text. */
3997 12 : start_ptr += charlen_to_bytelen(start_ptr,
3998 12 : pmatch[0].rm_eo - data_pos);
3999 12 : data_pos = pmatch[0].rm_eo;
4000 :
4001 : /*
4002 : * When global option is off, replace the first instance only.
4003 : */
4004 12 : if (!glob)
4005 5 : break;
4006 :
4007 : /*
4008 : * Advance search position. Normally we start the next search at the
4009 : * end of the previous match; but if the match was of zero length, we
4010 : * have to advance by one character, or we'd just find the same match
4011 : * again.
4012 : */
4013 7 : search_start = data_pos;
4014 7 : if (pmatch[0].rm_so == pmatch[0].rm_eo)
4015 2 : search_start++;
4016 : }
4017 :
4018 : /*
4019 : * Copy the text to the right of the last match.
4020 : */
4021 25 : if (data_pos < data_len)
4022 : {
4023 : int chunk_len;
4024 :
4025 17 : chunk_len = ((char *) src_text + VARSIZE_ANY(src_text)) - start_ptr;
4026 17 : appendBinaryStringInfo(&buf, start_ptr, chunk_len);
4027 : }
4028 :
4029 25 : ret_text = cstring_to_text_with_len(buf.data, buf.len);
4030 25 : pfree(buf.data);
4031 25 : pfree(data);
4032 :
4033 25 : return ret_text;
4034 : }
4035 :
4036 : /*
4037 : * split_text
4038 : * parse input string
4039 : * return ord item (1 based)
4040 : * based on provided field separator
4041 : */
4042 : Datum
4043 5 : split_text(PG_FUNCTION_ARGS)
4044 : {
4045 5 : text *inputstring = PG_GETARG_TEXT_PP(0);
4046 5 : text *fldsep = PG_GETARG_TEXT_PP(1);
4047 5 : int fldnum = PG_GETARG_INT32(2);
4048 : int inputstring_len;
4049 : int fldsep_len;
4050 : TextPositionState state;
4051 : int start_posn;
4052 : int end_posn;
4053 : text *result_text;
4054 :
4055 : /* field number is 1 based */
4056 5 : if (fldnum < 1)
4057 1 : ereport(ERROR,
4058 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4059 : errmsg("field position must be greater than zero")));
4060 :
4061 4 : text_position_setup(inputstring, fldsep, &state);
4062 :
4063 : /*
4064 : * Note: we check the converted string length, not the original, because
4065 : * they could be different if the input contained invalid encoding.
4066 : */
4067 4 : inputstring_len = state.len1;
4068 4 : fldsep_len = state.len2;
4069 :
4070 : /* return empty string for empty input string */
4071 4 : if (inputstring_len < 1)
4072 : {
4073 0 : text_position_cleanup(&state);
4074 0 : PG_RETURN_TEXT_P(cstring_to_text(""));
4075 : }
4076 :
4077 : /* empty field separator */
4078 4 : if (fldsep_len < 1)
4079 : {
4080 0 : text_position_cleanup(&state);
4081 : /* if first field, return input string, else empty string */
4082 0 : if (fldnum == 1)
4083 0 : PG_RETURN_TEXT_P(inputstring);
4084 : else
4085 0 : PG_RETURN_TEXT_P(cstring_to_text(""));
4086 : }
4087 :
4088 : /* identify bounds of first field */
4089 4 : start_posn = 1;
4090 4 : end_posn = text_position_next(1, &state);
4091 :
4092 : /* special case if fldsep not found at all */
4093 4 : if (end_posn == 0)
4094 : {
4095 0 : text_position_cleanup(&state);
4096 : /* if field 1 requested, return input string, else empty string */
4097 0 : if (fldnum == 1)
4098 0 : PG_RETURN_TEXT_P(inputstring);
4099 : else
4100 0 : PG_RETURN_TEXT_P(cstring_to_text(""));
4101 : }
4102 :
4103 11 : while (end_posn > 0 && --fldnum > 0)
4104 : {
4105 : /* identify bounds of next field */
4106 3 : start_posn = end_posn + fldsep_len;
4107 3 : end_posn = text_position_next(start_posn, &state);
4108 : }
4109 :
4110 4 : text_position_cleanup(&state);
4111 :
4112 4 : if (fldnum > 0)
4113 : {
4114 : /* N'th field separator not found */
4115 : /* if last field requested, return it, else empty string */
4116 2 : if (fldnum == 1)
4117 1 : result_text = text_substring(PointerGetDatum(inputstring),
4118 : start_posn,
4119 : -1,
4120 : true);
4121 : else
4122 1 : result_text = cstring_to_text("");
4123 : }
4124 : else
4125 : {
4126 : /* non-last field requested */
4127 2 : result_text = text_substring(PointerGetDatum(inputstring),
4128 : start_posn,
4129 : end_posn - start_posn,
4130 : false);
4131 : }
4132 :
4133 4 : PG_RETURN_TEXT_P(result_text);
4134 : }
4135 :
4136 : /*
4137 : * Convenience function to return true when two text params are equal.
4138 : */
4139 : static bool
4140 14 : text_isequal(text *txt1, text *txt2)
4141 : {
4142 14 : return DatumGetBool(DirectFunctionCall2(texteq,
4143 : PointerGetDatum(txt1),
4144 : PointerGetDatum(txt2)));
4145 : }
4146 :
4147 : /*
4148 : * text_to_array
4149 : * parse input string and return text array of elements,
4150 : * based on provided field separator
4151 : */
4152 : Datum
4153 14 : text_to_array(PG_FUNCTION_ARGS)
4154 : {
4155 14 : return text_to_array_internal(fcinfo);
4156 : }
4157 :
4158 : /*
4159 : * text_to_array_null
4160 : * parse input string and return text array of elements,
4161 : * based on provided field separator and null string
4162 : *
4163 : * This is a separate entry point only to prevent the regression tests from
4164 : * complaining about different argument sets for the same internal function.
4165 : */
4166 : Datum
4167 4 : text_to_array_null(PG_FUNCTION_ARGS)
4168 : {
4169 4 : return text_to_array_internal(fcinfo);
4170 : }
4171 :
4172 : /*
4173 : * common code for text_to_array and text_to_array_null functions
4174 : *
4175 : * These are not strict so we have to test for null inputs explicitly.
4176 : */
4177 : static Datum
4178 18 : text_to_array_internal(PG_FUNCTION_ARGS)
4179 : {
4180 : text *inputstring;
4181 : text *fldsep;
4182 : text *null_string;
4183 : int inputstring_len;
4184 : int fldsep_len;
4185 : char *start_ptr;
4186 : text *result_text;
4187 : bool is_null;
4188 18 : ArrayBuildState *astate = NULL;
4189 :
4190 : /* when input string is NULL, then result is NULL too */
4191 18 : if (PG_ARGISNULL(0))
4192 1 : PG_RETURN_NULL();
4193 :
4194 17 : inputstring = PG_GETARG_TEXT_PP(0);
4195 :
4196 : /* fldsep can be NULL */
4197 17 : if (!PG_ARGISNULL(1))
4198 16 : fldsep = PG_GETARG_TEXT_PP(1);
4199 : else
4200 1 : fldsep = NULL;
4201 :
4202 : /* null_string can be NULL or omitted */
4203 17 : if (PG_NARGS() > 2 && !PG_ARGISNULL(2))
4204 4 : null_string = PG_GETARG_TEXT_PP(2);
4205 : else
4206 13 : null_string = NULL;
4207 :
4208 17 : if (fldsep != NULL)
4209 : {
4210 : /*
4211 : * Normal case with non-null fldsep. Use the text_position machinery
4212 : * to search for occurrences of fldsep.
4213 : */
4214 : TextPositionState state;
4215 : int fldnum;
4216 : int start_posn;
4217 : int end_posn;
4218 : int chunk_len;
4219 :
4220 16 : text_position_setup(inputstring, fldsep, &state);
4221 :
4222 : /*
4223 : * Note: we check the converted string length, not the original,
4224 : * because they could be different if the input contained invalid
4225 : * encoding.
4226 : */
4227 16 : inputstring_len = state.len1;
4228 16 : fldsep_len = state.len2;
4229 :
4230 : /* return empty array for empty input string */
4231 16 : if (inputstring_len < 1)
4232 : {
4233 1 : text_position_cleanup(&state);
4234 7 : PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID));
4235 : }
4236 :
4237 : /*
4238 : * empty field separator: return the input string as a one-element
4239 : * array
4240 : */
4241 15 : if (fldsep_len < 1)
4242 : {
4243 : Datum elems[1];
4244 : bool nulls[1];
4245 : int dims[1];
4246 : int lbs[1];
4247 :
4248 5 : text_position_cleanup(&state);
4249 : /* single element can be a NULL too */
4250 5 : is_null = null_string ? text_isequal(inputstring, null_string) : false;
4251 :
4252 5 : elems[0] = PointerGetDatum(inputstring);
4253 5 : nulls[0] = is_null;
4254 5 : dims[0] = 1;
4255 5 : lbs[0] = 1;
4256 : /* XXX: this hardcodes assumptions about the text type */
4257 5 : PG_RETURN_ARRAYTYPE_P(construct_md_array(elems, nulls,
4258 : 1, dims, lbs,
4259 : TEXTOID, -1, false, 'i'));
4260 : }
4261 :
4262 10 : start_posn = 1;
4263 : /* start_ptr points to the start_posn'th character of inputstring */
4264 10 : start_ptr = VARDATA_ANY(inputstring);
4265 :
4266 62 : for (fldnum = 1;; fldnum++) /* field number is 1 based */
4267 : {
4268 62 : CHECK_FOR_INTERRUPTS();
4269 :
4270 62 : end_posn = text_position_next(start_posn, &state);
4271 :
4272 62 : if (end_posn == 0)
4273 : {
4274 : /* fetch last field */
4275 10 : chunk_len = ((char *) inputstring + VARSIZE_ANY(inputstring)) - start_ptr;
4276 : }
4277 : else
4278 : {
4279 : /* fetch non-last field */
4280 52 : chunk_len = charlen_to_bytelen(start_ptr, end_posn - start_posn);
4281 : }
4282 :
4283 : /* must build a temp text datum to pass to accumArrayResult */
4284 62 : result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4285 62 : is_null = null_string ? text_isequal(result_text, null_string) : false;
4286 :
4287 : /* stash away this field */
4288 62 : astate = accumArrayResult(astate,
4289 : PointerGetDatum(result_text),
4290 : is_null,
4291 : TEXTOID,
4292 : CurrentMemoryContext);
4293 :
4294 62 : pfree(result_text);
4295 :
4296 62 : if (end_posn == 0)
4297 10 : break;
4298 :
4299 52 : start_posn = end_posn;
4300 52 : start_ptr += chunk_len;
4301 52 : start_posn += fldsep_len;
4302 52 : start_ptr += charlen_to_bytelen(start_ptr, fldsep_len);
4303 52 : }
4304 :
4305 10 : text_position_cleanup(&state);
4306 : }
4307 : else
4308 : {
4309 : /*
4310 : * When fldsep is NULL, each character in the inputstring becomes an
4311 : * element in the result array. The separator is effectively the
4312 : * space between characters.
4313 : */
4314 1 : inputstring_len = VARSIZE_ANY_EXHDR(inputstring);
4315 :
4316 : /* return empty array for empty input string */
4317 1 : if (inputstring_len < 1)
4318 0 : PG_RETURN_ARRAYTYPE_P(construct_empty_array(TEXTOID));
4319 :
4320 1 : start_ptr = VARDATA_ANY(inputstring);
4321 :
4322 7 : while (inputstring_len > 0)
4323 : {
4324 5 : int chunk_len = pg_mblen(start_ptr);
4325 :
4326 5 : CHECK_FOR_INTERRUPTS();
4327 :
4328 : /* must build a temp text datum to pass to accumArrayResult */
4329 5 : result_text = cstring_to_text_with_len(start_ptr, chunk_len);
4330 5 : is_null = null_string ? text_isequal(result_text, null_string) : false;
4331 :
4332 : /* stash away this field */
4333 5 : astate = accumArrayResult(astate,
4334 : PointerGetDatum(result_text),
4335 : is_null,
4336 : TEXTOID,
4337 : CurrentMemoryContext);
4338 :
4339 5 : pfree(result_text);
4340 :
4341 5 : start_ptr += chunk_len;
4342 5 : inputstring_len -= chunk_len;
4343 : }
4344 : }
4345 :
4346 11 : PG_RETURN_ARRAYTYPE_P(makeArrayResult(astate,
4347 : CurrentMemoryContext));
4348 : }
4349 :
4350 : /*
4351 : * array_to_text
4352 : * concatenate Cstring representation of input array elements
4353 : * using provided field separator
4354 : */
4355 : Datum
4356 428 : array_to_text(PG_FUNCTION_ARGS)
4357 : {
4358 428 : ArrayType *v = PG_GETARG_ARRAYTYPE_P(0);
4359 428 : char *fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
4360 :
4361 428 : PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, NULL));
4362 : }
4363 :
4364 : /*
4365 : * array_to_text_null
4366 : * concatenate Cstring representation of input array elements
4367 : * using provided field separator and null string
4368 : *
4369 : * This version is not strict so we have to test for null inputs explicitly.
4370 : */
4371 : Datum
4372 2 : array_to_text_null(PG_FUNCTION_ARGS)
4373 : {
4374 : ArrayType *v;
4375 : char *fldsep;
4376 : char *null_string;
4377 :
4378 : /* returns NULL when first or second parameter is NULL */
4379 2 : if (PG_ARGISNULL(0) || PG_ARGISNULL(1))
4380 0 : PG_RETURN_NULL();
4381 :
4382 2 : v = PG_GETARG_ARRAYTYPE_P(0);
4383 2 : fldsep = text_to_cstring(PG_GETARG_TEXT_PP(1));
4384 :
4385 : /* NULL null string is passed through as a null pointer */
4386 2 : if (!PG_ARGISNULL(2))
4387 1 : null_string = text_to_cstring(PG_GETARG_TEXT_PP(2));
4388 : else
4389 1 : null_string = NULL;
4390 :
4391 2 : PG_RETURN_TEXT_P(array_to_text_internal(fcinfo, v, fldsep, null_string));
4392 : }
4393 :
4394 : /*
4395 : * common code for array_to_text and array_to_text_null functions
4396 : */
4397 : static text *
4398 433 : array_to_text_internal(FunctionCallInfo fcinfo, ArrayType *v,
4399 : const char *fldsep, const char *null_string)
4400 : {
4401 : text *result;
4402 : int nitems,
4403 : *dims,
4404 : ndims;
4405 : Oid element_type;
4406 : int typlen;
4407 : bool typbyval;
4408 : char typalign;
4409 : StringInfoData buf;
4410 433 : bool printed = false;
4411 : char *p;
4412 : bits8 *bitmap;
4413 : int bitmask;
4414 : int i;
4415 : ArrayMetaState *my_extra;
4416 :
4417 433 : ndims = ARR_NDIM(v);
4418 433 : dims = ARR_DIMS(v);
4419 433 : nitems = ArrayGetNItems(ndims, dims);
4420 :
4421 : /* if there are no elements, return an empty string */
4422 433 : if (nitems == 0)
4423 160 : return cstring_to_text_with_len("", 0);
4424 :
4425 273 : element_type = ARR_ELEMTYPE(v);
4426 273 : initStringInfo(&buf);
4427 :
4428 : /*
4429 : * We arrange to look up info about element type, including its output
4430 : * conversion proc, only once per series of calls, assuming the element
4431 : * type doesn't change underneath us.
4432 : */
4433 273 : my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
4434 273 : if (my_extra == NULL)
4435 : {
4436 144 : fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
4437 : sizeof(ArrayMetaState));
4438 144 : my_extra = (ArrayMetaState *) fcinfo->flinfo->fn_extra;
4439 144 : my_extra->element_type = ~element_type;
4440 : }
4441 :
4442 273 : if (my_extra->element_type != element_type)
4443 : {
4444 : /*
4445 : * Get info about element type, including its output conversion proc
4446 : */
4447 144 : get_type_io_data(element_type, IOFunc_output,
4448 : &my_extra->typlen, &my_extra->typbyval,
4449 : &my_extra->typalign, &my_extra->typdelim,
4450 : &my_extra->typioparam, &my_extra->typiofunc);
4451 144 : fmgr_info_cxt(my_extra->typiofunc, &my_extra->proc,
4452 144 : fcinfo->flinfo->fn_mcxt);
4453 144 : my_extra->element_type = element_type;
4454 : }
4455 273 : typlen = my_extra->typlen;
4456 273 : typbyval = my_extra->typbyval;
4457 273 : typalign = my_extra->typalign;
4458 :
4459 273 : p = ARR_DATA_PTR(v);
4460 273 : bitmap = ARR_NULLBITMAP(v);
4461 273 : bitmask = 1;
4462 :
4463 1280 : for (i = 0; i < nitems; i++)
4464 : {
4465 : Datum itemvalue;
4466 : char *value;
4467 :
4468 : /* Get source element, checking for NULL */
4469 1007 : if (bitmap && (*bitmap & bitmask) == 0)
4470 : {
4471 : /* if null_string is NULL, we just ignore null elements */
4472 6 : if (null_string != NULL)
4473 : {
4474 1 : if (printed)
4475 1 : appendStringInfo(&buf, "%s%s", fldsep, null_string);
4476 : else
4477 0 : appendStringInfoString(&buf, null_string);
4478 1 : printed = true;
4479 : }
4480 : }
4481 : else
4482 : {
4483 1004 : itemvalue = fetch_att(p, typbyval, typlen);
4484 :
4485 1004 : value = OutputFunctionCall(&my_extra->proc, itemvalue);
4486 :
4487 1004 : if (printed)
4488 731 : appendStringInfo(&buf, "%s%s", fldsep, value);
4489 : else
4490 273 : appendStringInfoString(&buf, value);
4491 1004 : printed = true;
4492 :
4493 1004 : p = att_addlength_pointer(p, typlen, p);
4494 1004 : p = (char *) att_align_nominal(p, typalign);
4495 : }
4496 :
4497 : /* advance bitmap pointer if any */
4498 1007 : if (bitmap)
4499 : {
4500 18 : bitmask <<= 1;
4501 18 : if (bitmask == 0x100)
4502 : {
4503 0 : bitmap++;
4504 0 : bitmask = 1;
4505 : }
4506 : }
4507 : }
4508 :
4509 273 : result = cstring_to_text_with_len(buf.data, buf.len);
4510 273 : pfree(buf.data);
4511 :
4512 273 : return result;
4513 : }
4514 :
4515 : #define HEXBASE 16
4516 : /*
4517 : * Convert an int32 to a string containing a base 16 (hex) representation of
4518 : * the number.
4519 : */
4520 : Datum
4521 331 : to_hex32(PG_FUNCTION_ARGS)
4522 : {
4523 331 : uint32 value = (uint32) PG_GETARG_INT32(0);
4524 : char *ptr;
4525 331 : const char *digits = "0123456789abcdef";
4526 : char buf[32]; /* bigger than needed, but reasonable */
4527 :
4528 331 : ptr = buf + sizeof(buf) - 1;
4529 331 : *ptr = '\0';
4530 :
4531 : do
4532 : {
4533 618 : *--ptr = digits[value % HEXBASE];
4534 618 : value /= HEXBASE;
4535 618 : } while (ptr > buf && value);
4536 :
4537 331 : PG_RETURN_TEXT_P(cstring_to_text(ptr));
4538 : }
4539 :
4540 : /*
4541 : * Convert an int64 to a string containing a base 16 (hex) representation of
4542 : * the number.
4543 : */
4544 : Datum
4545 1 : to_hex64(PG_FUNCTION_ARGS)
4546 : {
4547 1 : uint64 value = (uint64) PG_GETARG_INT64(0);
4548 : char *ptr;
4549 1 : const char *digits = "0123456789abcdef";
4550 : char buf[32]; /* bigger than needed, but reasonable */
4551 :
4552 1 : ptr = buf + sizeof(buf) - 1;
4553 1 : *ptr = '\0';
4554 :
4555 : do
4556 : {
4557 8 : *--ptr = digits[value % HEXBASE];
4558 8 : value /= HEXBASE;
4559 8 : } while (ptr > buf && value);
4560 :
4561 1 : PG_RETURN_TEXT_P(cstring_to_text(ptr));
4562 : }
4563 :
4564 : /*
4565 : * Create an md5 hash of a text string and return it as hex
4566 : *
4567 : * md5 produces a 16 byte (128 bit) hash; double it for hex
4568 : */
4569 : #define MD5_HASH_LEN 32
4570 :
4571 : Datum
4572 188 : md5_text(PG_FUNCTION_ARGS)
4573 : {
4574 188 : text *in_text = PG_GETARG_TEXT_PP(0);
4575 : size_t len;
4576 : char hexsum[MD5_HASH_LEN + 1];
4577 :
4578 : /* Calculate the length of the buffer using varlena metadata */
4579 188 : len = VARSIZE_ANY_EXHDR(in_text);
4580 :
4581 : /* get the hash result */
4582 188 : if (pg_md5_hash(VARDATA_ANY(in_text), len, hexsum) == false)
4583 0 : ereport(ERROR,
4584 : (errcode(ERRCODE_OUT_OF_MEMORY),
4585 : errmsg("out of memory")));
4586 :
4587 : /* convert to text and return it */
4588 188 : PG_RETURN_TEXT_P(cstring_to_text(hexsum));
4589 : }
4590 :
4591 : /*
4592 : * Create an md5 hash of a bytea field and return it as a hex string:
4593 : * 16-byte md5 digest is represented in 32 hex characters.
4594 : */
4595 : Datum
4596 9 : md5_bytea(PG_FUNCTION_ARGS)
4597 : {
4598 9 : bytea *in = PG_GETARG_BYTEA_PP(0);
4599 : size_t len;
4600 : char hexsum[MD5_HASH_LEN + 1];
4601 :
4602 9 : len = VARSIZE_ANY_EXHDR(in);
4603 9 : if (pg_md5_hash(VARDATA_ANY(in), len, hexsum) == false)
4604 0 : ereport(ERROR,
4605 : (errcode(ERRCODE_OUT_OF_MEMORY),
4606 : errmsg("out of memory")));
4607 :
4608 9 : PG_RETURN_TEXT_P(cstring_to_text(hexsum));
4609 : }
4610 :
4611 : /*
4612 : * Return the size of a datum, possibly compressed
4613 : *
4614 : * Works on any data type
4615 : */
4616 : Datum
4617 10 : pg_column_size(PG_FUNCTION_ARGS)
4618 : {
4619 10 : Datum value = PG_GETARG_DATUM(0);
4620 : int32 result;
4621 : int typlen;
4622 :
4623 : /* On first call, get the input type's typlen, and save at *fn_extra */
4624 10 : if (fcinfo->flinfo->fn_extra == NULL)
4625 : {
4626 : /* Lookup the datatype of the supplied argument */
4627 10 : Oid argtypeid = get_fn_expr_argtype(fcinfo->flinfo, 0);
4628 :
4629 10 : typlen = get_typlen(argtypeid);
4630 10 : if (typlen == 0) /* should not happen */
4631 0 : elog(ERROR, "cache lookup failed for type %u", argtypeid);
4632 :
4633 10 : fcinfo->flinfo->fn_extra = MemoryContextAlloc(fcinfo->flinfo->fn_mcxt,
4634 : sizeof(int));
4635 10 : *((int *) fcinfo->flinfo->fn_extra) = typlen;
4636 : }
4637 : else
4638 0 : typlen = *((int *) fcinfo->flinfo->fn_extra);
4639 :
4640 10 : if (typlen == -1)
4641 : {
4642 : /* varlena type, possibly toasted */
4643 10 : result = toast_datum_size(value);
4644 : }
4645 0 : else if (typlen == -2)
4646 : {
4647 : /* cstring */
4648 0 : result = strlen(DatumGetCString(value)) + 1;
4649 : }
4650 : else
4651 : {
4652 : /* ordinary fixed-width type */
4653 0 : result = typlen;
4654 : }
4655 :
4656 10 : PG_RETURN_INT32(result);
4657 : }
4658 :
4659 : /*
4660 : * string_agg - Concatenates values and returns string.
4661 : *
4662 : * Syntax: string_agg(value text, delimiter text) RETURNS text
4663 : *
4664 : * Note: Any NULL values are ignored. The first-call delimiter isn't
4665 : * actually used at all, and on subsequent calls the delimiter precedes
4666 : * the associated value.
4667 : */
4668 :
4669 : /* subroutine to initialize state */
4670 : static StringInfo
4671 111 : makeStringAggState(FunctionCallInfo fcinfo)
4672 : {
4673 : StringInfo state;
4674 : MemoryContext aggcontext;
4675 : MemoryContext oldcontext;
4676 :
4677 111 : if (!AggCheckCallContext(fcinfo, &aggcontext))
4678 : {
4679 : /* cannot be called directly because of internal-type argument */
4680 0 : elog(ERROR, "string_agg_transfn called in non-aggregate context");
4681 : }
4682 :
4683 : /*
4684 : * Create state in aggregate context. It'll stay there across subsequent
4685 : * calls.
4686 : */
4687 111 : oldcontext = MemoryContextSwitchTo(aggcontext);
4688 111 : state = makeStringInfo();
4689 111 : MemoryContextSwitchTo(oldcontext);
4690 :
4691 111 : return state;
4692 : }
4693 :
4694 : Datum
4695 10449 : string_agg_transfn(PG_FUNCTION_ARGS)
4696 : {
4697 : StringInfo state;
4698 :
4699 10449 : state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
4700 :
4701 : /* Append the value unless null. */
4702 10449 : if (!PG_ARGISNULL(1))
4703 : {
4704 : /* On the first time through, we ignore the delimiter. */
4705 10441 : if (state == NULL)
4706 107 : state = makeStringAggState(fcinfo);
4707 10334 : else if (!PG_ARGISNULL(2))
4708 10334 : appendStringInfoText(state, PG_GETARG_TEXT_PP(2)); /* delimiter */
4709 :
4710 10441 : appendStringInfoText(state, PG_GETARG_TEXT_PP(1)); /* value */
4711 : }
4712 :
4713 : /*
4714 : * The transition type for string_agg() is declared to be "internal",
4715 : * which is a pass-by-value type the same size as a pointer.
4716 : */
4717 10449 : PG_RETURN_POINTER(state);
4718 : }
4719 :
4720 : Datum
4721 114 : string_agg_finalfn(PG_FUNCTION_ARGS)
4722 : {
4723 : StringInfo state;
4724 :
4725 : /* cannot be called directly because of internal-type argument */
4726 114 : Assert(AggCheckCallContext(fcinfo, NULL));
4727 :
4728 114 : state = PG_ARGISNULL(0) ? NULL : (StringInfo) PG_GETARG_POINTER(0);
4729 :
4730 114 : if (state != NULL)
4731 107 : PG_RETURN_TEXT_P(cstring_to_text_with_len(state->data, state->len));
4732 : else
4733 7 : PG_RETURN_NULL();
4734 : }
4735 :
4736 : /*
4737 : * Implementation of both concat() and concat_ws().
4738 : *
4739 : * sepstr is the separator string to place between values.
4740 : * argidx identifies the first argument to concatenate (counting from zero).
4741 : * Returns NULL if result should be NULL, else text value.
4742 : */
4743 : static text *
4744 11 : concat_internal(const char *sepstr, int argidx,
4745 : FunctionCallInfo fcinfo)
4746 : {
4747 : text *result;
4748 : StringInfoData str;
4749 11 : bool first_arg = true;
4750 : int i;
4751 :
4752 : /*
4753 : * concat(VARIADIC some-array) is essentially equivalent to
4754 : * array_to_text(), ie concat the array elements with the given separator.
4755 : * So we just pass the case off to that code.
4756 : */
4757 11 : if (get_fn_expr_variadic(fcinfo->flinfo))
4758 : {
4759 : ArrayType *arr;
4760 :
4761 : /* Should have just the one argument */
4762 5 : Assert(argidx == PG_NARGS() - 1);
4763 :
4764 : /* concat(VARIADIC NULL) is defined as NULL */
4765 5 : if (PG_ARGISNULL(argidx))
4766 2 : return NULL;
4767 :
4768 : /*
4769 : * Non-null argument had better be an array. We assume that any call
4770 : * context that could let get_fn_expr_variadic return true will have
4771 : * checked that a VARIADIC-labeled parameter actually is an array. So
4772 : * it should be okay to just Assert that it's an array rather than
4773 : * doing a full-fledged error check.
4774 : */
4775 3 : Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, argidx))));
4776 :
4777 : /* OK, safe to fetch the array value */
4778 3 : arr = PG_GETARG_ARRAYTYPE_P(argidx);
4779 :
4780 : /*
4781 : * And serialize the array. We tell array_to_text to ignore null
4782 : * elements, which matches the behavior of the loop below.
4783 : */
4784 3 : return array_to_text_internal(fcinfo, arr, sepstr, NULL);
4785 : }
4786 :
4787 : /* Normal case without explicit VARIADIC marker */
4788 6 : initStringInfo(&str);
4789 :
4790 30 : for (i = argidx; i < PG_NARGS(); i++)
4791 : {
4792 24 : if (!PG_ARGISNULL(i))
4793 : {
4794 22 : Datum value = PG_GETARG_DATUM(i);
4795 : Oid valtype;
4796 : Oid typOutput;
4797 : bool typIsVarlena;
4798 :
4799 : /* add separator if appropriate */
4800 22 : if (first_arg)
4801 6 : first_arg = false;
4802 : else
4803 16 : appendStringInfoString(&str, sepstr);
4804 :
4805 : /* call the appropriate type output function, append the result */
4806 22 : valtype = get_fn_expr_argtype(fcinfo->flinfo, i);
4807 22 : if (!OidIsValid(valtype))
4808 0 : elog(ERROR, "could not determine data type of concat() input");
4809 22 : getTypeOutputInfo(valtype, &typOutput, &typIsVarlena);
4810 44 : appendStringInfoString(&str,
4811 22 : OidOutputFunctionCall(typOutput, value));
4812 : }
4813 : }
4814 :
4815 6 : result = cstring_to_text_with_len(str.data, str.len);
4816 6 : pfree(str.data);
4817 :
4818 6 : return result;
4819 : }
4820 :
4821 : /*
4822 : * Concatenate all arguments. NULL arguments are ignored.
4823 : */
4824 : Datum
4825 5 : text_concat(PG_FUNCTION_ARGS)
4826 : {
4827 : text *result;
4828 :
4829 5 : result = concat_internal("", 0, fcinfo);
4830 5 : if (result == NULL)
4831 1 : PG_RETURN_NULL();
4832 4 : PG_RETURN_TEXT_P(result);
4833 : }
4834 :
4835 : /*
4836 : * Concatenate all but first argument value with separators. The first
4837 : * parameter is used as the separator. NULL arguments are ignored.
4838 : */
4839 : Datum
4840 7 : text_concat_ws(PG_FUNCTION_ARGS)
4841 : {
4842 : char *sep;
4843 : text *result;
4844 :
4845 : /* return NULL when separator is NULL */
4846 7 : if (PG_ARGISNULL(0))
4847 1 : PG_RETURN_NULL();
4848 6 : sep = text_to_cstring(PG_GETARG_TEXT_PP(0));
4849 :
4850 6 : result = concat_internal(sep, 1, fcinfo);
4851 6 : if (result == NULL)
4852 1 : PG_RETURN_NULL();
4853 5 : PG_RETURN_TEXT_P(result);
4854 : }
4855 :
4856 : /*
4857 : * Return first n characters in the string. When n is negative,
4858 : * return all but last |n| characters.
4859 : */
4860 : Datum
4861 11 : text_left(PG_FUNCTION_ARGS)
4862 : {
4863 11 : text *str = PG_GETARG_TEXT_PP(0);
4864 11 : const char *p = VARDATA_ANY(str);
4865 11 : int len = VARSIZE_ANY_EXHDR(str);
4866 11 : int n = PG_GETARG_INT32(1);
4867 : int rlen;
4868 :
4869 11 : if (n < 0)
4870 5 : n = pg_mbstrlen_with_len(p, len) + n;
4871 11 : rlen = pg_mbcharcliplen(p, len, n);
4872 :
4873 11 : PG_RETURN_TEXT_P(cstring_to_text_with_len(p, rlen));
4874 : }
4875 :
4876 : /*
4877 : * Return last n characters in the string. When n is negative,
4878 : * return all but first |n| characters.
4879 : */
4880 : Datum
4881 11 : text_right(PG_FUNCTION_ARGS)
4882 : {
4883 11 : text *str = PG_GETARG_TEXT_PP(0);
4884 11 : const char *p = VARDATA_ANY(str);
4885 11 : int len = VARSIZE_ANY_EXHDR(str);
4886 11 : int n = PG_GETARG_INT32(1);
4887 : int off;
4888 :
4889 11 : if (n < 0)
4890 5 : n = -n;
4891 : else
4892 6 : n = pg_mbstrlen_with_len(p, len) - n;
4893 11 : off = pg_mbcharcliplen(p, len, n);
4894 :
4895 11 : PG_RETURN_TEXT_P(cstring_to_text_with_len(p + off, len - off));
4896 : }
4897 :
4898 : /*
4899 : * Return reversed string
4900 : */
4901 : Datum
4902 1 : text_reverse(PG_FUNCTION_ARGS)
4903 : {
4904 1 : text *str = PG_GETARG_TEXT_PP(0);
4905 1 : const char *p = VARDATA_ANY(str);
4906 1 : int len = VARSIZE_ANY_EXHDR(str);
4907 1 : const char *endp = p + len;
4908 : text *result;
4909 : char *dst;
4910 :
4911 1 : result = palloc(len + VARHDRSZ);
4912 1 : dst = (char *) VARDATA(result) + len;
4913 1 : SET_VARSIZE(result, len + VARHDRSZ);
4914 :
4915 1 : if (pg_database_encoding_max_length() > 1)
4916 : {
4917 : /* multibyte version */
4918 7 : while (p < endp)
4919 : {
4920 : int sz;
4921 :
4922 5 : sz = pg_mblen(p);
4923 5 : dst -= sz;
4924 5 : memcpy(dst, p, sz);
4925 5 : p += sz;
4926 : }
4927 : }
4928 : else
4929 : {
4930 : /* single byte version */
4931 0 : while (p < endp)
4932 0 : *(--dst) = *p++;
4933 : }
4934 :
4935 1 : PG_RETURN_TEXT_P(result);
4936 : }
4937 :
4938 :
4939 : /*
4940 : * Support macros for text_format()
4941 : */
4942 : #define TEXT_FORMAT_FLAG_MINUS 0x0001 /* is minus flag present? */
4943 :
4944 : #define ADVANCE_PARSE_POINTER(ptr,end_ptr) \
4945 : do { \
4946 : if (++(ptr) >= (end_ptr)) \
4947 : ereport(ERROR, \
4948 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
4949 : errmsg("unterminated format() type specifier"), \
4950 : errhint("For a single \"%%\" use \"%%%%\"."))); \
4951 : } while (0)
4952 :
4953 : /*
4954 : * Returns a formatted string
4955 : */
4956 : Datum
4957 1772 : text_format(PG_FUNCTION_ARGS)
4958 : {
4959 : text *fmt;
4960 : StringInfoData str;
4961 : const char *cp;
4962 : const char *start_ptr;
4963 : const char *end_ptr;
4964 : text *result;
4965 : int arg;
4966 : bool funcvariadic;
4967 : int nargs;
4968 1772 : Datum *elements = NULL;
4969 1772 : bool *nulls = NULL;
4970 1772 : Oid element_type = InvalidOid;
4971 1772 : Oid prev_type = InvalidOid;
4972 1772 : Oid prev_width_type = InvalidOid;
4973 : FmgrInfo typoutputfinfo;
4974 : FmgrInfo typoutputinfo_width;
4975 :
4976 : /* When format string is null, immediately return null */
4977 1772 : if (PG_ARGISNULL(0))
4978 1 : PG_RETURN_NULL();
4979 :
4980 : /* If argument is marked VARIADIC, expand array into elements */
4981 1771 : if (get_fn_expr_variadic(fcinfo->flinfo))
4982 : {
4983 : ArrayType *arr;
4984 : int16 elmlen;
4985 : bool elmbyval;
4986 : char elmalign;
4987 : int nitems;
4988 :
4989 : /* Should have just the one argument */
4990 8 : Assert(PG_NARGS() == 2);
4991 :
4992 : /* If argument is NULL, we treat it as zero-length array */
4993 8 : if (PG_ARGISNULL(1))
4994 1 : nitems = 0;
4995 : else
4996 : {
4997 : /*
4998 : * Non-null argument had better be an array. We assume that any
4999 : * call context that could let get_fn_expr_variadic return true
5000 : * will have checked that a VARIADIC-labeled parameter actually is
5001 : * an array. So it should be okay to just Assert that it's an
5002 : * array rather than doing a full-fledged error check.
5003 : */
5004 7 : Assert(OidIsValid(get_base_element_type(get_fn_expr_argtype(fcinfo->flinfo, 1))));
5005 :
5006 : /* OK, safe to fetch the array value */
5007 7 : arr = PG_GETARG_ARRAYTYPE_P(1);
5008 :
5009 : /* Get info about array element type */
5010 7 : element_type = ARR_ELEMTYPE(arr);
5011 7 : get_typlenbyvalalign(element_type,
5012 : &elmlen, &elmbyval, &elmalign);
5013 :
5014 : /* Extract all array elements */
5015 7 : deconstruct_array(arr, element_type, elmlen, elmbyval, elmalign,
5016 : &elements, &nulls, &nitems);
5017 : }
5018 :
5019 8 : nargs = nitems + 1;
5020 8 : funcvariadic = true;
5021 : }
5022 : else
5023 : {
5024 : /* Non-variadic case, we'll process the arguments individually */
5025 1763 : nargs = PG_NARGS();
5026 1763 : funcvariadic = false;
5027 : }
5028 :
5029 : /* Setup for main loop. */
5030 1771 : fmt = PG_GETARG_TEXT_PP(0);
5031 1771 : start_ptr = VARDATA_ANY(fmt);
5032 1771 : end_ptr = start_ptr + VARSIZE_ANY_EXHDR(fmt);
5033 1771 : initStringInfo(&str);
5034 1771 : arg = 1; /* next argument position to print */
5035 :
5036 : /* Scan format string, looking for conversion specifiers. */
5037 57672 : for (cp = start_ptr; cp < end_ptr; cp++)
5038 : {
5039 : int argpos;
5040 : int widthpos;
5041 : int flags;
5042 : int width;
5043 : Datum value;
5044 : bool isNull;
5045 : Oid typid;
5046 :
5047 : /*
5048 : * If it's not the start of a conversion specifier, just copy it to
5049 : * the output buffer.
5050 : */
5051 55911 : if (*cp != '%')
5052 : {
5053 51833 : appendStringInfoCharMacro(&str, *cp);
5054 103669 : continue;
5055 : }
5056 :
5057 4078 : ADVANCE_PARSE_POINTER(cp, end_ptr);
5058 :
5059 : /* Easy case: %% outputs a single % */
5060 4078 : if (*cp == '%')
5061 : {
5062 3 : appendStringInfoCharMacro(&str, *cp);
5063 3 : continue;
5064 : }
5065 :
5066 : /* Parse the optional portions of the format specifier */
5067 4075 : cp = text_format_parse_format(cp, end_ptr,
5068 : &argpos, &widthpos,
5069 : &flags, &width);
5070 :
5071 : /*
5072 : * Next we should see the main conversion specifier. Whether or not
5073 : * an argument position was present, it's known that at least one
5074 : * character remains in the string at this point. Experience suggests
5075 : * that it's worth checking that that character is one of the expected
5076 : * ones before we try to fetch arguments, so as to produce the least
5077 : * confusing response to a mis-formatted specifier.
5078 : */
5079 4071 : if (strchr("sIL", *cp) == NULL)
5080 1 : ereport(ERROR,
5081 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5082 : errmsg("unrecognized format() type specifier \"%c\"",
5083 : *cp),
5084 : errhint("For a single \"%%\" use \"%%%%\".")));
5085 :
5086 : /* If indirect width was specified, get its value */
5087 4070 : if (widthpos >= 0)
5088 : {
5089 : /* Collect the specified or next argument position */
5090 7 : if (widthpos > 0)
5091 6 : arg = widthpos;
5092 7 : if (arg >= nargs)
5093 0 : ereport(ERROR,
5094 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5095 : errmsg("too few arguments for format()")));
5096 :
5097 : /* Get the value and type of the selected argument */
5098 7 : if (!funcvariadic)
5099 : {
5100 7 : value = PG_GETARG_DATUM(arg);
5101 7 : isNull = PG_ARGISNULL(arg);
5102 7 : typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5103 : }
5104 : else
5105 : {
5106 0 : value = elements[arg - 1];
5107 0 : isNull = nulls[arg - 1];
5108 0 : typid = element_type;
5109 : }
5110 7 : if (!OidIsValid(typid))
5111 0 : elog(ERROR, "could not determine data type of format() input");
5112 :
5113 7 : arg++;
5114 :
5115 : /* We can treat NULL width the same as zero */
5116 7 : if (isNull)
5117 1 : width = 0;
5118 6 : else if (typid == INT4OID)
5119 6 : width = DatumGetInt32(value);
5120 0 : else if (typid == INT2OID)
5121 0 : width = DatumGetInt16(value);
5122 : else
5123 : {
5124 : /* For less-usual datatypes, convert to text then to int */
5125 : char *str;
5126 :
5127 0 : if (typid != prev_width_type)
5128 : {
5129 : Oid typoutputfunc;
5130 : bool typIsVarlena;
5131 :
5132 0 : getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5133 0 : fmgr_info(typoutputfunc, &typoutputinfo_width);
5134 0 : prev_width_type = typid;
5135 : }
5136 :
5137 0 : str = OutputFunctionCall(&typoutputinfo_width, value);
5138 :
5139 : /* pg_atoi will complain about bad data or overflow */
5140 0 : width = pg_atoi(str, sizeof(int), '\0');
5141 :
5142 0 : pfree(str);
5143 : }
5144 : }
5145 :
5146 : /* Collect the specified or next argument position */
5147 4070 : if (argpos > 0)
5148 22 : arg = argpos;
5149 4070 : if (arg >= nargs)
5150 4 : ereport(ERROR,
5151 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5152 : errmsg("too few arguments for format()")));
5153 :
5154 : /* Get the value and type of the selected argument */
5155 4066 : if (!funcvariadic)
5156 : {
5157 3854 : value = PG_GETARG_DATUM(arg);
5158 3854 : isNull = PG_ARGISNULL(arg);
5159 3854 : typid = get_fn_expr_argtype(fcinfo->flinfo, arg);
5160 : }
5161 : else
5162 : {
5163 212 : value = elements[arg - 1];
5164 212 : isNull = nulls[arg - 1];
5165 212 : typid = element_type;
5166 : }
5167 4066 : if (!OidIsValid(typid))
5168 0 : elog(ERROR, "could not determine data type of format() input");
5169 :
5170 4066 : arg++;
5171 :
5172 : /*
5173 : * Get the appropriate typOutput function, reusing previous one if
5174 : * same type as previous argument. That's particularly useful in the
5175 : * variadic-array case, but often saves work even for ordinary calls.
5176 : */
5177 4066 : if (typid != prev_type)
5178 : {
5179 : Oid typoutputfunc;
5180 : bool typIsVarlena;
5181 :
5182 2018 : getTypeOutputInfo(typid, &typoutputfunc, &typIsVarlena);
5183 2018 : fmgr_info(typoutputfunc, &typoutputfinfo);
5184 2018 : prev_type = typid;
5185 : }
5186 :
5187 : /*
5188 : * And now we can format the value.
5189 : */
5190 4066 : switch (*cp)
5191 : {
5192 : case 's':
5193 : case 'I':
5194 : case 'L':
5195 4066 : text_format_string_conversion(&str, *cp, &typoutputfinfo,
5196 : value, isNull,
5197 : flags, width);
5198 4065 : break;
5199 : default:
5200 : /* should not get here, because of previous check */
5201 0 : ereport(ERROR,
5202 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5203 : errmsg("unrecognized format() type specifier \"%c\"",
5204 : *cp),
5205 : errhint("For a single \"%%\" use \"%%%%\".")));
5206 : break;
5207 : }
5208 : }
5209 :
5210 : /* Don't need deconstruct_array results anymore. */
5211 1761 : if (elements != NULL)
5212 7 : pfree(elements);
5213 1761 : if (nulls != NULL)
5214 7 : pfree(nulls);
5215 :
5216 : /* Generate results. */
5217 1761 : result = cstring_to_text_with_len(str.data, str.len);
5218 1761 : pfree(str.data);
5219 :
5220 1761 : PG_RETURN_TEXT_P(result);
5221 : }
5222 :
5223 : /*
5224 : * Parse contiguous digits as a decimal number.
5225 : *
5226 : * Returns true if some digits could be parsed.
5227 : * The value is returned into *value, and *ptr is advanced to the next
5228 : * character to be parsed.
5229 : *
5230 : * Note parsing invariant: at least one character is known available before
5231 : * string end (end_ptr) at entry, and this is still true at exit.
5232 : */
5233 : static bool
5234 8144 : text_format_parse_digits(const char **ptr, const char *end_ptr, int *value)
5235 : {
5236 8144 : bool found = false;
5237 8144 : const char *cp = *ptr;
5238 8144 : int val = 0;
5239 :
5240 16340 : while (*cp >= '0' && *cp <= '9')
5241 : {
5242 53 : int newval = val * 10 + (*cp - '0');
5243 :
5244 53 : if (newval / 10 != val) /* overflow? */
5245 0 : ereport(ERROR,
5246 : (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
5247 : errmsg("number is out of range")));
5248 53 : val = newval;
5249 53 : ADVANCE_PARSE_POINTER(cp, end_ptr);
5250 52 : found = true;
5251 : }
5252 :
5253 8143 : *ptr = cp;
5254 8143 : *value = val;
5255 :
5256 8143 : return found;
5257 : }
5258 :
5259 : /*
5260 : * Parse a format specifier (generally following the SUS printf spec).
5261 : *
5262 : * We have already advanced over the initial '%', and we are looking for
5263 : * [argpos][flags][width]type (but the type character is not consumed here).
5264 : *
5265 : * Inputs are start_ptr (the position after '%') and end_ptr (string end + 1).
5266 : * Output parameters:
5267 : * argpos: argument position for value to be printed. -1 means unspecified.
5268 : * widthpos: argument position for width. Zero means the argument position
5269 : * was unspecified (ie, take the next arg) and -1 means no width
5270 : * argument (width was omitted or specified as a constant).
5271 : * flags: bitmask of flags.
5272 : * width: directly-specified width value. Zero means the width was omitted
5273 : * (note it's not necessary to distinguish this case from an explicit
5274 : * zero width value).
5275 : *
5276 : * The function result is the next character position to be parsed, ie, the
5277 : * location where the type character is/should be.
5278 : *
5279 : * Note parsing invariant: at least one character is known available before
5280 : * string end (end_ptr) at entry, and this is still true at exit.
5281 : */
5282 : static const char *
5283 4075 : text_format_parse_format(const char *start_ptr, const char *end_ptr,
5284 : int *argpos, int *widthpos,
5285 : int *flags, int *width)
5286 : {
5287 4075 : const char *cp = start_ptr;
5288 : int n;
5289 :
5290 : /* set defaults for output parameters */
5291 4075 : *argpos = -1;
5292 4075 : *widthpos = -1;
5293 4075 : *flags = 0;
5294 4075 : *width = 0;
5295 :
5296 : /* try to identify first number */
5297 4075 : if (text_format_parse_digits(&cp, end_ptr, &n))
5298 : {
5299 29 : if (*cp != '$')
5300 : {
5301 : /* Must be just a width and a type, so we're done */
5302 4 : *width = n;
5303 4 : return cp;
5304 : }
5305 : /* The number was argument position */
5306 25 : *argpos = n;
5307 : /* Explicit 0 for argument index is immediately refused */
5308 25 : if (n == 0)
5309 1 : ereport(ERROR,
5310 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5311 : errmsg("format specifies argument 0, but arguments are numbered from 1")));
5312 24 : ADVANCE_PARSE_POINTER(cp, end_ptr);
5313 : }
5314 :
5315 : /* Handle flags (only minus is supported now) */
5316 8143 : while (*cp == '-')
5317 : {
5318 5 : *flags |= TEXT_FORMAT_FLAG_MINUS;
5319 5 : ADVANCE_PARSE_POINTER(cp, end_ptr);
5320 : }
5321 :
5322 4069 : if (*cp == '*')
5323 : {
5324 : /* Handle indirect width */
5325 8 : ADVANCE_PARSE_POINTER(cp, end_ptr);
5326 8 : if (text_format_parse_digits(&cp, end_ptr, &n))
5327 : {
5328 : /* number in this position must be closed by $ */
5329 7 : if (*cp != '$')
5330 0 : ereport(ERROR,
5331 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5332 : errmsg("width argument position must be ended by \"$\"")));
5333 : /* The number was width argument position */
5334 7 : *widthpos = n;
5335 : /* Explicit 0 for argument index is immediately refused */
5336 7 : if (n == 0)
5337 1 : ereport(ERROR,
5338 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5339 : errmsg("format specifies argument 0, but arguments are numbered from 1")));
5340 6 : ADVANCE_PARSE_POINTER(cp, end_ptr);
5341 : }
5342 : else
5343 1 : *widthpos = 0; /* width's argument position is unspecified */
5344 : }
5345 : else
5346 : {
5347 : /* Check for direct width specification */
5348 4061 : if (text_format_parse_digits(&cp, end_ptr, &n))
5349 5 : *width = n;
5350 : }
5351 :
5352 : /* cp should now be pointing at type character */
5353 4067 : return cp;
5354 : }
5355 :
5356 : /*
5357 : * Format a %s, %I, or %L conversion
5358 : */
5359 : static void
5360 4066 : text_format_string_conversion(StringInfo buf, char conversion,
5361 : FmgrInfo *typOutputInfo,
5362 : Datum value, bool isNull,
5363 : int flags, int width)
5364 : {
5365 : char *str;
5366 :
5367 : /* Handle NULL arguments before trying to stringify the value. */
5368 4066 : if (isNull)
5369 : {
5370 11 : if (conversion == 's')
5371 3 : text_format_append_string(buf, "", flags, width);
5372 8 : else if (conversion == 'L')
5373 7 : text_format_append_string(buf, "NULL", flags, width);
5374 1 : else if (conversion == 'I')
5375 1 : ereport(ERROR,
5376 : (errcode(ERRCODE_NULL_VALUE_NOT_ALLOWED),
5377 : errmsg("null values cannot be formatted as an SQL identifier")));
5378 4075 : return;
5379 : }
5380 :
5381 : /* Stringify. */
5382 4055 : str = OutputFunctionCall(typOutputInfo, value);
5383 :
5384 : /* Escape. */
5385 4055 : if (conversion == 'I')
5386 : {
5387 : /* quote_identifier may or may not allocate a new string. */
5388 276 : text_format_append_string(buf, quote_identifier(str), flags, width);
5389 : }
5390 3779 : else if (conversion == 'L')
5391 : {
5392 253 : char *qstr = quote_literal_cstr(str);
5393 :
5394 253 : text_format_append_string(buf, qstr, flags, width);
5395 : /* quote_literal_cstr() always allocates a new string */
5396 253 : pfree(qstr);
5397 : }
5398 : else
5399 3526 : text_format_append_string(buf, str, flags, width);
5400 :
5401 : /* Cleanup. */
5402 4055 : pfree(str);
5403 : }
5404 :
5405 : /*
5406 : * Append str to buf, padding as directed by flags/width
5407 : */
5408 : static void
5409 4065 : text_format_append_string(StringInfo buf, const char *str,
5410 : int flags, int width)
5411 : {
5412 4065 : bool align_to_left = false;
5413 : int len;
5414 :
5415 : /* fast path for typical easy case */
5416 4065 : if (width == 0)
5417 : {
5418 4051 : appendStringInfoString(buf, str);
5419 8116 : return;
5420 : }
5421 :
5422 14 : if (width < 0)
5423 : {
5424 : /* Negative width: implicit '-' flag, then take absolute value */
5425 1 : align_to_left = true;
5426 : /* -INT_MIN is undefined */
5427 1 : if (width <= INT_MIN)
5428 0 : ereport(ERROR,
5429 : (errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
5430 : errmsg("number is out of range")));
5431 1 : width = -width;
5432 : }
5433 13 : else if (flags & TEXT_FORMAT_FLAG_MINUS)
5434 4 : align_to_left = true;
5435 :
5436 14 : len = pg_mbstrlen(str);
5437 14 : if (align_to_left)
5438 : {
5439 : /* left justify */
5440 5 : appendStringInfoString(buf, str);
5441 5 : if (len < width)
5442 5 : appendStringInfoSpaces(buf, width - len);
5443 : }
5444 : else
5445 : {
5446 : /* right justify */
5447 9 : if (len < width)
5448 9 : appendStringInfoSpaces(buf, width - len);
5449 9 : appendStringInfoString(buf, str);
5450 : }
5451 : }
5452 :
5453 : /*
5454 : * text_format_nv - nonvariadic wrapper for text_format function.
5455 : *
5456 : * note: this wrapper is necessary to pass the sanity check in opr_sanity,
5457 : * which checks that all built-in functions that share the implementing C
5458 : * function take the same number of arguments.
5459 : */
5460 : Datum
5461 5 : text_format_nv(PG_FUNCTION_ARGS)
5462 : {
5463 5 : return text_format(fcinfo);
5464 : }
5465 :
5466 : /*
5467 : * Helper function for Levenshtein distance functions. Faster than memcmp(),
5468 : * for this use case.
5469 : */
5470 : static inline bool
5471 0 : rest_of_char_same(const char *s1, const char *s2, int len)
5472 : {
5473 0 : while (len > 0)
5474 : {
5475 0 : len--;
5476 0 : if (s1[len] != s2[len])
5477 0 : return false;
5478 : }
5479 0 : return true;
5480 : }
5481 :
5482 : /* Expand each Levenshtein distance variant */
5483 : #include "levenshtein.c"
5484 : #define LEVENSHTEIN_LESS_EQUAL
5485 : #include "levenshtein.c"
|