Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * regc_pg_locale.c
4 : * ctype functions adapted to work on pg_wchar (a/k/a chr),
5 : * and functions to cache the results of wholesale ctype probing.
6 : *
7 : * This file is #included by regcomp.c; it's not meant to compile standalone.
8 : *
9 : * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
10 : * Portions Copyright (c) 1994, Regents of the University of California
11 : *
12 : * IDENTIFICATION
13 : * src/backend/regex/regc_pg_locale.c
14 : *
15 : *-------------------------------------------------------------------------
16 : */
17 :
18 : #include "catalog/pg_collation.h"
19 : #include "utils/pg_locale.h"
20 :
21 : /*
22 : * To provide as much functionality as possible on a variety of platforms,
23 : * without going so far as to implement everything from scratch, we use
24 : * several implementation strategies depending on the situation:
25 : *
26 : * 1. In C/POSIX collations, we use hard-wired code. We can't depend on
27 : * the <ctype.h> functions since those will obey LC_CTYPE. Note that these
28 : * collations don't give a fig about multibyte characters.
29 : *
30 : * 2. In the "default" collation (which is supposed to obey LC_CTYPE):
31 : *
32 : * 2a. When working in UTF8 encoding, we use the <wctype.h> functions if
33 : * available. This assumes that every platform uses Unicode codepoints
34 : * directly as the wchar_t representation of Unicode. On some platforms
35 : * wchar_t is only 16 bits wide, so we have to punt for codepoints > 0xFFFF.
36 : *
37 : * 2b. In all other encodings, or on machines that lack <wctype.h>, we use
38 : * the <ctype.h> functions for pg_wchar values up to 255, and punt for values
39 : * above that. This is only 100% correct in single-byte encodings such as
40 : * LATINn. However, non-Unicode multibyte encodings are mostly Far Eastern
41 : * character sets for which the properties being tested here aren't very
42 : * relevant for higher code values anyway. The difficulty with using the
43 : * <wctype.h> functions with non-Unicode multibyte encodings is that we can
44 : * have no certainty that the platform's wchar_t representation matches
45 : * what we do in pg_wchar conversions.
46 : *
47 : * 3. Other collations are only supported on platforms that HAVE_LOCALE_T.
48 : * Here, we use the locale_t-extended forms of the <wctype.h> and <ctype.h>
49 : * functions, under exactly the same cases as #2.
50 : *
51 : * There is one notable difference between cases 2 and 3: in the "default"
52 : * collation we force ASCII letters to follow ASCII upcase/downcase rules,
53 : * while in a non-default collation we just let the library functions do what
54 : * they will. The case where this matters is treatment of I/i in Turkish,
55 : * and the behavior is meant to match the upper()/lower() SQL functions.
56 : *
57 : * We store the active collation setting in static variables. In principle
58 : * it could be passed down to here via the regex library's "struct vars" data
59 : * structure; but that would require somewhat invasive changes in the regex
60 : * library, and right now there's no real benefit to be gained from that.
61 : *
62 : * NB: the coding here assumes pg_wchar is an unsigned type.
63 : */
64 :
65 : typedef enum
66 : {
67 : PG_REGEX_LOCALE_C, /* C locale (encoding independent) */
68 : PG_REGEX_LOCALE_WIDE, /* Use <wctype.h> functions */
69 : PG_REGEX_LOCALE_1BYTE, /* Use <ctype.h> functions */
70 : PG_REGEX_LOCALE_WIDE_L, /* Use locale_t <wctype.h> functions */
71 : PG_REGEX_LOCALE_1BYTE_L, /* Use locale_t <ctype.h> functions */
72 : PG_REGEX_LOCALE_ICU /* Use ICU uchar.h functions */
73 : } PG_Locale_Strategy;
74 :
75 : static PG_Locale_Strategy pg_regex_strategy;
76 : static pg_locale_t pg_regex_locale;
77 : static Oid pg_regex_collation;
78 :
79 : /*
80 : * Hard-wired character properties for C locale
81 : */
82 : #define PG_ISDIGIT 0x01
83 : #define PG_ISALPHA 0x02
84 : #define PG_ISALNUM (PG_ISDIGIT | PG_ISALPHA)
85 : #define PG_ISUPPER 0x04
86 : #define PG_ISLOWER 0x08
87 : #define PG_ISGRAPH 0x10
88 : #define PG_ISPRINT 0x20
89 : #define PG_ISPUNCT 0x40
90 : #define PG_ISSPACE 0x80
91 :
92 : static const unsigned char pg_char_properties[128] = {
93 : /* NUL */ 0,
94 : /* ^A */ 0,
95 : /* ^B */ 0,
96 : /* ^C */ 0,
97 : /* ^D */ 0,
98 : /* ^E */ 0,
99 : /* ^F */ 0,
100 : /* ^G */ 0,
101 : /* ^H */ 0,
102 : /* ^I */ PG_ISSPACE,
103 : /* ^J */ PG_ISSPACE,
104 : /* ^K */ PG_ISSPACE,
105 : /* ^L */ PG_ISSPACE,
106 : /* ^M */ PG_ISSPACE,
107 : /* ^N */ 0,
108 : /* ^O */ 0,
109 : /* ^P */ 0,
110 : /* ^Q */ 0,
111 : /* ^R */ 0,
112 : /* ^S */ 0,
113 : /* ^T */ 0,
114 : /* ^U */ 0,
115 : /* ^V */ 0,
116 : /* ^W */ 0,
117 : /* ^X */ 0,
118 : /* ^Y */ 0,
119 : /* ^Z */ 0,
120 : /* ^[ */ 0,
121 : /* ^\ */ 0,
122 : /* ^] */ 0,
123 : /* ^^ */ 0,
124 : /* ^_ */ 0,
125 : /* */ PG_ISPRINT | PG_ISSPACE,
126 : /* ! */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
127 : /* " */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
128 : /* # */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
129 : /* $ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
130 : /* % */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
131 : /* & */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
132 : /* ' */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
133 : /* ( */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
134 : /* ) */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
135 : /* * */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
136 : /* + */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
137 : /* , */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
138 : /* - */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
139 : /* . */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
140 : /* / */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
141 : /* 0 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
142 : /* 1 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
143 : /* 2 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
144 : /* 3 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
145 : /* 4 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
146 : /* 5 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
147 : /* 6 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
148 : /* 7 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
149 : /* 8 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
150 : /* 9 */ PG_ISDIGIT | PG_ISGRAPH | PG_ISPRINT,
151 : /* : */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
152 : /* ; */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
153 : /* < */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
154 : /* = */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
155 : /* > */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
156 : /* ? */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
157 : /* @ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
158 : /* A */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
159 : /* B */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
160 : /* C */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
161 : /* D */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
162 : /* E */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
163 : /* F */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
164 : /* G */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
165 : /* H */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
166 : /* I */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
167 : /* J */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
168 : /* K */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
169 : /* L */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
170 : /* M */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
171 : /* N */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
172 : /* O */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
173 : /* P */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
174 : /* Q */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
175 : /* R */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
176 : /* S */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
177 : /* T */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
178 : /* U */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
179 : /* V */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
180 : /* W */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
181 : /* X */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
182 : /* Y */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
183 : /* Z */ PG_ISALPHA | PG_ISUPPER | PG_ISGRAPH | PG_ISPRINT,
184 : /* [ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
185 : /* \ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
186 : /* ] */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
187 : /* ^ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
188 : /* _ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
189 : /* ` */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
190 : /* a */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
191 : /* b */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
192 : /* c */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
193 : /* d */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
194 : /* e */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
195 : /* f */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
196 : /* g */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
197 : /* h */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
198 : /* i */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
199 : /* j */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
200 : /* k */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
201 : /* l */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
202 : /* m */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
203 : /* n */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
204 : /* o */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
205 : /* p */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
206 : /* q */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
207 : /* r */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
208 : /* s */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
209 : /* t */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
210 : /* u */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
211 : /* v */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
212 : /* w */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
213 : /* x */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
214 : /* y */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
215 : /* z */ PG_ISALPHA | PG_ISLOWER | PG_ISGRAPH | PG_ISPRINT,
216 : /* { */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
217 : /* | */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
218 : /* } */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
219 : /* ~ */ PG_ISGRAPH | PG_ISPRINT | PG_ISPUNCT,
220 : /* DEL */ 0
221 : };
222 :
223 :
224 : /*
225 : * pg_set_regex_collation: set collation for these functions to obey
226 : *
227 : * This is called when beginning compilation or execution of a regexp.
228 : * Since there's no need for reentrancy of regexp operations, it's okay
229 : * to store the results in static variables.
230 : */
231 : void
232 33811 : pg_set_regex_collation(Oid collation)
233 : {
234 33811 : if (lc_ctype_is_c(collation))
235 : {
236 : /* C/POSIX collations use this path regardless of database encoding */
237 2 : pg_regex_strategy = PG_REGEX_LOCALE_C;
238 2 : pg_regex_locale = 0;
239 2 : pg_regex_collation = C_COLLATION_OID;
240 : }
241 : else
242 : {
243 33809 : if (collation == DEFAULT_COLLATION_OID)
244 33809 : pg_regex_locale = 0;
245 0 : else if (OidIsValid(collation))
246 : {
247 : /*
248 : * NB: pg_newlocale_from_collation will fail if not HAVE_LOCALE_T;
249 : * the case of pg_regex_locale != 0 but not HAVE_LOCALE_T does not
250 : * have to be considered below.
251 : */
252 0 : pg_regex_locale = pg_newlocale_from_collation(collation);
253 : }
254 : else
255 : {
256 : /*
257 : * This typically means that the parser could not resolve a
258 : * conflict of implicit collations, so report it that way.
259 : */
260 0 : ereport(ERROR,
261 : (errcode(ERRCODE_INDETERMINATE_COLLATION),
262 : errmsg("could not determine which collation to use for regular expression"),
263 : errhint("Use the COLLATE clause to set the collation explicitly.")));
264 : }
265 :
266 : #ifdef USE_ICU
267 : if (pg_regex_locale && pg_regex_locale->provider == COLLPROVIDER_ICU)
268 : pg_regex_strategy = PG_REGEX_LOCALE_ICU;
269 : else
270 : #endif
271 : #ifdef USE_WIDE_UPPER_LOWER
272 33809 : if (GetDatabaseEncoding() == PG_UTF8)
273 : {
274 33809 : if (pg_regex_locale)
275 0 : pg_regex_strategy = PG_REGEX_LOCALE_WIDE_L;
276 : else
277 33809 : pg_regex_strategy = PG_REGEX_LOCALE_WIDE;
278 : }
279 : else
280 : #endif /* USE_WIDE_UPPER_LOWER */
281 : {
282 0 : if (pg_regex_locale)
283 0 : pg_regex_strategy = PG_REGEX_LOCALE_1BYTE_L;
284 : else
285 0 : pg_regex_strategy = PG_REGEX_LOCALE_1BYTE;
286 : }
287 :
288 33809 : pg_regex_collation = collation;
289 : }
290 33811 : }
291 :
292 : static int
293 6148 : pg_wc_isdigit(pg_wchar c)
294 : {
295 6148 : switch (pg_regex_strategy)
296 : {
297 : case PG_REGEX_LOCALE_C:
298 0 : return (c <= (pg_wchar) 127 &&
299 0 : (pg_char_properties[c] & PG_ISDIGIT));
300 : case PG_REGEX_LOCALE_WIDE:
301 : #ifdef USE_WIDE_UPPER_LOWER
302 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
303 6148 : return iswdigit((wint_t) c);
304 : #endif
305 : /* FALL THRU */
306 : case PG_REGEX_LOCALE_1BYTE:
307 0 : return (c <= (pg_wchar) UCHAR_MAX &&
308 0 : isdigit((unsigned char) c));
309 : case PG_REGEX_LOCALE_WIDE_L:
310 : #if defined(HAVE_LOCALE_T) && defined(USE_WIDE_UPPER_LOWER)
311 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
312 0 : return iswdigit_l((wint_t) c, pg_regex_locale->info.lt);
313 : #endif
314 : /* FALL THRU */
315 : case PG_REGEX_LOCALE_1BYTE_L:
316 : #ifdef HAVE_LOCALE_T
317 0 : return (c <= (pg_wchar) UCHAR_MAX &&
318 0 : isdigit_l((unsigned char) c, pg_regex_locale->info.lt));
319 : #endif
320 : break;
321 : case PG_REGEX_LOCALE_ICU:
322 : #ifdef USE_ICU
323 : return u_isdigit(c);
324 : #endif
325 0 : break;
326 : }
327 0 : return 0; /* can't get here, but keep compiler quiet */
328 : }
329 :
330 : static int
331 8 : pg_wc_isalpha(pg_wchar c)
332 : {
333 8 : switch (pg_regex_strategy)
334 : {
335 : case PG_REGEX_LOCALE_C:
336 0 : return (c <= (pg_wchar) 127 &&
337 0 : (pg_char_properties[c] & PG_ISALPHA));
338 : case PG_REGEX_LOCALE_WIDE:
339 : #ifdef USE_WIDE_UPPER_LOWER
340 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
341 8 : return iswalpha((wint_t) c);
342 : #endif
343 : /* FALL THRU */
344 : case PG_REGEX_LOCALE_1BYTE:
345 0 : return (c <= (pg_wchar) UCHAR_MAX &&
346 0 : isalpha((unsigned char) c));
347 : case PG_REGEX_LOCALE_WIDE_L:
348 : #if defined(HAVE_LOCALE_T) && defined(USE_WIDE_UPPER_LOWER)
349 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
350 0 : return iswalpha_l((wint_t) c, pg_regex_locale->info.lt);
351 : #endif
352 : /* FALL THRU */
353 : case PG_REGEX_LOCALE_1BYTE_L:
354 : #ifdef HAVE_LOCALE_T
355 0 : return (c <= (pg_wchar) UCHAR_MAX &&
356 0 : isalpha_l((unsigned char) c, pg_regex_locale->info.lt));
357 : #endif
358 : break;
359 : case PG_REGEX_LOCALE_ICU:
360 : #ifdef USE_ICU
361 : return u_isalpha(c);
362 : #endif
363 0 : break;
364 : }
365 0 : return 0; /* can't get here, but keep compiler quiet */
366 : }
367 :
368 : static int
369 2082 : pg_wc_isalnum(pg_wchar c)
370 : {
371 2082 : switch (pg_regex_strategy)
372 : {
373 : case PG_REGEX_LOCALE_C:
374 2 : return (c <= (pg_wchar) 127 &&
375 1 : (pg_char_properties[c] & PG_ISALNUM));
376 : case PG_REGEX_LOCALE_WIDE:
377 : #ifdef USE_WIDE_UPPER_LOWER
378 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
379 2081 : return iswalnum((wint_t) c);
380 : #endif
381 : /* FALL THRU */
382 : case PG_REGEX_LOCALE_1BYTE:
383 0 : return (c <= (pg_wchar) UCHAR_MAX &&
384 0 : isalnum((unsigned char) c));
385 : case PG_REGEX_LOCALE_WIDE_L:
386 : #if defined(HAVE_LOCALE_T) && defined(USE_WIDE_UPPER_LOWER)
387 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
388 0 : return iswalnum_l((wint_t) c, pg_regex_locale->info.lt);
389 : #endif
390 : /* FALL THRU */
391 : case PG_REGEX_LOCALE_1BYTE_L:
392 : #ifdef HAVE_LOCALE_T
393 0 : return (c <= (pg_wchar) UCHAR_MAX &&
394 0 : isalnum_l((unsigned char) c, pg_regex_locale->info.lt));
395 : #endif
396 : break;
397 : case PG_REGEX_LOCALE_ICU:
398 : #ifdef USE_ICU
399 : return u_isalnum(c);
400 : #endif
401 0 : break;
402 : }
403 0 : return 0; /* can't get here, but keep compiler quiet */
404 : }
405 :
406 : static int
407 0 : pg_wc_isupper(pg_wchar c)
408 : {
409 0 : switch (pg_regex_strategy)
410 : {
411 : case PG_REGEX_LOCALE_C:
412 0 : return (c <= (pg_wchar) 127 &&
413 0 : (pg_char_properties[c] & PG_ISUPPER));
414 : case PG_REGEX_LOCALE_WIDE:
415 : #ifdef USE_WIDE_UPPER_LOWER
416 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
417 0 : return iswupper((wint_t) c);
418 : #endif
419 : /* FALL THRU */
420 : case PG_REGEX_LOCALE_1BYTE:
421 0 : return (c <= (pg_wchar) UCHAR_MAX &&
422 0 : isupper((unsigned char) c));
423 : case PG_REGEX_LOCALE_WIDE_L:
424 : #if defined(HAVE_LOCALE_T) && defined(USE_WIDE_UPPER_LOWER)
425 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
426 0 : return iswupper_l((wint_t) c, pg_regex_locale->info.lt);
427 : #endif
428 : /* FALL THRU */
429 : case PG_REGEX_LOCALE_1BYTE_L:
430 : #ifdef HAVE_LOCALE_T
431 0 : return (c <= (pg_wchar) UCHAR_MAX &&
432 0 : isupper_l((unsigned char) c, pg_regex_locale->info.lt));
433 : #endif
434 : break;
435 : case PG_REGEX_LOCALE_ICU:
436 : #ifdef USE_ICU
437 : return u_isupper(c);
438 : #endif
439 0 : break;
440 : }
441 0 : return 0; /* can't get here, but keep compiler quiet */
442 : }
443 :
444 : static int
445 0 : pg_wc_islower(pg_wchar c)
446 : {
447 0 : switch (pg_regex_strategy)
448 : {
449 : case PG_REGEX_LOCALE_C:
450 0 : return (c <= (pg_wchar) 127 &&
451 0 : (pg_char_properties[c] & PG_ISLOWER));
452 : case PG_REGEX_LOCALE_WIDE:
453 : #ifdef USE_WIDE_UPPER_LOWER
454 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
455 0 : return iswlower((wint_t) c);
456 : #endif
457 : /* FALL THRU */
458 : case PG_REGEX_LOCALE_1BYTE:
459 0 : return (c <= (pg_wchar) UCHAR_MAX &&
460 0 : islower((unsigned char) c));
461 : case PG_REGEX_LOCALE_WIDE_L:
462 : #if defined(HAVE_LOCALE_T) && defined(USE_WIDE_UPPER_LOWER)
463 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
464 0 : return iswlower_l((wint_t) c, pg_regex_locale->info.lt);
465 : #endif
466 : /* FALL THRU */
467 : case PG_REGEX_LOCALE_1BYTE_L:
468 : #ifdef HAVE_LOCALE_T
469 0 : return (c <= (pg_wchar) UCHAR_MAX &&
470 0 : islower_l((unsigned char) c, pg_regex_locale->info.lt));
471 : #endif
472 : break;
473 : case PG_REGEX_LOCALE_ICU:
474 : #ifdef USE_ICU
475 : return u_islower(c);
476 : #endif
477 0 : break;
478 : }
479 0 : return 0; /* can't get here, but keep compiler quiet */
480 : }
481 :
482 : static int
483 0 : pg_wc_isgraph(pg_wchar c)
484 : {
485 0 : switch (pg_regex_strategy)
486 : {
487 : case PG_REGEX_LOCALE_C:
488 0 : return (c <= (pg_wchar) 127 &&
489 0 : (pg_char_properties[c] & PG_ISGRAPH));
490 : case PG_REGEX_LOCALE_WIDE:
491 : #ifdef USE_WIDE_UPPER_LOWER
492 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
493 0 : return iswgraph((wint_t) c);
494 : #endif
495 : /* FALL THRU */
496 : case PG_REGEX_LOCALE_1BYTE:
497 0 : return (c <= (pg_wchar) UCHAR_MAX &&
498 0 : isgraph((unsigned char) c));
499 : case PG_REGEX_LOCALE_WIDE_L:
500 : #if defined(HAVE_LOCALE_T) && defined(USE_WIDE_UPPER_LOWER)
501 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
502 0 : return iswgraph_l((wint_t) c, pg_regex_locale->info.lt);
503 : #endif
504 : /* FALL THRU */
505 : case PG_REGEX_LOCALE_1BYTE_L:
506 : #ifdef HAVE_LOCALE_T
507 0 : return (c <= (pg_wchar) UCHAR_MAX &&
508 0 : isgraph_l((unsigned char) c, pg_regex_locale->info.lt));
509 : #endif
510 : break;
511 : case PG_REGEX_LOCALE_ICU:
512 : #ifdef USE_ICU
513 : return u_isgraph(c);
514 : #endif
515 0 : break;
516 : }
517 0 : return 0; /* can't get here, but keep compiler quiet */
518 : }
519 :
520 : static int
521 0 : pg_wc_isprint(pg_wchar c)
522 : {
523 0 : switch (pg_regex_strategy)
524 : {
525 : case PG_REGEX_LOCALE_C:
526 0 : return (c <= (pg_wchar) 127 &&
527 0 : (pg_char_properties[c] & PG_ISPRINT));
528 : case PG_REGEX_LOCALE_WIDE:
529 : #ifdef USE_WIDE_UPPER_LOWER
530 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
531 0 : return iswprint((wint_t) c);
532 : #endif
533 : /* FALL THRU */
534 : case PG_REGEX_LOCALE_1BYTE:
535 0 : return (c <= (pg_wchar) UCHAR_MAX &&
536 0 : isprint((unsigned char) c));
537 : case PG_REGEX_LOCALE_WIDE_L:
538 : #if defined(HAVE_LOCALE_T) && defined(USE_WIDE_UPPER_LOWER)
539 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
540 0 : return iswprint_l((wint_t) c, pg_regex_locale->info.lt);
541 : #endif
542 : /* FALL THRU */
543 : case PG_REGEX_LOCALE_1BYTE_L:
544 : #ifdef HAVE_LOCALE_T
545 0 : return (c <= (pg_wchar) UCHAR_MAX &&
546 0 : isprint_l((unsigned char) c, pg_regex_locale->info.lt));
547 : #endif
548 : break;
549 : case PG_REGEX_LOCALE_ICU:
550 : #ifdef USE_ICU
551 : return u_isprint(c);
552 : #endif
553 0 : break;
554 : }
555 0 : return 0; /* can't get here, but keep compiler quiet */
556 : }
557 :
558 : static int
559 0 : pg_wc_ispunct(pg_wchar c)
560 : {
561 0 : switch (pg_regex_strategy)
562 : {
563 : case PG_REGEX_LOCALE_C:
564 0 : return (c <= (pg_wchar) 127 &&
565 0 : (pg_char_properties[c] & PG_ISPUNCT));
566 : case PG_REGEX_LOCALE_WIDE:
567 : #ifdef USE_WIDE_UPPER_LOWER
568 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
569 0 : return iswpunct((wint_t) c);
570 : #endif
571 : /* FALL THRU */
572 : case PG_REGEX_LOCALE_1BYTE:
573 0 : return (c <= (pg_wchar) UCHAR_MAX &&
574 0 : ispunct((unsigned char) c));
575 : case PG_REGEX_LOCALE_WIDE_L:
576 : #if defined(HAVE_LOCALE_T) && defined(USE_WIDE_UPPER_LOWER)
577 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
578 0 : return iswpunct_l((wint_t) c, pg_regex_locale->info.lt);
579 : #endif
580 : /* FALL THRU */
581 : case PG_REGEX_LOCALE_1BYTE_L:
582 : #ifdef HAVE_LOCALE_T
583 0 : return (c <= (pg_wchar) UCHAR_MAX &&
584 0 : ispunct_l((unsigned char) c, pg_regex_locale->info.lt));
585 : #endif
586 : break;
587 : case PG_REGEX_LOCALE_ICU:
588 : #ifdef USE_ICU
589 : return u_ispunct(c);
590 : #endif
591 0 : break;
592 : }
593 0 : return 0; /* can't get here, but keep compiler quiet */
594 : }
595 :
596 : static int
597 4096 : pg_wc_isspace(pg_wchar c)
598 : {
599 4096 : switch (pg_regex_strategy)
600 : {
601 : case PG_REGEX_LOCALE_C:
602 0 : return (c <= (pg_wchar) 127 &&
603 0 : (pg_char_properties[c] & PG_ISSPACE));
604 : case PG_REGEX_LOCALE_WIDE:
605 : #ifdef USE_WIDE_UPPER_LOWER
606 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
607 4096 : return iswspace((wint_t) c);
608 : #endif
609 : /* FALL THRU */
610 : case PG_REGEX_LOCALE_1BYTE:
611 0 : return (c <= (pg_wchar) UCHAR_MAX &&
612 0 : isspace((unsigned char) c));
613 : case PG_REGEX_LOCALE_WIDE_L:
614 : #if defined(HAVE_LOCALE_T) && defined(USE_WIDE_UPPER_LOWER)
615 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
616 0 : return iswspace_l((wint_t) c, pg_regex_locale->info.lt);
617 : #endif
618 : /* FALL THRU */
619 : case PG_REGEX_LOCALE_1BYTE_L:
620 : #ifdef HAVE_LOCALE_T
621 0 : return (c <= (pg_wchar) UCHAR_MAX &&
622 0 : isspace_l((unsigned char) c, pg_regex_locale->info.lt));
623 : #endif
624 : break;
625 : case PG_REGEX_LOCALE_ICU:
626 : #ifdef USE_ICU
627 : return u_isspace(c);
628 : #endif
629 0 : break;
630 : }
631 0 : return 0; /* can't get here, but keep compiler quiet */
632 : }
633 :
634 : static pg_wchar
635 16 : pg_wc_toupper(pg_wchar c)
636 : {
637 16 : switch (pg_regex_strategy)
638 : {
639 : case PG_REGEX_LOCALE_C:
640 0 : if (c <= (pg_wchar) 127)
641 0 : return pg_ascii_toupper((unsigned char) c);
642 0 : return c;
643 : case PG_REGEX_LOCALE_WIDE:
644 : /* force C behavior for ASCII characters, per comments above */
645 16 : if (c <= (pg_wchar) 127)
646 16 : return pg_ascii_toupper((unsigned char) c);
647 : #ifdef USE_WIDE_UPPER_LOWER
648 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
649 0 : return towupper((wint_t) c);
650 : #endif
651 : /* FALL THRU */
652 : case PG_REGEX_LOCALE_1BYTE:
653 : /* force C behavior for ASCII characters, per comments above */
654 0 : if (c <= (pg_wchar) 127)
655 0 : return pg_ascii_toupper((unsigned char) c);
656 0 : if (c <= (pg_wchar) UCHAR_MAX)
657 0 : return toupper((unsigned char) c);
658 0 : return c;
659 : case PG_REGEX_LOCALE_WIDE_L:
660 : #if defined(HAVE_LOCALE_T) && defined(USE_WIDE_UPPER_LOWER)
661 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
662 0 : return towupper_l((wint_t) c, pg_regex_locale->info.lt);
663 : #endif
664 : /* FALL THRU */
665 : case PG_REGEX_LOCALE_1BYTE_L:
666 : #ifdef HAVE_LOCALE_T
667 0 : if (c <= (pg_wchar) UCHAR_MAX)
668 0 : return toupper_l((unsigned char) c, pg_regex_locale->info.lt);
669 : #endif
670 0 : return c;
671 : case PG_REGEX_LOCALE_ICU:
672 : #ifdef USE_ICU
673 : return u_toupper(c);
674 : #endif
675 0 : break;
676 : }
677 0 : return 0; /* can't get here, but keep compiler quiet */
678 : }
679 :
680 : static pg_wchar
681 16 : pg_wc_tolower(pg_wchar c)
682 : {
683 16 : switch (pg_regex_strategy)
684 : {
685 : case PG_REGEX_LOCALE_C:
686 0 : if (c <= (pg_wchar) 127)
687 0 : return pg_ascii_tolower((unsigned char) c);
688 0 : return c;
689 : case PG_REGEX_LOCALE_WIDE:
690 : /* force C behavior for ASCII characters, per comments above */
691 16 : if (c <= (pg_wchar) 127)
692 16 : return pg_ascii_tolower((unsigned char) c);
693 : #ifdef USE_WIDE_UPPER_LOWER
694 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
695 0 : return towlower((wint_t) c);
696 : #endif
697 : /* FALL THRU */
698 : case PG_REGEX_LOCALE_1BYTE:
699 : /* force C behavior for ASCII characters, per comments above */
700 0 : if (c <= (pg_wchar) 127)
701 0 : return pg_ascii_tolower((unsigned char) c);
702 0 : if (c <= (pg_wchar) UCHAR_MAX)
703 0 : return tolower((unsigned char) c);
704 0 : return c;
705 : case PG_REGEX_LOCALE_WIDE_L:
706 : #if defined(HAVE_LOCALE_T) && defined(USE_WIDE_UPPER_LOWER)
707 : if (sizeof(wchar_t) >= 4 || c <= (pg_wchar) 0xFFFF)
708 0 : return towlower_l((wint_t) c, pg_regex_locale->info.lt);
709 : #endif
710 : /* FALL THRU */
711 : case PG_REGEX_LOCALE_1BYTE_L:
712 : #ifdef HAVE_LOCALE_T
713 0 : if (c <= (pg_wchar) UCHAR_MAX)
714 0 : return tolower_l((unsigned char) c, pg_regex_locale->info.lt);
715 : #endif
716 0 : return c;
717 : case PG_REGEX_LOCALE_ICU:
718 : #ifdef USE_ICU
719 : return u_tolower(c);
720 : #endif
721 0 : break;
722 : }
723 0 : return 0; /* can't get here, but keep compiler quiet */
724 : }
725 :
726 :
727 : /*
728 : * These functions cache the results of probing libc's ctype behavior for
729 : * all character codes of interest in a given encoding/collation. The
730 : * result is provided as a "struct cvec", but notice that the representation
731 : * is a touch different from a cvec created by regc_cvec.c: we allocate the
732 : * chrs[] and ranges[] arrays separately from the struct so that we can
733 : * realloc them larger at need. This is okay since the cvecs made here
734 : * should never be freed by freecvec().
735 : *
736 : * We use malloc not palloc since we mustn't lose control on out-of-memory;
737 : * the main regex code expects us to return a failure indication instead.
738 : */
739 :
740 : typedef int (*pg_wc_probefunc) (pg_wchar c);
741 :
742 : typedef struct pg_ctype_cache
743 : {
744 : pg_wc_probefunc probefunc; /* pg_wc_isalpha or a sibling */
745 : Oid collation; /* collation this entry is for */
746 : struct cvec cv; /* cache entry contents */
747 : struct pg_ctype_cache *next; /* chain link */
748 : } pg_ctype_cache;
749 :
750 : static pg_ctype_cache *pg_ctype_cache_list = NULL;
751 :
752 : /*
753 : * Add a chr or range to pcc->cv; return false if run out of memory
754 : */
755 : static bool
756 51 : store_match(pg_ctype_cache *pcc, pg_wchar chr1, int nchrs)
757 : {
758 : chr *newchrs;
759 :
760 51 : if (nchrs > 1)
761 : {
762 36 : if (pcc->cv.nranges >= pcc->cv.rangespace)
763 : {
764 0 : pcc->cv.rangespace *= 2;
765 0 : newchrs = (chr *) realloc(pcc->cv.ranges,
766 0 : pcc->cv.rangespace * sizeof(chr) * 2);
767 0 : if (newchrs == NULL)
768 0 : return false;
769 0 : pcc->cv.ranges = newchrs;
770 : }
771 36 : pcc->cv.ranges[pcc->cv.nranges * 2] = chr1;
772 36 : pcc->cv.ranges[pcc->cv.nranges * 2 + 1] = chr1 + nchrs - 1;
773 36 : pcc->cv.nranges++;
774 : }
775 : else
776 : {
777 15 : assert(nchrs == 1);
778 15 : if (pcc->cv.nchrs >= pcc->cv.chrspace)
779 : {
780 0 : pcc->cv.chrspace *= 2;
781 0 : newchrs = (chr *) realloc(pcc->cv.chrs,
782 0 : pcc->cv.chrspace * sizeof(chr));
783 0 : if (newchrs == NULL)
784 0 : return false;
785 0 : pcc->cv.chrs = newchrs;
786 : }
787 15 : pcc->cv.chrs[pcc->cv.nchrs++] = chr1;
788 : }
789 51 : return true;
790 : }
791 :
792 : /*
793 : * Given a probe function (e.g., pg_wc_isalpha) get a struct cvec for all
794 : * chrs satisfying the probe function. The active collation is the one
795 : * previously set by pg_set_regex_collation. Return NULL if out of memory.
796 : *
797 : * Note that the result must not be freed or modified by caller.
798 : */
799 : static struct cvec *
800 18 : pg_ctype_get_cache(pg_wc_probefunc probefunc, int cclasscode)
801 : {
802 : pg_ctype_cache *pcc;
803 : pg_wchar max_chr;
804 : pg_wchar cur_chr;
805 : int nmatches;
806 : chr *newchrs;
807 :
808 : /*
809 : * Do we already have the answer cached?
810 : */
811 19 : for (pcc = pg_ctype_cache_list; pcc != NULL; pcc = pcc->next)
812 : {
813 25 : if (pcc->probefunc == probefunc &&
814 12 : pcc->collation == pg_regex_collation)
815 12 : return &pcc->cv;
816 : }
817 :
818 : /*
819 : * Nope, so initialize some workspace ...
820 : */
821 6 : pcc = (pg_ctype_cache *) malloc(sizeof(pg_ctype_cache));
822 6 : if (pcc == NULL)
823 0 : return NULL;
824 6 : pcc->probefunc = probefunc;
825 6 : pcc->collation = pg_regex_collation;
826 6 : pcc->cv.nchrs = 0;
827 6 : pcc->cv.chrspace = 128;
828 6 : pcc->cv.chrs = (chr *) malloc(pcc->cv.chrspace * sizeof(chr));
829 6 : pcc->cv.nranges = 0;
830 6 : pcc->cv.rangespace = 64;
831 6 : pcc->cv.ranges = (chr *) malloc(pcc->cv.rangespace * sizeof(chr) * 2);
832 6 : if (pcc->cv.chrs == NULL || pcc->cv.ranges == NULL)
833 : goto out_of_memory;
834 6 : pcc->cv.cclasscode = cclasscode;
835 :
836 : /*
837 : * Decide how many character codes we ought to look through. In general
838 : * we don't go past MAX_SIMPLE_CHR; chr codes above that are handled at
839 : * runtime using the "high colormap" mechanism. However, in C locale
840 : * there's no need to go further than 127, and if we only have a 1-byte
841 : * <ctype.h> API there's no need to go further than that can handle.
842 : *
843 : * If it's not MAX_SIMPLE_CHR that's constraining the search, mark the
844 : * output cvec as not having any locale-dependent behavior, since there
845 : * will be no need to do any run-time locale checks. (The #if's here
846 : * would always be true for production values of MAX_SIMPLE_CHR, but it's
847 : * useful to allow it to be small for testing purposes.)
848 : */
849 6 : switch (pg_regex_strategy)
850 : {
851 : case PG_REGEX_LOCALE_C:
852 : #if MAX_SIMPLE_CHR >= 127
853 0 : max_chr = (pg_wchar) 127;
854 0 : pcc->cv.cclasscode = -1;
855 : #else
856 : max_chr = (pg_wchar) MAX_SIMPLE_CHR;
857 : #endif
858 0 : break;
859 : case PG_REGEX_LOCALE_WIDE:
860 : case PG_REGEX_LOCALE_WIDE_L:
861 6 : max_chr = (pg_wchar) MAX_SIMPLE_CHR;
862 6 : break;
863 : case PG_REGEX_LOCALE_1BYTE:
864 : case PG_REGEX_LOCALE_1BYTE_L:
865 : #if MAX_SIMPLE_CHR >= UCHAR_MAX
866 0 : max_chr = (pg_wchar) UCHAR_MAX;
867 0 : pcc->cv.cclasscode = -1;
868 : #else
869 : max_chr = (pg_wchar) MAX_SIMPLE_CHR;
870 : #endif
871 0 : break;
872 : case PG_REGEX_LOCALE_ICU:
873 0 : max_chr = (pg_wchar) MAX_SIMPLE_CHR;
874 0 : break;
875 : default:
876 0 : max_chr = 0; /* can't get here, but keep compiler quiet */
877 0 : break;
878 : }
879 :
880 : /*
881 : * And scan 'em ...
882 : */
883 6 : nmatches = 0; /* number of consecutive matches */
884 :
885 12294 : for (cur_chr = 0; cur_chr <= max_chr; cur_chr++)
886 : {
887 12288 : if ((*probefunc) (cur_chr))
888 1495 : nmatches++;
889 10793 : else if (nmatches > 0)
890 : {
891 51 : if (!store_match(pcc, cur_chr - nmatches, nmatches))
892 0 : goto out_of_memory;
893 51 : nmatches = 0;
894 : }
895 : }
896 :
897 6 : if (nmatches > 0)
898 0 : if (!store_match(pcc, cur_chr - nmatches, nmatches))
899 0 : goto out_of_memory;
900 :
901 : /*
902 : * We might have allocated more memory than needed, if so free it
903 : */
904 6 : if (pcc->cv.nchrs == 0)
905 : {
906 3 : free(pcc->cv.chrs);
907 3 : pcc->cv.chrs = NULL;
908 3 : pcc->cv.chrspace = 0;
909 : }
910 3 : else if (pcc->cv.nchrs < pcc->cv.chrspace)
911 : {
912 3 : newchrs = (chr *) realloc(pcc->cv.chrs,
913 3 : pcc->cv.nchrs * sizeof(chr));
914 3 : if (newchrs == NULL)
915 0 : goto out_of_memory;
916 3 : pcc->cv.chrs = newchrs;
917 3 : pcc->cv.chrspace = pcc->cv.nchrs;
918 : }
919 6 : if (pcc->cv.nranges == 0)
920 : {
921 0 : free(pcc->cv.ranges);
922 0 : pcc->cv.ranges = NULL;
923 0 : pcc->cv.rangespace = 0;
924 : }
925 6 : else if (pcc->cv.nranges < pcc->cv.rangespace)
926 : {
927 6 : newchrs = (chr *) realloc(pcc->cv.ranges,
928 6 : pcc->cv.nranges * sizeof(chr) * 2);
929 6 : if (newchrs == NULL)
930 0 : goto out_of_memory;
931 6 : pcc->cv.ranges = newchrs;
932 6 : pcc->cv.rangespace = pcc->cv.nranges;
933 : }
934 :
935 : /*
936 : * Success, link it into cache chain
937 : */
938 6 : pcc->next = pg_ctype_cache_list;
939 6 : pg_ctype_cache_list = pcc;
940 :
941 6 : return &pcc->cv;
942 :
943 : /*
944 : * Failure, clean up
945 : */
946 : out_of_memory:
947 0 : if (pcc->cv.chrs)
948 0 : free(pcc->cv.chrs);
949 0 : if (pcc->cv.ranges)
950 0 : free(pcc->cv.ranges);
951 0 : free(pcc);
952 :
953 0 : return NULL;
954 : }
|