Line data Source code
1 : /*
2 : * regc_locale.c --
3 : *
4 : * This file contains locale-specific regexp routines.
5 : * This file is #included by regcomp.c.
6 : *
7 : * Copyright (c) 1998 by Scriptics Corporation.
8 : *
9 : * This software is copyrighted by the Regents of the University of
10 : * California, Sun Microsystems, Inc., Scriptics Corporation, ActiveState
11 : * Corporation and other parties. The following terms apply to all files
12 : * associated with the software unless explicitly disclaimed in
13 : * individual files.
14 : *
15 : * The authors hereby grant permission to use, copy, modify, distribute,
16 : * and license this software and its documentation for any purpose, provided
17 : * that existing copyright notices are retained in all copies and that this
18 : * notice is included verbatim in any distributions. No written agreement,
19 : * license, or royalty fee is required for any of the authorized uses.
20 : * Modifications to this software may be copyrighted by their authors
21 : * and need not follow the licensing terms described here, provided that
22 : * the new terms are clearly indicated on the first page of each file where
23 : * they apply.
24 : *
25 : * IN NO EVENT SHALL THE AUTHORS OR DISTRIBUTORS BE LIABLE TO ANY PARTY
26 : * FOR DIRECT, INDIRECT, SPECIAL, INCIDENTAL, OR CONSEQUENTIAL DAMAGES
27 : * ARISING OUT OF THE USE OF THIS SOFTWARE, ITS DOCUMENTATION, OR ANY
28 : * DERIVATIVES THEREOF, EVEN IF THE AUTHORS HAVE BEEN ADVISED OF THE
29 : * POSSIBILITY OF SUCH DAMAGE.
30 : *
31 : * THE AUTHORS AND DISTRIBUTORS SPECIFICALLY DISCLAIM ANY WARRANTIES,
32 : * INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY,
33 : * FITNESS FOR A PARTICULAR PURPOSE, AND NON-INFRINGEMENT. THIS SOFTWARE
34 : * IS PROVIDED ON AN "AS IS" BASIS, AND THE AUTHORS AND DISTRIBUTORS HAVE
35 : * NO OBLIGATION TO PROVIDE MAINTENANCE, SUPPORT, UPDATES, ENHANCEMENTS, OR
36 : * MODIFICATIONS.
37 : *
38 : * GOVERNMENT USE: If you are acquiring this software on behalf of the
39 : * U.S. government, the Government shall have only "Restricted Rights"
40 : * in the software and related documentation as defined in the Federal
41 : * Acquisition Regulations (FARs) in Clause 52.227.19 (c) (2). If you
42 : * are acquiring the software on behalf of the Department of Defense, the
43 : * software shall be classified as "Commercial Computer Software" and the
44 : * Government shall have only "Restricted Rights" as defined in Clause
45 : * 252.227-7013 (c) (1) of DFARs. Notwithstanding the foregoing, the
46 : * authors grant the U.S. Government and others acting in its behalf
47 : * permission to use and distribute the software in accordance with the
48 : * terms specified in this license.
49 : *
50 : * src/backend/regex/regc_locale.c
51 : */
52 :
53 : /* ASCII character-name table */
54 :
55 : static const struct cname
56 : {
57 : const char *name;
58 : const char code;
59 : } cnames[] =
60 :
61 : {
62 : {
63 : "NUL", '\0'
64 : },
65 : {
66 : "SOH", '\001'
67 : },
68 : {
69 : "STX", '\002'
70 : },
71 : {
72 : "ETX", '\003'
73 : },
74 : {
75 : "EOT", '\004'
76 : },
77 : {
78 : "ENQ", '\005'
79 : },
80 : {
81 : "ACK", '\006'
82 : },
83 : {
84 : "BEL", '\007'
85 : },
86 : {
87 : "alert", '\007'
88 : },
89 : {
90 : "BS", '\010'
91 : },
92 : {
93 : "backspace", '\b'
94 : },
95 : {
96 : "HT", '\011'
97 : },
98 : {
99 : "tab", '\t'
100 : },
101 : {
102 : "LF", '\012'
103 : },
104 : {
105 : "newline", '\n'
106 : },
107 : {
108 : "VT", '\013'
109 : },
110 : {
111 : "vertical-tab", '\v'
112 : },
113 : {
114 : "FF", '\014'
115 : },
116 : {
117 : "form-feed", '\f'
118 : },
119 : {
120 : "CR", '\015'
121 : },
122 : {
123 : "carriage-return", '\r'
124 : },
125 : {
126 : "SO", '\016'
127 : },
128 : {
129 : "SI", '\017'
130 : },
131 : {
132 : "DLE", '\020'
133 : },
134 : {
135 : "DC1", '\021'
136 : },
137 : {
138 : "DC2", '\022'
139 : },
140 : {
141 : "DC3", '\023'
142 : },
143 : {
144 : "DC4", '\024'
145 : },
146 : {
147 : "NAK", '\025'
148 : },
149 : {
150 : "SYN", '\026'
151 : },
152 : {
153 : "ETB", '\027'
154 : },
155 : {
156 : "CAN", '\030'
157 : },
158 : {
159 : "EM", '\031'
160 : },
161 : {
162 : "SUB", '\032'
163 : },
164 : {
165 : "ESC", '\033'
166 : },
167 : {
168 : "IS4", '\034'
169 : },
170 : {
171 : "FS", '\034'
172 : },
173 : {
174 : "IS3", '\035'
175 : },
176 : {
177 : "GS", '\035'
178 : },
179 : {
180 : "IS2", '\036'
181 : },
182 : {
183 : "RS", '\036'
184 : },
185 : {
186 : "IS1", '\037'
187 : },
188 : {
189 : "US", '\037'
190 : },
191 : {
192 : "space", ' '
193 : },
194 : {
195 : "exclamation-mark", '!'
196 : },
197 : {
198 : "quotation-mark", '"'
199 : },
200 : {
201 : "number-sign", '#'
202 : },
203 : {
204 : "dollar-sign", '$'
205 : },
206 : {
207 : "percent-sign", '%'
208 : },
209 : {
210 : "ampersand", '&'
211 : },
212 : {
213 : "apostrophe", '\''
214 : },
215 : {
216 : "left-parenthesis", '('
217 : },
218 : {
219 : "right-parenthesis", ')'
220 : },
221 : {
222 : "asterisk", '*'
223 : },
224 : {
225 : "plus-sign", '+'
226 : },
227 : {
228 : "comma", ','
229 : },
230 : {
231 : "hyphen", '-'
232 : },
233 : {
234 : "hyphen-minus", '-'
235 : },
236 : {
237 : "period", '.'
238 : },
239 : {
240 : "full-stop", '.'
241 : },
242 : {
243 : "slash", '/'
244 : },
245 : {
246 : "solidus", '/'
247 : },
248 : {
249 : "zero", '0'
250 : },
251 : {
252 : "one", '1'
253 : },
254 : {
255 : "two", '2'
256 : },
257 : {
258 : "three", '3'
259 : },
260 : {
261 : "four", '4'
262 : },
263 : {
264 : "five", '5'
265 : },
266 : {
267 : "six", '6'
268 : },
269 : {
270 : "seven", '7'
271 : },
272 : {
273 : "eight", '8'
274 : },
275 : {
276 : "nine", '9'
277 : },
278 : {
279 : "colon", ':'
280 : },
281 : {
282 : "semicolon", ';'
283 : },
284 : {
285 : "less-than-sign", '<'
286 : },
287 : {
288 : "equals-sign", '='
289 : },
290 : {
291 : "greater-than-sign", '>'
292 : },
293 : {
294 : "question-mark", '?'
295 : },
296 : {
297 : "commercial-at", '@'
298 : },
299 : {
300 : "left-square-bracket", '['
301 : },
302 : {
303 : "backslash", '\\'
304 : },
305 : {
306 : "reverse-solidus", '\\'
307 : },
308 : {
309 : "right-square-bracket", ']'
310 : },
311 : {
312 : "circumflex", '^'
313 : },
314 : {
315 : "circumflex-accent", '^'
316 : },
317 : {
318 : "underscore", '_'
319 : },
320 : {
321 : "low-line", '_'
322 : },
323 : {
324 : "grave-accent", '`'
325 : },
326 : {
327 : "left-brace", '{'
328 : },
329 : {
330 : "left-curly-bracket", '{'
331 : },
332 : {
333 : "vertical-line", '|'
334 : },
335 : {
336 : "right-brace", '}'
337 : },
338 : {
339 : "right-curly-bracket", '}'
340 : },
341 : {
342 : "tilde", '~'
343 : },
344 : {
345 : "DEL", '\177'
346 : },
347 : {
348 : NULL, 0
349 : }
350 : };
351 :
352 : /*
353 : * The following arrays define the valid character class names.
354 : */
355 : static const char *const classNames[NUM_CCLASSES + 1] = {
356 : "alnum", "alpha", "ascii", "blank", "cntrl", "digit", "graph",
357 : "lower", "print", "punct", "space", "upper", "xdigit", NULL
358 : };
359 :
360 : enum classes
361 : {
362 : CC_ALNUM, CC_ALPHA, CC_ASCII, CC_BLANK, CC_CNTRL, CC_DIGIT, CC_GRAPH,
363 : CC_LOWER, CC_PRINT, CC_PUNCT, CC_SPACE, CC_UPPER, CC_XDIGIT
364 : };
365 :
366 : /*
367 : * We do not use the hard-wired Unicode classification tables that Tcl does.
368 : * This is because (a) we need to deal with other encodings besides Unicode,
369 : * and (b) we want to track the behavior of the libc locale routines as
370 : * closely as possible. For example, it wouldn't be unreasonable for a
371 : * locale to not consider every Unicode letter as a letter. So we build
372 : * character classification cvecs by asking libc, even for Unicode.
373 : */
374 :
375 :
376 : /*
377 : * element - map collating-element name to chr
378 : */
379 : static chr
380 28 : element(struct vars *v, /* context */
381 : const chr *startp, /* points to start of name */
382 : const chr *endp) /* points just past end of name */
383 : {
384 : const struct cname *cn;
385 : size_t len;
386 :
387 : /* generic: one-chr names stand for themselves */
388 28 : assert(startp < endp);
389 28 : len = endp - startp;
390 28 : if (len == 1)
391 28 : return *startp;
392 :
393 0 : NOTE(REG_ULOCALE);
394 :
395 : /* search table */
396 0 : for (cn = cnames; cn->name != NULL; cn++)
397 : {
398 0 : if (strlen(cn->name) == len &&
399 0 : pg_char_and_wchar_strncmp(cn->name, startp, len) == 0)
400 : {
401 0 : break; /* NOTE BREAK OUT */
402 : }
403 : }
404 0 : if (cn->name != NULL)
405 0 : return CHR(cn->code);
406 :
407 : /* couldn't find it */
408 0 : ERR(REG_ECOLLATE);
409 0 : return 0;
410 : }
411 :
412 : /*
413 : * range - supply cvec for a range, including legality check
414 : */
415 : static struct cvec *
416 14 : range(struct vars *v, /* context */
417 : chr a, /* range start */
418 : chr b, /* range end, might equal a */
419 : int cases) /* case-independent? */
420 : {
421 : int nchrs;
422 : struct cvec *cv;
423 : chr c,
424 : cc;
425 :
426 14 : if (a != b && !before(a, b))
427 : {
428 0 : ERR(REG_ERANGE);
429 0 : return NULL;
430 : }
431 :
432 14 : if (!cases)
433 : { /* easy version */
434 14 : cv = getcvec(v, 0, 1);
435 14 : NOERRN();
436 14 : addrange(cv, a, b);
437 14 : return cv;
438 : }
439 :
440 : /*
441 : * When case-independent, it's hard to decide when cvec ranges are usable,
442 : * so for now at least, we won't try. We use a range for the originally
443 : * specified chrs and then add on any case-equivalents that are outside
444 : * that range as individual chrs.
445 : *
446 : * To ensure sane behavior if someone specifies a very large range, limit
447 : * the allocation size to 100000 chrs (arbitrary) and check for overrun
448 : * inside the loop below.
449 : */
450 0 : nchrs = b - a + 1;
451 0 : if (nchrs <= 0 || nchrs > 100000)
452 0 : nchrs = 100000;
453 :
454 0 : cv = getcvec(v, nchrs, 1);
455 0 : NOERRN();
456 0 : addrange(cv, a, b);
457 :
458 0 : for (c = a; c <= b; c++)
459 : {
460 0 : cc = pg_wc_tolower(c);
461 0 : if (cc != c &&
462 0 : (before(cc, a) || before(b, cc)))
463 : {
464 0 : if (cv->nchrs >= cv->chrspace)
465 : {
466 0 : ERR(REG_ETOOBIG);
467 0 : return NULL;
468 : }
469 0 : addchr(cv, cc);
470 : }
471 0 : cc = pg_wc_toupper(c);
472 0 : if (cc != c &&
473 0 : (before(cc, a) || before(b, cc)))
474 : {
475 0 : if (cv->nchrs >= cv->chrspace)
476 : {
477 0 : ERR(REG_ETOOBIG);
478 0 : return NULL;
479 : }
480 0 : addchr(cv, cc);
481 : }
482 0 : if (CANCEL_REQUESTED(v->re))
483 : {
484 0 : ERR(REG_CANCEL);
485 0 : return NULL;
486 : }
487 : }
488 :
489 0 : return cv;
490 : }
491 :
492 : /*
493 : * before - is chr x before chr y, for purposes of range legality?
494 : */
495 : static int /* predicate */
496 14 : before(chr x, chr y)
497 : {
498 14 : if (x < y)
499 14 : return 1;
500 0 : return 0;
501 : }
502 :
503 : /*
504 : * eclass - supply cvec for an equivalence class
505 : * Must include case counterparts on request.
506 : */
507 : static struct cvec *
508 0 : eclass(struct vars *v, /* context */
509 : chr c, /* Collating element representing the
510 : * equivalence class. */
511 : int cases) /* all cases? */
512 : {
513 : struct cvec *cv;
514 :
515 : /* crude fake equivalence class for testing */
516 0 : if ((v->cflags & REG_FAKE) && c == 'x')
517 : {
518 0 : cv = getcvec(v, 4, 0);
519 0 : addchr(cv, CHR('x'));
520 0 : addchr(cv, CHR('y'));
521 0 : if (cases)
522 : {
523 0 : addchr(cv, CHR('X'));
524 0 : addchr(cv, CHR('Y'));
525 : }
526 0 : return cv;
527 : }
528 :
529 : /* otherwise, none */
530 0 : if (cases)
531 0 : return allcases(v, c);
532 0 : cv = getcvec(v, 1, 0);
533 0 : assert(cv != NULL);
534 0 : addchr(cv, c);
535 0 : return cv;
536 : }
537 :
538 : /*
539 : * cclass - supply cvec for a character class
540 : *
541 : * Must include case counterparts if "cases" is true.
542 : *
543 : * The returned cvec might be either a transient cvec gotten from getcvec(),
544 : * or a permanently cached one from pg_ctype_get_cache(). This is okay
545 : * because callers are not supposed to explicitly free the result either way.
546 : */
547 : static struct cvec *
548 18 : cclass(struct vars *v, /* context */
549 : const chr *startp, /* where the name starts */
550 : const chr *endp, /* just past the end of the name */
551 : int cases) /* case-independent? */
552 : {
553 : size_t len;
554 18 : struct cvec *cv = NULL;
555 : const char *const *namePtr;
556 : int i,
557 : index;
558 :
559 : /*
560 : * Map the name to the corresponding enumerated value.
561 : */
562 18 : len = endp - startp;
563 18 : index = -1;
564 73 : for (namePtr = classNames, i = 0; *namePtr != NULL; namePtr++, i++)
565 : {
566 146 : if (strlen(*namePtr) == len &&
567 73 : pg_char_and_wchar_strncmp(*namePtr, startp, len) == 0)
568 : {
569 18 : index = i;
570 18 : break;
571 : }
572 : }
573 18 : if (index == -1)
574 : {
575 0 : ERR(REG_ECTYPE);
576 0 : return NULL;
577 : }
578 :
579 : /*
580 : * Remap lower and upper to alpha if the match is case insensitive.
581 : */
582 :
583 18 : if (cases &&
584 0 : ((enum classes) index == CC_LOWER ||
585 : (enum classes) index == CC_UPPER))
586 0 : index = (int) CC_ALPHA;
587 :
588 : /*
589 : * Now compute the character class contents. For classes that are based
590 : * on the behavior of a <wctype.h> or <ctype.h> function, we use
591 : * pg_ctype_get_cache so that we can cache the results. Other classes
592 : * have definitions that are hard-wired here, and for those we just
593 : * construct a transient cvec on the fly.
594 : *
595 : * NB: keep this code in sync with cclass_column_index(), below.
596 : */
597 :
598 18 : switch ((enum classes) index)
599 : {
600 : case CC_PRINT:
601 0 : cv = pg_ctype_get_cache(pg_wc_isprint, index);
602 0 : break;
603 : case CC_ALNUM:
604 10 : cv = pg_ctype_get_cache(pg_wc_isalnum, index);
605 10 : break;
606 : case CC_ALPHA:
607 0 : cv = pg_ctype_get_cache(pg_wc_isalpha, index);
608 0 : break;
609 : case CC_ASCII:
610 : /* hard-wired meaning */
611 0 : cv = getcvec(v, 0, 1);
612 0 : if (cv)
613 0 : addrange(cv, 0, 0x7f);
614 0 : break;
615 : case CC_BLANK:
616 : /* hard-wired meaning */
617 0 : cv = getcvec(v, 2, 0);
618 0 : addchr(cv, '\t');
619 0 : addchr(cv, ' ');
620 0 : break;
621 : case CC_CNTRL:
622 : /* hard-wired meaning */
623 0 : cv = getcvec(v, 0, 2);
624 0 : addrange(cv, 0x0, 0x1f);
625 0 : addrange(cv, 0x7f, 0x9f);
626 0 : break;
627 : case CC_DIGIT:
628 5 : cv = pg_ctype_get_cache(pg_wc_isdigit, index);
629 5 : break;
630 : case CC_PUNCT:
631 0 : cv = pg_ctype_get_cache(pg_wc_ispunct, index);
632 0 : break;
633 : case CC_XDIGIT:
634 :
635 : /*
636 : * It's not clear how to define this in non-western locales, and
637 : * even less clear that there's any particular use in trying. So
638 : * just hard-wire the meaning.
639 : */
640 0 : cv = getcvec(v, 0, 3);
641 0 : if (cv)
642 : {
643 0 : addrange(cv, '0', '9');
644 0 : addrange(cv, 'a', 'f');
645 0 : addrange(cv, 'A', 'F');
646 : }
647 0 : break;
648 : case CC_SPACE:
649 3 : cv = pg_ctype_get_cache(pg_wc_isspace, index);
650 3 : break;
651 : case CC_LOWER:
652 0 : cv = pg_ctype_get_cache(pg_wc_islower, index);
653 0 : break;
654 : case CC_UPPER:
655 0 : cv = pg_ctype_get_cache(pg_wc_isupper, index);
656 0 : break;
657 : case CC_GRAPH:
658 0 : cv = pg_ctype_get_cache(pg_wc_isgraph, index);
659 0 : break;
660 : }
661 :
662 : /* If cv is NULL now, the reason must be "out of memory" */
663 18 : if (cv == NULL)
664 0 : ERR(REG_ESPACE);
665 18 : return cv;
666 : }
667 :
668 : /*
669 : * cclass_column_index - get appropriate high colormap column index for chr
670 : */
671 : static int
672 0 : cclass_column_index(struct colormap *cm, chr c)
673 : {
674 0 : int colnum = 0;
675 :
676 : /* Shouldn't go through all these pushups for simple chrs */
677 0 : assert(c > MAX_SIMPLE_CHR);
678 :
679 : /*
680 : * Note: we should not see requests to consider cclasses that are not
681 : * treated as locale-specific by cclass(), above.
682 : */
683 0 : if (cm->classbits[CC_PRINT] && pg_wc_isprint(c))
684 0 : colnum |= cm->classbits[CC_PRINT];
685 0 : if (cm->classbits[CC_ALNUM] && pg_wc_isalnum(c))
686 0 : colnum |= cm->classbits[CC_ALNUM];
687 0 : if (cm->classbits[CC_ALPHA] && pg_wc_isalpha(c))
688 0 : colnum |= cm->classbits[CC_ALPHA];
689 0 : assert(cm->classbits[CC_ASCII] == 0);
690 0 : assert(cm->classbits[CC_BLANK] == 0);
691 0 : assert(cm->classbits[CC_CNTRL] == 0);
692 0 : if (cm->classbits[CC_DIGIT] && pg_wc_isdigit(c))
693 0 : colnum |= cm->classbits[CC_DIGIT];
694 0 : if (cm->classbits[CC_PUNCT] && pg_wc_ispunct(c))
695 0 : colnum |= cm->classbits[CC_PUNCT];
696 0 : assert(cm->classbits[CC_XDIGIT] == 0);
697 0 : if (cm->classbits[CC_SPACE] && pg_wc_isspace(c))
698 0 : colnum |= cm->classbits[CC_SPACE];
699 0 : if (cm->classbits[CC_LOWER] && pg_wc_islower(c))
700 0 : colnum |= cm->classbits[CC_LOWER];
701 0 : if (cm->classbits[CC_UPPER] && pg_wc_isupper(c))
702 0 : colnum |= cm->classbits[CC_UPPER];
703 0 : if (cm->classbits[CC_GRAPH] && pg_wc_isgraph(c))
704 0 : colnum |= cm->classbits[CC_GRAPH];
705 :
706 0 : return colnum;
707 : }
708 :
709 : /*
710 : * allcases - supply cvec for all case counterparts of a chr (including itself)
711 : *
712 : * This is a shortcut, preferably an efficient one, for simple characters;
713 : * messy cases are done via range().
714 : */
715 : static struct cvec *
716 16 : allcases(struct vars *v, /* context */
717 : chr c) /* character to get case equivs of */
718 : {
719 : struct cvec *cv;
720 : chr lc,
721 : uc;
722 :
723 16 : lc = pg_wc_tolower(c);
724 16 : uc = pg_wc_toupper(c);
725 :
726 16 : cv = getcvec(v, 2, 0);
727 16 : addchr(cv, lc);
728 16 : if (lc != uc)
729 15 : addchr(cv, uc);
730 16 : return cv;
731 : }
732 :
733 : /*
734 : * cmp - chr-substring compare
735 : *
736 : * Backrefs need this. It should preferably be efficient.
737 : * Note that it does not need to report anything except equal/unequal.
738 : * Note also that the length is exact, and the comparison should not
739 : * stop at embedded NULs!
740 : */
741 : static int /* 0 for equal, nonzero for unequal */
742 48 : cmp(const chr *x, const chr *y, /* strings to compare */
743 : size_t len) /* exact length of comparison */
744 : {
745 48 : return memcmp(VS(x), VS(y), len * sizeof(chr));
746 : }
747 :
748 : /*
749 : * casecmp - case-independent chr-substring compare
750 : *
751 : * REG_ICASE backrefs need this. It should preferably be efficient.
752 : * Note that it does not need to report anything except equal/unequal.
753 : * Note also that the length is exact, and the comparison should not
754 : * stop at embedded NULs!
755 : */
756 : static int /* 0 for equal, nonzero for unequal */
757 0 : casecmp(const chr *x, const chr *y, /* strings to compare */
758 : size_t len) /* exact length of comparison */
759 : {
760 0 : for (; len > 0; len--, x++, y++)
761 : {
762 0 : if ((*x != *y) && (pg_wc_tolower(*x) != pg_wc_tolower(*y)))
763 0 : return 1;
764 : }
765 0 : return 0;
766 : }
|