Line data Source code
1 : %top{
2 : /*-------------------------------------------------------------------------
3 : *
4 : * scan.l
5 : * lexical scanner for PostgreSQL
6 : *
7 : * NOTE NOTE NOTE:
8 : *
9 : * The rules in this file must be kept in sync with src/fe_utils/psqlscan.l!
10 : *
11 : * The rules are designed so that the scanner never has to backtrack,
12 : * in the sense that there is always a rule that can match the input
13 : * consumed so far (the rule action may internally throw back some input
14 : * with yyless(), however). As explained in the flex manual, this makes
15 : * for a useful speed increase --- about a third faster than a plain -CF
16 : * lexer, in simple testing. The extra complexity is mostly in the rules
17 : * for handling float numbers and continued string literals. If you change
18 : * the lexical rules, verify that you haven't broken the no-backtrack
19 : * property by running flex with the "-b" option and checking that the
20 : * resulting "lex.backup" file says that no backing up is needed. (As of
21 : * Postgres 9.2, this check is made automatically by the Makefile.)
22 : *
23 : *
24 : * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
25 : * Portions Copyright (c) 1994, Regents of the University of California
26 : *
27 : * IDENTIFICATION
28 : * src/backend/parser/scan.l
29 : *
30 : *-------------------------------------------------------------------------
31 : */
32 : #include "postgres.h"
33 :
34 : #include <ctype.h>
35 : #include <unistd.h>
36 :
37 : #include "parser/gramparse.h"
38 : #include "parser/parser.h" /* only needed for GUC variables */
39 : #include "parser/scansup.h"
40 : #include "mb/pg_wchar.h"
41 : }
42 :
43 : %{
44 : /* Avoid exit() on fatal scanner errors (a bit ugly -- see yy_fatal_error) */
45 : #undef fprintf
46 : #define fprintf(file, fmt, msg) fprintf_to_ereport(fmt, msg)
47 :
48 : static void
49 0 : fprintf_to_ereport(const char *fmt, const char *msg)
50 : {
51 0 : ereport(ERROR, (errmsg_internal("%s", msg)));
52 : }
53 :
54 : /*
55 : * GUC variables. This is a DIRECT violation of the warning given at the
56 : * head of gram.y, ie flex/bison code must not depend on any GUC variables;
57 : * as such, changing their values can induce very unintuitive behavior.
58 : * But we shall have to live with it until we can remove these variables.
59 : */
60 : int backslash_quote = BACKSLASH_QUOTE_SAFE_ENCODING;
61 : bool escape_string_warning = true;
62 : bool standard_conforming_strings = true;
63 :
64 : /*
65 : * Set the type of YYSTYPE.
66 : */
67 : #define YYSTYPE core_YYSTYPE
68 :
69 : /*
70 : * Set the type of yyextra. All state variables used by the scanner should
71 : * be in yyextra, *not* statically allocated.
72 : */
73 : #define YY_EXTRA_TYPE core_yy_extra_type *
74 :
75 : /*
76 : * Each call to yylex must set yylloc to the location of the found token
77 : * (expressed as a byte offset from the start of the input text).
78 : * When we parse a token that requires multiple lexer rules to process,
79 : * this should be done in the first such rule, else yylloc will point
80 : * into the middle of the token.
81 : */
82 : #define SET_YYLLOC() (*(yylloc) = yytext - yyextra->scanbuf)
83 :
84 : /*
85 : * Advance yylloc by the given number of bytes.
86 : */
87 : #define ADVANCE_YYLLOC(delta) ( *(yylloc) += (delta) )
88 :
89 : #define startlit() ( yyextra->literallen = 0 )
90 : static void addlit(char *ytext, int yleng, core_yyscan_t yyscanner);
91 : static void addlitchar(unsigned char ychar, core_yyscan_t yyscanner);
92 : static char *litbufdup(core_yyscan_t yyscanner);
93 : static char *litbuf_udeescape(unsigned char escape, core_yyscan_t yyscanner);
94 : static unsigned char unescape_single_char(unsigned char c, core_yyscan_t yyscanner);
95 : static int process_integer_literal(const char *token, YYSTYPE *lval);
96 : static bool is_utf16_surrogate_first(pg_wchar c);
97 : static bool is_utf16_surrogate_second(pg_wchar c);
98 : static pg_wchar surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second);
99 : static void addunicode(pg_wchar c, yyscan_t yyscanner);
100 : static bool check_uescapechar(unsigned char escape);
101 :
102 : #define yyerror(msg) scanner_yyerror(msg, yyscanner)
103 :
104 : #define lexer_errposition() scanner_errposition(*(yylloc), yyscanner)
105 :
106 : static void check_string_escape_warning(unsigned char ychar, core_yyscan_t yyscanner);
107 : static void check_escape_warning(core_yyscan_t yyscanner);
108 :
109 : /*
110 : * Work around a bug in flex 2.5.35: it emits a couple of functions that
111 : * it forgets to emit declarations for. Since we use -Wmissing-prototypes,
112 : * this would cause warnings. Providing our own declarations should be
113 : * harmless even when the bug gets fixed.
114 : */
115 : extern int core_yyget_column(yyscan_t yyscanner);
116 : extern void core_yyset_column(int column_no, yyscan_t yyscanner);
117 :
118 : %}
119 :
120 : %option reentrant
121 : %option bison-bridge
122 : %option bison-locations
123 : %option 8bit
124 : %option never-interactive
125 : %option nodefault
126 : %option noinput
127 : %option nounput
128 : %option noyywrap
129 : %option noyyalloc
130 : %option noyyrealloc
131 : %option noyyfree
132 : %option warn
133 : %option prefix="core_yy"
134 :
135 : /*
136 : * OK, here is a short description of lex/flex rules behavior.
137 : * The longest pattern which matches an input string is always chosen.
138 : * For equal-length patterns, the first occurring in the rules list is chosen.
139 : * INITIAL is the starting state, to which all non-conditional rules apply.
140 : * Exclusive states change parsing rules while the state is active. When in
141 : * an exclusive state, only those rules defined for that state apply.
142 : *
143 : * We use exclusive states for quoted strings, extended comments,
144 : * and to eliminate parsing troubles for numeric strings.
145 : * Exclusive states:
146 : * <xb> bit string literal
147 : * <xc> extended C-style comments
148 : * <xd> delimited identifiers (double-quoted identifiers)
149 : * <xh> hexadecimal numeric string
150 : * <xq> standard quoted strings
151 : * <xe> extended quoted strings (support backslash escape sequences)
152 : * <xdolq> $foo$ quoted strings
153 : * <xui> quoted identifier with Unicode escapes
154 : * <xuiend> end of a quoted identifier with Unicode escapes, UESCAPE can follow
155 : * <xus> quoted string with Unicode escapes
156 : * <xusend> end of a quoted string with Unicode escapes, UESCAPE can follow
157 : * <xeu> Unicode surrogate pair in extended quoted string
158 : *
159 : * Remember to add an <<EOF>> case whenever you add a new exclusive state!
160 : * The default one is probably not the right thing.
161 : */
162 :
163 : %x xb
164 : %x xc
165 : %x xd
166 : %x xh
167 : %x xe
168 : %x xq
169 : %x xdolq
170 : %x xui
171 : %x xuiend
172 : %x xus
173 : %x xusend
174 : %x xeu
175 :
176 : /*
177 : * In order to make the world safe for Windows and Mac clients as well as
178 : * Unix ones, we accept either \n or \r as a newline. A DOS-style \r\n
179 : * sequence will be seen as two successive newlines, but that doesn't cause
180 : * any problems. Comments that start with -- and extend to the next
181 : * newline are treated as equivalent to a single whitespace character.
182 : *
183 : * NOTE a fine point: if there is no newline following --, we will absorb
184 : * everything to the end of the input as a comment. This is correct. Older
185 : * versions of Postgres failed to recognize -- as a comment if the input
186 : * did not end with a newline.
187 : *
188 : * XXX perhaps \f (formfeed) should be treated as a newline as well?
189 : *
190 : * XXX if you change the set of whitespace characters, fix scanner_isspace()
191 : * to agree, and see also the plpgsql lexer.
192 : */
193 :
194 : space [ \t\n\r\f]
195 : horiz_space [ \t\f]
196 : newline [\n\r]
197 : non_newline [^\n\r]
198 :
199 : comment ("--"{non_newline}*)
200 :
201 : whitespace ({space}+|{comment})
202 :
203 : /*
204 : * SQL requires at least one newline in the whitespace separating
205 : * string literals that are to be concatenated. Silly, but who are we
206 : * to argue? Note that {whitespace_with_newline} should not have * after
207 : * it, whereas {whitespace} should generally have a * after it...
208 : */
209 :
210 : special_whitespace ({space}+|{comment}{newline})
211 : horiz_whitespace ({horiz_space}|{comment})
212 : whitespace_with_newline ({horiz_whitespace}*{newline}{special_whitespace}*)
213 :
214 : /*
215 : * To ensure that {quotecontinue} can be scanned without having to back up
216 : * if the full pattern isn't matched, we include trailing whitespace in
217 : * {quotestop}. This matches all cases where {quotecontinue} fails to match,
218 : * except for {quote} followed by whitespace and just one "-" (not two,
219 : * which would start a {comment}). To cover that we have {quotefail}.
220 : * The actions for {quotestop} and {quotefail} must throw back characters
221 : * beyond the quote proper.
222 : */
223 : quote '
224 : quotestop {quote}{whitespace}*
225 : quotecontinue {quote}{whitespace_with_newline}{quote}
226 : quotefail {quote}{whitespace}*"-"
227 :
228 : /* Bit string
229 : * It is tempting to scan the string for only those characters
230 : * which are allowed. However, this leads to silently swallowed
231 : * characters if illegal characters are included in the string.
232 : * For example, if xbinside is [01] then B'ABCD' is interpreted
233 : * as a zero-length string, and the ABCD' is lost!
234 : * Better to pass the string forward and let the input routines
235 : * validate the contents.
236 : */
237 : xbstart [bB]{quote}
238 : xbinside [^']*
239 :
240 : /* Hexadecimal number */
241 : xhstart [xX]{quote}
242 : xhinside [^']*
243 :
244 : /* National character */
245 : xnstart [nN]{quote}
246 :
247 : /* Quoted string that allows backslash escapes */
248 : xestart [eE]{quote}
249 : xeinside [^\\']+
250 : xeescape [\\][^0-7]
251 : xeoctesc [\\][0-7]{1,3}
252 : xehexesc [\\]x[0-9A-Fa-f]{1,2}
253 : xeunicode [\\](u[0-9A-Fa-f]{4}|U[0-9A-Fa-f]{8})
254 : xeunicodefail [\\](u[0-9A-Fa-f]{0,3}|U[0-9A-Fa-f]{0,7})
255 :
256 : /* Extended quote
257 : * xqdouble implements embedded quote, ''''
258 : */
259 : xqstart {quote}
260 : xqdouble {quote}{quote}
261 : xqinside [^']+
262 :
263 : /* $foo$ style quotes ("dollar quoting")
264 : * The quoted string starts with $foo$ where "foo" is an optional string
265 : * in the form of an identifier, except that it may not contain "$",
266 : * and extends to the first occurrence of an identical string.
267 : * There is *no* processing of the quoted text.
268 : *
269 : * {dolqfailed} is an error rule to avoid scanner backup when {dolqdelim}
270 : * fails to match its trailing "$".
271 : */
272 : dolq_start [A-Za-z\200-\377_]
273 : dolq_cont [A-Za-z\200-\377_0-9]
274 : dolqdelim \$({dolq_start}{dolq_cont}*)?\$
275 : dolqfailed \${dolq_start}{dolq_cont}*
276 : dolqinside [^$]+
277 :
278 : /* Double quote
279 : * Allows embedded spaces and other special characters into identifiers.
280 : */
281 : dquote \"
282 : xdstart {dquote}
283 : xdstop {dquote}
284 : xddouble {dquote}{dquote}
285 : xdinside [^"]+
286 :
287 : /* Unicode escapes */
288 : uescape [uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']{quote}
289 : /* error rule to avoid backup */
290 : uescapefail [uU][eE][sS][cC][aA][pP][eE]{whitespace}*"-"|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}[^']|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*{quote}|[uU][eE][sS][cC][aA][pP][eE]{whitespace}*|[uU][eE][sS][cC][aA][pP]|[uU][eE][sS][cC][aA]|[uU][eE][sS][cC]|[uU][eE][sS]|[uU][eE]|[uU]
291 :
292 : /* Quoted identifier with Unicode escapes */
293 : xuistart [uU]&{dquote}
294 :
295 : /* Quoted string with Unicode escapes */
296 : xusstart [uU]&{quote}
297 :
298 : /* Optional UESCAPE after a quoted string or identifier with Unicode escapes. */
299 : xustop1 {uescapefail}?
300 : xustop2 {uescape}
301 :
302 : /* error rule to avoid backup */
303 : xufailed [uU]&
304 :
305 :
306 : /* C-style comments
307 : *
308 : * The "extended comment" syntax closely resembles allowable operator syntax.
309 : * The tricky part here is to get lex to recognize a string starting with
310 : * slash-star as a comment, when interpreting it as an operator would produce
311 : * a longer match --- remember lex will prefer a longer match! Also, if we
312 : * have something like plus-slash-star, lex will think this is a 3-character
313 : * operator whereas we want to see it as a + operator and a comment start.
314 : * The solution is two-fold:
315 : * 1. append {op_chars}* to xcstart so that it matches as much text as
316 : * {operator} would. Then the tie-breaker (first matching rule of same
317 : * length) ensures xcstart wins. We put back the extra stuff with yyless()
318 : * in case it contains a star-slash that should terminate the comment.
319 : * 2. In the operator rule, check for slash-star within the operator, and
320 : * if found throw it back with yyless(). This handles the plus-slash-star
321 : * problem.
322 : * Dash-dash comments have similar interactions with the operator rule.
323 : */
324 : xcstart \/\*{op_chars}*
325 : xcstop \*+\/
326 : xcinside [^*/]+
327 :
328 : digit [0-9]
329 : ident_start [A-Za-z\200-\377_]
330 : ident_cont [A-Za-z\200-\377_0-9\$]
331 :
332 : identifier {ident_start}{ident_cont}*
333 :
334 : /* Assorted special-case operators and operator-like tokens */
335 : typecast "::"
336 : dot_dot \.\.
337 : colon_equals ":="
338 : equals_greater "=>"
339 : less_equals "<="
340 : greater_equals ">="
341 : less_greater "<>"
342 : not_equals "!="
343 :
344 : /*
345 : * "self" is the set of chars that should be returned as single-character
346 : * tokens. "op_chars" is the set of chars that can make up "Op" tokens,
347 : * which can be one or more characters long (but if a single-char token
348 : * appears in the "self" set, it is not to be returned as an Op). Note
349 : * that the sets overlap, but each has some chars that are not in the other.
350 : *
351 : * If you change either set, adjust the character lists appearing in the
352 : * rule for "operator"!
353 : */
354 : self [,()\[\].;\:\+\-\*\/\%\^\<\>\=]
355 : op_chars [\~\!\@\#\^\&\|\`\?\+\-\*\/\%\<\>\=]
356 : operator {op_chars}+
357 :
358 : /* we no longer allow unary minus in numbers.
359 : * instead we pass it separately to parser. there it gets
360 : * coerced via doNegate() -- Leon aug 20 1999
361 : *
362 : * {decimalfail} is used because we would like "1..10" to lex as 1, dot_dot, 10.
363 : *
364 : * {realfail1} and {realfail2} are added to prevent the need for scanner
365 : * backup when the {real} rule fails to match completely.
366 : */
367 :
368 : integer {digit}+
369 : decimal (({digit}*\.{digit}+)|({digit}+\.{digit}*))
370 : decimalfail {digit}+\.\.
371 : real ({integer}|{decimal})[Ee][-+]?{digit}+
372 : realfail1 ({integer}|{decimal})[Ee]
373 : realfail2 ({integer}|{decimal})[Ee][-+]
374 :
375 : param \${integer}
376 :
377 : other .
378 :
379 : /*
380 : * Dollar quoted strings are totally opaque, and no escaping is done on them.
381 : * Other quoted strings must allow some special characters such as single-quote
382 : * and newline.
383 : * Embedded single-quotes are implemented both in the SQL standard
384 : * style of two adjacent single quotes "''" and in the Postgres/Java style
385 : * of escaped-quote "\'".
386 : * Other embedded escaped characters are matched explicitly and the leading
387 : * backslash is dropped from the string.
388 : * Note that xcstart must appear before operator, as explained above!
389 : * Also whitespace (comment) must appear before operator.
390 : */
391 :
392 : %%
393 :
394 : {whitespace} {
395 : /* ignore */
396 : }
397 322152 :
398 : {xcstart} {
399 : /* Set location in case of syntax error in comment */
400 221 : SET_YYLLOC();
401 221 : yyextra->xcdepth = 0;
402 221 : BEGIN(xc);
403 : /* Put back any characters past slash-star; see above */
404 221 : yyless(2);
405 : }
406 221 :
407 : <xc>{xcstart} {
408 3 : (yyextra->xcdepth)++;
409 : /* Put back any characters past slash-star; see above */
410 3 : yyless(2);
411 : }
412 3 :
413 : <xc>{xcstop} {
414 224 : if (yyextra->xcdepth <= 0)
415 221 : BEGIN(INITIAL);
416 : else
417 3 : (yyextra->xcdepth)--;
418 : }
419 224 :
420 : <xc>{xcinside} {
421 : /* ignore */
422 : }
423 786 :
424 : <xc>{op_chars} {
425 : /* ignore */
426 : }
427 559 :
428 : <xc>\*+ {
429 : /* ignore */
430 : }
431 0 :
432 0 : <xc><<EOF>> { yyerror("unterminated /* comment"); }
433 :
434 : {xbstart} {
435 : /* Binary bit type.
436 : * At some point we should simply pass the string
437 : * forward to the parser and label it there.
438 : * In the meantime, place a leading "b" on the string
439 : * to mark it for the input routine as a binary string.
440 : */
441 117 : SET_YYLLOC();
442 117 : BEGIN(xb);
443 117 : startlit();
444 117 : addlitchar('b', yyscanner);
445 : }
446 117 : <xb>{quotestop} |
447 : <xb>{quotefail} {
448 117 : yyless(1);
449 117 : BEGIN(INITIAL);
450 117 : yylval->str = litbufdup(yyscanner);
451 117 : return BCONST;
452 : }
453 : <xh>{xhinside} |
454 : <xb>{xbinside} {
455 144 : addlit(yytext, yyleng, yyscanner);
456 : }
457 144 : <xh>{quotecontinue} |
458 : <xb>{quotecontinue} {
459 : /* ignore */
460 : }
461 0 : <xb><<EOF>> { yyerror("unterminated bit string literal"); }
462 :
463 : {xhstart} {
464 : /* Hexadecimal bit type.
465 : * At some point we should simply pass the string
466 : * forward to the parser and label it there.
467 : * In the meantime, place a leading "x" on the string
468 : * to mark it for the input routine as a hex string.
469 : */
470 32 : SET_YYLLOC();
471 32 : BEGIN(xh);
472 32 : startlit();
473 32 : addlitchar('x', yyscanner);
474 : }
475 32 : <xh>{quotestop} |
476 : <xh>{quotefail} {
477 32 : yyless(1);
478 32 : BEGIN(INITIAL);
479 32 : yylval->str = litbufdup(yyscanner);
480 32 : return XCONST;
481 : }
482 0 : <xh><<EOF>> { yyerror("unterminated hexadecimal string literal"); }
483 :
484 : {xnstart} {
485 : /* National character.
486 : * We will pass this along as a normal character string,
487 : * but preceded with an internally-generated "NCHAR".
488 : */
489 : const ScanKeyword *keyword;
490 :
491 0 : SET_YYLLOC();
492 0 : yyless(1); /* eat only 'n' this time */
493 :
494 0 : keyword = ScanKeywordLookup("nchar",
495 0 : yyextra->keywords,
496 0 : yyextra->num_keywords);
497 0 : if (keyword != NULL)
498 : {
499 0 : yylval->keyword = keyword->name;
500 0 : return keyword->value;
501 : }
502 : else
503 : {
504 : /* If NCHAR isn't a keyword, just return "n" */
505 0 : yylval->str = pstrdup("n");
506 0 : return IDENT;
507 : }
508 : }
509 :
510 : {xqstart} {
511 26094 : yyextra->warn_on_first_escape = true;
512 26094 : yyextra->saw_non_ascii = false;
513 26094 : SET_YYLLOC();
514 26094 : if (yyextra->standard_conforming_strings)
515 26039 : BEGIN(xq);
516 : else
517 55 : BEGIN(xe);
518 26094 : startlit();
519 : }
520 26094 : {xestart} {
521 233 : yyextra->warn_on_first_escape = false;
522 233 : yyextra->saw_non_ascii = false;
523 233 : SET_YYLLOC();
524 233 : BEGIN(xe);
525 233 : startlit();
526 : }
527 233 : {xusstart} {
528 12 : SET_YYLLOC();
529 12 : if (!yyextra->standard_conforming_strings)
530 6 : ereport(ERROR,
531 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
532 : errmsg("unsafe use of string constant with Unicode escapes"),
533 : errdetail("String constants with Unicode escapes cannot be used when standard_conforming_strings is off."),
534 : lexer_errposition()));
535 6 : BEGIN(xus);
536 6 : startlit();
537 : }
538 6 : <xq,xe>{quotestop} |
539 : <xq,xe>{quotefail} {
540 26327 : yyless(1);
541 26327 : BEGIN(INITIAL);
542 : /*
543 : * check that the data remains valid if it might have been
544 : * made invalid by unescaping any chars.
545 : */
546 26327 : if (yyextra->saw_non_ascii)
547 0 : pg_verifymbstr(yyextra->literalbuf,
548 0 : yyextra->literallen,
549 : false);
550 26327 : yylval->str = litbufdup(yyscanner);
551 26327 : return SCONST;
552 : }
553 : <xus>{quotestop} |
554 : <xus>{quotefail} {
555 : /* throw back all but the quote */
556 6 : yyless(1);
557 : /* xusend state looks for possible UESCAPE */
558 6 : BEGIN(xusend);
559 : }
560 6 : <xusend>{whitespace} {
561 : /* stay in xusend state over whitespace */
562 : }
563 4 : <xusend><<EOF>> |
564 : <xusend>{other} |
565 : <xusend>{xustop1} {
566 : /* no UESCAPE after the quote, throw back everything */
567 3 : yyless(0);
568 3 : BEGIN(INITIAL);
569 3 : yylval->str = litbuf_udeescape('\\', yyscanner);
570 1 : return SCONST;
571 : }
572 : <xusend>{xustop2} {
573 : /* found UESCAPE after the end quote */
574 3 : BEGIN(INITIAL);
575 3 : if (!check_uescapechar(yytext[yyleng - 2]))
576 : {
577 1 : SET_YYLLOC();
578 1 : ADVANCE_YYLLOC(yyleng - 2);
579 1 : yyerror("invalid Unicode escape character");
580 : }
581 2 : yylval->str = litbuf_udeescape(yytext[yyleng - 2],
582 : yyscanner);
583 2 : return SCONST;
584 : }
585 : <xq,xe,xus>{xqdouble} {
586 372 : addlitchar('\'', yyscanner);
587 : }
588 372 : <xq,xus>{xqinside} {
589 24909 : addlit(yytext, yyleng, yyscanner);
590 : }
591 24909 : <xe>{xeinside} {
592 311 : addlit(yytext, yyleng, yyscanner);
593 : }
594 311 : <xe>{xeunicode} {
595 0 : pg_wchar c = strtoul(yytext + 2, NULL, 16);
596 :
597 0 : check_escape_warning(yyscanner);
598 :
599 0 : if (is_utf16_surrogate_first(c))
600 : {
601 0 : yyextra->utf16_first_part = c;
602 0 : BEGIN(xeu);
603 : }
604 0 : else if (is_utf16_surrogate_second(c))
605 0 : yyerror("invalid Unicode surrogate pair");
606 : else
607 0 : addunicode(c, yyscanner);
608 : }
609 0 : <xeu>{xeunicode} {
610 0 : pg_wchar c = strtoul(yytext + 2, NULL, 16);
611 :
612 0 : if (!is_utf16_surrogate_second(c))
613 0 : yyerror("invalid Unicode surrogate pair");
614 :
615 0 : c = surrogate_pair_to_codepoint(yyextra->utf16_first_part, c);
616 :
617 0 : addunicode(c, yyscanner);
618 :
619 0 : BEGIN(xe);
620 : }
621 0 : <xeu>. { yyerror("invalid Unicode surrogate pair"); }
622 0 : <xeu>\n { yyerror("invalid Unicode surrogate pair"); }
623 0 : <xeu><<EOF>> { yyerror("invalid Unicode surrogate pair"); }
624 : <xe,xeu>{xeunicodefail} {
625 0 : ereport(ERROR,
626 : (errcode(ERRCODE_INVALID_ESCAPE_SEQUENCE),
627 : errmsg("invalid Unicode escape"),
628 : errhint("Unicode escapes must be \\uXXXX or \\UXXXXXXXX."),
629 : lexer_errposition()));
630 : }
631 : <xe>{xeescape} {
632 224 : if (yytext[1] == '\'')
633 : {
634 12 : if (yyextra->backslash_quote == BACKSLASH_QUOTE_OFF ||
635 12 : (yyextra->backslash_quote == BACKSLASH_QUOTE_SAFE_ENCODING &&
636 6 : PG_ENCODING_IS_CLIENT_ONLY(pg_get_client_encoding())))
637 0 : ereport(ERROR,
638 : (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
639 : errmsg("unsafe use of \\' in a string literal"),
640 : errhint("Use '' to write quotes in strings. \\' is insecure in client-only encodings."),
641 : lexer_errposition()));
642 : }
643 224 : check_string_escape_warning(yytext[1], yyscanner);
644 224 : addlitchar(unescape_single_char(yytext[1], yyscanner),
645 : yyscanner);
646 : }
647 224 : <xe>{xeoctesc} {
648 14 : unsigned char c = strtoul(yytext + 1, NULL, 8);
649 :
650 14 : check_escape_warning(yyscanner);
651 14 : addlitchar(c, yyscanner);
652 14 : if (c == '\0' || IS_HIGHBIT_SET(c))
653 0 : yyextra->saw_non_ascii = true;
654 : }
655 14 : <xe>{xehexesc} {
656 0 : unsigned char c = strtoul(yytext + 2, NULL, 16);
657 :
658 0 : check_escape_warning(yyscanner);
659 0 : addlitchar(c, yyscanner);
660 0 : if (c == '\0' || IS_HIGHBIT_SET(c))
661 0 : yyextra->saw_non_ascii = true;
662 : }
663 0 : <xq,xe,xus>{quotecontinue} {
664 : /* ignore */
665 : }
666 3 : <xe>. {
667 : /* This is only needed for \ just before EOF */
668 0 : addlitchar(yytext[0], yyscanner);
669 : }
670 0 : <xq,xe,xus><<EOF>> { yyerror("unterminated quoted string"); }
671 :
672 : {dolqdelim} {
673 510 : SET_YYLLOC();
674 510 : yyextra->dolqstart = pstrdup(yytext);
675 510 : BEGIN(xdolq);
676 510 : startlit();
677 : }
678 510 : {dolqfailed} {
679 0 : SET_YYLLOC();
680 : /* throw back all but the initial "$" */
681 0 : yyless(1);
682 : /* and treat it as {other} */
683 0 : return yytext[0];
684 : }
685 : <xdolq>{dolqdelim} {
686 538 : if (strcmp(yytext, yyextra->dolqstart) == 0)
687 : {
688 510 : pfree(yyextra->dolqstart);
689 510 : yyextra->dolqstart = NULL;
690 510 : BEGIN(INITIAL);
691 510 : yylval->str = litbufdup(yyscanner);
692 510 : return SCONST;
693 : }
694 : else
695 : {
696 : /*
697 : * When we fail to match $...$ to dolqstart, transfer
698 : * the $... part to the output, but put back the final
699 : * $ for rescanning. Consider $delim$...$junk$delim$
700 : */
701 28 : addlit(yytext, yyleng - 1, yyscanner);
702 28 : yyless(yyleng - 1);
703 : }
704 : }
705 28 : <xdolq>{dolqinside} {
706 811 : addlit(yytext, yyleng, yyscanner);
707 : }
708 811 : <xdolq>{dolqfailed} {
709 7 : addlit(yytext, yyleng, yyscanner);
710 : }
711 7 : <xdolq>. {
712 : /* This is only needed for $ inside the quoted text */
713 294 : addlitchar(yytext[0], yyscanner);
714 : }
715 294 : <xdolq><<EOF>> { yyerror("unterminated dollar-quoted string"); }
716 :
717 : {xdstart} {
718 2077 : SET_YYLLOC();
719 2077 : BEGIN(xd);
720 2077 : startlit();
721 : }
722 2077 : {xuistart} {
723 4 : SET_YYLLOC();
724 4 : BEGIN(xui);
725 4 : startlit();
726 : }
727 4 : <xd>{xdstop} {
728 : char *ident;
729 :
730 2077 : BEGIN(INITIAL);
731 2077 : if (yyextra->literallen == 0)
732 0 : yyerror("zero-length delimited identifier");
733 2077 : ident = litbufdup(yyscanner);
734 2077 : if (yyextra->literallen >= NAMEDATALEN)
735 0 : truncate_identifier(ident, yyextra->literallen, true);
736 2077 : yylval->str = ident;
737 2077 : return IDENT;
738 : }
739 : <xui>{dquote} {
740 4 : yyless(1);
741 : /* xuiend state looks for possible UESCAPE */
742 4 : BEGIN(xuiend);
743 : }
744 4 : <xuiend>{whitespace} {
745 : /* stay in xuiend state over whitespace */
746 : }
747 3 : <xuiend><<EOF>> |
748 : <xuiend>{other} |
749 : <xuiend>{xustop1} {
750 : /* no UESCAPE after the quote, throw back everything */
751 : char *ident;
752 : int identlen;
753 :
754 1 : yyless(0);
755 :
756 1 : BEGIN(INITIAL);
757 1 : if (yyextra->literallen == 0)
758 0 : yyerror("zero-length delimited identifier");
759 1 : ident = litbuf_udeescape('\\', yyscanner);
760 1 : identlen = strlen(ident);
761 1 : if (identlen >= NAMEDATALEN)
762 0 : truncate_identifier(ident, identlen, true);
763 1 : yylval->str = ident;
764 1 : return IDENT;
765 : }
766 : <xuiend>{xustop2} {
767 : /* found UESCAPE after the end quote */
768 : char *ident;
769 : int identlen;
770 :
771 3 : BEGIN(INITIAL);
772 3 : if (yyextra->literallen == 0)
773 0 : yyerror("zero-length delimited identifier");
774 3 : if (!check_uescapechar(yytext[yyleng - 2]))
775 : {
776 0 : SET_YYLLOC();
777 0 : ADVANCE_YYLLOC(yyleng - 2);
778 0 : yyerror("invalid Unicode escape character");
779 : }
780 3 : ident = litbuf_udeescape(yytext[yyleng - 2], yyscanner);
781 3 : identlen = strlen(ident);
782 3 : if (identlen >= NAMEDATALEN)
783 0 : truncate_identifier(ident, identlen, true);
784 3 : yylval->str = ident;
785 3 : return IDENT;
786 : }
787 : <xd,xui>{xddouble} {
788 3 : addlitchar('"', yyscanner);
789 : }
790 3 : <xd,xui>{xdinside} {
791 2083 : addlit(yytext, yyleng, yyscanner);
792 : }
793 2083 : <xd,xui><<EOF>> { yyerror("unterminated quoted identifier"); }
794 :
795 : {xufailed} {
796 : char *ident;
797 :
798 0 : SET_YYLLOC();
799 : /* throw back all but the initial u/U */
800 0 : yyless(1);
801 : /* and treat it as {identifier} */
802 0 : ident = downcase_truncate_identifier(yytext, yyleng, true);
803 0 : yylval->str = ident;
804 0 : return IDENT;
805 : }
806 :
807 : {typecast} {
808 7341 : SET_YYLLOC();
809 7341 : return TYPECAST;
810 : }
811 :
812 : {dot_dot} {
813 32 : SET_YYLLOC();
814 32 : return DOT_DOT;
815 : }
816 :
817 : {colon_equals} {
818 441 : SET_YYLLOC();
819 441 : return COLON_EQUALS;
820 : }
821 :
822 : {equals_greater} {
823 20 : SET_YYLLOC();
824 20 : return EQUALS_GREATER;
825 : }
826 :
827 : {less_equals} {
828 253 : SET_YYLLOC();
829 253 : return LESS_EQUALS;
830 : }
831 :
832 : {greater_equals} {
833 311 : SET_YYLLOC();
834 311 : return GREATER_EQUALS;
835 : }
836 :
837 : {less_greater} {
838 : /* We accept both "<>" and "!=" as meaning NOT_EQUALS */
839 368 : SET_YYLLOC();
840 368 : return NOT_EQUALS;
841 : }
842 :
843 : {not_equals} {
844 : /* We accept both "<>" and "!=" as meaning NOT_EQUALS */
845 906 : SET_YYLLOC();
846 906 : return NOT_EQUALS;
847 : }
848 :
849 : {self} {
850 217221 : SET_YYLLOC();
851 217221 : return yytext[0];
852 : }
853 :
854 : {operator} {
855 : /*
856 : * Check for embedded slash-star or dash-dash; those
857 : * are comment starts, so operator must stop there.
858 : * Note that slash-star or dash-dash at the first
859 : * character will match a prior rule, not this one.
860 : */
861 3684 : int nchars = yyleng;
862 3684 : char *slashstar = strstr(yytext, "/*");
863 3684 : char *dashdash = strstr(yytext, "--");
864 :
865 3684 : if (slashstar && dashdash)
866 : {
867 : /* if both appear, take the first one */
868 0 : if (slashstar > dashdash)
869 0 : slashstar = dashdash;
870 : }
871 3684 : else if (!slashstar)
872 3684 : slashstar = dashdash;
873 3684 : if (slashstar)
874 0 : nchars = slashstar - yytext;
875 :
876 : /*
877 : * For SQL compatibility, '+' and '-' cannot be the
878 : * last char of a multi-char operator unless the operator
879 : * contains chars that are not in SQL operators.
880 : * The idea is to lex '=-' as two operators, but not
881 : * to forbid operator names like '?-' that could not be
882 : * sequences of SQL operators.
883 : */
884 10736 : while (nchars > 1 &&
885 6436 : (yytext[nchars - 1] == '+' ||
886 3218 : yytext[nchars - 1] == '-'))
887 : {
888 : int ic;
889 :
890 321 : for (ic = nchars - 2; ic >= 0; ic--)
891 : {
892 171 : if (strchr("~!@#^&|`?%", yytext[ic]))
893 21 : break;
894 : }
895 171 : if (ic >= 0)
896 21 : break; /* found a char that makes it OK */
897 150 : nchars--; /* else remove the +/-, and check again */
898 : }
899 :
900 3684 : SET_YYLLOC();
901 :
902 3684 : if (nchars < yyleng)
903 : {
904 : /* Strip the unwanted chars from the token */
905 150 : yyless(nchars);
906 : /*
907 : * If what we have left is only one char, and it's
908 : * one of the characters matching "self", then
909 : * return it as a character token the same way
910 : * that the "self" rule would have.
911 : */
912 300 : if (nchars == 1 &&
913 150 : strchr(",()[].;:+-*/%^<>=", yytext[0]))
914 150 : return yytext[0];
915 : }
916 :
917 : /*
918 : * Complain if operator is too long. Unlike the case
919 : * for identifiers, we make this an error not a notice-
920 : * and-truncate, because the odds are we are looking at
921 : * a syntactic mistake anyway.
922 : */
923 3534 : if (nchars >= NAMEDATALEN)
924 0 : yyerror("operator too long");
925 :
926 3534 : yylval->str = pstrdup(yytext);
927 3534 : return Op;
928 : }
929 :
930 : {param} {
931 2594 : SET_YYLLOC();
932 2594 : yylval->ival = atol(yytext + 1);
933 2594 : return PARAM;
934 : }
935 :
936 : {integer} {
937 18942 : SET_YYLLOC();
938 18942 : return process_integer_literal(yytext, yylval);
939 : }
940 : {decimal} {
941 734 : SET_YYLLOC();
942 734 : yylval->str = pstrdup(yytext);
943 734 : return FCONST;
944 : }
945 : {decimalfail} {
946 : /* throw back the .., and treat as integer */
947 5 : yyless(yyleng - 2);
948 5 : SET_YYLLOC();
949 5 : return process_integer_literal(yytext, yylval);
950 : }
951 : {real} {
952 24 : SET_YYLLOC();
953 24 : yylval->str = pstrdup(yytext);
954 24 : return FCONST;
955 : }
956 : {realfail1} {
957 : /*
958 : * throw back the [Ee], and treat as {decimal}. Note
959 : * that it is possible the input is actually {integer},
960 : * but since this case will almost certainly lead to a
961 : * syntax error anyway, we don't bother to distinguish.
962 : */
963 0 : yyless(yyleng - 1);
964 0 : SET_YYLLOC();
965 0 : yylval->str = pstrdup(yytext);
966 0 : return FCONST;
967 : }
968 : {realfail2} {
969 : /* throw back the [Ee][+-], and proceed as above */
970 0 : yyless(yyleng - 2);
971 0 : SET_YYLLOC();
972 0 : yylval->str = pstrdup(yytext);
973 0 : return FCONST;
974 : }
975 :
976 :
977 : {identifier} {
978 : const ScanKeyword *keyword;
979 : char *ident;
980 :
981 354667 : SET_YYLLOC();
982 :
983 : /* Is it a keyword? */
984 709334 : keyword = ScanKeywordLookup(yytext,
985 354667 : yyextra->keywords,
986 354667 : yyextra->num_keywords);
987 354667 : if (keyword != NULL)
988 : {
989 170926 : yylval->keyword = keyword->name;
990 170926 : return keyword->value;
991 : }
992 :
993 : /*
994 : * No. Convert the identifier to lower case, and truncate
995 : * if necessary.
996 : */
997 183741 : ident = downcase_truncate_identifier(yytext, yyleng, true);
998 183741 : yylval->str = ident;
999 183741 : return IDENT;
1000 : }
1001 :
1002 : {other} {
1003 0 : SET_YYLLOC();
1004 0 : return yytext[0];
1005 : }
1006 :
1007 : <<EOF>> {
1008 38221 : SET_YYLLOC();
1009 38221 : yyterminate();
1010 : }
1011 :
1012 0 : %%
1013 0 :
1014 : /*
1015 : * Arrange access to yyextra for subroutines of the main yylex() function.
1016 : * We expect each subroutine to have a yyscanner parameter. Rather than
1017 : * use the yyget_xxx functions, which might or might not get inlined by the
1018 : * compiler, we cheat just a bit and cast yyscanner to the right type.
1019 : */
1020 : #undef yyextra
1021 : #define yyextra (((struct yyguts_t *) yyscanner)->yyextra_r)
1022 :
1023 : /* Likewise for a couple of other things we need. */
1024 : #undef yylloc
1025 : #define yylloc (((struct yyguts_t *) yyscanner)->yylloc_r)
1026 : #undef yyleng
1027 : #define yyleng (((struct yyguts_t *) yyscanner)->yyleng_r)
1028 :
1029 :
1030 : /*
1031 : * scanner_errposition
1032 : * Report a lexer or grammar error cursor position, if possible.
1033 : *
1034 : * This is expected to be used within an ereport() call. The return value
1035 : * is a dummy (always 0, in fact).
1036 : *
1037 : * Note that this can only be used for messages emitted during raw parsing
1038 : * (essentially, scan.l and gram.y), since it requires the yyscanner struct
1039 : * to still be available.
1040 : */
1041 : int
1042 143 : scanner_errposition(int location, core_yyscan_t yyscanner)
1043 : {
1044 : int pos;
1045 :
1046 143 : if (location < 0)
1047 0 : return 0; /* no-op if location is unknown */
1048 :
1049 : /* Convert byte offset to character number */
1050 143 : pos = pg_mbstrlen_with_len(yyextra->scanbuf, location) + 1;
1051 : /* And pass it to the ereport mechanism */
1052 143 : return errposition(pos);
1053 : }
1054 :
1055 : /*
1056 : * scanner_yyerror
1057 : * Report a lexer or grammar error.
1058 : *
1059 : * The message's cursor position is whatever YYLLOC was last set to,
1060 : * ie, the start of the current token if called within yylex(), or the
1061 : * most recently lexed token if called from the grammar.
1062 : * This is OK for syntax error messages from the Bison parser, because Bison
1063 : * parsers report error as soon as the first unparsable token is reached.
1064 : * Beware of using yyerror for other purposes, as the cursor position might
1065 : * be misleading!
1066 : */
1067 : void
1068 103 : scanner_yyerror(const char *message, core_yyscan_t yyscanner)
1069 : {
1070 103 : const char *loc = yyextra->scanbuf + *yylloc;
1071 :
1072 103 : if (*loc == YY_END_OF_BUFFER_CHAR)
1073 : {
1074 1 : ereport(ERROR,
1075 : (errcode(ERRCODE_SYNTAX_ERROR),
1076 : /* translator: %s is typically the translation of "syntax error" */
1077 : errmsg("%s at end of input", _(message)),
1078 : lexer_errposition()));
1079 : }
1080 : else
1081 : {
1082 102 : ereport(ERROR,
1083 : (errcode(ERRCODE_SYNTAX_ERROR),
1084 : /* translator: first %s is typically the translation of "syntax error" */
1085 : errmsg("%s at or near \"%s\"", _(message), loc),
1086 : lexer_errposition()));
1087 : }
1088 : }
1089 :
1090 :
1091 : /*
1092 : * Called before any actual parsing is done
1093 : */
1094 : core_yyscan_t
1095 38376 : scanner_init(const char *str,
1096 : core_yy_extra_type *yyext,
1097 : const ScanKeyword *keywords,
1098 : int num_keywords)
1099 : {
1100 38376 : Size slen = strlen(str);
1101 : yyscan_t scanner;
1102 :
1103 38376 : if (yylex_init(&scanner) != 0)
1104 0 : elog(ERROR, "yylex_init() failed: %m");
1105 :
1106 38376 : core_yyset_extra(yyext, scanner);
1107 :
1108 38376 : yyext->keywords = keywords;
1109 38376 : yyext->num_keywords = num_keywords;
1110 :
1111 38376 : yyext->backslash_quote = backslash_quote;
1112 38376 : yyext->escape_string_warning = escape_string_warning;
1113 38376 : yyext->standard_conforming_strings = standard_conforming_strings;
1114 :
1115 : /*
1116 : * Make a scan buffer with special termination needed by flex.
1117 : */
1118 38376 : yyext->scanbuf = (char *) palloc(slen + 2);
1119 38376 : yyext->scanbuflen = slen;
1120 38376 : memcpy(yyext->scanbuf, str, slen);
1121 38376 : yyext->scanbuf[slen] = yyext->scanbuf[slen + 1] = YY_END_OF_BUFFER_CHAR;
1122 38376 : yy_scan_buffer(yyext->scanbuf, slen + 2, scanner);
1123 :
1124 : /* initialize literal buffer to a reasonable but expansible size */
1125 38376 : yyext->literalalloc = 1024;
1126 38376 : yyext->literalbuf = (char *) palloc(yyext->literalalloc);
1127 38376 : yyext->literallen = 0;
1128 :
1129 38376 : return scanner;
1130 : }
1131 :
1132 :
1133 : /*
1134 : * Called after parsing is done to clean up after scanner_init()
1135 : */
1136 : void
1137 38220 : scanner_finish(core_yyscan_t yyscanner)
1138 : {
1139 : /*
1140 : * We don't bother to call yylex_destroy(), because all it would do is
1141 : * pfree a small amount of control storage. It's cheaper to leak the
1142 : * storage until the parsing context is destroyed. The amount of space
1143 : * involved is usually negligible compared to the output parse tree
1144 : * anyway.
1145 : *
1146 : * We do bother to pfree the scanbuf and literal buffer, but only if they
1147 : * represent a nontrivial amount of space. The 8K cutoff is arbitrary.
1148 : */
1149 38220 : if (yyextra->scanbuflen >= 8192)
1150 0 : pfree(yyextra->scanbuf);
1151 38220 : if (yyextra->literalalloc >= 8192)
1152 0 : pfree(yyextra->literalbuf);
1153 38220 : }
1154 :
1155 :
1156 : static void
1157 28293 : addlit(char *ytext, int yleng, core_yyscan_t yyscanner)
1158 : {
1159 : /* enlarge buffer if needed */
1160 28293 : if ((yyextra->literallen + yleng) >= yyextra->literalalloc)
1161 : {
1162 : do
1163 : {
1164 12 : yyextra->literalalloc *= 2;
1165 12 : } while ((yyextra->literallen + yleng) >= yyextra->literalalloc);
1166 24 : yyextra->literalbuf = (char *) repalloc(yyextra->literalbuf,
1167 12 : yyextra->literalalloc);
1168 : }
1169 : /* append new data */
1170 28293 : memcpy(yyextra->literalbuf + yyextra->literallen, ytext, yleng);
1171 28293 : yyextra->literallen += yleng;
1172 28293 : }
1173 :
1174 :
1175 : static void
1176 1056 : addlitchar(unsigned char ychar, core_yyscan_t yyscanner)
1177 : {
1178 : /* enlarge buffer if needed */
1179 1056 : if ((yyextra->literallen + 1) >= yyextra->literalalloc)
1180 : {
1181 0 : yyextra->literalalloc *= 2;
1182 0 : yyextra->literalbuf = (char *) repalloc(yyextra->literalbuf,
1183 0 : yyextra->literalalloc);
1184 : }
1185 : /* append new data */
1186 1056 : yyextra->literalbuf[yyextra->literallen] = ychar;
1187 1056 : yyextra->literallen += 1;
1188 1056 : }
1189 :
1190 :
1191 : /*
1192 : * Create a palloc'd copy of literalbuf, adding a trailing null.
1193 : */
1194 : static char *
1195 29063 : litbufdup(core_yyscan_t yyscanner)
1196 : {
1197 29063 : int llen = yyextra->literallen;
1198 : char *new;
1199 :
1200 29063 : new = palloc(llen + 1);
1201 29063 : memcpy(new, yyextra->literalbuf, llen);
1202 29063 : new[llen] = '\0';
1203 29063 : return new;
1204 : }
1205 :
1206 : static int
1207 18947 : process_integer_literal(const char *token, YYSTYPE *lval)
1208 : {
1209 : long val;
1210 : char *endptr;
1211 :
1212 18947 : errno = 0;
1213 18947 : val = strtol(token, &endptr, 10);
1214 18947 : if (*endptr != '\0' || errno == ERANGE
1215 : #ifdef HAVE_LONG_INT_64
1216 : /* if long > 32 bits, check for overflow of int4 */
1217 : || val != (long) ((int32) val)
1218 : #endif
1219 : )
1220 : {
1221 : /* integer too large, treat it as a float */
1222 84 : lval->str = pstrdup(token);
1223 84 : return FCONST;
1224 : }
1225 18863 : lval->ival = val;
1226 18863 : return ICONST;
1227 : }
1228 :
1229 : static unsigned int
1230 28 : hexval(unsigned char c)
1231 : {
1232 28 : if (c >= '0' && c <= '9')
1233 28 : return c - '0';
1234 0 : if (c >= 'a' && c <= 'f')
1235 0 : return c - 'a' + 0xA;
1236 0 : if (c >= 'A' && c <= 'F')
1237 0 : return c - 'A' + 0xA;
1238 0 : elog(ERROR, "invalid hexadecimal digit");
1239 : return 0; /* not reached */
1240 : }
1241 :
1242 : static void
1243 6 : check_unicode_value(pg_wchar c, char *loc, core_yyscan_t yyscanner)
1244 : {
1245 6 : if (GetDatabaseEncoding() == PG_UTF8)
1246 12 : return;
1247 :
1248 0 : if (c > 0x7F)
1249 : {
1250 0 : ADVANCE_YYLLOC(loc - yyextra->literalbuf + 3); /* 3 for U&" */
1251 0 : yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
1252 : }
1253 : }
1254 :
1255 : static bool
1256 6 : is_utf16_surrogate_first(pg_wchar c)
1257 : {
1258 6 : return (c >= 0xD800 && c <= 0xDBFF);
1259 : }
1260 :
1261 : static bool
1262 6 : is_utf16_surrogate_second(pg_wchar c)
1263 : {
1264 6 : return (c >= 0xDC00 && c <= 0xDFFF);
1265 : }
1266 :
1267 : static pg_wchar
1268 0 : surrogate_pair_to_codepoint(pg_wchar first, pg_wchar second)
1269 : {
1270 0 : return ((first & 0x3FF) << 10) + 0x10000 + (second & 0x3FF);
1271 : }
1272 :
1273 : static void
1274 0 : addunicode(pg_wchar c, core_yyscan_t yyscanner)
1275 : {
1276 : char buf[8];
1277 :
1278 0 : if (c == 0 || c > 0x10FFFF)
1279 0 : yyerror("invalid Unicode escape value");
1280 0 : if (c > 0x7F)
1281 : {
1282 0 : if (GetDatabaseEncoding() != PG_UTF8)
1283 0 : yyerror("Unicode escape values cannot be used for code point values above 007F when the server encoding is not UTF8");
1284 0 : yyextra->saw_non_ascii = true;
1285 : }
1286 0 : unicode_to_utf8(c, (unsigned char *) buf);
1287 0 : addlit(buf, pg_mblen(buf), yyscanner);
1288 0 : }
1289 :
1290 : /* is 'escape' acceptable as Unicode escape character (UESCAPE syntax) ? */
1291 : static bool
1292 6 : check_uescapechar(unsigned char escape)
1293 : {
1294 6 : if (isxdigit(escape)
1295 6 : || escape == '+'
1296 5 : || escape == '\''
1297 5 : || escape == '"'
1298 5 : || scanner_isspace(escape))
1299 : {
1300 1 : return false;
1301 : }
1302 : else
1303 5 : return true;
1304 : }
1305 :
1306 : /* like litbufdup, but handle unicode escapes */
1307 : static char *
1308 9 : litbuf_udeescape(unsigned char escape, core_yyscan_t yyscanner)
1309 : {
1310 : char *new;
1311 : char *litbuf,
1312 : *in,
1313 : *out;
1314 9 : pg_wchar pair_first = 0;
1315 :
1316 : /* Make literalbuf null-terminated to simplify the scanning loop */
1317 9 : litbuf = yyextra->literalbuf;
1318 9 : litbuf[yyextra->literallen] = '\0';
1319 :
1320 : /*
1321 : * This relies on the subtle assumption that a UTF-8 expansion cannot be
1322 : * longer than its escaped representation.
1323 : */
1324 9 : new = palloc(yyextra->literallen + 1);
1325 :
1326 9 : in = litbuf;
1327 9 : out = new;
1328 66 : while (*in)
1329 : {
1330 50 : if (in[0] == escape)
1331 : {
1332 8 : if (in[1] == escape)
1333 : {
1334 0 : if (pair_first)
1335 : {
1336 0 : ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
1337 0 : yyerror("invalid Unicode surrogate pair");
1338 : }
1339 0 : *out++ = escape;
1340 0 : in += 2;
1341 : }
1342 13 : else if (isxdigit((unsigned char) in[1]) &&
1343 10 : isxdigit((unsigned char) in[2]) &&
1344 10 : isxdigit((unsigned char) in[3]) &&
1345 5 : isxdigit((unsigned char) in[4]))
1346 4 : {
1347 : pg_wchar unicode;
1348 :
1349 12 : unicode = (hexval(in[1]) << 12) +
1350 8 : (hexval(in[2]) << 8) +
1351 4 : (hexval(in[3]) << 4) +
1352 4 : hexval(in[4]);
1353 4 : check_unicode_value(unicode, in, yyscanner);
1354 4 : if (pair_first)
1355 : {
1356 0 : if (is_utf16_surrogate_second(unicode))
1357 : {
1358 0 : unicode = surrogate_pair_to_codepoint(pair_first, unicode);
1359 0 : pair_first = 0;
1360 : }
1361 : else
1362 : {
1363 0 : ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
1364 0 : yyerror("invalid Unicode surrogate pair");
1365 : }
1366 : }
1367 4 : else if (is_utf16_surrogate_second(unicode))
1368 0 : yyerror("invalid Unicode surrogate pair");
1369 :
1370 4 : if (is_utf16_surrogate_first(unicode))
1371 0 : pair_first = unicode;
1372 : else
1373 : {
1374 4 : unicode_to_utf8(unicode, (unsigned char *) out);
1375 4 : out += pg_mblen(out);
1376 : }
1377 4 : in += 5;
1378 : }
1379 7 : else if (in[1] == '+' &&
1380 6 : isxdigit((unsigned char) in[2]) &&
1381 6 : isxdigit((unsigned char) in[3]) &&
1382 6 : isxdigit((unsigned char) in[4]) &&
1383 6 : isxdigit((unsigned char) in[5]) &&
1384 5 : isxdigit((unsigned char) in[6]) &&
1385 2 : isxdigit((unsigned char) in[7]))
1386 2 : {
1387 : pg_wchar unicode;
1388 :
1389 6 : unicode = (hexval(in[2]) << 20) +
1390 4 : (hexval(in[3]) << 16) +
1391 4 : (hexval(in[4]) << 12) +
1392 4 : (hexval(in[5]) << 8) +
1393 2 : (hexval(in[6]) << 4) +
1394 2 : hexval(in[7]);
1395 2 : check_unicode_value(unicode, in, yyscanner);
1396 2 : if (pair_first)
1397 : {
1398 0 : if (is_utf16_surrogate_second(unicode))
1399 : {
1400 0 : unicode = surrogate_pair_to_codepoint(pair_first, unicode);
1401 0 : pair_first = 0;
1402 : }
1403 : else
1404 : {
1405 0 : ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
1406 0 : yyerror("invalid Unicode surrogate pair");
1407 : }
1408 : }
1409 2 : else if (is_utf16_surrogate_second(unicode))
1410 0 : yyerror("invalid Unicode surrogate pair");
1411 :
1412 2 : if (is_utf16_surrogate_first(unicode))
1413 0 : pair_first = unicode;
1414 : else
1415 : {
1416 2 : unicode_to_utf8(unicode, (unsigned char *) out);
1417 2 : out += pg_mblen(out);
1418 : }
1419 2 : in += 8;
1420 : }
1421 : else
1422 : {
1423 2 : ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
1424 2 : yyerror("invalid Unicode escape value");
1425 : }
1426 : }
1427 : else
1428 : {
1429 42 : if (pair_first)
1430 : {
1431 0 : ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
1432 0 : yyerror("invalid Unicode surrogate pair");
1433 : }
1434 42 : *out++ = *in++;
1435 : }
1436 : }
1437 :
1438 : /* unfinished surrogate pair? */
1439 7 : if (pair_first)
1440 : {
1441 0 : ADVANCE_YYLLOC(in - litbuf + 3); /* 3 for U&" */
1442 0 : yyerror("invalid Unicode surrogate pair");
1443 : }
1444 :
1445 7 : *out = '\0';
1446 :
1447 : /*
1448 : * We could skip pg_verifymbstr if we didn't process any non-7-bit-ASCII
1449 : * codes; but it's probably not worth the trouble, since this isn't likely
1450 : * to be a performance-critical path.
1451 : */
1452 7 : pg_verifymbstr(new, out - new, false);
1453 7 : return new;
1454 : }
1455 :
1456 : static unsigned char
1457 224 : unescape_single_char(unsigned char c, core_yyscan_t yyscanner)
1458 : {
1459 224 : switch (c)
1460 : {
1461 : case 'b':
1462 0 : return '\b';
1463 : case 'f':
1464 0 : return '\f';
1465 : case 'n':
1466 95 : return '\n';
1467 : case 'r':
1468 4 : return '\r';
1469 : case 't':
1470 1 : return '\t';
1471 : default:
1472 : /* check for backslash followed by non-7-bit-ASCII */
1473 124 : if (c == '\0' || IS_HIGHBIT_SET(c))
1474 0 : yyextra->saw_non_ascii = true;
1475 :
1476 124 : return c;
1477 : }
1478 : }
1479 :
1480 : static void
1481 224 : check_string_escape_warning(unsigned char ychar, core_yyscan_t yyscanner)
1482 : {
1483 224 : if (ychar == '\'')
1484 : {
1485 6 : if (yyextra->warn_on_first_escape && yyextra->escape_string_warning)
1486 0 : ereport(WARNING,
1487 : (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
1488 : errmsg("nonstandard use of \\' in a string literal"),
1489 : errhint("Use '' to write quotes in strings, or use the escape string syntax (E'...')."),
1490 : lexer_errposition()));
1491 6 : yyextra->warn_on_first_escape = false; /* warn only once per string */
1492 : }
1493 218 : else if (ychar == '\\')
1494 : {
1495 118 : if (yyextra->warn_on_first_escape && yyextra->escape_string_warning)
1496 10 : ereport(WARNING,
1497 : (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
1498 : errmsg("nonstandard use of \\\\ in a string literal"),
1499 : errhint("Use the escape string syntax for backslashes, e.g., E'\\\\'."),
1500 : lexer_errposition()));
1501 118 : yyextra->warn_on_first_escape = false; /* warn only once per string */
1502 : }
1503 : else
1504 100 : check_escape_warning(yyscanner);
1505 224 : }
1506 :
1507 : static void
1508 114 : check_escape_warning(core_yyscan_t yyscanner)
1509 : {
1510 114 : if (yyextra->warn_on_first_escape && yyextra->escape_string_warning)
1511 0 : ereport(WARNING,
1512 : (errcode(ERRCODE_NONSTANDARD_USE_OF_ESCAPE_CHARACTER),
1513 : errmsg("nonstandard use of escape in a string literal"),
1514 : errhint("Use the escape string syntax for escapes, e.g., E'\\r\\n'."),
1515 : lexer_errposition()));
1516 114 : yyextra->warn_on_first_escape = false; /* warn only once per string */
1517 114 : }
1518 :
1519 : /*
1520 : * Interface functions to make flex use palloc() instead of malloc().
1521 : * It'd be better to make these static, but flex insists otherwise.
1522 : */
1523 :
1524 : void *
1525 115128 : core_yyalloc(yy_size_t bytes, core_yyscan_t yyscanner)
1526 : {
1527 115128 : return palloc(bytes);
1528 : }
1529 :
1530 : void *
1531 0 : core_yyrealloc(void *ptr, yy_size_t bytes, core_yyscan_t yyscanner)
1532 : {
1533 0 : if (ptr)
1534 0 : return repalloc(ptr, bytes);
1535 : else
1536 0 : return palloc(bytes);
1537 : }
1538 :
1539 : void
1540 0 : core_yyfree(void *ptr, core_yyscan_t yyscanner)
1541 : {
1542 0 : if (ptr)
1543 0 : pfree(ptr);
1544 0 : }
|