Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * scansup.c
4 : * support routines for the lex/flex scanner, used by both the normal
5 : * backend as well as the bootstrap backend
6 : *
7 : * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
8 : * Portions Copyright (c) 1994, Regents of the University of California
9 : *
10 : *
11 : * IDENTIFICATION
12 : * src/backend/parser/scansup.c
13 : *
14 : *-------------------------------------------------------------------------
15 : */
16 : #include "postgres.h"
17 :
18 : #include <ctype.h>
19 :
20 : #include "parser/scansup.h"
21 : #include "mb/pg_wchar.h"
22 :
23 :
24 : /* ----------------
25 : * scanstr
26 : *
27 : * if the string passed in has escaped codes, map the escape codes to actual
28 : * chars
29 : *
30 : * the string returned is palloc'd and should eventually be pfree'd by the
31 : * caller!
32 : * ----------------
33 : */
34 :
35 : char *
36 97116 : scanstr(const char *s)
37 : {
38 : char *newStr;
39 : int len,
40 : i,
41 : j;
42 :
43 97116 : if (s == NULL || s[0] == '\0')
44 268 : return pstrdup("");
45 :
46 96848 : len = strlen(s);
47 :
48 96848 : newStr = palloc(len + 1); /* string cannot get longer */
49 :
50 369648 : for (i = 0, j = 0; i < len; i++)
51 : {
52 272800 : if (s[i] == '\'')
53 : {
54 : /*
55 : * Note: if scanner is working right, unescaped quotes can only
56 : * appear in pairs, so there should be another character.
57 : */
58 12 : i++;
59 : /* The bootstrap parser is not as smart, so check here. */
60 12 : Assert(s[i] == '\'');
61 12 : newStr[j] = s[i];
62 : }
63 272788 : else if (s[i] == '\\')
64 : {
65 167 : i++;
66 167 : switch (s[i])
67 : {
68 : case 'b':
69 0 : newStr[j] = '\b';
70 0 : break;
71 : case 'f':
72 0 : newStr[j] = '\f';
73 0 : break;
74 : case 'n':
75 0 : newStr[j] = '\n';
76 0 : break;
77 : case 'r':
78 0 : newStr[j] = '\r';
79 0 : break;
80 : case 't':
81 0 : newStr[j] = '\t';
82 0 : break;
83 : case '0':
84 : case '1':
85 : case '2':
86 : case '3':
87 : case '4':
88 : case '5':
89 : case '6':
90 : case '7':
91 : {
92 : int k;
93 167 : long octVal = 0;
94 :
95 835 : for (k = 0;
96 1169 : s[i + k] >= '0' && s[i + k] <= '7' && k < 3;
97 501 : k++)
98 501 : octVal = (octVal << 3) + (s[i + k] - '0');
99 167 : i += k - 1;
100 167 : newStr[j] = ((char) octVal);
101 : }
102 167 : break;
103 : default:
104 0 : newStr[j] = s[i];
105 0 : break;
106 : } /* switch */
107 : } /* s[i] == '\\' */
108 : else
109 272621 : newStr[j] = s[i];
110 272800 : j++;
111 : }
112 96848 : newStr[j] = '\0';
113 96848 : return newStr;
114 : }
115 :
116 :
117 : /*
118 : * downcase_truncate_identifier() --- do appropriate downcasing and
119 : * truncation of an unquoted identifier. Optionally warn of truncation.
120 : *
121 : * Returns a palloc'd string containing the adjusted identifier.
122 : *
123 : * Note: in some usages the passed string is not null-terminated.
124 : *
125 : * Note: the API of this function is designed to allow for downcasing
126 : * transformations that increase the string length, but we don't yet
127 : * support that. If you want to implement it, you'll need to fix
128 : * SplitIdentifierString() in utils/adt/varlena.c.
129 : */
130 : char *
131 189024 : downcase_truncate_identifier(const char *ident, int len, bool warn)
132 : {
133 189024 : return downcase_identifier(ident, len, warn, true);
134 : }
135 :
136 : /*
137 : * a workhorse for downcase_truncate_identifier
138 : */
139 : char *
140 189041 : downcase_identifier(const char *ident, int len, bool warn, bool truncate)
141 : {
142 : char *result;
143 : int i;
144 : bool enc_is_single_byte;
145 :
146 189041 : result = palloc(len + 1);
147 189041 : enc_is_single_byte = pg_database_encoding_max_length() == 1;
148 :
149 : /*
150 : * SQL99 specifies Unicode-aware case normalization, which we don't yet
151 : * have the infrastructure for. Instead we use tolower() to provide a
152 : * locale-aware translation. However, there are some locales where this
153 : * is not right either (eg, Turkish may do strange things with 'i' and
154 : * 'I'). Our current compromise is to use tolower() for characters with
155 : * the high bit set, as long as they aren't part of a multi-byte
156 : * character, and use an ASCII-only downcasing for 7-bit characters.
157 : */
158 1457173 : for (i = 0; i < len; i++)
159 : {
160 1268132 : unsigned char ch = (unsigned char) ident[i];
161 :
162 1268132 : if (ch >= 'A' && ch <= 'Z')
163 43784 : ch += 'a' - 'A';
164 1224348 : else if (enc_is_single_byte && IS_HIGHBIT_SET(ch) && isupper(ch))
165 0 : ch = tolower(ch);
166 1268132 : result[i] = (char) ch;
167 : }
168 189041 : result[i] = '\0';
169 :
170 189041 : if (i >= NAMEDATALEN && truncate)
171 2 : truncate_identifier(result, i, warn);
172 :
173 189041 : return result;
174 : }
175 :
176 :
177 : /*
178 : * truncate_identifier() --- truncate an identifier to NAMEDATALEN-1 bytes.
179 : *
180 : * The given string is modified in-place, if necessary. A warning is
181 : * issued if requested.
182 : *
183 : * We require the caller to pass in the string length since this saves a
184 : * strlen() call in some common usages.
185 : */
186 : void
187 6646 : truncate_identifier(char *ident, int len, bool warn)
188 : {
189 6646 : if (len >= NAMEDATALEN)
190 : {
191 2 : len = pg_mbcliplen(ident, len, NAMEDATALEN - 1);
192 2 : if (warn)
193 : {
194 : /*
195 : * We avoid using %.*s here because it can misbehave if the data
196 : * is not valid in what libc thinks is the prevailing encoding.
197 : */
198 : char buf[NAMEDATALEN];
199 :
200 2 : memcpy(buf, ident, len);
201 2 : buf[len] = '\0';
202 2 : ereport(NOTICE,
203 : (errcode(ERRCODE_NAME_TOO_LONG),
204 : errmsg("identifier \"%s\" will be truncated to \"%s\"",
205 : ident, buf)));
206 : }
207 2 : ident[len] = '\0';
208 : }
209 6646 : }
210 :
211 : /*
212 : * scanner_isspace() --- return TRUE if flex scanner considers char whitespace
213 : *
214 : * This should be used instead of the potentially locale-dependent isspace()
215 : * function when it's important to match the lexer's behavior.
216 : *
217 : * In principle we might need similar functions for isalnum etc, but for the
218 : * moment only isspace seems needed.
219 : */
220 : bool
221 46198 : scanner_isspace(char ch)
222 : {
223 : /* This must match scan.l's list of {space} characters */
224 46198 : if (ch == ' ' ||
225 44010 : ch == '\t' ||
226 43973 : ch == '\n' ||
227 43972 : ch == '\r' ||
228 : ch == '\f')
229 2226 : return true;
230 43972 : return false;
231 : }
|