Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * ts_locale.c
4 : * locale compatibility layer for tsearch
5 : *
6 : * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
7 : *
8 : *
9 : * IDENTIFICATION
10 : * src/backend/tsearch/ts_locale.c
11 : *
12 : *-------------------------------------------------------------------------
13 : */
14 : #include "postgres.h"
15 :
16 : #include "catalog/pg_collation.h"
17 : #include "storage/fd.h"
18 : #include "tsearch/ts_locale.h"
19 : #include "tsearch/ts_public.h"
20 :
21 : static void tsearch_readline_callback(void *arg);
22 :
23 :
24 : #ifdef USE_WIDE_UPPER_LOWER
25 :
26 : int
27 643 : t_isdigit(const char *ptr)
28 : {
29 643 : int clen = pg_mblen(ptr);
30 : wchar_t character[2];
31 643 : Oid collation = DEFAULT_COLLATION_OID; /* TODO */
32 643 : pg_locale_t mylocale = 0; /* TODO */
33 :
34 643 : if (clen == 1 || lc_ctype_is_c(collation))
35 643 : return isdigit(TOUCHAR(ptr));
36 :
37 0 : char2wchar(character, 2, ptr, clen, mylocale);
38 :
39 0 : return iswdigit((wint_t) character[0]);
40 : }
41 :
42 : int
43 135177 : t_isspace(const char *ptr)
44 : {
45 135177 : int clen = pg_mblen(ptr);
46 : wchar_t character[2];
47 135177 : Oid collation = DEFAULT_COLLATION_OID; /* TODO */
48 135177 : pg_locale_t mylocale = 0; /* TODO */
49 :
50 135177 : if (clen == 1 || lc_ctype_is_c(collation))
51 135177 : return isspace(TOUCHAR(ptr));
52 :
53 0 : char2wchar(character, 2, ptr, clen, mylocale);
54 :
55 0 : return iswspace((wint_t) character[0]);
56 : }
57 :
58 : int
59 984 : t_isalpha(const char *ptr)
60 : {
61 984 : int clen = pg_mblen(ptr);
62 : wchar_t character[2];
63 984 : Oid collation = DEFAULT_COLLATION_OID; /* TODO */
64 984 : pg_locale_t mylocale = 0; /* TODO */
65 :
66 984 : if (clen == 1 || lc_ctype_is_c(collation))
67 984 : return isalpha(TOUCHAR(ptr));
68 :
69 0 : char2wchar(character, 2, ptr, clen, mylocale);
70 :
71 0 : return iswalpha((wint_t) character[0]);
72 : }
73 :
74 : int
75 357 : t_isprint(const char *ptr)
76 : {
77 357 : int clen = pg_mblen(ptr);
78 : wchar_t character[2];
79 357 : Oid collation = DEFAULT_COLLATION_OID; /* TODO */
80 357 : pg_locale_t mylocale = 0; /* TODO */
81 :
82 357 : if (clen == 1 || lc_ctype_is_c(collation))
83 357 : return isprint(TOUCHAR(ptr));
84 :
85 0 : char2wchar(character, 2, ptr, clen, mylocale);
86 :
87 0 : return iswprint((wint_t) character[0]);
88 : }
89 : #endif /* USE_WIDE_UPPER_LOWER */
90 :
91 :
92 : /*
93 : * Set up to read a file using tsearch_readline(). This facility is
94 : * better than just reading the file directly because it provides error
95 : * context pointing to the specific line where a problem is detected.
96 : *
97 : * Expected usage is:
98 : *
99 : * tsearch_readline_state trst;
100 : *
101 : * if (!tsearch_readline_begin(&trst, filename))
102 : * ereport(ERROR,
103 : * (errcode(ERRCODE_CONFIG_FILE_ERROR),
104 : * errmsg("could not open stop-word file \"%s\": %m",
105 : * filename)));
106 : * while ((line = tsearch_readline(&trst)) != NULL)
107 : * process line;
108 : * tsearch_readline_end(&trst);
109 : *
110 : * Note that the caller supplies the ereport() for file open failure;
111 : * this is so that a custom message can be provided. The filename string
112 : * passed to tsearch_readline_begin() must remain valid through
113 : * tsearch_readline_end().
114 : */
115 : bool
116 55 : tsearch_readline_begin(tsearch_readline_state *stp,
117 : const char *filename)
118 : {
119 55 : if ((stp->fp = AllocateFile(filename, "r")) == NULL)
120 0 : return false;
121 55 : stp->filename = filename;
122 55 : stp->lineno = 0;
123 55 : stp->curline = NULL;
124 : /* Setup error traceback support for ereport() */
125 55 : stp->cb.callback = tsearch_readline_callback;
126 55 : stp->cb.arg = (void *) stp;
127 55 : stp->cb.previous = error_context_stack;
128 55 : error_context_stack = &stp->cb;
129 55 : return true;
130 : }
131 :
132 : /*
133 : * Read the next line from a tsearch data file (expected to be in UTF-8), and
134 : * convert it to database encoding if needed. The returned string is palloc'd.
135 : * NULL return means EOF.
136 : */
137 : char *
138 1566 : tsearch_readline(tsearch_readline_state *stp)
139 : {
140 : char *result;
141 :
142 1566 : stp->lineno++;
143 1566 : stp->curline = NULL;
144 1566 : result = t_readline(stp->fp);
145 1566 : stp->curline = result;
146 1566 : return result;
147 : }
148 :
149 : /*
150 : * Close down after reading a file with tsearch_readline()
151 : */
152 : void
153 55 : tsearch_readline_end(tsearch_readline_state *stp)
154 : {
155 55 : FreeFile(stp->fp);
156 : /* Pop the error context stack */
157 55 : error_context_stack = stp->cb.previous;
158 55 : }
159 :
160 : /*
161 : * Error context callback for errors occurring while reading a tsearch
162 : * configuration file.
163 : */
164 : static void
165 0 : tsearch_readline_callback(void *arg)
166 : {
167 0 : tsearch_readline_state *stp = (tsearch_readline_state *) arg;
168 :
169 : /*
170 : * We can't include the text of the config line for errors that occur
171 : * during t_readline() itself. This is only partly a consequence of our
172 : * arms-length use of that routine: the major cause of such errors is
173 : * encoding violations, and we daren't try to print error messages
174 : * containing badly-encoded data.
175 : */
176 0 : if (stp->curline)
177 0 : errcontext("line %d of configuration file \"%s\": \"%s\"",
178 : stp->lineno,
179 : stp->filename,
180 : stp->curline);
181 : else
182 0 : errcontext("line %d of configuration file \"%s\"",
183 : stp->lineno,
184 : stp->filename);
185 0 : }
186 :
187 :
188 : /*
189 : * Read the next line from a tsearch data file (expected to be in UTF-8), and
190 : * convert it to database encoding if needed. The returned string is palloc'd.
191 : * NULL return means EOF.
192 : *
193 : * Note: direct use of this function is now deprecated. Go through
194 : * tsearch_readline() to provide better error reporting.
195 : */
196 : char *
197 1566 : t_readline(FILE *fp)
198 : {
199 : int len;
200 : char *recoded;
201 : char buf[4096]; /* lines must not be longer than this */
202 :
203 1566 : if (fgets(buf, sizeof(buf), fp) == NULL)
204 46 : return NULL;
205 :
206 1520 : len = strlen(buf);
207 :
208 : /* Make sure the input is valid UTF-8 */
209 1520 : (void) pg_verify_mbstr(PG_UTF8, buf, len, false);
210 :
211 : /* And convert */
212 1520 : recoded = pg_any_to_server(buf, len, PG_UTF8);
213 1520 : if (recoded == buf)
214 : {
215 : /*
216 : * conversion didn't pstrdup, so we must. We can use the length of the
217 : * original string, because no conversion was done.
218 : */
219 1520 : recoded = pnstrdup(recoded, len);
220 : }
221 :
222 1520 : return recoded;
223 : }
224 :
225 : /*
226 : * lowerstr --- fold null-terminated string to lower case
227 : *
228 : * Returned string is palloc'd
229 : */
230 : char *
231 1372 : lowerstr(const char *str)
232 : {
233 1372 : return lowerstr_with_len(str, strlen(str));
234 : }
235 :
236 : /*
237 : * lowerstr_with_len --- fold string to lower case
238 : *
239 : * Input string need not be null-terminated.
240 : *
241 : * Returned string is palloc'd
242 : */
243 : char *
244 3122 : lowerstr_with_len(const char *str, int len)
245 : {
246 : char *out;
247 :
248 : #ifdef USE_WIDE_UPPER_LOWER
249 3122 : Oid collation = DEFAULT_COLLATION_OID; /* TODO */
250 3122 : pg_locale_t mylocale = 0; /* TODO */
251 : #endif
252 :
253 3122 : if (len == 0)
254 0 : return pstrdup("");
255 :
256 : #ifdef USE_WIDE_UPPER_LOWER
257 :
258 : /*
259 : * Use wide char code only when max encoding length > 1 and ctype != C.
260 : * Some operating systems fail with multi-byte encodings and a C locale.
261 : * Also, for a C locale there is no need to process as multibyte. From
262 : * backend/utils/adt/oracle_compat.c Teodor
263 : */
264 3122 : if (pg_database_encoding_max_length() > 1 && !lc_ctype_is_c(collation))
265 3122 : {
266 : wchar_t *wstr,
267 : *wptr;
268 : int wlen;
269 :
270 : /*
271 : * alloc number of wchar_t for worst case, len contains number of
272 : * bytes >= number of characters and alloc 1 wchar_t for 0, because
273 : * wchar2char wants zero-terminated string
274 : */
275 3122 : wptr = wstr = (wchar_t *) palloc(sizeof(wchar_t) * (len + 1));
276 :
277 3122 : wlen = char2wchar(wstr, len + 1, str, len, mylocale);
278 3122 : Assert(wlen <= len);
279 :
280 21194 : while (*wptr)
281 : {
282 14950 : *wptr = towlower((wint_t) *wptr);
283 14950 : wptr++;
284 : }
285 :
286 : /*
287 : * Alloc result string for worst case + '\0'
288 : */
289 3122 : len = pg_database_encoding_max_length() * wlen + 1;
290 3122 : out = (char *) palloc(len);
291 :
292 3122 : wlen = wchar2char(out, wstr, len, mylocale);
293 :
294 3122 : pfree(wstr);
295 :
296 3122 : if (wlen < 0)
297 0 : ereport(ERROR,
298 : (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
299 : errmsg("conversion from wchar_t to server encoding failed: %m")));
300 3122 : Assert(wlen < len);
301 : }
302 : else
303 : #endif /* USE_WIDE_UPPER_LOWER */
304 : {
305 0 : const char *ptr = str;
306 : char *outptr;
307 :
308 0 : outptr = out = (char *) palloc(sizeof(char) * (len + 1));
309 0 : while ((ptr - str) < len && *ptr)
310 : {
311 0 : *outptr++ = tolower(TOUCHAR(ptr));
312 0 : ptr++;
313 : }
314 0 : *outptr = '\0';
315 : }
316 :
317 3122 : return out;
318 : }
|