Line data Source code
1 : /*-------------------------------------------------------------------------
2 : *
3 : * mbutils.c
4 : * This file contains functions for encoding conversion.
5 : *
6 : * The string-conversion functions in this file share some API quirks.
7 : * Note the following:
8 : *
9 : * The functions return a palloc'd, null-terminated string if conversion
10 : * is required. However, if no conversion is performed, the given source
11 : * string pointer is returned as-is.
12 : *
13 : * Although the presence of a length argument means that callers can pass
14 : * non-null-terminated strings, care is required because the same string
15 : * will be passed back if no conversion occurs. Such callers *must* check
16 : * whether result == src and handle that case differently.
17 : *
18 : * If the source and destination encodings are the same, the source string
19 : * is returned without any verification; it's assumed to be valid data.
20 : * If that might not be the case, the caller is responsible for validating
21 : * the string using a separate call to pg_verify_mbstr(). Whenever the
22 : * source and destination encodings are different, the functions ensure that
23 : * the result is validly encoded according to the destination encoding.
24 : *
25 : *
26 : * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
27 : * Portions Copyright (c) 1994, Regents of the University of California
28 : *
29 : *
30 : * IDENTIFICATION
31 : * src/backend/utils/mb/mbutils.c
32 : *
33 : *-------------------------------------------------------------------------
34 : */
35 : #include "postgres.h"
36 :
37 : #include "access/xact.h"
38 : #include "catalog/namespace.h"
39 : #include "mb/pg_wchar.h"
40 : #include "utils/builtins.h"
41 : #include "utils/memutils.h"
42 : #include "utils/syscache.h"
43 :
44 : /*
45 : * When converting strings between different encodings, we assume that space
46 : * for converted result is 4-to-1 growth in the worst case. The rate for
47 : * currently supported encoding pairs are within 3 (SJIS JIS X0201 half width
48 : * kanna -> UTF8 is the worst case). So "4" should be enough for the moment.
49 : *
50 : * Note that this is not the same as the maximum character width in any
51 : * particular encoding.
52 : */
53 : #define MAX_CONVERSION_GROWTH 4
54 :
55 : /*
56 : * We maintain a simple linked list caching the fmgr lookup info for the
57 : * currently selected conversion functions, as well as any that have been
58 : * selected previously in the current session. (We remember previous
59 : * settings because we must be able to restore a previous setting during
60 : * transaction rollback, without doing any fresh catalog accesses.)
61 : *
62 : * Since we'll never release this data, we just keep it in TopMemoryContext.
63 : */
64 : typedef struct ConvProcInfo
65 : {
66 : int s_encoding; /* server and client encoding IDs */
67 : int c_encoding;
68 : FmgrInfo to_server_info; /* lookup info for conversion procs */
69 : FmgrInfo to_client_info;
70 : } ConvProcInfo;
71 :
72 : static List *ConvProcList = NIL; /* List of ConvProcInfo */
73 :
74 : /*
75 : * These variables point to the currently active conversion functions,
76 : * or are NULL when no conversion is needed.
77 : */
78 : static FmgrInfo *ToServerConvProc = NULL;
79 : static FmgrInfo *ToClientConvProc = NULL;
80 :
81 : /*
82 : * These variables track the currently-selected encodings.
83 : */
84 : static const pg_enc2name *ClientEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
85 : static const pg_enc2name *DatabaseEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
86 : static const pg_enc2name *MessageEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
87 :
88 : /*
89 : * During backend startup we can't set client encoding because we (a)
90 : * can't look up the conversion functions, and (b) may not know the database
91 : * encoding yet either. So SetClientEncoding() just accepts anything and
92 : * remembers it for InitializeClientEncoding() to apply later.
93 : */
94 : static bool backend_startup_complete = false;
95 : static int pending_client_encoding = PG_SQL_ASCII;
96 :
97 :
98 : /* Internal functions */
99 : static char *perform_default_encoding_conversion(const char *src,
100 : int len, bool is_client_to_server);
101 : static int cliplen(const char *str, int len, int limit);
102 :
103 :
104 : /*
105 : * Prepare for a future call to SetClientEncoding. Success should mean
106 : * that SetClientEncoding is guaranteed to succeed for this encoding request.
107 : *
108 : * (But note that success before backend_startup_complete does not guarantee
109 : * success after ...)
110 : *
111 : * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
112 : */
113 : int
114 908 : PrepareClientEncoding(int encoding)
115 : {
116 : int current_server_encoding;
117 : ListCell *lc;
118 :
119 908 : if (!PG_VALID_FE_ENCODING(encoding))
120 0 : return -1;
121 :
122 : /* Can't do anything during startup, per notes above */
123 908 : if (!backend_startup_complete)
124 342 : return 0;
125 :
126 566 : current_server_encoding = GetDatabaseEncoding();
127 :
128 : /*
129 : * Check for cases that require no conversion function.
130 : */
131 566 : if (current_server_encoding == encoding ||
132 230 : current_server_encoding == PG_SQL_ASCII ||
133 : encoding == PG_SQL_ASCII)
134 566 : return 0;
135 :
136 0 : if (IsTransactionState())
137 : {
138 : /*
139 : * If we're in a live transaction, it's safe to access the catalogs,
140 : * so look up the functions. We repeat the lookup even if the info is
141 : * already cached, so that we can react to changes in the contents of
142 : * pg_conversion.
143 : */
144 : Oid to_server_proc,
145 : to_client_proc;
146 : ConvProcInfo *convinfo;
147 : MemoryContext oldcontext;
148 :
149 0 : to_server_proc = FindDefaultConversionProc(encoding,
150 : current_server_encoding);
151 0 : if (!OidIsValid(to_server_proc))
152 0 : return -1;
153 0 : to_client_proc = FindDefaultConversionProc(current_server_encoding,
154 : encoding);
155 0 : if (!OidIsValid(to_client_proc))
156 0 : return -1;
157 :
158 : /*
159 : * Load the fmgr info into TopMemoryContext (could still fail here)
160 : */
161 0 : convinfo = (ConvProcInfo *) MemoryContextAlloc(TopMemoryContext,
162 : sizeof(ConvProcInfo));
163 0 : convinfo->s_encoding = current_server_encoding;
164 0 : convinfo->c_encoding = encoding;
165 0 : fmgr_info_cxt(to_server_proc, &convinfo->to_server_info,
166 : TopMemoryContext);
167 0 : fmgr_info_cxt(to_client_proc, &convinfo->to_client_info,
168 : TopMemoryContext);
169 :
170 : /* Attach new info to head of list */
171 0 : oldcontext = MemoryContextSwitchTo(TopMemoryContext);
172 0 : ConvProcList = lcons(convinfo, ConvProcList);
173 0 : MemoryContextSwitchTo(oldcontext);
174 :
175 : /*
176 : * We cannot yet remove any older entry for the same encoding pair,
177 : * since it could still be in use. SetClientEncoding will clean up.
178 : */
179 :
180 0 : return 0; /* success */
181 : }
182 : else
183 : {
184 : /*
185 : * If we're not in a live transaction, the only thing we can do is
186 : * restore a previous setting using the cache. This covers all
187 : * transaction-rollback cases. The only case it might not work for is
188 : * trying to change client_encoding on the fly by editing
189 : * postgresql.conf and SIGHUP'ing. Which would probably be a stupid
190 : * thing to do anyway.
191 : */
192 0 : foreach(lc, ConvProcList)
193 : {
194 0 : ConvProcInfo *oldinfo = (ConvProcInfo *) lfirst(lc);
195 :
196 0 : if (oldinfo->s_encoding == current_server_encoding &&
197 0 : oldinfo->c_encoding == encoding)
198 0 : return 0;
199 : }
200 :
201 0 : return -1; /* it's not cached, so fail */
202 : }
203 : }
204 :
205 : /*
206 : * Set the active client encoding and set up the conversion-function pointers.
207 : * PrepareClientEncoding should have been called previously for this encoding.
208 : *
209 : * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
210 : */
211 : int
212 678 : SetClientEncoding(int encoding)
213 : {
214 : int current_server_encoding;
215 : bool found;
216 : ListCell *lc;
217 : ListCell *prev;
218 : ListCell *next;
219 :
220 678 : if (!PG_VALID_FE_ENCODING(encoding))
221 0 : return -1;
222 :
223 : /* Can't do anything during startup, per notes above */
224 678 : if (!backend_startup_complete)
225 : {
226 227 : pending_client_encoding = encoding;
227 227 : return 0;
228 : }
229 :
230 451 : current_server_encoding = GetDatabaseEncoding();
231 :
232 : /*
233 : * Check for cases that require no conversion function.
234 : */
235 451 : if (current_server_encoding == encoding ||
236 115 : current_server_encoding == PG_SQL_ASCII ||
237 : encoding == PG_SQL_ASCII)
238 : {
239 451 : ClientEncoding = &pg_enc2name_tbl[encoding];
240 451 : ToServerConvProc = NULL;
241 451 : ToClientConvProc = NULL;
242 451 : return 0;
243 : }
244 :
245 : /*
246 : * Search the cache for the entry previously prepared by
247 : * PrepareClientEncoding; if there isn't one, we lose. While at it,
248 : * release any duplicate entries so that repeated Prepare/Set cycles don't
249 : * leak memory.
250 : */
251 0 : found = false;
252 0 : prev = NULL;
253 0 : for (lc = list_head(ConvProcList); lc; lc = next)
254 : {
255 0 : ConvProcInfo *convinfo = (ConvProcInfo *) lfirst(lc);
256 :
257 0 : next = lnext(lc);
258 :
259 0 : if (convinfo->s_encoding == current_server_encoding &&
260 0 : convinfo->c_encoding == encoding)
261 : {
262 0 : if (!found)
263 : {
264 : /* Found newest entry, so set up */
265 0 : ClientEncoding = &pg_enc2name_tbl[encoding];
266 0 : ToServerConvProc = &convinfo->to_server_info;
267 0 : ToClientConvProc = &convinfo->to_client_info;
268 0 : found = true;
269 : }
270 : else
271 : {
272 : /* Duplicate entry, release it */
273 0 : ConvProcList = list_delete_cell(ConvProcList, lc, prev);
274 0 : pfree(convinfo);
275 0 : continue; /* prev mustn't advance */
276 : }
277 : }
278 :
279 0 : prev = lc;
280 : }
281 :
282 0 : if (found)
283 0 : return 0; /* success */
284 : else
285 0 : return -1; /* it's not cached, so fail */
286 : }
287 :
288 : /*
289 : * Initialize client encoding conversions.
290 : * Called from InitPostgres() once during backend startup.
291 : */
292 : void
293 336 : InitializeClientEncoding(void)
294 : {
295 336 : Assert(!backend_startup_complete);
296 336 : backend_startup_complete = true;
297 :
298 672 : if (PrepareClientEncoding(pending_client_encoding) < 0 ||
299 336 : SetClientEncoding(pending_client_encoding) < 0)
300 : {
301 : /*
302 : * Oops, the requested conversion is not available. We couldn't fail
303 : * before, but we can now.
304 : */
305 0 : ereport(FATAL,
306 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
307 : errmsg("conversion between %s and %s is not supported",
308 : pg_enc2name_tbl[pending_client_encoding].name,
309 : GetDatabaseEncodingName())));
310 : }
311 336 : }
312 :
313 : /*
314 : * returns the current client encoding
315 : */
316 : int
317 183 : pg_get_client_encoding(void)
318 : {
319 183 : return ClientEncoding->encoding;
320 : }
321 :
322 : /*
323 : * returns the current client encoding name
324 : */
325 : const char *
326 0 : pg_get_client_encoding_name(void)
327 : {
328 0 : return ClientEncoding->name;
329 : }
330 :
331 : /*
332 : * Convert src string to another encoding (general case).
333 : *
334 : * See the notes about string conversion functions at the top of this file.
335 : */
336 : unsigned char *
337 132 : pg_do_encoding_conversion(unsigned char *src, int len,
338 : int src_encoding, int dest_encoding)
339 : {
340 : unsigned char *result;
341 : Oid proc;
342 :
343 132 : if (len <= 0)
344 0 : return src; /* empty string is always valid */
345 :
346 132 : if (src_encoding == dest_encoding)
347 0 : return src; /* no conversion required, assume valid */
348 :
349 132 : if (dest_encoding == PG_SQL_ASCII)
350 2 : return src; /* any string is valid in SQL_ASCII */
351 :
352 130 : if (src_encoding == PG_SQL_ASCII)
353 : {
354 : /* No conversion is possible, but we must validate the result */
355 2 : (void) pg_verify_mbstr(dest_encoding, (const char *) src, len, false);
356 2 : return src;
357 : }
358 :
359 128 : if (!IsTransactionState()) /* shouldn't happen */
360 0 : elog(ERROR, "cannot perform encoding conversion outside a transaction");
361 :
362 128 : proc = FindDefaultConversionProc(src_encoding, dest_encoding);
363 128 : if (!OidIsValid(proc))
364 0 : ereport(ERROR,
365 : (errcode(ERRCODE_UNDEFINED_FUNCTION),
366 : errmsg("default conversion function for encoding \"%s\" to \"%s\" does not exist",
367 : pg_encoding_to_char(src_encoding),
368 : pg_encoding_to_char(dest_encoding))));
369 :
370 : /*
371 : * Allocate space for conversion result, being wary of integer overflow
372 : */
373 128 : if ((Size) len >= (MaxAllocSize / (Size) MAX_CONVERSION_GROWTH))
374 0 : ereport(ERROR,
375 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
376 : errmsg("out of memory"),
377 : errdetail("String of %d bytes is too long for encoding conversion.",
378 : len)));
379 :
380 128 : result = palloc(len * MAX_CONVERSION_GROWTH + 1);
381 :
382 128 : OidFunctionCall5(proc,
383 : Int32GetDatum(src_encoding),
384 : Int32GetDatum(dest_encoding),
385 : CStringGetDatum(src),
386 : CStringGetDatum(result),
387 : Int32GetDatum(len));
388 128 : return result;
389 : }
390 :
391 : /*
392 : * Convert string to encoding encoding_name. The source
393 : * encoding is the DB encoding.
394 : *
395 : * BYTEA convert_to(TEXT string, NAME encoding_name) */
396 : Datum
397 0 : pg_convert_to(PG_FUNCTION_ARGS)
398 : {
399 0 : Datum string = PG_GETARG_DATUM(0);
400 0 : Datum dest_encoding_name = PG_GETARG_DATUM(1);
401 0 : Datum src_encoding_name = DirectFunctionCall1(namein,
402 : CStringGetDatum(DatabaseEncoding->name));
403 : Datum result;
404 :
405 : /*
406 : * pg_convert expects a bytea as its first argument. We're passing it a
407 : * text argument here, relying on the fact that they are both in fact
408 : * varlena types, and thus structurally identical.
409 : */
410 0 : result = DirectFunctionCall3(pg_convert, string,
411 : src_encoding_name, dest_encoding_name);
412 :
413 0 : PG_RETURN_DATUM(result);
414 : }
415 :
416 : /*
417 : * Convert string from encoding encoding_name. The destination
418 : * encoding is the DB encoding.
419 : *
420 : * TEXT convert_from(BYTEA string, NAME encoding_name) */
421 : Datum
422 0 : pg_convert_from(PG_FUNCTION_ARGS)
423 : {
424 0 : Datum string = PG_GETARG_DATUM(0);
425 0 : Datum src_encoding_name = PG_GETARG_DATUM(1);
426 0 : Datum dest_encoding_name = DirectFunctionCall1(namein,
427 : CStringGetDatum(DatabaseEncoding->name));
428 : Datum result;
429 :
430 0 : result = DirectFunctionCall3(pg_convert, string,
431 : src_encoding_name, dest_encoding_name);
432 :
433 : /*
434 : * pg_convert returns a bytea, which we in turn return as text, relying on
435 : * the fact that they are both in fact varlena types, and thus
436 : * structurally identical. Although not all bytea values are valid text,
437 : * in this case it will be because we've told pg_convert to return one
438 : * that is valid as text in the current database encoding.
439 : */
440 0 : PG_RETURN_DATUM(result);
441 : }
442 :
443 : /*
444 : * Convert string between two arbitrary encodings.
445 : *
446 : * BYTEA convert(BYTEA string, NAME src_encoding_name, NAME dest_encoding_name)
447 : */
448 : Datum
449 132 : pg_convert(PG_FUNCTION_ARGS)
450 : {
451 132 : bytea *string = PG_GETARG_BYTEA_PP(0);
452 132 : char *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
453 132 : int src_encoding = pg_char_to_encoding(src_encoding_name);
454 132 : char *dest_encoding_name = NameStr(*PG_GETARG_NAME(2));
455 132 : int dest_encoding = pg_char_to_encoding(dest_encoding_name);
456 : const char *src_str;
457 : char *dest_str;
458 : bytea *retval;
459 : int len;
460 :
461 132 : if (src_encoding < 0)
462 0 : ereport(ERROR,
463 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
464 : errmsg("invalid source encoding name \"%s\"",
465 : src_encoding_name)));
466 132 : if (dest_encoding < 0)
467 0 : ereport(ERROR,
468 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
469 : errmsg("invalid destination encoding name \"%s\"",
470 : dest_encoding_name)));
471 :
472 : /* make sure that source string is valid */
473 132 : len = VARSIZE_ANY_EXHDR(string);
474 132 : src_str = VARDATA_ANY(string);
475 132 : pg_verify_mbstr_len(src_encoding, src_str, len, false);
476 :
477 : /* perform conversion */
478 132 : dest_str = (char *) pg_do_encoding_conversion((unsigned char *) src_str,
479 : len,
480 : src_encoding,
481 : dest_encoding);
482 :
483 : /* update len if conversion actually happened */
484 132 : if (dest_str != src_str)
485 128 : len = strlen(dest_str);
486 :
487 : /*
488 : * build bytea data type structure.
489 : */
490 132 : retval = (bytea *) palloc(len + VARHDRSZ);
491 132 : SET_VARSIZE(retval, len + VARHDRSZ);
492 132 : memcpy(VARDATA(retval), dest_str, len);
493 :
494 132 : if (dest_str != src_str)
495 128 : pfree(dest_str);
496 :
497 : /* free memory if allocated by the toaster */
498 132 : PG_FREE_IF_COPY(string, 0);
499 :
500 132 : PG_RETURN_BYTEA_P(retval);
501 : }
502 :
503 : /*
504 : * get the length of the string considered as text in the specified
505 : * encoding. Raises an error if the data is not valid in that
506 : * encoding.
507 : *
508 : * INT4 length (BYTEA string, NAME src_encoding_name)
509 : */
510 : Datum
511 0 : length_in_encoding(PG_FUNCTION_ARGS)
512 : {
513 0 : bytea *string = PG_GETARG_BYTEA_PP(0);
514 0 : char *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
515 0 : int src_encoding = pg_char_to_encoding(src_encoding_name);
516 : const char *src_str;
517 : int len;
518 : int retval;
519 :
520 0 : if (src_encoding < 0)
521 0 : ereport(ERROR,
522 : (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
523 : errmsg("invalid encoding name \"%s\"",
524 : src_encoding_name)));
525 :
526 0 : len = VARSIZE_ANY_EXHDR(string);
527 0 : src_str = VARDATA_ANY(string);
528 :
529 0 : retval = pg_verify_mbstr_len(src_encoding, src_str, len, false);
530 :
531 0 : PG_RETURN_INT32(retval);
532 : }
533 :
534 : /*
535 : * Get maximum multibyte character length in the specified encoding.
536 : *
537 : * Note encoding is specified numerically, not by name as above.
538 : */
539 : Datum
540 0 : pg_encoding_max_length_sql(PG_FUNCTION_ARGS)
541 : {
542 0 : int encoding = PG_GETARG_INT32(0);
543 :
544 0 : if (PG_VALID_ENCODING(encoding))
545 0 : PG_RETURN_INT32(pg_wchar_table[encoding].maxmblen);
546 : else
547 0 : PG_RETURN_NULL();
548 : }
549 :
550 : /*
551 : * Convert client encoding to server encoding.
552 : *
553 : * See the notes about string conversion functions at the top of this file.
554 : */
555 : char *
556 27073 : pg_client_to_server(const char *s, int len)
557 : {
558 27073 : return pg_any_to_server(s, len, ClientEncoding->encoding);
559 : }
560 :
561 : /*
562 : * Convert any encoding to server encoding.
563 : *
564 : * See the notes about string conversion functions at the top of this file.
565 : *
566 : * Unlike the other string conversion functions, this will apply validation
567 : * even if encoding == DatabaseEncoding->encoding. This is because this is
568 : * used to process data coming in from outside the database, and we never
569 : * want to just assume validity.
570 : */
571 : char *
572 137634 : pg_any_to_server(const char *s, int len, int encoding)
573 : {
574 137634 : if (len <= 0)
575 142 : return (char *) s; /* empty string is always valid */
576 :
577 137492 : if (encoding == DatabaseEncoding->encoding ||
578 : encoding == PG_SQL_ASCII)
579 : {
580 : /*
581 : * No conversion is needed, but we must still validate the data.
582 : */
583 137492 : (void) pg_verify_mbstr(DatabaseEncoding->encoding, s, len, false);
584 137492 : return (char *) s;
585 : }
586 :
587 0 : if (DatabaseEncoding->encoding == PG_SQL_ASCII)
588 : {
589 : /*
590 : * No conversion is possible, but we must still validate the data,
591 : * because the client-side code might have done string escaping using
592 : * the selected client_encoding. If the client encoding is ASCII-safe
593 : * then we just do a straight validation under that encoding. For an
594 : * ASCII-unsafe encoding we have a problem: we dare not pass such data
595 : * to the parser but we have no way to convert it. We compromise by
596 : * rejecting the data if it contains any non-ASCII characters.
597 : */
598 0 : if (PG_VALID_BE_ENCODING(encoding))
599 0 : (void) pg_verify_mbstr(encoding, s, len, false);
600 : else
601 : {
602 : int i;
603 :
604 0 : for (i = 0; i < len; i++)
605 : {
606 0 : if (s[i] == '\0' || IS_HIGHBIT_SET(s[i]))
607 0 : ereport(ERROR,
608 : (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
609 : errmsg("invalid byte value for encoding \"%s\": 0x%02x",
610 : pg_enc2name_tbl[PG_SQL_ASCII].name,
611 : (unsigned char) s[i])));
612 : }
613 : }
614 0 : return (char *) s;
615 : }
616 :
617 : /* Fast path if we can use cached conversion function */
618 0 : if (encoding == ClientEncoding->encoding)
619 0 : return perform_default_encoding_conversion(s, len, true);
620 :
621 : /* General case ... will not work outside transactions */
622 0 : return (char *) pg_do_encoding_conversion((unsigned char *) s,
623 : len,
624 : encoding,
625 0 : DatabaseEncoding->encoding);
626 : }
627 :
628 : /*
629 : * Convert server encoding to client encoding.
630 : *
631 : * See the notes about string conversion functions at the top of this file.
632 : */
633 : char *
634 176576 : pg_server_to_client(const char *s, int len)
635 : {
636 176576 : return pg_server_to_any(s, len, ClientEncoding->encoding);
637 : }
638 :
639 : /*
640 : * Convert server encoding to any encoding.
641 : *
642 : * See the notes about string conversion functions at the top of this file.
643 : */
644 : char *
645 209205 : pg_server_to_any(const char *s, int len, int encoding)
646 : {
647 209205 : if (len <= 0)
648 9001 : return (char *) s; /* empty string is always valid */
649 :
650 200204 : if (encoding == DatabaseEncoding->encoding ||
651 : encoding == PG_SQL_ASCII)
652 200204 : return (char *) s; /* assume data is valid */
653 :
654 0 : if (DatabaseEncoding->encoding == PG_SQL_ASCII)
655 : {
656 : /* No conversion is possible, but we must validate the result */
657 0 : (void) pg_verify_mbstr(encoding, s, len, false);
658 0 : return (char *) s;
659 : }
660 :
661 : /* Fast path if we can use cached conversion function */
662 0 : if (encoding == ClientEncoding->encoding)
663 0 : return perform_default_encoding_conversion(s, len, false);
664 :
665 : /* General case ... will not work outside transactions */
666 0 : return (char *) pg_do_encoding_conversion((unsigned char *) s,
667 : len,
668 0 : DatabaseEncoding->encoding,
669 : encoding);
670 : }
671 :
672 : /*
673 : * Perform default encoding conversion using cached FmgrInfo. Since
674 : * this function does not access database at all, it is safe to call
675 : * outside transactions. If the conversion has not been set up by
676 : * SetClientEncoding(), no conversion is performed.
677 : */
678 : static char *
679 0 : perform_default_encoding_conversion(const char *src, int len,
680 : bool is_client_to_server)
681 : {
682 : char *result;
683 : int src_encoding,
684 : dest_encoding;
685 : FmgrInfo *flinfo;
686 :
687 0 : if (is_client_to_server)
688 : {
689 0 : src_encoding = ClientEncoding->encoding;
690 0 : dest_encoding = DatabaseEncoding->encoding;
691 0 : flinfo = ToServerConvProc;
692 : }
693 : else
694 : {
695 0 : src_encoding = DatabaseEncoding->encoding;
696 0 : dest_encoding = ClientEncoding->encoding;
697 0 : flinfo = ToClientConvProc;
698 : }
699 :
700 0 : if (flinfo == NULL)
701 0 : return (char *) src;
702 :
703 : /*
704 : * Allocate space for conversion result, being wary of integer overflow
705 : */
706 0 : if ((Size) len >= (MaxAllocSize / (Size) MAX_CONVERSION_GROWTH))
707 0 : ereport(ERROR,
708 : (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
709 : errmsg("out of memory"),
710 : errdetail("String of %d bytes is too long for encoding conversion.",
711 : len)));
712 :
713 0 : result = palloc(len * MAX_CONVERSION_GROWTH + 1);
714 :
715 0 : FunctionCall5(flinfo,
716 : Int32GetDatum(src_encoding),
717 : Int32GetDatum(dest_encoding),
718 : CStringGetDatum(src),
719 : CStringGetDatum(result),
720 : Int32GetDatum(len));
721 0 : return result;
722 : }
723 :
724 :
725 : /* convert a multibyte string to a wchar */
726 : int
727 0 : pg_mb2wchar(const char *from, pg_wchar *to)
728 : {
729 0 : return (*pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len) ((const unsigned char *) from, to, strlen(from));
730 : }
731 :
732 : /* convert a multibyte string to a wchar with a limited length */
733 : int
734 32749 : pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
735 : {
736 32749 : return (*pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len) ((const unsigned char *) from, to, len);
737 : }
738 :
739 : /* same, with any encoding */
740 : int
741 0 : pg_encoding_mb2wchar_with_len(int encoding,
742 : const char *from, pg_wchar *to, int len)
743 : {
744 0 : return (*pg_wchar_table[encoding].mb2wchar_with_len) ((const unsigned char *) from, to, len);
745 : }
746 :
747 : /* convert a wchar string to a multibyte */
748 : int
749 0 : pg_wchar2mb(const pg_wchar *from, char *to)
750 : {
751 0 : return (*pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len) (from, (unsigned char *) to, pg_wchar_strlen(from));
752 : }
753 :
754 : /* convert a wchar string to a multibyte with a limited length */
755 : int
756 1210 : pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len)
757 : {
758 1210 : return (*pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len) (from, (unsigned char *) to, len);
759 : }
760 :
761 : /* same, with any encoding */
762 : int
763 0 : pg_encoding_wchar2mb_with_len(int encoding,
764 : const pg_wchar *from, char *to, int len)
765 : {
766 0 : return (*pg_wchar_table[encoding].wchar2mb_with_len) (from, (unsigned char *) to, len);
767 : }
768 :
769 : /* returns the byte length of a multibyte character */
770 : int
771 25029945 : pg_mblen(const char *mbstr)
772 : {
773 25029945 : return ((*pg_wchar_table[DatabaseEncoding->encoding].mblen) ((const unsigned char *) mbstr));
774 : }
775 :
776 : /* returns the display length of a multibyte character */
777 : int
778 1027 : pg_dsplen(const char *mbstr)
779 : {
780 1027 : return ((*pg_wchar_table[DatabaseEncoding->encoding].dsplen) ((const unsigned char *) mbstr));
781 : }
782 :
783 : /* returns the length (counted in wchars) of a multibyte string */
784 : int
785 14 : pg_mbstrlen(const char *mbstr)
786 : {
787 14 : int len = 0;
788 :
789 : /* optimization for single byte encoding */
790 14 : if (pg_database_encoding_max_length() == 1)
791 0 : return strlen(mbstr);
792 :
793 79 : while (*mbstr)
794 : {
795 51 : mbstr += pg_mblen(mbstr);
796 51 : len++;
797 : }
798 14 : return len;
799 : }
800 :
801 : /* returns the length (counted in wchars) of a multibyte string
802 : * (not necessarily NULL terminated)
803 : */
804 : int
805 14318 : pg_mbstrlen_with_len(const char *mbstr, int limit)
806 : {
807 14318 : int len = 0;
808 :
809 : /* optimization for single byte encoding */
810 14318 : if (pg_database_encoding_max_length() == 1)
811 0 : return limit;
812 :
813 23819860 : while (limit > 0 && *mbstr)
814 : {
815 23791224 : int l = pg_mblen(mbstr);
816 :
817 23791224 : limit -= l;
818 23791224 : mbstr += l;
819 23791224 : len++;
820 : }
821 14318 : return len;
822 : }
823 :
824 : /*
825 : * returns the byte length of a multibyte string
826 : * (not necessarily NULL terminated)
827 : * that is no longer than limit.
828 : * this function does not break multibyte character boundary.
829 : */
830 : int
831 28917 : pg_mbcliplen(const char *mbstr, int len, int limit)
832 : {
833 28917 : return pg_encoding_mbcliplen(DatabaseEncoding->encoding, mbstr,
834 : len, limit);
835 : }
836 :
837 : /*
838 : * pg_mbcliplen with specified encoding
839 : */
840 : int
841 28917 : pg_encoding_mbcliplen(int encoding, const char *mbstr,
842 : int len, int limit)
843 : {
844 : mblen_converter mblen_fn;
845 28917 : int clen = 0;
846 : int l;
847 :
848 : /* optimization for single byte encoding */
849 28917 : if (pg_encoding_max_length(encoding) == 1)
850 6 : return cliplen(mbstr, len, limit);
851 :
852 28911 : mblen_fn = pg_wchar_table[encoding].mblen;
853 :
854 2654246 : while (len > 0 && *mbstr)
855 : {
856 2597526 : l = (*mblen_fn) ((const unsigned char *) mbstr);
857 2597526 : if ((clen + l) > limit)
858 4 : break;
859 2597522 : clen += l;
860 2597522 : if (clen == limit)
861 1098 : break;
862 2596424 : len -= l;
863 2596424 : mbstr += l;
864 : }
865 28911 : return clen;
866 : }
867 :
868 : /*
869 : * Similar to pg_mbcliplen except the limit parameter specifies the
870 : * character length, not the byte length.
871 : */
872 : int
873 46 : pg_mbcharcliplen(const char *mbstr, int len, int limit)
874 : {
875 46 : int clen = 0;
876 46 : int nch = 0;
877 : int l;
878 :
879 : /* optimization for single byte encoding */
880 46 : if (pg_database_encoding_max_length() == 1)
881 0 : return cliplen(mbstr, len, limit);
882 :
883 253 : while (len > 0 && *mbstr)
884 : {
885 202 : l = pg_mblen(mbstr);
886 202 : nch++;
887 202 : if (nch > limit)
888 41 : break;
889 161 : clen += l;
890 161 : len -= l;
891 161 : mbstr += l;
892 : }
893 46 : return clen;
894 : }
895 :
896 : /* mbcliplen for any single-byte encoding */
897 : static int
898 6 : cliplen(const char *str, int len, int limit)
899 : {
900 6 : int l = 0;
901 :
902 6 : len = Min(len, limit);
903 12 : while (l < len && str[l])
904 0 : l++;
905 6 : return l;
906 : }
907 :
908 : void
909 335 : SetDatabaseEncoding(int encoding)
910 : {
911 335 : if (!PG_VALID_BE_ENCODING(encoding))
912 0 : elog(ERROR, "invalid database encoding: %d", encoding);
913 :
914 335 : DatabaseEncoding = &pg_enc2name_tbl[encoding];
915 335 : Assert(DatabaseEncoding->encoding == encoding);
916 335 : }
917 :
918 : void
919 341 : SetMessageEncoding(int encoding)
920 : {
921 : /* Some calls happen before we can elog()! */
922 341 : Assert(PG_VALID_ENCODING(encoding));
923 :
924 341 : MessageEncoding = &pg_enc2name_tbl[encoding];
925 341 : Assert(MessageEncoding->encoding == encoding);
926 341 : }
927 :
928 : #ifdef ENABLE_NLS
929 : /*
930 : * Make one bind_textdomain_codeset() call, translating a pg_enc to a gettext
931 : * codeset. Fails for MULE_INTERNAL, an encoding unknown to gettext; can also
932 : * fail for gettext-internal causes like out-of-memory.
933 : */
934 : static bool
935 : raw_pg_bind_textdomain_codeset(const char *domainname, int encoding)
936 : {
937 : bool elog_ok = (CurrentMemoryContext != NULL);
938 : int i;
939 :
940 : for (i = 0; pg_enc2gettext_tbl[i].name != NULL; i++)
941 : {
942 : if (pg_enc2gettext_tbl[i].encoding == encoding)
943 : {
944 : if (bind_textdomain_codeset(domainname,
945 : pg_enc2gettext_tbl[i].name) != NULL)
946 : return true;
947 :
948 : if (elog_ok)
949 : elog(LOG, "bind_textdomain_codeset failed");
950 : else
951 : write_stderr("bind_textdomain_codeset failed");
952 :
953 : break;
954 : }
955 : }
956 :
957 : return false;
958 : }
959 :
960 : /*
961 : * Bind a gettext message domain to the codeset corresponding to the database
962 : * encoding. For SQL_ASCII, instead bind to the codeset implied by LC_CTYPE.
963 : * Return the MessageEncoding implied by the new settings.
964 : *
965 : * On most platforms, gettext defaults to the codeset implied by LC_CTYPE.
966 : * When that matches the database encoding, we don't need to do anything. In
967 : * CREATE DATABASE, we enforce or trust that the locale's codeset matches the
968 : * database encoding, except for the C locale. (On Windows, we also permit a
969 : * discrepancy under the UTF8 encoding.) For the C locale, explicitly bind
970 : * gettext to the right codeset.
971 : *
972 : * On Windows, gettext defaults to the Windows ANSI code page. This is a
973 : * convenient departure for software that passes the strings to Windows ANSI
974 : * APIs, but we don't do that. Compel gettext to use database encoding or,
975 : * failing that, the LC_CTYPE encoding as it would on other platforms.
976 : *
977 : * This function is called before elog() and palloc() are usable.
978 : */
979 : int
980 : pg_bind_textdomain_codeset(const char *domainname)
981 : {
982 : bool elog_ok = (CurrentMemoryContext != NULL);
983 : int encoding = GetDatabaseEncoding();
984 : int new_msgenc;
985 :
986 : #ifndef WIN32
987 : const char *ctype = setlocale(LC_CTYPE, NULL);
988 :
989 : if (pg_strcasecmp(ctype, "C") == 0 || pg_strcasecmp(ctype, "POSIX") == 0)
990 : #endif
991 : if (encoding != PG_SQL_ASCII &&
992 : raw_pg_bind_textdomain_codeset(domainname, encoding))
993 : return encoding;
994 :
995 : new_msgenc = pg_get_encoding_from_locale(NULL, elog_ok);
996 : if (new_msgenc < 0)
997 : new_msgenc = PG_SQL_ASCII;
998 :
999 : #ifdef WIN32
1000 : if (!raw_pg_bind_textdomain_codeset(domainname, new_msgenc))
1001 : /* On failure, the old message encoding remains valid. */
1002 : return GetMessageEncoding();
1003 : #endif
1004 :
1005 : return new_msgenc;
1006 : }
1007 : #endif
1008 :
1009 : /*
1010 : * The database encoding, also called the server encoding, represents the
1011 : * encoding of data stored in text-like data types. Affected types include
1012 : * cstring, text, varchar, name, xml, and json.
1013 : */
1014 : int
1015 414813 : GetDatabaseEncoding(void)
1016 : {
1017 414813 : return DatabaseEncoding->encoding;
1018 : }
1019 :
1020 : const char *
1021 670 : GetDatabaseEncodingName(void)
1022 : {
1023 670 : return DatabaseEncoding->name;
1024 : }
1025 :
1026 : Datum
1027 0 : getdatabaseencoding(PG_FUNCTION_ARGS)
1028 : {
1029 0 : return DirectFunctionCall1(namein, CStringGetDatum(DatabaseEncoding->name));
1030 : }
1031 :
1032 : Datum
1033 0 : pg_client_encoding(PG_FUNCTION_ARGS)
1034 : {
1035 0 : return DirectFunctionCall1(namein, CStringGetDatum(ClientEncoding->name));
1036 : }
1037 :
1038 : /*
1039 : * gettext() returns messages in this encoding. This often matches the
1040 : * database encoding, but it differs for SQL_ASCII databases, for processes
1041 : * not attached to a database, and under a database encoding lacking iconv
1042 : * support (MULE_INTERNAL).
1043 : */
1044 : int
1045 0 : GetMessageEncoding(void)
1046 : {
1047 0 : return MessageEncoding->encoding;
1048 : }
1049 :
1050 : #ifdef WIN32
1051 : /*
1052 : * Result is palloc'ed null-terminated utf16 string. The character length
1053 : * is also passed to utf16len if not null. Returns NULL iff failed.
1054 : */
1055 : WCHAR *
1056 : pgwin32_message_to_UTF16(const char *str, int len, int *utf16len)
1057 : {
1058 : WCHAR *utf16;
1059 : int dstlen;
1060 : UINT codepage;
1061 :
1062 : codepage = pg_enc2name_tbl[GetMessageEncoding()].codepage;
1063 :
1064 : /*
1065 : * Use MultiByteToWideChar directly if there is a corresponding codepage,
1066 : * or double conversion through UTF8 if not. Double conversion is needed,
1067 : * for example, in an ENCODING=LATIN8, LC_CTYPE=C database.
1068 : */
1069 : if (codepage != 0)
1070 : {
1071 : utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
1072 : dstlen = MultiByteToWideChar(codepage, 0, str, len, utf16, len);
1073 : utf16[dstlen] = (WCHAR) 0;
1074 : }
1075 : else
1076 : {
1077 : char *utf8;
1078 :
1079 : /*
1080 : * XXX pg_do_encoding_conversion() requires a transaction. In the
1081 : * absence of one, hope for the input to be valid UTF8.
1082 : */
1083 : if (IsTransactionState())
1084 : {
1085 : utf8 = (char *) pg_do_encoding_conversion((unsigned char *) str,
1086 : len,
1087 : GetMessageEncoding(),
1088 : PG_UTF8);
1089 : if (utf8 != str)
1090 : len = strlen(utf8);
1091 : }
1092 : else
1093 : utf8 = (char *) str;
1094 :
1095 : utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
1096 : dstlen = MultiByteToWideChar(CP_UTF8, 0, utf8, len, utf16, len);
1097 : utf16[dstlen] = (WCHAR) 0;
1098 :
1099 : if (utf8 != str)
1100 : pfree(utf8);
1101 : }
1102 :
1103 : if (dstlen == 0 && len > 0)
1104 : {
1105 : pfree(utf16);
1106 : return NULL; /* error */
1107 : }
1108 :
1109 : if (utf16len)
1110 : *utf16len = dstlen;
1111 : return utf16;
1112 : }
1113 :
1114 : #endif
|