LCOV - PostgreSQL - src/backend/utils/mb/mbutils.c

LCOV - code coverage report

Current view:	top level - src/backend/utils/mb - mbutils.c (source / functions)		Hit	Total	Coverage
Test:	PostgreSQL	Lines:	152	284	53.5 %
Date:	2017-09-29 15:12:54	Functions:	24	37	64.9 %
Legend:	Lines: hit not hit

          Line data    Source code

       1             : /*-------------------------------------------------------------------------
       2             :  *
       3             :  * mbutils.c
       4             :  *    This file contains functions for encoding conversion.
       5             :  *
       6             :  * The string-conversion functions in this file share some API quirks.
       7             :  * Note the following:
       8             :  *
       9             :  * The functions return a palloc'd, null-terminated string if conversion
      10             :  * is required.  However, if no conversion is performed, the given source
      11             :  * string pointer is returned as-is.
      12             :  *
      13             :  * Although the presence of a length argument means that callers can pass
      14             :  * non-null-terminated strings, care is required because the same string
      15             :  * will be passed back if no conversion occurs.  Such callers *must* check
      16             :  * whether result == src and handle that case differently.
      17             :  *
      18             :  * If the source and destination encodings are the same, the source string
      19             :  * is returned without any verification; it's assumed to be valid data.
      20             :  * If that might not be the case, the caller is responsible for validating
      21             :  * the string using a separate call to pg_verify_mbstr().  Whenever the
      22             :  * source and destination encodings are different, the functions ensure that
      23             :  * the result is validly encoded according to the destination encoding.
      24             :  *
      25             :  *
      26             :  * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
      27             :  * Portions Copyright (c) 1994, Regents of the University of California
      28             :  *
      29             :  *
      30             :  * IDENTIFICATION
      31             :  *    src/backend/utils/mb/mbutils.c
      32             :  *
      33             :  *-------------------------------------------------------------------------
      34             :  */
      35             : #include "postgres.h"
      36             : 
      37             : #include "access/xact.h"
      38             : #include "catalog/namespace.h"
      39             : #include "mb/pg_wchar.h"
      40             : #include "utils/builtins.h"
      41             : #include "utils/memutils.h"
      42             : #include "utils/syscache.h"
      43             : 
      44             : /*
      45             :  * When converting strings between different encodings, we assume that space
      46             :  * for converted result is 4-to-1 growth in the worst case. The rate for
      47             :  * currently supported encoding pairs are within 3 (SJIS JIS X0201 half width
      48             :  * kanna -> UTF8 is the worst case).  So "4" should be enough for the moment.
      49             :  *
      50             :  * Note that this is not the same as the maximum character width in any
      51             :  * particular encoding.
      52             :  */
      53             : #define MAX_CONVERSION_GROWTH  4
      54             : 
      55             : /*
      56             :  * We maintain a simple linked list caching the fmgr lookup info for the
      57             :  * currently selected conversion functions, as well as any that have been
      58             :  * selected previously in the current session.  (We remember previous
      59             :  * settings because we must be able to restore a previous setting during
      60             :  * transaction rollback, without doing any fresh catalog accesses.)
      61             :  *
      62             :  * Since we'll never release this data, we just keep it in TopMemoryContext.
      63             :  */
      64             : typedef struct ConvProcInfo
      65             : {
      66             :     int         s_encoding;     /* server and client encoding IDs */
      67             :     int         c_encoding;
      68             :     FmgrInfo    to_server_info; /* lookup info for conversion procs */
      69             :     FmgrInfo    to_client_info;
      70             : } ConvProcInfo;
      71             : 
      72             : static List *ConvProcList = NIL;    /* List of ConvProcInfo */
      73             : 
      74             : /*
      75             :  * These variables point to the currently active conversion functions,
      76             :  * or are NULL when no conversion is needed.
      77             :  */
      78             : static FmgrInfo *ToServerConvProc = NULL;
      79             : static FmgrInfo *ToClientConvProc = NULL;
      80             : 
      81             : /*
      82             :  * These variables track the currently-selected encodings.
      83             :  */
      84             : static const pg_enc2name *ClientEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
      85             : static const pg_enc2name *DatabaseEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
      86             : static const pg_enc2name *MessageEncoding = &pg_enc2name_tbl[PG_SQL_ASCII];
      87             : 
      88             : /*
      89             :  * During backend startup we can't set client encoding because we (a)
      90             :  * can't look up the conversion functions, and (b) may not know the database
      91             :  * encoding yet either.  So SetClientEncoding() just accepts anything and
      92             :  * remembers it for InitializeClientEncoding() to apply later.
      93             :  */
      94             : static bool backend_startup_complete = false;
      95             : static int  pending_client_encoding = PG_SQL_ASCII;
      96             : 
      97             : 
      98             : /* Internal functions */
      99             : static char *perform_default_encoding_conversion(const char *src,
     100             :                                     int len, bool is_client_to_server);
     101             : static int  cliplen(const char *str, int len, int limit);
     102             : 
     103             : 
     104             : /*
     105             :  * Prepare for a future call to SetClientEncoding.  Success should mean
     106             :  * that SetClientEncoding is guaranteed to succeed for this encoding request.
     107             :  *
     108             :  * (But note that success before backend_startup_complete does not guarantee
     109             :  * success after ...)
     110             :  *
     111             :  * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
     112             :  */
     113             : int
     114         908 : PrepareClientEncoding(int encoding)
     115             : {
     116             :     int         current_server_encoding;
     117             :     ListCell   *lc;
     118             : 
     119         908 :     if (!PG_VALID_FE_ENCODING(encoding))
     120           0 :         return -1;
     121             : 
     122             :     /* Can't do anything during startup, per notes above */
     123         908 :     if (!backend_startup_complete)
     124         342 :         return 0;
     125             : 
     126         566 :     current_server_encoding = GetDatabaseEncoding();
     127             : 
     128             :     /*
     129             :      * Check for cases that require no conversion function.
     130             :      */
     131         566 :     if (current_server_encoding == encoding ||
     132         230 :         current_server_encoding == PG_SQL_ASCII ||
     133             :         encoding == PG_SQL_ASCII)
     134         566 :         return 0;
     135             : 
     136           0 :     if (IsTransactionState())
     137             :     {
     138             :         /*
     139             :          * If we're in a live transaction, it's safe to access the catalogs,
     140             :          * so look up the functions.  We repeat the lookup even if the info is
     141             :          * already cached, so that we can react to changes in the contents of
     142             :          * pg_conversion.
     143             :          */
     144             :         Oid         to_server_proc,
     145             :                     to_client_proc;
     146             :         ConvProcInfo *convinfo;
     147             :         MemoryContext oldcontext;
     148             : 
     149           0 :         to_server_proc = FindDefaultConversionProc(encoding,
     150             :                                                    current_server_encoding);
     151           0 :         if (!OidIsValid(to_server_proc))
     152           0 :             return -1;
     153           0 :         to_client_proc = FindDefaultConversionProc(current_server_encoding,
     154             :                                                    encoding);
     155           0 :         if (!OidIsValid(to_client_proc))
     156           0 :             return -1;
     157             : 
     158             :         /*
     159             :          * Load the fmgr info into TopMemoryContext (could still fail here)
     160             :          */
     161           0 :         convinfo = (ConvProcInfo *) MemoryContextAlloc(TopMemoryContext,
     162             :                                                        sizeof(ConvProcInfo));
     163           0 :         convinfo->s_encoding = current_server_encoding;
     164           0 :         convinfo->c_encoding = encoding;
     165           0 :         fmgr_info_cxt(to_server_proc, &convinfo->to_server_info,
     166             :                       TopMemoryContext);
     167           0 :         fmgr_info_cxt(to_client_proc, &convinfo->to_client_info,
     168             :                       TopMemoryContext);
     169             : 
     170             :         /* Attach new info to head of list */
     171           0 :         oldcontext = MemoryContextSwitchTo(TopMemoryContext);
     172           0 :         ConvProcList = lcons(convinfo, ConvProcList);
     173           0 :         MemoryContextSwitchTo(oldcontext);
     174             : 
     175             :         /*
     176             :          * We cannot yet remove any older entry for the same encoding pair,
     177             :          * since it could still be in use.  SetClientEncoding will clean up.
     178             :          */
     179             : 
     180           0 :         return 0;               /* success */
     181             :     }
     182             :     else
     183             :     {
     184             :         /*
     185             :          * If we're not in a live transaction, the only thing we can do is
     186             :          * restore a previous setting using the cache.  This covers all
     187             :          * transaction-rollback cases.  The only case it might not work for is
     188             :          * trying to change client_encoding on the fly by editing
     189             :          * postgresql.conf and SIGHUP'ing.  Which would probably be a stupid
     190             :          * thing to do anyway.
     191             :          */
     192           0 :         foreach(lc, ConvProcList)
     193             :         {
     194           0 :             ConvProcInfo *oldinfo = (ConvProcInfo *) lfirst(lc);
     195             : 
     196           0 :             if (oldinfo->s_encoding == current_server_encoding &&
     197           0 :                 oldinfo->c_encoding == encoding)
     198           0 :                 return 0;
     199             :         }
     200             : 
     201           0 :         return -1;              /* it's not cached, so fail */
     202             :     }
     203             : }
     204             : 
     205             : /*
     206             :  * Set the active client encoding and set up the conversion-function pointers.
     207             :  * PrepareClientEncoding should have been called previously for this encoding.
     208             :  *
     209             :  * Returns 0 if okay, -1 if not (bad encoding or can't support conversion)
     210             :  */
     211             : int
     212         678 : SetClientEncoding(int encoding)
     213             : {
     214             :     int         current_server_encoding;
     215             :     bool        found;
     216             :     ListCell   *lc;
     217             :     ListCell   *prev;
     218             :     ListCell   *next;
     219             : 
     220         678 :     if (!PG_VALID_FE_ENCODING(encoding))
     221           0 :         return -1;
     222             : 
     223             :     /* Can't do anything during startup, per notes above */
     224         678 :     if (!backend_startup_complete)
     225             :     {
     226         227 :         pending_client_encoding = encoding;
     227         227 :         return 0;
     228             :     }
     229             : 
     230         451 :     current_server_encoding = GetDatabaseEncoding();
     231             : 
     232             :     /*
     233             :      * Check for cases that require no conversion function.
     234             :      */
     235         451 :     if (current_server_encoding == encoding ||
     236         115 :         current_server_encoding == PG_SQL_ASCII ||
     237             :         encoding == PG_SQL_ASCII)
     238             :     {
     239         451 :         ClientEncoding = &pg_enc2name_tbl[encoding];
     240         451 :         ToServerConvProc = NULL;
     241         451 :         ToClientConvProc = NULL;
     242         451 :         return 0;
     243             :     }
     244             : 
     245             :     /*
     246             :      * Search the cache for the entry previously prepared by
     247             :      * PrepareClientEncoding; if there isn't one, we lose.  While at it,
     248             :      * release any duplicate entries so that repeated Prepare/Set cycles don't
     249             :      * leak memory.
     250             :      */
     251           0 :     found = false;
     252           0 :     prev = NULL;
     253           0 :     for (lc = list_head(ConvProcList); lc; lc = next)
     254             :     {
     255           0 :         ConvProcInfo *convinfo = (ConvProcInfo *) lfirst(lc);
     256             : 
     257           0 :         next = lnext(lc);
     258             : 
     259           0 :         if (convinfo->s_encoding == current_server_encoding &&
     260           0 :             convinfo->c_encoding == encoding)
     261             :         {
     262           0 :             if (!found)
     263             :             {
     264             :                 /* Found newest entry, so set up */
     265           0 :                 ClientEncoding = &pg_enc2name_tbl[encoding];
     266           0 :                 ToServerConvProc = &convinfo->to_server_info;
     267           0 :                 ToClientConvProc = &convinfo->to_client_info;
     268           0 :                 found = true;
     269             :             }
     270             :             else
     271             :             {
     272             :                 /* Duplicate entry, release it */
     273           0 :                 ConvProcList = list_delete_cell(ConvProcList, lc, prev);
     274           0 :                 pfree(convinfo);
     275           0 :                 continue;       /* prev mustn't advance */
     276             :             }
     277             :         }
     278             : 
     279           0 :         prev = lc;
     280             :     }
     281             : 
     282           0 :     if (found)
     283           0 :         return 0;               /* success */
     284             :     else
     285           0 :         return -1;              /* it's not cached, so fail */
     286             : }
     287             : 
     288             : /*
     289             :  * Initialize client encoding conversions.
     290             :  *      Called from InitPostgres() once during backend startup.
     291             :  */
     292             : void
     293         336 : InitializeClientEncoding(void)
     294             : {
     295         336 :     Assert(!backend_startup_complete);
     296         336 :     backend_startup_complete = true;
     297             : 
     298         672 :     if (PrepareClientEncoding(pending_client_encoding) < 0 ||
     299         336 :         SetClientEncoding(pending_client_encoding) < 0)
     300             :     {
     301             :         /*
     302             :          * Oops, the requested conversion is not available. We couldn't fail
     303             :          * before, but we can now.
     304             :          */
     305           0 :         ereport(FATAL,
     306             :                 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
     307             :                  errmsg("conversion between %s and %s is not supported",
     308             :                         pg_enc2name_tbl[pending_client_encoding].name,
     309             :                         GetDatabaseEncodingName())));
     310             :     }
     311         336 : }
     312             : 
     313             : /*
     314             :  * returns the current client encoding
     315             :  */
     316             : int
     317         183 : pg_get_client_encoding(void)
     318             : {
     319         183 :     return ClientEncoding->encoding;
     320             : }
     321             : 
     322             : /*
     323             :  * returns the current client encoding name
     324             :  */
     325             : const char *
     326           0 : pg_get_client_encoding_name(void)
     327             : {
     328           0 :     return ClientEncoding->name;
     329             : }
     330             : 
     331             : /*
     332             :  * Convert src string to another encoding (general case).
     333             :  *
     334             :  * See the notes about string conversion functions at the top of this file.
     335             :  */
     336             : unsigned char *
     337         132 : pg_do_encoding_conversion(unsigned char *src, int len,
     338             :                           int src_encoding, int dest_encoding)
     339             : {
     340             :     unsigned char *result;
     341             :     Oid         proc;
     342             : 
     343         132 :     if (len <= 0)
     344           0 :         return src;             /* empty string is always valid */
     345             : 
     346         132 :     if (src_encoding == dest_encoding)
     347           0 :         return src;             /* no conversion required, assume valid */
     348             : 
     349         132 :     if (dest_encoding == PG_SQL_ASCII)
     350           2 :         return src;             /* any string is valid in SQL_ASCII */
     351             : 
     352         130 :     if (src_encoding == PG_SQL_ASCII)
     353             :     {
     354             :         /* No conversion is possible, but we must validate the result */
     355           2 :         (void) pg_verify_mbstr(dest_encoding, (const char *) src, len, false);
     356           2 :         return src;
     357             :     }
     358             : 
     359         128 :     if (!IsTransactionState())  /* shouldn't happen */
     360           0 :         elog(ERROR, "cannot perform encoding conversion outside a transaction");
     361             : 
     362         128 :     proc = FindDefaultConversionProc(src_encoding, dest_encoding);
     363         128 :     if (!OidIsValid(proc))
     364           0 :         ereport(ERROR,
     365             :                 (errcode(ERRCODE_UNDEFINED_FUNCTION),
     366             :                  errmsg("default conversion function for encoding \"%s\" to \"%s\" does not exist",
     367             :                         pg_encoding_to_char(src_encoding),
     368             :                         pg_encoding_to_char(dest_encoding))));
     369             : 
     370             :     /*
     371             :      * Allocate space for conversion result, being wary of integer overflow
     372             :      */
     373         128 :     if ((Size) len >= (MaxAllocSize / (Size) MAX_CONVERSION_GROWTH))
     374           0 :         ereport(ERROR,
     375             :                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
     376             :                  errmsg("out of memory"),
     377             :                  errdetail("String of %d bytes is too long for encoding conversion.",
     378             :                            len)));
     379             : 
     380         128 :     result = palloc(len * MAX_CONVERSION_GROWTH + 1);
     381             : 
     382         128 :     OidFunctionCall5(proc,
     383             :                      Int32GetDatum(src_encoding),
     384             :                      Int32GetDatum(dest_encoding),
     385             :                      CStringGetDatum(src),
     386             :                      CStringGetDatum(result),
     387             :                      Int32GetDatum(len));
     388         128 :     return result;
     389             : }
     390             : 
     391             : /*
     392             :  * Convert string to encoding encoding_name. The source
     393             :  * encoding is the DB encoding.
     394             :  *
     395             :  * BYTEA convert_to(TEXT string, NAME encoding_name) */
     396             : Datum
     397           0 : pg_convert_to(PG_FUNCTION_ARGS)
     398             : {
     399           0 :     Datum       string = PG_GETARG_DATUM(0);
     400           0 :     Datum       dest_encoding_name = PG_GETARG_DATUM(1);
     401           0 :     Datum       src_encoding_name = DirectFunctionCall1(namein,
     402             :                                                         CStringGetDatum(DatabaseEncoding->name));
     403             :     Datum       result;
     404             : 
     405             :     /*
     406             :      * pg_convert expects a bytea as its first argument. We're passing it a
     407             :      * text argument here, relying on the fact that they are both in fact
     408             :      * varlena types, and thus structurally identical.
     409             :      */
     410           0 :     result = DirectFunctionCall3(pg_convert, string,
     411             :                                  src_encoding_name, dest_encoding_name);
     412             : 
     413           0 :     PG_RETURN_DATUM(result);
     414             : }
     415             : 
     416             : /*
     417             :  * Convert string from encoding encoding_name. The destination
     418             :  * encoding is the DB encoding.
     419             :  *
     420             :  * TEXT convert_from(BYTEA string, NAME encoding_name) */
     421             : Datum
     422           0 : pg_convert_from(PG_FUNCTION_ARGS)
     423             : {
     424           0 :     Datum       string = PG_GETARG_DATUM(0);
     425           0 :     Datum       src_encoding_name = PG_GETARG_DATUM(1);
     426           0 :     Datum       dest_encoding_name = DirectFunctionCall1(namein,
     427             :                                                          CStringGetDatum(DatabaseEncoding->name));
     428             :     Datum       result;
     429             : 
     430           0 :     result = DirectFunctionCall3(pg_convert, string,
     431             :                                  src_encoding_name, dest_encoding_name);
     432             : 
     433             :     /*
     434             :      * pg_convert returns a bytea, which we in turn return as text, relying on
     435             :      * the fact that they are both in fact varlena types, and thus
     436             :      * structurally identical. Although not all bytea values are valid text,
     437             :      * in this case it will be because we've told pg_convert to return one
     438             :      * that is valid as text in the current database encoding.
     439             :      */
     440           0 :     PG_RETURN_DATUM(result);
     441             : }
     442             : 
     443             : /*
     444             :  * Convert string between two arbitrary encodings.
     445             :  *
     446             :  * BYTEA convert(BYTEA string, NAME src_encoding_name, NAME dest_encoding_name)
     447             :  */
     448             : Datum
     449         132 : pg_convert(PG_FUNCTION_ARGS)
     450             : {
     451         132 :     bytea      *string = PG_GETARG_BYTEA_PP(0);
     452         132 :     char       *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
     453         132 :     int         src_encoding = pg_char_to_encoding(src_encoding_name);
     454         132 :     char       *dest_encoding_name = NameStr(*PG_GETARG_NAME(2));
     455         132 :     int         dest_encoding = pg_char_to_encoding(dest_encoding_name);
     456             :     const char *src_str;
     457             :     char       *dest_str;
     458             :     bytea      *retval;
     459             :     int         len;
     460             : 
     461         132 :     if (src_encoding < 0)
     462           0 :         ereport(ERROR,
     463             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     464             :                  errmsg("invalid source encoding name \"%s\"",
     465             :                         src_encoding_name)));
     466         132 :     if (dest_encoding < 0)
     467           0 :         ereport(ERROR,
     468             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     469             :                  errmsg("invalid destination encoding name \"%s\"",
     470             :                         dest_encoding_name)));
     471             : 
     472             :     /* make sure that source string is valid */
     473         132 :     len = VARSIZE_ANY_EXHDR(string);
     474         132 :     src_str = VARDATA_ANY(string);
     475         132 :     pg_verify_mbstr_len(src_encoding, src_str, len, false);
     476             : 
     477             :     /* perform conversion */
     478         132 :     dest_str = (char *) pg_do_encoding_conversion((unsigned char *) src_str,
     479             :                                                   len,
     480             :                                                   src_encoding,
     481             :                                                   dest_encoding);
     482             : 
     483             :     /* update len if conversion actually happened */
     484         132 :     if (dest_str != src_str)
     485         128 :         len = strlen(dest_str);
     486             : 
     487             :     /*
     488             :      * build bytea data type structure.
     489             :      */
     490         132 :     retval = (bytea *) palloc(len + VARHDRSZ);
     491         132 :     SET_VARSIZE(retval, len + VARHDRSZ);
     492         132 :     memcpy(VARDATA(retval), dest_str, len);
     493             : 
     494         132 :     if (dest_str != src_str)
     495         128 :         pfree(dest_str);
     496             : 
     497             :     /* free memory if allocated by the toaster */
     498         132 :     PG_FREE_IF_COPY(string, 0);
     499             : 
     500         132 :     PG_RETURN_BYTEA_P(retval);
     501             : }
     502             : 
     503             : /*
     504             :  * get the length of the string considered as text in the specified
     505             :  * encoding. Raises an error if the data is not valid in that
     506             :  * encoding.
     507             :  *
     508             :  * INT4 length (BYTEA string, NAME src_encoding_name)
     509             :  */
     510             : Datum
     511           0 : length_in_encoding(PG_FUNCTION_ARGS)
     512             : {
     513           0 :     bytea      *string = PG_GETARG_BYTEA_PP(0);
     514           0 :     char       *src_encoding_name = NameStr(*PG_GETARG_NAME(1));
     515           0 :     int         src_encoding = pg_char_to_encoding(src_encoding_name);
     516             :     const char *src_str;
     517             :     int         len;
     518             :     int         retval;
     519             : 
     520           0 :     if (src_encoding < 0)
     521           0 :         ereport(ERROR,
     522             :                 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
     523             :                  errmsg("invalid encoding name \"%s\"",
     524             :                         src_encoding_name)));
     525             : 
     526           0 :     len = VARSIZE_ANY_EXHDR(string);
     527           0 :     src_str = VARDATA_ANY(string);
     528             : 
     529           0 :     retval = pg_verify_mbstr_len(src_encoding, src_str, len, false);
     530             : 
     531           0 :     PG_RETURN_INT32(retval);
     532             : }
     533             : 
     534             : /*
     535             :  * Get maximum multibyte character length in the specified encoding.
     536             :  *
     537             :  * Note encoding is specified numerically, not by name as above.
     538             :  */
     539             : Datum
     540           0 : pg_encoding_max_length_sql(PG_FUNCTION_ARGS)
     541             : {
     542           0 :     int         encoding = PG_GETARG_INT32(0);
     543             : 
     544           0 :     if (PG_VALID_ENCODING(encoding))
     545           0 :         PG_RETURN_INT32(pg_wchar_table[encoding].maxmblen);
     546             :     else
     547           0 :         PG_RETURN_NULL();
     548             : }
     549             : 
     550             : /*
     551             :  * Convert client encoding to server encoding.
     552             :  *
     553             :  * See the notes about string conversion functions at the top of this file.
     554             :  */
     555             : char *
     556       27073 : pg_client_to_server(const char *s, int len)
     557             : {
     558       27073 :     return pg_any_to_server(s, len, ClientEncoding->encoding);
     559             : }
     560             : 
     561             : /*
     562             :  * Convert any encoding to server encoding.
     563             :  *
     564             :  * See the notes about string conversion functions at the top of this file.
     565             :  *
     566             :  * Unlike the other string conversion functions, this will apply validation
     567             :  * even if encoding == DatabaseEncoding->encoding.  This is because this is
     568             :  * used to process data coming in from outside the database, and we never
     569             :  * want to just assume validity.
     570             :  */
     571             : char *
     572      137634 : pg_any_to_server(const char *s, int len, int encoding)
     573             : {
     574      137634 :     if (len <= 0)
     575         142 :         return (char *) s;      /* empty string is always valid */
     576             : 
     577      137492 :     if (encoding == DatabaseEncoding->encoding ||
     578             :         encoding == PG_SQL_ASCII)
     579             :     {
     580             :         /*
     581             :          * No conversion is needed, but we must still validate the data.
     582             :          */
     583      137492 :         (void) pg_verify_mbstr(DatabaseEncoding->encoding, s, len, false);
     584      137492 :         return (char *) s;
     585             :     }
     586             : 
     587           0 :     if (DatabaseEncoding->encoding == PG_SQL_ASCII)
     588             :     {
     589             :         /*
     590             :          * No conversion is possible, but we must still validate the data,
     591             :          * because the client-side code might have done string escaping using
     592             :          * the selected client_encoding.  If the client encoding is ASCII-safe
     593             :          * then we just do a straight validation under that encoding.  For an
     594             :          * ASCII-unsafe encoding we have a problem: we dare not pass such data
     595             :          * to the parser but we have no way to convert it.  We compromise by
     596             :          * rejecting the data if it contains any non-ASCII characters.
     597             :          */
     598           0 :         if (PG_VALID_BE_ENCODING(encoding))
     599           0 :             (void) pg_verify_mbstr(encoding, s, len, false);
     600             :         else
     601             :         {
     602             :             int         i;
     603             : 
     604           0 :             for (i = 0; i < len; i++)
     605             :             {
     606           0 :                 if (s[i] == '\0' || IS_HIGHBIT_SET(s[i]))
     607           0 :                     ereport(ERROR,
     608             :                             (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
     609             :                              errmsg("invalid byte value for encoding \"%s\": 0x%02x",
     610             :                                     pg_enc2name_tbl[PG_SQL_ASCII].name,
     611             :                                     (unsigned char) s[i])));
     612             :             }
     613             :         }
     614           0 :         return (char *) s;
     615             :     }
     616             : 
     617             :     /* Fast path if we can use cached conversion function */
     618           0 :     if (encoding == ClientEncoding->encoding)
     619           0 :         return perform_default_encoding_conversion(s, len, true);
     620             : 
     621             :     /* General case ... will not work outside transactions */
     622           0 :     return (char *) pg_do_encoding_conversion((unsigned char *) s,
     623             :                                               len,
     624             :                                               encoding,
     625           0 :                                               DatabaseEncoding->encoding);
     626             : }
     627             : 
     628             : /*
     629             :  * Convert server encoding to client encoding.
     630             :  *
     631             :  * See the notes about string conversion functions at the top of this file.
     632             :  */
     633             : char *
     634      176576 : pg_server_to_client(const char *s, int len)
     635             : {
     636      176576 :     return pg_server_to_any(s, len, ClientEncoding->encoding);
     637             : }
     638             : 
     639             : /*
     640             :  * Convert server encoding to any encoding.
     641             :  *
     642             :  * See the notes about string conversion functions at the top of this file.
     643             :  */
     644             : char *
     645      209205 : pg_server_to_any(const char *s, int len, int encoding)
     646             : {
     647      209205 :     if (len <= 0)
     648        9001 :         return (char *) s;      /* empty string is always valid */
     649             : 
     650      200204 :     if (encoding == DatabaseEncoding->encoding ||
     651             :         encoding == PG_SQL_ASCII)
     652      200204 :         return (char *) s;      /* assume data is valid */
     653             : 
     654           0 :     if (DatabaseEncoding->encoding == PG_SQL_ASCII)
     655             :     {
     656             :         /* No conversion is possible, but we must validate the result */
     657           0 :         (void) pg_verify_mbstr(encoding, s, len, false);
     658           0 :         return (char *) s;
     659             :     }
     660             : 
     661             :     /* Fast path if we can use cached conversion function */
     662           0 :     if (encoding == ClientEncoding->encoding)
     663           0 :         return perform_default_encoding_conversion(s, len, false);
     664             : 
     665             :     /* General case ... will not work outside transactions */
     666           0 :     return (char *) pg_do_encoding_conversion((unsigned char *) s,
     667             :                                               len,
     668           0 :                                               DatabaseEncoding->encoding,
     669             :                                               encoding);
     670             : }
     671             : 
     672             : /*
     673             :  *  Perform default encoding conversion using cached FmgrInfo. Since
     674             :  *  this function does not access database at all, it is safe to call
     675             :  *  outside transactions.  If the conversion has not been set up by
     676             :  *  SetClientEncoding(), no conversion is performed.
     677             :  */
     678             : static char *
     679           0 : perform_default_encoding_conversion(const char *src, int len,
     680             :                                     bool is_client_to_server)
     681             : {
     682             :     char       *result;
     683             :     int         src_encoding,
     684             :                 dest_encoding;
     685             :     FmgrInfo   *flinfo;
     686             : 
     687           0 :     if (is_client_to_server)
     688             :     {
     689           0 :         src_encoding = ClientEncoding->encoding;
     690           0 :         dest_encoding = DatabaseEncoding->encoding;
     691           0 :         flinfo = ToServerConvProc;
     692             :     }
     693             :     else
     694             :     {
     695           0 :         src_encoding = DatabaseEncoding->encoding;
     696           0 :         dest_encoding = ClientEncoding->encoding;
     697           0 :         flinfo = ToClientConvProc;
     698             :     }
     699             : 
     700           0 :     if (flinfo == NULL)
     701           0 :         return (char *) src;
     702             : 
     703             :     /*
     704             :      * Allocate space for conversion result, being wary of integer overflow
     705             :      */
     706           0 :     if ((Size) len >= (MaxAllocSize / (Size) MAX_CONVERSION_GROWTH))
     707           0 :         ereport(ERROR,
     708             :                 (errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
     709             :                  errmsg("out of memory"),
     710             :                  errdetail("String of %d bytes is too long for encoding conversion.",
     711             :                            len)));
     712             : 
     713           0 :     result = palloc(len * MAX_CONVERSION_GROWTH + 1);
     714             : 
     715           0 :     FunctionCall5(flinfo,
     716             :                   Int32GetDatum(src_encoding),
     717             :                   Int32GetDatum(dest_encoding),
     718             :                   CStringGetDatum(src),
     719             :                   CStringGetDatum(result),
     720             :                   Int32GetDatum(len));
     721           0 :     return result;
     722             : }
     723             : 
     724             : 
     725             : /* convert a multibyte string to a wchar */
     726             : int
     727           0 : pg_mb2wchar(const char *from, pg_wchar *to)
     728             : {
     729           0 :     return (*pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len) ((const unsigned char *) from, to, strlen(from));
     730             : }
     731             : 
     732             : /* convert a multibyte string to a wchar with a limited length */
     733             : int
     734       32749 : pg_mb2wchar_with_len(const char *from, pg_wchar *to, int len)
     735             : {
     736       32749 :     return (*pg_wchar_table[DatabaseEncoding->encoding].mb2wchar_with_len) ((const unsigned char *) from, to, len);
     737             : }
     738             : 
     739             : /* same, with any encoding */
     740             : int
     741           0 : pg_encoding_mb2wchar_with_len(int encoding,
     742             :                               const char *from, pg_wchar *to, int len)
     743             : {
     744           0 :     return (*pg_wchar_table[encoding].mb2wchar_with_len) ((const unsigned char *) from, to, len);
     745             : }
     746             : 
     747             : /* convert a wchar string to a multibyte */
     748             : int
     749           0 : pg_wchar2mb(const pg_wchar *from, char *to)
     750             : {
     751           0 :     return (*pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len) (from, (unsigned char *) to, pg_wchar_strlen(from));
     752             : }
     753             : 
     754             : /* convert a wchar string to a multibyte with a limited length */
     755             : int
     756        1210 : pg_wchar2mb_with_len(const pg_wchar *from, char *to, int len)
     757             : {
     758        1210 :     return (*pg_wchar_table[DatabaseEncoding->encoding].wchar2mb_with_len) (from, (unsigned char *) to, len);
     759             : }
     760             : 
     761             : /* same, with any encoding */
     762             : int
     763           0 : pg_encoding_wchar2mb_with_len(int encoding,
     764             :                               const pg_wchar *from, char *to, int len)
     765             : {
     766           0 :     return (*pg_wchar_table[encoding].wchar2mb_with_len) (from, (unsigned char *) to, len);
     767             : }
     768             : 
     769             : /* returns the byte length of a multibyte character */
     770             : int
     771    25029945 : pg_mblen(const char *mbstr)
     772             : {
     773    25029945 :     return ((*pg_wchar_table[DatabaseEncoding->encoding].mblen) ((const unsigned char *) mbstr));
     774             : }
     775             : 
     776             : /* returns the display length of a multibyte character */
     777             : int
     778        1027 : pg_dsplen(const char *mbstr)
     779             : {
     780        1027 :     return ((*pg_wchar_table[DatabaseEncoding->encoding].dsplen) ((const unsigned char *) mbstr));
     781             : }
     782             : 
     783             : /* returns the length (counted in wchars) of a multibyte string */
     784             : int
     785          14 : pg_mbstrlen(const char *mbstr)
     786             : {
     787          14 :     int         len = 0;
     788             : 
     789             :     /* optimization for single byte encoding */
     790          14 :     if (pg_database_encoding_max_length() == 1)
     791           0 :         return strlen(mbstr);
     792             : 
     793          79 :     while (*mbstr)
     794             :     {
     795          51 :         mbstr += pg_mblen(mbstr);
     796          51 :         len++;
     797             :     }
     798          14 :     return len;
     799             : }
     800             : 
     801             : /* returns the length (counted in wchars) of a multibyte string
     802             :  * (not necessarily NULL terminated)
     803             :  */
     804             : int
     805       14318 : pg_mbstrlen_with_len(const char *mbstr, int limit)
     806             : {
     807       14318 :     int         len = 0;
     808             : 
     809             :     /* optimization for single byte encoding */
     810       14318 :     if (pg_database_encoding_max_length() == 1)
     811           0 :         return limit;
     812             : 
     813    23819860 :     while (limit > 0 && *mbstr)
     814             :     {
     815    23791224 :         int         l = pg_mblen(mbstr);
     816             : 
     817    23791224 :         limit -= l;
     818    23791224 :         mbstr += l;
     819    23791224 :         len++;
     820             :     }
     821       14318 :     return len;
     822             : }
     823             : 
     824             : /*
     825             :  * returns the byte length of a multibyte string
     826             :  * (not necessarily NULL terminated)
     827             :  * that is no longer than limit.
     828             :  * this function does not break multibyte character boundary.
     829             :  */
     830             : int
     831       28917 : pg_mbcliplen(const char *mbstr, int len, int limit)
     832             : {
     833       28917 :     return pg_encoding_mbcliplen(DatabaseEncoding->encoding, mbstr,
     834             :                                  len, limit);
     835             : }
     836             : 
     837             : /*
     838             :  * pg_mbcliplen with specified encoding
     839             :  */
     840             : int
     841       28917 : pg_encoding_mbcliplen(int encoding, const char *mbstr,
     842             :                       int len, int limit)
     843             : {
     844             :     mblen_converter mblen_fn;
     845       28917 :     int         clen = 0;
     846             :     int         l;
     847             : 
     848             :     /* optimization for single byte encoding */
     849       28917 :     if (pg_encoding_max_length(encoding) == 1)
     850           6 :         return cliplen(mbstr, len, limit);
     851             : 
     852       28911 :     mblen_fn = pg_wchar_table[encoding].mblen;
     853             : 
     854     2654246 :     while (len > 0 && *mbstr)
     855             :     {
     856     2597526 :         l = (*mblen_fn) ((const unsigned char *) mbstr);
     857     2597526 :         if ((clen + l) > limit)
     858           4 :             break;
     859     2597522 :         clen += l;
     860     2597522 :         if (clen == limit)
     861        1098 :             break;
     862     2596424 :         len -= l;
     863     2596424 :         mbstr += l;
     864             :     }
     865       28911 :     return clen;
     866             : }
     867             : 
     868             : /*
     869             :  * Similar to pg_mbcliplen except the limit parameter specifies the
     870             :  * character length, not the byte length.
     871             :  */
     872             : int
     873          46 : pg_mbcharcliplen(const char *mbstr, int len, int limit)
     874             : {
     875          46 :     int         clen = 0;
     876          46 :     int         nch = 0;
     877             :     int         l;
     878             : 
     879             :     /* optimization for single byte encoding */
     880          46 :     if (pg_database_encoding_max_length() == 1)
     881           0 :         return cliplen(mbstr, len, limit);
     882             : 
     883         253 :     while (len > 0 && *mbstr)
     884             :     {
     885         202 :         l = pg_mblen(mbstr);
     886         202 :         nch++;
     887         202 :         if (nch > limit)
     888          41 :             break;
     889         161 :         clen += l;
     890         161 :         len -= l;
     891         161 :         mbstr += l;
     892             :     }
     893          46 :     return clen;
     894             : }
     895             : 
     896             : /* mbcliplen for any single-byte encoding */
     897             : static int
     898           6 : cliplen(const char *str, int len, int limit)
     899             : {
     900           6 :     int         l = 0;
     901             : 
     902           6 :     len = Min(len, limit);
     903          12 :     while (l < len && str[l])
     904           0 :         l++;
     905           6 :     return l;
     906             : }
     907             : 
     908             : void
     909         335 : SetDatabaseEncoding(int encoding)
     910             : {
     911         335 :     if (!PG_VALID_BE_ENCODING(encoding))
     912           0 :         elog(ERROR, "invalid database encoding: %d", encoding);
     913             : 
     914         335 :     DatabaseEncoding = &pg_enc2name_tbl[encoding];
     915         335 :     Assert(DatabaseEncoding->encoding == encoding);
     916         335 : }
     917             : 
     918             : void
     919         341 : SetMessageEncoding(int encoding)
     920             : {
     921             :     /* Some calls happen before we can elog()! */
     922         341 :     Assert(PG_VALID_ENCODING(encoding));
     923             : 
     924         341 :     MessageEncoding = &pg_enc2name_tbl[encoding];
     925         341 :     Assert(MessageEncoding->encoding == encoding);
     926         341 : }
     927             : 
     928             : #ifdef ENABLE_NLS
     929             : /*
     930             :  * Make one bind_textdomain_codeset() call, translating a pg_enc to a gettext
     931             :  * codeset.  Fails for MULE_INTERNAL, an encoding unknown to gettext; can also
     932             :  * fail for gettext-internal causes like out-of-memory.
     933             :  */
     934             : static bool
     935             : raw_pg_bind_textdomain_codeset(const char *domainname, int encoding)
     936             : {
     937             :     bool        elog_ok = (CurrentMemoryContext != NULL);
     938             :     int         i;
     939             : 
     940             :     for (i = 0; pg_enc2gettext_tbl[i].name != NULL; i++)
     941             :     {
     942             :         if (pg_enc2gettext_tbl[i].encoding == encoding)
     943             :         {
     944             :             if (bind_textdomain_codeset(domainname,
     945             :                                         pg_enc2gettext_tbl[i].name) != NULL)
     946             :                 return true;
     947             : 
     948             :             if (elog_ok)
     949             :                 elog(LOG, "bind_textdomain_codeset failed");
     950             :             else
     951             :                 write_stderr("bind_textdomain_codeset failed");
     952             : 
     953             :             break;
     954             :         }
     955             :     }
     956             : 
     957             :     return false;
     958             : }
     959             : 
     960             : /*
     961             :  * Bind a gettext message domain to the codeset corresponding to the database
     962             :  * encoding.  For SQL_ASCII, instead bind to the codeset implied by LC_CTYPE.
     963             :  * Return the MessageEncoding implied by the new settings.
     964             :  *
     965             :  * On most platforms, gettext defaults to the codeset implied by LC_CTYPE.
     966             :  * When that matches the database encoding, we don't need to do anything.  In
     967             :  * CREATE DATABASE, we enforce or trust that the locale's codeset matches the
     968             :  * database encoding, except for the C locale.  (On Windows, we also permit a
     969             :  * discrepancy under the UTF8 encoding.)  For the C locale, explicitly bind
     970             :  * gettext to the right codeset.
     971             :  *
     972             :  * On Windows, gettext defaults to the Windows ANSI code page.  This is a
     973             :  * convenient departure for software that passes the strings to Windows ANSI
     974             :  * APIs, but we don't do that.  Compel gettext to use database encoding or,
     975             :  * failing that, the LC_CTYPE encoding as it would on other platforms.
     976             :  *
     977             :  * This function is called before elog() and palloc() are usable.
     978             :  */
     979             : int
     980             : pg_bind_textdomain_codeset(const char *domainname)
     981             : {
     982             :     bool        elog_ok = (CurrentMemoryContext != NULL);
     983             :     int         encoding = GetDatabaseEncoding();
     984             :     int         new_msgenc;
     985             : 
     986             : #ifndef WIN32
     987             :     const char *ctype = setlocale(LC_CTYPE, NULL);
     988             : 
     989             :     if (pg_strcasecmp(ctype, "C") == 0 || pg_strcasecmp(ctype, "POSIX") == 0)
     990             : #endif
     991             :         if (encoding != PG_SQL_ASCII &&
     992             :             raw_pg_bind_textdomain_codeset(domainname, encoding))
     993             :             return encoding;
     994             : 
     995             :     new_msgenc = pg_get_encoding_from_locale(NULL, elog_ok);
     996             :     if (new_msgenc < 0)
     997             :         new_msgenc = PG_SQL_ASCII;
     998             : 
     999             : #ifdef WIN32
    1000             :     if (!raw_pg_bind_textdomain_codeset(domainname, new_msgenc))
    1001             :         /* On failure, the old message encoding remains valid. */
    1002             :         return GetMessageEncoding();
    1003             : #endif
    1004             : 
    1005             :     return new_msgenc;
    1006             : }
    1007             : #endif
    1008             : 
    1009             : /*
    1010             :  * The database encoding, also called the server encoding, represents the
    1011             :  * encoding of data stored in text-like data types.  Affected types include
    1012             :  * cstring, text, varchar, name, xml, and json.
    1013             :  */
    1014             : int
    1015      414813 : GetDatabaseEncoding(void)
    1016             : {
    1017      414813 :     return DatabaseEncoding->encoding;
    1018             : }
    1019             : 
    1020             : const char *
    1021         670 : GetDatabaseEncodingName(void)
    1022             : {
    1023         670 :     return DatabaseEncoding->name;
    1024             : }
    1025             : 
    1026             : Datum
    1027           0 : getdatabaseencoding(PG_FUNCTION_ARGS)
    1028             : {
    1029           0 :     return DirectFunctionCall1(namein, CStringGetDatum(DatabaseEncoding->name));
    1030             : }
    1031             : 
    1032             : Datum
    1033           0 : pg_client_encoding(PG_FUNCTION_ARGS)
    1034             : {
    1035           0 :     return DirectFunctionCall1(namein, CStringGetDatum(ClientEncoding->name));
    1036             : }
    1037             : 
    1038             : /*
    1039             :  * gettext() returns messages in this encoding.  This often matches the
    1040             :  * database encoding, but it differs for SQL_ASCII databases, for processes
    1041             :  * not attached to a database, and under a database encoding lacking iconv
    1042             :  * support (MULE_INTERNAL).
    1043             :  */
    1044             : int
    1045           0 : GetMessageEncoding(void)
    1046             : {
    1047           0 :     return MessageEncoding->encoding;
    1048             : }
    1049             : 
    1050             : #ifdef WIN32
    1051             : /*
    1052             :  * Result is palloc'ed null-terminated utf16 string. The character length
    1053             :  * is also passed to utf16len if not null. Returns NULL iff failed.
    1054             :  */
    1055             : WCHAR *
    1056             : pgwin32_message_to_UTF16(const char *str, int len, int *utf16len)
    1057             : {
    1058             :     WCHAR      *utf16;
    1059             :     int         dstlen;
    1060             :     UINT        codepage;
    1061             : 
    1062             :     codepage = pg_enc2name_tbl[GetMessageEncoding()].codepage;
    1063             : 
    1064             :     /*
    1065             :      * Use MultiByteToWideChar directly if there is a corresponding codepage,
    1066             :      * or double conversion through UTF8 if not.  Double conversion is needed,
    1067             :      * for example, in an ENCODING=LATIN8, LC_CTYPE=C database.
    1068             :      */
    1069             :     if (codepage != 0)
    1070             :     {
    1071             :         utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
    1072             :         dstlen = MultiByteToWideChar(codepage, 0, str, len, utf16, len);
    1073             :         utf16[dstlen] = (WCHAR) 0;
    1074             :     }
    1075             :     else
    1076             :     {
    1077             :         char       *utf8;
    1078             : 
    1079             :         /*
    1080             :          * XXX pg_do_encoding_conversion() requires a transaction.  In the
    1081             :          * absence of one, hope for the input to be valid UTF8.
    1082             :          */
    1083             :         if (IsTransactionState())
    1084             :         {
    1085             :             utf8 = (char *) pg_do_encoding_conversion((unsigned char *) str,
    1086             :                                                       len,
    1087             :                                                       GetMessageEncoding(),
    1088             :                                                       PG_UTF8);
    1089             :             if (utf8 != str)
    1090             :                 len = strlen(utf8);
    1091             :         }
    1092             :         else
    1093             :             utf8 = (char *) str;
    1094             : 
    1095             :         utf16 = (WCHAR *) palloc(sizeof(WCHAR) * (len + 1));
    1096             :         dstlen = MultiByteToWideChar(CP_UTF8, 0, utf8, len, utf16, len);
    1097             :         utf16[dstlen] = (WCHAR) 0;
    1098             : 
    1099             :         if (utf8 != str)
    1100             :             pfree(utf8);
    1101             :     }
    1102             : 
    1103             :     if (dstlen == 0 && len > 0)
    1104             :     {
    1105             :         pfree(utf16);
    1106             :         return NULL;            /* error */
    1107             :     }
    1108             : 
    1109             :     if (utf16len)
    1110             :         *utf16len = dstlen;
    1111             :     return utf16;
    1112             : }
    1113             : 
    1114             : #endif

Generated by: LCOV version 1.11