LCOV - code coverage report
Current view: top level - src/interfaces/libpq - wchar.c (source / functions) Hit Total Coverage
Test: PostgreSQL Lines: 34 671 5.1 %
Date: 2017-09-29 15:12:54 Functions: 8 66 12.1 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /*
       2             :  * conversion functions between pg_wchar and multibyte streams.
       3             :  * Tatsuo Ishii
       4             :  * src/backend/utils/mb/wchar.c
       5             :  *
       6             :  */
       7             : /* can be used in either frontend or backend */
       8             : #ifdef FRONTEND
       9             : #include "postgres_fe.h"
      10             : #else
      11             : #include "postgres.h"
      12             : #endif
      13             : 
      14             : #include "mb/pg_wchar.h"
      15             : 
      16             : 
      17             : /*
      18             :  * conversion to pg_wchar is done by "table driven."
      19             :  * to add an encoding support, define mb2wchar_with_len(), mblen(), dsplen()
      20             :  * for the particular encoding. Note that if the encoding is only
      21             :  * supported in the client, you don't need to define
      22             :  * mb2wchar_with_len() function (SJIS is the case).
      23             :  *
      24             :  * These functions generally assume that their input is validly formed.
      25             :  * The "verifier" functions, further down in the file, have to be more
      26             :  * paranoid.  We expect that mblen() does not need to examine more than
      27             :  * the first byte of the character to discover the correct length.
      28             :  *
      29             :  * Note: for the display output of psql to work properly, the return values
      30             :  * of the dsplen functions must conform to the Unicode standard. In particular
      31             :  * the NUL character is zero width and control characters are generally
      32             :  * width -1. It is recommended that non-ASCII encodings refer their ASCII
      33             :  * subset to the ASCII routines to ensure consistency.
      34             :  */
      35             : 
      36             : /*
      37             :  * SQL/ASCII
      38             :  */
      39             : static int
      40           0 : pg_ascii2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
      41             : {
      42           0 :     int         cnt = 0;
      43             : 
      44           0 :     while (len > 0 && *from)
      45             :     {
      46           0 :         *to++ = *from++;
      47           0 :         len--;
      48           0 :         cnt++;
      49             :     }
      50           0 :     *to = 0;
      51           0 :     return cnt;
      52             : }
      53             : 
      54             : static int
      55           0 : pg_ascii_mblen(const unsigned char *s)
      56             : {
      57           0 :     return 1;
      58             : }
      59             : 
      60             : static int
      61           0 : pg_ascii_dsplen(const unsigned char *s)
      62             : {
      63           0 :     if (*s == '\0')
      64           0 :         return 0;
      65           0 :     if (*s < 0x20 || *s == 0x7f)
      66           0 :         return -1;
      67             : 
      68           0 :     return 1;
      69             : }
      70             : 
      71             : /*
      72             :  * EUC
      73             :  */
      74             : static int
      75           0 : pg_euc2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
      76             : {
      77           0 :     int         cnt = 0;
      78             : 
      79           0 :     while (len > 0 && *from)
      80             :     {
      81           0 :         if (*from == SS2 && len >= 2)    /* JIS X 0201 (so called "1 byte
      82             :                                          * KANA") */
      83             :         {
      84           0 :             from++;
      85           0 :             *to = (SS2 << 8) | *from++;
      86           0 :             len -= 2;
      87             :         }
      88           0 :         else if (*from == SS3 && len >= 3)   /* JIS X 0212 KANJI */
      89             :         {
      90           0 :             from++;
      91           0 :             *to = (SS3 << 16) | (*from++ << 8);
      92           0 :             *to |= *from++;
      93           0 :             len -= 3;
      94             :         }
      95           0 :         else if (IS_HIGHBIT_SET(*from) && len >= 2) /* JIS X 0208 KANJI */
      96             :         {
      97           0 :             *to = *from++ << 8;
      98           0 :             *to |= *from++;
      99           0 :             len -= 2;
     100             :         }
     101             :         else                    /* must be ASCII */
     102             :         {
     103           0 :             *to = *from++;
     104           0 :             len--;
     105             :         }
     106           0 :         to++;
     107           0 :         cnt++;
     108             :     }
     109           0 :     *to = 0;
     110           0 :     return cnt;
     111             : }
     112             : 
     113             : static inline int
     114           0 : pg_euc_mblen(const unsigned char *s)
     115             : {
     116             :     int         len;
     117             : 
     118           0 :     if (*s == SS2)
     119           0 :         len = 2;
     120           0 :     else if (*s == SS3)
     121           0 :         len = 3;
     122           0 :     else if (IS_HIGHBIT_SET(*s))
     123           0 :         len = 2;
     124             :     else
     125           0 :         len = 1;
     126           0 :     return len;
     127             : }
     128             : 
     129             : static inline int
     130           0 : pg_euc_dsplen(const unsigned char *s)
     131             : {
     132             :     int         len;
     133             : 
     134           0 :     if (*s == SS2)
     135           0 :         len = 2;
     136           0 :     else if (*s == SS3)
     137           0 :         len = 2;
     138           0 :     else if (IS_HIGHBIT_SET(*s))
     139           0 :         len = 2;
     140             :     else
     141           0 :         len = pg_ascii_dsplen(s);
     142           0 :     return len;
     143             : }
     144             : 
     145             : /*
     146             :  * EUC_JP
     147             :  */
     148             : static int
     149           0 : pg_eucjp2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     150             : {
     151           0 :     return pg_euc2wchar_with_len(from, to, len);
     152             : }
     153             : 
     154             : static int
     155           0 : pg_eucjp_mblen(const unsigned char *s)
     156             : {
     157           0 :     return pg_euc_mblen(s);
     158             : }
     159             : 
     160             : static int
     161           0 : pg_eucjp_dsplen(const unsigned char *s)
     162             : {
     163             :     int         len;
     164             : 
     165           0 :     if (*s == SS2)
     166           0 :         len = 1;
     167           0 :     else if (*s == SS3)
     168           0 :         len = 2;
     169           0 :     else if (IS_HIGHBIT_SET(*s))
     170           0 :         len = 2;
     171             :     else
     172           0 :         len = pg_ascii_dsplen(s);
     173           0 :     return len;
     174             : }
     175             : 
     176             : /*
     177             :  * EUC_KR
     178             :  */
     179             : static int
     180           0 : pg_euckr2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     181             : {
     182           0 :     return pg_euc2wchar_with_len(from, to, len);
     183             : }
     184             : 
     185             : static int
     186           0 : pg_euckr_mblen(const unsigned char *s)
     187             : {
     188           0 :     return pg_euc_mblen(s);
     189             : }
     190             : 
     191             : static int
     192           0 : pg_euckr_dsplen(const unsigned char *s)
     193             : {
     194           0 :     return pg_euc_dsplen(s);
     195             : }
     196             : 
     197             : /*
     198             :  * EUC_CN
     199             :  *
     200             :  */
     201             : static int
     202           0 : pg_euccn2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     203             : {
     204           0 :     int         cnt = 0;
     205             : 
     206           0 :     while (len > 0 && *from)
     207             :     {
     208           0 :         if (*from == SS2 && len >= 3)    /* code set 2 (unused?) */
     209             :         {
     210           0 :             from++;
     211           0 :             *to = (SS2 << 16) | (*from++ << 8);
     212           0 :             *to |= *from++;
     213           0 :             len -= 3;
     214             :         }
     215           0 :         else if (*from == SS3 && len >= 3)   /* code set 3 (unused ?) */
     216             :         {
     217           0 :             from++;
     218           0 :             *to = (SS3 << 16) | (*from++ << 8);
     219           0 :             *to |= *from++;
     220           0 :             len -= 3;
     221             :         }
     222           0 :         else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 1 */
     223             :         {
     224           0 :             *to = *from++ << 8;
     225           0 :             *to |= *from++;
     226           0 :             len -= 2;
     227             :         }
     228             :         else
     229             :         {
     230           0 :             *to = *from++;
     231           0 :             len--;
     232             :         }
     233           0 :         to++;
     234           0 :         cnt++;
     235             :     }
     236           0 :     *to = 0;
     237           0 :     return cnt;
     238             : }
     239             : 
     240             : static int
     241           0 : pg_euccn_mblen(const unsigned char *s)
     242             : {
     243             :     int         len;
     244             : 
     245           0 :     if (IS_HIGHBIT_SET(*s))
     246           0 :         len = 2;
     247             :     else
     248           0 :         len = 1;
     249           0 :     return len;
     250             : }
     251             : 
     252             : static int
     253           0 : pg_euccn_dsplen(const unsigned char *s)
     254             : {
     255             :     int         len;
     256             : 
     257           0 :     if (IS_HIGHBIT_SET(*s))
     258           0 :         len = 2;
     259             :     else
     260           0 :         len = pg_ascii_dsplen(s);
     261           0 :     return len;
     262             : }
     263             : 
     264             : /*
     265             :  * EUC_TW
     266             :  *
     267             :  */
     268             : static int
     269           0 : pg_euctw2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     270             : {
     271           0 :     int         cnt = 0;
     272             : 
     273           0 :     while (len > 0 && *from)
     274             :     {
     275           0 :         if (*from == SS2 && len >= 4)    /* code set 2 */
     276             :         {
     277           0 :             from++;
     278           0 :             *to = (((uint32) SS2) << 24) | (*from++ << 16);
     279           0 :             *to |= *from++ << 8;
     280           0 :             *to |= *from++;
     281           0 :             len -= 4;
     282             :         }
     283           0 :         else if (*from == SS3 && len >= 3)   /* code set 3 (unused?) */
     284             :         {
     285           0 :             from++;
     286           0 :             *to = (SS3 << 16) | (*from++ << 8);
     287           0 :             *to |= *from++;
     288           0 :             len -= 3;
     289             :         }
     290           0 :         else if (IS_HIGHBIT_SET(*from) && len >= 2) /* code set 2 */
     291             :         {
     292           0 :             *to = *from++ << 8;
     293           0 :             *to |= *from++;
     294           0 :             len -= 2;
     295             :         }
     296             :         else
     297             :         {
     298           0 :             *to = *from++;
     299           0 :             len--;
     300             :         }
     301           0 :         to++;
     302           0 :         cnt++;
     303             :     }
     304           0 :     *to = 0;
     305           0 :     return cnt;
     306             : }
     307             : 
     308             : static int
     309           0 : pg_euctw_mblen(const unsigned char *s)
     310             : {
     311             :     int         len;
     312             : 
     313           0 :     if (*s == SS2)
     314           0 :         len = 4;
     315           0 :     else if (*s == SS3)
     316           0 :         len = 3;
     317           0 :     else if (IS_HIGHBIT_SET(*s))
     318           0 :         len = 2;
     319             :     else
     320           0 :         len = 1;
     321           0 :     return len;
     322             : }
     323             : 
     324             : static int
     325           0 : pg_euctw_dsplen(const unsigned char *s)
     326             : {
     327             :     int         len;
     328             : 
     329           0 :     if (*s == SS2)
     330           0 :         len = 2;
     331           0 :     else if (*s == SS3)
     332           0 :         len = 2;
     333           0 :     else if (IS_HIGHBIT_SET(*s))
     334           0 :         len = 2;
     335             :     else
     336           0 :         len = pg_ascii_dsplen(s);
     337           0 :     return len;
     338             : }
     339             : 
     340             : /*
     341             :  * Convert pg_wchar to EUC_* encoding.
     342             :  * caller must allocate enough space for "to", including a trailing zero!
     343             :  * len: length of from.
     344             :  * "from" not necessarily null terminated.
     345             :  */
     346             : static int
     347           0 : pg_wchar2euc_with_len(const pg_wchar *from, unsigned char *to, int len)
     348             : {
     349           0 :     int         cnt = 0;
     350             : 
     351           0 :     while (len > 0 && *from)
     352             :     {
     353             :         unsigned char c;
     354             : 
     355           0 :         if ((c = (*from >> 24)))
     356             :         {
     357           0 :             *to++ = c;
     358           0 :             *to++ = (*from >> 16) & 0xff;
     359           0 :             *to++ = (*from >> 8) & 0xff;
     360           0 :             *to++ = *from & 0xff;
     361           0 :             cnt += 4;
     362             :         }
     363           0 :         else if ((c = (*from >> 16)))
     364             :         {
     365           0 :             *to++ = c;
     366           0 :             *to++ = (*from >> 8) & 0xff;
     367           0 :             *to++ = *from & 0xff;
     368           0 :             cnt += 3;
     369             :         }
     370           0 :         else if ((c = (*from >> 8)))
     371             :         {
     372           0 :             *to++ = c;
     373           0 :             *to++ = *from & 0xff;
     374           0 :             cnt += 2;
     375             :         }
     376             :         else
     377             :         {
     378           0 :             *to++ = *from;
     379           0 :             cnt++;
     380             :         }
     381           0 :         from++;
     382           0 :         len--;
     383             :     }
     384           0 :     *to = 0;
     385           0 :     return cnt;
     386             : }
     387             : 
     388             : 
     389             : /*
     390             :  * JOHAB
     391             :  */
     392             : static int
     393           0 : pg_johab_mblen(const unsigned char *s)
     394             : {
     395           0 :     return pg_euc_mblen(s);
     396             : }
     397             : 
     398             : static int
     399           0 : pg_johab_dsplen(const unsigned char *s)
     400             : {
     401           0 :     return pg_euc_dsplen(s);
     402             : }
     403             : 
     404             : /*
     405             :  * convert UTF8 string to pg_wchar (UCS-4)
     406             :  * caller must allocate enough space for "to", including a trailing zero!
     407             :  * len: length of from.
     408             :  * "from" not necessarily null terminated.
     409             :  */
     410             : static int
     411           0 : pg_utf2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     412             : {
     413           0 :     int         cnt = 0;
     414             :     uint32      c1,
     415             :                 c2,
     416             :                 c3,
     417             :                 c4;
     418             : 
     419           0 :     while (len > 0 && *from)
     420             :     {
     421           0 :         if ((*from & 0x80) == 0)
     422             :         {
     423           0 :             *to = *from++;
     424           0 :             len--;
     425             :         }
     426           0 :         else if ((*from & 0xe0) == 0xc0)
     427             :         {
     428           0 :             if (len < 2)
     429           0 :                 break;          /* drop trailing incomplete char */
     430           0 :             c1 = *from++ & 0x1f;
     431           0 :             c2 = *from++ & 0x3f;
     432           0 :             *to = (c1 << 6) | c2;
     433           0 :             len -= 2;
     434             :         }
     435           0 :         else if ((*from & 0xf0) == 0xe0)
     436             :         {
     437           0 :             if (len < 3)
     438           0 :                 break;          /* drop trailing incomplete char */
     439           0 :             c1 = *from++ & 0x0f;
     440           0 :             c2 = *from++ & 0x3f;
     441           0 :             c3 = *from++ & 0x3f;
     442           0 :             *to = (c1 << 12) | (c2 << 6) | c3;
     443           0 :             len -= 3;
     444             :         }
     445           0 :         else if ((*from & 0xf8) == 0xf0)
     446             :         {
     447           0 :             if (len < 4)
     448           0 :                 break;          /* drop trailing incomplete char */
     449           0 :             c1 = *from++ & 0x07;
     450           0 :             c2 = *from++ & 0x3f;
     451           0 :             c3 = *from++ & 0x3f;
     452           0 :             c4 = *from++ & 0x3f;
     453           0 :             *to = (c1 << 18) | (c2 << 12) | (c3 << 6) | c4;
     454           0 :             len -= 4;
     455             :         }
     456             :         else
     457             :         {
     458             :             /* treat a bogus char as length 1; not ours to raise error */
     459           0 :             *to = *from++;
     460           0 :             len--;
     461             :         }
     462           0 :         to++;
     463           0 :         cnt++;
     464             :     }
     465           0 :     *to = 0;
     466           0 :     return cnt;
     467             : }
     468             : 
     469             : 
     470             : /*
     471             :  * Map a Unicode code point to UTF-8.  utf8string must have 4 bytes of
     472             :  * space allocated.
     473             :  */
     474             : unsigned char *
     475           0 : unicode_to_utf8(pg_wchar c, unsigned char *utf8string)
     476             : {
     477           0 :     if (c <= 0x7F)
     478             :     {
     479           0 :         utf8string[0] = c;
     480             :     }
     481           0 :     else if (c <= 0x7FF)
     482             :     {
     483           0 :         utf8string[0] = 0xC0 | ((c >> 6) & 0x1F);
     484           0 :         utf8string[1] = 0x80 | (c & 0x3F);
     485             :     }
     486           0 :     else if (c <= 0xFFFF)
     487             :     {
     488           0 :         utf8string[0] = 0xE0 | ((c >> 12) & 0x0F);
     489           0 :         utf8string[1] = 0x80 | ((c >> 6) & 0x3F);
     490           0 :         utf8string[2] = 0x80 | (c & 0x3F);
     491             :     }
     492             :     else
     493             :     {
     494           0 :         utf8string[0] = 0xF0 | ((c >> 18) & 0x07);
     495           0 :         utf8string[1] = 0x80 | ((c >> 12) & 0x3F);
     496           0 :         utf8string[2] = 0x80 | ((c >> 6) & 0x3F);
     497           0 :         utf8string[3] = 0x80 | (c & 0x3F);
     498             :     }
     499             : 
     500           0 :     return utf8string;
     501             : }
     502             : 
     503             : /*
     504             :  * Trivial conversion from pg_wchar to UTF-8.
     505             :  * caller should allocate enough space for "to"
     506             :  * len: length of from.
     507             :  * "from" not necessarily null terminated.
     508             :  */
     509             : static int
     510           0 : pg_wchar2utf_with_len(const pg_wchar *from, unsigned char *to, int len)
     511             : {
     512           0 :     int         cnt = 0;
     513             : 
     514           0 :     while (len > 0 && *from)
     515             :     {
     516             :         int         char_len;
     517             : 
     518           0 :         unicode_to_utf8(*from, to);
     519           0 :         char_len = pg_utf_mblen(to);
     520           0 :         cnt += char_len;
     521           0 :         to += char_len;
     522           0 :         from++;
     523           0 :         len--;
     524             :     }
     525           0 :     *to = 0;
     526           0 :     return cnt;
     527             : }
     528             : 
     529             : /*
     530             :  * Return the byte length of a UTF8 character pointed to by s
     531             :  *
     532             :  * Note: in the current implementation we do not support UTF8 sequences
     533             :  * of more than 4 bytes; hence do NOT return a value larger than 4.
     534             :  * We return "1" for any leading byte that is either flat-out illegal or
     535             :  * indicates a length larger than we support.
     536             :  *
     537             :  * pg_utf2wchar_with_len(), utf8_to_unicode(), pg_utf8_islegal(), and perhaps
     538             :  * other places would need to be fixed to change this.
     539             :  */
     540             : int
     541     3541765 : pg_utf_mblen(const unsigned char *s)
     542             : {
     543             :     int         len;
     544             : 
     545     3541765 :     if ((*s & 0x80) == 0)
     546     3541753 :         len = 1;
     547          12 :     else if ((*s & 0xe0) == 0xc0)
     548          12 :         len = 2;
     549           0 :     else if ((*s & 0xf0) == 0xe0)
     550           0 :         len = 3;
     551           0 :     else if ((*s & 0xf8) == 0xf0)
     552           0 :         len = 4;
     553             : #ifdef NOT_USED
     554             :     else if ((*s & 0xfc) == 0xf8)
     555             :         len = 5;
     556             :     else if ((*s & 0xfe) == 0xfc)
     557             :         len = 6;
     558             : #endif
     559             :     else
     560           0 :         len = 1;
     561     3541765 :     return len;
     562             : }
     563             : 
     564             : /*
     565             :  * This is an implementation of wcwidth() and wcswidth() as defined in
     566             :  * "The Single UNIX Specification, Version 2, The Open Group, 1997"
     567             :  * <http://www.UNIX-systems.org/online.html>
     568             :  *
     569             :  * Markus Kuhn -- 2001-09-08 -- public domain
     570             :  *
     571             :  * customised for PostgreSQL
     572             :  *
     573             :  * original available at : http://www.cl.cam.ac.uk/~mgk25/ucs/wcwidth.c
     574             :  */
     575             : 
     576             : struct mbinterval
     577             : {
     578             :     unsigned short first;
     579             :     unsigned short last;
     580             : };
     581             : 
     582             : /* auxiliary function for binary search in interval table */
     583             : static int
     584     3535323 : mbbisearch(pg_wchar ucs, const struct mbinterval *table, int max)
     585             : {
     586     3535323 :     int         min = 0;
     587             :     int         mid;
     588             : 
     589     3535323 :     if (ucs < table[0].first || ucs > table[max].last)
     590     3535323 :         return 0;
     591           0 :     while (max >= min)
     592             :     {
     593           0 :         mid = (min + max) / 2;
     594           0 :         if (ucs > table[mid].last)
     595           0 :             min = mid + 1;
     596           0 :         else if (ucs < table[mid].first)
     597           0 :             max = mid - 1;
     598             :         else
     599           0 :             return 1;
     600             :     }
     601             : 
     602           0 :     return 0;
     603             : }
     604             : 
     605             : 
     606             : /* The following functions define the column width of an ISO 10646
     607             :  * character as follows:
     608             :  *
     609             :  *    - The null character (U+0000) has a column width of 0.
     610             :  *
     611             :  *    - Other C0/C1 control characters and DEL will lead to a return
     612             :  *      value of -1.
     613             :  *
     614             :  *    - Non-spacing and enclosing combining characters (general
     615             :  *      category code Mn or Me in the Unicode database) have a
     616             :  *      column width of 0.
     617             :  *
     618             :  *    - Other format characters (general category code Cf in the Unicode
     619             :  *      database) and ZERO WIDTH SPACE (U+200B) have a column width of 0.
     620             :  *
     621             :  *    - Hangul Jamo medial vowels and final consonants (U+1160-U+11FF)
     622             :  *      have a column width of 0.
     623             :  *
     624             :  *    - Spacing characters in the East Asian Wide (W) or East Asian
     625             :  *      FullWidth (F) category as defined in Unicode Technical
     626             :  *      Report #11 have a column width of 2.
     627             :  *
     628             :  *    - All remaining characters (including all printable
     629             :  *      ISO 8859-1 and WGL4 characters, Unicode control characters,
     630             :  *      etc.) have a column width of 1.
     631             :  *
     632             :  * This implementation assumes that wchar_t characters are encoded
     633             :  * in ISO 10646.
     634             :  */
     635             : 
     636             : static int
     637     3538881 : ucs_wcwidth(pg_wchar ucs)
     638             : {
     639             :     /* sorted list of non-overlapping intervals of non-spacing characters */
     640             :     static const struct mbinterval combining[] = {
     641             :         {0x0300, 0x034E}, {0x0360, 0x0362}, {0x0483, 0x0486},
     642             :         {0x0488, 0x0489}, {0x0591, 0x05A1}, {0x05A3, 0x05B9},
     643             :         {0x05BB, 0x05BD}, {0x05BF, 0x05BF}, {0x05C1, 0x05C2},
     644             :         {0x05C4, 0x05C4}, {0x064B, 0x0655}, {0x0670, 0x0670},
     645             :         {0x06D6, 0x06E4}, {0x06E7, 0x06E8}, {0x06EA, 0x06ED},
     646             :         {0x070F, 0x070F}, {0x0711, 0x0711}, {0x0730, 0x074A},
     647             :         {0x07A6, 0x07B0}, {0x0901, 0x0902}, {0x093C, 0x093C},
     648             :         {0x0941, 0x0948}, {0x094D, 0x094D}, {0x0951, 0x0954},
     649             :         {0x0962, 0x0963}, {0x0981, 0x0981}, {0x09BC, 0x09BC},
     650             :         {0x09C1, 0x09C4}, {0x09CD, 0x09CD}, {0x09E2, 0x09E3},
     651             :         {0x0A02, 0x0A02}, {0x0A3C, 0x0A3C}, {0x0A41, 0x0A42},
     652             :         {0x0A47, 0x0A48}, {0x0A4B, 0x0A4D}, {0x0A70, 0x0A71},
     653             :         {0x0A81, 0x0A82}, {0x0ABC, 0x0ABC}, {0x0AC1, 0x0AC5},
     654             :         {0x0AC7, 0x0AC8}, {0x0ACD, 0x0ACD}, {0x0B01, 0x0B01},
     655             :         {0x0B3C, 0x0B3C}, {0x0B3F, 0x0B3F}, {0x0B41, 0x0B43},
     656             :         {0x0B4D, 0x0B4D}, {0x0B56, 0x0B56}, {0x0B82, 0x0B82},
     657             :         {0x0BC0, 0x0BC0}, {0x0BCD, 0x0BCD}, {0x0C3E, 0x0C40},
     658             :         {0x0C46, 0x0C48}, {0x0C4A, 0x0C4D}, {0x0C55, 0x0C56},
     659             :         {0x0CBF, 0x0CBF}, {0x0CC6, 0x0CC6}, {0x0CCC, 0x0CCD},
     660             :         {0x0D41, 0x0D43}, {0x0D4D, 0x0D4D}, {0x0DCA, 0x0DCA},
     661             :         {0x0DD2, 0x0DD4}, {0x0DD6, 0x0DD6}, {0x0E31, 0x0E31},
     662             :         {0x0E34, 0x0E3A}, {0x0E47, 0x0E4E}, {0x0EB1, 0x0EB1},
     663             :         {0x0EB4, 0x0EB9}, {0x0EBB, 0x0EBC}, {0x0EC8, 0x0ECD},
     664             :         {0x0F18, 0x0F19}, {0x0F35, 0x0F35}, {0x0F37, 0x0F37},
     665             :         {0x0F39, 0x0F39}, {0x0F71, 0x0F7E}, {0x0F80, 0x0F84},
     666             :         {0x0F86, 0x0F87}, {0x0F90, 0x0F97}, {0x0F99, 0x0FBC},
     667             :         {0x0FC6, 0x0FC6}, {0x102D, 0x1030}, {0x1032, 0x1032},
     668             :         {0x1036, 0x1037}, {0x1039, 0x1039}, {0x1058, 0x1059},
     669             :         {0x1160, 0x11FF}, {0x17B7, 0x17BD}, {0x17C6, 0x17C6},
     670             :         {0x17C9, 0x17D3}, {0x180B, 0x180E}, {0x18A9, 0x18A9},
     671             :         {0x200B, 0x200F}, {0x202A, 0x202E}, {0x206A, 0x206F},
     672             :         {0x20D0, 0x20E3}, {0x302A, 0x302F}, {0x3099, 0x309A},
     673             :         {0xFB1E, 0xFB1E}, {0xFE20, 0xFE23}, {0xFEFF, 0xFEFF},
     674             :         {0xFFF9, 0xFFFB}
     675             :     };
     676             : 
     677             :     /* test for 8-bit control characters */
     678     3538881 :     if (ucs == 0)
     679           0 :         return 0;
     680             : 
     681     3538881 :     if (ucs < 0x20 || (ucs >= 0x7f && ucs < 0xa0) || ucs > 0x0010ffff)
     682        3558 :         return -1;
     683             : 
     684             :     /* binary search in table of non-spacing characters */
     685     3535323 :     if (mbbisearch(ucs, combining,
     686             :                    sizeof(combining) / sizeof(struct mbinterval) - 1))
     687           0 :         return 0;
     688             : 
     689             :     /*
     690             :      * if we arrive here, ucs is not a combining or C0/C1 control character
     691             :      */
     692             : 
     693     3535323 :     return 1 +
     694     3535323 :         (ucs >= 0x1100 &&
     695           0 :          (ucs <= 0x115f ||       /* Hangul Jamo init. consonants */
     696           0 :           (ucs >= 0x2e80 && ucs <= 0xa4cf && (ucs & ~0x0011) != 0x300a &&
     697           0 :            ucs != 0x303f) ||    /* CJK ... Yi */
     698           0 :           (ucs >= 0xac00 && ucs <= 0xd7a3) || /* Hangul Syllables */
     699           0 :           (ucs >= 0xf900 && ucs <= 0xfaff) || /* CJK Compatibility
     700             :                                                  * Ideographs */
     701           0 :           (ucs >= 0xfe30 && ucs <= 0xfe6f) || /* CJK Compatibility Forms */
     702           0 :           (ucs >= 0xff00 && ucs <= 0xff5f) || /* Fullwidth Forms */
     703           0 :           (ucs >= 0xffe0 && ucs <= 0xffe6) ||
     704           0 :           (ucs >= 0x20000 && ucs <= 0x2ffff)));
     705             : }
     706             : 
     707             : /*
     708             :  * Convert a UTF-8 character to a Unicode code point.
     709             :  * This is a one-character version of pg_utf2wchar_with_len.
     710             :  *
     711             :  * No error checks here, c must point to a long-enough string.
     712             :  */
     713             : pg_wchar
     714     3538881 : utf8_to_unicode(const unsigned char *c)
     715             : {
     716     3538881 :     if ((*c & 0x80) == 0)
     717     3538869 :         return (pg_wchar) c[0];
     718          12 :     else if ((*c & 0xe0) == 0xc0)
     719          24 :         return (pg_wchar) (((c[0] & 0x1f) << 6) |
     720          12 :                            (c[1] & 0x3f));
     721           0 :     else if ((*c & 0xf0) == 0xe0)
     722           0 :         return (pg_wchar) (((c[0] & 0x0f) << 12) |
     723           0 :                            ((c[1] & 0x3f) << 6) |
     724           0 :                            (c[2] & 0x3f));
     725           0 :     else if ((*c & 0xf8) == 0xf0)
     726           0 :         return (pg_wchar) (((c[0] & 0x07) << 18) |
     727           0 :                            ((c[1] & 0x3f) << 12) |
     728           0 :                            ((c[2] & 0x3f) << 6) |
     729           0 :                            (c[3] & 0x3f));
     730             :     else
     731             :         /* that is an invalid code on purpose */
     732           0 :         return 0xffffffff;
     733             : }
     734             : 
     735             : static int
     736     3538881 : pg_utf_dsplen(const unsigned char *s)
     737             : {
     738     3538881 :     return ucs_wcwidth(utf8_to_unicode(s));
     739             : }
     740             : 
     741             : /*
     742             :  * convert mule internal code to pg_wchar
     743             :  * caller should allocate enough space for "to"
     744             :  * len: length of from.
     745             :  * "from" not necessarily null terminated.
     746             :  */
     747             : static int
     748           0 : pg_mule2wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     749             : {
     750           0 :     int         cnt = 0;
     751             : 
     752           0 :     while (len > 0 && *from)
     753             :     {
     754           0 :         if (IS_LC1(*from) && len >= 2)
     755             :         {
     756           0 :             *to = *from++ << 16;
     757           0 :             *to |= *from++;
     758           0 :             len -= 2;
     759             :         }
     760           0 :         else if (IS_LCPRV1(*from) && len >= 3)
     761             :         {
     762           0 :             from++;
     763           0 :             *to = *from++ << 16;
     764           0 :             *to |= *from++;
     765           0 :             len -= 3;
     766             :         }
     767           0 :         else if (IS_LC2(*from) && len >= 3)
     768             :         {
     769           0 :             *to = *from++ << 16;
     770           0 :             *to |= *from++ << 8;
     771           0 :             *to |= *from++;
     772           0 :             len -= 3;
     773             :         }
     774           0 :         else if (IS_LCPRV2(*from) && len >= 4)
     775             :         {
     776           0 :             from++;
     777           0 :             *to = *from++ << 16;
     778           0 :             *to |= *from++ << 8;
     779           0 :             *to |= *from++;
     780           0 :             len -= 4;
     781             :         }
     782             :         else
     783             :         {                       /* assume ASCII */
     784           0 :             *to = (unsigned char) *from++;
     785           0 :             len--;
     786             :         }
     787           0 :         to++;
     788           0 :         cnt++;
     789             :     }
     790           0 :     *to = 0;
     791           0 :     return cnt;
     792             : }
     793             : 
     794             : /*
     795             :  * convert pg_wchar to mule internal code
     796             :  * caller should allocate enough space for "to"
     797             :  * len: length of from.
     798             :  * "from" not necessarily null terminated.
     799             :  */
     800             : static int
     801           0 : pg_wchar2mule_with_len(const pg_wchar *from, unsigned char *to, int len)
     802             : {
     803           0 :     int         cnt = 0;
     804             : 
     805           0 :     while (len > 0 && *from)
     806             :     {
     807             :         unsigned char lb;
     808             : 
     809           0 :         lb = (*from >> 16) & 0xff;
     810           0 :         if (IS_LC1(lb))
     811             :         {
     812           0 :             *to++ = lb;
     813           0 :             *to++ = *from & 0xff;
     814           0 :             cnt += 2;
     815             :         }
     816           0 :         else if (IS_LC2(lb))
     817             :         {
     818           0 :             *to++ = lb;
     819           0 :             *to++ = (*from >> 8) & 0xff;
     820           0 :             *to++ = *from & 0xff;
     821           0 :             cnt += 3;
     822             :         }
     823           0 :         else if (IS_LCPRV1_A_RANGE(lb))
     824             :         {
     825           0 :             *to++ = LCPRV1_A;
     826           0 :             *to++ = lb;
     827           0 :             *to++ = *from & 0xff;
     828           0 :             cnt += 3;
     829             :         }
     830           0 :         else if (IS_LCPRV1_B_RANGE(lb))
     831             :         {
     832           0 :             *to++ = LCPRV1_B;
     833           0 :             *to++ = lb;
     834           0 :             *to++ = *from & 0xff;
     835           0 :             cnt += 3;
     836             :         }
     837           0 :         else if (IS_LCPRV2_A_RANGE(lb))
     838             :         {
     839           0 :             *to++ = LCPRV2_A;
     840           0 :             *to++ = lb;
     841           0 :             *to++ = (*from >> 8) & 0xff;
     842           0 :             *to++ = *from & 0xff;
     843           0 :             cnt += 4;
     844             :         }
     845           0 :         else if (IS_LCPRV2_B_RANGE(lb))
     846             :         {
     847           0 :             *to++ = LCPRV2_B;
     848           0 :             *to++ = lb;
     849           0 :             *to++ = (*from >> 8) & 0xff;
     850           0 :             *to++ = *from & 0xff;
     851           0 :             cnt += 4;
     852             :         }
     853             :         else
     854             :         {
     855           0 :             *to++ = *from & 0xff;
     856           0 :             cnt += 1;
     857             :         }
     858           0 :         from++;
     859           0 :         len--;
     860             :     }
     861           0 :     *to = 0;
     862           0 :     return cnt;
     863             : }
     864             : 
     865             : int
     866           0 : pg_mule_mblen(const unsigned char *s)
     867             : {
     868             :     int         len;
     869             : 
     870           0 :     if (IS_LC1(*s))
     871           0 :         len = 2;
     872           0 :     else if (IS_LCPRV1(*s))
     873           0 :         len = 3;
     874           0 :     else if (IS_LC2(*s))
     875           0 :         len = 3;
     876           0 :     else if (IS_LCPRV2(*s))
     877           0 :         len = 4;
     878             :     else
     879           0 :         len = 1;                /* assume ASCII */
     880           0 :     return len;
     881             : }
     882             : 
     883             : static int
     884           0 : pg_mule_dsplen(const unsigned char *s)
     885             : {
     886             :     int         len;
     887             : 
     888             :     /*
     889             :      * Note: it's not really appropriate to assume that all multibyte charsets
     890             :      * are double-wide on screen.  But this seems an okay approximation for
     891             :      * the MULE charsets we currently support.
     892             :      */
     893             : 
     894           0 :     if (IS_LC1(*s))
     895           0 :         len = 1;
     896           0 :     else if (IS_LCPRV1(*s))
     897           0 :         len = 1;
     898           0 :     else if (IS_LC2(*s))
     899           0 :         len = 2;
     900           0 :     else if (IS_LCPRV2(*s))
     901           0 :         len = 2;
     902             :     else
     903           0 :         len = 1;                /* assume ASCII */
     904             : 
     905           0 :     return len;
     906             : }
     907             : 
     908             : /*
     909             :  * ISO8859-1
     910             :  */
     911             : static int
     912           0 : pg_latin12wchar_with_len(const unsigned char *from, pg_wchar *to, int len)
     913             : {
     914           0 :     int         cnt = 0;
     915             : 
     916           0 :     while (len > 0 && *from)
     917             :     {
     918           0 :         *to++ = *from++;
     919           0 :         len--;
     920           0 :         cnt++;
     921             :     }
     922           0 :     *to = 0;
     923           0 :     return cnt;
     924             : }
     925             : 
     926             : /*
     927             :  * Trivial conversion from pg_wchar to single byte encoding. Just ignores
     928             :  * high bits.
     929             :  * caller should allocate enough space for "to"
     930             :  * len: length of from.
     931             :  * "from" not necessarily null terminated.
     932             :  */
     933             : static int
     934           0 : pg_wchar2single_with_len(const pg_wchar *from, unsigned char *to, int len)
     935             : {
     936           0 :     int         cnt = 0;
     937             : 
     938           0 :     while (len > 0 && *from)
     939             :     {
     940           0 :         *to++ = *from++;
     941           0 :         len--;
     942           0 :         cnt++;
     943             :     }
     944           0 :     *to = 0;
     945           0 :     return cnt;
     946             : }
     947             : 
     948             : static int
     949           0 : pg_latin1_mblen(const unsigned char *s)
     950             : {
     951           0 :     return 1;
     952             : }
     953             : 
     954             : static int
     955           0 : pg_latin1_dsplen(const unsigned char *s)
     956             : {
     957           0 :     return pg_ascii_dsplen(s);
     958             : }
     959             : 
     960             : /*
     961             :  * SJIS
     962             :  */
     963             : static int
     964           0 : pg_sjis_mblen(const unsigned char *s)
     965             : {
     966             :     int         len;
     967             : 
     968           0 :     if (*s >= 0xa1 && *s <= 0xdf)
     969           0 :         len = 1;                /* 1 byte kana? */
     970           0 :     else if (IS_HIGHBIT_SET(*s))
     971           0 :         len = 2;                /* kanji? */
     972             :     else
     973           0 :         len = 1;                /* should be ASCII */
     974           0 :     return len;
     975             : }
     976             : 
     977             : static int
     978           0 : pg_sjis_dsplen(const unsigned char *s)
     979             : {
     980             :     int         len;
     981             : 
     982           0 :     if (*s >= 0xa1 && *s <= 0xdf)
     983           0 :         len = 1;                /* 1 byte kana? */
     984           0 :     else if (IS_HIGHBIT_SET(*s))
     985           0 :         len = 2;                /* kanji? */
     986             :     else
     987           0 :         len = pg_ascii_dsplen(s);   /* should be ASCII */
     988           0 :     return len;
     989             : }
     990             : 
     991             : /*
     992             :  * Big5
     993             :  */
     994             : static int
     995           0 : pg_big5_mblen(const unsigned char *s)
     996             : {
     997             :     int         len;
     998             : 
     999           0 :     if (IS_HIGHBIT_SET(*s))
    1000           0 :         len = 2;                /* kanji? */
    1001             :     else
    1002           0 :         len = 1;                /* should be ASCII */
    1003           0 :     return len;
    1004             : }
    1005             : 
    1006             : static int
    1007           0 : pg_big5_dsplen(const unsigned char *s)
    1008             : {
    1009             :     int         len;
    1010             : 
    1011           0 :     if (IS_HIGHBIT_SET(*s))
    1012           0 :         len = 2;                /* kanji? */
    1013             :     else
    1014           0 :         len = pg_ascii_dsplen(s);   /* should be ASCII */
    1015           0 :     return len;
    1016             : }
    1017             : 
    1018             : /*
    1019             :  * GBK
    1020             :  */
    1021             : static int
    1022           0 : pg_gbk_mblen(const unsigned char *s)
    1023             : {
    1024             :     int         len;
    1025             : 
    1026           0 :     if (IS_HIGHBIT_SET(*s))
    1027           0 :         len = 2;                /* kanji? */
    1028             :     else
    1029           0 :         len = 1;                /* should be ASCII */
    1030           0 :     return len;
    1031             : }
    1032             : 
    1033             : static int
    1034           0 : pg_gbk_dsplen(const unsigned char *s)
    1035             : {
    1036             :     int         len;
    1037             : 
    1038           0 :     if (IS_HIGHBIT_SET(*s))
    1039           0 :         len = 2;                /* kanji? */
    1040             :     else
    1041           0 :         len = pg_ascii_dsplen(s);   /* should be ASCII */
    1042           0 :     return len;
    1043             : }
    1044             : 
    1045             : /*
    1046             :  * UHC
    1047             :  */
    1048             : static int
    1049           0 : pg_uhc_mblen(const unsigned char *s)
    1050             : {
    1051             :     int         len;
    1052             : 
    1053           0 :     if (IS_HIGHBIT_SET(*s))
    1054           0 :         len = 2;                /* 2byte? */
    1055             :     else
    1056           0 :         len = 1;                /* should be ASCII */
    1057           0 :     return len;
    1058             : }
    1059             : 
    1060             : static int
    1061           0 : pg_uhc_dsplen(const unsigned char *s)
    1062             : {
    1063             :     int         len;
    1064             : 
    1065           0 :     if (IS_HIGHBIT_SET(*s))
    1066           0 :         len = 2;                /* 2byte? */
    1067             :     else
    1068           0 :         len = pg_ascii_dsplen(s);   /* should be ASCII */
    1069           0 :     return len;
    1070             : }
    1071             : 
    1072             : /*
    1073             :  * GB18030
    1074             :  *  Added by Bill Huang <bhuang@redhat.com>,<bill_huanghb@ybb.ne.jp>
    1075             :  */
    1076             : static int
    1077           0 : pg_gb18030_mblen(const unsigned char *s)
    1078             : {
    1079             :     int         len;
    1080             : 
    1081           0 :     if (!IS_HIGHBIT_SET(*s))
    1082           0 :         len = 1;                /* ASCII */
    1083           0 :     else if (*(s + 1) >= 0x30 && *(s + 1) <= 0x39)
    1084           0 :         len = 4;
    1085             :     else
    1086           0 :         len = 2;
    1087           0 :     return len;
    1088             : }
    1089             : 
    1090             : static int
    1091           0 : pg_gb18030_dsplen(const unsigned char *s)
    1092             : {
    1093             :     int         len;
    1094             : 
    1095           0 :     if (IS_HIGHBIT_SET(*s))
    1096           0 :         len = 2;
    1097             :     else
    1098           0 :         len = pg_ascii_dsplen(s);   /* ASCII */
    1099           0 :     return len;
    1100             : }
    1101             : 
    1102             : /*
    1103             :  *-------------------------------------------------------------------
    1104             :  * multibyte sequence validators
    1105             :  *
    1106             :  * These functions accept "s", a pointer to the first byte of a string,
    1107             :  * and "len", the remaining length of the string.  If there is a validly
    1108             :  * encoded character beginning at *s, return its length in bytes; else
    1109             :  * return -1.
    1110             :  *
    1111             :  * The functions can assume that len > 0 and that *s != '\0', but they must
    1112             :  * test for and reject zeroes in any additional bytes of a multibyte character.
    1113             :  *
    1114             :  * Note that this definition allows the function for a single-byte
    1115             :  * encoding to be just "return 1".
    1116             :  *-------------------------------------------------------------------
    1117             :  */
    1118             : 
    1119             : static int
    1120           0 : pg_ascii_verifier(const unsigned char *s, int len)
    1121             : {
    1122           0 :     return 1;
    1123             : }
    1124             : 
    1125             : #define IS_EUC_RANGE_VALID(c)   ((c) >= 0xa1 && (c) <= 0xfe)
    1126             : 
    1127             : static int
    1128           0 : pg_eucjp_verifier(const unsigned char *s, int len)
    1129             : {
    1130             :     int         l;
    1131             :     unsigned char c1,
    1132             :                 c2;
    1133             : 
    1134           0 :     c1 = *s++;
    1135             : 
    1136           0 :     switch (c1)
    1137             :     {
    1138             :         case SS2:               /* JIS X 0201 */
    1139           0 :             l = 2;
    1140           0 :             if (l > len)
    1141           0 :                 return -1;
    1142           0 :             c2 = *s++;
    1143           0 :             if (c2 < 0xa1 || c2 > 0xdf)
    1144           0 :                 return -1;
    1145           0 :             break;
    1146             : 
    1147             :         case SS3:               /* JIS X 0212 */
    1148           0 :             l = 3;
    1149           0 :             if (l > len)
    1150           0 :                 return -1;
    1151           0 :             c2 = *s++;
    1152           0 :             if (!IS_EUC_RANGE_VALID(c2))
    1153           0 :                 return -1;
    1154           0 :             c2 = *s++;
    1155           0 :             if (!IS_EUC_RANGE_VALID(c2))
    1156           0 :                 return -1;
    1157           0 :             break;
    1158             : 
    1159             :         default:
    1160           0 :             if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
    1161             :             {
    1162           0 :                 l = 2;
    1163           0 :                 if (l > len)
    1164           0 :                     return -1;
    1165           0 :                 if (!IS_EUC_RANGE_VALID(c1))
    1166           0 :                     return -1;
    1167           0 :                 c2 = *s++;
    1168           0 :                 if (!IS_EUC_RANGE_VALID(c2))
    1169           0 :                     return -1;
    1170             :             }
    1171             :             else
    1172             :                 /* must be ASCII */
    1173             :             {
    1174           0 :                 l = 1;
    1175             :             }
    1176           0 :             break;
    1177             :     }
    1178             : 
    1179           0 :     return l;
    1180             : }
    1181             : 
    1182             : static int
    1183           0 : pg_euckr_verifier(const unsigned char *s, int len)
    1184             : {
    1185             :     int         l;
    1186             :     unsigned char c1,
    1187             :                 c2;
    1188             : 
    1189           0 :     c1 = *s++;
    1190             : 
    1191           0 :     if (IS_HIGHBIT_SET(c1))
    1192             :     {
    1193           0 :         l = 2;
    1194           0 :         if (l > len)
    1195           0 :             return -1;
    1196           0 :         if (!IS_EUC_RANGE_VALID(c1))
    1197           0 :             return -1;
    1198           0 :         c2 = *s++;
    1199           0 :         if (!IS_EUC_RANGE_VALID(c2))
    1200           0 :             return -1;
    1201             :     }
    1202             :     else
    1203             :         /* must be ASCII */
    1204             :     {
    1205           0 :         l = 1;
    1206             :     }
    1207             : 
    1208           0 :     return l;
    1209             : }
    1210             : 
    1211             : /* EUC-CN byte sequences are exactly same as EUC-KR */
    1212             : #define pg_euccn_verifier   pg_euckr_verifier
    1213             : 
    1214             : static int
    1215           0 : pg_euctw_verifier(const unsigned char *s, int len)
    1216             : {
    1217             :     int         l;
    1218             :     unsigned char c1,
    1219             :                 c2;
    1220             : 
    1221           0 :     c1 = *s++;
    1222             : 
    1223           0 :     switch (c1)
    1224             :     {
    1225             :         case SS2:               /* CNS 11643 Plane 1-7 */
    1226           0 :             l = 4;
    1227           0 :             if (l > len)
    1228           0 :                 return -1;
    1229           0 :             c2 = *s++;
    1230           0 :             if (c2 < 0xa1 || c2 > 0xa7)
    1231           0 :                 return -1;
    1232           0 :             c2 = *s++;
    1233           0 :             if (!IS_EUC_RANGE_VALID(c2))
    1234           0 :                 return -1;
    1235           0 :             c2 = *s++;
    1236           0 :             if (!IS_EUC_RANGE_VALID(c2))
    1237           0 :                 return -1;
    1238           0 :             break;
    1239             : 
    1240             :         case SS3:               /* unused */
    1241           0 :             return -1;
    1242             : 
    1243             :         default:
    1244           0 :             if (IS_HIGHBIT_SET(c1)) /* CNS 11643 Plane 1 */
    1245             :             {
    1246           0 :                 l = 2;
    1247           0 :                 if (l > len)
    1248           0 :                     return -1;
    1249             :                 /* no further range check on c1? */
    1250           0 :                 c2 = *s++;
    1251           0 :                 if (!IS_EUC_RANGE_VALID(c2))
    1252           0 :                     return -1;
    1253             :             }
    1254             :             else
    1255             :                 /* must be ASCII */
    1256             :             {
    1257           0 :                 l = 1;
    1258             :             }
    1259           0 :             break;
    1260             :     }
    1261           0 :     return l;
    1262             : }
    1263             : 
    1264             : static int
    1265           0 : pg_johab_verifier(const unsigned char *s, int len)
    1266             : {
    1267             :     int         l,
    1268             :                 mbl;
    1269             :     unsigned char c;
    1270             : 
    1271           0 :     l = mbl = pg_johab_mblen(s);
    1272             : 
    1273           0 :     if (len < l)
    1274           0 :         return -1;
    1275             : 
    1276           0 :     if (!IS_HIGHBIT_SET(*s))
    1277           0 :         return mbl;
    1278             : 
    1279           0 :     while (--l > 0)
    1280             :     {
    1281           0 :         c = *++s;
    1282           0 :         if (!IS_EUC_RANGE_VALID(c))
    1283           0 :             return -1;
    1284             :     }
    1285           0 :     return mbl;
    1286             : }
    1287             : 
    1288             : static int
    1289           0 : pg_mule_verifier(const unsigned char *s, int len)
    1290             : {
    1291             :     int         l,
    1292             :                 mbl;
    1293             :     unsigned char c;
    1294             : 
    1295           0 :     l = mbl = pg_mule_mblen(s);
    1296             : 
    1297           0 :     if (len < l)
    1298           0 :         return -1;
    1299             : 
    1300           0 :     while (--l > 0)
    1301             :     {
    1302           0 :         c = *++s;
    1303           0 :         if (!IS_HIGHBIT_SET(c))
    1304           0 :             return -1;
    1305             :     }
    1306           0 :     return mbl;
    1307             : }
    1308             : 
    1309             : static int
    1310           0 : pg_latin1_verifier(const unsigned char *s, int len)
    1311             : {
    1312           0 :     return 1;
    1313             : }
    1314             : 
    1315             : static int
    1316           0 : pg_sjis_verifier(const unsigned char *s, int len)
    1317             : {
    1318             :     int         l,
    1319             :                 mbl;
    1320             :     unsigned char c1,
    1321             :                 c2;
    1322             : 
    1323           0 :     l = mbl = pg_sjis_mblen(s);
    1324             : 
    1325           0 :     if (len < l)
    1326           0 :         return -1;
    1327             : 
    1328           0 :     if (l == 1)                 /* pg_sjis_mblen already verified it */
    1329           0 :         return mbl;
    1330             : 
    1331           0 :     c1 = *s++;
    1332           0 :     c2 = *s;
    1333           0 :     if (!ISSJISHEAD(c1) || !ISSJISTAIL(c2))
    1334           0 :         return -1;
    1335           0 :     return mbl;
    1336             : }
    1337             : 
    1338             : static int
    1339           0 : pg_big5_verifier(const unsigned char *s, int len)
    1340             : {
    1341             :     int         l,
    1342             :                 mbl;
    1343             : 
    1344           0 :     l = mbl = pg_big5_mblen(s);
    1345             : 
    1346           0 :     if (len < l)
    1347           0 :         return -1;
    1348             : 
    1349           0 :     while (--l > 0)
    1350             :     {
    1351           0 :         if (*++s == '\0')
    1352           0 :             return -1;
    1353             :     }
    1354             : 
    1355           0 :     return mbl;
    1356             : }
    1357             : 
    1358             : static int
    1359           0 : pg_gbk_verifier(const unsigned char *s, int len)
    1360             : {
    1361             :     int         l,
    1362             :                 mbl;
    1363             : 
    1364           0 :     l = mbl = pg_gbk_mblen(s);
    1365             : 
    1366           0 :     if (len < l)
    1367           0 :         return -1;
    1368             : 
    1369           0 :     while (--l > 0)
    1370             :     {
    1371           0 :         if (*++s == '\0')
    1372           0 :             return -1;
    1373             :     }
    1374             : 
    1375           0 :     return mbl;
    1376             : }
    1377             : 
    1378             : static int
    1379           0 : pg_uhc_verifier(const unsigned char *s, int len)
    1380             : {
    1381             :     int         l,
    1382             :                 mbl;
    1383             : 
    1384           0 :     l = mbl = pg_uhc_mblen(s);
    1385             : 
    1386           0 :     if (len < l)
    1387           0 :         return -1;
    1388             : 
    1389           0 :     while (--l > 0)
    1390             :     {
    1391           0 :         if (*++s == '\0')
    1392           0 :             return -1;
    1393             :     }
    1394             : 
    1395           0 :     return mbl;
    1396             : }
    1397             : 
    1398             : static int
    1399           0 : pg_gb18030_verifier(const unsigned char *s, int len)
    1400             : {
    1401             :     int         l;
    1402             : 
    1403           0 :     if (!IS_HIGHBIT_SET(*s))
    1404           0 :         l = 1;                  /* ASCII */
    1405           0 :     else if (len >= 4 && *(s + 1) >= 0x30 && *(s + 1) <= 0x39)
    1406             :     {
    1407             :         /* Should be 4-byte, validate remaining bytes */
    1408           0 :         if (*s >= 0x81 && *s <= 0xfe &&
    1409           0 :             *(s + 2) >= 0x81 && *(s + 2) <= 0xfe &&
    1410           0 :             *(s + 3) >= 0x30 && *(s + 3) <= 0x39)
    1411           0 :             l = 4;
    1412             :         else
    1413           0 :             l = -1;
    1414             :     }
    1415           0 :     else if (len >= 2 && *s >= 0x81 && *s <= 0xfe)
    1416             :     {
    1417             :         /* Should be 2-byte, validate */
    1418           0 :         if ((*(s + 1) >= 0x40 && *(s + 1) <= 0x7e) ||
    1419           0 :             (*(s + 1) >= 0x80 && *(s + 1) <= 0xfe))
    1420           0 :             l = 2;
    1421             :         else
    1422           0 :             l = -1;
    1423             :     }
    1424             :     else
    1425           0 :         l = -1;
    1426           0 :     return l;
    1427             : }
    1428             : 
    1429             : static int
    1430           0 : pg_utf8_verifier(const unsigned char *s, int len)
    1431             : {
    1432           0 :     int         l = pg_utf_mblen(s);
    1433             : 
    1434           0 :     if (len < l)
    1435           0 :         return -1;
    1436             : 
    1437           0 :     if (!pg_utf8_islegal(s, l))
    1438           0 :         return -1;
    1439             : 
    1440           0 :     return l;
    1441             : }
    1442             : 
    1443             : /*
    1444             :  * Check for validity of a single UTF-8 encoded character
    1445             :  *
    1446             :  * This directly implements the rules in RFC3629.  The bizarre-looking
    1447             :  * restrictions on the second byte are meant to ensure that there isn't
    1448             :  * more than one encoding of a given Unicode character point; that is,
    1449             :  * you may not use a longer-than-necessary byte sequence with high order
    1450             :  * zero bits to represent a character that would fit in fewer bytes.
    1451             :  * To do otherwise is to create security hazards (eg, create an apparent
    1452             :  * non-ASCII character that decodes to plain ASCII).
    1453             :  *
    1454             :  * length is assumed to have been obtained by pg_utf_mblen(), and the
    1455             :  * caller must have checked that that many bytes are present in the buffer.
    1456             :  */
    1457             : bool
    1458           0 : pg_utf8_islegal(const unsigned char *source, int length)
    1459             : {
    1460             :     unsigned char a;
    1461             : 
    1462           0 :     switch (length)
    1463             :     {
    1464             :         default:
    1465             :             /* reject lengths 5 and 6 for now */
    1466           0 :             return false;
    1467             :         case 4:
    1468           0 :             a = source[3];
    1469           0 :             if (a < 0x80 || a > 0xBF)
    1470           0 :                 return false;
    1471             :             /* FALL THRU */
    1472             :         case 3:
    1473           0 :             a = source[2];
    1474           0 :             if (a < 0x80 || a > 0xBF)
    1475           0 :                 return false;
    1476             :             /* FALL THRU */
    1477             :         case 2:
    1478           0 :             a = source[1];
    1479           0 :             switch (*source)
    1480             :             {
    1481             :                 case 0xE0:
    1482           0 :                     if (a < 0xA0 || a > 0xBF)
    1483           0 :                         return false;
    1484           0 :                     break;
    1485             :                 case 0xED:
    1486           0 :                     if (a < 0x80 || a > 0x9F)
    1487           0 :                         return false;
    1488           0 :                     break;
    1489             :                 case 0xF0:
    1490           0 :                     if (a < 0x90 || a > 0xBF)
    1491           0 :                         return false;
    1492           0 :                     break;
    1493             :                 case 0xF4:
    1494           0 :                     if (a < 0x80 || a > 0x8F)
    1495           0 :                         return false;
    1496           0 :                     break;
    1497             :                 default:
    1498           0 :                     if (a < 0x80 || a > 0xBF)
    1499           0 :                         return false;
    1500           0 :                     break;
    1501             :             }
    1502             :             /* FALL THRU */
    1503             :         case 1:
    1504           0 :             a = *source;
    1505           0 :             if (a >= 0x80 && a < 0xC2)
    1506           0 :                 return false;
    1507           0 :             if (a > 0xF4)
    1508           0 :                 return false;
    1509           0 :             break;
    1510             :     }
    1511           0 :     return true;
    1512             : }
    1513             : 
    1514             : #ifndef FRONTEND
    1515             : 
    1516             : /*
    1517             :  * Generic character incrementer function.
    1518             :  *
    1519             :  * Not knowing anything about the properties of the encoding in use, we just
    1520             :  * keep incrementing the last byte until we get a validly-encoded result,
    1521             :  * or we run out of values to try.  We don't bother to try incrementing
    1522             :  * higher-order bytes, so there's no growth in runtime for wider characters.
    1523             :  * (If we did try to do that, we'd need to consider the likelihood that 255
    1524             :  * is not a valid final byte in the encoding.)
    1525             :  */
    1526             : static bool
    1527             : pg_generic_charinc(unsigned char *charptr, int len)
    1528             : {
    1529             :     unsigned char *lastbyte = charptr + len - 1;
    1530             :     mbverifier  mbverify;
    1531             : 
    1532             :     /* We can just invoke the character verifier directly. */
    1533             :     mbverify = pg_wchar_table[GetDatabaseEncoding()].mbverify;
    1534             : 
    1535             :     while (*lastbyte < (unsigned char) 255)
    1536             :     {
    1537             :         (*lastbyte)++;
    1538             :         if ((*mbverify) (charptr, len) == len)
    1539             :             return true;
    1540             :     }
    1541             : 
    1542             :     return false;
    1543             : }
    1544             : 
    1545             : /*
    1546             :  * UTF-8 character incrementer function.
    1547             :  *
    1548             :  * For a one-byte character less than 0x7F, we just increment the byte.
    1549             :  *
    1550             :  * For a multibyte character, every byte but the first must fall between 0x80
    1551             :  * and 0xBF; and the first byte must be between 0xC0 and 0xF4.  We increment
    1552             :  * the last byte that's not already at its maximum value.  If we can't find a
    1553             :  * byte that's less than the maximum allowable value, we simply fail.  We also
    1554             :  * need some special-case logic to skip regions used for surrogate pair
    1555             :  * handling, as those should not occur in valid UTF-8.
    1556             :  *
    1557             :  * Note that we don't reset lower-order bytes back to their minimums, since
    1558             :  * we can't afford to make an exhaustive search (see make_greater_string).
    1559             :  */
    1560             : static bool
    1561             : pg_utf8_increment(unsigned char *charptr, int length)
    1562             : {
    1563             :     unsigned char a;
    1564             :     unsigned char limit;
    1565             : 
    1566             :     switch (length)
    1567             :     {
    1568             :         default:
    1569             :             /* reject lengths 5 and 6 for now */
    1570             :             return false;
    1571             :         case 4:
    1572             :             a = charptr[3];
    1573             :             if (a < 0xBF)
    1574             :             {
    1575             :                 charptr[3]++;
    1576             :                 break;
    1577             :             }
    1578             :             /* FALL THRU */
    1579             :         case 3:
    1580             :             a = charptr[2];
    1581             :             if (a < 0xBF)
    1582             :             {
    1583             :                 charptr[2]++;
    1584             :                 break;
    1585             :             }
    1586             :             /* FALL THRU */
    1587             :         case 2:
    1588             :             a = charptr[1];
    1589             :             switch (*charptr)
    1590             :             {
    1591             :                 case 0xED:
    1592             :                     limit = 0x9F;
    1593             :                     break;
    1594             :                 case 0xF4:
    1595             :                     limit = 0x8F;
    1596             :                     break;
    1597             :                 default:
    1598             :                     limit = 0xBF;
    1599             :                     break;
    1600             :             }
    1601             :             if (a < limit)
    1602             :             {
    1603             :                 charptr[1]++;
    1604             :                 break;
    1605             :             }
    1606             :             /* FALL THRU */
    1607             :         case 1:
    1608             :             a = *charptr;
    1609             :             if (a == 0x7F || a == 0xDF || a == 0xEF || a == 0xF4)
    1610             :                 return false;
    1611             :             charptr[0]++;
    1612             :             break;
    1613             :     }
    1614             : 
    1615             :     return true;
    1616             : }
    1617             : 
    1618             : /*
    1619             :  * EUC-JP character incrementer function.
    1620             :  *
    1621             :  * If the sequence starts with SS2 (0x8e), it must be a two-byte sequence
    1622             :  * representing JIS X 0201 characters with the second byte ranging between
    1623             :  * 0xa1 and 0xdf.  We just increment the last byte if it's less than 0xdf,
    1624             :  * and otherwise rewrite the whole sequence to 0xa1 0xa1.
    1625             :  *
    1626             :  * If the sequence starts with SS3 (0x8f), it must be a three-byte sequence
    1627             :  * in which the last two bytes range between 0xa1 and 0xfe.  The last byte
    1628             :  * is incremented if possible, otherwise the second-to-last byte.
    1629             :  *
    1630             :  * If the sequence starts with a value other than the above and its MSB
    1631             :  * is set, it must be a two-byte sequence representing JIS X 0208 characters
    1632             :  * with both bytes ranging between 0xa1 and 0xfe.  The last byte is
    1633             :  * incremented if possible, otherwise the second-to-last byte.
    1634             :  *
    1635             :  * Otherwise, the sequence is a single-byte ASCII character. It is
    1636             :  * incremented up to 0x7f.
    1637             :  */
    1638             : static bool
    1639             : pg_eucjp_increment(unsigned char *charptr, int length)
    1640             : {
    1641             :     unsigned char c1,
    1642             :                 c2;
    1643             :     int         i;
    1644             : 
    1645             :     c1 = *charptr;
    1646             : 
    1647             :     switch (c1)
    1648             :     {
    1649             :         case SS2:               /* JIS X 0201 */
    1650             :             if (length != 2)
    1651             :                 return false;
    1652             : 
    1653             :             c2 = charptr[1];
    1654             : 
    1655             :             if (c2 >= 0xdf)
    1656             :                 charptr[0] = charptr[1] = 0xa1;
    1657             :             else if (c2 < 0xa1)
    1658             :                 charptr[1] = 0xa1;
    1659             :             else
    1660             :                 charptr[1]++;
    1661             :             break;
    1662             : 
    1663             :         case SS3:               /* JIS X 0212 */
    1664             :             if (length != 3)
    1665             :                 return false;
    1666             : 
    1667             :             for (i = 2; i > 0; i--)
    1668             :             {
    1669             :                 c2 = charptr[i];
    1670             :                 if (c2 < 0xa1)
    1671             :                 {
    1672             :                     charptr[i] = 0xa1;
    1673             :                     return true;
    1674             :                 }
    1675             :                 else if (c2 < 0xfe)
    1676             :                 {
    1677             :                     charptr[i]++;
    1678             :                     return true;
    1679             :                 }
    1680             :             }
    1681             : 
    1682             :             /* Out of 3-byte code region */
    1683             :             return false;
    1684             : 
    1685             :         default:
    1686             :             if (IS_HIGHBIT_SET(c1)) /* JIS X 0208? */
    1687             :             {
    1688             :                 if (length != 2)
    1689             :                     return false;
    1690             : 
    1691             :                 for (i = 1; i >= 0; i--)
    1692             :                 {
    1693             :                     c2 = charptr[i];
    1694             :                     if (c2 < 0xa1)
    1695             :                     {
    1696             :                         charptr[i] = 0xa1;
    1697             :                         return true;
    1698             :                     }
    1699             :                     else if (c2 < 0xfe)
    1700             :                     {
    1701             :                         charptr[i]++;
    1702             :                         return true;
    1703             :                     }
    1704             :                 }
    1705             : 
    1706             :                 /* Out of 2 byte code region */
    1707             :                 return false;
    1708             :             }
    1709             :             else
    1710             :             {                   /* ASCII, single byte */
    1711             :                 if (c1 > 0x7e)
    1712             :                     return false;
    1713             :                 (*charptr)++;
    1714             :             }
    1715             :             break;
    1716             :     }
    1717             : 
    1718             :     return true;
    1719             : }
    1720             : #endif                          /* !FRONTEND */
    1721             : 
    1722             : 
    1723             : /*
    1724             :  *-------------------------------------------------------------------
    1725             :  * encoding info table
    1726             :  * XXX must be sorted by the same order as enum pg_enc (in mb/pg_wchar.h)
    1727             :  *-------------------------------------------------------------------
    1728             :  */
    1729             : const pg_wchar_tbl pg_wchar_table[] = {
    1730             :     {pg_ascii2wchar_with_len, pg_wchar2single_with_len, pg_ascii_mblen, pg_ascii_dsplen, pg_ascii_verifier, 1}, /* PG_SQL_ASCII */
    1731             :     {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3},    /* PG_EUC_JP */
    1732             :     {pg_euccn2wchar_with_len, pg_wchar2euc_with_len, pg_euccn_mblen, pg_euccn_dsplen, pg_euccn_verifier, 2},    /* PG_EUC_CN */
    1733             :     {pg_euckr2wchar_with_len, pg_wchar2euc_with_len, pg_euckr_mblen, pg_euckr_dsplen, pg_euckr_verifier, 3},    /* PG_EUC_KR */
    1734             :     {pg_euctw2wchar_with_len, pg_wchar2euc_with_len, pg_euctw_mblen, pg_euctw_dsplen, pg_euctw_verifier, 4},    /* PG_EUC_TW */
    1735             :     {pg_eucjp2wchar_with_len, pg_wchar2euc_with_len, pg_eucjp_mblen, pg_eucjp_dsplen, pg_eucjp_verifier, 3},    /* PG_EUC_JIS_2004 */
    1736             :     {pg_utf2wchar_with_len, pg_wchar2utf_with_len, pg_utf_mblen, pg_utf_dsplen, pg_utf8_verifier, 4},   /* PG_UTF8 */
    1737             :     {pg_mule2wchar_with_len, pg_wchar2mule_with_len, pg_mule_mblen, pg_mule_dsplen, pg_mule_verifier, 4},   /* PG_MULE_INTERNAL */
    1738             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN1 */
    1739             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN2 */
    1740             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN3 */
    1741             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN4 */
    1742             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN5 */
    1743             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN6 */
    1744             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN7 */
    1745             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN8 */
    1746             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN9 */
    1747             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_LATIN10 */
    1748             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1256 */
    1749             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1258 */
    1750             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN866 */
    1751             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN874 */
    1752             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_KOI8R */
    1753             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1251 */
    1754             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1252 */
    1755             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-5 */
    1756             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-6 */
    1757             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-7 */
    1758             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* ISO-8859-8 */
    1759             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1250 */
    1760             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1253 */
    1761             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1254 */
    1762             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1255 */
    1763             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_WIN1257 */
    1764             :     {pg_latin12wchar_with_len, pg_wchar2single_with_len, pg_latin1_mblen, pg_latin1_dsplen, pg_latin1_verifier, 1}, /* PG_KOI8U */
    1765             :     {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2}, /* PG_SJIS */
    1766             :     {0, 0, pg_big5_mblen, pg_big5_dsplen, pg_big5_verifier, 2}, /* PG_BIG5 */
    1767             :     {0, 0, pg_gbk_mblen, pg_gbk_dsplen, pg_gbk_verifier, 2},    /* PG_GBK */
    1768             :     {0, 0, pg_uhc_mblen, pg_uhc_dsplen, pg_uhc_verifier, 2},    /* PG_UHC */
    1769             :     {0, 0, pg_gb18030_mblen, pg_gb18030_dsplen, pg_gb18030_verifier, 4},    /* PG_GB18030 */
    1770             :     {0, 0, pg_johab_mblen, pg_johab_dsplen, pg_johab_verifier, 3},  /* PG_JOHAB */
    1771             :     {0, 0, pg_sjis_mblen, pg_sjis_dsplen, pg_sjis_verifier, 2}  /* PG_SHIFT_JIS_2004 */
    1772             : };
    1773             : 
    1774             : /* returns the byte length of a word for mule internal code */
    1775             : int
    1776           0 : pg_mic_mblen(const unsigned char *mbstr)
    1777             : {
    1778           0 :     return pg_mule_mblen(mbstr);
    1779             : }
    1780             : 
    1781             : /*
    1782             :  * Returns the byte length of a multibyte character.
    1783             :  */
    1784             : int
    1785     3541765 : pg_encoding_mblen(int encoding, const char *mbstr)
    1786             : {
    1787     7083530 :     return (PG_VALID_ENCODING(encoding) ?
    1788     7083530 :             ((*pg_wchar_table[encoding].mblen) ((const unsigned char *) mbstr)) :
    1789           0 :             ((*pg_wchar_table[PG_SQL_ASCII].mblen) ((const unsigned char *) mbstr)));
    1790             : }
    1791             : 
    1792             : /*
    1793             :  * Returns the display length of a multibyte character.
    1794             :  */
    1795             : int
    1796     3538881 : pg_encoding_dsplen(int encoding, const char *mbstr)
    1797             : {
    1798     7077762 :     return (PG_VALID_ENCODING(encoding) ?
    1799     7077762 :             ((*pg_wchar_table[encoding].dsplen) ((const unsigned char *) mbstr)) :
    1800           0 :             ((*pg_wchar_table[PG_SQL_ASCII].dsplen) ((const unsigned char *) mbstr)));
    1801             : }
    1802             : 
    1803             : /*
    1804             :  * Verify the first multibyte character of the given string.
    1805             :  * Return its byte length if good, -1 if bad.  (See comments above for
    1806             :  * full details of the mbverify API.)
    1807             :  */
    1808             : int
    1809           0 : pg_encoding_verifymb(int encoding, const char *mbstr, int len)
    1810             : {
    1811           0 :     return (PG_VALID_ENCODING(encoding) ?
    1812           0 :             ((*pg_wchar_table[encoding].mbverify) ((const unsigned char *) mbstr, len)) :
    1813           0 :             ((*pg_wchar_table[PG_SQL_ASCII].mbverify) ((const unsigned char *) mbstr, len)));
    1814             : }
    1815             : 
    1816             : /*
    1817             :  * fetch maximum length of a given encoding
    1818             :  */
    1819             : int
    1820         891 : pg_encoding_max_length(int encoding)
    1821             : {
    1822         891 :     Assert(PG_VALID_ENCODING(encoding));
    1823             : 
    1824         891 :     return pg_wchar_table[encoding].maxmblen;
    1825             : }
    1826             : 
    1827             : #ifndef FRONTEND
    1828             : 
    1829             : /*
    1830             :  * fetch maximum length of the encoding for the current database
    1831             :  */
    1832             : int
    1833             : pg_database_encoding_max_length(void)
    1834             : {
    1835             :     return pg_wchar_table[GetDatabaseEncoding()].maxmblen;
    1836             : }
    1837             : 
    1838             : /*
    1839             :  * get the character incrementer for the encoding for the current database
    1840             :  */
    1841             : mbcharacter_incrementer
    1842             : pg_database_encoding_character_incrementer(void)
    1843             : {
    1844             :     /*
    1845             :      * Eventually it might be best to add a field to pg_wchar_table[], but for
    1846             :      * now we just use a switch.
    1847             :      */
    1848             :     switch (GetDatabaseEncoding())
    1849             :     {
    1850             :         case PG_UTF8:
    1851             :             return pg_utf8_increment;
    1852             : 
    1853             :         case PG_EUC_JP:
    1854             :             return pg_eucjp_increment;
    1855             : 
    1856             :         default:
    1857             :             return pg_generic_charinc;
    1858             :     }
    1859             : }
    1860             : 
    1861             : /*
    1862             :  * Verify mbstr to make sure that it is validly encoded in the current
    1863             :  * database encoding.  Otherwise same as pg_verify_mbstr().
    1864             :  */
    1865             : bool
    1866             : pg_verifymbstr(const char *mbstr, int len, bool noError)
    1867             : {
    1868             :     return
    1869             :         pg_verify_mbstr_len(GetDatabaseEncoding(), mbstr, len, noError) >= 0;
    1870             : }
    1871             : 
    1872             : /*
    1873             :  * Verify mbstr to make sure that it is validly encoded in the specified
    1874             :  * encoding.
    1875             :  */
    1876             : bool
    1877             : pg_verify_mbstr(int encoding, const char *mbstr, int len, bool noError)
    1878             : {
    1879             :     return pg_verify_mbstr_len(encoding, mbstr, len, noError) >= 0;
    1880             : }
    1881             : 
    1882             : /*
    1883             :  * Verify mbstr to make sure that it is validly encoded in the specified
    1884             :  * encoding.
    1885             :  *
    1886             :  * mbstr is not necessarily zero terminated; length of mbstr is
    1887             :  * specified by len.
    1888             :  *
    1889             :  * If OK, return length of string in the encoding.
    1890             :  * If a problem is found, return -1 when noError is
    1891             :  * true; when noError is false, ereport() a descriptive message.
    1892             :  */
    1893             : int
    1894             : pg_verify_mbstr_len(int encoding, const char *mbstr, int len, bool noError)
    1895             : {
    1896             :     mbverifier  mbverify;
    1897             :     int         mb_len;
    1898             : 
    1899             :     Assert(PG_VALID_ENCODING(encoding));
    1900             : 
    1901             :     /*
    1902             :      * In single-byte encodings, we need only reject nulls (\0).
    1903             :      */
    1904             :     if (pg_encoding_max_length(encoding) <= 1)
    1905             :     {
    1906             :         const char *nullpos = memchr(mbstr, 0, len);
    1907             : 
    1908             :         if (nullpos == NULL)
    1909             :             return len;
    1910             :         if (noError)
    1911             :             return -1;
    1912             :         report_invalid_encoding(encoding, nullpos, 1);
    1913             :     }
    1914             : 
    1915             :     /* fetch function pointer just once */
    1916             :     mbverify = pg_wchar_table[encoding].mbverify;
    1917             : 
    1918             :     mb_len = 0;
    1919             : 
    1920             :     while (len > 0)
    1921             :     {
    1922             :         int         l;
    1923             : 
    1924             :         /* fast path for ASCII-subset characters */
    1925             :         if (!IS_HIGHBIT_SET(*mbstr))
    1926             :         {
    1927             :             if (*mbstr != '\0')
    1928             :             {
    1929             :                 mb_len++;
    1930             :                 mbstr++;
    1931             :                 len--;
    1932             :                 continue;
    1933             :             }
    1934             :             if (noError)
    1935             :                 return -1;
    1936             :             report_invalid_encoding(encoding, mbstr, len);
    1937             :         }
    1938             : 
    1939             :         l = (*mbverify) ((const unsigned char *) mbstr, len);
    1940             : 
    1941             :         if (l < 0)
    1942             :         {
    1943             :             if (noError)
    1944             :                 return -1;
    1945             :             report_invalid_encoding(encoding, mbstr, len);
    1946             :         }
    1947             : 
    1948             :         mbstr += l;
    1949             :         len -= l;
    1950             :         mb_len++;
    1951             :     }
    1952             :     return mb_len;
    1953             : }
    1954             : 
    1955             : /*
    1956             :  * check_encoding_conversion_args: check arguments of a conversion function
    1957             :  *
    1958             :  * "expected" arguments can be either an encoding ID or -1 to indicate that
    1959             :  * the caller will check whether it accepts the ID.
    1960             :  *
    1961             :  * Note: the errors here are not really user-facing, so elog instead of
    1962             :  * ereport seems sufficient.  Also, we trust that the "expected" encoding
    1963             :  * arguments are valid encoding IDs, but we don't trust the actuals.
    1964             :  */
    1965             : void
    1966             : check_encoding_conversion_args(int src_encoding,
    1967             :                                int dest_encoding,
    1968             :                                int len,
    1969             :                                int expected_src_encoding,
    1970             :                                int expected_dest_encoding)
    1971             : {
    1972             :     if (!PG_VALID_ENCODING(src_encoding))
    1973             :         elog(ERROR, "invalid source encoding ID: %d", src_encoding);
    1974             :     if (src_encoding != expected_src_encoding && expected_src_encoding >= 0)
    1975             :         elog(ERROR, "expected source encoding \"%s\", but got \"%s\"",
    1976             :              pg_enc2name_tbl[expected_src_encoding].name,
    1977             :              pg_enc2name_tbl[src_encoding].name);
    1978             :     if (!PG_VALID_ENCODING(dest_encoding))
    1979             :         elog(ERROR, "invalid destination encoding ID: %d", dest_encoding);
    1980             :     if (dest_encoding != expected_dest_encoding && expected_dest_encoding >= 0)
    1981             :         elog(ERROR, "expected destination encoding \"%s\", but got \"%s\"",
    1982             :              pg_enc2name_tbl[expected_dest_encoding].name,
    1983             :              pg_enc2name_tbl[dest_encoding].name);
    1984             :     if (len < 0)
    1985             :         elog(ERROR, "encoding conversion length must not be negative");
    1986             : }
    1987             : 
    1988             : /*
    1989             :  * report_invalid_encoding: complain about invalid multibyte character
    1990             :  *
    1991             :  * note: len is remaining length of string, not length of character;
    1992             :  * len must be greater than zero, as we always examine the first byte.
    1993             :  */
    1994             : void
    1995             : report_invalid_encoding(int encoding, const char *mbstr, int len)
    1996             : {
    1997             :     int         l = pg_encoding_mblen(encoding, mbstr);
    1998             :     char        buf[8 * 5 + 1];
    1999             :     char       *p = buf;
    2000             :     int         j,
    2001             :                 jlimit;
    2002             : 
    2003             :     jlimit = Min(l, len);
    2004             :     jlimit = Min(jlimit, 8);    /* prevent buffer overrun */
    2005             : 
    2006             :     for (j = 0; j < jlimit; j++)
    2007             :     {
    2008             :         p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
    2009             :         if (j < jlimit - 1)
    2010             :             p += sprintf(p, " ");
    2011             :     }
    2012             : 
    2013             :     ereport(ERROR,
    2014             :             (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
    2015             :              errmsg("invalid byte sequence for encoding \"%s\": %s",
    2016             :                     pg_enc2name_tbl[encoding].name,
    2017             :                     buf)));
    2018             : }
    2019             : 
    2020             : /*
    2021             :  * report_untranslatable_char: complain about untranslatable character
    2022             :  *
    2023             :  * note: len is remaining length of string, not length of character;
    2024             :  * len must be greater than zero, as we always examine the first byte.
    2025             :  */
    2026             : void
    2027             : report_untranslatable_char(int src_encoding, int dest_encoding,
    2028             :                            const char *mbstr, int len)
    2029             : {
    2030             :     int         l = pg_encoding_mblen(src_encoding, mbstr);
    2031             :     char        buf[8 * 5 + 1];
    2032             :     char       *p = buf;
    2033             :     int         j,
    2034             :                 jlimit;
    2035             : 
    2036             :     jlimit = Min(l, len);
    2037             :     jlimit = Min(jlimit, 8);    /* prevent buffer overrun */
    2038             : 
    2039             :     for (j = 0; j < jlimit; j++)
    2040             :     {
    2041             :         p += sprintf(p, "0x%02x", (unsigned char) mbstr[j]);
    2042             :         if (j < jlimit - 1)
    2043             :             p += sprintf(p, " ");
    2044             :     }
    2045             : 
    2046             :     ereport(ERROR,
    2047             :             (errcode(ERRCODE_UNTRANSLATABLE_CHARACTER),
    2048             :              errmsg("character with byte sequence %s in encoding \"%s\" has no equivalent in encoding \"%s\"",
    2049             :                     buf,
    2050             :                     pg_enc2name_tbl[src_encoding].name,
    2051             :                     pg_enc2name_tbl[dest_encoding].name)));
    2052             : }
    2053             : 
    2054             : #endif                          /* !FRONTEND */

Generated by: LCOV version 1.11