Line data Source code
1 : /*
2 : * Encoding names and routines for work with it. All
3 : * in this file is shared between FE and BE.
4 : *
5 : * src/backend/utils/mb/encnames.c
6 : */
7 : #ifdef FRONTEND
8 : #include "postgres_fe.h"
9 : #else
10 : #include "postgres.h"
11 : #include "utils/builtins.h"
12 : #endif
13 :
14 : #include <ctype.h>
15 : #include <unistd.h>
16 :
17 : #include "mb/pg_wchar.h"
18 :
19 :
20 : /* ----------
21 : * All encoding names, sorted: *** A L P H A B E T I C ***
22 : *
23 : * All names must be without irrelevant chars, search routines use
24 : * isalnum() chars only. It means ISO-8859-1, iso_8859-1 and Iso8859_1
25 : * are always converted to 'iso88591'. All must be lower case.
26 : *
27 : * The table doesn't contain 'cs' aliases (like csISOLatin1). It's needed?
28 : *
29 : * Karel Zak, Aug 2001
30 : * ----------
31 : */
32 : typedef struct pg_encname
33 : {
34 : const char *name;
35 : pg_enc encoding;
36 : } pg_encname;
37 :
38 : static const pg_encname pg_encname_tbl[] =
39 : {
40 : {
41 : "abc", PG_WIN1258
42 : }, /* alias for WIN1258 */
43 : {
44 : "alt", PG_WIN866
45 : }, /* IBM866 */
46 : {
47 : "big5", PG_BIG5
48 : }, /* Big5; Chinese for Taiwan multibyte set */
49 : {
50 : "euccn", PG_EUC_CN
51 : }, /* EUC-CN; Extended Unix Code for simplified
52 : * Chinese */
53 : {
54 : "eucjis2004", PG_EUC_JIS_2004
55 : }, /* EUC-JIS-2004; Extended UNIX Code fixed
56 : * Width for Japanese, standard JIS X 0213 */
57 : {
58 : "eucjp", PG_EUC_JP
59 : }, /* EUC-JP; Extended UNIX Code fixed Width for
60 : * Japanese, standard OSF */
61 : {
62 : "euckr", PG_EUC_KR
63 : }, /* EUC-KR; Extended Unix Code for Korean , KS
64 : * X 1001 standard */
65 : {
66 : "euctw", PG_EUC_TW
67 : }, /* EUC-TW; Extended Unix Code for
68 : *
69 : * traditional Chinese */
70 : {
71 : "gb18030", PG_GB18030
72 : }, /* GB18030;GB18030 */
73 : {
74 : "gbk", PG_GBK
75 : }, /* GBK; Chinese Windows CodePage 936
76 : * simplified Chinese */
77 : {
78 : "iso88591", PG_LATIN1
79 : }, /* ISO-8859-1; RFC1345,KXS2 */
80 : {
81 : "iso885910", PG_LATIN6
82 : }, /* ISO-8859-10; RFC1345,KXS2 */
83 : {
84 : "iso885913", PG_LATIN7
85 : }, /* ISO-8859-13; RFC1345,KXS2 */
86 : {
87 : "iso885914", PG_LATIN8
88 : }, /* ISO-8859-14; RFC1345,KXS2 */
89 : {
90 : "iso885915", PG_LATIN9
91 : }, /* ISO-8859-15; RFC1345,KXS2 */
92 : {
93 : "iso885916", PG_LATIN10
94 : }, /* ISO-8859-16; RFC1345,KXS2 */
95 : {
96 : "iso88592", PG_LATIN2
97 : }, /* ISO-8859-2; RFC1345,KXS2 */
98 : {
99 : "iso88593", PG_LATIN3
100 : }, /* ISO-8859-3; RFC1345,KXS2 */
101 : {
102 : "iso88594", PG_LATIN4
103 : }, /* ISO-8859-4; RFC1345,KXS2 */
104 : {
105 : "iso88595", PG_ISO_8859_5
106 : }, /* ISO-8859-5; RFC1345,KXS2 */
107 : {
108 : "iso88596", PG_ISO_8859_6
109 : }, /* ISO-8859-6; RFC1345,KXS2 */
110 : {
111 : "iso88597", PG_ISO_8859_7
112 : }, /* ISO-8859-7; RFC1345,KXS2 */
113 : {
114 : "iso88598", PG_ISO_8859_8
115 : }, /* ISO-8859-8; RFC1345,KXS2 */
116 : {
117 : "iso88599", PG_LATIN5
118 : }, /* ISO-8859-9; RFC1345,KXS2 */
119 : {
120 : "johab", PG_JOHAB
121 : }, /* JOHAB; Extended Unix Code for simplified
122 : * Chinese */
123 : {
124 : "koi8", PG_KOI8R
125 : }, /* _dirty_ alias for KOI8-R (backward
126 : * compatibility) */
127 : {
128 : "koi8r", PG_KOI8R
129 : }, /* KOI8-R; RFC1489 */
130 : {
131 : "koi8u", PG_KOI8U
132 : }, /* KOI8-U; RFC2319 */
133 : {
134 : "latin1", PG_LATIN1
135 : }, /* alias for ISO-8859-1 */
136 : {
137 : "latin10", PG_LATIN10
138 : }, /* alias for ISO-8859-16 */
139 : {
140 : "latin2", PG_LATIN2
141 : }, /* alias for ISO-8859-2 */
142 : {
143 : "latin3", PG_LATIN3
144 : }, /* alias for ISO-8859-3 */
145 : {
146 : "latin4", PG_LATIN4
147 : }, /* alias for ISO-8859-4 */
148 : {
149 : "latin5", PG_LATIN5
150 : }, /* alias for ISO-8859-9 */
151 : {
152 : "latin6", PG_LATIN6
153 : }, /* alias for ISO-8859-10 */
154 : {
155 : "latin7", PG_LATIN7
156 : }, /* alias for ISO-8859-13 */
157 : {
158 : "latin8", PG_LATIN8
159 : }, /* alias for ISO-8859-14 */
160 : {
161 : "latin9", PG_LATIN9
162 : }, /* alias for ISO-8859-15 */
163 : {
164 : "mskanji", PG_SJIS
165 : }, /* alias for Shift_JIS */
166 : {
167 : "muleinternal", PG_MULE_INTERNAL
168 : },
169 : {
170 : "shiftjis", PG_SJIS
171 : }, /* Shift_JIS; JIS X 0202-1991 */
172 :
173 : {
174 : "shiftjis2004", PG_SHIFT_JIS_2004
175 : }, /* SHIFT-JIS-2004; Shift JIS for Japanese,
176 : * standard JIS X 0213 */
177 : {
178 : "sjis", PG_SJIS
179 : }, /* alias for Shift_JIS */
180 : {
181 : "sqlascii", PG_SQL_ASCII
182 : },
183 : {
184 : "tcvn", PG_WIN1258
185 : }, /* alias for WIN1258 */
186 : {
187 : "tcvn5712", PG_WIN1258
188 : }, /* alias for WIN1258 */
189 : {
190 : "uhc", PG_UHC
191 : }, /* UHC; Korean Windows CodePage 949 */
192 : {
193 : "unicode", PG_UTF8
194 : }, /* alias for UTF8 */
195 : {
196 : "utf8", PG_UTF8
197 : }, /* alias for UTF8 */
198 : {
199 : "vscii", PG_WIN1258
200 : }, /* alias for WIN1258 */
201 : {
202 : "win", PG_WIN1251
203 : }, /* _dirty_ alias for windows-1251 (backward
204 : * compatibility) */
205 : {
206 : "win1250", PG_WIN1250
207 : }, /* alias for Windows-1250 */
208 : {
209 : "win1251", PG_WIN1251
210 : }, /* alias for Windows-1251 */
211 : {
212 : "win1252", PG_WIN1252
213 : }, /* alias for Windows-1252 */
214 : {
215 : "win1253", PG_WIN1253
216 : }, /* alias for Windows-1253 */
217 : {
218 : "win1254", PG_WIN1254
219 : }, /* alias for Windows-1254 */
220 : {
221 : "win1255", PG_WIN1255
222 : }, /* alias for Windows-1255 */
223 : {
224 : "win1256", PG_WIN1256
225 : }, /* alias for Windows-1256 */
226 : {
227 : "win1257", PG_WIN1257
228 : }, /* alias for Windows-1257 */
229 : {
230 : "win1258", PG_WIN1258
231 : }, /* alias for Windows-1258 */
232 : {
233 : "win866", PG_WIN866
234 : }, /* IBM866 */
235 : {
236 : "win874", PG_WIN874
237 : }, /* alias for Windows-874 */
238 : {
239 : "win932", PG_SJIS
240 : }, /* alias for Shift_JIS */
241 : {
242 : "win936", PG_GBK
243 : }, /* alias for GBK */
244 : {
245 : "win949", PG_UHC
246 : }, /* alias for UHC */
247 : {
248 : "win950", PG_BIG5
249 : }, /* alias for BIG5 */
250 : {
251 : "windows1250", PG_WIN1250
252 : }, /* Windows-1251; Microsoft */
253 : {
254 : "windows1251", PG_WIN1251
255 : }, /* Windows-1251; Microsoft */
256 : {
257 : "windows1252", PG_WIN1252
258 : }, /* Windows-1252; Microsoft */
259 : {
260 : "windows1253", PG_WIN1253
261 : }, /* Windows-1253; Microsoft */
262 : {
263 : "windows1254", PG_WIN1254
264 : }, /* Windows-1254; Microsoft */
265 : {
266 : "windows1255", PG_WIN1255
267 : }, /* Windows-1255; Microsoft */
268 : {
269 : "windows1256", PG_WIN1256
270 : }, /* Windows-1256; Microsoft */
271 : {
272 : "windows1257", PG_WIN1257
273 : }, /* Windows-1257; Microsoft */
274 : {
275 : "windows1258", PG_WIN1258
276 : }, /* Windows-1258; Microsoft */
277 : {
278 : "windows866", PG_WIN866
279 : }, /* IBM866 */
280 : {
281 : "windows874", PG_WIN874
282 : }, /* Windows-874; Microsoft */
283 : {
284 : "windows932", PG_SJIS
285 : }, /* alias for Shift_JIS */
286 : {
287 : "windows936", PG_GBK
288 : }, /* alias for GBK */
289 : {
290 : "windows949", PG_UHC
291 : }, /* alias for UHC */
292 : {
293 : "windows950", PG_BIG5
294 : } /* alias for BIG5 */
295 : };
296 :
297 : /* ----------
298 : * These are "official" encoding names.
299 : * XXX must be sorted by the same order as enum pg_enc (in mb/pg_wchar.h)
300 : * ----------
301 : */
302 : #ifndef WIN32
303 : #define DEF_ENC2NAME(name, codepage) { #name, PG_##name }
304 : #else
305 : #define DEF_ENC2NAME(name, codepage) { #name, PG_##name, codepage }
306 : #endif
307 : const pg_enc2name pg_enc2name_tbl[] =
308 : {
309 : DEF_ENC2NAME(SQL_ASCII, 0),
310 : DEF_ENC2NAME(EUC_JP, 20932),
311 : DEF_ENC2NAME(EUC_CN, 20936),
312 : DEF_ENC2NAME(EUC_KR, 51949),
313 : DEF_ENC2NAME(EUC_TW, 0),
314 : DEF_ENC2NAME(EUC_JIS_2004, 20932),
315 : DEF_ENC2NAME(UTF8, 65001),
316 : DEF_ENC2NAME(MULE_INTERNAL, 0),
317 : DEF_ENC2NAME(LATIN1, 28591),
318 : DEF_ENC2NAME(LATIN2, 28592),
319 : DEF_ENC2NAME(LATIN3, 28593),
320 : DEF_ENC2NAME(LATIN4, 28594),
321 : DEF_ENC2NAME(LATIN5, 28599),
322 : DEF_ENC2NAME(LATIN6, 0),
323 : DEF_ENC2NAME(LATIN7, 0),
324 : DEF_ENC2NAME(LATIN8, 0),
325 : DEF_ENC2NAME(LATIN9, 28605),
326 : DEF_ENC2NAME(LATIN10, 0),
327 : DEF_ENC2NAME(WIN1256, 1256),
328 : DEF_ENC2NAME(WIN1258, 1258),
329 : DEF_ENC2NAME(WIN866, 866),
330 : DEF_ENC2NAME(WIN874, 874),
331 : DEF_ENC2NAME(KOI8R, 20866),
332 : DEF_ENC2NAME(WIN1251, 1251),
333 : DEF_ENC2NAME(WIN1252, 1252),
334 : DEF_ENC2NAME(ISO_8859_5, 28595),
335 : DEF_ENC2NAME(ISO_8859_6, 28596),
336 : DEF_ENC2NAME(ISO_8859_7, 28597),
337 : DEF_ENC2NAME(ISO_8859_8, 28598),
338 : DEF_ENC2NAME(WIN1250, 1250),
339 : DEF_ENC2NAME(WIN1253, 1253),
340 : DEF_ENC2NAME(WIN1254, 1254),
341 : DEF_ENC2NAME(WIN1255, 1255),
342 : DEF_ENC2NAME(WIN1257, 1257),
343 : DEF_ENC2NAME(KOI8U, 21866),
344 : DEF_ENC2NAME(SJIS, 932),
345 : DEF_ENC2NAME(BIG5, 950),
346 : DEF_ENC2NAME(GBK, 936),
347 : DEF_ENC2NAME(UHC, 949),
348 : DEF_ENC2NAME(GB18030, 54936),
349 : DEF_ENC2NAME(JOHAB, 0),
350 : DEF_ENC2NAME(SHIFT_JIS_2004, 932)
351 : };
352 :
353 : /* ----------
354 : * These are encoding names for gettext.
355 : *
356 : * This covers all encodings except MULE_INTERNAL, which is alien to gettext.
357 : * ----------
358 : */
359 : const pg_enc2gettext pg_enc2gettext_tbl[] =
360 : {
361 : {PG_SQL_ASCII, "US-ASCII"},
362 : {PG_UTF8, "UTF-8"},
363 : {PG_LATIN1, "LATIN1"},
364 : {PG_LATIN2, "LATIN2"},
365 : {PG_LATIN3, "LATIN3"},
366 : {PG_LATIN4, "LATIN4"},
367 : {PG_ISO_8859_5, "ISO-8859-5"},
368 : {PG_ISO_8859_6, "ISO_8859-6"},
369 : {PG_ISO_8859_7, "ISO-8859-7"},
370 : {PG_ISO_8859_8, "ISO-8859-8"},
371 : {PG_LATIN5, "LATIN5"},
372 : {PG_LATIN6, "LATIN6"},
373 : {PG_LATIN7, "LATIN7"},
374 : {PG_LATIN8, "LATIN8"},
375 : {PG_LATIN9, "LATIN-9"},
376 : {PG_LATIN10, "LATIN10"},
377 : {PG_KOI8R, "KOI8-R"},
378 : {PG_KOI8U, "KOI8-U"},
379 : {PG_WIN1250, "CP1250"},
380 : {PG_WIN1251, "CP1251"},
381 : {PG_WIN1252, "CP1252"},
382 : {PG_WIN1253, "CP1253"},
383 : {PG_WIN1254, "CP1254"},
384 : {PG_WIN1255, "CP1255"},
385 : {PG_WIN1256, "CP1256"},
386 : {PG_WIN1257, "CP1257"},
387 : {PG_WIN1258, "CP1258"},
388 : {PG_WIN866, "CP866"},
389 : {PG_WIN874, "CP874"},
390 : {PG_EUC_CN, "EUC-CN"},
391 : {PG_EUC_JP, "EUC-JP"},
392 : {PG_EUC_KR, "EUC-KR"},
393 : {PG_EUC_TW, "EUC-TW"},
394 : {PG_EUC_JIS_2004, "EUC-JP"},
395 : {PG_SJIS, "SHIFT-JIS"},
396 : {PG_BIG5, "BIG5"},
397 : {PG_GBK, "GBK"},
398 : {PG_UHC, "UHC"},
399 : {PG_GB18030, "GB18030"},
400 : {PG_JOHAB, "JOHAB"},
401 : {PG_SHIFT_JIS_2004, "SHIFT_JISX0213"},
402 : {0, NULL}
403 : };
404 :
405 :
406 : #ifndef FRONTEND
407 :
408 : /*
409 : * Table of encoding names for ICU
410 : *
411 : * Reference: <https://ssl.icu-project.org/icu-bin/convexp>
412 : *
413 : * NULL entries are not supported by ICU, or their mapping is unclear.
414 : */
415 : static const char *const pg_enc2icu_tbl[] =
416 : {
417 : NULL, /* PG_SQL_ASCII */
418 : "EUC-JP", /* PG_EUC_JP */
419 : "EUC-CN", /* PG_EUC_CN */
420 : "EUC-KR", /* PG_EUC_KR */
421 : "EUC-TW", /* PG_EUC_TW */
422 : NULL, /* PG_EUC_JIS_2004 */
423 : "UTF-8", /* PG_UTF8 */
424 : NULL, /* PG_MULE_INTERNAL */
425 : "ISO-8859-1", /* PG_LATIN1 */
426 : "ISO-8859-2", /* PG_LATIN2 */
427 : "ISO-8859-3", /* PG_LATIN3 */
428 : "ISO-8859-4", /* PG_LATIN4 */
429 : "ISO-8859-9", /* PG_LATIN5 */
430 : "ISO-8859-10", /* PG_LATIN6 */
431 : "ISO-8859-13", /* PG_LATIN7 */
432 : "ISO-8859-14", /* PG_LATIN8 */
433 : "ISO-8859-15", /* PG_LATIN9 */
434 : NULL, /* PG_LATIN10 */
435 : "CP1256", /* PG_WIN1256 */
436 : "CP1258", /* PG_WIN1258 */
437 : "CP866", /* PG_WIN866 */
438 : NULL, /* PG_WIN874 */
439 : "KOI8-R", /* PG_KOI8R */
440 : "CP1251", /* PG_WIN1251 */
441 : "CP1252", /* PG_WIN1252 */
442 : "ISO-8859-5", /* PG_ISO_8859_5 */
443 : "ISO-8859-6", /* PG_ISO_8859_6 */
444 : "ISO-8859-7", /* PG_ISO_8859_7 */
445 : "ISO-8859-8", /* PG_ISO_8859_8 */
446 : "CP1250", /* PG_WIN1250 */
447 : "CP1253", /* PG_WIN1253 */
448 : "CP1254", /* PG_WIN1254 */
449 : "CP1255", /* PG_WIN1255 */
450 : "CP1257", /* PG_WIN1257 */
451 : "KOI8-U", /* PG_KOI8U */
452 : };
453 :
454 : bool
455 : is_encoding_supported_by_icu(int encoding)
456 : {
457 : return (pg_enc2icu_tbl[encoding] != NULL);
458 : }
459 :
460 : const char *
461 : get_encoding_name_for_icu(int encoding)
462 : {
463 : const char *icu_encoding_name;
464 :
465 : StaticAssertStmt(lengthof(pg_enc2icu_tbl) == PG_ENCODING_BE_LAST + 1,
466 : "pg_enc2icu_tbl incomplete");
467 :
468 : icu_encoding_name = pg_enc2icu_tbl[encoding];
469 :
470 : if (!icu_encoding_name)
471 : ereport(ERROR,
472 : (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
473 : errmsg("encoding \"%s\" not supported by ICU",
474 : pg_encoding_to_char(encoding))));
475 :
476 : return icu_encoding_name;
477 : }
478 :
479 : #endif /* not FRONTEND */
480 :
481 :
482 : /* ----------
483 : * Encoding checks, for error returns -1 else encoding id
484 : * ----------
485 : */
486 : int
487 0 : pg_valid_client_encoding(const char *name)
488 : {
489 : int enc;
490 :
491 0 : if ((enc = pg_char_to_encoding(name)) < 0)
492 0 : return -1;
493 :
494 0 : if (!PG_VALID_FE_ENCODING(enc))
495 0 : return -1;
496 :
497 0 : return enc;
498 : }
499 :
500 : int
501 0 : pg_valid_server_encoding(const char *name)
502 : {
503 : int enc;
504 :
505 0 : if ((enc = pg_char_to_encoding(name)) < 0)
506 0 : return -1;
507 :
508 0 : if (!PG_VALID_BE_ENCODING(enc))
509 0 : return -1;
510 :
511 0 : return enc;
512 : }
513 :
514 : int
515 1 : pg_valid_server_encoding_id(int encoding)
516 : {
517 1 : return PG_VALID_BE_ENCODING(encoding);
518 : }
519 :
520 : /* ----------
521 : * Remove irrelevant chars from encoding name
522 : * ----------
523 : */
524 : static char *
525 0 : clean_encoding_name(const char *key, char *newkey)
526 : {
527 : const char *p;
528 : char *np;
529 :
530 0 : for (p = key, np = newkey; *p != '\0'; p++)
531 : {
532 0 : if (isalnum((unsigned char) *p))
533 : {
534 0 : if (*p >= 'A' && *p <= 'Z')
535 0 : *np++ = *p + 'a' - 'A';
536 : else
537 0 : *np++ = *p;
538 : }
539 : }
540 0 : *np = '\0';
541 0 : return newkey;
542 : }
543 :
544 : /* ----------
545 : * Search encoding by encoding name
546 : *
547 : * Returns encoding ID, or -1 for error
548 : * ----------
549 : */
550 : int
551 0 : pg_char_to_encoding(const char *name)
552 : {
553 0 : unsigned int nel = lengthof(pg_encname_tbl);
554 0 : const pg_encname *base = pg_encname_tbl,
555 0 : *last = base + nel - 1,
556 : *position;
557 : int result;
558 : char buff[NAMEDATALEN],
559 : *key;
560 :
561 0 : if (name == NULL || *name == '\0')
562 0 : return -1;
563 :
564 0 : if (strlen(name) >= NAMEDATALEN)
565 : {
566 : #ifdef FRONTEND
567 0 : fprintf(stderr, "encoding name too long\n");
568 0 : return -1;
569 : #else
570 : ereport(ERROR,
571 : (errcode(ERRCODE_NAME_TOO_LONG),
572 : errmsg("encoding name too long")));
573 : #endif
574 : }
575 0 : key = clean_encoding_name(name, buff);
576 :
577 0 : while (last >= base)
578 : {
579 0 : position = base + ((last - base) >> 1);
580 0 : result = key[0] - position->name[0];
581 :
582 0 : if (result == 0)
583 : {
584 0 : result = strcmp(key, position->name);
585 0 : if (result == 0)
586 0 : return position->encoding;
587 : }
588 0 : if (result < 0)
589 0 : last = position - 1;
590 : else
591 0 : base = position + 1;
592 : }
593 0 : return -1;
594 : }
595 :
596 : #ifndef FRONTEND
597 : Datum
598 : PG_char_to_encoding(PG_FUNCTION_ARGS)
599 : {
600 : Name s = PG_GETARG_NAME(0);
601 :
602 : PG_RETURN_INT32(pg_char_to_encoding(NameStr(*s)));
603 : }
604 : #endif
605 :
606 : const char *
607 1 : pg_encoding_to_char(int encoding)
608 : {
609 1 : if (PG_VALID_ENCODING(encoding))
610 : {
611 1 : const pg_enc2name *p = &pg_enc2name_tbl[encoding];
612 :
613 1 : Assert(encoding == p->encoding);
614 1 : return p->name;
615 : }
616 0 : return "";
617 : }
618 :
619 : #ifndef FRONTEND
620 : Datum
621 : PG_encoding_to_char(PG_FUNCTION_ARGS)
622 : {
623 : int32 encoding = PG_GETARG_INT32(0);
624 : const char *encoding_name = pg_encoding_to_char(encoding);
625 :
626 : return DirectFunctionCall1(namein, CStringGetDatum(encoding_name));
627 : }
628 :
629 : #endif
|