Line data Source code
1 : /*-------------------------------------------------------------------------
2 : * unicode_norm.c
3 : * Normalize a Unicode string to NFKC form
4 : *
5 : * This implements Unicode normalization, per the documentation at
6 : * http://www.unicode.org/reports/tr15/.
7 : *
8 : * Portions Copyright (c) 2017, PostgreSQL Global Development Group
9 : *
10 : * IDENTIFICATION
11 : * src/common/unicode_norm.c
12 : *
13 : *-------------------------------------------------------------------------
14 : */
15 : #ifndef FRONTEND
16 : #include "postgres.h"
17 : #else
18 : #include "postgres_fe.h"
19 : #endif
20 :
21 : #include "common/unicode_norm.h"
22 : #include "common/unicode_norm_table.h"
23 :
24 : #ifndef FRONTEND
25 : #define ALLOC(size) palloc(size)
26 : #define FREE(size) pfree(size)
27 : #else
28 : #define ALLOC(size) malloc(size)
29 : #define FREE(size) free(size)
30 : #endif
31 :
32 : /* Constants for calculations with Hangul characters */
33 : #define SBASE 0xAC00 /* U+AC00 */
34 : #define LBASE 0x1100 /* U+1100 */
35 : #define VBASE 0x1161 /* U+1161 */
36 : #define TBASE 0x11A7 /* U+11A7 */
37 : #define LCOUNT 19
38 : #define VCOUNT 21
39 : #define TCOUNT 28
40 : #define NCOUNT VCOUNT * TCOUNT
41 : #define SCOUNT LCOUNT * NCOUNT
42 :
43 : /* comparison routine for bsearch() of decomposition lookup table. */
44 : static int
45 0 : conv_compare(const void *p1, const void *p2)
46 : {
47 : uint32 v1,
48 : v2;
49 :
50 0 : v1 = *(const uint32 *) p1;
51 0 : v2 = ((const pg_unicode_decomposition *) p2)->codepoint;
52 0 : return (v1 > v2) ? 1 : ((v1 == v2) ? 0 : -1);
53 : }
54 :
55 : /*
56 : * Get the entry corresponding to code in the decomposition lookup table.
57 : */
58 : static pg_unicode_decomposition *
59 0 : get_code_entry(pg_wchar code)
60 : {
61 0 : return bsearch(&(code),
62 : (void *) UnicodeDecompMain,
63 : lengthof(UnicodeDecompMain),
64 : sizeof(pg_unicode_decomposition),
65 : conv_compare);
66 : }
67 :
68 : /*
69 : * Given a decomposition entry looked up earlier, get the decomposed
70 : * characters.
71 : *
72 : * Note: the returned pointer can point to statically allocated buffer, and
73 : * is only valid until next call to this function!
74 : */
75 : static const pg_wchar *
76 0 : get_code_decomposition(pg_unicode_decomposition *entry, int *dec_size)
77 : {
78 : static pg_wchar x;
79 :
80 0 : if (DECOMPOSITION_IS_INLINE(entry))
81 : {
82 0 : Assert(DECOMPOSITION_SIZE(entry) == 1);
83 0 : x = (pg_wchar) entry->dec_index;
84 0 : *dec_size = 1;
85 0 : return &x;
86 : }
87 : else
88 : {
89 0 : *dec_size = DECOMPOSITION_SIZE(entry);
90 0 : return &UnicodeDecomp_codepoints[entry->dec_index];
91 : }
92 : }
93 :
94 : /*
95 : * Calculate how many characters a given character will decompose to.
96 : *
97 : * This needs to recurse, if the character decomposes into characters that
98 : * are, in turn, decomposable.
99 : */
100 : static int
101 0 : get_decomposed_size(pg_wchar code)
102 : {
103 : pg_unicode_decomposition *entry;
104 0 : int size = 0;
105 : int i;
106 : const uint32 *decomp;
107 : int dec_size;
108 :
109 : /*
110 : * Fast path for Hangul characters not stored in tables to save memory as
111 : * decomposition is algorithmic. See
112 : * http://unicode.org/reports/tr15/tr15-18.html, annex 10 for details on
113 : * the matter.
114 : */
115 0 : if (code >= SBASE && code < SBASE + SCOUNT)
116 : {
117 : uint32 tindex,
118 : sindex;
119 :
120 0 : sindex = code - SBASE;
121 0 : tindex = sindex % TCOUNT;
122 :
123 0 : if (tindex != 0)
124 0 : return 3;
125 0 : return 2;
126 : }
127 :
128 0 : entry = get_code_entry(code);
129 :
130 : /*
131 : * Just count current code if no other decompositions. A NULL entry is
132 : * equivalent to a character with class 0 and no decompositions.
133 : */
134 0 : if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0)
135 0 : return 1;
136 :
137 : /*
138 : * If this entry has other decomposition codes look at them as well. First
139 : * get its decomposition in the list of tables available.
140 : */
141 0 : decomp = get_code_decomposition(entry, &dec_size);
142 0 : for (i = 0; i < dec_size; i++)
143 : {
144 0 : uint32 lcode = decomp[i];
145 :
146 0 : size += get_decomposed_size(lcode);
147 : }
148 :
149 0 : return size;
150 : }
151 :
152 : /*
153 : * Recompose a set of characters. For hangul characters, the calculation
154 : * is algorithmic. For others, an inverse lookup at the decomposition
155 : * table is necessary. Returns true if a recomposition can be done, and
156 : * false otherwise.
157 : */
158 : static bool
159 0 : recompose_code(uint32 start, uint32 code, uint32 *result)
160 : {
161 : /*
162 : * Handle Hangul characters algorithmically, per the Unicode spec.
163 : *
164 : * Check if two current characters are L and V.
165 : */
166 0 : if (start >= LBASE && start < LBASE + LCOUNT &&
167 0 : code >= VBASE && code < VBASE + VCOUNT)
168 : {
169 : /* make syllable of form LV */
170 0 : uint32 lindex = start - LBASE;
171 0 : uint32 vindex = code - VBASE;
172 :
173 0 : *result = SBASE + (lindex * VCOUNT + vindex) * TCOUNT;
174 0 : return true;
175 : }
176 : /* Check if two current characters are LV and T */
177 0 : else if (start >= SBASE && start < (SBASE + SCOUNT) &&
178 0 : ((start - SBASE) % TCOUNT) == 0 &&
179 0 : code >= TBASE && code < (TBASE + TCOUNT))
180 : {
181 : /* make syllable of from LVT */
182 0 : uint32 tindex = code - TBASE;
183 :
184 0 : *result = start + tindex;
185 0 : return true;
186 : }
187 : else
188 : {
189 : int i;
190 :
191 : /*
192 : * Do an inverse lookup of the decomposition tables to see if anything
193 : * matches. The comparison just needs to be a perfect match on the
194 : * sub-table of size two, because the start character has already been
195 : * recomposed partially.
196 : */
197 0 : for (i = 0; i < lengthof(UnicodeDecompMain); i++)
198 : {
199 0 : const pg_unicode_decomposition *entry = &UnicodeDecompMain[i];
200 :
201 0 : if (DECOMPOSITION_SIZE(entry) != 2)
202 0 : continue;
203 :
204 0 : if (DECOMPOSITION_NO_COMPOSE(entry))
205 0 : continue;
206 :
207 0 : if (start == UnicodeDecomp_codepoints[entry->dec_index] &&
208 0 : code == UnicodeDecomp_codepoints[entry->dec_index + 1])
209 : {
210 0 : *result = entry->codepoint;
211 0 : return true;
212 : }
213 : }
214 : }
215 :
216 0 : return false;
217 : }
218 :
219 : /*
220 : * Decompose the given code into the array given by caller. The
221 : * decomposition begins at the position given by caller, saving one
222 : * lookup on the decomposition table. The current position needs to be
223 : * updated here to let the caller know from where to continue filling
224 : * in the array result.
225 : */
226 : static void
227 0 : decompose_code(pg_wchar code, pg_wchar **result, int *current)
228 : {
229 : pg_unicode_decomposition *entry;
230 : int i;
231 : const uint32 *decomp;
232 : int dec_size;
233 :
234 : /*
235 : * Fast path for Hangul characters not stored in tables to save memory as
236 : * decomposition is algorithmic. See
237 : * http://unicode.org/reports/tr15/tr15-18.html, annex 10 for details on
238 : * the matter.
239 : */
240 0 : if (code >= SBASE && code < SBASE + SCOUNT)
241 : {
242 : uint32 l,
243 : v,
244 : tindex,
245 : sindex;
246 0 : pg_wchar *res = *result;
247 :
248 0 : sindex = code - SBASE;
249 0 : l = LBASE + sindex / (VCOUNT * TCOUNT);
250 0 : v = VBASE + (sindex % (VCOUNT * TCOUNT)) / TCOUNT;
251 0 : tindex = sindex % TCOUNT;
252 :
253 0 : res[*current] = l;
254 0 : (*current)++;
255 0 : res[*current] = v;
256 0 : (*current)++;
257 :
258 0 : if (tindex != 0)
259 : {
260 0 : res[*current] = TBASE + tindex;
261 0 : (*current)++;
262 : }
263 :
264 0 : return;
265 : }
266 :
267 0 : entry = get_code_entry(code);
268 :
269 : /*
270 : * Just fill in with the current decomposition if there are no
271 : * decomposition codes to recurse to. A NULL entry is equivalent to a
272 : * character with class 0 and no decompositions, so just leave also in
273 : * this case.
274 : */
275 0 : if (entry == NULL || DECOMPOSITION_SIZE(entry) == 0)
276 : {
277 0 : pg_wchar *res = *result;
278 :
279 0 : res[*current] = code;
280 0 : (*current)++;
281 0 : return;
282 : }
283 :
284 : /*
285 : * If this entry has other decomposition codes look at them as well.
286 : */
287 0 : decomp = get_code_decomposition(entry, &dec_size);
288 0 : for (i = 0; i < dec_size; i++)
289 : {
290 0 : pg_wchar lcode = (pg_wchar) decomp[i];
291 :
292 : /* Leave if no more decompositions */
293 0 : decompose_code(lcode, result, current);
294 : }
295 : }
296 :
297 : /*
298 : * unicode_normalize_kc - Normalize a Unicode string to NFKC form.
299 : *
300 : * The input is a 0-terminated array of codepoints.
301 : *
302 : * In frontend, returns a 0-terminated array of codepoints, allocated with
303 : * malloc. Or NULL if we run out of memory. In frontend, the returned
304 : * string is palloc'd instead, and OOM is reported with ereport().
305 : */
306 : pg_wchar *
307 0 : unicode_normalize_kc(const pg_wchar *input)
308 : {
309 : pg_wchar *decomp_chars;
310 : pg_wchar *recomp_chars;
311 : int decomp_size,
312 : current_size;
313 : int count;
314 : const pg_wchar *p;
315 :
316 : /* variables for recomposition */
317 : int last_class;
318 : int starter_pos;
319 : int target_pos;
320 : uint32 starter_ch;
321 :
322 : /* First, do character decomposition */
323 :
324 : /*
325 : * Calculate how many characters long the decomposed version will be.
326 : */
327 0 : decomp_size = 0;
328 0 : for (p = input; *p; p++)
329 0 : decomp_size += get_decomposed_size(*p);
330 :
331 0 : decomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
332 0 : if (decomp_chars == NULL)
333 0 : return NULL;
334 :
335 : /*
336 : * Now fill in each entry recursively. This needs a second pass on the
337 : * decomposition table.
338 : */
339 0 : current_size = 0;
340 0 : for (p = input; *p; p++)
341 0 : decompose_code(*p, &decomp_chars, ¤t_size);
342 0 : decomp_chars[decomp_size] = '\0';
343 0 : Assert(decomp_size == current_size);
344 :
345 : /*
346 : * Now apply canonical ordering.
347 : */
348 0 : for (count = 1; count < decomp_size; count++)
349 : {
350 0 : pg_wchar prev = decomp_chars[count - 1];
351 0 : pg_wchar next = decomp_chars[count];
352 : pg_wchar tmp;
353 0 : pg_unicode_decomposition *prevEntry = get_code_entry(prev);
354 0 : pg_unicode_decomposition *nextEntry = get_code_entry(next);
355 :
356 : /*
357 : * If no entries are found, the character used is either an Hangul
358 : * character or a character with a class of 0 and no decompositions,
359 : * so move to next result.
360 : */
361 0 : if (prevEntry == NULL || nextEntry == NULL)
362 0 : continue;
363 :
364 : /*
365 : * Per Unicode (http://unicode.org/reports/tr15/tr15-18.html) annex 4,
366 : * a sequence of two adjacent characters in a string is an
367 : * exchangeable pair if the combining class (from the Unicode
368 : * Character Database) for the first character is greater than the
369 : * combining class for the second, and the second is not a starter. A
370 : * character is a starter if its combining class is 0.
371 : */
372 0 : if (nextEntry->comb_class == 0x0 || prevEntry->comb_class == 0x0)
373 0 : continue;
374 :
375 0 : if (prevEntry->comb_class <= nextEntry->comb_class)
376 0 : continue;
377 :
378 : /* exchange can happen */
379 0 : tmp = decomp_chars[count - 1];
380 0 : decomp_chars[count - 1] = decomp_chars[count];
381 0 : decomp_chars[count] = tmp;
382 :
383 : /* backtrack to check again */
384 0 : if (count > 1)
385 0 : count -= 2;
386 : }
387 :
388 : /*
389 : * The last phase of NFKC is the recomposition of the reordered Unicode
390 : * string using combining classes. The recomposed string cannot be longer
391 : * than the decomposed one, so make the allocation of the output string
392 : * based on that assumption.
393 : */
394 0 : recomp_chars = (pg_wchar *) ALLOC((decomp_size + 1) * sizeof(pg_wchar));
395 0 : if (!recomp_chars)
396 : {
397 0 : FREE(decomp_chars);
398 0 : return NULL;
399 : }
400 :
401 0 : last_class = -1; /* this eliminates a special check */
402 0 : starter_pos = 0;
403 0 : target_pos = 1;
404 0 : starter_ch = recomp_chars[0] = decomp_chars[0];
405 :
406 0 : for (count = 1; count < decomp_size; count++)
407 : {
408 0 : pg_wchar ch = decomp_chars[count];
409 0 : pg_unicode_decomposition *ch_entry = get_code_entry(ch);
410 0 : int ch_class = (ch_entry == NULL) ? 0 : ch_entry->comb_class;
411 : pg_wchar composite;
412 :
413 0 : if (last_class < ch_class &&
414 0 : recompose_code(starter_ch, ch, &composite))
415 : {
416 0 : recomp_chars[starter_pos] = composite;
417 0 : starter_ch = composite;
418 : }
419 0 : else if (ch_class == 0)
420 : {
421 0 : starter_pos = target_pos;
422 0 : starter_ch = ch;
423 0 : last_class = -1;
424 0 : recomp_chars[target_pos++] = ch;
425 : }
426 : else
427 : {
428 0 : last_class = ch_class;
429 0 : recomp_chars[target_pos++] = ch;
430 : }
431 : }
432 0 : recomp_chars[target_pos] = (pg_wchar) '\0';
433 :
434 0 : FREE(decomp_chars);
435 :
436 0 : return recomp_chars;
437 : }
|