Line data Source code
1 : /* decode.c - implementation of IDNA2008 decoding functions
2 : Copyright (C) 2011-2024 Simon Josefsson
3 :
4 : Libidn2 is free software: you can redistribute it and/or modify it
5 : under the terms of either:
6 :
7 : * the GNU Lesser General Public License as published by the Free
8 : Software Foundation; either version 3 of the License, or (at
9 : your option) any later version.
10 :
11 : or
12 :
13 : * the GNU General Public License as published by the Free
14 : Software Foundation; either version 2 of the License, or (at
15 : your option) any later version.
16 :
17 : or both in parallel, as here.
18 :
19 : This program is distributed in the hope that it will be useful,
20 : but WITHOUT ANY WARRANTY; without even the implied warranty of
21 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 : GNU General Public License for more details.
23 :
24 : You should have received copies of the GNU General Public License and
25 : the GNU Lesser General Public License along with this program. If
26 : not, see <http://www.gnu.org/licenses/>.
27 : */
28 :
29 : #include <config.h>
30 :
31 : #include "idn2.h"
32 :
33 : #include <errno.h> /* errno */
34 : #include <stdlib.h> /* malloc, free */
35 :
36 : #include <unitypes.h>
37 : #include <uniconv.h> /* u8_strconv_from_locale */
38 : #include <unistr.h> /* u8_to_u32, u32_cpy, ... */
39 :
40 : /**
41 : * idn2_to_unicode_8z4z:
42 : * @input: Input zero-terminated UTF-8 string.
43 : * @output: Newly allocated UTF-32/UCS-4 output string.
44 : * @flags: Currently unused.
45 : *
46 : * Converts a possibly ACE encoded domain name in UTF-8 format into a
47 : * UTF-32 string (punycode decoding). The output buffer will be zero-terminated
48 : * and must be deallocated by the caller.
49 : *
50 : * @output may be NULL to test lookup of @input without allocating memory.
51 : *
52 : * Returns:
53 : * %IDN2_OK: The conversion was successful.
54 : * %IDN2_TOO_BIG_DOMAIN: The domain is too long.
55 : * %IDN2_TOO_BIG_LABEL: A label is would have been too long.
56 : * %IDN2_ENCODING_ERROR: Character conversion failed.
57 : * %IDN2_MALLOC: Memory allocation failed.
58 : *
59 : * Since: 2.0.0
60 : **/
61 : int
62 290 : idn2_to_unicode_8z4z (const char *input, uint32_t **output,
63 : G_GNUC_UNUSED int flags)
64 : {
65 : uint32_t *domain_u32;
66 : int rc;
67 :
68 290 : if (!input)
69 : {
70 6 : if (output)
71 5 : *output = NULL;
72 6 : return IDN2_OK;
73 : }
74 :
75 : /* split into labels and check */
76 : uint32_t out_u32[IDN2_DOMAIN_MAX_LENGTH + 1];
77 284 : size_t out_len = 0;
78 : const char *e, *s;
79 :
80 1171 : for (e = s = input; *e; s = e)
81 : {
82 : uint32_t label_u32[IDN2_LABEL_MAX_LENGTH];
83 940 : size_t label_len = IDN2_LABEL_MAX_LENGTH;
84 :
85 10863 : while (*e && *e != '.')
86 9923 : e++;
87 :
88 940 : if (e - s >= 4 && (s[0] == 'x' || s[0] == 'X')
89 586 : && (s[1] == 'n' || s[1] == 'N') && s[2] == '-' && s[3] == '-')
90 : {
91 458 : s += 4;
92 :
93 458 : rc = idn2_punycode_decode ((char *) s, e - s,
94 : label_u32, &label_len);
95 458 : if (rc)
96 53 : return rc;
97 :
98 411 : if (out_len + label_len + (*e == '.') > IDN2_DOMAIN_MAX_LENGTH)
99 1 : return IDN2_TOO_BIG_DOMAIN;
100 :
101 410 : u32_cpy (out_u32 + out_len, label_u32, label_len);
102 : }
103 : else
104 : {
105 : /* convert UTF-8 input to UTF-32 */
106 482 : if (!
107 : (domain_u32 =
108 482 : u8_to_u32 ((uint8_t *) s, e - s, NULL, &label_len)))
109 : {
110 3 : if (errno == ENOMEM)
111 0 : return IDN2_MALLOC;
112 3 : return IDN2_ENCODING_ERROR;
113 : }
114 :
115 479 : if (label_len > IDN2_LABEL_MAX_LENGTH)
116 : {
117 1 : free (domain_u32);
118 1 : return IDN2_TOO_BIG_LABEL;
119 : }
120 :
121 478 : if (out_len + label_len + (*e == '.') > IDN2_DOMAIN_MAX_LENGTH)
122 : {
123 1 : free (domain_u32);
124 1 : return IDN2_TOO_BIG_DOMAIN;
125 : }
126 :
127 477 : u32_cpy (out_u32 + out_len, domain_u32, label_len);
128 477 : free (domain_u32);
129 : }
130 :
131 887 : out_len += label_len;
132 887 : if (*e)
133 : {
134 661 : out_u32[out_len++] = '.';
135 661 : e++;
136 : }
137 : }
138 :
139 231 : if (output)
140 : {
141 : uint32_t *_out;
142 :
143 229 : out_u32[out_len] = 0;
144 :
145 229 : _out = u32_cpy_alloc (out_u32, out_len + 1);
146 229 : if (!_out)
147 : {
148 0 : if (errno == ENOMEM)
149 0 : return IDN2_MALLOC;
150 0 : return IDN2_ENCODING_ERROR;
151 : }
152 :
153 229 : *output = _out;
154 : }
155 :
156 231 : return IDN2_OK;
157 : }
158 :
159 : /**
160 : * idn2_to_unicode_4z4z:
161 : * @input: Input zero-terminated UTF-32 string.
162 : * @output: Newly allocated UTF-32 output string.
163 : * @flags: Currently unused.
164 : *
165 : * Converts a possibly ACE encoded domain name in UTF-32 format into a
166 : * UTF-32 string (punycode decoding). The output buffer will be zero-terminated
167 : * and must be deallocated by the caller.
168 : *
169 : * @output may be NULL to test lookup of @input without allocating memory.
170 : *
171 : * Returns:
172 : * %IDN2_OK: The conversion was successful.
173 : * %IDN2_TOO_BIG_DOMAIN: The domain is too long.
174 : * %IDN2_TOO_BIG_LABEL: A label is would have been too long.
175 : * %IDN2_ENCODING_ERROR: Character conversion failed.
176 : * %IDN2_MALLOC: Memory allocation failed.
177 : *
178 : * Since: 2.0.0
179 : **/
180 : int
181 91 : idn2_to_unicode_4z4z (const uint32_t *input, uint32_t **output, int flags)
182 : {
183 : uint8_t *input_u8;
184 : uint32_t *output_u32;
185 : size_t length;
186 : int rc;
187 :
188 91 : if (!input)
189 : {
190 2 : if (output)
191 1 : *output = NULL;
192 2 : return IDN2_OK;
193 : }
194 :
195 89 : input_u8 = u32_to_u8 (input, u32_strlen (input) + 1, NULL, &length);
196 89 : if (!input_u8)
197 : {
198 9 : if (errno == ENOMEM)
199 0 : return IDN2_MALLOC;
200 9 : return IDN2_ENCODING_ERROR;
201 : }
202 :
203 80 : rc = idn2_to_unicode_8z4z ((char *) input_u8, &output_u32, flags);
204 80 : free (input_u8);
205 :
206 80 : if (rc == IDN2_OK)
207 : {
208 69 : if (output)
209 67 : *output = output_u32;
210 : else
211 2 : free (output_u32);
212 : }
213 :
214 80 : return rc;
215 : }
216 :
217 : /**
218 : * idn2_to_unicode_44i:
219 : * @in: Input array with UTF-32 code points.
220 : * @inlen: number of code points of input array
221 : * @out: output array with UTF-32 code points.
222 : * @outlen: on input, maximum size of output array with UTF-32 code points,
223 : * on exit, actual size of output array with UTF-32 code points.
224 : * @flags: Currently unused.
225 : *
226 : * The ToUnicode operation takes a sequence of UTF-32 code points
227 : * that make up one domain label and returns a sequence of UTF-32
228 : * code points. If the input sequence is a label in ACE form, then the
229 : * result is an equivalent internationalized label that is not in ACE
230 : * form, otherwise the original sequence is returned unaltered.
231 : *
232 : * @output may be NULL to test lookup of @input without allocating memory.
233 : *
234 : * Returns:
235 : * %IDN2_OK: The conversion was successful.
236 : * %IDN2_TOO_BIG_DOMAIN: The domain is too long.
237 : * %IDN2_TOO_BIG_LABEL: A label is would have been too long.
238 : * %IDN2_ENCODING_ERROR: Character conversion failed.
239 : * %IDN2_MALLOC: Memory allocation failed.
240 : *
241 : * Since: 2.0.0
242 : **/
243 : int
244 58 : idn2_to_unicode_44i (const uint32_t *in, size_t inlen, uint32_t *out,
245 : size_t *outlen, int flags)
246 : {
247 : uint32_t *input_u32;
248 : uint32_t *output_u32;
249 : size_t len;
250 : int rc;
251 :
252 58 : if (!in)
253 : {
254 5 : if (outlen)
255 3 : *outlen = 0;
256 5 : return IDN2_OK;
257 : }
258 :
259 53 : input_u32 = (uint32_t *) malloc ((inlen + 1) * sizeof (uint32_t));
260 53 : if (!input_u32)
261 0 : return IDN2_MALLOC;
262 :
263 53 : u32_cpy (input_u32, in, inlen);
264 53 : input_u32[inlen] = 0;
265 :
266 53 : rc = idn2_to_unicode_4z4z (input_u32, &output_u32, flags);
267 53 : free (input_u32);
268 53 : if (rc != IDN2_OK)
269 15 : return rc;
270 :
271 38 : len = u32_strlen (output_u32);
272 38 : if (out && outlen)
273 34 : u32_cpy (out, output_u32, len < *outlen ? len : *outlen);
274 38 : free (output_u32);
275 :
276 38 : if (outlen)
277 35 : *outlen = len;
278 :
279 38 : return IDN2_OK;
280 : }
281 :
282 : /**
283 : * idn2_to_unicode_8z8z:
284 : * @input: Input zero-terminated UTF-8 string.
285 : * @output: Newly allocated UTF-8 output string.
286 : * @flags: Currently unused.
287 : *
288 : * Converts a possibly ACE encoded domain name in UTF-8 format into a
289 : * UTF-8 string (punycode decoding). The output buffer will be zero-terminated
290 : * and must be deallocated by the caller.
291 : *
292 : * @output may be NULL to test lookup of @input without allocating memory.
293 : *
294 : * Returns:
295 : * %IDN2_OK: The conversion was successful.
296 : * %IDN2_TOO_BIG_DOMAIN: The domain is too long.
297 : * %IDN2_TOO_BIG_LABEL: A label is would have been too long.
298 : * %IDN2_ENCODING_ERROR: Character conversion failed.
299 : * %IDN2_MALLOC: Memory allocation failed.
300 : *
301 : * Since: 2.0.0
302 : **/
303 : int
304 171 : idn2_to_unicode_8z8z (const char *input, char **output, int flags)
305 : {
306 : uint32_t *output_u32;
307 : uint8_t *output_u8;
308 : size_t length;
309 : int rc;
310 :
311 171 : rc = idn2_to_unicode_8z4z (input, &output_u32, flags);
312 171 : if (rc != IDN2_OK || !input)
313 40 : return rc;
314 :
315 : output_u8 =
316 131 : u32_to_u8 (output_u32, u32_strlen (output_u32) + 1, NULL, &length);
317 131 : free (output_u32);
318 :
319 131 : if (!output_u8)
320 : {
321 1 : if (errno == ENOMEM)
322 0 : return IDN2_MALLOC;
323 1 : return IDN2_ENCODING_ERROR;
324 : }
325 :
326 130 : if (output)
327 128 : *output = (char *) output_u8;
328 : else
329 2 : free (output_u8);
330 :
331 130 : return IDN2_OK;
332 : }
333 :
334 : /**
335 : * idn2_to_unicode_8zlz:
336 : * @input: Input zero-terminated UTF-8 string.
337 : * @output: Newly allocated output string in current locale's character set.
338 : * @flags: Currently unused.
339 : *
340 : * Converts a possibly ACE encoded domain name in UTF-8 format into a
341 : * string encoded in the current locale's character set (punycode
342 : * decoding). The output buffer will be zero-terminated and must be
343 : * deallocated by the caller.
344 : *
345 : * @output may be NULL to test lookup of @input without allocating memory.
346 : *
347 : * Returns:
348 : * %IDN2_OK: The conversion was successful.
349 : * %IDN2_TOO_BIG_DOMAIN: The domain is too long.
350 : * %IDN2_TOO_BIG_LABEL: A label is would have been too long.
351 : * %IDN2_ENCODING_ERROR: Character conversion failed.
352 : * %IDN2_MALLOC: Memory allocation failed.
353 : *
354 : * Since: 2.0.0
355 : **/
356 : int
357 132 : idn2_to_unicode_8zlz (const char *input, char **output, int flags)
358 : {
359 : int rc;
360 : uint8_t *output_u8, *output_l8;
361 : const char *encoding;
362 :
363 132 : rc = idn2_to_unicode_8z8z (input, (char **) &output_u8, flags);
364 132 : if (rc != IDN2_OK || !input)
365 33 : return rc;
366 :
367 99 : encoding = locale_charset ();
368 : output_l8 =
369 99 : (uint8_t *) u8_strconv_to_encoding (output_u8, encoding, iconveh_error);
370 :
371 99 : if (!output_l8)
372 : {
373 10 : if (errno == ENOMEM)
374 0 : rc = IDN2_MALLOC;
375 : else
376 10 : rc = IDN2_ENCODING_ERROR;
377 :
378 10 : free (output_l8);
379 : }
380 : else
381 : {
382 89 : if (output)
383 85 : *output = (char *) output_l8;
384 : else
385 4 : free (output_l8);
386 :
387 89 : rc = IDN2_OK;
388 : }
389 :
390 99 : free (output_u8);
391 :
392 99 : return rc;
393 : }
394 :
395 : /**
396 : * idn2_to_unicode_lzlz:
397 : * @input: Input zero-terminated string encoded in the current locale's character set.
398 : * @output: Newly allocated output string in current locale's character set.
399 : * @flags: Currently unused.
400 : *
401 : * Converts a possibly ACE encoded domain name in the locale's character
402 : * set into a string encoded in the current locale's character set (punycode
403 : * decoding). The output buffer will be zero-terminated and must be
404 : * deallocated by the caller.
405 : *
406 : * @output may be NULL to test lookup of @input without allocating memory.
407 : *
408 : * Returns:
409 : * %IDN2_OK: The conversion was successful.
410 : * %IDN2_TOO_BIG_DOMAIN: The domain is too long.
411 : * %IDN2_TOO_BIG_LABEL: A label is would have been too long.
412 : * %IDN2_ENCODING_ERROR: Output character conversion failed.
413 : * %IDN2_ICONV_FAIL: Input character conversion failed.
414 : * %IDN2_MALLOC: Memory allocation failed.
415 : *
416 : * Since: 2.0.0
417 : **/
418 : int
419 98 : idn2_to_unicode_lzlz (const char *input, char **output, int flags)
420 : {
421 : uint8_t *input_l8;
422 : const char *encoding;
423 : int rc;
424 :
425 98 : if (!input)
426 : {
427 2 : if (output)
428 1 : *output = NULL;
429 2 : return IDN2_OK;
430 : }
431 :
432 96 : encoding = locale_charset ();
433 96 : input_l8 = u8_strconv_from_encoding (input, encoding, iconveh_error);
434 :
435 96 : if (!input_l8)
436 : {
437 3 : if (errno == ENOMEM)
438 0 : return IDN2_MALLOC;
439 3 : return IDN2_ICONV_FAIL;
440 : }
441 :
442 93 : rc = idn2_to_unicode_8zlz ((char *) input_l8, output, flags);
443 93 : free (input_l8);
444 :
445 93 : return rc;
446 : }
|