Line data Source code
1 : /* lookup.c - implementation of IDNA2008 lookup functions
2 : Copyright (C) 2011-2024 Simon Josefsson
3 : Copyright (C) 2017-2024 Tim Ruehsen
4 :
5 : Libidn2 is free software: you can redistribute it and/or modify it
6 : under the terms of either:
7 :
8 : * the GNU Lesser General Public License as published by the Free
9 : Software Foundation; either version 3 of the License, or (at
10 : your option) any later version.
11 :
12 : or
13 :
14 : * the GNU General Public License as published by the Free
15 : Software Foundation; either version 2 of the License, or (at
16 : your option) any later version.
17 :
18 : or both in parallel, as here.
19 :
20 : This program is distributed in the hope that it will be useful,
21 : but WITHOUT ANY WARRANTY; without even the implied warranty of
22 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
23 : GNU General Public License for more details.
24 :
25 : You should have received copies of the GNU General Public License and
26 : the GNU Lesser General Public License along with this program. If
27 : not, see <http://www.gnu.org/licenses/>.
28 : */
29 :
30 : #include <config.h>
31 :
32 : #include "idn2.h"
33 :
34 : #include <errno.h> /* errno */
35 : #include <stdlib.h> /* malloc, free */
36 :
37 : #include <unitypes.h>
38 : #include <uniconv.h> /* u8_strconv_from_locale */
39 : #include <uninorm.h> /* u32_normalize */
40 : #include <unistr.h> /* u8_to_u32 */
41 :
42 : #include "idna.h" /* _idn2_label_test */
43 : #include "tr46map.h" /* definition for tr46map.c */
44 :
45 : #ifdef HAVE_LIBUNISTRING
46 : /* copied from gnulib */
47 : # include <limits.h>
48 : # define _C_CTYPE_LOWER_N(N) \
49 : case 'a' + (N): case 'b' + (N): case 'c' + (N): case 'd' + (N): \
50 : case 'e' + (N): case 'f' + (N): \
51 : case 'g' + (N): case 'h' + (N): case 'i' + (N): case 'j' + (N): \
52 : case 'k' + (N): case 'l' + (N): case 'm' + (N): case 'n' + (N): \
53 : case 'o' + (N): case 'p' + (N): case 'q' + (N): case 'r' + (N): \
54 : case 's' + (N): case 't' + (N): case 'u' + (N): case 'v' + (N): \
55 : case 'w' + (N): case 'x' + (N): case 'y' + (N): case 'z' + (N)
56 : # define _C_CTYPE_UPPER _C_CTYPE_LOWER_N ('A' - 'a')
57 : static inline int
58 25146 : c_tolower (int c)
59 : {
60 25146 : switch (c)
61 : {
62 0 : _C_CTYPE_UPPER:
63 0 : return c - 'A' + 'a';
64 25146 : default:
65 25146 : return c;
66 : }
67 : }
68 :
69 : static int
70 1111 : c_strncasecmp (const char *s1, const char *s2, size_t n)
71 : {
72 1111 : register const unsigned char *p1 = (const unsigned char *) s1;
73 1111 : register const unsigned char *p2 = (const unsigned char *) s2;
74 : unsigned char c1, c2;
75 :
76 1111 : if (p1 == p2 || n == 0)
77 0 : return 0;
78 :
79 : do
80 : {
81 12573 : c1 = c_tolower (*p1);
82 12573 : c2 = c_tolower (*p2);
83 :
84 12573 : if (--n == 0 || c1 == '\0')
85 : break;
86 :
87 11462 : ++p1;
88 11462 : ++p2;
89 : }
90 11462 : while (c1 == c2);
91 :
92 : if (UCHAR_MAX <= INT_MAX)
93 1111 : return c1 - c2;
94 : else
95 : /* On machines where 'char' and 'int' are types of the same size, the
96 : difference of two 'unsigned char' values - including the sign bit -
97 : doesn't fit in an 'int'. */
98 : return (c1 > c2 ? 1 : c1 < c2 ? -1 : 0);
99 : }
100 : #else
101 : # include <c-strcase.h>
102 : #endif
103 :
104 : static int
105 5577587 : set_default_flags (int *flags)
106 : {
107 5577587 : if (((*flags) & IDN2_TRANSITIONAL) && ((*flags) & IDN2_NONTRANSITIONAL))
108 2 : return IDN2_INVALID_FLAGS;
109 :
110 5577585 : if (((*flags) & (IDN2_TRANSITIONAL | IDN2_NONTRANSITIONAL))
111 2239376 : && ((*flags) & IDN2_NO_TR46))
112 2 : return IDN2_INVALID_FLAGS;
113 :
114 5577583 : if (((*flags) & IDN2_ALABEL_ROUNDTRIP)
115 6 : && ((*flags) & IDN2_NO_ALABEL_ROUNDTRIP))
116 0 : return IDN2_INVALID_FLAGS;
117 :
118 5577583 : if (!((*flags) & (IDN2_NO_TR46 | IDN2_TRANSITIONAL)))
119 3345617 : *flags |= IDN2_NONTRANSITIONAL;
120 :
121 5577583 : return IDN2_OK;
122 : }
123 :
124 : static int
125 1714379 : label (const uint8_t *src, size_t srclen, uint8_t *dst, size_t *dstlen,
126 : int flags)
127 : {
128 : size_t plen;
129 1714379 : uint32_t *p = NULL;
130 1714379 : const uint8_t *src_org = NULL;
131 1714379 : uint8_t *src_allocated = NULL;
132 1714379 : int rc, check_roundtrip = 0;
133 1714379 : size_t tmpl, srclen_org = 0;
134 : uint32_t label_u32[IDN2_LABEL_MAX_LENGTH];
135 1714379 : size_t label32_len = IDN2_LABEL_MAX_LENGTH;
136 :
137 1714379 : if (_idn2_ascii_p (src, srclen))
138 : {
139 13066 : if (!(flags & IDN2_NO_ALABEL_ROUNDTRIP) && srclen >= 4
140 2717 : && memcmp (src, "xn--", 4) == 0)
141 : {
142 : /*
143 : If the input to this procedure appears to be an A-label
144 : (i.e., it starts in "xn--", interpreted
145 : case-insensitively), the lookup application MAY attempt to
146 : convert it to a U-label, first ensuring that the A-label is
147 : entirely in lowercase (converting it to lowercase if
148 : necessary), and apply the tests of Section 5.4 and the
149 : conversion of Section 5.5 to that form. */
150 1402 : rc = idn2_punycode_decode ((char *) src + 4, srclen - 4,
151 : label_u32, &label32_len);
152 1402 : if (rc)
153 0 : return rc;
154 :
155 1402 : check_roundtrip = 1;
156 1402 : src_org = src;
157 1402 : srclen_org = srclen;
158 :
159 1402 : srclen = IDN2_LABEL_MAX_LENGTH;
160 1402 : src = src_allocated =
161 1402 : u32_to_u8 (label_u32, label32_len, NULL, &srclen);
162 1402 : if (!src)
163 : {
164 0 : if (errno == ENOMEM)
165 0 : return IDN2_MALLOC;
166 0 : return IDN2_ENCODING_ERROR;
167 : }
168 : }
169 : else
170 : {
171 11664 : if (srclen > IDN2_LABEL_MAX_LENGTH)
172 53 : return IDN2_TOO_BIG_LABEL;
173 11611 : if (srclen > *dstlen)
174 0 : return IDN2_TOO_BIG_DOMAIN;
175 :
176 11611 : memcpy (dst, src, srclen);
177 11611 : *dstlen = srclen;
178 11611 : return IDN2_OK;
179 : }
180 : }
181 :
182 1702715 : rc = _idn2_u8_to_u32_nfc (src, srclen, &p, &plen, flags & IDN2_NFC_INPUT);
183 1702715 : if (rc != IDN2_OK)
184 1 : goto out;
185 :
186 1702714 : if (!(flags & IDN2_TRANSITIONAL))
187 : {
188 1553421 : rc = _idn2_label_test (TEST_NFC |
189 : TEST_2HYPHEN |
190 : TEST_LEADING_COMBINING |
191 : TEST_DISALLOWED |
192 : TEST_CONTEXTJ_RULE |
193 : TEST_CONTEXTO_WITH_RULE |
194 1553421 : TEST_UNASSIGNED | TEST_BIDI |
195 : ((flags & IDN2_NONTRANSITIONAL) ?
196 1553421 : TEST_NONTRANSITIONAL : 0) | ((flags &
197 : IDN2_USE_STD3_ASCII_RULES)
198 1553421 : ? 0 :
199 : TEST_ALLOW_STD3_DISALLOWED),
200 : p, plen);
201 :
202 1553421 : if (rc != IDN2_OK)
203 1009808 : goto out;
204 : }
205 :
206 692906 : dst[0] = 'x';
207 692906 : dst[1] = 'n';
208 692906 : dst[2] = '-';
209 692906 : dst[3] = '-';
210 :
211 692906 : tmpl = *dstlen - 4;
212 692906 : rc = idn2_punycode_encode (p, plen, (char *) dst + 4, &tmpl);
213 692906 : if (rc != IDN2_OK)
214 167 : goto out;
215 :
216 :
217 692739 : *dstlen = 4 + tmpl;
218 :
219 692739 : if (check_roundtrip)
220 : {
221 1111 : if (srclen_org != *dstlen
222 1111 : || c_strncasecmp ((char *) src_org, (char *) dst, srclen_org))
223 : {
224 3 : rc = IDN2_ALABEL_ROUNDTRIP_FAILED;
225 3 : goto out;
226 : }
227 : }
228 691628 : else if (!(flags & IDN2_NO_ALABEL_ROUNDTRIP))
229 : {
230 691628 : rc = idn2_punycode_decode ((char *) dst + 4, *dstlen - 4,
231 : label_u32, &label32_len);
232 691628 : if (rc)
233 : {
234 770 : rc = IDN2_ALABEL_ROUNDTRIP_FAILED;
235 770 : goto out;
236 : }
237 :
238 690858 : if (plen != label32_len || u32_cmp (p, label_u32, label32_len))
239 : {
240 0 : rc = IDN2_ALABEL_ROUNDTRIP_FAILED;
241 0 : goto out;
242 : }
243 : }
244 :
245 691966 : rc = IDN2_OK;
246 :
247 1702715 : out:
248 1702715 : free (p);
249 1702715 : free (src_allocated);
250 1702715 : return rc;
251 : }
252 :
253 : #define TR46_TRANSITIONAL_CHECK \
254 : (TEST_NFC | TEST_2HYPHEN | TEST_HYPHEN_STARTEND | TEST_LEADING_COMBINING | TEST_TRANSITIONAL)
255 : #define TR46_NONTRANSITIONAL_CHECK \
256 : (TEST_NFC | TEST_2HYPHEN | TEST_HYPHEN_STARTEND | TEST_LEADING_COMBINING | TEST_NONTRANSITIONAL)
257 :
258 : static int
259 4465183 : _tr46 (const uint8_t *domain_u8, uint8_t **out, int flags)
260 : {
261 : size_t len, it;
262 : uint32_t *domain_u32;
263 4465183 : int err = IDN2_OK, rc;
264 4465183 : int transitional = 0;
265 : int test_flags;
266 :
267 4465183 : if (flags & IDN2_TRANSITIONAL)
268 1119566 : transitional = 1;
269 :
270 : /* convert UTF-8 to UTF-32 */
271 4465183 : if (!(domain_u32 =
272 4465183 : u8_to_u32 (domain_u8, u8_strlen (domain_u8) + 1, NULL, &len)))
273 : {
274 20 : if (errno == ENOMEM)
275 0 : return IDN2_MALLOC;
276 20 : return IDN2_ENCODING_ERROR;
277 : }
278 :
279 4465163 : size_t len2 = 0;
280 5376652 : for (it = 0; it < len - 1; it++)
281 : {
282 : IDNAMap map;
283 :
284 4768483 : get_idna_map (domain_u32[it], &map);
285 :
286 4768483 : if (map_is (&map, TR46_FLG_DISALLOWED))
287 : {
288 3856994 : if (domain_u32[it])
289 : {
290 3856994 : free (domain_u32);
291 3856994 : return IDN2_DISALLOWED;
292 : }
293 0 : len2++;
294 : }
295 911489 : else if (map_is (&map, TR46_FLG_MAPPED))
296 : {
297 55667 : len2 += map.nmappings;
298 : }
299 855822 : else if (map_is (&map, TR46_FLG_VALID))
300 : {
301 814865 : len2++;
302 : }
303 40957 : else if (map_is (&map, TR46_FLG_IGNORED))
304 : {
305 1910 : continue;
306 : }
307 39047 : else if (map_is (&map, TR46_FLG_DEVIATION))
308 : {
309 6623 : if (transitional)
310 : {
311 2652 : len2 += map.nmappings;
312 : }
313 : else
314 3971 : len2++;
315 : }
316 32424 : else if (!(flags & IDN2_USE_STD3_ASCII_RULES))
317 : {
318 22515 : if (map_is (&map, TR46_FLG_DISALLOWED_STD3_VALID))
319 : {
320 : /* valid because UseSTD3ASCIIRules=false, see #TR46 5 */
321 20895 : len2++;
322 : }
323 1620 : else if (map_is (&map, TR46_FLG_DISALLOWED_STD3_MAPPED))
324 : {
325 : /* mapped because UseSTD3ASCIIRules=false, see #TR46 5 */
326 1620 : len2 += map.nmappings;
327 : }
328 : }
329 : }
330 :
331 : /* Exit early if result is too long.
332 : * This avoids excessive CPU usage in punycode encoding, which is O(N^2). */
333 608169 : if (len2 >= IDN2_DOMAIN_MAX_LENGTH)
334 : {
335 28 : free (domain_u32);
336 28 : return IDN2_TOO_BIG_DOMAIN;
337 : }
338 :
339 608141 : uint32_t *tmp = (uint32_t *) malloc ((len2 + 1) * sizeof (uint32_t));
340 608141 : if (!tmp)
341 : {
342 0 : free (domain_u32);
343 0 : return IDN2_MALLOC;
344 : }
345 :
346 608141 : len2 = 0;
347 1502016 : for (it = 0; it < len - 1; it++)
348 : {
349 893875 : uint32_t c = domain_u32[it];
350 : IDNAMap map;
351 :
352 893875 : get_idna_map (c, &map);
353 :
354 893875 : if (map_is (&map, TR46_FLG_DISALLOWED))
355 : {
356 0 : tmp[len2++] = c;
357 : }
358 893875 : else if (map_is (&map, TR46_FLG_MAPPED))
359 : {
360 53593 : len2 += get_map_data (tmp + len2, &map);
361 : }
362 840282 : else if (map_is (&map, TR46_FLG_VALID))
363 : {
364 804515 : tmp[len2++] = c;
365 : }
366 35767 : else if (map_is (&map, TR46_FLG_IGNORED))
367 : {
368 1676 : continue;
369 : }
370 34091 : else if (map_is (&map, TR46_FLG_DEVIATION))
371 : {
372 5705 : if (transitional)
373 : {
374 2194 : len2 += get_map_data (tmp + len2, &map);
375 : }
376 : else
377 3511 : tmp[len2++] = c;
378 : }
379 28386 : else if (!(flags & IDN2_USE_STD3_ASCII_RULES))
380 : {
381 18603 : if (map_is (&map, TR46_FLG_DISALLOWED_STD3_VALID))
382 : {
383 17007 : tmp[len2++] = c;
384 : }
385 1596 : else if (map_is (&map, TR46_FLG_DISALLOWED_STD3_MAPPED))
386 : {
387 1596 : len2 += get_map_data (tmp + len2, &map);
388 : }
389 : }
390 : }
391 608141 : free (domain_u32);
392 :
393 : /* Normalize to NFC */
394 608141 : tmp[len2] = 0;
395 608141 : domain_u32 = u32_normalize (UNINORM_NFC, tmp, len2 + 1, NULL, &len);
396 608141 : free (tmp);
397 608141 : tmp = NULL;
398 :
399 608141 : if (!domain_u32)
400 : {
401 0 : if (errno == ENOMEM)
402 0 : return IDN2_MALLOC;
403 0 : return IDN2_ENCODING_ERROR;
404 : }
405 :
406 : /* split into labels and check */
407 : uint32_t *e, *s;
408 1238102 : for (e = s = domain_u32; *e; s = e)
409 : {
410 1488135 : while (*e && *e != '.')
411 857275 : e++;
412 :
413 630860 : if (e - s >= 4 && s[0] == 'x' && s[1] == 'n' && s[2] == '-'
414 10284 : && s[3] == '-')
415 9094 : {
416 : /* decode punycode and check result non-transitional */
417 : size_t ace_len;
418 : uint32_t name_u32[IDN2_LABEL_MAX_LENGTH];
419 9993 : size_t name_len = IDN2_LABEL_MAX_LENGTH;
420 : uint8_t *ace;
421 :
422 9993 : ace = u32_to_u8 (s + 4, e - s - 4, NULL, &ace_len);
423 9993 : if (!ace)
424 : {
425 0 : free (domain_u32);
426 0 : if (errno == ENOMEM)
427 899 : return IDN2_MALLOC;
428 0 : return IDN2_ENCODING_ERROR;
429 : }
430 :
431 9993 : rc = idn2_punycode_decode ((char *) ace, ace_len,
432 : name_u32, &name_len);
433 :
434 9993 : free (ace);
435 :
436 9993 : if (rc)
437 : {
438 899 : free (domain_u32);
439 899 : return rc;
440 : }
441 :
442 9094 : test_flags = TR46_NONTRANSITIONAL_CHECK;
443 :
444 9094 : if (!(flags & IDN2_USE_STD3_ASCII_RULES))
445 8733 : test_flags |= TEST_ALLOW_STD3_DISALLOWED;
446 :
447 9094 : if ((rc = _idn2_label_test (test_flags, name_u32, name_len)))
448 5515 : err = rc;
449 : }
450 : else
451 : {
452 620867 : test_flags =
453 620867 : transitional ? TR46_TRANSITIONAL_CHECK :
454 : TR46_NONTRANSITIONAL_CHECK;
455 :
456 620867 : if (!(flags & IDN2_USE_STD3_ASCII_RULES))
457 614950 : test_flags |= TEST_ALLOW_STD3_DISALLOWED;
458 :
459 620867 : if ((rc = _idn2_label_test (test_flags, s, e - s)))
460 12420 : err = rc;
461 : }
462 :
463 629961 : if (*e)
464 24942 : e++;
465 : }
466 :
467 607242 : if (err == IDN2_OK && out)
468 593115 : {
469 593115 : uint8_t *_out = u32_to_u8 (domain_u32, len, NULL, &len);
470 593115 : free (domain_u32);
471 :
472 593115 : if (!_out)
473 : {
474 0 : if (errno == ENOMEM)
475 0 : return IDN2_MALLOC;
476 0 : return IDN2_ENCODING_ERROR;
477 : }
478 :
479 593115 : *out = _out;
480 : }
481 : else
482 14127 : free (domain_u32);
483 :
484 607242 : return err;
485 : }
486 :
487 : /**
488 : * idn2_lookup_u8:
489 : * @src: input zero-terminated UTF-8 string in Unicode NFC normalized form.
490 : * @lookupname: newly allocated output variable with name to lookup in DNS.
491 : * @flags: optional #idn2_flags to modify behaviour.
492 : *
493 : * Perform IDNA2008 lookup string conversion on domain name @src, as
494 : * described in section 5 of RFC 5891. Note that the input string
495 : * must be encoded in UTF-8 and be in Unicode NFC form.
496 : *
497 : * Pass %IDN2_NFC_INPUT in @flags to convert input to NFC form before
498 : * further processing. %IDN2_TRANSITIONAL and %IDN2_NONTRANSITIONAL
499 : * do already imply %IDN2_NFC_INPUT.
500 : *
501 : * Pass %IDN2_ALABEL_ROUNDTRIP in @flags to
502 : * convert any input A-labels to U-labels and perform additional
503 : * testing. This is default since version 2.2.
504 : * To switch this behavior off, pass IDN2_NO_ALABEL_ROUNDTRIP
505 : *
506 : * Pass %IDN2_TRANSITIONAL to enable Unicode TR46
507 : * transitional processing, and %IDN2_NONTRANSITIONAL to enable
508 : * Unicode TR46 non-transitional processing.
509 : *
510 : * Multiple flags may be specified by binary or:ing them together.
511 : *
512 : * After version 2.0.3: %IDN2_USE_STD3_ASCII_RULES disabled by default.
513 : * Previously we were eliminating non-STD3 characters from domain strings
514 : * such as _443._tcp.example.com, or IPs 1.2.3.4/24 provided to libidn2
515 : * functions. That was an unexpected regression for applications switching
516 : * from libidn and thus it is no longer applied by default.
517 : * Use %IDN2_USE_STD3_ASCII_RULES to enable that behavior again.
518 : *
519 : * After version 0.11: @lookupname may be NULL to test lookup of @src
520 : * without allocating memory.
521 : *
522 : * Returns: On successful conversion %IDN2_OK is returned, if the
523 : * output domain or any label would have been too long
524 : * %IDN2_TOO_BIG_DOMAIN or %IDN2_TOO_BIG_LABEL is returned, or
525 : * another error code is returned.
526 : *
527 : * Since: 0.1
528 : **/
529 : int
530 5915510 : idn2_lookup_u8 (const uint8_t *src, uint8_t **lookupname, int flags)
531 : {
532 5915510 : size_t lookupnamelen = 0;
533 : uint8_t _lookupname[IDN2_DOMAIN_MAX_LENGTH + 1];
534 5915510 : uint8_t *src_allocated = NULL;
535 : int rc;
536 :
537 5915510 : if (src == NULL)
538 : {
539 337923 : if (lookupname)
540 337919 : *lookupname = NULL;
541 337923 : return IDN2_OK;
542 : }
543 :
544 5577587 : rc = set_default_flags (&flags);
545 5577587 : if (rc != IDN2_OK)
546 4 : return rc;
547 :
548 5577583 : if (!(flags & IDN2_NO_TR46))
549 : {
550 4465183 : uint8_t *out = NULL;
551 :
552 4465183 : rc = _tr46 (src, &out, flags);
553 4465183 : if (rc != IDN2_OK)
554 3872068 : return rc;
555 :
556 593115 : src = src_allocated = out;
557 : }
558 :
559 : do
560 : {
561 1714379 : const uint8_t *end = (uint8_t *) strchrnul ((const char *) src, '.');
562 : /* XXX Do we care about non-U+002E dots such as U+3002, U+FF0E
563 : and U+FF61 here? Perhaps when IDN2_NFC_INPUT? */
564 1714379 : size_t labellen = end - src;
565 : uint8_t tmp[IDN2_LABEL_MAX_LENGTH];
566 1714379 : size_t tmplen = IDN2_LABEL_MAX_LENGTH;
567 :
568 1714379 : rc = label (src, labellen, tmp, &tmplen, flags);
569 1714379 : if (rc != IDN2_OK)
570 : {
571 1010802 : free (src_allocated);
572 1010859 : return rc;
573 : }
574 :
575 703577 : if (lookupnamelen + tmplen
576 703577 : > IDN2_DOMAIN_MAX_LENGTH - (tmplen == 0 && *end == '\0' ? 1 : 2))
577 : {
578 57 : free (src_allocated);
579 57 : return IDN2_TOO_BIG_DOMAIN;
580 : }
581 :
582 703520 : memcpy (_lookupname + lookupnamelen, tmp, tmplen);
583 703520 : lookupnamelen += tmplen;
584 :
585 703520 : if (*end == '.')
586 : {
587 8864 : if (lookupnamelen + 1 > IDN2_DOMAIN_MAX_LENGTH)
588 : {
589 0 : free (src_allocated);
590 0 : return IDN2_TOO_BIG_DOMAIN;
591 : }
592 :
593 8864 : _lookupname[lookupnamelen] = '.';
594 8864 : lookupnamelen++;
595 : }
596 703520 : _lookupname[lookupnamelen] = '\0';
597 :
598 703520 : src = end;
599 : }
600 703520 : while (*src++);
601 :
602 694656 : free (src_allocated);
603 :
604 694656 : if (lookupname)
605 : {
606 694655 : uint8_t *tmp = (uint8_t *) malloc (lookupnamelen + 1);
607 :
608 694655 : if (tmp == NULL)
609 0 : return IDN2_MALLOC;
610 :
611 694655 : memcpy (tmp, _lookupname, lookupnamelen + 1);
612 694655 : *lookupname = tmp;
613 : }
614 :
615 694656 : return IDN2_OK;
616 : }
617 :
618 : /**
619 : * idn2_lookup_ul:
620 : * @src: input zero-terminated locale encoded string.
621 : * @lookupname: newly allocated output variable with name to lookup in DNS.
622 : * @flags: optional #idn2_flags to modify behaviour.
623 : *
624 : * Perform IDNA2008 lookup string conversion on domain name @src, as
625 : * described in section 5 of RFC 5891. Note that the input is assumed
626 : * to be encoded in the locale's default coding system, and will be
627 : * transcoded to UTF-8 and NFC normalized by this function.
628 : *
629 : * Pass %IDN2_ALABEL_ROUNDTRIP in @flags to
630 : * convert any input A-labels to U-labels and perform additional
631 : * testing. This is default since version 2.2.
632 : * To switch this behavior off, pass IDN2_NO_ALABEL_ROUNDTRIP
633 : *
634 : * Pass %IDN2_TRANSITIONAL to enable Unicode TR46 transitional processing,
635 : * and %IDN2_NONTRANSITIONAL to enable Unicode TR46 non-transitional
636 : * processing.
637 : *
638 : * Multiple flags may be specified by binary or:ing them together, for
639 : * example %IDN2_ALABEL_ROUNDTRIP | %IDN2_NONTRANSITIONAL.
640 : *
641 : * The %IDN2_NFC_INPUT in @flags is always enabled in this function.
642 : *
643 : * After version 0.11: @lookupname may be NULL to test lookup of @src
644 : * without allocating memory.
645 : *
646 : * Returns: On successful conversion %IDN2_OK is returned, if
647 : * conversion from locale to UTF-8 fails then %IDN2_ICONV_FAIL is
648 : * returned, if the output domain or any label would have been too
649 : * long %IDN2_TOO_BIG_DOMAIN or %IDN2_TOO_BIG_LABEL is returned, or
650 : * another error code is returned.
651 : *
652 : * Since: 0.1
653 : **/
654 : int
655 3725 : idn2_lookup_ul (const char *src, char **lookupname, int flags)
656 : {
657 3725 : uint8_t *utf8src = NULL;
658 : int rc;
659 :
660 3725 : if (src)
661 : {
662 3721 : const char *encoding = locale_charset ();
663 :
664 3721 : utf8src = u8_strconv_from_encoding (src, encoding, iconveh_error);
665 :
666 3721 : if (!utf8src)
667 : {
668 3252 : if (errno == ENOMEM)
669 0 : return IDN2_MALLOC;
670 3252 : return IDN2_ICONV_FAIL;
671 : }
672 : }
673 :
674 473 : rc = idn2_lookup_u8 (utf8src, (uint8_t **) lookupname,
675 : flags | IDN2_NFC_INPUT);
676 :
677 473 : free (utf8src);
678 :
679 473 : return rc;
680 : }
681 :
682 : /**
683 : * idn2_to_ascii_4i:
684 : * @input: zero terminated input Unicode (UCS-4) string.
685 : * @inlen: number of elements in @input.
686 : * @output: output zero terminated string that must have room for at
687 : * least 63 characters plus the terminating zero.
688 : * @flags: optional #idn2_flags to modify behaviour.
689 : *
690 : * The ToASCII operation takes a sequence of Unicode code points that make
691 : * up one domain label and transforms it into a sequence of code points in
692 : * the ASCII range (0..7F). If ToASCII succeeds, the original sequence and
693 : * the resulting sequence are equivalent labels.
694 : *
695 : * It is important to note that the ToASCII operation can fail.
696 : * ToASCII fails if any step of it fails. If any step of the
697 : * ToASCII operation fails on any label in a domain name, that domain
698 : * name MUST NOT be used as an internationalized domain name.
699 : * The method for dealing with this failure is application-specific.
700 : *
701 : * The inputs to ToASCII are a sequence of code points.
702 : *
703 : * ToASCII never alters a sequence of code points that are all in the ASCII
704 : * range to begin with (although it could fail). Applying the ToASCII operation multiple
705 : * effect as applying it just once.
706 : *
707 : * The default behavior of this function (when flags are zero) is to apply
708 : * the IDNA2008 rules without the TR46 amendments. As the TR46
709 : * non-transitional processing is nowadays ubiquitous, when unsure, it is
710 : * recommended to call this function with the %IDN2_NONTRANSITIONAL
711 : * and the %IDN2_NFC_INPUT flags for compatibility with other software.
712 : *
713 : * Warning: With version 2.1.1 until before version 2.3.5 this
714 : * function was deprecated in favor idn2_to_ascii_4i2(). We still
715 : * encourage you to use idn2_to_ascii_4i2() when appropriate.
716 : *
717 : * Returns: On successful conversion %IDN2_OK is returned; if the
718 : * output label would have been too long %IDN2_TOO_BIG_LABEL is
719 : * returned, or another error code is returned.
720 : *
721 : * Since: 2.0.0
722 : **/
723 : int
724 172 : idn2_to_ascii_4i (const uint32_t *input, size_t inlen, char *output,
725 : int flags)
726 : {
727 : char *out;
728 : int rc;
729 :
730 172 : if (!input)
731 : {
732 0 : if (output)
733 0 : *output = 0;
734 0 : return IDN2_OK;
735 : }
736 :
737 172 : rc = idn2_to_ascii_4i2 (input, inlen, &out, flags);
738 172 : if (rc == IDN2_OK)
739 : {
740 1 : size_t len = strlen (out);
741 :
742 1 : if (len > IDN2_LABEL_MAX_LENGTH)
743 0 : rc = IDN2_TOO_BIG_LABEL;
744 1 : else if (output)
745 1 : strcpy (output, out);
746 :
747 1 : free (out);
748 : }
749 :
750 172 : return rc;
751 : }
752 :
753 : /**
754 : * idn2_to_ascii_4i2:
755 : * @input: zero terminated input Unicode (UCS-4) string.
756 : * @inlen: number of elements in @input.
757 : * @output: pointer to newly allocated zero-terminated output string.
758 : * @flags: optional #idn2_flags to modify behaviour.
759 : *
760 : * The ToASCII operation takes a sequence of Unicode code points that make
761 : * up one domain label and transforms it into a sequence of code points in
762 : * the ASCII range (0..7F). If ToASCII succeeds, the original sequence and
763 : * the resulting sequence are equivalent labels.
764 : *
765 : * It is important to note that the ToASCII operation can fail.
766 : * ToASCII fails if any step of it fails. If any step of the
767 : * ToASCII operation fails on any label in a domain name, that domain
768 : * name MUST NOT be used as an internationalized domain name.
769 : * The method for dealing with this failure is application-specific.
770 : *
771 : * The inputs to ToASCII are a sequence of code points.
772 : *
773 : * ToASCII never alters a sequence of code points that are all in the ASCII
774 : * range to begin with (although it could fail). Applying the ToASCII operation multiple
775 : * effect as applying it just once.
776 : *
777 : * The default behavior of this function (when flags are zero) is to apply
778 : * the IDNA2008 rules without the TR46 amendments. As the TR46
779 : * non-transitional processing is nowadays ubiquitous, when unsure, it is
780 : * recommended to call this function with the %IDN2_NONTRANSITIONAL
781 : * and the %IDN2_NFC_INPUT flags for compatibility with other software.
782 : *
783 : * Returns: On successful conversion %IDN2_OK is returned; if the
784 : * output label would have been too long %IDN2_TOO_BIG_LABEL is
785 : * returned, or another error code is returned.
786 : *
787 : * Since: 2.1.1
788 : **/
789 : int
790 1208 : idn2_to_ascii_4i2 (const uint32_t *input, size_t inlen, char **output,
791 : int flags)
792 : {
793 : uint32_t *input_u32;
794 : uint8_t *input_u8, *output_u8;
795 : size_t length;
796 : int rc;
797 :
798 1208 : if (!input)
799 : {
800 2 : if (output)
801 2 : *output = NULL;
802 2 : return IDN2_OK;
803 : }
804 :
805 1206 : input_u32 = (uint32_t *) malloc ((inlen + 1) * sizeof (uint32_t));
806 1206 : if (!input_u32)
807 0 : return IDN2_MALLOC;
808 :
809 1206 : u32_cpy (input_u32, input, inlen);
810 1206 : input_u32[inlen] = 0;
811 :
812 1206 : input_u8 = u32_to_u8 (input_u32, inlen + 1, NULL, &length);
813 1206 : free (input_u32);
814 1206 : if (!input_u8)
815 : {
816 1148 : if (errno == ENOMEM)
817 0 : return IDN2_MALLOC;
818 1148 : return IDN2_ENCODING_ERROR;
819 : }
820 :
821 58 : rc = idn2_lookup_u8 (input_u8, &output_u8, flags);
822 58 : free (input_u8);
823 :
824 58 : if (rc == IDN2_OK)
825 : {
826 16 : if (output)
827 16 : *output = (char *) output_u8;
828 : else
829 0 : free (output_u8);
830 : }
831 :
832 58 : return rc;
833 : }
834 :
835 : /**
836 : * idn2_to_ascii_4z:
837 : * @input: zero terminated input Unicode (UCS-4) string.
838 : * @output: pointer to newly allocated zero-terminated output string.
839 : * @flags: optional #idn2_flags to modify behaviour.
840 : *
841 : * Convert UCS-4 domain name to ASCII string using the IDNA2008
842 : * rules. The domain name may contain several labels, separated by dots.
843 : * The output buffer must be deallocated by the caller.
844 : *
845 : * The default behavior of this function (when flags are zero) is to apply
846 : * the IDNA2008 rules without the TR46 amendments. As the TR46
847 : * non-transitional processing is nowadays ubiquitous, when unsure, it is
848 : * recommended to call this function with the %IDN2_NONTRANSITIONAL
849 : * and the %IDN2_NFC_INPUT flags for compatibility with other software.
850 : *
851 : * Return value: Returns %IDN2_OK on success, or error code.
852 : *
853 : * Since: 2.0.0
854 : **/
855 : int
856 1034 : idn2_to_ascii_4z (const uint32_t *input, char **output, int flags)
857 : {
858 : uint8_t *input_u8;
859 : size_t length;
860 : int rc;
861 :
862 1034 : if (!input)
863 : {
864 0 : if (output)
865 0 : *output = NULL;
866 0 : return IDN2_OK;
867 : }
868 :
869 1034 : input_u8 = u32_to_u8 (input, u32_strlen (input) + 1, NULL, &length);
870 1034 : if (!input_u8)
871 : {
872 972 : if (errno == ENOMEM)
873 0 : return IDN2_MALLOC;
874 972 : return IDN2_ENCODING_ERROR;
875 : }
876 :
877 62 : rc = idn2_lookup_u8 (input_u8, (uint8_t **) output, flags);
878 62 : free (input_u8);
879 :
880 62 : return rc;
881 : }
882 :
883 : /**
884 : * idn2_to_ascii_8z:
885 : * @input: zero terminated input UTF-8 string.
886 : * @output: pointer to newly allocated output string.
887 : * @flags: optional #idn2_flags to modify behaviour.
888 : *
889 : * Convert UTF-8 domain name to ASCII string using the IDNA2008
890 : * rules. The domain name may contain several labels, separated by dots.
891 : * The output buffer must be deallocated by the caller.
892 : *
893 : * The default behavior of this function (when flags are zero) is to apply
894 : * the IDNA2008 rules without the TR46 amendments. As the TR46
895 : * non-transitional processing is nowadays ubiquitous, when unsure, it is
896 : * recommended to call this function with the %IDN2_NONTRANSITIONAL
897 : * and the %IDN2_NFC_INPUT flags for compatibility with other software.
898 : *
899 : * Return value: Returns %IDN2_OK on success, or error code.
900 : *
901 : * Since: 2.0.0
902 : **/
903 : int
904 3710 : idn2_to_ascii_8z (const char *input, char **output, int flags)
905 : {
906 3710 : return idn2_lookup_u8 ((const uint8_t *) input, (uint8_t **) output, flags);
907 : }
908 :
909 : /**
910 : * idn2_to_ascii_lz:
911 : * @input: zero terminated input UTF-8 string.
912 : * @output: pointer to newly allocated output string.
913 : * @flags: optional #idn2_flags to modify behaviour.
914 : *
915 : * Convert a domain name in locale's encoding to ASCII string using the IDNA2008
916 : * rules. The domain name may contain several labels, separated by dots.
917 : * The output buffer must be deallocated by the caller.
918 : *
919 : * The default behavior of this function (when flags are zero) is to apply
920 : * the IDNA2008 rules without the TR46 amendments. As the TR46
921 : * non-transitional processing is nowadays ubiquitous, when unsure, it is
922 : * recommended to call this function with the %IDN2_NONTRANSITIONAL
923 : * and the %IDN2_NFC_INPUT flags for compatibility with other software.
924 : *
925 : * Returns: %IDN2_OK on success, or error code.
926 : * Same as described in idn2_lookup_ul() documentation.
927 : *
928 : * Since: 2.0.0
929 : **/
930 : int
931 3712 : idn2_to_ascii_lz (const char *input, char **output, int flags)
932 : {
933 3712 : return idn2_lookup_ul (input, output, flags);
934 : }
|