Line data Source code
1 : /* punycode.c - punycode encoding/decoding
2 : Copyright (C) 2011-2024 Simon Josefsson
3 :
4 : Libidn2 is free software: you can redistribute it and/or modify it
5 : under the terms of either:
6 :
7 : * the GNU Lesser General Public License as published by the Free
8 : Software Foundation; either version 3 of the License, or (at
9 : your option) any later version.
10 :
11 : or
12 :
13 : * the GNU General Public License as published by the Free
14 : Software Foundation; either version 2 of the License, or (at
15 : your option) any later version.
16 :
17 : or both in parallel, as here.
18 :
19 : This program is distributed in the hope that it will be useful,
20 : but WITHOUT ANY WARRANTY; without even the implied warranty of
21 : MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
22 : GNU General Public License for more details.
23 :
24 : You should have received copies of the GNU General Public License and
25 : the GNU Lesser General Public License along with this program. If
26 : not, see <http://www.gnu.org/licenses/>.
27 : */
28 :
29 : /*
30 : Code copied from http://www.nicemice.net/idn/punycode-spec.gz on
31 : 2011-01-04 with SHA-1 a966a8017f6be579d74a50a226accc7607c40133
32 : labeled punycode-spec 1.0.3 (2006-Mar-23-Thu). It is modified for
33 : Libidn2 by Simon Josefsson and others. Original code license:
34 :
35 : punycode-spec 1.0.3 (2006-Mar-23-Thu)
36 : http://www.nicemice.net/idn/
37 : Adam M. Costello
38 : http://www.nicemice.net/amc/
39 :
40 : B. Disclaimer and license
41 :
42 : Regarding this entire document or any portion of it (including
43 : the pseudocode and C code), the author makes no guarantees and
44 : is not responsible for any damage resulting from its use. The
45 : author grants irrevocable permission to anyone to use, modify,
46 : and distribute it in any way that does not diminish the rights
47 : of anyone else to use, modify, and distribute it, provided that
48 : redistributed derivative works do not contain misleading author or
49 : version information. Derivative works need not be licensed under
50 : similar terms.
51 :
52 : C. Punycode sample implementation
53 :
54 : punycode-sample.c 2.0.0 (2004-Mar-21-Sun)
55 : http://www.nicemice.net/idn/
56 : Adam M. Costello
57 : http://www.nicemice.net/amc/
58 :
59 : This is ANSI C code (C89) implementing Punycode 1.0.x.
60 : */
61 :
62 : #include <config.h>
63 :
64 : #include "idn2.h" /* IDN2_OK, ... */
65 :
66 : #include <string.h>
67 :
68 : /*** Bootstring parameters for Punycode ***/
69 :
70 : enum
71 : { base = 36, tmin = 1, tmax = 26, skew = 38, damp = 700,
72 : initial_bias = 72, initial_n = 0x80, delimiter = 0x2D
73 : };
74 :
75 : /* basic(cp) tests whether cp is a basic code point: */
76 : #define encode_basic(cp) ((uint32_t)(cp) < 0x80)
77 :
78 : /* encode_digit(d,flag) returns the basic code point whose value */
79 : /* (when used for representing integers) is d, which needs to be in */
80 : /* the range 0 to base-1. The lowercase form is used unless flag is */
81 : /* nonzero, in which case the uppercase form is used. The behavior */
82 : /* is undefined if flag is nonzero and digit d has no uppercase form. */
83 :
84 : static char
85 2664250 : encode_digit (uint32_t d, int flag)
86 : {
87 2664250 : return d + 22 + 75 * (d < 26) - ((flag != 0) << 5);
88 : /* 0..25 map to ASCII a..z or A..Z */
89 : /* 26..35 map to ASCII 0..9 */
90 : }
91 :
92 : /* basic(cp) tests whether cp is a basic code point: */
93 : #define decode_basic(cp) \
94 : ((cp >= 'a' && cp <= 'z') || (cp >= '0' && cp <='9') \
95 : || (cp >= 'A' && cp <='Z') || cp == '-' || cp == '_')
96 :
97 : /* decode_digit(cp) returns the numeric value of a basic code */
98 : /* point (for use in representing integers) in the range 0 to */
99 : /* base-1, or base if cp does not represent a value. */
100 :
101 : static unsigned
102 2749299 : decode_digit (int cp)
103 : {
104 2749299 : if (cp >= 'a' && cp <= 'z')
105 1757205 : return cp - 'a';
106 992094 : if (cp >= '0' && cp <= '9')
107 988345 : return cp - '0' + 26;
108 3749 : if (cp >= 'A' && cp <= 'Z')
109 3744 : return cp - 'A';
110 :
111 5 : return 0;
112 : }
113 :
114 : /*** Bias adaptation function ***/
115 :
116 : static uint32_t
117 : adapt (uint32_t delta, uint32_t numpoints, int firsttime)
118 : _GL_ATTRIBUTE_CONST;
119 :
120 1454751 : static uint32_t adapt (uint32_t delta, uint32_t numpoints, int firsttime)
121 : {
122 : uint32_t k;
123 :
124 1454751 : delta = firsttime ? delta / damp : delta >> 1;
125 : /* delta >> 1 is a faster way of doing delta / 2 */
126 1454751 : delta += delta / numpoints;
127 :
128 1892940 : for (k = 0; delta > ((base - tmin) * tmax) / 2; k += base)
129 : {
130 438189 : delta /= base - tmin;
131 : }
132 :
133 1454751 : return k + (base - tmin + 1) * delta / (delta + skew);
134 : }
135 :
136 : /**
137 : * idn2_punycode_encode:
138 : * @input: array of input Unicode (UCS-4) code points
139 : * @input_length: number of code points in the @input array.
140 : * @output: output character array with ASCII code points.
141 : * @output_length: size of @output array
142 : *
143 : * Converts a sequence of code points to Punycode.
144 : *
145 : * Warning: Be aware that is rare for applications to need to perform
146 : * Punycode operations, and you should consider idn2_to_ascii_8z(),
147 : * idn2_to_ascii_lz(), idn2_to_ascii_4z() etc.
148 : *
149 : * Returns: On successful encoding %IDN2_OK is returned, or an error
150 : * codes like %IDN2_PUNYCODE_BAD_INPUT, %IDN2_PUNYCODE_BIG_OUTPUT,
151 : * or %IDN2_PUNYCODE_OVERFLOW.
152 : *
153 : * Since: 2.3.5
154 : **/
155 : int
156 693053 : idn2_punycode_encode (const uint32_t *input,
157 : size_t input_length,
158 : char *output, size_t *output_length)
159 : {
160 : uint32_t our_input_length, n, delta, h, b, bias, j, m, q, k, t;
161 : size_t out, max_out;
162 :
163 : /* The Punycode spec assumes that the input length is the same type */
164 : /* of integer as a code point, so we need to convert the size_t to */
165 : /* a uint32_t, which could overflow. */
166 :
167 693053 : if (input_length > UINT32_MAX)
168 0 : return IDN2_PUNYCODE_OVERFLOW;
169 693053 : our_input_length = (uint32_t) input_length;
170 :
171 : /* Initialize the state: */
172 :
173 693053 : n = initial_n;
174 693053 : delta = 0;
175 693053 : out = 0;
176 693053 : max_out = *output_length;
177 693053 : bias = initial_bias;
178 :
179 : /* Handle the basic code points: */
180 :
181 1426995 : for (j = 0; j < our_input_length; ++j)
182 : {
183 734001 : if (encode_basic (input[j]))
184 : {
185 19337 : if (max_out - out < 2)
186 59 : return IDN2_PUNYCODE_BIG_OUTPUT;
187 19278 : output[out++] = (char) input[j];
188 : }
189 714664 : else if (input[j] > 0x10FFFF
190 714664 : || (input[j] >= 0xD800 && input[j] <= 0xDBFF))
191 0 : return IDN2_PUNYCODE_BAD_INPUT;
192 : }
193 :
194 692994 : h = b = (uint32_t) out;
195 : /* cannot overflow because out <= our_input_length <= UINT32_MAX */
196 :
197 : /* h is the number of code points that have been handled, b is the */
198 : /* number of basic code points, and out is the number of ASCII code */
199 : /* points that have been output. */
200 :
201 692994 : if (b > 0)
202 3382 : output[out++] = delimiter;
203 :
204 : /* Main encoding loop: */
205 :
206 1397772 : while (h < our_input_length)
207 : {
208 : /* All non-basic code points < n have been */
209 : /* handled already. Find the next larger one: */
210 :
211 1530292 : for (m = UINT32_MAX, j = 0; j < our_input_length; ++j)
212 : {
213 : /* if (basic(input[j])) continue; */
214 : /* (not needed for Punycode) */
215 825406 : if (input[j] >= n && input[j] < m)
216 713436 : m = input[j];
217 : }
218 :
219 : /* Increase delta enough to advance the decoder's */
220 : /* <n,i> state to <m,0>, but guard against overflow: */
221 :
222 704886 : if (m - n > (UINT32_MAX - delta) / (h + 1))
223 0 : return IDN2_PUNYCODE_OVERFLOW;
224 704886 : delta += (m - n) * (h + 1);
225 704886 : n = m;
226 :
227 1525690 : for (j = 0; j < our_input_length; ++j)
228 : {
229 : /* Punycode does not need to check whether input[j] is basic: */
230 820912 : if (input[j] < n /* || basic(input[j]) */ )
231 : {
232 64410 : if (++delta == 0)
233 0 : return IDN2_PUNYCODE_OVERFLOW;
234 : }
235 :
236 820912 : if (input[j] == n)
237 : {
238 : /* Represent delta as a generalized variable-length integer: */
239 :
240 713260 : for (q = delta, k = base;; k += base)
241 : {
242 2664358 : if (out >= max_out)
243 108 : return IDN2_PUNYCODE_BIG_OUTPUT;
244 3940585 : t = k <= bias /* + tmin */ ? tmin : /* +tmin not needed */
245 1276335 : k >= bias + tmax ? tmax : k - bias;
246 2664250 : if (q < t)
247 713152 : break;
248 1951098 : output[out++] = encode_digit (t + (q - t) % (base - t), 0);
249 1951098 : q = (q - t) / (base - t);
250 : }
251 :
252 713152 : output[out++] = encode_digit (q, 0);
253 713152 : bias = adapt (delta, h + 1, h == b);
254 713152 : delta = 0;
255 713152 : ++h;
256 : }
257 : }
258 :
259 704778 : ++delta, ++n;
260 : }
261 :
262 692886 : *output_length = out;
263 692886 : return IDN2_OK;
264 : }
265 :
266 : /**
267 : * idn2_punycode_decode:
268 : * @input: array of ASCII characters (0..7F)
269 : * @input_length: Number of ASCII characters in the @input array.
270 : * @output: output character array with ASCII code points.
271 : * @output_length: on input, guarantee that @output has room
272 : * for this many code points; on output, *@output_length
273 : * holds number of code points in @output.
274 : *
275 : * Converts Punycode to a sequence of code points.
276 : *
277 : * The decoder will never need to output more code points than the
278 : * number of ASCII code points in the input, because of the way the
279 : * encoding is defined.
280 : *
281 : * The number of code points output cannot exceed the maximum possible
282 : * value of a uint32_t, even if the supplied @output_length is greater
283 : * than that.
284 : *
285 : * Warning: Be aware that is rare for applications to need to perform
286 : * Punycode operations, and you should consider
287 : * idn2_to_unicode_8z8z(), idn2_to_unicode_lzlz() etc.
288 : *
289 : * Returns: On successful encoding %IDN2_OK is returned, or an error
290 : * codes like %IDN2_PUNYCODE_BAD_INPUT, %IDN2_PUNYCODE_BIG_OUTPUT,
291 : * or %IDN2_PUNYCODE_OVERFLOW.
292 : *
293 : * Since: 2.3.5
294 : **/
295 : int
296 703898 : idn2_punycode_decode (const char *input,
297 : size_t input_length,
298 : uint32_t *output, size_t *output_length)
299 : {
300 703898 : uint32_t n, out = 0, i, max_out, bias, oldi, w, k, digit, t;
301 703898 : size_t b = 0, j, in;
302 :
303 703898 : if (!input_length)
304 136 : return IDN2_PUNYCODE_BAD_INPUT;
305 :
306 : /* Check that all chars are basic */
307 3480540 : for (j = 0; j < input_length; ++j)
308 : {
309 2778106 : if (!decode_basic (input[j]))
310 1328 : return IDN2_PUNYCODE_BAD_INPUT;
311 2776778 : if (input[j] == delimiter)
312 8054 : b = j;
313 : }
314 :
315 702434 : max_out =
316 702434 : *output_length > UINT32_MAX ? UINT32_MAX : (uint32_t) * output_length;
317 :
318 702434 : if (input[b] == delimiter)
319 : {
320 : /* do not accept leading or trailing delimiter
321 : * - leading delim must be omitted if there is no ASCII char in u-label
322 : * - trailing delim means there where no non-ASCII chars in u-label
323 : */
324 5066 : if (!b || b == input_length - 1)
325 141 : return IDN2_PUNYCODE_BAD_INPUT;
326 :
327 4925 : if (b >= max_out)
328 16 : return IDN2_PUNYCODE_BIG_OUTPUT;
329 :
330 : /* Check that all chars before last delimiter are basic chars */
331 : /* and copy the first b code points to the output. */
332 21160 : for (j = 0; j < b; j++)
333 16251 : output[out++] = input[j];
334 :
335 4909 : b += 1; /* advance to non-basic char encoding */
336 : }
337 :
338 : /* Initialize the state: */
339 702277 : n = initial_n;
340 702277 : i = 0;
341 702277 : bias = initial_bias;
342 :
343 : /* Main decoding loop: Start just after the last delimiter if any */
344 : /* basic code points were copied; start at the beginning otherwise. */
345 1443821 : for (in = b; in < input_length; ++out)
346 : {
347 :
348 : /* in is the index of the next ASCII code point to be consumed, */
349 : /* and out is the number of code points in the output array. */
350 :
351 : /* Decode a generalized variable-length integer into delta, */
352 : /* which gets added to i. The overflow checking is easier */
353 : /* if we increase i as we go, then subtract off its starting */
354 : /* value at the end to obtain delta. */
355 741704 : for (oldi = i, w = 1, k = base;; k += base)
356 : {
357 2749381 : if (in >= input_length)
358 82 : return IDN2_PUNYCODE_BAD_INPUT;
359 2749299 : digit = decode_digit (input[in++]);
360 2749299 : if (digit >= base)
361 0 : return IDN2_PUNYCODE_BAD_INPUT;
362 2749299 : if (digit > (UINT32_MAX - i) / w)
363 23 : return IDN2_PUNYCODE_OVERFLOW;
364 2749276 : i += digit * w;
365 4085850 : t = k <= bias /* + tmin */ ? tmin : /* +tmin not needed */
366 1336574 : k >= bias + tmax ? tmax : k - bias;
367 2749276 : if (digit < t)
368 741599 : break;
369 2007677 : if (w > UINT32_MAX / (base - t))
370 0 : return IDN2_PUNYCODE_OVERFLOW;
371 2007677 : w *= (base - t);
372 : }
373 :
374 741599 : bias = adapt (i - oldi, out + 1, oldi == 0);
375 :
376 : /* i was supposed to wrap around from out+1 to 0, */
377 : /* incrementing n each time, so we'll fix that now: */
378 741599 : if (i / (out + 1) > UINT32_MAX - n)
379 0 : return IDN2_PUNYCODE_OVERFLOW;
380 741599 : n += i / (out + 1);
381 741599 : if (n > 0x10FFFF || (n >= 0xD800 && n <= 0xDBFF))
382 37 : return IDN2_PUNYCODE_BAD_INPUT;
383 741562 : i %= (out + 1);
384 :
385 : /* Insert n at position i of the output: */
386 :
387 : /* not needed for Punycode: */
388 : /* if (basic(n)) return IDN2_PUNYCODE_BAD_INPUT; */
389 741562 : if (out >= max_out)
390 18 : return IDN2_PUNYCODE_BIG_OUTPUT;
391 :
392 741544 : memmove (output + i + 1, output + i, (out - i) * sizeof *output);
393 741544 : output[i++] = n;
394 : }
395 :
396 702117 : *output_length = (size_t) out;
397 : /* cannot overflow because out <= old value of *output_length */
398 702117 : return IDN2_OK;
399 : }
400 :
401 : /* We are stuck exporting these old internal interfaces because old
402 : versions of GNUTLS used them, and maybe other software. */
403 :
404 : extern int _IDN2_API _idn2_punycode_decode (size_t input_length,
405 : const char input[],
406 : size_t *output_length,
407 : uint32_t output[]);
408 : extern int _IDN2_API _idn2_punycode_encode (size_t input_length,
409 : const uint32_t input[],
410 : size_t *output_length,
411 : char output[]);
412 :
413 : int
414 1 : _idn2_punycode_decode (size_t input_length,
415 : const char input[],
416 : size_t *output_length, uint32_t output[])
417 : {
418 1 : return idn2_punycode_decode (input, input_length, output, output_length);
419 :
420 : }
421 :
422 : int
423 1 : _idn2_punycode_encode (size_t input_length,
424 : const uint32_t input[],
425 : size_t *output_length, char output[])
426 : {
427 1 : return idn2_punycode_encode (input, input_length, output, output_length);
428 : }
|