LCOV - code coverage report
Current view: top level - builds/libidn/libidn2/lib - punycode.c (source / functions) Hit Total Coverage
Test: Libidn2-2.3.7.3-61ee Code Coverage Lines: 115 122 94.3 %
Date: 2024-12-01 07:08:19 Functions: 7 7 100.0 %
Legend: Lines: hit not hit

          Line data    Source code
       1             : /* punycode.c - punycode encoding/decoding
       2             :    Copyright (C) 2011-2024 Simon Josefsson
       3             : 
       4             :    Libidn2 is free software: you can redistribute it and/or modify it
       5             :    under the terms of either:
       6             : 
       7             :      * the GNU Lesser General Public License as published by the Free
       8             :        Software Foundation; either version 3 of the License, or (at
       9             :        your option) any later version.
      10             : 
      11             :    or
      12             : 
      13             :      * the GNU General Public License as published by the Free
      14             :        Software Foundation; either version 2 of the License, or (at
      15             :        your option) any later version.
      16             : 
      17             :    or both in parallel, as here.
      18             : 
      19             :    This program is distributed in the hope that it will be useful,
      20             :    but WITHOUT ANY WARRANTY; without even the implied warranty of
      21             :    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
      22             :    GNU General Public License for more details.
      23             : 
      24             :    You should have received copies of the GNU General Public License and
      25             :    the GNU Lesser General Public License along with this program.  If
      26             :    not, see <http://www.gnu.org/licenses/>.
      27             : */
      28             : 
      29             : /*
      30             :   Code copied from http://www.nicemice.net/idn/punycode-spec.gz on
      31             :   2011-01-04 with SHA-1 a966a8017f6be579d74a50a226accc7607c40133
      32             :   labeled punycode-spec 1.0.3 (2006-Mar-23-Thu).  It is modified for
      33             :   Libidn2 by Simon Josefsson and others.  Original code license:
      34             : 
      35             :   punycode-spec 1.0.3 (2006-Mar-23-Thu)
      36             :   http://www.nicemice.net/idn/
      37             :   Adam M. Costello
      38             :   http://www.nicemice.net/amc/
      39             : 
      40             :   B. Disclaimer and license
      41             : 
      42             :     Regarding this entire document or any portion of it (including
      43             :     the pseudocode and C code), the author makes no guarantees and
      44             :     is not responsible for any damage resulting from its use.  The
      45             :     author grants irrevocable permission to anyone to use, modify,
      46             :     and distribute it in any way that does not diminish the rights
      47             :     of anyone else to use, modify, and distribute it, provided that
      48             :     redistributed derivative works do not contain misleading author or
      49             :     version information.  Derivative works need not be licensed under
      50             :     similar terms.
      51             : 
      52             :   C. Punycode sample implementation
      53             : 
      54             :   punycode-sample.c 2.0.0 (2004-Mar-21-Sun)
      55             :   http://www.nicemice.net/idn/
      56             :   Adam M. Costello
      57             :   http://www.nicemice.net/amc/
      58             : 
      59             :   This is ANSI C code (C89) implementing Punycode 1.0.x.
      60             : */
      61             : 
      62             : #include <config.h>
      63             : 
      64             : #include "idn2.h"             /* IDN2_OK, ... */
      65             : 
      66             : #include <string.h>
      67             : 
      68             : /*** Bootstring parameters for Punycode ***/
      69             : 
      70             : enum
      71             : { base = 36, tmin = 1, tmax = 26, skew = 38, damp = 700,
      72             :   initial_bias = 72, initial_n = 0x80, delimiter = 0x2D
      73             : };
      74             : 
      75             : /* basic(cp) tests whether cp is a basic code point: */
      76             : #define encode_basic(cp) ((uint32_t)(cp) < 0x80)
      77             : 
      78             : /* encode_digit(d,flag) returns the basic code point whose value      */
      79             : /* (when used for representing integers) is d, which needs to be in   */
      80             : /* the range 0 to base-1.  The lowercase form is used unless flag is  */
      81             : /* nonzero, in which case the uppercase form is used.  The behavior   */
      82             : /* is undefined if flag is nonzero and digit d has no uppercase form. */
      83             : 
      84             : static char
      85     2664250 : encode_digit (uint32_t d, int flag)
      86             : {
      87     2664250 :   return d + 22 + 75 * (d < 26) - ((flag != 0) << 5);
      88             :   /*  0..25 map to ASCII a..z or A..Z */
      89             :   /* 26..35 map to ASCII 0..9         */
      90             : }
      91             : 
      92             : /* basic(cp) tests whether cp is a basic code point: */
      93             : #define decode_basic(cp)                               \
      94             :   ((cp >= 'a' && cp <= 'z') || (cp >= '0' && cp <='9')              \
      95             :    || (cp >= 'A' && cp <='Z') || cp == '-' || cp == '_')
      96             : 
      97             : /* decode_digit(cp) returns the numeric value of a basic code */
      98             : /* point (for use in representing integers) in the range 0 to */
      99             : /* base-1, or base if cp does not represent a value.          */
     100             : 
     101             : static unsigned
     102     2749299 : decode_digit (int cp)
     103             : {
     104     2749299 :   if (cp >= 'a' && cp <= 'z')
     105     1757205 :     return cp - 'a';
     106      992094 :   if (cp >= '0' && cp <= '9')
     107      988345 :     return cp - '0' + 26;
     108        3749 :   if (cp >= 'A' && cp <= 'Z')
     109        3744 :     return cp - 'A';
     110             : 
     111           5 :   return 0;
     112             : }
     113             : 
     114             : /*** Bias adaptation function ***/
     115             : 
     116             : static uint32_t
     117             : adapt (uint32_t delta, uint32_t numpoints, int firsttime)
     118             :   _GL_ATTRIBUTE_CONST;
     119             : 
     120     1454751 :      static uint32_t adapt (uint32_t delta, uint32_t numpoints, int firsttime)
     121             : {
     122             :   uint32_t k;
     123             : 
     124     1454751 :   delta = firsttime ? delta / damp : delta >> 1;
     125             :   /* delta >> 1 is a faster way of doing delta / 2 */
     126     1454751 :   delta += delta / numpoints;
     127             : 
     128     1892940 :   for (k = 0; delta > ((base - tmin) * tmax) / 2; k += base)
     129             :     {
     130      438189 :       delta /= base - tmin;
     131             :     }
     132             : 
     133     1454751 :   return k + (base - tmin + 1) * delta / (delta + skew);
     134             : }
     135             : 
     136             : /**
     137             :  * idn2_punycode_encode:
     138             :  * @input: array of input Unicode (UCS-4) code points
     139             :  * @input_length: number of code points in the @input array.
     140             :  * @output: output character array with ASCII code points.
     141             :  * @output_length: size of @output array
     142             :  *
     143             :  * Converts a sequence of code points to Punycode.
     144             :  *
     145             :  * Warning: Be aware that is rare for applications to need to perform
     146             :  *   Punycode operations, and you should consider idn2_to_ascii_8z(),
     147             :  *   idn2_to_ascii_lz(), idn2_to_ascii_4z() etc.
     148             :  *
     149             :  * Returns: On successful encoding %IDN2_OK is returned, or an error
     150             :  *   codes like %IDN2_PUNYCODE_BAD_INPUT, %IDN2_PUNYCODE_BIG_OUTPUT,
     151             :  *   or %IDN2_PUNYCODE_OVERFLOW.
     152             :  *
     153             :  * Since: 2.3.5
     154             :  **/
     155             : int
     156      693053 : idn2_punycode_encode (const uint32_t *input,
     157             :                       size_t input_length,
     158             :                       char *output, size_t *output_length)
     159             : {
     160             :   uint32_t our_input_length, n, delta, h, b, bias, j, m, q, k, t;
     161             :   size_t out, max_out;
     162             : 
     163             :   /* The Punycode spec assumes that the input length is the same type */
     164             :   /* of integer as a code point, so we need to convert the size_t to  */
     165             :   /* a uint32_t, which could overflow.                           */
     166             : 
     167      693053 :   if (input_length > UINT32_MAX)
     168           0 :     return IDN2_PUNYCODE_OVERFLOW;
     169      693053 :   our_input_length = (uint32_t) input_length;
     170             : 
     171             :   /* Initialize the state: */
     172             : 
     173      693053 :   n = initial_n;
     174      693053 :   delta = 0;
     175      693053 :   out = 0;
     176      693053 :   max_out = *output_length;
     177      693053 :   bias = initial_bias;
     178             : 
     179             :   /* Handle the basic code points: */
     180             : 
     181     1426995 :   for (j = 0; j < our_input_length; ++j)
     182             :     {
     183      734001 :       if (encode_basic (input[j]))
     184             :         {
     185       19337 :           if (max_out - out < 2)
     186          59 :             return IDN2_PUNYCODE_BIG_OUTPUT;
     187       19278 :           output[out++] = (char) input[j];
     188             :         }
     189      714664 :       else if (input[j] > 0x10FFFF
     190      714664 :                || (input[j] >= 0xD800 && input[j] <= 0xDBFF))
     191           0 :         return IDN2_PUNYCODE_BAD_INPUT;
     192             :     }
     193             : 
     194      692994 :   h = b = (uint32_t) out;
     195             :   /* cannot overflow because out <= our_input_length <= UINT32_MAX */
     196             : 
     197             :   /* h is the number of code points that have been handled, b is the  */
     198             :   /* number of basic code points, and out is the number of ASCII code */
     199             :   /* points that have been output.                                    */
     200             : 
     201      692994 :   if (b > 0)
     202        3382 :     output[out++] = delimiter;
     203             : 
     204             :   /* Main encoding loop: */
     205             : 
     206     1397772 :   while (h < our_input_length)
     207             :     {
     208             :       /* All non-basic code points < n have been     */
     209             :       /* handled already.  Find the next larger one: */
     210             : 
     211     1530292 :       for (m = UINT32_MAX, j = 0; j < our_input_length; ++j)
     212             :         {
     213             :           /* if (basic(input[j])) continue; */
     214             :           /* (not needed for Punycode) */
     215      825406 :           if (input[j] >= n && input[j] < m)
     216      713436 :             m = input[j];
     217             :         }
     218             : 
     219             :       /* Increase delta enough to advance the decoder's    */
     220             :       /* <n,i> state to <m,0>, but guard against overflow: */
     221             : 
     222      704886 :       if (m - n > (UINT32_MAX - delta) / (h + 1))
     223           0 :         return IDN2_PUNYCODE_OVERFLOW;
     224      704886 :       delta += (m - n) * (h + 1);
     225      704886 :       n = m;
     226             : 
     227     1525690 :       for (j = 0; j < our_input_length; ++j)
     228             :         {
     229             :           /* Punycode does not need to check whether input[j] is basic: */
     230      820912 :           if (input[j] < n /* || basic(input[j]) */ )
     231             :             {
     232       64410 :               if (++delta == 0)
     233           0 :                 return IDN2_PUNYCODE_OVERFLOW;
     234             :             }
     235             : 
     236      820912 :           if (input[j] == n)
     237             :             {
     238             :               /* Represent delta as a generalized variable-length integer: */
     239             : 
     240      713260 :               for (q = delta, k = base;; k += base)
     241             :                 {
     242     2664358 :                   if (out >= max_out)
     243         108 :                     return IDN2_PUNYCODE_BIG_OUTPUT;
     244     3940585 :                   t = k <= bias /* + tmin */ ? tmin :        /* +tmin not needed */
     245     1276335 :                     k >= bias + tmax ? tmax : k - bias;
     246     2664250 :                   if (q < t)
     247      713152 :                     break;
     248     1951098 :                   output[out++] = encode_digit (t + (q - t) % (base - t), 0);
     249     1951098 :                   q = (q - t) / (base - t);
     250             :                 }
     251             : 
     252      713152 :               output[out++] = encode_digit (q, 0);
     253      713152 :               bias = adapt (delta, h + 1, h == b);
     254      713152 :               delta = 0;
     255      713152 :               ++h;
     256             :             }
     257             :         }
     258             : 
     259      704778 :       ++delta, ++n;
     260             :     }
     261             : 
     262      692886 :   *output_length = out;
     263      692886 :   return IDN2_OK;
     264             : }
     265             : 
     266             : /**
     267             :  * idn2_punycode_decode:
     268             :  * @input: array of ASCII characters (0..7F)
     269             :  * @input_length: Number of ASCII characters in the @input array.
     270             :  * @output: output character array with ASCII code points.
     271             :  * @output_length: on input, guarantee that @output has room
     272             :  *   for this many code points; on output, *@output_length
     273             :  *   holds number of code points in @output.
     274             :  *
     275             :  * Converts Punycode to a sequence of code points.
     276             :  *
     277             :  * The decoder will never need to output more code points than the
     278             :  * number of ASCII code points in the input, because of the way the
     279             :  * encoding is defined.
     280             :  *
     281             :  * The number of code points output cannot exceed the maximum possible
     282             :  * value of a uint32_t, even if the supplied @output_length is greater
     283             :  * than that.
     284             :  *
     285             :  * Warning: Be aware that is rare for applications to need to perform
     286             :  *   Punycode operations, and you should consider
     287             :  *   idn2_to_unicode_8z8z(), idn2_to_unicode_lzlz() etc.
     288             :  *
     289             :  * Returns: On successful encoding %IDN2_OK is returned, or an error
     290             :  *   codes like %IDN2_PUNYCODE_BAD_INPUT, %IDN2_PUNYCODE_BIG_OUTPUT,
     291             :  *   or %IDN2_PUNYCODE_OVERFLOW.
     292             :  *
     293             :  * Since: 2.3.5
     294             :  **/
     295             : int
     296      703898 : idn2_punycode_decode (const char *input,
     297             :                       size_t input_length,
     298             :                       uint32_t *output, size_t *output_length)
     299             : {
     300      703898 :   uint32_t n, out = 0, i, max_out, bias, oldi, w, k, digit, t;
     301      703898 :   size_t b = 0, j, in;
     302             : 
     303      703898 :   if (!input_length)
     304         136 :     return IDN2_PUNYCODE_BAD_INPUT;
     305             : 
     306             :   /* Check that all chars are basic */
     307     3480540 :   for (j = 0; j < input_length; ++j)
     308             :     {
     309     2778106 :       if (!decode_basic (input[j]))
     310        1328 :         return IDN2_PUNYCODE_BAD_INPUT;
     311     2776778 :       if (input[j] == delimiter)
     312        8054 :         b = j;
     313             :     }
     314             : 
     315      702434 :   max_out =
     316      702434 :     *output_length > UINT32_MAX ? UINT32_MAX : (uint32_t) * output_length;
     317             : 
     318      702434 :   if (input[b] == delimiter)
     319             :     {
     320             :       /* do not accept leading or trailing delimiter
     321             :        *   - leading delim must be omitted if there is no ASCII char in u-label
     322             :        *   - trailing delim means there where no non-ASCII chars in u-label
     323             :        */
     324        5066 :       if (!b || b == input_length - 1)
     325         141 :         return IDN2_PUNYCODE_BAD_INPUT;
     326             : 
     327        4925 :       if (b >= max_out)
     328          16 :         return IDN2_PUNYCODE_BIG_OUTPUT;
     329             : 
     330             :       /* Check that all chars before last delimiter are basic chars */
     331             :       /* and copy the first b code points to the output. */
     332       21160 :       for (j = 0; j < b; j++)
     333       16251 :         output[out++] = input[j];
     334             : 
     335        4909 :       b += 1;                   /* advance to non-basic char encoding */
     336             :     }
     337             : 
     338             :   /* Initialize the state: */
     339      702277 :   n = initial_n;
     340      702277 :   i = 0;
     341      702277 :   bias = initial_bias;
     342             : 
     343             :   /* Main decoding loop:  Start just after the last delimiter if any  */
     344             :   /* basic code points were copied; start at the beginning otherwise. */
     345     1443821 :   for (in = b; in < input_length; ++out)
     346             :     {
     347             : 
     348             :       /* in is the index of the next ASCII code point to be consumed, */
     349             :       /* and out is the number of code points in the output array.    */
     350             : 
     351             :       /* Decode a generalized variable-length integer into delta,  */
     352             :       /* which gets added to i.  The overflow checking is easier   */
     353             :       /* if we increase i as we go, then subtract off its starting */
     354             :       /* value at the end to obtain delta.                         */
     355      741704 :       for (oldi = i, w = 1, k = base;; k += base)
     356             :         {
     357     2749381 :           if (in >= input_length)
     358          82 :             return IDN2_PUNYCODE_BAD_INPUT;
     359     2749299 :           digit = decode_digit (input[in++]);
     360     2749299 :           if (digit >= base)
     361           0 :             return IDN2_PUNYCODE_BAD_INPUT;
     362     2749299 :           if (digit > (UINT32_MAX - i) / w)
     363          23 :             return IDN2_PUNYCODE_OVERFLOW;
     364     2749276 :           i += digit * w;
     365     4085850 :           t = k <= bias /* + tmin */ ? tmin :        /* +tmin not needed */
     366     1336574 :             k >= bias + tmax ? tmax : k - bias;
     367     2749276 :           if (digit < t)
     368      741599 :             break;
     369     2007677 :           if (w > UINT32_MAX / (base - t))
     370           0 :             return IDN2_PUNYCODE_OVERFLOW;
     371     2007677 :           w *= (base - t);
     372             :         }
     373             : 
     374      741599 :       bias = adapt (i - oldi, out + 1, oldi == 0);
     375             : 
     376             :       /* i was supposed to wrap around from out+1 to 0,   */
     377             :       /* incrementing n each time, so we'll fix that now: */
     378      741599 :       if (i / (out + 1) > UINT32_MAX - n)
     379           0 :         return IDN2_PUNYCODE_OVERFLOW;
     380      741599 :       n += i / (out + 1);
     381      741599 :       if (n > 0x10FFFF || (n >= 0xD800 && n <= 0xDBFF))
     382          37 :         return IDN2_PUNYCODE_BAD_INPUT;
     383      741562 :       i %= (out + 1);
     384             : 
     385             :       /* Insert n at position i of the output: */
     386             : 
     387             :       /* not needed for Punycode: */
     388             :       /* if (basic(n)) return IDN2_PUNYCODE_BAD_INPUT; */
     389      741562 :       if (out >= max_out)
     390          18 :         return IDN2_PUNYCODE_BIG_OUTPUT;
     391             : 
     392      741544 :       memmove (output + i + 1, output + i, (out - i) * sizeof *output);
     393      741544 :       output[i++] = n;
     394             :     }
     395             : 
     396      702117 :   *output_length = (size_t) out;
     397             :   /* cannot overflow because out <= old value of *output_length */
     398      702117 :   return IDN2_OK;
     399             : }
     400             : 
     401             : /* We are stuck exporting these old internal interfaces because old
     402             :    versions of GNUTLS used them, and maybe other software. */
     403             : 
     404             : extern int _IDN2_API _idn2_punycode_decode (size_t input_length,
     405             :                                             const char input[],
     406             :                                             size_t *output_length,
     407             :                                             uint32_t output[]);
     408             : extern int _IDN2_API _idn2_punycode_encode (size_t input_length,
     409             :                                             const uint32_t input[],
     410             :                                             size_t *output_length,
     411             :                                             char output[]);
     412             : 
     413             : int
     414           1 : _idn2_punycode_decode (size_t input_length,
     415             :                        const char input[],
     416             :                        size_t *output_length, uint32_t output[])
     417             : {
     418           1 :   return idn2_punycode_decode (input, input_length, output, output_length);
     419             : 
     420             : }
     421             : 
     422             : int
     423           1 : _idn2_punycode_encode (size_t input_length,
     424             :                        const uint32_t input[],
     425             :                        size_t *output_length, char output[])
     426             : {
     427           1 :   return idn2_punycode_encode (input, input_length, output, output_length);
     428             : }

Generated by: LCOV version 1.16