Watt-32 tcp/ip  2.2 dev-rel.10
punycode.c
1 /*
2  * punycode.c from RFC 3492
3  * http://www.nicemice.net/idn/
4  * Adam M. Costello
5  * http://www.nicemice.net/amc/
6  *
7  * This is ANSI C code (C89) implementing Punycode (RFC 3492).
8  */
9 
10 #include <stdio.h>
11 #include <stdlib.h>
12 #include <limits.h>
13 #include <string.h>
14 
15 #include "wattcp.h"
16 #include "misc.h"
17 #include "strings.h"
18 #include "punycode.h"
19 
20 #if defined(USE_IDNA) || defined(TEST_PROG)
21 
22 /*** Bootstring parameters for Punycode ***/
23 
24 enum {
25  base = 36, tmin = 1, tmax = 26,
26  skew = 38, damp = 700,
27  initial_bias = 72,
28  initial_n = 0x80,
29  delimiter = 0x2D
30 };
31 
32 /* basic(cp) tests whether cp is a basic code point:
33  */
34 #define basic(cp) ((DWORD)(cp) < 0x80)
35 
36 /* delim(cp) tests whether cp is a delimiter:
37  */
38 #define delim(cp) ((cp) == delimiter)
39 
40 /*
41  * decode_digit(cp) returns the numeric value of a basic code
42  * point (for use in representing integers) in the range 0 to
43  * base-1, or base if cp is does not represent a value.
44  */
45 static DWORD decode_digit (DWORD cp)
46 {
47  return (cp - 48 < 10 ?
48  cp - 22 : cp - 65 < 26 ?
49  cp - 65 : cp - 97 < 26 ?
50  cp - 97 : base);
51 }
52 
53 /*
54  * encode_digit(d,flag) returns the basic code point whose value
55  * (when used for representing integers) is d, which needs to be in
56  * the range 0 to base-1. The lowercase form is used unless flag is
57  * nonzero, in which case the uppercase form is used. The behavior
58  * is undefined if flag is nonzero and digit d has no uppercase form.
59  */
60 static char encode_digit (DWORD d, int flag)
61 {
62  return (char) (d + 22 + 75 * (d < 26) - ((flag != 0) << 5));
63  /* 0..25 map to ASCII a..z or A..Z */
64  /* 26..35 map to ASCII 0..9 */
65 }
66 
67 /* flagged(bcp) tests whether a basic code point is flagged
68  * (uppercase). The behavior is undefined if bcp is not a
69  * basic code point.
70  */
71 #define flagged(bcp) ((DWORD)(bcp) - 65 < 26)
72 
73 /*
74  * encode_basic(bcp,flag) forces a basic code point to lowercase
75  * if flag is zero, uppercase if flag is nonzero, and returns
76  * the resulting code point. The code point is unchanged if it
77  * is caseless. The behavior is undefined if bcp is not a basic
78  * code point.
79  */
80 static char encode_basic (DWORD bcp, int flag)
81 {
82  bcp -= (bcp - 97 < 26) << 5;
83  return (char) (bcp + ((!flag && (bcp - 65 < 26)) << 5));
84 }
85 
86 /* maxint is the maximum value of a DWORD variable:
87  */
88 static const DWORD maxint = (DWORD)-1;
89 
90 static DWORD adapt (DWORD delta, DWORD numpoints, int firsttime)
91 {
92  DWORD k;
93 
94  delta = firsttime ? delta / damp : delta >> 1;
95  /* delta >> 1 is a faster way of doing delta / 2
96  */
97  delta += delta / numpoints;
98 
99  for (k = 0; delta > ((base - tmin) * tmax) / 2; k += base)
100  delta /= base - tmin;
101  return k + (base - tmin + 1) * delta / (delta + skew);
102 }
103 
104 /*
105  *Main encode function
106  */
107 enum punycode_status punycode_encode (size_t input_length,
108  const DWORD *input,
109  const BYTE *case_flags,
110  size_t *output_length,
111  char *output)
112 {
113  DWORD n, delta, h, b, out, max_out, bias, j, m, q, k, t;
114 
115  /* Initialize the state: */
116 
117  n = initial_n;
118  delta = out = 0;
119  max_out = *output_length;
120  bias = initial_bias;
121 
122  /* Handle the basic code points: */
123  for (j = 0; j < input_length; ++j)
124  {
125  if (basic (input[j]))
126  {
127  if (max_out - out < 2)
128  return (punycode_big_output);
129  output[out++] = case_flags ? encode_basic (input[j], case_flags[j]) :
130  (char)input[j];
131  }
132 #if 0
133  else if (input[j] < n)
134  return (punycode_bad_input);
135  /* (not needed for Punycode with unsigned code points) */
136 #endif
137  }
138 
139  h = b = out;
140 
141  /* h is the number of code points that have been handled, b is the
142  * number of basic code points, and out is the number of characters
143  * that have been output.
144  */
145  if (b > 0)
146  output[out++] = delimiter;
147 
148  /* Main encoding loop:
149  */
150  while (h < input_length)
151  {
152  /* All non-basic code points < n have been
153  * handled already. Find the next larger one:
154  */
155  for (m = maxint, j = 0; j < input_length; ++j)
156  {
157 #if 0
158  if (basic(input[j]))
159  continue;
160  /* (not needed for Punycode) */
161 #endif
162  if (input[j] >= n && input[j] < m)
163  m = input[j];
164  }
165 
166  /* Increase delta enough to advance the decoder's
167  * <n,i> state to <m,0>, but guard against overflow:
168  */
169  if (m - n > (maxint - delta) / (h + 1))
170  return (punycode_overflow);
171 
172  delta += (m - n) * (h + 1);
173  n = m;
174 
175  for (j = 0; j < input_length; ++j)
176  {
177  if (input[j] < n)
178  {
179  if (++delta == 0)
180  return (punycode_overflow);
181  }
182 
183  if (input[j] == n)
184  {
185  /* Represent delta as a generalized variable-length integer:
186  */
187  for (q = delta, k = base;; k += base)
188  {
189  if (out >= max_out)
190  return (punycode_big_output);
191 
192  t = k <= bias ? tmin :
193  k >= bias + tmax ? tmax :
194  k - bias;
195  if (q < t)
196  break;
197  output[out++] = encode_digit (t + (q - t) % (base - t), 0);
198  q = (q - t) / (base - t);
199  }
200  output[out++] = encode_digit (q, case_flags && case_flags[j]);
201  bias = adapt (delta, h + 1, h == b);
202  delta = 0;
203  ++h;
204  }
205  }
206  ++delta;
207  ++n;
208  }
209 
210  *output_length = out;
211  return (punycode_success);
212 }
213 
214 /*** Main decode function ***/
215 
216 enum punycode_status punycode_decode (DWORD input_length,
217  const char *input,
218  size_t *output_length,
219  DWORD *output,
220  BYTE *case_flags)
221 {
222  DWORD n, out, i, max_out, bias, b, j, in, oldi, w, k, digit, t;
223 
224  /* Initialize the state: */
225 
226  n = initial_n;
227  out = i = 0;
228  max_out = *output_length;
229  bias = initial_bias;
230 
231  /* Handle the basic code points: Let b be the number of input code
232  * points before the last delimiter, or 0 if there is none, then
233  * copy the first b code points to the output.
234  */
235  for (b = j = 0; j < input_length; ++j)
236  if (delim (input[j]))
237  b = j;
238  if (b > max_out)
239  return (punycode_big_output);
240 
241  for (j = 0; j < b; ++j)
242  {
243  if (case_flags)
244  case_flags[out] = flagged (input[j]);
245  if (!basic (input[j]))
246  return (punycode_bad_input);
247  output[out++] = input[j];
248  }
249 
250  /* Main decoding loop: Start just after the last delimiter if any
251  * basic code points were copied; start at the beginning otherwise.
252  */
253  for (in = (b > 0) ? (b + 1) : 0; in < input_length; ++out)
254  {
255  /* in is the index of the next character to be consumed, and
256  * out is the number of code points in the output array.
257  */
258 
259  /* Decode a generalized variable-length integer into delta,
260  * which gets added to i. The overflow checking is easier
261  * if we increase i as we go, then subtract off its starting
262  * value at the end to obtain delta.
263  */
264  for (oldi = i, w = 1, k = base;; k += base)
265  {
266  if (in >= input_length)
267  return (punycode_bad_input);
268 
269  digit = decode_digit (input[in++]);
270  if (digit >= base)
271  return (punycode_bad_input);
272 
273  if (digit > ((maxint - i) / w))
274  return (punycode_overflow);
275 
276  i += digit * w;
277  t = k <= bias ? tmin :
278  k >= bias + tmax ? tmax :
279  k - bias;
280  if (digit < t)
281  break;
282  if (w > maxint / (base - t))
283  return (punycode_overflow);
284 
285  w *= (base - t);
286  }
287 
288  bias = adapt (i - oldi, out + 1, oldi == 0);
289 
290  /* i was supposed to wrap around from out+1 to 0,
291  * incrementing n each time, so we'll fix that now:
292  */
293  if (i / (out + 1) > maxint - n)
294  return (punycode_overflow);
295 
296  n += i / (out + 1);
297  i %= (out + 1);
298 
299  /* Insert n at position i of the output:
300  */
301 #if 0
302  /* not needed for Punycode: */
303  if (decode_digit(n) <= base)
304  return (punycode_invalid_input);
305 #endif
306  if (out >= max_out)
307  return (punycode_big_output);
308 
309  if (case_flags)
310  {
311  memmove (case_flags + i + 1, case_flags + i, out - i);
312 
313  /* Case of last character determines uppercase flag:
314  */
315  case_flags[i] = flagged (input[in - 1]);
316  }
317  memmove (output + i + 1, output + i, (out - i) * sizeof *output);
318  output[i++] = n;
319  }
320 
321  *output_length = out;
322  return (punycode_success);
323 }
324 #endif /* USE_IDNA || TEST_PROG */
325 
326 
327 #if defined(TEST_PROG)
328 
329 /* For testing, we'll just set some compile-time limits rather than
330  * use malloc(), and set a compile-time option rather than using a
331  * command-line option.
332  */
333 #define UNICODE_MAX_LENGTH 256
334 #define ACE_MAX_LENGTH 256
335 
336 static void usage (char **argv)
337 {
338  fprintf (stderr,
339  "\n"
340  "%s -e reads code points and writes a Punycode string.\n"
341  "%s -d reads a Punycode string and writes code points.\n"
342  "\n"
343  "Input and output are plain text in the native character set.\n"
344  "Code points are in the form u+hex separated by whitespace.\n"
345  "Although the specification allows Punycode strings to contain\n"
346  "any characters from the ASCII repertoire, this test code\n"
347  "supports only the printable characters, and needs the Punycode\n"
348  "string to be followed by a newline.\n"
349  "The case of the u in u+hex is the force-to-uppercase flag.\n",
350  argv[0], argv[0]);
351  exit (EXIT_FAILURE);
352 }
353 
354 #define fail(msg) __fail (__LINE__, msg)
355 static void __fail (unsigned line, const char *msg)
356 {
357  fprintf (stderr, "line %u: %s", line, msg);
358  exit (EXIT_FAILURE);
359 }
360 
361 static const char too_big[] = "input or output is too large, recompile with larger limits\n";
362 static const char invalid_input[] = "invalid input\n";
363 static const char overflow[] = "arithmetic overflow\n";
364 static const char io_error[] = "I/O error\n";
365 
366 /* The following string is used to convert printable
367  * characters between ASCII and the native charset:
368  */
369 static const char print_ascii[] = "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"
370  "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"
371  " !\"#$%&'()*+,-./" "0123456789:;<=>?"
372  "@ABCDEFGHIJKLMNO"
373  "PQRSTUVWXYZ[\\]^_"
374  "`abcdefghijklmno"
375  "pqrstuvwxyz{|}~\n";
376 
377 int main (int argc, char **argv)
378 {
379  enum punycode_status status;
380  size_t input_length, output_length, j;
381  BYTE case_flags [UNICODE_MAX_LENGTH];
382  int r;
383 
384  if (argc != 2 || argv[1][0] != '-' || argv[1][2] != 0)
385  usage (argv);
386 
387  if (argv[1][1] == 'e')
388  {
389  DWORD input [UNICODE_MAX_LENGTH], codept;
390  char output [ACE_MAX_LENGTH+1], uplus[3];
391  int c;
392 
393  /* Read the input code points:
394  */
395  input_length = 0;
396 
397  for (;;)
398  {
399  r = scanf ("%2s%lx", uplus, &codept);
400  if (ferror (stdin))
401  fail (io_error);
402  if (r == EOF || r == 0)
403  break;
404 
405  if (r != 2 || uplus[1] != '+' || codept > (DWORD) - 1)
406  fail (invalid_input);
407 
408  if (input_length == UNICODE_MAX_LENGTH)
409  fail (too_big);
410 
411  if (uplus[0] == 'u')
412  case_flags[input_length] = 0;
413  else if (uplus[0] == 'U')
414  case_flags[input_length] = 1;
415  else fail (invalid_input);
416 
417  input[input_length++] = codept;
418  }
419 
420  /* Encode:
421  */
422  output_length = ACE_MAX_LENGTH;
423  status = punycode_encode (input_length, input, case_flags,
424  &output_length, output);
425  if (status == punycode_bad_input)
426  fail (invalid_input);
427  if (status == punycode_big_output)
428  fail (too_big);
429  if (status == punycode_overflow)
430  fail (overflow);
431  WATT_ASSERT (status == punycode_success);
432 
433  /* Convert to native charset and output:
434  */
435  for (j = 0; j < output_length; ++j)
436  {
437  c = output[j];
438  WATT_ASSERT (c >= 0 && c <= 127);
439  if (print_ascii[c] == 0)
440  fail (invalid_input);
441  output[j] = print_ascii[c];
442  }
443 
444  output[j] = '\0';
445  r = puts (output);
446  if (r == EOF)
447  fail (io_error);
448  return (EXIT_SUCCESS);
449  }
450 
451  if (argv[1][1] == 'd')
452  {
453  char input [ACE_MAX_LENGTH+2], *p, *pp;
454  DWORD output [UNICODE_MAX_LENGTH];
455 
456  /* Read the Punycode input string and convert to ASCII: */
457 
458  fgets (input, ACE_MAX_LENGTH+2, stdin);
459  if (ferror (stdin))
460  fail (io_error);
461  if (feof (stdin))
462  fail (invalid_input);
463  input_length = strlen (input) - 1;
464  if (input[input_length] != '\n')
465  fail (too_big);
466  input[input_length] = '\0';
467 
468  for (p = input; *p != 0; ++p)
469  {
470  pp = strchr (print_ascii, *p);
471  if (pp == 0)
472  fail (invalid_input);
473  *p = pp - print_ascii;
474  }
475 
476  /* Decode:
477  */
478  output_length = UNICODE_MAX_LENGTH;
479  status = punycode_decode (input_length, input, &output_length,
480  output, case_flags);
481  if (status == punycode_bad_input)
482  fail (invalid_input);
483  if (status == punycode_big_output)
484  fail (too_big);
485  if (status == punycode_overflow)
486  fail (overflow);
487  WATT_ASSERT (status == punycode_success);
488 
489  /* Output the result:
490  */
491  for (j = 0; j < output_length; ++j)
492  printf ("%s+%04lX\n", case_flags[j] ? "U" : "u", output[j]);
493  return (EXIT_SUCCESS);
494  }
495  usage (argv);
496  return (EXIT_SUCCESS);
497 }
498 #endif /* TEST_PROG */
Core definitions.
int main(int argc, char **argv)
Definition: echo.c:223