Watt-32 tcp/ip  2.2 dev-rel.10
idna.c
1 /*
2  * Code for enabling lookup of names with non-ASCII letters via
3  * ACE and IDNA (Internationalizing Domain Names in Applications)
4  * Ref. RFC-3490.
5  *
6  */
7 
8 /* \version 0.1: Mar 19, 2004 :
9  * G. Vanem - Created.
10  *
11  */
12 
13 #include <stdio.h>
14 #include <stdlib.h>
15 #include <limits.h>
16 #include <string.h>
17 #include <ctype.h>
18 
19 #include "wattcp.h"
20 #include "misc.h"
21 #include "strings.h"
22 #include "pcdbug.h"
23 #include "punycode.h"
24 #include "idna.h"
25 
26 #if defined(USE_IDNA)
27 
28 #define _FLUSHWC_H /* normal_flushwc() not needed */
29 
30 #ifdef _MSC_VER
31 #pragma warning (disable:4244)
32 #endif
33 
34 #include "iconv/ascii.h"
35 #include "iconv/jisx0201.h"
36 #include "iconv/jisx0208.h"
37 #include "iconv/cp437.h"
38 #include "iconv/cp737.h"
39 #include "iconv/cp775.h"
40 #include "iconv/cp850.h"
41 #include "iconv/cp852.h"
42 #include "iconv/cp853.h"
43 #include "iconv/cp855.h"
44 #include "iconv/cp856.h"
45 #include "iconv/cp857.h"
46 #include "iconv/cp858.h"
47 #include "iconv/cp860.h"
48 #include "iconv/cp861.h"
49 #include "iconv/cp862.h"
50 #include "iconv/cp863.h"
51 #include "iconv/cp864.h"
52 #include "iconv/cp865.h"
53 #include "iconv/cp866.h"
54 #include "iconv/cp869.h"
55 #include "iconv/cp874.h"
56 #include "iconv/cp922.h"
57 #include "iconv/cp932.h"
58 #include "iconv/cp943.h"
59 #include "iconv/ksc5601.h"
60 #include "iconv/cp949.h"
61 #include "iconv/big5.h"
62 #include "iconv/cp950.h"
63 #include "iconv/cp1046.h"
64 #include "iconv/cp1124.h"
65 #include "iconv/cp1125.h"
66 #include "iconv/cp1129.h"
67 #include "iconv/cp1133.h"
68 #include "iconv/cp1161.h"
69 #include "iconv/cp1162.h"
70 #include "iconv/cp1163.h"
71 #include "iconv/cp1250.h"
72 #include "iconv/cp1251.h"
73 #include "iconv/cp1252.h"
74 #include "iconv/cp1253.h"
75 #include "iconv/cp1254.h"
76 #include "iconv/cp1255.h"
77 #include "iconv/cp1256.h"
78 #include "iconv/cp1257.h"
79 #include "iconv/cp1258.h"
80 
81 typedef int (*toUnicode) (conv_t, ucs4_t *, const unsigned char *, int);
82 typedef int (*toAscii) (conv_t, unsigned char *, ucs4_t, int);
83 
84 struct iconv_table {
85  const char *name;
86  WORD codepage;
87  toUnicode mbtowc;
88  toAscii wctomb;
89  };
90 
91 static const struct iconv_table mappings[] = {
92  { "CP437", 437, cp437_mbtowc, cp437_wctomb },
93  { "CP737", 737, cp737_mbtowc, cp737_wctomb },
94  { "CP775", 775, cp775_mbtowc, cp775_wctomb },
95  { "CP850", 850, cp850_mbtowc, cp850_wctomb },
96  { "CP852", 852, cp852_mbtowc, cp852_wctomb },
97  { "CP853", 853, cp853_mbtowc, cp853_wctomb },
98  { "CP855", 855, cp855_mbtowc, cp855_wctomb },
99  { "CP856", 856, cp856_mbtowc, cp856_wctomb },
100  { "CP857", 857, cp857_mbtowc, cp857_wctomb },
101  { "CP858", 858, cp858_mbtowc, cp858_wctomb },
102  { "CP860", 860, cp860_mbtowc, cp860_wctomb },
103  { "CP861", 861, cp861_mbtowc, cp861_wctomb },
104  { "CP862", 862, cp862_mbtowc, cp862_wctomb },
105  { "CP863", 863, cp863_mbtowc, cp863_wctomb },
106  { "CP864", 864, cp864_mbtowc, cp864_wctomb },
107  { "CP865", 865, cp865_mbtowc, cp865_wctomb },
108  { "CP866", 866, cp866_mbtowc, cp866_wctomb },
109  { "CP869", 869, cp869_mbtowc, cp869_wctomb },
110  { "CP874", 874, cp874_mbtowc, cp874_wctomb },
111  { "CP922", 922, cp922_mbtowc, cp922_wctomb },
112  { "CP932", 932, cp932_mbtowc, cp932_wctomb },
113  { "CP943", 943, cp943_mbtowc, cp943_wctomb },
114  { "CP949", 949, cp949_mbtowc, cp949_wctomb },
115  { "CP950", 950, cp950_mbtowc, cp950_wctomb },
116  { "CP1046", 1046, cp1046_mbtowc, cp1046_wctomb },
117  { "CP1124", 1124, cp1124_mbtowc, cp1124_wctomb },
118  { "CP1125", 1125, cp1125_mbtowc, cp1125_wctomb },
119  { "CP1129", 1129, cp1129_mbtowc, cp1129_wctomb },
120  { "CP1133", 1133, cp1133_mbtowc, cp1133_wctomb },
121  { "CP1161", 1161, cp1161_mbtowc, cp1161_wctomb },
122  { "CP1162", 1162, cp1162_mbtowc, cp1162_wctomb },
123  { "CP1163", 1163, cp1163_mbtowc, cp1163_wctomb },
124  { "CP1250", 1250, cp1250_mbtowc, cp1250_wctomb },
125  { "CP1251", 1251, cp1251_mbtowc, cp1251_wctomb },
126  { "CP1252", 1252, cp1252_mbtowc, cp1252_wctomb },
127  { "CP1253", 1253, cp1253_mbtowc, cp1253_wctomb },
128  { "CP1254", 1254, cp1254_mbtowc, cp1254_wctomb },
129  { "CP1255", 1255, cp1255_mbtowc, cp1255_wctomb },
130  { "CP1256", 1256, cp1256_mbtowc, cp1256_wctomb },
131  { "CP1257", 1257, cp1257_mbtowc, cp1257_wctomb },
132  { "CP1258", 1258, cp1258_mbtowc, cp1258_wctomb },
133  { NULL, 0, NULL, NULL }
134  };
135 
136 static const struct iconv_table *curr_mapping = NULL;
137 static conv_t iconv;
138 
139 /* The following string is used to convert printable
140  * characters between ASCII and the native charset:
141  */
142 static const char print_ascii[] = "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"
143  "\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n"
144  " !\"#$%&'()*+,-./" "0123456789:;<=>?"
145  "@ABCDEFGHIJKLMNO"
146  "PQRSTUVWXYZ[\\]^_"
147  "`abcdefghijklmno"
148  "pqrstuvwxyz{|}~\n";
149 #if defined(WIN32)
150  #define GET_CODEPAGE() GetACP()
151 #else
152  #define GET_CODEPAGE() GetCodePage()
153 #endif
154 
158 BOOL iconv_init (WORD cp)
159 {
160  int i;
161 
162  if (cp == 0)
163  cp = GET_CODEPAGE();
164 
165  IDNA_DEBUG (2, ("iconv_init: codepage %u\n", cp));
166  if (!cp)
167  return (FALSE);
168 
169  for (i = 0; i < DIM(mappings); i++)
170  if (cp == mappings[i].codepage)
171  break;
172  if (i == DIM(mappings))
173  return (FALSE);
174  curr_mapping = mappings + i;
175  return (TRUE);
176 }
177 
181 const char *iconv_strerror (int rc)
182 {
183  switch (rc)
184  {
185  case RET_ILUNI:
186  return ("Illegal Unicode");
187  case RET_ILSEQ:
188  return ("Illegal sequence");
189  case RET_TOOSMALL:
190  return ("Output buffer too small");
191  case RET_TOOFEW(0):
192  return ("Input sequence too short");
193  case RET_TOOFEW(1):
194  return ("Input sequence 1 byte too short");
195  case RET_TOOFEW(2):
196  return ("Input sequence 2 bytes too short");
197  case RET_TOOFEW(3):
198  return ("Input sequence 3 bytes too short");
199  default:
200  return ("Unknown");
201  }
202 }
203 
207 static int iconv_to_unicode (char ch, ucs4_t *uc)
208 {
209  ucs4_t res = 0;
210  int rc;
211 
212  if (!curr_mapping)
213  return (0);
214  rc = (*curr_mapping->mbtowc) (iconv, &res, (unsigned char*)&ch, 1);
215  if (rc < 1)
216  {
217  IDNA_DEBUG (1, ("iconv_to_unicode failed; %d, %s\n",
218  rc, iconv_strerror(rc)));
219  return (0);
220  }
221  *uc = res;
222  return (rc);
223 }
224 
228 static int iconv_to_ascii (ucs4_t uc, char *ch)
229 {
230  int rc = 0;
231 
232  if (curr_mapping)
233  {
234  unsigned char res[4] = { 0,0,0,0 };
235 
236  rc = (*curr_mapping->wctomb) (iconv, res, uc, sizeof(res));
237  if (rc == 1)
238  *ch = (char) res[0];
239  else if (rc == 2)
240  *(WORD*)ch = *(WORD*)&res;
241  else if (rc > 2)
242  memcpy (ch, res, rc);
243  else
244  IDNA_DEBUG (1, ("iconv_to_ascii failed; %d, %s\n",
245  rc, iconv_strerror(rc)));
246  }
247  return (rc);
248 }
249 
253 static char **split_labels (const char *name)
254 {
255  static char buf [MAX_LABELS][MAX_HOSTLEN];
256  static char *res [MAX_LABELS+1];
257  const char *p = name;
258  int i;
259 
260  for (i = 0; i < MAX_LABELS && *p; i++)
261  {
262  const char *dot = strchr (p, '.');
263 
264  if (!dot)
265  {
266  res[i] = _strlcpy (buf[i], p, sizeof(buf[i]));
267  i++;
268  break;
269  }
270  res[i] = _strlcpy (buf[i], p, dot-p+1);
271  p = ++dot;
272  }
273  res[i] = NULL;
274  IDNA_DEBUG (3, ("split_labels: `%s', %d labels\n", name, i));
275  return (res);
276 }
277 
281 static char *convert_to_ACE (const char *name)
282 {
283  DWORD utf_input[MAX_HOSTLEN];
284  BYTE utf_case [MAX_HOSTLEN];
285  const char *p;
286  size_t in_len, out_len;
287  static char out_buf [2*MAX_HOSTLEN];
288  enum punycode_status status;
289  int i, c;
290 
291  for (i = 0, p = name; *p; i++)
292  {
293  ucs4_t utf = 0;
294 
295  c = (*p++) & 255;
296  iconv_to_unicode (c, &utf);
297  utf_input[i] = utf;
298  utf_case[i] = (BYTE) isupper (c);
299  if (utf > 0xFFFF)
300  IDNA_DEBUG (3, ("%c -> u+%08lX\n", c, utf));
301  else IDNA_DEBUG (3, ("%c -> u+%04lX\n", c, utf));
302  }
303  in_len = i;
304  out_len = sizeof(out_buf);
305  status = punycode_encode (in_len, utf_input, utf_case, &out_len, out_buf);
306 
307  if (status != punycode_success)
308  out_len = 0;
309 
310  for (i = 0; i < (int)out_len; i++)
311  {
312  int c = out_buf[i];
313 
314  if (c < 0 || c > 127)
315  {
316  IDNA_DEBUG (1, ("illegal Punycode result: %c (%d)\n", c, c));
317  return (NULL);
318  }
319  if (!print_ascii[c])
320  {
321  IDNA_DEBUG (1, ("Punycode not ASCII: %c (%d)\n", c, c));
322  return (NULL);
323  }
324  out_buf[i] = print_ascii[c];
325  }
326  out_buf[i] = '\0';
327  IDNA_DEBUG (2, ("punycode_encode: status %d, out_len %lu, out_buf `%s'\n",
328  status, (DWORD)out_len, out_buf));
329  return (status == punycode_success ? out_buf : NULL);
330 }
331 
335 static char *convert_from_ACE (const char *name)
336 {
337  DWORD utf_output[MAX_HOSTLEN];
338  BYTE utf_case [MAX_HOSTLEN];
339  static char out_buf [MAX_HOSTLEN];
340  size_t utf_len, i, j;
341  enum punycode_status status;
342 
343  utf_len = sizeof(utf_output);
344  status = punycode_decode (strlen(name), name, &utf_len, utf_output, utf_case);
345 
346  if (status != punycode_success)
347  utf_len = 0;
348 
349  for (i = j = 0; i < utf_len && j < sizeof(out_buf); i++)
350  {
351  ucs4_t utf = utf_output[i];
352  int len = iconv_to_ascii (utf, out_buf+j);
353 
354  if (len <= 0)
355  break;
356  IDNA_DEBUG (3, ("%c+%04lX -> %*.s\n",
357  utf_case[i] ? 'U' : 'u', utf, len, out_buf+j));
358  j += len;
359  }
360  out_buf[j] = '\0';
361  IDNA_DEBUG (2, ("punycode_decode: status %d, out_len %lu, out_buf `%s'\n",
362  status, (DWORD)utf_len, out_buf));
363  return (status == punycode_success ? out_buf : NULL);
364 }
365 
366 
385  char *name, /* IN/OUT: native ASCII/ACE name */
386  size_t *size) /* IN: length of name buf, */
387 { /* OUT: ACE encoded length */
388  const char *ace;
389  char *in_name = name;
390  char **labels = split_labels (name);
391  int i;
392  size_t len = 0;
393 
394  for (i = 0; labels[i]; i++)
395  {
396  const BYTE *p;
397  const char *label = labels[i];
398 
399  ace = NULL;
400  if (strnicmp(label,"xn--",4)) /* if not already encoded */
401  {
402  for (p = (const BYTE*)label; *p; p++)
403  if (*p >= 0x80)
404  {
405  ace = convert_to_ACE (label);
406  if (!ace)
407  return (FALSE);
408  break;
409  }
410  }
411 
412  if (ace)
413  {
414  if (len + 5 + strlen(ace) > *size)
415  {
416  IDNA_DEBUG (1, ("input length exceeded\n"));
417  return (FALSE);
418  }
419  name += sprintf (name, "xn--%s.", ace);
420  }
421  else /* pass through unchanged */
422  {
423  if (len + 1 + strlen(label) > *size)
424  {
425  IDNA_DEBUG (1, ("input length exceeded\n"));
426  return (FALSE);
427  }
428  name += sprintf (name, "%s.", label);
429  }
430  }
431  if (i > 0) /* drop trailing '.' */
432  name--;
433  len = name - in_name;
434  *name = '\0';
435  *size = len;
436  IDNA_DEBUG (2, ("IDNA_convert_to_ACE: `%s', %lu bytes\n",
437  in_name, (DWORD)len));
438  return (TRUE);
439 }
440 
452  char *name, /* IN/OUT: ACE/native ASCII name */
453  size_t *size) /* IN: ACE raw string length, */
454 { /* OUT: ASCII deccoded length */
455  char *in_name = name;
456  char **labels = split_labels (name);
457  int i;
458 
459  for (i = 0; labels[i]; i++)
460  {
461  const char *ascii = NULL;
462  const char *label = labels[i];
463 
464  if (!strncmp(label,"xn--",4) && label[4])
465  {
466  ascii = convert_from_ACE (label+4);
467  if (!ascii)
468  return (FALSE);
469  }
470  name += sprintf (name, "%s.", ascii ? ascii : label);
471  }
472  *name = '\0';
473  *size = name - in_name;
474  IDNA_DEBUG (2, ("IDNA_convert_from_ACE: `%s', %lu bytes\n",
475  in_name, (DWORD)*size));
476  return (TRUE);
477 }
478 
479 #if defined(TEST_PROG)
480 
481 #include <netdb.h>
482 #include <arpa/inet.h>
483 
484 #include "sock_ini.h"
485 #include "pcdns.h"
486 #include "pcdbug.h"
487 
488 void dump_cp_list (void)
489 {
490  int i;
491 
492  printf ("Supported codepages: ");
493  for (i = 0; mappings[i].name; i++)
494  printf ("%s\n", mappings[i].name);
495  exit (0);
496 }
497 
498 void usage (void)
499 {
500  printf ("IDNA [-d] [-c codepage] hostname | ip-address\n"
501  " -d debug level, \"-dd\" for more details\n"
502  " -c select codepage (active is CP%d).\n", GET_CODEPAGE());
503  printf (" use \"-c?\" to list supported codepages\n");
504  exit (0);
505 }
506 
507 int main (int argc, char **argv)
508 {
509  struct in_addr addr;
510  struct hostent *he;
511  const char *host;
512  WORD cp = 0;
513  int ch;
514  int debug = 0;
515 
516  while ((ch = getopt(argc, argv, "c:dh?")) != EOF)
517  switch (ch)
518  {
519  case 'c':
520  if (*optarg == '?')
521  dump_cp_list();
522  cp = atoi (optarg);
523  break;
524  case 'd':
525  debug++;
526  break;
527  case '?':
528  case 'h':
529  default:
530  usage();
531  break;
532  }
533 
534  argc -= optind;
535  argv += optind;
536  if (!*argv)
537  usage();
538 
539  if (debug)
540  dbug_init();
541  sock_init();
542 
543  if (!iconv_init(cp))
544  {
545  printf ("iconv_init() failed for CP %d\n", cp);
546  return (1);
547  }
548 
549  debug_on = debug;
550  dns_do_idna = TRUE;
551  host = argv[0];
552  printf ("Resolving `%s'...", host);
553  fflush (stdout);
554 
555  if (inet_aton(host,&addr))
556  {
557  he = gethostbyaddr ((char*)&addr, sizeof(addr), AF_INET);
558  if (he)
559  printf ("%s\n", he->h_name);
560  else printf ("failed; %s)\n", hstrerror(h_errno));
561  }
562  else
563  {
564  he = gethostbyname (host);
565  if (he)
566  printf ("%s\n", inet_ntoa(*(struct in_addr*)he->h_addr));
567  else printf ("failed; %s\n", hstrerror(h_errno));
568  }
569 
570  debug_on = 0;
571  return (0);
572 }
573 #endif /* TEST_PROG */
574 #endif /* USE_IDNA */
575 
576 
Definition: idna.h:8
BOOL IDNA_convert_from_ACE(char *name, size_t *size)
Convert a possibly ACE-encoded name to a name in native codepage.
Definition: idna.c:451
const char * iconv_strerror(int rc)
Return textual error for 'rc'.
Definition: idna.c:181
Definition: netdb.h:102
Core definitions.
char * _strlcpy(char *dst, const char *src, size_t len)
Similar to strncpy(), but always returns 'dst' with 0-termination.
Definition: strings.c:226
BOOL IDNA_convert_to_ACE(char *name, size_t *size)
Convert a possibly non-ASCII name into ACE-form.
Definition: idna.c:384
Definition: in.h:146
BOOL iconv_init(WORD code_page)
Initialise iconv; find codepage and mapping functions to use.
Definition: idna.c:158
int main(int argc, char **argv)
Definition: echo.c:223