tst-iconv-opt.c 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347
  1. /* Test iconv's TRANSLIT and IGNORE option handling
  2. Copyright (C) 2020-2026 Free Software Foundation, Inc.
  3. This file is part of the GNU C Library.
  4. The GNU C Library is free software; you can redistribute it and/or
  5. modify it under the terms of the GNU Lesser General Public
  6. License as published by the Free Software Foundation; either
  7. version 2.1 of the License, or (at your option) any later version.
  8. The GNU C Library is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11. Lesser General Public License for more details.
  12. You should have received a copy of the GNU Lesser General Public
  13. License along with the GNU C Library; if not, see
  14. <https://www.gnu.org/licenses/>. */
  15. #include <iconv.h>
  16. #include <locale.h>
  17. #include <errno.h>
  18. #include <string.h>
  19. #include <support/support.h>
  20. #include <support/check.h>
  21. /* Run one iconv test. Arguments:
  22. to: destination character set and options
  23. from: source character set
  24. input: input string to be converted
  25. exp_in: expected number of bytes consumed
  26. exp_ret: expected return value (error or number of irreversible conversions)
  27. exp_out: expected output string
  28. exp_err: expected value of `errno' after iconv returns. */
  29. static void
  30. test_iconv (const char *to, const char *from, char *input, size_t exp_in,
  31. size_t exp_ret, const char *exp_out, int exp_err)
  32. {
  33. iconv_t cd;
  34. char outbuf[500];
  35. size_t inlen, outlen;
  36. char *inptr, *outptr;
  37. size_t n;
  38. cd = iconv_open (to, from);
  39. TEST_VERIFY (cd != (iconv_t) -1);
  40. inlen = strlen (input);
  41. outlen = sizeof (outbuf);
  42. inptr = input;
  43. outptr = outbuf;
  44. errno = 0;
  45. n = iconv (cd, &inptr, &inlen, &outptr, &outlen);
  46. TEST_COMPARE (n, exp_ret);
  47. TEST_VERIFY (inptr == input + exp_in);
  48. TEST_COMPARE (errno, exp_err);
  49. TEST_COMPARE_BLOB (outbuf, outptr - outbuf, exp_out, strlen (exp_out));
  50. TEST_VERIFY (iconv_close (cd) == 0);
  51. }
  52. /* We test option parsing by converting UTF-8 inputs to ASCII under various
  53. option combinations. The UTF-8 inputs fall into three categories:
  54. - ASCII-only,
  55. - non-ASCII,
  56. - non-ASCII with invalid UTF-8 characters. */
  57. /* 1. */
  58. char ascii[] = "Just some ASCII text";
  59. /* 2. Valid UTF-8 input and some corresponding expected outputs with various
  60. options. The two non-ASCII characters below are accented alphabets:
  61. an `a' then an `o'. */
  62. char utf8[] = "UTF-8 text with \u00E1 couple \u00F3f non-ASCII characters";
  63. char u2a[] = "UTF-8 text with ";
  64. char u2a_translit[] = "UTF-8 text with a couple of non-ASCII characters";
  65. char u2a_ignore[] = "UTF-8 text with couple f non-ASCII characters";
  66. /* 3. Invalid UTF-8 input and some corresponding expected outputs. \xff is
  67. invalid UTF-8. It's followed by some valid but non-ASCII UTF-8. */
  68. char iutf8[] = "Invalid UTF-8 \xff\u27E6text\u27E7";
  69. char iu2a[] = "Invalid UTF-8 ";
  70. char iu2a_ignore[] = "Invalid UTF-8 text";
  71. char iu2a_both[] = "Invalid UTF-8 [|text|]";
  72. /* 4. Another invalid UTF-8 input and corresponding expected outputs. This time
  73. the valid non-ASCII UTF-8 characters appear before the invalid \xff. */
  74. char jutf8[] = "Invalid \u27E6UTF-8\u27E7 \xfftext";
  75. char ju2a[] = "Invalid ";
  76. char ju2a_translit[] = "Invalid [|UTF-8|] ";
  77. char ju2a_ignore[] = "Invalid UTF-8 text";
  78. char ju2a_both[] = "Invalid [|UTF-8|] text";
  79. /* We also test option handling for character set names that have the form
  80. "A/B". In this test, we test conversions "ISO-10646/UTF-8", and either
  81. ISO-8859-1 or ASCII. */
  82. /* 5. Accented 'A' and 'a' characters in ISO-8859-1 and UTF-8, and an
  83. equivalent ASCII transliteration. */
  84. char iso8859_1_a[] = {0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, /* Accented A's. */
  85. 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, /* Accented a's. */
  86. 0x00};
  87. char utf8_a[] = "\u00C0\u00C1\u00C2\u00C3\u00C4\u00C5"
  88. "\u00E0\u00E1\u00E2\u00E3\u00E4\u00E5";
  89. char ascii_a[] = "AAAAAAaaaaaa";
  90. /* 6. An invalid ASCII string where [0] is invalid and [1] is '~'. */
  91. char iascii [] = {0x80, '~', '\0'};
  92. char empty[] = "";
  93. char ia2u_ignore[] = "~";
  94. static int
  95. do_test (void)
  96. {
  97. xsetlocale (LC_ALL, "en_US.UTF-8");
  98. /* 0. iconv_open should gracefully fail for invalid character sets. */
  99. TEST_VERIFY (iconv_open ("INVALID", "UTF-8") == (iconv_t) -1);
  100. TEST_VERIFY (iconv_open ("UTF-8", "INVALID") == (iconv_t) -1);
  101. TEST_VERIFY (iconv_open ("INVALID", "INVALID") == (iconv_t) -1);
  102. /* 1. ASCII-only UTF-8 input should convert to ASCII with no changes: */
  103. test_iconv ("ASCII", "UTF-8", ascii, strlen (ascii), 0, ascii, 0);
  104. test_iconv ("ASCII//", "UTF-8", ascii, strlen (ascii), 0, ascii, 0);
  105. test_iconv ("ASCII//TRANSLIT", "UTF-8", ascii, strlen (ascii), 0, ascii, 0);
  106. test_iconv ("ASCII//TRANSLIT//", "UTF-8", ascii, strlen (ascii), 0, ascii,
  107. 0);
  108. test_iconv ("ASCII//IGNORE", "UTF-8", ascii, strlen (ascii), 0, ascii, 0);
  109. test_iconv ("ASCII//IGNORE//", "UTF-8", ascii, strlen (ascii), 0, ascii, 0);
  110. /* 2. Valid UTF-8 input with non-ASCII characters: */
  111. /* EILSEQ when converted to ASCII. */
  112. test_iconv ("ASCII", "UTF-8", utf8, strlen (u2a), (size_t) -1, u2a, EILSEQ);
  113. /* Converted without error with TRANSLIT enabled. */
  114. test_iconv ("ASCII//TRANSLIT", "UTF-8", utf8, strlen (utf8), 2, u2a_translit,
  115. 0);
  116. /* EILSEQ with IGNORE enabled. Non-ASCII chars dropped from output. */
  117. test_iconv ("ASCII//IGNORE", "UTF-8", utf8, strlen (utf8), (size_t) -1,
  118. u2a_ignore, EILSEQ);
  119. /* With TRANSLIT and IGNORE enabled, transliterated without error. We test
  120. four combinations. */
  121. test_iconv ("ASCII//TRANSLIT,IGNORE", "UTF-8", utf8, strlen (utf8), 2,
  122. u2a_translit, 0);
  123. test_iconv ("ASCII//TRANSLIT//IGNORE", "UTF-8", utf8, strlen (utf8), 2,
  124. u2a_translit, 0);
  125. test_iconv ("ASCII//IGNORE,TRANSLIT", "UTF-8", utf8, strlen (utf8), 2,
  126. u2a_translit, 0);
  127. /* Due to bug 19519, iconv was ignoring TRANSLIT for the following input. */
  128. test_iconv ("ASCII//IGNORE//TRANSLIT", "UTF-8", utf8, strlen (utf8), 2,
  129. u2a_translit, 0);
  130. /* Misspellings of TRANSLIT and IGNORE are ignored, but conversion still
  131. works while respecting any other correctly spelled options. */
  132. test_iconv ("ASCII//T", "UTF-8", utf8, strlen (u2a), (size_t) -1, u2a,
  133. EILSEQ);
  134. test_iconv ("ASCII//TRANSLITERATE", "UTF-8", utf8, strlen (u2a), (size_t) -1,
  135. u2a, EILSEQ);
  136. test_iconv ("ASCII//I", "UTF-8", utf8, strlen (u2a), (size_t) -1, u2a,
  137. EILSEQ);
  138. test_iconv ("ASCII//IGNORED", "UTF-8", utf8, strlen (u2a), (size_t) -1, u2a,
  139. EILSEQ);
  140. test_iconv ("ASCII//TRANSLITERATE//IGNORED", "UTF-8", utf8, strlen (u2a),
  141. (size_t) -1, u2a, EILSEQ);
  142. test_iconv ("ASCII//IGNORED,TRANSLITERATE", "UTF-8", utf8, strlen (u2a),
  143. (size_t) -1, u2a, EILSEQ);
  144. test_iconv ("ASCII//T//I", "UTF-8", utf8, strlen (u2a), (size_t) -1, u2a,
  145. EILSEQ);
  146. test_iconv ("ASCII//TRANSLIT//I", "UTF-8", utf8, strlen (utf8), 2,
  147. u2a_translit, 0);
  148. /* Due to bug 19519, iconv was ignoring TRANSLIT for the following input. */
  149. test_iconv ("ASCII//I//TRANSLIT", "UTF-8", utf8, strlen (utf8), 2,
  150. u2a_translit, 0);
  151. test_iconv ("ASCII//IGNORED,TRANSLIT", "UTF-8", utf8, strlen (utf8), 2,
  152. u2a_translit, 0);
  153. test_iconv ("ASCII//TRANSLIT,IGNORED", "UTF-8", utf8, strlen (utf8), 2,
  154. u2a_translit, 0);
  155. test_iconv ("ASCII//IGNORE,T", "UTF-8", utf8, strlen (utf8), (size_t) -1,
  156. u2a_ignore, EILSEQ);
  157. test_iconv ("ASCII//T,IGNORE", "UTF-8", utf8, strlen (utf8), (size_t) -1,
  158. u2a_ignore, EILSEQ);
  159. /* Due to bug 19519, iconv was ignoring IGNORE for the following input. */
  160. test_iconv ("ASCII//TRANSLITERATE//IGNORE", "UTF-8", utf8, strlen (utf8),
  161. (size_t) -1, u2a_ignore, EILSEQ);
  162. test_iconv ("ASCII//IGNORE//TRANSLITERATE", "UTF-8", utf8, strlen (utf8),
  163. (size_t) -1, u2a_ignore, EILSEQ);
  164. /* 3. Invalid UTF-8 followed by some valid non-ASCII UTF-8 characters: */
  165. /* EILSEQ; output is truncated at the first invalid UTF-8 character. */
  166. test_iconv ("ASCII", "UTF-8", iutf8, strlen (iu2a), (size_t) -1, iu2a,
  167. EILSEQ);
  168. /* With TRANSLIT enabled: EILSEQ; output still truncated at the first invalid
  169. UTF-8 character. */
  170. test_iconv ("ASCII//TRANSLIT", "UTF-8", iutf8, strlen (iu2a), (size_t) -1,
  171. iu2a, EILSEQ);
  172. /* With IGNORE enabled: EILSEQ; output omits invalid UTF-8 characters and
  173. valid UTF-8 non-ASCII characters. */
  174. test_iconv ("ASCII//IGNORE", "UTF-8", iutf8, strlen (iutf8), (size_t) -1,
  175. iu2a_ignore, EILSEQ);
  176. /* With TRANSLIT and IGNORE enabled, output omits only invalid UTF-8
  177. characters and transliterates valid non-ASCII UTF-8 characters. We test
  178. four combinations. */
  179. test_iconv ("ASCII//TRANSLIT,IGNORE", "UTF-8", iutf8, strlen (iutf8), 2,
  180. iu2a_both, 0);
  181. /* Due to bug 19519, iconv was ignoring IGNORE for the following input. */
  182. test_iconv ("ASCII//TRANSLIT//IGNORE", "UTF-8", iutf8, strlen (iutf8), 2,
  183. iu2a_both, 0);
  184. test_iconv ("ASCII//IGNORE,TRANSLIT", "UTF-8", iutf8, strlen (iutf8), 2,
  185. iu2a_both, 0);
  186. /* Due to bug 19519, iconv was ignoring TRANSLIT for the following input. */
  187. test_iconv ("ASCII//IGNORE//TRANSLIT", "UTF-8", iutf8, strlen (iutf8), 2,
  188. iu2a_both, 0);
  189. /* 4. Invalid UTF-8 with valid non-ASCII UTF-8 chars appearing first: */
  190. /* EILSEQ; output is truncated at the first non-ASCII character. */
  191. test_iconv ("ASCII", "UTF-8", jutf8, strlen (ju2a), (size_t) -1, ju2a,
  192. EILSEQ);
  193. /* With TRANSLIT enabled: EILSEQ; output now truncated at the first invalid
  194. UTF-8 character. */
  195. test_iconv ("ASCII//TRANSLIT", "UTF-8", jutf8, strlen (jutf8) - 5,
  196. (size_t) -1, ju2a_translit, EILSEQ);
  197. test_iconv ("ASCII//translit", "UTF-8", jutf8, strlen (jutf8) - 5,
  198. (size_t) -1, ju2a_translit, EILSEQ);
  199. /* With IGNORE enabled: EILSEQ; output omits invalid UTF-8 characters and
  200. valid UTF-8 non-ASCII characters. */
  201. test_iconv ("ASCII//IGNORE", "UTF-8", jutf8, strlen (jutf8), (size_t) -1,
  202. ju2a_ignore, EILSEQ);
  203. test_iconv ("ASCII//ignore", "UTF-8", jutf8, strlen (jutf8), (size_t) -1,
  204. ju2a_ignore, EILSEQ);
  205. /* With TRANSLIT and IGNORE enabled, output omits only invalid UTF-8
  206. characters and transliterates valid non-ASCII UTF-8 characters. We test
  207. several combinations. */
  208. test_iconv ("ASCII//TRANSLIT,IGNORE", "UTF-8", jutf8, strlen (jutf8), 2,
  209. ju2a_both, 0);
  210. /* Due to bug 19519, iconv was ignoring IGNORE for the following input. */
  211. test_iconv ("ASCII//TRANSLIT//IGNORE", "UTF-8", jutf8, strlen (jutf8), 2,
  212. ju2a_both, 0);
  213. test_iconv ("ASCII//IGNORE,TRANSLIT", "UTF-8", jutf8, strlen (jutf8), 2,
  214. ju2a_both, 0);
  215. /* Due to bug 19519, iconv was ignoring TRANSLIT for the following input. */
  216. test_iconv ("ASCII//IGNORE//TRANSLIT", "UTF-8", jutf8, strlen (jutf8), 2,
  217. ju2a_both, 0);
  218. test_iconv ("ASCII//translit,ignore", "UTF-8", jutf8, strlen (jutf8), 2,
  219. ju2a_both, 0);
  220. /* Trailing whitespace and separators should be ignored. */
  221. test_iconv ("ASCII//IGNORE,TRANSLIT ", "UTF-8", jutf8, strlen (jutf8), 2,
  222. ju2a_both, 0);
  223. test_iconv ("ASCII//IGNORE,TRANSLIT/", "UTF-8", jutf8, strlen (jutf8), 2,
  224. ju2a_both, 0);
  225. test_iconv ("ASCII//IGNORE,TRANSLIT//", "UTF-8", jutf8, strlen (jutf8), 2,
  226. ju2a_both, 0);
  227. test_iconv ("ASCII//IGNORE,TRANSLIT,", "UTF-8", jutf8, strlen (jutf8), 2,
  228. ju2a_both, 0);
  229. test_iconv ("ASCII//IGNORE,TRANSLIT,,", "UTF-8", jutf8, strlen (jutf8), 2,
  230. ju2a_both, 0);
  231. test_iconv ("ASCII//IGNORE,TRANSLIT /,", "UTF-8", jutf8, strlen (jutf8), 2,
  232. ju2a_both, 0);
  233. /* TRANSLIT or IGNORE suffixes in fromcode should be ignored. */
  234. test_iconv ("ASCII", "UTF-8//TRANSLIT", jutf8, strlen (ju2a), (size_t) -1,
  235. ju2a, EILSEQ);
  236. test_iconv ("ASCII", "UTF-8//IGNORE", jutf8, strlen (ju2a), (size_t) -1,
  237. ju2a, EILSEQ);
  238. test_iconv ("ASCII", "UTF-8//TRANSLIT,IGNORE", jutf8, strlen (ju2a),
  239. (size_t) -1, ju2a, EILSEQ);
  240. /* 5. Charset names of the form "A/B/": */
  241. /* ISO-8859-1 is converted to UTF-8 without needing transliteration. */
  242. test_iconv ("ISO-10646/UTF-8", "ISO-8859-1", iso8859_1_a,
  243. strlen (iso8859_1_a), 0, utf8_a, 0);
  244. test_iconv ("ISO-10646/UTF-8/", "ISO-8859-1", iso8859_1_a,
  245. strlen (iso8859_1_a), 0, utf8_a, 0);
  246. test_iconv ("ISO-10646/UTF-8/IGNORE", "ISO-8859-1", iso8859_1_a,
  247. strlen (iso8859_1_a), 0, utf8_a, 0);
  248. test_iconv ("ISO-10646/UTF-8//IGNORE", "ISO-8859-1", iso8859_1_a,
  249. strlen (iso8859_1_a), 0, utf8_a, 0);
  250. test_iconv ("ISO-10646/UTF-8/TRANSLIT", "ISO-8859-1", iso8859_1_a,
  251. strlen (iso8859_1_a), 0, utf8_a, 0);
  252. test_iconv ("ISO-10646/UTF-8//TRANSLIT", "ISO-8859-1", iso8859_1_a,
  253. strlen (iso8859_1_a), 0, utf8_a, 0);
  254. test_iconv ("ISO-10646/UTF-8//TRANSLIT/IGNORE", "ISO-8859-1", iso8859_1_a,
  255. strlen (iso8859_1_a), 0, utf8_a, 0);
  256. test_iconv ("ISO-10646/UTF-8//TRANSLIT//IGNORE", "ISO-8859-1", iso8859_1_a,
  257. strlen (iso8859_1_a), 0, utf8_a, 0);
  258. test_iconv ("ISO-10646/UTF-8/TRANSLIT,IGNORE", "ISO-8859-1", iso8859_1_a,
  259. strlen (iso8859_1_a), 0, utf8_a, 0);
  260. /* UTF-8 with accented A's is converted to ASCII with transliteration. */
  261. test_iconv ("ASCII", "ISO-10646/UTF-8", utf8_a,
  262. 0, (size_t) -1, empty, EILSEQ);
  263. test_iconv ("ASCII//IGNORE", "ISO-10646/UTF-8", utf8_a,
  264. strlen (utf8_a), (size_t) -1, empty, EILSEQ);
  265. test_iconv ("ASCII//TRANSLIT", "ISO-10646/UTF-8", utf8_a,
  266. strlen (utf8_a), 12, ascii_a, 0);
  267. /* Invalid ASCII is converted to UTF-8 only with IGNORE. */
  268. test_iconv ("ISO-10646/UTF-8", "ASCII", iascii, strlen (empty), (size_t) -1,
  269. empty, EILSEQ);
  270. test_iconv ("ISO-10646/UTF-8/TRANSLIT", "ASCII", iascii, strlen (empty),
  271. (size_t) -1, empty, EILSEQ);
  272. test_iconv ("ISO-10646/UTF-8/IGNORE", "ASCII", iascii, strlen (iascii),
  273. (size_t) -1, ia2u_ignore, EILSEQ);
  274. test_iconv ("ISO-10646/UTF-8/TRANSLIT,IGNORE", "ASCII", iascii,
  275. strlen (iascii), (size_t) -1, ia2u_ignore, EILSEQ);
  276. /* Due to bug 19519, iconv was ignoring IGNORE for the following three
  277. inputs: */
  278. test_iconv ("ISO-10646/UTF-8/TRANSLIT/IGNORE", "ASCII", iascii,
  279. strlen (iascii), (size_t) -1, ia2u_ignore, EILSEQ);
  280. test_iconv ("ISO-10646/UTF-8//TRANSLIT,IGNORE", "ASCII", iascii,
  281. strlen (iascii), (size_t) -1, ia2u_ignore, EILSEQ);
  282. test_iconv ("ISO-10646/UTF-8//TRANSLIT//IGNORE", "ASCII", iascii,
  283. strlen (iascii), (size_t) -1, ia2u_ignore, EILSEQ);
  284. return 0;
  285. }
  286. #include <support/test-driver.c>