utf8_kunit.c 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297
  1. // SPDX-License-Identifier: GPL-2.0-only
  2. /*
  3. * KUnit tests for utf-8 support.
  4. *
  5. * Copyright 2017 Collabora Ltd.
  6. */
  7. #include <linux/unicode.h>
  8. #include <kunit/test.h>
  9. #include "../utf8n.h"
  10. static const struct {
  11. /* UTF-8 strings in this vector _must_ be NULL-terminated. */
  12. unsigned char str[10];
  13. unsigned char dec[10];
  14. } nfdi_test_data[] = {
  15. /* Trivial sequence */
  16. {
  17. /* "ABba" decomposes to itself */
  18. .str = "aBba",
  19. .dec = "aBba",
  20. },
  21. /* Simple equivalent sequences */
  22. {
  23. /* 'VULGAR FRACTION ONE QUARTER' cannot decompose to
  24. 'NUMBER 1' + 'FRACTION SLASH' + 'NUMBER 4' on
  25. canonical decomposition */
  26. .str = {0xc2, 0xbc, 0x00},
  27. .dec = {0xc2, 0xbc, 0x00},
  28. },
  29. {
  30. /* 'LATIN SMALL LETTER A WITH DIAERESIS' decomposes to
  31. 'LETTER A' + 'COMBINING DIAERESIS' */
  32. .str = {0xc3, 0xa4, 0x00},
  33. .dec = {0x61, 0xcc, 0x88, 0x00},
  34. },
  35. {
  36. /* 'LATIN SMALL LETTER LJ' can't decompose to
  37. 'LETTER L' + 'LETTER J' on canonical decomposition */
  38. .str = {0xC7, 0x89, 0x00},
  39. .dec = {0xC7, 0x89, 0x00},
  40. },
  41. {
  42. /* GREEK ANO TELEIA decomposes to MIDDLE DOT */
  43. .str = {0xCE, 0x87, 0x00},
  44. .dec = {0xC2, 0xB7, 0x00}
  45. },
  46. /* Canonical ordering */
  47. {
  48. /* A + 'COMBINING ACUTE ACCENT' + 'COMBINING OGONEK' decomposes
  49. to A + 'COMBINING OGONEK' + 'COMBINING ACUTE ACCENT' */
  50. .str = {0x41, 0xcc, 0x81, 0xcc, 0xa8, 0x0},
  51. .dec = {0x41, 0xcc, 0xa8, 0xcc, 0x81, 0x0},
  52. },
  53. {
  54. /* 'LATIN SMALL LETTER A WITH DIAERESIS' + 'COMBINING OGONEK'
  55. decomposes to
  56. 'LETTER A' + 'COMBINING OGONEK' + 'COMBINING DIAERESIS' */
  57. .str = {0xc3, 0xa4, 0xCC, 0xA8, 0x00},
  58. .dec = {0x61, 0xCC, 0xA8, 0xcc, 0x88, 0x00},
  59. },
  60. };
  61. static const struct {
  62. /* UTF-8 strings in this vector _must_ be NULL-terminated. */
  63. unsigned char str[30];
  64. unsigned char ncf[30];
  65. } nfdicf_test_data[] = {
  66. /* Trivial sequences */
  67. {
  68. /* "ABba" folds to lowercase */
  69. .str = {0x41, 0x42, 0x62, 0x61, 0x00},
  70. .ncf = {0x61, 0x62, 0x62, 0x61, 0x00},
  71. },
  72. {
  73. /* All ASCII folds to lower-case */
  74. .str = "ABCDEFGHIJKLMNOPQRSTUVWXYZ0.1",
  75. .ncf = "abcdefghijklmnopqrstuvwxyz0.1",
  76. },
  77. {
  78. /* LATIN SMALL LETTER SHARP S folds to
  79. LATIN SMALL LETTER S + LATIN SMALL LETTER S */
  80. .str = {0xc3, 0x9f, 0x00},
  81. .ncf = {0x73, 0x73, 0x00},
  82. },
  83. {
  84. /* LATIN CAPITAL LETTER A WITH RING ABOVE folds to
  85. LATIN SMALL LETTER A + COMBINING RING ABOVE */
  86. .str = {0xC3, 0x85, 0x00},
  87. .ncf = {0x61, 0xcc, 0x8a, 0x00},
  88. },
  89. /* Introduced by UTF-8.0.0. */
  90. /* Cherokee letters are interesting test-cases because they fold
  91. to upper-case. Before 8.0.0, Cherokee lowercase were
  92. undefined, thus, the folding from LC is not stable between
  93. 7.0.0 -> 8.0.0, but it is from UC. */
  94. {
  95. /* CHEROKEE SMALL LETTER A folds to CHEROKEE LETTER A */
  96. .str = {0xea, 0xad, 0xb0, 0x00},
  97. .ncf = {0xe1, 0x8e, 0xa0, 0x00},
  98. },
  99. {
  100. /* CHEROKEE SMALL LETTER YE folds to CHEROKEE LETTER YE */
  101. .str = {0xe1, 0x8f, 0xb8, 0x00},
  102. .ncf = {0xe1, 0x8f, 0xb0, 0x00},
  103. },
  104. {
  105. /* OLD HUNGARIAN CAPITAL LETTER AMB folds to
  106. OLD HUNGARIAN SMALL LETTER AMB */
  107. .str = {0xf0, 0x90, 0xb2, 0x83, 0x00},
  108. .ncf = {0xf0, 0x90, 0xb3, 0x83, 0x00},
  109. },
  110. /* Introduced by UTF-9.0.0. */
  111. {
  112. /* OSAGE CAPITAL LETTER CHA folds to
  113. OSAGE SMALL LETTER CHA */
  114. .str = {0xf0, 0x90, 0x92, 0xb5, 0x00},
  115. .ncf = {0xf0, 0x90, 0x93, 0x9d, 0x00},
  116. },
  117. {
  118. /* LATIN CAPITAL LETTER SMALL CAPITAL I folds to
  119. LATIN LETTER SMALL CAPITAL I */
  120. .str = {0xea, 0x9e, 0xae, 0x00},
  121. .ncf = {0xc9, 0xaa, 0x00},
  122. },
  123. /* Introduced by UTF-11.0.0. */
  124. {
  125. /* GEORGIAN SMALL LETTER AN folds to GEORGIAN MTAVRULI
  126. CAPITAL LETTER AN */
  127. .str = {0xe1, 0xb2, 0x90, 0x00},
  128. .ncf = {0xe1, 0x83, 0x90, 0x00},
  129. }
  130. };
  131. static ssize_t utf8len(const struct unicode_map *um, enum utf8_normalization n,
  132. const char *s)
  133. {
  134. return utf8nlen(um, n, s, (size_t)-1);
  135. }
  136. static int utf8cursor(struct utf8cursor *u8c, const struct unicode_map *um,
  137. enum utf8_normalization n, const char *s)
  138. {
  139. return utf8ncursor(u8c, um, n, s, (unsigned int)-1);
  140. }
  141. static void check_utf8_nfdi(struct kunit *test)
  142. {
  143. int i;
  144. struct utf8cursor u8c;
  145. struct unicode_map *um = test->priv;
  146. for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) {
  147. int len = strlen(nfdi_test_data[i].str);
  148. int nlen = strlen(nfdi_test_data[i].dec);
  149. int j = 0;
  150. unsigned char c;
  151. int ret;
  152. KUNIT_EXPECT_EQ(test, utf8len(um, UTF8_NFDI, nfdi_test_data[i].str), nlen);
  153. KUNIT_EXPECT_EQ(test, utf8nlen(um, UTF8_NFDI, nfdi_test_data[i].str, len),
  154. nlen);
  155. ret = utf8cursor(&u8c, um, UTF8_NFDI, nfdi_test_data[i].str);
  156. KUNIT_EXPECT_TRUE_MSG(test, ret >= 0, "Can't create cursor\n");
  157. while ((c = utf8byte(&u8c)) > 0) {
  158. KUNIT_EXPECT_EQ_MSG(test, c, nfdi_test_data[i].dec[j],
  159. "Unexpected byte 0x%x should be 0x%x\n",
  160. c, nfdi_test_data[i].dec[j]);
  161. j++;
  162. }
  163. KUNIT_EXPECT_EQ(test, j, nlen);
  164. }
  165. }
  166. static void check_utf8_nfdicf(struct kunit *test)
  167. {
  168. int i;
  169. struct utf8cursor u8c;
  170. struct unicode_map *um = test->priv;
  171. for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) {
  172. int len = strlen(nfdicf_test_data[i].str);
  173. int nlen = strlen(nfdicf_test_data[i].ncf);
  174. int j = 0;
  175. int ret;
  176. unsigned char c;
  177. KUNIT_EXPECT_EQ(test, utf8len(um, UTF8_NFDICF, nfdicf_test_data[i].str),
  178. nlen);
  179. KUNIT_EXPECT_EQ(test, utf8nlen(um, UTF8_NFDICF, nfdicf_test_data[i].str, len),
  180. nlen);
  181. ret = utf8cursor(&u8c, um, UTF8_NFDICF, nfdicf_test_data[i].str);
  182. KUNIT_EXPECT_TRUE_MSG(test, ret >= 0, "Can't create cursor\n");
  183. while ((c = utf8byte(&u8c)) > 0) {
  184. KUNIT_EXPECT_EQ_MSG(test, c, nfdicf_test_data[i].ncf[j],
  185. "Unexpected byte 0x%x should be 0x%x\n",
  186. c, nfdicf_test_data[i].ncf[j]);
  187. j++;
  188. }
  189. KUNIT_EXPECT_EQ(test, j, nlen);
  190. }
  191. }
  192. static void check_utf8_comparisons(struct kunit *test)
  193. {
  194. int i;
  195. struct unicode_map *um = test->priv;
  196. for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) {
  197. const struct qstr s1 = {.name = nfdi_test_data[i].str,
  198. .len = sizeof(nfdi_test_data[i].str)};
  199. const struct qstr s2 = {.name = nfdi_test_data[i].dec,
  200. .len = sizeof(nfdi_test_data[i].dec)};
  201. /* strncmp returns 0 when strings are equal */
  202. KUNIT_EXPECT_TRUE_MSG(test, utf8_strncmp(um, &s1, &s2) == 0,
  203. "%s %s comparison mismatch\n", s1.name, s2.name);
  204. }
  205. for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) {
  206. const struct qstr s1 = {.name = nfdicf_test_data[i].str,
  207. .len = sizeof(nfdicf_test_data[i].str)};
  208. const struct qstr s2 = {.name = nfdicf_test_data[i].ncf,
  209. .len = sizeof(nfdicf_test_data[i].ncf)};
  210. /* strncasecmp returns 0 when strings are equal */
  211. KUNIT_EXPECT_TRUE_MSG(test, utf8_strncasecmp(um, &s1, &s2) == 0,
  212. "%s %s comparison mismatch\n", s1.name, s2.name);
  213. }
  214. }
  215. static void check_supported_versions(struct kunit *test)
  216. {
  217. struct unicode_map *um = test->priv;
  218. /* Unicode 7.0.0 should be supported. */
  219. KUNIT_EXPECT_TRUE(test, utf8version_is_supported(um, UNICODE_AGE(7, 0, 0)));
  220. /* Unicode 9.0.0 should be supported. */
  221. KUNIT_EXPECT_TRUE(test, utf8version_is_supported(um, UNICODE_AGE(9, 0, 0)));
  222. /* Unicode 1x.0.0 (the latest version) should be supported. */
  223. KUNIT_EXPECT_TRUE(test, utf8version_is_supported(um, UTF8_LATEST));
  224. /* Next versions don't exist. */
  225. KUNIT_EXPECT_FALSE(test, utf8version_is_supported(um, UNICODE_AGE(13, 0, 0)));
  226. KUNIT_EXPECT_FALSE(test, utf8version_is_supported(um, UNICODE_AGE(0, 0, 0)));
  227. KUNIT_EXPECT_FALSE(test, utf8version_is_supported(um, UNICODE_AGE(-1, -1, -1)));
  228. }
  229. static struct kunit_case unicode_normalization_test_cases[] = {
  230. KUNIT_CASE(check_supported_versions),
  231. KUNIT_CASE(check_utf8_comparisons),
  232. KUNIT_CASE(check_utf8_nfdicf),
  233. KUNIT_CASE(check_utf8_nfdi),
  234. {}
  235. };
  236. static int init_test_ucd(struct kunit *test)
  237. {
  238. struct unicode_map *um = utf8_load(UTF8_LATEST);
  239. test->priv = um;
  240. KUNIT_EXPECT_EQ_MSG(test, IS_ERR(um), 0,
  241. "%s: Unable to load utf8 table.\n", __func__);
  242. return 0;
  243. }
  244. static void exit_test_ucd(struct kunit *test)
  245. {
  246. utf8_unload(test->priv);
  247. }
  248. static struct kunit_suite unicode_normalization_test_suite = {
  249. .name = "unicode_normalization",
  250. .test_cases = unicode_normalization_test_cases,
  251. .init = init_test_ucd,
  252. .exit = exit_test_ucd,
  253. };
  254. kunit_test_suite(unicode_normalization_test_suite);
  255. MODULE_AUTHOR("Gabriel Krisman Bertazi <krisman@collabora.co.uk>");
  256. MODULE_DESCRIPTION("KUnit tests for utf-8 support.");
  257. MODULE_LICENSE("GPL");