ucs.c 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251
  1. // SPDX-License-Identifier: GPL-2.0
  2. /*
  3. * ucs.c - Universal Character Set processing
  4. */
  5. #include <linux/array_size.h>
  6. #include <linux/bsearch.h>
  7. #include <linux/consolemap.h>
  8. #include <linux/minmax.h>
  9. struct ucs_interval16 {
  10. u16 first;
  11. u16 last;
  12. };
  13. struct ucs_interval32 {
  14. u32 first;
  15. u32 last;
  16. };
  17. #include "ucs_width_table.h"
  18. static int interval16_cmp(const void *key, const void *element)
  19. {
  20. u16 cp = *(u16 *)key;
  21. const struct ucs_interval16 *entry = element;
  22. if (cp < entry->first)
  23. return -1;
  24. if (cp > entry->last)
  25. return 1;
  26. return 0;
  27. }
  28. static int interval32_cmp(const void *key, const void *element)
  29. {
  30. u32 cp = *(u32 *)key;
  31. const struct ucs_interval32 *entry = element;
  32. if (cp < entry->first)
  33. return -1;
  34. if (cp > entry->last)
  35. return 1;
  36. return 0;
  37. }
  38. static bool cp_in_range16(u16 cp, const struct ucs_interval16 *ranges, size_t size)
  39. {
  40. if (cp < ranges[0].first || cp > ranges[size - 1].last)
  41. return false;
  42. return __inline_bsearch(&cp, ranges, size, sizeof(*ranges),
  43. interval16_cmp) != NULL;
  44. }
  45. static bool cp_in_range32(u32 cp, const struct ucs_interval32 *ranges, size_t size)
  46. {
  47. if (cp < ranges[0].first || cp > ranges[size - 1].last)
  48. return false;
  49. return __inline_bsearch(&cp, ranges, size, sizeof(*ranges),
  50. interval32_cmp) != NULL;
  51. }
  52. #define UCS_IS_BMP(cp) ((cp) <= 0xffff)
  53. /**
  54. * ucs_is_zero_width() - Determine if a Unicode code point is zero-width.
  55. * @cp: Unicode code point (UCS-4)
  56. *
  57. * Return: true if the character is zero-width, false otherwise
  58. */
  59. bool ucs_is_zero_width(u32 cp)
  60. {
  61. if (UCS_IS_BMP(cp))
  62. return cp_in_range16(cp, ucs_zero_width_bmp_ranges,
  63. ARRAY_SIZE(ucs_zero_width_bmp_ranges));
  64. else
  65. return cp_in_range32(cp, ucs_zero_width_non_bmp_ranges,
  66. ARRAY_SIZE(ucs_zero_width_non_bmp_ranges));
  67. }
  68. /**
  69. * ucs_is_double_width() - Determine if a Unicode code point is double-width.
  70. * @cp: Unicode code point (UCS-4)
  71. *
  72. * Return: true if the character is double-width, false otherwise
  73. */
  74. bool ucs_is_double_width(u32 cp)
  75. {
  76. if (UCS_IS_BMP(cp))
  77. return cp_in_range16(cp, ucs_double_width_bmp_ranges,
  78. ARRAY_SIZE(ucs_double_width_bmp_ranges));
  79. else
  80. return cp_in_range32(cp, ucs_double_width_non_bmp_ranges,
  81. ARRAY_SIZE(ucs_double_width_non_bmp_ranges));
  82. }
  83. /*
  84. * Structure for base with combining mark pairs and resulting recompositions.
  85. * Using u16 to save space since all values are within BMP range.
  86. */
  87. struct ucs_recomposition {
  88. u16 base; /* base character */
  89. u16 mark; /* combining mark */
  90. u16 recomposed; /* corresponding recomposed character */
  91. };
  92. #include "ucs_recompose_table.h"
  93. struct compare_key {
  94. u16 base;
  95. u16 mark;
  96. };
  97. static int recomposition_cmp(const void *key, const void *element)
  98. {
  99. const struct compare_key *search_key = key;
  100. const struct ucs_recomposition *entry = element;
  101. /* Compare base character first */
  102. if (search_key->base < entry->base)
  103. return -1;
  104. if (search_key->base > entry->base)
  105. return 1;
  106. /* Base characters match, now compare combining character */
  107. if (search_key->mark < entry->mark)
  108. return -1;
  109. if (search_key->mark > entry->mark)
  110. return 1;
  111. /* Both match */
  112. return 0;
  113. }
  114. /**
  115. * ucs_recompose() - Attempt to recompose two Unicode characters into a single character.
  116. * @base: Base Unicode code point (UCS-4)
  117. * @mark: Combining mark Unicode code point (UCS-4)
  118. *
  119. * Return: Recomposed Unicode code point, or 0 if no recomposition is possible
  120. */
  121. u32 ucs_recompose(u32 base, u32 mark)
  122. {
  123. /* Check if characters are within the range of our table */
  124. if (base < UCS_RECOMPOSE_MIN_BASE || base > UCS_RECOMPOSE_MAX_BASE ||
  125. mark < UCS_RECOMPOSE_MIN_MARK || mark > UCS_RECOMPOSE_MAX_MARK)
  126. return 0;
  127. struct compare_key key = { base, mark };
  128. struct ucs_recomposition *result =
  129. __inline_bsearch(&key, ucs_recomposition_table,
  130. ARRAY_SIZE(ucs_recomposition_table),
  131. sizeof(*ucs_recomposition_table),
  132. recomposition_cmp);
  133. return result ? result->recomposed : 0;
  134. }
  135. /*
  136. * The fallback table structures implement a 2-level lookup.
  137. */
  138. struct ucs_page_desc {
  139. u8 page; /* Page index (high byte of code points) */
  140. u8 count; /* Number of entries in this page */
  141. u16 start; /* Start index in entries array */
  142. };
  143. struct ucs_page_entry {
  144. u8 offset; /* Offset within page (0-255) */
  145. u8 fallback; /* Fallback character or range start marker */
  146. };
  147. #include "ucs_fallback_table.h"
  148. static int ucs_page_desc_cmp(const void *key, const void *element)
  149. {
  150. u8 page = *(u8 *)key;
  151. const struct ucs_page_desc *entry = element;
  152. if (page < entry->page)
  153. return -1;
  154. if (page > entry->page)
  155. return 1;
  156. return 0;
  157. }
  158. static int ucs_page_entry_cmp(const void *key, const void *element)
  159. {
  160. u8 offset = *(u8 *)key;
  161. const struct ucs_page_entry *entry = element;
  162. if (offset < entry->offset)
  163. return -1;
  164. if (entry->fallback == UCS_PAGE_ENTRY_RANGE_MARKER) {
  165. if (offset > entry[1].offset)
  166. return 1;
  167. } else {
  168. if (offset > entry->offset)
  169. return 1;
  170. }
  171. return 0;
  172. }
  173. /**
  174. * ucs_get_fallback() - Get a substitution for the provided Unicode character
  175. * @cp: Unicode code point (UCS-4)
  176. *
  177. * Get a simpler fallback character for the provided Unicode character.
  178. * This is used for terminal display when corresponding glyph is unavailable.
  179. * The substitution may not be as good as the actual glyph for the original
  180. * character but still way more helpful than a squared question mark.
  181. *
  182. * Return: Fallback Unicode code point, or 0 if none is available
  183. */
  184. u32 ucs_get_fallback(u32 cp)
  185. {
  186. const struct ucs_page_desc *page;
  187. const struct ucs_page_entry *entry;
  188. u8 page_idx = cp >> 8, offset = cp;
  189. if (!UCS_IS_BMP(cp))
  190. return 0;
  191. /*
  192. * Full-width to ASCII mapping (covering all printable ASCII 33-126)
  193. * 0xFF01 (!) to 0xFF5E (~) -> ASCII 33 (!) to 126 (~)
  194. * We process them programmatically to reduce the table size.
  195. */
  196. if (cp >= 0xFF01 && cp <= 0xFF5E)
  197. return cp - 0xFF01 + 33;
  198. page = __inline_bsearch(&page_idx, ucs_fallback_pages,
  199. ARRAY_SIZE(ucs_fallback_pages),
  200. sizeof(*ucs_fallback_pages),
  201. ucs_page_desc_cmp);
  202. if (!page)
  203. return 0;
  204. entry = __inline_bsearch(&offset, ucs_fallback_entries + page->start,
  205. page->count, sizeof(*ucs_fallback_entries),
  206. ucs_page_entry_cmp);
  207. if (!entry)
  208. return 0;
  209. if (entry->fallback == UCS_PAGE_ENTRY_RANGE_MARKER)
  210. entry++;
  211. return entry->fallback;
  212. }