gen_ucs_recompose_table.py 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257
  1. #!/usr/bin/env python3
  2. # SPDX-License-Identifier: GPL-2.0
  3. #
  4. # Leverage Python's unicodedata module to generate ucs_recompose_table.h
  5. #
  6. # The generated table maps base character + combining mark pairs to their
  7. # precomposed equivalents.
  8. #
  9. # Usage:
  10. # python3 gen_ucs_recompose_table.py # Generate with common recomposition pairs
  11. # python3 gen_ucs_recompose_table.py --full # Generate with all recomposition pairs
  12. import unicodedata
  13. import sys
  14. import argparse
  15. import textwrap
  16. # This script's file name
  17. from pathlib import Path
  18. this_file = Path(__file__).name
  19. # Default output file name
  20. DEFAULT_OUT_FILE = "ucs_recompose_table.h"
  21. common_recompose_description = "most commonly used Latin, Greek, and Cyrillic recomposition pairs only"
  22. COMMON_RECOMPOSITION_PAIRS = [
  23. # Latin letters with accents - uppercase
  24. (0x0041, 0x0300, 0x00C0), # A + COMBINING GRAVE ACCENT = LATIN CAPITAL LETTER A WITH GRAVE
  25. (0x0041, 0x0301, 0x00C1), # A + COMBINING ACUTE ACCENT = LATIN CAPITAL LETTER A WITH ACUTE
  26. (0x0041, 0x0302, 0x00C2), # A + COMBINING CIRCUMFLEX ACCENT = LATIN CAPITAL LETTER A WITH CIRCUMFLEX
  27. (0x0041, 0x0303, 0x00C3), # A + COMBINING TILDE = LATIN CAPITAL LETTER A WITH TILDE
  28. (0x0041, 0x0308, 0x00C4), # A + COMBINING DIAERESIS = LATIN CAPITAL LETTER A WITH DIAERESIS
  29. (0x0041, 0x030A, 0x00C5), # A + COMBINING RING ABOVE = LATIN CAPITAL LETTER A WITH RING ABOVE
  30. (0x0043, 0x0327, 0x00C7), # C + COMBINING CEDILLA = LATIN CAPITAL LETTER C WITH CEDILLA
  31. (0x0045, 0x0300, 0x00C8), # E + COMBINING GRAVE ACCENT = LATIN CAPITAL LETTER E WITH GRAVE
  32. (0x0045, 0x0301, 0x00C9), # E + COMBINING ACUTE ACCENT = LATIN CAPITAL LETTER E WITH ACUTE
  33. (0x0045, 0x0302, 0x00CA), # E + COMBINING CIRCUMFLEX ACCENT = LATIN CAPITAL LETTER E WITH CIRCUMFLEX
  34. (0x0045, 0x0308, 0x00CB), # E + COMBINING DIAERESIS = LATIN CAPITAL LETTER E WITH DIAERESIS
  35. (0x0049, 0x0300, 0x00CC), # I + COMBINING GRAVE ACCENT = LATIN CAPITAL LETTER I WITH GRAVE
  36. (0x0049, 0x0301, 0x00CD), # I + COMBINING ACUTE ACCENT = LATIN CAPITAL LETTER I WITH ACUTE
  37. (0x0049, 0x0302, 0x00CE), # I + COMBINING CIRCUMFLEX ACCENT = LATIN CAPITAL LETTER I WITH CIRCUMFLEX
  38. (0x0049, 0x0308, 0x00CF), # I + COMBINING DIAERESIS = LATIN CAPITAL LETTER I WITH DIAERESIS
  39. (0x004E, 0x0303, 0x00D1), # N + COMBINING TILDE = LATIN CAPITAL LETTER N WITH TILDE
  40. (0x004F, 0x0300, 0x00D2), # O + COMBINING GRAVE ACCENT = LATIN CAPITAL LETTER O WITH GRAVE
  41. (0x004F, 0x0301, 0x00D3), # O + COMBINING ACUTE ACCENT = LATIN CAPITAL LETTER O WITH ACUTE
  42. (0x004F, 0x0302, 0x00D4), # O + COMBINING CIRCUMFLEX ACCENT = LATIN CAPITAL LETTER O WITH CIRCUMFLEX
  43. (0x004F, 0x0303, 0x00D5), # O + COMBINING TILDE = LATIN CAPITAL LETTER O WITH TILDE
  44. (0x004F, 0x0308, 0x00D6), # O + COMBINING DIAERESIS = LATIN CAPITAL LETTER O WITH DIAERESIS
  45. (0x0055, 0x0300, 0x00D9), # U + COMBINING GRAVE ACCENT = LATIN CAPITAL LETTER U WITH GRAVE
  46. (0x0055, 0x0301, 0x00DA), # U + COMBINING ACUTE ACCENT = LATIN CAPITAL LETTER U WITH ACUTE
  47. (0x0055, 0x0302, 0x00DB), # U + COMBINING CIRCUMFLEX ACCENT = LATIN CAPITAL LETTER U WITH CIRCUMFLEX
  48. (0x0055, 0x0308, 0x00DC), # U + COMBINING DIAERESIS = LATIN CAPITAL LETTER U WITH DIAERESIS
  49. (0x0059, 0x0301, 0x00DD), # Y + COMBINING ACUTE ACCENT = LATIN CAPITAL LETTER Y WITH ACUTE
  50. # Latin letters with accents - lowercase
  51. (0x0061, 0x0300, 0x00E0), # a + COMBINING GRAVE ACCENT = LATIN SMALL LETTER A WITH GRAVE
  52. (0x0061, 0x0301, 0x00E1), # a + COMBINING ACUTE ACCENT = LATIN SMALL LETTER A WITH ACUTE
  53. (0x0061, 0x0302, 0x00E2), # a + COMBINING CIRCUMFLEX ACCENT = LATIN SMALL LETTER A WITH CIRCUMFLEX
  54. (0x0061, 0x0303, 0x00E3), # a + COMBINING TILDE = LATIN SMALL LETTER A WITH TILDE
  55. (0x0061, 0x0308, 0x00E4), # a + COMBINING DIAERESIS = LATIN SMALL LETTER A WITH DIAERESIS
  56. (0x0061, 0x030A, 0x00E5), # a + COMBINING RING ABOVE = LATIN SMALL LETTER A WITH RING ABOVE
  57. (0x0063, 0x0327, 0x00E7), # c + COMBINING CEDILLA = LATIN SMALL LETTER C WITH CEDILLA
  58. (0x0065, 0x0300, 0x00E8), # e + COMBINING GRAVE ACCENT = LATIN SMALL LETTER E WITH GRAVE
  59. (0x0065, 0x0301, 0x00E9), # e + COMBINING ACUTE ACCENT = LATIN SMALL LETTER E WITH ACUTE
  60. (0x0065, 0x0302, 0x00EA), # e + COMBINING CIRCUMFLEX ACCENT = LATIN SMALL LETTER E WITH CIRCUMFLEX
  61. (0x0065, 0x0308, 0x00EB), # e + COMBINING DIAERESIS = LATIN SMALL LETTER E WITH DIAERESIS
  62. (0x0069, 0x0300, 0x00EC), # i + COMBINING GRAVE ACCENT = LATIN SMALL LETTER I WITH GRAVE
  63. (0x0069, 0x0301, 0x00ED), # i + COMBINING ACUTE ACCENT = LATIN SMALL LETTER I WITH ACUTE
  64. (0x0069, 0x0302, 0x00EE), # i + COMBINING CIRCUMFLEX ACCENT = LATIN SMALL LETTER I WITH CIRCUMFLEX
  65. (0x0069, 0x0308, 0x00EF), # i + COMBINING DIAERESIS = LATIN SMALL LETTER I WITH DIAERESIS
  66. (0x006E, 0x0303, 0x00F1), # n + COMBINING TILDE = LATIN SMALL LETTER N WITH TILDE
  67. (0x006F, 0x0300, 0x00F2), # o + COMBINING GRAVE ACCENT = LATIN SMALL LETTER O WITH GRAVE
  68. (0x006F, 0x0301, 0x00F3), # o + COMBINING ACUTE ACCENT = LATIN SMALL LETTER O WITH ACUTE
  69. (0x006F, 0x0302, 0x00F4), # o + COMBINING CIRCUMFLEX ACCENT = LATIN SMALL LETTER O WITH CIRCUMFLEX
  70. (0x006F, 0x0303, 0x00F5), # o + COMBINING TILDE = LATIN SMALL LETTER O WITH TILDE
  71. (0x006F, 0x0308, 0x00F6), # o + COMBINING DIAERESIS = LATIN SMALL LETTER O WITH DIAERESIS
  72. (0x0075, 0x0300, 0x00F9), # u + COMBINING GRAVE ACCENT = LATIN SMALL LETTER U WITH GRAVE
  73. (0x0075, 0x0301, 0x00FA), # u + COMBINING ACUTE ACCENT = LATIN SMALL LETTER U WITH ACUTE
  74. (0x0075, 0x0302, 0x00FB), # u + COMBINING CIRCUMFLEX ACCENT = LATIN SMALL LETTER U WITH CIRCUMFLEX
  75. (0x0075, 0x0308, 0x00FC), # u + COMBINING DIAERESIS = LATIN SMALL LETTER U WITH DIAERESIS
  76. (0x0079, 0x0301, 0x00FD), # y + COMBINING ACUTE ACCENT = LATIN SMALL LETTER Y WITH ACUTE
  77. (0x0079, 0x0308, 0x00FF), # y + COMBINING DIAERESIS = LATIN SMALL LETTER Y WITH DIAERESIS
  78. # Common Greek characters
  79. (0x0391, 0x0301, 0x0386), # Α + COMBINING ACUTE ACCENT = GREEK CAPITAL LETTER ALPHA WITH TONOS
  80. (0x0395, 0x0301, 0x0388), # Ε + COMBINING ACUTE ACCENT = GREEK CAPITAL LETTER EPSILON WITH TONOS
  81. (0x0397, 0x0301, 0x0389), # Η + COMBINING ACUTE ACCENT = GREEK CAPITAL LETTER ETA WITH TONOS
  82. (0x0399, 0x0301, 0x038A), # Ι + COMBINING ACUTE ACCENT = GREEK CAPITAL LETTER IOTA WITH TONOS
  83. (0x039F, 0x0301, 0x038C), # Ο + COMBINING ACUTE ACCENT = GREEK CAPITAL LETTER OMICRON WITH TONOS
  84. (0x03A5, 0x0301, 0x038E), # Υ + COMBINING ACUTE ACCENT = GREEK CAPITAL LETTER UPSILON WITH TONOS
  85. (0x03A9, 0x0301, 0x038F), # Ω + COMBINING ACUTE ACCENT = GREEK CAPITAL LETTER OMEGA WITH TONOS
  86. (0x03B1, 0x0301, 0x03AC), # α + COMBINING ACUTE ACCENT = GREEK SMALL LETTER ALPHA WITH TONOS
  87. (0x03B5, 0x0301, 0x03AD), # ε + COMBINING ACUTE ACCENT = GREEK SMALL LETTER EPSILON WITH TONOS
  88. (0x03B7, 0x0301, 0x03AE), # η + COMBINING ACUTE ACCENT = GREEK SMALL LETTER ETA WITH TONOS
  89. (0x03B9, 0x0301, 0x03AF), # ι + COMBINING ACUTE ACCENT = GREEK SMALL LETTER IOTA WITH TONOS
  90. (0x03BF, 0x0301, 0x03CC), # ο + COMBINING ACUTE ACCENT = GREEK SMALL LETTER OMICRON WITH TONOS
  91. (0x03C5, 0x0301, 0x03CD), # υ + COMBINING ACUTE ACCENT = GREEK SMALL LETTER UPSILON WITH TONOS
  92. (0x03C9, 0x0301, 0x03CE), # ω + COMBINING ACUTE ACCENT = GREEK SMALL LETTER OMEGA WITH TONOS
  93. # Common Cyrillic characters
  94. (0x0418, 0x0306, 0x0419), # И + COMBINING BREVE = CYRILLIC CAPITAL LETTER SHORT I
  95. (0x0438, 0x0306, 0x0439), # и + COMBINING BREVE = CYRILLIC SMALL LETTER SHORT I
  96. (0x0423, 0x0306, 0x040E), # У + COMBINING BREVE = CYRILLIC CAPITAL LETTER SHORT U
  97. (0x0443, 0x0306, 0x045E), # у + COMBINING BREVE = CYRILLIC SMALL LETTER SHORT U
  98. ]
  99. full_recompose_description = "all possible recomposition pairs from the Unicode BMP"
  100. def collect_all_recomposition_pairs():
  101. """Collect all possible recomposition pairs from the Unicode data."""
  102. # Map to store recomposition pairs: (base, combining) -> recomposed
  103. recompose_map = {}
  104. # Process all assigned Unicode code points in BMP (Basic Multilingual Plane)
  105. # We limit to BMP (0x0000-0xFFFF) to keep our table smaller with uint16_t
  106. for cp in range(0, 0x10000):
  107. try:
  108. char = chr(cp)
  109. # Skip unassigned or control characters
  110. if not unicodedata.name(char, ''):
  111. continue
  112. # Find decomposition
  113. decomp = unicodedata.decomposition(char)
  114. if not decomp or '<' in decomp: # Skip compatibility decompositions
  115. continue
  116. # Parse the decomposition
  117. parts = decomp.split()
  118. if len(parts) == 2: # Simple base + combining mark
  119. base = int(parts[0], 16)
  120. combining = int(parts[1], 16)
  121. # Only store if both are in BMP
  122. if base < 0x10000 and combining < 0x10000:
  123. recompose_map[(base, combining)] = cp
  124. except (ValueError, TypeError):
  125. continue
  126. # Convert to a list of tuples and sort for binary search
  127. recompose_list = [(base, combining, recomposed)
  128. for (base, combining), recomposed in recompose_map.items()]
  129. recompose_list.sort()
  130. return recompose_list
  131. def validate_common_pairs(full_list):
  132. """Validate that all common pairs are in the full list.
  133. Raises:
  134. ValueError: If any common pair is missing or has a different recomposition
  135. value than what's in the full table.
  136. """
  137. full_pairs = {(base, combining): recomposed for base, combining, recomposed in full_list}
  138. for base, combining, recomposed in COMMON_RECOMPOSITION_PAIRS:
  139. full_recomposed = full_pairs.get((base, combining))
  140. if full_recomposed is None:
  141. error_msg = f"Error: Common pair (0x{base:04X}, 0x{combining:04X}) not found in full data"
  142. print(error_msg)
  143. raise ValueError(error_msg)
  144. elif full_recomposed != recomposed:
  145. error_msg = (f"Error: Common pair (0x{base:04X}, 0x{combining:04X}) has different recomposition: "
  146. f"0x{recomposed:04X} vs 0x{full_recomposed:04X}")
  147. print(error_msg)
  148. raise ValueError(error_msg)
  149. def generate_recomposition_table(use_full_list=False, out_file=DEFAULT_OUT_FILE):
  150. """Generate the recomposition C table."""
  151. # Collect all recomposition pairs for validation
  152. full_recompose_list = collect_all_recomposition_pairs()
  153. # Decide which list to use
  154. if use_full_list:
  155. print("Using full recomposition list...")
  156. recompose_list = full_recompose_list
  157. table_description = full_recompose_description
  158. alt_list = COMMON_RECOMPOSITION_PAIRS
  159. alt_description = common_recompose_description
  160. else:
  161. print("Using common recomposition list...")
  162. # Validate that all common pairs are in the full list
  163. validate_common_pairs(full_recompose_list)
  164. recompose_list = sorted(COMMON_RECOMPOSITION_PAIRS)
  165. table_description = common_recompose_description
  166. alt_list = full_recompose_list
  167. alt_description = full_recompose_description
  168. generation_mode = " --full" if use_full_list else ""
  169. alternative_mode = " --full" if not use_full_list else ""
  170. table_description_detail = f"{table_description} ({len(recompose_list)} entries)"
  171. alt_description_detail = f"{alt_description} ({len(alt_list)} entries)"
  172. # Calculate min/max values for boundary checks
  173. min_base = min(base for base, _, _ in recompose_list)
  174. max_base = max(base for base, _, _ in recompose_list)
  175. min_combining = min(combining for _, combining, _ in recompose_list)
  176. max_combining = max(combining for _, combining, _ in recompose_list)
  177. # Generate implementation file
  178. with open(out_file, 'w') as f:
  179. f.write(f"""\
  180. /* SPDX-License-Identifier: GPL-2.0 */
  181. /*
  182. * {out_file} - Unicode character recomposition
  183. *
  184. * Auto-generated by {this_file}{generation_mode}
  185. *
  186. * Unicode Version: {unicodedata.unidata_version}
  187. *
  188. {textwrap.fill(
  189. f"This file contains a table with {table_description_detail}. " +
  190. f"To generate a table with {alt_description_detail} instead, run:",
  191. width=75, initial_indent=" * ", subsequent_indent=" * ")}
  192. *
  193. * python3 {this_file}{alternative_mode}
  194. */
  195. /*
  196. * Table of {table_description}
  197. * Sorted by base character and then combining mark for binary search
  198. */
  199. static const struct ucs_recomposition ucs_recomposition_table[] = {{
  200. """)
  201. for base, combining, recomposed in recompose_list:
  202. try:
  203. base_name = unicodedata.name(chr(base))
  204. combining_name = unicodedata.name(chr(combining))
  205. recomposed_name = unicodedata.name(chr(recomposed))
  206. comment = f"/* {base_name} + {combining_name} = {recomposed_name} */"
  207. except ValueError:
  208. comment = f"/* U+{base:04X} + U+{combining:04X} = U+{recomposed:04X} */"
  209. f.write(f"\t{{ 0x{base:04X}, 0x{combining:04X}, 0x{recomposed:04X} }}, {comment}\n")
  210. f.write(f"""\
  211. }};
  212. /*
  213. * Boundary values for quick rejection
  214. * These are calculated by analyzing the table during generation
  215. */
  216. #define UCS_RECOMPOSE_MIN_BASE 0x{min_base:04X}
  217. #define UCS_RECOMPOSE_MAX_BASE 0x{max_base:04X}
  218. #define UCS_RECOMPOSE_MIN_MARK 0x{min_combining:04X}
  219. #define UCS_RECOMPOSE_MAX_MARK 0x{max_combining:04X}
  220. """)
  221. if __name__ == "__main__":
  222. parser = argparse.ArgumentParser(description="Generate Unicode recomposition table")
  223. parser.add_argument("--full", action="store_true",
  224. help="Generate a full recomposition table (default: common pairs only)")
  225. parser.add_argument("-o", "--output", dest="output_file", default=DEFAULT_OUT_FILE,
  226. help=f"Output file name (default: {DEFAULT_OUT_FILE})")
  227. args = parser.parse_args()
  228. generate_recomposition_table(use_full_list=args.full, out_file=args.output_file)