gconv_trans.c 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237
  1. /* Transliteration using the locale's data.
  2. Copyright (C) 2000-2026 Free Software Foundation, Inc.
  3. This file is part of the GNU C Library.
  4. The GNU C Library is free software; you can redistribute it and/or
  5. modify it under the terms of the GNU Lesser General Public
  6. License as published by the Free Software Foundation; either
  7. version 2.1 of the License, or (at your option) any later version.
  8. The GNU C Library is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11. Lesser General Public License for more details.
  12. You should have received a copy of the GNU Lesser General Public
  13. License along with the GNU C Library; if not, see
  14. <https://www.gnu.org/licenses/>. */
  15. #include <assert.h>
  16. #include <dlfcn.h>
  17. #include <search.h>
  18. #include <stdint.h>
  19. #include <string.h>
  20. #include <stdlib.h>
  21. #include <libc-lock.h>
  22. #include "gconv_int.h"
  23. #include "../locale/localeinfo.h"
  24. #include <pointer_guard.h>
  25. int
  26. __gconv_transliterate (struct __gconv_step *step,
  27. struct __gconv_step_data *step_data,
  28. const unsigned char *inbufstart,
  29. const unsigned char **inbufp,
  30. const unsigned char *inbufend,
  31. unsigned char **outbufstart, size_t *irreversible)
  32. {
  33. /* Find out about the locale's transliteration. */
  34. uint32_t size;
  35. const uint32_t *from_idx;
  36. const uint32_t *from_tbl;
  37. const uint32_t *to_idx;
  38. const uint32_t *to_tbl;
  39. const uint32_t *winbuf;
  40. const uint32_t *winbufend;
  41. uint32_t low;
  42. uint32_t high;
  43. /* The input buffer. There are actually 4-byte values. */
  44. winbuf = (const uint32_t *) *inbufp;
  45. winbufend = (const uint32_t *) inbufend;
  46. __gconv_fct fct = step->__fct;
  47. if (step->__shlib_handle != NULL)
  48. PTR_DEMANGLE (fct);
  49. /* If there is no transliteration information in the locale don't do
  50. anything and return the error. */
  51. size = _NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_TRANSLIT_TAB_SIZE);
  52. if (size == 0)
  53. goto no_rules;
  54. /* Get the rest of the values. */
  55. from_idx =
  56. (const uint32_t *) _NL_CURRENT (LC_CTYPE, _NL_CTYPE_TRANSLIT_FROM_IDX);
  57. from_tbl =
  58. (const uint32_t *) _NL_CURRENT (LC_CTYPE, _NL_CTYPE_TRANSLIT_FROM_TBL);
  59. to_idx =
  60. (const uint32_t *) _NL_CURRENT (LC_CTYPE, _NL_CTYPE_TRANSLIT_TO_IDX);
  61. to_tbl =
  62. (const uint32_t *) _NL_CURRENT (LC_CTYPE, _NL_CTYPE_TRANSLIT_TO_TBL);
  63. /* Test whether there is enough input. */
  64. if (winbuf + 1 > winbufend)
  65. return (winbuf == winbufend
  66. ? __GCONV_EMPTY_INPUT : __GCONV_INCOMPLETE_INPUT);
  67. /* The array starting at FROM_IDX contains indices to the string table
  68. in FROM_TBL. The indices are sorted wrt to the strings. I.e., we
  69. are doing binary search. */
  70. low = 0;
  71. high = size;
  72. while (low < high)
  73. {
  74. uint32_t med = (low + high) / 2;
  75. uint32_t idx;
  76. int cnt;
  77. /* Compare the string at this index with the string at the current
  78. position in the input buffer. */
  79. idx = from_idx[med];
  80. cnt = 0;
  81. do
  82. {
  83. if (from_tbl[idx + cnt] != winbuf[cnt])
  84. /* Does not match. */
  85. break;
  86. ++cnt;
  87. }
  88. while (from_tbl[idx + cnt] != L'\0' && winbuf + cnt < winbufend);
  89. if (cnt > 0 && from_tbl[idx + cnt] == L'\0')
  90. {
  91. /* Found a matching input sequence. Now try to convert the
  92. possible replacements. */
  93. uint32_t idx2 = to_idx[med];
  94. do
  95. {
  96. /* Determine length of replacement. */
  97. unsigned int len = 0;
  98. int res;
  99. const unsigned char *toinptr;
  100. unsigned char *outptr;
  101. while (to_tbl[idx2 + len] != L'\0')
  102. ++len;
  103. /* Try this input text. */
  104. toinptr = (const unsigned char *) &to_tbl[idx2];
  105. outptr = *outbufstart;
  106. res = DL_CALL_FCT (fct,
  107. (step, step_data, &toinptr,
  108. (const unsigned char *) &to_tbl[idx2 + len],
  109. &outptr, NULL, 0, 0));
  110. if (res != __GCONV_ILLEGAL_INPUT)
  111. {
  112. /* If the conversion succeeds we have to increment the
  113. input buffer. */
  114. if (res == __GCONV_EMPTY_INPUT)
  115. {
  116. *inbufp += cnt * sizeof (uint32_t);
  117. ++*irreversible;
  118. res = __GCONV_OK;
  119. }
  120. /* Do not increment the output pointer if we could not
  121. store the entire output. */
  122. if (res != __GCONV_FULL_OUTPUT)
  123. *outbufstart = outptr;
  124. return res;
  125. }
  126. /* Next replacement. */
  127. idx2 += len + 1;
  128. }
  129. while (to_tbl[idx2] != L'\0');
  130. /* Nothing found, continue searching. */
  131. }
  132. else if (cnt > 0 && winbuf + cnt == winbufend)
  133. /* This means that the input buffer contents matches a prefix of
  134. an entry. Since we cannot match it unless we get more input,
  135. we will tell the caller about it. */
  136. return __GCONV_INCOMPLETE_INPUT;
  137. if (winbuf + cnt >= winbufend || from_tbl[idx + cnt] < winbuf[cnt])
  138. low = med + 1;
  139. else
  140. high = med;
  141. }
  142. no_rules:
  143. /* Maybe the character is supposed to be ignored. */
  144. if (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_TRANSLIT_IGNORE_LEN) != 0)
  145. {
  146. int n = _NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_TRANSLIT_IGNORE_LEN);
  147. const uint32_t *ranges =
  148. (const uint32_t *) _NL_CURRENT (LC_CTYPE, _NL_CTYPE_TRANSLIT_IGNORE);
  149. const uint32_t wc = *(const uint32_t *) (*inbufp);
  150. int i;
  151. /* Test whether there is enough input. */
  152. if (winbuf + 1 > winbufend)
  153. return (winbuf == winbufend
  154. ? __GCONV_EMPTY_INPUT : __GCONV_INCOMPLETE_INPUT);
  155. for (i = 0; i < n; ranges += 3, ++i)
  156. if (ranges[0] <= wc && wc <= ranges[1]
  157. && (wc - ranges[0]) % ranges[2] == 0)
  158. {
  159. /* Matches the range. Ignore it. */
  160. *inbufp += 4;
  161. ++*irreversible;
  162. return __GCONV_OK;
  163. }
  164. else if (wc < ranges[0])
  165. /* There cannot be any other matching range since they are
  166. sorted. */
  167. break;
  168. }
  169. /* One last chance: use the default replacement. */
  170. if (_NL_CURRENT_WORD (LC_CTYPE, _NL_CTYPE_TRANSLIT_DEFAULT_MISSING_LEN) != 0)
  171. {
  172. const uint32_t *default_missing = (const uint32_t *)
  173. _NL_CURRENT (LC_CTYPE, _NL_CTYPE_TRANSLIT_DEFAULT_MISSING);
  174. const unsigned char *toinptr = (const unsigned char *) default_missing;
  175. uint32_t len = _NL_CURRENT_WORD (LC_CTYPE,
  176. _NL_CTYPE_TRANSLIT_DEFAULT_MISSING_LEN);
  177. unsigned char *outptr;
  178. int res;
  179. /* Test whether there is enough input. */
  180. if (winbuf + 1 > winbufend)
  181. return (winbuf == winbufend
  182. ? __GCONV_EMPTY_INPUT : __GCONV_INCOMPLETE_INPUT);
  183. outptr = *outbufstart;
  184. res = DL_CALL_FCT (fct,
  185. (step, step_data, &toinptr,
  186. (const unsigned char *) (default_missing + len),
  187. &outptr, NULL, 0, 0));
  188. if (res != __GCONV_ILLEGAL_INPUT)
  189. {
  190. /* If the conversion succeeds we have to increment the
  191. input buffer. */
  192. if (res == __GCONV_EMPTY_INPUT)
  193. {
  194. /* This worked but is not reversible. */
  195. ++*irreversible;
  196. *inbufp += 4;
  197. res = __GCONV_OK;
  198. }
  199. *outbufstart = outptr;
  200. return res;
  201. }
  202. }
  203. /* Haven't found a match. */
  204. return __gconv_mark_illegal_input (step_data);
  205. }
  206. libc_hidden_def (__gconv_transliterate)