gconv_simple.c 32 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080
  1. /* Simple transformations functions.
  2. Copyright (C) 1997-2026 Free Software Foundation, Inc.
  3. This file is part of the GNU C Library.
  4. The GNU C Library is free software; you can redistribute it and/or
  5. modify it under the terms of the GNU Lesser General Public
  6. License as published by the Free Software Foundation; either
  7. version 2.1 of the License, or (at your option) any later version.
  8. The GNU C Library is distributed in the hope that it will be useful,
  9. but WITHOUT ANY WARRANTY; without even the implied warranty of
  10. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  11. Lesser General Public License for more details.
  12. You should have received a copy of the GNU Lesser General Public
  13. License along with the GNU C Library; if not, see
  14. <https://www.gnu.org/licenses/>. */
  15. #include <byteswap.h>
  16. #include <dlfcn.h>
  17. #include <endian.h>
  18. #include <errno.h>
  19. #include <gconv.h>
  20. #include <stdint.h>
  21. #include <stdlib.h>
  22. #include <string.h>
  23. #include <wchar.h>
  24. #include <sys/param.h>
  25. #include <gconv_int.h>
  26. #define BUILTIN_ALIAS(s1, s2) /* nothing */
  27. #define BUILTIN_TRANSFORMATION(From, To, Cost, Name, Fct, BtowcFct, \
  28. MinF, MaxF, MinT, MaxT) \
  29. extern int Fct (struct __gconv_step *, struct __gconv_step_data *, \
  30. const unsigned char **, const unsigned char *, \
  31. unsigned char **, size_t *, int, int);
  32. #include "gconv_builtin.h"
  33. #ifndef EILSEQ
  34. # define EILSEQ EINVAL
  35. #endif
  36. /* Specialized conversion function for a single byte to INTERNAL, recognizing
  37. only ASCII characters. */
  38. wint_t
  39. __gconv_btwoc_ascii (struct __gconv_step *step, unsigned char c)
  40. {
  41. if (c < 0x80)
  42. return c;
  43. else
  44. return WEOF;
  45. }
  46. /* Transform from the internal, UCS4-like format, to UCS4. The
  47. difference between the internal ucs4 format and the real UCS4
  48. format is, if any, the endianness. The Unicode/ISO 10646 says that
  49. unless some higher protocol specifies it differently, the byte
  50. order is big endian.*/
  51. #define DEFINE_INIT 0
  52. #define DEFINE_FINI 0
  53. #define MIN_NEEDED_FROM 4
  54. #define MIN_NEEDED_TO 4
  55. #define FROM_DIRECTION 1
  56. #define FROM_LOOP internal_ucs4_loop
  57. #define TO_LOOP internal_ucs4_loop /* This is not used. */
  58. #define FUNCTION_NAME __gconv_transform_internal_ucs4
  59. #define ONE_DIRECTION 0
  60. static inline int
  61. __attribute ((always_inline))
  62. internal_ucs4_loop (struct __gconv_step *step,
  63. struct __gconv_step_data *step_data,
  64. const unsigned char **inptrp, const unsigned char *inend,
  65. unsigned char **outptrp, const unsigned char *outend,
  66. size_t *irreversible)
  67. {
  68. const unsigned char *inptr = *inptrp;
  69. unsigned char *outptr = *outptrp;
  70. size_t n_convert = MIN (inend - inptr, outend - outptr) / 4;
  71. int result;
  72. #if __BYTE_ORDER == __LITTLE_ENDIAN
  73. /* Sigh, we have to do some real work. */
  74. size_t cnt;
  75. for (cnt = 0; cnt < n_convert; ++cnt, inptr += 4, outptr += 4)
  76. {
  77. uint32_t val = get32 (inptr);
  78. put32 (outptr, bswap_32 (val));
  79. }
  80. *inptrp = inptr;
  81. *outptrp = outptr;
  82. #elif __BYTE_ORDER == __BIG_ENDIAN
  83. /* Simply copy the data. */
  84. *inptrp = inptr + n_convert * 4;
  85. *outptrp = __mempcpy (outptr, inptr, n_convert * 4);
  86. #else
  87. # error "This endianness is not supported."
  88. #endif
  89. /* Determine the status. */
  90. if (*inptrp == inend)
  91. result = __GCONV_EMPTY_INPUT;
  92. else if (*outptrp + 4 > outend)
  93. result = __GCONV_FULL_OUTPUT;
  94. else
  95. result = __GCONV_INCOMPLETE_INPUT;
  96. return result;
  97. }
  98. static inline int
  99. __attribute ((always_inline))
  100. internal_ucs4_loop_single (struct __gconv_step *step,
  101. struct __gconv_step_data *step_data,
  102. const unsigned char **inptrp,
  103. const unsigned char *inend,
  104. unsigned char **outptrp,
  105. const unsigned char *outend,
  106. size_t *irreversible)
  107. {
  108. mbstate_t *state = step_data->__statep;
  109. size_t cnt = state->__count & 7;
  110. while (*inptrp < inend && cnt < 4)
  111. state->__value.__wchb[cnt++] = *(*inptrp)++;
  112. if (__glibc_unlikely (cnt < 4))
  113. {
  114. /* Still not enough bytes. Store the ones in the input buffer. */
  115. state->__count &= ~7;
  116. state->__count |= cnt;
  117. return __GCONV_INCOMPLETE_INPUT;
  118. }
  119. #if __BYTE_ORDER == __LITTLE_ENDIAN
  120. (*outptrp)[0] = state->__value.__wchb[3];
  121. (*outptrp)[1] = state->__value.__wchb[2];
  122. (*outptrp)[2] = state->__value.__wchb[1];
  123. (*outptrp)[3] = state->__value.__wchb[0];
  124. #elif __BYTE_ORDER == __BIG_ENDIAN
  125. /* XXX unaligned */
  126. (*outptrp)[0] = state->__value.__wchb[0];
  127. (*outptrp)[1] = state->__value.__wchb[1];
  128. (*outptrp)[2] = state->__value.__wchb[2];
  129. (*outptrp)[3] = state->__value.__wchb[3];
  130. #else
  131. # error "This endianness is not supported."
  132. #endif
  133. *outptrp += 4;
  134. /* Clear the state buffer. */
  135. state->__count &= ~7;
  136. return __GCONV_OK;
  137. }
  138. #include <iconv/skeleton.c>
  139. /* Transform from UCS4 to the internal, UCS4-like format. Unlike
  140. for the other direction we have to check for correct values here. */
  141. #define DEFINE_INIT 0
  142. #define DEFINE_FINI 0
  143. #define MIN_NEEDED_FROM 4
  144. #define MIN_NEEDED_TO 4
  145. #define FROM_DIRECTION 1
  146. #define FROM_LOOP ucs4_internal_loop
  147. #define TO_LOOP ucs4_internal_loop /* This is not used. */
  148. #define FUNCTION_NAME __gconv_transform_ucs4_internal
  149. #define ONE_DIRECTION 0
  150. static inline int
  151. __attribute ((always_inline))
  152. ucs4_internal_loop (struct __gconv_step *step,
  153. struct __gconv_step_data *step_data,
  154. const unsigned char **inptrp, const unsigned char *inend,
  155. unsigned char **outptrp, const unsigned char *outend,
  156. size_t *irreversible)
  157. {
  158. int flags = step_data->__flags;
  159. const unsigned char *inptr = *inptrp;
  160. unsigned char *outptr = *outptrp;
  161. int result;
  162. for (; inptr + 4 <= inend && outptr + 4 <= outend; inptr += 4)
  163. {
  164. uint32_t inval = get32 (inptr);
  165. #if __BYTE_ORDER == __LITTLE_ENDIAN
  166. inval = bswap_32 (inval);
  167. #endif
  168. if (__glibc_unlikely (inval > 0x7fffffff))
  169. {
  170. /* The value is too large. We don't try transliteration here since
  171. this is not an error because of the lack of possibilities to
  172. represent the result. This is a genuine bug in the input since
  173. UCS4 does not allow such values. */
  174. if (irreversible == NULL)
  175. /* We are transliterating, don't try to correct anything. */
  176. return __gconv_mark_illegal_input (step_data);
  177. if (flags & __GCONV_IGNORE_ERRORS)
  178. {
  179. /* Just ignore this character. */
  180. ++*irreversible;
  181. continue;
  182. }
  183. *inptrp = inptr;
  184. *outptrp = outptr;
  185. return __gconv_mark_illegal_input (step_data);
  186. }
  187. put32 (outptr, inval);
  188. outptr += sizeof (uint32_t);
  189. }
  190. *inptrp = inptr;
  191. *outptrp = outptr;
  192. /* Determine the status. */
  193. if (*inptrp == inend)
  194. result = __GCONV_EMPTY_INPUT;
  195. else if (*outptrp + 4 > outend)
  196. result = __GCONV_FULL_OUTPUT;
  197. else
  198. result = __GCONV_INCOMPLETE_INPUT;
  199. return result;
  200. }
  201. static inline int
  202. __attribute ((always_inline))
  203. ucs4_internal_loop_single (struct __gconv_step *step,
  204. struct __gconv_step_data *step_data,
  205. const unsigned char **inptrp,
  206. const unsigned char *inend,
  207. unsigned char **outptrp,
  208. const unsigned char *outend,
  209. size_t *irreversible)
  210. {
  211. mbstate_t *state = step_data->__statep;
  212. int flags = step_data->__flags;
  213. size_t cnt = state->__count & 7;
  214. while (*inptrp < inend && cnt < 4)
  215. state->__value.__wchb[cnt++] = *(*inptrp)++;
  216. if (__glibc_unlikely (cnt < 4))
  217. {
  218. /* Still not enough bytes. Store the ones in the input buffer. */
  219. state->__count &= ~7;
  220. state->__count |= cnt;
  221. return __GCONV_INCOMPLETE_INPUT;
  222. }
  223. if (__builtin_expect (((unsigned char *) state->__value.__wchb)[0] > 0x80,
  224. 0))
  225. {
  226. /* The value is too large. We don't try transliteration here since
  227. this is not an error because of the lack of possibilities to
  228. represent the result. This is a genuine bug in the input since
  229. UCS4 does not allow such values. */
  230. if (!(flags & __GCONV_IGNORE_ERRORS))
  231. {
  232. *inptrp -= cnt - (state->__count & 7);
  233. return __gconv_mark_illegal_input (step_data);
  234. }
  235. }
  236. else
  237. {
  238. #if __BYTE_ORDER == __LITTLE_ENDIAN
  239. (*outptrp)[0] = state->__value.__wchb[3];
  240. (*outptrp)[1] = state->__value.__wchb[2];
  241. (*outptrp)[2] = state->__value.__wchb[1];
  242. (*outptrp)[3] = state->__value.__wchb[0];
  243. #elif __BYTE_ORDER == __BIG_ENDIAN
  244. (*outptrp)[0] = state->__value.__wchb[0];
  245. (*outptrp)[1] = state->__value.__wchb[1];
  246. (*outptrp)[2] = state->__value.__wchb[2];
  247. (*outptrp)[3] = state->__value.__wchb[3];
  248. #endif
  249. *outptrp += 4;
  250. }
  251. /* Clear the state buffer. */
  252. state->__count &= ~7;
  253. return __GCONV_OK;
  254. }
  255. #include <iconv/skeleton.c>
  256. /* Similarly for the little endian form. */
  257. #define DEFINE_INIT 0
  258. #define DEFINE_FINI 0
  259. #define MIN_NEEDED_FROM 4
  260. #define MIN_NEEDED_TO 4
  261. #define FROM_DIRECTION 1
  262. #define FROM_LOOP internal_ucs4le_loop
  263. #define TO_LOOP internal_ucs4le_loop /* This is not used. */
  264. #define FUNCTION_NAME __gconv_transform_internal_ucs4le
  265. #define ONE_DIRECTION 0
  266. static inline int
  267. __attribute ((always_inline))
  268. internal_ucs4le_loop (struct __gconv_step *step,
  269. struct __gconv_step_data *step_data,
  270. const unsigned char **inptrp, const unsigned char *inend,
  271. unsigned char **outptrp, const unsigned char *outend,
  272. size_t *irreversible)
  273. {
  274. const unsigned char *inptr = *inptrp;
  275. unsigned char *outptr = *outptrp;
  276. size_t n_convert = MIN (inend - inptr, outend - outptr) / 4;
  277. int result;
  278. #if __BYTE_ORDER == __BIG_ENDIAN
  279. /* Sigh, we have to do some real work. */
  280. size_t cnt;
  281. for (cnt = 0; cnt < n_convert; ++cnt, inptr += 4, outptr += 4)
  282. {
  283. uint32_t val = get32 (inptr);
  284. put32 (outptr, bswap_32 (val));
  285. }
  286. *inptrp = inptr;
  287. *outptrp = outptr;
  288. #elif __BYTE_ORDER == __LITTLE_ENDIAN
  289. /* Simply copy the data. */
  290. *inptrp = inptr + n_convert * 4;
  291. *outptrp = __mempcpy (outptr, inptr, n_convert * 4);
  292. #else
  293. # error "This endianness is not supported."
  294. #endif
  295. /* Determine the status. */
  296. if (*inptrp == inend)
  297. result = __GCONV_EMPTY_INPUT;
  298. else if (*outptrp + 4 > outend)
  299. result = __GCONV_FULL_OUTPUT;
  300. else
  301. result = __GCONV_INCOMPLETE_INPUT;
  302. return result;
  303. }
  304. static inline int
  305. __attribute ((always_inline))
  306. internal_ucs4le_loop_single (struct __gconv_step *step,
  307. struct __gconv_step_data *step_data,
  308. const unsigned char **inptrp,
  309. const unsigned char *inend,
  310. unsigned char **outptrp,
  311. const unsigned char *outend,
  312. size_t *irreversible)
  313. {
  314. mbstate_t *state = step_data->__statep;
  315. size_t cnt = state->__count & 7;
  316. while (*inptrp < inend && cnt < 4)
  317. state->__value.__wchb[cnt++] = *(*inptrp)++;
  318. if (__glibc_unlikely (cnt < 4))
  319. {
  320. /* Still not enough bytes. Store the ones in the input buffer. */
  321. state->__count &= ~7;
  322. state->__count |= cnt;
  323. return __GCONV_INCOMPLETE_INPUT;
  324. }
  325. #if __BYTE_ORDER == __BIG_ENDIAN
  326. (*outptrp)[0] = state->__value.__wchb[3];
  327. (*outptrp)[1] = state->__value.__wchb[2];
  328. (*outptrp)[2] = state->__value.__wchb[1];
  329. (*outptrp)[3] = state->__value.__wchb[0];
  330. #else
  331. /* XXX unaligned */
  332. (*outptrp)[0] = state->__value.__wchb[0];
  333. (*outptrp)[1] = state->__value.__wchb[1];
  334. (*outptrp)[2] = state->__value.__wchb[2];
  335. (*outptrp)[3] = state->__value.__wchb[3];
  336. #endif
  337. *outptrp += 4;
  338. /* Clear the state buffer. */
  339. state->__count &= ~7;
  340. return __GCONV_OK;
  341. }
  342. #include <iconv/skeleton.c>
  343. /* And finally from UCS4-LE to the internal encoding. */
  344. #define DEFINE_INIT 0
  345. #define DEFINE_FINI 0
  346. #define MIN_NEEDED_FROM 4
  347. #define MIN_NEEDED_TO 4
  348. #define FROM_DIRECTION 1
  349. #define FROM_LOOP ucs4le_internal_loop
  350. #define TO_LOOP ucs4le_internal_loop /* This is not used. */
  351. #define FUNCTION_NAME __gconv_transform_ucs4le_internal
  352. #define ONE_DIRECTION 0
  353. static inline int
  354. __attribute ((always_inline))
  355. ucs4le_internal_loop (struct __gconv_step *step,
  356. struct __gconv_step_data *step_data,
  357. const unsigned char **inptrp, const unsigned char *inend,
  358. unsigned char **outptrp, const unsigned char *outend,
  359. size_t *irreversible)
  360. {
  361. int flags = step_data->__flags;
  362. const unsigned char *inptr = *inptrp;
  363. unsigned char *outptr = *outptrp;
  364. int result;
  365. for (; inptr + 4 <= inend && outptr + 4 <= outend; inptr += 4)
  366. {
  367. uint32_t inval = get32 (inptr);
  368. #if __BYTE_ORDER == __BIG_ENDIAN
  369. inval = bswap_32 (inval);
  370. #endif
  371. if (__glibc_unlikely (inval > 0x7fffffff))
  372. {
  373. /* The value is too large. We don't try transliteration here since
  374. this is not an error because of the lack of possibilities to
  375. represent the result. This is a genuine bug in the input since
  376. UCS4 does not allow such values. */
  377. if (irreversible == NULL)
  378. /* We are transliterating, don't try to correct anything. */
  379. return __gconv_mark_illegal_input (step_data);
  380. if (flags & __GCONV_IGNORE_ERRORS)
  381. {
  382. /* Just ignore this character. */
  383. ++*irreversible;
  384. continue;
  385. }
  386. *inptrp = inptr;
  387. *outptrp = outptr;
  388. return __gconv_mark_illegal_input (step_data);
  389. }
  390. put32 (outptr, inval);
  391. outptr += sizeof (uint32_t);
  392. }
  393. *inptrp = inptr;
  394. *outptrp = outptr;
  395. /* Determine the status. */
  396. if (*inptrp == inend)
  397. result = __GCONV_EMPTY_INPUT;
  398. else if (*inptrp + 4 > inend)
  399. result = __GCONV_INCOMPLETE_INPUT;
  400. else
  401. {
  402. assert (*outptrp + 4 > outend);
  403. result = __GCONV_FULL_OUTPUT;
  404. }
  405. return result;
  406. }
  407. static inline int
  408. __attribute ((always_inline))
  409. ucs4le_internal_loop_single (struct __gconv_step *step,
  410. struct __gconv_step_data *step_data,
  411. const unsigned char **inptrp,
  412. const unsigned char *inend,
  413. unsigned char **outptrp,
  414. const unsigned char *outend,
  415. size_t *irreversible)
  416. {
  417. mbstate_t *state = step_data->__statep;
  418. int flags = step_data->__flags;
  419. size_t cnt = state->__count & 7;
  420. while (*inptrp < inend && cnt < 4)
  421. state->__value.__wchb[cnt++] = *(*inptrp)++;
  422. if (__glibc_unlikely (cnt < 4))
  423. {
  424. /* Still not enough bytes. Store the ones in the input buffer. */
  425. state->__count &= ~7;
  426. state->__count |= cnt;
  427. return __GCONV_INCOMPLETE_INPUT;
  428. }
  429. if (__builtin_expect (((unsigned char *) state->__value.__wchb)[3] > 0x80,
  430. 0))
  431. {
  432. /* The value is too large. We don't try transliteration here since
  433. this is not an error because of the lack of possibilities to
  434. represent the result. This is a genuine bug in the input since
  435. UCS4 does not allow such values. */
  436. if (!(flags & __GCONV_IGNORE_ERRORS))
  437. return __gconv_mark_illegal_input (step_data);
  438. }
  439. else
  440. {
  441. #if __BYTE_ORDER == __BIG_ENDIAN
  442. (*outptrp)[0] = state->__value.__wchb[3];
  443. (*outptrp)[1] = state->__value.__wchb[2];
  444. (*outptrp)[2] = state->__value.__wchb[1];
  445. (*outptrp)[3] = state->__value.__wchb[0];
  446. #else
  447. (*outptrp)[0] = state->__value.__wchb[0];
  448. (*outptrp)[1] = state->__value.__wchb[1];
  449. (*outptrp)[2] = state->__value.__wchb[2];
  450. (*outptrp)[3] = state->__value.__wchb[3];
  451. #endif
  452. *outptrp += 4;
  453. }
  454. /* Clear the state buffer. */
  455. state->__count &= ~7;
  456. return __GCONV_OK;
  457. }
  458. #include <iconv/skeleton.c>
  459. /* Convert from ISO 646-IRV to the internal (UCS4-like) format. */
  460. #define DEFINE_INIT 0
  461. #define DEFINE_FINI 0
  462. #define MIN_NEEDED_FROM 1
  463. #define MIN_NEEDED_TO 4
  464. #define FROM_DIRECTION 1
  465. #define FROM_LOOP ascii_internal_loop
  466. #define TO_LOOP ascii_internal_loop /* This is not used. */
  467. #define FUNCTION_NAME __gconv_transform_ascii_internal
  468. #define ONE_DIRECTION 1
  469. #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
  470. #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
  471. #define LOOPFCT FROM_LOOP
  472. #define BODY \
  473. { \
  474. if (__glibc_unlikely (*inptr > '\x7f')) \
  475. { \
  476. /* The value is too large. We don't try transliteration here since \
  477. this is not an error because of the lack of possibilities to \
  478. represent the result. This is a genuine bug in the input since \
  479. ASCII does not allow such values. */ \
  480. STANDARD_FROM_LOOP_ERR_HANDLER (1); \
  481. } \
  482. else \
  483. { \
  484. /* It's an one byte sequence. */ \
  485. *((uint32_t *) outptr) = *inptr++; \
  486. outptr += sizeof (uint32_t); \
  487. } \
  488. }
  489. #define LOOP_NEED_FLAGS
  490. #include <iconv/loop.c>
  491. #include <iconv/skeleton.c>
  492. /* Convert from the internal (UCS4-like) format to ISO 646-IRV. */
  493. #define DEFINE_INIT 0
  494. #define DEFINE_FINI 0
  495. #define MIN_NEEDED_FROM 4
  496. #define MIN_NEEDED_TO 1
  497. #define FROM_DIRECTION 1
  498. #define FROM_LOOP internal_ascii_loop
  499. #define TO_LOOP internal_ascii_loop /* This is not used. */
  500. #define FUNCTION_NAME __gconv_transform_internal_ascii
  501. #define ONE_DIRECTION 1
  502. #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
  503. #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
  504. #define LOOPFCT FROM_LOOP
  505. #define BODY \
  506. { \
  507. if (__glibc_unlikely (*((const uint32_t *) inptr) > 0x7f)) \
  508. { \
  509. UNICODE_TAG_HANDLER (*((const uint32_t *) inptr), 4); \
  510. STANDARD_TO_LOOP_ERR_HANDLER (4); \
  511. } \
  512. else \
  513. { \
  514. /* It's an one byte sequence. */ \
  515. *outptr++ = *((const uint32_t *) inptr); \
  516. inptr += sizeof (uint32_t); \
  517. } \
  518. }
  519. #define LOOP_NEED_FLAGS
  520. #include <iconv/loop.c>
  521. #include <iconv/skeleton.c>
  522. /* Convert from the internal (UCS4-like) format to UTF-8. */
  523. #define DEFINE_INIT 0
  524. #define DEFINE_FINI 0
  525. #define MIN_NEEDED_FROM 4
  526. #define MIN_NEEDED_TO 1
  527. #define MAX_NEEDED_TO 6
  528. #define FROM_DIRECTION 1
  529. #define FROM_LOOP internal_utf8_loop
  530. #define TO_LOOP internal_utf8_loop /* This is not used. */
  531. #define FUNCTION_NAME __gconv_transform_internal_utf8
  532. #define ONE_DIRECTION 1
  533. #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
  534. #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
  535. #define MAX_NEEDED_OUTPUT MAX_NEEDED_TO
  536. #define LOOPFCT FROM_LOOP
  537. #define BODY \
  538. { \
  539. uint32_t wc = *((const uint32_t *) inptr); \
  540. \
  541. if (__glibc_likely (wc < 0x80)) \
  542. /* It's an one byte sequence. */ \
  543. *outptr++ = (unsigned char) wc; \
  544. else if (__glibc_likely (wc <= 0x7fffffff \
  545. && (wc < 0xd800 || wc > 0xdfff))) \
  546. { \
  547. size_t step; \
  548. unsigned char *start; \
  549. \
  550. for (step = 2; step < 6; ++step) \
  551. if ((wc & (~(uint32_t)0 << (5 * step + 1))) == 0) \
  552. break; \
  553. \
  554. if (__glibc_unlikely (outptr + step > outend)) \
  555. { \
  556. /* Too long. */ \
  557. result = __GCONV_FULL_OUTPUT; \
  558. break; \
  559. } \
  560. \
  561. start = outptr; \
  562. *outptr = (unsigned char) (~0xff >> step); \
  563. outptr += step; \
  564. do \
  565. { \
  566. start[--step] = 0x80 | (wc & 0x3f); \
  567. wc >>= 6; \
  568. } \
  569. while (step > 1); \
  570. start[0] |= wc; \
  571. } \
  572. else \
  573. { \
  574. STANDARD_TO_LOOP_ERR_HANDLER (4); \
  575. } \
  576. \
  577. inptr += 4; \
  578. }
  579. #define LOOP_NEED_FLAGS
  580. #include <iconv/loop.c>
  581. #include <iconv/skeleton.c>
  582. /* Convert from UTF-8 to the internal (UCS4-like) format. */
  583. #define DEFINE_INIT 0
  584. #define DEFINE_FINI 0
  585. #define MIN_NEEDED_FROM 1
  586. #define MAX_NEEDED_FROM 6
  587. #define MIN_NEEDED_TO 4
  588. #define FROM_DIRECTION 1
  589. #define FROM_LOOP utf8_internal_loop
  590. #define TO_LOOP utf8_internal_loop /* This is not used. */
  591. #define FUNCTION_NAME __gconv_transform_utf8_internal
  592. #define ONE_DIRECTION 1
  593. #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
  594. #define MAX_NEEDED_INPUT MAX_NEEDED_FROM
  595. #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
  596. #define LOOPFCT FROM_LOOP
  597. #define BODY \
  598. { \
  599. /* Next input byte. */ \
  600. uint32_t ch = *inptr; \
  601. \
  602. if (__glibc_likely (ch < 0x80)) \
  603. { \
  604. /* One byte sequence. */ \
  605. ++inptr; \
  606. } \
  607. else \
  608. { \
  609. unsigned int cnt; \
  610. unsigned int i; \
  611. \
  612. if (ch >= 0xc2 && ch < 0xe0) \
  613. { \
  614. /* We expect two bytes. The first byte cannot be 0xc0 or 0xc1, \
  615. otherwise the wide character could have been represented \
  616. using a single byte. */ \
  617. cnt = 2; \
  618. ch &= 0x1f; \
  619. } \
  620. else if (__glibc_likely ((ch & 0xf0) == 0xe0)) \
  621. { \
  622. /* We expect three bytes. */ \
  623. cnt = 3; \
  624. ch &= 0x0f; \
  625. } \
  626. else if (__glibc_likely ((ch & 0xf8) == 0xf0)) \
  627. { \
  628. /* We expect four bytes. */ \
  629. cnt = 4; \
  630. ch &= 0x07; \
  631. } \
  632. else if (__glibc_likely ((ch & 0xfc) == 0xf8)) \
  633. { \
  634. /* We expect five bytes. */ \
  635. cnt = 5; \
  636. ch &= 0x03; \
  637. } \
  638. else if (__glibc_likely ((ch & 0xfe) == 0xfc)) \
  639. { \
  640. /* We expect six bytes. */ \
  641. cnt = 6; \
  642. ch &= 0x01; \
  643. } \
  644. else \
  645. { \
  646. /* Search the end of this ill-formed UTF-8 character. This \
  647. is the next byte with (x & 0xc0) != 0x80. */ \
  648. i = 0; \
  649. do \
  650. ++i; \
  651. while (inptr + i < inend \
  652. && (*(inptr + i) & 0xc0) == 0x80 \
  653. && i < 5); \
  654. \
  655. errout: \
  656. STANDARD_FROM_LOOP_ERR_HANDLER (i); \
  657. } \
  658. \
  659. if (__glibc_unlikely (inptr + cnt > inend)) \
  660. { \
  661. /* We don't have enough input. But before we report that check \
  662. that all the bytes are correct. */ \
  663. for (i = 1; inptr + i < inend; ++i) \
  664. if ((inptr[i] & 0xc0) != 0x80) \
  665. break; \
  666. \
  667. if (__glibc_likely (inptr + i == inend)) \
  668. { \
  669. result = __GCONV_INCOMPLETE_INPUT; \
  670. break; \
  671. } \
  672. \
  673. goto errout; \
  674. } \
  675. \
  676. /* Read the possible remaining bytes. */ \
  677. for (i = 1; i < cnt; ++i) \
  678. { \
  679. uint32_t byte = inptr[i]; \
  680. \
  681. if ((byte & 0xc0) != 0x80) \
  682. /* This is an illegal encoding. */ \
  683. break; \
  684. \
  685. ch <<= 6; \
  686. ch |= byte & 0x3f; \
  687. } \
  688. \
  689. /* If i < cnt, some trail byte was not >= 0x80, < 0xc0. \
  690. If cnt > 2 and ch < 2^(5*cnt-4), the wide character ch could \
  691. have been represented with fewer than cnt bytes. */ \
  692. if (i < cnt || (cnt > 2 && (ch >> (5 * cnt - 4)) == 0) \
  693. /* Do not accept UTF-16 surrogates. */ \
  694. || (ch >= 0xd800 && ch <= 0xdfff)) \
  695. { \
  696. /* This is an illegal encoding. */ \
  697. goto errout; \
  698. } \
  699. \
  700. inptr += cnt; \
  701. } \
  702. \
  703. /* Now adjust the pointers and store the result. */ \
  704. *((uint32_t *) outptr) = ch; \
  705. outptr += sizeof (uint32_t); \
  706. }
  707. #define LOOP_NEED_FLAGS
  708. #define STORE_REST \
  709. { \
  710. /* We store the remaining bytes while converting them into the UCS4 \
  711. format. We can assume that the first byte in the buffer is \
  712. correct and that it requires a larger number of bytes than there \
  713. are in the input buffer. */ \
  714. wint_t ch = **inptrp; \
  715. size_t cnt, r; \
  716. \
  717. state->__count = inend - *inptrp; \
  718. \
  719. assert (ch != 0xc0 && ch != 0xc1); \
  720. if (ch >= 0xc2 && ch < 0xe0) \
  721. { \
  722. /* We expect two bytes. The first byte cannot be 0xc0 or \
  723. 0xc1, otherwise the wide character could have been \
  724. represented using a single byte. */ \
  725. cnt = 2; \
  726. ch &= 0x1f; \
  727. } \
  728. else if (__glibc_likely ((ch & 0xf0) == 0xe0)) \
  729. { \
  730. /* We expect three bytes. */ \
  731. cnt = 3; \
  732. ch &= 0x0f; \
  733. } \
  734. else if (__glibc_likely ((ch & 0xf8) == 0xf0)) \
  735. { \
  736. /* We expect four bytes. */ \
  737. cnt = 4; \
  738. ch &= 0x07; \
  739. } \
  740. else if (__glibc_likely ((ch & 0xfc) == 0xf8)) \
  741. { \
  742. /* We expect five bytes. */ \
  743. cnt = 5; \
  744. ch &= 0x03; \
  745. } \
  746. else \
  747. { \
  748. /* We expect six bytes. */ \
  749. cnt = 6; \
  750. ch &= 0x01; \
  751. } \
  752. \
  753. /* The first byte is already consumed. */ \
  754. r = cnt - 1; \
  755. while (++(*inptrp) < inend) \
  756. { \
  757. ch <<= 6; \
  758. ch |= **inptrp & 0x3f; \
  759. --r; \
  760. } \
  761. \
  762. /* Shift for the so far missing bytes. */ \
  763. ch <<= r * 6; \
  764. \
  765. /* Store the number of bytes expected for the entire sequence. */ \
  766. state->__count |= cnt << 8; \
  767. \
  768. /* Store the value. */ \
  769. state->__value.__wch = ch; \
  770. }
  771. #define UNPACK_BYTES \
  772. { \
  773. static const unsigned char inmask[5] = { 0xc0, 0xe0, 0xf0, 0xf8, 0xfc }; \
  774. wint_t wch = state->__value.__wch; \
  775. size_t ntotal = state->__count >> 8; \
  776. \
  777. inlen = state->__count & 255; \
  778. \
  779. bytebuf[0] = inmask[ntotal - 2]; \
  780. \
  781. do \
  782. { \
  783. if (--ntotal < inlen) \
  784. bytebuf[ntotal] = 0x80 | (wch & 0x3f); \
  785. wch >>= 6; \
  786. } \
  787. while (ntotal > 1); \
  788. \
  789. bytebuf[0] |= wch; \
  790. }
  791. #define CLEAR_STATE \
  792. state->__count = 0
  793. #include <iconv/loop.c>
  794. #include <iconv/skeleton.c>
  795. /* Convert from UCS2 to the internal (UCS4-like) format. */
  796. #define DEFINE_INIT 0
  797. #define DEFINE_FINI 0
  798. #define MIN_NEEDED_FROM 2
  799. #define MIN_NEEDED_TO 4
  800. #define FROM_DIRECTION 1
  801. #define FROM_LOOP ucs2_internal_loop
  802. #define TO_LOOP ucs2_internal_loop /* This is not used. */
  803. #define FUNCTION_NAME __gconv_transform_ucs2_internal
  804. #define ONE_DIRECTION 1
  805. #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
  806. #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
  807. #define LOOPFCT FROM_LOOP
  808. #define BODY \
  809. { \
  810. uint16_t u1 = get16 (inptr); \
  811. \
  812. if (__glibc_unlikely (u1 >= 0xd800 && u1 < 0xe000)) \
  813. { \
  814. /* Surrogate characters in UCS-2 input are not valid. Reject \
  815. them. (Catching this here is not security relevant.) */ \
  816. STANDARD_FROM_LOOP_ERR_HANDLER (2); \
  817. } \
  818. \
  819. *((uint32_t *) outptr) = u1; \
  820. outptr += sizeof (uint32_t); \
  821. inptr += 2; \
  822. }
  823. #define LOOP_NEED_FLAGS
  824. #include <iconv/loop.c>
  825. #include <iconv/skeleton.c>
  826. /* Convert from the internal (UCS4-like) format to UCS2. */
  827. #define DEFINE_INIT 0
  828. #define DEFINE_FINI 0
  829. #define MIN_NEEDED_FROM 4
  830. #define MIN_NEEDED_TO 2
  831. #define FROM_DIRECTION 1
  832. #define FROM_LOOP internal_ucs2_loop
  833. #define TO_LOOP internal_ucs2_loop /* This is not used. */
  834. #define FUNCTION_NAME __gconv_transform_internal_ucs2
  835. #define ONE_DIRECTION 1
  836. #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
  837. #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
  838. #define LOOPFCT FROM_LOOP
  839. #define BODY \
  840. { \
  841. uint32_t val = *((const uint32_t *) inptr); \
  842. \
  843. if (__glibc_unlikely (val >= 0x10000)) \
  844. { \
  845. UNICODE_TAG_HANDLER (val, 4); \
  846. STANDARD_TO_LOOP_ERR_HANDLER (4); \
  847. } \
  848. else if (__glibc_unlikely (val >= 0xd800 && val < 0xe000)) \
  849. { \
  850. /* Surrogate characters in UCS-4 input are not valid. \
  851. We must catch this, because the UCS-2 output might be \
  852. interpreted as UTF-16 by other programs. If we let \
  853. surrogates pass through, attackers could make a security \
  854. hole exploit by synthesizing any desired plane 1-16 \
  855. character. */ \
  856. result = __gconv_mark_illegal_input (step_data); \
  857. if (! ignore_errors_p ()) \
  858. break; \
  859. inptr += 4; \
  860. ++*irreversible; \
  861. continue; \
  862. } \
  863. else \
  864. { \
  865. put16 (outptr, val); \
  866. outptr += sizeof (uint16_t); \
  867. inptr += 4; \
  868. } \
  869. }
  870. #define LOOP_NEED_FLAGS
  871. #include <iconv/loop.c>
  872. #include <iconv/skeleton.c>
  873. /* Convert from UCS2 in other endianness to the internal (UCS4-like) format. */
  874. #define DEFINE_INIT 0
  875. #define DEFINE_FINI 0
  876. #define MIN_NEEDED_FROM 2
  877. #define MIN_NEEDED_TO 4
  878. #define FROM_DIRECTION 1
  879. #define FROM_LOOP ucs2reverse_internal_loop
  880. #define TO_LOOP ucs2reverse_internal_loop/* This is not used.*/
  881. #define FUNCTION_NAME __gconv_transform_ucs2reverse_internal
  882. #define ONE_DIRECTION 1
  883. #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
  884. #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
  885. #define LOOPFCT FROM_LOOP
  886. #define BODY \
  887. { \
  888. uint16_t u1 = bswap_16 (get16 (inptr)); \
  889. \
  890. if (__glibc_unlikely (u1 >= 0xd800 && u1 < 0xe000)) \
  891. { \
  892. /* Surrogate characters in UCS-2 input are not valid. Reject \
  893. them. (Catching this here is not security relevant.) */ \
  894. if (! ignore_errors_p ()) \
  895. { \
  896. result = __gconv_mark_illegal_input (step_data); \
  897. break; \
  898. } \
  899. inptr += 2; \
  900. ++*irreversible; \
  901. continue; \
  902. } \
  903. \
  904. *((uint32_t *) outptr) = u1; \
  905. outptr += sizeof (uint32_t); \
  906. inptr += 2; \
  907. }
  908. #define LOOP_NEED_FLAGS
  909. #include <iconv/loop.c>
  910. #include <iconv/skeleton.c>
  911. /* Convert from the internal (UCS4-like) format to UCS2 in other endianness. */
  912. #define DEFINE_INIT 0
  913. #define DEFINE_FINI 0
  914. #define MIN_NEEDED_FROM 4
  915. #define MIN_NEEDED_TO 2
  916. #define FROM_DIRECTION 1
  917. #define FROM_LOOP internal_ucs2reverse_loop
  918. #define TO_LOOP internal_ucs2reverse_loop/* This is not used.*/
  919. #define FUNCTION_NAME __gconv_transform_internal_ucs2reverse
  920. #define ONE_DIRECTION 1
  921. #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
  922. #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
  923. #define LOOPFCT FROM_LOOP
  924. #define BODY \
  925. { \
  926. uint32_t val = *((const uint32_t *) inptr); \
  927. if (__glibc_unlikely (val >= 0x10000)) \
  928. { \
  929. UNICODE_TAG_HANDLER (val, 4); \
  930. STANDARD_TO_LOOP_ERR_HANDLER (4); \
  931. } \
  932. else if (__glibc_unlikely (val >= 0xd800 && val < 0xe000)) \
  933. { \
  934. /* Surrogate characters in UCS-4 input are not valid. \
  935. We must catch this, because the UCS-2 output might be \
  936. interpreted as UTF-16 by other programs. If we let \
  937. surrogates pass through, attackers could make a security \
  938. hole exploit by synthesizing any desired plane 1-16 \
  939. character. */ \
  940. if (! ignore_errors_p ()) \
  941. { \
  942. result = __gconv_mark_illegal_input (step_data); \
  943. break; \
  944. } \
  945. inptr += 4; \
  946. ++*irreversible; \
  947. continue; \
  948. } \
  949. else \
  950. { \
  951. put16 (outptr, bswap_16 (val)); \
  952. outptr += sizeof (uint16_t); \
  953. inptr += 4; \
  954. } \
  955. }
  956. #define LOOP_NEED_FLAGS
  957. #include <iconv/loop.c>
  958. #include <iconv/skeleton.c>