| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080 |
- /* Simple transformations functions.
- Copyright (C) 1997-2026 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
- #include <byteswap.h>
- #include <dlfcn.h>
- #include <endian.h>
- #include <errno.h>
- #include <gconv.h>
- #include <stdint.h>
- #include <stdlib.h>
- #include <string.h>
- #include <wchar.h>
- #include <sys/param.h>
- #include <gconv_int.h>
- #define BUILTIN_ALIAS(s1, s2) /* nothing */
- #define BUILTIN_TRANSFORMATION(From, To, Cost, Name, Fct, BtowcFct, \
- MinF, MaxF, MinT, MaxT) \
- extern int Fct (struct __gconv_step *, struct __gconv_step_data *, \
- const unsigned char **, const unsigned char *, \
- unsigned char **, size_t *, int, int);
- #include "gconv_builtin.h"
- #ifndef EILSEQ
- # define EILSEQ EINVAL
- #endif
- /* Specialized conversion function for a single byte to INTERNAL, recognizing
- only ASCII characters. */
- wint_t
- __gconv_btwoc_ascii (struct __gconv_step *step, unsigned char c)
- {
- if (c < 0x80)
- return c;
- else
- return WEOF;
- }
- /* Transform from the internal, UCS4-like format, to UCS4. The
- difference between the internal ucs4 format and the real UCS4
- format is, if any, the endianness. The Unicode/ISO 10646 says that
- unless some higher protocol specifies it differently, the byte
- order is big endian.*/
- #define DEFINE_INIT 0
- #define DEFINE_FINI 0
- #define MIN_NEEDED_FROM 4
- #define MIN_NEEDED_TO 4
- #define FROM_DIRECTION 1
- #define FROM_LOOP internal_ucs4_loop
- #define TO_LOOP internal_ucs4_loop /* This is not used. */
- #define FUNCTION_NAME __gconv_transform_internal_ucs4
- #define ONE_DIRECTION 0
- static inline int
- __attribute ((always_inline))
- internal_ucs4_loop (struct __gconv_step *step,
- struct __gconv_step_data *step_data,
- const unsigned char **inptrp, const unsigned char *inend,
- unsigned char **outptrp, const unsigned char *outend,
- size_t *irreversible)
- {
- const unsigned char *inptr = *inptrp;
- unsigned char *outptr = *outptrp;
- size_t n_convert = MIN (inend - inptr, outend - outptr) / 4;
- int result;
- #if __BYTE_ORDER == __LITTLE_ENDIAN
- /* Sigh, we have to do some real work. */
- size_t cnt;
- for (cnt = 0; cnt < n_convert; ++cnt, inptr += 4, outptr += 4)
- {
- uint32_t val = get32 (inptr);
- put32 (outptr, bswap_32 (val));
- }
- *inptrp = inptr;
- *outptrp = outptr;
- #elif __BYTE_ORDER == __BIG_ENDIAN
- /* Simply copy the data. */
- *inptrp = inptr + n_convert * 4;
- *outptrp = __mempcpy (outptr, inptr, n_convert * 4);
- #else
- # error "This endianness is not supported."
- #endif
- /* Determine the status. */
- if (*inptrp == inend)
- result = __GCONV_EMPTY_INPUT;
- else if (*outptrp + 4 > outend)
- result = __GCONV_FULL_OUTPUT;
- else
- result = __GCONV_INCOMPLETE_INPUT;
- return result;
- }
- static inline int
- __attribute ((always_inline))
- internal_ucs4_loop_single (struct __gconv_step *step,
- struct __gconv_step_data *step_data,
- const unsigned char **inptrp,
- const unsigned char *inend,
- unsigned char **outptrp,
- const unsigned char *outend,
- size_t *irreversible)
- {
- mbstate_t *state = step_data->__statep;
- size_t cnt = state->__count & 7;
- while (*inptrp < inend && cnt < 4)
- state->__value.__wchb[cnt++] = *(*inptrp)++;
- if (__glibc_unlikely (cnt < 4))
- {
- /* Still not enough bytes. Store the ones in the input buffer. */
- state->__count &= ~7;
- state->__count |= cnt;
- return __GCONV_INCOMPLETE_INPUT;
- }
- #if __BYTE_ORDER == __LITTLE_ENDIAN
- (*outptrp)[0] = state->__value.__wchb[3];
- (*outptrp)[1] = state->__value.__wchb[2];
- (*outptrp)[2] = state->__value.__wchb[1];
- (*outptrp)[3] = state->__value.__wchb[0];
- #elif __BYTE_ORDER == __BIG_ENDIAN
- /* XXX unaligned */
- (*outptrp)[0] = state->__value.__wchb[0];
- (*outptrp)[1] = state->__value.__wchb[1];
- (*outptrp)[2] = state->__value.__wchb[2];
- (*outptrp)[3] = state->__value.__wchb[3];
- #else
- # error "This endianness is not supported."
- #endif
- *outptrp += 4;
- /* Clear the state buffer. */
- state->__count &= ~7;
- return __GCONV_OK;
- }
- #include <iconv/skeleton.c>
- /* Transform from UCS4 to the internal, UCS4-like format. Unlike
- for the other direction we have to check for correct values here. */
- #define DEFINE_INIT 0
- #define DEFINE_FINI 0
- #define MIN_NEEDED_FROM 4
- #define MIN_NEEDED_TO 4
- #define FROM_DIRECTION 1
- #define FROM_LOOP ucs4_internal_loop
- #define TO_LOOP ucs4_internal_loop /* This is not used. */
- #define FUNCTION_NAME __gconv_transform_ucs4_internal
- #define ONE_DIRECTION 0
- static inline int
- __attribute ((always_inline))
- ucs4_internal_loop (struct __gconv_step *step,
- struct __gconv_step_data *step_data,
- const unsigned char **inptrp, const unsigned char *inend,
- unsigned char **outptrp, const unsigned char *outend,
- size_t *irreversible)
- {
- int flags = step_data->__flags;
- const unsigned char *inptr = *inptrp;
- unsigned char *outptr = *outptrp;
- int result;
- for (; inptr + 4 <= inend && outptr + 4 <= outend; inptr += 4)
- {
- uint32_t inval = get32 (inptr);
- #if __BYTE_ORDER == __LITTLE_ENDIAN
- inval = bswap_32 (inval);
- #endif
- if (__glibc_unlikely (inval > 0x7fffffff))
- {
- /* The value is too large. We don't try transliteration here since
- this is not an error because of the lack of possibilities to
- represent the result. This is a genuine bug in the input since
- UCS4 does not allow such values. */
- if (irreversible == NULL)
- /* We are transliterating, don't try to correct anything. */
- return __gconv_mark_illegal_input (step_data);
- if (flags & __GCONV_IGNORE_ERRORS)
- {
- /* Just ignore this character. */
- ++*irreversible;
- continue;
- }
- *inptrp = inptr;
- *outptrp = outptr;
- return __gconv_mark_illegal_input (step_data);
- }
- put32 (outptr, inval);
- outptr += sizeof (uint32_t);
- }
- *inptrp = inptr;
- *outptrp = outptr;
- /* Determine the status. */
- if (*inptrp == inend)
- result = __GCONV_EMPTY_INPUT;
- else if (*outptrp + 4 > outend)
- result = __GCONV_FULL_OUTPUT;
- else
- result = __GCONV_INCOMPLETE_INPUT;
- return result;
- }
- static inline int
- __attribute ((always_inline))
- ucs4_internal_loop_single (struct __gconv_step *step,
- struct __gconv_step_data *step_data,
- const unsigned char **inptrp,
- const unsigned char *inend,
- unsigned char **outptrp,
- const unsigned char *outend,
- size_t *irreversible)
- {
- mbstate_t *state = step_data->__statep;
- int flags = step_data->__flags;
- size_t cnt = state->__count & 7;
- while (*inptrp < inend && cnt < 4)
- state->__value.__wchb[cnt++] = *(*inptrp)++;
- if (__glibc_unlikely (cnt < 4))
- {
- /* Still not enough bytes. Store the ones in the input buffer. */
- state->__count &= ~7;
- state->__count |= cnt;
- return __GCONV_INCOMPLETE_INPUT;
- }
- if (__builtin_expect (((unsigned char *) state->__value.__wchb)[0] > 0x80,
- 0))
- {
- /* The value is too large. We don't try transliteration here since
- this is not an error because of the lack of possibilities to
- represent the result. This is a genuine bug in the input since
- UCS4 does not allow such values. */
- if (!(flags & __GCONV_IGNORE_ERRORS))
- {
- *inptrp -= cnt - (state->__count & 7);
- return __gconv_mark_illegal_input (step_data);
- }
- }
- else
- {
- #if __BYTE_ORDER == __LITTLE_ENDIAN
- (*outptrp)[0] = state->__value.__wchb[3];
- (*outptrp)[1] = state->__value.__wchb[2];
- (*outptrp)[2] = state->__value.__wchb[1];
- (*outptrp)[3] = state->__value.__wchb[0];
- #elif __BYTE_ORDER == __BIG_ENDIAN
- (*outptrp)[0] = state->__value.__wchb[0];
- (*outptrp)[1] = state->__value.__wchb[1];
- (*outptrp)[2] = state->__value.__wchb[2];
- (*outptrp)[3] = state->__value.__wchb[3];
- #endif
- *outptrp += 4;
- }
- /* Clear the state buffer. */
- state->__count &= ~7;
- return __GCONV_OK;
- }
- #include <iconv/skeleton.c>
- /* Similarly for the little endian form. */
- #define DEFINE_INIT 0
- #define DEFINE_FINI 0
- #define MIN_NEEDED_FROM 4
- #define MIN_NEEDED_TO 4
- #define FROM_DIRECTION 1
- #define FROM_LOOP internal_ucs4le_loop
- #define TO_LOOP internal_ucs4le_loop /* This is not used. */
- #define FUNCTION_NAME __gconv_transform_internal_ucs4le
- #define ONE_DIRECTION 0
- static inline int
- __attribute ((always_inline))
- internal_ucs4le_loop (struct __gconv_step *step,
- struct __gconv_step_data *step_data,
- const unsigned char **inptrp, const unsigned char *inend,
- unsigned char **outptrp, const unsigned char *outend,
- size_t *irreversible)
- {
- const unsigned char *inptr = *inptrp;
- unsigned char *outptr = *outptrp;
- size_t n_convert = MIN (inend - inptr, outend - outptr) / 4;
- int result;
- #if __BYTE_ORDER == __BIG_ENDIAN
- /* Sigh, we have to do some real work. */
- size_t cnt;
- for (cnt = 0; cnt < n_convert; ++cnt, inptr += 4, outptr += 4)
- {
- uint32_t val = get32 (inptr);
- put32 (outptr, bswap_32 (val));
- }
- *inptrp = inptr;
- *outptrp = outptr;
- #elif __BYTE_ORDER == __LITTLE_ENDIAN
- /* Simply copy the data. */
- *inptrp = inptr + n_convert * 4;
- *outptrp = __mempcpy (outptr, inptr, n_convert * 4);
- #else
- # error "This endianness is not supported."
- #endif
- /* Determine the status. */
- if (*inptrp == inend)
- result = __GCONV_EMPTY_INPUT;
- else if (*outptrp + 4 > outend)
- result = __GCONV_FULL_OUTPUT;
- else
- result = __GCONV_INCOMPLETE_INPUT;
- return result;
- }
- static inline int
- __attribute ((always_inline))
- internal_ucs4le_loop_single (struct __gconv_step *step,
- struct __gconv_step_data *step_data,
- const unsigned char **inptrp,
- const unsigned char *inend,
- unsigned char **outptrp,
- const unsigned char *outend,
- size_t *irreversible)
- {
- mbstate_t *state = step_data->__statep;
- size_t cnt = state->__count & 7;
- while (*inptrp < inend && cnt < 4)
- state->__value.__wchb[cnt++] = *(*inptrp)++;
- if (__glibc_unlikely (cnt < 4))
- {
- /* Still not enough bytes. Store the ones in the input buffer. */
- state->__count &= ~7;
- state->__count |= cnt;
- return __GCONV_INCOMPLETE_INPUT;
- }
- #if __BYTE_ORDER == __BIG_ENDIAN
- (*outptrp)[0] = state->__value.__wchb[3];
- (*outptrp)[1] = state->__value.__wchb[2];
- (*outptrp)[2] = state->__value.__wchb[1];
- (*outptrp)[3] = state->__value.__wchb[0];
- #else
- /* XXX unaligned */
- (*outptrp)[0] = state->__value.__wchb[0];
- (*outptrp)[1] = state->__value.__wchb[1];
- (*outptrp)[2] = state->__value.__wchb[2];
- (*outptrp)[3] = state->__value.__wchb[3];
- #endif
- *outptrp += 4;
- /* Clear the state buffer. */
- state->__count &= ~7;
- return __GCONV_OK;
- }
- #include <iconv/skeleton.c>
- /* And finally from UCS4-LE to the internal encoding. */
- #define DEFINE_INIT 0
- #define DEFINE_FINI 0
- #define MIN_NEEDED_FROM 4
- #define MIN_NEEDED_TO 4
- #define FROM_DIRECTION 1
- #define FROM_LOOP ucs4le_internal_loop
- #define TO_LOOP ucs4le_internal_loop /* This is not used. */
- #define FUNCTION_NAME __gconv_transform_ucs4le_internal
- #define ONE_DIRECTION 0
- static inline int
- __attribute ((always_inline))
- ucs4le_internal_loop (struct __gconv_step *step,
- struct __gconv_step_data *step_data,
- const unsigned char **inptrp, const unsigned char *inend,
- unsigned char **outptrp, const unsigned char *outend,
- size_t *irreversible)
- {
- int flags = step_data->__flags;
- const unsigned char *inptr = *inptrp;
- unsigned char *outptr = *outptrp;
- int result;
- for (; inptr + 4 <= inend && outptr + 4 <= outend; inptr += 4)
- {
- uint32_t inval = get32 (inptr);
- #if __BYTE_ORDER == __BIG_ENDIAN
- inval = bswap_32 (inval);
- #endif
- if (__glibc_unlikely (inval > 0x7fffffff))
- {
- /* The value is too large. We don't try transliteration here since
- this is not an error because of the lack of possibilities to
- represent the result. This is a genuine bug in the input since
- UCS4 does not allow such values. */
- if (irreversible == NULL)
- /* We are transliterating, don't try to correct anything. */
- return __gconv_mark_illegal_input (step_data);
- if (flags & __GCONV_IGNORE_ERRORS)
- {
- /* Just ignore this character. */
- ++*irreversible;
- continue;
- }
- *inptrp = inptr;
- *outptrp = outptr;
- return __gconv_mark_illegal_input (step_data);
- }
- put32 (outptr, inval);
- outptr += sizeof (uint32_t);
- }
- *inptrp = inptr;
- *outptrp = outptr;
- /* Determine the status. */
- if (*inptrp == inend)
- result = __GCONV_EMPTY_INPUT;
- else if (*inptrp + 4 > inend)
- result = __GCONV_INCOMPLETE_INPUT;
- else
- {
- assert (*outptrp + 4 > outend);
- result = __GCONV_FULL_OUTPUT;
- }
- return result;
- }
- static inline int
- __attribute ((always_inline))
- ucs4le_internal_loop_single (struct __gconv_step *step,
- struct __gconv_step_data *step_data,
- const unsigned char **inptrp,
- const unsigned char *inend,
- unsigned char **outptrp,
- const unsigned char *outend,
- size_t *irreversible)
- {
- mbstate_t *state = step_data->__statep;
- int flags = step_data->__flags;
- size_t cnt = state->__count & 7;
- while (*inptrp < inend && cnt < 4)
- state->__value.__wchb[cnt++] = *(*inptrp)++;
- if (__glibc_unlikely (cnt < 4))
- {
- /* Still not enough bytes. Store the ones in the input buffer. */
- state->__count &= ~7;
- state->__count |= cnt;
- return __GCONV_INCOMPLETE_INPUT;
- }
- if (__builtin_expect (((unsigned char *) state->__value.__wchb)[3] > 0x80,
- 0))
- {
- /* The value is too large. We don't try transliteration here since
- this is not an error because of the lack of possibilities to
- represent the result. This is a genuine bug in the input since
- UCS4 does not allow such values. */
- if (!(flags & __GCONV_IGNORE_ERRORS))
- return __gconv_mark_illegal_input (step_data);
- }
- else
- {
- #if __BYTE_ORDER == __BIG_ENDIAN
- (*outptrp)[0] = state->__value.__wchb[3];
- (*outptrp)[1] = state->__value.__wchb[2];
- (*outptrp)[2] = state->__value.__wchb[1];
- (*outptrp)[3] = state->__value.__wchb[0];
- #else
- (*outptrp)[0] = state->__value.__wchb[0];
- (*outptrp)[1] = state->__value.__wchb[1];
- (*outptrp)[2] = state->__value.__wchb[2];
- (*outptrp)[3] = state->__value.__wchb[3];
- #endif
- *outptrp += 4;
- }
- /* Clear the state buffer. */
- state->__count &= ~7;
- return __GCONV_OK;
- }
- #include <iconv/skeleton.c>
- /* Convert from ISO 646-IRV to the internal (UCS4-like) format. */
- #define DEFINE_INIT 0
- #define DEFINE_FINI 0
- #define MIN_NEEDED_FROM 1
- #define MIN_NEEDED_TO 4
- #define FROM_DIRECTION 1
- #define FROM_LOOP ascii_internal_loop
- #define TO_LOOP ascii_internal_loop /* This is not used. */
- #define FUNCTION_NAME __gconv_transform_ascii_internal
- #define ONE_DIRECTION 1
- #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
- #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
- #define LOOPFCT FROM_LOOP
- #define BODY \
- { \
- if (__glibc_unlikely (*inptr > '\x7f')) \
- { \
- /* The value is too large. We don't try transliteration here since \
- this is not an error because of the lack of possibilities to \
- represent the result. This is a genuine bug in the input since \
- ASCII does not allow such values. */ \
- STANDARD_FROM_LOOP_ERR_HANDLER (1); \
- } \
- else \
- { \
- /* It's an one byte sequence. */ \
- *((uint32_t *) outptr) = *inptr++; \
- outptr += sizeof (uint32_t); \
- } \
- }
- #define LOOP_NEED_FLAGS
- #include <iconv/loop.c>
- #include <iconv/skeleton.c>
- /* Convert from the internal (UCS4-like) format to ISO 646-IRV. */
- #define DEFINE_INIT 0
- #define DEFINE_FINI 0
- #define MIN_NEEDED_FROM 4
- #define MIN_NEEDED_TO 1
- #define FROM_DIRECTION 1
- #define FROM_LOOP internal_ascii_loop
- #define TO_LOOP internal_ascii_loop /* This is not used. */
- #define FUNCTION_NAME __gconv_transform_internal_ascii
- #define ONE_DIRECTION 1
- #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
- #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
- #define LOOPFCT FROM_LOOP
- #define BODY \
- { \
- if (__glibc_unlikely (*((const uint32_t *) inptr) > 0x7f)) \
- { \
- UNICODE_TAG_HANDLER (*((const uint32_t *) inptr), 4); \
- STANDARD_TO_LOOP_ERR_HANDLER (4); \
- } \
- else \
- { \
- /* It's an one byte sequence. */ \
- *outptr++ = *((const uint32_t *) inptr); \
- inptr += sizeof (uint32_t); \
- } \
- }
- #define LOOP_NEED_FLAGS
- #include <iconv/loop.c>
- #include <iconv/skeleton.c>
- /* Convert from the internal (UCS4-like) format to UTF-8. */
- #define DEFINE_INIT 0
- #define DEFINE_FINI 0
- #define MIN_NEEDED_FROM 4
- #define MIN_NEEDED_TO 1
- #define MAX_NEEDED_TO 6
- #define FROM_DIRECTION 1
- #define FROM_LOOP internal_utf8_loop
- #define TO_LOOP internal_utf8_loop /* This is not used. */
- #define FUNCTION_NAME __gconv_transform_internal_utf8
- #define ONE_DIRECTION 1
- #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
- #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
- #define MAX_NEEDED_OUTPUT MAX_NEEDED_TO
- #define LOOPFCT FROM_LOOP
- #define BODY \
- { \
- uint32_t wc = *((const uint32_t *) inptr); \
- \
- if (__glibc_likely (wc < 0x80)) \
- /* It's an one byte sequence. */ \
- *outptr++ = (unsigned char) wc; \
- else if (__glibc_likely (wc <= 0x7fffffff \
- && (wc < 0xd800 || wc > 0xdfff))) \
- { \
- size_t step; \
- unsigned char *start; \
- \
- for (step = 2; step < 6; ++step) \
- if ((wc & (~(uint32_t)0 << (5 * step + 1))) == 0) \
- break; \
- \
- if (__glibc_unlikely (outptr + step > outend)) \
- { \
- /* Too long. */ \
- result = __GCONV_FULL_OUTPUT; \
- break; \
- } \
- \
- start = outptr; \
- *outptr = (unsigned char) (~0xff >> step); \
- outptr += step; \
- do \
- { \
- start[--step] = 0x80 | (wc & 0x3f); \
- wc >>= 6; \
- } \
- while (step > 1); \
- start[0] |= wc; \
- } \
- else \
- { \
- STANDARD_TO_LOOP_ERR_HANDLER (4); \
- } \
- \
- inptr += 4; \
- }
- #define LOOP_NEED_FLAGS
- #include <iconv/loop.c>
- #include <iconv/skeleton.c>
- /* Convert from UTF-8 to the internal (UCS4-like) format. */
- #define DEFINE_INIT 0
- #define DEFINE_FINI 0
- #define MIN_NEEDED_FROM 1
- #define MAX_NEEDED_FROM 6
- #define MIN_NEEDED_TO 4
- #define FROM_DIRECTION 1
- #define FROM_LOOP utf8_internal_loop
- #define TO_LOOP utf8_internal_loop /* This is not used. */
- #define FUNCTION_NAME __gconv_transform_utf8_internal
- #define ONE_DIRECTION 1
- #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
- #define MAX_NEEDED_INPUT MAX_NEEDED_FROM
- #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
- #define LOOPFCT FROM_LOOP
- #define BODY \
- { \
- /* Next input byte. */ \
- uint32_t ch = *inptr; \
- \
- if (__glibc_likely (ch < 0x80)) \
- { \
- /* One byte sequence. */ \
- ++inptr; \
- } \
- else \
- { \
- unsigned int cnt; \
- unsigned int i; \
- \
- if (ch >= 0xc2 && ch < 0xe0) \
- { \
- /* We expect two bytes. The first byte cannot be 0xc0 or 0xc1, \
- otherwise the wide character could have been represented \
- using a single byte. */ \
- cnt = 2; \
- ch &= 0x1f; \
- } \
- else if (__glibc_likely ((ch & 0xf0) == 0xe0)) \
- { \
- /* We expect three bytes. */ \
- cnt = 3; \
- ch &= 0x0f; \
- } \
- else if (__glibc_likely ((ch & 0xf8) == 0xf0)) \
- { \
- /* We expect four bytes. */ \
- cnt = 4; \
- ch &= 0x07; \
- } \
- else if (__glibc_likely ((ch & 0xfc) == 0xf8)) \
- { \
- /* We expect five bytes. */ \
- cnt = 5; \
- ch &= 0x03; \
- } \
- else if (__glibc_likely ((ch & 0xfe) == 0xfc)) \
- { \
- /* We expect six bytes. */ \
- cnt = 6; \
- ch &= 0x01; \
- } \
- else \
- { \
- /* Search the end of this ill-formed UTF-8 character. This \
- is the next byte with (x & 0xc0) != 0x80. */ \
- i = 0; \
- do \
- ++i; \
- while (inptr + i < inend \
- && (*(inptr + i) & 0xc0) == 0x80 \
- && i < 5); \
- \
- errout: \
- STANDARD_FROM_LOOP_ERR_HANDLER (i); \
- } \
- \
- if (__glibc_unlikely (inptr + cnt > inend)) \
- { \
- /* We don't have enough input. But before we report that check \
- that all the bytes are correct. */ \
- for (i = 1; inptr + i < inend; ++i) \
- if ((inptr[i] & 0xc0) != 0x80) \
- break; \
- \
- if (__glibc_likely (inptr + i == inend)) \
- { \
- result = __GCONV_INCOMPLETE_INPUT; \
- break; \
- } \
- \
- goto errout; \
- } \
- \
- /* Read the possible remaining bytes. */ \
- for (i = 1; i < cnt; ++i) \
- { \
- uint32_t byte = inptr[i]; \
- \
- if ((byte & 0xc0) != 0x80) \
- /* This is an illegal encoding. */ \
- break; \
- \
- ch <<= 6; \
- ch |= byte & 0x3f; \
- } \
- \
- /* If i < cnt, some trail byte was not >= 0x80, < 0xc0. \
- If cnt > 2 and ch < 2^(5*cnt-4), the wide character ch could \
- have been represented with fewer than cnt bytes. */ \
- if (i < cnt || (cnt > 2 && (ch >> (5 * cnt - 4)) == 0) \
- /* Do not accept UTF-16 surrogates. */ \
- || (ch >= 0xd800 && ch <= 0xdfff)) \
- { \
- /* This is an illegal encoding. */ \
- goto errout; \
- } \
- \
- inptr += cnt; \
- } \
- \
- /* Now adjust the pointers and store the result. */ \
- *((uint32_t *) outptr) = ch; \
- outptr += sizeof (uint32_t); \
- }
- #define LOOP_NEED_FLAGS
- #define STORE_REST \
- { \
- /* We store the remaining bytes while converting them into the UCS4 \
- format. We can assume that the first byte in the buffer is \
- correct and that it requires a larger number of bytes than there \
- are in the input buffer. */ \
- wint_t ch = **inptrp; \
- size_t cnt, r; \
- \
- state->__count = inend - *inptrp; \
- \
- assert (ch != 0xc0 && ch != 0xc1); \
- if (ch >= 0xc2 && ch < 0xe0) \
- { \
- /* We expect two bytes. The first byte cannot be 0xc0 or \
- 0xc1, otherwise the wide character could have been \
- represented using a single byte. */ \
- cnt = 2; \
- ch &= 0x1f; \
- } \
- else if (__glibc_likely ((ch & 0xf0) == 0xe0)) \
- { \
- /* We expect three bytes. */ \
- cnt = 3; \
- ch &= 0x0f; \
- } \
- else if (__glibc_likely ((ch & 0xf8) == 0xf0)) \
- { \
- /* We expect four bytes. */ \
- cnt = 4; \
- ch &= 0x07; \
- } \
- else if (__glibc_likely ((ch & 0xfc) == 0xf8)) \
- { \
- /* We expect five bytes. */ \
- cnt = 5; \
- ch &= 0x03; \
- } \
- else \
- { \
- /* We expect six bytes. */ \
- cnt = 6; \
- ch &= 0x01; \
- } \
- \
- /* The first byte is already consumed. */ \
- r = cnt - 1; \
- while (++(*inptrp) < inend) \
- { \
- ch <<= 6; \
- ch |= **inptrp & 0x3f; \
- --r; \
- } \
- \
- /* Shift for the so far missing bytes. */ \
- ch <<= r * 6; \
- \
- /* Store the number of bytes expected for the entire sequence. */ \
- state->__count |= cnt << 8; \
- \
- /* Store the value. */ \
- state->__value.__wch = ch; \
- }
- #define UNPACK_BYTES \
- { \
- static const unsigned char inmask[5] = { 0xc0, 0xe0, 0xf0, 0xf8, 0xfc }; \
- wint_t wch = state->__value.__wch; \
- size_t ntotal = state->__count >> 8; \
- \
- inlen = state->__count & 255; \
- \
- bytebuf[0] = inmask[ntotal - 2]; \
- \
- do \
- { \
- if (--ntotal < inlen) \
- bytebuf[ntotal] = 0x80 | (wch & 0x3f); \
- wch >>= 6; \
- } \
- while (ntotal > 1); \
- \
- bytebuf[0] |= wch; \
- }
- #define CLEAR_STATE \
- state->__count = 0
- #include <iconv/loop.c>
- #include <iconv/skeleton.c>
- /* Convert from UCS2 to the internal (UCS4-like) format. */
- #define DEFINE_INIT 0
- #define DEFINE_FINI 0
- #define MIN_NEEDED_FROM 2
- #define MIN_NEEDED_TO 4
- #define FROM_DIRECTION 1
- #define FROM_LOOP ucs2_internal_loop
- #define TO_LOOP ucs2_internal_loop /* This is not used. */
- #define FUNCTION_NAME __gconv_transform_ucs2_internal
- #define ONE_DIRECTION 1
- #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
- #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
- #define LOOPFCT FROM_LOOP
- #define BODY \
- { \
- uint16_t u1 = get16 (inptr); \
- \
- if (__glibc_unlikely (u1 >= 0xd800 && u1 < 0xe000)) \
- { \
- /* Surrogate characters in UCS-2 input are not valid. Reject \
- them. (Catching this here is not security relevant.) */ \
- STANDARD_FROM_LOOP_ERR_HANDLER (2); \
- } \
- \
- *((uint32_t *) outptr) = u1; \
- outptr += sizeof (uint32_t); \
- inptr += 2; \
- }
- #define LOOP_NEED_FLAGS
- #include <iconv/loop.c>
- #include <iconv/skeleton.c>
- /* Convert from the internal (UCS4-like) format to UCS2. */
- #define DEFINE_INIT 0
- #define DEFINE_FINI 0
- #define MIN_NEEDED_FROM 4
- #define MIN_NEEDED_TO 2
- #define FROM_DIRECTION 1
- #define FROM_LOOP internal_ucs2_loop
- #define TO_LOOP internal_ucs2_loop /* This is not used. */
- #define FUNCTION_NAME __gconv_transform_internal_ucs2
- #define ONE_DIRECTION 1
- #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
- #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
- #define LOOPFCT FROM_LOOP
- #define BODY \
- { \
- uint32_t val = *((const uint32_t *) inptr); \
- \
- if (__glibc_unlikely (val >= 0x10000)) \
- { \
- UNICODE_TAG_HANDLER (val, 4); \
- STANDARD_TO_LOOP_ERR_HANDLER (4); \
- } \
- else if (__glibc_unlikely (val >= 0xd800 && val < 0xe000)) \
- { \
- /* Surrogate characters in UCS-4 input are not valid. \
- We must catch this, because the UCS-2 output might be \
- interpreted as UTF-16 by other programs. If we let \
- surrogates pass through, attackers could make a security \
- hole exploit by synthesizing any desired plane 1-16 \
- character. */ \
- result = __gconv_mark_illegal_input (step_data); \
- if (! ignore_errors_p ()) \
- break; \
- inptr += 4; \
- ++*irreversible; \
- continue; \
- } \
- else \
- { \
- put16 (outptr, val); \
- outptr += sizeof (uint16_t); \
- inptr += 4; \
- } \
- }
- #define LOOP_NEED_FLAGS
- #include <iconv/loop.c>
- #include <iconv/skeleton.c>
- /* Convert from UCS2 in other endianness to the internal (UCS4-like) format. */
- #define DEFINE_INIT 0
- #define DEFINE_FINI 0
- #define MIN_NEEDED_FROM 2
- #define MIN_NEEDED_TO 4
- #define FROM_DIRECTION 1
- #define FROM_LOOP ucs2reverse_internal_loop
- #define TO_LOOP ucs2reverse_internal_loop/* This is not used.*/
- #define FUNCTION_NAME __gconv_transform_ucs2reverse_internal
- #define ONE_DIRECTION 1
- #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
- #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
- #define LOOPFCT FROM_LOOP
- #define BODY \
- { \
- uint16_t u1 = bswap_16 (get16 (inptr)); \
- \
- if (__glibc_unlikely (u1 >= 0xd800 && u1 < 0xe000)) \
- { \
- /* Surrogate characters in UCS-2 input are not valid. Reject \
- them. (Catching this here is not security relevant.) */ \
- if (! ignore_errors_p ()) \
- { \
- result = __gconv_mark_illegal_input (step_data); \
- break; \
- } \
- inptr += 2; \
- ++*irreversible; \
- continue; \
- } \
- \
- *((uint32_t *) outptr) = u1; \
- outptr += sizeof (uint32_t); \
- inptr += 2; \
- }
- #define LOOP_NEED_FLAGS
- #include <iconv/loop.c>
- #include <iconv/skeleton.c>
- /* Convert from the internal (UCS4-like) format to UCS2 in other endianness. */
- #define DEFINE_INIT 0
- #define DEFINE_FINI 0
- #define MIN_NEEDED_FROM 4
- #define MIN_NEEDED_TO 2
- #define FROM_DIRECTION 1
- #define FROM_LOOP internal_ucs2reverse_loop
- #define TO_LOOP internal_ucs2reverse_loop/* This is not used.*/
- #define FUNCTION_NAME __gconv_transform_internal_ucs2reverse
- #define ONE_DIRECTION 1
- #define MIN_NEEDED_INPUT MIN_NEEDED_FROM
- #define MIN_NEEDED_OUTPUT MIN_NEEDED_TO
- #define LOOPFCT FROM_LOOP
- #define BODY \
- { \
- uint32_t val = *((const uint32_t *) inptr); \
- if (__glibc_unlikely (val >= 0x10000)) \
- { \
- UNICODE_TAG_HANDLER (val, 4); \
- STANDARD_TO_LOOP_ERR_HANDLER (4); \
- } \
- else if (__glibc_unlikely (val >= 0xd800 && val < 0xe000)) \
- { \
- /* Surrogate characters in UCS-4 input are not valid. \
- We must catch this, because the UCS-2 output might be \
- interpreted as UTF-16 by other programs. If we let \
- surrogates pass through, attackers could make a security \
- hole exploit by synthesizing any desired plane 1-16 \
- character. */ \
- if (! ignore_errors_p ()) \
- { \
- result = __gconv_mark_illegal_input (step_data); \
- break; \
- } \
- inptr += 4; \
- ++*irreversible; \
- continue; \
- } \
- else \
- { \
- put16 (outptr, bswap_16 (val)); \
- outptr += sizeof (uint16_t); \
- inptr += 4; \
- } \
- }
- #define LOOP_NEED_FLAGS
- #include <iconv/loop.c>
- #include <iconv/skeleton.c>
|