| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746 |
- /* Copyright (C) 1995-2026 Free Software Foundation, Inc.
- This file is part of the GNU C Library.
- The GNU C Library is free software; you can redistribute it and/or
- modify it under the terms of the GNU Lesser General Public
- License as published by the Free Software Foundation; either
- version 2.1 of the License, or (at your option) any later version.
- The GNU C Library is distributed in the hope that it will be useful,
- but WITHOUT ANY WARRANTY; without even the implied warranty of
- MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
- Lesser General Public License for more details.
- You should have received a copy of the GNU Lesser General Public
- License along with the GNU C Library; if not, see
- <https://www.gnu.org/licenses/>. */
- #include <assert.h>
- #include <langinfo.h>
- #include <locale.h>
- #include <stddef.h>
- #include <stdint.h>
- #include <stdlib.h>
- #include <string.h>
- #include <sys/param.h>
- #ifndef STRING_TYPE
- # define STRING_TYPE char
- # define USTRING_TYPE unsigned char
- # define STRXFRM __strxfrm_l
- # define STRLEN strlen
- # define STPNCPY __stpncpy
- # define WEIGHT_H "../locale/weight.h"
- # define SUFFIX MB
- # define L(arg) arg
- #endif
- #define CONCAT(a,b) CONCAT1(a,b)
- #define CONCAT1(a,b) a##b
- /* Maximum string size that is calculated with cached indices. Right now this
- is an arbitrary value open to optimizations. SMALL_STR_SIZE * 4 has to be
- lower than __MAX_ALLOCA_CUTOFF. Keep localedata/xfrm-test.c in sync. */
- #define SMALL_STR_SIZE 4095
- #include "../locale/localeinfo.h"
- #include WEIGHT_H
- /* Group locale data for shorter parameter lists. */
- typedef struct
- {
- uint32_t nrules;
- unsigned char *rulesets;
- USTRING_TYPE *weights;
- int32_t *table;
- USTRING_TYPE *extra;
- int32_t *indirect;
- } locale_data_t;
- #ifndef WIDE_CHAR_VERSION
- /* We need UTF-8 encoding of numbers. */
- static int
- utf8_encode (char *buf, int val)
- {
- int retval;
- if (val < 0x80)
- {
- *buf++ = (char) val;
- retval = 1;
- }
- else
- {
- int step;
- for (step = 2; step < 6; ++step)
- if ((val & (~(uint32_t)0 << (5 * step + 1))) == 0)
- break;
- retval = step;
- *buf = (unsigned char) (~0xff >> step);
- --step;
- do
- {
- buf[step] = 0x80 | (val & 0x3f);
- val >>= 6;
- }
- while (--step > 0);
- *buf |= val;
- }
- return retval;
- }
- #endif
- /* Find next weight and rule index. Inlined since called for every char. */
- static __always_inline size_t
- find_idx (const USTRING_TYPE **us, int32_t *weight_idx,
- unsigned char *rule_idx, const locale_data_t *l_data, const int pass)
- {
- int32_t tmp = findidx (l_data->table, l_data->indirect, l_data->extra, us,
- -1);
- *rule_idx = tmp >> 24;
- int32_t idx = tmp & 0xffffff;
- size_t len = l_data->weights[idx++];
- /* Skip over indices of previous levels. */
- for (int i = 0; i < pass; i++)
- {
- idx += len;
- len = l_data->weights[idx++];
- }
- *weight_idx = idx;
- return len;
- }
- static int
- find_position (const USTRING_TYPE *us, const locale_data_t *l_data,
- const int pass)
- {
- int32_t weight_idx;
- unsigned char rule_idx;
- const USTRING_TYPE *usrc = us;
- find_idx (&usrc, &weight_idx, &rule_idx, l_data, pass);
- return l_data->rulesets[rule_idx * l_data->nrules + pass] & sort_position;
- }
- /* Do the transformation. */
- static size_t
- do_xfrm (const USTRING_TYPE *usrc, STRING_TYPE *dest, size_t n,
- const locale_data_t *l_data)
- {
- int32_t weight_idx;
- unsigned char rule_idx;
- uint32_t pass;
- size_t needed = 0;
- size_t last_needed;
- /* Now the passes over the weights. */
- for (pass = 0; pass < l_data->nrules; ++pass)
- {
- size_t backw_len = 0;
- last_needed = needed;
- const USTRING_TYPE *cur = usrc;
- const USTRING_TYPE *backw_start = NULL;
- /* We assume that if a rule has defined `position' in one section
- this is true for all of them. */
- int position = find_position (cur, l_data, pass);
- if (position == 0)
- {
- while (*cur != L('\0'))
- {
- const USTRING_TYPE *pos = cur;
- size_t len = find_idx (&cur, &weight_idx, &rule_idx, l_data,
- pass);
- int rule = l_data->rulesets[rule_idx * l_data->nrules + pass];
- if ((rule & sort_forward) != 0)
- {
- /* Handle the pushed backward sequence. */
- if (backw_start != NULL)
- {
- for (size_t i = backw_len; i > 0; )
- {
- int32_t weight_idx;
- unsigned char rule_idx;
- size_t len = find_idx (&backw_start, &weight_idx,
- &rule_idx, l_data, pass);
- if (needed + i < n)
- for (size_t j = len; j > 0; j--)
- dest[needed + i - j] =
- l_data->weights[weight_idx++];
- i -= len;
- }
- needed += backw_len;
- backw_start = NULL;
- backw_len = 0;
- }
- /* Now handle the forward element. */
- if (needed + len < n)
- while (len-- > 0)
- dest[needed++] = l_data->weights[weight_idx++];
- else
- /* No more characters fit into the buffer. */
- needed += len;
- }
- else
- {
- /* Remember start of the backward sequence & track length. */
- if (backw_start == NULL)
- backw_start = pos;
- backw_len += len;
- }
- }
- /* Handle the pushed backward sequence. */
- if (backw_start != NULL)
- {
- for (size_t i = backw_len; i > 0; )
- {
- size_t len = find_idx (&backw_start, &weight_idx, &rule_idx,
- l_data, pass);
- if (needed + i < n)
- for (size_t j = len; j > 0; j--)
- dest[needed + i - j] =
- l_data->weights[weight_idx++];
- i -= len;
- }
- needed += backw_len;
- }
- }
- else
- {
- int val = 1;
- #ifndef WIDE_CHAR_VERSION
- char buf[7];
- size_t buflen;
- #endif
- size_t i;
- while (*cur != L('\0'))
- {
- const USTRING_TYPE *pos = cur;
- size_t len = find_idx (&cur, &weight_idx, &rule_idx, l_data,
- pass);
- int rule = l_data->rulesets[rule_idx * l_data->nrules + pass];
- if ((rule & sort_forward) != 0)
- {
- /* Handle the pushed backward sequence. */
- if (backw_start != NULL)
- {
- for (size_t p = backw_len; p > 0; p--)
- {
- size_t len;
- int32_t weight_idx;
- unsigned char rule_idx;
- const USTRING_TYPE *backw_cur = backw_start;
- /* To prevent a warning init the used vars. */
- len = find_idx (&backw_cur, &weight_idx,
- &rule_idx, l_data, pass);
- for (i = 1; i < p; i++)
- len = find_idx (&backw_cur, &weight_idx,
- &rule_idx, l_data, pass);
- if (len != 0)
- {
- #ifdef WIDE_CHAR_VERSION
- if (needed + 1 + len < n)
- {
- dest[needed] = val;
- for (i = 0; i < len; ++i)
- dest[needed + 1 + i] =
- l_data->weights[weight_idx + i];
- }
- needed += 1 + len;
- #else
- buflen = utf8_encode (buf, val);
- if (needed + buflen + len < n)
- {
- for (i = 0; i < buflen; ++i)
- dest[needed + i] = buf[i];
- for (i = 0; i < len; ++i)
- dest[needed + buflen + i] =
- l_data->weights[weight_idx + i];
- }
- needed += buflen + len;
- #endif
- val = 1;
- }
- else
- ++val;
- }
- backw_start = NULL;
- backw_len = 0;
- }
- /* Now handle the forward element. */
- if (len != 0)
- {
- #ifdef WIDE_CHAR_VERSION
- if (needed + 1 + len < n)
- {
- dest[needed] = val;
- for (i = 0; i < len; ++i)
- dest[needed + 1 + i] =
- l_data->weights[weight_idx + i];
- }
- needed += 1 + len;
- #else
- buflen = utf8_encode (buf, val);
- if (needed + buflen + len < n)
- {
- for (i = 0; i < buflen; ++i)
- dest[needed + i] = buf[i];
- for (i = 0; i < len; ++i)
- dest[needed + buflen + i] =
- l_data->weights[weight_idx + i];
- }
- needed += buflen + len;
- #endif
- val = 1;
- }
- else
- ++val;
- }
- else
- {
- /* Remember start of the backward sequence & track length. */
- if (backw_start == NULL)
- backw_start = pos;
- backw_len++;
- }
- }
- /* Handle the pushed backward sequence. */
- if (backw_start != NULL)
- {
- for (size_t p = backw_len; p > 0; p--)
- {
- size_t len;
- int32_t weight_idx;
- unsigned char rule_idx;
- const USTRING_TYPE *backw_cur = backw_start;
- /* To prevent a warning init the used vars. */
- len = find_idx (&backw_cur, &weight_idx,
- &rule_idx, l_data, pass);
- for (i = 1; i < p; i++)
- len = find_idx (&backw_cur, &weight_idx,
- &rule_idx, l_data, pass);
- if (len != 0)
- {
- #ifdef WIDE_CHAR_VERSION
- if (needed + 1 + len < n)
- {
- dest[needed] = val;
- for (i = 0; i < len; ++i)
- dest[needed + 1 + i] =
- l_data->weights[weight_idx + i];
- }
- needed += 1 + len;
- #else
- buflen = utf8_encode (buf, val);
- if (needed + buflen + len < n)
- {
- for (i = 0; i < buflen; ++i)
- dest[needed + i] = buf[i];
- for (i = 0; i < len; ++i)
- dest[needed + buflen + i] =
- l_data->weights[weight_idx + i];
- }
- needed += buflen + len;
- #endif
- val = 1;
- }
- else
- ++val;
- }
- }
- }
- /* Finally store the byte to separate the passes or terminate
- the string. */
- if (needed < n)
- dest[needed] = pass + 1 < l_data->nrules ? L('\1') : L('\0');
- ++needed;
- }
- /* This is a little optimization: many collation specifications have
- a `position' rule at the end and if no non-ignored character
- is found the last \1 byte is immediately followed by a \0 byte
- signalling this. We can avoid the \1 byte(s). */
- if (needed > 2 && needed == last_needed + 1)
- {
- /* Remove the \1 byte. */
- if (--needed <= n)
- dest[needed - 1] = L('\0');
- }
- /* Return the number of bytes/words we need, but don't count the NUL
- byte/word at the end. */
- return needed - 1;
- }
- /* Do the transformation using weight-index and rule cache. */
- static size_t
- do_xfrm_cached (STRING_TYPE *dest, size_t n, const locale_data_t *l_data,
- size_t idxmax, int32_t *idxarr, const unsigned char *rulearr)
- {
- uint32_t nrules = l_data->nrules;
- unsigned char *rulesets = l_data->rulesets;
- USTRING_TYPE *weights = l_data->weights;
- uint32_t pass;
- size_t needed = 0;
- size_t last_needed;
- size_t idxcnt;
- /* Now the passes over the weights. */
- for (pass = 0; pass < nrules; ++pass)
- {
- size_t backw_stop = ~0ul;
- int rule = rulesets[rulearr[0] * nrules + pass];
- /* We assume that if a rule has defined `position' in one section
- this is true for all of them. */
- int position = rule & sort_position;
- last_needed = needed;
- if (position == 0)
- {
- for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
- {
- if ((rule & sort_forward) != 0)
- {
- size_t len;
- if (backw_stop != ~0ul)
- {
- /* Handle the pushed elements now. */
- size_t backw;
- for (backw = idxcnt; backw > backw_stop; )
- {
- --backw;
- len = weights[idxarr[backw]++];
- if (needed + len < n)
- while (len-- > 0)
- dest[needed++] = weights[idxarr[backw]++];
- else
- {
- /* No more characters fit into the buffer. */
- needed += len;
- idxarr[backw] += len;
- }
- }
- backw_stop = ~0ul;
- }
- /* Now handle the forward element. */
- len = weights[idxarr[idxcnt]++];
- if (needed + len < n)
- while (len-- > 0)
- dest[needed++] = weights[idxarr[idxcnt]++];
- else
- {
- /* No more characters fit into the buffer. */
- needed += len;
- idxarr[idxcnt] += len;
- }
- }
- else
- {
- /* Remember where the backwards series started. */
- if (backw_stop == ~0ul)
- backw_stop = idxcnt;
- }
- rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
- }
- if (backw_stop != ~0ul)
- {
- /* Handle the pushed elements now. */
- size_t backw;
- backw = idxcnt;
- while (backw > backw_stop)
- {
- size_t len = weights[idxarr[--backw]++];
- if (needed + len < n)
- while (len-- > 0)
- dest[needed++] = weights[idxarr[backw]++];
- else
- {
- /* No more characters fit into the buffer. */
- needed += len;
- idxarr[backw] += len;
- }
- }
- }
- }
- else
- {
- int val = 1;
- #ifndef WIDE_CHAR_VERSION
- char buf[7];
- size_t buflen;
- #endif
- size_t i;
- for (idxcnt = 0; idxcnt < idxmax; ++idxcnt)
- {
- if ((rule & sort_forward) != 0)
- {
- size_t len;
- if (backw_stop != ~0ul)
- {
- /* Handle the pushed elements now. */
- size_t backw;
- for (backw = idxcnt; backw > backw_stop; )
- {
- --backw;
- len = weights[idxarr[backw]++];
- if (len != 0)
- {
- #ifdef WIDE_CHAR_VERSION
- if (needed + 1 + len < n)
- {
- dest[needed] = val;
- for (i = 0; i < len; ++i)
- dest[needed + 1 + i] =
- weights[idxarr[backw] + i];
- }
- needed += 1 + len;
- #else
- buflen = utf8_encode (buf, val);
- if (needed + buflen + len < n)
- {
- for (i = 0; i < buflen; ++i)
- dest[needed + i] = buf[i];
- for (i = 0; i < len; ++i)
- dest[needed + buflen + i] =
- weights[idxarr[backw] + i];
- }
- needed += buflen + len;
- #endif
- idxarr[backw] += len;
- val = 1;
- }
- else
- ++val;
- }
- backw_stop = ~0ul;
- }
- /* Now handle the forward element. */
- len = weights[idxarr[idxcnt]++];
- if (len != 0)
- {
- #ifdef WIDE_CHAR_VERSION
- if (needed + 1+ len < n)
- {
- dest[needed] = val;
- for (i = 0; i < len; ++i)
- dest[needed + 1 + i] =
- weights[idxarr[idxcnt] + i];
- }
- needed += 1 + len;
- #else
- buflen = utf8_encode (buf, val);
- if (needed + buflen + len < n)
- {
- for (i = 0; i < buflen; ++i)
- dest[needed + i] = buf[i];
- for (i = 0; i < len; ++i)
- dest[needed + buflen + i] =
- weights[idxarr[idxcnt] + i];
- }
- needed += buflen + len;
- #endif
- idxarr[idxcnt] += len;
- val = 1;
- }
- else
- /* Note that we don't have to increment `idxarr[idxcnt]'
- since the length is zero. */
- ++val;
- }
- else
- {
- /* Remember where the backwards series started. */
- if (backw_stop == ~0ul)
- backw_stop = idxcnt;
- }
- rule = rulesets[rulearr[idxcnt + 1] * nrules + pass];
- }
- if (backw_stop != ~0ul)
- {
- /* Handle the pushed elements now. */
- size_t backw;
- backw = idxmax - 1;
- while (backw > backw_stop)
- {
- size_t len = weights[idxarr[--backw]++];
- if (len != 0)
- {
- #ifdef WIDE_CHAR_VERSION
- if (needed + 1 + len < n)
- {
- dest[needed] = val;
- for (i = 0; i < len; ++i)
- dest[needed + 1 + i] =
- weights[idxarr[backw] + i];
- }
- needed += 1 + len;
- #else
- buflen = utf8_encode (buf, val);
- if (needed + buflen + len < n)
- {
- for (i = 0; i < buflen; ++i)
- dest[needed + i] = buf[i];
- for (i = 0; i < len; ++i)
- dest[needed + buflen + i] =
- weights[idxarr[backw] + i];
- }
- needed += buflen + len;
- #endif
- idxarr[backw] += len;
- val = 1;
- }
- else
- ++val;
- }
- }
- }
- /* Finally store the byte to separate the passes or terminate
- the string. */
- if (needed < n)
- dest[needed] = pass + 1 < nrules ? L('\1') : L('\0');
- ++needed;
- }
- /* This is a little optimization: many collation specifications have
- a `position' rule at the end and if no non-ignored character
- is found the last \1 byte is immediately followed by a \0 byte
- signalling this. We can avoid the \1 byte(s). */
- if (needed > 2 && needed == last_needed + 1)
- {
- /* Remove the \1 byte. */
- if (--needed <= n)
- dest[needed - 1] = L('\0');
- }
- /* Return the number of bytes/words we need, but don't count the NUL
- byte/word at the end. */
- return needed - 1;
- }
- size_t
- STRXFRM (STRING_TYPE *dest, const STRING_TYPE *src, size_t n, locale_t l)
- {
- locale_data_t l_data;
- struct __locale_data *current = l->__locales[LC_COLLATE];
- l_data.nrules = current->values[_NL_ITEM_INDEX (_NL_COLLATE_NRULES)].word;
- /* Handle byte comparison case. */
- if (l_data.nrules == 0)
- {
- size_t srclen = STRLEN (src);
- if (n != 0)
- STPNCPY (dest, src, MIN (srclen + 1, n));
- return srclen;
- }
- /* Handle an empty string, code hereafter relies on strlen (src) > 0. */
- if (*src == L('\0'))
- {
- if (n != 0)
- *dest = L('\0');
- return 0;
- }
- /* Get the locale data. */
- l_data.rulesets = (unsigned char *)
- current->values[_NL_ITEM_INDEX (_NL_COLLATE_RULESETS)].string;
- l_data.table = (int32_t *)
- current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_TABLE,SUFFIX))].string;
- l_data.weights = (USTRING_TYPE *)
- current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_WEIGHT,SUFFIX))].string;
- l_data.extra = (USTRING_TYPE *)
- current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_EXTRA,SUFFIX))].string;
- l_data.indirect = (int32_t *)
- current->values[_NL_ITEM_INDEX (CONCAT(_NL_COLLATE_INDIRECT,SUFFIX))].string;
- assert (((uintptr_t) l_data.table) % __alignof__ (l_data.table[0]) == 0);
- assert (((uintptr_t) l_data.weights) % __alignof__ (l_data.weights[0]) == 0);
- assert (((uintptr_t) l_data.extra) % __alignof__ (l_data.extra[0]) == 0);
- assert (((uintptr_t) l_data.indirect) % __alignof__ (l_data.indirect[0]) == 0);
- /* We need the elements of the string as unsigned values since they
- are used as indices. */
- const USTRING_TYPE *usrc = (const USTRING_TYPE *) src;
- /* Allocate cache for small strings on the stack and fill it with weight and
- rule indices. If the cache size is not sufficient, continue with the
- uncached xfrm version. */
- size_t idxmax = 0;
- const USTRING_TYPE *cur = usrc;
- int32_t *idxarr = alloca (SMALL_STR_SIZE * sizeof (int32_t));
- unsigned char *rulearr = alloca (SMALL_STR_SIZE + 1);
- do
- {
- int32_t tmp = findidx (l_data.table, l_data.indirect, l_data.extra, &cur,
- -1);
- rulearr[idxmax] = tmp >> 24;
- idxarr[idxmax] = tmp & 0xffffff;
- ++idxmax;
- }
- while (*cur != L('\0') && idxmax < SMALL_STR_SIZE);
- /* This element is only read, the value never used but to determine
- another value which then is ignored. */
- rulearr[idxmax] = '\0';
- /* Do the transformation. */
- if (*cur == L('\0'))
- return do_xfrm_cached (dest, n, &l_data, idxmax, idxarr, rulearr);
- else
- return do_xfrm (usrc, dest, n, &l_data);
- }
- libc_hidden_def (STRXFRM)
- #ifndef WIDE_CHAR_VERSION
- weak_alias (__strxfrm_l, strxfrm_l)
- #endif
|