| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714715716717718719720721722723724725726727728729730731732733734735736737738739740741742743744745746747748749750751752753754755756757758759760761762763764765766767768769770771772773774775776777778779780781782783784785786787788789790791792793794795796797798799800801802803804805806807808809810811812813814815816817818819820821822823824825826827828829830831832833834835836837838839840841842843844845846847848849850851852853854855856857858859860861862863864865866867868869870871872873874875876877878879880881882883884885886887888889890891892893894895896897898899900901902903904905906907908909910911912913914915916917918919920921922923924925926927928929930931932933934935936937938939940941942943944945946947948949950951952953954955956957958959960961962963964965966967968969970971972973974975976977978979980981982983984985986987988989990991992993994995996997998999100010011002100310041005100610071008100910101011101210131014101510161017101810191020102110221023102410251026102710281029103010311032103310341035103610371038103910401041104210431044104510461047104810491050105110521053105410551056105710581059106010611062106310641065106610671068106910701071107210731074107510761077107810791080108110821083108410851086108710881089109010911092109310941095109610971098109911001101110211031104110511061107110811091110111111121113111411151116111711181119112011211122112311241125112611271128112911301131113211331134113511361137113811391140114111421143114411451146114711481149115011511152115311541155115611571158115911601161116211631164116511661167116811691170117111721173117411751176117711781179118011811182118311841185118611871188118911901191119211931194119511961197119811991200120112021203120412051206120712081209121012111212121312141215121612171218121912201221122212231224122512261227122812291230123112321233123412351236123712381239124012411242124312441245124612471248124912501251125212531254125512561257125812591260126112621263126412651266126712681269127012711272127312741275127612771278127912801281128212831284128512861287128812891290129112921293129412951296129712981299130013011302130313041305130613071308130913101311131213131314131513161317131813191320132113221323132413251326132713281329133013311332133313341335133613371338133913401341134213431344134513461347134813491350135113521353135413551356135713581359136013611362136313641365136613671368136913701371137213731374137513761377137813791380138113821383138413851386138713881389139013911392139313941395139613971398139914001401140214031404140514061407140814091410141114121413141414151416141714181419142014211422142314241425142614271428142914301431143214331434143514361437143814391440144114421443144414451446144714481449145014511452145314541455145614571458145914601461146214631464146514661467146814691470147114721473147414751476147714781479148014811482148314841485148614871488148914901491149214931494149514961497149814991500150115021503150415051506150715081509151015111512151315141515151615171518151915201521152215231524152515261527152815291530153115321533153415351536153715381539154015411542154315441545154615471548154915501551155215531554155515561557155815591560156115621563156415651566156715681569157015711572157315741575157615771578157915801581158215831584158515861587158815891590159115921593159415951596159715981599160016011602160316041605160616071608160916101611161216131614161516161617161816191620162116221623162416251626162716281629163016311632163316341635163616371638163916401641164216431644164516461647164816491650165116521653165416551656165716581659166016611662166316641665166616671668166916701671167216731674167516761677167816791680168116821683168416851686168716881689169016911692169316941695169616971698169917001701170217031704170517061707170817091710171117121713171417151716171717181719172017211722172317241725172617271728172917301731173217331734173517361737173817391740174117421743174417451746174717481749175017511752175317541755175617571758175917601761176217631764176517661767176817691770177117721773177417751776177717781779178017811782178317841785178617871788178917901791179217931794179517961797179817991800180118021803180418051806180718081809181018111812181318141815181618171818181918201821182218231824182518261827182818291830183118321833183418351836183718381839184018411842184318441845184618471848184918501851185218531854185518561857185818591860186118621863186418651866186718681869187018711872187318741875187618771878187918801881188218831884188518861887188818891890189118921893189418951896189718981899190019011902190319041905190619071908190919101911191219131914191519161917191819191920192119221923192419251926192719281929193019311932193319341935193619371938193919401941194219431944194519461947194819491950195119521953195419551956195719581959196019611962196319641965196619671968196919701971197219731974197519761977197819791980198119821983198419851986198719881989199019911992199319941995199619971998199920002001200220032004200520062007200820092010201120122013201420152016201720182019202020212022202320242025202620272028202920302031203220332034203520362037203820392040204120422043204420452046204720482049205020512052205320542055205620572058205920602061206220632064206520662067206820692070207120722073207420752076207720782079208020812082208320842085208620872088208920902091209220932094209520962097209820992100210121022103210421052106210721082109211021112112211321142115211621172118211921202121212221232124212521262127212821292130213121322133213421352136213721382139214021412142214321442145214621472148214921502151215221532154215521562157215821592160216121622163216421652166216721682169217021712172217321742175217621772178217921802181218221832184218521862187218821892190219121922193219421952196219721982199220022012202220322042205220622072208220922102211221222132214221522162217221822192220222122222223222422252226222722282229223022312232223322342235223622372238223922402241224222432244224522462247224822492250225122522253225422552256225722582259226022612262226322642265226622672268226922702271227222732274227522762277227822792280228122822283228422852286228722882289229022912292229322942295229622972298229923002301230223032304230523062307230823092310231123122313231423152316231723182319232023212322232323242325232623272328232923302331233223332334233523362337233823392340234123422343234423452346234723482349235023512352235323542355235623572358235923602361236223632364236523662367236823692370237123722373237423752376237723782379238023812382238323842385238623872388238923902391239223932394239523962397239823992400240124022403240424052406240724082409241024112412241324142415241624172418241924202421242224232424242524262427242824292430243124322433243424352436243724382439244024412442244324442445244624472448244924502451245224532454245524562457245824592460246124622463246424652466246724682469247024712472247324742475247624772478247924802481248224832484248524862487248824892490249124922493249424952496249724982499250025012502250325042505250625072508250925102511251225132514251525162517251825192520252125222523252425252526252725282529253025312532253325342535253625372538253925402541254225432544254525462547254825492550255125522553255425552556255725582559256025612562256325642565256625672568256925702571257225732574257525762577257825792580258125822583258425852586258725882589259025912592259325942595259625972598259926002601260226032604260526062607260826092610261126122613261426152616261726182619262026212622262326242625262626272628262926302631263226332634263526362637263826392640264126422643264426452646264726482649265026512652265326542655265626572658265926602661266226632664266526662667266826692670267126722673267426752676267726782679268026812682268326842685268626872688268926902691269226932694269526962697269826992700270127022703270427052706270727082709271027112712271327142715271627172718271927202721272227232724272527262727272827292730273127322733273427352736273727382739274027412742274327442745274627472748274927502751275227532754275527562757275827592760276127622763276427652766276727682769277027712772277327742775277627772778277927802781278227832784278527862787278827892790279127922793279427952796279727982799280028012802280328042805280628072808280928102811281228132814281528162817281828192820282128222823282428252826282728282829283028312832283328342835283628372838283928402841284228432844284528462847284828492850285128522853285428552856285728582859286028612862286328642865286628672868286928702871287228732874287528762877287828792880288128822883288428852886288728882889289028912892289328942895289628972898289929002901290229032904290529062907290829092910291129122913291429152916291729182919292029212922292329242925292629272928292929302931293229332934293529362937293829392940294129422943294429452946294729482949295029512952295329542955295629572958295929602961 |
- @node Character Set Handling, Locales, String and Array Utilities, Top
- @c %MENU% Support for extended character sets
- @chapter Character Set Handling
- @ifnottex
- @macro cal{text}
- \text\
- @end macro
- @end ifnottex
- Character sets used in the early days of computing had only six, seven,
- or eight bits for each character: there was never a case where more than
- eight bits (one byte) were used to represent a single character. The
- limitations of this approach became more apparent as more people
- grappled with non-Roman character sets, where not all the characters
- that make up a language's character set can be represented by @math{2^8}
- choices. This chapter shows the functionality that was added to the C
- library to support multiple character sets.
- @menu
- * Extended Char Intro:: Introduction to Extended Characters.
- * Charset Function Overview:: Overview about Character Handling
- Functions.
- * Restartable multibyte conversion:: Restartable multibyte conversion
- Functions.
- * Non-reentrant Conversion:: Non-reentrant Conversion Function.
- * Generic Charset Conversion:: Generic Charset Conversion.
- @end menu
- @node Extended Char Intro
- @section Introduction to Extended Characters
- A variety of solutions are available to overcome the differences between
- character sets with a 1:1 relation between bytes and characters and
- character sets with ratios of 2:1 or 4:1. The remainder of this
- section gives a few examples to help understand the design decisions
- made while developing the functionality of the @w{C library}.
- @cindex internal representation
- A distinction we have to make right away is between internal and
- external representation. @dfn{Internal representation} means the
- representation used by a program while keeping the text in memory.
- External representations are used when text is stored or transmitted
- through some communication channel. Examples of external
- representations include files waiting in a directory to be
- read and parsed.
- Traditionally there has been no difference between the two representations.
- It was equally comfortable and useful to use the same single-byte
- representation internally and externally. This comfort level decreases
- with more and larger character sets.
- One of the problems to overcome with the internal representation is
- handling text that is externally encoded using different character
- sets. Assume a program that reads two texts and compares them using
- some metric. The comparison can be usefully done only if the texts are
- internally kept in a common format.
- @cindex wide character
- For such a common format (@math{=} character set) eight bits are certainly
- no longer enough. So the smallest entity will have to grow: @dfn{wide
- characters} will now be used. Instead of one byte per character, two or
- four will be used instead. (Three are not good to address in memory and
- more than four bytes seem not to be necessary).
- @cindex Unicode
- @cindex ISO 10646
- As shown in some other part of this manual,
- @c !!! Ahem, wide char string functions are not yet covered -- drepper
- a completely new family has been created of functions that can handle wide
- character texts in memory. The most commonly used character sets for such
- internal wide character representations are Unicode and @w{ISO 10646}
- (also known as UCS for Universal Character Set). Unicode was originally
- planned as a 16-bit character set; whereas, @w{ISO 10646} was designed to
- be a 31-bit large code space. The two standards are practically identical.
- They have the same character repertoire and code table, but Unicode specifies
- added semantics. At the moment, only characters in the first @code{0x10000}
- code positions (the so-called Basic Multilingual Plane, BMP) have been
- assigned, but the assignment of more specialized characters outside this
- 16-bit space is already in progress. A number of encodings have been
- defined for Unicode and @w{ISO 10646} characters:
- @cindex UCS-2
- @cindex UCS-4
- @cindex UTF-8
- @cindex UTF-16
- UCS-2 is a 16-bit word that can only represent characters
- from the BMP, UCS-4 is a 32-bit word than can represent any Unicode
- and @w{ISO 10646} character, UTF-8 is an ASCII compatible encoding where
- ASCII characters are represented by ASCII bytes and non-ASCII characters
- by sequences of 2-6 non-ASCII bytes, and finally UTF-16 is an extension
- of UCS-2 in which pairs of certain UCS-2 words can be used to encode
- non-BMP characters up to @code{0x10ffff}.
- To represent wide characters the @code{char} type is not suitable. For
- this reason the @w{ISO C} standard introduces a new type that is
- designed to keep one character of a wide character string. To maintain
- the similarity there is also a type corresponding to @code{int} for
- those functions that take a single wide character.
- @deftp {Data type} wchar_t
- @standards{ISO, stddef.h}
- This data type is used as the base type for wide character strings.
- In other words, arrays of objects of this type are the equivalent of
- @code{char[]} for multibyte character strings. The type is defined in
- @file{stddef.h}.
- The @w{ISO C90} standard, where @code{wchar_t} was introduced, does not
- say anything specific about the representation. It only requires that
- this type is capable of storing all elements of the basic character set.
- Therefore it would be legitimate to define @code{wchar_t} as @code{char},
- which might make sense for embedded systems.
- But in @theglibc{} @code{wchar_t} is always 32 bits wide and, therefore,
- capable of representing all UCS-4 values and, therefore, covering all of
- @w{ISO 10646}. Some Unix systems define @code{wchar_t} as a 16-bit type
- and thereby follow Unicode very strictly. This definition is perfectly
- fine with the standard, but it also means that to represent all
- characters from Unicode and @w{ISO 10646} one has to use UTF-16 surrogate
- characters, which is in fact a multi-wide-character encoding. But
- resorting to multi-wide-character encoding contradicts the purpose of the
- @code{wchar_t} type.
- @end deftp
- @deftp {Data type} wint_t
- @standards{ISO, wchar.h}
- @code{wint_t} is a data type used for parameters and variables that
- contain a single wide character. As the name suggests this type is the
- equivalent of @code{int} when using the normal @code{char} strings. The
- types @code{wchar_t} and @code{wint_t} often have the same
- representation if their size is 32 bits wide but if @code{wchar_t} is
- defined as @code{char} the type @code{wint_t} must be defined as
- @code{int} due to the parameter promotion.
- @pindex wchar.h
- This type is defined in @file{wchar.h} and was introduced in
- @w{Amendment 1} to @w{ISO C90}.
- @end deftp
- As there are for the @code{char} data type macros are available for
- specifying the minimum and maximum value representable in an object of
- type @code{wchar_t}.
- @deftypevr Macro wint_t WCHAR_MIN
- @standards{ISO, wchar.h}
- The macro @code{WCHAR_MIN} evaluates to the minimum value representable
- by an object of type @code{wint_t}.
- This macro was introduced in @w{Amendment 1} to @w{ISO C90}.
- @end deftypevr
- @deftypevr Macro wint_t WCHAR_MAX
- @standards{ISO, wchar.h}
- The macro @code{WCHAR_MAX} evaluates to the maximum value representable
- by an object of type @code{wint_t}.
- This macro was introduced in @w{Amendment 1} to @w{ISO C90}.
- @end deftypevr
- Another special wide character value is the equivalent to @code{EOF}.
- @deftypevr Macro wint_t WEOF
- @standards{ISO, wchar.h}
- The macro @code{WEOF} evaluates to a constant expression of type
- @code{wint_t} whose value is different from any member of the extended
- character set.
- @code{WEOF} need not be the same value as @code{EOF} and unlike
- @code{EOF} it also need @emph{not} be negative. In other words, sloppy
- code like
- @smallexample
- @{
- int c;
- @dots{}
- while ((c = getc (fp)) < 0)
- @dots{}
- @}
- @end smallexample
- @noindent
- has to be rewritten to use @code{WEOF} explicitly when wide characters
- are used:
- @smallexample
- @{
- wint_t c;
- @dots{}
- while ((c = getwc (fp)) != WEOF)
- @dots{}
- @}
- @end smallexample
- @pindex wchar.h
- This macro was introduced in @w{Amendment 1} to @w{ISO C90} and is
- defined in @file{wchar.h}.
- @end deftypevr
- These internal representations present problems when it comes to storage
- and transmittal. Because each single wide character consists of more
- than one byte, they are affected by byte-ordering. Thus, machines with
- different endianesses would see different values when accessing the same
- data. This byte ordering concern also applies for communication protocols
- that are all byte-based and therefore require that the sender has to
- decide about splitting the wide character in bytes. A last (but not least
- important) point is that wide characters often require more storage space
- than a customized byte-oriented character set.
- @cindex multibyte character
- @cindex EBCDIC
- For all the above reasons, an external encoding that is different from
- the internal encoding is often used if the latter is UCS-2 or UCS-4.
- The external encoding is byte-based and can be chosen appropriately for
- the environment and for the texts to be handled. A variety of different
- character sets can be used for this external encoding (information that
- will not be exhaustively presented here--instead, a description of the
- major groups will suffice). All of the ASCII-based character sets
- fulfill one requirement: they are "filesystem safe." This means that
- the character @code{'/'} is used in the encoding @emph{only} to
- represent itself. Things are a bit different for character sets like
- EBCDIC (Extended Binary Coded Decimal Interchange Code, a character set
- family used by IBM), but if the operating system does not understand
- EBCDIC directly the parameters-to-system calls have to be converted
- first anyhow.
- @itemize @bullet
- @item
- The simplest character sets are single-byte character sets. There can
- be only up to 256 characters (for @w{8 bit} character sets), which is
- not sufficient to cover all languages but might be sufficient to handle
- a specific text. Handling of a @w{8 bit} character sets is simple. This
- is not true for other kinds presented later, and therefore, the
- application one uses might require the use of @w{8 bit} character sets.
- @cindex ISO 2022
- @item
- The @w{ISO 2022} standard defines a mechanism for extended character
- sets where one character @emph{can} be represented by more than one
- byte. This is achieved by associating a state with the text.
- Characters that can be used to change the state can be embedded in the
- text. Each byte in the text might have a different interpretation in each
- state. The state might even influence whether a given byte stands for a
- character on its own or whether it has to be combined with some more
- bytes.
- @cindex EUC
- @cindex Shift_JIS
- @cindex SJIS
- In most uses of @w{ISO 2022} the defined character sets do not allow
- state changes that cover more than the next character. This has the
- big advantage that whenever one can identify the beginning of the byte
- sequence of a character one can interpret a text correctly. Examples of
- character sets using this policy are the various EUC character sets
- (used by Sun's operating systems, EUC-JP, EUC-KR, EUC-TW, and EUC-CN)
- or Shift_JIS (SJIS, a Japanese encoding).
- But there are also character sets using a state that is valid for more
- than one character and has to be changed by another byte sequence.
- Examples for this are ISO-2022-JP, ISO-2022-KR, and ISO-2022-CN.
- @item
- @cindex ISO 6937
- Early attempts to fix 8 bit character sets for other languages using the
- Roman alphabet lead to character sets like @w{ISO 6937}. Here bytes
- representing characters like the acute accent do not produce output
- themselves: one has to combine them with other characters to get the
- desired result. For example, the byte sequence @code{0xc2 0x61}
- (non-spacing acute accent, followed by lower-case `a') to get the ``small
- a with acute'' character. To get the acute accent character on its own,
- one has to write @code{0xc2 0x20} (the non-spacing acute followed by a
- space).
- Character sets like @w{ISO 6937} are used in some embedded systems such
- as teletex.
- @item
- @cindex UTF-8
- Instead of converting the Unicode or @w{ISO 10646} text used internally,
- it is often also sufficient to simply use an encoding different than
- UCS-2/UCS-4. The Unicode and @w{ISO 10646} standards even specify such an
- encoding: UTF-8. This encoding is able to represent all of @w{ISO
- 10646} 31 bits in a byte string of length one to six.
- @cindex UTF-7
- There were a few other attempts to encode @w{ISO 10646} such as UTF-7,
- but UTF-8 is today the only encoding that should be used. In fact, with
- any luck UTF-8 will soon be the only external encoding that has to be
- supported. It proves to be universally usable and its only disadvantage
- is that it favors Roman languages by making the byte string
- representation of other scripts (Cyrillic, Greek, Asian scripts) longer
- than necessary if using a specific character set for these scripts.
- Methods like the Unicode compression scheme can alleviate these
- problems.
- @end itemize
- The question remaining is: how to select the character set or encoding
- to use. The answer: you cannot decide about it yourself, it is decided
- by the developers of the system or the majority of the users. Since the
- goal is interoperability one has to use whatever the other people one
- works with use. If there are no constraints, the selection is based on
- the requirements the expected circle of users will have. In other words,
- if a project is expected to be used in only, say, Russia it is fine to use
- KOI8-R or a similar character set. But if at the same time people from,
- say, Greece are participating one should use a character set that allows
- all people to collaborate.
- The most widely useful solution seems to be: go with the most general
- character set, namely @w{ISO 10646}. Use UTF-8 as the external encoding
- and problems about users not being able to use their own language
- adequately are a thing of the past.
- One final comment about the choice of the wide character representation
- is necessary at this point. We have said above that the natural choice
- is using Unicode or @w{ISO 10646}. This is not required, but at least
- encouraged, by the @w{ISO C} standard. The standard defines at least a
- macro @code{__STDC_ISO_10646__} that is only defined on systems where
- the @code{wchar_t} type encodes @w{ISO 10646} characters. If this
- symbol is not defined one should avoid making assumptions about the wide
- character representation. If the programmer uses only the functions
- provided by the C library to handle wide character strings there should
- be no compatibility problems with other systems.
- @node Charset Function Overview
- @section Overview about Character Handling Functions
- A Unix @w{C library} contains three different sets of functions in two
- families to handle character set conversion. One of the function families
- (the most commonly used) is specified in the @w{ISO C90} standard and,
- therefore, is portable even beyond the Unix world. Unfortunately this
- family is the least useful one. These functions should be avoided
- whenever possible, especially when developing libraries (as opposed to
- applications).
- The second family of functions got introduced in the early Unix standards
- (XPG2) and is still part of the latest and greatest Unix standard:
- @w{Unix 98}. It is also the most powerful and useful set of functions.
- But we will start with the functions defined in @w{Amendment 1} to
- @w{ISO C90}.
- @node Restartable multibyte conversion
- @section Restartable Multibyte Conversion Functions
- The @w{ISO C} standard defines functions to convert strings from a
- multibyte representation to wide character strings. There are a number
- of peculiarities:
- @itemize @bullet
- @item
- The character set assumed for the multibyte encoding is not specified
- as an argument to the functions. Instead the character set specified by
- the @code{LC_CTYPE} category of the current locale is used; see
- @ref{Locale Categories}.
- @item
- The functions handling more than one character at a time require NUL
- terminated strings as the argument (i.e., converting blocks of text
- does not work unless one can add a NUL byte at an appropriate place).
- @Theglibc{} contains some extensions to the standard that allow
- specifying a size, but basically they also expect terminated strings.
- @end itemize
- Despite these limitations the @w{ISO C} functions can be used in many
- contexts. In graphical user interfaces, for instance, it is not
- uncommon to have functions that require text to be displayed in a wide
- character string if the text is not simple ASCII. The text itself might
- come from a file with translations and the user should decide about the
- current locale, which determines the translation and therefore also the
- external encoding used. In such a situation (and many others) the
- functions described here are perfect. If more freedom while performing
- the conversion is necessary take a look at the @code{iconv} functions
- (@pxref{Generic Charset Conversion}).
- @menu
- * Selecting the Conversion:: Selecting the conversion and its properties.
- * Keeping the state:: Representing the state of the conversion.
- * Converting a Character:: Converting Single Characters.
- * Converting Strings:: Converting Multibyte and Wide Character
- Strings.
- * Multibyte Conversion Example:: A Complete Multibyte Conversion Example.
- @end menu
- @node Selecting the Conversion
- @subsection Selecting the conversion and its properties
- We already said above that the currently selected locale for the
- @code{LC_CTYPE} category decides the conversion that is performed
- by the functions we are about to describe. Each locale uses its own
- character set (given as an argument to @code{localedef}) and this is the
- one assumed as the external multibyte encoding. The wide character
- set is always UCS-4 in @theglibc{}.
- A characteristic of each multibyte character set is the maximum number
- of bytes that can be necessary to represent one character. This
- information is quite important when writing code that uses the
- conversion functions (as shown in the examples below).
- The @w{ISO C} standard defines two macros that provide this information.
- @deftypevr Macro int MB_LEN_MAX
- @standards{ISO, limits.h}
- @code{MB_LEN_MAX} specifies the maximum number of bytes in the multibyte
- sequence for a single character in any of the supported locales. It is
- a compile-time constant and is defined in @file{limits.h}.
- @pindex limits.h
- @end deftypevr
- @deftypevr Macro int MB_CUR_MAX
- @standards{ISO, stdlib.h}
- @code{MB_CUR_MAX} expands into a positive integer expression that is the
- maximum number of bytes in a multibyte character in the current locale.
- The value is never greater than @code{MB_LEN_MAX}. Unlike
- @code{MB_LEN_MAX} this macro need not be a compile-time constant, and in
- @theglibc{} it is not.
- @pindex stdlib.h
- @code{MB_CUR_MAX} is defined in @file{stdlib.h}.
- @end deftypevr
- Two different macros are necessary since strictly @w{ISO C90} compilers
- do not allow variable length array definitions, but still it is desirable
- to avoid dynamic allocation. This incomplete piece of code shows the
- problem:
- @smallexample
- @{
- char buf[MB_LEN_MAX];
- ssize_t len = 0;
- while (! feof (fp))
- @{
- fread (&buf[len], 1, MB_CUR_MAX - len, fp);
- /* @r{@dots{} process} buf */
- len -= used;
- @}
- @}
- @end smallexample
- The code in the inner loop is expected to have always enough bytes in
- the array @var{buf} to convert one multibyte character. The array
- @var{buf} has to be sized statically since many compilers do not allow a
- variable size. The @code{fread} call makes sure that @code{MB_CUR_MAX}
- bytes are always available in @var{buf}. Note that it isn't
- a problem if @code{MB_CUR_MAX} is not a compile-time constant.
- @node Keeping the state
- @subsection Representing the state of the conversion
- @cindex stateful
- In the introduction of this chapter it was said that certain character
- sets use a @dfn{stateful} encoding. That is, the encoded values depend
- in some way on the previous bytes in the text.
- Since the conversion functions allow converting a text in more than one
- step we must have a way to pass this information from one call of the
- functions to another.
- @deftp {Data type} mbstate_t
- @standards{ISO, wchar.h}
- @cindex shift state
- A variable of type @code{mbstate_t} can contain all the information
- about the @dfn{shift state} needed from one call to a conversion
- function to another.
- @pindex wchar.h
- @code{mbstate_t} is defined in @file{wchar.h}. It was introduced in
- @w{Amendment 1} to @w{ISO C90}.
- @end deftp
- To use objects of type @code{mbstate_t} the programmer has to define such
- objects (normally as local variables on the stack) and pass a pointer to
- the object to the conversion functions. This way the conversion function
- can update the object if the current multibyte character set is stateful.
- There is no specific function or initializer to put the state object in
- any specific state. The rules are that the object should always
- represent the initial state before the first use, and this is achieved by
- clearing the whole variable with code such as follows:
- @smallexample
- @{
- mbstate_t state;
- memset (&state, '\0', sizeof (state));
- /* @r{from now on @var{state} can be used.} */
- @dots{}
- @}
- @end smallexample
- When using the conversion functions to generate output it is often
- necessary to test whether the current state corresponds to the initial
- state. This is necessary, for example, to decide whether to emit
- escape sequences to set the state to the initial state at certain
- sequence points. Communication protocols often require this.
- @deftypefun int mbsinit (const mbstate_t *@var{ps})
- @standards{ISO, wchar.h}
- @safety{@prelim{}@mtsafe{}@assafe{}@acsafe{}}
- @c ps is dereferenced once, unguarded. This would call for @mtsrace:ps,
- @c but since a single word-sized field is (atomically) accessed, any
- @c race here would be harmless. Other functions that take an optional
- @c mbstate_t* argument named ps are marked with @mtasurace:<func>/!ps,
- @c to indicate that the function uses a static buffer if ps is NULL.
- @c These could also have been marked with @mtsrace:ps, but we'll omit
- @c that for brevity, for it's somewhat redundant with the @mtasurace.
- The @code{mbsinit} function determines whether the state object pointed
- to by @var{ps} is in the initial state. If @var{ps} is a null pointer or
- the object is in the initial state the return value is nonzero. Otherwise
- it is zero.
- @pindex wchar.h
- @code{mbsinit} was introduced in @w{Amendment 1} to @w{ISO C90} and is
- declared in @file{wchar.h}.
- @end deftypefun
- Code using @code{mbsinit} often looks similar to this:
- @c Fix the example to explicitly say how to generate the escape sequence
- @c to restore the initial state.
- @smallexample
- @{
- mbstate_t state;
- memset (&state, '\0', sizeof (state));
- /* @r{Use @var{state}.} */
- @dots{}
- if (! mbsinit (&state))
- @{
- /* @r{Emit code to return to initial state.} */
- const wchar_t empty[] = L"";
- const wchar_t *srcp = empty;
- wcsrtombs (outbuf, &srcp, outbuflen, &state);
- @}
- @dots{}
- @}
- @end smallexample
- The code to emit the escape sequence to get back to the initial state is
- interesting. The @code{wcsrtombs} function can be used to determine the
- necessary output code (@pxref{Converting Strings}). Please note that with
- @theglibc{} it is not necessary to perform this extra action for the
- conversion from multibyte text to wide character text since the wide
- character encoding is not stateful. But there is nothing mentioned in
- any standard that prohibits making @code{wchar_t} use a stateful
- encoding.
- @node Converting a Character
- @subsection Converting Single Characters
- The most fundamental of the conversion functions are those dealing with
- single characters. Please note that this does not always mean single
- bytes. But since there is very often a subset of the multibyte
- character set that consists of single byte sequences, there are
- functions to help with converting bytes. Frequently, ASCII is a subset
- of the multibyte character set. In such a scenario, each ASCII character
- stands for itself, and all other characters have at least a first byte
- that is beyond the range @math{0} to @math{127}.
- @deftypefun wint_t btowc (int @var{c})
- @standards{ISO, wchar.h}
- @safety{@prelim{}@mtsafe{}@asunsafe{@asucorrupt{} @ascuheap{} @asulock{} @ascudlopen{}}@acunsafe{@acucorrupt{} @aculock{} @acsmem{} @acsfd{}}}
- @c Calls btowc_fct or __fct; reads from locale, and from the
- @c get_gconv_fcts result multiple times. get_gconv_fcts calls
- @c __wcsmbs_load_conv to initialize the ctype if it's null.
- @c wcsmbs_load_conv takes a non-recursive wrlock before allocating
- @c memory for the fcts structure, initializing it, and then storing it
- @c in the locale object. The initialization involves dlopening and a
- @c lot more.
- The @code{btowc} function (``byte to wide character'') converts a valid
- single byte character @var{c} in the initial shift state into the wide
- character equivalent using the conversion rules from the currently
- selected locale of the @code{LC_CTYPE} category.
- If @code{(unsigned char) @var{c}} is no valid single byte multibyte
- character or if @var{c} is @code{EOF}, the function returns @code{WEOF}.
- Please note the restriction of @var{c} being tested for validity only in
- the initial shift state. No @code{mbstate_t} object is used from
- which the state information is taken, and the function also does not use
- any static state.
- @pindex wchar.h
- The @code{btowc} function was introduced in @w{Amendment 1} to @w{ISO C90}
- and is declared in @file{wchar.h}.
- @end deftypefun
- Despite the limitation that the single byte value is always interpreted
- in the initial state, this function is actually useful most of the time.
- Most characters are either entirely single-byte character sets or they
- are extensions to ASCII. But then it is possible to write code like this
- (not that this specific example is very useful):
- @smallexample
- wchar_t *
- itow (unsigned long int val)
- @{
- static wchar_t buf[30];
- wchar_t *wcp = &buf[29];
- *wcp = L'\0';
- while (val != 0)
- @{
- *--wcp = btowc ('0' + val % 10);
- val /= 10;
- @}
- if (wcp == &buf[29])
- *--wcp = L'0';
- return wcp;
- @}
- @end smallexample
- Why is it necessary to use such a complicated implementation and not
- simply cast @code{'0' + val % 10} to a wide character? The answer is
- that there is no guarantee that one can perform this kind of arithmetic
- on the character of the character set used for @code{wchar_t}
- representation. In other situations the bytes are not constant at
- compile time and so the compiler cannot do the work. In situations like
- this, using @code{btowc} is required.
- @noindent
- There is also a function for the conversion in the other direction.
- @deftypefun int wctob (wint_t @var{c})
- @standards{ISO, wchar.h}
- @safety{@prelim{}@mtsafe{}@asunsafe{@asucorrupt{} @ascuheap{} @asulock{} @ascudlopen{}}@acunsafe{@acucorrupt{} @aculock{} @acsmem{} @acsfd{}}}
- The @code{wctob} function (``wide character to byte'') takes as the
- parameter a valid wide character. If the multibyte representation for
- this character in the initial state is exactly one byte long, the return
- value of this function is this character. Otherwise the return value is
- @code{EOF}.
- @pindex wchar.h
- @code{wctob} was introduced in @w{Amendment 1} to @w{ISO C90} and
- is declared in @file{wchar.h}.
- @end deftypefun
- There are more general functions to convert single characters from
- multibyte representation to wide characters and vice versa. These
- functions pose no limit on the length of the multibyte representation
- and they also do not require it to be in the initial state.
- @deftypefun size_t mbrtowc (wchar_t *restrict @var{pwc}, const char *restrict @var{s}, size_t @var{n}, mbstate_t *restrict @var{ps})
- @standards{ISO, wchar.h}
- @safety{@prelim{}@mtunsafe{@mtasurace{:mbrtowc/!ps}}@asunsafe{@asucorrupt{} @ascuheap{} @asulock{} @ascudlopen{}}@acunsafe{@acucorrupt{} @aculock{} @acsmem{} @acsfd{}}}
- @cindex stateful
- The @code{mbrtowc} function (``multibyte restartable to wide
- character'') converts the next multibyte character in the string pointed
- to by @var{s} into a wide character and stores it in the location
- pointed to by @var{pwc}. The conversion is performed according
- to the locale currently selected for the @code{LC_CTYPE} category. If
- the conversion for the character set used in the locale requires a state,
- the multibyte string is interpreted in the state represented by the
- object pointed to by @var{ps}. If @var{ps} is a null pointer, a static,
- internal state variable used only by the @code{mbrtowc} function is
- used.
- If the next multibyte character corresponds to the null wide character,
- the return value of the function is @math{0} and the state object is
- afterwards in the initial state. If the next @var{n} or fewer bytes
- form a correct multibyte character, the return value is the number of
- bytes starting from @var{s} that form the multibyte character. The
- conversion state is updated according to the bytes consumed in the
- conversion. In both cases the wide character (either the @code{L'\0'}
- or the one found in the conversion) is stored in the string pointed to
- by @var{pwc} if @var{pwc} is not null.
- If the first @var{n} bytes of the multibyte string possibly form a valid
- multibyte character but there are more than @var{n} bytes needed to
- complete it, the return value of the function is @code{(size_t) -2} and
- no value is stored in @code{*@var{pwc}}. The conversion state is
- updated and all @var{n} input bytes are consumed and should not be
- submitted again. Please note that this can happen even if @var{n} has a
- value greater than or equal to @code{MB_CUR_MAX} since the input might
- contain redundant shift sequences.
- If the first @code{n} bytes of the multibyte string cannot possibly form
- a valid multibyte character, no value is stored, the global variable
- @code{errno} is set to the value @code{EILSEQ}, and the function returns
- @code{(size_t) -1}. The conversion state is afterwards undefined.
- As specified, the @code{mbrtowc} function could deal with multibyte
- sequences which contain embedded null bytes (which happens in Unicode
- encodings such as UTF-16), but @theglibc{} does not support such
- multibyte encodings. When encountering a null input byte, the function
- will either return zero, or return @code{(size_t) -1)} and report a
- @code{EILSEQ} error. The @code{iconv} function can be used for
- converting between arbitrary encodings. @xref{Generic Conversion
- Interface}.
- @pindex wchar.h
- @code{mbrtowc} was introduced in @w{Amendment 1} to @w{ISO C90} and
- is declared in @file{wchar.h}.
- @end deftypefun
- A function that copies a multibyte string into a wide character string
- while at the same time converting all lowercase characters into
- uppercase could look like this:
- @smallexample
- @include mbstouwcs.c.texi
- @end smallexample
- In the inner loop, a single wide character is stored in @code{wc}, and
- the number of consumed bytes is stored in the variable @code{nbytes}.
- If the conversion is successful, the uppercase variant of the wide
- character is stored in the @code{result} array and the pointer to the
- input string and the number of available bytes is adjusted. If the
- @code{mbrtowc} function returns zero, the null input byte has not been
- converted, so it must be stored explicitly in the result.
- The above code uses the fact that there can never be more wide
- characters in the converted result than there are bytes in the multibyte
- input string. This method yields a pessimistic guess about the size of
- the result, and if many wide character strings have to be constructed
- this way or if the strings are long, the extra memory required to be
- allocated because the input string contains multibyte characters might
- be significant. The allocated memory block can be resized to the
- correct size before returning it, but a better solution might be to
- allocate just the right amount of space for the result right away.
- Unfortunately there is no function to compute the length of the wide
- character string directly from the multibyte string. There is, however,
- a function that does part of the work.
- @deftypefun size_t mbrlen (const char *restrict @var{s}, size_t @var{n}, mbstate_t *@var{ps})
- @standards{ISO, wchar.h}
- @safety{@prelim{}@mtunsafe{@mtasurace{:mbrlen/!ps}}@asunsafe{@asucorrupt{} @ascuheap{} @asulock{} @ascudlopen{}}@acunsafe{@acucorrupt{} @aculock{} @acsmem{} @acsfd{}}}
- The @code{mbrlen} function (``multibyte restartable length'') computes
- the number of at most @var{n} bytes starting at @var{s}, which form the
- next valid and complete multibyte character.
- If the next multibyte character corresponds to the NUL wide character,
- the return value is @math{0}. If the next @var{n} bytes form a valid
- multibyte character, the number of bytes belonging to this multibyte
- character byte sequence is returned.
- If the first @var{n} bytes possibly form a valid multibyte
- character but the character is incomplete, the return value is
- @code{(size_t) -2}. Otherwise the multibyte character sequence is invalid
- and the return value is @code{(size_t) -1}.
- The multibyte sequence is interpreted in the state represented by the
- object pointed to by @var{ps}. If @var{ps} is a null pointer, a state
- object local to @code{mbrlen} is used.
- @pindex wchar.h
- @code{mbrlen} was introduced in @w{Amendment 1} to @w{ISO C90} and
- is declared in @file{wchar.h}.
- @end deftypefun
- The attentive reader now will note that @code{mbrlen} can be implemented
- as
- @smallexample
- mbrtowc (NULL, s, n, ps != NULL ? ps : &internal)
- @end smallexample
- This is true and in fact is mentioned in the official specification.
- How can this function be used to determine the length of the wide
- character string created from a multibyte character string? It is not
- directly usable, but we can define a function @code{mbslen} using it:
- @smallexample
- size_t
- mbslen (const char *s)
- @{
- mbstate_t state;
- size_t result = 0;
- size_t nbytes;
- memset (&state, '\0', sizeof (state));
- while ((nbytes = mbrlen (s, MB_LEN_MAX, &state)) > 0)
- @{
- if (nbytes >= (size_t) -2)
- /* @r{Something is wrong.} */
- return (size_t) -1;
- s += nbytes;
- ++result;
- @}
- return result;
- @}
- @end smallexample
- This function simply calls @code{mbrlen} for each multibyte character
- in the string and counts the number of function calls. Please note that
- we here use @code{MB_LEN_MAX} as the size argument in the @code{mbrlen}
- call. This is acceptable since a) this value is larger than the length of
- the longest multibyte character sequence and b) we know that the string
- @var{s} ends with a NUL byte, which cannot be part of any other multibyte
- character sequence but the one representing the NUL wide character.
- Therefore, the @code{mbrlen} function will never read invalid memory.
- Now that this function is available (just to make this clear, this
- function is @emph{not} part of @theglibc{}) we can compute the
- number of wide characters required to store the converted multibyte
- character string @var{s} using
- @smallexample
- wcs_bytes = (mbslen (s) + 1) * sizeof (wchar_t);
- @end smallexample
- Please note that the @code{mbslen} function is quite inefficient. The
- implementation of @code{mbstouwcs} with @code{mbslen} would have to
- perform the conversion of the multibyte character input string twice, and
- this conversion might be quite expensive. So it is necessary to think
- about the consequences of using the easier but imprecise method before
- doing the work twice.
- @deftypefun size_t wcrtomb (char *restrict @var{s}, wchar_t @var{wc}, mbstate_t *restrict @var{ps})
- @standards{ISO, wchar.h}
- @safety{@prelim{}@mtunsafe{@mtasurace{:wcrtomb/!ps}}@asunsafe{@asucorrupt{} @ascuheap{} @asulock{} @ascudlopen{}}@acunsafe{@acucorrupt{} @aculock{} @acsmem{} @acsfd{}}}
- @c wcrtomb uses a static, non-thread-local unguarded state variable when
- @c PS is NULL. When a state is passed in, and it's not used
- @c concurrently in other threads, this function behaves safely as long
- @c as gconv modules don't bring MT safety issues of their own.
- @c Attempting to load gconv modules or to build conversion chains in
- @c signal handlers may encounter gconv databases or caches in a
- @c partially-updated state, and asynchronous cancellation may leave them
- @c in such states, besides leaking the lock that guards them.
- @c get_gconv_fcts ok
- @c wcsmbs_load_conv ok
- @c norm_add_slashes ok
- @c wcsmbs_getfct ok
- @c gconv_find_transform ok
- @c gconv_read_conf (libc_once)
- @c gconv_lookup_cache ok
- @c find_module_idx ok
- @c find_module ok
- @c gconv_find_shlib (ok)
- @c ->init_fct (assumed ok)
- @c gconv_get_builtin_trans ok
- @c gconv_release_step ok
- @c do_lookup_alias ok
- @c find_derivation ok
- @c derivation_lookup ok
- @c increment_counter ok
- @c gconv_find_shlib ok
- @c step->init_fct (assumed ok)
- @c gen_steps ok
- @c gconv_find_shlib ok
- @c dlopen (presumed ok)
- @c dlsym (presumed ok)
- @c step->init_fct (assumed ok)
- @c step->end_fct (assumed ok)
- @c gconv_get_builtin_trans ok
- @c gconv_release_step ok
- @c add_derivation ok
- @c gconv_close_transform ok
- @c gconv_release_step ok
- @c step->end_fct (assumed ok)
- @c gconv_release_shlib ok
- @c dlclose (presumed ok)
- @c gconv_release_cache ok
- @c ->tomb->__fct (assumed ok)
- The @code{wcrtomb} function (``wide character restartable to
- multibyte'') converts a single wide character into a multibyte string
- corresponding to that wide character.
- If @var{s} is a null pointer, the function resets the state stored in
- the object pointed to by @var{ps} (or the internal @code{mbstate_t}
- object) to the initial state. This can also be achieved by a call like
- this:
- @smallexample
- wcrtombs (temp_buf, L'\0', ps)
- @end smallexample
- @noindent
- since, if @var{s} is a null pointer, @code{wcrtomb} performs as if it
- writes into an internal buffer, which is guaranteed to be large enough.
- If @var{wc} is the NUL wide character, @code{wcrtomb} emits, if
- necessary, a shift sequence to get the state @var{ps} into the initial
- state followed by a single NUL byte, which is stored in the string
- @var{s}.
- Otherwise a byte sequence (possibly including shift sequences) is written
- into the string @var{s}. This only happens if @var{wc} is a valid wide
- character (i.e., it has a multibyte representation in the character set
- selected by locale of the @code{LC_CTYPE} category). If @var{wc} is no
- valid wide character, nothing is stored in the strings @var{s},
- @code{errno} is set to @code{EILSEQ}, the conversion state in @var{ps}
- is undefined and the return value is @code{(size_t) -1}.
- If no error occurred the function returns the number of bytes stored in
- the string @var{s}. This includes all bytes representing shift
- sequences.
- One word about the interface of the function: there is no parameter
- specifying the length of the array @var{s}, so the caller has to make sure
- that there is enough space available, otherwise buffer overruns can occur.
- This version of @theglibc{} does not assume that @var{s} is at least
- @var{MB_CUR_MAX} bytes long, but programs that need to run on @glibcadj{}
- versions that have this assumption documented in the manual must comply
- with this limit.
- @pindex wchar.h
- @code{wcrtomb} was introduced in @w{Amendment 1} to @w{ISO C90} and is
- declared in @file{wchar.h}.
- @end deftypefun
- Using @code{wcrtomb} is as easy as using @code{mbrtowc}. The following
- example appends a wide character string to a multibyte character string.
- Again, the code is not really useful (or correct), it is simply here to
- demonstrate the use and some problems.
- @smallexample
- char *
- mbscatwcs (char *s, size_t len, const wchar_t *ws)
- @{
- mbstate_t state;
- /* @r{Find the end of the existing string.} */
- char *wp = strchr (s, '\0');
- len -= wp - s;
- memset (&state, '\0', sizeof (state));
- do
- @{
- size_t nbytes;
- if (len < MB_CUR_LEN)
- @{
- /* @r{We cannot guarantee that the next}
- @r{character fits into the buffer, so}
- @r{return an error.} */
- errno = E2BIG;
- return NULL;
- @}
- nbytes = wcrtomb (wp, *ws, &state);
- if (nbytes == (size_t) -1)
- /* @r{Error in the conversion.} */
- return NULL;
- len -= nbytes;
- wp += nbytes;
- @}
- while (*ws++ != L'\0');
- return s;
- @}
- @end smallexample
- First the function has to find the end of the string currently in the
- array @var{s}. The @code{strchr} call does this very efficiently since a
- requirement for multibyte character representations is that the NUL byte
- is never used except to represent itself (and in this context, the end
- of the string).
- After initializing the state object the loop is entered where the first
- task is to make sure there is enough room in the array @var{s}. We
- abort if there are not at least @code{MB_CUR_LEN} bytes available. This
- is not always optimal but we have no other choice. We might have less
- than @code{MB_CUR_LEN} bytes available but the next multibyte character
- might also be only one byte long. At the time the @code{wcrtomb} call
- returns it is too late to decide whether the buffer was large enough. If
- this solution is unsuitable, there is a very slow but more accurate
- solution.
- @smallexample
- @dots{}
- if (len < MB_CUR_LEN)
- @{
- mbstate_t temp_state;
- memcpy (&temp_state, &state, sizeof (state));
- if (wcrtomb (NULL, *ws, &temp_state) > len)
- @{
- /* @r{We cannot guarantee that the next}
- @r{character fits into the buffer, so}
- @r{return an error.} */
- errno = E2BIG;
- return NULL;
- @}
- @}
- @dots{}
- @end smallexample
- Here we perform the conversion that might overflow the buffer so that
- we are afterwards in the position to make an exact decision about the
- buffer size. Please note the @code{NULL} argument for the destination
- buffer in the new @code{wcrtomb} call; since we are not interested in the
- converted text at this point, this is a nice way to express this. The
- most unusual thing about this piece of code certainly is the duplication
- of the conversion state object, but if a change of the state is necessary
- to emit the next multibyte character, we want to have the same shift state
- change performed in the real conversion. Therefore, we have to preserve
- the initial shift state information.
- There are certainly many more and even better solutions to this problem.
- This example is only provided for educational purposes.
- @node Converting Strings
- @subsection Converting Multibyte and Wide Character Strings
- The functions described in the previous section only convert a single
- character at a time. Most operations to be performed in real-world
- programs include strings and therefore the @w{ISO C} standard also
- defines conversions on entire strings. However, the defined set of
- functions is quite limited; therefore, @theglibc{} contains a few
- extensions that can help in some important situations.
- @deftypefun size_t mbsrtowcs (wchar_t *restrict @var{dst}, const char **restrict @var{src}, size_t @var{len}, mbstate_t *restrict @var{ps})
- @standards{ISO, wchar.h}
- @safety{@prelim{}@mtunsafe{@mtasurace{:mbsrtowcs/!ps}}@asunsafe{@asucorrupt{} @ascuheap{} @asulock{} @ascudlopen{}}@acunsafe{@acucorrupt{} @aculock{} @acsmem{} @acsfd{}}}
- The @code{mbsrtowcs} function (``multibyte string restartable to wide
- character string'') converts the NUL-terminated multibyte character
- string at @code{*@var{src}} into an equivalent wide character string,
- including the NUL wide character at the end. The conversion is started
- using the state information from the object pointed to by @var{ps} or
- from an internal object of @code{mbsrtowcs} if @var{ps} is a null
- pointer. Before returning, the state object is updated to match the state
- after the last converted character. The state is the initial state if the
- terminating NUL byte is reached and converted.
- If @var{dst} is not a null pointer, the result is stored in the array
- pointed to by @var{dst}; otherwise, the conversion result is not
- available since it is stored in an internal buffer.
- If @var{len} wide characters are stored in the array @var{dst} before
- reaching the end of the input string, the conversion stops and @var{len}
- is returned. If @var{dst} is a null pointer, @var{len} is never checked.
- Another reason for a premature return from the function call is if the
- input string contains an invalid multibyte sequence. In this case the
- global variable @code{errno} is set to @code{EILSEQ} and the function
- returns @code{(size_t) -1}.
- @c XXX The ISO C9x draft seems to have a problem here. It says that PS
- @c is not updated if DST is NULL. This is not said straightforward and
- @c none of the other functions is described like this. It would make sense
- @c to define the function this way but I don't think it is meant like this.
- In all other cases the function returns the number of wide characters
- converted during this call. If @var{dst} is not null, @code{mbsrtowcs}
- stores in the pointer pointed to by @var{src} either a null pointer (if
- the NUL byte in the input string was reached) or the address of the byte
- following the last converted multibyte character.
- Like @code{mbstowcs} the @var{dst} parameter may be a null pointer and
- the function can be used to count the number of wide characters that
- would be required.
- @pindex wchar.h
- @code{mbsrtowcs} was introduced in @w{Amendment 1} to @w{ISO C90} and is
- declared in @file{wchar.h}.
- @end deftypefun
- The definition of the @code{mbsrtowcs} function has one important
- limitation. The requirement that @var{dst} has to be a NUL-terminated
- string provides problems if one wants to convert buffers with text. A
- buffer is not normally a collection of NUL-terminated strings but instead a
- continuous collection of lines, separated by newline characters. Now
- assume that a function to convert one line from a buffer is needed. Since
- the line is not NUL-terminated, the source pointer cannot directly point
- into the unmodified text buffer. This means, either one inserts the NUL
- byte at the appropriate place for the time of the @code{mbsrtowcs}
- function call (which is not doable for a read-only buffer or in a
- multi-threaded application) or one copies the line in an extra buffer
- where it can be terminated by a NUL byte. Note that it is not in general
- possible to limit the number of characters to convert by setting the
- parameter @var{len} to any specific value. Since it is not known how
- many bytes each multibyte character sequence is in length, one can only
- guess.
- @cindex stateful
- There is still a problem with the method of NUL-terminating a line right
- after the newline character, which could lead to very strange results.
- As said in the description of the @code{mbsrtowcs} function above, the
- conversion state is guaranteed to be in the initial shift state after
- processing the NUL byte at the end of the input string. But this NUL
- byte is not really part of the text (i.e., the conversion state after
- the newline in the original text could be something different than the
- initial shift state and therefore the first character of the next line
- is encoded using this state). But the state in question is never
- accessible to the user since the conversion stops after the NUL byte
- (which resets the state). Most stateful character sets in use today
- require that the shift state after a newline be the initial state--but
- this is not a strict guarantee. Therefore, simply NUL-terminating a
- piece of a running text is not always an adequate solution and,
- therefore, should never be used in generally used code.
- The generic conversion interface (@pxref{Generic Charset Conversion})
- does not have this limitation (it simply works on buffers, not
- strings), and @theglibc{} contains a set of functions that take
- additional parameters specifying the maximal number of bytes that are
- consumed from the input string. This way the problem of
- @code{mbsrtowcs}'s example above could be solved by determining the line
- length and passing this length to the function.
- @deftypefun size_t wcsrtombs (char *restrict @var{dst}, const wchar_t **restrict @var{src}, size_t @var{len}, mbstate_t *restrict @var{ps})
- @standards{ISO, wchar.h}
- @safety{@prelim{}@mtunsafe{@mtasurace{:wcsrtombs/!ps}}@asunsafe{@asucorrupt{} @ascuheap{} @asulock{} @ascudlopen{}}@acunsafe{@acucorrupt{} @aculock{} @acsmem{} @acsfd{}}}
- The @code{wcsrtombs} function (``wide character string restartable to
- multibyte string'') converts the NUL-terminated wide character string at
- @code{*@var{src}} into an equivalent multibyte character string and
- stores the result in the array pointed to by @var{dst}. The NUL wide
- character is also converted. The conversion starts in the state
- described in the object pointed to by @var{ps} or by a state object
- local to @code{wcsrtombs} in case @var{ps} is a null pointer. If
- @var{dst} is a null pointer, the conversion is performed as usual but the
- result is not available. If all characters of the input string were
- successfully converted and if @var{dst} is not a null pointer, the
- pointer pointed to by @var{src} gets assigned a null pointer.
- If one of the wide characters in the input string has no valid multibyte
- character equivalent, the conversion stops early, sets the global
- variable @code{errno} to @code{EILSEQ}, and returns @code{(size_t) -1}.
- Another reason for a premature stop is if @var{dst} is not a null
- pointer and the next converted character would require more than
- @var{len} bytes in total to the array @var{dst}. In this case (and if
- @var{dst} is not a null pointer) the pointer pointed to by @var{src} is
- assigned a value pointing to the wide character right after the last one
- successfully converted.
- Except in the case of an encoding error the return value of the
- @code{wcsrtombs} function is the number of bytes in all the multibyte
- character sequences which were or would have been (if @var{dst} was
- not a null) stored in @var{dst}. Before returning, the state in the
- object pointed to by @var{ps} (or the internal object in case @var{ps}
- is a null pointer) is updated to reflect the state after the last
- conversion. The state is the initial shift state in case the
- terminating NUL wide character was converted.
- @pindex wchar.h
- The @code{wcsrtombs} function was introduced in @w{Amendment 1} to
- @w{ISO C90} and is declared in @file{wchar.h}.
- @end deftypefun
- The restriction mentioned above for the @code{mbsrtowcs} function applies
- here also. There is no possibility of directly controlling the number of
- input characters. One has to place the NUL wide character at the correct
- place or control the consumed input indirectly via the available output
- array size (the @var{len} parameter).
- @deftypefun size_t mbsnrtowcs (wchar_t *restrict @var{dst}, const char **restrict @var{src}, size_t @var{nmc}, size_t @var{len}, mbstate_t *restrict @var{ps})
- @standards{GNU, wchar.h}
- @safety{@prelim{}@mtunsafe{@mtasurace{:mbsnrtowcs/!ps}}@asunsafe{@asucorrupt{} @ascuheap{} @asulock{} @ascudlopen{}}@acunsafe{@acucorrupt{} @aculock{} @acsmem{} @acsfd{}}}
- The @code{mbsnrtowcs} function is very similar to the @code{mbsrtowcs}
- function. All the parameters are the same except for @var{nmc}, which is
- new. The return value is the same as for @code{mbsrtowcs}.
- This new parameter specifies how many bytes at most can be used from the
- multibyte character string. In other words, the multibyte character
- string @code{*@var{src}} need not be NUL-terminated. But if a NUL byte
- is found within the @var{nmc} first bytes of the string, the conversion
- stops there.
- Like @code{mbstowcs} the @var{dst} parameter may be a null pointer and
- the function can be used to count the number of wide characters that
- would be required.
- This function is a GNU extension. It is meant to work around the
- problems mentioned above. Now it is possible to convert a buffer with
- multibyte character text piece by piece without having to care about
- inserting NUL bytes and the effect of NUL bytes on the conversion state.
- @end deftypefun
- A function to convert a multibyte string into a wide character string
- and display it could be written like this (this is not a really useful
- example):
- @smallexample
- void
- showmbs (const char *src, FILE *fp)
- @{
- mbstate_t state;
- int cnt = 0;
- memset (&state, '\0', sizeof (state));
- while (1)
- @{
- wchar_t linebuf[100];
- const char *endp = strchr (src, '\n');
- size_t n;
- /* @r{Exit if there is no more line.} */
- if (endp == NULL)
- break;
- n = mbsnrtowcs (linebuf, &src, endp - src, 99, &state);
- linebuf[n] = L'\0';
- fprintf (fp, "line %d: \"%S\"\n", linebuf);
- @}
- @}
- @end smallexample
- There is no problem with the state after a call to @code{mbsnrtowcs}.
- Since we don't insert characters in the strings that were not in there
- right from the beginning and we use @var{state} only for the conversion
- of the given buffer, there is no problem with altering the state.
- @deftypefun size_t wcsnrtombs (char *restrict @var{dst}, const wchar_t **restrict @var{src}, size_t @var{nwc}, size_t @var{len}, mbstate_t *restrict @var{ps})
- @standards{GNU, wchar.h}
- @safety{@prelim{}@mtunsafe{@mtasurace{:wcsnrtombs/!ps}}@asunsafe{@asucorrupt{} @ascuheap{} @asulock{} @ascudlopen{}}@acunsafe{@acucorrupt{} @aculock{} @acsmem{} @acsfd{}}}
- The @code{wcsnrtombs} function implements the conversion from wide
- character strings to multibyte character strings. It is similar to
- @code{wcsrtombs} but, just like @code{mbsnrtowcs}, it takes an extra
- parameter, which specifies the length of the input string.
- No more than @var{nwc} wide characters from the input string
- @code{*@var{src}} are converted. If the input string contains a NUL
- wide character in the first @var{nwc} characters, the conversion stops at
- this place.
- The @code{wcsnrtombs} function is a GNU extension and just like
- @code{mbsnrtowcs} helps in situations where no NUL-terminated input
- strings are available.
- @end deftypefun
- @node Multibyte Conversion Example
- @subsection A Complete Multibyte Conversion Example
- The example programs given in the last sections are only brief and do
- not contain all the error checking, etc. Presented here is a complete
- and documented example. It features the @code{mbrtowc} function but it
- should be easy to derive versions using the other functions.
- @smallexample
- int
- file_mbsrtowcs (int input, int output)
- @{
- /* @r{Note the use of @code{MB_LEN_MAX}.}
- @r{@code{MB_CUR_MAX} cannot portably be used here.} */
- char buffer[BUFSIZ + MB_LEN_MAX];
- mbstate_t state;
- int filled = 0;
- int eof = 0;
- /* @r{Initialize the state.} */
- memset (&state, '\0', sizeof (state));
- while (!eof)
- @{
- ssize_t nread;
- ssize_t nwrite;
- char *inp = buffer;
- wchar_t outbuf[BUFSIZ];
- wchar_t *outp = outbuf;
- /* @r{Fill up the buffer from the input file.} */
- nread = read (input, buffer + filled, BUFSIZ);
- if (nread < 0)
- @{
- perror ("read");
- return 0;
- @}
- /* @r{If we reach end of file, make a note to read no more.} */
- if (nread == 0)
- eof = 1;
- /* @r{@code{filled} is now the number of bytes in @code{buffer}.} */
- filled += nread;
- /* @r{Convert those bytes to wide characters--as many as we can.} */
- while (1)
- @{
- size_t thislen = mbrtowc (outp, inp, filled, &state);
- /* @r{Stop converting at invalid character;}
- @r{this can mean we have read just the first part}
- @r{of a valid character.} */
- if (thislen == (size_t) -1)
- break;
- /* @r{We want to handle embedded NUL bytes}
- @r{but the return value is 0. Correct this.} */
- if (thislen == 0)
- thislen = 1;
- /* @r{Advance past this character.} */
- inp += thislen;
- filled -= thislen;
- ++outp;
- @}
- /* @r{Write the wide characters we just made.} */
- nwrite = write (output, outbuf,
- (outp - outbuf) * sizeof (wchar_t));
- if (nwrite < 0)
- @{
- perror ("write");
- return 0;
- @}
- /* @r{See if we have a @emph{real} invalid character.} */
- if ((eof && filled > 0) || filled >= MB_CUR_MAX)
- @{
- error (0, 0, "invalid multibyte character");
- return 0;
- @}
- /* @r{If any characters must be carried forward,}
- @r{put them at the beginning of @code{buffer}.} */
- if (filled > 0)
- memmove (buffer, inp, filled);
- @}
- return 1;
- @}
- @end smallexample
- @node Non-reentrant Conversion
- @section Non-reentrant Conversion Function
- The functions described in the previous chapter are defined in
- @w{Amendment 1} to @w{ISO C90}, but the original @w{ISO C90} standard
- also contained functions for character set conversion. The reason that
- these original functions are not described first is that they are almost
- entirely useless.
- The problem is that all the conversion functions described in the
- original @w{ISO C90} use a local state. Using a local state implies that
- multiple conversions at the same time (not only when using threads)
- cannot be done, and that you cannot first convert single characters and
- then strings since you cannot tell the conversion functions which state
- to use.
- These original functions are therefore usable only in a very limited set
- of situations. One must complete converting the entire string before
- starting a new one, and each string/text must be converted with the same
- function (there is no problem with the library itself; it is guaranteed
- that no library function changes the state of any of these functions).
- @strong{For the above reasons it is highly requested that the functions
- described in the previous section be used in place of non-reentrant
- conversion functions.}
- @menu
- * Non-reentrant Character Conversion:: Non-reentrant Conversion of Single
- Characters.
- * Non-reentrant String Conversion:: Non-reentrant Conversion of Strings.
- * Shift State:: States in Non-reentrant Functions.
- @end menu
- @node Non-reentrant Character Conversion
- @subsection Non-reentrant Conversion of Single Characters
- @deftypefun int mbtowc (wchar_t *restrict @var{result}, const char *restrict @var{string}, size_t @var{size})
- @standards{ISO, stdlib.h}
- @safety{@prelim{}@mtunsafe{@mtasurace{}}@asunsafe{@asucorrupt{} @ascuheap{} @asulock{} @ascudlopen{}}@acunsafe{@acucorrupt{} @aculock{} @acsmem{} @acsfd{}}}
- The @code{mbtowc} (``multibyte to wide character'') function when called
- with non-null @var{string} converts the first multibyte character
- beginning at @var{string} to its corresponding wide character code. It
- stores the result in @code{*@var{result}}.
- @code{mbtowc} never examines more than @var{size} bytes. (The idea is
- to supply for @var{size} the number of bytes of data you have in hand.)
- @code{mbtowc} with non-null @var{string} distinguishes three
- possibilities: the first @var{size} bytes at @var{string} start with
- valid multibyte characters, they start with an invalid byte sequence or
- just part of a character, or @var{string} points to an empty string (a
- null character).
- For a valid multibyte character, @code{mbtowc} converts it to a wide
- character and stores that in @code{*@var{result}}, and returns the
- number of bytes in that character (always at least @math{1} and never
- more than @var{size}).
- For an invalid byte sequence, @code{mbtowc} returns @math{-1}. For an
- empty string, it returns @math{0}, also storing @code{'\0'} in
- @code{*@var{result}}.
- If the multibyte character code uses shift characters, then
- @code{mbtowc} maintains and updates a shift state as it scans. If you
- call @code{mbtowc} with a null pointer for @var{string}, that
- initializes the shift state to its standard initial value. It also
- returns nonzero if the multibyte character code in use actually has a
- shift state. @xref{Shift State}.
- @end deftypefun
- @deftypefun int wctomb (char *@var{string}, wchar_t @var{wchar})
- @standards{ISO, stdlib.h}
- @safety{@prelim{}@mtunsafe{@mtasurace{}}@asunsafe{@asucorrupt{} @ascuheap{} @asulock{} @ascudlopen{}}@acunsafe{@acucorrupt{} @aculock{} @acsmem{} @acsfd{}}}
- The @code{wctomb} (``wide character to multibyte'') function converts
- the wide character code @var{wchar} to its corresponding multibyte
- character sequence, and stores the result in bytes starting at
- @var{string}. At most @code{MB_CUR_MAX} characters are stored.
- @code{wctomb} with non-null @var{string} distinguishes three
- possibilities for @var{wchar}: a valid wide character code (one that can
- be translated to a multibyte character), an invalid code, and
- @code{L'\0'}.
- Given a valid code, @code{wctomb} converts it to a multibyte character,
- storing the bytes starting at @var{string}. Then it returns the number
- of bytes in that character (always at least @math{1} and never more
- than @code{MB_CUR_MAX}).
- If @var{wchar} is an invalid wide character code, @code{wctomb} returns
- @math{-1}. If @var{wchar} is @code{L'\0'}, it returns @code{0}, also
- storing @code{'\0'} in @code{*@var{string}}.
- If the multibyte character code uses shift characters, then
- @code{wctomb} maintains and updates a shift state as it scans. If you
- call @code{wctomb} with a null pointer for @var{string}, that
- initializes the shift state to its standard initial value. It also
- returns nonzero if the multibyte character code in use actually has a
- shift state. @xref{Shift State}.
- Calling this function with a @var{wchar} argument of zero when
- @var{string} is not null has the side-effect of reinitializing the
- stored shift state @emph{as well as} storing the multibyte character
- @code{'\0'} and returning @math{0}.
- @end deftypefun
- Similar to @code{mbrlen} there is also a non-reentrant function that
- computes the length of a multibyte character. It can be defined in
- terms of @code{mbtowc}.
- @deftypefun int mblen (const char *@var{string}, size_t @var{size})
- @standards{ISO, stdlib.h}
- @safety{@prelim{}@mtunsafe{@mtasurace{}}@asunsafe{@asucorrupt{} @ascuheap{} @asulock{} @ascudlopen{}}@acunsafe{@acucorrupt{} @aculock{} @acsmem{} @acsfd{}}}
- The @code{mblen} function with a non-null @var{string} argument returns
- the number of bytes that make up the multibyte character beginning at
- @var{string}, never examining more than @var{size} bytes. (The idea is
- to supply for @var{size} the number of bytes of data you have in hand.)
- The return value of @code{mblen} distinguishes three possibilities: the
- first @var{size} bytes at @var{string} start with valid multibyte
- characters, they start with an invalid byte sequence or just part of a
- character, or @var{string} points to an empty string (a null character).
- For a valid multibyte character, @code{mblen} returns the number of
- bytes in that character (always at least @code{1} and never more than
- @var{size}). For an invalid byte sequence, @code{mblen} returns
- @math{-1}. For an empty string, it returns @math{0}.
- If the multibyte character code uses shift characters, then @code{mblen}
- maintains and updates a shift state as it scans. If you call
- @code{mblen} with a null pointer for @var{string}, that initializes the
- shift state to its standard initial value. It also returns a nonzero
- value if the multibyte character code in use actually has a shift state.
- @xref{Shift State}.
- @pindex stdlib.h
- The function @code{mblen} is declared in @file{stdlib.h}.
- @end deftypefun
- @node Non-reentrant String Conversion
- @subsection Non-reentrant Conversion of Strings
- For convenience the @w{ISO C90} standard also defines functions to
- convert entire strings instead of single characters. These functions
- suffer from the same problems as their reentrant counterparts from
- @w{Amendment 1} to @w{ISO C90}; see @ref{Converting Strings}.
- @deftypefun size_t mbstowcs (wchar_t *@var{wstring}, const char *@var{string}, size_t @var{size})
- @standards{ISO, stdlib.h}
- @safety{@prelim{}@mtsafe{}@asunsafe{@asucorrupt{} @ascuheap{} @asulock{} @ascudlopen{}}@acunsafe{@acucorrupt{} @aculock{} @acsmem{} @acsfd{}}}
- @c Odd... Although this was supposed to be non-reentrant, the internal
- @c state is not a static buffer, but an automatic variable.
- The @code{mbstowcs} (``multibyte string to wide character string'')
- function converts the null-terminated string of multibyte characters
- @var{string} to an array of wide character codes, storing not more than
- @var{size} wide characters into the array beginning at @var{wstring}.
- The terminating null character counts towards the size, so if @var{size}
- is less than the actual number of wide characters resulting from
- @var{string}, no terminating null character is stored.
- The conversion of characters from @var{string} begins in the initial
- shift state.
- If an invalid multibyte character sequence is found, the @code{mbstowcs}
- function returns a value of @math{-1}. Otherwise, it returns the number
- of wide characters stored in the array @var{wstring}. This number does
- not include the terminating null character, which is present if the
- number is less than @var{size}.
- Here is an example showing how to convert a string of multibyte
- characters, allocating enough space for the result.
- @smallexample
- wchar_t *
- mbstowcs_alloc (const char *string)
- @{
- size_t size = strlen (string) + 1;
- wchar_t *buf = xmalloc (size * sizeof (wchar_t));
- size = mbstowcs (buf, string, size);
- if (size == (size_t) -1)
- return NULL;
- buf = xreallocarray (buf, size + 1, sizeof *buf);
- return buf;
- @}
- @end smallexample
- If @var{wstring} is a null pointer then no output is written and the
- conversion proceeds as above, and the result is returned. In practice
- such behaviour is useful for calculating the exact number of wide
- characters required to convert @var{string}. This behaviour of
- accepting a null pointer for @var{wstring} is an @w{XPG4.2} extension
- that is not specified in @w{ISO C} and is optional in @w{POSIX}.
- @end deftypefun
- @deftypefun size_t wcstombs (char *@var{string}, const wchar_t *@var{wstring}, size_t @var{size})
- @standards{ISO, stdlib.h}
- @safety{@prelim{}@mtsafe{}@asunsafe{@asucorrupt{} @ascuheap{} @asulock{} @ascudlopen{}}@acunsafe{@acucorrupt{} @aculock{} @acsmem{} @acsfd{}}}
- The @code{wcstombs} (``wide character string to multibyte string'')
- function converts the null-terminated wide character array @var{wstring}
- into a string containing multibyte characters, storing not more than
- @var{size} bytes starting at @var{string}, followed by a terminating
- null character if there is room. The conversion of characters begins in
- the initial shift state.
- The terminating null character counts towards the size, so if @var{size}
- is less than or equal to the number of bytes needed in @var{wstring}, no
- terminating null character is stored.
- If a code that does not correspond to a valid multibyte character is
- found, the @code{wcstombs} function returns a value of @math{-1}.
- Otherwise, the return value is the number of bytes stored in the array
- @var{string}. This number does not include the terminating null character,
- which is present if the number is less than @var{size}.
- @end deftypefun
- @node Shift State
- @subsection States in Non-reentrant Functions
- In some multibyte character codes, the @emph{meaning} of any particular
- byte sequence is not fixed; it depends on what other sequences have come
- earlier in the same string. Typically there are just a few sequences that
- can change the meaning of other sequences; these few are called
- @dfn{shift sequences} and we say that they set the @dfn{shift state} for
- other sequences that follow.
- To illustrate shift state and shift sequences, suppose we decide that
- the sequence @code{0200} (just one byte) enters Japanese mode, in which
- pairs of bytes in the range from @code{0240} to @code{0377} are single
- characters, while @code{0201} enters Latin-1 mode, in which single bytes
- in the range from @code{0240} to @code{0377} are characters, and
- interpreted according to the ISO Latin-1 character set. This is a
- multibyte code that has two alternative shift states (``Japanese mode''
- and ``Latin-1 mode''), and two shift sequences that specify particular
- shift states.
- When the multibyte character code in use has shift states, then
- @code{mblen}, @code{mbtowc}, and @code{wctomb} must maintain and update
- the current shift state as they scan the string. To make this work
- properly, you must follow these rules:
- @itemize @bullet
- @item
- Before starting to scan a string, call the function with a null pointer
- for the multibyte character address---for example, @code{mblen (NULL,
- 0)}. This initializes the shift state to its standard initial value.
- @item
- Scan the string one character at a time, in order. Do not ``back up''
- and rescan characters already scanned, and do not intersperse the
- processing of different strings.
- @end itemize
- Here is an example of using @code{mblen} following these rules:
- @smallexample
- void
- scan_string (char *s)
- @{
- int length = strlen (s);
- /* @r{Initialize shift state.} */
- mblen (NULL, 0);
- while (1)
- @{
- int thischar = mblen (s, length);
- /* @r{Deal with end of string and invalid characters.} */
- if (thischar == 0)
- break;
- if (thischar == -1)
- @{
- error ("invalid multibyte character");
- break;
- @}
- /* @r{Advance past this character.} */
- s += thischar;
- length -= thischar;
- @}
- @}
- @end smallexample
- The functions @code{mblen}, @code{mbtowc} and @code{wctomb} are not
- reentrant when using a multibyte code that uses a shift state. However,
- no other library functions call these functions, so you don't have to
- worry that the shift state will be changed mysteriously.
- @node Generic Charset Conversion
- @section Generic Charset Conversion
- The conversion functions mentioned so far in this chapter all had in
- common that they operate on character sets that are not directly
- specified by the functions. The multibyte encoding used is specified by
- the currently selected locale for the @code{LC_CTYPE} category. The
- wide character set is fixed by the implementation (in the case of @theglibc{}
- it is always UCS-4 encoded @w{ISO 10646}).
- This has of course several problems when it comes to general character
- conversion:
- @itemize @bullet
- @item
- For every conversion where neither the source nor the destination
- character set is the character set of the locale for the @code{LC_CTYPE}
- category, one has to change the @code{LC_CTYPE} locale using
- @code{setlocale}.
- Changing the @code{LC_CTYPE} locale introduces major problems for the rest
- of the programs since several more functions (e.g., the character
- classification functions, @pxref{Classification of Characters}) use the
- @code{LC_CTYPE} category.
- @item
- Parallel conversions to and from different character sets are not
- possible since the @code{LC_CTYPE} selection is global and shared by all
- threads.
- @item
- If neither the source nor the destination character set is the character
- set used for @code{wchar_t} representation, there is at least a two-step
- process necessary to convert a text using the functions above. One would
- have to select the source character set as the multibyte encoding,
- convert the text into a @code{wchar_t} text, select the destination
- character set as the multibyte encoding, and convert the wide character
- text to the multibyte (@math{=} destination) character set.
- Even if this is possible (which is not guaranteed) it is a very tiring
- work. Plus it suffers from the other two raised points even more due to
- the steady changing of the locale.
- @end itemize
- The XPG2 standard defines a completely new set of functions, which has
- none of these limitations. They are not at all coupled to the selected
- locales, and they have no constraints on the character sets selected for
- source and destination. Only the set of available conversions limits
- them. The standard does not specify that any conversion at all must be
- available. Such availability is a measure of the quality of the
- implementation.
- In the following text first the interface to @code{iconv} and then the
- conversion function, will be described. Comparisons with other
- implementations will show what obstacles stand in the way of portable
- applications. Finally, the implementation is described in so far as might
- interest the advanced user who wants to extend conversion capabilities.
- @menu
- * Generic Conversion Interface:: Generic Character Set Conversion Interface.
- * iconv Examples:: A complete @code{iconv} example.
- * Other iconv Implementations:: Some Details about other @code{iconv}
- Implementations.
- * glibc iconv Implementation:: The @code{iconv} Implementation in the GNU C
- library.
- @end menu
- @node Generic Conversion Interface
- @subsection Generic Character Set Conversion Interface
- This set of functions follows the traditional cycle of using a resource:
- open--use--close. The interface consists of three functions, each of
- which implements one step.
- Before the interfaces are described it is necessary to introduce a
- data type. Just like other open--use--close interfaces the functions
- introduced here work using handles and the @file{iconv.h} header
- defines a special type for the handles used.
- @deftp {Data Type} iconv_t
- @standards{XPG2, iconv.h}
- This data type is an abstract type defined in @file{iconv.h}. The user
- must not assume anything about the definition of this type; it must be
- completely opaque.
- Objects of this type can be assigned handles for the conversions using
- the @code{iconv} functions. The objects themselves need not be freed, but
- the conversions for which the handles stand for have to.
- @end deftp
- @noindent
- The first step is the function to create a handle.
- @deftypefun iconv_t iconv_open (const char *@var{tocode}, const char *@var{fromcode})
- @standards{XPG2, iconv.h}
- @safety{@prelim{}@mtsafe{@mtslocale{}}@asunsafe{@asucorrupt{} @ascuheap{} @asulock{} @ascudlopen{}}@acunsafe{@acucorrupt{} @aculock{} @acsmem{} @acsfd{}}}
- @c Calls malloc if tocode and/or fromcode are too big for alloca. Calls
- @c strip and upstr on both, then gconv_open. strip and upstr call
- @c isalnum_l and toupper_l with the C locale. gconv_open may MT-safely
- @c tokenize toset, replace unspecified codesets with the current locale
- @c (possibly two different accesses), and finally it calls
- @c gconv_find_transform and initializes the gconv_t result with all the
- @c steps in the conversion sequence, running each one's initializer,
- @c destructing and releasing them all if anything fails.
- The @code{iconv_open} function has to be used before starting a
- conversion. The two parameters this function takes determine the
- source and destination character set for the conversion, and if the
- implementation has the possibility to perform such a conversion, the
- function returns a handle.
- If the wanted conversion is not available, the @code{iconv_open} function
- returns @code{(iconv_t) -1}. In this case the global variable
- @code{errno} can have the following values:
- @table @code
- @item EMFILE
- The process already has @code{OPEN_MAX} file descriptors open.
- @item ENFILE
- The system limit of open files is reached.
- @item ENOMEM
- Not enough memory to carry out the operation.
- @item EINVAL
- The conversion from @var{fromcode} to @var{tocode} is not supported.
- @end table
- It is not possible to use the same descriptor in different threads to
- perform independent conversions. The data structures associated
- with the descriptor include information about the conversion state.
- This must not be messed up by using it in different conversions.
- An @code{iconv} descriptor is like a file descriptor as for every use a
- new descriptor must be created. The descriptor does not stand for all
- of the conversions from @var{fromset} to @var{toset}.
- The @glibcadj{} implementation of @code{iconv_open} has one
- significant extension to other implementations. To ease the extension
- of the set of available conversions, the implementation allows storing
- the necessary files with data and code in an arbitrary number of
- directories. How this extension must be written will be explained below
- (@pxref{glibc iconv Implementation}). Here it is only important to say
- that all directories mentioned in the @code{GCONV_PATH} environment
- variable are considered only if they contain a file @file{gconv-modules}.
- These directories need not necessarily be created by the system
- administrator. In fact, this extension is introduced to help users
- writing and using their own, new conversions. Of course, this does not
- work for security reasons in SUID binaries; in this case only the system
- directory is considered and this normally is
- @file{@var{prefix}/lib/gconv}. The @code{GCONV_PATH} environment
- variable is examined exactly once at the first call of the
- @code{iconv_open} function. Later modifications of the variable have no
- effect.
- @pindex iconv.h
- The @code{iconv_open} function was introduced early in the X/Open
- Portability Guide, @w{version 2}. It is supported by all commercial
- Unices as it is required for the Unix branding. However, the quality and
- completeness of the implementation varies widely. The @code{iconv_open}
- function is declared in @file{iconv.h}.
- @end deftypefun
- The @code{iconv} implementation can associate large data structure with
- the handle returned by @code{iconv_open}. Therefore, it is crucial to
- free all the resources once all conversions are carried out and the
- conversion is not needed anymore.
- @deftypefun int iconv_close (iconv_t @var{cd})
- @standards{XPG2, iconv.h}
- @safety{@prelim{}@mtsafe{}@asunsafe{@asucorrupt{} @ascuheap{} @asulock{} @ascudlopen{}}@acunsafe{@acucorrupt{} @aculock{} @acsmem{}}}
- @c Calls gconv_close to destruct and release each of the conversion
- @c steps, release the gconv_t object, then call gconv_close_transform.
- @c Access to the gconv_t object is not guarded, but calling iconv_close
- @c concurrently with any other use is undefined.
- The @code{iconv_close} function frees all resources associated with the
- handle @var{cd}, which must have been returned by a successful call to
- the @code{iconv_open} function.
- If the function call was successful the return value is @math{0}.
- Otherwise it is @math{-1} and @code{errno} is set appropriately.
- Defined errors are:
- @table @code
- @item EBADF
- The conversion descriptor is invalid.
- @end table
- @pindex iconv.h
- The @code{iconv_close} function was introduced together with the rest
- of the @code{iconv} functions in XPG2 and is declared in @file{iconv.h}.
- @end deftypefun
- The standard defines only one actual conversion function. This has,
- therefore, the most general interface: it allows conversion from one
- buffer to another. Conversion from a file to a buffer, vice versa, or
- even file to file can be implemented on top of it.
- @deftypefun size_t iconv (iconv_t @var{cd}, char **@var{inbuf}, size_t *@var{inbytesleft}, char **@var{outbuf}, size_t *@var{outbytesleft})
- @standards{XPG2, iconv.h}
- @safety{@prelim{}@mtsafe{@mtsrace{:cd}}@assafe{}@acunsafe{@acucorrupt{}}}
- @c Without guarding access to the iconv_t object pointed to by cd, call
- @c the conversion function to convert inbuf or flush the internal
- @c conversion state.
- @cindex stateful
- The @code{iconv} function converts the text in the input buffer
- according to the rules associated with the descriptor @var{cd} and
- stores the result in the output buffer. It is possible to call the
- function for the same text several times in a row since for stateful
- character sets the necessary state information is kept in the data
- structures associated with the descriptor.
- The input buffer is specified by @code{*@var{inbuf}} and it contains
- @code{*@var{inbytesleft}} bytes. The extra indirection is necessary for
- communicating the used input back to the caller (see below). It is
- important to note that the buffer pointer is of type @code{char} and the
- length is measured in bytes even if the input text is encoded in wide
- characters.
- The output buffer is specified in a similar way. @code{*@var{outbuf}}
- points to the beginning of the buffer with at least
- @code{*@var{outbytesleft}} bytes room for the result. The buffer
- pointer again is of type @code{char} and the length is measured in
- bytes. If @var{outbuf} or @code{*@var{outbuf}} is a null pointer, the
- conversion is performed but no output is available.
- If @var{inbuf} is a null pointer, the @code{iconv} function performs the
- necessary action to put the state of the conversion into the initial
- state. This is obviously a no-op for non-stateful encodings, but if the
- encoding has a state, such a function call might put some byte sequences
- in the output buffer, which perform the necessary state changes. The
- next call with @var{inbuf} not being a null pointer then simply goes on
- from the initial state. It is important that the programmer never makes
- any assumption as to whether the conversion has to deal with states.
- Even if the input and output character sets are not stateful, the
- implementation might still have to keep states. This is due to the
- implementation chosen for @theglibc{} as it is described below.
- Therefore an @code{iconv} call to reset the state should always be
- performed if some protocol requires this for the output text.
- The conversion stops for one of three reasons. The first is that all
- characters from the input buffer are converted. This actually can mean
- two things: either all bytes from the input buffer are consumed or
- there are some bytes at the end of the buffer that possibly can form a
- complete character but the input is incomplete. The second reason for a
- stop is that the output buffer is full. And the third reason is that
- the input contains invalid characters.
- In all of these cases the buffer pointers after the last successful
- conversion, for the input and output buffers, are stored in @var{inbuf} and
- @var{outbuf}, and the available room in each buffer is stored in
- @var{inbytesleft} and @var{outbytesleft}.
- Since the character sets selected in the @code{iconv_open} call can be
- almost arbitrary, there can be situations where the input buffer contains
- valid characters, which have no identical representation in the output
- character set. The behavior in this situation is undefined. The
- @emph{current} behavior of @theglibc{} in this situation is to
- return with an error immediately. This certainly is not the most
- desirable solution; therefore, future versions will provide better ones,
- but they are not yet finished.
- If all input from the input buffer is successfully converted and stored
- in the output buffer, the function returns the number of non-reversible
- conversions performed. In all other cases the return value is
- @code{(size_t) -1} and @code{errno} is set appropriately. In such cases
- the value pointed to by @var{inbytesleft} is nonzero.
- @table @code
- @item EILSEQ
- The conversion stopped because of an invalid byte sequence in the input.
- After the call, @code{*@var{inbuf}} points at the first byte of the
- invalid byte sequence.
- @item E2BIG
- The conversion stopped because it ran out of space in the output buffer.
- @item EINVAL
- The conversion stopped because of an incomplete byte sequence at the end
- of the input buffer.
- @item EBADF
- The @var{cd} argument is invalid.
- @end table
- @pindex iconv.h
- The @code{iconv} function was introduced in the XPG2 standard and is
- declared in the @file{iconv.h} header.
- @end deftypefun
- The definition of the @code{iconv} function is quite good overall. It
- provides quite flexible functionality. The only problems lie in the
- boundary cases, which are incomplete byte sequences at the end of the
- input buffer and invalid input. A third problem, which is not really
- a design problem, is the way conversions are selected. The standard
- does not say anything about the legitimate names, a minimal set of
- available conversions. We will see how this negatively impacts other
- implementations, as demonstrated below.
- @node iconv Examples
- @subsection A complete @code{iconv} example
- The example below features a solution for a common problem. Given that
- one knows the internal encoding used by the system for @code{wchar_t}
- strings, one often is in the position to read text from a file and store
- it in wide character buffers. One can do this using @code{mbsrtowcs},
- but then we run into the problems discussed above.
- @smallexample
- int
- file2wcs (int fd, const char *charset, wchar_t *outbuf, size_t avail)
- @{
- char inbuf[BUFSIZ];
- size_t insize = 0;
- char *wrptr = (char *) outbuf;
- int result = 0;
- iconv_t cd;
- cd = iconv_open ("WCHAR_T", charset);
- if (cd == (iconv_t) -1)
- @{
- /* @r{Something went wrong.} */
- if (errno == EINVAL)
- error (0, 0, "conversion from '%s' to wchar_t not available",
- charset);
- else
- perror ("iconv_open");
- /* @r{Terminate the output string.} */
- *outbuf = L'\0';
- return -1;
- @}
- while (avail > 0)
- @{
- size_t nread;
- size_t nconv;
- char *inptr = inbuf;
- /* @r{Read more input.} */
- nread = read (fd, inbuf + insize, sizeof (inbuf) - insize);
- if (nread == 0)
- @{
- /* @r{When we come here the file is completely read.}
- @r{This still could mean there are some unused}
- @r{characters in the @code{inbuf}. Put them back.} */
- if (lseek (fd, -insize, SEEK_CUR) == -1)
- result = -1;
- /* @r{Now write out the byte sequence to get into the}
- @r{initial state if this is necessary.} */
- iconv (cd, NULL, NULL, &wrptr, &avail);
- break;
- @}
- insize += nread;
- /* @r{Do the conversion.} */
- nconv = iconv (cd, &inptr, &insize, &wrptr, &avail);
- if (nconv == (size_t) -1)
- @{
- /* @r{Not everything went right. It might only be}
- @r{an unfinished byte sequence at the end of the}
- @r{buffer. Or it is a real problem.} */
- if (errno == EINVAL)
- /* @r{This is harmless. Simply move the unused}
- @r{bytes to the beginning of the buffer so that}
- @r{they can be used in the next round.} */
- memmove (inbuf, inptr, insize);
- else
- @{
- /* @r{It is a real problem. Maybe we ran out of}
- @r{space in the output buffer or we have invalid}
- @r{input. In any case back the file pointer to}
- @r{the position of the last processed byte.} */
- lseek (fd, -insize, SEEK_CUR);
- result = -1;
- break;
- @}
- @}
- @}
- /* @r{Terminate the output string.} */
- if (avail >= sizeof (wchar_t))
- *((wchar_t *) wrptr) = L'\0';
- if (iconv_close (cd) != 0)
- perror ("iconv_close");
- return (wchar_t *) wrptr - outbuf;
- @}
- @end smallexample
- @cindex stateful
- This example shows the most important aspects of using the @code{iconv}
- functions. It shows how successive calls to @code{iconv} can be used to
- convert large amounts of text. The user does not have to care about
- stateful encodings as the functions take care of everything.
- An interesting point is the case where @code{iconv} returns an error and
- @code{errno} is set to @code{EINVAL}. This is not really an error in the
- transformation. It can happen whenever the input character set contains
- byte sequences of more than one byte for some character and texts are not
- processed in one piece. In this case there is a chance that a multibyte
- sequence is cut. The caller can then simply read the remainder of the
- takes and feed the offending bytes together with new character from the
- input to @code{iconv} and continue the work. The internal state kept in
- the descriptor is @emph{not} unspecified after such an event as is the
- case with the conversion functions from the @w{ISO C} standard.
- The example also shows the problem of using wide character strings with
- @code{iconv}. As explained in the description of the @code{iconv}
- function above, the function always takes a pointer to a @code{char}
- array and the available space is measured in bytes. In the example, the
- output buffer is a wide character buffer; therefore, we use a local
- variable @var{wrptr} of type @code{char *}, which is used in the
- @code{iconv} calls.
- This looks rather innocent but can lead to problems on platforms that
- have tight restriction on alignment. Therefore the caller of @code{iconv}
- has to make sure that the pointers passed are suitable for access of
- characters from the appropriate character set. Since, in the
- above case, the input parameter to the function is a @code{wchar_t}
- pointer, this is the case (unless the user violates alignment when
- computing the parameter). But in other situations, especially when
- writing generic functions where one does not know what type of character
- set one uses and, therefore, treats text as a sequence of bytes, it might
- become tricky.
- @node Other iconv Implementations
- @subsection Some Details about other @code{iconv} Implementations
- This is not really the place to discuss the @code{iconv} implementation
- of other systems but it is necessary to know a bit about them to write
- portable programs. The above mentioned problems with the specification
- of the @code{iconv} functions can lead to portability issues.
- The first thing to notice is that, due to the large number of character
- sets in use, it is certainly not practical to encode the conversions
- directly in the C library. Therefore, the conversion information must
- come from files outside the C library. This is usually done in one or
- both of the following ways:
- @itemize @bullet
- @item
- The C library contains a set of generic conversion functions that can
- read the needed conversion tables and other information from data files.
- These files get loaded when necessary.
- This solution is problematic as it requires a great deal of effort to
- apply to all character sets (potentially an infinite set). The
- differences in the structure of the different character sets is so large
- that many different variants of the table-processing functions must be
- developed. In addition, the generic nature of these functions make them
- slower than specifically implemented functions.
- @item
- The C library only contains a framework that can dynamically load
- object files and execute the conversion functions contained therein.
- This solution provides much more flexibility. The C library itself
- contains only very little code and therefore reduces the general memory
- footprint. Also, with a documented interface between the C library and
- the loadable modules it is possible for third parties to extend the set
- of available conversion modules. A drawback of this solution is that
- dynamic loading must be available.
- @end itemize
- Some implementations in commercial Unices implement a mixture of these
- possibilities; the majority implement only the second solution. Using
- loadable modules moves the code out of the library itself and keeps
- the door open for extensions and improvements, but this design is also
- limiting on some platforms since not many platforms support dynamic
- loading in statically linked programs. On platforms without this
- capability it is therefore not possible to use this interface in
- statically linked programs. @Theglibc{} has, on ELF platforms, no
- problems with dynamic loading in these situations; therefore, this
- point is moot. The danger is that one gets acquainted with this
- situation and forgets about the restrictions on other systems.
- A second thing to know about other @code{iconv} implementations is that
- the number of available conversions is often very limited. Some
- implementations provide, in the standard release (not special
- international or developer releases), at most 100 to 200 conversion
- possibilities. This does not mean 200 different character sets are
- supported; for example, conversions from one character set to a set of 10
- others might count as 10 conversions. Together with the other direction
- this makes 20 conversion possibilities used up by one character set. One
- can imagine the thin coverage these platforms provide. Some Unix vendors
- even provide only a handful of conversions, which renders them useless for
- almost all uses.
- This directly leads to a third and probably the most problematic point.
- The way the @code{iconv} conversion functions are implemented on all
- known Unix systems and the availability of the conversion functions from
- character set @math{@cal{A}} to @math{@cal{B}} and the conversion from
- @math{@cal{B}} to @math{@cal{C}} does @emph{not} imply that the
- conversion from @math{@cal{A}} to @math{@cal{C}} is available.
- This might not seem unreasonable and problematic at first, but it is a
- quite big problem as one will notice shortly after hitting it. To show
- the problem we assume to write a program that has to convert from
- @math{@cal{A}} to @math{@cal{C}}. A call like
- @smallexample
- cd = iconv_open ("@math{@cal{C}}", "@math{@cal{A}}");
- @end smallexample
- @noindent
- fails according to the assumption above. But what does the program
- do now? The conversion is necessary; therefore, simply giving up is not
- an option.
- This is a nuisance. The @code{iconv} function should take care of this.
- But how should the program proceed from here on? If it tries to convert
- to character set @math{@cal{B}}, first the two @code{iconv_open}
- calls
- @smallexample
- cd1 = iconv_open ("@math{@cal{B}}", "@math{@cal{A}}");
- @end smallexample
- @noindent
- and
- @smallexample
- cd2 = iconv_open ("@math{@cal{C}}", "@math{@cal{B}}");
- @end smallexample
- @noindent
- will succeed, but how to find @math{@cal{B}}?
- Unfortunately, the answer is: there is no general solution. On some
- systems guessing might help. On those systems most character sets can
- convert to and from UTF-8 encoded @w{ISO 10646} or Unicode text. Besides
- this only some very system-specific methods can help. Since the
- conversion functions come from loadable modules and these modules must
- be stored somewhere in the filesystem, one @emph{could} try to find them
- and determine from the available file which conversions are available
- and whether there is an indirect route from @math{@cal{A}} to
- @math{@cal{C}}.
- This example shows one of the design errors of @code{iconv} mentioned
- above. It should at least be possible to determine the list of available
- conversions programmatically so that if @code{iconv_open} says there is no
- such conversion, one could make sure this also is true for indirect
- routes.
- @node glibc iconv Implementation
- @subsection The @code{iconv} Implementation in @theglibc{}
- After reading about the problems of @code{iconv} implementations in the
- last section it is certainly good to note that the implementation in
- @theglibc{} has none of the problems mentioned above. What
- follows is a step-by-step analysis of the points raised above. The
- evaluation is based on the current state of the development (as of
- January 1999). The development of the @code{iconv} functions is not
- complete, but basic functionality has solidified.
- @Theglibc{}'s @code{iconv} implementation uses shared loadable
- modules to implement the conversions. A very small number of
- conversions are built into the library itself but these are only rather
- trivial conversions.
- All the benefits of loadable modules are available in the @glibcadj{}
- implementation. This is especially appealing since the interface is
- well documented (see below), and it, therefore, is easy to write new
- conversion modules. The drawback of using loadable objects is not a
- problem in @theglibc{}, at least on ELF systems. Since the
- library is able to load shared objects even in statically linked
- binaries, static linking need not be forbidden in case one wants to use
- @code{iconv}.
- The second mentioned problem is the number of supported conversions.
- Currently, @theglibc{} supports more than 150 character sets. The
- way the implementation is designed the number of supported conversions
- is greater than 22350 (@math{150} times @math{149}). If any conversion
- from or to a character set is missing, it can be added easily.
- Particularly impressive as it may be, this high number is due to the
- fact that the @glibcadj{} implementation of @code{iconv} does not have
- the third problem mentioned above (i.e., whenever there is a conversion
- from a character set @math{@cal{A}} to @math{@cal{B}} and from
- @math{@cal{B}} to @math{@cal{C}} it is always possible to convert from
- @math{@cal{A}} to @math{@cal{C}} directly). If the @code{iconv_open}
- returns an error and sets @code{errno} to @code{EINVAL}, there is no
- known way, directly or indirectly, to perform the wanted conversion.
- @cindex triangulation
- Triangulation is achieved by providing for each character set a
- conversion from and to UCS-4 encoded @w{ISO 10646}. Using @w{ISO 10646}
- as an intermediate representation it is possible to @dfn{triangulate}
- (i.e., convert with an intermediate representation).
- There is no inherent requirement to provide a conversion to @w{ISO
- 10646} for a new character set, and it is also possible to provide other
- conversions where neither source nor destination character set is @w{ISO
- 10646}. The existing set of conversions is simply meant to cover all
- conversions that might be of interest.
- @cindex ISO-2022-JP
- @cindex EUC-JP
- All currently available conversions use the triangulation method above,
- making conversion run unnecessarily slow. If, for example, somebody
- often needs the conversion from ISO-2022-JP to EUC-JP, a quicker solution
- would involve direct conversion between the two character sets, skipping
- the input to @w{ISO 10646} first. The two character sets of interest
- are much more similar to each other than to @w{ISO 10646}.
- In such a situation one easily can write a new conversion and provide it
- as a better alternative. The @glibcadj{} @code{iconv} implementation
- would automatically use the module implementing the conversion if it is
- specified to be more efficient.
- @subsubsection Format of @file{gconv-modules} files
- All information about the available conversions comes from a file named
- @file{gconv-modules}, which can be found in any of the directories along
- the @code{GCONV_PATH}. The @file{gconv-modules} files are line-oriented
- text files, where each of the lines has one of the following formats:
- @itemize @bullet
- @item
- If the first non-whitespace character is a @kbd{#} the line contains only
- comments and is ignored.
- @item
- Lines starting with @code{alias} define an alias name for a character
- set. Two more words are expected on the line. The first word
- defines the alias name, and the second defines the original name of the
- character set. The effect is that it is possible to use the alias name
- in the @var{fromset} or @var{toset} parameters of @code{iconv_open} and
- achieve the same result as when using the real character set name.
- This is quite important as a character set has often many different
- names. There is normally an official name but this need not correspond to
- the most popular name. Besides this many character sets have special
- names that are somehow constructed. For example, all character sets
- specified by the ISO have an alias of the form @code{ISO-IR-@var{nnn}}
- where @var{nnn} is the registration number. This allows programs that
- know about the registration number to construct character set names and
- use them in @code{iconv_open} calls. More on the available names and
- aliases follows below.
- @item
- Lines starting with @code{module} introduce an available conversion
- module. These lines must contain three or four more words.
- The first word specifies the source character set, the second word the
- destination character set of conversion implemented in this module, and
- the third word is the name of the loadable module. The filename is
- constructed by appending the usual shared object suffix (normally
- @file{.so}) and this file is then supposed to be found in the same
- directory the @file{gconv-modules} file is in. The last word on the line,
- which is optional, is a numeric value representing the cost of the
- conversion. If this word is missing, a cost of @math{1} is assumed. The
- numeric value itself does not matter that much; what counts are the
- relative values of the sums of costs for all possible conversion paths.
- Below is a more precise description of the use of the cost value.
- @end itemize
- Returning to the example above where one has written a module to directly
- convert from ISO-2022-JP to EUC-JP and back. All that has to be done is
- to put the new module, let its name be ISO2022JP-EUCJP.so, in a directory
- and add a file @file{gconv-modules} with the following content in the
- same directory:
- @smallexample
- module ISO-2022-JP// EUC-JP// ISO2022JP-EUCJP 1
- module EUC-JP// ISO-2022-JP// ISO2022JP-EUCJP 1
- @end smallexample
- To see why this is sufficient, it is necessary to understand how the
- conversion used by @code{iconv} (and described in the descriptor) is
- selected. The approach to this problem is quite simple.
- At the first call of the @code{iconv_open} function the program reads
- all available @file{gconv-modules} files and builds up two tables: one
- containing all the known aliases and another that contains the
- information about the conversions and which shared object implements
- them.
- @subsubsection Finding the conversion path in @code{iconv}
- The set of available conversions form a directed graph with weighted
- edges. The weights on the edges are the costs specified in the
- @file{gconv-modules} files. The @code{iconv_open} function uses an
- algorithm suitable for search for the best path in such a graph and so
- constructs a list of conversions that must be performed in succession
- to get the transformation from the source to the destination character
- set.
- Explaining why the above @file{gconv-modules} files allows the
- @code{iconv} implementation to resolve the specific ISO-2022-JP to
- EUC-JP conversion module instead of the conversion coming with the
- library itself is straightforward. Since the latter conversion takes two
- steps (from ISO-2022-JP to @w{ISO 10646} and then from @w{ISO 10646} to
- EUC-JP), the cost is @math{1+1 = 2}. The above @file{gconv-modules}
- file, however, specifies that the new conversion modules can perform this
- conversion with only the cost of @math{1}.
- A mysterious item about the @file{gconv-modules} file above (and also
- the file coming with @theglibc{}) are the names of the character
- sets specified in the @code{module} lines. Why do almost all the names
- end in @code{//}? And this is not all: the names can actually be
- regular expressions. At this point in time this mystery should not be
- revealed, unless you have the relevant spell-casting materials: ashes
- from an original @w{DOS 6.2} boot disk burnt in effigy, a crucifix
- blessed by St.@: Emacs, assorted herbal roots from Central America, sand
- from Cebu, etc. Sorry! @strong{The part of the implementation where
- this is used is not yet finished. For now please simply follow the
- existing examples. It'll become clearer once it is. --drepper}
- A last remark about the @file{gconv-modules} is about the names not
- ending with @code{//}. A character set named @code{INTERNAL} is often
- mentioned. From the discussion above and the chosen name it should have
- become clear that this is the name for the representation used in the
- intermediate step of the triangulation. We have said that this is UCS-4
- but actually that is not quite right. The UCS-4 specification also
- includes the specification of the byte ordering used. Since a UCS-4 value
- consists of four bytes, a stored value is affected by byte ordering. The
- internal representation is @emph{not} the same as UCS-4 in case the byte
- ordering of the processor (or at least the running process) is not the
- same as the one required for UCS-4. This is done for performance reasons
- as one does not want to perform unnecessary byte-swapping operations if
- one is not interested in actually seeing the result in UCS-4. To avoid
- trouble with endianness, the internal representation consistently is named
- @code{INTERNAL} even on big-endian systems where the representations are
- identical.
- @subsubsection @code{iconv} module data structures
- So far this section has described how modules are located and considered
- to be used. What remains to be described is the interface of the modules
- so that one can write new ones. This section describes the interface as
- it is in use in January 1999. The interface will change a bit in the
- future but, with luck, only in an upwardly compatible way.
- The definitions necessary to write new modules are publicly available
- in the non-standard header @file{gconv.h}. The following text,
- therefore, describes the definitions from this header file. First,
- however, it is necessary to get an overview.
- From the perspective of the user of @code{iconv} the interface is quite
- simple: the @code{iconv_open} function returns a handle that can be used
- in calls to @code{iconv}, and finally the handle is freed with a call to
- @code{iconv_close}. The problem is that the handle has to be able to
- represent the possibly long sequences of conversion steps and also the
- state of each conversion since the handle is all that is passed to the
- @code{iconv} function. Therefore, the data structures are really the
- elements necessary to understanding the implementation.
- We need two different kinds of data structures. The first describes the
- conversion and the second describes the state etc. There are really two
- type definitions like this in @file{gconv.h}.
- @pindex gconv.h
- @deftp {Data type} {struct __gconv_step}
- @standards{GNU, gconv.h}
- This data structure describes one conversion a module can perform. For
- each function in a loaded module with conversion functions there is
- exactly one object of this type. This object is shared by all users of
- the conversion (i.e., this object does not contain any information
- corresponding to an actual conversion; it only describes the conversion
- itself).
- @table @code
- @item struct __gconv_loaded_object *__shlib_handle
- @itemx const char *__modname
- @itemx int __counter
- All these elements of the structure are used internally in the C library
- to coordinate loading and unloading the shared object. One must not expect any
- of the other elements to be available or initialized.
- @item const char *__from_name
- @itemx const char *__to_name
- @code{__from_name} and @code{__to_name} contain the names of the source and
- destination character sets. They can be used to identify the actual
- conversion to be carried out since one module might implement conversions
- for more than one character set and/or direction.
- @item gconv_fct __fct
- @itemx gconv_init_fct __init_fct
- @itemx gconv_end_fct __end_fct
- These elements contain pointers to the functions in the loadable module.
- The interface will be explained below.
- @item int __min_needed_from
- @itemx int __max_needed_from
- @itemx int __min_needed_to
- @itemx int __max_needed_to;
- These values have to be supplied in the init function of the module. The
- @code{__min_needed_from} value specifies how many bytes a character of
- the source character set at least needs. The @code{__max_needed_from}
- specifies the maximum value that also includes possible shift sequences.
- The @code{__min_needed_to} and @code{__max_needed_to} values serve the
- same purpose as @code{__min_needed_from} and @code{__max_needed_from} but
- this time for the destination character set.
- It is crucial that these values be accurate since otherwise the
- conversion functions will have problems or not work at all.
- @item int __stateful
- This element must also be initialized by the init function.
- @code{int __stateful} is nonzero if the source character set is stateful.
- Otherwise it is zero.
- @item void *__data
- This element can be used freely by the conversion functions in the
- module. @code{void *__data} can be used to communicate extra information
- from one call to another. @code{void *__data} need not be initialized if
- not needed at all. If @code{void *__data} element is assigned a pointer
- to dynamically allocated memory (presumably in the init function) it has
- to be made sure that the end function deallocates the memory. Otherwise
- the application will leak memory.
- It is important to be aware that this data structure is shared by all
- users of this specification conversion and therefore the @code{__data}
- element must not contain data specific to one specific use of the
- conversion function.
- @end table
- @end deftp
- @deftp {Data type} {struct __gconv_step_data}
- @standards{GNU, gconv.h}
- This is the data structure that contains the information specific to
- each use of the conversion functions.
- @table @code
- @item char *__outbuf
- @itemx char *__outbufend
- These elements specify the output buffer for the conversion step. The
- @code{__outbuf} element points to the beginning of the buffer, and
- @code{__outbufend} points to the byte following the last byte in the
- buffer. The conversion function must not assume anything about the size
- of the buffer but it can be safely assumed there is room for at
- least one complete character in the output buffer.
- Once the conversion is finished, if the conversion is the last step, the
- @code{__outbuf} element must be modified to point after the last byte
- written into the buffer to signal how much output is available. If this
- conversion step is not the last one, the element must not be modified.
- The @code{__outbufend} element must not be modified.
- @item int __flags
- This field is a set of flags. The @code{__GCONV_IS_LAST} bit is set if
- this conversion step is the last one. This information is necessary for
- the recursion. See the description of the conversion function internals
- below. This element must never be modified.
- @item int __invocation_counter
- The conversion function can use this element to see how many calls of
- the conversion function already happened. Some character sets require a
- certain prolog when generating output, and by comparing this value with
- zero, one can find out whether it is the first call and whether,
- therefore, the prolog should be emitted. This element must never be
- modified.
- @item int __internal_use
- This element is another one rarely used but needed in certain
- situations. It is assigned a nonzero value in case the conversion
- functions are used to implement @code{mbsrtowcs} et.al.@: (i.e., the
- function is not used directly through the @code{iconv} interface).
- This sometimes makes a difference as it is expected that the
- @code{iconv} functions are used to translate entire texts while the
- @code{mbsrtowcs} functions are normally used only to convert single
- strings and might be used multiple times to convert entire texts.
- But in this situation we would have problem complying with some rules of
- the character set specification. Some character sets require a prolog,
- which must appear exactly once for an entire text. If a number of
- @code{mbsrtowcs} calls are used to convert the text, only the first call
- must add the prolog. However, because there is no communication between the
- different calls of @code{mbsrtowcs}, the conversion functions have no
- possibility to find this out. The situation is different for sequences
- of @code{iconv} calls since the handle allows access to the needed
- information.
- The @code{int __internal_use} element is mostly used together with
- @code{__invocation_counter} as follows:
- @smallexample
- if (!data->__internal_use
- && data->__invocation_counter == 0)
- /* @r{Emit prolog.} */
- @dots{}
- @end smallexample
- This element must never be modified.
- @item mbstate_t *__statep
- The @code{__statep} element points to an object of type @code{mbstate_t}
- (@pxref{Keeping the state}). The conversion of a stateful character
- set must use the object pointed to by @code{__statep} to store
- information about the conversion state. The @code{__statep} element
- itself must never be modified.
- @item mbstate_t __state
- This element must @emph{never} be used directly. It is only part of
- this structure to have the needed space allocated.
- @end table
- @end deftp
- @subsubsection @code{iconv} module interfaces
- With the knowledge about the data structures we now can describe the
- conversion function itself. To understand the interface a bit of
- knowledge is necessary about the functionality in the C library that
- loads the objects with the conversions.
- It is often the case that one conversion is used more than once (i.e.,
- there are several @code{iconv_open} calls for the same set of character
- sets during one program run). The @code{mbsrtowcs} et.al.@: functions in
- @theglibc{} also use the @code{iconv} functionality, which
- increases the number of uses of the same functions even more.
- Because of this multiple use of conversions, the modules do not get
- loaded exclusively for one conversion. Instead a module once loaded can
- be used by an arbitrary number of @code{iconv} or @code{mbsrtowcs} calls
- at the same time. The splitting of the information between conversion-
- function-specific information and conversion data makes this possible.
- The last section showed the two data structures used to do this.
- This is of course also reflected in the interface and semantics of the
- functions that the modules must provide. There are three functions that
- must have the following names:
- @table @code
- @item gconv_init
- The @code{gconv_init} function initializes the conversion function
- specific data structure. This very same object is shared by all
- conversions that use this conversion and, therefore, no state information
- about the conversion itself must be stored in here. If a module
- implements more than one conversion, the @code{gconv_init} function will
- be called multiple times.
- @item gconv_end
- The @code{gconv_end} function is responsible for freeing all resources
- allocated by the @code{gconv_init} function. If there is nothing to do,
- this function can be missing. Special care must be taken if the module
- implements more than one conversion and the @code{gconv_init} function
- does not allocate the same resources for all conversions.
- @item gconv
- This is the actual conversion function. It is called to convert one
- block of text. It gets passed the conversion step information
- initialized by @code{gconv_init} and the conversion data, specific to
- this use of the conversion functions.
- @end table
- There are three data types defined for the three module interface
- functions and these define the interface.
- @deftypevr {Data type} int {(*__gconv_init_fct)} (struct __gconv_step *)
- @standards{GNU, gconv.h}
- This specifies the interface of the initialization function of the
- module. It is called exactly once for each conversion the module
- implements.
- As explained in the description of the @code{struct __gconv_step} data
- structure above the initialization function has to initialize parts of
- it.
- @table @code
- @item __min_needed_from
- @itemx __max_needed_from
- @itemx __min_needed_to
- @itemx __max_needed_to
- These elements must be initialized to the exact numbers of the minimum
- and maximum number of bytes used by one character in the source and
- destination character sets, respectively. If the characters all have the
- same size, the minimum and maximum values are the same.
- @item __stateful
- This element must be initialized to a nonzero value if the source
- character set is stateful. Otherwise it must be zero.
- @end table
- If the initialization function needs to communicate some information
- to the conversion function, this communication can happen using the
- @code{__data} element of the @code{__gconv_step} structure. But since
- this data is shared by all the conversions, it must not be modified by
- the conversion function. The example below shows how this can be used.
- @smallexample
- #define MIN_NEEDED_FROM 1
- #define MAX_NEEDED_FROM 4
- #define MIN_NEEDED_TO 4
- #define MAX_NEEDED_TO 4
- int
- gconv_init (struct __gconv_step *step)
- @{
- /* @r{Determine which direction.} */
- struct iso2022jp_data *new_data;
- enum direction dir = illegal_dir;
- enum variant var = illegal_var;
- int result;
- if (__strcasecmp (step->__from_name, "ISO-2022-JP//") == 0)
- @{
- dir = from_iso2022jp;
- var = iso2022jp;
- @}
- else if (__strcasecmp (step->__to_name, "ISO-2022-JP//") == 0)
- @{
- dir = to_iso2022jp;
- var = iso2022jp;
- @}
- else if (__strcasecmp (step->__from_name, "ISO-2022-JP-2//") == 0)
- @{
- dir = from_iso2022jp;
- var = iso2022jp2;
- @}
- else if (__strcasecmp (step->__to_name, "ISO-2022-JP-2//") == 0)
- @{
- dir = to_iso2022jp;
- var = iso2022jp2;
- @}
- result = __GCONV_NOCONV;
- if (dir != illegal_dir)
- @{
- new_data = (struct iso2022jp_data *)
- malloc (sizeof (struct iso2022jp_data));
- result = __GCONV_NOMEM;
- if (new_data != NULL)
- @{
- new_data->dir = dir;
- new_data->var = var;
- step->__data = new_data;
- if (dir == from_iso2022jp)
- @{
- step->__min_needed_from = MIN_NEEDED_FROM;
- step->__max_needed_from = MAX_NEEDED_FROM;
- step->__min_needed_to = MIN_NEEDED_TO;
- step->__max_needed_to = MAX_NEEDED_TO;
- @}
- else
- @{
- step->__min_needed_from = MIN_NEEDED_TO;
- step->__max_needed_from = MAX_NEEDED_TO;
- step->__min_needed_to = MIN_NEEDED_FROM;
- step->__max_needed_to = MAX_NEEDED_FROM + 2;
- @}
- /* @r{Yes, this is a stateful encoding.} */
- step->__stateful = 1;
- result = __GCONV_OK;
- @}
- @}
- return result;
- @}
- @end smallexample
- The function first checks which conversion is wanted. The module from
- which this function is taken implements four different conversions;
- which one is selected can be determined by comparing the names. The
- comparison should always be done without paying attention to the case.
- Next, a data structure, which contains the necessary information about
- which conversion is selected, is allocated. The data structure
- @code{struct iso2022jp_data} is locally defined since, outside the
- module, this data is not used at all. Please note that if all four
- conversions this module supports are requested there are four data
- blocks.
- One interesting thing is the initialization of the @code{__min_} and
- @code{__max_} elements of the step data object. A single ISO-2022-JP
- character can consist of one to four bytes. Therefore the
- @code{MIN_NEEDED_FROM} and @code{MAX_NEEDED_FROM} macros are defined
- this way. The output is always the @code{INTERNAL} character set (aka
- UCS-4) and therefore each character consists of exactly four bytes. For
- the conversion from @code{INTERNAL} to ISO-2022-JP we have to take into
- account that escape sequences might be necessary to switch the character
- sets. Therefore the @code{__max_needed_to} element for this direction
- gets assigned @code{MAX_NEEDED_FROM + 2}. This takes into account the
- two bytes needed for the escape sequences to signal the switching. The
- asymmetry in the maximum values for the two directions can be explained
- easily: when reading ISO-2022-JP text, escape sequences can be handled
- alone (i.e., it is not necessary to process a real character since the
- effect of the escape sequence can be recorded in the state information).
- The situation is different for the other direction. Since it is in
- general not known which character comes next, one cannot emit escape
- sequences to change the state in advance. This means the escape
- sequences have to be emitted together with the next character.
- Therefore one needs more room than only for the character itself.
- The possible return values of the initialization function are:
- @table @code
- @item __GCONV_OK
- The initialization succeeded
- @item __GCONV_NOCONV
- The requested conversion is not supported in the module. This can
- happen if the @file{gconv-modules} file has errors.
- @item __GCONV_NOMEM
- Memory required to store additional information could not be allocated.
- @end table
- @end deftypevr
- The function called before the module is unloaded is significantly
- easier. It often has nothing at all to do; in which case it can be left
- out completely.
- @deftypevr {Data type} void {(*__gconv_end_fct)} (struct gconv_step *)
- @standards{GNU, gconv.h}
- The task of this function is to free all resources allocated in the
- initialization function. Therefore only the @code{__data} element of
- the object pointed to by the argument is of interest. Continuing the
- example from the initialization function, the finalization function
- looks like this:
- @smallexample
- void
- gconv_end (struct __gconv_step *data)
- @{
- free (data->__data);
- @}
- @end smallexample
- @end deftypevr
- The most important function is the conversion function itself, which can
- get quite complicated for complex character sets. But since this is not
- of interest here, we will only describe a possible skeleton for the
- conversion function.
- @deftypevr {Data type} int {(*__gconv_fct)} (struct __gconv_step *, struct __gconv_step_data *, const char **, const char *, size_t *, int)
- @standards{GNU, gconv.h}
- The conversion function can be called for two basic reasons: to convert
- text or to reset the state. From the description of the @code{iconv}
- function it can be seen why the flushing mode is necessary. What mode
- is selected is determined by the sixth argument, an integer. This
- argument being nonzero means that flushing is selected.
- Common to both modes is where the output buffer can be found. The
- information about this buffer is stored in the conversion step data. A
- pointer to this information is passed as the second argument to this
- function. The description of the @code{struct __gconv_step_data}
- structure has more information on the conversion step data.
- @cindex stateful
- What has to be done for flushing depends on the source character set.
- If the source character set is not stateful, nothing has to be done.
- Otherwise the function has to emit a byte sequence to bring the state
- object into the initial state. Once this all happened the other
- conversion modules in the chain of conversions have to get the same
- chance. Whether another step follows can be determined from the
- @code{__GCONV_IS_LAST} flag in the @code{__flags} field of the step
- data structure to which the first parameter points.
- The more interesting mode is when actual text has to be converted. The
- first step in this case is to convert as much text as possible from the
- input buffer and store the result in the output buffer. The start of the
- input buffer is determined by the third argument, which is a pointer to a
- pointer variable referencing the beginning of the buffer. The fourth
- argument is a pointer to the byte right after the last byte in the buffer.
- The conversion has to be performed according to the current state if the
- character set is stateful. The state is stored in an object pointed to
- by the @code{__statep} element of the step data (second argument). Once
- either the input buffer is empty or the output buffer is full the
- conversion stops. At this point, the pointer variable referenced by the
- third parameter must point to the byte following the last processed
- byte (i.e., if all of the input is consumed, this pointer and the fourth
- parameter have the same value).
- What now happens depends on whether this step is the last one. If it is
- the last step, the only thing that has to be done is to update the
- @code{__outbuf} element of the step data structure to point after the
- last written byte. This update gives the caller the information on how
- much text is available in the output buffer. In addition, the variable
- pointed to by the fifth parameter, which is of type @code{size_t}, must
- be incremented by the number of characters (@emph{not bytes}) that were
- converted in a non-reversible way. Then, the function can return.
- In case the step is not the last one, the later conversion functions have
- to get a chance to do their work. Therefore, the appropriate conversion
- function has to be called. The information about the functions is
- stored in the conversion data structures, passed as the first parameter.
- This information and the step data are stored in arrays, so the next
- element in both cases can be found by simple pointer arithmetic:
- @smallexample
- int
- gconv (struct __gconv_step *step, struct __gconv_step_data *data,
- const char **inbuf, const char *inbufend, size_t *written,
- int do_flush)
- @{
- struct __gconv_step *next_step = step + 1;
- struct __gconv_step_data *next_data = data + 1;
- @dots{}
- @end smallexample
- The @code{next_step} pointer references the next step information and
- @code{next_data} the next data record. The call of the next function
- therefore will look similar to this:
- @smallexample
- next_step->__fct (next_step, next_data, &outerr, outbuf,
- written, 0)
- @end smallexample
- But this is not yet all. Once the function call returns the conversion
- function might have some more to do. If the return value of the function
- is @code{__GCONV_EMPTY_INPUT}, more room is available in the output
- buffer. Unless the input buffer is empty, the conversion functions start
- all over again and process the rest of the input buffer. If the return
- value is not @code{__GCONV_EMPTY_INPUT}, something went wrong and we have
- to recover from this.
- A requirement for the conversion function is that the input buffer
- pointer (the third argument) always point to the last character that
- was put in converted form into the output buffer. This is trivially
- true after the conversion performed in the current step, but if the
- conversion functions deeper downstream stop prematurely, not all
- characters from the output buffer are consumed and, therefore, the input
- buffer pointers must be backed off to the right position.
- Correcting the input buffers is easy to do if the input and output
- character sets have a fixed width for all characters. In this situation
- we can compute how many characters are left in the output buffer and,
- therefore, can correct the input buffer pointer appropriately with a
- similar computation. Things are getting tricky if either character set
- has characters represented with variable length byte sequences, and it
- gets even more complicated if the conversion has to take care of the
- state. In these cases the conversion has to be performed once again, from
- the known state before the initial conversion (i.e., if necessary the
- state of the conversion has to be reset and the conversion loop has to be
- executed again). The difference now is that it is known how much input
- must be created, and the conversion can stop before converting the first
- unused character. Once this is done the input buffer pointers must be
- updated again and the function can return.
- One final thing should be mentioned. If it is necessary for the
- conversion to know whether it is the first invocation (in case a prolog
- has to be emitted), the conversion function should increment the
- @code{__invocation_counter} element of the step data structure just
- before returning to the caller. See the description of the @code{struct
- __gconv_step_data} structure above for more information on how this can
- be used.
- The return value must be one of the following values:
- @table @code
- @item __GCONV_EMPTY_INPUT
- All input was consumed and there is room left in the output buffer.
- @item __GCONV_FULL_OUTPUT
- No more room in the output buffer. In case this is not the last step
- this value is propagated down from the call of the next conversion
- function in the chain.
- @item __GCONV_INCOMPLETE_INPUT
- The input buffer is not entirely empty since it contains an incomplete
- character sequence.
- @end table
- The following example provides a framework for a conversion function.
- In case a new conversion has to be written the holes in this
- implementation have to be filled and that is it.
- @smallexample
- int
- gconv (struct __gconv_step *step, struct __gconv_step_data *data,
- const char **inbuf, const char *inbufend, size_t *written,
- int do_flush)
- @{
- struct __gconv_step *next_step = step + 1;
- struct __gconv_step_data *next_data = data + 1;
- gconv_fct fct = next_step->__fct;
- int status;
- /* @r{If the function is called with no input this means we have}
- @r{to reset to the initial state. The possibly partly}
- @r{converted input is dropped.} */
- if (do_flush)
- @{
- status = __GCONV_OK;
- /* @r{Possible emit a byte sequence which put the state object}
- @r{into the initial state.} */
- /* @r{Call the steps down the chain if there are any but only}
- @r{if we successfully emitted the escape sequence.} */
- if (status == __GCONV_OK && ! (data->__flags & __GCONV_IS_LAST))
- status = fct (next_step, next_data, NULL, NULL,
- written, 1);
- @}
- else
- @{
- /* @r{We preserve the initial values of the pointer variables.} */
- const char *inptr = *inbuf;
- char *outbuf = data->__outbuf;
- char *outend = data->__outbufend;
- char *outptr;
- do
- @{
- /* @r{Remember the start value for this round.} */
- inptr = *inbuf;
- /* @r{The outbuf buffer is empty.} */
- outptr = outbuf;
- /* @r{For stateful encodings the state must be safe here.} */
- /* @r{Run the conversion loop. @code{status} is set}
- @r{appropriately afterwards.} */
- /* @r{If this is the last step, leave the loop. There is}
- @r{nothing we can do.} */
- if (data->__flags & __GCONV_IS_LAST)
- @{
- /* @r{Store information about how many bytes are}
- @r{available.} */
- data->__outbuf = outbuf;
- /* @r{If any non-reversible conversions were performed,}
- @r{add the number to @code{*written}.} */
- break;
- @}
- /* @r{Write out all output that was produced.} */
- if (outbuf > outptr)
- @{
- const char *outerr = data->__outbuf;
- int result;
- result = fct (next_step, next_data, &outerr,
- outbuf, written, 0);
- if (result != __GCONV_EMPTY_INPUT)
- @{
- if (outerr != outbuf)
- @{
- /* @r{Reset the input buffer pointer. We}
- @r{document here the complex case.} */
- size_t nstatus;
- /* @r{Reload the pointers.} */
- *inbuf = inptr;
- outbuf = outptr;
- /* @r{Possibly reset the state.} */
- /* @r{Redo the conversion, but this time}
- @r{the end of the output buffer is at}
- @r{@code{outerr}.} */
- @}
- /* @r{Change the status.} */
- status = result;
- @}
- else
- /* @r{All the output is consumed, we can make}
- @r{ another run if everything was ok.} */
- if (status == __GCONV_FULL_OUTPUT)
- status = __GCONV_OK;
- @}
- @}
- while (status == __GCONV_OK);
- /* @r{We finished one use of this step.} */
- ++data->__invocation_counter;
- @}
- return status;
- @}
- @end smallexample
- @end deftypevr
- This information should be sufficient to write new modules. Anybody
- doing so should also take a look at the available source code in the
- @glibcadj{} sources. It contains many examples of working and optimized
- modules.
- @c File charset.texi edited October 2001 by Dennis Grace, IBM Corporation
|