Ignore:
Timestamp:
Mar 16, 2002, 8:53:47 AM (23 years ago)
Author:
umoeller
Message:

Misc updates for Unicode.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/src/helpers/encodings.c

    r97 r147  
    33 *@@sourcefile encodings.c:
    44 *      character encoding translations.
     5 *
     6 *      See encCreateCodec for an introduction.
    57 *
    68 *@@header "encodings\base.h"
     
    3133#include "setup.h"                      // code generation and debugging options
    3234
    33 #include "encodings\base.h"             // includes all other encodings
     35#include "helpers\standards.h"
     36
     37#include "encodings\base.h"
     38#include "encodings\alltables.h"
     39// #include "encodings\collate.h"
    3440
    3541#pragma hdrstop
    3642
    37 typedef struct _ENCODINGTABLE
    38 {
    39     XWPENCODINGID   EncodingID;
    40 
    41     unsigned short  cEntries;
    42     unsigned short  ausEntries[1];        // variable size
    43 } ENCODINGTABLE, *PENCODINGTABLE;
    44 
    45 
    46 /*
    47  *@@ encRegisterEncoding:
    48  *      registers a new proprietary encoding with the engine.
    49  *
    50  *      Before you can translate encodings with this engine,
    51  *      you have to register them. This makes sure that the
    52  *      big encoding tables will only be linked to the executable
    53  *      code if they are explicitly referenced. As a result, you
    54  *      have to #include "encodings\base.h" and pass a pointer to
    55  *      one of the global tables in the header files to this
    56  *      function.
    57  *
    58  *      This returns an encoding handle that can then be used
    59  *      with the other encoding functions.
    60  *
    61  *      Example:
    62  *
    63  +          #include "encodings\base.h"
    64  +          #include "encodings\alltables.h"     // or a specific table only
    65  +
    66  +          int rc = encRegisterEncoding(&G_iso8859_1,
    67  +                                       sizeof(G_iso8859_1) / sizeof(G_iso8859_1[0]),
    68  +                                       enc_iso8859_1);    // ID to register with
    69  */
    70 
    71 long encRegisterEncoding(PXWPENCODINGMAP pEncodingMap,
    72                          unsigned long cArrayEntries,    // count of array items
    73                          XWPENCODINGID EncodingID)       // enum from encodings\base.h
    74 {
    75     long lrc = 0;
    76 
    77     unsigned short usHighest = 0;
     43#define ENCODINGENTRY(id)   enc_ ## id, G_ ## id, ARRAYITEMCOUNT(G_ ## id)
     44
     45/*
     46 *@@ G_aEncodings:
     47 *      list of all encodings supported by this engine
     48 *      (i.e. we have a corresponding codepage in
     49 *      include\encodings\*.h) together with some
     50 *      additional information for each encoding,
     51 *      such as the corresponding OS/2 codepage
     52 *      number and a descriptive string.
     53 *
     54 *@@added V [umoeller]
     55 */
     56
     57struct
     58{
     59    ENCID               id;                 // engine ID (enum)
     60    PXWPENCODINGMAP     pMap;               // ptr to map from include\encodings\*.h
     61    unsigned long       cEntries;           // entries in map (array item count)
     62    unsigned short      usCodepageOS2;      // corresponding OS/2 codepage or 0 if none
     63    ENCBYTECOUNT        bc;
     64    const char          *pcszDescription;   // description
     65} G_aEncodings[] =
     66    {
     67        ENCODINGENTRY(cp437), 437, SINGLE, "DOS Latin US",
     68        ENCODINGENTRY(cp737), 737, SINGLE, "DOS Greek",
     69        ENCODINGENTRY(cp775), 775, SINGLE, "DOS BaltRim",
     70        ENCODINGENTRY(cp850), 850, SINGLE, "DOS Latin 1",
     71        ENCODINGENTRY(cp852), 852, SINGLE, "DOS Latin 2",               // default in Hungary,
     72                                                                // Romania, Poland
     73        ENCODINGENTRY(cp855), 855, SINGLE, "DOS Cyrillic",
     74        ENCODINGENTRY(cp857), 857, SINGLE, "DOS Latin 5 (Turkish)",
     75        ENCODINGENTRY(cp860), 860, SINGLE, "DOS Portuguese",
     76        ENCODINGENTRY(cp861), 861, SINGLE, "DOS Icelandic",
     77        ENCODINGENTRY(cp862), 862, SINGLE, "DOS Hebrew",
     78        ENCODINGENTRY(cp863), 863, SINGLE, "DOS Canadian French",
     79        ENCODINGENTRY(cp864), 864, SINGLE, "DOS Arabic",                // default in Egypt
     80        ENCODINGENTRY(cp865), 865, SINGLE, "DOS Nordic",
     81        ENCODINGENTRY(cp866), 866, SINGLE, "DOS Cyrillic Russian",      // default in Russia
     82        ENCODINGENTRY(cp869), 869, SINGLE, "DOS Greek2",
     83        ENCODINGENTRY(cp874), 874, SINGLE, "DOS Thai (TIS-620)",        // default in Thailand
     84        // ENCODINGENTRY(cp932), 932 or 943?, DOUBLE, "Japanese Windows",
     85        // ENCODINGENTRY(cp936), 936 or 946?, DOUBLE, "Chinese",
     86        // ENCODINGENTRY(cp949), 951 or 949?, DOUBLE, "Korean",
     87        // ENCODINGENTRY(cp950), 947 or 950?, DOUBLE, "Taiwan Big-5",           // default in China?
     88        ENCODINGENTRY(cp1004), 1004, SINGLE, "Windows Extended",
     89        ENCODINGENTRY(cp1250), 1250, SINGLE, "Windows Latin 2",
     90        ENCODINGENTRY(cp1251), 1251, SINGLE, "Windows Cyrillic",
     91        ENCODINGENTRY(cp1252), 1252, SINGLE, "Windows Latin 1",
     92        ENCODINGENTRY(cp1253), 1253, SINGLE, "Windows Greek",
     93        ENCODINGENTRY(cp1254), 1254, SINGLE, "Windows Turkish",
     94        ENCODINGENTRY(cp1255), 1255, SINGLE, "Windows Hebrew",
     95        ENCODINGENTRY(cp1256), 1256, SINGLE, "Windows Arabic",
     96        ENCODINGENTRY(cp1257), 1257, SINGLE, "Windows Latin-4",
     97        ENCODINGENTRY(cp1258), 1258, UNKNOWN, "unknown",
     98        ENCODINGENTRY(iso8859_1), 819, SINGLE, "ISO/IEC 8859-1:1998 (Latin-1)",
     99        ENCODINGENTRY(iso8859_2), 912, SINGLE, "ISO 8859-2:1999 (Latin-2)",
     100        ENCODINGENTRY(iso8859_3), 913, SINGLE, "ISO/IEC 8859-3:1999 (Latin-3)",
     101        ENCODINGENTRY(iso8859_4), 914, SINGLE, "ISO/IEC 8859-4:1998 (Latin-4)",
     102        ENCODINGENTRY(iso8859_5), 915, SINGLE, "ISO 8859-5:1999 (Cyrillic)",
     103        ENCODINGENTRY(iso8859_6), 1089, SINGLE, "ISO 8859-6:1999 (Arabic)",
     104        ENCODINGENTRY(iso8859_7), 813, SINGLE, "ISO 8859-7:1987 (Greek)",   // default in Greece
     105        ENCODINGENTRY(iso8859_8), 916, SINGLE, "ISO/IEC 8859-8:1999 (Hebrew)",
     106        ENCODINGENTRY(iso8859_9), 920, SINGLE, "ISO/IEC 8859-9:1999 (Latin-5)",
     107        ENCODINGENTRY(iso8859_10), 0, SINGLE, "ISO/IEC 8859-10:1998",
     108        ENCODINGENTRY(iso8859_13), 0, SINGLE, "ISO/IEC 8859-13:1998",
     109        ENCODINGENTRY(iso8859_14), 0, SINGLE, "ISO/IEC 8859-14:1998",
     110        ENCODINGENTRY(iso8859_15), 923, SINGLE, "ISO/IEC 8859-15:1999",
     111
     112        UNSUPPORTED, NULL, 0, 1200, MULTI_UNICODE, "Unicode UCS-2",
     113        UNSUPPORTED, NULL, 0, 1208, MULTI_UNICODE, "Unicode UTF-8"
     114    };
     115
     116/*
     117 *@@ FindEntry:
     118 *
     119 *@@added V0.9.18 (2002-03-08) [umoeller]
     120 */
     121
     122static int FindEntry(ENCID id,
     123                     PXWPENCODINGMAP *ppMap,
     124                     unsigned long *pcEntries)
     125{
    78126    unsigned long ul;
    79 
    80     // step 1:
    81     // run through the table and calculate the highest
    82     // character entry used
    83127    for (ul = 0;
    84          ul < cArrayEntries;
     128         ul < ARRAYITEMCOUNT(G_aEncodings);
    85129         ul++)
    86130    {
    87         unsigned short usFrom = pEncodingMap[ul].usFrom;
    88         if (usFrom > usHighest)
    89             usHighest = usFrom;
     131        if (G_aEncodings[ul].id == id)
     132        {
     133            *ppMap = G_aEncodings[ul].pMap;
     134            *pcEntries = G_aEncodings[ul].cEntries;
     135            return (1);
     136        }
    90137    }
    91138
    92     // step 2: allocate encoding table
    93     if (usHighest)
     139    return (0);
     140}
     141
     142/*
     143 *@@ encFindIdForCodepage:
     144 *      returns the ENCID for the given OS/2
     145 *      codepage, or UNSUPPORTED if there's none.
     146 *
     147 *@@added V0.9.18 (2002-03-08) [umoeller]
     148 */
     149
     150ENCID encFindIdForCodepage(unsigned short usCodepage,
     151                           const char **ppcszDescription,   // out: codepage description; ptr can be NULL
     152                           ENCBYTECOUNT *pByteCount)
     153{
     154    unsigned long ul;
     155    for (ul = 0;
     156         ul < ARRAYITEMCOUNT(G_aEncodings);
     157         ul++)
    94158    {
    95         // allocate memory as needed
    96         unsigned long cb =   sizeof(ENCODINGTABLE)
    97                            + (   (usHighest - 1)
    98                                * sizeof(unsigned short)
    99                              );
    100 
    101         PENCODINGTABLE pTableNew = (PENCODINGTABLE)malloc(cb);
    102         if (pTableNew)
    103         {
    104             memset(pTableNew, -1, cb);
    105             pTableNew->cEntries = usHighest;        // array size
    106 
    107             // step 3: fill encoding table
    108             // this only has the Unicode target USHORTs;
    109             // the source is simply the offset. So to
    110             // get Unicode for character 123 in the specific encoding,
    111             // do pTableNew->ausEntries[123].
    112             // If you get 0xFFFF, the encoding is undefined.
    113 
    114             for (ul = 0;
    115                  ul < cArrayEntries;
    116                  ul++)
     159        if (G_aEncodings[ul].usCodepageOS2 == usCodepage)
     160
     161        {
     162            if (ppcszDescription)
     163                *ppcszDescription = G_aEncodings[ul].pcszDescription;
     164            if (pByteCount)
     165                *pByteCount = G_aEncodings[ul].bc;
     166            return G_aEncodings[ul].id;
     167        }
     168    }
     169
     170    return (UNSUPPORTED);
     171}
     172
     173/*
     174 *@@ encCreateCodec:
     175 *      creates a codec that can be used for conversion between
     176 *      Unicode and codepaged characters (and vice versa).
     177 *
     178 *      A codec essentially consists of two tables which can
     179 *      be used for quick index-based lookups in both directions.
     180 *      This function goes thru the tables provided in
     181 *      include\encodings\*.h and builds the codec tables
     182 *      from them.
     183 *
     184 *      This function takes an encoding ID as input. Each
     185 *      codepage table in include\encodings\*.h has one
     186 *      of those IDs assigned. Use encFindIdForCodepage
     187 *      to find the ID for a given OS/2 codepage.
     188 *
     189 *      Use codecs carefully and only when they are really
     190 *      needed for a specific conversion. Building a codec
     191 *      is expensive, so you should create a codec once
     192 *      and reuse it for future conversions. In addition,
     193 *      create codecs only for the codepages that are
     194 *      actually used. Each codec will take up
     195 *      n * sizeof(USHORT) bytes, where n is the highest
     196 *      Unicode character used in the codepage.
     197 *
     198 *      Remarks:
     199 *
     200 *      --  All codepages share the first 128 characters
     201 *          (0-0x7F) with ASCII.
     202 *
     203 *      --  Since the first 128 characters (0-0x7F) in
     204 *          Unicode are equivalent to ASCII also, codecs
     205 *          are not needed if you process ASCII strings
     206 *          only.
     207 *
     208 *      --  Since the next 128 characters (0x80-0xFF) in
     209 *          Unicode are equivalent to ISO/IEC 8859-1
     210 *          (Latin-1), codecs aren't needed for those
     211 *          strings either.
     212 *
     213 *          Note that codepoints 0x80-0x9F are undefined
     214 *          in Latin-1 but used as control sequences in
     215 *          Unicode.
     216 *
     217 *      --  As far as I know, codepage 1252, which is
     218 *          used per default under Windows, is equivalent
     219 *          to Latin 1 except that it also defines
     220 *          codepoints 0x80-0x9F to certain DTP characters.
     221 *
     222 *      --  From my testing, codepage 1004 (which is
     223 *          described as "Windows-compatible" in most OS/2
     224 *          docs) is the same as codepage 1252, except for
     225 *          character 0xAF.
     226 *
     227 *      Unfortunately, OS/2 uses codepage 850 on most
     228 *      systems (and Windows uses OS/2 codepage 1252),
     229 *      so for conversion between those, codecs are needed.
     230 */
     231
     232PCONVERSION encCreateCodec(ENCID id)
     233{
     234    PXWPENCODINGMAP pEncodingMap;
     235    unsigned long cArrayEntries;
     236
     237    if (FindEntry(id,
     238                  &pEncodingMap,
     239                  &cArrayEntries))
     240    {
     241        unsigned short usHighestCP = 0,
     242                       usHighestUni = 0;
     243        unsigned long ul;
     244
     245        // step 1:
     246        // run through the table and calculate the highest
     247        // character entry used
     248        for (ul = 0;
     249             ul < cArrayEntries;
     250             ul++)
     251        {
     252            if (pEncodingMap[ul].usCP > usHighestCP)
     253                usHighestCP = pEncodingMap[ul].usCP;
     254            if (pEncodingMap[ul].usUni > usHighestUni)
     255                usHighestUni = pEncodingMap[ul].usUni;
     256        }
     257
     258        // step 2: allocate encoding table
     259        if (usHighestCP && usHighestUni)
     260        {
     261            PCONVERSION pTableNew;
     262            if (pTableNew = NEW(CONVERSION))
    117263            {
    118                 PXWPENCODINGMAP pEntry = &pEncodingMap[ul];
    119                 pTableNew->ausEntries[pEntry->usFrom] = pEntry->usUni;
     264                unsigned long cbEntriesUniFromCP
     265                    = (usHighestCP + 1) * sizeof(unsigned short);
     266                unsigned long cbEntriesCPFromUni
     267                    = (usHighestUni + 1) * sizeof(unsigned short);
     268
     269                ZERO(pTableNew);
     270
     271                pTableNew->usHighestCP = usHighestCP;
     272                pTableNew->usHighestUni = usHighestUni;
     273
     274                if (    (pTableNew->ausEntriesUniFromCP
     275                            = (unsigned short*)malloc(cbEntriesUniFromCP))
     276                     && (pTableNew->ausEntriesCPFromUni
     277                            = (unsigned short*)malloc(cbEntriesCPFromUni))
     278                   )
     279                {
     280                    // step 3: fill encoding tables
     281
     282                    memset(pTableNew->ausEntriesUniFromCP,
     283                           0xFF,
     284                           cbEntriesUniFromCP);
     285                    memset(pTableNew->ausEntriesCPFromUni,
     286                           0xFF,
     287                           cbEntriesCPFromUni);
     288
     289                    for (ul = 0;
     290                         ul < cArrayEntries;
     291                         ul++)
     292                    {
     293                        PXWPENCODINGMAP pEntry = &pEncodingMap[ul];
     294
     295                        pTableNew->ausEntriesUniFromCP[pEntry->usCP] = pEntry->usUni;
     296
     297                        pTableNew->ausEntriesCPFromUni[pEntry->usUni] = pEntry->usCP;
     298                    }
     299
     300                    return (pTableNew);
     301                }
     302
     303                free(pTableNew);
    120304            }
    121 
    122             lrc = (long)pTableNew;
    123305        }
    124306    }
    125307
    126     return (lrc);
     308    return (NULL);
     309}
     310
     311/*
     312 *@@ encFreeCodec:
     313 *      frees a codec created with encFreeConversion
     314 *      and sets the given pointer to NULL.
     315 *
     316 *@@added V0.9.18 (2002-03-08) [umoeller]
     317 */
     318
     319void encFreeCodec(PCONVERSION *ppTable)         // in: ptr to codec ptr returned by encCreateCodec
     320{
     321    PCONVERSION pTable;
     322    if (pTable = *ppTable)
     323    {
     324        if (pTable->ausEntriesUniFromCP)
     325            free(pTable->ausEntriesUniFromCP);
     326        if (pTable->ausEntriesCPFromUni)
     327            free(pTable->ausEntriesCPFromUni);
     328        free(pTable);
     329        *ppTable = NULL;
     330    }
     331}
     332
     333/*
     334 *@@ encChar2Uni:
     335 *      converts a codepage-specific character
     336 *      to Unicode, using the given conversion
     337 *      table from encCreateCodec().
     338 *
     339 *      Returns 0xFFFF on errors, which is unlikely
     340 *      with Unicode though.
     341 *
     342 *@@added V0.9.18 (2002-03-08) [umoeller]
     343 */
     344
     345unsigned long encChar2Uni(PCONVERSION pTable,
     346                          unsigned short c)
     347{
     348    if (    (pTable)
     349         && (c <= pTable->usHighestCP)
     350       )
     351        return (pTable->ausEntriesUniFromCP[c]);
     352
     353    return (0xFFFF);
     354}
     355
     356/*
     357 *@@ encUni2Char:
     358 *      converts a Unicode character to the
     359 *      codepage specified by the given
     360 *      conversion table from encCreateCodec().
     361 *
     362 *      Returns 0xFFFF if the Unicode character
     363 *      has no codepage equivalent.
     364 *
     365 *@@added V0.9.18 (2002-03-08) [umoeller]
     366 */
     367
     368unsigned short encUni2Char(PCONVERSION pTable,
     369                           unsigned long ulUni)
     370{
     371    if (    (pTable)
     372         && (ulUni <= pTable->usHighestUni)
     373       )
     374        return (pTable->ausEntriesCPFromUni[ulUni]);
     375
     376    return (0xFFFF);
    127377}
    128378
     
    246496}
    247497
     498#if 0
     499
     500/*
     501 *@@ encCodepageToUTF8:
     502 *
     503 *@@added V0.9.18 (2002-03-08) [umoeller]
     504 */
     505
     506void encCodepageToUTF8(const char **ppch)
     507{
     508
     509}
     510
     511putwchar(c)
     512{
     513  if (c < 0x80) {
     514    putchar (c);
     515  }
     516  else if (c < 0x800) {
     517    putchar (0xC0 | c>>6);
     518    putchar (0x80 | c & 0x3F);
     519  }
     520  else if (c < 0x10000) {
     521    putchar (0xE0 | c>>12);
     522    putchar (0x80 | c>>6 & 0x3F);
     523    putchar (0x80 | c & 0x3F);
     524  }
     525  else if (c < 0x200000) {
     526    putchar (0xF0 | c>>18);
     527    putchar (0x80 | c>>12 & 0x3F);
     528    putchar (0x80 | c>>6 & 0x3F);
     529    putchar (0x80 | c & 0x3F);
     530  }
     531}
     532
     533#endif
     534
Note: See TracChangeset for help on using the changeset viewer.