| 1 | /* Determine a canonical name for the current locale's character encoding.
 | 
|---|
| 2 | 
 | 
|---|
| 3 |    Copyright (C) 2000-2002 Free Software Foundation, Inc.
 | 
|---|
| 4 | 
 | 
|---|
| 5 |    This program is free software; you can redistribute it and/or modify it
 | 
|---|
| 6 |    under the terms of the GNU Library General Public License as published
 | 
|---|
| 7 |    by the Free Software Foundation; either version 2, or (at your option)
 | 
|---|
| 8 |    any later version.
 | 
|---|
| 9 | 
 | 
|---|
| 10 |    This program is distributed in the hope that it will be useful,
 | 
|---|
| 11 |    but WITHOUT ANY WARRANTY; without even the implied warranty of
 | 
|---|
| 12 |    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU
 | 
|---|
| 13 |    Library General Public License for more details.
 | 
|---|
| 14 | 
 | 
|---|
| 15 |    You should have received a copy of the GNU Library General Public
 | 
|---|
| 16 |    License along with this program; if not, write to the Free Software
 | 
|---|
| 17 |    Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307,
 | 
|---|
| 18 |    USA.  */
 | 
|---|
| 19 | 
 | 
|---|
| 20 | /* Written by Bruno Haible <haible@clisp.cons.org>.  */
 | 
|---|
| 21 | 
 | 
|---|
| 22 | #ifdef HAVE_CONFIG_H
 | 
|---|
| 23 | # include <config.h>
 | 
|---|
| 24 | #endif
 | 
|---|
| 25 | 
 | 
|---|
| 26 | #if HAVE_STDDEF_H
 | 
|---|
| 27 | # include <stddef.h>
 | 
|---|
| 28 | #endif
 | 
|---|
| 29 | 
 | 
|---|
| 30 | #include <stdio.h>
 | 
|---|
| 31 | #if HAVE_STRING_H
 | 
|---|
| 32 | # include <string.h>
 | 
|---|
| 33 | #else
 | 
|---|
| 34 | # include <strings.h>
 | 
|---|
| 35 | #endif
 | 
|---|
| 36 | #if HAVE_STDLIB_H
 | 
|---|
| 37 | # include <stdlib.h>
 | 
|---|
| 38 | #endif
 | 
|---|
| 39 | 
 | 
|---|
| 40 | #if defined _WIN32 || defined __WIN32__
 | 
|---|
| 41 | # undef WIN32   /* avoid warning on mingw32 */
 | 
|---|
| 42 | # define WIN32
 | 
|---|
| 43 | #endif
 | 
|---|
| 44 | 
 | 
|---|
| 45 | #if defined __EMX__
 | 
|---|
| 46 | /* Assume EMX program runs on OS/2, even if compiled under DOS.  */
 | 
|---|
| 47 | # define OS2
 | 
|---|
| 48 | #endif
 | 
|---|
| 49 | 
 | 
|---|
| 50 | #if !defined WIN32
 | 
|---|
| 51 | # if HAVE_LANGINFO_CODESET
 | 
|---|
| 52 | #  include <langinfo.h>
 | 
|---|
| 53 | # else
 | 
|---|
| 54 | #  if HAVE_SETLOCALE
 | 
|---|
| 55 | #   include <locale.h>
 | 
|---|
| 56 | #  endif
 | 
|---|
| 57 | # endif
 | 
|---|
| 58 | #elif defined WIN32
 | 
|---|
| 59 | # define WIN32_LEAN_AND_MEAN
 | 
|---|
| 60 | # include <windows.h>
 | 
|---|
| 61 | #endif
 | 
|---|
| 62 | #if defined OS2
 | 
|---|
| 63 | # define INCL_DOS
 | 
|---|
| 64 | # include <os2.h>
 | 
|---|
| 65 | #endif
 | 
|---|
| 66 | 
 | 
|---|
| 67 | #if defined _WIN32 || defined __WIN32__ || defined __EMX__ || defined __DJGPP__
 | 
|---|
| 68 |   /* Win32, OS/2, DOS */
 | 
|---|
| 69 | # define ISSLASH(C) ((C) == '/' || (C) == '\\')
 | 
|---|
| 70 | #endif
 | 
|---|
| 71 | 
 | 
|---|
| 72 | #ifndef DIRECTORY_SEPARATOR
 | 
|---|
| 73 | # define DIRECTORY_SEPARATOR '/'
 | 
|---|
| 74 | #endif
 | 
|---|
| 75 | 
 | 
|---|
| 76 | #ifndef ISSLASH
 | 
|---|
| 77 | # define ISSLASH(C) ((C) == DIRECTORY_SEPARATOR)
 | 
|---|
| 78 | #endif
 | 
|---|
| 79 | 
 | 
|---|
| 80 | #ifdef HAVE_GETC_UNLOCKED
 | 
|---|
| 81 | # undef getc
 | 
|---|
| 82 | # define getc getc_unlocked
 | 
|---|
| 83 | #endif
 | 
|---|
| 84 | 
 | 
|---|
| 85 | /* The following static variable is declared 'volatile' to avoid a
 | 
|---|
| 86 |    possible multithread problem in the function get_charset_aliases. If we
 | 
|---|
| 87 |    are running in a threaded environment, and if two threads initialize
 | 
|---|
| 88 |    'charset_aliases' simultaneously, both will produce the same value,
 | 
|---|
| 89 |    and everything will be ok if the two assignments to 'charset_aliases'
 | 
|---|
| 90 |    are atomic. But I don't know what will happen if the two assignments mix.  */
 | 
|---|
| 91 | #if __STDC__ != 1
 | 
|---|
| 92 | # define volatile /* empty */
 | 
|---|
| 93 | #endif
 | 
|---|
| 94 | /* Pointer to the contents of the charset.alias file, if it has already been
 | 
|---|
| 95 |    read, else NULL.  Its format is:
 | 
|---|
| 96 |    ALIAS_1 '\0' CANONICAL_1 '\0' ... ALIAS_n '\0' CANONICAL_n '\0' '\0'  */
 | 
|---|
| 97 | static const char * volatile charset_aliases;
 | 
|---|
| 98 | 
 | 
|---|
| 99 | /* Return a pointer to the contents of the charset.alias file.  */
 | 
|---|
| 100 | static const char *
 | 
|---|
| 101 | get_charset_aliases ()
 | 
|---|
| 102 | {
 | 
|---|
| 103 |   const char *cp;
 | 
|---|
| 104 | 
 | 
|---|
| 105 |   cp = charset_aliases;
 | 
|---|
| 106 |   if (cp == NULL)
 | 
|---|
| 107 |     {
 | 
|---|
| 108 | #if !defined WIN32
 | 
|---|
| 109 |       FILE *fp;
 | 
|---|
| 110 |       const char *dir = LIBDIR;
 | 
|---|
| 111 |       const char *base = "charset.alias";
 | 
|---|
| 112 |       char *file_name;
 | 
|---|
| 113 | 
 | 
|---|
| 114 |       /* Concatenate dir and base into freshly allocated file_name.  */
 | 
|---|
| 115 |       {
 | 
|---|
| 116 |         size_t dir_len = strlen (dir);
 | 
|---|
| 117 |         size_t base_len = strlen (base);
 | 
|---|
| 118 |         int add_slash = (dir_len > 0 && !ISSLASH (dir[dir_len - 1]));
 | 
|---|
| 119 |         file_name = (char *) malloc (dir_len + add_slash + base_len + 1);
 | 
|---|
| 120 |         if (file_name != NULL)
 | 
|---|
| 121 |           {
 | 
|---|
| 122 |             memcpy (file_name, dir, dir_len);
 | 
|---|
| 123 |             if (add_slash)
 | 
|---|
| 124 |               file_name[dir_len] = DIRECTORY_SEPARATOR;
 | 
|---|
| 125 |             memcpy (file_name + dir_len + add_slash, base, base_len + 1);
 | 
|---|
| 126 |           }
 | 
|---|
| 127 |       }
 | 
|---|
| 128 | 
 | 
|---|
| 129 |       if (file_name == NULL || (fp = fopen (file_name, "r")) == NULL)
 | 
|---|
| 130 |         /* Out of memory or file not found, treat it as empty.  */
 | 
|---|
| 131 |         cp = "";
 | 
|---|
| 132 |       else
 | 
|---|
| 133 |         {
 | 
|---|
| 134 |           /* Parse the file's contents.  */
 | 
|---|
| 135 |           int c;
 | 
|---|
| 136 |           char buf1[50+1];
 | 
|---|
| 137 |           char buf2[50+1];
 | 
|---|
| 138 |           char *res_ptr = NULL;
 | 
|---|
| 139 |           size_t res_size = 0;
 | 
|---|
| 140 |           size_t l1, l2;
 | 
|---|
| 141 | 
 | 
|---|
| 142 |           for (;;)
 | 
|---|
| 143 |             {
 | 
|---|
| 144 |               c = getc (fp);
 | 
|---|
| 145 |               if (c == EOF)
 | 
|---|
| 146 |                 break;
 | 
|---|
| 147 |               if (c == '\n' || c == ' ' || c == '\t')
 | 
|---|
| 148 |                 continue;
 | 
|---|
| 149 |               if (c == '#')
 | 
|---|
| 150 |                 {
 | 
|---|
| 151 |                   /* Skip comment, to end of line.  */
 | 
|---|
| 152 |                   do
 | 
|---|
| 153 |                     c = getc (fp);
 | 
|---|
| 154 |                   while (!(c == EOF || c == '\n'));
 | 
|---|
| 155 |                   if (c == EOF)
 | 
|---|
| 156 |                     break;
 | 
|---|
| 157 |                   continue;
 | 
|---|
| 158 |                 }
 | 
|---|
| 159 |               ungetc (c, fp);
 | 
|---|
| 160 |               if (fscanf (fp, "%50s %50s", buf1, buf2) < 2)
 | 
|---|
| 161 |                 break;
 | 
|---|
| 162 |               l1 = strlen (buf1);
 | 
|---|
| 163 |               l2 = strlen (buf2);
 | 
|---|
| 164 |               if (res_size == 0)
 | 
|---|
| 165 |                 {
 | 
|---|
| 166 |                   res_size = l1 + 1 + l2 + 1;
 | 
|---|
| 167 |                   res_ptr = (char *) malloc (res_size + 1);
 | 
|---|
| 168 |                 }
 | 
|---|
| 169 |               else
 | 
|---|
| 170 |                 {
 | 
|---|
| 171 |                   res_size += l1 + 1 + l2 + 1;
 | 
|---|
| 172 |                   res_ptr = (char *) realloc (res_ptr, res_size + 1);
 | 
|---|
| 173 |                 }
 | 
|---|
| 174 |               if (res_ptr == NULL)
 | 
|---|
| 175 |                 {
 | 
|---|
| 176 |                   /* Out of memory. */
 | 
|---|
| 177 |                   res_size = 0;
 | 
|---|
| 178 |                   break;
 | 
|---|
| 179 |                 }
 | 
|---|
| 180 |               strcpy (res_ptr + res_size - (l2 + 1) - (l1 + 1), buf1);
 | 
|---|
| 181 |               strcpy (res_ptr + res_size - (l2 + 1), buf2);
 | 
|---|
| 182 |             }
 | 
|---|
| 183 |           fclose (fp);
 | 
|---|
| 184 |           if (res_size == 0)
 | 
|---|
| 185 |             cp = "";
 | 
|---|
| 186 |           else
 | 
|---|
| 187 |             {
 | 
|---|
| 188 |               *(res_ptr + res_size) = '\0';
 | 
|---|
| 189 |               cp = res_ptr;
 | 
|---|
| 190 |             }
 | 
|---|
| 191 |         }
 | 
|---|
| 192 | 
 | 
|---|
| 193 |       if (file_name != NULL)
 | 
|---|
| 194 |         free (file_name);
 | 
|---|
| 195 | 
 | 
|---|
| 196 | #else
 | 
|---|
| 197 | 
 | 
|---|
| 198 |       /* To avoid the troubles of installing a separate file in the same
 | 
|---|
| 199 |          directory as the DLL and of retrieving the DLL's directory at
 | 
|---|
| 200 |          runtime, simply inline the aliases here.  */
 | 
|---|
| 201 | 
 | 
|---|
| 202 | # if defined WIN32
 | 
|---|
| 203 |       cp = "CP936" "\0" "GBK" "\0"
 | 
|---|
| 204 |            "CP1361" "\0" "JOHAB" "\0";
 | 
|---|
| 205 | # endif
 | 
|---|
| 206 | #endif
 | 
|---|
| 207 | 
 | 
|---|
| 208 |       charset_aliases = cp;
 | 
|---|
| 209 |     }
 | 
|---|
| 210 | 
 | 
|---|
| 211 |   return cp;
 | 
|---|
| 212 | }
 | 
|---|
| 213 | 
 | 
|---|
| 214 | /* Determine the current locale's character encoding, and canonicalize it
 | 
|---|
| 215 |    into one of the canonical names listed in config.charset.
 | 
|---|
| 216 |    The result must not be freed; it is statically allocated.
 | 
|---|
| 217 |    If the canonical name cannot be determined, the result is a non-canonical
 | 
|---|
| 218 |    name.  */
 | 
|---|
| 219 | 
 | 
|---|
| 220 | #ifdef STATIC
 | 
|---|
| 221 | STATIC
 | 
|---|
| 222 | #endif
 | 
|---|
| 223 | const char *
 | 
|---|
| 224 | locale_charset ()
 | 
|---|
| 225 | {
 | 
|---|
| 226 |   const char *codeset;
 | 
|---|
| 227 |   const char *aliases;
 | 
|---|
| 228 | 
 | 
|---|
| 229 | #if !(defined WIN32 || defined OS2)
 | 
|---|
| 230 | 
 | 
|---|
| 231 | # if HAVE_LANGINFO_CODESET
 | 
|---|
| 232 | 
 | 
|---|
| 233 |   /* Most systems support nl_langinfo (CODESET) nowadays.  */
 | 
|---|
| 234 |   codeset = nl_langinfo (CODESET);
 | 
|---|
| 235 | 
 | 
|---|
| 236 | # else
 | 
|---|
| 237 | 
 | 
|---|
| 238 |   /* On old systems which lack it, use setlocale or getenv.  */
 | 
|---|
| 239 |   const char *locale = NULL;
 | 
|---|
| 240 | 
 | 
|---|
| 241 |   /* But most old systems don't have a complete set of locales.  Some
 | 
|---|
| 242 |      (like SunOS 4 or DJGPP) have only the C locale.  Therefore we don't
 | 
|---|
| 243 |      use setlocale here; it would return "C" when it doesn't support the
 | 
|---|
| 244 |      locale name the user has set.  */
 | 
|---|
| 245 | #  if HAVE_SETLOCALE && 0
 | 
|---|
| 246 |   locale = setlocale (LC_CTYPE, NULL);
 | 
|---|
| 247 | #  endif
 | 
|---|
| 248 |   if (locale == NULL || locale[0] == '\0')
 | 
|---|
| 249 |     {
 | 
|---|
| 250 |       locale = getenv ("LC_ALL");
 | 
|---|
| 251 |       if (locale == NULL || locale[0] == '\0')
 | 
|---|
| 252 |         {
 | 
|---|
| 253 |           locale = getenv ("LC_CTYPE");
 | 
|---|
| 254 |           if (locale == NULL || locale[0] == '\0')
 | 
|---|
| 255 |             locale = getenv ("LANG");
 | 
|---|
| 256 |         }
 | 
|---|
| 257 |     }
 | 
|---|
| 258 | 
 | 
|---|
| 259 |   /* On some old systems, one used to set locale = "iso8859_1". On others,
 | 
|---|
| 260 |      you set it to "language_COUNTRY.charset". In any case, we resolve it
 | 
|---|
| 261 |      through the charset.alias file.  */
 | 
|---|
| 262 |   codeset = locale;
 | 
|---|
| 263 | 
 | 
|---|
| 264 | # endif
 | 
|---|
| 265 | 
 | 
|---|
| 266 | #elif defined WIN32
 | 
|---|
| 267 | 
 | 
|---|
| 268 |   static char buf[2 + 10 + 1];
 | 
|---|
| 269 | 
 | 
|---|
| 270 |   /* Win32 has a function returning the locale's codepage as a number.  */
 | 
|---|
| 271 |   sprintf (buf, "CP%u", GetACP ());
 | 
|---|
| 272 |   codeset = buf;
 | 
|---|
| 273 | 
 | 
|---|
| 274 | #elif defined OS2
 | 
|---|
| 275 | 
 | 
|---|
| 276 |   const char *locale;
 | 
|---|
| 277 |   static char buf[2 + 10 + 1];
 | 
|---|
| 278 |   ULONG cp[3];
 | 
|---|
| 279 |   ULONG cplen;
 | 
|---|
| 280 | 
 | 
|---|
| 281 |   /* Allow user to override the codeset, as set in the operating system,
 | 
|---|
| 282 |      with standard language environment variables.  */
 | 
|---|
| 283 |   locale = getenv ("LC_ALL");
 | 
|---|
| 284 |   if (locale == NULL || locale[0] == '\0')
 | 
|---|
| 285 |     {
 | 
|---|
| 286 |       locale = getenv ("LC_CTYPE");
 | 
|---|
| 287 |       if (locale == NULL || locale[0] == '\0')
 | 
|---|
| 288 |         locale = getenv ("LANG");
 | 
|---|
| 289 |     }
 | 
|---|
| 290 |   if (locale != NULL && locale[0] != '\0')
 | 
|---|
| 291 |     {
 | 
|---|
| 292 |       /* If the locale name contains an encoding after the dot, return it.  */
 | 
|---|
| 293 |       const char *dot = strchr (locale, '.');
 | 
|---|
| 294 | 
 | 
|---|
| 295 |       if (dot != NULL)
 | 
|---|
| 296 |         {
 | 
|---|
| 297 |           const char *modifier;
 | 
|---|
| 298 | 
 | 
|---|
| 299 |           dot++;
 | 
|---|
| 300 |           /* Look for the possible @... trailer and remove it, if any.  */
 | 
|---|
| 301 |           modifier = strchr (dot, '@');
 | 
|---|
| 302 |           if (modifier == NULL)
 | 
|---|
| 303 |             return dot;
 | 
|---|
| 304 |           if (modifier - dot < sizeof (buf))
 | 
|---|
| 305 |             {
 | 
|---|
| 306 |               memcpy (buf, dot, modifier - dot);
 | 
|---|
| 307 |               buf [modifier - dot] = '\0';
 | 
|---|
| 308 |               return buf;
 | 
|---|
| 309 |             }
 | 
|---|
| 310 |         }
 | 
|---|
| 311 | 
 | 
|---|
| 312 |       /* Resolve through the charset.alias file.  */
 | 
|---|
| 313 |       codeset = locale;
 | 
|---|
| 314 |     }
 | 
|---|
| 315 |   else
 | 
|---|
| 316 |     {
 | 
|---|
| 317 |       /* OS/2 has a function returning the locale's codepage as a number.  */
 | 
|---|
| 318 |       if (DosQueryCp (sizeof (cp), cp, &cplen))
 | 
|---|
| 319 |         codeset = "";
 | 
|---|
| 320 |       else
 | 
|---|
| 321 |         {
 | 
|---|
| 322 |           sprintf (buf, "CP%u", cp[0]);
 | 
|---|
| 323 |           codeset = buf;
 | 
|---|
| 324 |         }
 | 
|---|
| 325 |     }
 | 
|---|
| 326 | 
 | 
|---|
| 327 | #endif
 | 
|---|
| 328 | 
 | 
|---|
| 329 |   if (codeset == NULL)
 | 
|---|
| 330 |     /* The canonical name cannot be determined.  */
 | 
|---|
| 331 |     codeset = "";
 | 
|---|
| 332 | 
 | 
|---|
| 333 |   /* Resolve alias. */
 | 
|---|
| 334 |   for (aliases = get_charset_aliases ();
 | 
|---|
| 335 |        *aliases != '\0';
 | 
|---|
| 336 |        aliases += strlen (aliases) + 1, aliases += strlen (aliases) + 1)
 | 
|---|
| 337 |     if (strcmp (codeset, aliases) == 0
 | 
|---|
| 338 |         || (aliases[0] == '*' && aliases[1] == '\0'))
 | 
|---|
| 339 |       {
 | 
|---|
| 340 |         codeset = aliases + strlen (aliases) + 1;
 | 
|---|
| 341 |         break;
 | 
|---|
| 342 |       }
 | 
|---|
| 343 | 
 | 
|---|
| 344 |   return codeset;
 | 
|---|
| 345 | }
 | 
|---|