| 1 | /* locale information | 
|---|
| 2 |  | 
|---|
| 3 | Copyright 2016-2022 Free Software Foundation, Inc. | 
|---|
| 4 |  | 
|---|
| 5 | This program is free software; you can redistribute it and/or modify | 
|---|
| 6 | it under the terms of the GNU General Public License as published by | 
|---|
| 7 | the Free Software Foundation, either version 3, or (at your option) | 
|---|
| 8 | any later version. | 
|---|
| 9 |  | 
|---|
| 10 | This program is distributed in the hope that it will be useful, | 
|---|
| 11 | but WITHOUT ANY WARRANTY; without even the implied warranty of | 
|---|
| 12 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the | 
|---|
| 13 | GNU General Public License for more details. | 
|---|
| 14 |  | 
|---|
| 15 | You should have received a copy of the GNU General Public License | 
|---|
| 16 | along with this program; if not, write to the Free Software | 
|---|
| 17 | Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA | 
|---|
| 18 | 02110-1301, USA.  */ | 
|---|
| 19 |  | 
|---|
| 20 | /* Written by Paul Eggert.  */ | 
|---|
| 21 |  | 
|---|
| 22 | #include <config.h> | 
|---|
| 23 |  | 
|---|
| 24 | #include <localeinfo.h> | 
|---|
| 25 |  | 
|---|
| 26 | #include <limits.h> | 
|---|
| 27 | #include <locale.h> | 
|---|
| 28 | #include <stdlib.h> | 
|---|
| 29 | #include <string.h> | 
|---|
| 30 | #include <wctype.h> | 
|---|
| 31 |  | 
|---|
| 32 | /* The sbclen implementation relies on this.  */ | 
|---|
| 33 | static_assert (MB_LEN_MAX <= SCHAR_MAX); | 
|---|
| 34 |  | 
|---|
| 35 | /* Return true if the locale uses UTF-8.  */ | 
|---|
| 36 |  | 
|---|
| 37 | static bool | 
|---|
| 38 | is_using_utf8 (void) | 
|---|
| 39 | { | 
|---|
| 40 | wchar_t wc; | 
|---|
| 41 | mbstate_t mbs = {0}; | 
|---|
| 42 | return mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100; | 
|---|
| 43 | } | 
|---|
| 44 |  | 
|---|
| 45 | /* Return true if the locale is compatible enough with the C locale so | 
|---|
| 46 | that the locale is single-byte, bytes are in collating-sequence | 
|---|
| 47 | order, and there are no multi-character collating elements.  */ | 
|---|
| 48 |  | 
|---|
| 49 | static bool | 
|---|
| 50 | using_simple_locale (bool multibyte) | 
|---|
| 51 | { | 
|---|
| 52 | /* The native character set is known to be compatible with | 
|---|
| 53 | the C locale.  The following test isn't perfect, but it's good | 
|---|
| 54 | enough in practice, as only ASCII and EBCDIC are in common use | 
|---|
| 55 | and this test correctly accepts ASCII and rejects EBCDIC.  */ | 
|---|
| 56 | enum { native_c_charset = | 
|---|
| 57 | ('\b' == 8 && '\t' == 9 && '\n' == 10 && '\v' == 11 && '\f' == 12 | 
|---|
| 58 | && '\r' == 13 && ' ' == 32 && '!' == 33 && '"' == 34 && '#' == 35 | 
|---|
| 59 | && '%' == 37 && '&' == 38 && '\'' == 39 && '(' == 40 && ')' == 41 | 
|---|
| 60 | && '*' == 42 && '+' == 43 && ',' == 44 && '-' == 45 && '.' == 46 | 
|---|
| 61 | && '/' == 47 && '0' == 48 && '9' == 57 && ':' == 58 && ';' == 59 | 
|---|
| 62 | && '<' == 60 && '=' == 61 && '>' == 62 && '?' == 63 && 'A' == 65 | 
|---|
| 63 | && 'Z' == 90 && '[' == 91 && '\\' == 92 && ']' == 93 && '^' == 94 | 
|---|
| 64 | && '_' == 95 && 'a' == 97 && 'z' == 122 && '{' == 123 && '|' == 124 | 
|---|
| 65 | && '}' == 125 && '~' == 126) | 
|---|
| 66 | }; | 
|---|
| 67 |  | 
|---|
| 68 | if (!native_c_charset || multibyte) | 
|---|
| 69 | return false; | 
|---|
| 70 |  | 
|---|
| 71 | /* As a heuristic, use strcoll to compare native character order. | 
|---|
| 72 | If this agrees with byte order the locale should be simple. | 
|---|
| 73 | This heuristic should work for all known practical locales, | 
|---|
| 74 | although it would be invalid for artificially-constructed locales | 
|---|
| 75 | where the native order is the collating-sequence order but there | 
|---|
| 76 | are multi-character collating elements.  */ | 
|---|
| 77 | for (int i = 0; i < UCHAR_MAX; i++) | 
|---|
| 78 | if (0 <= strcoll (((char []) {i, 0}), ((char []) {i + 1, 0}))) | 
|---|
| 79 | return false; | 
|---|
| 80 |  | 
|---|
| 81 | return true; | 
|---|
| 82 | } | 
|---|
| 83 |  | 
|---|
| 84 | /* Initialize *LOCALEINFO from the current locale.  */ | 
|---|
| 85 |  | 
|---|
| 86 | void | 
|---|
| 87 | init_localeinfo (struct localeinfo *localeinfo) | 
|---|
| 88 | { | 
|---|
| 89 | localeinfo->multibyte = MB_CUR_MAX > 1; | 
|---|
| 90 | localeinfo->simple = using_simple_locale (localeinfo->multibyte); | 
|---|
| 91 | localeinfo->using_utf8 = is_using_utf8 (); | 
|---|
| 92 |  | 
|---|
| 93 | for (int i = CHAR_MIN; i <= CHAR_MAX; i++) | 
|---|
| 94 | { | 
|---|
| 95 | char c = i; | 
|---|
| 96 | unsigned char uc = i; | 
|---|
| 97 | mbstate_t s = {0}; | 
|---|
| 98 | wchar_t wc; | 
|---|
| 99 | size_t len = mbrtowc (&wc, &c, 1, &s); | 
|---|
| 100 | localeinfo->sbclen[uc] = len <= 1 ? 1 : - (int) - len; | 
|---|
| 101 | localeinfo->sbctowc[uc] = len <= 1 ? wc : WEOF; | 
|---|
| 102 | } | 
|---|
| 103 | } | 
|---|
| 104 |  | 
|---|
| 105 | /* The set of wchar_t values C such that there's a useful locale | 
|---|
| 106 | somewhere where C != towupper (C) && C != towlower (towupper (C)). | 
|---|
| 107 | For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because | 
|---|
| 108 | towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and | 
|---|
| 109 | towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU).  */ | 
|---|
| 110 | static short const lonesome_lower[] = | 
|---|
| 111 | { | 
|---|
| 112 | 0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345, | 
|---|
| 113 | 0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1, | 
|---|
| 114 |  | 
|---|
| 115 | /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase | 
|---|
| 116 | counterpart in locales predating Unicode 4.0.0 (April 2003).  */ | 
|---|
| 117 | 0x03F2, | 
|---|
| 118 |  | 
|---|
| 119 | 0x03F5, 0x1E9B, 0x1FBE, | 
|---|
| 120 | }; | 
|---|
| 121 |  | 
|---|
| 122 | /* Verify that the worst case fits.  This is 1 for towupper, 1 for | 
|---|
| 123 | towlower, and 1 for each entry in LONESOME_LOWER.  */ | 
|---|
| 124 | static_assert (1 + 1 + sizeof lonesome_lower / sizeof *lonesome_lower | 
|---|
| 125 | <= CASE_FOLDED_BUFSIZE); | 
|---|
| 126 |  | 
|---|
| 127 | /* Find the characters equal to C after case-folding, other than C | 
|---|
| 128 | itself, and store them into FOLDED.  Return the number of characters | 
|---|
| 129 | stored; this is zero if C is WEOF.  */ | 
|---|
| 130 |  | 
|---|
| 131 | int | 
|---|
| 132 | case_folded_counterparts (wint_t c, wchar_t folded[CASE_FOLDED_BUFSIZE]) | 
|---|
| 133 | { | 
|---|
| 134 | int i; | 
|---|
| 135 | int n = 0; | 
|---|
| 136 | wint_t uc = towupper (c); | 
|---|
| 137 | wint_t lc = towlower (uc); | 
|---|
| 138 | if (uc != c) | 
|---|
| 139 | folded[n++] = uc; | 
|---|
| 140 | if (lc != uc && lc != c && towupper (lc) == uc) | 
|---|
| 141 | folded[n++] = lc; | 
|---|
| 142 | for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++) | 
|---|
| 143 | { | 
|---|
| 144 | wint_t li = lonesome_lower[i]; | 
|---|
| 145 | if (li != lc && li != uc && li != c && towupper (li) == uc) | 
|---|
| 146 | folded[n++] = li; | 
|---|
| 147 | } | 
|---|
| 148 | return n; | 
|---|
| 149 | } | 
|---|