source: trunk/src/sed/lib/localeinfo.c@ 3613

Last change on this file since 3613 was 3613, checked in by bird, 10 months ago

src/sed: Merged in changes between 4.1.5 and 4.9 from the vendor branch. (svn merge /vendor/sed/4.1.5 /vendor/sed/current .)

File size: 5.0 KB
Line 
1/* locale information
2
3 Copyright 2016-2022 Free Software Foundation, Inc.
4
5 This program is free software; you can redistribute it and/or modify
6 it under the terms of the GNU General Public License as published by
7 the Free Software Foundation, either version 3, or (at your option)
8 any later version.
9
10 This program is distributed in the hope that it will be useful,
11 but WITHOUT ANY WARRANTY; without even the implied warranty of
12 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
13 GNU General Public License for more details.
14
15 You should have received a copy of the GNU General Public License
16 along with this program; if not, write to the Free Software
17 Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
18 02110-1301, USA. */
19
20/* Written by Paul Eggert. */
21
22#include <config.h>
23
24#include <localeinfo.h>
25
26#include <limits.h>
27#include <locale.h>
28#include <stdlib.h>
29#include <string.h>
30#include <wctype.h>
31
32/* The sbclen implementation relies on this. */
33static_assert (MB_LEN_MAX <= SCHAR_MAX);
34
35/* Return true if the locale uses UTF-8. */
36
37static bool
38is_using_utf8 (void)
39{
40 wchar_t wc;
41 mbstate_t mbs = {0};
42 return mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100;
43}
44
45/* Return true if the locale is compatible enough with the C locale so
46 that the locale is single-byte, bytes are in collating-sequence
47 order, and there are no multi-character collating elements. */
48
49static bool
50using_simple_locale (bool multibyte)
51{
52 /* The native character set is known to be compatible with
53 the C locale. The following test isn't perfect, but it's good
54 enough in practice, as only ASCII and EBCDIC are in common use
55 and this test correctly accepts ASCII and rejects EBCDIC. */
56 enum { native_c_charset =
57 ('\b' == 8 && '\t' == 9 && '\n' == 10 && '\v' == 11 && '\f' == 12
58 && '\r' == 13 && ' ' == 32 && '!' == 33 && '"' == 34 && '#' == 35
59 && '%' == 37 && '&' == 38 && '\'' == 39 && '(' == 40 && ')' == 41
60 && '*' == 42 && '+' == 43 && ',' == 44 && '-' == 45 && '.' == 46
61 && '/' == 47 && '0' == 48 && '9' == 57 && ':' == 58 && ';' == 59
62 && '<' == 60 && '=' == 61 && '>' == 62 && '?' == 63 && 'A' == 65
63 && 'Z' == 90 && '[' == 91 && '\\' == 92 && ']' == 93 && '^' == 94
64 && '_' == 95 && 'a' == 97 && 'z' == 122 && '{' == 123 && '|' == 124
65 && '}' == 125 && '~' == 126)
66 };
67
68 if (!native_c_charset || multibyte)
69 return false;
70
71 /* As a heuristic, use strcoll to compare native character order.
72 If this agrees with byte order the locale should be simple.
73 This heuristic should work for all known practical locales,
74 although it would be invalid for artificially-constructed locales
75 where the native order is the collating-sequence order but there
76 are multi-character collating elements. */
77 for (int i = 0; i < UCHAR_MAX; i++)
78 if (0 <= strcoll (((char []) {i, 0}), ((char []) {i + 1, 0})))
79 return false;
80
81 return true;
82}
83
84/* Initialize *LOCALEINFO from the current locale. */
85
86void
87init_localeinfo (struct localeinfo *localeinfo)
88{
89 localeinfo->multibyte = MB_CUR_MAX > 1;
90 localeinfo->simple = using_simple_locale (localeinfo->multibyte);
91 localeinfo->using_utf8 = is_using_utf8 ();
92
93 for (int i = CHAR_MIN; i <= CHAR_MAX; i++)
94 {
95 char c = i;
96 unsigned char uc = i;
97 mbstate_t s = {0};
98 wchar_t wc;
99 size_t len = mbrtowc (&wc, &c, 1, &s);
100 localeinfo->sbclen[uc] = len <= 1 ? 1 : - (int) - len;
101 localeinfo->sbctowc[uc] = len <= 1 ? wc : WEOF;
102 }
103}
104
105/* The set of wchar_t values C such that there's a useful locale
106 somewhere where C != towupper (C) && C != towlower (towupper (C)).
107 For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because
108 towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and
109 towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU). */
110static short const lonesome_lower[] =
111 {
112 0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345,
113 0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1,
114
115 /* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase
116 counterpart in locales predating Unicode 4.0.0 (April 2003). */
117 0x03F2,
118
119 0x03F5, 0x1E9B, 0x1FBE,
120 };
121
122/* Verify that the worst case fits. This is 1 for towupper, 1 for
123 towlower, and 1 for each entry in LONESOME_LOWER. */
124static_assert (1 + 1 + sizeof lonesome_lower / sizeof *lonesome_lower
125 <= CASE_FOLDED_BUFSIZE);
126
127/* Find the characters equal to C after case-folding, other than C
128 itself, and store them into FOLDED. Return the number of characters
129 stored; this is zero if C is WEOF. */
130
131int
132case_folded_counterparts (wint_t c, wchar_t folded[CASE_FOLDED_BUFSIZE])
133{
134 int i;
135 int n = 0;
136 wint_t uc = towupper (c);
137 wint_t lc = towlower (uc);
138 if (uc != c)
139 folded[n++] = uc;
140 if (lc != uc && lc != c && towupper (lc) == uc)
141 folded[n++] = lc;
142 for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++)
143 {
144 wint_t li = lonesome_lower[i];
145 if (li != lc && li != uc && li != c && towupper (li) == uc)
146 folded[n++] = li;
147 }
148 return n;
149}
Note: See TracBrowser for help on using the repository browser.