Context Navigation

localeinfo.c

Last change on this file was 3613, checked in by bird, 10 months ago
src/sed: Merged in changes between 4.1.5 and 4.9 from the vendor branch. (svn merge ^{/vendor/sed/4.1.5}/vendor/sed/current .)
File size: 5.0 KB

Rev	Line
[3611]	1	/* locale information
	2
	3	Copyright 2016-2022 Free Software Foundation, Inc.
	4
	5	This program is free software; you can redistribute it and/or modify
	6	it under the terms of the GNU General Public License as published by
	7	the Free Software Foundation, either version 3, or (at your option)
	8	any later version.
	9
	10	This program is distributed in the hope that it will be useful,
	11	but WITHOUT ANY WARRANTY; without even the implied warranty of
	12	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
	13	GNU General Public License for more details.
	14
	15	You should have received a copy of the GNU General Public License
	16	along with this program; if not, write to the Free Software
	17	Foundation, Inc., 51 Franklin Street - Fifth Floor, Boston, MA
	18	02110-1301, USA. */
	19
	20	/* Written by Paul Eggert. */
	21
	22	#include <config.h>
	23
	24	#include <localeinfo.h>
	25
	26	#include <limits.h>
	27	#include <locale.h>
	28	#include <stdlib.h>
	29	#include <string.h>
	30	#include <wctype.h>
	31
	32	/* The sbclen implementation relies on this. */
	33	static_assert (MB_LEN_MAX <= SCHAR_MAX);
	34
	35	/* Return true if the locale uses UTF-8. */
	36
	37	static bool
	38	is_using_utf8 (void)
	39	{
	40	wchar_t wc;
	41	mbstate_t mbs = {0};
	42	return mbrtowc (&wc, "\xc4\x80", 2, &mbs) == 2 && wc == 0x100;
	43	}
	44
	45	/* Return true if the locale is compatible enough with the C locale so
	46	that the locale is single-byte, bytes are in collating-sequence
	47	order, and there are no multi-character collating elements. */
	48
	49	static bool
	50	using_simple_locale (bool multibyte)
	51	{
	52	/* The native character set is known to be compatible with
	53	the C locale. The following test isn't perfect, but it's good
	54	enough in practice, as only ASCII and EBCDIC are in common use
	55	and this test correctly accepts ASCII and rejects EBCDIC. */
	56	enum { native_c_charset =
	57	('\b' == 8 && '\t' == 9 && '\n' == 10 && '\v' == 11 && '\f' == 12
	58	&& '\r' == 13 && ' ' == 32 && '!' == 33 && '"' == 34 && '#' == 35
	59	&& '%' == 37 && '&' == 38 && '\'' == 39 && '(' == 40 && ')' == 41
	60	&& '*' == 42 && '+' == 43 && ',' == 44 && '-' == 45 && '.' == 46
	61	&& '/' == 47 && '0' == 48 && '9' == 57 && ':' == 58 && ';' == 59
	62	&& '<' == 60 && '=' == 61 && '>' == 62 && '?' == 63 && 'A' == 65
	63	&& 'Z' == 90 && '[' == 91 && '\\' == 92 && ']' == 93 && '^' == 94
	64	&& '_' == 95 && 'a' == 97 && 'z' == 122 && '{' == 123 && '\|' == 124
	65	&& '}' == 125 && '~' == 126)
	66	};
	67
	68	if (!native_c_charset \|\| multibyte)
	69	return false;
	70
	71	/* As a heuristic, use strcoll to compare native character order.
	72	If this agrees with byte order the locale should be simple.
	73	This heuristic should work for all known practical locales,
	74	although it would be invalid for artificially-constructed locales
	75	where the native order is the collating-sequence order but there
	76	are multi-character collating elements. */
	77	for (int i = 0; i < UCHAR_MAX; i++)
	78	if (0 <= strcoll (((char []) {i, 0}), ((char []) {i + 1, 0})))
	79	return false;
	80
	81	return true;
	82	}
	83
	84	/* Initialize LOCALEINFO from the current locale. /
	85
	86	void
	87	init_localeinfo (struct localeinfo *localeinfo)
	88	{
	89	localeinfo->multibyte = MB_CUR_MAX > 1;
	90	localeinfo->simple = using_simple_locale (localeinfo->multibyte);
	91	localeinfo->using_utf8 = is_using_utf8 ();
	92
	93	for (int i = CHAR_MIN; i <= CHAR_MAX; i++)
	94	{
	95	char c = i;
	96	unsigned char uc = i;
	97	mbstate_t s = {0};
	98	wchar_t wc;
	99	size_t len = mbrtowc (&wc, &c, 1, &s);
	100	localeinfo->sbclen[uc] = len <= 1 ? 1 : - (int) - len;
	101	localeinfo->sbctowc[uc] = len <= 1 ? wc : WEOF;
	102	}
	103	}
	104
	105	/* The set of wchar_t values C such that there's a useful locale
	106	somewhere where C != towupper (C) && C != towlower (towupper (C)).
	107	For example, 0x00B5 (U+00B5 MICRO SIGN) is in this table, because
	108	towupper (0x00B5) == 0x039C (U+039C GREEK CAPITAL LETTER MU), and
	109	towlower (0x039C) == 0x03BC (U+03BC GREEK SMALL LETTER MU). */
	110	static short const lonesome_lower[] =
	111	{
	112	0x00B5, 0x0131, 0x017F, 0x01C5, 0x01C8, 0x01CB, 0x01F2, 0x0345,
	113	0x03C2, 0x03D0, 0x03D1, 0x03D5, 0x03D6, 0x03F0, 0x03F1,
	114
	115	/* U+03F2 GREEK LUNATE SIGMA SYMBOL lacks a specific uppercase
	116	counterpart in locales predating Unicode 4.0.0 (April 2003). */
	117	0x03F2,
	118
	119	0x03F5, 0x1E9B, 0x1FBE,
	120	};
	121
	122	/* Verify that the worst case fits. This is 1 for towupper, 1 for
	123	towlower, and 1 for each entry in LONESOME_LOWER. */
	124	static_assert (1 + 1 + sizeof lonesome_lower / sizeof *lonesome_lower
	125	<= CASE_FOLDED_BUFSIZE);
	126
	127	/* Find the characters equal to C after case-folding, other than C
	128	itself, and store them into FOLDED. Return the number of characters
	129	stored; this is zero if C is WEOF. */
	130
	131	int
	132	case_folded_counterparts (wint_t c, wchar_t folded[CASE_FOLDED_BUFSIZE])
	133	{
	134	int i;
	135	int n = 0;
	136	wint_t uc = towupper (c);
	137	wint_t lc = towlower (uc);
	138	if (uc != c)
	139	folded[n++] = uc;
	140	if (lc != uc && lc != c && towupper (lc) == uc)
	141	folded[n++] = lc;
	142	for (i = 0; i < sizeof lonesome_lower / sizeof *lonesome_lower; i++)
	143	{
	144	wint_t li = lonesome_lower[i];
	145	if (li != lc && li != uc && li != c && towupper (li) == uc)
	146	folded[n++] = li;
	147	}
	148	return n;
	149	}

Note: See TracBrowser for help on using the repository browser.

Context Navigation

source: trunk/src/sed/lib/localeinfo.c

Download in other formats: