source: trunk/src/sed/testsuite/test-mbrtowc.c@ 3613

Last change on this file since 3613 was 3613, checked in by bird, 10 months ago

src/sed: Merged in changes between 4.1.5 and 4.9 from the vendor branch. (svn merge /vendor/sed/4.1.5 /vendor/sed/current .)

File size: 4.8 KB
Line 
1/* Auxiliary program to test mbrtowc(3) behaviour.
2 Copyright 2016-2022 Free Software Foundation, Inc.
3
4 This program is free software; you can redistribute it and/or modify
5 it under the terms of the GNU General Public License as published by
6 the Free Software Foundation; either version 3, or (at your option)
7 any later version.
8
9 This program is distributed in the hope that it will be useful,
10 but WITHOUT ANY WARRANTY; without even the implied warranty of
11 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
12 GNU General Public License for more details.
13
14 You should have received a copy of the GNU General Public License
15 along with this program; If not, see <https://www.gnu.org/licenses/>. */
16
17/* Test the operating-system's native mbrtowc(3) function,
18 by feeding it multibyte seqeunces one byte at a time,
19 and reporting the result.
20
21 The program prints the following values after each mbrtowc invocation,
22 separated by commas:
23
24 -2 the octet is contributes to a valid yet incomplete multibyte sequence
25 in the current locale.
26
27 -1 the octet causes an encoding error.
28
29 0 the octet represents a NUL byte
30
31 1 the octet is a valid single-byte character, OR
32 completes a valid multibyte sequence.
33
34 Because the program invokes mbrtowc(3) byte-by-byte, the reported
35 result should never be larger than 1.
36
37 Example of typical output with UTF-8 encoding
38 ---------------------------------------------
39
40 The unicode character 'N-ARY SUMMATION' (U+2211), encoded in UTF-8 as:
41 hex: 0xE2 0x88 0x91
42 oct: 342 210 211
43
44 Decoding the valid sequence byte-by-byte gives:
45 $ printf '\342\210\221' | LC_ALL=en_US.UTF-8 test-mbrtowc
46 -2,-2,1
47
48 '\210' is not a valid leading byte in UTF-8,
49 thus the first byte gives -1, and the 'X' is treated
50 as a valid single-byte character:
51
52 $ printf '\210X' | LC_ALL=en_US.UTF-8 test-mbrtowc
53 -1,1
54
55 '\342' is a valid yet incomplete multibyte sequence.
56 Passing it to mbrtowc results in value '-2'.
57 The following value 'X' gives an encoding error '-1'
58 (as 'X' is not a valid trailing byte in a multibyte UTF-8 sequence):
59
60 $ printf '\342X' | LC_ALL=en_US.UTF-8 test-mbrtowc
61 -2,-1
62
63
64 Detecting implementation bugs in mbrtowc
65 ----------------------------------------
66
67 UTF-8 implementation is correct on most operating systems.
68 Other multibyte locales might present more difficulties.
69 An example is the Japanese SHIFT-JIS locale under Mac OS X.
70 NOTE: The locale is 'ja_JP.SJIS' under Mac OS X, 'ja_JP.shiftjis'
71 under Ubuntu. 'ja_JP.sjis' was also found on some systems.
72
73 Using unicode character 'KATAKANA LETTER ZE' (U+30BC)
74 UTF-8: hex: 0xE3 0x82 0xBC
75 Shift-jis hex: 0x83 0x5B
76 oct: 203 133
77
78 The following is a valid multibyte sequence in SHIFT-JIS,
79 the first byte should result in '-2' (valid yet incomplete),
80 and the second byte should result in '1' (a valid multibyte sequence
81 completed):
82
83 $ printf '\203\133' | LC_ALL=ja_JP.SJIS test-mbrtowc
84 -2,1
85
86 The follwing is an INVALID multibyte sequence in SHIFT-JIS
87 (The byte ':' is not valid as a second octet).
88 Buggy implementations will accept this as a valid multibyte sequence:
89
90 # NOTE: this result indicates a buggy mbrtowc
91 $ printf '\203:' | LC_ALL=ja_JP.SJIS test-mbrtowc
92 -2,1
93
94 A correct implementations should report '-1' for the second byte (i.e.
95 an encoding error):
96
97 $ printf '\203:' | LC_ALL=ja_JP.SJIS test-mbrtowc
98 -2,-1
99
100
101 Expected results with correct implementations
102 ---------------------------------------------
103
104 In GNU Sed some tests purposely use invalid multibyte sequences
105 to test sed's behaviour. A buggy implemetation of mbrtowc
106 would result in false-alarm failures.
107
108 The following are expected results in correct implementations:
109 (locale names are from Mac OS X):
110
111 $ printf '\203\133' | LC_ALL=ja_JP.SJIS test-mbrtowc
112 -2,1
113 $ printf '\203:' | LC_ALL=ja_JP.SJIS test-mbrtowc
114 -2,-1
115 $ printf '\262C' | LC_ALL=ja_JP.eucJP test-mbrtowc
116 -2,-1
117*/
118
119#include <config.h>
120#include <locale.h>
121#include <stdio.h>
122#include <stdlib.h>
123#include <wchar.h>
124
125#include "closeout.h"
126#include "error.h"
127#include "progname.h"
128
129/* stub replacement for non-standard err(3) */
130static int
131die (const char *msg)
132{
133 error (0, 0, "%s: error: %s\n", program_name, msg);
134 exit (EXIT_FAILURE);
135}
136
137int
138main (int argc, char **argv)
139{
140 int c;
141 int first = 1;
142
143 set_program_name (argv[0]);
144 if (!setlocale (LC_ALL, ""))
145 die ("failed to set locale");
146
147 while ((c = getchar ()) != EOF)
148 {
149 wchar_t wc;
150 char ch = (unsigned char) c;
151 int i = (int) mbrtowc (&wc, &ch, 1, NULL);
152
153 if (!first)
154 putchar (',');
155 first = 0;
156
157 printf ("%d", i);
158 }
159
160 if (first)
161 die ("empty input");
162
163 putchar ('\n');
164
165 if (ferror (stdin))
166 die ("read error");
167 close_stdout ();
168
169 exit (EXIT_SUCCESS);
170}
Note: See TracBrowser for help on using the repository browser.