1 | /* Auxiliary program to test mbrtowc(3) behaviour.
|
---|
2 | Copyright 2016-2022 Free Software Foundation, Inc.
|
---|
3 |
|
---|
4 | This program is free software; you can redistribute it and/or modify
|
---|
5 | it under the terms of the GNU General Public License as published by
|
---|
6 | the Free Software Foundation; either version 3, or (at your option)
|
---|
7 | any later version.
|
---|
8 |
|
---|
9 | This program is distributed in the hope that it will be useful,
|
---|
10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
12 | GNU General Public License for more details.
|
---|
13 |
|
---|
14 | You should have received a copy of the GNU General Public License
|
---|
15 | along with this program; If not, see <https://www.gnu.org/licenses/>. */
|
---|
16 |
|
---|
17 | /* Test the operating-system's native mbrtowc(3) function,
|
---|
18 | by feeding it multibyte seqeunces one byte at a time,
|
---|
19 | and reporting the result.
|
---|
20 |
|
---|
21 | The program prints the following values after each mbrtowc invocation,
|
---|
22 | separated by commas:
|
---|
23 |
|
---|
24 | -2 the octet is contributes to a valid yet incomplete multibyte sequence
|
---|
25 | in the current locale.
|
---|
26 |
|
---|
27 | -1 the octet causes an encoding error.
|
---|
28 |
|
---|
29 | 0 the octet represents a NUL byte
|
---|
30 |
|
---|
31 | 1 the octet is a valid single-byte character, OR
|
---|
32 | completes a valid multibyte sequence.
|
---|
33 |
|
---|
34 | Because the program invokes mbrtowc(3) byte-by-byte, the reported
|
---|
35 | result should never be larger than 1.
|
---|
36 |
|
---|
37 | Example of typical output with UTF-8 encoding
|
---|
38 | ---------------------------------------------
|
---|
39 |
|
---|
40 | The unicode character 'N-ARY SUMMATION' (U+2211), encoded in UTF-8 as:
|
---|
41 | hex: 0xE2 0x88 0x91
|
---|
42 | oct: 342 210 211
|
---|
43 |
|
---|
44 | Decoding the valid sequence byte-by-byte gives:
|
---|
45 | $ printf '\342\210\221' | LC_ALL=en_US.UTF-8 test-mbrtowc
|
---|
46 | -2,-2,1
|
---|
47 |
|
---|
48 | '\210' is not a valid leading byte in UTF-8,
|
---|
49 | thus the first byte gives -1, and the 'X' is treated
|
---|
50 | as a valid single-byte character:
|
---|
51 |
|
---|
52 | $ printf '\210X' | LC_ALL=en_US.UTF-8 test-mbrtowc
|
---|
53 | -1,1
|
---|
54 |
|
---|
55 | '\342' is a valid yet incomplete multibyte sequence.
|
---|
56 | Passing it to mbrtowc results in value '-2'.
|
---|
57 | The following value 'X' gives an encoding error '-1'
|
---|
58 | (as 'X' is not a valid trailing byte in a multibyte UTF-8 sequence):
|
---|
59 |
|
---|
60 | $ printf '\342X' | LC_ALL=en_US.UTF-8 test-mbrtowc
|
---|
61 | -2,-1
|
---|
62 |
|
---|
63 |
|
---|
64 | Detecting implementation bugs in mbrtowc
|
---|
65 | ----------------------------------------
|
---|
66 |
|
---|
67 | UTF-8 implementation is correct on most operating systems.
|
---|
68 | Other multibyte locales might present more difficulties.
|
---|
69 | An example is the Japanese SHIFT-JIS locale under Mac OS X.
|
---|
70 | NOTE: The locale is 'ja_JP.SJIS' under Mac OS X, 'ja_JP.shiftjis'
|
---|
71 | under Ubuntu. 'ja_JP.sjis' was also found on some systems.
|
---|
72 |
|
---|
73 | Using unicode character 'KATAKANA LETTER ZE' (U+30BC)
|
---|
74 | UTF-8: hex: 0xE3 0x82 0xBC
|
---|
75 | Shift-jis hex: 0x83 0x5B
|
---|
76 | oct: 203 133
|
---|
77 |
|
---|
78 | The following is a valid multibyte sequence in SHIFT-JIS,
|
---|
79 | the first byte should result in '-2' (valid yet incomplete),
|
---|
80 | and the second byte should result in '1' (a valid multibyte sequence
|
---|
81 | completed):
|
---|
82 |
|
---|
83 | $ printf '\203\133' | LC_ALL=ja_JP.SJIS test-mbrtowc
|
---|
84 | -2,1
|
---|
85 |
|
---|
86 | The follwing is an INVALID multibyte sequence in SHIFT-JIS
|
---|
87 | (The byte ':' is not valid as a second octet).
|
---|
88 | Buggy implementations will accept this as a valid multibyte sequence:
|
---|
89 |
|
---|
90 | # NOTE: this result indicates a buggy mbrtowc
|
---|
91 | $ printf '\203:' | LC_ALL=ja_JP.SJIS test-mbrtowc
|
---|
92 | -2,1
|
---|
93 |
|
---|
94 | A correct implementations should report '-1' for the second byte (i.e.
|
---|
95 | an encoding error):
|
---|
96 |
|
---|
97 | $ printf '\203:' | LC_ALL=ja_JP.SJIS test-mbrtowc
|
---|
98 | -2,-1
|
---|
99 |
|
---|
100 |
|
---|
101 | Expected results with correct implementations
|
---|
102 | ---------------------------------------------
|
---|
103 |
|
---|
104 | In GNU Sed some tests purposely use invalid multibyte sequences
|
---|
105 | to test sed's behaviour. A buggy implemetation of mbrtowc
|
---|
106 | would result in false-alarm failures.
|
---|
107 |
|
---|
108 | The following are expected results in correct implementations:
|
---|
109 | (locale names are from Mac OS X):
|
---|
110 |
|
---|
111 | $ printf '\203\133' | LC_ALL=ja_JP.SJIS test-mbrtowc
|
---|
112 | -2,1
|
---|
113 | $ printf '\203:' | LC_ALL=ja_JP.SJIS test-mbrtowc
|
---|
114 | -2,-1
|
---|
115 | $ printf '\262C' | LC_ALL=ja_JP.eucJP test-mbrtowc
|
---|
116 | -2,-1
|
---|
117 | */
|
---|
118 |
|
---|
119 | #include <config.h>
|
---|
120 | #include <locale.h>
|
---|
121 | #include <stdio.h>
|
---|
122 | #include <stdlib.h>
|
---|
123 | #include <wchar.h>
|
---|
124 |
|
---|
125 | #include "closeout.h"
|
---|
126 | #include "error.h"
|
---|
127 | #include "progname.h"
|
---|
128 |
|
---|
129 | /* stub replacement for non-standard err(3) */
|
---|
130 | static int
|
---|
131 | die (const char *msg)
|
---|
132 | {
|
---|
133 | error (0, 0, "%s: error: %s\n", program_name, msg);
|
---|
134 | exit (EXIT_FAILURE);
|
---|
135 | }
|
---|
136 |
|
---|
137 | int
|
---|
138 | main (int argc, char **argv)
|
---|
139 | {
|
---|
140 | int c;
|
---|
141 | int first = 1;
|
---|
142 |
|
---|
143 | set_program_name (argv[0]);
|
---|
144 | if (!setlocale (LC_ALL, ""))
|
---|
145 | die ("failed to set locale");
|
---|
146 |
|
---|
147 | while ((c = getchar ()) != EOF)
|
---|
148 | {
|
---|
149 | wchar_t wc;
|
---|
150 | char ch = (unsigned char) c;
|
---|
151 | int i = (int) mbrtowc (&wc, &ch, 1, NULL);
|
---|
152 |
|
---|
153 | if (!first)
|
---|
154 | putchar (',');
|
---|
155 | first = 0;
|
---|
156 |
|
---|
157 | printf ("%d", i);
|
---|
158 | }
|
---|
159 |
|
---|
160 | if (first)
|
---|
161 | die ("empty input");
|
---|
162 |
|
---|
163 | putchar ('\n');
|
---|
164 |
|
---|
165 | if (ferror (stdin))
|
---|
166 | die ("read error");
|
---|
167 | close_stdout ();
|
---|
168 |
|
---|
169 | exit (EXIT_SUCCESS);
|
---|
170 | }
|
---|