[3611] | 1 | /* Auxiliary program to test mbrtowc(3) behaviour.
|
---|
| 2 | Copyright 2016-2022 Free Software Foundation, Inc.
|
---|
| 3 |
|
---|
| 4 | This program is free software; you can redistribute it and/or modify
|
---|
| 5 | it under the terms of the GNU General Public License as published by
|
---|
| 6 | the Free Software Foundation; either version 3, or (at your option)
|
---|
| 7 | any later version.
|
---|
| 8 |
|
---|
| 9 | This program is distributed in the hope that it will be useful,
|
---|
| 10 | but WITHOUT ANY WARRANTY; without even the implied warranty of
|
---|
| 11 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
---|
| 12 | GNU General Public License for more details.
|
---|
| 13 |
|
---|
| 14 | You should have received a copy of the GNU General Public License
|
---|
| 15 | along with this program; If not, see <https://www.gnu.org/licenses/>. */
|
---|
| 16 |
|
---|
| 17 | /* Test the operating-system's native mbrtowc(3) function,
|
---|
| 18 | by feeding it multibyte seqeunces one byte at a time,
|
---|
| 19 | and reporting the result.
|
---|
| 20 |
|
---|
| 21 | The program prints the following values after each mbrtowc invocation,
|
---|
| 22 | separated by commas:
|
---|
| 23 |
|
---|
| 24 | -2 the octet is contributes to a valid yet incomplete multibyte sequence
|
---|
| 25 | in the current locale.
|
---|
| 26 |
|
---|
| 27 | -1 the octet causes an encoding error.
|
---|
| 28 |
|
---|
| 29 | 0 the octet represents a NUL byte
|
---|
| 30 |
|
---|
| 31 | 1 the octet is a valid single-byte character, OR
|
---|
| 32 | completes a valid multibyte sequence.
|
---|
| 33 |
|
---|
| 34 | Because the program invokes mbrtowc(3) byte-by-byte, the reported
|
---|
| 35 | result should never be larger than 1.
|
---|
| 36 |
|
---|
| 37 | Example of typical output with UTF-8 encoding
|
---|
| 38 | ---------------------------------------------
|
---|
| 39 |
|
---|
| 40 | The unicode character 'N-ARY SUMMATION' (U+2211), encoded in UTF-8 as:
|
---|
| 41 | hex: 0xE2 0x88 0x91
|
---|
| 42 | oct: 342 210 211
|
---|
| 43 |
|
---|
| 44 | Decoding the valid sequence byte-by-byte gives:
|
---|
| 45 | $ printf '\342\210\221' | LC_ALL=en_US.UTF-8 test-mbrtowc
|
---|
| 46 | -2,-2,1
|
---|
| 47 |
|
---|
| 48 | '\210' is not a valid leading byte in UTF-8,
|
---|
| 49 | thus the first byte gives -1, and the 'X' is treated
|
---|
| 50 | as a valid single-byte character:
|
---|
| 51 |
|
---|
| 52 | $ printf '\210X' | LC_ALL=en_US.UTF-8 test-mbrtowc
|
---|
| 53 | -1,1
|
---|
| 54 |
|
---|
| 55 | '\342' is a valid yet incomplete multibyte sequence.
|
---|
| 56 | Passing it to mbrtowc results in value '-2'.
|
---|
| 57 | The following value 'X' gives an encoding error '-1'
|
---|
| 58 | (as 'X' is not a valid trailing byte in a multibyte UTF-8 sequence):
|
---|
| 59 |
|
---|
| 60 | $ printf '\342X' | LC_ALL=en_US.UTF-8 test-mbrtowc
|
---|
| 61 | -2,-1
|
---|
| 62 |
|
---|
| 63 |
|
---|
| 64 | Detecting implementation bugs in mbrtowc
|
---|
| 65 | ----------------------------------------
|
---|
| 66 |
|
---|
| 67 | UTF-8 implementation is correct on most operating systems.
|
---|
| 68 | Other multibyte locales might present more difficulties.
|
---|
| 69 | An example is the Japanese SHIFT-JIS locale under Mac OS X.
|
---|
| 70 | NOTE: The locale is 'ja_JP.SJIS' under Mac OS X, 'ja_JP.shiftjis'
|
---|
| 71 | under Ubuntu. 'ja_JP.sjis' was also found on some systems.
|
---|
| 72 |
|
---|
| 73 | Using unicode character 'KATAKANA LETTER ZE' (U+30BC)
|
---|
| 74 | UTF-8: hex: 0xE3 0x82 0xBC
|
---|
| 75 | Shift-jis hex: 0x83 0x5B
|
---|
| 76 | oct: 203 133
|
---|
| 77 |
|
---|
| 78 | The following is a valid multibyte sequence in SHIFT-JIS,
|
---|
| 79 | the first byte should result in '-2' (valid yet incomplete),
|
---|
| 80 | and the second byte should result in '1' (a valid multibyte sequence
|
---|
| 81 | completed):
|
---|
| 82 |
|
---|
| 83 | $ printf '\203\133' | LC_ALL=ja_JP.SJIS test-mbrtowc
|
---|
| 84 | -2,1
|
---|
| 85 |
|
---|
| 86 | The follwing is an INVALID multibyte sequence in SHIFT-JIS
|
---|
| 87 | (The byte ':' is not valid as a second octet).
|
---|
| 88 | Buggy implementations will accept this as a valid multibyte sequence:
|
---|
| 89 |
|
---|
| 90 | # NOTE: this result indicates a buggy mbrtowc
|
---|
| 91 | $ printf '\203:' | LC_ALL=ja_JP.SJIS test-mbrtowc
|
---|
| 92 | -2,1
|
---|
| 93 |
|
---|
| 94 | A correct implementations should report '-1' for the second byte (i.e.
|
---|
| 95 | an encoding error):
|
---|
| 96 |
|
---|
| 97 | $ printf '\203:' | LC_ALL=ja_JP.SJIS test-mbrtowc
|
---|
| 98 | -2,-1
|
---|
| 99 |
|
---|
| 100 |
|
---|
| 101 | Expected results with correct implementations
|
---|
| 102 | ---------------------------------------------
|
---|
| 103 |
|
---|
| 104 | In GNU Sed some tests purposely use invalid multibyte sequences
|
---|
| 105 | to test sed's behaviour. A buggy implemetation of mbrtowc
|
---|
| 106 | would result in false-alarm failures.
|
---|
| 107 |
|
---|
| 108 | The following are expected results in correct implementations:
|
---|
| 109 | (locale names are from Mac OS X):
|
---|
| 110 |
|
---|
| 111 | $ printf '\203\133' | LC_ALL=ja_JP.SJIS test-mbrtowc
|
---|
| 112 | -2,1
|
---|
| 113 | $ printf '\203:' | LC_ALL=ja_JP.SJIS test-mbrtowc
|
---|
| 114 | -2,-1
|
---|
| 115 | $ printf '\262C' | LC_ALL=ja_JP.eucJP test-mbrtowc
|
---|
| 116 | -2,-1
|
---|
| 117 | */
|
---|
| 118 |
|
---|
| 119 | #include <config.h>
|
---|
| 120 | #include <locale.h>
|
---|
| 121 | #include <stdio.h>
|
---|
| 122 | #include <stdlib.h>
|
---|
| 123 | #include <wchar.h>
|
---|
| 124 |
|
---|
| 125 | #include "closeout.h"
|
---|
| 126 | #include "error.h"
|
---|
| 127 | #include "progname.h"
|
---|
| 128 |
|
---|
| 129 | /* stub replacement for non-standard err(3) */
|
---|
| 130 | static int
|
---|
| 131 | die (const char *msg)
|
---|
| 132 | {
|
---|
| 133 | error (0, 0, "%s: error: %s\n", program_name, msg);
|
---|
| 134 | exit (EXIT_FAILURE);
|
---|
| 135 | }
|
---|
| 136 |
|
---|
| 137 | int
|
---|
| 138 | main (int argc, char **argv)
|
---|
| 139 | {
|
---|
| 140 | int c;
|
---|
| 141 | int first = 1;
|
---|
| 142 |
|
---|
| 143 | set_program_name (argv[0]);
|
---|
| 144 | if (!setlocale (LC_ALL, ""))
|
---|
| 145 | die ("failed to set locale");
|
---|
| 146 |
|
---|
| 147 | while ((c = getchar ()) != EOF)
|
---|
| 148 | {
|
---|
| 149 | wchar_t wc;
|
---|
| 150 | char ch = (unsigned char) c;
|
---|
| 151 | int i = (int) mbrtowc (&wc, &ch, 1, NULL);
|
---|
| 152 |
|
---|
| 153 | if (!first)
|
---|
| 154 | putchar (',');
|
---|
| 155 | first = 0;
|
---|
| 156 |
|
---|
| 157 | printf ("%d", i);
|
---|
| 158 | }
|
---|
| 159 |
|
---|
| 160 | if (first)
|
---|
| 161 | die ("empty input");
|
---|
| 162 |
|
---|
| 163 | putchar ('\n');
|
---|
| 164 |
|
---|
| 165 | if (ferror (stdin))
|
---|
| 166 | die ("read error");
|
---|
| 167 | close_stdout ();
|
---|
| 168 |
|
---|
| 169 | exit (EXIT_SUCCESS);
|
---|
| 170 | }
|
---|