source: trunk/src/sed/testsuite/mb-charclass-non-utf8.sh@ 3634

Last change on this file since 3634 was 3613, checked in by bird, 14 months ago

src/sed: Merged in changes between 4.1.5 and 4.9 from the vendor branch. (svn merge /vendor/sed/4.1.5 /vendor/sed/current .)

  • Property svn:executable set to *
File size: 4.3 KB
Line 
1#!/bin/sh
2# Test multibyte locale which is not UTF-8 (ja_JP.shift_jis)
3# This is a stateful locale. Same byte value can be either
4# a single-byte character, or the second byte of a multibyte
5# character.
6
7# Copyright (C) 2016-2022 Free Software Foundation, Inc.
8
9# This program is free software: you can redistribute it and/or modify
10# it under the terms of the GNU General Public License as published by
11# the Free Software Foundation, either version 3 of the License, or
12# (at your option) any later version.
13
14# This program is distributed in the hope that it will be useful,
15# but WITHOUT ANY WARRANTY; without even the implied warranty of
16# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17# GNU General Public License for more details.
18
19# You should have received a copy of the GNU General Public License
20# along with this program. If not, see <https://www.gnu.org/licenses/>.
21. "${srcdir=.}/testsuite/init.sh"; path_prepend_ ./sed
22print_ver_ sed
23
24# If found, LOCALE_JA_SJIS will contain the locale name.
25require_ja_shiftjis_locale_
26
27# Ensure the implementation is not buggy (skip otherwise)
28require_valid_ja_shiftjis_locale_ "$LOCALE_JA_SJIS"
29
30# This test uses two characters:
31# Unicode Character 'KATAKANA LETTER ZE' (U+30BC)
32# Unicode Character 'KATAKANA LETTER ZO' (U+30BE)
33#
34# In SHIFT-JIS locale, these multibyte characters contain
35# open/close brackets (ASCII 0x5B/0x5D) as the trailing byte.
36#
37# See also:
38# https://en.wikipedia.org/wiki/Shift_JIS
39# http://www.rikai.com/library/kanjitables/kanji_codes.sjis.shtml
40
41# Unicode Character 'KATAKANA LETTER ZE' (U+30BC)
42#
43# UTF-8: hex: 0xE3 0x82 0xBC
44# bin: 11100011 10000010 10111100
45#
46# Shift-jis hex: 0x83 0x5B
47# oct: 203 133
48# bin: 10000011 01011011
49#
50# Conversion example:
51# $ printf '\x83\x5B' | iconv -f SHIFT-JIS -t UTF-8 | od -tx1o1c
52# 0000000 e3 82 bc
53# 343 202 274
54# 343 202 274
55
56# Unicode Character 'KATAKANA LETTER ZO' (U+30BE)
57#
58# UTF-8: hex: 0xE3 0x82 0xBE
59# bin: 11100011 10000010 10111110
60#
61# Shift-jis hex: 0x83 0x5D
62# oct: 203 135
63# bin: 10000011 01011101
64#
65# Conversion example:
66# $ printf '\x83\x5D' | iconv -f SHIFT-JIS -t UTF-8 | od -tx1o1c
67# 0000000 e3 82 be
68# 343 202 276
69# 343 202 276
70#
71
72
73#
74# Tests 1,2: Test y/// command with multibyte, non-utf8 seqeunce.
75# Implmenetation notes: str_append() has special code path for non-utf8 cases.
76#
77
78# Test 1: valid multibyte seqeunce
79printf 'y/a/\203\133/' > p1 || framework_failure_
80echo Xa > in1 || framework_failure_
81printf 'X\203\133\n' > exp1 || framework_failure_
82
83LC_ALL="$LOCALE_JA_SJIS" sed -f p1 <in1 >out1 || fail=1
84compare_ exp1 out1 || fail=1
85
86# Test 2: invalid multibyte seqeunce, treated as two single-byte characters.
87printf 'y/aa/\203\060/' > p2 || framework_failure_
88LC_ALL="$LOCALE_JA_SJIS" sed -f p2 </dev/null 2>out2 || fail=1
89compare_ /dev/null out2 || fail=1
90
91#
92# Test 3: multibyte character class with these characters.
93#
94# Before sed-4.3, snarf_char_class would parse it incorrectly,
95# Treating the first closing-bracket as closing the character-class,
96# instead of being part of a multibyte sequence.
97
98printf '/[\203]/]/p' > p3 || framework_failure_
99LC_ALL="$LOCALE_JA_SJIS" sed -f p3 </dev/null >out3 || fail=1
100compare_ /dev/null out3 || fail=1
101
102# Test 4:
103# Same as test 3, but with the other multibyte character.
104# (this did not cause a failure before sed-4.3, but the code was incorrect).
105# Keep this test for code-coverage purposes.
106printf '/[\203[/]/p' > p4 || framework_failure_
107LC_ALL="$LOCALE_JA_SJIS" sed -f p4 </dev/null >out4 || fail=1
108compare_ /dev/null out4 || fail=1
109
110# TODO: Find a locale in which ':.=' can be part of a valid multibyte octet.
111#
112# snarf_char_class specifically tests for five bytes: ':.=[]' .
113# '[' and ']' are tested above, yet '.:=' are not valid as part of a
114# multibyte shift-jis sequence.
115#
116# valid:
117# $ printf '\203]' | iconv -f SHIFT-JIS -t utf-8
118# $ printf '\203[' | iconv -f SHIFT-JIS -t utf-8
119#
120# invalid:
121# $ printf '\203:' | iconv -f SHIFT-JIS -t utf-8
122# iconv: (stdin):1:0: cannot convert
123#
124# $ printf '\203=' | iconv -f SHIFT-JIS -t utf-8
125# iconv: (stdin):1:0: cannot convert
126#
127# $ printf '\203.' | iconv -f SHIFT-JIS -t utf-8
128# iconv: (stdin):0:0: cannot convert
129
130Exit $fail
Note: See TracBrowser for help on using the repository browser.