| 1 | #!/bin/sh
|
|---|
| 2 | # Test multibyte locale which is not UTF-8 (ja_JP.shift_jis)
|
|---|
| 3 | # This is a stateful locale. Same byte value can be either
|
|---|
| 4 | # a single-byte character, or the second byte of a multibyte
|
|---|
| 5 | # character.
|
|---|
| 6 |
|
|---|
| 7 | # Copyright (C) 2016-2022 Free Software Foundation, Inc.
|
|---|
| 8 |
|
|---|
| 9 | # This program is free software: you can redistribute it and/or modify
|
|---|
| 10 | # it under the terms of the GNU General Public License as published by
|
|---|
| 11 | # the Free Software Foundation, either version 3 of the License, or
|
|---|
| 12 | # (at your option) any later version.
|
|---|
| 13 |
|
|---|
| 14 | # This program is distributed in the hope that it will be useful,
|
|---|
| 15 | # but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|---|
| 16 | # MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|---|
| 17 | # GNU General Public License for more details.
|
|---|
| 18 |
|
|---|
| 19 | # You should have received a copy of the GNU General Public License
|
|---|
| 20 | # along with this program. If not, see <https://www.gnu.org/licenses/>.
|
|---|
| 21 | . "${srcdir=.}/testsuite/init.sh"; path_prepend_ ./sed
|
|---|
| 22 | print_ver_ sed
|
|---|
| 23 |
|
|---|
| 24 | # If found, LOCALE_JA_SJIS will contain the locale name.
|
|---|
| 25 | require_ja_shiftjis_locale_
|
|---|
| 26 |
|
|---|
| 27 | # Ensure the implementation is not buggy (skip otherwise)
|
|---|
| 28 | require_valid_ja_shiftjis_locale_ "$LOCALE_JA_SJIS"
|
|---|
| 29 |
|
|---|
| 30 | # This test uses two characters:
|
|---|
| 31 | # Unicode Character 'KATAKANA LETTER ZE' (U+30BC)
|
|---|
| 32 | # Unicode Character 'KATAKANA LETTER ZO' (U+30BE)
|
|---|
| 33 | #
|
|---|
| 34 | # In SHIFT-JIS locale, these multibyte characters contain
|
|---|
| 35 | # open/close brackets (ASCII 0x5B/0x5D) as the trailing byte.
|
|---|
| 36 | #
|
|---|
| 37 | # See also:
|
|---|
| 38 | # https://en.wikipedia.org/wiki/Shift_JIS
|
|---|
| 39 | # http://www.rikai.com/library/kanjitables/kanji_codes.sjis.shtml
|
|---|
| 40 |
|
|---|
| 41 | # Unicode Character 'KATAKANA LETTER ZE' (U+30BC)
|
|---|
| 42 | #
|
|---|
| 43 | # UTF-8: hex: 0xE3 0x82 0xBC
|
|---|
| 44 | # bin: 11100011 10000010 10111100
|
|---|
| 45 | #
|
|---|
| 46 | # Shift-jis hex: 0x83 0x5B
|
|---|
| 47 | # oct: 203 133
|
|---|
| 48 | # bin: 10000011 01011011
|
|---|
| 49 | #
|
|---|
| 50 | # Conversion example:
|
|---|
| 51 | # $ printf '\x83\x5B' | iconv -f SHIFT-JIS -t UTF-8 | od -tx1o1c
|
|---|
| 52 | # 0000000 e3 82 bc
|
|---|
| 53 | # 343 202 274
|
|---|
| 54 | # 343 202 274
|
|---|
| 55 |
|
|---|
| 56 | # Unicode Character 'KATAKANA LETTER ZO' (U+30BE)
|
|---|
| 57 | #
|
|---|
| 58 | # UTF-8: hex: 0xE3 0x82 0xBE
|
|---|
| 59 | # bin: 11100011 10000010 10111110
|
|---|
| 60 | #
|
|---|
| 61 | # Shift-jis hex: 0x83 0x5D
|
|---|
| 62 | # oct: 203 135
|
|---|
| 63 | # bin: 10000011 01011101
|
|---|
| 64 | #
|
|---|
| 65 | # Conversion example:
|
|---|
| 66 | # $ printf '\x83\x5D' | iconv -f SHIFT-JIS -t UTF-8 | od -tx1o1c
|
|---|
| 67 | # 0000000 e3 82 be
|
|---|
| 68 | # 343 202 276
|
|---|
| 69 | # 343 202 276
|
|---|
| 70 | #
|
|---|
| 71 |
|
|---|
| 72 |
|
|---|
| 73 | #
|
|---|
| 74 | # Tests 1,2: Test y/// command with multibyte, non-utf8 seqeunce.
|
|---|
| 75 | # Implmenetation notes: str_append() has special code path for non-utf8 cases.
|
|---|
| 76 | #
|
|---|
| 77 |
|
|---|
| 78 | # Test 1: valid multibyte seqeunce
|
|---|
| 79 | printf 'y/a/\203\133/' > p1 || framework_failure_
|
|---|
| 80 | echo Xa > in1 || framework_failure_
|
|---|
| 81 | printf 'X\203\133\n' > exp1 || framework_failure_
|
|---|
| 82 |
|
|---|
| 83 | LC_ALL="$LOCALE_JA_SJIS" sed -f p1 <in1 >out1 || fail=1
|
|---|
| 84 | compare_ exp1 out1 || fail=1
|
|---|
| 85 |
|
|---|
| 86 | # Test 2: invalid multibyte seqeunce, treated as two single-byte characters.
|
|---|
| 87 | printf 'y/aa/\203\060/' > p2 || framework_failure_
|
|---|
| 88 | LC_ALL="$LOCALE_JA_SJIS" sed -f p2 </dev/null 2>out2 || fail=1
|
|---|
| 89 | compare_ /dev/null out2 || fail=1
|
|---|
| 90 |
|
|---|
| 91 | #
|
|---|
| 92 | # Test 3: multibyte character class with these characters.
|
|---|
| 93 | #
|
|---|
| 94 | # Before sed-4.3, snarf_char_class would parse it incorrectly,
|
|---|
| 95 | # Treating the first closing-bracket as closing the character-class,
|
|---|
| 96 | # instead of being part of a multibyte sequence.
|
|---|
| 97 |
|
|---|
| 98 | printf '/[\203]/]/p' > p3 || framework_failure_
|
|---|
| 99 | LC_ALL="$LOCALE_JA_SJIS" sed -f p3 </dev/null >out3 || fail=1
|
|---|
| 100 | compare_ /dev/null out3 || fail=1
|
|---|
| 101 |
|
|---|
| 102 | # Test 4:
|
|---|
| 103 | # Same as test 3, but with the other multibyte character.
|
|---|
| 104 | # (this did not cause a failure before sed-4.3, but the code was incorrect).
|
|---|
| 105 | # Keep this test for code-coverage purposes.
|
|---|
| 106 | printf '/[\203[/]/p' > p4 || framework_failure_
|
|---|
| 107 | LC_ALL="$LOCALE_JA_SJIS" sed -f p4 </dev/null >out4 || fail=1
|
|---|
| 108 | compare_ /dev/null out4 || fail=1
|
|---|
| 109 |
|
|---|
| 110 | # TODO: Find a locale in which ':.=' can be part of a valid multibyte octet.
|
|---|
| 111 | #
|
|---|
| 112 | # snarf_char_class specifically tests for five bytes: ':.=[]' .
|
|---|
| 113 | # '[' and ']' are tested above, yet '.:=' are not valid as part of a
|
|---|
| 114 | # multibyte shift-jis sequence.
|
|---|
| 115 | #
|
|---|
| 116 | # valid:
|
|---|
| 117 | # $ printf '\203]' | iconv -f SHIFT-JIS -t utf-8
|
|---|
| 118 | # $ printf '\203[' | iconv -f SHIFT-JIS -t utf-8
|
|---|
| 119 | #
|
|---|
| 120 | # invalid:
|
|---|
| 121 | # $ printf '\203:' | iconv -f SHIFT-JIS -t utf-8
|
|---|
| 122 | # iconv: (stdin):1:0: cannot convert
|
|---|
| 123 | #
|
|---|
| 124 | # $ printf '\203=' | iconv -f SHIFT-JIS -t utf-8
|
|---|
| 125 | # iconv: (stdin):1:0: cannot convert
|
|---|
| 126 | #
|
|---|
| 127 | # $ printf '\203.' | iconv -f SHIFT-JIS -t utf-8
|
|---|
| 128 | # iconv: (stdin):0:0: cannot convert
|
|---|
| 129 |
|
|---|
| 130 | Exit $fail
|
|---|