| 1 | #! /bin/sh
|
|---|
| 2 | # Test whether \s matches SP and UTF-8 multi-byte white space characters.
|
|---|
| 3 | #
|
|---|
| 4 | # Copyright (C) 2013-2021 Free Software Foundation, Inc.
|
|---|
| 5 | #
|
|---|
| 6 | # Copying and distribution of this file, with or without modification,
|
|---|
| 7 | # are permitted in any medium without royalty provided the copyright
|
|---|
| 8 | # notice and this notice are preserved.
|
|---|
| 9 |
|
|---|
| 10 | . "${srcdir=.}/init.sh"; path_prepend_ ../src
|
|---|
| 11 |
|
|---|
| 12 | require_en_utf8_locale_
|
|---|
| 13 |
|
|---|
| 14 | LC_ALL=en_US.UTF-8
|
|---|
| 15 | export LC_ALL
|
|---|
| 16 |
|
|---|
| 17 | # It would have been nice to be able to use all UTF8 characters
|
|---|
| 18 | # with the Unicode WSpace=Y character property,
|
|---|
| 19 | # https://en.wikipedia.org/wiki/Whitespace_character, but that
|
|---|
| 20 | # would currently cause distracting failures everywhere I've tried.
|
|---|
| 21 | # Instead, I've listed each with an indicator column, telling what
|
|---|
| 22 | # this test should do if the system's locale/tools produce the
|
|---|
| 23 | # wrong answer.
|
|---|
| 24 |
|
|---|
| 25 | # The values in that column:
|
|---|
| 26 | # X required on all systems (fail if \s or \S fail to work as expected)
|
|---|
| 27 | # x required on "modern enough" systems
|
|---|
| 28 | # O optional: \s or \S misbehavior elicits a warning, but never failure
|
|---|
| 29 |
|
|---|
| 30 | utf8_space_characters=$(sed 's/.*: *//;s/ */\\x/g' <<\EOF
|
|---|
| 31 | U+0009 Horizontal Tab: X 09
|
|---|
| 32 | U+000A Line feed: O 0a
|
|---|
| 33 | U+000B Vertical Tab: X 0b
|
|---|
| 34 | U+000C Form feed: X 0c
|
|---|
| 35 | U+000D Carriage return: X 0d
|
|---|
| 36 | U+0020 SPACE: X 20
|
|---|
| 37 | U+0085 Next line: O 85
|
|---|
| 38 | U+00A0 NO-BREAK SPACE: O c2 a0
|
|---|
| 39 | U+1680 OGHAM SPACE MARK: x e1 9a 80
|
|---|
| 40 | U+2000 EN QUAD: x e2 80 80
|
|---|
| 41 | U+2001 EM QUAD: x e2 80 81
|
|---|
| 42 | U+2002 EN SPACE: x e2 80 82
|
|---|
| 43 | U+2003 EM SPACE: x e2 80 83
|
|---|
| 44 | U+2004 THREE-PER-EM SPACE: x e2 80 84
|
|---|
| 45 | U+2005 FOUR-PER-EM SPACE: x e2 80 85
|
|---|
| 46 | U+2006 SIX-PER-EM SPACE: x e2 80 86
|
|---|
| 47 | U+2007 FIGURE SPACE: O e2 80 87
|
|---|
| 48 | U+2008 PUNCTUATION SPACE: x e2 80 88
|
|---|
| 49 | U+2009 THIN SPACE: x e2 80 89
|
|---|
| 50 | U+200A HAIR SPACE: x e2 80 8a
|
|---|
| 51 | U+200B ZERO WIDTH SPACE: O e2 80 8b
|
|---|
| 52 | U+202F NARROW NO-BREAK SPACE: O e2 80 af
|
|---|
| 53 | U+205F MEDIUM MATHEMATICAL SPACE: x e2 81 9f
|
|---|
| 54 | U+3000 IDEOGRAPHIC SPACE: x e3 80 80
|
|---|
| 55 | EOF
|
|---|
| 56 | )
|
|---|
| 57 |
|
|---|
| 58 | fail=0
|
|---|
| 59 |
|
|---|
| 60 | # On systems that are not "modern enough," simply warn when an "x"-marked
|
|---|
| 61 | # character is not classified as white space. Too many systems
|
|---|
| 62 | # have inadequate UTF-8 tables in this respect, and that lack should not
|
|---|
| 63 | # discourage/confuse those who consider whether to install grep.
|
|---|
| 64 |
|
|---|
| 65 | # As for what constitutes "modern enough", I've arbitrarily started
|
|---|
| 66 | # with "Fedora 20 or newer". Tested additions welcome.
|
|---|
| 67 | modern_enough=0
|
|---|
| 68 | grep -iE 'fedora release [2-9][0-9]+\b' /etc/redhat-release >/dev/null 2>&1 \
|
|---|
| 69 | && modern_enough=1
|
|---|
| 70 |
|
|---|
| 71 | for i in $utf8_space_characters; do
|
|---|
| 72 | eval 'fail() { fail=1; }'
|
|---|
| 73 | m=ERROR
|
|---|
| 74 | case $i in
|
|---|
| 75 | X*) ;;
|
|---|
| 76 | x*) test $modern_enough = 1 || { eval 'fail() { :; }'; m=warning; } ;;
|
|---|
| 77 | O*) m=warning; eval 'fail() { :; }' ;;
|
|---|
| 78 | *) warn_ "unexpected prefix: $i"; exit 1 ;;
|
|---|
| 79 | esac
|
|---|
| 80 |
|
|---|
| 81 | # Strip the prefix byte.
|
|---|
| 82 | i=${i#?}
|
|---|
| 83 |
|
|---|
| 84 | hex_printf_ "$i" | grep -q '^\s$' \
|
|---|
| 85 | || { warn_ " $m: \\s failed to match $i in the $LC_ALL locale"; fail; }
|
|---|
| 86 | hex_printf_ "$i" | returns_ 1 grep -q '\S' \
|
|---|
| 87 | || { warn_ " $m: \\S mistakenly matched $i in the $LC_ALL locale"; fail; }
|
|---|
| 88 | done
|
|---|
| 89 |
|
|---|
| 90 |
|
|---|
| 91 | # This is a separate test, only nominally related to \s.
|
|---|
| 92 | # It is solely to get coverage of a code path (exercising dfa.c's
|
|---|
| 93 | # match_mb_charset function) that would have otherwise been untouched.
|
|---|
| 94 | # However, as of the change-set adding this new test, match_mb_charset
|
|---|
| 95 | # is unreachable via grep.
|
|---|
| 96 | printf '\0' | returns_ 1 grep -aE '^\s?$' > out 2>&1 || fail=1
|
|---|
| 97 | compare /dev/null out
|
|---|
| 98 |
|
|---|
| 99 | Exit $fail
|
|---|