OILS / data_lang / quoting-survey.sh View on Github | oils.pub

216 lines, 92 significant
1#!/usr/bin/env bash
2#
3# What do Unix tools do with "bad" filenames?
4#
5# - Those with invalid unicode
6# - Those with terminal escape sequences
7#
8# Usage:
9# data_lang/quoting-survey.sh <function name>
10
11set -o nounset
12set -o pipefail
13set -o errexit
14
15# We already know:
16# - bash ${#len} operator is very broken
17
18# in bash it could be %Q. Or maybe it's %Q everywhere?
19# in mycpp, we can translate %r calling repr() to qsn::encode()
20
21
22# Summary:
23#
24# These tools do UTF-8 error decoding, but they use a funny shell-like format:
25#
26# - GNU coreutils: ls, cp, stat
27# - GNU tar
28# - zsh in error message, and in printf %q
29# - bash and mksh in printf %q only, but not in error message
30# - GNU findutils: find, but NOT xargs
31#
32# Surprise: not grep
33#
34# TODO: automate this a bit?
35# - You can validate their stdout and stderr?
36# - Look for the literal escape sequence.
37
38
39# TODO: What about the one that changes the title?
40
41BOLD=$'\x1b[1m'
42RESET=$'\x1b[0;0m'
43
44# A mix of valid and invalid utf-8
45char_then_byte=$'\xce\xce\xbc'
46byte_then_char=$'\xce\xbc\xce'
47
48readonly DIR=_tmp/qsn-demo
49
50banner() {
51 echo
52 echo =====
53 echo "$@"
54 echo
55}
56
57setup-bad-files() {
58
59 # - Make a file with an invalid code point, and utf-8 char
60 # - Make a file with a terminal escape sequence
61
62 mkdir -p $DIR
63 pushd $DIR
64 touch -- $BOLD $char_then_byte $byte_then_char
65}
66
67test-programs() {
68 echo "$BOLD Hello $RESET World"
69
70 # does approximate decoding
71 printf '%q\n' "$char_then_byte"
72 printf '%q\n' "$byte_then_char"
73
74 setup-bad-files
75 # ls doesn't print these by default, that' sgood
76
77 # Hm this also does approximate decoding
78 banner 'ls'
79 ls
80 echo
81 ls --escape
82 echo
83 # Test out error message
84 # It's basically correct, but ugly. There are too many segments, and
85 # there's an unnecessary leading ''.
86 # J8 is shorter and more consistent.
87
88 ls -- "$RESET" || true
89
90 # same
91 banner 'cp'
92 cp -- "$RESET" /tmp || true
93
94 # weird output but it ultimately understands it
95 banner 'stat'
96 stat *
97
98 # Hm also understands utf-8
99 banner 'find'
100 find
101 # This prints it raw
102 #find -print0
103
104 # xargs --verbose messes up! Makes it bold. It also understands less
105 # unicode.
106 if false; then
107 banner 'xargs'
108 echo * | xargs --verbose -n 1 -- true
109 fi
110
111 # prints bytes, no unicode
112 banner 'strace'
113 strace -- true "$BOLD" "$char_then_byte" "$byte_then_char"
114
115 # it does understand mu
116 banner 'ps'
117 bash -c "true zzmagic $BOLD $char_then_byte $byte_then_char; sleep 2" &
118 ps aux | grep zzmagic
119}
120
121test-errors() {
122 # also prints it
123 setup-bad-files
124
125 # GOOD
126 banner 'tar'
127 tar -f $BOLD || true
128 tar --create "$BOLD" "$byte_then_char" "$char_then_byte" > out.tar
129 tar --list < out.tar
130
131 banner 'rm'
132 # works
133 rm -f -v -- "$BOLD" "$byte_then_char" "$char_then_byte"
134
135 banner 'grep'
136 # BUG
137 #grep z "$BOLD"
138 grep z "$byte_then_char" || true
139 grep z "$char_then_byte" || true
140
141 # python doesn't print it somehow?
142 banner 'python'
143 # BUG: Python prints terminal sequences
144 #python "$BOLD" || true
145 python "$byte_then_char" || true
146 python "$char_then_byte" || true
147
148 # BUG: Lua prints terminal sequences
149 # So coreutils does it right!
150 banner 'lua'
151 #lua "$BOLD" || true
152 lua "$byte_then_char" || true
153 lua "$char_then_byte" || true
154
155 # BUG: prints it
156 banner 'awk'
157 #awk -F "$BOLD" || true
158 awk -F "$byte_then_char" || true
159 awk -F "$char_then_byte" || true
160
161 # BUG
162 banner 'ruby'
163 #ruby "$BOLD" || true
164 ruby "$byte_then_char" || true
165 ruby "$char_then_byte" || true
166
167 # BUG
168 banner 'perl'
169 #perl "$BOLD" || true
170 perl "$byte_then_char" || true
171 perl "$char_then_byte" || true
172
173 # BUG
174 # But it's a little smarter about mu cases
175 banner 'nodejs'
176 #nodejs "$BOLD" || true
177 nodejs "$byte_then_char" || true
178 nodejs "$char_then_byte" || true
179
180 # shells:
181
182 # BUG
183 banner 'bash'
184 #bash "$BOLD" || true
185 bash "$byte_then_char" || true
186 bash "$char_then_byte" || true
187
188 banner 'dash'
189 #dash "$BOLD" || true
190
191 # zsh actually escapes it!
192 banner 'zsh'
193 zsh "$BOLD" || true
194 zsh "$byte_then_char" || true
195 zsh "$char_then_byte" || true
196
197 # BUG
198 banner 'mksh'
199 #mksh "$BOLD" || true
200}
201
202test-busybox() {
203 setup-bad-files
204
205 # displays ?? -- doesn't understand unicode
206 banner 'busybox ls'
207 busybox ls
208
209 # BUG: prints it literally
210 banner 'busybox find'
211 busybox find
212
213 #reset
214}
215
216"$@"