OILS / benchmarks / id.sh View on Github | oilshell.org

466 lines, 239 significant
1#!/usr/bin/env bash
2#
3# Keep track of benchmark data provenance.
4#
5# Usage:
6# benchmarks/id.sh <function name>
7
8set -o nounset
9set -o pipefail
10set -o errexit
11
12REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
13
14source build/common.sh # for $CLANG
15source benchmarks/common.sh
16source test/tsv-lib.sh # tsv-row
17
18print-job-id() {
19 date '+%Y-%m-%d__%H-%M-%S'
20}
21
22# TODO: add benchmark labels/hashes for osh and all other shells
23#
24# Need to archive labels too.
25#
26# TODO: How do I make sure the zsh label is current? Across different
27# machines?
28#
29# What happens when zsh is silently upgraded?
30# I guess before every benchmark, you have to run the ID collection. Man
31# that is a lot of code.
32#
33# Should I make symlinks to the published location?
34#
35# Maybe bash/dash/mksh/zsh should be invoked through a symlink?
36# Every symlink is a shell runtime version, and it has an associated
37# toolchain?
38
39# Platform is ambient?
40# _tmp/
41# shell-id/
42# bash/
43# HASH.txt
44# version.txt
45# dash/
46# HASH.txt
47# version.txt
48# host-id/
49# lisa/
50# HASH.txt
51# cpuinfo.txt
52
53# ../benchmark-data/
54# shell-id/
55# bash-$HASH/
56# osh-$HASH/ # osh-cpython, osh-ovm? osh-opy-ovm? Too many dimensions.
57# # the other shells don't have this?
58# zsh-$HASH/
59# host-id/
60# lisa-$HASH/
61
62_dump-if-exists() {
63 local path=$1
64 local out=$2
65 if ! test -f "$path"; then
66 return
67 fi
68 cat "$path" > $out
69}
70
71#
72# Shell ID
73#
74
75dump-shell-id() {
76 ### Write files that identify the shell
77
78 local sh_path=$1
79 local out_dir=$2
80
81 if ! command -v $sh_path >/dev/null; then
82 die "dump-shell-id: Couldn't find $sh_path"
83 fi
84
85 mkdir -p $out_dir
86
87 echo $sh_path > $out_dir/sh-path.txt
88
89 # Add extra repository info for osh.
90 case $sh_path in
91 */osh*)
92 local commit_hash=$out_dir/git-commit-hash.txt
93
94 if test -n "${XSHAR_GIT_COMMIT:-}"; then
95 echo "$XSHAR_GIT_COMMIT" > $commit_hash
96 else
97 local branch
98 branch=$(git rev-parse --abbrev-ref HEAD)
99 echo $branch > $out_dir/git-branch.txt
100 git rev-parse $branch > $commit_hash
101 fi
102 ;;
103 esac
104
105 local sh_name
106 sh_name=$(basename $sh_path)
107
108 case $sh_name in
109 bash|zsh|yash)
110 $sh_path --version > $out_dir/version.txt
111 ;;
112 osh)
113 case $sh_path in
114 *_bin/*/osh)
115 # Doesn't support --version yet
116 ;;
117 *)
118 $sh_path --version > $out_dir/osh-version.txt
119 ;;
120 esac
121 ;;
122 # oils-for-unix|oils-for-unix.stripped)
123 # ;;
124 dash|mksh)
125 # These don't have version strings!
126 dpkg -s $sh_name > $out_dir/dpkg-version.txt
127 ;;
128
129 # not a shell, but useful for benchmarks/compute
130 python2)
131 $sh_path -V 2> $out_dir/version.txt
132 ;;
133 *)
134 die "Invalid shell '$sh_name'"
135 ;;
136 esac
137}
138
139_shell-id-hash() {
140 local src=$1
141
142 local file
143
144 # for shells and Python
145 file=$src/version.txt
146 test -f $file && cat $file
147
148 # Only hash the dimensions we want to keep
149 file=$src/dpkg-version.txt
150 test -f $file && egrep '^Version' $file
151
152 # Interpreter as CPython vs. OVM is what we care about, so
153 # select 'Interpreter:' but not 'Interpreter version:'.
154 # For example, the version is different on Ubuntu Bionic vs. Trusty, but we
155 # ignore that.
156 file=$src/osh-version.txt
157 test -f $file && egrep '^Oil version|^Interpreter:' $file
158
159 # For OSH
160 file=$src/git-commit-hash.txt
161 test -f $file && cat $file
162
163 return 0
164}
165
166publish-shell-id() {
167 ### Copy temp directory to hashed location
168
169 local src=$1 # e.g. _tmp/prov-tmp/osh
170 local dest_base=${2:-../benchmark-data/shell-id} # or _tmp/shell-id
171
172 local sh_path sh_name
173 read sh_path < $src/sh-path.txt
174 sh_name=$(basename $sh_path)
175
176 local hash
177 hash=$(_shell-id-hash $src | md5sum) # not secure, an identifier
178
179 local id="${hash:0:8}"
180 local dest="$dest_base/$sh_name-$id"
181
182 mkdir -p $dest
183 cp --no-target-directory --recursive $src/ $dest/
184
185 echo $hash > $dest/HASH.txt
186
187 log "Published shell ID to $dest"
188
189 echo $id
190}
191
192#
193# Platform ID
194#
195
196# Events that will change the env for a given machine:
197# - kernel upgrade
198# - distro upgrade
199
200# How about ~/git/oilshell/benchmark-data/host-id/lisa-$HASH
201# How to calculate the hash though?
202
203dump-host-id() {
204 ### Write files that identify the host
205
206 local out_dir=${1:-_tmp/host-id/$(hostname)}
207
208 mkdir -p $out_dir
209
210 hostname > $out_dir/hostname.txt
211
212 # does it make sense to do individual fields like -m?
213 # avoid parsing?
214 # We care about the kernel and the CPU architecture.
215 # There is a lot of redundant information there.
216 uname -m > $out_dir/machine.txt
217
218 {
219 # Short flags work on OS X too
220 uname -s # --kernel-name
221 uname -r # --kernel-release
222 uname -v # --kernel-version
223 } > $out_dir/kernel.txt
224
225 _dump-if-exists /etc/lsb-release $out_dir/lsb-release.txt
226
227 # remove the cpu MHz field, which changes a lot
228 if test -e /proc/cpuinfo; then
229 grep -i -v 'cpu mhz' /proc/cpuinfo > $out_dir/cpuinfo.txt
230 fi
231
232 # mem info doesn't make a difference? I guess it's just nice to check that
233 # it's not swapping. But shouldn't be part of the hash.
234
235 if test -e /proc/meminfo; then
236 grep '^MemTotal' /proc/meminfo > $out_dir/meminfo.txt
237 fi
238
239 #head $out_dir/* 1>&2 # don't write to stdout
240}
241
242# There is already concept of the triple?
243# http://wiki.osdev.org/Target_Triplet
244# It's not exactly the same as what we need here, but close.
245
246_host-id-hash() {
247 local src=$1
248
249 # Don't hash CPU or memory
250 #cat $src/cpuinfo.txt
251 #cat $src/hostname.txt # e.g. lisa
252
253 cat $src/machine.txt # e.g. x86_64
254 cat $src/kernel.txt
255
256 # OS
257 local file=$src/lsb-release.txt
258 if test -f $file; then
259 cat $file
260 fi
261
262 return 0
263}
264
265# Writes a short ID to stdout.
266publish-host-id() {
267 local src=$1 # e.g. _tmp/host-id/lisa
268 local dest_base=${2:-../benchmark-data/host-id}
269
270 local name
271 name=$(basename $src)
272
273 local hash
274 hash=$(_host-id-hash $src | md5sum) # not secure, an identifier
275
276 local id="${hash:0:8}"
277 local dest="$dest_base/$name-$id"
278
279 mkdir -p $dest
280 cp --no-target-directory --recursive $src/ $dest/
281
282 echo $hash > $dest/HASH.txt
283
284 log "Published host ID to $dest"
285
286 echo $id
287}
288
289#
290# Compilers
291#
292
293dump-compiler-id() {
294 ### Write files that identify the compiler
295
296 local cc=$1 # path to the compiler
297 local out_dir=${2:-_tmp/compiler-id/$(basename $cc)}
298
299 mkdir -p $out_dir
300
301 case $cc in
302 */gcc)
303 $cc --version
304 # -v has more details, but they might be overkill.
305 ;;
306 */clang)
307 $cc --version
308 # -v has stuff we don't want
309 ;;
310 esac > $out_dir/version.txt
311}
312
313_compiler-id-hash() {
314 local src=$1
315
316 # Remove some extraneous information from clang.
317 cat $src/version.txt | grep -v InstalledDir
318}
319
320# Writes a short ID to stdout.
321publish-compiler-id() {
322 local src=$1 # e.g. _tmp/compiler-id/clang
323 local dest_base=${2:-../benchmark-data/compiler-id}
324
325 local name=$(basename $src)
326 local hash
327 hash=$(_compiler-id-hash $src | md5sum) # not secure, an identifier
328
329 local id="${hash:0:8}"
330 local dest="$dest_base/$name-$id"
331
332 mkdir -p $dest
333 cp --no-target-directory --recursive $src/ $dest/
334
335 echo $hash > $dest/HASH.txt
336
337 log "Published compiler ID to $dest"
338
339 echo $id
340}
341
342#
343# Table Output
344#
345
346# Writes a table of host and shells to stdout. Writes text files and
347# calculates IDs for them as a side effect.
348#
349# The table can be passed to other benchmarks to ensure that their provenance
350# is recorded.
351
352shell-provenance-2() {
353 ### Write to _tmp/provenance.{txt,tsv} and $out_dir/{shell-id,host-id}
354
355 local maybe_host=$1 # if it exists, it overrides the host
356 local job_id=$2
357 local out_dir=$3
358 shift 3
359
360 # log "*** shell-provenance"
361
362 local host_name
363 if test -n "$maybe_host"; then # label is often 'no-host'
364 host_name=$maybe_host
365 else
366 host_name=$(hostname)
367 fi
368
369 log "*** shell-provenance-2 $maybe_host $host_name $job_id $out_dir"
370
371 local tmp_dir=_tmp/prov-tmp/$host_name
372 dump-host-id $tmp_dir
373
374 local host_hash
375 host_hash=$(publish-host-id $tmp_dir "$out_dir/host-id")
376
377 local shell_hash
378
379 local out_txt=_tmp/provenance.txt # Legacy text file
380 echo -n '' > $out_txt # truncated, no header
381
382 local out_tsv=_tmp/provenance.tsv
383 tsv-row job_id host_name host_hash sh_path shell_hash > $out_tsv
384
385 local i=0
386
387 for sh_path in "$@"; do
388 # There can be two different OSH
389
390 tmp_dir=_tmp/prov-tmp/shell-$i
391 i=$((i + 1))
392
393 dump-shell-id $sh_path $tmp_dir
394
395 # writes to ../benchmark-data or _tmp/provenance
396 shell_hash=$(publish-shell-id $tmp_dir "$out_dir/shell-id")
397
398 # note: filter-provenance depends on $4 being $sh_path
399 # APPEND to txt
400 echo "$job_id $host_name $host_hash $sh_path $shell_hash" >> $out_txt
401
402 tsv-row "$job_id" "$host_name" "$host_hash" "$sh_path" "$shell_hash" >> $out_tsv
403 done
404
405 log "Wrote $out_txt and $out_tsv"
406}
407
408compiler-provenance-2() {
409 # Write to _tmp/compiler-provenance.txt and $out_dir/{compiler-id,host-id}
410
411 local maybe_host=$1 # if it exists, it overrides the host
412 local job_id=$2
413 local out_dir=$3
414
415 local host_name
416 if test -n "$maybe_host"; then # label is often 'no-host'
417 host_name=$maybe_host
418 else
419 host_name=$(hostname)
420 fi
421
422 log "*** compiler-provenance-2 $maybe_host $host_name $job_id $out_dir"
423
424 local tmp_dir=_tmp/prov-tmp/$host_name
425 dump-host-id $tmp_dir
426
427 local host_hash
428 host_hash=$(publish-host-id $tmp_dir "$out_dir/host-id")
429
430 local compiler_hash
431
432 local out_txt=_tmp/compiler-provenance.txt # Legacy text file
433 echo -n '' > $out_txt # truncated, no header
434
435 local out_tsv=_tmp/compiler-provenance.tsv
436 tsv-row job_id host_name host_hash compiler_path compiler_hash > $out_tsv
437
438 for compiler_path in $(which gcc) $CLANG; do
439 local name=$(basename $compiler_path)
440
441 tmp_dir=_tmp/prov-tmp/$name
442 dump-compiler-id $compiler_path $tmp_dir
443
444 compiler_hash=$(publish-compiler-id $tmp_dir "$out_dir/compiler-id")
445
446 echo "$job_id $host_name $host_hash $compiler_path $compiler_hash" \
447 >> $out_txt
448
449 tsv-row \
450 "$job_id" "$host_name" "$host_hash" "$compiler_path" "$compiler_hash" \
451 >> $out_tsv
452 done
453
454 log "Wrote $out_txt and $out_tsv"
455}
456
457out-param() {
458 declare -n out=$1
459
460 out=returned
461}
462
463if test $(basename $0) = 'id.sh'; then
464 "$@"
465fi
466