OILS / benchmarks / id.sh View on Github | oilshell.org

468 lines, 240 significant
1#!/usr/bin/env bash
2#
3# Keep track of benchmark data provenance.
4#
5# Usage:
6# benchmarks/id.sh <function name>
7
8set -o nounset
9set -o pipefail
10set -o errexit
11
12REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
13
14source build/common.sh # for $CLANG
15source benchmarks/common.sh
16source test/tsv-lib.sh # tsv-row
17
18print-job-id() {
19 date '+%Y-%m-%d__%H-%M-%S'
20}
21
22# TODO: add benchmark labels/hashes for osh and all other shells
23#
24# Need to archive labels too.
25#
26# TODO: How do I make sure the zsh label is current? Across different
27# machines?
28#
29# What happens when zsh is silently upgraded?
30# I guess before every benchmark, you have to run the ID collection. Man
31# that is a lot of code.
32#
33# Should I make symlinks to the published location?
34#
35# Maybe bash/dash/mksh/zsh should be invoked through a symlink?
36# Every symlink is a shell runtime version, and it has an associated
37# toolchain?
38
39# Platform is ambient?
40# _tmp/
41# shell-id/
42# bash/
43# HASH.txt
44# version.txt
45# dash/
46# HASH.txt
47# version.txt
48# host-id/
49# lisa/
50# HASH.txt
51# cpuinfo.txt
52
53# ../benchmark-data/
54# shell-id/
55# bash-$HASH/
56# osh-$HASH/ # osh-cpython, osh-ovm? osh-opy-ovm? Too many dimensions.
57# # the other shells don't have this?
58# zsh-$HASH/
59# host-id/
60# lisa-$HASH/
61
62_dump-if-exists() {
63 local path=$1
64 local out=$2
65 if ! test -f "$path"; then
66 return
67 fi
68 cat "$path" > $out
69}
70
71#
72# Shell ID
73#
74
75dump-shell-id() {
76 ### Write files that identify the shell
77
78 local sh_path=$1
79 local out_dir=$2
80
81 if ! command -v $sh_path >/dev/null; then
82 die "dump-shell-id: Couldn't find $sh_path"
83 fi
84
85 mkdir -p $out_dir
86
87 echo $sh_path > $out_dir/sh-path.txt
88
89 # Add extra repository info for osh.
90 case $sh_path in
91 */osh*)
92 local commit_hash=$out_dir/git-commit-hash.txt
93
94 if test -n "${XSHAR_GIT_COMMIT:-}"; then
95 echo "$XSHAR_GIT_COMMIT" > $commit_hash
96 else
97 local branch
98 branch=$(git rev-parse --abbrev-ref HEAD)
99 echo $branch > $out_dir/git-branch.txt
100 git rev-parse $branch > $commit_hash
101 fi
102 ;;
103 esac
104
105 local sh_name
106 sh_name=$(basename $sh_path)
107
108 case $sh_name in
109 bash|zsh|yash)
110 $sh_path --version > $out_dir/version.txt
111 ;;
112 osh)
113 case $sh_path in
114 *_bin/*/osh)
115 # Doesn't support --version yet
116 ;;
117 *)
118 $sh_path --version > $out_dir/osh-version.txt
119 ;;
120 esac
121 ;;
122 # oils-for-unix|oils-for-unix.stripped)
123 # ;;
124 dash|mksh)
125 # These don't have version strings!
126 dpkg -s $sh_name > $out_dir/dpkg-version.txt
127 ;;
128
129 # not a shell, but useful for benchmarks/compute
130 python2)
131 $sh_path -V 2> $out_dir/version.txt
132 ;;
133 *)
134 die "Invalid shell '$sh_name'"
135 ;;
136 esac
137}
138
139_shell-id-hash() {
140 local src=$1
141
142 local file
143
144 # for shells and Python
145 file=$src/version.txt
146 test -f $file && cat $file
147
148 # Only hash the dimensions we want to keep
149 file=$src/dpkg-version.txt
150 test -f $file && egrep '^Version' $file
151
152 # Interpreter as CPython vs. OVM is what we care about, so
153 # select 'Interpreter:' but not 'Interpreter version:'.
154 # For example, the version is different on Ubuntu Bionic vs. Trusty, but we
155 # ignore that.
156 file=$src/osh-version.txt
157 test -f $file && egrep '^Oil version|^Interpreter:' $file
158
159 # For OSH
160 file=$src/git-commit-hash.txt
161 test -f $file && cat $file
162 # XXX: Include shell path to help distinguish between versions of OSH
163 echo $src
164
165 return 0
166}
167
168publish-shell-id() {
169 ### Copy temp directory to hashed location
170
171 local src=$1 # e.g. _tmp/prov-tmp/osh
172 local dest_base=${2:-../benchmark-data/shell-id} # or _tmp/shell-id
173
174 local sh_path sh_name
175 read sh_path < $src/sh-path.txt
176 sh_name=$(basename $sh_path)
177
178 local hash
179 hash=$(_shell-id-hash $src | md5sum) # not secure, an identifier
180
181 local id="${hash:0:8}"
182 local dest="$dest_base/$sh_name-$id"
183
184 mkdir -p $dest
185 cp --no-target-directory --recursive $src/ $dest/
186
187 echo $hash > $dest/HASH.txt
188
189 log "Published shell ID to $dest"
190
191 echo $id
192}
193
194#
195# Platform ID
196#
197
198# Events that will change the env for a given machine:
199# - kernel upgrade
200# - distro upgrade
201
202# How about ~/git/oilshell/benchmark-data/host-id/lisa-$HASH
203# How to calculate the hash though?
204
205dump-host-id() {
206 ### Write files that identify the host
207
208 local out_dir=${1:-_tmp/host-id/$(hostname)}
209
210 mkdir -p $out_dir
211
212 hostname > $out_dir/hostname.txt
213
214 # does it make sense to do individual fields like -m?
215 # avoid parsing?
216 # We care about the kernel and the CPU architecture.
217 # There is a lot of redundant information there.
218 uname -m > $out_dir/machine.txt
219
220 {
221 # Short flags work on OS X too
222 uname -s # --kernel-name
223 uname -r # --kernel-release
224 uname -v # --kernel-version
225 } > $out_dir/kernel.txt
226
227 _dump-if-exists /etc/lsb-release $out_dir/lsb-release.txt
228
229 # remove the cpu MHz field, which changes a lot
230 if test -e /proc/cpuinfo; then
231 grep -i -v 'cpu mhz' /proc/cpuinfo > $out_dir/cpuinfo.txt
232 fi
233
234 # mem info doesn't make a difference? I guess it's just nice to check that
235 # it's not swapping. But shouldn't be part of the hash.
236
237 if test -e /proc/meminfo; then
238 grep '^MemTotal' /proc/meminfo > $out_dir/meminfo.txt
239 fi
240
241 #head $out_dir/* 1>&2 # don't write to stdout
242}
243
244# There is already concept of the triple?
245# http://wiki.osdev.org/Target_Triplet
246# It's not exactly the same as what we need here, but close.
247
248_host-id-hash() {
249 local src=$1
250
251 # Don't hash CPU or memory
252 #cat $src/cpuinfo.txt
253 #cat $src/hostname.txt # e.g. lisa
254
255 cat $src/machine.txt # e.g. x86_64
256 cat $src/kernel.txt
257
258 # OS
259 local file=$src/lsb-release.txt
260 if test -f $file; then
261 cat $file
262 fi
263
264 return 0
265}
266
267# Writes a short ID to stdout.
268publish-host-id() {
269 local src=$1 # e.g. _tmp/host-id/lisa
270 local dest_base=${2:-../benchmark-data/host-id}
271
272 local name
273 name=$(basename $src)
274
275 local hash
276 hash=$(_host-id-hash $src | md5sum) # not secure, an identifier
277
278 local id="${hash:0:8}"
279 local dest="$dest_base/$name-$id"
280
281 mkdir -p $dest
282 cp --no-target-directory --recursive $src/ $dest/
283
284 echo $hash > $dest/HASH.txt
285
286 log "Published host ID to $dest"
287
288 echo $id
289}
290
291#
292# Compilers
293#
294
295dump-compiler-id() {
296 ### Write files that identify the compiler
297
298 local cc=$1 # path to the compiler
299 local out_dir=${2:-_tmp/compiler-id/$(basename $cc)}
300
301 mkdir -p $out_dir
302
303 case $cc in
304 */gcc)
305 $cc --version
306 # -v has more details, but they might be overkill.
307 ;;
308 */clang)
309 $cc --version
310 # -v has stuff we don't want
311 ;;
312 esac > $out_dir/version.txt
313}
314
315_compiler-id-hash() {
316 local src=$1
317
318 # Remove some extraneous information from clang.
319 cat $src/version.txt | grep -v InstalledDir
320}
321
322# Writes a short ID to stdout.
323publish-compiler-id() {
324 local src=$1 # e.g. _tmp/compiler-id/clang
325 local dest_base=${2:-../benchmark-data/compiler-id}
326
327 local name=$(basename $src)
328 local hash
329 hash=$(_compiler-id-hash $src | md5sum) # not secure, an identifier
330
331 local id="${hash:0:8}"
332 local dest="$dest_base/$name-$id"
333
334 mkdir -p $dest
335 cp --no-target-directory --recursive $src/ $dest/
336
337 echo $hash > $dest/HASH.txt
338
339 log "Published compiler ID to $dest"
340
341 echo $id
342}
343
344#
345# Table Output
346#
347
348# Writes a table of host and shells to stdout. Writes text files and
349# calculates IDs for them as a side effect.
350#
351# The table can be passed to other benchmarks to ensure that their provenance
352# is recorded.
353
354shell-provenance-2() {
355 ### Write to _tmp/provenance.{txt,tsv} and $out_dir/{shell-id,host-id}
356
357 local maybe_host=$1 # if it exists, it overrides the host
358 local job_id=$2
359 local out_dir=$3
360 shift 3
361
362 # log "*** shell-provenance"
363
364 local host_name
365 if test -n "$maybe_host"; then # label is often 'no-host'
366 host_name=$maybe_host
367 else
368 host_name=$(hostname)
369 fi
370
371 log "*** shell-provenance-2 $maybe_host $host_name $job_id $out_dir"
372
373 local tmp_dir=_tmp/prov-tmp/$host_name
374 dump-host-id $tmp_dir
375
376 local host_hash
377 host_hash=$(publish-host-id $tmp_dir "$out_dir/host-id")
378
379 local shell_hash
380
381 local out_txt=_tmp/provenance.txt # Legacy text file
382 echo -n '' > $out_txt # truncated, no header
383
384 local out_tsv=_tmp/provenance.tsv
385 tsv-row job_id host_name host_hash sh_path shell_hash > $out_tsv
386
387 local i=0
388
389 for sh_path in "$@"; do
390 # There can be two different OSH
391
392 tmp_dir=_tmp/prov-tmp/shell-$i
393 i=$((i + 1))
394
395 dump-shell-id $sh_path $tmp_dir
396
397 # writes to ../benchmark-data or _tmp/provenance
398 shell_hash=$(publish-shell-id $tmp_dir "$out_dir/shell-id")
399
400 # note: filter-provenance depends on $4 being $sh_path
401 # APPEND to txt
402 echo "$job_id $host_name $host_hash $sh_path $shell_hash" >> $out_txt
403
404 tsv-row "$job_id" "$host_name" "$host_hash" "$sh_path" "$shell_hash" >> $out_tsv
405 done
406
407 log "Wrote $out_txt and $out_tsv"
408}
409
410compiler-provenance-2() {
411 # Write to _tmp/compiler-provenance.txt and $out_dir/{compiler-id,host-id}
412
413 local maybe_host=$1 # if it exists, it overrides the host
414 local job_id=$2
415 local out_dir=$3
416
417 local host_name
418 if test -n "$maybe_host"; then # label is often 'no-host'
419 host_name=$maybe_host
420 else
421 host_name=$(hostname)
422 fi
423
424 log "*** compiler-provenance-2 $maybe_host $host_name $job_id $out_dir"
425
426 local tmp_dir=_tmp/prov-tmp/$host_name
427 dump-host-id $tmp_dir
428
429 local host_hash
430 host_hash=$(publish-host-id $tmp_dir "$out_dir/host-id")
431
432 local compiler_hash
433
434 local out_txt=_tmp/compiler-provenance.txt # Legacy text file
435 echo -n '' > $out_txt # truncated, no header
436
437 local out_tsv=_tmp/compiler-provenance.tsv
438 tsv-row job_id host_name host_hash compiler_path compiler_hash > $out_tsv
439
440 for compiler_path in $(which gcc) $CLANG; do
441 local name=$(basename $compiler_path)
442
443 tmp_dir=_tmp/prov-tmp/$name
444 dump-compiler-id $compiler_path $tmp_dir
445
446 compiler_hash=$(publish-compiler-id $tmp_dir "$out_dir/compiler-id")
447
448 echo "$job_id $host_name $host_hash $compiler_path $compiler_hash" \
449 >> $out_txt
450
451 tsv-row \
452 "$job_id" "$host_name" "$host_hash" "$compiler_path" "$compiler_hash" \
453 >> $out_tsv
454 done
455
456 log "Wrote $out_txt and $out_tsv"
457}
458
459out-param() {
460 declare -n out=$1
461
462 out=returned
463}
464
465if test $(basename $0) = 'id.sh'; then
466 "$@"
467fi
468