OILS / benchmarks / autoconf.sh View on Github | oils.pub

603 lines, 246 significant
1#!/usr/bin/env bash
2#
3# Why is CPython configure slower under OSH?
4#
5# Usage:
6# benchmarks/autoconf.sh <function name>
7#
8# Examples:
9# $0 patch-pyconf # times builtin
10# $0 measure-times # time-tsv with gc stats
11# $0 report-times # times builtin
12#
13# $0 measure-syscalls
14# $0 report-syscalls
15# $0 report-processes
16# $0 report-external
17#
18# Simpler:
19# $0 measure-rusage # time-tsv
20
21set -o nounset
22set -o pipefail
23set -o errexit
24
25REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
26
27source benchmarks/cachegrind.sh # with-cachegrind
28source benchmarks/callgrind.sh # with-cachegrind
29source test/tsv-lib.sh # $TAB
30
31readonly BASE_DIR_RELATIVE=_tmp/autoconf
32readonly BASE_DIR=$REPO_ROOT/$BASE_DIR_RELATIVE
33readonly PY_CONF=$REPO_ROOT/Python-2.7.13/configure
34
35#
36# Trying to measure allocation/GC overhead
37#
38# This DOES NOT HELP because bumpleak/bumproot are **slower** on bigger heaps.
39# There's less cache locality!
40#
41
42cpython-configure-tasks() {
43 local -a variants=( opt+bumpleak opt+bumproot opt )
44 for v in ${variants[@]}; do
45 echo "${v}${TAB}_bin/cxx-$v/osh"
46 done
47 echo "opt${TAB}_bin/cxx-opt/mycpp-souffle/osh"
48}
49
50cpython-setup() {
51 cpython-configure-tasks | while read -r _ osh; do
52 ninja $osh
53 done
54}
55
56measure-alloc-overhead() {
57 local base_dir=$BASE_DIR/alloc-overhead
58 rm -r -f -v $base_dir
59
60 cpython-configure-tasks | while read -r variant osh; do
61 osh=$REPO_ROOT/$osh
62
63 local task_dir=$base_dir/$variant
64
65 mkdir -p $task_dir
66 pushd $task_dir > /dev/null
67
68 local -a flags=(
69 --output "$base_dir/$variant.tsv"
70 --rusage
71 )
72
73 local -a time_argv
74
75 time_argv=(
76 time-tsv --print-header
77 "${flags[@]}"
78 --field variant
79 )
80 "${time_argv[@]}"
81
82 time_argv=(
83 time-tsv --append
84 "${flags[@]}"
85 --field "$variant"
86 -- $osh $PY_CONF
87 )
88
89 #echo "${time_argv[@]}"
90 "${time_argv[@]}"
91
92 popd > /dev/null
93
94 done
95}
96
97#
98# Compare bash/dash/osh locally
99#
100
101measure-rusage() {
102 local base_dir=$BASE_DIR/rusage
103 rm -r -f -v $base_dir
104
105 shell-tasks | while read -r sh_label sh_path; do
106
107 local task_dir=$base_dir/$sh_label
108
109 mkdir -p $task_dir
110 pushd $task_dir > /dev/null
111
112 local -a flags=(
113 --output "$base_dir/$sh_label.tsv"
114 --rusage
115 )
116
117 local -a time_argv
118
119 time_argv=(
120 time-tsv --print-header
121 "${flags[@]}"
122 --field sh_label
123 )
124 "${time_argv[@]}"
125
126 time_argv=(
127 time-tsv --append
128 "${flags[@]}"
129 --field "$sh_label"
130 -- $sh_path $PY_CONF
131 )
132
133 #echo "${time_argv[@]}"
134 "${time_argv[@]}"
135
136 popd > /dev/null
137
138 done
139}
140
141#
142# Now try strace
143#
144
145shell-tasks() {
146 echo "bash${TAB}bash"
147 echo "dash${TAB}dash"
148 echo "osh${TAB}$REPO_ROOT/_bin/cxx-opt/osh"
149 echo "osh${TAB}$REPO_ROOT/_bin/cxx-opt/mycpp-souffle/osh"
150}
151
152measure-syscalls() {
153 local osh=_bin/cxx-opt/osh
154 #local osh=_bin/cxx-dbg/osh
155
156 ninja $osh
157
158 local base_dir=$BASE_DIR/syscalls
159
160 rm -r -f -v $base_dir
161
162 shell-tasks | while read -r sh_label sh_path; do
163 local dir=$base_dir/$sh_label
164 mkdir -p $dir
165
166 local counts_dir=$base_dir/$sh_label
167 mkdir -p $counts_dir
168
169 pushd $dir
170 #strace -o $counts -c $sh_path $PY_CONF
171 # See how many external processes are started?
172 #strace -o $counts -ff -e execve $sh_path $PY_CONF
173 strace -o $counts_dir/syscalls -ff $sh_path $PY_CONF
174 popd
175 done
176}
177
178# --- _tmp/autoconf/bash
179# 6047
180# 4621
181# --- _tmp/autoconf/dash
182# 6088
183# 4627
184# --- _tmp/autoconf/osh
185# 5691
186# 4631
187#
188# Woah we start fewer processes! But are not faster?
189
190grep-exec() {
191 # --max-count 1 - only report one execve per process
192 egrep --no-filename --max-count 1 -o 'execve\("[^"]+' "$@"
193}
194
195# andy@hoover:~/git/oilshell/oil$ benchmarks/autoconf.sh report-syscalls
196# --- _tmp/autoconf/syscalls/bash
197# 2592 _tmp/autoconf/syscalls/bash/syscalls.903220
198# 2608 _tmp/autoconf/syscalls/bash/syscalls.898727
199# 2632 _tmp/autoconf/syscalls/bash/syscalls.898387
200# 2679 _tmp/autoconf/syscalls/bash/syscalls.898292
201# 2853 _tmp/autoconf/syscalls/bash/syscalls.898927
202# 2873 _tmp/autoconf/syscalls/bash/syscalls.898334
203# 2920 _tmp/autoconf/syscalls/bash/syscalls.898895
204# 3204 _tmp/autoconf/syscalls/bash/syscalls.898664
205# 112549 _tmp/autoconf/syscalls/bash/syscalls.897471
206# 1360223 total
207#
208# --- _tmp/autoconf/syscalls/dash
209# 2592 _tmp/autoconf/syscalls/dash/syscalls.909344
210# 2607 _tmp/autoconf/syscalls/dash/syscalls.904921
211# 2630 _tmp/autoconf/syscalls/dash/syscalls.904581
212# 2683 _tmp/autoconf/syscalls/dash/syscalls.904486
213# 2851 _tmp/autoconf/syscalls/dash/syscalls.905109
214# 2873 _tmp/autoconf/syscalls/dash/syscalls.904528
215# 2920 _tmp/autoconf/syscalls/dash/syscalls.905088
216# 3204 _tmp/autoconf/syscalls/dash/syscalls.904858
217# 112922 _tmp/autoconf/syscalls/dash/syscalls.903626
218# 1372118 total
219#
220# --- _tmp/autoconf/syscalls/osh
221# 2592 _tmp/autoconf/syscalls/osh/syscalls.915226
222# 2607 _tmp/autoconf/syscalls/osh/syscalls.910993
223# 2630 _tmp/autoconf/syscalls/osh/syscalls.910647
224# 2679 _tmp/autoconf/syscalls/osh/syscalls.910561
225# 2851 _tmp/autoconf/syscalls/osh/syscalls.911162
226# 2873 _tmp/autoconf/syscalls/osh/syscalls.910599
227# 2920 _tmp/autoconf/syscalls/osh/syscalls.911143
228# 3204 _tmp/autoconf/syscalls/osh/syscalls.910936
229# 72921 _tmp/autoconf/syscalls/osh/syscalls.909769
230# 1211074 total
231
232report-processes() {
233 for sh_dir in $BASE_DIR_RELATIVE/syscalls/*; do
234 echo "--- $sh_dir"
235 ls $sh_dir/* | wc -l
236 grep-exec $sh_dir/syscalls.* | wc -l
237 echo
238
239 done
240}
241
242report-external() {
243 local n=${1:-5}
244
245 for sh_dir in $BASE_DIR_RELATIVE/syscalls/*; do
246 echo "--- $sh_dir"
247
248 grep-exec $sh_dir/syscalls.* | sort | uniq -c | sort -n | tail -n $n
249 echo
250 done
251}
252
253report-syscalls() {
254 # Hm this is instructive, the shell itself makes the most syscalls
255 # And fewer than other shells?
256
257 for sh_dir in $BASE_DIR_RELATIVE/syscalls/*; do
258 echo "--- $sh_dir"
259 wc -l $sh_dir/syscalls.* | sort -n | tail
260 echo
261 done
262}
263
264#
265# Cachegrind
266#
267
268measure-valgrind() {
269 local tool=$1
270
271 # opt seems to give OK results, but I thought dbg was more accurate
272 #local osh=_bin/cxx-opt/osh
273 local osh=_bin/cxx-dbg/osh
274
275 ninja $osh
276
277 local osh=$REPO_ROOT/$osh
278
279 local base_dir=$REPO_ROOT/_tmp/$tool
280
281 local dir=$base_dir/cpython-configure
282 rm -r -f -v $dir
283
284 local out_file=$base_dir/cpython-configure.txt
285
286 mkdir -v -p $dir
287
288 pushd $dir
289 $tool $out_file $osh $PY_CONF
290 popd
291}
292
293measure-cachegrind() {
294 measure-valgrind with-cachegrind
295}
296
297measure-callgrind() {
298 # This takes ~5 minutes with opt binary, ~6:43 with dbg
299 # vs ~15 seconds uninstrumented
300 time measure-valgrind with-callgrind
301}
302
303# Note:
304# benchmarks/osh-runtime.sh compares our release, which does not have #ifdef
305# GC_TIMING, so we don't know total GC time.
306
307# TODO:
308#
309# - Run locally, reproduce GC_TIMING - this is not in the release build
310# - it seems to say only 143 ms total GC time, but we're seeing 1.5+ seconds
311# slowdown on Cpython configure vs. bash
312# - I want a local run that automates it, and returns PERCENTAGES for elapsed
313# time, sys time, user time
314# - We also might not want to amortize free() inside Allocate()
315# - #ifdef LAZY_FREE I think! That might show a big slowdown with free
316
317patch-pyconf() {
318 #sed -i $'s/ac_compile=\'$CC/ac_compile=\'times; $CC/g' $PY_CONF
319
320 # temporary
321 echo 'times > $SH_BENCHMARK_TIMES' >> $PY_CONF
322}
323
324measure-times() {
325 local osh=_bin/cxx-opt/osh
326 ninja $osh
327
328 local base_dir=$BASE_DIR/times
329 rm -r -f -v $base_dir
330
331 local trace_dir=$base_dir/oils-trace
332 mkdir -p $trace_dir
333
334 shell-tasks | while read -r sh_label sh_path; do
335 #case $sh_label in bash|dash) continue ;; esac
336
337 local dir=$base_dir/$sh_label
338 mkdir -p $dir
339
340 pushd $dir
341
342 local -a flags=(
343 --output "$base_dir/$sh_label.tsv"
344 --rusage
345 )
346
347 local -a time_argv
348
349 time_argv=(
350 time-tsv --print-header
351 "${flags[@]}"
352 --field sh_label
353 )
354 "${time_argv[@]}"
355
356 time_argv=(
357 time-tsv --append
358 "${flags[@]}"
359 --field "$sh_label"
360 -- $sh_path $PY_CONF
361 )
362
363 #echo "${time_argv[@]}"
364
365 # 1269 argv0.json files created
366 # we can miss some via NOLASTFORK optimization
367 #OILS_TRACE_DIR=$trace_dir \
368
369 _OILS_GC_VERBOSE=1 OILS_GC_STATS_FD=99 \
370 SH_BENCHMARK_TIMES=$base_dir/$sh_label.times.txt \
371 "${time_argv[@]}" \
372 99>$base_dir/$sh_label.gc-stats.txt
373
374 popd
375 done
376}
377
378inner-long-tsv() {
379 python2 -c '
380import os, re, sys
381
382def PrintRow(row):
383 print("\t".join(row))
384
385PrintRow(["shell", "who", "what", "seconds"])
386
387for path in sys.argv[1:]:
388 filename = os.path.basename(path)
389 shell = filename.split(".")[0]
390
391 f = open(path)
392 s = f.read()
393
394 secs = re.findall("0m([0-9.]+)s", s)
395 assert len(secs) == 4, secs
396
397 PrintRow([shell, "self", "user", secs[0]])
398 PrintRow([shell, "self", "sys", secs[1]])
399 PrintRow([shell, "child", "user", secs[2]])
400 PrintRow([shell, "child", "sys", secs[3]])
401
402 # Non-normalized, but OK
403 total_secs = sum(float(s) for s in secs)
404 PrintRow([shell, "both", "both", str(total_secs)])
405
406 ' $BASE_DIR/times/*.times.txt
407}
408
409compare-dim() {
410 # 8% more child system time
411 local who=${1:-child}
412 local what=${2:-user}
413
414 echo "=== $who $what ==="
415
416 # Annoying
417 # https://www.math.utah.edu/docs/info/gawk_8.html
418 # "If, for some reason, you need to force a number to be converted to a
419 # string, concatenate the empty string, "", with that number. To force a
420 # string to be converted to a number, add zero to that string."
421
422 cat $BASE_DIR/times-long.tsv | awk -v "who=$who" -v "what=$what" '
423 BEGIN {
424 TAB = "\t"
425
426 i = 0
427
428 printf "%s\t%s\t%s\t%s\n", "shell", "secs", "ratio", "diff secs"
429 }
430 $2 == who && $3 == what {
431 if (i == 0) {
432 first_secs = $4 + 0
433 }
434 i++
435
436 secs = $4 + 0
437 ratio = secs / first_secs
438 diff = secs - first_secs
439
440 # Need commas for OFMT to work correctly?
441 printf "%s\t%5.3f\t%5.3f\t%5.3f\n", $1, secs, ratio, diff
442 }
443 '
444
445 echo
446}
447
448compare-times() {
449 log "INNER"
450 log ''
451
452 compare-dim self user
453
454 compare-dim self sys
455
456 compare-dim child user
457
458 compare-dim child sys
459
460 compare-dim both both
461
462 # outer
463 log "OUTER"
464 log ''
465
466 compare-dim both elapsed
467
468 # These kinda match
469 return
470 compare-dim both user
471 compare-dim both sys
472}
473
474outer-long-tsv() {
475 log "=== outer times ==="
476 awk '
477 BEGIN {
478 i = 0
479
480 printf "%s\t%s\t%s\t%s\n", "shell", "who", "what", "seconds"
481 }
482 i == 0 {
483 #print "Skipping header"
484 i++
485 next
486 }
487 i >= 1 {
488 elapsed = $2 + 0
489 user = $3 + 0
490 sys = $4 + 0
491 sh_label = $6
492
493 printf "%s\t%s\t%s\t%5.3f\n", sh_label, "both", "elapsed", elapsed
494 printf "%s\t%s\t%s\t%5.3f\n", sh_label, "both", "user", user
495 printf "%s\t%s\t%s\t%5.3f\n", sh_label, "both", "sys", sys
496
497 i++
498 }
499 ' $BASE_DIR/outer-wide.tsv
500}
501
502report-times() {
503 head $BASE_DIR/times/*.tsv
504 echo
505 head $BASE_DIR/times/*.times.txt
506 echo
507
508 inner-long-tsv | tee $BASE_DIR/inner-long.tsv
509 echo
510
511 tsv-concat $BASE_DIR/times/*.tsv | tee $BASE_DIR/outer-wide.tsv
512 outer-long-tsv | tee $BASE_DIR/outer-long.tsv
513 echo
514
515 tsv-concat $BASE_DIR/{inner,outer}-long.tsv | tee $BASE_DIR/times-long.tsv
516
517 compare-times
518}
519
520### Why is clone() taking longer according to strace?
521
522fork-tasks() {
523 echo "bash${TAB}bash"
524 echo "dash${TAB}dash"
525
526 # Hm this is noisy, but cxx-opt-sh does seem slower
527 echo "osh${TAB}$REPO_ROOT/_bin/cxx-opt/osh"
528 echo "osh${TAB}$REPO_ROOT/_bin/cxx-opt-sh/osh"
529 echo "osh${TAB}$REPO_ROOT/_bin/cxx-opt/mycpp-souffle/osh"
530 echo "osh${TAB}$REPO_ROOT/_bin/cxx-opt-sh/mycpp-souffle/osh"
531}
532
533measure-fork() {
534 fork-tasks | while read -r sh_label sh_path; do
535 #case $sh_label in bash|dash) continue ;; esac
536
537 echo "=== $sh_path ==="
538
539 # Builtin is very fast
540 #time $sh_path -c 'for i in $(seq 100); do true; done'
541
542 # Hm this is very noisy
543 # TODO use hyperfine?
544 time $sh_path -c 'for i in $(seq 100); do /bin/true; done'
545
546 case $sh_label in
547 osh)
548 # Oops, we are not symlinking to the .stripped binary!
549 # This is explicitly done for symbols and benchmarking.
550 # Hm does that make it slower then?
551
552 ls -l -L $sh_path
553 ldd $sh_path
554 ;;
555 esac
556 done
557}
558
559# $ head _tmp/elapsed/*.times.txt
560# ==> _tmp/elapsed/bash.times.txt <==
561# 0m0.213s 0m0.477s
562# 0m8.233s 0m2.931s
563#
564# ==> _tmp/elapsed/dash.times.txt <==
565# 0m0.217s 0m0.463s
566# 0m8.281s 0m2.922s
567#
568# ==> _tmp/elapsed/osh.times.txt <==
569# 0m0.360s 0m0.720s
570# 0m8.790s 0m2.960s
571
572# shell user time - GC and allocs
573# shell system time - ???
574# child user time - ???
575# TODO: count how many processes this is.
576# It's more than 500 ms
577# Is that 500 processes, and 1 ms per process?
578
579fork-time() {
580 local osh=_bin/cxx-opt/osh
581
582 $osh -c 'time for i in {1..1000}; do true; done'
583 echo
584
585 $osh -c 'time for i in {1..1000}; do ( true ); done'
586 echo
587
588 # Does this increase fork time or no?
589 # Hm I can only get the forks up to 306ms for 1000, or 300 us
590 # But the HereDocWriter does dup() and so forth
591 $osh -c '
592echo ysh-parse
593time for i in {1..40}; do
594 . test/ysh-parse-errors.sh
595done
596times
597time for i in {1..1000}; do
598 ( true )
599done'
600 echo
601}
602
603"$@"