1 | #!/usr/bin/env bash
|
2 | #
|
3 | # Why is CPython configure slower under OSH?
|
4 | #
|
5 | # Usage:
|
6 | # benchmarks/autoconf.sh <function name>
|
7 | #
|
8 | # Examples:
|
9 | # $0 patch-pyconf # times builtin
|
10 | # $0 measure-times # time-tsv with gc stats
|
11 | # $0 report-times # times builtin
|
12 | #
|
13 | # $0 measure-syscalls
|
14 | # $0 report-syscalls
|
15 | # $0 report-processes
|
16 | # $0 report-external
|
17 | #
|
18 | # Simpler:
|
19 | # $0 measure-rusage # time-tsv
|
20 |
|
21 | set -o nounset
|
22 | set -o pipefail
|
23 | set -o errexit
|
24 |
|
25 | REPO_ROOT=$(cd "$(dirname $0)/.."; pwd)
|
26 |
|
27 | source benchmarks/cachegrind.sh # with-cachegrind
|
28 | source benchmarks/callgrind.sh # with-cachegrind
|
29 | source test/tsv-lib.sh # $TAB
|
30 |
|
31 | readonly BASE_DIR_RELATIVE=_tmp/autoconf
|
32 | readonly BASE_DIR=$REPO_ROOT/$BASE_DIR_RELATIVE
|
33 | readonly PY_CONF=$REPO_ROOT/Python-2.7.13/configure
|
34 |
|
35 | #
|
36 | # Trying to measure allocation/GC overhead
|
37 | #
|
38 | # This DOES NOT HELP because bumpleak/bumproot are **slower** on bigger heaps.
|
39 | # There's less cache locality!
|
40 | #
|
41 |
|
42 | cpython-configure-tasks() {
|
43 | local -a variants=( opt+bumpleak opt+bumproot opt )
|
44 | for v in ${variants[@]}; do
|
45 | echo "${v}${TAB}_bin/cxx-$v/osh"
|
46 | done
|
47 | echo "opt${TAB}_bin/cxx-opt/mycpp-souffle/osh"
|
48 | }
|
49 |
|
50 | cpython-setup() {
|
51 | cpython-configure-tasks | while read -r _ osh; do
|
52 | ninja $osh
|
53 | done
|
54 | }
|
55 |
|
56 | measure-alloc-overhead() {
|
57 | local base_dir=$BASE_DIR/alloc-overhead
|
58 | rm -r -f -v $base_dir
|
59 |
|
60 | cpython-configure-tasks | while read -r variant osh; do
|
61 | osh=$REPO_ROOT/$osh
|
62 |
|
63 | local task_dir=$base_dir/$variant
|
64 |
|
65 | mkdir -p $task_dir
|
66 | pushd $task_dir > /dev/null
|
67 |
|
68 | local -a flags=(
|
69 | --output "$base_dir/$variant.tsv"
|
70 | --rusage
|
71 | )
|
72 |
|
73 | local -a time_argv
|
74 |
|
75 | time_argv=(
|
76 | time-tsv --print-header
|
77 | "${flags[@]}"
|
78 | --field variant
|
79 | )
|
80 | "${time_argv[@]}"
|
81 |
|
82 | time_argv=(
|
83 | time-tsv --append
|
84 | "${flags[@]}"
|
85 | --field "$variant"
|
86 | -- $osh $PY_CONF
|
87 | )
|
88 |
|
89 | #echo "${time_argv[@]}"
|
90 | "${time_argv[@]}"
|
91 |
|
92 | popd > /dev/null
|
93 |
|
94 | done
|
95 | }
|
96 |
|
97 | #
|
98 | # Compare bash/dash/osh locally
|
99 | #
|
100 |
|
101 | measure-rusage() {
|
102 | local base_dir=$BASE_DIR/rusage
|
103 | rm -r -f -v $base_dir
|
104 |
|
105 | shell-tasks | while read -r sh_label sh_path; do
|
106 |
|
107 | local task_dir=$base_dir/$sh_label
|
108 |
|
109 | mkdir -p $task_dir
|
110 | pushd $task_dir > /dev/null
|
111 |
|
112 | local -a flags=(
|
113 | --output "$base_dir/$sh_label.tsv"
|
114 | --rusage
|
115 | )
|
116 |
|
117 | local -a time_argv
|
118 |
|
119 | time_argv=(
|
120 | time-tsv --print-header
|
121 | "${flags[@]}"
|
122 | --field sh_label
|
123 | )
|
124 | "${time_argv[@]}"
|
125 |
|
126 | time_argv=(
|
127 | time-tsv --append
|
128 | "${flags[@]}"
|
129 | --field "$sh_label"
|
130 | -- $sh_path $PY_CONF
|
131 | )
|
132 |
|
133 | #echo "${time_argv[@]}"
|
134 | "${time_argv[@]}"
|
135 |
|
136 | popd > /dev/null
|
137 |
|
138 | done
|
139 | }
|
140 |
|
141 | #
|
142 | # Now try strace
|
143 | #
|
144 |
|
145 | shell-tasks() {
|
146 | echo "bash${TAB}bash"
|
147 | echo "dash${TAB}dash"
|
148 | echo "osh${TAB}$REPO_ROOT/_bin/cxx-opt/osh"
|
149 | echo "osh${TAB}$REPO_ROOT/_bin/cxx-opt/mycpp-souffle/osh"
|
150 | }
|
151 |
|
152 | measure-syscalls() {
|
153 | local osh=_bin/cxx-opt/osh
|
154 | #local osh=_bin/cxx-dbg/osh
|
155 |
|
156 | ninja $osh
|
157 |
|
158 | local base_dir=$BASE_DIR/syscalls
|
159 |
|
160 | rm -r -f -v $base_dir
|
161 |
|
162 | shell-tasks | while read -r sh_label sh_path; do
|
163 | local dir=$base_dir/$sh_label
|
164 | mkdir -p $dir
|
165 |
|
166 | local counts_dir=$base_dir/$sh_label
|
167 | mkdir -p $counts_dir
|
168 |
|
169 | pushd $dir
|
170 | #strace -o $counts -c $sh_path $PY_CONF
|
171 | # See how many external processes are started?
|
172 | #strace -o $counts -ff -e execve $sh_path $PY_CONF
|
173 | strace -o $counts_dir/syscalls -ff $sh_path $PY_CONF
|
174 | popd
|
175 | done
|
176 | }
|
177 |
|
178 | # --- _tmp/autoconf/bash
|
179 | # 6047
|
180 | # 4621
|
181 | # --- _tmp/autoconf/dash
|
182 | # 6088
|
183 | # 4627
|
184 | # --- _tmp/autoconf/osh
|
185 | # 5691
|
186 | # 4631
|
187 | #
|
188 | # Woah we start fewer processes! But are not faster?
|
189 |
|
190 | grep-exec() {
|
191 | # --max-count 1 - only report one execve per process
|
192 | egrep --no-filename --max-count 1 -o 'execve\("[^"]+' "$@"
|
193 | }
|
194 |
|
195 | # andy@hoover:~/git/oilshell/oil$ benchmarks/autoconf.sh report-syscalls
|
196 | # --- _tmp/autoconf/syscalls/bash
|
197 | # 2592 _tmp/autoconf/syscalls/bash/syscalls.903220
|
198 | # 2608 _tmp/autoconf/syscalls/bash/syscalls.898727
|
199 | # 2632 _tmp/autoconf/syscalls/bash/syscalls.898387
|
200 | # 2679 _tmp/autoconf/syscalls/bash/syscalls.898292
|
201 | # 2853 _tmp/autoconf/syscalls/bash/syscalls.898927
|
202 | # 2873 _tmp/autoconf/syscalls/bash/syscalls.898334
|
203 | # 2920 _tmp/autoconf/syscalls/bash/syscalls.898895
|
204 | # 3204 _tmp/autoconf/syscalls/bash/syscalls.898664
|
205 | # 112549 _tmp/autoconf/syscalls/bash/syscalls.897471
|
206 | # 1360223 total
|
207 | #
|
208 | # --- _tmp/autoconf/syscalls/dash
|
209 | # 2592 _tmp/autoconf/syscalls/dash/syscalls.909344
|
210 | # 2607 _tmp/autoconf/syscalls/dash/syscalls.904921
|
211 | # 2630 _tmp/autoconf/syscalls/dash/syscalls.904581
|
212 | # 2683 _tmp/autoconf/syscalls/dash/syscalls.904486
|
213 | # 2851 _tmp/autoconf/syscalls/dash/syscalls.905109
|
214 | # 2873 _tmp/autoconf/syscalls/dash/syscalls.904528
|
215 | # 2920 _tmp/autoconf/syscalls/dash/syscalls.905088
|
216 | # 3204 _tmp/autoconf/syscalls/dash/syscalls.904858
|
217 | # 112922 _tmp/autoconf/syscalls/dash/syscalls.903626
|
218 | # 1372118 total
|
219 | #
|
220 | # --- _tmp/autoconf/syscalls/osh
|
221 | # 2592 _tmp/autoconf/syscalls/osh/syscalls.915226
|
222 | # 2607 _tmp/autoconf/syscalls/osh/syscalls.910993
|
223 | # 2630 _tmp/autoconf/syscalls/osh/syscalls.910647
|
224 | # 2679 _tmp/autoconf/syscalls/osh/syscalls.910561
|
225 | # 2851 _tmp/autoconf/syscalls/osh/syscalls.911162
|
226 | # 2873 _tmp/autoconf/syscalls/osh/syscalls.910599
|
227 | # 2920 _tmp/autoconf/syscalls/osh/syscalls.911143
|
228 | # 3204 _tmp/autoconf/syscalls/osh/syscalls.910936
|
229 | # 72921 _tmp/autoconf/syscalls/osh/syscalls.909769
|
230 | # 1211074 total
|
231 |
|
232 | report-processes() {
|
233 | for sh_dir in $BASE_DIR_RELATIVE/syscalls/*; do
|
234 | echo "--- $sh_dir"
|
235 | ls $sh_dir/* | wc -l
|
236 | grep-exec $sh_dir/syscalls.* | wc -l
|
237 | echo
|
238 |
|
239 | done
|
240 | }
|
241 |
|
242 | report-external() {
|
243 | local n=${1:-5}
|
244 |
|
245 | for sh_dir in $BASE_DIR_RELATIVE/syscalls/*; do
|
246 | echo "--- $sh_dir"
|
247 |
|
248 | grep-exec $sh_dir/syscalls.* | sort | uniq -c | sort -n | tail -n $n
|
249 | echo
|
250 | done
|
251 | }
|
252 |
|
253 | report-syscalls() {
|
254 | # Hm this is instructive, the shell itself makes the most syscalls
|
255 | # And fewer than other shells?
|
256 |
|
257 | for sh_dir in $BASE_DIR_RELATIVE/syscalls/*; do
|
258 | echo "--- $sh_dir"
|
259 | wc -l $sh_dir/syscalls.* | sort -n | tail
|
260 | echo
|
261 | done
|
262 | }
|
263 |
|
264 | #
|
265 | # Cachegrind
|
266 | #
|
267 |
|
268 | measure-valgrind() {
|
269 | local tool=$1
|
270 |
|
271 | # opt seems to give OK results, but I thought dbg was more accurate
|
272 | #local osh=_bin/cxx-opt/osh
|
273 | local osh=_bin/cxx-dbg/osh
|
274 |
|
275 | ninja $osh
|
276 |
|
277 | local osh=$REPO_ROOT/$osh
|
278 |
|
279 | local base_dir=$REPO_ROOT/_tmp/$tool
|
280 |
|
281 | local dir=$base_dir/cpython-configure
|
282 | rm -r -f -v $dir
|
283 |
|
284 | local out_file=$base_dir/cpython-configure.txt
|
285 |
|
286 | mkdir -v -p $dir
|
287 |
|
288 | pushd $dir
|
289 | $tool $out_file $osh $PY_CONF
|
290 | popd
|
291 | }
|
292 |
|
293 | measure-cachegrind() {
|
294 | measure-valgrind with-cachegrind
|
295 | }
|
296 |
|
297 | measure-callgrind() {
|
298 | # This takes ~5 minutes with opt binary, ~6:43 with dbg
|
299 | # vs ~15 seconds uninstrumented
|
300 | time measure-valgrind with-callgrind
|
301 | }
|
302 |
|
303 | # Note:
|
304 | # benchmarks/osh-runtime.sh compares our release, which does not have #ifdef
|
305 | # GC_TIMING, so we don't know total GC time.
|
306 |
|
307 | # TODO:
|
308 | #
|
309 | # - Run locally, reproduce GC_TIMING - this is not in the release build
|
310 | # - it seems to say only 143 ms total GC time, but we're seeing 1.5+ seconds
|
311 | # slowdown on Cpython configure vs. bash
|
312 | # - I want a local run that automates it, and returns PERCENTAGES for elapsed
|
313 | # time, sys time, user time
|
314 | # - We also might not want to amortize free() inside Allocate()
|
315 | # - #ifdef LAZY_FREE I think! That might show a big slowdown with free
|
316 |
|
317 | patch-pyconf() {
|
318 | #sed -i $'s/ac_compile=\'$CC/ac_compile=\'times; $CC/g' $PY_CONF
|
319 |
|
320 | # temporary
|
321 | echo 'times > $SH_BENCHMARK_TIMES' >> $PY_CONF
|
322 | }
|
323 |
|
324 | measure-times() {
|
325 | local osh=_bin/cxx-opt/osh
|
326 | ninja $osh
|
327 |
|
328 | local base_dir=$BASE_DIR/times
|
329 | rm -r -f -v $base_dir
|
330 |
|
331 | local trace_dir=$base_dir/oils-trace
|
332 | mkdir -p $trace_dir
|
333 |
|
334 | shell-tasks | while read -r sh_label sh_path; do
|
335 | #case $sh_label in bash|dash) continue ;; esac
|
336 |
|
337 | local dir=$base_dir/$sh_label
|
338 | mkdir -p $dir
|
339 |
|
340 | pushd $dir
|
341 |
|
342 | local -a flags=(
|
343 | --output "$base_dir/$sh_label.tsv"
|
344 | --rusage
|
345 | )
|
346 |
|
347 | local -a time_argv
|
348 |
|
349 | time_argv=(
|
350 | time-tsv --print-header
|
351 | "${flags[@]}"
|
352 | --field sh_label
|
353 | )
|
354 | "${time_argv[@]}"
|
355 |
|
356 | time_argv=(
|
357 | time-tsv --append
|
358 | "${flags[@]}"
|
359 | --field "$sh_label"
|
360 | -- $sh_path $PY_CONF
|
361 | )
|
362 |
|
363 | #echo "${time_argv[@]}"
|
364 |
|
365 | # 1269 argv0.json files created
|
366 | # we can miss some via NOLASTFORK optimization
|
367 | #OILS_TRACE_DIR=$trace_dir \
|
368 |
|
369 | _OILS_GC_VERBOSE=1 OILS_GC_STATS_FD=99 \
|
370 | SH_BENCHMARK_TIMES=$base_dir/$sh_label.times.txt \
|
371 | "${time_argv[@]}" \
|
372 | 99>$base_dir/$sh_label.gc-stats.txt
|
373 |
|
374 | popd
|
375 | done
|
376 | }
|
377 |
|
378 | inner-long-tsv() {
|
379 | python2 -c '
|
380 | import os, re, sys
|
381 |
|
382 | def PrintRow(row):
|
383 | print("\t".join(row))
|
384 |
|
385 | PrintRow(["shell", "who", "what", "seconds"])
|
386 |
|
387 | for path in sys.argv[1:]:
|
388 | filename = os.path.basename(path)
|
389 | shell = filename.split(".")[0]
|
390 |
|
391 | f = open(path)
|
392 | s = f.read()
|
393 |
|
394 | secs = re.findall("0m([0-9.]+)s", s)
|
395 | assert len(secs) == 4, secs
|
396 |
|
397 | PrintRow([shell, "self", "user", secs[0]])
|
398 | PrintRow([shell, "self", "sys", secs[1]])
|
399 | PrintRow([shell, "child", "user", secs[2]])
|
400 | PrintRow([shell, "child", "sys", secs[3]])
|
401 |
|
402 | # Non-normalized, but OK
|
403 | total_secs = sum(float(s) for s in secs)
|
404 | PrintRow([shell, "both", "both", str(total_secs)])
|
405 |
|
406 | ' $BASE_DIR/times/*.times.txt
|
407 | }
|
408 |
|
409 | compare-dim() {
|
410 | # 8% more child system time
|
411 | local who=${1:-child}
|
412 | local what=${2:-user}
|
413 |
|
414 | echo "=== $who $what ==="
|
415 |
|
416 | # Annoying
|
417 | # https://www.math.utah.edu/docs/info/gawk_8.html
|
418 | # "If, for some reason, you need to force a number to be converted to a
|
419 | # string, concatenate the empty string, "", with that number. To force a
|
420 | # string to be converted to a number, add zero to that string."
|
421 |
|
422 | cat $BASE_DIR/times-long.tsv | awk -v "who=$who" -v "what=$what" '
|
423 | BEGIN {
|
424 | TAB = "\t"
|
425 |
|
426 | i = 0
|
427 |
|
428 | printf "%s\t%s\t%s\t%s\n", "shell", "secs", "ratio", "diff secs"
|
429 | }
|
430 | $2 == who && $3 == what {
|
431 | if (i == 0) {
|
432 | first_secs = $4 + 0
|
433 | }
|
434 | i++
|
435 |
|
436 | secs = $4 + 0
|
437 | ratio = secs / first_secs
|
438 | diff = secs - first_secs
|
439 |
|
440 | # Need commas for OFMT to work correctly?
|
441 | printf "%s\t%5.3f\t%5.3f\t%5.3f\n", $1, secs, ratio, diff
|
442 | }
|
443 | '
|
444 |
|
445 | echo
|
446 | }
|
447 |
|
448 | compare-times() {
|
449 | log "INNER"
|
450 | log ''
|
451 |
|
452 | compare-dim self user
|
453 |
|
454 | compare-dim self sys
|
455 |
|
456 | compare-dim child user
|
457 |
|
458 | compare-dim child sys
|
459 |
|
460 | compare-dim both both
|
461 |
|
462 | # outer
|
463 | log "OUTER"
|
464 | log ''
|
465 |
|
466 | compare-dim both elapsed
|
467 |
|
468 | # These kinda match
|
469 | return
|
470 | compare-dim both user
|
471 | compare-dim both sys
|
472 | }
|
473 |
|
474 | outer-long-tsv() {
|
475 | log "=== outer times ==="
|
476 | awk '
|
477 | BEGIN {
|
478 | i = 0
|
479 |
|
480 | printf "%s\t%s\t%s\t%s\n", "shell", "who", "what", "seconds"
|
481 | }
|
482 | i == 0 {
|
483 | #print "Skipping header"
|
484 | i++
|
485 | next
|
486 | }
|
487 | i >= 1 {
|
488 | elapsed = $2 + 0
|
489 | user = $3 + 0
|
490 | sys = $4 + 0
|
491 | sh_label = $6
|
492 |
|
493 | printf "%s\t%s\t%s\t%5.3f\n", sh_label, "both", "elapsed", elapsed
|
494 | printf "%s\t%s\t%s\t%5.3f\n", sh_label, "both", "user", user
|
495 | printf "%s\t%s\t%s\t%5.3f\n", sh_label, "both", "sys", sys
|
496 |
|
497 | i++
|
498 | }
|
499 | ' $BASE_DIR/outer-wide.tsv
|
500 | }
|
501 |
|
502 | report-times() {
|
503 | head $BASE_DIR/times/*.tsv
|
504 | echo
|
505 | head $BASE_DIR/times/*.times.txt
|
506 | echo
|
507 |
|
508 | inner-long-tsv | tee $BASE_DIR/inner-long.tsv
|
509 | echo
|
510 |
|
511 | tsv-concat $BASE_DIR/times/*.tsv | tee $BASE_DIR/outer-wide.tsv
|
512 | outer-long-tsv | tee $BASE_DIR/outer-long.tsv
|
513 | echo
|
514 |
|
515 | tsv-concat $BASE_DIR/{inner,outer}-long.tsv | tee $BASE_DIR/times-long.tsv
|
516 |
|
517 | compare-times
|
518 | }
|
519 |
|
520 | ### Why is clone() taking longer according to strace?
|
521 |
|
522 | fork-tasks() {
|
523 | echo "bash${TAB}bash"
|
524 | echo "dash${TAB}dash"
|
525 |
|
526 | # Hm this is noisy, but cxx-opt-sh does seem slower
|
527 | echo "osh${TAB}$REPO_ROOT/_bin/cxx-opt/osh"
|
528 | echo "osh${TAB}$REPO_ROOT/_bin/cxx-opt-sh/osh"
|
529 | echo "osh${TAB}$REPO_ROOT/_bin/cxx-opt/mycpp-souffle/osh"
|
530 | echo "osh${TAB}$REPO_ROOT/_bin/cxx-opt-sh/mycpp-souffle/osh"
|
531 | }
|
532 |
|
533 | measure-fork() {
|
534 | fork-tasks | while read -r sh_label sh_path; do
|
535 | #case $sh_label in bash|dash) continue ;; esac
|
536 |
|
537 | echo "=== $sh_path ==="
|
538 |
|
539 | # Builtin is very fast
|
540 | #time $sh_path -c 'for i in $(seq 100); do true; done'
|
541 |
|
542 | # Hm this is very noisy
|
543 | # TODO use hyperfine?
|
544 | time $sh_path -c 'for i in $(seq 100); do /bin/true; done'
|
545 |
|
546 | case $sh_label in
|
547 | osh)
|
548 | # Oops, we are not symlinking to the .stripped binary!
|
549 | # This is explicitly done for symbols and benchmarking.
|
550 | # Hm does that make it slower then?
|
551 |
|
552 | ls -l -L $sh_path
|
553 | ldd $sh_path
|
554 | ;;
|
555 | esac
|
556 | done
|
557 | }
|
558 |
|
559 | # $ head _tmp/elapsed/*.times.txt
|
560 | # ==> _tmp/elapsed/bash.times.txt <==
|
561 | # 0m0.213s 0m0.477s
|
562 | # 0m8.233s 0m2.931s
|
563 | #
|
564 | # ==> _tmp/elapsed/dash.times.txt <==
|
565 | # 0m0.217s 0m0.463s
|
566 | # 0m8.281s 0m2.922s
|
567 | #
|
568 | # ==> _tmp/elapsed/osh.times.txt <==
|
569 | # 0m0.360s 0m0.720s
|
570 | # 0m8.790s 0m2.960s
|
571 |
|
572 | # shell user time - GC and allocs
|
573 | # shell system time - ???
|
574 | # child user time - ???
|
575 | # TODO: count how many processes this is.
|
576 | # It's more than 500 ms
|
577 | # Is that 500 processes, and 1 ms per process?
|
578 |
|
579 | fork-time() {
|
580 | local osh=_bin/cxx-opt/osh
|
581 |
|
582 | $osh -c 'time for i in {1..1000}; do true; done'
|
583 | echo
|
584 |
|
585 | $osh -c 'time for i in {1..1000}; do ( true ); done'
|
586 | echo
|
587 |
|
588 | # Does this increase fork time or no?
|
589 | # Hm I can only get the forks up to 306ms for 1000, or 300 us
|
590 | # But the HereDocWriter does dup() and so forth
|
591 | $osh -c '
|
592 | echo ysh-parse
|
593 | time for i in {1..40}; do
|
594 | . test/ysh-parse-errors.sh
|
595 | done
|
596 | times
|
597 | time for i in {1..1000}; do
|
598 | ( true )
|
599 | done'
|
600 | echo
|
601 | }
|
602 |
|
603 | "$@"
|