1 | #!/usr/bin/env Rscript
|
2 | #
|
3 | # benchmarks/report.R -- Analyze data collected by shell scripts.
|
4 | #
|
5 | # Usage:
|
6 | # benchmarks/report.R OUT_DIR [TIMES_CSV...]
|
7 |
|
8 | # Suppress warnings about functions masked from 'package:stats' and 'package:base'
|
9 | # filter, lag
|
10 | # intersect, setdiff, setequal, union
|
11 | library(dplyr, warn.conflicts = FALSE)
|
12 | library(tidyr) # spread()
|
13 | library(stringr)
|
14 |
|
15 | source('benchmarks/common.R')
|
16 |
|
17 | options(stringsAsFactors = F)
|
18 |
|
19 | # For pretty printing
|
20 | commas = function(x) {
|
21 | format(x, big.mark=',')
|
22 | }
|
23 |
|
24 | sourceUrl = function(path) {
|
25 | sprintf('https://github.com/oilshell/oil/blob/master/%s', path)
|
26 | }
|
27 |
|
28 | # Takes a filename, not a path.
|
29 | sourceUrl2 = function(filename) {
|
30 | sprintf(
|
31 | 'https://github.com/oilshell/oil/blob/master/benchmarks/testdata/%s',
|
32 | filename)
|
33 | }
|
34 |
|
35 | mycppUrl = function(name) {
|
36 | sprintf('https://github.com/oilshell/oil/blob/master/mycpp/examples/%s.py', name)
|
37 | }
|
38 |
|
39 | genUrl = function(name) {
|
40 | sprintf('../../_gen/mycpp/examples/%s.mycpp.cc', name)
|
41 | }
|
42 |
|
43 |
|
44 | # TODO: Set up cgit because Github links are slow.
|
45 | benchmarkDataLink = function(subdir, name, suffix) {
|
46 | #sprintf('../../../../benchmark-data/shell-id/%s', shell_id)
|
47 | sprintf('https://github.com/oilshell/benchmark-data/blob/master/%s/%s%s',
|
48 | subdir, name, suffix)
|
49 | }
|
50 |
|
51 | provenanceLink = function(subdir, name, suffix) {
|
52 | sprintf('../%s/%s%s', subdir, name, suffix)
|
53 | }
|
54 |
|
55 |
|
56 | GetOshLabel = function(shell_hash, prov_dir) {
|
57 | ### Given a string, return another string.
|
58 |
|
59 | path = sprintf('%s/shell-id/osh-%s/sh-path.txt', prov_dir, shell_hash)
|
60 |
|
61 | if (file.exists(path)) {
|
62 | Log('Reading %s', path)
|
63 | lines = readLines(path)
|
64 | if (length(grep('_bin/osh', lines)) > 0) {
|
65 | label = 'osh-ovm'
|
66 | } else if (length(grep('bin/osh', lines)) > 0) {
|
67 | label = 'osh-cpython'
|
68 | } else if (length(grep('_bin/.*/mycpp-souffle/osh', lines)) > 0) {
|
69 | label = 'osh-native-souffle'
|
70 | } else if (length(grep('_bin/.*/osh', lines)) > 0) {
|
71 | label = 'osh-native'
|
72 | } else {
|
73 | stop("Expected _bin/osh, bin/osh, or _bin/.*/osh")
|
74 | }
|
75 | } else {
|
76 | stop(sprintf("%s doesn't exist", path))
|
77 | }
|
78 | return(label)
|
79 | }
|
80 |
|
81 | opt_suffix1 = '_bin/cxx-opt/osh'
|
82 | opt_suffix2 = '_bin/cxx-opt-sh/osh'
|
83 | opt_suffix3 = '_bin/cxx-opt/mycpp-souffle/osh'
|
84 | opt_suffix4 = '_bin/cxx-opt-sh/mycpp-souffle/osh'
|
85 |
|
86 | ShellLabels = function(shell_name, shell_hash, num_hosts) {
|
87 | ### Given 2 vectors, return a vector of readable labels.
|
88 |
|
89 | # TODO: Clean up callers. Some metrics all this function with a
|
90 | # shell/runtime BASENAME, and others a PATH
|
91 | # - e.g. ComputeReport calls this with runtime_name which is actually a PATH
|
92 |
|
93 | #Log('name %s', shell_name)
|
94 | #Log('hash %s', shell_hash)
|
95 |
|
96 | if (num_hosts == 1) {
|
97 | prov_dir = '_tmp'
|
98 | } else {
|
99 | prov_dir = '../benchmark-data/'
|
100 | }
|
101 |
|
102 | labels = c()
|
103 | for (i in 1:length(shell_name)) {
|
104 | sh = shell_name[i]
|
105 | if (sh == 'osh') {
|
106 | label = GetOshLabel(shell_hash[i], prov_dir)
|
107 |
|
108 | } else if (endsWith(sh, opt_suffix1) || endsWith(sh, opt_suffix2)) {
|
109 | label = 'opt/osh'
|
110 |
|
111 | } else if (endsWith(sh, opt_suffix3) || endsWith(sh, opt_suffix4)) {
|
112 | label = 'opt/osh-souffle'
|
113 |
|
114 | } else if (endsWith(sh, '_bin/cxx-opt+bumpleak/osh')) {
|
115 | label = 'bumpleak/osh'
|
116 |
|
117 | } else {
|
118 | label = sh
|
119 | }
|
120 |
|
121 | Log('[%s] [%s]', shell_name[i], label)
|
122 | labels = c(labels, label)
|
123 | }
|
124 |
|
125 | return(labels)
|
126 | }
|
127 |
|
128 | # Simple version of the above, used by benchmarks/gc
|
129 | ShellLabelFromPath = function(sh_path) {
|
130 | labels = c()
|
131 | for (i in 1:length(sh_path)) {
|
132 | sh = sh_path[i]
|
133 |
|
134 | if (endsWith(sh, opt_suffix1) || endsWith(sh, opt_suffix2)) {
|
135 | # the opt binary is osh-native
|
136 | label = 'osh-native'
|
137 |
|
138 | } else if (endsWith(sh, opt_suffix3) || endsWith(sh, opt_suffix4)) {
|
139 | # the opt binary is osh-native
|
140 | label = 'osh-native-souffle'
|
141 |
|
142 | } else if (endsWith(sh, '_bin/cxx-opt+bumpleak/osh')) {
|
143 | label = 'bumpleak/osh'
|
144 |
|
145 | } else if (endsWith(sh, '_bin/osh')) { # the app bundle
|
146 | label = 'osh-ovm'
|
147 |
|
148 | } else if (endsWith(sh, 'bin/osh')) {
|
149 | label = 'osh-cpython'
|
150 |
|
151 | } else {
|
152 | label = sh
|
153 | }
|
154 | labels = c(labels, label)
|
155 | }
|
156 | return(labels)
|
157 | }
|
158 |
|
159 | DistinctHosts = function(t) {
|
160 | t %>% distinct(host_name, host_hash) -> distinct_hosts
|
161 | # The label is just the name
|
162 | distinct_hosts$host_label = distinct_hosts$host_name
|
163 | return(distinct_hosts)
|
164 | }
|
165 |
|
166 | DistinctShells = function(t, num_hosts = -1) {
|
167 | t %>% distinct(shell_name, shell_hash) -> distinct_shells
|
168 |
|
169 | Log('')
|
170 | Log('Labeling shells')
|
171 |
|
172 | # Calculate it if not passed
|
173 | if (num_hosts == -1) {
|
174 | num_hosts = nrow(DistinctHosts(t))
|
175 | }
|
176 |
|
177 | distinct_shells$shell_label = ShellLabels(distinct_shells$shell_name,
|
178 | distinct_shells$shell_hash,
|
179 | num_hosts)
|
180 | return(distinct_shells)
|
181 | }
|
182 |
|
183 | ParserReport = function(in_dir, out_dir) {
|
184 | times = read.csv(file.path(in_dir, 'times.csv'))
|
185 | lines = read.csv(file.path(in_dir, 'lines.csv'))
|
186 | raw_data = read.csv(file.path(in_dir, 'raw-data.csv'))
|
187 |
|
188 | cachegrind = readTsv(file.path(in_dir, 'cachegrind.tsv'))
|
189 |
|
190 | # For joining by filename
|
191 | lines_by_filename = tibble(
|
192 | num_lines = lines$num_lines,
|
193 | filename = basename(lines$path)
|
194 | )
|
195 |
|
196 | # Remove failures
|
197 | times %>% filter(status == 0) %>% select(-c(status)) -> times
|
198 | cachegrind %>% filter(status == 0) %>% select(-c(status)) -> cachegrind
|
199 |
|
200 | # Add the number of lines, joining on path, and compute lines/ms
|
201 | times %>%
|
202 | left_join(lines, by = c('path')) %>%
|
203 | mutate(filename = basename(path), filename_HREF = sourceUrl(path),
|
204 | max_rss_MB = max_rss_KiB * 1024 / 1e6,
|
205 | elapsed_ms = elapsed_secs * 1000,
|
206 | user_ms = user_secs * 1000,
|
207 | sys_ms = sys_secs * 1000,
|
208 | lines_per_ms = num_lines / elapsed_ms) %>%
|
209 | select(-c(path, max_rss_KiB, elapsed_secs, user_secs, sys_secs)) ->
|
210 | joined_times
|
211 |
|
212 | #print(head(times))
|
213 | #print(head(lines))
|
214 | #print(head(vm))
|
215 | #print(head(joined_times))
|
216 |
|
217 | print(summary(joined_times))
|
218 |
|
219 | #
|
220 | # Find distinct shells and hosts, and label them for readability.
|
221 | #
|
222 |
|
223 | distinct_hosts = DistinctHosts(joined_times)
|
224 | Log('')
|
225 | Log('Distinct hosts')
|
226 | print(distinct_hosts)
|
227 |
|
228 | distinct_shells = DistinctShells(joined_times)
|
229 | Log('')
|
230 | Log('Distinct shells')
|
231 | print(distinct_shells)
|
232 |
|
233 | # Replace name/hash combinations with labels.
|
234 | joined_times %>%
|
235 | left_join(distinct_hosts, by = c('host_name', 'host_hash')) %>%
|
236 | left_join(distinct_shells, by = c('shell_name', 'shell_hash')) %>%
|
237 | select(-c(host_name, host_hash, shell_name, shell_hash)) ->
|
238 | joined_times
|
239 |
|
240 | # Like 'times', but do shell_label as one step
|
241 | # Hack: we know benchmarks/auto.sh runs this on one machine
|
242 | distinct_shells_2 = DistinctShells(cachegrind, num_hosts = nrow(distinct_hosts))
|
243 | cachegrind %>%
|
244 | left_join(lines, by = c('path')) %>%
|
245 | select(-c(elapsed_secs, user_secs, sys_secs, max_rss_KiB)) %>%
|
246 | left_join(distinct_shells_2, by = c('shell_name', 'shell_hash')) %>%
|
247 | select(-c(shell_name, shell_hash)) %>%
|
248 | mutate(filename = basename(path), filename_HREF = sourceUrl(path)) %>%
|
249 | select(-c(path)) ->
|
250 | joined_cachegrind
|
251 |
|
252 | Log('summary(joined_times):')
|
253 | print(summary(joined_times))
|
254 | Log('head(joined_times):')
|
255 | print(head(joined_times))
|
256 |
|
257 | # Summarize rates by platform/shell
|
258 | joined_times %>%
|
259 | mutate(host_label = paste("host", host_label)) %>%
|
260 | group_by(host_label, shell_label) %>%
|
261 | summarize(total_lines = sum(num_lines), total_ms = sum(elapsed_ms)) %>%
|
262 | mutate(lines_per_ms = total_lines / total_ms) %>%
|
263 | select(-c(total_ms)) %>%
|
264 | spread(key = host_label, value = lines_per_ms) ->
|
265 | times_summary
|
266 |
|
267 | # Sort by parsing rate on machine 1
|
268 | if ("host hoover" %in% colnames(times_summary)) {
|
269 | times_summary %>% arrange(desc(`host hoover`)) -> times_summary
|
270 | } else {
|
271 | times_summary %>% arrange(desc(`host no-host`)) -> times_summary
|
272 | }
|
273 |
|
274 | Log('times_summary:')
|
275 | print(times_summary)
|
276 |
|
277 | # Summarize cachegrind by platform/shell
|
278 | # Bug fix: as.numeric(irefs) avoids 32-bit integer overflow!
|
279 | joined_cachegrind %>%
|
280 | group_by(shell_label) %>%
|
281 | summarize(total_lines = sum(num_lines), total_irefs = sum(as.numeric(irefs))) %>%
|
282 | mutate(thousand_irefs_per_line = total_irefs / total_lines / 1000) %>%
|
283 | select(-c(total_irefs)) ->
|
284 | cachegrind_summary
|
285 |
|
286 | if ("no-host" %in% distinct_hosts$host_label) {
|
287 |
|
288 | # We don't have all the shells
|
289 | elapsed = NULL
|
290 | rate = NULL
|
291 | max_rss = NULL
|
292 | instructions = NULL
|
293 |
|
294 | joined_times %>%
|
295 | select(c(shell_label, elapsed_ms, user_ms, sys_ms, max_rss_MB,
|
296 | num_lines, filename, filename_HREF)) %>%
|
297 | arrange(filename, elapsed_ms) ->
|
298 | times_flat
|
299 |
|
300 | joined_cachegrind %>%
|
301 | select(c(shell_label, irefs, num_lines, filename, filename_HREF)) %>%
|
302 | arrange(filename, irefs) ->
|
303 | cachegrind_flat
|
304 |
|
305 | } else {
|
306 |
|
307 | times_flat = NULL
|
308 | cachegrind_flat = NULL
|
309 |
|
310 | # Hack for release. TODO: unify with Soil
|
311 | if (Sys.getenv("OILS_NO_SOUFFLE") == "") {
|
312 | souffle_col = c('osh-native-souffle')
|
313 | } else {
|
314 | souffle_col = c()
|
315 | }
|
316 |
|
317 | cols1 = c('host_label', 'bash', 'dash', 'mksh', 'zsh',
|
318 | 'osh-ovm', 'osh-cpython', 'osh-native', souffle_col,
|
319 | 'osh_to_bash_ratio', 'num_lines', 'filename', 'filename_HREF')
|
320 |
|
321 | # Elapsed seconds for each shell by platform and file
|
322 | joined_times %>%
|
323 | select(-c(lines_per_ms, user_ms, sys_ms, max_rss_MB)) %>%
|
324 | spread(key = shell_label, value = elapsed_ms) %>%
|
325 | arrange(host_label, num_lines) %>%
|
326 | mutate(osh_to_bash_ratio = `osh-native` / bash) %>%
|
327 | select(all_of(cols1)) ->
|
328 | elapsed
|
329 |
|
330 | Log('\n')
|
331 | Log('ELAPSED')
|
332 | print(elapsed)
|
333 |
|
334 | cols2 = c('host_label', 'bash', 'dash', 'mksh', 'zsh',
|
335 | 'osh-ovm', 'osh-cpython', 'osh-native', souffle_col,
|
336 | 'num_lines', 'filename', 'filename_HREF')
|
337 | # Rates by file and shell
|
338 | joined_times %>%
|
339 | select(-c(elapsed_ms, user_ms, sys_ms, max_rss_MB)) %>%
|
340 | spread(key = shell_label, value = lines_per_ms) %>%
|
341 | arrange(host_label, num_lines) %>%
|
342 | select(all_of(cols2)) ->
|
343 | rate
|
344 |
|
345 | Log('\n')
|
346 | Log('RATE')
|
347 | print(rate)
|
348 |
|
349 | # Memory usage by file
|
350 | joined_times %>%
|
351 | select(-c(elapsed_ms, lines_per_ms, user_ms, sys_ms)) %>%
|
352 | spread(key = shell_label, value = max_rss_MB) %>%
|
353 | arrange(host_label, num_lines) %>%
|
354 | select(all_of(cols2)) ->
|
355 | max_rss
|
356 |
|
357 | Log('\n')
|
358 | Log('MAX RSS')
|
359 | print(max_rss)
|
360 |
|
361 | Log('\n')
|
362 | Log('joined_cachegrind has %d rows', nrow(joined_cachegrind))
|
363 | print(joined_cachegrind)
|
364 | #print(joined_cachegrind %>% filter(path == 'benchmarks/testdata/configure-helper.sh'))
|
365 |
|
366 | cols3 = c('bash', 'dash', 'mksh', 'osh-native', souffle_col,
|
367 | 'num_lines', 'filename', 'filename_HREF')
|
368 |
|
369 | # Cachegrind instructions by file
|
370 | joined_cachegrind %>%
|
371 | mutate(thousand_irefs_per_line = irefs / num_lines / 1000) %>%
|
372 | select(-c(irefs)) %>%
|
373 | spread(key = shell_label, value = thousand_irefs_per_line) %>%
|
374 | arrange(num_lines) %>%
|
375 | select(all_of(cols3)) ->
|
376 | instructions
|
377 |
|
378 | Log('\n')
|
379 | Log('instructions has %d rows', nrow(instructions))
|
380 | print(instructions)
|
381 | }
|
382 |
|
383 | WriteProvenance(distinct_hosts, distinct_shells, out_dir)
|
384 |
|
385 | raw_data_table = tibble(
|
386 | filename = basename(as.character(raw_data$path)),
|
387 | filename_HREF = benchmarkDataLink('osh-parser', filename, '')
|
388 | )
|
389 | #print(raw_data_table)
|
390 |
|
391 | writeCsv(raw_data_table, file.path(out_dir, 'raw-data'))
|
392 |
|
393 | precision = SamePrecision(0) # lines per ms
|
394 | writeCsv(times_summary, file.path(out_dir, 'summary'), precision)
|
395 |
|
396 | precision = ColumnPrecision(list(), default = 1)
|
397 | writeTsv(cachegrind_summary, file.path(out_dir, 'cachegrind_summary'), precision)
|
398 |
|
399 | if (!is.null(times_flat)) {
|
400 | precision = SamePrecision(0)
|
401 | writeTsv(times_flat, file.path(out_dir, 'times_flat'), precision)
|
402 | }
|
403 |
|
404 | if (!is.null(cachegrind_flat)) {
|
405 | precision = SamePrecision(0)
|
406 | writeTsv(cachegrind_flat, file.path(out_dir, 'cachegrind_flat'), precision)
|
407 | }
|
408 |
|
409 | if (!is.null(elapsed)) { # equivalent to no-host
|
410 | # Round to nearest millisecond, but the ratio has a decimal point.
|
411 | precision = ColumnPrecision(list(osh_to_bash_ratio = 1), default = 0)
|
412 | writeCsv(elapsed, file.path(out_dir, 'elapsed'), precision)
|
413 |
|
414 | precision = SamePrecision(0)
|
415 | writeCsv(rate, file.path(out_dir, 'rate'), precision)
|
416 |
|
417 | writeCsv(max_rss, file.path(out_dir, 'max_rss'))
|
418 |
|
419 | precision = SamePrecision(1)
|
420 | writeTsv(instructions, file.path(out_dir, 'instructions'), precision)
|
421 | }
|
422 |
|
423 | Log('Wrote %s', out_dir)
|
424 | }
|
425 |
|
426 | WriteProvenance = function(distinct_hosts, distinct_shells, out_dir, tsv = F) {
|
427 |
|
428 | num_hosts = nrow(distinct_hosts)
|
429 | if (num_hosts == 1) {
|
430 | linkify = provenanceLink
|
431 | } else {
|
432 | linkify = benchmarkDataLink
|
433 | }
|
434 |
|
435 | Log('distinct_hosts')
|
436 | print(distinct_hosts)
|
437 | Log('')
|
438 |
|
439 | Log('distinct_shells')
|
440 | print(distinct_shells)
|
441 | Log('')
|
442 |
|
443 | # Should be:
|
444 | # host_id_url
|
445 | # And then csv_to_html will be smart enough? It should take --url flag?
|
446 | host_table = tibble(
|
447 | host_label = distinct_hosts$host_label,
|
448 | host_id = paste(distinct_hosts$host_name,
|
449 | distinct_hosts$host_hash, sep='-'),
|
450 | host_id_HREF = linkify('host-id', host_id, '/')
|
451 | )
|
452 | Log('host_table')
|
453 | print(host_table)
|
454 | Log('')
|
455 |
|
456 | shell_table = tibble(
|
457 | shell_label = distinct_shells$shell_label,
|
458 | shell_id = paste(distinct_shells$shell_name,
|
459 | distinct_shells$shell_hash, sep='-'),
|
460 | shell_id_HREF = linkify('shell-id', shell_id, '/')
|
461 | )
|
462 |
|
463 | Log('shell_table')
|
464 | print(shell_table)
|
465 | Log('')
|
466 |
|
467 | if (tsv) {
|
468 | writeTsv(host_table, file.path(out_dir, 'hosts'))
|
469 | writeTsv(shell_table, file.path(out_dir, 'shells'))
|
470 | } else {
|
471 | writeCsv(host_table, file.path(out_dir, 'hosts'))
|
472 | writeCsv(shell_table, file.path(out_dir, 'shells'))
|
473 | }
|
474 | }
|
475 |
|
476 | WriteSimpleProvenance = function(provenance, out_dir) {
|
477 | Log('provenance')
|
478 | print(provenance)
|
479 | Log('')
|
480 |
|
481 | # Legacy: add $shell_name, because "$shell_basename-$shell_hash" is what
|
482 | # benchmarks/id.sh publish-shell-id uses
|
483 | provenance %>%
|
484 | mutate(shell_name = basename(sh_path)) %>%
|
485 | distinct(shell_label, shell_name, shell_hash) ->
|
486 | distinct_shells
|
487 |
|
488 | Log('distinct_shells')
|
489 | print(distinct_shells)
|
490 | Log('')
|
491 |
|
492 | provenance %>% distinct(host_label, host_name, host_hash) -> distinct_hosts
|
493 |
|
494 | WriteProvenance(distinct_hosts, distinct_shells, out_dir, tsv = T)
|
495 | }
|
496 |
|
497 | RuntimeReport = function(in_dir, out_dir) {
|
498 | times = readTsv(file.path(in_dir, 'times.tsv'))
|
499 |
|
500 | gc_stats = readTsv(file.path(in_dir, 'gc_stats.tsv'))
|
501 | provenance = readTsv(file.path(in_dir, 'provenance.tsv'))
|
502 |
|
503 | times %>% filter(status != 0) -> failed
|
504 | if (nrow(failed) != 0) {
|
505 | print(failed)
|
506 | stop('Some osh-runtime tasks failed')
|
507 | }
|
508 |
|
509 | # Joins:
|
510 | # times <= sh_path => provenance
|
511 | # times <= join_id, host_name => gc_stats
|
512 |
|
513 | # TODO: provenance may have rows from 2 machines. Could validate them and
|
514 | # deduplicate.
|
515 |
|
516 | # It should have (host_label, host_name, host_hash)
|
517 | # (shell_label, sh_path, shell_hash)
|
518 | provenance %>%
|
519 | mutate(host_label = host_name, shell_label = ShellLabelFromPath(sh_path)) ->
|
520 | provenance
|
521 |
|
522 | provenance %>% distinct(sh_path, shell_label) -> label_lookup
|
523 |
|
524 | Log('label_lookup')
|
525 | print(label_lookup)
|
526 |
|
527 | # Join with provenance for host label and shell label
|
528 | times %>%
|
529 | select(c(elapsed_secs, user_secs, sys_secs, max_rss_KiB, task_id,
|
530 | host_name, sh_path, workload)) %>%
|
531 | mutate(elapsed_ms = elapsed_secs * 1000,
|
532 | user_ms = user_secs * 1000,
|
533 | sys_ms = sys_secs * 1000,
|
534 | max_rss_MB = max_rss_KiB * 1024 / 1e6) %>%
|
535 | select(-c(elapsed_secs, user_secs, sys_secs, max_rss_KiB)) %>%
|
536 | left_join(label_lookup, by = c('sh_path')) %>%
|
537 | select(-c(sh_path)) %>%
|
538 | # we want to compare workloads on adjacent rows
|
539 | arrange(workload) ->
|
540 | details
|
541 |
|
542 | times %>%
|
543 | select(c(task_id, host_name, sh_path, workload, minor_faults, major_faults, swaps, in_block, out_block, signals, voluntary_ctx, involuntary_ctx)) %>%
|
544 | left_join(label_lookup, by = c('sh_path')) %>%
|
545 | select(-c(sh_path)) %>%
|
546 | # we want to compare workloads on adjacent rows
|
547 | arrange(workload) ->
|
548 | details_io
|
549 |
|
550 | Log('details')
|
551 | print(details)
|
552 |
|
553 | # Hack for release. TODO: unify with Soil
|
554 | if (Sys.getenv("OILS_NO_SOUFFLE") == "") {
|
555 | souffle_col = c('osh-native-souffle')
|
556 | } else {
|
557 | souffle_col = c()
|
558 | }
|
559 |
|
560 | cols2 = c('workload', 'host_name',
|
561 | 'bash', 'dash', 'osh-cpython', 'osh-native', souffle_col,
|
562 | 'py_bash_ratio', 'native_bash_ratio')
|
563 |
|
564 | # Elapsed time comparison
|
565 | details %>%
|
566 | select(-c(task_id, user_ms, sys_ms, max_rss_MB)) %>%
|
567 | spread(key = shell_label, value = elapsed_ms) %>%
|
568 | mutate(py_bash_ratio = `osh-cpython` / bash) %>%
|
569 | mutate(native_bash_ratio = `osh-native` / bash) %>%
|
570 | arrange(workload, host_name) %>%
|
571 | select(all_of(cols2)) ->
|
572 | elapsed
|
573 |
|
574 | Log('elapsed')
|
575 | print(elapsed)
|
576 |
|
577 | # Minor Page Faults Comparison
|
578 | details_io %>%
|
579 | select(c(host_name, shell_label, workload, minor_faults)) %>%
|
580 | spread(key = shell_label, value = minor_faults) %>%
|
581 | mutate(py_bash_ratio = `osh-cpython` / bash) %>%
|
582 | mutate(native_bash_ratio = `osh-native` / bash) %>%
|
583 | arrange(workload, host_name) %>%
|
584 | select(all_of(cols2)) ->
|
585 | page_faults
|
586 |
|
587 | Log('page_faults')
|
588 | print(page_faults)
|
589 |
|
590 | # Max RSS comparison
|
591 | details %>%
|
592 | select(c(host_name, shell_label, workload, max_rss_MB)) %>%
|
593 | spread(key = shell_label, value = max_rss_MB) %>%
|
594 | mutate(py_bash_ratio = `osh-cpython` / bash) %>%
|
595 | mutate(native_bash_ratio = `osh-native` / bash) %>%
|
596 | arrange(workload, host_name) %>%
|
597 | select(all_of(cols2)) ->
|
598 | max_rss
|
599 |
|
600 | Log('max rss')
|
601 | print(max_rss)
|
602 |
|
603 | details %>%
|
604 | select(c(task_id, host_name, workload, elapsed_ms, max_rss_MB)) %>%
|
605 | mutate(join_id = sprintf("gc-%d", task_id)) %>%
|
606 | select(-c(task_id)) ->
|
607 | gc_details
|
608 |
|
609 | Log('GC details')
|
610 | print(gc_details)
|
611 | Log('')
|
612 |
|
613 | Log('GC stats')
|
614 | print(gc_stats)
|
615 | Log('')
|
616 |
|
617 | gc_stats %>%
|
618 | left_join(gc_details, by = c('join_id', 'host_name')) %>%
|
619 | select(-c(join_id, roots_capacity, objs_capacity)) %>%
|
620 | # Do same transformations as GcReport()
|
621 | mutate(allocated_MB = bytes_allocated / 1e6) %>%
|
622 | select(-c(bytes_allocated)) %>%
|
623 | rename(num_gc_done = num_collections) %>%
|
624 | # Put these columns first
|
625 | relocate(workload, host_name,
|
626 | elapsed_ms, max_gc_millis, total_gc_millis,
|
627 | allocated_MB, max_rss_MB, num_allocated) ->
|
628 | gc_stats
|
629 |
|
630 | Log('After GC stats')
|
631 | print(gc_stats)
|
632 | Log('')
|
633 |
|
634 | WriteSimpleProvenance(provenance, out_dir)
|
635 |
|
636 | # milliseconds don't need decimal digit
|
637 | precision = ColumnPrecision(list(bash = 0, dash = 0, `osh-cpython` = 0,
|
638 | `osh-native` = 0, `osh-native-souffle` = 0, py_bash_ratio = 2,
|
639 | native_bash_ratio = 2))
|
640 | writeTsv(elapsed, file.path(out_dir, 'elapsed'), precision)
|
641 | writeTsv(page_faults, file.path(out_dir, 'page_faults'), precision)
|
642 |
|
643 | precision2 = ColumnPrecision(list(py_bash_ratio = 2, native_bash_ratio = 2))
|
644 | writeTsv(max_rss, file.path(out_dir, 'max_rss'), precision2)
|
645 |
|
646 | precision3 = ColumnPrecision(list(max_rss_MB = 1, allocated_MB = 1),
|
647 | default = 0)
|
648 | writeTsv(gc_stats, file.path(out_dir, 'gc_stats'), precision3)
|
649 |
|
650 | writeTsv(details, file.path(out_dir, 'details'), precision3)
|
651 | writeTsv(details_io, file.path(out_dir, 'details_io'))
|
652 |
|
653 | Log('Wrote %s', out_dir)
|
654 | }
|
655 |
|
656 | VmBaselineReport = function(in_dir, out_dir) {
|
657 | vm = readTsv(file.path(in_dir, 'vm-baseline.tsv'))
|
658 | #print(vm)
|
659 |
|
660 | # Not using DistinctHosts() because field host_hash isn't collected
|
661 | num_hosts = nrow(vm %>% distinct(host))
|
662 |
|
663 | vm %>%
|
664 | rename(kib = metric_value) %>%
|
665 | mutate(shell_label = ShellLabels(shell_name, shell_hash, num_hosts),
|
666 | megabytes = kib * 1024 / 1e6) %>%
|
667 | select(-c(shell_name, kib)) %>%
|
668 | spread(key = c(metric_name), value = megabytes) %>%
|
669 | rename(VmPeak_MB = VmPeak, VmRSS_MB = VmRSS) %>%
|
670 | select(c(shell_label, shell_hash, host, VmRSS_MB, VmPeak_MB)) %>%
|
671 | arrange(shell_label, shell_hash, host, VmPeak_MB) ->
|
672 | vm
|
673 |
|
674 | print(vm)
|
675 |
|
676 | writeTsv(vm, file.path(out_dir, 'vm-baseline'))
|
677 | }
|
678 |
|
679 | WriteOvmBuildDetails = function(distinct_hosts, distinct_compilers, out_dir) {
|
680 | host_table = tibble(
|
681 | host_label = distinct_hosts$host_label,
|
682 | host_id = paste(distinct_hosts$host_name,
|
683 | distinct_hosts$host_hash, sep='-'),
|
684 | host_id_HREF = benchmarkDataLink('host-id', host_id, '/')
|
685 | )
|
686 | print(host_table)
|
687 |
|
688 | dc = distinct_compilers
|
689 | compiler_table = tibble(
|
690 | compiler_label = dc$compiler_label,
|
691 | compiler_id = paste(dc$compiler_label, dc$compiler_hash, sep='-'),
|
692 | compiler_id_HREF = benchmarkDataLink('compiler-id', compiler_id, '/')
|
693 | )
|
694 | print(compiler_table)
|
695 |
|
696 | writeTsv(host_table, file.path(out_dir, 'hosts'))
|
697 | writeTsv(compiler_table, file.path(out_dir, 'compilers'))
|
698 | }
|
699 |
|
700 | OvmBuildReport = function(in_dir, out_dir) {
|
701 | times = readTsv(file.path(in_dir, 'times.tsv'))
|
702 | native_sizes = readTsv(file.path(in_dir, 'native-sizes.tsv'))
|
703 | #raw_data = readTsv(file.path(in_dir, 'raw-data.tsv'))
|
704 |
|
705 | times %>% filter(status != 0) -> failed
|
706 | if (nrow(failed) != 0) {
|
707 | print(failed)
|
708 | stop('Some ovm-build tasks failed')
|
709 | }
|
710 |
|
711 | times %>% distinct(host_name, host_hash) -> distinct_hosts
|
712 | distinct_hosts$host_label = distinct_hosts$host_name
|
713 |
|
714 | times %>% distinct(compiler_path, compiler_hash) -> distinct_compilers
|
715 | distinct_compilers$compiler_label = basename(distinct_compilers$compiler_path)
|
716 |
|
717 | #print(distinct_hosts)
|
718 | #print(distinct_compilers)
|
719 |
|
720 | WriteOvmBuildDetails(distinct_hosts, distinct_compilers, out_dir)
|
721 |
|
722 | times %>%
|
723 | select(-c(status)) %>%
|
724 | left_join(distinct_hosts, by = c('host_name', 'host_hash')) %>%
|
725 | left_join(distinct_compilers, by = c('compiler_path', 'compiler_hash')) %>%
|
726 | select(-c(host_name, host_hash, compiler_path, compiler_hash)) %>%
|
727 | mutate(src_dir = basename(src_dir),
|
728 | host_label = paste("host ", host_label),
|
729 | is_conf = str_detect(action, 'configure'),
|
730 | is_ovm = str_detect(action, 'oil.ovm'),
|
731 | is_dbg = str_detect(action, 'dbg'),
|
732 | ) %>%
|
733 | select(host_label, src_dir, compiler_label, action, is_conf, is_ovm, is_dbg,
|
734 | elapsed_secs) %>%
|
735 | spread(key = c(host_label), value = elapsed_secs) %>%
|
736 | arrange(src_dir, compiler_label, desc(is_conf), is_ovm, desc(is_dbg)) %>%
|
737 | select(-c(is_conf, is_ovm, is_dbg)) ->
|
738 | times
|
739 |
|
740 | #print(times)
|
741 |
|
742 | # paths look like _tmp/ovm-build/bin/clang/oils_cpp.stripped
|
743 | native_sizes %>%
|
744 | select(c(host_label, path, num_bytes)) %>%
|
745 | mutate(host_label = paste("host ", host_label),
|
746 | binary = basename(path),
|
747 | compiler = basename(dirname(path)),
|
748 | ) %>%
|
749 | select(-c(path)) %>%
|
750 | spread(key = c(host_label), value = num_bytes) %>%
|
751 | arrange(compiler, binary) ->
|
752 | native_sizes
|
753 |
|
754 | # NOTE: These don't have the host and compiler.
|
755 | writeTsv(times, file.path(out_dir, 'times'))
|
756 | writeTsv(native_sizes, file.path(out_dir, 'native-sizes'))
|
757 |
|
758 | # TODO: I want a size report too
|
759 | #writeCsv(sizes, file.path(out_dir, 'sizes'))
|
760 | }
|
761 |
|
762 | unique_stdout_md5sum = function(t, num_expected) {
|
763 | u = n_distinct(t$stdout_md5sum)
|
764 | if (u != num_expected) {
|
765 | t %>% select(c(host_name, task_name, arg1, arg2, runtime_name, stdout_md5sum)) %>% print()
|
766 | stop(sprintf('Expected %d unique md5sums, got %d', num_expected, u))
|
767 | }
|
768 | }
|
769 |
|
770 | ComputeReport = function(in_dir, out_dir) {
|
771 | # TSV file, not CSV
|
772 | times = read.table(file.path(in_dir, 'times.tsv'), header=T)
|
773 | print(times)
|
774 |
|
775 | times %>% filter(status != 0) -> failed
|
776 | if (nrow(failed) != 0) {
|
777 | print(failed)
|
778 | stop('Some compute tasks failed')
|
779 | }
|
780 |
|
781 | #
|
782 | # Check correctness
|
783 | #
|
784 |
|
785 | times %>% filter(task_name == 'hello') %>% unique_stdout_md5sum(1)
|
786 | times %>% filter(task_name == 'fib') %>% unique_stdout_md5sum(1)
|
787 | times %>% filter(task_name == 'word_freq') %>% unique_stdout_md5sum(1)
|
788 | # 3 different inputs
|
789 | times %>% filter(task_name == 'parse_help') %>% unique_stdout_md5sum(3)
|
790 |
|
791 | times %>% filter(task_name == 'bubble_sort') %>% unique_stdout_md5sum(2)
|
792 |
|
793 | # TODO:
|
794 | # - oils_cpp doesn't implement unicode LANG=C
|
795 | # - bash behaves differently on your desktop vs. in the container
|
796 | # - might need layer-locales in the image?
|
797 |
|
798 | #times %>% filter(task_name == 'palindrome' & arg1 == 'unicode') %>% unique_stdout_md5sum(1)
|
799 | # Ditto here
|
800 | #times %>% filter(task_name == 'palindrome' & arg1 == 'bytes') %>% unique_stdout_md5sum(1)
|
801 |
|
802 | #
|
803 | # Find distinct shells and hosts, and label them for readability.
|
804 | #
|
805 |
|
806 | # Runtimes are called shells, as a hack for code reuse
|
807 | times %>%
|
808 | mutate(shell_name = runtime_name, shell_hash = runtime_hash) %>%
|
809 | select(c(host_name, host_hash, shell_name, shell_hash)) ->
|
810 | tmp
|
811 |
|
812 | distinct_hosts = DistinctHosts(tmp)
|
813 | Log('')
|
814 | Log('Distinct hosts')
|
815 | print(distinct_hosts)
|
816 |
|
817 | distinct_shells = DistinctShells(tmp)
|
818 | Log('')
|
819 | Log('Distinct runtimes')
|
820 | print(distinct_shells)
|
821 |
|
822 | num_hosts = nrow(distinct_hosts)
|
823 |
|
824 | times %>%
|
825 | select(-c(status, stdout_md5sum, stdout_filename, host_hash, runtime_hash)) %>%
|
826 | mutate(runtime_label = ShellLabels(runtime_name, runtime_hash, num_hosts),
|
827 | elapsed_ms = elapsed_secs * 1000,
|
828 | user_ms = user_secs * 1000,
|
829 | sys_ms = sys_secs * 1000,
|
830 | max_rss_MB = max_rss_KiB * 1024 / 1e6) %>%
|
831 | select(-c(runtime_name, elapsed_secs, user_secs, sys_secs, max_rss_KiB)) %>%
|
832 | arrange(host_name, task_name, arg1, arg2, user_ms) ->
|
833 | details
|
834 |
|
835 | times %>%
|
836 | mutate(
|
837 | runtime_label = ShellLabels(runtime_name, runtime_hash, num_hosts),
|
838 | stdout_md5sum_HREF = file.path('tmp', task_name, stdout_filename)) %>%
|
839 | select(c(host_name, task_name, arg1, arg2, runtime_label,
|
840 | stdout_md5sum, stdout_md5sum_HREF)) ->
|
841 | stdout_files
|
842 |
|
843 | details %>% filter(task_name == 'hello') %>% select(-c(task_name)) -> hello
|
844 | details %>% filter(task_name == 'fib') %>% select(-c(task_name)) -> fib
|
845 | details %>% filter(task_name == 'word_freq') %>% select(-c(task_name)) -> word_freq
|
846 | # There's no arg2
|
847 | details %>% filter(task_name == 'parse_help') %>% select(-c(task_name, arg2)) -> parse_help
|
848 |
|
849 | details %>% filter(task_name == 'bubble_sort') %>% select(-c(task_name)) -> bubble_sort
|
850 | details %>% filter(task_name == 'palindrome' & arg1 == 'unicode') %>% select(-c(task_name)) -> palindrome
|
851 |
|
852 | precision = ColumnPrecision(list(max_rss_MB = 1), default = 0)
|
853 | writeTsv(details, file.path(out_dir, 'details'), precision)
|
854 |
|
855 | writeTsv(stdout_files, file.path(out_dir, 'stdout_files'), precision)
|
856 |
|
857 | writeTsv(hello, file.path(out_dir, 'hello'), precision)
|
858 | writeTsv(fib, file.path(out_dir, 'fib'), precision)
|
859 | writeTsv(word_freq, file.path(out_dir, 'word_freq'), precision)
|
860 | writeTsv(parse_help, file.path(out_dir, 'parse_help'), precision)
|
861 |
|
862 | writeTsv(bubble_sort, file.path(out_dir, 'bubble_sort'), precision)
|
863 | writeTsv(palindrome, file.path(out_dir, 'palindrome'), precision)
|
864 |
|
865 | WriteProvenance(distinct_hosts, distinct_shells, out_dir, tsv = T)
|
866 | }
|
867 |
|
868 | WriteOneTask = function(times, out_dir, task_name, precision) {
|
869 | times %>%
|
870 | filter(task == task_name) %>%
|
871 | select(-c(task)) -> subset
|
872 |
|
873 | writeTsv(subset, file.path(out_dir, task_name), precision)
|
874 | }
|
875 |
|
876 | SHELL_ORDER = c('dash',
|
877 | 'bash',
|
878 | 'zsh',
|
879 | '_bin/cxx-opt+bumpleak/osh',
|
880 | '_bin/cxx-opt+bumproot/osh',
|
881 | '_bin/cxx-opt+bumpsmall/osh',
|
882 | '_bin/cxx-opt/osh',
|
883 | '_bin/cxx-opt+nopool/osh')
|
884 |
|
885 | GcReport = function(in_dir, out_dir) {
|
886 | times = read.table(file.path(in_dir, 'raw/times.tsv'), header=T)
|
887 | gc_stats = read.table(file.path(in_dir, 'stage1/gc_stats.tsv'), header=T)
|
888 |
|
889 | times %>% filter(status != 0) -> failed
|
890 | if (nrow(failed) != 0) {
|
891 | print(failed)
|
892 | stop('Some gc tasks failed')
|
893 | }
|
894 |
|
895 | # Change units and order columns
|
896 | times %>%
|
897 | arrange(task, factor(sh_path, levels = SHELL_ORDER)) %>%
|
898 | mutate(elapsed_ms = elapsed_secs * 1000,
|
899 | user_ms = user_secs * 1000,
|
900 | sys_ms = sys_secs * 1000,
|
901 | max_rss_MB = max_rss_KiB * 1024 / 1e6,
|
902 | shell_label = ShellLabelFromPath(sh_path)
|
903 | ) %>%
|
904 | select(c(join_id, task, elapsed_ms, user_ms, sys_ms, max_rss_MB, shell_label,
|
905 | shell_runtime_opts)) ->
|
906 | times
|
907 |
|
908 | # Join and order columns
|
909 | gc_stats %>% left_join(times, by = c('join_id')) %>%
|
910 | arrange(desc(task)) %>%
|
911 | mutate(allocated_MB = bytes_allocated / 1e6) %>%
|
912 | # try to make the table skinnier
|
913 | rename(num_gc_done = num_collections) %>%
|
914 | select(task, elapsed_ms, max_gc_millis, total_gc_millis,
|
915 | allocated_MB, max_rss_MB, num_allocated,
|
916 | num_gc_points, num_gc_done, gc_threshold, num_growths, max_survived,
|
917 | shell_label) ->
|
918 | gc_stats
|
919 |
|
920 | times %>% select(-c(join_id)) -> times
|
921 |
|
922 |
|
923 | precision = ColumnPrecision(list(max_rss_MB = 1, allocated_MB = 1),
|
924 | default = 0)
|
925 |
|
926 | writeTsv(times, file.path(out_dir, 'times'), precision)
|
927 | writeTsv(gc_stats, file.path(out_dir, 'gc_stats'), precision)
|
928 |
|
929 | tasks = c('parse.configure-coreutils',
|
930 | 'parse.configure-cpython',
|
931 | 'parse.abuild',
|
932 | 'ex.compute-fib',
|
933 | 'ex.bashcomp-parse-help',
|
934 | 'ex.abuild-print-help')
|
935 | # Write out separate rows
|
936 | for (task in tasks) {
|
937 | WriteOneTask(times, out_dir, task, precision)
|
938 | }
|
939 | }
|
940 |
|
941 | GcCachegrindReport = function(in_dir, out_dir) {
|
942 | times = readTsv(file.path(in_dir, 'raw/times.tsv'))
|
943 | counts = readTsv(file.path(in_dir, 'stage1/cachegrind.tsv'))
|
944 |
|
945 | times %>% filter(status != 0) -> failed
|
946 | if (nrow(failed) != 0) {
|
947 | print(failed)
|
948 | stop('Some gc tasks failed')
|
949 | }
|
950 |
|
951 | print(times)
|
952 | print(counts)
|
953 |
|
954 | counts %>% left_join(times, by = c('join_id')) %>%
|
955 | mutate(million_irefs = irefs / 1e6) %>%
|
956 | select(c(million_irefs, task, sh_path, shell_runtime_opts)) %>%
|
957 | arrange(factor(sh_path, levels = SHELL_ORDER)) ->
|
958 | counts
|
959 |
|
960 | precision = NULL
|
961 | tasks = c('parse.abuild', 'ex.compute-fib')
|
962 | for (task in tasks) {
|
963 | WriteOneTask(counts, out_dir, task, precision)
|
964 | }
|
965 | }
|
966 |
|
967 | MyCppReport = function(in_dir, out_dir) {
|
968 | times = readTsv(file.path(in_dir, 'benchmark-table.tsv'))
|
969 | print(times)
|
970 |
|
971 | times %>% filter(status != 0) -> failed
|
972 | if (nrow(failed) != 0) {
|
973 | print(failed)
|
974 | stop('Some mycpp tasks failed')
|
975 | }
|
976 |
|
977 | # Don't care about elapsed and system
|
978 | times %>% select(-c(status, elapsed_secs, bin, task_out)) %>%
|
979 | mutate(example_name_HREF = mycppUrl(example_name),
|
980 | gen = c('gen'),
|
981 | gen_HREF = genUrl(example_name),
|
982 | user_ms = user_secs * 1000,
|
983 | sys_ms = sys_secs * 1000,
|
984 | max_rss_MB = max_rss_KiB * 1024 / 1e6) %>%
|
985 | select(-c(user_secs, sys_secs, max_rss_KiB)) ->
|
986 | details
|
987 |
|
988 | details %>% select(-c(sys_ms, max_rss_MB)) %>%
|
989 | spread(key = impl, value = user_ms) %>%
|
990 | mutate(`C++ : Python` = `C++` / Python) %>%
|
991 | arrange(`C++ : Python`) ->
|
992 | user_time
|
993 |
|
994 | details %>% select(-c(user_ms, max_rss_MB)) %>%
|
995 | spread(key = impl, value = sys_ms) %>%
|
996 | mutate(`C++ : Python` = `C++` / Python) %>%
|
997 | arrange(`C++ : Python`) ->
|
998 | sys_time
|
999 |
|
1000 | details %>% select(-c(user_ms, sys_ms)) %>%
|
1001 | spread(key = impl, value = max_rss_MB) %>%
|
1002 | mutate(`C++ : Python` = `C++` / Python) %>%
|
1003 | arrange(`C++ : Python`) ->
|
1004 | max_rss
|
1005 |
|
1006 | # Sometimes it speeds up by more than 10x
|
1007 | precision1 = ColumnPrecision(list(`C++ : Python` = 3), default = 0)
|
1008 | writeTsv(user_time, file.path(out_dir, 'user_time'), precision1)
|
1009 | writeTsv(sys_time, file.path(out_dir, 'sys_time'), precision1)
|
1010 |
|
1011 | precision2 = ColumnPrecision(list(`C++ : Python` = 2), default = 1)
|
1012 | writeTsv(max_rss, file.path(out_dir, 'max_rss'), precision2)
|
1013 |
|
1014 | writeTsv(details, file.path(out_dir, 'details'))
|
1015 | }
|
1016 |
|
1017 | UftraceTaskReport = function(env, task_name, summaries) {
|
1018 | # Need this again after redirect
|
1019 | MaybeDisableColor(stdout())
|
1020 |
|
1021 | task_env = env[[task_name]]
|
1022 |
|
1023 | untyped = task_env$untyped
|
1024 | typed = task_env$typed
|
1025 | strings = task_env$strings
|
1026 | slabs = task_env$slabs
|
1027 | reserve = task_env$reserve
|
1028 |
|
1029 | string_overhead = 17 # GC header (8) + len (4) + hash value (4) + NUL (1)
|
1030 | strings %>% mutate(obj_len = str_len + string_overhead) -> strings
|
1031 |
|
1032 | # TODO: Output these totals PER WORKLOAD, e.g. parsing big/small, executing
|
1033 | # big/small
|
1034 | #
|
1035 | # And then zoom in on distributions as well
|
1036 |
|
1037 | num_allocs = nrow(untyped)
|
1038 | total_bytes = sum(untyped$obj_len)
|
1039 |
|
1040 | untyped %>% group_by(obj_len) %>% count() %>% ungroup() -> untyped_hist
|
1041 | #print(untyped_hist)
|
1042 |
|
1043 | untyped_hist %>%
|
1044 | mutate(n_less_than = cumsum(n),
|
1045 | percent = n_less_than * 100.0 / num_allocs) ->
|
1046 | alloc_sizes
|
1047 |
|
1048 | a24 = untyped_hist %>% filter(obj_len <= 24)
|
1049 | a48 = untyped_hist %>% filter(obj_len <= 48)
|
1050 | a96 = untyped_hist %>% filter(obj_len <= 96)
|
1051 |
|
1052 | allocs_24_bytes_or_less = sum(a24$n) * 100.0 / num_allocs
|
1053 | allocs_48_bytes_or_less = sum(a48$n) * 100.0 / num_allocs
|
1054 | allocs_96_bytes_or_less = sum(a96$n) * 100.0 / num_allocs
|
1055 |
|
1056 | Log('Percentage of allocs less than 48 bytes: %.1f', allocs_48_bytes_or_less)
|
1057 |
|
1058 | options(tibble.print_min=25)
|
1059 |
|
1060 | Log('')
|
1061 | Log('All allocations')
|
1062 | print(alloc_sizes %>% head(22))
|
1063 | print(alloc_sizes %>% tail(5))
|
1064 |
|
1065 | Log('')
|
1066 | Log('Common Sizes')
|
1067 | print(untyped_hist %>% arrange(desc(n)) %>% head(8))
|
1068 |
|
1069 | Log('')
|
1070 | Log(' %s total allocations, total bytes = %s', commas(num_allocs), commas(total_bytes))
|
1071 | Log('')
|
1072 |
|
1073 | Log('Typed allocations')
|
1074 |
|
1075 | num_typed = nrow(typed)
|
1076 |
|
1077 | typed %>% group_by(func_name) %>% count() %>% ungroup() %>%
|
1078 | mutate(percent = n * 100.0 / num_typed) %>%
|
1079 | arrange(desc(n)) -> most_common_types
|
1080 |
|
1081 | print(most_common_types %>% head(20))
|
1082 | print(most_common_types %>% tail(5))
|
1083 |
|
1084 | lists = typed %>% filter(str_starts(func_name, ('List<')))
|
1085 | #print(lists)
|
1086 |
|
1087 | num_lists = nrow(lists)
|
1088 | total_list_bytes = num_lists * 24 # sizeof List<T> head is hard-coded
|
1089 |
|
1090 | Log('')
|
1091 | Log('%s typed allocs, including %s List<T>', commas(num_typed), commas(num_lists))
|
1092 | Log('%.2f%% of allocs are typed', num_typed * 100 / num_allocs)
|
1093 | Log('')
|
1094 |
|
1095 | #
|
1096 | # Strings
|
1097 | #
|
1098 |
|
1099 | num_strings = nrow(strings)
|
1100 | total_string_bytes = sum(strings$obj_len)
|
1101 |
|
1102 | strings %>% group_by(str_len) %>% count() %>% ungroup() %>%
|
1103 | mutate(n_less_than = cumsum(n),
|
1104 | percent = n_less_than * 100.0 / num_strings) ->
|
1105 | string_lengths
|
1106 |
|
1107 | strs_6_bytes_or_less = string_lengths %>% filter(str_len == 6) %>% select(percent)
|
1108 | strs_14_bytes_or_less = string_lengths %>% filter(str_len == 14) %>% select(percent)
|
1109 |
|
1110 | # Parse workload
|
1111 | # 62% of strings <= 6 bytes
|
1112 | # 84% of strings <= 14 bytes
|
1113 |
|
1114 | Log('Str - NewStr() and OverAllocatedStr()')
|
1115 | print(string_lengths %>% head(16))
|
1116 | print(string_lengths %>% tail(5))
|
1117 | Log('')
|
1118 |
|
1119 | Log('%s string allocations, total length = %s, total bytes = %s', commas(num_strings),
|
1120 | commas(sum(strings$str_len)), commas(total_string_bytes))
|
1121 | Log('')
|
1122 | Log('%.2f%% of allocs are strings', num_strings * 100 / num_allocs)
|
1123 | Log('%.2f%% of bytes are strings', total_string_bytes * 100 / total_bytes)
|
1124 | Log('')
|
1125 |
|
1126 | #
|
1127 | # Slabs
|
1128 | #
|
1129 |
|
1130 | Log('NewSlab()')
|
1131 |
|
1132 | num_slabs = nrow(slabs)
|
1133 | slabs %>% group_by(slab_len) %>% count() %>% ungroup() %>%
|
1134 | mutate(n_less_than = cumsum(n),
|
1135 | percent = n_less_than * 100.0 / num_slabs) ->
|
1136 | slab_lengths
|
1137 |
|
1138 | slabs %>% group_by(func_name) %>% count() %>% ungroup() %>%
|
1139 | arrange(desc(n)) -> slab_types
|
1140 |
|
1141 | Log(' Lengths')
|
1142 | print(slab_lengths %>% head())
|
1143 | print(slab_lengths %>% tail(5))
|
1144 | Log('')
|
1145 |
|
1146 | Log(' Slab Types')
|
1147 | print(slab_types %>% head())
|
1148 | print(slab_types %>% tail(5))
|
1149 | Log('')
|
1150 |
|
1151 | total_slab_items = sum(slabs$slab_len)
|
1152 |
|
1153 | Log('%s slabs, total items = %s', commas(num_slabs),
|
1154 | commas(sum(slabs$slab_len)))
|
1155 | Log('%.2f%% of allocs are slabs', num_slabs * 100 / num_allocs)
|
1156 | Log('')
|
1157 |
|
1158 | #
|
1159 | # reserve() calls
|
1160 | #
|
1161 |
|
1162 | # There should be strictly more List::reserve() calls than NewSlab
|
1163 |
|
1164 | Log('::reserve(int n)')
|
1165 | Log('')
|
1166 |
|
1167 | num_reserve = nrow(reserve)
|
1168 | reserve %>% group_by(num_items) %>% count() %>% ungroup() %>%
|
1169 | mutate(n_less_than = cumsum(n),
|
1170 | percent = n_less_than * 100.0 / num_reserve) ->
|
1171 | reserve_args
|
1172 |
|
1173 | Log(' Num Items')
|
1174 | print(reserve_args %>% head(15))
|
1175 | print(reserve_args %>% tail(5))
|
1176 | Log('')
|
1177 |
|
1178 | Log('%s reserve() calls, total items = %s', commas(num_reserve),
|
1179 | commas(sum(reserve$num_items)))
|
1180 | Log('')
|
1181 |
|
1182 | # Accounting for all allocations!
|
1183 | Log('Untyped: %s', commas(num_allocs))
|
1184 | Log('Typed + Str + Slab: %s', commas(num_typed + num_strings + num_slabs))
|
1185 | Log('')
|
1186 |
|
1187 | num_other_typed = num_typed - num_lists
|
1188 |
|
1189 | # Summary table
|
1190 | stats = tibble(task = task_name,
|
1191 | total_bytes_ = commas(total_bytes),
|
1192 | num_allocs_ = commas(num_allocs),
|
1193 | sum_typed_strs_slabs = commas(num_typed + num_strings + num_slabs),
|
1194 | num_reserve_calls = commas(num_reserve),
|
1195 |
|
1196 | percent_list_allocs = Percent(num_lists, num_allocs),
|
1197 | percent_slab_allocs = Percent(num_slabs, num_allocs),
|
1198 | percent_string_allocs = Percent(num_strings, num_allocs),
|
1199 | percent_other_typed_allocs = Percent(num_other_typed, num_allocs),
|
1200 |
|
1201 | percent_list_bytes = Percent(total_list_bytes, total_bytes),
|
1202 | percent_string_bytes = Percent(total_string_bytes, total_bytes),
|
1203 |
|
1204 | allocs_24_bytes_or_less = sprintf('%.1f%%', allocs_24_bytes_or_less),
|
1205 | allocs_48_bytes_or_less = sprintf('%.1f%%', allocs_48_bytes_or_less),
|
1206 | allocs_96_bytes_or_less = sprintf('%.1f%%', allocs_96_bytes_or_less),
|
1207 |
|
1208 | strs_6_bytes_or_less = sprintf('%.1f%%', strs_6_bytes_or_less),
|
1209 | strs_14_bytes_or_less = sprintf('%.1f%%', strs_14_bytes_or_less),
|
1210 | )
|
1211 | summaries$stats[[task_name]] = stats
|
1212 |
|
1213 | summaries$most_common_types[[task_name]] = most_common_types
|
1214 | }
|
1215 |
|
1216 | LoadUftraceTsv = function(in_dir, env) {
|
1217 | for (task in list.files(in_dir)) {
|
1218 | Log('Loading data for task %s', task)
|
1219 | base_dir = file.path(in_dir, task)
|
1220 |
|
1221 | task_env = new.env()
|
1222 | env[[task]] = task_env
|
1223 |
|
1224 | # TSV file, not CSV
|
1225 | task_env$untyped = readTsv(file.path(base_dir, 'all-untyped.tsv'))
|
1226 | task_env$typed = readTsv(file.path(base_dir, 'typed.tsv'))
|
1227 | task_env$strings = readTsv(file.path(base_dir, 'strings.tsv'))
|
1228 | task_env$slabs = readTsv(file.path(base_dir, 'slabs.tsv'))
|
1229 | task_env$reserve = readTsv(file.path(base_dir, 'reserve.tsv'))
|
1230 |
|
1231 | # median string length is 4, mean is 9.5!
|
1232 | Log('UNTYPED')
|
1233 | print(summary(task_env$untyped))
|
1234 | Log('')
|
1235 |
|
1236 | Log('TYPED')
|
1237 | print(summary(task_env$typed))
|
1238 | Log('')
|
1239 |
|
1240 | Log('STRINGS')
|
1241 | print(summary(task_env$strings))
|
1242 | Log('')
|
1243 |
|
1244 | Log('SLABS')
|
1245 | print(summary(task_env$slabs))
|
1246 | Log('')
|
1247 |
|
1248 | Log('RESERVE')
|
1249 | print(summary(task_env$reserve))
|
1250 | Log('')
|
1251 | }
|
1252 | }
|
1253 |
|
1254 | Percent = function(n, total) {
|
1255 | sprintf('%.1f%%', n * 100.0 / total)
|
1256 | }
|
1257 |
|
1258 | PrettyPrintLong = function(d) {
|
1259 | tr = t(d) # transpose
|
1260 |
|
1261 | row_names = rownames(tr)
|
1262 |
|
1263 | for (i in 1:nrow(tr)) {
|
1264 | row_name = row_names[i]
|
1265 | cat(sprintf('%26s', row_name)) # calculated min width manually
|
1266 | cat(sprintf('%20s', tr[i,]))
|
1267 | cat('\n')
|
1268 |
|
1269 | # Extra spacing
|
1270 | if (row_name %in% c('num_reserve_calls',
|
1271 | 'percent_string_bytes',
|
1272 | 'percent_other_typed_allocs',
|
1273 | 'allocs_96_bytes_or_less')) {
|
1274 | cat('\n')
|
1275 | }
|
1276 | }
|
1277 | }
|
1278 |
|
1279 |
|
1280 | UftraceReport = function(env, out_dir) {
|
1281 | # summaries$stats should be a list of 1-row data frames
|
1282 | # summaries$top_types should be a list of types
|
1283 | summaries = new.env()
|
1284 |
|
1285 | for (task_name in names(env)) {
|
1286 | report_out = file.path(out_dir, paste0(task_name, '.txt'))
|
1287 |
|
1288 | Log('Making report for task %s -> %s', task_name, report_out)
|
1289 |
|
1290 | sink(file = report_out)
|
1291 | UftraceTaskReport(env, task_name, summaries)
|
1292 | sink() # reset
|
1293 | }
|
1294 | Log('')
|
1295 |
|
1296 | # Concate all the data frames added to summary
|
1297 | stats = bind_rows(as.list(summaries$stats))
|
1298 |
|
1299 | sink(file = file.path(out_dir, 'summary.txt'))
|
1300 | #print(stats)
|
1301 | #Log('')
|
1302 |
|
1303 | PrettyPrintLong(stats)
|
1304 | Log('')
|
1305 |
|
1306 | mct = summaries$most_common_types
|
1307 | for (task_name in names(mct)) {
|
1308 | Log('Common types in workload %s', task_name)
|
1309 | Log('')
|
1310 |
|
1311 | print(mct[[task_name]] %>% head(5))
|
1312 | Log('')
|
1313 | }
|
1314 | sink()
|
1315 |
|
1316 | # For the REPL
|
1317 | return(list(stats = stats))
|
1318 | }
|
1319 |
|
1320 | main = function(argv) {
|
1321 | action = argv[[1]]
|
1322 | in_dir = argv[[2]]
|
1323 | out_dir = argv[[3]]
|
1324 |
|
1325 | if (action == 'osh-parser') {
|
1326 | ParserReport(in_dir, out_dir)
|
1327 |
|
1328 | } else if (action == 'osh-runtime') {
|
1329 | RuntimeReport(in_dir, out_dir)
|
1330 |
|
1331 | } else if (action == 'vm-baseline') {
|
1332 | VmBaselineReport(in_dir, out_dir)
|
1333 |
|
1334 | } else if (action == 'ovm-build') {
|
1335 | OvmBuildReport(in_dir, out_dir)
|
1336 |
|
1337 | } else if (action == 'compute') {
|
1338 | ComputeReport(in_dir, out_dir)
|
1339 |
|
1340 | } else if (action == 'gc') {
|
1341 | GcReport(in_dir, out_dir)
|
1342 |
|
1343 | } else if (action == 'gc-cachegrind') {
|
1344 | GcCachegrindReport(in_dir, out_dir)
|
1345 |
|
1346 | } else if (action == 'mycpp') {
|
1347 | MyCppReport(in_dir, out_dir)
|
1348 |
|
1349 | } else if (action == 'uftrace') {
|
1350 | d = new.env()
|
1351 | LoadUftraceTsv(in_dir, d)
|
1352 | UftraceReport(d, out_dir)
|
1353 |
|
1354 | } else {
|
1355 | Log("Invalid action '%s'", action)
|
1356 | quit(status = 1)
|
1357 | }
|
1358 | Log('PID %d done', Sys.getpid())
|
1359 | }
|
1360 |
|
1361 | if (length(sys.frames()) == 0) {
|
1362 | # increase ggplot font size globally
|
1363 | #theme_set(theme_grey(base_size = 20))
|
1364 |
|
1365 | main(commandArgs(TRUE))
|
1366 | }
|