| 1 | #!/usr/bin/env Rscript
|
| 2 | #
|
| 3 | # benchmarks/report.R -- Analyze data collected by shell scripts.
|
| 4 | #
|
| 5 | # Usage:
|
| 6 | # benchmarks/report.R OUT_DIR [TIMES_CSV...]
|
| 7 |
|
| 8 | # Suppress warnings about functions masked from 'package:stats' and 'package:base'
|
| 9 | # filter, lag
|
| 10 | # intersect, setdiff, setequal, union
|
| 11 | library(dplyr, warn.conflicts = FALSE)
|
| 12 | library(tidyr) # spread()
|
| 13 | library(stringr)
|
| 14 |
|
| 15 | source('benchmarks/common.R')
|
| 16 |
|
| 17 | options(stringsAsFactors = F)
|
| 18 |
|
| 19 | # For pretty printing
|
| 20 | commas = function(x) {
|
| 21 | format(x, big.mark=',')
|
| 22 | }
|
| 23 |
|
| 24 | sourceUrl = function(path) {
|
| 25 | sprintf('https://github.com/oilshell/oil/blob/master/%s', path)
|
| 26 | }
|
| 27 |
|
| 28 | # Takes a filename, not a path.
|
| 29 | sourceUrl2 = function(filename) {
|
| 30 | sprintf(
|
| 31 | 'https://github.com/oilshell/oil/blob/master/benchmarks/testdata/%s',
|
| 32 | filename)
|
| 33 | }
|
| 34 |
|
| 35 | mycppUrl = function(name) {
|
| 36 | sprintf('https://github.com/oilshell/oil/blob/master/mycpp/examples/%s.py', name)
|
| 37 | }
|
| 38 |
|
| 39 | genUrl = function(name) {
|
| 40 | sprintf('../../_gen/mycpp/examples/%s.mycpp.cc', name)
|
| 41 | }
|
| 42 |
|
| 43 |
|
| 44 | # TODO: Set up cgit because Github links are slow.
|
| 45 | benchmarkDataLink = function(subdir, name, suffix) {
|
| 46 | #sprintf('../../../../benchmark-data/shell-id/%s', shell_id)
|
| 47 | sprintf('https://github.com/oilshell/benchmark-data/blob/master/%s/%s%s',
|
| 48 | subdir, name, suffix)
|
| 49 | }
|
| 50 |
|
| 51 | provenanceLink = function(subdir, name, suffix) {
|
| 52 | sprintf('../%s/%s%s', subdir, name, suffix)
|
| 53 | }
|
| 54 |
|
| 55 |
|
| 56 | GetOshLabel = function(shell_hash, prov_dir) {
|
| 57 | ### Given a string, return another string.
|
| 58 |
|
| 59 | path = sprintf('%s/shell-id/osh-%s/sh-path.txt', prov_dir, shell_hash)
|
| 60 |
|
| 61 | if (file.exists(path)) {
|
| 62 | Log('Reading %s', path)
|
| 63 | lines = readLines(path)
|
| 64 | if (length(grep('_bin/osh', lines)) > 0) {
|
| 65 | label = 'osh-ovm'
|
| 66 | } else if (length(grep('bin/osh', lines)) > 0) {
|
| 67 | label = 'osh-cpython'
|
| 68 | } else if (length(grep('_bin/.*/mycpp-souffle/osh', lines)) > 0) {
|
| 69 | label = 'osh-native-souffle'
|
| 70 | } else if (length(grep('_bin/.*/osh', lines)) > 0) {
|
| 71 | label = 'osh-native'
|
| 72 | } else {
|
| 73 | stop("Expected _bin/osh, bin/osh, or _bin/.*/osh")
|
| 74 | }
|
| 75 | } else {
|
| 76 | stop(sprintf("%s doesn't exist", path))
|
| 77 | }
|
| 78 | return(label)
|
| 79 | }
|
| 80 |
|
| 81 | osh_opt_suffix1 = '_bin/cxx-opt/osh'
|
| 82 | osh_opt_suffix2 = '_bin/cxx-opt-sh/osh'
|
| 83 |
|
| 84 | osh_souffle_suffix1 = '_bin/cxx-opt/mycpp-souffle/osh'
|
| 85 | osh_souffle_suffix2 = '_bin/cxx-opt-sh/mycpp-souffle/osh'
|
| 86 |
|
| 87 | ysh_opt_suffix1 = '_bin/cxx-opt/ysh'
|
| 88 | ysh_opt_suffix2 = '_bin/cxx-opt-sh/ysh'
|
| 89 |
|
| 90 | ShellLabels = function(shell_name, shell_hash, num_hosts) {
|
| 91 | ### Given 2 vectors, return a vector of readable labels.
|
| 92 |
|
| 93 | # TODO: Clean up callers. Some metrics all this function with a
|
| 94 | # shell/runtime BASENAME, and others a PATH
|
| 95 | # - e.g. ComputeReport calls this with runtime_name which is actually a PATH
|
| 96 |
|
| 97 | #Log('name %s', shell_name)
|
| 98 | #Log('hash %s', shell_hash)
|
| 99 |
|
| 100 | if (num_hosts == 1) {
|
| 101 | prov_dir = '_tmp'
|
| 102 | } else {
|
| 103 | prov_dir = '../benchmark-data/'
|
| 104 | }
|
| 105 |
|
| 106 | labels = c()
|
| 107 | for (i in 1:length(shell_name)) {
|
| 108 | sh = shell_name[i]
|
| 109 | if (sh == 'osh') {
|
| 110 | label = GetOshLabel(shell_hash[i], prov_dir)
|
| 111 |
|
| 112 | } else if (endsWith(sh, osh_opt_suffix1) || endsWith(sh, osh_opt_suffix2)) {
|
| 113 | label = 'opt/osh'
|
| 114 |
|
| 115 | } else if (endsWith(sh, ysh_opt_suffix1) || endsWith(sh, ysh_opt_suffix2)) {
|
| 116 | label = 'opt/ysh'
|
| 117 |
|
| 118 | } else if (endsWith(sh, osh_souffle_suffix1) || endsWith(sh, osh_souffle_suffix2)) {
|
| 119 | label = 'opt/osh-souffle'
|
| 120 |
|
| 121 | } else if (endsWith(sh, '_bin/cxx-opt+bumpleak/osh')) {
|
| 122 | label = 'bumpleak/osh'
|
| 123 |
|
| 124 | } else {
|
| 125 | label = sh
|
| 126 | }
|
| 127 |
|
| 128 | Log('[%s] [%s]', shell_name[i], label)
|
| 129 | labels = c(labels, label)
|
| 130 | }
|
| 131 |
|
| 132 | return(labels)
|
| 133 | }
|
| 134 |
|
| 135 | # Simple version of the above, used by benchmarks/{gc,osh-runtime}
|
| 136 | ShellLabelFromPath = function(sh_path) {
|
| 137 | labels = c()
|
| 138 | for (i in 1:length(sh_path)) {
|
| 139 | sh = sh_path[i]
|
| 140 |
|
| 141 | if (endsWith(sh, osh_opt_suffix1) || endsWith(sh, osh_opt_suffix2)) {
|
| 142 | # the opt binary is called osh-native - the osh-runtime report relies on this
|
| 143 | label = 'osh-native'
|
| 144 |
|
| 145 | } else if (endsWith(sh, ysh_opt_suffix1) || endsWith(sh, ysh_opt_suffix2)) {
|
| 146 | label = 'opt/ysh'
|
| 147 |
|
| 148 | } else if (endsWith(sh, osh_souffle_suffix1) || endsWith(sh, osh_souffle_suffix2)) {
|
| 149 | label = 'osh-native-souffle'
|
| 150 |
|
| 151 | } else if (endsWith(sh, '_bin/cxx-opt+bumpleak/osh')) {
|
| 152 | label = 'bumpleak/osh'
|
| 153 |
|
| 154 | } else if (endsWith(sh, '_bin/osh')) { # the app bundle
|
| 155 | label = 'osh-ovm'
|
| 156 |
|
| 157 | } else if (endsWith(sh, 'bin/osh')) {
|
| 158 | label = 'osh-cpython'
|
| 159 |
|
| 160 | } else {
|
| 161 | label = sh
|
| 162 | }
|
| 163 | labels = c(labels, label)
|
| 164 | }
|
| 165 | return(labels)
|
| 166 | }
|
| 167 |
|
| 168 | DistinctHosts = function(t) {
|
| 169 | t %>% distinct(host_name, host_hash) -> distinct_hosts
|
| 170 | # The label is just the name
|
| 171 | distinct_hosts$host_label = distinct_hosts$host_name
|
| 172 | return(distinct_hosts)
|
| 173 | }
|
| 174 |
|
| 175 | DistinctShells = function(t, num_hosts = -1) {
|
| 176 | t %>% distinct(shell_name, shell_hash) -> distinct_shells
|
| 177 |
|
| 178 | Log('')
|
| 179 | Log('Labeling shells')
|
| 180 |
|
| 181 | # Calculate it if not passed
|
| 182 | if (num_hosts == -1) {
|
| 183 | num_hosts = nrow(DistinctHosts(t))
|
| 184 | }
|
| 185 |
|
| 186 | distinct_shells$shell_label = ShellLabels(distinct_shells$shell_name,
|
| 187 | distinct_shells$shell_hash,
|
| 188 | num_hosts)
|
| 189 | return(distinct_shells)
|
| 190 | }
|
| 191 |
|
| 192 | ParserReport = function(in_dir, out_dir) {
|
| 193 | times = read.csv(file.path(in_dir, 'times.csv'))
|
| 194 | lines = read.csv(file.path(in_dir, 'lines.csv'))
|
| 195 | raw_data = read.csv(file.path(in_dir, 'raw-data.csv'))
|
| 196 |
|
| 197 | cachegrind = readTsv(file.path(in_dir, 'cachegrind.tsv'))
|
| 198 |
|
| 199 | # For joining by filename
|
| 200 | lines_by_filename = tibble(
|
| 201 | num_lines = lines$num_lines,
|
| 202 | filename = basename(lines$path)
|
| 203 | )
|
| 204 |
|
| 205 | # Remove failures
|
| 206 | times %>% filter(status == 0) %>% select(-c(status)) -> times
|
| 207 | cachegrind %>% filter(status == 0) %>% select(-c(status)) -> cachegrind
|
| 208 |
|
| 209 | # Add the number of lines, joining on path, and compute lines/ms
|
| 210 | times %>%
|
| 211 | left_join(lines, by = c('path')) %>%
|
| 212 | mutate(filename = basename(path), filename_HREF = sourceUrl(path),
|
| 213 | max_rss_MB = max_rss_KiB * 1024 / 1e6,
|
| 214 | elapsed_ms = elapsed_secs * 1000,
|
| 215 | user_ms = user_secs * 1000,
|
| 216 | sys_ms = sys_secs * 1000,
|
| 217 | lines_per_ms = num_lines / elapsed_ms) %>%
|
| 218 | select(-c(path, max_rss_KiB, elapsed_secs, user_secs, sys_secs)) ->
|
| 219 | joined_times
|
| 220 |
|
| 221 | #print(head(times))
|
| 222 | #print(head(lines))
|
| 223 | #print(head(vm))
|
| 224 | #print(head(joined_times))
|
| 225 |
|
| 226 | print(summary(joined_times))
|
| 227 |
|
| 228 | #
|
| 229 | # Find distinct shells and hosts, and label them for readability.
|
| 230 | #
|
| 231 |
|
| 232 | distinct_hosts = DistinctHosts(joined_times)
|
| 233 | Log('')
|
| 234 | Log('Distinct hosts')
|
| 235 | print(distinct_hosts)
|
| 236 |
|
| 237 | distinct_shells = DistinctShells(joined_times)
|
| 238 | Log('')
|
| 239 | Log('Distinct shells')
|
| 240 | print(distinct_shells)
|
| 241 |
|
| 242 | # Replace name/hash combinations with labels.
|
| 243 | joined_times %>%
|
| 244 | left_join(distinct_hosts, by = c('host_name', 'host_hash')) %>%
|
| 245 | left_join(distinct_shells, by = c('shell_name', 'shell_hash')) %>%
|
| 246 | select(-c(host_name, host_hash, shell_name, shell_hash)) ->
|
| 247 | joined_times
|
| 248 |
|
| 249 | # Like 'times', but do shell_label as one step
|
| 250 | # Hack: we know benchmarks/auto.sh runs this on one machine
|
| 251 | distinct_shells_2 = DistinctShells(cachegrind, num_hosts = nrow(distinct_hosts))
|
| 252 | cachegrind %>%
|
| 253 | left_join(lines, by = c('path')) %>%
|
| 254 | select(-c(elapsed_secs, user_secs, sys_secs, max_rss_KiB)) %>%
|
| 255 | left_join(distinct_shells_2, by = c('shell_name', 'shell_hash')) %>%
|
| 256 | select(-c(shell_name, shell_hash)) %>%
|
| 257 | mutate(filename = basename(path), filename_HREF = sourceUrl(path)) %>%
|
| 258 | select(-c(path)) ->
|
| 259 | joined_cachegrind
|
| 260 |
|
| 261 | Log('summary(joined_times):')
|
| 262 | print(summary(joined_times))
|
| 263 | Log('head(joined_times):')
|
| 264 | print(head(joined_times))
|
| 265 |
|
| 266 | # Summarize rates by platform/shell
|
| 267 | joined_times %>%
|
| 268 | mutate(host_label = paste("host", host_label)) %>%
|
| 269 | group_by(host_label, shell_label) %>%
|
| 270 | summarize(total_lines = sum(num_lines), total_ms = sum(elapsed_ms)) %>%
|
| 271 | mutate(lines_per_ms = total_lines / total_ms) %>%
|
| 272 | select(-c(total_ms)) %>%
|
| 273 | spread(key = host_label, value = lines_per_ms) ->
|
| 274 | times_summary
|
| 275 |
|
| 276 | # Sort by parsing rate on machine 1
|
| 277 | if ("host hoover" %in% colnames(times_summary)) {
|
| 278 | times_summary %>% arrange(desc(`host hoover`)) -> times_summary
|
| 279 | } else {
|
| 280 | times_summary %>% arrange(desc(`host no-host`)) -> times_summary
|
| 281 | }
|
| 282 |
|
| 283 | Log('times_summary:')
|
| 284 | print(times_summary)
|
| 285 |
|
| 286 | # Summarize cachegrind by platform/shell
|
| 287 | # Bug fix: as.numeric(irefs) avoids 32-bit integer overflow!
|
| 288 | joined_cachegrind %>%
|
| 289 | group_by(shell_label) %>%
|
| 290 | summarize(total_lines = sum(num_lines), total_irefs = sum(as.numeric(irefs))) %>%
|
| 291 | mutate(thousand_irefs_per_line = total_irefs / total_lines / 1000) %>%
|
| 292 | select(-c(total_irefs)) ->
|
| 293 | cachegrind_summary
|
| 294 |
|
| 295 | if ("no-host" %in% distinct_hosts$host_label) {
|
| 296 |
|
| 297 | # We don't have all the shells
|
| 298 | elapsed = NULL
|
| 299 | rate = NULL
|
| 300 | max_rss = NULL
|
| 301 | instructions = NULL
|
| 302 |
|
| 303 | joined_times %>%
|
| 304 | select(c(shell_label, elapsed_ms, user_ms, sys_ms, max_rss_MB,
|
| 305 | num_lines, filename, filename_HREF)) %>%
|
| 306 | arrange(filename, elapsed_ms) ->
|
| 307 | times_flat
|
| 308 |
|
| 309 | joined_cachegrind %>%
|
| 310 | select(c(shell_label, irefs, num_lines, filename, filename_HREF)) %>%
|
| 311 | arrange(filename, irefs) ->
|
| 312 | cachegrind_flat
|
| 313 |
|
| 314 | } else {
|
| 315 |
|
| 316 | times_flat = NULL
|
| 317 | cachegrind_flat = NULL
|
| 318 |
|
| 319 | # Hack for release. TODO: unify with Soil
|
| 320 | if (Sys.getenv("OILS_NO_SOUFFLE") == "") {
|
| 321 | souffle_col = c('osh-native-souffle')
|
| 322 | } else {
|
| 323 | souffle_col = c()
|
| 324 | }
|
| 325 |
|
| 326 | cols1 = c('host_label', 'bash', 'dash', 'mksh', 'zsh',
|
| 327 | 'osh-ovm', 'osh-cpython', 'osh-native', souffle_col,
|
| 328 | 'osh_to_bash_ratio', 'num_lines', 'filename', 'filename_HREF')
|
| 329 |
|
| 330 | # Elapsed seconds for each shell by platform and file
|
| 331 | joined_times %>%
|
| 332 | select(-c(lines_per_ms, user_ms, sys_ms, max_rss_MB)) %>%
|
| 333 | spread(key = shell_label, value = elapsed_ms) %>%
|
| 334 | arrange(host_label, num_lines) %>%
|
| 335 | mutate(osh_to_bash_ratio = `osh-native` / bash) %>%
|
| 336 | select(all_of(cols1)) ->
|
| 337 | elapsed
|
| 338 |
|
| 339 | Log('\n')
|
| 340 | Log('ELAPSED')
|
| 341 | print(elapsed)
|
| 342 |
|
| 343 | cols2 = c('host_label', 'bash', 'dash', 'mksh', 'zsh',
|
| 344 | 'osh-ovm', 'osh-cpython', 'osh-native', souffle_col,
|
| 345 | 'num_lines', 'filename', 'filename_HREF')
|
| 346 | # Rates by file and shell
|
| 347 | joined_times %>%
|
| 348 | select(-c(elapsed_ms, user_ms, sys_ms, max_rss_MB)) %>%
|
| 349 | spread(key = shell_label, value = lines_per_ms) %>%
|
| 350 | arrange(host_label, num_lines) %>%
|
| 351 | select(all_of(cols2)) ->
|
| 352 | rate
|
| 353 |
|
| 354 | Log('\n')
|
| 355 | Log('RATE')
|
| 356 | print(rate)
|
| 357 |
|
| 358 | # Memory usage by file
|
| 359 | joined_times %>%
|
| 360 | select(-c(elapsed_ms, lines_per_ms, user_ms, sys_ms)) %>%
|
| 361 | spread(key = shell_label, value = max_rss_MB) %>%
|
| 362 | arrange(host_label, num_lines) %>%
|
| 363 | select(all_of(cols2)) ->
|
| 364 | max_rss
|
| 365 |
|
| 366 | Log('\n')
|
| 367 | Log('MAX RSS')
|
| 368 | print(max_rss)
|
| 369 |
|
| 370 | Log('\n')
|
| 371 | Log('joined_cachegrind has %d rows', nrow(joined_cachegrind))
|
| 372 | print(joined_cachegrind)
|
| 373 | #print(joined_cachegrind %>% filter(path == 'benchmarks/testdata/configure-helper.sh'))
|
| 374 |
|
| 375 | cols3 = c('bash', 'dash', 'mksh', 'osh-native', souffle_col,
|
| 376 | 'num_lines', 'filename', 'filename_HREF')
|
| 377 |
|
| 378 | # Cachegrind instructions by file
|
| 379 | joined_cachegrind %>%
|
| 380 | mutate(thousand_irefs_per_line = irefs / num_lines / 1000) %>%
|
| 381 | select(-c(irefs)) %>%
|
| 382 | spread(key = shell_label, value = thousand_irefs_per_line) %>%
|
| 383 | arrange(num_lines) %>%
|
| 384 | select(all_of(cols3)) ->
|
| 385 | instructions
|
| 386 |
|
| 387 | Log('\n')
|
| 388 | Log('instructions has %d rows', nrow(instructions))
|
| 389 | print(instructions)
|
| 390 | }
|
| 391 |
|
| 392 | WriteProvenance(distinct_hosts, distinct_shells, out_dir)
|
| 393 |
|
| 394 | raw_data_table = tibble(
|
| 395 | filename = basename(as.character(raw_data$path)),
|
| 396 | filename_HREF = benchmarkDataLink('osh-parser', filename, '')
|
| 397 | )
|
| 398 | #print(raw_data_table)
|
| 399 |
|
| 400 | writeCsv(raw_data_table, file.path(out_dir, 'raw-data'))
|
| 401 |
|
| 402 | precision = SamePrecision(0) # lines per ms
|
| 403 | writeCsv(times_summary, file.path(out_dir, 'summary'), precision)
|
| 404 |
|
| 405 | precision = ColumnPrecision(list(), default = 1)
|
| 406 | writeTsv(cachegrind_summary, file.path(out_dir, 'cachegrind_summary'), precision)
|
| 407 |
|
| 408 | if (!is.null(times_flat)) {
|
| 409 | precision = SamePrecision(0)
|
| 410 | writeTsv(times_flat, file.path(out_dir, 'times_flat'), precision)
|
| 411 | }
|
| 412 |
|
| 413 | if (!is.null(cachegrind_flat)) {
|
| 414 | precision = SamePrecision(0)
|
| 415 | writeTsv(cachegrind_flat, file.path(out_dir, 'cachegrind_flat'), precision)
|
| 416 | }
|
| 417 |
|
| 418 | if (!is.null(elapsed)) { # equivalent to no-host
|
| 419 | # Round to nearest millisecond, but the ratio has a decimal point.
|
| 420 | precision = ColumnPrecision(list(osh_to_bash_ratio = 1), default = 0)
|
| 421 | writeCsv(elapsed, file.path(out_dir, 'elapsed'), precision)
|
| 422 |
|
| 423 | precision = SamePrecision(0)
|
| 424 | writeCsv(rate, file.path(out_dir, 'rate'), precision)
|
| 425 |
|
| 426 | writeCsv(max_rss, file.path(out_dir, 'max_rss'))
|
| 427 |
|
| 428 | precision = SamePrecision(1)
|
| 429 | writeTsv(instructions, file.path(out_dir, 'instructions'), precision)
|
| 430 | }
|
| 431 |
|
| 432 | Log('Wrote %s', out_dir)
|
| 433 | }
|
| 434 |
|
| 435 | WriteProvenance = function(distinct_hosts, distinct_shells, out_dir, tsv = F) {
|
| 436 |
|
| 437 | num_hosts = nrow(distinct_hosts)
|
| 438 | if (num_hosts == 1) {
|
| 439 | linkify = provenanceLink
|
| 440 | } else {
|
| 441 | linkify = benchmarkDataLink
|
| 442 | }
|
| 443 |
|
| 444 | Log('distinct_hosts')
|
| 445 | print(distinct_hosts)
|
| 446 | Log('')
|
| 447 |
|
| 448 | Log('distinct_shells')
|
| 449 | print(distinct_shells)
|
| 450 | Log('')
|
| 451 |
|
| 452 | # Should be:
|
| 453 | # host_id_url
|
| 454 | # And then csv_to_html will be smart enough? It should take --url flag?
|
| 455 | host_table = tibble(
|
| 456 | host_label = distinct_hosts$host_label,
|
| 457 | host_id = paste(distinct_hosts$host_name,
|
| 458 | distinct_hosts$host_hash, sep='-'),
|
| 459 | host_id_HREF = linkify('host-id', host_id, '/')
|
| 460 | )
|
| 461 | Log('host_table')
|
| 462 | print(host_table)
|
| 463 | Log('')
|
| 464 |
|
| 465 | shell_table = tibble(
|
| 466 | shell_label = distinct_shells$shell_label,
|
| 467 | shell_id = paste(distinct_shells$shell_name,
|
| 468 | distinct_shells$shell_hash, sep='-'),
|
| 469 | shell_id_HREF = linkify('shell-id', shell_id, '/')
|
| 470 | )
|
| 471 |
|
| 472 | Log('shell_table')
|
| 473 | print(shell_table)
|
| 474 | Log('')
|
| 475 |
|
| 476 | if (tsv) {
|
| 477 | writeTsv(host_table, file.path(out_dir, 'hosts'))
|
| 478 | writeTsv(shell_table, file.path(out_dir, 'shells'))
|
| 479 | } else {
|
| 480 | writeCsv(host_table, file.path(out_dir, 'hosts'))
|
| 481 | writeCsv(shell_table, file.path(out_dir, 'shells'))
|
| 482 | }
|
| 483 | }
|
| 484 |
|
| 485 | WriteSimpleProvenance = function(provenance, out_dir) {
|
| 486 | Log('provenance')
|
| 487 | print(provenance)
|
| 488 | Log('')
|
| 489 |
|
| 490 | # Legacy: add $shell_name, because "$shell_basename-$shell_hash" is what
|
| 491 | # benchmarks/id.sh publish-shell-id uses
|
| 492 | provenance %>%
|
| 493 | mutate(shell_name = basename(sh_path)) %>%
|
| 494 | distinct(shell_label, shell_name, shell_hash) ->
|
| 495 | distinct_shells
|
| 496 |
|
| 497 | Log('distinct_shells')
|
| 498 | print(distinct_shells)
|
| 499 | Log('')
|
| 500 |
|
| 501 | provenance %>% distinct(host_label, host_name, host_hash) -> distinct_hosts
|
| 502 |
|
| 503 | WriteProvenance(distinct_hosts, distinct_shells, out_dir, tsv = T)
|
| 504 | }
|
| 505 |
|
| 506 | RuntimeReport = function(in_dir, out_dir) {
|
| 507 | times = readTsv(file.path(in_dir, 'times.tsv'))
|
| 508 |
|
| 509 | gc_stats = readTsv(file.path(in_dir, 'gc_stats.tsv'))
|
| 510 | provenance = readTsv(file.path(in_dir, 'provenance.tsv'))
|
| 511 |
|
| 512 | times %>% filter(status != 0) -> failed
|
| 513 | if (nrow(failed) != 0) {
|
| 514 | print(failed)
|
| 515 | stop('Some osh-runtime tasks failed')
|
| 516 | }
|
| 517 |
|
| 518 | # Joins:
|
| 519 | # times <= sh_path => provenance
|
| 520 | # times <= join_id, host_name => gc_stats
|
| 521 |
|
| 522 | # TODO: provenance may have rows from 2 machines. Could validate them and
|
| 523 | # deduplicate.
|
| 524 |
|
| 525 | # It should have (host_label, host_name, host_hash)
|
| 526 | # (shell_label, sh_path, shell_hash)
|
| 527 | provenance %>%
|
| 528 | mutate(host_label = host_name, shell_label = ShellLabelFromPath(sh_path)) ->
|
| 529 | provenance
|
| 530 |
|
| 531 | provenance %>% distinct(sh_path, shell_label) -> label_lookup
|
| 532 |
|
| 533 | Log('label_lookup')
|
| 534 | print(label_lookup)
|
| 535 |
|
| 536 | # Join with provenance for host label and shell label
|
| 537 | times %>%
|
| 538 | select(c(elapsed_secs, user_secs, sys_secs, max_rss_KiB, task_id,
|
| 539 | host_name, sh_path, workload)) %>%
|
| 540 | mutate(elapsed_ms = elapsed_secs * 1000,
|
| 541 | user_ms = user_secs * 1000,
|
| 542 | sys_ms = sys_secs * 1000,
|
| 543 | max_rss_MB = max_rss_KiB * 1024 / 1e6) %>%
|
| 544 | select(-c(elapsed_secs, user_secs, sys_secs, max_rss_KiB)) %>%
|
| 545 | left_join(label_lookup, by = c('sh_path')) %>%
|
| 546 | select(-c(sh_path)) %>%
|
| 547 | # we want to compare workloads on adjacent rows
|
| 548 | arrange(workload) ->
|
| 549 | details
|
| 550 |
|
| 551 | times %>%
|
| 552 | select(c(task_id, host_name, sh_path, workload, minor_faults, major_faults, swaps, in_block, out_block, signals, voluntary_ctx, involuntary_ctx)) %>%
|
| 553 | left_join(label_lookup, by = c('sh_path')) %>%
|
| 554 | select(-c(sh_path)) %>%
|
| 555 | # we want to compare workloads on adjacent rows
|
| 556 | arrange(workload) ->
|
| 557 | details_io
|
| 558 |
|
| 559 | Log('details')
|
| 560 | print(details)
|
| 561 |
|
| 562 | # Hack for release. TODO: unify with Soil
|
| 563 | if (Sys.getenv("OILS_NO_SOUFFLE") == "") {
|
| 564 | souffle_col = c('osh-native-souffle')
|
| 565 | } else {
|
| 566 | souffle_col = c()
|
| 567 | }
|
| 568 |
|
| 569 | cols2 = c('workload', 'host_name',
|
| 570 | 'bash', 'dash', 'osh-cpython', 'osh-native', souffle_col,
|
| 571 | 'py_bash_ratio', 'native_bash_ratio')
|
| 572 |
|
| 573 | # Elapsed time comparison
|
| 574 | details %>%
|
| 575 | select(-c(task_id, user_ms, sys_ms, max_rss_MB)) %>%
|
| 576 | spread(key = shell_label, value = elapsed_ms) %>%
|
| 577 | mutate(py_bash_ratio = `osh-cpython` / bash) %>%
|
| 578 | mutate(native_bash_ratio = `osh-native` / bash) %>%
|
| 579 | arrange(workload, host_name) %>%
|
| 580 | select(all_of(cols2)) ->
|
| 581 | elapsed
|
| 582 |
|
| 583 | Log('elapsed')
|
| 584 | print(elapsed)
|
| 585 |
|
| 586 | # Minor Page Faults Comparison
|
| 587 | details_io %>%
|
| 588 | select(c(host_name, shell_label, workload, minor_faults)) %>%
|
| 589 | spread(key = shell_label, value = minor_faults) %>%
|
| 590 | mutate(py_bash_ratio = `osh-cpython` / bash) %>%
|
| 591 | mutate(native_bash_ratio = `osh-native` / bash) %>%
|
| 592 | arrange(workload, host_name) %>%
|
| 593 | select(all_of(cols2)) ->
|
| 594 | page_faults
|
| 595 |
|
| 596 | Log('page_faults')
|
| 597 | print(page_faults)
|
| 598 |
|
| 599 | # Max RSS comparison
|
| 600 | details %>%
|
| 601 | select(c(host_name, shell_label, workload, max_rss_MB)) %>%
|
| 602 | spread(key = shell_label, value = max_rss_MB) %>%
|
| 603 | mutate(py_bash_ratio = `osh-cpython` / bash) %>%
|
| 604 | mutate(native_bash_ratio = `osh-native` / bash) %>%
|
| 605 | arrange(workload, host_name) %>%
|
| 606 | select(all_of(cols2)) ->
|
| 607 | max_rss
|
| 608 |
|
| 609 | Log('max rss')
|
| 610 | print(max_rss)
|
| 611 |
|
| 612 | details %>%
|
| 613 | select(c(task_id, host_name, workload, elapsed_ms, max_rss_MB)) %>%
|
| 614 | mutate(join_id = sprintf("gc-%d", task_id)) %>%
|
| 615 | select(-c(task_id)) ->
|
| 616 | gc_details
|
| 617 |
|
| 618 | Log('GC details')
|
| 619 | print(gc_details)
|
| 620 | Log('')
|
| 621 |
|
| 622 | Log('GC stats')
|
| 623 | print(gc_stats)
|
| 624 | Log('')
|
| 625 |
|
| 626 | gc_stats %>%
|
| 627 | left_join(gc_details, by = c('join_id', 'host_name')) %>%
|
| 628 | select(-c(join_id, roots_capacity, objs_capacity)) %>%
|
| 629 | # Do same transformations as GcReport()
|
| 630 | mutate(allocated_MB = bytes_allocated / 1e6) %>%
|
| 631 | select(-c(bytes_allocated)) %>%
|
| 632 | rename(num_gc_done = num_collections) %>%
|
| 633 | # Put these columns first
|
| 634 | relocate(workload, host_name,
|
| 635 | elapsed_ms, max_gc_millis, total_gc_millis,
|
| 636 | allocated_MB, max_rss_MB, num_allocated) ->
|
| 637 | gc_stats
|
| 638 |
|
| 639 | Log('After GC stats')
|
| 640 | print(gc_stats)
|
| 641 | Log('')
|
| 642 |
|
| 643 | WriteSimpleProvenance(provenance, out_dir)
|
| 644 |
|
| 645 | # milliseconds don't need decimal digit
|
| 646 | precision = ColumnPrecision(list(bash = 0, dash = 0, `osh-cpython` = 0,
|
| 647 | `osh-native` = 0, `osh-native-souffle` = 0, py_bash_ratio = 2,
|
| 648 | native_bash_ratio = 2))
|
| 649 | writeTsv(elapsed, file.path(out_dir, 'elapsed'), precision)
|
| 650 | writeTsv(page_faults, file.path(out_dir, 'page_faults'), precision)
|
| 651 |
|
| 652 | precision2 = ColumnPrecision(list(py_bash_ratio = 2, native_bash_ratio = 2))
|
| 653 | writeTsv(max_rss, file.path(out_dir, 'max_rss'), precision2)
|
| 654 |
|
| 655 | precision3 = ColumnPrecision(list(max_rss_MB = 1, allocated_MB = 1),
|
| 656 | default = 0)
|
| 657 | writeTsv(gc_stats, file.path(out_dir, 'gc_stats'), precision3)
|
| 658 |
|
| 659 | writeTsv(details, file.path(out_dir, 'details'), precision3)
|
| 660 | writeTsv(details_io, file.path(out_dir, 'details_io'))
|
| 661 |
|
| 662 | Log('Wrote %s', out_dir)
|
| 663 | }
|
| 664 |
|
| 665 | VmBaselineReport = function(in_dir, out_dir) {
|
| 666 | vm = readTsv(file.path(in_dir, 'vm-baseline.tsv'))
|
| 667 | #print(vm)
|
| 668 |
|
| 669 | # Not using DistinctHosts() because field host_hash isn't collected
|
| 670 | num_hosts = nrow(vm %>% distinct(host))
|
| 671 |
|
| 672 | vm %>%
|
| 673 | rename(kib = metric_value) %>%
|
| 674 | mutate(shell_label = ShellLabels(shell_name, shell_hash, num_hosts),
|
| 675 | megabytes = kib * 1024 / 1e6) %>%
|
| 676 | select(-c(shell_name, kib)) %>%
|
| 677 | spread(key = c(metric_name), value = megabytes) %>%
|
| 678 | rename(VmPeak_MB = VmPeak, VmRSS_MB = VmRSS) %>%
|
| 679 | select(c(shell_label, shell_hash, host, VmRSS_MB, VmPeak_MB)) %>%
|
| 680 | arrange(shell_label, shell_hash, host, VmPeak_MB) ->
|
| 681 | vm
|
| 682 |
|
| 683 | print(vm)
|
| 684 |
|
| 685 | writeTsv(vm, file.path(out_dir, 'vm-baseline'))
|
| 686 | }
|
| 687 |
|
| 688 | WriteOvmBuildDetails = function(distinct_hosts, distinct_compilers, out_dir) {
|
| 689 | host_table = tibble(
|
| 690 | host_label = distinct_hosts$host_label,
|
| 691 | host_id = paste(distinct_hosts$host_name,
|
| 692 | distinct_hosts$host_hash, sep='-'),
|
| 693 | host_id_HREF = benchmarkDataLink('host-id', host_id, '/')
|
| 694 | )
|
| 695 | print(host_table)
|
| 696 |
|
| 697 | dc = distinct_compilers
|
| 698 | compiler_table = tibble(
|
| 699 | compiler_label = dc$compiler_label,
|
| 700 | compiler_id = paste(dc$compiler_label, dc$compiler_hash, sep='-'),
|
| 701 | compiler_id_HREF = benchmarkDataLink('compiler-id', compiler_id, '/')
|
| 702 | )
|
| 703 | print(compiler_table)
|
| 704 |
|
| 705 | writeTsv(host_table, file.path(out_dir, 'hosts'))
|
| 706 | writeTsv(compiler_table, file.path(out_dir, 'compilers'))
|
| 707 | }
|
| 708 |
|
| 709 | OvmBuildReport = function(in_dir, out_dir) {
|
| 710 | times = readTsv(file.path(in_dir, 'times.tsv'))
|
| 711 | native_sizes = readTsv(file.path(in_dir, 'native-sizes.tsv'))
|
| 712 | #raw_data = readTsv(file.path(in_dir, 'raw-data.tsv'))
|
| 713 |
|
| 714 | times %>% filter(status != 0) -> failed
|
| 715 | if (nrow(failed) != 0) {
|
| 716 | print(failed)
|
| 717 | stop('Some ovm-build tasks failed')
|
| 718 | }
|
| 719 |
|
| 720 | times %>% distinct(host_name, host_hash) -> distinct_hosts
|
| 721 | distinct_hosts$host_label = distinct_hosts$host_name
|
| 722 |
|
| 723 | times %>% distinct(compiler_path, compiler_hash) -> distinct_compilers
|
| 724 | distinct_compilers$compiler_label = basename(distinct_compilers$compiler_path)
|
| 725 |
|
| 726 | #print(distinct_hosts)
|
| 727 | #print(distinct_compilers)
|
| 728 |
|
| 729 | WriteOvmBuildDetails(distinct_hosts, distinct_compilers, out_dir)
|
| 730 |
|
| 731 | times %>%
|
| 732 | select(-c(status)) %>%
|
| 733 | left_join(distinct_hosts, by = c('host_name', 'host_hash')) %>%
|
| 734 | left_join(distinct_compilers, by = c('compiler_path', 'compiler_hash')) %>%
|
| 735 | select(-c(host_name, host_hash, compiler_path, compiler_hash)) %>%
|
| 736 | mutate(src_dir = basename(src_dir),
|
| 737 | host_label = paste("host ", host_label),
|
| 738 | is_conf = str_detect(action, 'configure'),
|
| 739 | is_ovm = str_detect(action, 'oil.ovm'),
|
| 740 | is_dbg = str_detect(action, 'dbg'),
|
| 741 | ) %>%
|
| 742 | select(host_label, src_dir, compiler_label, action, is_conf, is_ovm, is_dbg,
|
| 743 | elapsed_secs) %>%
|
| 744 | spread(key = c(host_label), value = elapsed_secs) %>%
|
| 745 | arrange(src_dir, compiler_label, desc(is_conf), is_ovm, desc(is_dbg)) %>%
|
| 746 | select(-c(is_conf, is_ovm, is_dbg)) ->
|
| 747 | times
|
| 748 |
|
| 749 | #print(times)
|
| 750 |
|
| 751 | # paths look like _tmp/ovm-build/bin/clang/oils_cpp.stripped
|
| 752 | native_sizes %>%
|
| 753 | select(c(host_label, path, num_bytes)) %>%
|
| 754 | mutate(host_label = paste("host ", host_label),
|
| 755 | binary = basename(path),
|
| 756 | compiler = basename(dirname(path)),
|
| 757 | ) %>%
|
| 758 | select(-c(path)) %>%
|
| 759 | spread(key = c(host_label), value = num_bytes) %>%
|
| 760 | arrange(compiler, binary) ->
|
| 761 | native_sizes
|
| 762 |
|
| 763 | # NOTE: These don't have the host and compiler.
|
| 764 | writeTsv(times, file.path(out_dir, 'times'))
|
| 765 | writeTsv(native_sizes, file.path(out_dir, 'native-sizes'))
|
| 766 |
|
| 767 | # TODO: I want a size report too
|
| 768 | #writeCsv(sizes, file.path(out_dir, 'sizes'))
|
| 769 | }
|
| 770 |
|
| 771 | unique_stdout_md5sum = function(t, num_expected) {
|
| 772 | u = n_distinct(t$stdout_md5sum)
|
| 773 | if (u != num_expected) {
|
| 774 | t %>% select(c(host_name, task_name, arg1, arg2, runtime_name, stdout_md5sum)) %>% print()
|
| 775 | stop(sprintf('Expected %d unique md5sums, got %d', num_expected, u))
|
| 776 | }
|
| 777 | }
|
| 778 |
|
| 779 | ComputeReport = function(in_dir, out_dir) {
|
| 780 | # TSV file, not CSV
|
| 781 | times = read.table(file.path(in_dir, 'times.tsv'), header=T)
|
| 782 | print(times)
|
| 783 |
|
| 784 | times %>% filter(status != 0) -> failed
|
| 785 | if (nrow(failed) != 0) {
|
| 786 | print(failed)
|
| 787 | stop('Some compute tasks failed')
|
| 788 | }
|
| 789 |
|
| 790 | #
|
| 791 | # Check correctness
|
| 792 | #
|
| 793 |
|
| 794 | times %>% filter(task_name == 'hello') %>% unique_stdout_md5sum(1)
|
| 795 | times %>% filter(task_name == 'fib') %>% unique_stdout_md5sum(1)
|
| 796 | times %>% filter(task_name == 'for_loop') %>% unique_stdout_md5sum(1)
|
| 797 | times %>% filter(task_name == 'control_flow') %>% unique_stdout_md5sum(1)
|
| 798 | times %>% filter(task_name == 'word_freq') %>% unique_stdout_md5sum(1)
|
| 799 | # 3 different inputs
|
| 800 | times %>% filter(task_name == 'parse_help') %>% unique_stdout_md5sum(3)
|
| 801 |
|
| 802 | times %>% filter(task_name == 'bubble_sort') %>% unique_stdout_md5sum(2)
|
| 803 |
|
| 804 | # TODO:
|
| 805 | # - oils_cpp doesn't implement unicode LANG=C
|
| 806 | # - bash behaves differently on your desktop vs. in the container
|
| 807 | # - might need layer-locales in the image?
|
| 808 |
|
| 809 | #times %>% filter(task_name == 'palindrome' & arg1 == 'unicode') %>% unique_stdout_md5sum(1)
|
| 810 | # Ditto here
|
| 811 | #times %>% filter(task_name == 'palindrome' & arg1 == 'bytes') %>% unique_stdout_md5sum(1)
|
| 812 |
|
| 813 | #
|
| 814 | # Find distinct shells and hosts, and label them for readability.
|
| 815 | #
|
| 816 |
|
| 817 | # Runtimes are called shells, as a hack for code reuse
|
| 818 | times %>%
|
| 819 | mutate(shell_name = runtime_name, shell_hash = runtime_hash) %>%
|
| 820 | select(c(host_name, host_hash, shell_name, shell_hash)) ->
|
| 821 | tmp
|
| 822 |
|
| 823 | distinct_hosts = DistinctHosts(tmp)
|
| 824 | Log('')
|
| 825 | Log('Distinct hosts')
|
| 826 | print(distinct_hosts)
|
| 827 |
|
| 828 | distinct_shells = DistinctShells(tmp)
|
| 829 | Log('')
|
| 830 | Log('Distinct runtimes')
|
| 831 | print(distinct_shells)
|
| 832 |
|
| 833 | num_hosts = nrow(distinct_hosts)
|
| 834 |
|
| 835 | times %>%
|
| 836 | select(-c(status, stdout_md5sum, stdout_filename, host_hash, runtime_hash)) %>%
|
| 837 | mutate(runtime_label = ShellLabels(runtime_name, runtime_hash, num_hosts),
|
| 838 | elapsed_ms = elapsed_secs * 1000,
|
| 839 | user_ms = user_secs * 1000,
|
| 840 | sys_ms = sys_secs * 1000,
|
| 841 | max_rss_MB = max_rss_KiB * 1024 / 1e6) %>%
|
| 842 | select(-c(runtime_name, elapsed_secs, user_secs, sys_secs, max_rss_KiB)) %>%
|
| 843 | arrange(host_name, task_name, arg1, arg2, user_ms) ->
|
| 844 | details
|
| 845 |
|
| 846 | times %>%
|
| 847 | mutate(
|
| 848 | runtime_label = ShellLabels(runtime_name, runtime_hash, num_hosts),
|
| 849 | stdout_md5sum_HREF = file.path('tmp', task_name, stdout_filename)) %>%
|
| 850 | select(c(host_name, task_name, arg1, arg2, runtime_label,
|
| 851 | stdout_md5sum, stdout_md5sum_HREF)) ->
|
| 852 | stdout_files
|
| 853 |
|
| 854 | details %>% filter(task_name == 'hello') %>% select(-c(task_name)) -> hello
|
| 855 | details %>% filter(task_name == 'fib') %>% select(-c(task_name)) -> fib
|
| 856 | details %>% filter(task_name == 'for_loop') %>% select(-c(task_name)) -> for_loop
|
| 857 | details %>% filter(task_name == 'control_flow') %>% select(-c(task_name)) -> control_flow
|
| 858 | details %>% filter(task_name == 'word_freq') %>% select(-c(task_name)) -> word_freq
|
| 859 | # There's no arg2
|
| 860 | details %>% filter(task_name == 'parse_help') %>% select(-c(task_name, arg2)) -> parse_help
|
| 861 |
|
| 862 | details %>% filter(task_name == 'bubble_sort') %>% select(-c(task_name)) -> bubble_sort
|
| 863 | details %>% filter(task_name == 'palindrome' & arg1 == 'unicode') %>% select(-c(task_name)) -> palindrome
|
| 864 |
|
| 865 | precision = ColumnPrecision(list(max_rss_MB = 1), default = 0)
|
| 866 | writeTsv(details, file.path(out_dir, 'details'), precision)
|
| 867 |
|
| 868 | writeTsv(stdout_files, file.path(out_dir, 'stdout_files'), precision)
|
| 869 |
|
| 870 | writeTsv(hello, file.path(out_dir, 'hello'), precision)
|
| 871 | writeTsv(fib, file.path(out_dir, 'fib'), precision)
|
| 872 | writeTsv(word_freq, file.path(out_dir, 'word_freq'), precision)
|
| 873 | writeTsv(for_loop, file.path(out_dir, 'for_loop'), precision)
|
| 874 | writeTsv(control_flow, file.path(out_dir, 'control_flow'), precision)
|
| 875 | writeTsv(parse_help, file.path(out_dir, 'parse_help'), precision)
|
| 876 |
|
| 877 | writeTsv(bubble_sort, file.path(out_dir, 'bubble_sort'), precision)
|
| 878 | writeTsv(palindrome, file.path(out_dir, 'palindrome'), precision)
|
| 879 |
|
| 880 | WriteProvenance(distinct_hosts, distinct_shells, out_dir, tsv = T)
|
| 881 | }
|
| 882 |
|
| 883 | WriteOneTask = function(times, out_dir, task_name, precision) {
|
| 884 | times %>%
|
| 885 | filter(task == task_name) %>%
|
| 886 | select(-c(task)) -> subset
|
| 887 |
|
| 888 | writeTsv(subset, file.path(out_dir, task_name), precision)
|
| 889 | }
|
| 890 |
|
| 891 | SHELL_ORDER = c('dash',
|
| 892 | 'bash',
|
| 893 | 'zsh',
|
| 894 | '_bin/cxx-opt+bumpleak/osh',
|
| 895 | '_bin/cxx-opt+bumproot/osh',
|
| 896 | '_bin/cxx-opt+bumpsmall/osh',
|
| 897 | '_bin/cxx-opt/osh',
|
| 898 | '_bin/cxx-opt+nopool/osh')
|
| 899 |
|
| 900 | GcReport = function(in_dir, out_dir) {
|
| 901 | times = read.table(file.path(in_dir, 'raw/times.tsv'), header=T)
|
| 902 | gc_stats = read.table(file.path(in_dir, 'stage1/gc_stats.tsv'), header=T)
|
| 903 |
|
| 904 | times %>% filter(status != 0) -> failed
|
| 905 | if (nrow(failed) != 0) {
|
| 906 | print(failed)
|
| 907 | stop('Some gc tasks failed')
|
| 908 | }
|
| 909 |
|
| 910 | # Change units and order columns
|
| 911 | times %>%
|
| 912 | arrange(task, factor(sh_path, levels = SHELL_ORDER)) %>%
|
| 913 | mutate(elapsed_ms = elapsed_secs * 1000,
|
| 914 | user_ms = user_secs * 1000,
|
| 915 | sys_ms = sys_secs * 1000,
|
| 916 | max_rss_MB = max_rss_KiB * 1024 / 1e6,
|
| 917 | shell_label = ShellLabelFromPath(sh_path)
|
| 918 | ) %>%
|
| 919 | select(c(join_id, task, elapsed_ms, user_ms, sys_ms, max_rss_MB, shell_label,
|
| 920 | shell_runtime_opts)) ->
|
| 921 | times
|
| 922 |
|
| 923 | # Join and order columns
|
| 924 | gc_stats %>% left_join(times, by = c('join_id')) %>%
|
| 925 | arrange(desc(task)) %>%
|
| 926 | mutate(allocated_MB = bytes_allocated / 1e6) %>%
|
| 927 | # try to make the table skinnier
|
| 928 | rename(num_gc_done = num_collections) %>%
|
| 929 | select(task, elapsed_ms, max_gc_millis, total_gc_millis,
|
| 930 | allocated_MB, max_rss_MB, num_allocated,
|
| 931 | num_gc_points, num_gc_done, gc_threshold, num_growths, max_survived,
|
| 932 | shell_label) ->
|
| 933 | gc_stats
|
| 934 |
|
| 935 | times %>% select(-c(join_id)) -> times
|
| 936 |
|
| 937 |
|
| 938 | precision = ColumnPrecision(list(max_rss_MB = 1, allocated_MB = 1),
|
| 939 | default = 0)
|
| 940 |
|
| 941 | writeTsv(times, file.path(out_dir, 'times'), precision)
|
| 942 | writeTsv(gc_stats, file.path(out_dir, 'gc_stats'), precision)
|
| 943 |
|
| 944 | tasks = c('parse.configure-coreutils',
|
| 945 | 'parse.configure-cpython',
|
| 946 | 'parse.abuild',
|
| 947 | 'ex.compute-fib',
|
| 948 | 'ex.bashcomp-parse-help',
|
| 949 | 'ex.abuild-print-help')
|
| 950 | # Write out separate rows
|
| 951 | for (task in tasks) {
|
| 952 | WriteOneTask(times, out_dir, task, precision)
|
| 953 | }
|
| 954 | }
|
| 955 |
|
| 956 | GcCachegrindReport = function(in_dir, out_dir) {
|
| 957 | times = readTsv(file.path(in_dir, 'raw/times.tsv'))
|
| 958 | counts = readTsv(file.path(in_dir, 'stage1/cachegrind.tsv'))
|
| 959 |
|
| 960 | times %>% filter(status != 0) -> failed
|
| 961 | if (nrow(failed) != 0) {
|
| 962 | print(failed)
|
| 963 | stop('Some gc tasks failed')
|
| 964 | }
|
| 965 |
|
| 966 | print(times)
|
| 967 | print(counts)
|
| 968 |
|
| 969 | counts %>% left_join(times, by = c('join_id')) %>%
|
| 970 | mutate(million_irefs = irefs / 1e6) %>%
|
| 971 | select(c(million_irefs, task, sh_path, shell_runtime_opts)) %>%
|
| 972 | arrange(factor(sh_path, levels = SHELL_ORDER)) ->
|
| 973 | counts
|
| 974 |
|
| 975 | precision = NULL
|
| 976 | tasks = c('parse.abuild', 'ex.compute-fib')
|
| 977 | for (task in tasks) {
|
| 978 | WriteOneTask(counts, out_dir, task, precision)
|
| 979 | }
|
| 980 | }
|
| 981 |
|
| 982 | MyCppReport = function(in_dir, out_dir) {
|
| 983 | times = readTsv(file.path(in_dir, 'benchmark-table.tsv'))
|
| 984 | print(times)
|
| 985 |
|
| 986 | times %>% filter(status != 0) -> failed
|
| 987 | if (nrow(failed) != 0) {
|
| 988 | print(failed)
|
| 989 | stop('Some mycpp tasks failed')
|
| 990 | }
|
| 991 |
|
| 992 | # Don't care about elapsed and system
|
| 993 | times %>% select(-c(status, elapsed_secs, bin, task_out)) %>%
|
| 994 | mutate(example_name_HREF = mycppUrl(example_name),
|
| 995 | gen = c('gen'),
|
| 996 | gen_HREF = genUrl(example_name),
|
| 997 | user_ms = user_secs * 1000,
|
| 998 | sys_ms = sys_secs * 1000,
|
| 999 | max_rss_MB = max_rss_KiB * 1024 / 1e6) %>%
|
| 1000 | select(-c(user_secs, sys_secs, max_rss_KiB)) ->
|
| 1001 | details
|
| 1002 |
|
| 1003 | details %>% select(-c(sys_ms, max_rss_MB)) %>%
|
| 1004 | spread(key = impl, value = user_ms) %>%
|
| 1005 | mutate(`C++ : Python` = `C++` / Python) %>%
|
| 1006 | arrange(`C++ : Python`) ->
|
| 1007 | user_time
|
| 1008 |
|
| 1009 | details %>% select(-c(user_ms, max_rss_MB)) %>%
|
| 1010 | spread(key = impl, value = sys_ms) %>%
|
| 1011 | mutate(`C++ : Python` = `C++` / Python) %>%
|
| 1012 | arrange(`C++ : Python`) ->
|
| 1013 | sys_time
|
| 1014 |
|
| 1015 | details %>% select(-c(user_ms, sys_ms)) %>%
|
| 1016 | spread(key = impl, value = max_rss_MB) %>%
|
| 1017 | mutate(`C++ : Python` = `C++` / Python) %>%
|
| 1018 | arrange(`C++ : Python`) ->
|
| 1019 | max_rss
|
| 1020 |
|
| 1021 | # Sometimes it speeds up by more than 10x
|
| 1022 | precision1 = ColumnPrecision(list(`C++ : Python` = 3), default = 0)
|
| 1023 | writeTsv(user_time, file.path(out_dir, 'user_time'), precision1)
|
| 1024 | writeTsv(sys_time, file.path(out_dir, 'sys_time'), precision1)
|
| 1025 |
|
| 1026 | precision2 = ColumnPrecision(list(`C++ : Python` = 2), default = 1)
|
| 1027 | writeTsv(max_rss, file.path(out_dir, 'max_rss'), precision2)
|
| 1028 |
|
| 1029 | writeTsv(details, file.path(out_dir, 'details'))
|
| 1030 | }
|
| 1031 |
|
| 1032 | UftraceTaskReport = function(env, task_name, summaries) {
|
| 1033 | # Need this again after redirect
|
| 1034 | MaybeDisableColor(stdout())
|
| 1035 |
|
| 1036 | task_env = env[[task_name]]
|
| 1037 |
|
| 1038 | untyped = task_env$untyped
|
| 1039 | typed = task_env$typed
|
| 1040 | strings = task_env$strings
|
| 1041 | slabs = task_env$slabs
|
| 1042 | reserve = task_env$reserve
|
| 1043 |
|
| 1044 | string_overhead = 17 # GC header (8) + len (4) + hash value (4) + NUL (1)
|
| 1045 | strings %>% mutate(obj_len = str_len + string_overhead) -> strings
|
| 1046 |
|
| 1047 | # TODO: Output these totals PER WORKLOAD, e.g. parsing big/small, executing
|
| 1048 | # big/small
|
| 1049 | #
|
| 1050 | # And then zoom in on distributions as well
|
| 1051 |
|
| 1052 | num_allocs = nrow(untyped)
|
| 1053 | total_bytes = sum(untyped$obj_len)
|
| 1054 |
|
| 1055 | untyped %>% group_by(obj_len) %>% count() %>% ungroup() -> untyped_hist
|
| 1056 | #print(untyped_hist)
|
| 1057 |
|
| 1058 | untyped_hist %>%
|
| 1059 | mutate(n_less_than = cumsum(n),
|
| 1060 | percent = n_less_than * 100.0 / num_allocs) ->
|
| 1061 | alloc_sizes
|
| 1062 |
|
| 1063 | a24 = untyped_hist %>% filter(obj_len <= 24)
|
| 1064 | a48 = untyped_hist %>% filter(obj_len <= 48)
|
| 1065 | a96 = untyped_hist %>% filter(obj_len <= 96)
|
| 1066 |
|
| 1067 | allocs_24_bytes_or_less = sum(a24$n) * 100.0 / num_allocs
|
| 1068 | allocs_48_bytes_or_less = sum(a48$n) * 100.0 / num_allocs
|
| 1069 | allocs_96_bytes_or_less = sum(a96$n) * 100.0 / num_allocs
|
| 1070 |
|
| 1071 | Log('Percentage of allocs less than 48 bytes: %.1f', allocs_48_bytes_or_less)
|
| 1072 |
|
| 1073 | options(tibble.print_min=25)
|
| 1074 |
|
| 1075 | Log('')
|
| 1076 | Log('All allocations')
|
| 1077 | print(alloc_sizes %>% head(22))
|
| 1078 | print(alloc_sizes %>% tail(5))
|
| 1079 |
|
| 1080 | Log('')
|
| 1081 | Log('Common Sizes')
|
| 1082 | print(untyped_hist %>% arrange(desc(n)) %>% head(8))
|
| 1083 |
|
| 1084 | Log('')
|
| 1085 | Log(' %s total allocations, total bytes = %s', commas(num_allocs), commas(total_bytes))
|
| 1086 | Log('')
|
| 1087 |
|
| 1088 | Log('Typed allocations')
|
| 1089 |
|
| 1090 | num_typed = nrow(typed)
|
| 1091 |
|
| 1092 | typed %>% group_by(func_name) %>% count() %>% ungroup() %>%
|
| 1093 | mutate(percent = n * 100.0 / num_typed) %>%
|
| 1094 | arrange(desc(n)) -> most_common_types
|
| 1095 |
|
| 1096 | print(most_common_types %>% head(20))
|
| 1097 | print(most_common_types %>% tail(5))
|
| 1098 |
|
| 1099 | lists = typed %>% filter(str_starts(func_name, ('List<')))
|
| 1100 | #print(lists)
|
| 1101 |
|
| 1102 | num_lists = nrow(lists)
|
| 1103 | total_list_bytes = num_lists * 24 # sizeof List<T> head is hard-coded
|
| 1104 |
|
| 1105 | Log('')
|
| 1106 | Log('%s typed allocs, including %s List<T>', commas(num_typed), commas(num_lists))
|
| 1107 | Log('%.2f%% of allocs are typed', num_typed * 100 / num_allocs)
|
| 1108 | Log('')
|
| 1109 |
|
| 1110 | #
|
| 1111 | # Strings
|
| 1112 | #
|
| 1113 |
|
| 1114 | num_strings = nrow(strings)
|
| 1115 | total_string_bytes = sum(strings$obj_len)
|
| 1116 |
|
| 1117 | strings %>% group_by(str_len) %>% count() %>% ungroup() %>%
|
| 1118 | mutate(n_less_than = cumsum(n),
|
| 1119 | percent = n_less_than * 100.0 / num_strings) ->
|
| 1120 | string_lengths
|
| 1121 |
|
| 1122 | strs_6_bytes_or_less = string_lengths %>% filter(str_len == 6) %>% select(percent)
|
| 1123 | strs_14_bytes_or_less = string_lengths %>% filter(str_len == 14) %>% select(percent)
|
| 1124 |
|
| 1125 | # Parse workload
|
| 1126 | # 62% of strings <= 6 bytes
|
| 1127 | # 84% of strings <= 14 bytes
|
| 1128 |
|
| 1129 | Log('Str - NewStr() and OverAllocatedStr()')
|
| 1130 | print(string_lengths %>% head(16))
|
| 1131 | print(string_lengths %>% tail(5))
|
| 1132 | Log('')
|
| 1133 |
|
| 1134 | Log('%s string allocations, total length = %s, total bytes = %s', commas(num_strings),
|
| 1135 | commas(sum(strings$str_len)), commas(total_string_bytes))
|
| 1136 | Log('')
|
| 1137 | Log('%.2f%% of allocs are strings', num_strings * 100 / num_allocs)
|
| 1138 | Log('%.2f%% of bytes are strings', total_string_bytes * 100 / total_bytes)
|
| 1139 | Log('')
|
| 1140 |
|
| 1141 | #
|
| 1142 | # Slabs
|
| 1143 | #
|
| 1144 |
|
| 1145 | Log('NewSlab()')
|
| 1146 |
|
| 1147 | num_slabs = nrow(slabs)
|
| 1148 | slabs %>% group_by(slab_len) %>% count() %>% ungroup() %>%
|
| 1149 | mutate(n_less_than = cumsum(n),
|
| 1150 | percent = n_less_than * 100.0 / num_slabs) ->
|
| 1151 | slab_lengths
|
| 1152 |
|
| 1153 | slabs %>% group_by(func_name) %>% count() %>% ungroup() %>%
|
| 1154 | arrange(desc(n)) -> slab_types
|
| 1155 |
|
| 1156 | Log(' Lengths')
|
| 1157 | print(slab_lengths %>% head())
|
| 1158 | print(slab_lengths %>% tail(5))
|
| 1159 | Log('')
|
| 1160 |
|
| 1161 | Log(' Slab Types')
|
| 1162 | print(slab_types %>% head())
|
| 1163 | print(slab_types %>% tail(5))
|
| 1164 | Log('')
|
| 1165 |
|
| 1166 | total_slab_items = sum(slabs$slab_len)
|
| 1167 |
|
| 1168 | Log('%s slabs, total items = %s', commas(num_slabs),
|
| 1169 | commas(sum(slabs$slab_len)))
|
| 1170 | Log('%.2f%% of allocs are slabs', num_slabs * 100 / num_allocs)
|
| 1171 | Log('')
|
| 1172 |
|
| 1173 | #
|
| 1174 | # reserve() calls
|
| 1175 | #
|
| 1176 |
|
| 1177 | # There should be strictly more List::reserve() calls than NewSlab
|
| 1178 |
|
| 1179 | Log('::reserve(int n)')
|
| 1180 | Log('')
|
| 1181 |
|
| 1182 | num_reserve = nrow(reserve)
|
| 1183 | reserve %>% group_by(num_items) %>% count() %>% ungroup() %>%
|
| 1184 | mutate(n_less_than = cumsum(n),
|
| 1185 | percent = n_less_than * 100.0 / num_reserve) ->
|
| 1186 | reserve_args
|
| 1187 |
|
| 1188 | Log(' Num Items')
|
| 1189 | print(reserve_args %>% head(15))
|
| 1190 | print(reserve_args %>% tail(5))
|
| 1191 | Log('')
|
| 1192 |
|
| 1193 | Log('%s reserve() calls, total items = %s', commas(num_reserve),
|
| 1194 | commas(sum(reserve$num_items)))
|
| 1195 | Log('')
|
| 1196 |
|
| 1197 | # Accounting for all allocations!
|
| 1198 | Log('Untyped: %s', commas(num_allocs))
|
| 1199 | Log('Typed + Str + Slab: %s', commas(num_typed + num_strings + num_slabs))
|
| 1200 | Log('')
|
| 1201 |
|
| 1202 | num_other_typed = num_typed - num_lists
|
| 1203 |
|
| 1204 | # Summary table
|
| 1205 | stats = tibble(task = task_name,
|
| 1206 | total_bytes_ = commas(total_bytes),
|
| 1207 | num_allocs_ = commas(num_allocs),
|
| 1208 | sum_typed_strs_slabs = commas(num_typed + num_strings + num_slabs),
|
| 1209 | num_reserve_calls = commas(num_reserve),
|
| 1210 |
|
| 1211 | percent_list_allocs = Percent(num_lists, num_allocs),
|
| 1212 | percent_slab_allocs = Percent(num_slabs, num_allocs),
|
| 1213 | percent_string_allocs = Percent(num_strings, num_allocs),
|
| 1214 | percent_other_typed_allocs = Percent(num_other_typed, num_allocs),
|
| 1215 |
|
| 1216 | percent_list_bytes = Percent(total_list_bytes, total_bytes),
|
| 1217 | percent_string_bytes = Percent(total_string_bytes, total_bytes),
|
| 1218 |
|
| 1219 | allocs_24_bytes_or_less = sprintf('%.1f%%', allocs_24_bytes_or_less),
|
| 1220 | allocs_48_bytes_or_less = sprintf('%.1f%%', allocs_48_bytes_or_less),
|
| 1221 | allocs_96_bytes_or_less = sprintf('%.1f%%', allocs_96_bytes_or_less),
|
| 1222 |
|
| 1223 | strs_6_bytes_or_less = sprintf('%.1f%%', strs_6_bytes_or_less),
|
| 1224 | strs_14_bytes_or_less = sprintf('%.1f%%', strs_14_bytes_or_less),
|
| 1225 | )
|
| 1226 | summaries$stats[[task_name]] = stats
|
| 1227 |
|
| 1228 | summaries$most_common_types[[task_name]] = most_common_types
|
| 1229 | }
|
| 1230 |
|
| 1231 | LoadUftraceTsv = function(in_dir, env) {
|
| 1232 | for (task in list.files(in_dir)) {
|
| 1233 | Log('Loading data for task %s', task)
|
| 1234 | base_dir = file.path(in_dir, task)
|
| 1235 |
|
| 1236 | task_env = new.env()
|
| 1237 | env[[task]] = task_env
|
| 1238 |
|
| 1239 | # TSV file, not CSV
|
| 1240 | task_env$untyped = readTsv(file.path(base_dir, 'all-untyped.tsv'))
|
| 1241 | task_env$typed = readTsv(file.path(base_dir, 'typed.tsv'))
|
| 1242 | task_env$strings = readTsv(file.path(base_dir, 'strings.tsv'))
|
| 1243 | task_env$slabs = readTsv(file.path(base_dir, 'slabs.tsv'))
|
| 1244 | task_env$reserve = readTsv(file.path(base_dir, 'reserve.tsv'))
|
| 1245 |
|
| 1246 | # median string length is 4, mean is 9.5!
|
| 1247 | Log('UNTYPED')
|
| 1248 | print(summary(task_env$untyped))
|
| 1249 | Log('')
|
| 1250 |
|
| 1251 | Log('TYPED')
|
| 1252 | print(summary(task_env$typed))
|
| 1253 | Log('')
|
| 1254 |
|
| 1255 | Log('STRINGS')
|
| 1256 | print(summary(task_env$strings))
|
| 1257 | Log('')
|
| 1258 |
|
| 1259 | Log('SLABS')
|
| 1260 | print(summary(task_env$slabs))
|
| 1261 | Log('')
|
| 1262 |
|
| 1263 | Log('RESERVE')
|
| 1264 | print(summary(task_env$reserve))
|
| 1265 | Log('')
|
| 1266 | }
|
| 1267 | }
|
| 1268 |
|
| 1269 | Percent = function(n, total) {
|
| 1270 | sprintf('%.1f%%', n * 100.0 / total)
|
| 1271 | }
|
| 1272 |
|
| 1273 | PrettyPrintLong = function(d) {
|
| 1274 | tr = t(d) # transpose
|
| 1275 |
|
| 1276 | row_names = rownames(tr)
|
| 1277 |
|
| 1278 | for (i in 1:nrow(tr)) {
|
| 1279 | row_name = row_names[i]
|
| 1280 | cat(sprintf('%26s', row_name)) # calculated min width manually
|
| 1281 | cat(sprintf('%20s', tr[i,]))
|
| 1282 | cat('\n')
|
| 1283 |
|
| 1284 | # Extra spacing
|
| 1285 | if (row_name %in% c('num_reserve_calls',
|
| 1286 | 'percent_string_bytes',
|
| 1287 | 'percent_other_typed_allocs',
|
| 1288 | 'allocs_96_bytes_or_less')) {
|
| 1289 | cat('\n')
|
| 1290 | }
|
| 1291 | }
|
| 1292 | }
|
| 1293 |
|
| 1294 |
|
| 1295 | UftraceReport = function(env, out_dir) {
|
| 1296 | # summaries$stats should be a list of 1-row data frames
|
| 1297 | # summaries$top_types should be a list of types
|
| 1298 | summaries = new.env()
|
| 1299 |
|
| 1300 | for (task_name in names(env)) {
|
| 1301 | report_out = file.path(out_dir, paste0(task_name, '.txt'))
|
| 1302 |
|
| 1303 | Log('Making report for task %s -> %s', task_name, report_out)
|
| 1304 |
|
| 1305 | sink(file = report_out)
|
| 1306 | UftraceTaskReport(env, task_name, summaries)
|
| 1307 | sink() # reset
|
| 1308 | }
|
| 1309 | Log('')
|
| 1310 |
|
| 1311 | # Concate all the data frames added to summary
|
| 1312 | stats = bind_rows(as.list(summaries$stats))
|
| 1313 |
|
| 1314 | sink(file = file.path(out_dir, 'summary.txt'))
|
| 1315 | #print(stats)
|
| 1316 | #Log('')
|
| 1317 |
|
| 1318 | PrettyPrintLong(stats)
|
| 1319 | Log('')
|
| 1320 |
|
| 1321 | mct = summaries$most_common_types
|
| 1322 | for (task_name in names(mct)) {
|
| 1323 | Log('Common types in workload %s', task_name)
|
| 1324 | Log('')
|
| 1325 |
|
| 1326 | print(mct[[task_name]] %>% head(5))
|
| 1327 | Log('')
|
| 1328 | }
|
| 1329 | sink()
|
| 1330 |
|
| 1331 | # For the REPL
|
| 1332 | return(list(stats = stats))
|
| 1333 | }
|
| 1334 |
|
| 1335 | main = function(argv) {
|
| 1336 | action = argv[[1]]
|
| 1337 | in_dir = argv[[2]]
|
| 1338 | out_dir = argv[[3]]
|
| 1339 |
|
| 1340 | if (action == 'osh-parser') {
|
| 1341 | ParserReport(in_dir, out_dir)
|
| 1342 |
|
| 1343 | } else if (action == 'osh-runtime') {
|
| 1344 | RuntimeReport(in_dir, out_dir)
|
| 1345 |
|
| 1346 | } else if (action == 'vm-baseline') {
|
| 1347 | VmBaselineReport(in_dir, out_dir)
|
| 1348 |
|
| 1349 | } else if (action == 'ovm-build') {
|
| 1350 | OvmBuildReport(in_dir, out_dir)
|
| 1351 |
|
| 1352 | } else if (action == 'compute') {
|
| 1353 | ComputeReport(in_dir, out_dir)
|
| 1354 |
|
| 1355 | } else if (action == 'gc') {
|
| 1356 | GcReport(in_dir, out_dir)
|
| 1357 |
|
| 1358 | } else if (action == 'gc-cachegrind') {
|
| 1359 | GcCachegrindReport(in_dir, out_dir)
|
| 1360 |
|
| 1361 | } else if (action == 'mycpp') {
|
| 1362 | MyCppReport(in_dir, out_dir)
|
| 1363 |
|
| 1364 | } else if (action == 'uftrace') {
|
| 1365 | d = new.env()
|
| 1366 | LoadUftraceTsv(in_dir, d)
|
| 1367 | UftraceReport(d, out_dir)
|
| 1368 |
|
| 1369 | } else {
|
| 1370 | Log("Invalid action '%s'", action)
|
| 1371 | quit(status = 1)
|
| 1372 | }
|
| 1373 | Log('PID %d done', Sys.getpid())
|
| 1374 | }
|
| 1375 |
|
| 1376 | if (length(sys.frames()) == 0) {
|
| 1377 | # increase ggplot font size globally
|
| 1378 | #theme_set(theme_grey(base_size = 20))
|
| 1379 |
|
| 1380 | main(commandArgs(TRUE))
|
| 1381 | }
|