diff --git a/Makefile b/Makefile index 3074d7a..fd8cdee 100644 --- a/Makefile +++ b/Makefile @@ -26,18 +26,20 @@ # ============================================================ -MAX_CORES ?= 250 +MAX_CORES ?= 120 # EasyBuild installation prefix (imallona; edit accordingly) ## <------------------------------------!!!! EASYBUILD_PREFIX ?= /data/imallona/.local/easybuild export EASYBUILD_PREFIX # omnibenchmark command template -OB_CMD = ob run benchmark --local-storage --cores ${MAX_CORES} +# OB_CMD = ob run benchmark --local-storage --cores ${MAX_CORES} +OB_CMD = ob run +OB_CMD_END = --cores ${MAX_CORES} -k --task-timeout 1min --yes # actual benchmark plan repository - to be pinned (the commit/tag) CLUSTERING_REPO = https://github.com/omnibenchmark/clustering_example -CLUSTERING_BRANCH = longer_yamls +CLUSTERING_BRANCH = full_yamls CLUSTERING_DIR = clustering_example # legacy reports in the wrong repository; to be moved to this one @@ -81,7 +83,8 @@ run_conda: clone_yamls for i in $(RUNS); do \ echo " Run $$i for seed $$seed and run $$i."; \ echo "DEST: results/out_conda_seed_$$seed\_run_$$i" ;\ - ${OB_CMD} -b $(CLUSTERING_DIR)/Clustering_conda_tmp.yml --out-dir results/out_conda_seed_$$seed\_run_$$i; \ + mkdir -p results/out_conda_seed_$$seed\_run_$$i/ ;\ + ${OB_CMD} $(CLUSTERING_DIR)/Clustering_conda_tmp.yml --out-dir results/out_conda_seed_$$seed\_run_$$i ${OB_CMD_END}; \ cp $(CLUSTERING_DIR)/Clustering_conda_tmp.yml results/out_conda_seed_$$seed\_run_$$i/; \ done; \ done @@ -93,7 +96,8 @@ run_oras: clone_yamls sed -i "s/--seed\",[[:space:]]*[0-9]\+/--seed\", $$seed/" $(CLUSTERING_DIR)/Clustering_oras_tmp.yml; \ for i in $(RUNS); do \ echo " Run $$i for seed $$seed and run $$i."; \ - ${OB_CMD} -b $(CLUSTERING_DIR)/Clustering_oras_tmp.yml --out-dir results/out_oras_seed_$$seed\_run_$$i/; \ + mkdir -p results/out_oras_seed_$$seed\_run_$$i/ ;\ + ${OB_CMD} $(CLUSTERING_DIR)/Clustering_oras_tmp.yml --out-dir results/out_oras_seed_$$seed\_run_$$i/ ${OB_CMD_END}; \ cp $(CLUSTERING_DIR)/Clustering_oras_tmp.yml results/out_oras_seed_$$seed\_run_$$i/; \ done; \ done @@ -111,7 +115,8 @@ run_envs: clone_yamls sed -i "s/--seed\",[[:space:]]*[0-9]\+/--seed\", $$seed/" $(CLUSTERING_DIR)/Clustering_envmodules_tmp.yml; \ for i in $(RUNS); do \ echo " Run $$i for seed $$seed and run $$i..."; \ - ${OB_CMD} -b $(CLUSTERING_DIR)/Clustering_envmodules_tmp.yml --out-dir results/out_envmodules_seed_$$seed\_run_$$i/; \ + mkdir -p results/out_envmodules_seed_$$seed\_run_$$i/ ;\ + ${OB_CMD} $(CLUSTERING_DIR)/Clustering_envmodules_tmp.yml --out-dir results/out_envmodules_seed_$$seed\_run_$$i/ ${OB_CMD_END}; \ cp $(CLUSTERING_DIR)/Clustering_envmodules_tmp.yml results/out_envmodules_seed_$$seed\_run_$$i/; \ done; \ done \ diff --git a/analyze_results_izaskun.Rmd b/analyze_results_izaskun.Rmd index 1f43736..89965cd 100644 --- a/analyze_results_izaskun.Rmd +++ b/analyze_results_izaskun.Rmd @@ -34,7 +34,7 @@ knitr::opts_chunk$set( fig.path = "plots/", dev = c("png", "svg"), cache.lazy = FALSE, - cache = TRUE) + cache = FALSE) ``` @@ -138,7 +138,7 @@ flatten_record <- function(rec) { ```{r} records <- fromJSON("aggregated_results.json", simplifyVector = FALSE) -##records <- fromJSON("aggregated_results_full.json", simplifyVector = FALSE) +## records <- fromJSON("aggregated_results_full.json", simplifyVector = FALSE) ## fd_list <- vector("list", length(records)) ## for (i in seq_along(records)) { @@ -223,8 +223,6 @@ print(ggplot(fd[fd$backend %in% c("conda","oras","envmodules"), ], Are there cputimes that are redudant/incosistent for repeated runs? ```{r} -str(fd) - keys <- c("dataset_name", "method", "method_full", "generator", "backend", "k", "seed", "run") @@ -277,9 +275,17 @@ fd_clean <- fd %>% (has_seed) | (!has_seed & seed == 2) # keep all seeds if encoded, else only seed==2 ) %>% mutate(seed = ifelse(has_seed, as.character(seed), "none")) +``` +Before +```{r} table(fd$seed, grepl('seed', fd$method_full), useNA = 'always') +``` + +After + +```{r} table(fd_clean$seed, grepl('seed', fd_clean$method_full), useNA = 'always') fd <- fd_clean @@ -299,7 +305,7 @@ write.csv(fd, file = 'aggregated_results.csv') We impute NA cpu_time as 0.05 s -```{r} +```{r, fig.width = 12, fig.height = 8} min(fd$cpu_time, na.rm = TRUE) fd$imputed_cpu_time <- ifelse(is.na(fd$cpu_time), no = fd$cpu_time, yes = 0.05) @@ -350,16 +356,15 @@ ggplot(fd_avg, aes(x = backend, y = imputed_cpu_time)) + # Pairwise correlations -```{r, fig.width = 9} +```{r, fig.width = 7, fig.height = 14} wide_run <- fd %>% filter(backend %in% c("conda","oras","envmodules")) %>% group_by(dataset_name, method, k, metric_name, run, seed, backend) %>% summarise(metric_value = mean(as.numeric(metric_value), na.rm = TRUE), .groups = "drop") %>% tidyr::pivot_wider(names_from = backend, values_from = metric_value, values_fill = NA) -head(wide_run) -# Compute correlations per metric_name AND seed and for pairwise complete obs +# compute correlations per metric_name AND seed and for pairwise complete obs cors_seed <- wide_run %>% group_by(metric_name, seed) %>% summarise( @@ -372,7 +377,7 @@ cors_seed <- wide_run %>% .groups = "drop") -print(cors_run) +print(cors_seed) cors_long <- cors_seed %>% pivot_longer( @@ -449,7 +454,7 @@ fd_avg <- fd_long %>% group_by(backend, seed, run, metric, dataset_name, method_full) %>% summarise(mean_value = mean(value, na.rm = TRUE), .groups = "drop") -head(fd_avg) +## head(fd_avg) # boxplots + jittered scatter, faceted by metric ggplot(fd_avg, aes(x = backend, y = mean_value, fill = backend)) + @@ -493,12 +498,9 @@ ggplot(fd_avg, aes(x = backend, y = mean_value, fill = backend)) + scale_fill_brewer(palette = "Set2") ``` -## Non-complete observations colored by method provider, with a bug +## Non-complete observations colored by method -Something wrong with method "1" etc here? - - -```{r, fig.width = 10, fig.height = 10} +```{r, fig.width = 15, fig.height = 10} fd_long <- fd %>% pivot_longer(cols = all_of(perf_metrics), names_to = "metric", @@ -508,27 +510,55 @@ fd_avg <- fd_long %>% group_by(backend, seed, run, metric, dataset_name, method_full) %>% summarise(mean_value = mean(value, na.rm = TRUE), .groups = "drop") -head(fd_avg) -table(fd_avg$seed, fd_avg$run) +## head(fd_avg) +## table(fd_avg$seed, fd_avg$run) +## unique(fd_avg$method_full) ggplot(fd_avg, aes(x = backend, y = mean_value)) + - # boxplots filled by backend - ## geom_boxplot(aes(fill = backend), outlier.alpha = 0.3) + - # points colored by method + # points colored by method, seed and other params geom_point(aes(color = method_full), alpha = 0.6, position = position_jitter(width = 0.15)) + # connect points across backends for same method/run/etc. - geom_line(aes(group = interaction(method_full, seed, run, dataset_name), - color = run), + geom_line(aes(group = interaction(method_full, seed, run, dataset_name)), alpha = 0.1) + facet_wrap(~metric, scales = "free_y") + theme_minimal(base_size = 14) + labs(title = "Performance metrics by backend", x = "Backend", - y = "Value") + - scale_fill_brewer(palette = "Set2") + - scale_color_brewer(palette = "Dark2") + y = "Value") + +``` + +## Non-complete observations colored by method provider + +```{r, fig.width = 15, fig.height = 10} +fd_long <- fd %>% + pivot_longer(cols = all_of(perf_metrics), + names_to = "metric", + values_to = "value") + +fd_avg <- fd_long %>% + group_by(backend, seed, run, metric, dataset_name, method_full, method) %>% + summarise(mean_value = mean(value, na.rm = TRUE), .groups = "drop") + +## head(fd_avg) +## table(fd_avg$seed, fd_avg$run) +## unique(fd_avg$method_full) + +ggplot(fd_avg, aes(x = backend, y = mean_value)) + + # points colored by method, seed and other params + geom_point(aes(color = method), + alpha = 0.6, + position = position_jitter(width = 0.15)) + + # connect points across backends for same method/run/etc. + geom_line(aes(group = interaction(method, seed, run, dataset_name)), + alpha = 0.1) + + facet_wrap(~metric, scales = "free_y") + + theme_minimal(base_size = 14) + + labs(title = "Performance metrics by backend", + x = "Backend", + y = "Value") ``` @@ -538,7 +568,7 @@ ggplot(fd_avg, aes(x = backend, y = mean_value)) + Mind there is no such a thing as true k, we use the first labelset. -```{r, fig.height = 7, fig.width = 10} +```{r, fig.height = 10, fig.width = 13} fd_dev_k <- fd %>% group_by(method_full, dataset_name, metric_name) %>% @@ -547,7 +577,7 @@ fd_dev_k <- fd %>% deviation_k = metric_value - true_value) %>% ungroup() -summary(fd_dev_k$deviation_k) +## summary(fd_dev_k$deviation_k) ## str(fd_dev_k) ggplot(fd_dev_k, aes(x = factor(k), y = deviation_k, color = method_full)) + @@ -557,14 +587,13 @@ ggplot(fd_dev_k, aes(x = factor(k), y = deviation_k, color = method_full)) + labs(title = "Deviation of metric across ks", subtitle = "Deviation vs value at true k", x = "k", - y = "Perf metric deviation from true k") + - scale_color_brewer(palette = "Set1") + y = "Perf metric deviation from true k") ``` ## By k offset -```{r, fig.width = 10, fig.height = 7} +```{r, fig.width = 12, fig.height = 9} ## str(fd) fd_dev_k <- fd %>% group_by(method_full, dataset_name, metric_name) %>% @@ -588,8 +617,7 @@ ggplot(fd_dev_k_avg, aes(x = k_offset, y = mean_deviation_k, color = method_full labs(title = "Performance impact of k offsets", subtitle = "Deviation vs value at true k", x = "Offset from true k", - y = "Perf metric deviation from true k") + - scale_color_brewer(palette = "Set1") + y = "Perf metric deviation from true k") ggplot(fd_dev_k_avg, aes(x = k_offset, y = mean_deviation_k, color = dataset_name)) + geom_point(alpha = 0.6, position = position_jitter(width = 0.15, height = 0)) + @@ -597,14 +625,13 @@ ggplot(fd_dev_k_avg, aes(x = k_offset, y = mean_deviation_k, color = dataset_nam theme_minimal(base_size = 14) + labs(title = "Performance impact of k offsets", x = "Offset from true k", - y = "Perf metric deviation from true k") + - scale_color_brewer(palette = "Set1") + y = "Perf metric deviation from true k") ``` adj Rand index only -```{r, fig.height = 6, fig.width = 6} +```{r, fig.height = 6, fig.width = 8} # ilter to adjusted_rand_score only fd_dev_k_ars <- fd_dev_k_avg %>% filter(metric_name == "adjusted_rand_score") @@ -616,8 +643,7 @@ ggplot(fd_dev_k_ars, aes(x = k_offset, y = mean_deviation_k, color = method_full labs(title = "Performance impact of k offsets", subtitle = "Deviation vs value at true k", x = "Offset from true k", - y = "Deviation in adjusted_rand_score") + - scale_color_brewer(palette = "Set1") + y = "Deviation in adjusted_rand_score") # by dataset_name ggplot(fd_dev_k_ars, aes(x = k_offset, y = mean_deviation_k, color = dataset_name)) + @@ -625,36 +651,16 @@ ggplot(fd_dev_k_ars, aes(x = k_offset, y = mean_deviation_k, color = dataset_nam theme_minimal(base_size = 14) + labs(title = "Performance impact of k offsets", x = "Offset from true k", - y = "Deviation in adjusted_rand_score") + - scale_color_brewer(palette = "Set1") + y = "Deviation in adjusted_rand_score") ``` # Comp vs clustering performance trade-offs +Caution only CPU's NAs are handled, not other perf metrics -```{r, fig.width = 12} - -## print(ggplot(fd, aes(x = cpu_time, y = metric_value, -## color = method_full, shape = backend)) + -## geom_point(alpha = 0.6) + -## facet_wrap(~metric_name, scales = "free_y") + -## theme_minimal(base_size = 14) + -## labs(title = "clustering metrics vs runtime trade‑offs", -## x = "CPU time (s)", -## y = "clustering metric value")) - -## print(ggplot(fd, aes(x = max_rss, y = metric_value, -## color = method_full, shape = backend)) + -## geom_point(alpha = 0.6) + -## facet_wrap(~metric_name, scales = "free_y") + -## theme_minimal(base_size = 14) + -## labs(title = "clustering metrics vs RSS trade‑offs", -## x = "max RSS (MB)", -## y = "clustering metric value")) - - +```{r, fig.width = 12, fig.height = 12} # aggregate across runs and ks again... fd_avg <- fd %>% group_by(method_full, backend, dataset_name, metric_name) %>% @@ -697,7 +703,7 @@ Again there is no such a thing as a true k ## Colored by method -```{r, fig.width = 15, fig.height = 15} +```{r, fig.width = 15, fig.height = 15, warning = FALSE} fd_dev_true <- fd %>% group_by(method_full, dataset_name, metric_name) %>% # get the metric value at true_k @@ -740,7 +746,7 @@ ggplot(fd_dev_true_avg, aes(x = mean_mean_val, y = mean_diff_val, color = method linetype = "dotted", color = "red") + facet_wrap(~metric_name, scales = "free") + theme_minimal(base_size = 14) + - labs(title = "Bland–Altman: any k vs true_k (averaged)", + labs(title = "Bland Altman: any k vs true_k (averaged)", x = "Mean of k and true_k (averaged)", y = "Difference (k - true_k, averaged)") @@ -749,7 +755,7 @@ ggplot(fd_dev_true_avg, aes(x = mean_mean_val, y = mean_diff_val, color = method ## Colored by dataset -```{r, fig.width = 15, fig.height = 15} +```{r, fig.width = 15, fig.height = 15, warning = FALSE} ggplot(fd_dev_true_avg, aes(x = mean_mean_val, y = mean_diff_val, color = dataset_name)) + geom_point(alpha = 0.6) + @@ -761,84 +767,9 @@ ggplot(fd_dev_true_avg, aes(x = mean_mean_val, y = mean_diff_val, color = datase linetype = "dotted", color = "red") + facet_wrap(~metric_name, scales = "free") + theme_minimal(base_size = 14) + - labs(title = "Bland–Altman: any k vs true_k (averaged)", + labs(title = "Bland Altman: any k vs true_k (averaged)", x = "Mean of k and true_k (averaged)", y = "Difference (k - true_k, averaged)") ``` - - - - - - - - - - - - - - - - - - - - - - - - - -## Effects of misspecifying `k`, unreadable QC plot - -```{r, fig.width = 15, fig.height = 15} -## true_vals <- fd %>% -## filter(k == true_k) %>% -## select(method_full, dataset_name, metric_name, run, seed, true_value = metric_value) - -## fd_dev_k <- fd %>% -## group_by(method_full, dataset_name, metric_name, run, seed) %>% -## mutate(true_value = first(metric_value[k == true_k]), -## deviation_k = metric_value - true_value) %>% -## ungroup() - -## head(fd_dev_k) - -## ggplot(fd_dev_k, aes(x = factor(k_offset), y = deviation_k, color = method_full)) + -## geom_point(alpha = 0.6, position = position_jitter(width = 0.15)) + -## facet_grid(dataset_name ~ metric_name, scales = "free_y") + -## theme_minimal(base_size = 14) + -## labs(title = "Deviation of metric across ks", -## subtitle = "Deviation vs value at true k", -## x = "k", -## y = "Perf metric deviation from true k") + -## scale_color_brewer(palette = "Set1") - -fd_dev_k <- fd %>% - group_by(method_full, dataset_name, metric_name, run, seed) %>% - mutate(true_value = first(metric_value[k == true_k]), - deviation_k = metric_value - true_value) %>% - ungroup() - -# deduplicate: average across runs , only ARI -fd_dev_k_avg <- fd_dev_k %>% - filter(metric_name == "adjusted_rand_score") %>% - group_by(method_full, dataset_name, metric_name, k, k_offset) %>% - summarise( - mean_deviation_k = mean(deviation_k, na.rm = TRUE), - .groups = "drop" - ) - -# plot with avg values -ggplot(fd_dev_k_avg, aes(x = k_offset, y = mean_deviation_k, color = method_full)) + - geom_point(alpha = 0.6, position = position_jitter(width = 0.15, height = 0)) + - facet_wrap(~dataset_name, scales = "free_y", ncol = 3) + - theme_minimal(base_size = 14) + - labs(title = "Deviation of metric across k offsets", - x = "Offset from true k", - y = "Mean ARI deviation from that of true `k`") + - scale_color_brewer(palette = "Set1") -```