diff --git a/.play_minio.json b/.play_minio.json
deleted file mode 100644
index 81a2b2b..0000000
--- a/.play_minio.json
+++ /dev/null
@@ -1 +0,0 @@
-{"access_key": "Q3AM3UQ867SPQQA43P2F", "secret_key": "zuf+tfteSlswRu7BJ86wekitnifILbZam1KYY3TG"}
\ No newline at end of file
diff --git a/Makefile b/Makefile
index abede70..3074d7a 100644
--- a/Makefile
+++ b/Makefile
@@ -16,19 +16,19 @@
 #   run_conda       run conda backend with seeds + repeats
 #   run_oras        run oras backend with seeds + repeats
 #   run_envs        run envmodules backend with seeds + repeats
-#   knit_report     generate RMarkdown reports - not fully tested
+#   knit_report     generate RMarkdown reports and an aggregated CSV - not fully tested
 #
 # Environment:
-# - MAX_CORES controls parallelism (default: 50).
+# - MAX_CORES controls num concurrent rules
 # - EASYBUILD_PREFIX needs to be tuned to access the envmodules built extending EESSI <--------------!!!!
 #    see: https://github.com/omnibenchmark/clustering_example/pull/43
 #
 # ============================================================
 
 
-MAX_CORES ?= 50
+MAX_CORES ?= 250
 
-# EasyBuild installation prefix (imallona; edit accordingly)
+# EasyBuild installation prefix (imallona; edit accordingly) ## <------------------------------------!!!!
 EASYBUILD_PREFIX ?= /data/imallona/.local/easybuild
 export EASYBUILD_PREFIX
 
@@ -36,23 +36,30 @@ export EASYBUILD_PREFIX
 OB_CMD = ob run benchmark --local-storage --cores ${MAX_CORES}
 
 # actual benchmark plan repository - to be pinned (the commit/tag)
-CLUSTERING_REPO = https://github.com/omnibenchmark/clustering_example
-CLUSTERING_DIR	= clustering_example
+CLUSTERING_REPO   = https://github.com/omnibenchmark/clustering_example
+CLUSTERING_BRANCH = longer_yamls
+CLUSTERING_DIR	  = clustering_example
 
 # legacy reports in the wrong repository; to be moved to this one
 REPORTS_REPO = https://github.com/imallona/clustering_report
 REPORTS_DIR = clustering_report
 
+## seeds to explore
+SEEDS := 2 54 546 744 1443
+
+## repeated runs per seed
+RUNS := 1 2 3
+
 all: clone_yamls clone_reports run_conda run_oras run_envs knit_report
 
 # clone the clustering_example repo if not already present
 clone_yamls:
 	@if [ ! -d "$(CLUSTERING_DIR)" ]; then \
 		echo "Cloning clustering_example repo..."; \
-		git clone --branch easyconfigs_py3126 $(CLUSTERING_REPO); \
+		git clone --branch ${CLUSTERING_BRANCH} $(CLUSTERING_REPO); \
 	else \
 		echo "clustering_example repo already present, pulling latest..."; \
-		cd $(CLUSTERING_DIR) && git fetch && git checkout easyconfigs_py3126 && git pull; \
+		cd $(CLUSTERING_DIR) && git fetch  && git checkout ${CLUSTERING_BRANCH} && git pull; \
 	fi
 
 # clone the clustering_report repo (mark branch) if not already present
@@ -66,57 +73,52 @@ clone_reports:
 	fi
 
 run_conda: clone_yamls
-	@for seed in 2 54 546 744 1443; do \
+	mkdir -p results
+	@for seed in $(SEEDS); do \
 		echo "Running conda benchmark with seed $$seed..."; \
 		cp $(CLUSTERING_DIR)/Clustering_conda.yml $(CLUSTERING_DIR)/Clustering_conda_tmp.yml; \
-		sed -i "s/--seed, [0-9]\+/--seed, $$seed/" $(CLUSTERING_DIR)/Clustering_conda_tmp.yml; \
-		for i in 1 2 3; do \
-			echo "  Run $$i for seed $$seed..."; \
-			${OB_CMD} -b $(CLUSTERING_DIR)/Clustering_conda_tmp.yml; \
-			cp $(CLUSTERING_DIR)/Clustering_conda_tmp.yml out; \
-			mv out out_conda_seed_$$seed\_run_$$i; \
+		sed -i "s/--seed\",[[:space:]]*[0-9]\+/--seed\", $$seed/" $(CLUSTERING_DIR)/Clustering_conda_tmp.yml; \
+		for i in $(RUNS); do \
+			echo "  Run $$i for seed $$seed and run $$i."; \
+                        echo "DEST: results/out_conda_seed_$$seed\_run_$$i" ;\
+			${OB_CMD} -b $(CLUSTERING_DIR)/Clustering_conda_tmp.yml --out-dir results/out_conda_seed_$$seed\_run_$$i; \
+			cp $(CLUSTERING_DIR)/Clustering_conda_tmp.yml results/out_conda_seed_$$seed\_run_$$i/; \
 		done; \
-	rm $(CLUSTERING_DIR)/Clustering_conda_tmp.yml; \
 	done
 
 run_oras: clone_yamls
-	@for seed in 2 54 546 744 1443; do \
+	@for seed in $(SEEDS); do \
 		echo "Running oras benchmark with seed $$seed..."; \
 		cp $(CLUSTERING_DIR)/Clustering_oras.yml $(CLUSTERING_DIR)/Clustering_oras_tmp.yml; \
-		sed -i "s/--seed, [0-9]\+/--seed, $$seed/" $(CLUSTERING_DIR)/Clustering_oras_tmp.yml; \
-		for i in 1 2 3; do \
-			echo "  Run $$i for seed $$seed..."; \
-			${OB_CMD} -b $(CLUSTERING_DIR)/Clustering_oras_tmp.yml; \
-			cp $(CLUSTERING_DIR)/Clustering_oras_tmp.yml out; \
-			mv out out_oras_seed_$$seed\_run_$$i; \
+		sed -i "s/--seed\",[[:space:]]*[0-9]\+/--seed\", $$seed/" $(CLUSTERING_DIR)/Clustering_oras_tmp.yml; \
+		for i in $(RUNS); do \
+			echo "  Run $$i for seed $$seed and run $$i."; \
+			${OB_CMD} -b $(CLUSTERING_DIR)/Clustering_oras_tmp.yml --out-dir results/out_oras_seed_$$seed\_run_$$i/; \
+			cp $(CLUSTERING_DIR)/Clustering_oras_tmp.yml results/out_oras_seed_$$seed\_run_$$i/; \
 		done; \
-	rm $(CLUSTERING_DIR)/Clustering_oras_tmp.yml; \
 	done
 
 run_envs: clone_yamls
 	@bash -c '\
-		source /cvmfs/software.eessi.io/versions/2025.06/init/lmod/bash && \
-		module load EESSI-extend/2025.06-easybuild && \
-		export MODULEPATH="$(EASYBUILD_PREFIX)/software/modules/all:$$MODULEPATH" && \
-		module use $$MODULEPATH && \
-                echo $$MODULEPATH && \
-		for seed in 2 54 546 744 1443; do \
-			echo "Running envmodules benchmark with seed $$seed..."; \
-			cp $(CLUSTERING_DIR)/Clustering_envmodules.yml $(CLUSTERING_DIR)/Clustering_envmodules_tmp.yml; \
-			sed -i "s/--seed, [0-9]\+/--seed, $$seed/" $(CLUSTERING_DIR)/Clustering_envmodules_tmp.yml; \
-			for i in 1 2 3; do \
-				echo "  Run $$i for seed $$seed..."; \
-				${OB_CMD} -b $(CLUSTERING_DIR)/Clustering_envmodules_tmp.yml; \
-				cp $(CLUSTERING_DIR)/Clustering_envmodules_tmp.yml out; \
-				mv out out_envmodules_seed_$$seed\_run_$$i; \
-			done; \
-			rm $(CLUSTERING_DIR)/Clustering_envmodules_tmp.yml; \
-		done \
-	'
-
+ 		source /cvmfs/software.eessi.io/versions/2025.06/init/lmod/bash && \
+ 		module load EESSI-extend/2025.06-easybuild && \
+ 		export MODULEPATH="$(EASYBUILD_PREFIX)/software/modules/all:$$MODULEPATH" && \
+ 		module use $$MODULEPATH && \
+                 echo $$MODULEPATH && \
+ 		for seed in $(SEEDS); do \
+ 			echo "Running envmodules benchmark with seed $$seed..."; \
+ 			cp $(CLUSTERING_DIR)/Clustering_envmodules.yml $(CLUSTERING_DIR)/Clustering_envmodules_tmp.yml; \
+			sed -i "s/--seed\",[[:space:]]*[0-9]\+/--seed\", $$seed/" $(CLUSTERING_DIR)/Clustering_envmodules_tmp.yml; \
+ 			for i in $(RUNS); do \
+ 				echo "  Run $$i for seed $$seed and run $$i..."; \
+ 				${OB_CMD} -b $(CLUSTERING_DIR)/Clustering_envmodules_tmp.yml --out-dir results/out_envmodules_seed_$$seed\_run_$$i/; \
+ 				cp $(CLUSTERING_DIR)/Clustering_envmodules_tmp.yml results/out_envmodules_seed_$$seed\_run_$$i/; \
+ 			done; \
+ 		done \
+ 	'
 
 knit_report: clone_reports
-	R -e 'rmarkdown::render("$(REPORTS_DIR)/07_metrics_across_backends.Rmd", params = list(performance_bn = "performance-results.rds", metrics_bn = "metrics-results.rds", clustering_dir =  "."))'
-	R -e 'rmarkdown::render("$(REPORTS_DIR)/08_performances_across_backends.Rmd", params = list(performance_bn = "performance-results.rds", metrics_bn = "metrics-results.rds", clustering_dir =  "."))'
-	python parse_results.py
-	R -e 'rmarkdown::render("analyze_results.Rmd")'
+	## R -e 'rmarkdown::render("$(REPORTS_DIR)/07_metrics_across_backends.Rmd", params = list(performance_bn = "performance-results.rds", metrics_bn = "metrics-results.rds", clustering_dir =  "."))'
+	## R -e 'rmarkdown::render("$(REPORTS_DIR)/08_performances_across_backends.Rmd", params = list(performance_bn = "performance-results.rds", metrics_bn = "metrics-results.rds", clustering_dir =  "."))'
+	python parse_results.py > aggregated_results.json
+	R -e 'rmarkdown::render("analyze_results_izaskun.Rmd")'
diff --git a/analyze_results_izaskun.Rmd b/analyze_results_izaskun.Rmd
new file mode 100644
index 0000000..700022f
--- /dev/null
+++ b/analyze_results_izaskun.Rmd
@@ -0,0 +1,811 @@
+---
+title: "clustbench exploratory / fig 2 making"
+author: "Izaskun Mallona"
+output:
+  html_document:
+    theme: readable
+    toc: true
+    toc_float: true
+    code_folding: hide
+    code_download: true
+    number_sections: true
+    df_print: default
+    highlight: tango
+    keep_md: true
+    self_contained: false
+date: "`r format(Sys.Date(), '%B %d, %Y')`"
+---
+
+
+```{r setup, message = FALSE}
+library(knitr)
+library(tidyverse)
+library(jsonlite)
+library(ggplot2)
+library(data.table)
+library(parallel)
+
+knitr::opts_chunk$set(
+  echo    = TRUE,
+  warning = TRUE,
+  message = TRUE,
+  fig.width = 10,
+  fig.height = 10,
+  fig.path = "plots/",
+  dev = c("png", "svg"),
+  cache.lazy = FALSE,
+  cache = FALSE)
+
+```
+
+```{r}
+
+## given the way we have them, so we aim to provide 5 ks, middle being the true, but pad with 2s on the left
+get_true_k <- function(x) {
+  n <- length(x)
+  if (n == 5) {
+    return(x[3])
+  } else if (n == 4) {
+    return(x[3])
+  } else if (n == 3) {
+    return(x[2])
+  } else {
+    return(x[1])
+  }
+}
+
+
+`%||%` <- function(a, b) if (!is.null(a)) a else b
+
+flatten_parameters <- function(params, parameter_dir) {
+  if (is.null(params) || length(params) == 0) {
+    return(data.frame(parameter_dir = parameter_dir,
+                      param_name    = NA_character_,
+                      param_value   = NA_character_,
+                      stringsAsFactors = FALSE))
+  }
+  flat <- unlist(params, use.names = TRUE)
+  data.frame(
+    parameter_dir = parameter_dir,
+    param_name    = names(flat),
+    param_value   = as.character(flat),
+    stringsAsFactors = FALSE
+  )
+}
+
+flatten_record <- function(rec) {
+  mets <- rec$metrics
+  perf <- rec$performance
+  params_df <- flatten_parameters(rec$parameters, rec$parameter_dir)
+
+  rows <- list()
+  idx <- 0
+
+  for (family in names(mets)) {
+    fam_list <- mets[[family]]
+    if (is.null(fam_list)) next
+
+    for (metric_name in names(fam_list)) {
+      metric_vals <- fam_list[[metric_name]]
+      if (is.null(metric_vals)) next
+
+      ks <- names(metric_vals)
+      true_k <- get_true_k(ks)
+
+      for (k in ks) {
+        for (i in seq_len(nrow(params_df))) {
+          idx <- idx + 1
+          rows[[idx]] <- data.frame(
+            backend       = rec$backend,
+            seed          = rec$seed,
+            run           = rec$run,
+            generator     = rec$generator,
+            dataset_name  = rec$dataset_name,
+            method        = rec$method,          # keep "sklearn"
+            method_full   = rec$method_full,     # "sklearn_method-birch" or "sklearn_method-kmeans"
+            parameter_dir = rec$parameter_dir,   # "method-birch" or "method-kmeans"
+            param_name    = params_df$param_name[i],
+            param_value   = params_df$param_value[i],
+            metric_family = family,
+            metric_name   = metric_name,
+            k             = as.integer(k),
+            true_k        = as.integer(true_k),
+            metric_value  = metric_vals[[k]],
+            s             = perf$s,
+            h_m_s         = perf[["h:m:s"]],
+            max_rss       = perf$max_rss,
+            max_vms       = perf$max_vms,
+            max_uss       = perf$max_uss,
+            max_pss       = perf$max_pss,
+            io_in         = perf$io_in,
+            io_out        = perf$io_out,
+            mean_load     = perf$mean_load,
+            cpu_time      = perf$cpu_time,
+            stringsAsFactors = FALSE
+          )
+        }
+      }
+    }
+  }
+
+  do.call(rbind, rows)
+}
+
+
+```
+
+
+```{r}
+
+records <- fromJSON("aggregated_results.json", simplifyVector = FALSE)
+##records <- fromJSON("aggregated_results_full.json", simplifyVector = FALSE)
+
+## fd_list <- vector("list", length(records))
+## for (i in seq_along(records)) {
+##   if (i == 1 || i == 2 || i == 100 || i %% 1000 == 0)
+##       cat("Processing record", i, "of", length(records), "\n")
+##   fd_list[[i]] <- flatten_record(records[[i]])
+## }
+## fd <- data.table::rbindlist(fd_list, use.names = TRUE, fill = TRUE)
+
+
+# parallel apply instead
+fd_list <- mclapply(seq_along(records), function(i) {
+  if (i == 1 || i == 2 || i == 100 || i %% 1000 == 0)
+    cat("Processing record", i, "of", length(records), "\n")
+  flatten_record(records[[i]])
+}, mc.cores = detectCores())
+
+fd <- rbindlist(fd_list, use.names = TRUE, fill = TRUE)
+
+
+## dim(fd)
+
+## str(fd)
+## table(is.na(fd$max_rss))
+## table(fd$method_full)
+## table(vapply(records, function(x) x$method_full, character(1)))
+
+fd <- as.data.frame(fd)
+
+fd$k <- as.integer(fd$k)
+fd$true_k <- as.integer(fd$true_k)
+
+cols_to_num <- c("max_rss","max_vms","max_uss","max_pss",
+                 "io_in","io_out","mean_load","cpu_time",
+                 'metric_value')
+
+fd[cols_to_num] <- lapply(fd[cols_to_num], function(x) {
+  x[x == "NA"] <- NA_character_
+  as.numeric(x)
+})
+
+fd <- fd[!is.na(fd$metric_value),]
+fd$k_offset <- fd$k - fd$true_k
+## write.csv(fd, file = 'aggregated_results.csv') ## later, this needs extra cleaning
+```
+
+# QC
+
+Is the speed of computing the metric, or of running the method?
+
+```{r}
+print(ggplot(fd[fd$backend %in% c("conda","oras","envmodules"), ],
+       aes(x = backend, y = cpu_time, fill = backend)) +
+  geom_boxplot(outlier.alpha = 0.3) +
+  theme_minimal(base_size = 14) +
+  labs(title = "CPU time by backend",
+       x = "Backend",
+       y = "CPU Time (s)") +
+  scale_fill_brewer(palette = "Set2"))
+```
+
+Clearly the method, so all good. With segments following dataset/method/params.
+
+```{r}
+print(ggplot(fd[fd$backend %in% c("conda","oras","envmodules"), ],
+       aes(x = backend, y = cpu_time, fill = backend)) +
+  geom_boxplot(outlier.alpha = 0.3) +
+  # add points for each method/params combination
+  geom_point(alpha = 0.6, position = position_jitter(width = 0.15)) +
+  # connect points across backends for same module+params
+  geom_line(aes(group = interaction(method, method_full, seed, run, generator, dataset_name, k)),
+            alpha = 0.1, color = "grey40") +
+  facet_wrap(~metric_name, scales = "free_y") +
+  theme_minimal(base_size = 14) +
+  labs(title = "CPU time by backend",
+       x = "Backend",
+       y = "CPU time (s)") +
+  scale_color_brewer(palette = "Set1")
+)
+```
+
+# Consistency checks
+
+Are there cputimes that are redudant/incosistent for repeated runs?
+
+```{r}
+keys <- c("dataset_name", "method", "method_full", "generator",
+          "backend", "k", "seed", "run")
+
+fd_inconsistencies <- fd %>%
+  group_by(across(all_of(keys))) %>%
+  summarise(
+    n_cpu_time = n_distinct(cpu_time, na.rm = TRUE),
+    cpu_times  = list(unique(cpu_time)),
+    .groups = "drop"
+  ) %>%
+  filter(n_cpu_time > 1)
+
+stopifnot(nrow(fd_inconsistencies) == 0)
+```
+
+Seed consistencies
+
+```{r}
+keys <- c("dataset_name", "method", "method_full", "generator",
+          "backend", "run", "k", "metric_name")
+
+fd_seed_diff <- fd %>%
+  group_by(across(all_of(keys))) %>%
+  summarise(
+    n_seeds = n_distinct(seed),
+    n_metric_values = n_distinct(metric_value, na.rm = TRUE),
+    metric_values = list(unique(metric_value)),
+    .groups = "drop"
+  ) %>%
+  filter(n_seeds > 1 & n_metric_values > 1)
+
+stopifnot(nrow(fd_seed_diff) == 0)
+```
+
+`k` inconsistencies
+
+
+```{r}
+fd <- fd[!is.na(fd$k),]
+stopifnot(range(fd$k_offset) == c(-2, 2))
+```
+
+We are running some methods more times with different seeds, even if they don't use seeds at all. Removing these different "seeds" that are not such but runs. We have repeated/controlled runs separately.
+
+```{r}
+
+## seed 2 for seed-unaware methods, all seed for seed-aware methods
+fd_clean <- fd %>%
+  mutate(has_seed = grepl("seed-", method_full)) %>%
+  filter(
+    (has_seed) | (!has_seed & seed == 2)   # keep all seeds if encoded, else only seed==2
+  ) %>%
+  mutate(seed = ifelse(has_seed, as.character(seed), "none"))
+```
+
+Before
+
+```{r}
+table(fd$seed, grepl('seed', fd$method_full), useNA = 'always')
+```
+
+After
+
+```{r}
+table(fd_clean$seed, grepl('seed', fd_clean$method_full), useNA = 'always')
+
+fd <- fd_clean
+
+rm(fd_clean)
+
+write.csv(fd, file = 'aggregated_results.csv')
+# write.csv(fd, file = 'aggregated_results_full.csv')
+
+```
+
+
+
+# CPU by backend
+
+## QC by backend scatterplots
+
+
+```{r, fig.width = 7, fig.height = 5}
+
+wide_cpu <- fd %>%
+  filter(backend %in% c("conda","envmodules","oras")) %>%
+  group_by(dataset_name, method, method_full, seed, run, backend) %>%
+  summarise(cpu_time = mean(as.numeric(cpu_time), na.rm = TRUE), .groups = "drop") %>%
+  pivot_wider(names_from = backend, values_from = cpu_time)
+
+ggplot(wide_cpu, aes(x = conda, y = oras, color = method)) +
+  geom_point(alpha = 0.6) +
+  labs(title = "CPU time: conda vs oras",
+       x = "conda", y = "oras") +
+  theme_minimal()
+
+ggplot(wide_cpu, aes(x = conda, y = envmodules,  color = method)) +
+  geom_point(alpha = 0.6) +
+  labs(title = "CPU time: conda vs envmodules",
+       x = "conda", y = "envmodules") +
+  theme_minimal()
+
+ggplot(wide_cpu, aes(x = oras, y = envmodules,  color = method)) +
+  geom_point(alpha = 0.6) +
+  labs(title = "CPU time: oras vs envmodules",
+       x = "oras", y = "envmodules") +
+  theme_minimal()
+
+
+```
+
+## Censoring aware : cpu_time < 0.05 plotted as 0.05
+
+We impute NA cpu_time as 0.05 s
+
+```{r, fig.width = 12, fig.height = 8}
+
+min(fd$cpu_time, na.rm = TRUE)
+fd$imputed_cpu_time <- ifelse(is.na(fd$cpu_time), no = fd$cpu_time, yes = 0.05)
+fd$censored_cpu_time  <- is.na(fd$cpu_time)
+
+## also, we average  for repeated runs, seeds and ks
+
+fd_avg <- fd %>%
+  filter(backend %in% c("conda","oras","envmodules")) %>%
+  group_by(backend, method, method_full, generator, dataset_name) %>%
+  summarise(imputed_cpu_time = mean(imputed_cpu_time, na.rm = TRUE),
+            censored_cpu_time = any(censored_cpu_time),
+            .groups = "drop")
+
+
+## str(fd_avg)
+## head(fd_avg)
+
+ggplot(fd_avg, aes(x = backend, y = imputed_cpu_time)) +
+  geom_boxplot(outlier.alpha = 0.3) +
+  geom_point(aes(color = method_full, shape = censored_cpu_time),
+             alpha = 0.6, position = position_jitter(width = 0.15)) +
+  geom_line(aes(group = interaction(method, method_full, generator, dataset_name)),
+            alpha = 0.2, color = "grey40") +
+  theme_minimal(base_size = 14) +
+  facet_wrap(~method, scales = "free_y") + 
+  labs(title = "CPU time by backend - censored",
+       x = "Backend",
+       y = "CPU time (s)") +
+  scale_y_sqrt()
+
+
+ggplot(fd_avg, aes(x = backend, y = imputed_cpu_time)) +
+  geom_boxplot(outlier.alpha = 0.3) +
+  geom_point(aes(color = method_full, shape = censored_cpu_time),
+             alpha = 0.6, position = position_jitter(width = 0.15)) +
+  geom_line(aes(group = interaction(method, method_full, generator, dataset_name)),
+            alpha = 0.2, color = "grey40") +
+  theme_minimal(base_size = 14) +
+  facet_wrap(~method, scales = "free_y") + 
+  labs(title = "CPU time by backend - censored",
+       x = "Backend",
+       y = "CPU time (s)")
+
+```
+
+# Results consistency across backends
+
+# Pairwise correlations
+
+```{r, fig.width = 7, fig.height = 14}
+wide_run <- fd %>%
+  filter(backend %in% c("conda","oras","envmodules")) %>%
+  group_by(dataset_name, method, k, metric_name, run, seed, backend) %>%
+  summarise(metric_value = mean(as.numeric(metric_value), na.rm = TRUE), .groups = "drop") %>%
+  tidyr::pivot_wider(names_from = backend, values_from = metric_value, values_fill = NA)
+
+
+# compute correlations per metric_name AND seed and for pairwise complete obs
+cors_seed <- wide_run %>%
+  group_by(metric_name, seed) %>%
+  summarise(
+    cor_conda_oras = if(sum(complete.cases(conda, oras)) > 1) 
+                       cor(conda, oras, use = "complete.obs") else NA_real_,
+    cor_conda_envmodules = if(sum(complete.cases(conda, envmodules)) > 1) 
+                       cor(conda, envmodules, use = "complete.obs") else NA_real_,
+    cor_oras_envmodules = if(sum(complete.cases(oras, envmodules)) > 1) 
+                       cor(oras, envmodules, use = "complete.obs") else NA_real_,
+    .groups = "drop")
+
+
+print(cors_seed)
+
+cors_long <- cors_seed %>%
+  pivot_longer(
+    cols = starts_with("cor_"),
+    names_to = "pair",
+    values_to = "correlation")
+
+
+ggplot(cors_long, aes(x = pair, y = interaction(metric_name, seed), fill = correlation)) +
+  geom_tile(color = "white") +
+  geom_text(aes(label = round(correlation, 2)), color = "black", size = 3) +
+  scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0,
+                       limits = c(-1, 1), na.value = "grey90") +
+  theme_minimal(base_size = 12) +
+  labs(title = "Backend correlations per metric and seed",
+       x = "Backend pair",
+       y = "Metric and seed",
+       fill = "cor coef")
+
+```
+
+And the repeated runs?
+
+```{r}
+
+wide_run <- fd %>%
+  filter(backend %in% c("conda","oras","envmodules")) %>%
+  group_by(dataset_name, method, k, metric_name, run, seed, backend) %>%
+  summarise(metric_value = mean(as.numeric(metric_value), na.rm = TRUE), .groups = "drop") %>%
+  unite("backend_run", backend, run, sep = "_") %>%   # combine backend and run
+  pivot_wider(names_from = backend_run, values_from = metric_value, values_fill = NA)
+
+
+# only correlating aggregated metrics
+num_cols <- wide_run %>%
+  select(where(is.numeric), -k, -seed)
+
+
+# pairwise cors, so including repeated runs with the same backend
+cors_all <- cor(num_cols, use = "pairwise.complete.obs")
+
+cors_df <- melt(cors_all)
+
+# heatmap with correlation coefficients
+ggplot(cors_df, aes(x = Var1, y = Var2, fill = value)) +
+  geom_tile(color = "white") +
+  geom_text(aes(label = round(value, 2)), color = "black", size = 3) +
+  scale_fill_gradient2(low = "blue", mid = "white", high = "red", midpoint = 0,
+                       limits = c(-1, 1), na.value = "grey90") +
+  theme_minimal(base_size = 12) +
+  labs(title = "Metric correlation per backend and repeated run",
+       x = "backend and run", y = "backend and run", fill = "Correlation")
+
+
+```
+
+# Computational performance metrics
+
+## Non-complete obs
+
+```{r, fig.width = 10, fig.height=10}
+
+perf_metrics <- c("cpu_time","max_rss","max_vms","max_uss",
+                  "max_pss","io_in","io_out","mean_load", "imputed_cpu_time")
+
+fd_long <- fd %>%
+  filter(backend %in% c("conda","oras","envmodules")) %>%
+  pivot_longer(cols = all_of(perf_metrics),
+               names_to = "metric",
+               values_to = "value")
+
+## again, averaging repeated runs with the same seed
+fd_avg <- fd_long %>%
+  group_by(backend, seed, run, metric, dataset_name, method_full) %>%
+  summarise(mean_value = mean(value, na.rm = TRUE), .groups = "drop")
+
+## head(fd_avg)
+                                                                         
+#  boxplots + jittered scatter, faceted by metric
+ggplot(fd_avg, aes(x = backend, y = mean_value, fill = backend)) +
+  geom_boxplot(outlier.alpha = 0.3) +
+  geom_jitter(width = 0.2, alpha = 0.1, size = 1, color = "black") +
+  facet_wrap(~metric, scales = "free_y") +
+  theme_minimal(base_size = 14) +
+  labs(title = "Performance metrics by backend",
+       x = "Backend",
+       y = "Value") +
+  scale_fill_brewer(palette = "Set2")
+```
+
+## Complete obs
+
+Now the same but making sure only items that are measured across all backends (so no NAs because they're too quick and not profiled) are shown
+
+```{r, fig.width = 10, fig.height=10}
+
+fd_long_complete <- fd %>%
+  filter(backend %in% c("conda","oras","envmodules")) %>%
+  pivot_longer(cols = all_of(perf_metrics),
+               names_to = "metric",
+               values_to = "value") %>%
+  group_by(method_full, dataset_name, seed, run, generator, k, metric) %>%
+  filter(n_distinct(backend) == 3, !any(is.na(value))) %>%
+  ungroup()
+
+fd_avg <- fd_long_complete %>%
+  group_by(backend, seed, run, metric, dataset_name, method_full) %>%
+  summarise(mean_value = mean(value, na.rm = TRUE), .groups = "drop")
+
+ggplot(fd_avg, aes(x = backend, y = mean_value, fill = backend)) +
+  geom_boxplot(outlier.alpha = 0.3) +
+  geom_jitter(width = 0.2, alpha = 0.1, size = 1, color = "black") +
+  facet_wrap(~metric, scales = "free_y") +
+  theme_minimal(base_size = 14) +
+  labs(title = "Perf metrics by backend (only complete groups)",
+       x = "backend",
+       y = "perf value") +
+  scale_fill_brewer(palette = "Set2")
+```
+
+## Non-complete observations colored by method
+
+```{r, fig.width = 15, fig.height = 10}
+fd_long <- fd %>%
+  pivot_longer(cols = all_of(perf_metrics),
+               names_to = "metric",
+               values_to = "value")
+
+fd_avg <- fd_long %>%
+  group_by(backend, seed, run, metric, dataset_name, method_full) %>%
+  summarise(mean_value = mean(value, na.rm = TRUE), .groups = "drop")
+
+## head(fd_avg)
+## table(fd_avg$seed, fd_avg$run)
+## unique(fd_avg$method_full)
+
+ggplot(fd_avg, aes(x = backend, y = mean_value)) +
+  # points colored by method, seed and other params
+  geom_point(aes(color = method_full),
+             alpha = 0.6,
+             position = position_jitter(width = 0.15)) +
+  # connect points across backends for same method/run/etc.
+  geom_line(aes(group = interaction(method_full, seed, run, dataset_name)),
+            alpha = 0.1) +
+  facet_wrap(~metric, scales = "free_y") +
+  theme_minimal(base_size = 14) +
+  labs(title = "Performance metrics by backend",
+       x = "Backend",
+       y = "Value")
+
+```
+
+## Non-complete observations colored by method provider
+
+```{r, fig.width = 15, fig.height = 10}
+fd_long <- fd %>%
+  pivot_longer(cols = all_of(perf_metrics),
+               names_to = "metric",
+               values_to = "value")
+
+fd_avg <- fd_long %>%
+  group_by(backend, seed, run, metric, dataset_name, method_full, method) %>%
+  summarise(mean_value = mean(value, na.rm = TRUE), .groups = "drop")
+
+## head(fd_avg)
+## table(fd_avg$seed, fd_avg$run)
+## unique(fd_avg$method_full)
+
+ggplot(fd_avg, aes(x = backend, y = mean_value)) +
+  # points colored by method, seed and other params
+  geom_point(aes(color = method),
+             alpha = 0.6,
+             position = position_jitter(width = 0.15)) +
+  # connect points across backends for same method/run/etc.
+  geom_line(aes(group = interaction(method, seed, run, dataset_name)),
+            alpha = 0.1) +
+  facet_wrap(~metric, scales = "free_y") +
+  theme_minimal(base_size = 14) +
+  labs(title = "Performance metrics by backend",
+       x = "Backend",
+       y = "Value")
+
+```
+
+# Choice of `k` - and its misspecification
+
+## Deviation of performance at false k vs true k
+
+Mind there is no such a thing as true k, we use the first labelset.
+
+```{r, fig.height = 10, fig.width = 13}
+fd$metric_value <- as.numeric(fd$metric_value)
+
+fd_dev_k <- fd %>%
+  group_by(method_full, dataset_name, metric_name) %>%
+  # get the metric_value at the true_k for this group
+  mutate(true_value = metric_value[k == true_k][1],
+         deviation_k = metric_value - true_value) %>%
+  ungroup()
+
+## summary(fd_dev_k$deviation_k)
+
+## str(fd_dev_k)
+ggplot(fd_dev_k, aes(x = factor(k), y = deviation_k, color = method_full)) +
+  geom_point(alpha = 0.6, position = position_jitter(width = 0.15)) +
+  facet_wrap(~ metric_name, scales = "free_y") +
+  theme_minimal(base_size = 14) +
+  labs(title = "Deviation of metric across ks",
+       subtitle = "Deviation vs value at true k",
+       x = "k",
+       y = "Perf metric deviation from true k")
+```
+
+## By k offset
+
+
+```{r, fig.width = 12, fig.height = 9}
+## str(fd)
+fd_dev_k <- fd %>%
+  group_by(method_full, dataset_name, metric_name) %>%
+  # get the metric_value at the true_k for this group
+  mutate(true_value  = metric_value[k == true_k][1],
+         deviation_k = metric_value - true_value) %>%
+  ungroup()
+
+## again aggregate across repeated runs  - not seeds, given the method_full has them in its values
+fd_dev_k_avg <- fd_dev_k %>%
+  group_by(method_full, dataset_name, metric_name, k, k_offset) %>%
+  summarise(
+    mean_deviation_k = mean(deviation_k, na.rm = TRUE),
+    .groups = "drop"
+  )
+
+ggplot(fd_dev_k_avg, aes(x = k_offset, y = mean_deviation_k, color = method_full)) +
+  geom_point(alpha = 0.6, position = position_jitter(width = 0.15, height = 0)) +
+  facet_wrap(~ metric_name, scales = "free_y") +
+  theme_minimal(base_size = 14) +
+  labs(title  = "Performance impact of k offsets",
+       subtitle = "Deviation vs value at true k",
+       x = "Offset from true k",
+       y = "Perf metric deviation from true k")
+
+ggplot(fd_dev_k_avg, aes(x = k_offset, y = mean_deviation_k, color = dataset_name)) +
+  geom_point(alpha = 0.6, position = position_jitter(width = 0.15, height = 0)) +
+  facet_wrap(~ metric_name, scales = "free_y") +
+  theme_minimal(base_size = 14) +
+  labs(title  = "Performance impact of k offsets",
+       x = "Offset from true k",
+       y = "Perf metric deviation from true k")
+
+```
+
+adj Rand index only
+
+```{r, fig.height = 6, fig.width = 16}
+# ilter to adjusted_rand_score only
+fd_dev_k_ars <- fd_dev_k_avg %>%
+  filter(metric_name == "adjusted_rand_score")
+
+# by method_full
+ggplot(fd_dev_k_ars, aes(x = k_offset, y = mean_deviation_k, color = method_full)) +
+  geom_point(alpha = 0.6, position = position_jitter(width = 0.15, height = 0)) +
+  theme_minimal(base_size = 14) +
+  labs(title    = "Performance impact of k offsets",
+       subtitle = "Deviation vs value at true k",
+       x        = "Offset from true k",
+       y        = "Deviation in adjusted_rand_score")
+
+#  by dataset_name
+ggplot(fd_dev_k_ars, aes(x = k_offset, y = mean_deviation_k, color = dataset_name)) +
+  geom_point(alpha = 0.6, position = position_jitter(width = 0.15, height = 0)) +
+  theme_minimal(base_size = 14) +
+  labs(title    = "Performance impact of k offsets",
+       x  = "Offset from true k",
+       y   = "Deviation in adjusted_rand_score")
+
+```
+
+
+# Comp vs clustering performance trade-offs
+
+Caution only CPU's NAs are handled, not other perf metrics
+
+```{r, fig.width = 12, fig.height = 12}
+# aggregate across runs and ks again...
+fd_avg <- fd %>%
+  group_by(method_full, backend, dataset_name, metric_name) %>%
+  summarise(
+    mean_imputed_cpu_time = mean(imputed_cpu_time, na.rm = TRUE),
+    mean_max_rss  = mean(max_rss, na.rm = TRUE),
+    mean_metric   = mean(metric_value, na.rm = TRUE),
+    .groups = "drop"
+  )
+
+# imputed cpu time vs metric
+print(
+  ggplot(fd_avg, aes(x = mean_imputed_cpu_time, y = mean_metric,
+                     color = method_full, shape = backend)) +
+    geom_point(alpha = 0.6) +
+    facet_wrap(~metric_name, scales = "free_y") +
+    theme_minimal(base_size = 14) +
+    labs(title = "Clustering metrics vs runtime trade‑offs",
+         x = "Mean CPU time (s)",
+         y = "Mean clustering metric value")
+)
+
+# RSS vs metric, caution here no imputation / no idea how to handle memory for censored cpu_time data
+print(
+  ggplot(fd_avg, aes(x = mean_max_rss, y = mean_metric,
+                     color = method_full, shape = backend)) +
+    geom_point(alpha = 0.6) +
+    facet_wrap(~metric_name, scales = "free_y") +
+    theme_minimal(base_size = 14) +
+    labs(title = "Clustering metrics vs RSS trade‑offs",
+         x = "Mean max RSS (MB)",
+         y = "Mean clustering metric value")
+)
+
+```
+
+# Bland Altmans for ks vs true k
+
+Again there is no such a thing as a true k
+
+## Colored by method
+
+```{r, fig.width = 15, fig.height = 15, warning = FALSE}
+fd_dev_true <- fd %>%
+  group_by(method_full, dataset_name, metric_name) %>%
+  # get the metric value at true_k
+  mutate(true_value = metric_value[k == true_k][1],
+         diff_val   = metric_value - true_value,
+         mean_val   = (metric_value + true_value)/2) %>%
+  ungroup()
+
+## # Bland–Altman style plot: any k vs true_k
+## ggplot(fd_dev_true, aes(x = mean_val, y = diff_val, color = method_full)) +
+##   geom_point(alpha = 0.6) +
+##   geom_hline(yintercept = mean(fd_dev_true$diff_val, na.rm = TRUE),
+##              linetype = "dashed", color = "blue") +
+##   geom_hline(yintercept = mean(fd_dev_true$diff_val, na.rm = TRUE) + 1.96*sd(fd_dev_true$diff_val, na.rm = TRUE),
+##              linetype = "dotted", color = "red") +
+##   geom_hline(yintercept = mean(fd_dev_true$diff_val, na.rm = TRUE) - 1.96*sd(fd_dev_true$diff_val, na.rm = TRUE),
+##              linetype = "dotted", color = "red") +
+##   facet_wrap(~metric_name, scales = "free") +
+##   theme_minimal(base_size = 14) +
+##   labs(title = "Bland–Altman: any k vs true_k",
+##        x = "Mean of k and true_k",
+##        y = "Difference (k - true_k)")
+
+# collapse across seeds and runs
+fd_dev_true_avg <- fd_dev_true %>%
+  group_by(method_full, dataset_name, metric_name, k, k_offset) %>%
+  summarise(
+    mean_diff_val = mean(diff_val, na.rm = TRUE),
+    mean_mean_val = mean(mean_val, na.rm = TRUE),
+    .groups = "drop"
+  )
+
+ggplot(fd_dev_true_avg, aes(x = mean_mean_val, y = mean_diff_val, color = method_full)) +
+  geom_point(alpha = 0.6) +
+  geom_hline(yintercept = mean(fd_dev_true_avg$mean_diff_val, na.rm = TRUE),
+             linetype = "dashed", color = "blue") +
+  geom_hline(yintercept = mean(fd_dev_true_avg$mean_diff_val, na.rm = TRUE) + 1.96*sd(fd_dev_true_avg$mean_diff_val, na.rm = TRUE),
+             linetype = "dotted", color = "red") +
+  geom_hline(yintercept = mean(fd_dev_true_avg$mean_diff_val, na.rm = TRUE) - 1.96*sd(fd_dev_true_avg$mean_diff_val, na.rm = TRUE),
+             linetype = "dotted", color = "red") +
+  facet_wrap(~metric_name, scales = "free") +
+  theme_minimal(base_size = 14) +
+  labs(title = "Bland Altman: any k vs true_k (averaged)",
+       x = "Mean of k and true_k (averaged)",
+       y = "Difference (k - true_k, averaged)")
+
+
+```
+
+## Colored by dataset
+
+```{r, fig.width = 15, fig.height = 15, warning = FALSE}
+
+ggplot(fd_dev_true_avg, aes(x = mean_mean_val, y = mean_diff_val, color = dataset_name)) +
+  geom_point(alpha = 0.6) +
+  geom_hline(yintercept = mean(fd_dev_true_avg$mean_diff_val, na.rm = TRUE),
+             linetype = "dashed", color = "blue") +
+  geom_hline(yintercept = mean(fd_dev_true_avg$mean_diff_val, na.rm = TRUE) + 1.96*sd(fd_dev_true_avg$mean_diff_val, na.rm = TRUE),
+             linetype = "dotted", color = "red") +
+  geom_hline(yintercept = mean(fd_dev_true_avg$mean_diff_val, na.rm = TRUE) - 1.96*sd(fd_dev_true_avg$mean_diff_val, na.rm = TRUE),
+             linetype = "dotted", color = "red") +
+  facet_wrap(~metric_name, scales = "free") +
+  theme_minimal(base_size = 14) +
+  labs(title = "Bland Altman: any k vs true_k (averaged)",
+       x = "Mean of k and true_k (averaged)",
+       y = "Difference (k - true_k, averaged)")
+
+
+```
diff --git a/parse_results.py b/parse_results.py
old mode 100755
new mode 100644
index a3e60a9..c07cebd
--- a/parse_results.py
+++ b/parse_results.py
@@ -1,8 +1,9 @@
 #!/usr/bin/env python3
 """
-Simple script to parse clustbench results with glob pattern matching.
+Parse clustbench results with glob pattern matching.
 
-Pattern: out-{backend}-{rep}/data/clustbench/dataset_generator-{generator}_dataset_name-{name}/clustering/{method}
+Pattern:
+out-{backend}_seed-{seed}_run-{run}/data/clustbench/dataset_generator-{generator}_dataset_name-{name}/clustering/{method}
 """
 
 import csv
@@ -11,66 +12,92 @@
 import re
 from pathlib import Path
 from typing import Dict, List, Optional
+import sys
 
-
-def parse_result_path(path: Path) -> Dict[str, str]:
+def parse_result_path(path: Path) -> List[Dict[str, str]]:
     """
-    Parse a result path and extract components.
+    Parse a result path and extract components:
+    - backend, seed, run (from out_* directories)
+    - dataset generator and dataset name
+    - method (immediate folder after clustering/)
+    - method_full (method + variant symlink/subdir)
 
-    Pattern: out-{backend}-{rep}/data/clustbench/dataset_generator-{generator}_dataset_name-{name}/clustering/{method}
+    Returns a list of dicts, one per available variant directory under {method}.
     """
     parts = path.parts
-
-    result = {}
-
-    # Parse out-{backend}-{rep}
-    out_match = re.match(r'out-([^-]+)-(\d+)', parts[0])
+    base_result: Dict[str, str] = {}
+
+    # print("DEBUG parts:", parts, file = sys.stderr)
+    # parse out_{backend}_seed_{seed}_run_{run}
+    out_match = re.match(
+        r"out_(?P<backend>[a-zA-Z0-9]+)_seed_(?P<seed>\d+)_run_(?P<run>\d+)",
+        parts[1]
+    )
     if out_match:
-        result['backend'] = out_match.group(1)
-        result['rep'] = out_match.group(2)
+        base_result["backend"] = out_match.group("backend")
+        base_result["seed"] = out_match.group("seed")
+        base_result["run"] = out_match.group("run")
 
-    # Find dataset_generator part
+    # find dataset_generator part
     for part in parts:
-        if part.startswith('dataset_generator-'):
-            # Parse dataset_generator-{generator}_dataset_name-{name}
-            dataset_match = re.match(r'dataset_generator-([^_]+)_dataset_name-(.+)', part)
+        if part.startswith("dataset_generator-"):
+            dataset_match = re.match(
+                r"dataset_generator-([^_]+)_dataset_name-(.+)", part
+            )
             if dataset_match:
-                result['generator'] = dataset_match.group(1)
-                result['dataset_name'] = dataset_match.group(2)
+                base_result["generator"] = dataset_match.group(1)
+                base_result["dataset_name"] = dataset_match.group(2)
             break
 
-    # The method is the last part (after clustering/)
-    if 'clustering' in parts:
-        clustering_idx = parts.index('clustering')
-        if clustering_idx + 1 < len(parts):
-            result['method'] = parts[clustering_idx + 1]
+    results: List[Dict[str, str]] = []
 
-    result['path'] = str(path)
+    # The method is the folder after clustering/
+    if "clustering" in parts:
+        clustering_idx = parts.index("clustering")
+        if clustering_idx + 1 < len(parts):
+            method_dir = parts[clustering_idx + 1]
+            base_result["method"] = method_dir
+
+            method_path = path
+            if method_path.is_dir():
+                for child in method_path.iterdir():
+                    # skip hidden dirs, hashes, and metrics folder
+                    if child.name.startswith("."):
+                        continue
+                    if re.fullmatch(r"[0-9a-f]{32,}", child.name):
+                        continue
+                    if re.fullmatch(r"[0-9a-f]{8,}", child.name):
+                        continue
+                    if child.name == "metrics":
+                        continue
+
+                    if child.is_symlink() or child.is_dir():
+                        r = base_result.copy()
+                        r["method_full"] = f"{method_dir}_{child.name}"
+                        r["path"] = str(child)
+                        results.append(r)
+            else:
+                r = base_result.copy()
+                r["method_full"] = "/".join(parts[clustering_idx + 1:])
+                r["path"] = str(path)
+                results.append(r)
 
-    return result
+    return results
 
 
 def parse_performance_file(perf_file: Path) -> Optional[Dict]:
-    """
-    Parse a clustbench_performance.txt file (TSV format).
-
-    Returns:
-        Dictionary with performance metrics, or None if file doesn't exist
-    """
+    """Parse a clustbench_performance.txt file (TSV format)."""
     if not perf_file.exists():
         return None
 
     try:
         with open(perf_file, 'r') as f:
             reader = csv.DictReader(f, delimiter='\t')
-            # Get the first (and only) data row
             for row in reader:
-                # Convert values to appropriate types
                 result = {}
                 for key, value in row.items():
                     if value:
                         value = value.strip()
-                        # Keep h:m:s as string, convert others to float
                         if key == 'h:m:s':
                             result[key] = value
                         else:
@@ -88,208 +115,158 @@ def parse_performance_file(perf_file: Path) -> Optional[Dict]:
 
 
 def parse_metric_scores(scores_file: Path) -> Optional[Dict[str, float]]:
-    """
-    Parse a clustbench.scores.gz file.
-    
-    Format:
-    k=2,k=2,k=2,k=3,k=4
-    1.0,1.0,1.0,0.7671742903354675,0.7289468426413069
-    
-    Returns:
-        Dictionary mapping k values to scores, or None if file doesn't exist
-    """
+    """Parse a clustbench.scores.gz file into {k: score} dict."""
     if not scores_file.exists():
         return None
-    
+
     try:
         with gzip.open(scores_file, 'rt') as f:
             lines = f.readlines()
-            
+
         if len(lines) != 2:
             return {'error': f'Expected 2 lines, got {len(lines)}'}
-        
-        # Parse header (k values) - extract integers from "k=2" format
-        k_strings = [k.strip() for k in lines[0].strip().split(',')]
+
+        k_strings = [k.strip().strip('"') for k in lines[0].strip().split(',')]
         k_values = []
         for k_str in k_strings:
-            match = re.match(r'k=(\d+)', k_str)
-            if match:
-                k_values.append(int(match.group(1)))
+            m = re.match(r'k=(\d+)', k_str)
+            if m:
+                k_values.append(int(m.group(1)))
             else:
                 return {'error': f'Invalid k format: {k_str}'}
-        
-        # Parse scores
-        scores = [float(s.strip()) for s in lines[1].strip().split(',')]
-        
+
+        score_strings = [s.strip().strip('"') for s in lines[1].strip().split(',')]
+        scores = []
+        for s in score_strings:
+            try:
+                scores.append(float(s))
+            except ValueError:
+                return {'error': f'Invalid score: {s}'}
+
         if len(k_values) != len(scores):
             return {'error': f'Mismatch: {len(k_values)} k values, {len(scores)} scores'}
-        
-        # Build result dict, checking for duplicate k values with different scores
+
         result = {}
         for k, score in zip(k_values, scores):
-            if k in result:
-                # Check if the score is different
-                if abs(result[k] - score) > 1e-10:
-                    raise ValueError(f'Duplicate k value {k} with different scores: {result[k]} vs {score}')
-            else:
-                result[k] = score
-        
+            if k in result and abs(result[k] - score) > 1e-10:
+                return {'error': f'Duplicate k {k} with differing scores'}
+            result[k] = score
+
         return result
-        
+
     except Exception as e:
         return {'error': str(e)}
 
 
-def parse_metrics(param_dir: Path) -> Dict[str, Dict[str, Dict[str, float]]]:
-    """
-    Parse metrics from a parameter directory.
-    
-    Structure: {param_dir}/metrics/{metric_family}/metric-{metric_name}/clustbench.scores.gz
-    
-    Returns:
-        Nested dict: {metric_family: {metric_name: {k: score}}}
-    """
+def parse_metrics(config_dir: Path) -> Dict[str, Dict[str, Dict[str, float]]]:
+    """Parse metrics from a configuration directory."""
     metrics = {}
-    metrics_dir = param_dir / 'metrics'
-    
+    metrics_dir = config_dir / 'metrics'
     if not metrics_dir.exists():
         return metrics
-    
-    # Iterate over metric families
+
     for family_dir in metrics_dir.iterdir():
         if not family_dir.is_dir():
             continue
-        
         family_name = family_dir.name
         metrics[family_name] = {}
-        
-        # Iterate over metrics in this family
         for metric_dir in family_dir.iterdir():
             if not metric_dir.is_dir():
                 continue
-            
-            # Extract metric name from metric-{name} pattern
             metric_match = re.match(r'metric-(.+)', metric_dir.name)
             if not metric_match:
                 continue
-            
             metric_name = metric_match.group(1)
-            
-            # Parse the scores file
             scores_file = metric_dir / 'clustbench.scores.gz'
             scores = parse_metric_scores(scores_file)
-            
             if scores:
                 metrics[family_name][metric_name] = scores
-    
     return metrics
 
 
-def find_results(base_dir: str = '.', pattern: str = 'out-*/data/clustbench/dataset_generator-*/clustering/*') -> List[Dict[str, str]]:
+def find_results(
+    base_dir: str = ".",
+    pattern: str = "results/out_*/data/clustbench/dataset_generator-*/clustering/*"
+) -> List[Dict[str, str]]:
     """
-    Find all result directories matching the pattern.
-
-    Args:
-        base_dir: Base directory to search from
-        pattern: Glob pattern to match
-
-    Returns:
-        List of parsed result dictionaries
+    Return one record per configuration folder with parameters, performance, and metrics.
     """
     base_path = Path(base_dir)
-    results = []
+    results: List[Dict[str, str]] = []
 
     for path in base_path.glob(pattern):
-        if path.is_dir():
-            # Skip hidden directories (starting with .)
-            if not any(part.startswith('.') for part in path.parts):
-                parsed = parse_result_path(path)
-
-                # Find all parameter directories (subdirectories with parameter patterns)
-                param_dirs = [d for d in path.iterdir() if d.is_dir() and not d.name.startswith('.')]
-
-                if param_dirs:
-                    # Parse configurations and their performance
-                    parsed['configurations'] = []
-
-                    # Assume first param_dir for method-level data
-                    first_param_dir = param_dirs[0]
-
-                    # Parse performance file at method level
-                    perf_file = first_param_dir / 'clustbench_performance.txt'
-                    performance = parse_performance_file(perf_file)
-                    if performance:
-                        parsed['performance'] = performance
-
-                    # Parse metrics at method level
-                    metrics = parse_metrics(first_param_dir)
-                    if metrics:
-                        parsed['metrics'] = metrics
-
-                    # Add method_params and method_full at method level
-                    method_params = first_param_dir.name
-
-                    # Extract method from method-{method} pattern if present
-                    method_match = re.match(r'method-([^_]+)', method_params)
-                    if method_match:
-                        extracted_method = method_match.group(1)
-                        parsed['method'] = extracted_method
-
-                    method_full = f"{parsed.get('method', '')}_{method_params}"
-                    parsed['method_params'] = method_params
-                    parsed['method_full'] = method_full
-
-                    for param_dir in param_dirs:
-                        # Load parameters.json if it exists
-                        params_file = param_dir / 'parameters.json'
-                        parameters = None
-                        if params_file.exists():
-                            try:
-                                with open(params_file, 'r') as f:
-                                    parameters = json.load(f)
-                            except Exception as e:
-                                parameters = {'error': str(e)}
-
-                        config = {
-                            'parameter_dir': param_dir.name,
-                            'parameters': parameters
-                        }
+        if not path.is_dir():
+            continue
+        if any(part.startswith(".") for part in path.parts):
+            continue
 
-                        parsed['configurations'].append(config)
+        variants = parse_result_path(path)
+        for variant in variants:
+            config_dir = Path(variant["path"])
+            if not config_dir.is_dir() or config_dir.name == "metrics":
+                continue
 
-                results.append(parsed)
+            record = variant.copy()
+
+            # Parameters
+            params_file = config_dir / "parameters.json"
+            parameters = None
+            if params_file.exists():
+                try:
+                    with open(params_file, "r") as f:
+                        parameters = json.load(f)
+                except Exception as e:
+                    parameters = {"error": str(e)}
+            record["parameters"] = parameters
+            record["parameter_dir"] = config_dir.name
+
+            # Performance
+            perf_file = config_dir / "clustbench_performance.txt"
+            performance = parse_performance_file(perf_file)
+            if performance:
+                record["performance"] = performance
+
+            # Metrics
+            metrics = parse_metrics(config_dir)
+            if metrics:
+                record["metrics"] = metrics
+
+            # Normalize method name
+            m = re.match(r"method-([^_]+)", config_dir.name)
+            if m:
+                record["method"] = m.group(1)
+
+            # Ensure method_full includes config dir name once
+            variant_name = record["method_full"]
+            if config_dir.name not in variant_name:
+                record["method_full"] = f"{variant_name}_{config_dir.name}"
+
+            record["path"] = str(config_dir)
+            results.append(record)
 
     return results
 
 
 def main():
-    """Main function to run the parser."""
-    # Find all matching results
     results = find_results()
-
-    # Print as JSON
     print(json.dumps(results, indent=2))
 
-    # Print summary
-    print(f"\n# Found {len(results)} result directories", file=__import__('sys').stderr)
-
-    # Group by backend, generator, method
-    by_backend = {}
-    by_generator = {}
-    by_method = {}
+    # Summary to stderr
+    import sys
+    print(f"\n# Found {len(results)} result directories", file=sys.stderr)
 
+    by_backend, by_generator, by_method = {}, {}, {}
     for r in results:
         backend = r.get('backend', 'unknown')
         generator = r.get('generator', 'unknown')
         method = r.get('method', 'unknown')
-
         by_backend[backend] = by_backend.get(backend, 0) + 1
         by_generator[generator] = by_generator.get(generator, 0) + 1
         by_method[method] = by_method.get(method, 0) + 1
 
-    print(f"# By backend: {by_backend}", file=__import__('sys').stderr)
-    print(f"# By generator: {by_generator}", file=__import__('sys').stderr)
-    print(f"# By method: {by_method}", file=__import__('sys').stderr)
+    print(f"# By backend: {by_backend}", file=sys.stderr)
+    print(f"# By generator: {by_generator}", file=sys.stderr)
+    print(f"# By method: {by_method}", file=sys.stderr)
 
 
 if __name__ == '__main__':