From 20754df611ecbe65634bc8cf4aac43a72e81e5b3 Mon Sep 17 00:00:00 2001
From: jgilber2 <jgilber2@its.jnj.com>
Date: Wed, 29 Apr 2026 11:25:07 -0700
Subject: [PATCH 01/10] initial implementation of artifact handler

---
 R/InvalidationPolicy.R                   | 142 +++++++++++
 R/RunAnalyses.R                          |  72 +++++-
 R/SettingsHasher.R                       | 285 ++++++++++++++++++++++
 R/ValidateArtifact.R                     | 150 ++++++++++++
 man/InvalidationPolicy.Rd                |  94 ++++++++
 man/SettingsHasher.Rd                    | 238 +++++++++++++++++++
 man/ValidateArtifact.Rd                  | 144 ++++++++++++
 tests/testthat/test-invalidationPolicy.R | 288 +++++++++++++++++++++++
 8 files changed, 1405 insertions(+), 8 deletions(-)
 create mode 100644 R/InvalidationPolicy.R
 create mode 100644 R/SettingsHasher.R
 create mode 100644 R/ValidateArtifact.R
 create mode 100644 man/InvalidationPolicy.Rd
 create mode 100644 man/SettingsHasher.Rd
 create mode 100644 man/ValidateArtifact.Rd
 create mode 100644 tests/testthat/test-invalidationPolicy.R

diff --git a/R/InvalidationPolicy.R b/R/InvalidationPolicy.R
new file mode 100644
index 0000000..9934bea
--- /dev/null
+++ b/R/InvalidationPolicy.R
@@ -0,0 +1,142 @@
+# Copyright 2026 Observational Health Data Sciences and Informatics
+#
+# This file is part of CohortMethod
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#' Invalidation Policy
+#'
+#' @description
+#' Determines which artifacts should be deleted when analysis settings change,
+#' based on dependency relationships between components.
+#'
+#' @details
+#' When settings change, not all artifacts need to be regenerated. This class
+#' implements the logic to determine minimal invalidation scope based on which
+#' settings components have changed.
+#'
+#' @export
+InvalidationPolicy <- R6::R6Class(
+  "InvalidationPolicy",
+  public = list(
+    #' @description
+    #' Compute which file patterns should be deleted based on changed components
+    #'
+    #' @param changedComponents A list (as returned by SettingsHasher$compareSettingsComponents)
+    #'   with logical values for each component
+    #'
+    #' @return
+    #' Character vector of file patterns/globs to delete. Empty vector if no deletion needed.
+    #'
+    #' @details
+    #' Deletion cascade logic:
+    #' - If loadArgsChanged: all downstream artifacts (everything)
+    #' - Else if studyPopArgsChanged: populations, PS models, strata, outcomes
+    #' - Else if psArgsChanged: PS models, strata, outcomes
+    #' - Else if strataArgsChanged: strata and outcomes
+    #' - Else if outcomeModelArgsChanged: outcome models only
+    #' - Else if only balanceArgsChanged: balance files only
+    #' - Else if analyticsChanged (only new outcomes): nothing (outcomes are outcome-specific)
+    computeInvalidationScope = function(changedComponents) {
+      checkmate::assertList(changedComponents)
+
+      filePatternsToDelete <- c()
+
+      # Cascade of invalidation based on dependency graph
+      if (isTRUE(changedComponents$loadArgsChanged)) {
+        # If data loading changed, everything depends on it, so delete all analysis artifacts
+        filePatternsToDelete <- c(
+          filePatternsToDelete,
+          "CmData_.*\\.zip$",      # All CohortMethodData
+          "StudyPop_.*\\.rds$",    # All study populations
+          "Ps_.*\\.rds$",          # All propensity scores
+          "StratPop_.*\\.rds$",    # All stratified populations
+          "Balance_.*\\.rds$",     # All balance files
+          "Analysis_[0-9]+$"       # All analysis folders (contains outcome models)
+        )
+      } else if (isTRUE(changedComponents$studyPopArgsChanged)) {
+        # Study population arguments changed: recompute populations and everything downstream
+        filePatternsToDelete <- c(
+          filePatternsToDelete,
+          "StudyPop_.*\\.rds$",    # All study populations
+          "Ps_.*\\.rds$",          # All PS (may depend on population)
+          "StratPop_.*\\.rds$",    # All stratified populations
+          "Balance_.*\\.rds$",     # All balance files
+          "Analysis_[0-9]+$"       # All outcome models
+        )
+      } else if (isTRUE(changedComponents$psArgsChanged)) {
+        # Propensity score arguments changed: recompute PS and downstream
+        filePatternsToDelete <- c(
+          filePatternsToDelete,
+          "Ps_.*\\.rds$",          # All PS models
+          "StratPop_.*\\.rds$",    # All stratified populations (depend on PS)
+          "Balance_.*\\.rds$",     # All balance files
+          "Analysis_[0-9]+$"       # All outcome models
+        )
+      } else if (isTRUE(changedComponents$strataArgsChanged)) {
+        # Stratification (trim/match/stratify) changed: recompute strata and downstream
+        filePatternsToDelete <- c(
+          filePatternsToDelete,
+          "StratPop_.*\\.rds$",    # All stratified populations
+          "Balance_.*\\.rds$",     # All balance files
+          "Analysis_[0-9]+$"       # All outcome models
+        )
+      } else if (isTRUE(changedComponents$outcomeModelArgsChanged)) {
+        # Outcome model arguments changed: recompute outcome models only
+        filePatternsToDelete <- c(
+          filePatternsToDelete,
+          "Analysis_[0-9]+$"       # All outcome model folders
+        )
+      } else if (isTRUE(changedComponents$balanceArgsChanged)) {
+        # Balance computation arguments changed: delete balance files only
+        filePatternsToDelete <- c(
+          filePatternsToDelete,
+          "Balance_.*\\.rds$"      # All balance files
+        )
+      }
+      # If analyticsChanged but nothing else changed (e.g., only outcomes added):
+      # No deletion needed - outcome models are outcome-specific, new ones will be computed
+
+      return(filePatternsToDelete)
+    },
+
+    #' @description
+    #' Get a human-readable message describing what will be deleted
+    #'
+    #' @param changedComponents A list of changed components
+    #'
+    #' @return
+    #' Character string suitable for displaying to the user
+    getInvalidationMessage = function(changedComponents) {
+      checkmate::assertList(changedComponents)
+
+      if (isTRUE(changedComponents$loadArgsChanged)) {
+        return("Data loading arguments have changed. All analysis artifacts must be regenerated.")
+      } else if (isTRUE(changedComponents$studyPopArgsChanged)) {
+        return("Study population settings have changed. Study populations, propensity scores, and outcome models must be regenerated.")
+      } else if (isTRUE(changedComponents$psArgsChanged)) {
+        return("Propensity score settings have changed. Propensity scores and outcome models must be regenerated.")
+      } else if (isTRUE(changedComponents$strataArgsChanged)) {
+        return("Stratification settings (trimming/matching/stratifying) have changed. Stratified populations and outcome models must be regenerated.")
+      } else if (isTRUE(changedComponents$outcomeModelArgsChanged)) {
+        return("Outcome model settings have changed. Outcome models must be regenerated.")
+      } else if (isTRUE(changedComponents$balanceArgsChanged)) {
+        return("Covariate balance settings have changed. Balance files must be recomputed.")
+      } else if (isTRUE(changedComponents$analyticsChanged)) {
+        return("Analyses have changed (new outcomes or studies added). New analyses will be computed.")
+      } else {
+        return("No artifacts need to be deleted.")
+      }
+    }
+  )
+)
diff --git a/R/RunAnalyses.R b/R/RunAnalyses.R
index 4cc0e6d..2f04330 100644
--- a/R/RunAnalyses.R
+++ b/R/RunAnalyses.R
@@ -209,14 +209,70 @@ runCmAnalyses <- function(connectionDetails,
   if (file.exists(cmAnalysesSpecificationsFile)) {
     oldCmAnalysesSpecifications <- readRDS(cmAnalysesSpecificationsFile)
     if (!isTRUE(all.equal(oldCmAnalysesSpecifications$toList(), cmAnalysesSpecifications$toList()))) {
-      rm(list = ls(envir = cache), envir = cache)
-      message(sprintf("Output files already exist in '%s', but the analysis settings have changed.", outputFolder))
-      response <- utils::askYesNo("Do you want to delete the old files before proceeding?")
-      if (is.na(response)) {
-        # Cancel:
-        return()
-      } else if (response == TRUE) {
-        unlink(outputFolder, recursive = TRUE)
+      # Use intelligent invalidation policy to determine what needs to be deleted
+      hasher <- SettingsHasher$new()
+      changedComponents <- hasher$compareSettingsComponents(
+        oldCmAnalysesSpecifications,
+        cmAnalysesSpecifications
+      )
+
+      if (any(as.logical(changedComponents))) {
+        # Determine minimal set of files to delete based on what changed
+        policy <- InvalidationPolicy$new()
+        filePatternsToDelete <- policy$computeInvalidationScope(changedComponents)
+
+        if (length(filePatternsToDelete) > 0) {
+          # Get human-readable message about what will be deleted
+          message(sprintf("Output files already exist in '%s', but the analysis settings have changed.", outputFolder))
+          message(policy$getInvalidationMessage(changedComponents))
+
+          # Show some affected files to user
+          affectedFiles <- c()
+          for (pattern in filePatternsToDelete) {
+            matchedFiles <- list.files(outputFolder, pattern = pattern, full.names = FALSE)
+            affectedFiles <- c(affectedFiles, matchedFiles)
+          }
+
+          if (length(affectedFiles) > 0) {
+            message(sprintf(
+              "Files to be deleted: %d file(s) matching %d pattern(s)",
+              length(affectedFiles),
+              length(filePatternsToDelete)
+            ))
+            if (length(affectedFiles) <= 10) {
+              for (f in affectedFiles) {
+                message(sprintf("  - %s", f))
+              }
+            } else {
+              for (f in head(affectedFiles, 5)) {
+                message(sprintf("  - %s", f))
+              }
+              message(sprintf("  ... and %d more files", length(affectedFiles) - 5))
+            }
+          }
+
+          response <- utils::askYesNo("Do you want to delete these artifacts before proceeding?")
+          if (is.na(response)) {
+            # Cancel:
+            return()
+          } else if (response == TRUE) {
+            # Delete files matching patterns
+            for (pattern in filePatternsToDelete) {
+              files <- list.files(outputFolder, pattern = pattern, full.names = TRUE, recursive = TRUE)
+              if (length(files) > 0) {
+                unlink(files)
+              }
+              # Also try to delete directories matching pattern
+              dirs <- list.dirs(outputFolder, full.names = TRUE, recursive = FALSE)
+              matchingDirs <- dirs[grep(pattern, basename(dirs))]
+              if (length(matchingDirs) > 0) {
+                unlink(matchingDirs, recursive = TRUE)
+              }
+            }
+          }
+        }
+        # Clear cache regardless (old cache may reference deleted files)
+        rm(list = ls(envir = cache), envir = cache)
       }
     }
   }
diff --git a/R/SettingsHasher.R b/R/SettingsHasher.R
new file mode 100644
index 0000000..70efcac
--- /dev/null
+++ b/R/SettingsHasher.R
@@ -0,0 +1,285 @@
+# Copyright 2026 Observational Health Data Sciences and Informatics
+#
+# This file is part of CohortMethod
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#' Settings Hasher
+#'
+#' @description
+#' Compute deterministic hashes for individual settings components to detect
+#' which parts of the analysis specification have changed.
+#'
+#' @details
+#' This class hashes each settings component independently using JSON serialization
+#' and MD5 digests. This allows fine-grained detection of which artifact types need
+#' to be invalidated when specifications change.
+#'
+#' @export
+SettingsHasher <- R6::R6Class(
+  "SettingsHasher",
+  public = list(
+    #' @description
+    #' Hash the data loading arguments
+    #'
+    #' @param getDbCohortMethodDataArgs An object of type `GetDbCohortMethodDataArgs`
+    #'
+    #' @return
+    #' Character string containing the MD5 hash (32 hex characters)
+    hashLoadArgs = function(getDbCohortMethodDataArgs) {
+      if (is.null(getDbCohortMethodDataArgs)) {
+        return("")
+      }
+      json <- getDbCohortMethodDataArgs$toJson()
+      return(digest::digest(json, algo = "md5"))
+    },
+
+    #' @description
+    #' Hash the study population arguments
+    #'
+    #' @param createStudyPopulationArgs An object of type `CreateStudyPopulationArgs`
+    #'
+    #' @return
+    #' Character string containing the MD5 hash (32 hex characters)
+    hashStudyPopArgs = function(createStudyPopulationArgs) {
+      if (is.null(createStudyPopulationArgs)) {
+        return("")
+      }
+      json <- createStudyPopulationArgs$toJson()
+      return(digest::digest(json, algo = "md5"))
+    },
+
+    #' @description
+    #' Hash the propensity score arguments
+    #'
+    #' @param createPsArgs An object of type `CreatePsArgs`
+    #'
+    #' @return
+    #' Character string containing the MD5 hash (32 hex characters)
+    hashPsArgs = function(createPsArgs) {
+      if (is.null(createPsArgs)) {
+        return("")
+      }
+      json <- createPsArgs$toJson()
+      return(digest::digest(json, algo = "md5"))
+    },
+
+    #' @description
+    #' Hash arguments for propensity score trimming
+    #'
+    #' @param trimByPsArgs An object of type `TrimByPsArgs`
+    #'
+    #' @return
+    #' Character string containing the MD5 hash (32 hex characters)
+    hashTrimByPsArgs = function(trimByPsArgs) {
+      if (is.null(trimByPsArgs)) {
+        return("")
+      }
+      json <- trimByPsArgs$toJson()
+      return(digest::digest(json, algo = "md5"))
+    },
+
+    #' @description
+    #' Hash arguments for propensity score matching
+    #'
+    #' @param matchOnPsArgs An object of type `MatchOnPsArgs`
+    #'
+    #' @return
+    #' Character string containing the MD5 hash (32 hex characters)
+    hashMatchOnPsArgs = function(matchOnPsArgs) {
+      if (is.null(matchOnPsArgs)) {
+        return("")
+      }
+      json <- matchOnPsArgs$toJson()
+      return(digest::digest(json, algo = "md5"))
+    },
+
+    #' @description
+    #' Hash arguments for propensity score stratification
+    #'
+    #' @param stratifyByPsArgs An object of type `StratifyByPsArgs`
+    #'
+    #' @return
+    #' Character string containing the MD5 hash (32 hex characters)
+    hashStratifyByPsArgs = function(stratifyByPsArgs) {
+      if (is.null(stratifyByPsArgs)) {
+        return("")
+      }
+      json <- stratifyByPsArgs$toJson()
+      return(digest::digest(json, algo = "md5"))
+    },
+
+    #' @description
+    #' Hash arguments for outcome model fitting
+    #'
+    #' @param fitOutcomeModelArgs An object of type `FitOutcomeModelArgs`
+    #'
+    #' @return
+    #' Character string containing the MD5 hash (32 hex characters)
+    hashOutcomeModelArgs = function(fitOutcomeModelArgs) {
+      if (is.null(fitOutcomeModelArgs)) {
+        return("")
+      }
+      json <- fitOutcomeModelArgs$toJson()
+      return(digest::digest(json, algo = "md5"))
+    },
+
+    #' @description
+    #' Hash arguments for covariate balance computation
+    #'
+    #' @param computeCovariateBalanceArgs An object of type `ComputeCovariateBalanceArgs`
+    #'
+    #' @return
+    #' Character string containing the MD5 hash (32 hex characters)
+    hashBalanceArgs = function(computeCovariateBalanceArgs) {
+      if (is.null(computeCovariateBalanceArgs)) {
+        return("")
+      }
+      json <- computeCovariateBalanceArgs$toJson()
+      return(digest::digest(json, algo = "md5"))
+    },
+
+    #' @description
+    #' Compare old and new specifications to identify which components changed
+    #'
+    #' @param oldSpecs An object of type `CmAnalysesSpecifications` (old version)
+    #' @param newSpecs An object of type `CmAnalysesSpecifications` (new version)
+    #'
+    #' @return
+    #' A list of logical values indicating which components changed:
+    #' - loadArgsChanged: whether data loading arguments differ
+    #' - studyPopArgsChanged: whether study population arguments differ
+    #' - psArgsChanged: whether propensity score arguments differ
+    #' - strataArgsChanged: whether stratification arguments (trim/match/stratify) differ
+    #' - outcomeModelArgsChanged: whether outcome model arguments differ
+    #' - balanceArgsChanged: whether balance computation arguments differ
+    #' - analyticsChanged: whether outcome of interest or analysis IDs changed
+    compareSettingsComponents = function(oldSpecs, newSpecs) {
+      checkmate::assertClass(oldSpecs, "CmAnalysesSpecifications")
+      checkmate::assertClass(newSpecs, "CmAnalysesSpecifications")
+
+      # Compare each analysis component's settings
+      changedComponents <- list(
+        loadArgsChanged = FALSE,
+        studyPopArgsChanged = FALSE,
+        psArgsChanged = FALSE,
+        strataArgsChanged = FALSE,
+        outcomeModelArgsChanged = FALSE,
+        balanceArgsChanged = FALSE,
+        analyticsChanged = FALSE
+      )
+
+      # Get the old and new analysis lists
+      oldAnalyses <- oldSpecs$cmAnalysisList
+      newAnalyses <- newSpecs$cmAnalysisList
+
+      # If the number of analyses differs significantly or can't be matched, mark as changed
+      if (length(oldAnalyses) != length(newAnalyses)) {
+        # For now, mark as all changed if list lengths differ
+        # A more sophisticated approach would match analyses by ID
+        changedComponents$analyticsChanged <- TRUE
+      } else {
+        # Compare each analysis by ID (assuming they maintain order/ID)
+        for (i in seq_along(newAnalyses)) {
+          oldAna <- oldAnalyses[[i]]
+          newAna <- newAnalyses[[i]]
+
+          # Check data loading
+          oldLoadHash <- self$hashLoadArgs(oldAna$getDbCohortMethodDataArgs)
+          newLoadHash <- self$hashLoadArgs(newAna$getDbCohortMethodDataArgs)
+          if (oldLoadHash != newLoadHash) {
+            changedComponents$loadArgsChanged <- TRUE
+          }
+
+          # Check study population
+          oldStudyPopHash <- self$hashStudyPopArgs(oldAna$createStudyPopulationArgs)
+          newStudyPopHash <- self$hashStudyPopArgs(newAna$createStudyPopulationArgs)
+          if (oldStudyPopHash != newStudyPopHash) {
+            changedComponents$studyPopArgsChanged <- TRUE
+          }
+
+          # Check propensity score
+          oldPsHash <- self$hashPsArgs(oldAna$createPsArgs)
+          newPsHash <- self$hashPsArgs(newAna$createPsArgs)
+          if (oldPsHash != newPsHash) {
+            changedComponents$psArgsChanged <- TRUE
+          }
+
+          # Check stratification (trim/match/stratify all affect strata)
+          oldTrimHash <- self$hashTrimByPsArgs(oldAna$trimByPsArgs)
+          newTrimHash <- self$hashTrimByPsArgs(newAna$trimByPsArgs)
+          oldMatchHash <- self$hashMatchOnPsArgs(oldAna$matchOnPsArgs)
+          newMatchHash <- self$hashMatchOnPsArgs(newAna$matchOnPsArgs)
+          oldStratifyHash <- self$hashStratifyByPsArgs(oldAna$stratifyByPsArgs)
+          newStratifyHash <- self$hashStratifyByPsArgs(newAna$stratifyByPsArgs)
+
+          if (oldTrimHash != newTrimHash || oldMatchHash != newMatchHash || oldStratifyHash != newStratifyHash) {
+            changedComponents$strataArgsChanged <- TRUE
+          }
+
+          # Check outcome model
+          oldOutcomeHash <- self$hashOutcomeModelArgs(oldAna$fitOutcomeModelArgs)
+          newOutcomeHash <- self$hashOutcomeModelArgs(newAna$fitOutcomeModelArgs)
+          if (oldOutcomeHash != newOutcomeHash) {
+            changedComponents$outcomeModelArgsChanged <- TRUE
+          }
+
+          # Check balance args
+          oldBalanceHash <- self$hashBalanceArgs(oldAna$computeSharedCovariateBalanceArgs)
+          newBalanceHash <- self$hashBalanceArgs(newAna$computeSharedCovariateBalanceArgs)
+          if (oldBalanceHash != newBalanceHash) {
+            changedComponents$balanceArgsChanged <- TRUE
+          }
+        }
+      }
+
+      # Check if outcomes of interest or excluded analyses changed
+      oldOutcomes <- oldSpecs$targetComparatorOutcomesList
+      newOutcomes <- newSpecs$targetComparatorOutcomesList
+
+      if (length(oldOutcomes) != length(newOutcomes)) {
+        changedComponents$analyticsChanged <- TRUE
+      } else {
+        # Check if outcomes within TCOs differ
+        for (i in seq_along(newOutcomes)) {
+          oldTco <- oldOutcomes[[i]]
+          newTco <- newOutcomes[[i]]
+
+          # Simple check: compare target, comparator, nesting cohort
+          if (oldTco$targetId != newTco$targetId ||
+            oldTco$comparatorId != newTco$comparatorId ||
+            !identical(oldTco$nestingCohortId, newTco$nestingCohortId)) {
+            changedComponents$analyticsChanged <- TRUE
+            break
+          }
+
+          # Check outcome changes (new outcomes added or old ones removed)
+          oldOutcomeIds <- sort(sapply(oldTco$outcomes, function(x) x$outcomeId))
+          newOutcomeIds <- sort(sapply(newTco$outcomes, function(x) x$outcomeId))
+          if (!identical(oldOutcomeIds, newOutcomeIds)) {
+            # Just outcomes added: no need to delete
+            # Outcomes removed: potentially need to delete
+            if (any(!(newOutcomeIds %in% oldOutcomeIds))) {
+              # New outcomes added - this is OK, don't mark as changed
+            } else {
+              # Outcomes removed - this means regeneration may be needed
+              changedComponents$analyticsChanged <- TRUE
+            }
+          }
+        }
+      }
+
+      return(changedComponents)
+    }
+  )
+)
diff --git a/R/ValidateArtifact.R b/R/ValidateArtifact.R
new file mode 100644
index 0000000..708822d
--- /dev/null
+++ b/R/ValidateArtifact.R
@@ -0,0 +1,150 @@
+# Copyright 2026 Observational Health Data Sciences and Informatics
+#
+# This file is part of CohortMethod
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#' Validate Artifact
+#'
+#' @description
+#' Utility functions to validate that cached artifacts match expected parameter hashes.
+#'
+#' @details
+#' These functions help determine whether an existing artifact can be reused by
+#' validating that its parameters match the hash encoded in the filename or metadata.
+#'
+#' @keywords internal
+#'
+#' @export
+ValidateArtifact <- R6::R6Class(
+  "ValidateArtifact",
+  public = list(
+    #' @description
+    #' Extract hash from a filename
+    #'
+    #' @param filename Character string with the artifact filename
+    #'
+    #' @return
+    #' Character string containing the extracted hash, or NA if not found
+    #'
+    #' @details
+    #' Filenames typically have format like:
+    #' - CmData_l{loadId}_{tcn}.zip
+    #' - StudyPop_l{loadId}_s{studyPopId}_{tcn}_o{outcomeId}.rds
+    #'
+    #' This function extracts numeric IDs which are used as hashes.
+    extractHashFromFilename = function(filename) {
+      # This is a placeholder - actual hash extraction depends on filename format
+      # In CohortMethod, the "ID" (like loadId, studyPopId) is the hash
+      # This would be enhanced based on actual filename patterns used
+
+      # For now, return NA - the actual validation happens at file existence level
+      return(NA_character_)
+    },
+
+    #' @description
+    #' Validate that an artifact file exists and matches expected settings
+    #'
+    #' @param filepath Character string with the full path to the artifact file
+    #' @param settingsHash Character string with the expected settings hash
+    #'
+    #' @return
+    #' Logical TRUE if file exists and hash matches, FALSE otherwise
+    #'
+    #' @details
+    #' This function performs basic validation. In the current implementation,
+    #' we rely on filename encoding to ensure the hash is embedded in the filename.
+    #' If the file exists and has the right naming pattern, we consider it valid.
+    validateArtifactFile = function(filepath, settingsHash = NULL) {
+      # Check if file exists
+      if (!file.exists(filepath)) {
+        return(FALSE)
+      }
+
+      # If no hash provided, just check existence
+      if (is.null(settingsHash)) {
+        return(TRUE)
+      }
+
+      # Extract filename
+      filename <- basename(filepath)
+
+      # In current CohortMethod implementation, the hash is encoded in the filename
+      # through the ID system (loadId, studyPopId, etc.)
+      # Validation consists of checking that the file exists with correct naming
+
+      # For now, assume existence + correct naming = valid artifact
+      # This could be enhanced with actual hash extraction and comparison
+
+      return(TRUE)
+    },
+
+    #' @description
+    #' Check if a cohort method data file is valid and reusable
+    #'
+    #' @param filepath Character string with the path to the CmData zip file
+    #' @param expectedLoadId Numeric ID expected for this data loading configuration
+    #'
+    #' @return
+    #' Logical TRUE if file exists and corresponds to the expected load ID
+    validateCohortMethodData = function(filepath, expectedLoadId) {
+      if (!file.exists(filepath)) {
+        return(FALSE)
+      }
+
+      # Check that file is readable as a zip
+      tryCatch({
+        # Try to get zip file list to verify integrity
+        zip::zip_list(filepath)
+        return(TRUE)
+      }, error = function(e) {
+        return(FALSE)
+      })
+    },
+
+    #' @description
+    #' List all analysis artifacts in a directory by type
+    #'
+    #' @param outputFolder Character string with the output folder path
+    #' @param artifactType Character string indicating artifact type:
+    #'   "cmdata", "studypop", "ps", "stratpop", "balance", "outcome_model"
+    #'
+    #' @return
+    #' Character vector of full paths to matching artifact files
+    listArtifactsByType = function(outputFolder, artifactType) {
+      checkmate::assertCharacter(outputFolder, len = 1)
+      checkmate::assertCharacter(artifactType, len = 1)
+
+      if (!dir.exists(outputFolder)) {
+        return(character())
+      }
+
+      patterns <- list(
+        cmdata = "^CmData_.*\\.zip$",
+        studypop = "^StudyPop_.*\\.rds$",
+        ps = "^Ps_.*\\.rds$",
+        stratpop = "^StratPop_.*\\.rds$",
+        balance = "^Balance_.*\\.rds$",
+        outcome_model = "^Analysis_[0-9]+/om.*\\.rds$"
+      )
+
+      pattern <- patterns[[tolower(artifactType)]]
+      if (is.null(pattern)) {
+        stop("Unknown artifact type: ", artifactType)
+      }
+
+      files <- list.files(outputFolder, pattern = pattern, full.names = TRUE)
+      return(files)
+    }
+  )
+)
diff --git a/man/InvalidationPolicy.Rd b/man/InvalidationPolicy.Rd
new file mode 100644
index 0000000..45834eb
--- /dev/null
+++ b/man/InvalidationPolicy.Rd
@@ -0,0 +1,94 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/InvalidationPolicy.R
+\name{InvalidationPolicy}
+\alias{InvalidationPolicy}
+\title{Invalidation Policy}
+\description{
+Determines which artifacts should be deleted when analysis settings change,
+based on dependency relationships between components.
+}
+\details{
+When settings change, not all artifacts need to be regenerated. This class
+implements the logic to determine minimal invalidation scope based on which
+settings components have changed.
+}
+\section{Methods}{
+\subsection{Public methods}{
+\itemize{
+\item \href{#method-InvalidationPolicy-computeInvalidationScope}{\code{InvalidationPolicy$computeInvalidationScope()}}
+\item \href{#method-InvalidationPolicy-getInvalidationMessage}{\code{InvalidationPolicy$getInvalidationMessage()}}
+\item \href{#method-InvalidationPolicy-clone}{\code{InvalidationPolicy$clone()}}
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-InvalidationPolicy-computeInvalidationScope"></a>}}
+\if{latex}{\out{\hypertarget{method-InvalidationPolicy-computeInvalidationScope}{}}}
+\subsection{Method \code{computeInvalidationScope()}}{
+Compute which file patterns should be deleted based on changed components
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{InvalidationPolicy$computeInvalidationScope(changedComponents)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{changedComponents}}{A list (as returned by SettingsHasher$compareSettingsComponents)
+with logical values for each component}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Details}{
+Deletion cascade logic:
+\itemize{
+\item If loadArgsChanged: all downstream artifacts (everything)
+\item Else if studyPopArgsChanged: populations, PS models, strata, outcomes
+\item Else if psArgsChanged: PS models, strata, outcomes
+\item Else if strataArgsChanged: strata and outcomes
+\item Else if outcomeModelArgsChanged: outcome models only
+\item Else if only balanceArgsChanged: balance files only
+\item Else if analyticsChanged (only new outcomes): nothing (outcomes are outcome-specific)
+}
+}
+
+\subsection{Returns}{
+Character vector of file patterns/globs to delete. Empty vector if no deletion needed.
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-InvalidationPolicy-getInvalidationMessage"></a>}}
+\if{latex}{\out{\hypertarget{method-InvalidationPolicy-getInvalidationMessage}{}}}
+\subsection{Method \code{getInvalidationMessage()}}{
+Get a human-readable message describing what will be deleted
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{InvalidationPolicy$getInvalidationMessage(changedComponents)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{changedComponents}}{A list of changed components}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Returns}{
+Character string suitable for displaying to the user
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-InvalidationPolicy-clone"></a>}}
+\if{latex}{\out{\hypertarget{method-InvalidationPolicy-clone}{}}}
+\subsection{Method \code{clone()}}{
+The objects of this class are cloneable with this method.
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{InvalidationPolicy$clone(deep = FALSE)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{deep}}{Whether to make a deep clone.}
+}
+\if{html}{\out{</div>}}
+}
+}
+}
diff --git a/man/SettingsHasher.Rd b/man/SettingsHasher.Rd
new file mode 100644
index 0000000..4e1df70
--- /dev/null
+++ b/man/SettingsHasher.Rd
@@ -0,0 +1,238 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/SettingsHasher.R
+\name{SettingsHasher}
+\alias{SettingsHasher}
+\title{Settings Hasher}
+\description{
+Compute deterministic hashes for individual settings components to detect
+which parts of the analysis specification have changed.
+}
+\details{
+This class hashes each settings component independently using JSON serialization
+and MD5 digests. This allows fine-grained detection of which artifact types need
+to be invalidated when specifications change.
+}
+\section{Methods}{
+\subsection{Public methods}{
+\itemize{
+\item \href{#method-SettingsHasher-hashLoadArgs}{\code{SettingsHasher$hashLoadArgs()}}
+\item \href{#method-SettingsHasher-hashStudyPopArgs}{\code{SettingsHasher$hashStudyPopArgs()}}
+\item \href{#method-SettingsHasher-hashPsArgs}{\code{SettingsHasher$hashPsArgs()}}
+\item \href{#method-SettingsHasher-hashTrimByPsArgs}{\code{SettingsHasher$hashTrimByPsArgs()}}
+\item \href{#method-SettingsHasher-hashMatchOnPsArgs}{\code{SettingsHasher$hashMatchOnPsArgs()}}
+\item \href{#method-SettingsHasher-hashStratifyByPsArgs}{\code{SettingsHasher$hashStratifyByPsArgs()}}
+\item \href{#method-SettingsHasher-hashOutcomeModelArgs}{\code{SettingsHasher$hashOutcomeModelArgs()}}
+\item \href{#method-SettingsHasher-hashBalanceArgs}{\code{SettingsHasher$hashBalanceArgs()}}
+\item \href{#method-SettingsHasher-compareSettingsComponents}{\code{SettingsHasher$compareSettingsComponents()}}
+\item \href{#method-SettingsHasher-clone}{\code{SettingsHasher$clone()}}
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-SettingsHasher-hashLoadArgs"></a>}}
+\if{latex}{\out{\hypertarget{method-SettingsHasher-hashLoadArgs}{}}}
+\subsection{Method \code{hashLoadArgs()}}{
+Hash the data loading arguments
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{SettingsHasher$hashLoadArgs(getDbCohortMethodDataArgs)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{getDbCohortMethodDataArgs}}{An object of type \code{GetDbCohortMethodDataArgs}}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Returns}{
+Character string containing the MD5 hash (32 hex characters)
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-SettingsHasher-hashStudyPopArgs"></a>}}
+\if{latex}{\out{\hypertarget{method-SettingsHasher-hashStudyPopArgs}{}}}
+\subsection{Method \code{hashStudyPopArgs()}}{
+Hash the study population arguments
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{SettingsHasher$hashStudyPopArgs(createStudyPopulationArgs)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{createStudyPopulationArgs}}{An object of type \code{CreateStudyPopulationArgs}}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Returns}{
+Character string containing the MD5 hash (32 hex characters)
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-SettingsHasher-hashPsArgs"></a>}}
+\if{latex}{\out{\hypertarget{method-SettingsHasher-hashPsArgs}{}}}
+\subsection{Method \code{hashPsArgs()}}{
+Hash the propensity score arguments
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{SettingsHasher$hashPsArgs(createPsArgs)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{createPsArgs}}{An object of type \code{CreatePsArgs}}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Returns}{
+Character string containing the MD5 hash (32 hex characters)
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-SettingsHasher-hashTrimByPsArgs"></a>}}
+\if{latex}{\out{\hypertarget{method-SettingsHasher-hashTrimByPsArgs}{}}}
+\subsection{Method \code{hashTrimByPsArgs()}}{
+Hash arguments for propensity score trimming
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{SettingsHasher$hashTrimByPsArgs(trimByPsArgs)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{trimByPsArgs}}{An object of type \code{TrimByPsArgs}}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Returns}{
+Character string containing the MD5 hash (32 hex characters)
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-SettingsHasher-hashMatchOnPsArgs"></a>}}
+\if{latex}{\out{\hypertarget{method-SettingsHasher-hashMatchOnPsArgs}{}}}
+\subsection{Method \code{hashMatchOnPsArgs()}}{
+Hash arguments for propensity score matching
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{SettingsHasher$hashMatchOnPsArgs(matchOnPsArgs)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{matchOnPsArgs}}{An object of type \code{MatchOnPsArgs}}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Returns}{
+Character string containing the MD5 hash (32 hex characters)
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-SettingsHasher-hashStratifyByPsArgs"></a>}}
+\if{latex}{\out{\hypertarget{method-SettingsHasher-hashStratifyByPsArgs}{}}}
+\subsection{Method \code{hashStratifyByPsArgs()}}{
+Hash arguments for propensity score stratification
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{SettingsHasher$hashStratifyByPsArgs(stratifyByPsArgs)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{stratifyByPsArgs}}{An object of type \code{StratifyByPsArgs}}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Returns}{
+Character string containing the MD5 hash (32 hex characters)
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-SettingsHasher-hashOutcomeModelArgs"></a>}}
+\if{latex}{\out{\hypertarget{method-SettingsHasher-hashOutcomeModelArgs}{}}}
+\subsection{Method \code{hashOutcomeModelArgs()}}{
+Hash arguments for outcome model fitting
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{SettingsHasher$hashOutcomeModelArgs(fitOutcomeModelArgs)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{fitOutcomeModelArgs}}{An object of type \code{FitOutcomeModelArgs}}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Returns}{
+Character string containing the MD5 hash (32 hex characters)
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-SettingsHasher-hashBalanceArgs"></a>}}
+\if{latex}{\out{\hypertarget{method-SettingsHasher-hashBalanceArgs}{}}}
+\subsection{Method \code{hashBalanceArgs()}}{
+Hash arguments for covariate balance computation
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{SettingsHasher$hashBalanceArgs(computeCovariateBalanceArgs)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{computeCovariateBalanceArgs}}{An object of type \code{ComputeCovariateBalanceArgs}}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Returns}{
+Character string containing the MD5 hash (32 hex characters)
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-SettingsHasher-compareSettingsComponents"></a>}}
+\if{latex}{\out{\hypertarget{method-SettingsHasher-compareSettingsComponents}{}}}
+\subsection{Method \code{compareSettingsComponents()}}{
+Compare old and new specifications to identify which components changed
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{SettingsHasher$compareSettingsComponents(oldSpecs, newSpecs)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{oldSpecs}}{An object of type \code{CmAnalysesSpecifications} (old version)}
+
+\item{\code{newSpecs}}{An object of type \code{CmAnalysesSpecifications} (new version)}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Returns}{
+A list of logical values indicating which components changed:
+\itemize{
+\item loadArgsChanged: whether data loading arguments differ
+\item studyPopArgsChanged: whether study population arguments differ
+\item psArgsChanged: whether propensity score arguments differ
+\item strataArgsChanged: whether stratification arguments (trim/match/stratify) differ
+\item outcomeModelArgsChanged: whether outcome model arguments differ
+\item balanceArgsChanged: whether balance computation arguments differ
+\item analyticsChanged: whether outcome of interest or analysis IDs changed
+}
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-SettingsHasher-clone"></a>}}
+\if{latex}{\out{\hypertarget{method-SettingsHasher-clone}{}}}
+\subsection{Method \code{clone()}}{
+The objects of this class are cloneable with this method.
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{SettingsHasher$clone(deep = FALSE)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{deep}}{Whether to make a deep clone.}
+}
+\if{html}{\out{</div>}}
+}
+}
+}
diff --git a/man/ValidateArtifact.Rd b/man/ValidateArtifact.Rd
new file mode 100644
index 0000000..1d1c516
--- /dev/null
+++ b/man/ValidateArtifact.Rd
@@ -0,0 +1,144 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/ValidateArtifact.R
+\name{ValidateArtifact}
+\alias{ValidateArtifact}
+\title{Validate Artifact}
+\description{
+Utility functions to validate that cached artifacts match expected parameter hashes.
+}
+\details{
+These functions help determine whether an existing artifact can be reused by
+validating that its parameters match the hash encoded in the filename or metadata.
+}
+\keyword{internal}
+\section{Methods}{
+\subsection{Public methods}{
+\itemize{
+\item \href{#method-ValidateArtifact-extractHashFromFilename}{\code{ValidateArtifact$extractHashFromFilename()}}
+\item \href{#method-ValidateArtifact-validateArtifactFile}{\code{ValidateArtifact$validateArtifactFile()}}
+\item \href{#method-ValidateArtifact-validateCohortMethodData}{\code{ValidateArtifact$validateCohortMethodData()}}
+\item \href{#method-ValidateArtifact-listArtifactsByType}{\code{ValidateArtifact$listArtifactsByType()}}
+\item \href{#method-ValidateArtifact-clone}{\code{ValidateArtifact$clone()}}
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-ValidateArtifact-extractHashFromFilename"></a>}}
+\if{latex}{\out{\hypertarget{method-ValidateArtifact-extractHashFromFilename}{}}}
+\subsection{Method \code{extractHashFromFilename()}}{
+Extract hash from a filename
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{ValidateArtifact$extractHashFromFilename(filename)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{filename}}{Character string with the artifact filename}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Details}{
+Filenames typically have format like:
+\itemize{
+\item CmData_l{loadId}_{tcn}.zip
+\item StudyPop_l{loadId}\emph{s{studyPopId}}{tcn}_o{outcomeId}.rds
+}
+
+This function extracts numeric IDs which are used as hashes.
+}
+
+\subsection{Returns}{
+Character string containing the extracted hash, or NA if not found
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-ValidateArtifact-validateArtifactFile"></a>}}
+\if{latex}{\out{\hypertarget{method-ValidateArtifact-validateArtifactFile}{}}}
+\subsection{Method \code{validateArtifactFile()}}{
+Validate that an artifact file exists and matches expected settings
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{ValidateArtifact$validateArtifactFile(filepath, settingsHash = NULL)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{filepath}}{Character string with the full path to the artifact file}
+
+\item{\code{settingsHash}}{Character string with the expected settings hash}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Details}{
+This function performs basic validation. In the current implementation,
+we rely on filename encoding to ensure the hash is embedded in the filename.
+If the file exists and has the right naming pattern, we consider it valid.
+}
+
+\subsection{Returns}{
+Logical TRUE if file exists and hash matches, FALSE otherwise
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-ValidateArtifact-validateCohortMethodData"></a>}}
+\if{latex}{\out{\hypertarget{method-ValidateArtifact-validateCohortMethodData}{}}}
+\subsection{Method \code{validateCohortMethodData()}}{
+Check if a cohort method data file is valid and reusable
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{ValidateArtifact$validateCohortMethodData(filepath, expectedLoadId)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{filepath}}{Character string with the path to the CmData zip file}
+
+\item{\code{expectedLoadId}}{Numeric ID expected for this data loading configuration}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Returns}{
+Logical TRUE if file exists and corresponds to the expected load ID
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-ValidateArtifact-listArtifactsByType"></a>}}
+\if{latex}{\out{\hypertarget{method-ValidateArtifact-listArtifactsByType}{}}}
+\subsection{Method \code{listArtifactsByType()}}{
+List all analysis artifacts in a directory by type
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{ValidateArtifact$listArtifactsByType(outputFolder, artifactType)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{outputFolder}}{Character string with the output folder path}
+
+\item{\code{artifactType}}{Character string indicating artifact type:
+"cmdata", "studypop", "ps", "stratpop", "balance", "outcome_model"}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Returns}{
+Character vector of full paths to matching artifact files
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-ValidateArtifact-clone"></a>}}
+\if{latex}{\out{\hypertarget{method-ValidateArtifact-clone}{}}}
+\subsection{Method \code{clone()}}{
+The objects of this class are cloneable with this method.
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{ValidateArtifact$clone(deep = FALSE)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{deep}}{Whether to make a deep clone.}
+}
+\if{html}{\out{</div>}}
+}
+}
+}
diff --git a/tests/testthat/test-invalidationPolicy.R b/tests/testthat/test-invalidationPolicy.R
new file mode 100644
index 0000000..3674508
--- /dev/null
+++ b/tests/testthat/test-invalidationPolicy.R
@@ -0,0 +1,288 @@
+context("Artifact caching system")
+
+# Helper to create covariate settings
+getDefaultCovariateSettings <- function() {
+  return(FeatureExtraction::createCovariateSettings(
+    useDemographicsGender = TRUE,
+    useDemographicsAge = TRUE
+  ))
+}
+
+test_that("SettingsHasher hashes settings consistently", {
+  # Create two identical settings objects
+  args1 <- createGetDbCohortMethodDataArgs(
+    covariateSettings = getDefaultCovariateSettings()
+  )
+  args2 <- createGetDbCohortMethodDataArgs(
+    covariateSettings = getDefaultCovariateSettings()
+  )
+
+  hasher <- SettingsHasher$new()
+
+  # Hash the same settings twice
+  hash1 <- hasher$hashLoadArgs(args1)
+  hash2 <- hasher$hashLoadArgs(args2)
+
+  # Should produce identical hashes
+  expect_equal(hash1, hash2)
+  expect_true(is.character(hash1))
+  expect_equal(nchar(hash1), 32)  # MD5 is 32 hex characters
+})
+
+test_that("SettingsHasher detects changes in load args", {
+  hasher <- SettingsHasher$new()
+
+  args1 <- createGetDbCohortMethodDataArgs(
+    covariateSettings = getDefaultCovariateSettings()
+  )
+  args2 <- createGetDbCohortMethodDataArgs(
+    covariateSettings = getDefaultCovariateSettings(),
+    maxCohortSize = 50000  # Different
+  )
+
+  hash1 <- hasher$hashLoadArgs(args1)
+  hash2 <- hasher$hashLoadArgs(args2)
+
+  expect_true(hash1 != hash2)
+})
+
+test_that("SettingsHasher detects changes in study population args", {
+  hasher <- SettingsHasher$new()
+
+  args1 <- createCreateStudyPopulationArgs()
+  args2 <- createCreateStudyPopulationArgs(minDaysAtRisk = 30)  # Different
+
+  hash1 <- hasher$hashStudyPopArgs(args1)
+  hash2 <- hasher$hashStudyPopArgs(args2)
+
+  expect_true(hash1 != hash2)
+})
+
+test_that("SettingsHasher handles NULL arguments", {
+  hasher <- SettingsHasher$new()
+
+  hash <- hasher$hashLoadArgs(NULL)
+  expect_equal(hash, "")
+
+  hash <- hasher$hashPsArgs(NULL)
+  expect_equal(hash, "")
+})
+
+test_that("SettingsHasher compareSettingsComponents works with identical specs", {
+  hasher <- SettingsHasher$new()
+
+  # Create identical specifications
+  analysis <- createCmAnalysis(
+    analysisId = 1,
+    getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs(
+      covariateSettings = getDefaultCovariateSettings()
+    ),
+    createStudyPopulationArgs = createCreateStudyPopulationArgs(),
+    createPsArgs = createCreatePsArgs()
+  )
+
+  specs1 <- createCmAnalysesSpecifications(
+    cmAnalysisList = list(analysis),
+    targetComparatorOutcomesList = list(
+      createTargetComparatorOutcomes(
+        targetId = 1,
+        comparatorId = 2,
+        outcomes = list(createOutcome(outcomeId = 3))
+      )
+    )
+  )
+
+  specs2 <- createCmAnalysesSpecifications(
+    cmAnalysisList = list(analysis),
+    targetComparatorOutcomesList = list(
+      createTargetComparatorOutcomes(
+        targetId = 1,
+        comparatorId = 2,
+        outcomes = list(createOutcome(outcomeId = 3))
+      )
+    )
+  )
+
+  changes <- hasher$compareSettingsComponents(specs1, specs2)
+
+  expect_false(any(as.logical(changes)))
+})
+
+test_that("SettingsHasher compareSettingsComponents detects load args change", {
+  hasher <- SettingsHasher$new()
+
+  analysis1 <- createCmAnalysis(
+    analysisId = 1,
+    getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs(
+      covariateSettings = getDefaultCovariateSettings()
+    ),
+    createStudyPopulationArgs = createCreateStudyPopulationArgs(),
+    createPsArgs = createCreatePsArgs()
+  )
+
+  analysis2 <- createCmAnalysis(
+    analysisId = 1,
+    getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs(
+      covariateSettings = getDefaultCovariateSettings(),
+      maxCohortSize = 50000
+    ),
+    createStudyPopulationArgs = createCreateStudyPopulationArgs(),
+    createPsArgs = createCreatePsArgs()
+  )
+
+  specs1 <- createCmAnalysesSpecifications(
+    cmAnalysisList = list(analysis1),
+    targetComparatorOutcomesList = list(
+      createTargetComparatorOutcomes(
+        targetId = 1,
+        comparatorId = 2,
+        outcomes = list(createOutcome(outcomeId = 3))
+      )
+    )
+  )
+
+  specs2 <- createCmAnalysesSpecifications(
+    cmAnalysisList = list(analysis2),
+    targetComparatorOutcomesList = list(
+      createTargetComparatorOutcomes(
+        targetId = 1,
+        comparatorId = 2,
+        outcomes = list(createOutcome(outcomeId = 3))
+      )
+    )
+  )
+
+  changes <- hasher$compareSettingsComponents(specs1, specs2)
+
+  expect_true(changes$loadArgsChanged)
+})
+
+test_that("InvalidationPolicy computes correct deletion scope for load args change", {
+  policy <- InvalidationPolicy$new()
+
+  changedComponents <- list(
+    loadArgsChanged = TRUE,
+    studyPopArgsChanged = FALSE,
+    psArgsChanged = FALSE,
+    strataArgsChanged = FALSE,
+    outcomeModelArgsChanged = FALSE,
+    balanceArgsChanged = FALSE,
+    analyticsChanged = FALSE
+  )
+
+  patterns <- policy$computeInvalidationScope(changedComponents)
+
+  # Should include all major artifact patterns
+  expect_true(any(grepl("CmData", patterns)))
+  expect_true(any(grepl("StudyPop", patterns)))
+  expect_true(any(grepl("Ps", patterns)))
+  expect_true(any(grepl("Analysis", patterns)))
+})
+
+test_that("InvalidationPolicy computes correct deletion scope for study pop args change", {
+  policy <- InvalidationPolicy$new()
+
+  changedComponents <- list(
+    loadArgsChanged = FALSE,
+    studyPopArgsChanged = TRUE,
+    psArgsChanged = FALSE,
+    strataArgsChanged = FALSE,
+    outcomeModelArgsChanged = FALSE,
+    balanceArgsChanged = FALSE,
+    analyticsChanged = FALSE
+  )
+
+  patterns <- policy$computeInvalidationScope(changedComponents)
+
+  # Should include study pop and downstream
+  expect_true(any(grepl("StudyPop", patterns)))
+  expect_true(any(grepl("Ps", patterns)))
+  expect_true(any(grepl("Analysis", patterns)))
+
+  # But not CmData
+  expect_false(any(grepl("CmData_", patterns)))
+})
+
+test_that("InvalidationPolicy computes empty scope for outcome addition only", {
+  policy <- InvalidationPolicy$new()
+
+  changedComponents <- list(
+    loadArgsChanged = FALSE,
+    studyPopArgsChanged = FALSE,
+    psArgsChanged = FALSE,
+    strataArgsChanged = FALSE,
+    outcomeModelArgsChanged = FALSE,
+    balanceArgsChanged = FALSE,
+    analyticsChanged = TRUE  # But only because new outcomes added
+  )
+
+  patterns <- policy$computeInvalidationScope(changedComponents)
+
+  # Should be empty - new outcomes don't require deletion
+  expect_equal(length(patterns), 0)
+})
+
+test_that("InvalidationPolicy provides meaningful messages", {
+  policy <- InvalidationPolicy$new()
+
+  changedComponents <- list(
+    loadArgsChanged = TRUE,
+    studyPopArgsChanged = FALSE,
+    psArgsChanged = FALSE,
+    strataArgsChanged = FALSE,
+    outcomeModelArgsChanged = FALSE,
+    balanceArgsChanged = FALSE,
+    analyticsChanged = FALSE
+  )
+
+  message <- policy$getInvalidationMessage(changedComponents)
+  expect_true(grepl("Data loading", message, ignore.case = TRUE))
+  expect_true(is.character(message))
+})
+
+test_that("ValidateArtifact can list artifacts by type", {
+  outputFolder <- tempfile(pattern = "cmArtifacts")
+  dir.create(outputFolder)
+
+  on.exit(unlink(outputFolder, recursive = TRUE))
+
+  # Create some test files
+  file.create(file.path(outputFolder, "CmData_l1_t1_c2.zip"))
+  file.create(file.path(outputFolder, "StudyPop_l1_s1_t1_c2_o3.rds"))
+  file.create(file.path(outputFolder, "Ps_l1_p1_t1_c2.rds"))
+
+  validator <- ValidateArtifact$new()
+
+  cmdata <- validator$listArtifactsByType(outputFolder, "cmdata")
+  expect_equal(length(cmdata), 1)
+  expect_true(grepl("CmData", cmdata))
+
+  studypop <- validator$listArtifactsByType(outputFolder, "studypop")
+  expect_equal(length(studypop), 1)
+  expect_true(grepl("StudyPop", studypop))
+
+  ps <- validator$listArtifactsByType(outputFolder, "ps")
+  expect_equal(length(ps), 1)
+  expect_true(grepl("Ps", ps))
+})
+
+test_that("ValidateArtifact can validate artifact file existence", {
+  outputFolder <- tempfile(pattern = "cmArtifacts")
+  dir.create(outputFolder)
+
+  on.exit(unlink(outputFolder, recursive = TRUE))
+
+  validator <- ValidateArtifact$new()
+
+  # Non-existent file
+  result <- validator$validateArtifactFile(file.path(outputFolder, "nonexistent.zip"))
+  expect_false(result)
+
+  # Create a file
+  testFile <- file.path(outputFolder, "test.zip")
+  file.create(testFile)
+
+  # Existing file
+  result <- validator$validateArtifactFile(testFile)
+  expect_true(result)
+})

From a8ade8fadaedd0c5944d9409ddfcfafe426dca0c Mon Sep 17 00:00:00 2001
From: jgilber2 <jgilber2@its.jnj.com>
Date: Thu, 30 Apr 2026 07:22:25 -0700
Subject: [PATCH 02/10] Simpliofied approach to be more similar to SCCS and
 include database ids within checksums

---
 NAMESPACE                                |   2 +
 NEWS.md                                  |  12 +
 R/ArtifactStore.R                        | 174 ++++++++
 R/InvalidationPolicy.R                   | 142 -------
 R/RunAnalyses.R                          | 515 +++++++++--------------
 R/SettingsHasher.R                       | 285 -------------
 R/ValidateArtifact.R                     | 150 -------
 man/InvalidationPolicy.Rd                |  94 -----
 man/SettingsHasher.Rd                    | 238 -----------
 man/ValidateArtifact.Rd                  | 144 -------
 tests/testthat/test-artifactStore.R      | 155 +++++++
 tests/testthat/test-eunomia.R            |  18 +-
 tests/testthat/test-invalidationPolicy.R | 288 -------------
 13 files changed, 565 insertions(+), 1652 deletions(-)
 create mode 100644 R/ArtifactStore.R
 delete mode 100644 R/InvalidationPolicy.R
 delete mode 100644 R/SettingsHasher.R
 delete mode 100644 R/ValidateArtifact.R
 delete mode 100644 man/InvalidationPolicy.Rd
 delete mode 100644 man/SettingsHasher.Rd
 delete mode 100644 man/ValidateArtifact.Rd
 create mode 100644 tests/testthat/test-artifactStore.R
 delete mode 100644 tests/testthat/test-invalidationPolicy.R

diff --git a/NAMESPACE b/NAMESPACE
index 09da7dc..6056ad0 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -4,6 +4,8 @@ S3method(coef,OutcomeModel)
 S3method(confint,OutcomeModel)
 S3method(print,OutcomeModel)
 S3method(print,summary.CohortMethodData)
+export(ArtifactStore)
+export(LocalArtifactStore)
 export(adjustedKm)
 export(checkCmInstallation)
 export(computeCovariateBalance)
diff --git a/NEWS.md b/NEWS.md
index ef86558..62ed9a6 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,15 @@
+CohortMethod 6.0.3 (Development)
+================================
+
+Features:
+
+- Added content-addressable artifact caching system that preserves intermediate computation results when adding new outcomes or changing non-critical settings. Filenames are derived from SHA-256 hashes of all parameters that determine their content (including `databaseId`), so changing upstream settings naturally produces new filenames and unchanged settings reuse existing files.
+
+- New `databaseId` parameter (required) in `runCmAnalyses()` prevents accidental reuse of cached results from a different database.
+
+- Pluggable `ArtifactStore` interface with `LocalArtifactStore` default implementation, enabling future custom storage backends (S3, HTTP, etc.).
+
+
 CohortMethod 6.0.2
 ==================
 
diff --git a/R/ArtifactStore.R b/R/ArtifactStore.R
new file mode 100644
index 0000000..542e499
--- /dev/null
+++ b/R/ArtifactStore.R
@@ -0,0 +1,174 @@
+# Copyright 2026 Observational Health Data Sciences and Informatics
+#
+# This file is part of CohortMethod
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#' @title Abstract Artifact Store
+#'
+#' @description
+#' Abstract R6 interface for storing and retrieving analysis artifacts.
+#' Subclass this to implement custom storage backends (e.g., S3, database).
+#'
+#' @export
+ArtifactStore <- R6::R6Class(
+  "ArtifactStore",
+  public = list(
+    #' @description Check if an artifact exists.
+    #' @param key Character. The artifact key (relative path).
+    #' @return Logical.
+    exists = function(key) {
+      stop("Abstract method: must be implemented by subclass")
+    },
+
+    #' @description Read an RDS artifact.
+    #' @param key Character. The artifact key.
+    #' @return The deserialized R object.
+    readRDS = function(key) {
+      stop("Abstract method: must be implemented by subclass")
+    },
+
+    #' @description Save an RDS artifact.
+    #' @param object The R object to save.
+    #' @param key Character. The artifact key.
+    saveRDS = function(object, key) {
+      stop("Abstract method: must be implemented by subclass")
+    },
+
+    #' @description Read an Andromeda (zip) artifact.
+    #' @param key Character. The artifact key.
+    #' @return An Andromeda object.
+    readAndromeda = function(key) {
+      stop("Abstract method: must be implemented by subclass")
+    },
+
+    #' @description Save an Andromeda (zip) artifact.
+    #' @param object The Andromeda object to save.
+    #' @param key Character. The artifact key.
+    saveAndromeda = function(object, key) {
+      stop("Abstract method: must be implemented by subclass")
+    },
+
+    #' @description List artifacts matching a prefix.
+    #' @param prefix Character or NULL. Filter to keys starting with this prefix.
+    #' @return Character vector of keys.
+    listArtifacts = function(prefix = NULL) {
+      stop("Abstract method: must be implemented by subclass")
+    },
+
+    #' @description Delete an artifact.
+    #' @param key Character. The artifact key.
+    delete = function(key) {
+      stop("Abstract method: must be implemented by subclass")
+    },
+
+    #' @description Ensure a directory exists for a given key.
+    #' @param key Character. The artifact key.
+    ensureDir = function(key) {
+      stop("Abstract method: must be implemented by subclass")
+    }
+  )
+)
+
+#' @title Local Filesystem Artifact Store
+#'
+#' @description
+#' Implementation of [ArtifactStore] that reads and writes artifacts to a local
+#' filesystem directory. This is the default storage backend.
+#'
+#' @export
+LocalArtifactStore <- R6::R6Class(
+  "LocalArtifactStore",
+  inherit = ArtifactStore,
+  public = list(
+    #' @description Create a new local artifact store.
+    #' @param basePath Character. The root directory for artifact storage.
+    initialize = function(basePath) {
+      private$basePath <- basePath
+      if (!dir.exists(basePath)) {
+        dir.create(basePath, recursive = TRUE)
+      }
+    },
+
+    #' @description Check if an artifact exists.
+    #' @param key Character. The artifact key (relative path).
+    #' @return Logical.
+    exists = function(key) {
+      file.exists(file.path(private$basePath, key))
+    },
+
+    #' @description Read an RDS artifact.
+    #' @param key Character. The artifact key.
+    #' @return The deserialized R object.
+    readRDS = function(key) {
+      base::readRDS(file.path(private$basePath, key))
+    },
+
+    #' @description Save an RDS artifact.
+    #' @param object The R object to save.
+    #' @param key Character. The artifact key.
+    saveRDS = function(object, key) {
+      base::saveRDS(object, file.path(private$basePath, key))
+    },
+
+    #' @description Read an Andromeda (zip) artifact.
+    #' @param key Character. The artifact key.
+    #' @return An Andromeda object.
+    readAndromeda = function(key) {
+      loadCohortMethodData(file.path(private$basePath, key))
+    },
+
+    #' @description Save an Andromeda (zip) artifact.
+    #' @param object The Andromeda object to save.
+    #' @param key Character. The artifact key.
+    saveAndromeda = function(object, key) {
+      saveCohortMethodData(object, file.path(private$basePath, key))
+    },
+
+    #' @description List artifacts matching a prefix.
+    #' @param prefix Character or NULL. Filter to keys starting with this prefix.
+    #' @return Character vector of keys.
+    listArtifacts = function(prefix = NULL) {
+      files <- list.files(private$basePath, recursive = TRUE)
+      if (!is.null(prefix)) {
+        files <- files[startsWith(files, prefix)]
+      }
+      files
+    },
+
+    #' @description Delete an artifact.
+    #' @param key Character. The artifact key.
+    delete = function(key) {
+      unlink(file.path(private$basePath, key))
+    },
+
+    #' @description Ensure a directory exists for a given key.
+    #' @param key Character. The artifact key.
+    ensureDir = function(key) {
+      dir <- dirname(file.path(private$basePath, key))
+      if (!dir.exists(dir)) {
+        dir.create(dir, recursive = TRUE)
+      }
+    },
+
+    #' @description Get the full filesystem path for a key.
+    #' @param key Character. The artifact key.
+    #' @return Character. The full path.
+    getFullPath = function(key) {
+      file.path(private$basePath, key)
+    }
+  ),
+  private = list(
+    basePath = NULL
+  )
+)
diff --git a/R/InvalidationPolicy.R b/R/InvalidationPolicy.R
deleted file mode 100644
index 9934bea..0000000
--- a/R/InvalidationPolicy.R
+++ /dev/null
@@ -1,142 +0,0 @@
-# Copyright 2026 Observational Health Data Sciences and Informatics
-#
-# This file is part of CohortMethod
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#' Invalidation Policy
-#'
-#' @description
-#' Determines which artifacts should be deleted when analysis settings change,
-#' based on dependency relationships between components.
-#'
-#' @details
-#' When settings change, not all artifacts need to be regenerated. This class
-#' implements the logic to determine minimal invalidation scope based on which
-#' settings components have changed.
-#'
-#' @export
-InvalidationPolicy <- R6::R6Class(
-  "InvalidationPolicy",
-  public = list(
-    #' @description
-    #' Compute which file patterns should be deleted based on changed components
-    #'
-    #' @param changedComponents A list (as returned by SettingsHasher$compareSettingsComponents)
-    #'   with logical values for each component
-    #'
-    #' @return
-    #' Character vector of file patterns/globs to delete. Empty vector if no deletion needed.
-    #'
-    #' @details
-    #' Deletion cascade logic:
-    #' - If loadArgsChanged: all downstream artifacts (everything)
-    #' - Else if studyPopArgsChanged: populations, PS models, strata, outcomes
-    #' - Else if psArgsChanged: PS models, strata, outcomes
-    #' - Else if strataArgsChanged: strata and outcomes
-    #' - Else if outcomeModelArgsChanged: outcome models only
-    #' - Else if only balanceArgsChanged: balance files only
-    #' - Else if analyticsChanged (only new outcomes): nothing (outcomes are outcome-specific)
-    computeInvalidationScope = function(changedComponents) {
-      checkmate::assertList(changedComponents)
-
-      filePatternsToDelete <- c()
-
-      # Cascade of invalidation based on dependency graph
-      if (isTRUE(changedComponents$loadArgsChanged)) {
-        # If data loading changed, everything depends on it, so delete all analysis artifacts
-        filePatternsToDelete <- c(
-          filePatternsToDelete,
-          "CmData_.*\\.zip$",      # All CohortMethodData
-          "StudyPop_.*\\.rds$",    # All study populations
-          "Ps_.*\\.rds$",          # All propensity scores
-          "StratPop_.*\\.rds$",    # All stratified populations
-          "Balance_.*\\.rds$",     # All balance files
-          "Analysis_[0-9]+$"       # All analysis folders (contains outcome models)
-        )
-      } else if (isTRUE(changedComponents$studyPopArgsChanged)) {
-        # Study population arguments changed: recompute populations and everything downstream
-        filePatternsToDelete <- c(
-          filePatternsToDelete,
-          "StudyPop_.*\\.rds$",    # All study populations
-          "Ps_.*\\.rds$",          # All PS (may depend on population)
-          "StratPop_.*\\.rds$",    # All stratified populations
-          "Balance_.*\\.rds$",     # All balance files
-          "Analysis_[0-9]+$"       # All outcome models
-        )
-      } else if (isTRUE(changedComponents$psArgsChanged)) {
-        # Propensity score arguments changed: recompute PS and downstream
-        filePatternsToDelete <- c(
-          filePatternsToDelete,
-          "Ps_.*\\.rds$",          # All PS models
-          "StratPop_.*\\.rds$",    # All stratified populations (depend on PS)
-          "Balance_.*\\.rds$",     # All balance files
-          "Analysis_[0-9]+$"       # All outcome models
-        )
-      } else if (isTRUE(changedComponents$strataArgsChanged)) {
-        # Stratification (trim/match/stratify) changed: recompute strata and downstream
-        filePatternsToDelete <- c(
-          filePatternsToDelete,
-          "StratPop_.*\\.rds$",    # All stratified populations
-          "Balance_.*\\.rds$",     # All balance files
-          "Analysis_[0-9]+$"       # All outcome models
-        )
-      } else if (isTRUE(changedComponents$outcomeModelArgsChanged)) {
-        # Outcome model arguments changed: recompute outcome models only
-        filePatternsToDelete <- c(
-          filePatternsToDelete,
-          "Analysis_[0-9]+$"       # All outcome model folders
-        )
-      } else if (isTRUE(changedComponents$balanceArgsChanged)) {
-        # Balance computation arguments changed: delete balance files only
-        filePatternsToDelete <- c(
-          filePatternsToDelete,
-          "Balance_.*\\.rds$"      # All balance files
-        )
-      }
-      # If analyticsChanged but nothing else changed (e.g., only outcomes added):
-      # No deletion needed - outcome models are outcome-specific, new ones will be computed
-
-      return(filePatternsToDelete)
-    },
-
-    #' @description
-    #' Get a human-readable message describing what will be deleted
-    #'
-    #' @param changedComponents A list of changed components
-    #'
-    #' @return
-    #' Character string suitable for displaying to the user
-    getInvalidationMessage = function(changedComponents) {
-      checkmate::assertList(changedComponents)
-
-      if (isTRUE(changedComponents$loadArgsChanged)) {
-        return("Data loading arguments have changed. All analysis artifacts must be regenerated.")
-      } else if (isTRUE(changedComponents$studyPopArgsChanged)) {
-        return("Study population settings have changed. Study populations, propensity scores, and outcome models must be regenerated.")
-      } else if (isTRUE(changedComponents$psArgsChanged)) {
-        return("Propensity score settings have changed. Propensity scores and outcome models must be regenerated.")
-      } else if (isTRUE(changedComponents$strataArgsChanged)) {
-        return("Stratification settings (trimming/matching/stratifying) have changed. Stratified populations and outcome models must be regenerated.")
-      } else if (isTRUE(changedComponents$outcomeModelArgsChanged)) {
-        return("Outcome model settings have changed. Outcome models must be regenerated.")
-      } else if (isTRUE(changedComponents$balanceArgsChanged)) {
-        return("Covariate balance settings have changed. Balance files must be recomputed.")
-      } else if (isTRUE(changedComponents$analyticsChanged)) {
-        return("Analyses have changed (new outcomes or studies added). New analyses will be computed.")
-      } else {
-        return("No artifacts need to be deleted.")
-      }
-    }
-  )
-)
diff --git a/R/RunAnalyses.R b/R/RunAnalyses.R
index 2f04330..42507af 100644
--- a/R/RunAnalyses.R
+++ b/R/RunAnalyses.R
@@ -171,6 +171,12 @@ createDefaultMultiThreadingSettings <- function(maxCores) {
 #'                                       [createDefaultMultiThreadingSettings()] functions.
 #' @param cmAnalysesSpecifications       An object of type `CmAnalysesSpecifications` as created using
 #'                                       the `createCmAnalysesSpecifications()`.
+#' @param databaseId                     A unique identifier for the database being used. This is
+#'                                       baked into artifact hashes to prevent accidental reuse of
+#'                                       cached results from a different database. Required.
+#' @param artifactStore                  An object inheriting from [ArtifactStore] used to read and
+#'                                       write cached artifacts. If NULL (default), a
+#'                                       [LocalArtifactStore] backed by `outputFolder` is used.
 #'
 #' @return
 #' A tibble describing for each target-comparator-outcome-analysisId combination where the intermediary and
@@ -188,7 +194,9 @@ runCmAnalyses <- function(connectionDetails,
                           nestingCohortTable = "cohort",
                           outputFolder = "./CohortMethodOutput",
                           multiThreadingSettings = createMultiThreadingSettings(),
-                          cmAnalysesSpecifications) {
+                          cmAnalysesSpecifications,
+                          databaseId,
+                          artifactStore = NULL) {
   errorMessages <- checkmate::makeAssertCollection()
   checkmate::assertClass(connectionDetails, "ConnectionDetails", add = errorMessages)
   checkmate::assertCharacter(cdmDatabaseSchema, len = 1, add = errorMessages)
@@ -202,91 +210,47 @@ runCmAnalyses <- function(connectionDetails,
   checkmate::assertCharacter(outputFolder, len = 1, add = errorMessages)
   checkmate::assertClass(multiThreadingSettings, "CmMultiThreadingSettings", add = errorMessages)
   checkmate::assertR6(cmAnalysesSpecifications, "CmAnalysesSpecifications", add = errorMessages)
+  checkmate::assertCharacter(databaseId, len = 1, min.chars = 1, add = errorMessages)
   checkmate::reportAssertions(collection = errorMessages)
 
   outputFolder <- normalizePath(outputFolder, mustWork = FALSE)
-  cmAnalysesSpecificationsFile <- file.path(outputFolder, "cmAnalysesSpecifications.rds")
-  if (file.exists(cmAnalysesSpecificationsFile)) {
-    oldCmAnalysesSpecifications <- readRDS(cmAnalysesSpecificationsFile)
-    if (!isTRUE(all.equal(oldCmAnalysesSpecifications$toList(), cmAnalysesSpecifications$toList()))) {
-      # Use intelligent invalidation policy to determine what needs to be deleted
-      hasher <- SettingsHasher$new()
-      changedComponents <- hasher$compareSettingsComponents(
-        oldCmAnalysesSpecifications,
-        cmAnalysesSpecifications
-      )
-
-      if (any(as.logical(changedComponents))) {
-        # Determine minimal set of files to delete based on what changed
-        policy <- InvalidationPolicy$new()
-        filePatternsToDelete <- policy$computeInvalidationScope(changedComponents)
-
-        if (length(filePatternsToDelete) > 0) {
-          # Get human-readable message about what will be deleted
-          message(sprintf("Output files already exist in '%s', but the analysis settings have changed.", outputFolder))
-          message(policy$getInvalidationMessage(changedComponents))
-
-          # Show some affected files to user
-          affectedFiles <- c()
-          for (pattern in filePatternsToDelete) {
-            matchedFiles <- list.files(outputFolder, pattern = pattern, full.names = FALSE)
-            affectedFiles <- c(affectedFiles, matchedFiles)
-          }
+  if (!dir.exists(outputFolder)) {
+    dir.create(outputFolder, recursive = TRUE)
+  }
 
-          if (length(affectedFiles) > 0) {
-            message(sprintf(
-              "Files to be deleted: %d file(s) matching %d pattern(s)",
-              length(affectedFiles),
-              length(filePatternsToDelete)
-            ))
-            if (length(affectedFiles) <= 10) {
-              for (f in affectedFiles) {
-                message(sprintf("  - %s", f))
-              }
-            } else {
-              for (f in head(affectedFiles, 5)) {
-                message(sprintf("  - %s", f))
-              }
-              message(sprintf("  ... and %d more files", length(affectedFiles) - 5))
-            }
-          }
+  # Initialize artifact store
+  if (is.null(artifactStore)) {
+    artifactStore <- LocalArtifactStore$new(outputFolder)
+  }
 
-          response <- utils::askYesNo("Do you want to delete these artifacts before proceeding?")
-          if (is.na(response)) {
-            # Cancel:
-            return()
-          } else if (response == TRUE) {
-            # Delete files matching patterns
-            for (pattern in filePatternsToDelete) {
-              files <- list.files(outputFolder, pattern = pattern, full.names = TRUE, recursive = TRUE)
-              if (length(files) > 0) {
-                unlink(files)
-              }
-              # Also try to delete directories matching pattern
-              dirs <- list.dirs(outputFolder, full.names = TRUE, recursive = FALSE)
-              matchingDirs <- dirs[grep(pattern, basename(dirs))]
-              if (length(matchingDirs) > 0) {
-                unlink(matchingDirs, recursive = TRUE)
-              }
-            }
-          }
-        }
-        # Clear cache regardless (old cache may reference deleted files)
-        rm(list = ls(envir = cache), envir = cache)
-      }
+  # Check for database ID mismatch against previously cached results
+  databaseIdFile <- "databaseId.rds"
+  if (artifactStore$exists(databaseIdFile)) {
+    previousDatabaseId <- artifactStore$readRDS(databaseIdFile)
+    if (!identical(previousDatabaseId, databaseId)) {
+      stop(sprintf(
+        paste("Database ID mismatch: output folder was previously used with databaseId '%s',",
+              "but now '%s' was provided. To reuse this folder with a different database,",
+              "delete the existing output folder first."),
+        previousDatabaseId,
+        databaseId
+      ))
     }
+  } else {
+    artifactStore$saveRDS(databaseId, databaseIdFile)
   }
-  if (!dir.exists(outputFolder)) {
-    dir.create(outputFolder, recursive = TRUE)
-  }
-  saveRDS(cmAnalysesSpecifications, cmAnalysesSpecificationsFile)
+
+  # Save specifications for reference
+  artifactStore$saveRDS(cmAnalysesSpecifications, "cmAnalysesSpecifications.rds")
+
   referenceTable <- createReferenceTable(
     cmAnalysisList = cmAnalysesSpecifications$cmAnalysisList,
     targetComparatorOutcomesList = cmAnalysesSpecifications$targetComparatorOutcomesList,
     analysesToExclude = cmAnalysesSpecifications$analysesToExclude,
     outputFolder = outputFolder,
     refitPsForEveryOutcome = cmAnalysesSpecifications$refitPsForEveryOutcome,
-    refitPsForEveryStudyPopulation = cmAnalysesSpecifications$refitPsForEveryStudyPopulation
+    refitPsForEveryStudyPopulation = cmAnalysesSpecifications$refitPsForEveryStudyPopulation,
+    databaseId = databaseId
   )
   referenceTable |>
     select(-"includedCovariateConceptIds", "excludedCovariateConceptIds") |>
@@ -1106,7 +1070,8 @@ createReferenceTable <- function(cmAnalysisList,
                                  analysesToExclude,
                                  outputFolder,
                                  refitPsForEveryOutcome,
-                                 refitPsForEveryStudyPopulation) {
+                                 refitPsForEveryStudyPopulation,
+                                 databaseId = "") {
   # Create all rows per target-comparator-outcome-analysis combination:
   convertAnalysisToTable <- function(analysis) {
     tibble(
@@ -1161,11 +1126,20 @@ createReferenceTable <- function(cmAnalysisList,
     loadArgsId = match(loadArgsJsons, uniqueLoadArgsJsons)
   )
   referenceTable <- inner_join(referenceTable, analysisIdToLoadArgsId, by = "analysisId")
+
+  # Content-addressable hash includes databaseId to prevent cross-database cache reuse
+  referenceTable$loadHash <- vapply(seq_len(nrow(referenceTable)), function(i) {
+    .contentHash(
+      databaseId,
+      loadArgsJsons[[match(referenceTable$analysisId[i], analyses$analysisId)]],
+      referenceTable$targetId[i],
+      referenceTable$comparatorId[i],
+      referenceTable$nestingCohortId[i]
+    )
+  }, character(1))
+
   referenceTable$cohortMethodDataFile <- .createCohortMethodDataFileName(
-    loadId = referenceTable$loadArgsId,
-    targetId = referenceTable$targetId,
-    comparatorId = referenceTable$comparatorId,
-    nestingCohortId = referenceTable$nestingCohortId
+    loadHash = referenceTable$loadHash
   )
 
   # Add studypop filenames
@@ -1177,14 +1151,17 @@ createReferenceTable <- function(cmAnalysisList,
     studyPopArgsId = match(studyPopArgsJsons, uniqueStudyPopArgsJsons)
   )
   referenceTable <- inner_join(referenceTable, analysisIdToStudyPopArgsId, by = "analysisId")
-  referenceTable$studyPopFile <- .createStudyPopulationFileName(
-    loadId = referenceTable$loadArgsId,
-    studyPopId = referenceTable$studyPopArgsId,
-    targetId = referenceTable$targetId,
-    comparatorId = referenceTable$comparatorId,
-    nestingCohortId = referenceTable$nestingCohortId,
-    outcomeId = referenceTable$outcomeId
-  )
+
+  # Content-addressable hash for study population files
+  referenceTable$studyPopFile <- vapply(seq_len(nrow(referenceTable)), function(i) {
+    hash <- .contentHash(
+      databaseId,
+      referenceTable$loadHash[i],
+      studyPopArgsJsons[[match(referenceTable$analysisId[i], analyses$analysisId)]],
+      referenceTable$outcomeId[i]
+    )
+    .createStudyPopulationFileName(hash)
+  }, character(1))
 
   # Add PS filenames
   createPsArgsJsons <- lapply(
@@ -1200,15 +1177,17 @@ createReferenceTable <- function(cmAnalysisList,
   noPsIds <- which(uniqueCreatePsArgsJsons == "")
   idxWithPs <- !(referenceTable$psArgsId %in% noPsIds)
   referenceTable$psFile <- ""
-  referenceTable$psFile[idxWithPs] <- .createPsOutcomeFileName(
-    loadId = referenceTable$loadArgsId[idxWithPs],
-    studyPopId = referenceTable$studyPopArgsId[idxWithPs],
-    psId = referenceTable$psArgsId[idxWithPs],
-    targetId = referenceTable$targetId[idxWithPs],
-    comparatorId = referenceTable$comparatorId[idxWithPs],
-    nestingCohortId = referenceTable$nestingCohortId[idxWithPs],
-    outcomeId = referenceTable$outcomeId[idxWithPs]
-  )
+  referenceTable$psFile[idxWithPs] <- vapply(which(idxWithPs), function(i) {
+    hash <- .contentHash(
+      databaseId,
+      referenceTable$loadHash[i],
+      studyPopArgsJsons[[match(referenceTable$analysisId[i], analyses$analysisId)]],
+      createPsArgsJsons[[match(referenceTable$analysisId[i], analyses$analysisId)]],
+      referenceTable$outcomeId[i]
+    )
+    .createPsFileName(hash)
+  }, character(1))
+
   referenceTable$sharedPsFile <- ""
   if (!refitPsForEveryOutcome) {
     if (refitPsForEveryStudyPopulation) {
@@ -1227,39 +1206,39 @@ createReferenceTable <- function(cmAnalysisList,
           return(TRUE)
         }
       }
-      studyPopArgsEquivalentId <- seq_along(cmAnalysisList)
+      # Compute the canonical (first equivalent) studyPop JSON for hashing
+      studyPopArgsEquivalentIdx <- seq_along(cmAnalysisList)
       for (i in seq_along(cmAnalysisList)) {
         for (j in seq_len(i - 1)) {
           if (equivalent(cmAnalysisList[[i]]$createStudyPopulationArgs,
                          cmAnalysisList[[j]]$createStudyPopulationArgs)) {
-            studyPopArgsEquivalentId[i] <- j
+            studyPopArgsEquivalentIdx[i] <- j
             break
           }
         }
       }
-      analysisIdToStudyPopArgsEquivalentId <- tibble(
-        analysisId = analyses$analysisId,
-        studyPopArgsEquivalentId = studyPopArgsEquivalentId
-      )
-      referenceTable <- inner_join(referenceTable, analysisIdToStudyPopArgsEquivalentId, by = "analysisId")
-      referenceTable$sharedPsFile[idxWithPs] <- .createPsFileName(
-        loadId = referenceTable$loadArgsId[idxWithPs],
-        studyPopId = referenceTable$studyPopArgsEquivalentId[idxWithPs],
-        psId = referenceTable$psArgsId[idxWithPs],
-        targetId = referenceTable$targetId[idxWithPs],
-        comparatorId = referenceTable$comparatorId[idxWithPs],
-        nestingCohortId = referenceTable$nestingCohortId[idxWithPs]
-      )
+      referenceTable$sharedPsFile[idxWithPs] <- vapply(which(idxWithPs), function(i) {
+        aIdx <- match(referenceTable$analysisId[i], analyses$analysisId)
+        equivIdx <- studyPopArgsEquivalentIdx[aIdx]
+        hash <- .contentHash(
+          databaseId,
+          referenceTable$loadHash[i],
+          studyPopArgsJsons[[equivIdx]],
+          createPsArgsJsons[[aIdx]]
+        )
+        .createPsFileName(hash)
+      }, character(1))
     } else {
       # One propensity model across all study population settings:
-      referenceTable$sharedPsFile[idxWithPs] <- .createPsFileName(
-        loadId = referenceTable$loadArgsId[idxWithPs],
-        studyPopId = NULL,
-        psId = referenceTable$psArgsId[idxWithPs],
-        targetId = referenceTable$targetId[idxWithPs],
-        comparatorId = referenceTable$comparatorId[idxWithPs],
-        nestingCohortId = referenceTable$nestingCohortId[idxWithPs]
-      )
+      referenceTable$sharedPsFile[idxWithPs] <- vapply(which(idxWithPs), function(i) {
+        aIdx <- match(referenceTable$analysisId[i], analyses$analysisId)
+        hash <- .contentHash(
+          databaseId,
+          referenceTable$loadHash[i],
+          createPsArgsJsons[[aIdx]]
+        )
+        .createPsFileName(hash)
+      }, character(1))
     }
   }
 
@@ -1281,16 +1260,18 @@ createReferenceTable <- function(cmAnalysisList,
   noStrataIds <- which(uniqueStrataArgsJsons == "")
   idxWithStrata <- !(referenceTable$strataArgsId %in% noStrataIds)
   referenceTable$strataFile <- ""
-  referenceTable$strataFile[idxWithStrata] <- .createStratifiedPopFileName(
-    loadId = referenceTable$loadArgsId[idxWithStrata],
-    studyPopId = referenceTable$studyPopArgsId[idxWithStrata],
-    psId = referenceTable$psArgsId[idxWithStrata],
-    strataId = referenceTable$strataArgsId[idxWithStrata],
-    targetId = referenceTable$targetId[idxWithStrata],
-    comparatorId = referenceTable$comparatorId[idxWithStrata],
-    nestingCohortId = referenceTable$nestingCohortId[idxWithStrata],
-    outcomeId = referenceTable$outcomeId[idxWithStrata]
-  )
+  referenceTable$strataFile[idxWithStrata] <- vapply(which(idxWithStrata), function(i) {
+    aIdx <- match(referenceTable$analysisId[i], analyses$analysisId)
+    hash <- .contentHash(
+      databaseId,
+      referenceTable$loadHash[i],
+      studyPopArgsJsons[[aIdx]],
+      createPsArgsJsons[[aIdx]],
+      strataArgsJsons[[aIdx]],
+      referenceTable$outcomeId[i]
+    )
+    .createStratifiedPopFileName(hash)
+  }, character(1))
 
   # Add shared covariate balance files (per target-comparator-analysis)
   if (refitPsForEveryOutcome) {
@@ -1301,24 +1282,26 @@ createReferenceTable <- function(cmAnalysisList,
       function(x) if (is.null(x$computeSharedCovariateBalanceArgs)) "" else x$computeSharedCovariateBalanceArgs$toJson()
     )
     uniqueSharedBalanceArgsJsons <- unique(sharedBalanceArgsJsons)
+    noSharedBalanceIds <- which(uniqueSharedBalanceArgsJsons == "")
     analysisIdToSharedBalanceArgsId <- tibble(
       analysisId = analyses$analysisId,
       sharedBalanceId = match(sharedBalanceArgsJsons, uniqueSharedBalanceArgsJsons)
     )
     referenceTable <- inner_join(referenceTable, analysisIdToSharedBalanceArgsId, by = "analysisId")
-    noSharedBalanceIds <- which(uniqueSharedBalanceArgsJsons == "")
     idxWithSharedBalance <- !(referenceTable$sharedBalanceId %in% noSharedBalanceIds)
     referenceTable$sharedBalanceFile <- ""
-    referenceTable$sharedBalanceFile[idxWithSharedBalance] <- .createsharedBalanceFileName(
-      loadId = referenceTable$loadArgsId[idxWithSharedBalance],
-      studyPopId = referenceTable$studyPopArgsId[idxWithSharedBalance],
-      psId = referenceTable$psArgsId[idxWithSharedBalance],
-      strataId = referenceTable$strataArgsId[idxWithSharedBalance],
-      sharedBalanceId = referenceTable$sharedBalanceId[idxWithSharedBalance],
-      targetId = referenceTable$targetId[idxWithSharedBalance],
-      comparatorId = referenceTable$comparatorId[idxWithSharedBalance],
-      nestingCohortId = referenceTable$nestingCohortId[idxWithSharedBalance]
-    )
+    referenceTable$sharedBalanceFile[idxWithSharedBalance] <- vapply(which(idxWithSharedBalance), function(i) {
+      aIdx <- match(referenceTable$analysisId[i], analyses$analysisId)
+      hash <- .contentHash(
+        databaseId,
+        referenceTable$loadHash[i],
+        studyPopArgsJsons[[aIdx]],
+        createPsArgsJsons[[aIdx]],
+        strataArgsJsons[[aIdx]],
+        sharedBalanceArgsJsons[[aIdx]]
+      )
+      .createSharedBalanceFileName(hash)
+    }, character(1))
   }
 
   # Add covariate balance files (per target-comparator-analysis-outcome)
@@ -1342,27 +1325,32 @@ createReferenceTable <- function(cmAnalysisList,
   }
   idxFilter <- referenceTable$balanceId %in% balanceIdsRequiringFiltering
   referenceTable$filteredForbalanceFile <- ""
-  referenceTable$filteredForbalanceFile[idxFilter] <- .createFilterForBalanceFileName(
-    loadId = referenceTable$loadArgsId[idxFilter],
-    balanceId = referenceTable$balanceId[idxFilter],
-    targetId = referenceTable$targetId[idxFilter],
-    comparatorId = referenceTable$comparatorId[idxFilter],
-    nestingCohortId = referenceTable$nestingCohortId[idxFilter]
-  )
+  referenceTable$filteredForbalanceFile[idxFilter] <- vapply(which(idxFilter), function(i) {
+    aIdx <- match(referenceTable$analysisId[i], analyses$analysisId)
+    hash <- .contentHash(
+      databaseId,
+      referenceTable$loadHash[i],
+      balanceArgsJsons[[aIdx]]
+    )
+    .createFilterForBalanceFileName(hash)
+  }, character(1))
+
   noBalanceIds <- which(uniqueBalanceArgsJsons == "")
   idxWithBalance <- !(referenceTable$balanceId %in% noBalanceIds)
   referenceTable$balanceFile <- ""
-  referenceTable$balanceFile[idxWithBalance] <- .createBalanceFileName(
-    loadId = referenceTable$loadArgsId[idxWithBalance],
-    studyPopId = referenceTable$studyPopArgsId[idxWithBalance],
-    psId = referenceTable$psArgsId[idxWithBalance],
-    strataId = referenceTable$strataArgsId[idxWithBalance],
-    balanceId = referenceTable$balanceId[idxWithBalance],
-    targetId = referenceTable$targetId[idxWithBalance],
-    comparatorId = referenceTable$comparatorId[idxWithBalance],
-    nestingCohortId = referenceTable$nestingCohortId[idxWithBalance],
-    outcomeId = referenceTable$outcomeId[idxWithBalance]
-  )
+  referenceTable$balanceFile[idxWithBalance] <- vapply(which(idxWithBalance), function(i) {
+    aIdx <- match(referenceTable$analysisId[i], analyses$analysisId)
+    hash <- .contentHash(
+      databaseId,
+      referenceTable$loadHash[i],
+      studyPopArgsJsons[[aIdx]],
+      createPsArgsJsons[[aIdx]],
+      strataArgsJsons[[aIdx]],
+      balanceArgsJsons[[aIdx]],
+      referenceTable$outcomeId[i]
+    )
+    .createBalanceFileName(hash)
+  }, character(1))
 
   # Add prefiltered covariate files
   preFilterArgJsons <- lapply(
@@ -1394,27 +1382,36 @@ createReferenceTable <- function(cmAnalysisList,
   idxWithPreFilter <- !(referenceTable$prefilterId %in% noPreFilterIds)
 
   referenceTable$prefilteredCovariatesFile <- ""
-  referenceTable$prefilteredCovariatesFile[idxWithPreFilter] <- .createPrefilteredCovariatesFileName(
-    loadId = referenceTable$loadArgsId[idxWithPreFilter],
-    targetId = referenceTable$targetId[idxWithPreFilter],
-    comparatorId = referenceTable$comparatorId[idxWithPreFilter],
-    nestingCohortId = referenceTable$nestingCohortId[idxWithPreFilter],
-    prefilterId = referenceTable$prefilterId[idxWithPreFilter]
-  )
+  referenceTable$prefilteredCovariatesFile[idxWithPreFilter] <- vapply(which(idxWithPreFilter), function(i) {
+    aIdx <- match(referenceTable$analysisId[i], analyses$analysisId)
+    hash <- .contentHash(
+      databaseId,
+      referenceTable$loadHash[i],
+      preFilterArgJsons[[aIdx]]
+    )
+    .createPrefilteredCovariatesFileName(hash)
+  }, character(1))
 
 
   # Add outcome model file names
-  referenceTable <- referenceTable |>
-    mutate(outcomeModelFile = ifelse(.data$fitOutcomeModel,
-                                     .createOutcomeModelFileName(
-                                       folder = .data$analysisFolder,
-                                       targetId = .data$targetId,
-                                       comparatorId = .data$comparatorId,
-                                       nestingCohortId = .data$nestingCohortId,
-                                       outcomeId = .data$outcomeId
-                                     ),
-                                     ""
-    ))
+  outcomeModelArgsJsons <- lapply(cmAnalysisList, function(x) {
+    if (is.null(x$fitOutcomeModelArgs)) "" else x$fitOutcomeModelArgs$toJson()
+  })
+  referenceTable$outcomeModelFile <- ""
+  idxWithOm <- referenceTable$fitOutcomeModel
+  referenceTable$outcomeModelFile[idxWithOm] <- vapply(which(idxWithOm), function(i) {
+    aIdx <- match(referenceTable$analysisId[i], analyses$analysisId)
+    hash <- .contentHash(
+      databaseId,
+      referenceTable$loadHash[i],
+      studyPopArgsJsons[[aIdx]],
+      createPsArgsJsons[[aIdx]],
+      strataArgsJsons[[aIdx]],
+      outcomeModelArgsJsons[[aIdx]],
+      referenceTable$outcomeId[i]
+    )
+    .createOutcomeModelFileName(referenceTable$analysisFolder[i], hash)
+  }, character(1))
 
   # Some cleanup:
   referenceTable <- referenceTable[, c(
@@ -1479,154 +1476,62 @@ createReferenceTable <- function(cmAnalysisList,
   return(referenceTable)
 }
 
-.f <- function(x) {
-  return(format(x, scientific = FALSE, trim = TRUE))
-}
-
-.createTcnSubstring <- function(targetId, comparatorId, nestingCohortId) {
-  subString <- if_else(is.na(nestingCohortId),
-                       sprintf("_t%s_c%s", .f(targetId), .f(comparatorId)),
-                       sprintf("_t%s_c%s_n%s", .f(targetId), .f(comparatorId), .f(nestingCohortId)))
-
-  return(subString)
-}
-
-.createCohortMethodDataFileName <- function(loadId, targetId, comparatorId, nestingCohortId) {
-  name <- sprintf("CmData_l%s%s.zip",
-                  loadId,
-                  .createTcnSubstring(targetId, comparatorId, nestingCohortId))
-  return(name)
+# Compute a deterministic content hash from arbitrary inputs.
+# Used to generate stable file names that depend on settings, not position.
+# @param ... Inputs to hash. R6 settings objects are serialized via toList()
+#   with sorted keys for deterministic ordering.
+# @param length Number of hex characters to keep (default 12).
+# @return A character string of `length` hex characters.
+.contentHash <- function(..., length = 12) {
+  parts <- list(...)
+  canonical <- paste(vapply(parts, function(x) {
+    if (is.null(x) || (is.atomic(x) && length(x) == 1 && is.na(x))) {
+      "NULL"
+    } else if (inherits(x, "AbstractSerializableSettings")) {
+      lst <- x$toList()
+      as.character(jsonlite::toJSON(lst[order(names(lst))],
+                                    auto_unbox = TRUE, digits = NA, null = "null"))
+    } else {
+      as.character(jsonlite::toJSON(x, auto_unbox = TRUE, digits = NA, null = "null"))
+    }
+  }, character(1)), collapse = "|")
+  substr(digest::digest(canonical, algo = "sha256", serialize = FALSE), 1, length)
 }
 
-
-.createPrefilteredCovariatesFileName <- function(loadId, targetId, comparatorId, nestingCohortId, prefilterId) {
-  name <- sprintf("Prefilter_l%s%s_p%s.zip",
-                  loadId,
-                  .createTcnSubstring(targetId, comparatorId, nestingCohortId),
-                  prefilterId)
-  name[prefilterId == -1] <- rep("", sum(prefilterId == -1))
-  return(name)
+.createCohortMethodDataFileName <- function(loadHash) {
+  sprintf("CmData_%s.zip", loadHash)
 }
 
-.createStudyPopulationFileName <- function(loadId,
-                                           studyPopId,
-                                           targetId,
-                                           comparatorId,
-                                           nestingCohortId,
-                                           outcomeId) {
-  name <- sprintf("StudyPop_l%s_s%s%s_o%s.rds",
-                  loadId,
-                  studyPopId,
-                  .createTcnSubstring(targetId, comparatorId, nestingCohortId),
-                  .f(outcomeId))
-  return(name)
+.createPrefilteredCovariatesFileName <- function(hash) {
+  sprintf("Prefilter_%s.zip", hash)
 }
 
-.createPsFileName <- function(loadId, studyPopId, psId, targetId, comparatorId, nestingCohortId) {
-  if (is.null(studyPopId)) {
-    name <- sprintf("Ps_l%s_p%s%s.rds",
-                    loadId,
-                    psId,
-                    .createTcnSubstring(targetId, comparatorId, nestingCohortId))
-  } else {
-    name <- sprintf("Ps_l%s_s%s_p%s%s.rds",
-                    loadId,
-                    studyPopId,
-                    psId,
-                    .createTcnSubstring(targetId, comparatorId, nestingCohortId))
-  }
-  return(name)
+.createStudyPopulationFileName <- function(hash) {
+  sprintf("StudyPop_%s.rds", hash)
 }
 
-.createPsOutcomeFileName <- function(loadId,
-                                     studyPopId,
-                                     psId,
-                                     targetId,
-                                     comparatorId,
-                                     nestingCohortId,
-                                     outcomeId) {
-  name <- sprintf("Ps_l%s_s%s_p%s%s_o%s.rds",
-                  loadId,
-                  studyPopId,
-                  psId,
-                  .createTcnSubstring(targetId, comparatorId, nestingCohortId),
-                  .f(outcomeId))
-  return(name)
+.createPsFileName <- function(hash) {
+  sprintf("Ps_%s.rds", hash)
 }
 
-.createStratifiedPopFileName <- function(loadId,
-                                         studyPopId,
-                                         psId,
-                                         strataId,
-                                         targetId,
-                                         comparatorId,
-                                         nestingCohortId,
-                                         outcomeId) {
-  name <- sprintf("StratPop_l%s_s%s_p%s%s_s%s_o%s.rds",
-                  loadId,
-                  studyPopId,
-                  psId,
-                  .createTcnSubstring(targetId, comparatorId, nestingCohortId),
-                  strataId,
-                  .f(outcomeId))
-  return(name)
+.createStratifiedPopFileName <- function(hash) {
+  sprintf("StratPop_%s.rds", hash)
 }
 
-.createsharedBalanceFileName <- function(loadId,
-                                         studyPopId,
-                                         psId,
-                                         strataId,
-                                         sharedBalanceId,
-                                         targetId,
-                                         comparatorId,
-                                         nestingCohortId) {
-  name <- sprintf("Balance_l%s_s%s_p%s%s_s%s_b%s.rds",
-                  loadId,
-                  studyPopId,
-                  psId,
-                  .createTcnSubstring(targetId, comparatorId, nestingCohortId),
-                  strataId,
-                  sharedBalanceId)
-  return(name)
+.createSharedBalanceFileName <- function(hash) {
+  sprintf("SharedBalance_%s.rds", hash)
 }
 
-.createFilterForBalanceFileName <- function(loadId,
-                                            balanceId,
-                                            targetId,
-                                            comparatorId,
-                                            nestingCohortId) {
-  name <- sprintf("FilterForBalance_l%s%s_b%s.zip",
-                  loadId,
-                  .createTcnSubstring(targetId, comparatorId, nestingCohortId),
-                  balanceId)
-  return(name)
+.createFilterForBalanceFileName <- function(hash) {
+  sprintf("FilterForBalance_%s.zip", hash)
 }
 
-.createBalanceFileName <- function(loadId,
-                                   studyPopId,
-                                   psId,
-                                   strataId,
-                                   balanceId,
-                                   targetId,
-                                   comparatorId,
-                                   nestingCohortId,
-                                   outcomeId) {
-  name <- sprintf("Balance_l%s_s%s_p%s%s_s%s_b%s_o%s.rds",
-                  loadId,
-                  studyPopId,
-                  psId,
-                  .createTcnSubstring(targetId, comparatorId, nestingCohortId),
-                  strataId,
-                  balanceId,
-                  .f(outcomeId))
-  return(name)
+.createBalanceFileName <- function(hash) {
+  sprintf("Balance_%s.rds", hash)
 }
 
-.createOutcomeModelFileName <- function(folder, targetId, comparatorId, nestingCohortId, outcomeId) {
-  name <- sprintf("om%s_o%s.rds",
-                  .createTcnSubstring(targetId, comparatorId, nestingCohortId),
-                  .f(outcomeId))
-  return(file.path(folder, name))
+.createOutcomeModelFileName <- function(folder, hash) {
+  file.path(folder, sprintf("om_%s.rds", hash))
 }
 
 #' Get file reference
diff --git a/R/SettingsHasher.R b/R/SettingsHasher.R
deleted file mode 100644
index 70efcac..0000000
--- a/R/SettingsHasher.R
+++ /dev/null
@@ -1,285 +0,0 @@
-# Copyright 2026 Observational Health Data Sciences and Informatics
-#
-# This file is part of CohortMethod
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#' Settings Hasher
-#'
-#' @description
-#' Compute deterministic hashes for individual settings components to detect
-#' which parts of the analysis specification have changed.
-#'
-#' @details
-#' This class hashes each settings component independently using JSON serialization
-#' and MD5 digests. This allows fine-grained detection of which artifact types need
-#' to be invalidated when specifications change.
-#'
-#' @export
-SettingsHasher <- R6::R6Class(
-  "SettingsHasher",
-  public = list(
-    #' @description
-    #' Hash the data loading arguments
-    #'
-    #' @param getDbCohortMethodDataArgs An object of type `GetDbCohortMethodDataArgs`
-    #'
-    #' @return
-    #' Character string containing the MD5 hash (32 hex characters)
-    hashLoadArgs = function(getDbCohortMethodDataArgs) {
-      if (is.null(getDbCohortMethodDataArgs)) {
-        return("")
-      }
-      json <- getDbCohortMethodDataArgs$toJson()
-      return(digest::digest(json, algo = "md5"))
-    },
-
-    #' @description
-    #' Hash the study population arguments
-    #'
-    #' @param createStudyPopulationArgs An object of type `CreateStudyPopulationArgs`
-    #'
-    #' @return
-    #' Character string containing the MD5 hash (32 hex characters)
-    hashStudyPopArgs = function(createStudyPopulationArgs) {
-      if (is.null(createStudyPopulationArgs)) {
-        return("")
-      }
-      json <- createStudyPopulationArgs$toJson()
-      return(digest::digest(json, algo = "md5"))
-    },
-
-    #' @description
-    #' Hash the propensity score arguments
-    #'
-    #' @param createPsArgs An object of type `CreatePsArgs`
-    #'
-    #' @return
-    #' Character string containing the MD5 hash (32 hex characters)
-    hashPsArgs = function(createPsArgs) {
-      if (is.null(createPsArgs)) {
-        return("")
-      }
-      json <- createPsArgs$toJson()
-      return(digest::digest(json, algo = "md5"))
-    },
-
-    #' @description
-    #' Hash arguments for propensity score trimming
-    #'
-    #' @param trimByPsArgs An object of type `TrimByPsArgs`
-    #'
-    #' @return
-    #' Character string containing the MD5 hash (32 hex characters)
-    hashTrimByPsArgs = function(trimByPsArgs) {
-      if (is.null(trimByPsArgs)) {
-        return("")
-      }
-      json <- trimByPsArgs$toJson()
-      return(digest::digest(json, algo = "md5"))
-    },
-
-    #' @description
-    #' Hash arguments for propensity score matching
-    #'
-    #' @param matchOnPsArgs An object of type `MatchOnPsArgs`
-    #'
-    #' @return
-    #' Character string containing the MD5 hash (32 hex characters)
-    hashMatchOnPsArgs = function(matchOnPsArgs) {
-      if (is.null(matchOnPsArgs)) {
-        return("")
-      }
-      json <- matchOnPsArgs$toJson()
-      return(digest::digest(json, algo = "md5"))
-    },
-
-    #' @description
-    #' Hash arguments for propensity score stratification
-    #'
-    #' @param stratifyByPsArgs An object of type `StratifyByPsArgs`
-    #'
-    #' @return
-    #' Character string containing the MD5 hash (32 hex characters)
-    hashStratifyByPsArgs = function(stratifyByPsArgs) {
-      if (is.null(stratifyByPsArgs)) {
-        return("")
-      }
-      json <- stratifyByPsArgs$toJson()
-      return(digest::digest(json, algo = "md5"))
-    },
-
-    #' @description
-    #' Hash arguments for outcome model fitting
-    #'
-    #' @param fitOutcomeModelArgs An object of type `FitOutcomeModelArgs`
-    #'
-    #' @return
-    #' Character string containing the MD5 hash (32 hex characters)
-    hashOutcomeModelArgs = function(fitOutcomeModelArgs) {
-      if (is.null(fitOutcomeModelArgs)) {
-        return("")
-      }
-      json <- fitOutcomeModelArgs$toJson()
-      return(digest::digest(json, algo = "md5"))
-    },
-
-    #' @description
-    #' Hash arguments for covariate balance computation
-    #'
-    #' @param computeCovariateBalanceArgs An object of type `ComputeCovariateBalanceArgs`
-    #'
-    #' @return
-    #' Character string containing the MD5 hash (32 hex characters)
-    hashBalanceArgs = function(computeCovariateBalanceArgs) {
-      if (is.null(computeCovariateBalanceArgs)) {
-        return("")
-      }
-      json <- computeCovariateBalanceArgs$toJson()
-      return(digest::digest(json, algo = "md5"))
-    },
-
-    #' @description
-    #' Compare old and new specifications to identify which components changed
-    #'
-    #' @param oldSpecs An object of type `CmAnalysesSpecifications` (old version)
-    #' @param newSpecs An object of type `CmAnalysesSpecifications` (new version)
-    #'
-    #' @return
-    #' A list of logical values indicating which components changed:
-    #' - loadArgsChanged: whether data loading arguments differ
-    #' - studyPopArgsChanged: whether study population arguments differ
-    #' - psArgsChanged: whether propensity score arguments differ
-    #' - strataArgsChanged: whether stratification arguments (trim/match/stratify) differ
-    #' - outcomeModelArgsChanged: whether outcome model arguments differ
-    #' - balanceArgsChanged: whether balance computation arguments differ
-    #' - analyticsChanged: whether outcome of interest or analysis IDs changed
-    compareSettingsComponents = function(oldSpecs, newSpecs) {
-      checkmate::assertClass(oldSpecs, "CmAnalysesSpecifications")
-      checkmate::assertClass(newSpecs, "CmAnalysesSpecifications")
-
-      # Compare each analysis component's settings
-      changedComponents <- list(
-        loadArgsChanged = FALSE,
-        studyPopArgsChanged = FALSE,
-        psArgsChanged = FALSE,
-        strataArgsChanged = FALSE,
-        outcomeModelArgsChanged = FALSE,
-        balanceArgsChanged = FALSE,
-        analyticsChanged = FALSE
-      )
-
-      # Get the old and new analysis lists
-      oldAnalyses <- oldSpecs$cmAnalysisList
-      newAnalyses <- newSpecs$cmAnalysisList
-
-      # If the number of analyses differs significantly or can't be matched, mark as changed
-      if (length(oldAnalyses) != length(newAnalyses)) {
-        # For now, mark as all changed if list lengths differ
-        # A more sophisticated approach would match analyses by ID
-        changedComponents$analyticsChanged <- TRUE
-      } else {
-        # Compare each analysis by ID (assuming they maintain order/ID)
-        for (i in seq_along(newAnalyses)) {
-          oldAna <- oldAnalyses[[i]]
-          newAna <- newAnalyses[[i]]
-
-          # Check data loading
-          oldLoadHash <- self$hashLoadArgs(oldAna$getDbCohortMethodDataArgs)
-          newLoadHash <- self$hashLoadArgs(newAna$getDbCohortMethodDataArgs)
-          if (oldLoadHash != newLoadHash) {
-            changedComponents$loadArgsChanged <- TRUE
-          }
-
-          # Check study population
-          oldStudyPopHash <- self$hashStudyPopArgs(oldAna$createStudyPopulationArgs)
-          newStudyPopHash <- self$hashStudyPopArgs(newAna$createStudyPopulationArgs)
-          if (oldStudyPopHash != newStudyPopHash) {
-            changedComponents$studyPopArgsChanged <- TRUE
-          }
-
-          # Check propensity score
-          oldPsHash <- self$hashPsArgs(oldAna$createPsArgs)
-          newPsHash <- self$hashPsArgs(newAna$createPsArgs)
-          if (oldPsHash != newPsHash) {
-            changedComponents$psArgsChanged <- TRUE
-          }
-
-          # Check stratification (trim/match/stratify all affect strata)
-          oldTrimHash <- self$hashTrimByPsArgs(oldAna$trimByPsArgs)
-          newTrimHash <- self$hashTrimByPsArgs(newAna$trimByPsArgs)
-          oldMatchHash <- self$hashMatchOnPsArgs(oldAna$matchOnPsArgs)
-          newMatchHash <- self$hashMatchOnPsArgs(newAna$matchOnPsArgs)
-          oldStratifyHash <- self$hashStratifyByPsArgs(oldAna$stratifyByPsArgs)
-          newStratifyHash <- self$hashStratifyByPsArgs(newAna$stratifyByPsArgs)
-
-          if (oldTrimHash != newTrimHash || oldMatchHash != newMatchHash || oldStratifyHash != newStratifyHash) {
-            changedComponents$strataArgsChanged <- TRUE
-          }
-
-          # Check outcome model
-          oldOutcomeHash <- self$hashOutcomeModelArgs(oldAna$fitOutcomeModelArgs)
-          newOutcomeHash <- self$hashOutcomeModelArgs(newAna$fitOutcomeModelArgs)
-          if (oldOutcomeHash != newOutcomeHash) {
-            changedComponents$outcomeModelArgsChanged <- TRUE
-          }
-
-          # Check balance args
-          oldBalanceHash <- self$hashBalanceArgs(oldAna$computeSharedCovariateBalanceArgs)
-          newBalanceHash <- self$hashBalanceArgs(newAna$computeSharedCovariateBalanceArgs)
-          if (oldBalanceHash != newBalanceHash) {
-            changedComponents$balanceArgsChanged <- TRUE
-          }
-        }
-      }
-
-      # Check if outcomes of interest or excluded analyses changed
-      oldOutcomes <- oldSpecs$targetComparatorOutcomesList
-      newOutcomes <- newSpecs$targetComparatorOutcomesList
-
-      if (length(oldOutcomes) != length(newOutcomes)) {
-        changedComponents$analyticsChanged <- TRUE
-      } else {
-        # Check if outcomes within TCOs differ
-        for (i in seq_along(newOutcomes)) {
-          oldTco <- oldOutcomes[[i]]
-          newTco <- newOutcomes[[i]]
-
-          # Simple check: compare target, comparator, nesting cohort
-          if (oldTco$targetId != newTco$targetId ||
-            oldTco$comparatorId != newTco$comparatorId ||
-            !identical(oldTco$nestingCohortId, newTco$nestingCohortId)) {
-            changedComponents$analyticsChanged <- TRUE
-            break
-          }
-
-          # Check outcome changes (new outcomes added or old ones removed)
-          oldOutcomeIds <- sort(sapply(oldTco$outcomes, function(x) x$outcomeId))
-          newOutcomeIds <- sort(sapply(newTco$outcomes, function(x) x$outcomeId))
-          if (!identical(oldOutcomeIds, newOutcomeIds)) {
-            # Just outcomes added: no need to delete
-            # Outcomes removed: potentially need to delete
-            if (any(!(newOutcomeIds %in% oldOutcomeIds))) {
-              # New outcomes added - this is OK, don't mark as changed
-            } else {
-              # Outcomes removed - this means regeneration may be needed
-              changedComponents$analyticsChanged <- TRUE
-            }
-          }
-        }
-      }
-
-      return(changedComponents)
-    }
-  )
-)
diff --git a/R/ValidateArtifact.R b/R/ValidateArtifact.R
deleted file mode 100644
index 708822d..0000000
--- a/R/ValidateArtifact.R
+++ /dev/null
@@ -1,150 +0,0 @@
-# Copyright 2026 Observational Health Data Sciences and Informatics
-#
-# This file is part of CohortMethod
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-#' Validate Artifact
-#'
-#' @description
-#' Utility functions to validate that cached artifacts match expected parameter hashes.
-#'
-#' @details
-#' These functions help determine whether an existing artifact can be reused by
-#' validating that its parameters match the hash encoded in the filename or metadata.
-#'
-#' @keywords internal
-#'
-#' @export
-ValidateArtifact <- R6::R6Class(
-  "ValidateArtifact",
-  public = list(
-    #' @description
-    #' Extract hash from a filename
-    #'
-    #' @param filename Character string with the artifact filename
-    #'
-    #' @return
-    #' Character string containing the extracted hash, or NA if not found
-    #'
-    #' @details
-    #' Filenames typically have format like:
-    #' - CmData_l{loadId}_{tcn}.zip
-    #' - StudyPop_l{loadId}_s{studyPopId}_{tcn}_o{outcomeId}.rds
-    #'
-    #' This function extracts numeric IDs which are used as hashes.
-    extractHashFromFilename = function(filename) {
-      # This is a placeholder - actual hash extraction depends on filename format
-      # In CohortMethod, the "ID" (like loadId, studyPopId) is the hash
-      # This would be enhanced based on actual filename patterns used
-
-      # For now, return NA - the actual validation happens at file existence level
-      return(NA_character_)
-    },
-
-    #' @description
-    #' Validate that an artifact file exists and matches expected settings
-    #'
-    #' @param filepath Character string with the full path to the artifact file
-    #' @param settingsHash Character string with the expected settings hash
-    #'
-    #' @return
-    #' Logical TRUE if file exists and hash matches, FALSE otherwise
-    #'
-    #' @details
-    #' This function performs basic validation. In the current implementation,
-    #' we rely on filename encoding to ensure the hash is embedded in the filename.
-    #' If the file exists and has the right naming pattern, we consider it valid.
-    validateArtifactFile = function(filepath, settingsHash = NULL) {
-      # Check if file exists
-      if (!file.exists(filepath)) {
-        return(FALSE)
-      }
-
-      # If no hash provided, just check existence
-      if (is.null(settingsHash)) {
-        return(TRUE)
-      }
-
-      # Extract filename
-      filename <- basename(filepath)
-
-      # In current CohortMethod implementation, the hash is encoded in the filename
-      # through the ID system (loadId, studyPopId, etc.)
-      # Validation consists of checking that the file exists with correct naming
-
-      # For now, assume existence + correct naming = valid artifact
-      # This could be enhanced with actual hash extraction and comparison
-
-      return(TRUE)
-    },
-
-    #' @description
-    #' Check if a cohort method data file is valid and reusable
-    #'
-    #' @param filepath Character string with the path to the CmData zip file
-    #' @param expectedLoadId Numeric ID expected for this data loading configuration
-    #'
-    #' @return
-    #' Logical TRUE if file exists and corresponds to the expected load ID
-    validateCohortMethodData = function(filepath, expectedLoadId) {
-      if (!file.exists(filepath)) {
-        return(FALSE)
-      }
-
-      # Check that file is readable as a zip
-      tryCatch({
-        # Try to get zip file list to verify integrity
-        zip::zip_list(filepath)
-        return(TRUE)
-      }, error = function(e) {
-        return(FALSE)
-      })
-    },
-
-    #' @description
-    #' List all analysis artifacts in a directory by type
-    #'
-    #' @param outputFolder Character string with the output folder path
-    #' @param artifactType Character string indicating artifact type:
-    #'   "cmdata", "studypop", "ps", "stratpop", "balance", "outcome_model"
-    #'
-    #' @return
-    #' Character vector of full paths to matching artifact files
-    listArtifactsByType = function(outputFolder, artifactType) {
-      checkmate::assertCharacter(outputFolder, len = 1)
-      checkmate::assertCharacter(artifactType, len = 1)
-
-      if (!dir.exists(outputFolder)) {
-        return(character())
-      }
-
-      patterns <- list(
-        cmdata = "^CmData_.*\\.zip$",
-        studypop = "^StudyPop_.*\\.rds$",
-        ps = "^Ps_.*\\.rds$",
-        stratpop = "^StratPop_.*\\.rds$",
-        balance = "^Balance_.*\\.rds$",
-        outcome_model = "^Analysis_[0-9]+/om.*\\.rds$"
-      )
-
-      pattern <- patterns[[tolower(artifactType)]]
-      if (is.null(pattern)) {
-        stop("Unknown artifact type: ", artifactType)
-      }
-
-      files <- list.files(outputFolder, pattern = pattern, full.names = TRUE)
-      return(files)
-    }
-  )
-)
diff --git a/man/InvalidationPolicy.Rd b/man/InvalidationPolicy.Rd
deleted file mode 100644
index 45834eb..0000000
--- a/man/InvalidationPolicy.Rd
+++ /dev/null
@@ -1,94 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/InvalidationPolicy.R
-\name{InvalidationPolicy}
-\alias{InvalidationPolicy}
-\title{Invalidation Policy}
-\description{
-Determines which artifacts should be deleted when analysis settings change,
-based on dependency relationships between components.
-}
-\details{
-When settings change, not all artifacts need to be regenerated. This class
-implements the logic to determine minimal invalidation scope based on which
-settings components have changed.
-}
-\section{Methods}{
-\subsection{Public methods}{
-\itemize{
-\item \href{#method-InvalidationPolicy-computeInvalidationScope}{\code{InvalidationPolicy$computeInvalidationScope()}}
-\item \href{#method-InvalidationPolicy-getInvalidationMessage}{\code{InvalidationPolicy$getInvalidationMessage()}}
-\item \href{#method-InvalidationPolicy-clone}{\code{InvalidationPolicy$clone()}}
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-InvalidationPolicy-computeInvalidationScope"></a>}}
-\if{latex}{\out{\hypertarget{method-InvalidationPolicy-computeInvalidationScope}{}}}
-\subsection{Method \code{computeInvalidationScope()}}{
-Compute which file patterns should be deleted based on changed components
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{InvalidationPolicy$computeInvalidationScope(changedComponents)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{changedComponents}}{A list (as returned by SettingsHasher$compareSettingsComponents)
-with logical values for each component}
-}
-\if{html}{\out{</div>}}
-}
-\subsection{Details}{
-Deletion cascade logic:
-\itemize{
-\item If loadArgsChanged: all downstream artifacts (everything)
-\item Else if studyPopArgsChanged: populations, PS models, strata, outcomes
-\item Else if psArgsChanged: PS models, strata, outcomes
-\item Else if strataArgsChanged: strata and outcomes
-\item Else if outcomeModelArgsChanged: outcome models only
-\item Else if only balanceArgsChanged: balance files only
-\item Else if analyticsChanged (only new outcomes): nothing (outcomes are outcome-specific)
-}
-}
-
-\subsection{Returns}{
-Character vector of file patterns/globs to delete. Empty vector if no deletion needed.
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-InvalidationPolicy-getInvalidationMessage"></a>}}
-\if{latex}{\out{\hypertarget{method-InvalidationPolicy-getInvalidationMessage}{}}}
-\subsection{Method \code{getInvalidationMessage()}}{
-Get a human-readable message describing what will be deleted
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{InvalidationPolicy$getInvalidationMessage(changedComponents)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{changedComponents}}{A list of changed components}
-}
-\if{html}{\out{</div>}}
-}
-\subsection{Returns}{
-Character string suitable for displaying to the user
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-InvalidationPolicy-clone"></a>}}
-\if{latex}{\out{\hypertarget{method-InvalidationPolicy-clone}{}}}
-\subsection{Method \code{clone()}}{
-The objects of this class are cloneable with this method.
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{InvalidationPolicy$clone(deep = FALSE)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{deep}}{Whether to make a deep clone.}
-}
-\if{html}{\out{</div>}}
-}
-}
-}
diff --git a/man/SettingsHasher.Rd b/man/SettingsHasher.Rd
deleted file mode 100644
index 4e1df70..0000000
--- a/man/SettingsHasher.Rd
+++ /dev/null
@@ -1,238 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/SettingsHasher.R
-\name{SettingsHasher}
-\alias{SettingsHasher}
-\title{Settings Hasher}
-\description{
-Compute deterministic hashes for individual settings components to detect
-which parts of the analysis specification have changed.
-}
-\details{
-This class hashes each settings component independently using JSON serialization
-and MD5 digests. This allows fine-grained detection of which artifact types need
-to be invalidated when specifications change.
-}
-\section{Methods}{
-\subsection{Public methods}{
-\itemize{
-\item \href{#method-SettingsHasher-hashLoadArgs}{\code{SettingsHasher$hashLoadArgs()}}
-\item \href{#method-SettingsHasher-hashStudyPopArgs}{\code{SettingsHasher$hashStudyPopArgs()}}
-\item \href{#method-SettingsHasher-hashPsArgs}{\code{SettingsHasher$hashPsArgs()}}
-\item \href{#method-SettingsHasher-hashTrimByPsArgs}{\code{SettingsHasher$hashTrimByPsArgs()}}
-\item \href{#method-SettingsHasher-hashMatchOnPsArgs}{\code{SettingsHasher$hashMatchOnPsArgs()}}
-\item \href{#method-SettingsHasher-hashStratifyByPsArgs}{\code{SettingsHasher$hashStratifyByPsArgs()}}
-\item \href{#method-SettingsHasher-hashOutcomeModelArgs}{\code{SettingsHasher$hashOutcomeModelArgs()}}
-\item \href{#method-SettingsHasher-hashBalanceArgs}{\code{SettingsHasher$hashBalanceArgs()}}
-\item \href{#method-SettingsHasher-compareSettingsComponents}{\code{SettingsHasher$compareSettingsComponents()}}
-\item \href{#method-SettingsHasher-clone}{\code{SettingsHasher$clone()}}
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-SettingsHasher-hashLoadArgs"></a>}}
-\if{latex}{\out{\hypertarget{method-SettingsHasher-hashLoadArgs}{}}}
-\subsection{Method \code{hashLoadArgs()}}{
-Hash the data loading arguments
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{SettingsHasher$hashLoadArgs(getDbCohortMethodDataArgs)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{getDbCohortMethodDataArgs}}{An object of type \code{GetDbCohortMethodDataArgs}}
-}
-\if{html}{\out{</div>}}
-}
-\subsection{Returns}{
-Character string containing the MD5 hash (32 hex characters)
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-SettingsHasher-hashStudyPopArgs"></a>}}
-\if{latex}{\out{\hypertarget{method-SettingsHasher-hashStudyPopArgs}{}}}
-\subsection{Method \code{hashStudyPopArgs()}}{
-Hash the study population arguments
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{SettingsHasher$hashStudyPopArgs(createStudyPopulationArgs)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{createStudyPopulationArgs}}{An object of type \code{CreateStudyPopulationArgs}}
-}
-\if{html}{\out{</div>}}
-}
-\subsection{Returns}{
-Character string containing the MD5 hash (32 hex characters)
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-SettingsHasher-hashPsArgs"></a>}}
-\if{latex}{\out{\hypertarget{method-SettingsHasher-hashPsArgs}{}}}
-\subsection{Method \code{hashPsArgs()}}{
-Hash the propensity score arguments
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{SettingsHasher$hashPsArgs(createPsArgs)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{createPsArgs}}{An object of type \code{CreatePsArgs}}
-}
-\if{html}{\out{</div>}}
-}
-\subsection{Returns}{
-Character string containing the MD5 hash (32 hex characters)
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-SettingsHasher-hashTrimByPsArgs"></a>}}
-\if{latex}{\out{\hypertarget{method-SettingsHasher-hashTrimByPsArgs}{}}}
-\subsection{Method \code{hashTrimByPsArgs()}}{
-Hash arguments for propensity score trimming
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{SettingsHasher$hashTrimByPsArgs(trimByPsArgs)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{trimByPsArgs}}{An object of type \code{TrimByPsArgs}}
-}
-\if{html}{\out{</div>}}
-}
-\subsection{Returns}{
-Character string containing the MD5 hash (32 hex characters)
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-SettingsHasher-hashMatchOnPsArgs"></a>}}
-\if{latex}{\out{\hypertarget{method-SettingsHasher-hashMatchOnPsArgs}{}}}
-\subsection{Method \code{hashMatchOnPsArgs()}}{
-Hash arguments for propensity score matching
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{SettingsHasher$hashMatchOnPsArgs(matchOnPsArgs)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{matchOnPsArgs}}{An object of type \code{MatchOnPsArgs}}
-}
-\if{html}{\out{</div>}}
-}
-\subsection{Returns}{
-Character string containing the MD5 hash (32 hex characters)
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-SettingsHasher-hashStratifyByPsArgs"></a>}}
-\if{latex}{\out{\hypertarget{method-SettingsHasher-hashStratifyByPsArgs}{}}}
-\subsection{Method \code{hashStratifyByPsArgs()}}{
-Hash arguments for propensity score stratification
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{SettingsHasher$hashStratifyByPsArgs(stratifyByPsArgs)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{stratifyByPsArgs}}{An object of type \code{StratifyByPsArgs}}
-}
-\if{html}{\out{</div>}}
-}
-\subsection{Returns}{
-Character string containing the MD5 hash (32 hex characters)
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-SettingsHasher-hashOutcomeModelArgs"></a>}}
-\if{latex}{\out{\hypertarget{method-SettingsHasher-hashOutcomeModelArgs}{}}}
-\subsection{Method \code{hashOutcomeModelArgs()}}{
-Hash arguments for outcome model fitting
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{SettingsHasher$hashOutcomeModelArgs(fitOutcomeModelArgs)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{fitOutcomeModelArgs}}{An object of type \code{FitOutcomeModelArgs}}
-}
-\if{html}{\out{</div>}}
-}
-\subsection{Returns}{
-Character string containing the MD5 hash (32 hex characters)
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-SettingsHasher-hashBalanceArgs"></a>}}
-\if{latex}{\out{\hypertarget{method-SettingsHasher-hashBalanceArgs}{}}}
-\subsection{Method \code{hashBalanceArgs()}}{
-Hash arguments for covariate balance computation
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{SettingsHasher$hashBalanceArgs(computeCovariateBalanceArgs)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{computeCovariateBalanceArgs}}{An object of type \code{ComputeCovariateBalanceArgs}}
-}
-\if{html}{\out{</div>}}
-}
-\subsection{Returns}{
-Character string containing the MD5 hash (32 hex characters)
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-SettingsHasher-compareSettingsComponents"></a>}}
-\if{latex}{\out{\hypertarget{method-SettingsHasher-compareSettingsComponents}{}}}
-\subsection{Method \code{compareSettingsComponents()}}{
-Compare old and new specifications to identify which components changed
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{SettingsHasher$compareSettingsComponents(oldSpecs, newSpecs)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{oldSpecs}}{An object of type \code{CmAnalysesSpecifications} (old version)}
-
-\item{\code{newSpecs}}{An object of type \code{CmAnalysesSpecifications} (new version)}
-}
-\if{html}{\out{</div>}}
-}
-\subsection{Returns}{
-A list of logical values indicating which components changed:
-\itemize{
-\item loadArgsChanged: whether data loading arguments differ
-\item studyPopArgsChanged: whether study population arguments differ
-\item psArgsChanged: whether propensity score arguments differ
-\item strataArgsChanged: whether stratification arguments (trim/match/stratify) differ
-\item outcomeModelArgsChanged: whether outcome model arguments differ
-\item balanceArgsChanged: whether balance computation arguments differ
-\item analyticsChanged: whether outcome of interest or analysis IDs changed
-}
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-SettingsHasher-clone"></a>}}
-\if{latex}{\out{\hypertarget{method-SettingsHasher-clone}{}}}
-\subsection{Method \code{clone()}}{
-The objects of this class are cloneable with this method.
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{SettingsHasher$clone(deep = FALSE)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{deep}}{Whether to make a deep clone.}
-}
-\if{html}{\out{</div>}}
-}
-}
-}
diff --git a/man/ValidateArtifact.Rd b/man/ValidateArtifact.Rd
deleted file mode 100644
index 1d1c516..0000000
--- a/man/ValidateArtifact.Rd
+++ /dev/null
@@ -1,144 +0,0 @@
-% Generated by roxygen2: do not edit by hand
-% Please edit documentation in R/ValidateArtifact.R
-\name{ValidateArtifact}
-\alias{ValidateArtifact}
-\title{Validate Artifact}
-\description{
-Utility functions to validate that cached artifacts match expected parameter hashes.
-}
-\details{
-These functions help determine whether an existing artifact can be reused by
-validating that its parameters match the hash encoded in the filename or metadata.
-}
-\keyword{internal}
-\section{Methods}{
-\subsection{Public methods}{
-\itemize{
-\item \href{#method-ValidateArtifact-extractHashFromFilename}{\code{ValidateArtifact$extractHashFromFilename()}}
-\item \href{#method-ValidateArtifact-validateArtifactFile}{\code{ValidateArtifact$validateArtifactFile()}}
-\item \href{#method-ValidateArtifact-validateCohortMethodData}{\code{ValidateArtifact$validateCohortMethodData()}}
-\item \href{#method-ValidateArtifact-listArtifactsByType}{\code{ValidateArtifact$listArtifactsByType()}}
-\item \href{#method-ValidateArtifact-clone}{\code{ValidateArtifact$clone()}}
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-ValidateArtifact-extractHashFromFilename"></a>}}
-\if{latex}{\out{\hypertarget{method-ValidateArtifact-extractHashFromFilename}{}}}
-\subsection{Method \code{extractHashFromFilename()}}{
-Extract hash from a filename
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{ValidateArtifact$extractHashFromFilename(filename)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{filename}}{Character string with the artifact filename}
-}
-\if{html}{\out{</div>}}
-}
-\subsection{Details}{
-Filenames typically have format like:
-\itemize{
-\item CmData_l{loadId}_{tcn}.zip
-\item StudyPop_l{loadId}\emph{s{studyPopId}}{tcn}_o{outcomeId}.rds
-}
-
-This function extracts numeric IDs which are used as hashes.
-}
-
-\subsection{Returns}{
-Character string containing the extracted hash, or NA if not found
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-ValidateArtifact-validateArtifactFile"></a>}}
-\if{latex}{\out{\hypertarget{method-ValidateArtifact-validateArtifactFile}{}}}
-\subsection{Method \code{validateArtifactFile()}}{
-Validate that an artifact file exists and matches expected settings
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{ValidateArtifact$validateArtifactFile(filepath, settingsHash = NULL)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{filepath}}{Character string with the full path to the artifact file}
-
-\item{\code{settingsHash}}{Character string with the expected settings hash}
-}
-\if{html}{\out{</div>}}
-}
-\subsection{Details}{
-This function performs basic validation. In the current implementation,
-we rely on filename encoding to ensure the hash is embedded in the filename.
-If the file exists and has the right naming pattern, we consider it valid.
-}
-
-\subsection{Returns}{
-Logical TRUE if file exists and hash matches, FALSE otherwise
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-ValidateArtifact-validateCohortMethodData"></a>}}
-\if{latex}{\out{\hypertarget{method-ValidateArtifact-validateCohortMethodData}{}}}
-\subsection{Method \code{validateCohortMethodData()}}{
-Check if a cohort method data file is valid and reusable
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{ValidateArtifact$validateCohortMethodData(filepath, expectedLoadId)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{filepath}}{Character string with the path to the CmData zip file}
-
-\item{\code{expectedLoadId}}{Numeric ID expected for this data loading configuration}
-}
-\if{html}{\out{</div>}}
-}
-\subsection{Returns}{
-Logical TRUE if file exists and corresponds to the expected load ID
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-ValidateArtifact-listArtifactsByType"></a>}}
-\if{latex}{\out{\hypertarget{method-ValidateArtifact-listArtifactsByType}{}}}
-\subsection{Method \code{listArtifactsByType()}}{
-List all analysis artifacts in a directory by type
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{ValidateArtifact$listArtifactsByType(outputFolder, artifactType)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{outputFolder}}{Character string with the output folder path}
-
-\item{\code{artifactType}}{Character string indicating artifact type:
-"cmdata", "studypop", "ps", "stratpop", "balance", "outcome_model"}
-}
-\if{html}{\out{</div>}}
-}
-\subsection{Returns}{
-Character vector of full paths to matching artifact files
-}
-}
-\if{html}{\out{<hr>}}
-\if{html}{\out{<a id="method-ValidateArtifact-clone"></a>}}
-\if{latex}{\out{\hypertarget{method-ValidateArtifact-clone}{}}}
-\subsection{Method \code{clone()}}{
-The objects of this class are cloneable with this method.
-\subsection{Usage}{
-\if{html}{\out{<div class="r">}}\preformatted{ValidateArtifact$clone(deep = FALSE)}\if{html}{\out{</div>}}
-}
-
-\subsection{Arguments}{
-\if{html}{\out{<div class="arguments">}}
-\describe{
-\item{\code{deep}}{Whether to make a deep clone.}
-}
-\if{html}{\out{</div>}}
-}
-}
-}
diff --git a/tests/testthat/test-artifactStore.R b/tests/testthat/test-artifactStore.R
new file mode 100644
index 0000000..42e9a67
--- /dev/null
+++ b/tests/testthat/test-artifactStore.R
@@ -0,0 +1,155 @@
+context("Artifact Store and Content Hashing")
+
+test_that("LocalArtifactStore can be initialized", {
+  outputFolder <- tempfile(pattern = "artifactStore")
+  on.exit(unlink(outputFolder, recursive = TRUE))
+
+  store <- LocalArtifactStore$new(outputFolder)
+  expect_s3_class(store, "R6")
+  expect_true(inherits(store, "ArtifactStore"))
+  expect_true(dir.exists(outputFolder))
+})
+
+test_that("LocalArtifactStore exists/readRDS/saveRDS work", {
+  outputFolder <- tempfile(pattern = "artifactStore")
+  on.exit(unlink(outputFolder, recursive = TRUE))
+
+  store <- LocalArtifactStore$new(outputFolder)
+
+  expect_false(store$exists("test.rds"))
+
+  store$saveRDS(list(a = 1, b = "hello"), "test.rds")
+  expect_true(store$exists("test.rds"))
+
+  obj <- store$readRDS("test.rds")
+  expect_equal(obj$a, 1)
+  expect_equal(obj$b, "hello")
+})
+
+test_that("LocalArtifactStore listArtifacts works", {
+  outputFolder <- tempfile(pattern = "artifactStore")
+  on.exit(unlink(outputFolder, recursive = TRUE))
+
+  store <- LocalArtifactStore$new(outputFolder)
+
+  store$saveRDS(1, "CmData_abc123.rds")
+  store$saveRDS(2, "StudyPop_def456.rds")
+  store$saveRDS(3, "Ps_ghi789.rds")
+
+  all <- store$listArtifacts()
+  expect_equal(length(all), 3)
+
+  cm <- store$listArtifacts(prefix = "CmData")
+  expect_equal(length(cm), 1)
+  expect_true(grepl("CmData", cm))
+})
+
+test_that("LocalArtifactStore delete works", {
+  outputFolder <- tempfile(pattern = "artifactStore")
+  on.exit(unlink(outputFolder, recursive = TRUE))
+
+  store <- LocalArtifactStore$new(outputFolder)
+
+  store$saveRDS(1, "test.rds")
+  expect_true(store$exists("test.rds"))
+
+  store$delete("test.rds")
+  expect_false(store$exists("test.rds"))
+})
+
+test_that("LocalArtifactStore ensureDir creates nested directories", {
+  outputFolder <- tempfile(pattern = "artifactStore")
+  on.exit(unlink(outputFolder, recursive = TRUE))
+
+  store <- LocalArtifactStore$new(outputFolder)
+
+  store$ensureDir("Analysis_1/om_abc123.rds")
+  expect_true(dir.exists(file.path(outputFolder, "Analysis_1")))
+})
+
+test_that("LocalArtifactStore getFullPath works", {
+  outputFolder <- tempfile(pattern = "artifactStore")
+  on.exit(unlink(outputFolder, recursive = TRUE))
+
+  store <- LocalArtifactStore$new(outputFolder)
+
+  path <- store$getFullPath("CmData_abc.zip")
+  expect_equal(path, file.path(outputFolder, "CmData_abc.zip"))
+})
+
+test_that(".contentHash produces consistent results", {
+  hash1 <- CohortMethod:::.contentHash("db1", "loadArgs", 1, 2)
+  hash2 <- CohortMethod:::.contentHash("db1", "loadArgs", 1, 2)
+  expect_equal(hash1, hash2)
+  expect_equal(nchar(hash1), 12)
+})
+
+test_that(".contentHash differs with different inputs", {
+  hash1 <- CohortMethod:::.contentHash("db1", "loadArgs", 1, 2)
+  hash2 <- CohortMethod:::.contentHash("db2", "loadArgs", 1, 2)
+  expect_true(hash1 != hash2)
+})
+
+test_that(".contentHash handles NULL inputs", {
+  hash1 <- CohortMethod:::.contentHash("db1", NULL, 1)
+  hash2 <- CohortMethod:::.contentHash("db1", NULL, 1)
+  expect_equal(hash1, hash2)
+})
+
+test_that(".contentHash includes databaseId in differentiation", {
+  hash1 <- CohortMethod:::.contentHash("database_A", "settings1")
+  hash2 <- CohortMethod:::.contentHash("database_B", "settings1")
+  expect_true(hash1 != hash2)
+})
+
+test_that("Content-addressed filenames change when settings change", {
+  args1 <- createGetDbCohortMethodDataArgs(
+    covariateSettings = FeatureExtraction::createCovariateSettings(
+      useDemographicsGender = TRUE
+    )
+  )
+  args2 <- createGetDbCohortMethodDataArgs(
+    covariateSettings = FeatureExtraction::createCovariateSettings(
+      useDemographicsGender = TRUE
+    ),
+    maxCohortSize = 50000
+  )
+
+  hash1 <- CohortMethod:::.contentHash("testDb", args1$toJson(), 1, 2, NA)
+  hash2 <- CohortMethod:::.contentHash("testDb", args2$toJson(), 1, 2, NA)
+
+  expect_true(hash1 != hash2)
+})
+
+test_that("Content-addressed filenames stable when settings identical", {
+  args1 <- createGetDbCohortMethodDataArgs(
+    covariateSettings = FeatureExtraction::createCovariateSettings(
+      useDemographicsGender = TRUE
+    )
+  )
+  args2 <- createGetDbCohortMethodDataArgs(
+    covariateSettings = FeatureExtraction::createCovariateSettings(
+      useDemographicsGender = TRUE
+    )
+  )
+
+  hash1 <- CohortMethod:::.contentHash("testDb", args1$toJson(), 1, 2, NA)
+  hash2 <- CohortMethod:::.contentHash("testDb", args2$toJson(), 1, 2, NA)
+
+  expect_equal(hash1, hash2)
+})
+
+test_that("Cascading hashes propagate upstream changes", {
+  loadHash1 <- CohortMethod:::.contentHash("db1", "loadArgs_v1", 1, 2, NA)
+  loadHash2 <- CohortMethod:::.contentHash("db1", "loadArgs_v2", 1, 2, NA)
+
+  # Study pop hash includes loadHash, so changing load args changes studyPop filename
+  studyPopHash1 <- CohortMethod:::.contentHash("db1", loadHash1, "studyPopArgs", 3)
+  studyPopHash2 <- CohortMethod:::.contentHash("db1", loadHash2, "studyPopArgs", 3)
+
+  expect_true(studyPopHash1 != studyPopHash2)
+
+  # But same load args + same studyPop args = same hash
+  studyPopHash3 <- CohortMethod:::.contentHash("db1", loadHash1, "studyPopArgs", 3)
+  expect_equal(studyPopHash1, studyPopHash3)
+})
diff --git a/tests/testthat/test-eunomia.R b/tests/testthat/test-eunomia.R
index b71e883..ffb0131 100644
--- a/tests/testthat/test-eunomia.R
+++ b/tests/testthat/test-eunomia.R
@@ -238,6 +238,7 @@ if (!isFALSE(tryCatch(find.package("Eunomia"), error = function(e) FALSE))) {
           exposureTable = "cohort",
           outcomeTable = "cohort",
           outputFolder = outputFolder1,
+          databaseId = "Eunomia",
           cmAnalysesSpecifications = createCmAnalysesSpecifications(
             cmAnalysisList = cmAnalysisList,
             targetComparatorOutcomesList = targetComparatorOutcomesList,
@@ -374,6 +375,7 @@ if (!isFALSE(tryCatch(find.package("Eunomia"), error = function(e) FALSE))) {
           exposureTable = "cohort",
           outcomeTable = "cohort",
           outputFolder = outputFolder2,
+          databaseId = "Eunomia",
           cmAnalysesSpecifications = createCmAnalysesSpecifications(
             cmAnalysisList = cmAnalysisList,
             targetComparatorOutcomesList = targetComparatorOutcomesList,
@@ -384,7 +386,7 @@ if (!isFALSE(tryCatch(find.package("Eunomia"), error = function(e) FALSE))) {
     )
 
     expect_equal(result$sharedPsFile, c("", ""))
-    expect_equal(result$psFile, c("Ps_l1_s1_p1_t1_c2_o3.rds", "Ps_l1_s1_p1_t1_c2_o4.rds"))
+    expect_true(all(grepl("^Ps_", result$psFile)))
   })
 
   test_that("High correlation covariates", {
@@ -458,6 +460,7 @@ if (!isFALSE(tryCatch(find.package("Eunomia"), error = function(e) FALSE))) {
       exposureTable = "cohort",
       outcomeTable = "cohort",
       outputFolder = outputFolder3,
+      databaseId = "Eunomia",
       cmAnalysesSpecifications = createCmAnalysesSpecifications(
         cmAnalysisList = cmAnalysisList,
         targetComparatorOutcomesList = targetComparatorOutcomesList
@@ -564,6 +567,7 @@ if (!isFALSE(tryCatch(find.package("Eunomia"), error = function(e) FALSE))) {
           exposureTable = "cohort",
           outcomeTable = "cohort",
           outputFolder = outputFolder4,
+          databaseId = "Eunomia",
           cmAnalysesSpecifications = createCmAnalysesSpecifications(
             cmAnalysisList = cmAnalysisList,
             targetComparatorOutcomesList = targetComparatorOutcomesList
@@ -685,6 +689,7 @@ if (!isFALSE(tryCatch(find.package("Eunomia"), error = function(e) FALSE))) {
           exposureTable = "cohort",
           outcomeTable = "cohort",
           outputFolder = outputFolder5,
+          databaseId = "Eunomia",
           cmAnalysesSpecifications = createCmAnalysesSpecifications(
             cmAnalysisList = cmAnalysisList,
             targetComparatorOutcomesList = targetComparatorOutcomesList
@@ -695,11 +700,11 @@ if (!isFALSE(tryCatch(find.package("Eunomia"), error = function(e) FALSE))) {
     expect_equal(nrow(result), 4)
     expect_equal(sum(result$nestingCohortId == 4, na.rm = TRUE), 2)
 
-    expect_equal(length(grep("_t1_c2_n4", result$cohortMethodDataFile)), 2)
-    expect_equal(length(grep("_t1_c2_n4", result$studyPopFile)), 2)
-    expect_equal(length(grep("_t1_c2_n4", result$sharedPsFile)), 2)
-    expect_equal(length(grep("_t1_c2_n4", result$psFile)), 2)
-    expect_equal(length(grep("_t1_c2_n4", result$strataFile)), 2)
+    # With content-addressable hashes, nesting cohort produces different hashes
+    # Verify that nested rows have distinct filenames from non-nested rows
+    nestedFiles <- result$cohortMethodDataFile[result$nestingCohortId == 4 & !is.na(result$nestingCohortId)]
+    nonNestedFiles <- result$cohortMethodDataFile[is.na(result$nestingCohortId)]
+    expect_true(length(intersect(nestedFiles, nonNestedFiles)) == 0)
     expect_equal(length(grep("_t1_c2_n4", result$sharedBalanceFile)), 2)
     expect_equal(length(grep("_t1_c2_n4", result$filteredForbalanceFile)), 2)
     expect_equal(length(grep("_t1_c2_n4", result$balanceFile)), 2)
@@ -819,6 +824,7 @@ if (!isFALSE(tryCatch(find.package("Eunomia"), error = function(e) FALSE))) {
           exposureTable = "cohort",
           outcomeTable = "cohort",
           outputFolder = outputFolder6,
+          databaseId = "Eunomia",
           cmAnalysesSpecifications = createCmAnalysesSpecifications(
             cmAnalysisList = cmAnalysisList,
             targetComparatorOutcomesList = targetComparatorOutcomesList,
diff --git a/tests/testthat/test-invalidationPolicy.R b/tests/testthat/test-invalidationPolicy.R
deleted file mode 100644
index 3674508..0000000
--- a/tests/testthat/test-invalidationPolicy.R
+++ /dev/null
@@ -1,288 +0,0 @@
-context("Artifact caching system")
-
-# Helper to create covariate settings
-getDefaultCovariateSettings <- function() {
-  return(FeatureExtraction::createCovariateSettings(
-    useDemographicsGender = TRUE,
-    useDemographicsAge = TRUE
-  ))
-}
-
-test_that("SettingsHasher hashes settings consistently", {
-  # Create two identical settings objects
-  args1 <- createGetDbCohortMethodDataArgs(
-    covariateSettings = getDefaultCovariateSettings()
-  )
-  args2 <- createGetDbCohortMethodDataArgs(
-    covariateSettings = getDefaultCovariateSettings()
-  )
-
-  hasher <- SettingsHasher$new()
-
-  # Hash the same settings twice
-  hash1 <- hasher$hashLoadArgs(args1)
-  hash2 <- hasher$hashLoadArgs(args2)
-
-  # Should produce identical hashes
-  expect_equal(hash1, hash2)
-  expect_true(is.character(hash1))
-  expect_equal(nchar(hash1), 32)  # MD5 is 32 hex characters
-})
-
-test_that("SettingsHasher detects changes in load args", {
-  hasher <- SettingsHasher$new()
-
-  args1 <- createGetDbCohortMethodDataArgs(
-    covariateSettings = getDefaultCovariateSettings()
-  )
-  args2 <- createGetDbCohortMethodDataArgs(
-    covariateSettings = getDefaultCovariateSettings(),
-    maxCohortSize = 50000  # Different
-  )
-
-  hash1 <- hasher$hashLoadArgs(args1)
-  hash2 <- hasher$hashLoadArgs(args2)
-
-  expect_true(hash1 != hash2)
-})
-
-test_that("SettingsHasher detects changes in study population args", {
-  hasher <- SettingsHasher$new()
-
-  args1 <- createCreateStudyPopulationArgs()
-  args2 <- createCreateStudyPopulationArgs(minDaysAtRisk = 30)  # Different
-
-  hash1 <- hasher$hashStudyPopArgs(args1)
-  hash2 <- hasher$hashStudyPopArgs(args2)
-
-  expect_true(hash1 != hash2)
-})
-
-test_that("SettingsHasher handles NULL arguments", {
-  hasher <- SettingsHasher$new()
-
-  hash <- hasher$hashLoadArgs(NULL)
-  expect_equal(hash, "")
-
-  hash <- hasher$hashPsArgs(NULL)
-  expect_equal(hash, "")
-})
-
-test_that("SettingsHasher compareSettingsComponents works with identical specs", {
-  hasher <- SettingsHasher$new()
-
-  # Create identical specifications
-  analysis <- createCmAnalysis(
-    analysisId = 1,
-    getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs(
-      covariateSettings = getDefaultCovariateSettings()
-    ),
-    createStudyPopulationArgs = createCreateStudyPopulationArgs(),
-    createPsArgs = createCreatePsArgs()
-  )
-
-  specs1 <- createCmAnalysesSpecifications(
-    cmAnalysisList = list(analysis),
-    targetComparatorOutcomesList = list(
-      createTargetComparatorOutcomes(
-        targetId = 1,
-        comparatorId = 2,
-        outcomes = list(createOutcome(outcomeId = 3))
-      )
-    )
-  )
-
-  specs2 <- createCmAnalysesSpecifications(
-    cmAnalysisList = list(analysis),
-    targetComparatorOutcomesList = list(
-      createTargetComparatorOutcomes(
-        targetId = 1,
-        comparatorId = 2,
-        outcomes = list(createOutcome(outcomeId = 3))
-      )
-    )
-  )
-
-  changes <- hasher$compareSettingsComponents(specs1, specs2)
-
-  expect_false(any(as.logical(changes)))
-})
-
-test_that("SettingsHasher compareSettingsComponents detects load args change", {
-  hasher <- SettingsHasher$new()
-
-  analysis1 <- createCmAnalysis(
-    analysisId = 1,
-    getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs(
-      covariateSettings = getDefaultCovariateSettings()
-    ),
-    createStudyPopulationArgs = createCreateStudyPopulationArgs(),
-    createPsArgs = createCreatePsArgs()
-  )
-
-  analysis2 <- createCmAnalysis(
-    analysisId = 1,
-    getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs(
-      covariateSettings = getDefaultCovariateSettings(),
-      maxCohortSize = 50000
-    ),
-    createStudyPopulationArgs = createCreateStudyPopulationArgs(),
-    createPsArgs = createCreatePsArgs()
-  )
-
-  specs1 <- createCmAnalysesSpecifications(
-    cmAnalysisList = list(analysis1),
-    targetComparatorOutcomesList = list(
-      createTargetComparatorOutcomes(
-        targetId = 1,
-        comparatorId = 2,
-        outcomes = list(createOutcome(outcomeId = 3))
-      )
-    )
-  )
-
-  specs2 <- createCmAnalysesSpecifications(
-    cmAnalysisList = list(analysis2),
-    targetComparatorOutcomesList = list(
-      createTargetComparatorOutcomes(
-        targetId = 1,
-        comparatorId = 2,
-        outcomes = list(createOutcome(outcomeId = 3))
-      )
-    )
-  )
-
-  changes <- hasher$compareSettingsComponents(specs1, specs2)
-
-  expect_true(changes$loadArgsChanged)
-})
-
-test_that("InvalidationPolicy computes correct deletion scope for load args change", {
-  policy <- InvalidationPolicy$new()
-
-  changedComponents <- list(
-    loadArgsChanged = TRUE,
-    studyPopArgsChanged = FALSE,
-    psArgsChanged = FALSE,
-    strataArgsChanged = FALSE,
-    outcomeModelArgsChanged = FALSE,
-    balanceArgsChanged = FALSE,
-    analyticsChanged = FALSE
-  )
-
-  patterns <- policy$computeInvalidationScope(changedComponents)
-
-  # Should include all major artifact patterns
-  expect_true(any(grepl("CmData", patterns)))
-  expect_true(any(grepl("StudyPop", patterns)))
-  expect_true(any(grepl("Ps", patterns)))
-  expect_true(any(grepl("Analysis", patterns)))
-})
-
-test_that("InvalidationPolicy computes correct deletion scope for study pop args change", {
-  policy <- InvalidationPolicy$new()
-
-  changedComponents <- list(
-    loadArgsChanged = FALSE,
-    studyPopArgsChanged = TRUE,
-    psArgsChanged = FALSE,
-    strataArgsChanged = FALSE,
-    outcomeModelArgsChanged = FALSE,
-    balanceArgsChanged = FALSE,
-    analyticsChanged = FALSE
-  )
-
-  patterns <- policy$computeInvalidationScope(changedComponents)
-
-  # Should include study pop and downstream
-  expect_true(any(grepl("StudyPop", patterns)))
-  expect_true(any(grepl("Ps", patterns)))
-  expect_true(any(grepl("Analysis", patterns)))
-
-  # But not CmData
-  expect_false(any(grepl("CmData_", patterns)))
-})
-
-test_that("InvalidationPolicy computes empty scope for outcome addition only", {
-  policy <- InvalidationPolicy$new()
-
-  changedComponents <- list(
-    loadArgsChanged = FALSE,
-    studyPopArgsChanged = FALSE,
-    psArgsChanged = FALSE,
-    strataArgsChanged = FALSE,
-    outcomeModelArgsChanged = FALSE,
-    balanceArgsChanged = FALSE,
-    analyticsChanged = TRUE  # But only because new outcomes added
-  )
-
-  patterns <- policy$computeInvalidationScope(changedComponents)
-
-  # Should be empty - new outcomes don't require deletion
-  expect_equal(length(patterns), 0)
-})
-
-test_that("InvalidationPolicy provides meaningful messages", {
-  policy <- InvalidationPolicy$new()
-
-  changedComponents <- list(
-    loadArgsChanged = TRUE,
-    studyPopArgsChanged = FALSE,
-    psArgsChanged = FALSE,
-    strataArgsChanged = FALSE,
-    outcomeModelArgsChanged = FALSE,
-    balanceArgsChanged = FALSE,
-    analyticsChanged = FALSE
-  )
-
-  message <- policy$getInvalidationMessage(changedComponents)
-  expect_true(grepl("Data loading", message, ignore.case = TRUE))
-  expect_true(is.character(message))
-})
-
-test_that("ValidateArtifact can list artifacts by type", {
-  outputFolder <- tempfile(pattern = "cmArtifacts")
-  dir.create(outputFolder)
-
-  on.exit(unlink(outputFolder, recursive = TRUE))
-
-  # Create some test files
-  file.create(file.path(outputFolder, "CmData_l1_t1_c2.zip"))
-  file.create(file.path(outputFolder, "StudyPop_l1_s1_t1_c2_o3.rds"))
-  file.create(file.path(outputFolder, "Ps_l1_p1_t1_c2.rds"))
-
-  validator <- ValidateArtifact$new()
-
-  cmdata <- validator$listArtifactsByType(outputFolder, "cmdata")
-  expect_equal(length(cmdata), 1)
-  expect_true(grepl("CmData", cmdata))
-
-  studypop <- validator$listArtifactsByType(outputFolder, "studypop")
-  expect_equal(length(studypop), 1)
-  expect_true(grepl("StudyPop", studypop))
-
-  ps <- validator$listArtifactsByType(outputFolder, "ps")
-  expect_equal(length(ps), 1)
-  expect_true(grepl("Ps", ps))
-})
-
-test_that("ValidateArtifact can validate artifact file existence", {
-  outputFolder <- tempfile(pattern = "cmArtifacts")
-  dir.create(outputFolder)
-
-  on.exit(unlink(outputFolder, recursive = TRUE))
-
-  validator <- ValidateArtifact$new()
-
-  # Non-existent file
-  result <- validator$validateArtifactFile(file.path(outputFolder, "nonexistent.zip"))
-  expect_false(result)
-
-  # Create a file
-  testFile <- file.path(outputFolder, "test.zip")
-  file.create(testFile)
-
-  # Existing file
-  result <- validator$validateArtifactFile(testFile)
-  expect_true(result)
-})

From a2f62be6a20f350a19c751a25e62d550a7d464c0 Mon Sep 17 00:00:00 2001
From: jgilber2 <jgilber2@its.jnj.com>
Date: Thu, 30 Apr 2026 07:48:55 -0700
Subject: [PATCH 03/10] Adding additional tests for cache schenarios

---
 tests/testthat/test-cachingScenarios.R | 632 +++++++++++++++++++++++++
 1 file changed, 632 insertions(+)
 create mode 100644 tests/testthat/test-cachingScenarios.R

diff --git a/tests/testthat/test-cachingScenarios.R b/tests/testthat/test-cachingScenarios.R
new file mode 100644
index 0000000..1136e17
--- /dev/null
+++ b/tests/testthat/test-cachingScenarios.R
@@ -0,0 +1,632 @@
+context("Content-addressable caching scenarios")
+
+# These tests verify that content-addressable filenames correctly handle
+# real-world scenarios: reusing artifacts when possible, generating new
+# filenames when settings change, and cascading hash changes appropriately.
+
+# Helper to create a basic analysis for testing
+makeBasicAnalysis <- function(analysisId = 1, withPs = TRUE, withStrata = TRUE) {
+  args <- list(
+    analysisId = analysisId,
+    getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs(
+      covariateSettings = FeatureExtraction::createCovariateSettings(
+        useDemographicsGender = TRUE,
+        useDemographicsAge = TRUE
+      )
+    ),
+    createStudyPopulationArgs = createCreateStudyPopulationArgs(
+      minDaysAtRisk = 1,
+      riskWindowStart = 0,
+      startAnchor = "cohort start",
+      riskWindowEnd = 30,
+      endAnchor = "cohort start"
+    )
+  )
+  if (withPs) {
+    args$createPsArgs <- createCreatePsArgs()
+  }
+  if (withStrata && withPs) {
+    args$matchOnPsArgs <- createMatchOnPsArgs()
+  }
+  args$fitOutcomeModelArgs <- createFitOutcomeModelArgs(modelType = "cox")
+  do.call(createCmAnalysis, args)
+}
+
+# Helper to build a reference table without needing a database
+buildRef <- function(cmAnalysisList,
+                     targetComparatorOutcomesList,
+                     databaseId = "testDb",
+                     refitPsForEveryOutcome = FALSE,
+                     refitPsForEveryStudyPopulation = TRUE) {
+  outputFolder <- tempfile(pattern = "cmRef")
+  dir.create(outputFolder)
+  on.exit(unlink(outputFolder, recursive = TRUE))
+
+  CohortMethod:::createReferenceTable(
+    cmAnalysisList = cmAnalysisList,
+    targetComparatorOutcomesList = targetComparatorOutcomesList,
+    analysesToExclude = NULL,
+    outputFolder = outputFolder,
+    refitPsForEveryOutcome = refitPsForEveryOutcome,
+    refitPsForEveryStudyPopulation = refitPsForEveryStudyPopulation,
+    databaseId = databaseId
+  )
+}
+
+
+# ===========================================================================
+# Scenario 1: Adding a new negative control outcome
+# Negative controls need new outcome models but should NOT change PS models
+# or CohortMethodData files
+# ===========================================================================
+
+test_that("Adding a negative control does NOT change CmData or shared PS filenames", {
+  analysis <- makeBasicAnalysis()
+
+  # Original: one outcome of interest, one negative control
+  tcos1 <- list(
+    createTargetComparatorOutcomes(
+      targetId = 1,
+      comparatorId = 2,
+      outcomes = list(
+        createOutcome(outcomeId = 100, outcomeOfInterest = TRUE),
+        createOutcome(outcomeId = 200, outcomeOfInterest = FALSE)  # negative control
+      )
+    )
+  )
+
+  # New: add a second negative control
+  tcos2 <- list(
+    createTargetComparatorOutcomes(
+      targetId = 1,
+      comparatorId = 2,
+      outcomes = list(
+        createOutcome(outcomeId = 100, outcomeOfInterest = TRUE),
+        createOutcome(outcomeId = 200, outcomeOfInterest = FALSE),
+        createOutcome(outcomeId = 300, outcomeOfInterest = FALSE)  # NEW negative control
+      )
+    )
+  )
+
+  ref1 <- buildRef(list(analysis), tcos1)
+  ref2 <- buildRef(list(analysis), tcos2)
+
+  # CohortMethodData files should be identical (same load args, same target/comparator)
+  expect_equal(
+    unique(ref1$cohortMethodDataFile),
+    unique(ref2$cohortMethodDataFile)
+  )
+
+  # Shared PS files should be identical (PS doesn't depend on outcome)
+  sharedPs1 <- unique(ref1$sharedPsFile[ref1$sharedPsFile != ""])
+  sharedPs2 <- unique(ref2$sharedPsFile[ref2$sharedPsFile != ""])
+  expect_equal(sharedPs1, sharedPs2)
+
+  # Original outcome (100) should have the same outcome model file
+  om1_100 <- ref1$outcomeModelFile[ref1$outcomeId == 100]
+  om2_100 <- ref2$outcomeModelFile[ref2$outcomeId == 100]
+  expect_equal(om1_100, om2_100)
+
+  # New negative control (300) should have a NEW outcome model file
+  om2_300 <- ref2$outcomeModelFile[ref2$outcomeId == 300]
+  expect_true(om2_300 != "")
+  expect_true(!(om2_300 %in% ref1$outcomeModelFile))
+})
+
+test_that("Adding a negative control produces distinct outcome model per outcome", {
+  analysis <- makeBasicAnalysis()
+
+  tcos <- list(
+    createTargetComparatorOutcomes(
+      targetId = 1,
+      comparatorId = 2,
+      outcomes = list(
+        createOutcome(outcomeId = 100, outcomeOfInterest = TRUE),
+        createOutcome(outcomeId = 200, outcomeOfInterest = FALSE),
+        createOutcome(outcomeId = 300, outcomeOfInterest = FALSE)
+      )
+    )
+  )
+
+  ref <- buildRef(list(analysis), tcos)
+
+  # Each outcome should have a unique outcome model filename
+  omFiles <- ref$outcomeModelFile[ref$outcomeModelFile != ""]
+  expect_equal(length(omFiles), length(unique(omFiles)))
+})
+
+
+# ===========================================================================
+# Scenario 2: Adding an outcome of interest
+# Should produce new StudyPop + StratPop + OutcomeModel for the new outcome
+# while reusing CmData and shared PS
+# ===========================================================================
+
+test_that("Adding an outcome of interest reuses CmData and shared PS", {
+  analysis <- makeBasicAnalysis()
+
+  tcos1 <- list(
+    createTargetComparatorOutcomes(
+      targetId = 1,
+      comparatorId = 2,
+      outcomes = list(
+        createOutcome(outcomeId = 100, outcomeOfInterest = TRUE)
+      )
+    )
+  )
+
+  tcos2 <- list(
+    createTargetComparatorOutcomes(
+      targetId = 1,
+      comparatorId = 2,
+      outcomes = list(
+        createOutcome(outcomeId = 100, outcomeOfInterest = TRUE),
+        createOutcome(outcomeId = 101, outcomeOfInterest = TRUE)  # NEW
+      )
+    )
+  )
+
+  ref1 <- buildRef(list(analysis), tcos1)
+  ref2 <- buildRef(list(analysis), tcos2)
+
+  # CohortMethodData should be reused
+  expect_equal(unique(ref1$cohortMethodDataFile), unique(ref2$cohortMethodDataFile))
+
+  # Shared PS should be reused (same model, not outcome-specific)
+  sharedPs1 <- unique(ref1$sharedPsFile[ref1$sharedPsFile != ""])
+  sharedPs2 <- unique(ref2$sharedPsFile[ref2$sharedPsFile != ""])
+  expect_equal(sharedPs1, sharedPs2)
+
+  # Original outcome 100 filenames should be identical
+  expect_equal(
+    ref1$studyPopFile[ref1$outcomeId == 100],
+    ref2$studyPopFile[ref2$outcomeId == 100]
+  )
+  expect_equal(
+    ref1$strataFile[ref1$outcomeId == 100],
+    ref2$strataFile[ref2$outcomeId == 100]
+  )
+  expect_equal(
+    ref1$outcomeModelFile[ref1$outcomeId == 100],
+    ref2$outcomeModelFile[ref2$outcomeId == 100]
+  )
+
+  # New outcome 101 should have new study pop and outcome model
+  newStudyPop <- ref2$studyPopFile[ref2$outcomeId == 101]
+  expect_true(newStudyPop != "")
+  expect_true(!(newStudyPop %in% ref1$studyPopFile))
+
+  newOm <- ref2$outcomeModelFile[ref2$outcomeId == 101]
+  expect_true(newOm != "")
+  expect_true(!(newOm %in% ref1$outcomeModelFile))
+})
+
+test_that("Adding an outcome does NOT affect strata file of existing outcomes", {
+  analysis <- makeBasicAnalysis()
+
+  tcos1 <- list(
+    createTargetComparatorOutcomes(
+      targetId = 1,
+      comparatorId = 2,
+      outcomes = list(
+        createOutcome(outcomeId = 100, outcomeOfInterest = TRUE)
+      )
+    )
+  )
+
+  tcos2 <- list(
+    createTargetComparatorOutcomes(
+      targetId = 1,
+      comparatorId = 2,
+      outcomes = list(
+        createOutcome(outcomeId = 100, outcomeOfInterest = TRUE),
+        createOutcome(outcomeId = 101, outcomeOfInterest = TRUE)
+      )
+    )
+  )
+
+  ref1 <- buildRef(list(analysis), tcos1)
+  ref2 <- buildRef(list(analysis), tcos2)
+
+  # Strata file for outcome 100 should be unchanged
+  expect_equal(
+    ref1$strataFile[ref1$outcomeId == 100],
+    ref2$strataFile[ref2$outcomeId == 100]
+  )
+})
+
+
+# ===========================================================================
+# Scenario 3: Changing PS covariates cascades to all downstream artifacts
+# CmData changes → new loadHash → new StudyPop, PS, Strata, OutcomeModel
+# ===========================================================================
+
+test_that("Changing covariates cascades: all filenames change", {
+  analysis1 <- createCmAnalysis(
+    analysisId = 1,
+    getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs(
+      covariateSettings = FeatureExtraction::createCovariateSettings(
+        useDemographicsGender = TRUE,
+        useDemographicsAge = TRUE
+      )
+    ),
+    createStudyPopulationArgs = createCreateStudyPopulationArgs(),
+    createPsArgs = createCreatePsArgs(),
+    matchOnPsArgs = createMatchOnPsArgs(),
+    fitOutcomeModelArgs = createFitOutcomeModelArgs(modelType = "cox")
+  )
+
+  # Same analysis but with additional covariate
+  analysis2 <- createCmAnalysis(
+    analysisId = 1,
+    getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs(
+      covariateSettings = FeatureExtraction::createCovariateSettings(
+        useDemographicsGender = TRUE,
+        useDemographicsAge = TRUE,
+        useDemographicsRace = TRUE  # CHANGED: added covariate
+      )
+    ),
+    createStudyPopulationArgs = createCreateStudyPopulationArgs(),
+    createPsArgs = createCreatePsArgs(),
+    matchOnPsArgs = createMatchOnPsArgs(),
+    fitOutcomeModelArgs = createFitOutcomeModelArgs(modelType = "cox")
+  )
+
+  tcos <- list(
+    createTargetComparatorOutcomes(
+      targetId = 1,
+      comparatorId = 2,
+      outcomes = list(createOutcome(outcomeId = 100, outcomeOfInterest = TRUE))
+    )
+  )
+
+  ref1 <- buildRef(list(analysis1), tcos)
+  ref2 <- buildRef(list(analysis2), tcos)
+
+  # CohortMethodData files should differ (covariates changed)
+  expect_true(ref1$cohortMethodDataFile[1] != ref2$cohortMethodDataFile[1])
+
+  # Study population files should differ (cascaded via loadHash)
+  expect_true(ref1$studyPopFile[1] != ref2$studyPopFile[1])
+
+  # Shared PS files should differ (cascaded via loadHash)
+  expect_true(ref1$sharedPsFile[1] != ref2$sharedPsFile[1])
+
+  # Strata files should differ (cascaded)
+  expect_true(ref1$strataFile[1] != ref2$strataFile[1])
+
+  # Outcome model files should differ (cascaded)
+  expect_true(ref1$outcomeModelFile[1] != ref2$outcomeModelFile[1])
+})
+
+test_that("Changing PS args cascades to PS and downstream but NOT CmData", {
+  covSettings <- FeatureExtraction::createCovariateSettings(
+    useDemographicsGender = TRUE,
+    useDemographicsAge = TRUE
+  )
+
+  analysis1 <- createCmAnalysis(
+    analysisId = 1,
+    getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs(
+      covariateSettings = covSettings
+    ),
+    createStudyPopulationArgs = createCreateStudyPopulationArgs(),
+    createPsArgs = createCreatePsArgs(maxCohortSizeForFitting = 100000),
+    matchOnPsArgs = createMatchOnPsArgs(),
+    fitOutcomeModelArgs = createFitOutcomeModelArgs(modelType = "cox")
+  )
+
+  analysis2 <- createCmAnalysis(
+    analysisId = 1,
+    getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs(
+      covariateSettings = covSettings
+    ),
+    createStudyPopulationArgs = createCreateStudyPopulationArgs(),
+    createPsArgs = createCreatePsArgs(maxCohortSizeForFitting = 50000),  # CHANGED
+    matchOnPsArgs = createMatchOnPsArgs(),
+    fitOutcomeModelArgs = createFitOutcomeModelArgs(modelType = "cox")
+  )
+
+  tcos <- list(
+    createTargetComparatorOutcomes(
+      targetId = 1,
+      comparatorId = 2,
+      outcomes = list(createOutcome(outcomeId = 100, outcomeOfInterest = TRUE))
+    )
+  )
+
+  ref1 <- buildRef(list(analysis1), tcos)
+  ref2 <- buildRef(list(analysis2), tcos)
+
+  # CohortMethodData should be REUSED (load args unchanged)
+  expect_equal(ref1$cohortMethodDataFile[1], ref2$cohortMethodDataFile[1])
+
+  # StudyPop should be REUSED (studyPop args unchanged, same loadHash)
+  expect_equal(ref1$studyPopFile[1], ref2$studyPopFile[1])
+
+  # Shared PS should DIFFER (PS args changed)
+  expect_true(ref1$sharedPsFile[1] != ref2$sharedPsFile[1])
+
+  # Strata and outcome model should DIFFER (cascaded from PS change)
+  expect_true(ref1$strataFile[1] != ref2$strataFile[1])
+  expect_true(ref1$outcomeModelFile[1] != ref2$outcomeModelFile[1])
+})
+
+test_that("Changing study pop args cascades to StudyPop and downstream but NOT CmData", {
+  covSettings <- FeatureExtraction::createCovariateSettings(
+    useDemographicsGender = TRUE
+  )
+
+  analysis1 <- createCmAnalysis(
+    analysisId = 1,
+    getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs(
+      covariateSettings = covSettings
+    ),
+    createStudyPopulationArgs = createCreateStudyPopulationArgs(minDaysAtRisk = 1),
+    createPsArgs = createCreatePsArgs(),
+    matchOnPsArgs = createMatchOnPsArgs(),
+    fitOutcomeModelArgs = createFitOutcomeModelArgs(modelType = "cox")
+  )
+
+  analysis2 <- createCmAnalysis(
+    analysisId = 1,
+    getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs(
+      covariateSettings = covSettings
+    ),
+    createStudyPopulationArgs = createCreateStudyPopulationArgs(minDaysAtRisk = 30),  # CHANGED
+    createPsArgs = createCreatePsArgs(),
+    matchOnPsArgs = createMatchOnPsArgs(),
+    fitOutcomeModelArgs = createFitOutcomeModelArgs(modelType = "cox")
+  )
+
+  tcos <- list(
+    createTargetComparatorOutcomes(
+      targetId = 1,
+      comparatorId = 2,
+      outcomes = list(createOutcome(outcomeId = 100, outcomeOfInterest = TRUE))
+    )
+  )
+
+  ref1 <- buildRef(list(analysis1), tcos)
+  ref2 <- buildRef(list(analysis2), tcos)
+
+  # CohortMethodData should be REUSED
+  expect_equal(ref1$cohortMethodDataFile[1], ref2$cohortMethodDataFile[1])
+
+  # StudyPop should DIFFER
+  expect_true(ref1$studyPopFile[1] != ref2$studyPopFile[1])
+
+  # PS should DIFFER (depends on studyPop args in hash)
+  expect_true(ref1$sharedPsFile[1] != ref2$sharedPsFile[1])
+
+  # Strata should DIFFER
+  expect_true(ref1$strataFile[1] != ref2$strataFile[1])
+
+  # Outcome model should DIFFER
+  expect_true(ref1$outcomeModelFile[1] != ref2$outcomeModelFile[1])
+})
+
+
+# ===========================================================================
+# Scenario 4: databaseId differentiation
+# Same settings on different databases MUST produce different filenames
+# ===========================================================================
+
+test_that("Different databaseId produces entirely different filenames", {
+  analysis <- makeBasicAnalysis()
+
+  tcos <- list(
+    createTargetComparatorOutcomes(
+      targetId = 1,
+      comparatorId = 2,
+      outcomes = list(createOutcome(outcomeId = 100, outcomeOfInterest = TRUE))
+    )
+  )
+
+  refA <- buildRef(list(analysis), tcos, databaseId = "database_A")
+  refB <- buildRef(list(analysis), tcos, databaseId = "database_B")
+
+  # ALL filenames should differ
+
+  expect_true(refA$cohortMethodDataFile[1] != refB$cohortMethodDataFile[1])
+  expect_true(refA$studyPopFile[1] != refB$studyPopFile[1])
+  expect_true(refA$sharedPsFile[1] != refB$sharedPsFile[1])
+  expect_true(refA$strataFile[1] != refB$strataFile[1])
+  expect_true(refA$outcomeModelFile[1] != refB$outcomeModelFile[1])
+})
+
+
+# ===========================================================================
+# Scenario 5: Changing matching/stratification args
+# Should change strata + outcome models but NOT CmData, StudyPop, or PS
+# ===========================================================================
+
+test_that("Changing matching args changes strata but not CmData/StudyPop/PS", {
+  covSettings <- FeatureExtraction::createCovariateSettings(
+    useDemographicsGender = TRUE
+  )
+
+  analysis1 <- createCmAnalysis(
+    analysisId = 1,
+    getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs(
+      covariateSettings = covSettings
+    ),
+    createStudyPopulationArgs = createCreateStudyPopulationArgs(),
+    createPsArgs = createCreatePsArgs(),
+    matchOnPsArgs = createMatchOnPsArgs(caliper = 0.2),
+    fitOutcomeModelArgs = createFitOutcomeModelArgs(modelType = "cox")
+  )
+
+  analysis2 <- createCmAnalysis(
+    analysisId = 1,
+    getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs(
+      covariateSettings = covSettings
+    ),
+    createStudyPopulationArgs = createCreateStudyPopulationArgs(),
+    createPsArgs = createCreatePsArgs(),
+    matchOnPsArgs = createMatchOnPsArgs(caliper = 0.1),  # CHANGED caliper
+    fitOutcomeModelArgs = createFitOutcomeModelArgs(modelType = "cox")
+  )
+
+  tcos <- list(
+    createTargetComparatorOutcomes(
+      targetId = 1,
+      comparatorId = 2,
+      outcomes = list(createOutcome(outcomeId = 100, outcomeOfInterest = TRUE))
+    )
+  )
+
+  ref1 <- buildRef(list(analysis1), tcos)
+  ref2 <- buildRef(list(analysis2), tcos)
+
+  # CmData should be REUSED
+  expect_equal(ref1$cohortMethodDataFile[1], ref2$cohortMethodDataFile[1])
+
+  # StudyPop should be REUSED
+  expect_equal(ref1$studyPopFile[1], ref2$studyPopFile[1])
+
+  # Shared PS should be REUSED (matching doesn't affect PS fitting)
+  expect_equal(ref1$sharedPsFile[1], ref2$sharedPsFile[1])
+
+  # Strata should DIFFER (matching caliper changed)
+  expect_true(ref1$strataFile[1] != ref2$strataFile[1])
+
+  # Outcome model should DIFFER (depends on strata)
+  expect_true(ref1$outcomeModelFile[1] != ref2$outcomeModelFile[1])
+})
+
+
+# ===========================================================================
+# Scenario 6: Changing outcome model args only
+# Should change ONLY outcome models
+# ===========================================================================
+
+test_that("Changing outcome model args changes only outcome model files", {
+  covSettings <- FeatureExtraction::createCovariateSettings(
+    useDemographicsGender = TRUE
+  )
+
+  analysis1 <- createCmAnalysis(
+    analysisId = 1,
+    getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs(
+      covariateSettings = covSettings
+    ),
+    createStudyPopulationArgs = createCreateStudyPopulationArgs(),
+    createPsArgs = createCreatePsArgs(),
+    matchOnPsArgs = createMatchOnPsArgs(),
+    fitOutcomeModelArgs = createFitOutcomeModelArgs(modelType = "cox")
+  )
+
+  analysis2 <- createCmAnalysis(
+    analysisId = 1,
+    getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs(
+      covariateSettings = covSettings
+    ),
+    createStudyPopulationArgs = createCreateStudyPopulationArgs(),
+    createPsArgs = createCreatePsArgs(),
+    matchOnPsArgs = createMatchOnPsArgs(),
+    fitOutcomeModelArgs = createFitOutcomeModelArgs(modelType = "logistic")  # CHANGED
+  )
+
+  tcos <- list(
+    createTargetComparatorOutcomes(
+      targetId = 1,
+      comparatorId = 2,
+      outcomes = list(createOutcome(outcomeId = 100, outcomeOfInterest = TRUE))
+    )
+  )
+
+  ref1 <- buildRef(list(analysis1), tcos)
+  ref2 <- buildRef(list(analysis2), tcos)
+
+  # CmData, StudyPop, PS, Strata should all be REUSED
+  expect_equal(ref1$cohortMethodDataFile[1], ref2$cohortMethodDataFile[1])
+  expect_equal(ref1$studyPopFile[1], ref2$studyPopFile[1])
+  expect_equal(ref1$sharedPsFile[1], ref2$sharedPsFile[1])
+  expect_equal(ref1$strataFile[1], ref2$strataFile[1])
+
+  # Only outcome model should DIFFER
+  expect_true(ref1$outcomeModelFile[1] != ref2$outcomeModelFile[1])
+})
+
+
+# ===========================================================================
+# Scenario 7: Multiple analyses sharing CmData
+# ===========================================================================
+
+test_that("Analyses with same load args share CmData file", {
+  covSettings <- FeatureExtraction::createCovariateSettings(
+    useDemographicsGender = TRUE
+  )
+
+  # Two analyses with same getDbCohortMethodDataArgs but different PS args
+  analysis1 <- createCmAnalysis(
+    analysisId = 1,
+    getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs(
+      covariateSettings = covSettings
+    ),
+    createStudyPopulationArgs = createCreateStudyPopulationArgs(),
+    createPsArgs = createCreatePsArgs(maxCohortSizeForFitting = 100000),
+    matchOnPsArgs = createMatchOnPsArgs(),
+    fitOutcomeModelArgs = createFitOutcomeModelArgs(modelType = "cox")
+  )
+
+  analysis2 <- createCmAnalysis(
+    analysisId = 2,
+    getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs(
+      covariateSettings = covSettings
+    ),
+    createStudyPopulationArgs = createCreateStudyPopulationArgs(),
+    createPsArgs = createCreatePsArgs(maxCohortSizeForFitting = 50000),
+    matchOnPsArgs = createMatchOnPsArgs(),
+    fitOutcomeModelArgs = createFitOutcomeModelArgs(modelType = "cox")
+  )
+
+  tcos <- list(
+    createTargetComparatorOutcomes(
+      targetId = 1,
+      comparatorId = 2,
+      outcomes = list(createOutcome(outcomeId = 100, outcomeOfInterest = TRUE))
+    )
+  )
+
+  ref <- buildRef(list(analysis1, analysis2), tcos)
+
+  # Both rows should share the same CmData file
+  cmFiles <- unique(ref$cohortMethodDataFile)
+  expect_equal(length(cmFiles), 1)
+
+  # But have different PS files
+  psFiles <- unique(ref$sharedPsFile[ref$sharedPsFile != ""])
+  expect_equal(length(psFiles), 2)
+})
+
+
+# ===========================================================================
+# Scenario 8: Hash stability across invocations
+# ===========================================================================
+
+test_that("Reference table is deterministic across repeated calls", {
+  analysis <- makeBasicAnalysis()
+
+  tcos <- list(
+    createTargetComparatorOutcomes(
+      targetId = 1,
+      comparatorId = 2,
+      outcomes = list(
+        createOutcome(outcomeId = 100, outcomeOfInterest = TRUE),
+        createOutcome(outcomeId = 200, outcomeOfInterest = FALSE)
+      )
+    )
+  )
+
+  ref1 <- buildRef(list(analysis), tcos, databaseId = "stable_test")
+  ref2 <- buildRef(list(analysis), tcos, databaseId = "stable_test")
+
+  # All filenames should be identical
+  expect_equal(ref1$cohortMethodDataFile, ref2$cohortMethodDataFile)
+  expect_equal(ref1$studyPopFile, ref2$studyPopFile)
+  expect_equal(ref1$sharedPsFile, ref2$sharedPsFile)
+  expect_equal(ref1$strataFile, ref2$strataFile)
+  expect_equal(ref1$outcomeModelFile, ref2$outcomeModelFile)
+})

From c1e8d37cd312bed9a816fc5d7a9920ba628bdb1c Mon Sep 17 00:00:00 2001
From: jgilber2 <jgilber2@its.jnj.com>
Date: Thu, 30 Apr 2026 10:15:00 -0700
Subject: [PATCH 04/10] Changes to eunomia tests to match new filenames

---
 tests/testthat/test-eunomia.R | 13 +++++++++----
 1 file changed, 9 insertions(+), 4 deletions(-)

diff --git a/tests/testthat/test-eunomia.R b/tests/testthat/test-eunomia.R
index ffb0131..e9d8e32 100644
--- a/tests/testthat/test-eunomia.R
+++ b/tests/testthat/test-eunomia.R
@@ -705,10 +705,15 @@ if (!isFALSE(tryCatch(find.package("Eunomia"), error = function(e) FALSE))) {
     nestedFiles <- result$cohortMethodDataFile[result$nestingCohortId == 4 & !is.na(result$nestingCohortId)]
     nonNestedFiles <- result$cohortMethodDataFile[is.na(result$nestingCohortId)]
     expect_true(length(intersect(nestedFiles, nonNestedFiles)) == 0)
-    expect_equal(length(grep("_t1_c2_n4", result$sharedBalanceFile)), 2)
-    expect_equal(length(grep("_t1_c2_n4", result$filteredForbalanceFile)), 2)
-    expect_equal(length(grep("_t1_c2_n4", result$balanceFile)), 2)
-    expect_equal(length(grep("_t1_c2_n4", result$outcomeModelFile)), 2)
+
+    # Verify nested rows have distinct balance/outcome filenames from non-nested rows
+    nestedBalance <- result$sharedBalanceFile[result$nestingCohortId == 4 & !is.na(result$nestingCohortId)]
+    nonNestedBalance <- result$sharedBalanceFile[is.na(result$nestingCohortId)]
+    expect_true(length(intersect(nestedBalance[nestedBalance != ""], nonNestedBalance[nonNestedBalance != ""])) == 0)
+
+    nestedOm <- result$outcomeModelFile[result$nestingCohortId == 4 & !is.na(result$nestingCohortId)]
+    nonNestedOm <- result$outcomeModelFile[is.na(result$nestingCohortId)]
+    expect_true(length(intersect(nestedOm[nestedOm != ""], nonNestedOm[nonNestedOm != ""])) == 0)
 
     uniqueCmdFiles <- result[!duplicated(result$cohortMethodDataFile), ]
     for (i in seq_len(nrow(uniqueCmdFiles))) {

From 9e80057552052b62e4c697fb4cea343f51986052 Mon Sep 17 00:00:00 2001
From: jgilber2 <jgilber2@its.jnj.com>
Date: Thu, 30 Apr 2026 11:45:33 -0700
Subject: [PATCH 05/10] missing doc files

---
 man/ArtifactStore.Rd      | 193 +++++++++++++++++++++++++++++++
 man/LocalArtifactStore.Rd | 235 ++++++++++++++++++++++++++++++++++++++
 man/runCmAnalyses.Rd      |  12 +-
 3 files changed, 439 insertions(+), 1 deletion(-)
 create mode 100644 man/ArtifactStore.Rd
 create mode 100644 man/LocalArtifactStore.Rd

diff --git a/man/ArtifactStore.Rd b/man/ArtifactStore.Rd
new file mode 100644
index 0000000..517dab3
--- /dev/null
+++ b/man/ArtifactStore.Rd
@@ -0,0 +1,193 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/ArtifactStore.R
+\name{ArtifactStore}
+\alias{ArtifactStore}
+\title{Abstract Artifact Store}
+\description{
+Abstract R6 interface for storing and retrieving analysis artifacts.
+Subclass this to implement custom storage backends (e.g., S3, database).
+}
+\section{Methods}{
+\subsection{Public methods}{
+\itemize{
+\item \href{#method-ArtifactStore-exists}{\code{ArtifactStore$exists()}}
+\item \href{#method-ArtifactStore-readRDS}{\code{ArtifactStore$readRDS()}}
+\item \href{#method-ArtifactStore-saveRDS}{\code{ArtifactStore$saveRDS()}}
+\item \href{#method-ArtifactStore-readAndromeda}{\code{ArtifactStore$readAndromeda()}}
+\item \href{#method-ArtifactStore-saveAndromeda}{\code{ArtifactStore$saveAndromeda()}}
+\item \href{#method-ArtifactStore-listArtifacts}{\code{ArtifactStore$listArtifacts()}}
+\item \href{#method-ArtifactStore-delete}{\code{ArtifactStore$delete()}}
+\item \href{#method-ArtifactStore-ensureDir}{\code{ArtifactStore$ensureDir()}}
+\item \href{#method-ArtifactStore-clone}{\code{ArtifactStore$clone()}}
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-ArtifactStore-exists"></a>}}
+\if{latex}{\out{\hypertarget{method-ArtifactStore-exists}{}}}
+\subsection{Method \code{exists()}}{
+Check if an artifact exists.
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{ArtifactStore$exists(key)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{key}}{Character. The artifact key (relative path).}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Returns}{
+Logical.
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-ArtifactStore-readRDS"></a>}}
+\if{latex}{\out{\hypertarget{method-ArtifactStore-readRDS}{}}}
+\subsection{Method \code{readRDS()}}{
+Read an RDS artifact.
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{ArtifactStore$readRDS(key)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{key}}{Character. The artifact key.}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Returns}{
+The deserialized R object.
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-ArtifactStore-saveRDS"></a>}}
+\if{latex}{\out{\hypertarget{method-ArtifactStore-saveRDS}{}}}
+\subsection{Method \code{saveRDS()}}{
+Save an RDS artifact.
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{ArtifactStore$saveRDS(object, key)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{object}}{The R object to save.}
+
+\item{\code{key}}{Character. The artifact key.}
+}
+\if{html}{\out{</div>}}
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-ArtifactStore-readAndromeda"></a>}}
+\if{latex}{\out{\hypertarget{method-ArtifactStore-readAndromeda}{}}}
+\subsection{Method \code{readAndromeda()}}{
+Read an Andromeda (zip) artifact.
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{ArtifactStore$readAndromeda(key)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{key}}{Character. The artifact key.}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Returns}{
+An Andromeda object.
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-ArtifactStore-saveAndromeda"></a>}}
+\if{latex}{\out{\hypertarget{method-ArtifactStore-saveAndromeda}{}}}
+\subsection{Method \code{saveAndromeda()}}{
+Save an Andromeda (zip) artifact.
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{ArtifactStore$saveAndromeda(object, key)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{object}}{The Andromeda object to save.}
+
+\item{\code{key}}{Character. The artifact key.}
+}
+\if{html}{\out{</div>}}
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-ArtifactStore-listArtifacts"></a>}}
+\if{latex}{\out{\hypertarget{method-ArtifactStore-listArtifacts}{}}}
+\subsection{Method \code{listArtifacts()}}{
+List artifacts matching a prefix.
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{ArtifactStore$listArtifacts(prefix = NULL)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{prefix}}{Character or NULL. Filter to keys starting with this prefix.}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Returns}{
+Character vector of keys.
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-ArtifactStore-delete"></a>}}
+\if{latex}{\out{\hypertarget{method-ArtifactStore-delete}{}}}
+\subsection{Method \code{delete()}}{
+Delete an artifact.
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{ArtifactStore$delete(key)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{key}}{Character. The artifact key.}
+}
+\if{html}{\out{</div>}}
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-ArtifactStore-ensureDir"></a>}}
+\if{latex}{\out{\hypertarget{method-ArtifactStore-ensureDir}{}}}
+\subsection{Method \code{ensureDir()}}{
+Ensure a directory exists for a given key.
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{ArtifactStore$ensureDir(key)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{key}}{Character. The artifact key.}
+}
+\if{html}{\out{</div>}}
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-ArtifactStore-clone"></a>}}
+\if{latex}{\out{\hypertarget{method-ArtifactStore-clone}{}}}
+\subsection{Method \code{clone()}}{
+The objects of this class are cloneable with this method.
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{ArtifactStore$clone(deep = FALSE)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{deep}}{Whether to make a deep clone.}
+}
+\if{html}{\out{</div>}}
+}
+}
+}
diff --git a/man/LocalArtifactStore.Rd b/man/LocalArtifactStore.Rd
new file mode 100644
index 0000000..4ae5c65
--- /dev/null
+++ b/man/LocalArtifactStore.Rd
@@ -0,0 +1,235 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/ArtifactStore.R
+\name{LocalArtifactStore}
+\alias{LocalArtifactStore}
+\title{Local Filesystem Artifact Store}
+\description{
+Implementation of \link{ArtifactStore} that reads and writes artifacts to a local
+filesystem directory. This is the default storage backend.
+}
+\section{Super class}{
+\code{\link[CohortMethod:ArtifactStore]{CohortMethod::ArtifactStore}} -> \code{LocalArtifactStore}
+}
+\section{Methods}{
+\subsection{Public methods}{
+\itemize{
+\item \href{#method-LocalArtifactStore-new}{\code{LocalArtifactStore$new()}}
+\item \href{#method-LocalArtifactStore-exists}{\code{LocalArtifactStore$exists()}}
+\item \href{#method-LocalArtifactStore-readRDS}{\code{LocalArtifactStore$readRDS()}}
+\item \href{#method-LocalArtifactStore-saveRDS}{\code{LocalArtifactStore$saveRDS()}}
+\item \href{#method-LocalArtifactStore-readAndromeda}{\code{LocalArtifactStore$readAndromeda()}}
+\item \href{#method-LocalArtifactStore-saveAndromeda}{\code{LocalArtifactStore$saveAndromeda()}}
+\item \href{#method-LocalArtifactStore-listArtifacts}{\code{LocalArtifactStore$listArtifacts()}}
+\item \href{#method-LocalArtifactStore-delete}{\code{LocalArtifactStore$delete()}}
+\item \href{#method-LocalArtifactStore-ensureDir}{\code{LocalArtifactStore$ensureDir()}}
+\item \href{#method-LocalArtifactStore-getFullPath}{\code{LocalArtifactStore$getFullPath()}}
+\item \href{#method-LocalArtifactStore-clone}{\code{LocalArtifactStore$clone()}}
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-LocalArtifactStore-new"></a>}}
+\if{latex}{\out{\hypertarget{method-LocalArtifactStore-new}{}}}
+\subsection{Method \code{new()}}{
+Create a new local artifact store.
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{LocalArtifactStore$new(basePath)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{basePath}}{Character. The root directory for artifact storage.}
+}
+\if{html}{\out{</div>}}
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-LocalArtifactStore-exists"></a>}}
+\if{latex}{\out{\hypertarget{method-LocalArtifactStore-exists}{}}}
+\subsection{Method \code{exists()}}{
+Check if an artifact exists.
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{LocalArtifactStore$exists(key)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{key}}{Character. The artifact key (relative path).}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Returns}{
+Logical.
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-LocalArtifactStore-readRDS"></a>}}
+\if{latex}{\out{\hypertarget{method-LocalArtifactStore-readRDS}{}}}
+\subsection{Method \code{readRDS()}}{
+Read an RDS artifact.
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{LocalArtifactStore$readRDS(key)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{key}}{Character. The artifact key.}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Returns}{
+The deserialized R object.
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-LocalArtifactStore-saveRDS"></a>}}
+\if{latex}{\out{\hypertarget{method-LocalArtifactStore-saveRDS}{}}}
+\subsection{Method \code{saveRDS()}}{
+Save an RDS artifact.
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{LocalArtifactStore$saveRDS(object, key)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{object}}{The R object to save.}
+
+\item{\code{key}}{Character. The artifact key.}
+}
+\if{html}{\out{</div>}}
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-LocalArtifactStore-readAndromeda"></a>}}
+\if{latex}{\out{\hypertarget{method-LocalArtifactStore-readAndromeda}{}}}
+\subsection{Method \code{readAndromeda()}}{
+Read an Andromeda (zip) artifact.
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{LocalArtifactStore$readAndromeda(key)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{key}}{Character. The artifact key.}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Returns}{
+An Andromeda object.
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-LocalArtifactStore-saveAndromeda"></a>}}
+\if{latex}{\out{\hypertarget{method-LocalArtifactStore-saveAndromeda}{}}}
+\subsection{Method \code{saveAndromeda()}}{
+Save an Andromeda (zip) artifact.
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{LocalArtifactStore$saveAndromeda(object, key)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{object}}{The Andromeda object to save.}
+
+\item{\code{key}}{Character. The artifact key.}
+}
+\if{html}{\out{</div>}}
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-LocalArtifactStore-listArtifacts"></a>}}
+\if{latex}{\out{\hypertarget{method-LocalArtifactStore-listArtifacts}{}}}
+\subsection{Method \code{listArtifacts()}}{
+List artifacts matching a prefix.
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{LocalArtifactStore$listArtifacts(prefix = NULL)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{prefix}}{Character or NULL. Filter to keys starting with this prefix.}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Returns}{
+Character vector of keys.
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-LocalArtifactStore-delete"></a>}}
+\if{latex}{\out{\hypertarget{method-LocalArtifactStore-delete}{}}}
+\subsection{Method \code{delete()}}{
+Delete an artifact.
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{LocalArtifactStore$delete(key)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{key}}{Character. The artifact key.}
+}
+\if{html}{\out{</div>}}
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-LocalArtifactStore-ensureDir"></a>}}
+\if{latex}{\out{\hypertarget{method-LocalArtifactStore-ensureDir}{}}}
+\subsection{Method \code{ensureDir()}}{
+Ensure a directory exists for a given key.
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{LocalArtifactStore$ensureDir(key)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{key}}{Character. The artifact key.}
+}
+\if{html}{\out{</div>}}
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-LocalArtifactStore-getFullPath"></a>}}
+\if{latex}{\out{\hypertarget{method-LocalArtifactStore-getFullPath}{}}}
+\subsection{Method \code{getFullPath()}}{
+Get the full filesystem path for a key.
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{LocalArtifactStore$getFullPath(key)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{key}}{Character. The artifact key.}
+}
+\if{html}{\out{</div>}}
+}
+\subsection{Returns}{
+Character. The full path.
+}
+}
+\if{html}{\out{<hr>}}
+\if{html}{\out{<a id="method-LocalArtifactStore-clone"></a>}}
+\if{latex}{\out{\hypertarget{method-LocalArtifactStore-clone}{}}}
+\subsection{Method \code{clone()}}{
+The objects of this class are cloneable with this method.
+\subsection{Usage}{
+\if{html}{\out{<div class="r">}}\preformatted{LocalArtifactStore$clone(deep = FALSE)}\if{html}{\out{</div>}}
+}
+
+\subsection{Arguments}{
+\if{html}{\out{<div class="arguments">}}
+\describe{
+\item{\code{deep}}{Whether to make a deep clone.}
+}
+\if{html}{\out{</div>}}
+}
+}
+}
diff --git a/man/runCmAnalyses.Rd b/man/runCmAnalyses.Rd
index 2044f01..e5268ca 100644
--- a/man/runCmAnalyses.Rd
+++ b/man/runCmAnalyses.Rd
@@ -16,7 +16,9 @@ runCmAnalyses(
   nestingCohortTable = "cohort",
   outputFolder = "./CohortMethodOutput",
   multiThreadingSettings = createMultiThreadingSettings(),
-  cmAnalysesSpecifications
+  cmAnalysesSpecifications,
+  databaseId,
+  artifactStore = NULL
 )
 }
 \arguments{
@@ -69,6 +71,14 @@ the \code{\link[=createMultiThreadingSettings]{createMultiThreadingSettings()}}
 
 \item{cmAnalysesSpecifications}{An object of type \code{CmAnalysesSpecifications} as created using
 the \code{createCmAnalysesSpecifications()}.}
+
+\item{databaseId}{A unique identifier for the database being used. This is
+baked into artifact hashes to prevent accidental reuse of
+cached results from a different database. Required.}
+
+\item{artifactStore}{An object inheriting from \link{ArtifactStore} used to read and
+write cached artifacts. If NULL (default), a
+\link{LocalArtifactStore} backed by \code{outputFolder} is used.}
 }
 \value{
 A tibble describing for each target-comparator-outcome-analysisId combination where the intermediary and

From d1eafdd6e7cbfeeb71c000f04c7bb7b5b0e5e332 Mon Sep 17 00:00:00 2001
From: jgilber2 <jgilber2@its.jnj.com>
Date: Mon, 4 May 2026 14:36:12 -0700
Subject: [PATCH 06/10] Added more tests to check results cache ar written to
 disk

---
 tests/testthat/test-cachingScenarios.R | 279 +++++++++++++++++++++++++
 1 file changed, 279 insertions(+)

diff --git a/tests/testthat/test-cachingScenarios.R b/tests/testthat/test-cachingScenarios.R
index 1136e17..e2bba951 100644
--- a/tests/testthat/test-cachingScenarios.R
+++ b/tests/testthat/test-cachingScenarios.R
@@ -630,3 +630,282 @@ test_that("Reference table is deterministic across repeated calls", {
   expect_equal(ref1$strataFile, ref2$strataFile)
   expect_equal(ref1$outcomeModelFile, ref2$outcomeModelFile)
 })
+
+
+# ===========================================================================
+# End-to-end disk reuse tests (require Eunomia)
+# These tests verify that artifact files are actually written to and reused
+# from disk, not just that filenames are computed correctly.
+# ===========================================================================
+
+if (!isFALSE(tryCatch(find.package("Eunomia"), error = function(e) FALSE))) {
+
+  # Minimal analysis used across all E2E tests: no PS, no matching, simple cox
+  makeEunomiaAnalysis <- function(analysisId = 1) {
+    createCmAnalysis(
+      analysisId = analysisId,
+      getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs(
+        firstExposureOnly = TRUE,
+        washoutPeriod = 183,
+        covariateSettings = FeatureExtraction::createCovariateSettings(
+          useDemographicsGender = TRUE,
+          useDemographicsAge = TRUE
+        )
+      ),
+      createStudyPopulationArgs = createCreateStudyPopulationArgs(
+        removeSubjectsWithPriorOutcome = TRUE,
+        minDaysAtRisk = 1,
+        riskWindowStart = 0,
+        startAnchor = "cohort start",
+        riskWindowEnd = 30,
+        endAnchor = "cohort end"
+      ),
+      fitOutcomeModelArgs = createFitOutcomeModelArgs(modelType = "cox")
+    )
+  }
+
+  test_that("E2E: files are actually written to disk", {
+    outputFolder <- tempfile(pattern = "cmCache")
+    on.exit(unlink(outputFolder, recursive = TRUE))
+
+    result <- runCmAnalyses(
+      connectionDetails = connectionDetails,
+      cdmDatabaseSchema = "main",
+      exposureTable = "cohort",
+      outcomeTable = "cohort",
+      outputFolder = outputFolder,
+      databaseId = "Eunomia",
+      cmAnalysesSpecifications = createCmAnalysesSpecifications(
+        cmAnalysisList = list(makeEunomiaAnalysis()),
+        targetComparatorOutcomesList = list(
+          createTargetComparatorOutcomes(
+            targetId = 1,
+            comparatorId = 2,
+            outcomes = list(createOutcome(outcomeId = 3, outcomeOfInterest = TRUE))
+          )
+        )
+      )
+    )
+
+    # Every non-empty filename in the reference table must exist on disk
+    checkFiles <- function(col) {
+      paths <- result[[col]]
+      paths <- paths[paths != ""]
+      paths <- unique(paths)
+      missing <- paths[!file.exists(file.path(outputFolder, paths))]
+      expect_equal(length(missing), 0,
+                   info = sprintf("Missing %s files: %s", col, paste(missing, collapse = ", ")))
+    }
+
+    checkFiles("cohortMethodDataFile")
+    checkFiles("studyPopFile")
+    checkFiles("outcomeModelFile")
+  })
+
+  test_that("E2E: adding a new outcome reuses CmData and computes only new outcome model", {
+    outputFolder <- tempfile(pattern = "cmCacheReuse")
+    on.exit(unlink(outputFolder, recursive = TRUE))
+
+    analysis <- makeEunomiaAnalysis()
+    tcos_base <- createTargetComparatorOutcomes(
+      targetId = 1,
+      comparatorId = 2,
+      outcomes = list(
+        createOutcome(outcomeId = 3, outcomeOfInterest = TRUE)
+      )
+    )
+    tcos_extended <- createTargetComparatorOutcomes(
+      targetId = 1,
+      comparatorId = 2,
+      outcomes = list(
+        createOutcome(outcomeId = 3, outcomeOfInterest = TRUE),
+        createOutcome(outcomeId = 4, outcomeOfInterest = TRUE)  # NEW outcome
+      )
+    )
+
+    # First run: outcomes [3]
+    result1 <- runCmAnalyses(
+      connectionDetails = connectionDetails,
+      cdmDatabaseSchema = "main",
+      exposureTable = "cohort",
+      outcomeTable = "cohort",
+      outputFolder = outputFolder,
+      databaseId = "Eunomia",
+      cmAnalysesSpecifications = createCmAnalysesSpecifications(
+        cmAnalysisList = list(analysis),
+        targetComparatorOutcomesList = list(tcos_base)
+      )
+    )
+
+    # Record modification times of existing artifacts after first run
+    cmDataPath <- file.path(outputFolder, result1$cohortMethodDataFile[1])
+    studyPopPath <- file.path(outputFolder, result1$studyPopFile[1])
+    om3Path <- file.path(outputFolder, result1$outcomeModelFile[1])
+
+    expect_true(file.exists(cmDataPath))
+    expect_true(file.exists(studyPopPath))
+    expect_true(file.exists(om3Path))
+
+    mtime_cmData <- file.info(cmDataPath)$mtime
+    mtime_studyPop <- file.info(studyPopPath)$mtime
+    mtime_om3 <- file.info(om3Path)$mtime
+
+    # Brief sleep to ensure mtime would differ if files were rewritten
+    Sys.sleep(1)
+
+    # Second run: add outcome 4
+    result2 <- runCmAnalyses(
+      connectionDetails = connectionDetails,
+      cdmDatabaseSchema = "main",
+      exposureTable = "cohort",
+      outcomeTable = "cohort",
+      outputFolder = outputFolder,
+      databaseId = "Eunomia",
+      cmAnalysesSpecifications = createCmAnalysesSpecifications(
+        cmAnalysisList = list(analysis),
+        targetComparatorOutcomesList = list(tcos_extended)
+      )
+    )
+
+    # CmData file: same filename, NOT re-written (mtime unchanged)
+    expect_equal(
+      result1$cohortMethodDataFile[1],
+      result2$cohortMethodDataFile[result2$outcomeId == 3]
+    )
+    expect_equal(file.info(cmDataPath)$mtime, mtime_cmData)
+
+    # StudyPop for outcome 3: same filename, NOT re-written
+    expect_equal(
+      result1$studyPopFile[1],
+      result2$studyPopFile[result2$outcomeId == 3]
+    )
+    expect_equal(file.info(studyPopPath)$mtime, mtime_studyPop)
+
+    # Outcome model for outcome 3: same filename, NOT re-written
+    expect_equal(
+      result1$outcomeModelFile[1],
+      result2$outcomeModelFile[result2$outcomeId == 3]
+    )
+    expect_equal(file.info(om3Path)$mtime, mtime_om3)
+
+    # Outcome 4: new outcome model created
+    om4Path <- file.path(outputFolder, result2$outcomeModelFile[result2$outcomeId == 4])
+    expect_true(file.exists(om4Path))
+
+    # Outcome 4's study pop: newly created
+    sp4Path <- file.path(outputFolder, result2$studyPopFile[result2$outcomeId == 4])
+    expect_true(file.exists(sp4Path))
+
+    # New outcome 4 has a DIFFERENT filename than outcome 3
+    expect_false(
+      result2$outcomeModelFile[result2$outcomeId == 3] ==
+        result2$outcomeModelFile[result2$outcomeId == 4]
+    )
+  })
+
+  test_that("E2E: changing settings forces recomputation with new filename", {
+    outputFolder <- tempfile(pattern = "cmCacheInvalidate")
+    on.exit(unlink(outputFolder, recursive = TRUE))
+
+    tcos <- createTargetComparatorOutcomes(
+      targetId = 1,
+      comparatorId = 2,
+      outcomes = list(createOutcome(outcomeId = 3, outcomeOfInterest = TRUE))
+    )
+
+    analysis_v1 <- createCmAnalysis(
+      analysisId = 1,
+      getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs(
+        firstExposureOnly = TRUE,
+        washoutPeriod = 183,
+        covariateSettings = FeatureExtraction::createCovariateSettings(
+          useDemographicsGender = TRUE,
+          useDemographicsAge = TRUE
+        )
+      ),
+      createStudyPopulationArgs = createCreateStudyPopulationArgs(
+        minDaysAtRisk = 1,
+        riskWindowStart = 0,
+        startAnchor = "cohort start",
+        riskWindowEnd = 30,
+        endAnchor = "cohort end"
+      ),
+      fitOutcomeModelArgs = createFitOutcomeModelArgs(modelType = "cox")
+    )
+
+    analysis_v2 <- createCmAnalysis(
+      analysisId = 1,
+      getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs(
+        firstExposureOnly = TRUE,
+        washoutPeriod = 183,
+        covariateSettings = FeatureExtraction::createCovariateSettings(
+          useDemographicsGender = TRUE,
+          useDemographicsAge = TRUE
+        )
+      ),
+      createStudyPopulationArgs = createCreateStudyPopulationArgs(
+        minDaysAtRisk = 30,  # CHANGED: was 1, now 30
+        riskWindowStart = 0,
+        startAnchor = "cohort start",
+        riskWindowEnd = 30,
+        endAnchor = "cohort end"
+      ),
+      fitOutcomeModelArgs = createFitOutcomeModelArgs(modelType = "cox")
+    )
+
+    # First run
+    result_v1 <- runCmAnalyses(
+      connectionDetails = connectionDetails,
+      cdmDatabaseSchema = "main",
+      exposureTable = "cohort",
+      outcomeTable = "cohort",
+      outputFolder = outputFolder,
+      databaseId = "Eunomia",
+      cmAnalysesSpecifications = createCmAnalysesSpecifications(
+        cmAnalysisList = list(analysis_v1),
+        targetComparatorOutcomesList = list(tcos)
+      )
+    )
+
+    # Record what was created
+    cmDataFile_v1 <- result_v1$cohortMethodDataFile[1]
+    studyPopFile_v1 <- result_v1$studyPopFile[1]
+    omFile_v1 <- result_v1$outcomeModelFile[1]
+
+    expect_true(file.exists(file.path(outputFolder, cmDataFile_v1)))
+    expect_true(file.exists(file.path(outputFolder, studyPopFile_v1)))
+    expect_true(file.exists(file.path(outputFolder, omFile_v1)))
+
+    # Second run with changed study pop args
+    result_v2 <- runCmAnalyses(
+      connectionDetails = connectionDetails,
+      cdmDatabaseSchema = "main",
+      exposureTable = "cohort",
+      outcomeTable = "cohort",
+      outputFolder = outputFolder,
+      databaseId = "Eunomia",
+      cmAnalysesSpecifications = createCmAnalysesSpecifications(
+        cmAnalysisList = list(analysis_v2),
+        targetComparatorOutcomesList = list(tcos)
+      )
+    )
+
+    cmDataFile_v2 <- result_v2$cohortMethodDataFile[1]
+    studyPopFile_v2 <- result_v2$studyPopFile[1]
+    omFile_v2 <- result_v2$outcomeModelFile[1]
+
+    # CmData: same (load args unchanged), study pop and outcome model: new files
+    expect_equal(cmDataFile_v1, cmDataFile_v2)
+    expect_false(studyPopFile_v1 == studyPopFile_v2)
+    expect_false(omFile_v1 == omFile_v2)
+
+    # New files actually exist on disk
+    expect_true(file.exists(file.path(outputFolder, studyPopFile_v2)))
+    expect_true(file.exists(file.path(outputFolder, omFile_v2)))
+
+    # OLD study pop and outcome model files still on disk (not deleted by new run)
+    expect_true(file.exists(file.path(outputFolder, studyPopFile_v1)))
+    expect_true(file.exists(file.path(outputFolder, omFile_v1)))
+  })
+
+}

From be90862951ac72b8d5378d7a192b00e589db4b14 Mon Sep 17 00:00:00 2001
From: jgilber2 <jgilber2@its.jnj.com>
Date: Mon, 4 May 2026 14:39:07 -0700
Subject: [PATCH 07/10] fix for broken equipoise data load

---
 R/RunAnalyses.R | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/R/RunAnalyses.R b/R/RunAnalyses.R
index 42507af..4932d46 100644
--- a/R/RunAnalyses.R
+++ b/R/RunAnalyses.R
@@ -1961,9 +1961,14 @@ addEquipoise <- function(referenceTable, outputFolder) {
       filter(.data$sharedPsFile == "" & .data$psFile != "") |>
       distinct(.data$psFile) |>
       pull()
-    equipoise <- bind_rows(lapply(psFiles, getEquipoise))
-    referenceTable <- referenceTable |>
-      left_join(equipoise, by = join_by("psFile"))
+    if (length(psFiles) > 0) {
+      equipoise <- bind_rows(lapply(psFiles, getEquipoise))
+      referenceTable <- referenceTable |>
+        left_join(equipoise, by = join_by("psFile"))
+    } else {
+      referenceTable <- referenceTable |>
+        mutate(equipoise = NA_real_)
+    }
   }
   return(referenceTable)
 }

From e47fb0d575a4d3364b2cbc02fe8a8cd584d529ed Mon Sep 17 00:00:00 2001
From: jgilber2 <jgilber2@its.jnj.com>
Date: Wed, 6 May 2026 10:28:02 -0700
Subject: [PATCH 08/10] shared base population that is based on only the
 target/nesting/database/time at risk settings and not outcome based

---
 R/RunAnalyses.R                        | 257 ++++++++++++++++++-------
 R/StudyPopulation.R                    | 242 +++++++++++++----------
 tests/testthat/test-cachingScenarios.R | 175 ++++++++++++++++-
 3 files changed, 506 insertions(+), 168 deletions(-)

diff --git a/R/RunAnalyses.R b/R/RunAnalyses.R
index 4932d46..0d91937 100644
--- a/R/RunAnalyses.R
+++ b/R/RunAnalyses.R
@@ -323,7 +323,60 @@ runCmAnalyses <- function(connectionDetails,
     ParallelLogger::stopCluster(cluster)
   }
 
-  # Create study populations --------------------------------------
+  # Create base populations (shared across outcomes with same risk windows) ---
+  subset <- referenceTable[!duplicated(referenceTable$basePopFile), ]
+  subset <- subset[subset$basePopFile != "", ]
+  subset <- subset[!file.exists(file.path(outputFolder, subset$basePopFile)), ]
+  if (nrow(subset) != 0) {
+    message("*** Creating base populations ***")
+    createBasePopTask <- function(i) {
+      refRow <- subset[i, ]
+      analysisRow <- ParallelLogger::matchInList(
+        cmAnalysesSpecifications$cmAnalysisList,
+        list(analysisId = refRow$analysisId)
+      )[[1]]
+      createStudyPopulationArgs <- analysisRow$createStudyPopulationArgs
+
+      # Apply per-outcome risk window overrides (these affect the base pop)
+      tco <- ParallelLogger::matchInList(
+        cmAnalysesSpecifications$targetComparatorOutcomesList,
+        list(
+          nestingCohortId = if (is.na(refRow$nestingCohortId)) {NULL} else {refRow$nestingCohortId},
+          comparatorId = refRow$comparatorId,
+          targetId = refRow$targetId
+        )
+      )[[1]]
+      outcome <- ParallelLogger::matchInList(
+        tco$outcomes,
+        list(outcomeId = as.numeric(refRow$outcomeId))
+      )
+      if (!is.null(outcome$riskWindowStart)) {
+        createStudyPopulationArgs$riskWindowStart <- outcome$riskWindowStart
+      }
+      if (!is.null(outcome$startAnchor)) {
+        createStudyPopulationArgs$startAnchor <- outcome$startAnchor
+      }
+      if (!is.null(outcome$riskWindowEnd)) {
+        createStudyPopulationArgs$riskWindowEnd <- outcome$riskWindowEnd
+      }
+      if (!is.null(outcome$endAnchor)) {
+        createStudyPopulationArgs$endAnchor <- outcome$endAnchor
+      }
+      task <- list(
+        cohortMethodDataFile = file.path(outputFolder, refRow$cohortMethodDataFile),
+        createStudyPopulationArgs = createStudyPopulationArgs,
+        basePopFile = file.path(outputFolder, refRow$basePopFile)
+      )
+      return(task)
+    }
+    objectsToCreate <- lapply(1:nrow(subset), createBasePopTask)
+    cluster <- ParallelLogger::makeCluster(min(length(objectsToCreate), multiThreadingSettings$createStudyPopThreads))
+    ParallelLogger::clusterRequire(cluster, "CohortMethod")
+    dummy <- ParallelLogger::clusterApply(cluster, objectsToCreate, doCreateBasePopObject)
+    ParallelLogger::stopCluster(cluster)
+  }
+
+  # Create study populations (per-outcome, from base populations) -----------
   subset <- referenceTable[!duplicated(referenceTable$studyPopFile), ]
   subset <- subset[subset$studyPopFile != "", ]
   subset <- subset[!file.exists(file.path(outputFolder, subset$studyPopFile)), ]
@@ -366,10 +419,8 @@ runCmAnalyses <- function(connectionDetails,
         createStudyPopulationArgs$endAnchor <- outcome$endAnchor
       }
       task <- list(
-        cohortMethodDataFile = file.path(
-          outputFolder,
-          refRow$cohortMethodDataFile
-        ),
+        cohortMethodDataFile = file.path(outputFolder, refRow$cohortMethodDataFile),
+        basePopFile = file.path(outputFolder, refRow$basePopFile),
         outcomeId = refRow$outcomeId,
         createStudyPopulationArgs = createStudyPopulationArgs,
         minimizeFileSizes = getOption("minimizeFileSizes"),
@@ -437,7 +488,7 @@ runCmAnalyses <- function(connectionDetails,
             refRow$cohortMethodDataFile
           ),
           args = createPsArgs,
-          createStudyPopulationArgs = analysisRow$createStudyPopulationArgs,
+          basePopFile = file.path(outputFolder, refRow$basePopFile),
           sharedPsFile = file.path(outputFolder, refRow$sharedPsFile)
         )
         return(task)
@@ -750,17 +801,31 @@ doCreateCmDataObject <- function(params) {
   return(NULL)
 }
 
+doCreateBasePopObject <- function(params) {
+  cohortMethodData <- getCohortMethodData(params$cohortMethodDataFile)
+  ParallelLogger::logDebug(sprintf("Calling createBasePopulation() using %s",
+                                   params$cohortMethodDataFile))
+  basePop <- createBasePopulation(cohortMethodData,
+                                  createStudyPopulationArgs = params$createStudyPopulationArgs)
+  saveRDS(basePop, params$basePopFile)
+  return(NULL)
+}
+
 doCreateStudyPopObject <- function(params) {
+  basePop <- readRDS(params$basePopFile)
   cohortMethodData <- getCohortMethodData(params$cohortMethodDataFile)
-  args <- list(
+  ParallelLogger::logDebug(sprintf("Calling addOutcomeToPopulation() using %s for outcomeId %s",
+                                   params$basePopFile,
+                                   params$outcomeId))
+  studyPop <- addOutcomeToPopulation(
+    basePopulation = basePop,
     cohortMethodData = cohortMethodData,
     outcomeId = params$outcomeId,
-    createStudyPopulationArgs = params$createStudyPopulationArgs
+    removeSubjectsWithPriorOutcome = params$createStudyPopulationArgs$removeSubjectsWithPriorOutcome,
+    priorOutcomeLookback = params$createStudyPopulationArgs$priorOutcomeLookback,
+    startAnchor = params$createStudyPopulationArgs$startAnchor,
+    riskWindowStart = params$createStudyPopulationArgs$riskWindowStart
   )
-  ParallelLogger::logDebug(sprintf("Calling createStudyPopulation() using %s for outcomeId %s",
-                                   params$cohortMethodDataFile,
-                                   args$outcomeId))
-  studyPop <- do.call("createStudyPopulation", args)
   if (!is.null(params$minimizeFileSizes) && params$minimizeFileSizes) {
     metaData <- attr(studyPop, "metaData")
     studyPop <- studyPop[, c("rowId", "treatment", "personSeqId", "outcomeCount", "timeAtRisk", "survivalTime")]
@@ -789,20 +854,16 @@ doFitPsModel <- function(params) {
 doFitSharedPsModel <- function(params, refitPsForEveryStudyPopulation) {
   cohortMethodData <- getCohortMethodData(params$cohortMethodDataFile)
   if (refitPsForEveryStudyPopulation) {
-    args <- list(
-      cohortMethodData = cohortMethodData,
-      createStudyPopulationArgs = params$createStudyPopulationArgs
-    )
-    message("Fitting propensity model across all outcomes (ignore messages about 'no outcome specified')")
-    ParallelLogger::logDebug(sprintf("Calling createPs() for shared PS using %s",
-                                     params$cohortMethodDataFile))
-    studyPop <- do.call("createStudyPopulation", args)
+    # Use base population directly (outcome-independent by construction)
+    basePop <- readRDS(params$basePopFile)
+    ParallelLogger::logDebug(sprintf("Calling createPs() for shared PS using base population %s",
+                                     params$basePopFile))
   } else {
-    studyPop <- NULL
+    basePop <- NULL
   }
   args <- list(
     cohortMethodData = cohortMethodData,
-    population = studyPop,
+    population = basePop,
     createPsArgs = params$args
   )
   ps <- do.call("createPs", args)
@@ -1152,13 +1213,85 @@ createReferenceTable <- function(cmAnalysisList,
   )
   referenceTable <- inner_join(referenceTable, analysisIdToStudyPopArgsId, by = "analysisId")
 
-  # Content-addressable hash for study population files
+  # Compute effective base population args per row (applying per-outcome risk window overrides)
+  # Base pop args include only outcome-independent fields: risk windows, censoring, minDaysAtRisk
+  .getEffectiveBasePopArgsJson <- function(i) {
+    aIdx <- match(referenceTable$analysisId[i], analyses$analysisId)
+    args <- cmAnalysisList[[aIdx]]$createStudyPopulationArgs
+
+    # Look up per-outcome overrides from targetComparatorOutcomesList
+    tcoMatch <- Filter(function(tco) {
+      tco$targetId == referenceTable$targetId[i] &&
+        tco$comparatorId == referenceTable$comparatorId[i] &&
+        (is.null(tco$nestingCohortId) && is.na(referenceTable$nestingCohortId[i]) ||
+           identical(tco$nestingCohortId, referenceTable$nestingCohortId[i]))
+    }, targetComparatorOutcomesList)
+    if (length(tcoMatch) > 0) {
+      outcomeMatch <- Filter(function(o) o$outcomeId == referenceTable$outcomeId[i],
+                             tcoMatch[[1]]$outcomes)
+      if (length(outcomeMatch) > 0) {
+        outcome <- outcomeMatch[[1]]
+        # Apply risk window overrides that affect the base population
+        baseFields <- list(
+          minDaysAtRisk = args$minDaysAtRisk,
+          maxDaysAtRisk = args$maxDaysAtRisk,
+          riskWindowStart = if (!is.null(outcome$riskWindowStart)) outcome$riskWindowStart else args$riskWindowStart,
+          startAnchor = if (!is.null(outcome$startAnchor)) outcome$startAnchor else args$startAnchor,
+          riskWindowEnd = if (!is.null(outcome$riskWindowEnd)) outcome$riskWindowEnd else args$riskWindowEnd,
+          endAnchor = if (!is.null(outcome$endAnchor)) outcome$endAnchor else args$endAnchor,
+          censorAtNewRiskWindow = args$censorAtNewRiskWindow
+        )
+        return(as.character(jsonlite::toJSON(baseFields[order(names(baseFields))],
+                                            auto_unbox = TRUE, digits = NA, null = "null")))
+      }
+    }
+    # No overrides — use analysis-level args
+    .serializeBasePopArgs(args)
+  }
+
+  # Compute outcome-specific args JSON (removeSubjectsWithPriorOutcome + priorOutcomeLookback)
+  .getOutcomeSpecificArgsJson <- function(i) {
+    aIdx <- match(referenceTable$analysisId[i], analyses$analysisId)
+    args <- cmAnalysisList[[aIdx]]$createStudyPopulationArgs
+    priorOutcomeLookback <- args$priorOutcomeLookback
+    removeSubjectsWithPriorOutcome <- args$removeSubjectsWithPriorOutcome
+
+    # Check for per-outcome priorOutcomeLookback override
+    tcoMatch <- Filter(function(tco) {
+      tco$targetId == referenceTable$targetId[i] &&
+        tco$comparatorId == referenceTable$comparatorId[i] &&
+        (is.null(tco$nestingCohortId) && is.na(referenceTable$nestingCohortId[i]) ||
+           identical(tco$nestingCohortId, referenceTable$nestingCohortId[i]))
+    }, targetComparatorOutcomesList)
+    if (length(tcoMatch) > 0) {
+      outcomeMatch <- Filter(function(o) o$outcomeId == referenceTable$outcomeId[i],
+                             tcoMatch[[1]]$outcomes)
+      if (length(outcomeMatch) > 0 && !is.null(outcomeMatch[[1]]$priorOutcomeLookback)) {
+        priorOutcomeLookback <- outcomeMatch[[1]]$priorOutcomeLookback
+      }
+    }
+    as.character(jsonlite::toJSON(list(
+      removeSubjectsWithPriorOutcome = removeSubjectsWithPriorOutcome,
+      priorOutcomeLookback = priorOutcomeLookback
+    ), auto_unbox = TRUE, digits = NA, null = "null"))
+  }
+
+  basePopArgsJsons <- vapply(seq_len(nrow(referenceTable)), .getEffectiveBasePopArgsJson, character(1))
+  outcomeSpecificArgsJsons <- vapply(seq_len(nrow(referenceTable)), .getOutcomeSpecificArgsJson, character(1))
+
+  # Content-addressable hash for base population files (outcome-independent)
+  referenceTable$basePopHash <- vapply(seq_len(nrow(referenceTable)), function(i) {
+    .contentHash(databaseId, referenceTable$loadHash[i], basePopArgsJsons[i])
+  }, character(1))
+  referenceTable$basePopFile <- .createBasePopulationFileName(referenceTable$basePopHash)
+
+  # Content-addressable hash for study population files (outcome-specific)
   referenceTable$studyPopFile <- vapply(seq_len(nrow(referenceTable)), function(i) {
     hash <- .contentHash(
       databaseId,
-      referenceTable$loadHash[i],
-      studyPopArgsJsons[[match(referenceTable$analysisId[i], analyses$analysisId)]],
-      referenceTable$outcomeId[i]
+      referenceTable$basePopHash[i],
+      referenceTable$outcomeId[i],
+      outcomeSpecificArgsJsons[i]
     )
     .createStudyPopulationFileName(hash)
   }, character(1))
@@ -1178,52 +1311,26 @@ createReferenceTable <- function(cmAnalysisList,
   idxWithPs <- !(referenceTable$psArgsId %in% noPsIds)
   referenceTable$psFile <- ""
   referenceTable$psFile[idxWithPs] <- vapply(which(idxWithPs), function(i) {
+    aIdx <- match(referenceTable$analysisId[i], analyses$analysisId)
     hash <- .contentHash(
       databaseId,
-      referenceTable$loadHash[i],
-      studyPopArgsJsons[[match(referenceTable$analysisId[i], analyses$analysisId)]],
-      createPsArgsJsons[[match(referenceTable$analysisId[i], analyses$analysisId)]],
+      referenceTable$basePopHash[i],
+      createPsArgsJsons[[aIdx]],
       referenceTable$outcomeId[i]
     )
     .createPsFileName(hash)
   }, character(1))
 
+  # Shared PS: uses basePopHash directly (outcome-independent, replaces equivalent() logic)
   referenceTable$sharedPsFile <- ""
   if (!refitPsForEveryOutcome) {
     if (refitPsForEveryStudyPopulation) {
-      # Find equivalent studyPopArgs, so we can reuse PS over those as well:
-      equivalent <- function(studyPopArgs1, studyPopArgs2) {
-        if (studyPopArgs1$minDaysAtRisk == 0 && studyPopArgs2$minDaysAtRisk == 0) {
-          return(TRUE)
-        } else if (studyPopArgs1$minDaysAtRisk != studyPopArgs2$minDaysAtRisk ||
-                   studyPopArgs1$riskWindowStart != studyPopArgs2$riskWindowStart ||
-                   studyPopArgs1$startAnchor != studyPopArgs2$startAnchor ||
-                   studyPopArgs1$riskWindowEnd != studyPopArgs2$riskWindowEnd ||
-                   studyPopArgs1$endAnchor != studyPopArgs2$endAnchor ||
-                   studyPopArgs1$censorAtNewRiskWindow != studyPopArgs2$censorAtNewRiskWindow) {
-          return(FALSE)
-        } else {
-          return(TRUE)
-        }
-      }
-      # Compute the canonical (first equivalent) studyPop JSON for hashing
-      studyPopArgsEquivalentIdx <- seq_along(cmAnalysisList)
-      for (i in seq_along(cmAnalysisList)) {
-        for (j in seq_len(i - 1)) {
-          if (equivalent(cmAnalysisList[[i]]$createStudyPopulationArgs,
-                         cmAnalysisList[[j]]$createStudyPopulationArgs)) {
-            studyPopArgsEquivalentIdx[i] <- j
-            break
-          }
-        }
-      }
+      # basePopHash already captures equivalence for risk window parameters
       referenceTable$sharedPsFile[idxWithPs] <- vapply(which(idxWithPs), function(i) {
         aIdx <- match(referenceTable$analysisId[i], analyses$analysisId)
-        equivIdx <- studyPopArgsEquivalentIdx[aIdx]
         hash <- .contentHash(
           databaseId,
-          referenceTable$loadHash[i],
-          studyPopArgsJsons[[equivIdx]],
+          referenceTable$basePopHash[i],
           createPsArgsJsons[[aIdx]]
         )
         .createPsFileName(hash)
@@ -1264,8 +1371,7 @@ createReferenceTable <- function(cmAnalysisList,
     aIdx <- match(referenceTable$analysisId[i], analyses$analysisId)
     hash <- .contentHash(
       databaseId,
-      referenceTable$loadHash[i],
-      studyPopArgsJsons[[aIdx]],
+      referenceTable$basePopHash[i],
       createPsArgsJsons[[aIdx]],
       strataArgsJsons[[aIdx]],
       referenceTable$outcomeId[i]
@@ -1294,8 +1400,7 @@ createReferenceTable <- function(cmAnalysisList,
       aIdx <- match(referenceTable$analysisId[i], analyses$analysisId)
       hash <- .contentHash(
         databaseId,
-        referenceTable$loadHash[i],
-        studyPopArgsJsons[[aIdx]],
+        referenceTable$basePopHash[i],
         createPsArgsJsons[[aIdx]],
         strataArgsJsons[[aIdx]],
         sharedBalanceArgsJsons[[aIdx]]
@@ -1342,8 +1447,7 @@ createReferenceTable <- function(cmAnalysisList,
     aIdx <- match(referenceTable$analysisId[i], analyses$analysisId)
     hash <- .contentHash(
       databaseId,
-      referenceTable$loadHash[i],
-      studyPopArgsJsons[[aIdx]],
+      referenceTable$basePopHash[i],
       createPsArgsJsons[[aIdx]],
       strataArgsJsons[[aIdx]],
       balanceArgsJsons[[aIdx]],
@@ -1403,8 +1507,7 @@ createReferenceTable <- function(cmAnalysisList,
     aIdx <- match(referenceTable$analysisId[i], analyses$analysisId)
     hash <- .contentHash(
       databaseId,
-      referenceTable$loadHash[i],
-      studyPopArgsJsons[[aIdx]],
+      referenceTable$basePopHash[i],
       createPsArgsJsons[[aIdx]],
       strataArgsJsons[[aIdx]],
       outcomeModelArgsJsons[[aIdx]],
@@ -1425,6 +1528,8 @@ createReferenceTable <- function(cmAnalysisList,
     "outcomeOfInterest",
     "trueEffectSize",
     "cohortMethodDataFile",
+    "basePopFile",
+    "basePopHash",
     "studyPopFile",
     "sharedPsFile",
     "psFile",
@@ -1498,10 +1603,30 @@ createReferenceTable <- function(cmAnalysisList,
   substr(digest::digest(canonical, algo = "sha256", serialize = FALSE), 1, length)
 }
 
+# Serialize only the base-population-relevant fields of a CreateStudyPopulationArgs object.
+# Excludes outcome-specific fields (removeSubjectsWithPriorOutcome, priorOutcomeLookback).
+.serializeBasePopArgs <- function(args) {
+  baseFields <- list(
+    minDaysAtRisk = args$minDaysAtRisk,
+    maxDaysAtRisk = args$maxDaysAtRisk,
+    riskWindowStart = args$riskWindowStart,
+    startAnchor = args$startAnchor,
+    riskWindowEnd = args$riskWindowEnd,
+    endAnchor = args$endAnchor,
+    censorAtNewRiskWindow = args$censorAtNewRiskWindow
+  )
+  as.character(jsonlite::toJSON(baseFields[order(names(baseFields))],
+                                auto_unbox = TRUE, digits = NA, null = "null"))
+}
+
 .createCohortMethodDataFileName <- function(loadHash) {
   sprintf("CmData_%s.zip", loadHash)
 }
 
+.createBasePopulationFileName <- function(hash) {
+  sprintf("BasePop_%s.rds", hash)
+}
+
 .createPrefilteredCovariatesFileName <- function(hash) {
   sprintf("Prefilter_%s.zip", hash)
 }
diff --git a/R/StudyPopulation.R b/R/StudyPopulation.R
index 7d5b739..be192fd 100644
--- a/R/StudyPopulation.R
+++ b/R/StudyPopulation.R
@@ -31,43 +31,14 @@ fastDuplicated <- function(data, columns) {
   }
 }
 
-#' Create a study population
-#'
-#' @details
-#' Create a study population by enforcing certain inclusion and exclusion criteria, defining a risk
-#' window, and determining which outcomes fall inside the risk window.
-#'
-#' @template CohortMethodData
-#'
-#' @param population                  If specified, this population will be used as the starting
-#'                                    point instead of the cohorts in the `cohortMethodData` object.
-#' @param outcomeId                   The ID of the outcome. If NULL, no outcome-specific
-#'                                    transformations will be performed.
-#' @param createStudyPopulationArgs   An object of type `CreateStudyPopulationArgs` as created by
-#'                                    the [createCreateStudyPopulationArgs()] function.
-#' @return
-#' A `tibble` specifying the study population. This `tibble` will have the following columns:
-#'
-#' - `rowId`: A unique identifier for an exposure.
-#' - `personSeqId`: The person sequence ID of the subject.
-#' - `cohortStartdate`: The index date.
-#' - `outcomeCount` The number of outcomes observed during the risk window.
-#' - `timeAtRisk`: The number of days in the risk window.
-#' - `survivalTime`: The number of days until either the outcome or the end of the risk window.
-#'
-#' @export
-createStudyPopulation <- function(cohortMethodData,
-                                  population = NULL,
-                                  outcomeId = NULL,
-                                  createStudyPopulationArgs = createCreateStudyPopulationArgs()) {
-  errorMessages <- checkmate::makeAssertCollection()
-  checkmate::assertClass(cohortMethodData, "CohortMethodData", add = errorMessages)
-  checkmate::assertDataFrame(population, null.ok = TRUE, add = errorMessages)
-  checkmate::assertNumeric(outcomeId, null.ok = TRUE, add = errorMessages)
-  if (!is.null(outcomeId)) checkmate::assertTRUE(all(outcomeId %% 1 == 0), add = errorMessages)
-  checkmate::assertR6(createStudyPopulationArgs, "CreateStudyPopulationArgs", add = errorMessages)
-  checkmate::reportAssertions(collection = errorMessages)
-
+# Internal: Create a base population (outcome-independent)
+#
+# Performs all outcome-independent filtering: risk window creation, censoring,
+# and minimum days-at-risk filtering. This base population can be shared across
+# multiple outcomes that use the same risk window parameters.
+createBasePopulation <- function(cohortMethodData,
+                                 population = NULL,
+                                 createStudyPopulationArgs = createCreateStudyPopulationArgs()) {
   isEnd <- function(anchor) {
     return(grepl("end$", anchor, ignore.case = TRUE))
   }
@@ -82,38 +53,6 @@ createStudyPopulation <- function(cohortMethodData,
   }
   metaData$targetEstimator <- "ate"
 
-  if (createStudyPopulationArgs$removeSubjectsWithPriorOutcome) {
-    if (is.null(outcomeId)) {
-      message("No outcome specified so skipping removing people with prior outcomes")
-    } else {
-      message("Removing subjects with prior outcomes (if any)")
-      outcomes <- cohortMethodData$outcomes |>
-        filter(.data$outcomeId == !!outcomeId) |>
-        collect()
-      if (isEnd(createStudyPopulationArgs$startAnchor)) {
-        outcomes <- merge(outcomes, population[, c("rowId", "daysToCohortEnd")])
-        priorOutcomeRowIds <- outcomes |>
-          filter(
-            .data$daysToEvent > -createStudyPopulationArgs$priorOutcomeLookback &
-              outcomes$daysToEvent < outcomes$daysToCohortEnd + createStudyPopulationArgs$riskWindowStart
-          ) |>
-          pull("rowId")
-      } else {
-        priorOutcomeRowIds <- outcomes |>
-          filter(
-            .data$daysToEvent > -createStudyPopulationArgs$priorOutcomeLookback &
-              .data$daysToEvent < createStudyPopulationArgs$riskWindowStart
-          ) |>
-          pull("rowId")
-      }
-      population <- population |>
-        filter(!(.data$rowId %in% priorOutcomeRowIds))
-      metaData$attrition <- rbind(
-        metaData$attrition,
-        getCounts(population, paste("No prior outcome"))
-      )
-    }
-  }
   # Create risk windows:
   population$riskStart <- rep(createStudyPopulationArgs$riskWindowStart, nrow(population))
   if (isEnd(createStudyPopulationArgs$startAnchor)) {
@@ -168,49 +107,154 @@ createStudyPopulation <- function(cohortMethodData,
       "days at risk"
     )))
   }
-  if (is.null(outcomeId)) {
-    message("No outcome specified so not creating outcome and time variables")
-  } else {
-    # Select outcomes during time at risk
+  attr(population, "metaData") <- metaData
+  ParallelLogger::logDebug("Base population has ", nrow(population), " rows")
+  return(population)
+}
+
+# Internal: Add outcome-specific data to a base population
+#
+# Applies outcome-specific filtering (removeSubjectsWithPriorOutcome) and
+# computes outcome columns (outcomeCount, timeAtRisk, survivalTime, daysToEvent).
+addOutcomeToPopulation <- function(basePopulation,
+                                   cohortMethodData,
+                                   outcomeId,
+                                   removeSubjectsWithPriorOutcome = TRUE,
+                                   priorOutcomeLookback = 99999,
+                                   startAnchor = "cohort start",
+                                   riskWindowStart = 0) {
+  isEnd <- function(anchor) {
+    return(grepl("end$", anchor, ignore.case = TRUE))
+  }
+
+  population <- basePopulation
+  metaData <- attr(population, "metaData")
+
+  if (removeSubjectsWithPriorOutcome) {
+    message("Removing subjects with prior outcomes (if any)")
     outcomes <- cohortMethodData$outcomes |>
       filter(.data$outcomeId == !!outcomeId) |>
       collect()
-    outcomes <- merge(outcomes, population[, c("rowId", "riskStart", "riskEnd")])
-    outcomes <- outcomes |>
-      filter(
-        .data$daysToEvent >= .data$riskStart &
-          .data$daysToEvent <= .data$riskEnd
-      )
-
-    # Create outcome count column
-    if (nrow(outcomes) == 0) {
-      population$outcomeCount <- rep(0, nrow(population))
+    if (isEnd(startAnchor)) {
+      outcomes <- merge(outcomes, population[, c("rowId", "daysToCohortEnd")])
+      priorOutcomeRowIds <- outcomes |>
+        filter(
+          .data$daysToEvent > -priorOutcomeLookback &
+            outcomes$daysToEvent < outcomes$daysToCohortEnd + riskWindowStart
+        ) |>
+        pull("rowId")
     } else {
-      outcomeCount <- outcomes |>
-        group_by(.data$rowId) |>
-        summarise(outcomeCount = length(.data$outcomeId))
-      population$outcomeCount <- 0
-      population$outcomeCount[match(outcomeCount$rowId, population$rowId)] <- outcomeCount$outcomeCount
+      priorOutcomeRowIds <- outcomes |>
+        filter(
+          .data$daysToEvent > -priorOutcomeLookback &
+            .data$daysToEvent < riskWindowStart
+        ) |>
+        pull("rowId")
     }
+    population <- population |>
+      filter(!(.data$rowId %in% priorOutcomeRowIds))
+    metaData$attrition <- rbind(
+      metaData$attrition,
+      getCounts(population, paste("No prior outcome"))
+    )
+  }
 
-    # Create time at risk column
-    population$timeAtRisk <- population$riskEnd - population$riskStart + 1
+  # Select outcomes during time at risk
+  outcomes <- cohortMethodData$outcomes |>
+    filter(.data$outcomeId == !!outcomeId) |>
+    collect()
+  outcomes <- merge(outcomes, population[, c("rowId", "riskStart", "riskEnd")])
+  outcomes <- outcomes |>
+    filter(
+      .data$daysToEvent >= .data$riskStart &
+        .data$daysToEvent <= .data$riskEnd
+    )
 
-    # Create survival time column
-    firstOutcomes <- outcomes |>
-      arrange(.data$rowId, .data$daysToEvent) |>
-      filter(!duplicated(.data$rowId))
-    population$daysToEvent <- rep(NA, nrow(population))
-    population$daysToEvent[match(firstOutcomes$rowId, population$rowId)] <- firstOutcomes$daysToEvent
-    population$survivalTime <- population$timeAtRisk
-    population$survivalTime[population$outcomeCount != 0] <- population$daysToEvent[population$outcomeCount !=
-                                                                                      0] - population$riskStart[population$outcomeCount != 0] + 1
+  # Create outcome count column
+  if (nrow(outcomes) == 0) {
+    population$outcomeCount <- rep(0, nrow(population))
+  } else {
+    outcomeCount <- outcomes |>
+      group_by(.data$rowId) |>
+      summarise(outcomeCount = length(.data$outcomeId))
+    population$outcomeCount <- 0
+    population$outcomeCount[match(outcomeCount$rowId, population$rowId)] <- outcomeCount$outcomeCount
   }
+
+  # Create time at risk column
+  population$timeAtRisk <- population$riskEnd - population$riskStart + 1
+
+  # Create survival time column
+  firstOutcomes <- outcomes |>
+    arrange(.data$rowId, .data$daysToEvent) |>
+    filter(!duplicated(.data$rowId))
+  population$daysToEvent <- rep(NA, nrow(population))
+  population$daysToEvent[match(firstOutcomes$rowId, population$rowId)] <- firstOutcomes$daysToEvent
+  population$survivalTime <- population$timeAtRisk
+  population$survivalTime[population$outcomeCount != 0] <- population$daysToEvent[population$outcomeCount !=
+                                                                                    0] - population$riskStart[population$outcomeCount != 0] + 1
+
   attr(population, "metaData") <- metaData
   ParallelLogger::logDebug("Study population has ", nrow(population), " rows")
   return(population)
 }
 
+#' Create a study population
+#'
+#' @details
+#' Create a study population by enforcing certain inclusion and exclusion criteria, defining a risk
+#' window, and determining which outcomes fall inside the risk window.
+#'
+#' @template CohortMethodData
+#'
+#' @param population                  If specified, this population will be used as the starting
+#'                                    point instead of the cohorts in the `cohortMethodData` object.
+#' @param outcomeId                   The ID of the outcome. If NULL, no outcome-specific
+#'                                    transformations will be performed.
+#' @param createStudyPopulationArgs   An object of type `CreateStudyPopulationArgs` as created by
+#'                                    the [createCreateStudyPopulationArgs()] function.
+#' @return
+#' A `tibble` specifying the study population. This `tibble` will have the following columns:
+#'
+#' - `rowId`: A unique identifier for an exposure.
+#' - `personSeqId`: The person sequence ID of the subject.
+#' - `cohortStartdate`: The index date.
+#' - `outcomeCount` The number of outcomes observed during the risk window.
+#' - `timeAtRisk`: The number of days in the risk window.
+#' - `survivalTime`: The number of days until either the outcome or the end of the risk window.
+#'
+#' @export
+createStudyPopulation <- function(cohortMethodData,
+                                  population = NULL,
+                                  outcomeId = NULL,
+                                  createStudyPopulationArgs = createCreateStudyPopulationArgs()) {
+  errorMessages <- checkmate::makeAssertCollection()
+  checkmate::assertClass(cohortMethodData, "CohortMethodData", add = errorMessages)
+  checkmate::assertDataFrame(population, null.ok = TRUE, add = errorMessages)
+  checkmate::assertNumeric(outcomeId, null.ok = TRUE, add = errorMessages)
+  if (!is.null(outcomeId)) checkmate::assertTRUE(all(outcomeId %% 1 == 0), add = errorMessages)
+  checkmate::assertR6(createStudyPopulationArgs, "CreateStudyPopulationArgs", add = errorMessages)
+  checkmate::reportAssertions(collection = errorMessages)
+
+  basePop <- createBasePopulation(cohortMethodData, population, createStudyPopulationArgs)
+
+  if (is.null(outcomeId)) {
+    message("No outcome specified so not creating outcome and time variables")
+    return(basePop)
+  }
+
+  studyPop <- addOutcomeToPopulation(
+    basePopulation = basePop,
+    cohortMethodData = cohortMethodData,
+    outcomeId = outcomeId,
+    removeSubjectsWithPriorOutcome = createStudyPopulationArgs$removeSubjectsWithPriorOutcome,
+    priorOutcomeLookback = createStudyPopulationArgs$priorOutcomeLookback,
+    startAnchor = createStudyPopulationArgs$startAnchor,
+    riskWindowStart = createStudyPopulationArgs$riskWindowStart
+  )
+  return(studyPop)
+}
+
 #' Get the attrition table for a population
 #'
 #' @param object   Either an object of type [CohortMethodData], a population object generated by
diff --git a/tests/testthat/test-cachingScenarios.R b/tests/testthat/test-cachingScenarios.R
index e2bba951..c146175 100644
--- a/tests/testthat/test-cachingScenarios.R
+++ b/tests/testthat/test-cachingScenarios.R
@@ -633,11 +633,165 @@ test_that("Reference table is deterministic across repeated calls", {
 
 
 # ===========================================================================
-# End-to-end disk reuse tests (require Eunomia)
-# These tests verify that artifact files are actually written to and reused
-# from disk, not just that filenames are computed correctly.
+# Base Population Sharing Tests
+# Verify that the two-phase population architecture correctly shares
+# base populations across outcomes while keeping study populations distinct.
 # ===========================================================================
 
+test_that("Two outcomes with same risk windows share the same basePopFile", {
+  analysis <- makeBasicAnalysis()
+  tcos <- list(
+    createTargetComparatorOutcomes(
+      targetId = 1,
+      comparatorId = 2,
+      outcomes = list(
+        createOutcome(outcomeId = 10, outcomeOfInterest = TRUE),
+        createOutcome(outcomeId = 20, outcomeOfInterest = TRUE)
+      )
+    )
+  )
+  ref <- buildRef(list(analysis), tcos)
+
+  # Same base population (same risk windows for both outcomes)
+  expect_equal(ref$basePopFile[ref$outcomeId == 10],
+               ref$basePopFile[ref$outcomeId == 20])
+  # Different study populations (different outcomeId)
+  expect_true(ref$studyPopFile[ref$outcomeId == 10] !=
+                ref$studyPopFile[ref$outcomeId == 20])
+})
+
+test_that("Per-outcome risk window override creates different basePopFile", {
+  analysis <- makeBasicAnalysis()
+  tcos <- list(
+    createTargetComparatorOutcomes(
+      targetId = 1,
+      comparatorId = 2,
+      outcomes = list(
+        createOutcome(outcomeId = 10, outcomeOfInterest = TRUE),
+        createOutcome(outcomeId = 20, outcomeOfInterest = TRUE,
+                      riskWindowEnd = 60, endAnchor = "cohort start")  # Override default of 30
+      )
+    )
+  )
+  ref <- buildRef(list(analysis), tcos)
+
+  # Different base populations (different risk window end)
+  expect_true(ref$basePopFile[ref$outcomeId == 10] !=
+                ref$basePopFile[ref$outcomeId == 20])
+  # Different study populations too
+  expect_true(ref$studyPopFile[ref$outcomeId == 10] !=
+                ref$studyPopFile[ref$outcomeId == 20])
+})
+
+test_that("Changing priorOutcomeLookback changes studyPopFile but NOT basePopFile", {
+  analysis1 <- createCmAnalysis(
+    analysisId = 1,
+    getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs(
+      covariateSettings = FeatureExtraction::createCovariateSettings(
+        useDemographicsGender = TRUE
+      )
+    ),
+    createStudyPopulationArgs = createCreateStudyPopulationArgs(
+      removeSubjectsWithPriorOutcome = TRUE,
+      priorOutcomeLookback = 99999,
+      minDaysAtRisk = 1,
+      riskWindowStart = 0,
+      startAnchor = "cohort start",
+      riskWindowEnd = 30,
+      endAnchor = "cohort start"
+    ),
+    fitOutcomeModelArgs = createFitOutcomeModelArgs(modelType = "cox")
+  )
+  analysis2 <- createCmAnalysis(
+    analysisId = 2,
+    getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs(
+      covariateSettings = FeatureExtraction::createCovariateSettings(
+        useDemographicsGender = TRUE
+      )
+    ),
+    createStudyPopulationArgs = createCreateStudyPopulationArgs(
+      removeSubjectsWithPriorOutcome = TRUE,
+      priorOutcomeLookback = 365,
+      minDaysAtRisk = 1,
+      riskWindowStart = 0,
+      startAnchor = "cohort start",
+      riskWindowEnd = 30,
+      endAnchor = "cohort start"
+    ),
+    fitOutcomeModelArgs = createFitOutcomeModelArgs(modelType = "cox")
+  )
+  tcos <- list(
+    createTargetComparatorOutcomes(
+      targetId = 1,
+      comparatorId = 2,
+      outcomes = list(createOutcome(outcomeId = 10, outcomeOfInterest = TRUE))
+    )
+  )
+  ref <- buildRef(list(analysis1, analysis2), tcos)
+
+  row1 <- ref[ref$analysisId == 1, ]
+  row2 <- ref[ref$analysisId == 2, ]
+
+  # Same base population (same risk windows, minDaysAtRisk, etc.)
+  expect_equal(row1$basePopFile, row2$basePopFile)
+  # Different study populations (different priorOutcomeLookback)
+  expect_true(row1$studyPopFile != row2$studyPopFile)
+})
+
+test_that("Shared PS file uses basePopHash (outcome-independent)", {
+  analysis <- makeBasicAnalysis()
+  tcos <- list(
+    createTargetComparatorOutcomes(
+      targetId = 1,
+      comparatorId = 2,
+      outcomes = list(
+        createOutcome(outcomeId = 10, outcomeOfInterest = TRUE),
+        createOutcome(outcomeId = 20, outcomeOfInterest = TRUE)
+      )
+    )
+  )
+  ref <- buildRef(list(analysis), tcos)
+
+  # Shared PS should be the same for both outcomes (based on basePopHash)
+  expect_equal(ref$sharedPsFile[ref$outcomeId == 10],
+               ref$sharedPsFile[ref$outcomeId == 20])
+  # And both should be non-empty
+  expect_true(ref$sharedPsFile[1] != "")
+})
+
+test_that("Adding an outcome preserves all existing filenames", {
+  analysis <- makeBasicAnalysis()
+  tcos1 <- list(
+    createTargetComparatorOutcomes(
+      targetId = 1,
+      comparatorId = 2,
+      outcomes = list(createOutcome(outcomeId = 10, outcomeOfInterest = TRUE))
+    )
+  )
+  tcos2 <- list(
+    createTargetComparatorOutcomes(
+      targetId = 1,
+      comparatorId = 2,
+      outcomes = list(
+        createOutcome(outcomeId = 10, outcomeOfInterest = TRUE),
+        createOutcome(outcomeId = 20, outcomeOfInterest = TRUE)
+      )
+    )
+  )
+
+  ref1 <- buildRef(list(analysis), tcos1)
+  ref2 <- buildRef(list(analysis), tcos2)
+  ref2_o10 <- ref2[ref2$outcomeId == 10, ]
+
+  # All filenames for outcome 10 should be identical
+  expect_equal(ref1$basePopFile, ref2_o10$basePopFile)
+  expect_equal(ref1$cohortMethodDataFile, ref2_o10$cohortMethodDataFile)
+  expect_equal(ref1$studyPopFile, ref2_o10$studyPopFile)
+  expect_equal(ref1$sharedPsFile, ref2_o10$sharedPsFile)
+  expect_equal(ref1$strataFile, ref2_o10$strataFile)
+  expect_equal(ref1$outcomeModelFile, ref2_o10$outcomeModelFile)
+})
+
 if (!isFALSE(tryCatch(find.package("Eunomia"), error = function(e) FALSE))) {
 
   # Minimal analysis used across all E2E tests: no PS, no matching, simple cox
@@ -698,6 +852,7 @@ if (!isFALSE(tryCatch(find.package("Eunomia"), error = function(e) FALSE))) {
     }
 
     checkFiles("cohortMethodDataFile")
+    checkFiles("basePopFile")
     checkFiles("studyPopFile")
     checkFiles("outcomeModelFile")
   })
@@ -739,14 +894,17 @@ if (!isFALSE(tryCatch(find.package("Eunomia"), error = function(e) FALSE))) {
 
     # Record modification times of existing artifacts after first run
     cmDataPath <- file.path(outputFolder, result1$cohortMethodDataFile[1])
+    basePopPath <- file.path(outputFolder, result1$basePopFile[1])
     studyPopPath <- file.path(outputFolder, result1$studyPopFile[1])
     om3Path <- file.path(outputFolder, result1$outcomeModelFile[1])
 
     expect_true(file.exists(cmDataPath))
+    expect_true(file.exists(basePopPath))
     expect_true(file.exists(studyPopPath))
     expect_true(file.exists(om3Path))
 
     mtime_cmData <- file.info(cmDataPath)$mtime
+    mtime_basePop <- file.info(basePopPath)$mtime
     mtime_studyPop <- file.info(studyPopPath)$mtime
     mtime_om3 <- file.info(om3Path)$mtime
 
@@ -774,6 +932,17 @@ if (!isFALSE(tryCatch(find.package("Eunomia"), error = function(e) FALSE))) {
     )
     expect_equal(file.info(cmDataPath)$mtime, mtime_cmData)
 
+    # Base population: same filename, NOT re-written (shared across outcomes)
+    expect_equal(
+      result1$basePopFile[1],
+      result2$basePopFile[result2$outcomeId == 3]
+    )
+    expect_equal(
+      result1$basePopFile[1],
+      result2$basePopFile[result2$outcomeId == 4]
+    )
+    expect_equal(file.info(basePopPath)$mtime, mtime_basePop)
+
     # StudyPop for outcome 3: same filename, NOT re-written
     expect_equal(
       result1$studyPopFile[1],

From 6fa101a628ce96311475157023250ba628d891a2 Mon Sep 17 00:00:00 2001
From: jgilber2 <jgilber2@its.jnj.com>
Date: Wed, 6 May 2026 10:53:17 -0700
Subject: [PATCH 09/10] Regression testing script to compare current code
 against main

---
 extras/TestResultsRegression.R | 372 +++++++++++++++++++++++++++++++++
 1 file changed, 372 insertions(+)
 create mode 100644 extras/TestResultsRegression.R

diff --git a/extras/TestResultsRegression.R b/extras/TestResultsRegression.R
new file mode 100644
index 0000000..0df4d03
--- /dev/null
+++ b/extras/TestResultsRegression.R
@@ -0,0 +1,372 @@
+# TestResultsRegression.R
+#
+# Verifies that the current code produces identical results to the released version.
+# Compares exported CSV results against the reference zip committed on main.
+#
+# Usage (from package root):
+#   RENV_CONFIG_SANDBOX_ENABLED=FALSE Rscript --vanilla extras/TestResultsRegression.R
+
+library(CohortMethod)
+
+# ===========================================================================
+# 1. Extract reference results from git main branch
+# ===========================================================================
+referenceDir <- tempfile(pattern = "cmReference")
+dir.create(referenceDir)
+
+refZipPath <- file.path(referenceDir, "Results_Eunomia.zip")
+# Use the script's directory to find the package root (extras/ is one level below)
+packageRoot <- normalizePath(file.path(getwd()))
+if (!file.exists(file.path(packageRoot, ".git"))) {
+  # Fallback: try installed package location
+
+  packageRoot <- normalizePath(system.file("..", package = "CohortMethod"))
+}
+# Try local main first, then origin/main
+exitCode <- system2("git", c("-C", packageRoot, "show", "main:inst/Results_Eunomia.zip"),
+                    stdout = refZipPath, stderr = FALSE)
+if (exitCode != 0) {
+  exitCode <- system2("git", c("-C", packageRoot, "show", "origin/main:inst/Results_Eunomia.zip"),
+                      stdout = refZipPath, stderr = FALSE)
+}
+if (exitCode != 0) {
+  stop("Failed to extract inst/Results_Eunomia.zip from main branch. ",
+       "Make sure you are in the CohortMethod git repo and 'main' branch exists.")
+}
+unzip(refZipPath, exdir = referenceDir)
+message("Reference results extracted from main branch")
+
+# ===========================================================================
+# 2. Run full analysis with current code (mirrors CreateResultsSchemaForTesting.R)
+# ===========================================================================
+connectionDetails <- Eunomia::getEunomiaConnectionDetails()
+Eunomia::createCohorts(connectionDetails)
+
+outputFolder <- tempfile(pattern = "cmRegression")
+
+tcos1 <- createTargetComparatorOutcomes(
+  targetId = 1,
+  comparatorId = 2,
+  outcomes = list(
+    createOutcome(
+      outcomeId = 3,
+      priorOutcomeLookback = 30
+    ),
+    createOutcome(
+      outcomeId = 4,
+      outcomeOfInterest = FALSE,
+      trueEffectSize = 1
+    )
+  ),
+  excludedCovariateConceptIds = c(1118084, 1124300)
+)
+# Empty cohorts:
+tcos2 <- createTargetComparatorOutcomes(
+  targetId = 998,
+  comparatorId = 999,
+  outcomes = list(
+    createOutcome(
+      outcomeId = 3,
+      priorOutcomeLookback = 30
+    ),
+    createOutcome(
+      outcomeId = 4,
+      outcomeOfInterest = FALSE,
+      trueEffectSize = 1
+    )
+  )
+)
+
+targetComparatorOutcomesList <- list(tcos1, tcos2)
+
+covarSettings <- createDefaultCovariateSettings(addDescendantsToExclude = TRUE)
+
+getDbCmDataArgs <- createGetDbCohortMethodDataArgs(
+  washoutPeriod = 183,
+  firstExposureOnly = TRUE,
+  removeDuplicateSubjects = "remove all",
+  covariateSettings = covarSettings
+)
+
+createStudyPopArgs1 <- createCreateStudyPopulationArgs(
+  removeSubjectsWithPriorOutcome = TRUE,
+  censorAtNewRiskWindow = TRUE,
+  minDaysAtRisk = 1,
+  riskWindowStart = 0,
+  startAnchor = "cohort start",
+  riskWindowEnd = 30,
+  endAnchor = "cohort end"
+)
+
+createStudyPopArgs2 <- createCreateStudyPopulationArgs(
+  removeSubjectsWithPriorOutcome = TRUE,
+  censorAtNewRiskWindow = TRUE,
+  minDaysAtRisk = 1,
+  riskWindowStart = 0,
+  startAnchor = "cohort start",
+  riskWindowEnd = 30,
+  endAnchor = "cohort end"
+)
+
+fitOutcomeModelArgs1 <- createFitOutcomeModelArgs(modelType = "cox")
+
+cmAnalysis1 <- createCmAnalysis(
+  analysisId = 1,
+  description = "No matching, simple outcome model",
+  getDbCohortMethodDataArgs = getDbCmDataArgs,
+  createStudyPopulationArgs = createStudyPopArgs1,
+  fitOutcomeModelArgs = fitOutcomeModelArgs1
+)
+
+createPsArgs <- createCreatePsArgs(
+  prior = createPrior("laplace", variance = 0.01),
+  estimator = "att"
+)
+
+matchOnPsArgs <- createMatchOnPsArgs(maxRatio = 100)
+
+computeSharedCovBalArgs <- createComputeCovariateBalanceArgs()
+
+computeCovBalArgs <- createComputeCovariateBalanceArgs(
+  covariateFilter = FeatureExtraction::getDefaultTable1Specifications()
+)
+
+fitOutcomeModelArgs2 <- createFitOutcomeModelArgs(
+  modelType = "cox",
+  stratified = TRUE
+)
+
+cmAnalysis2 <- createCmAnalysis(
+  analysisId = 2,
+  description = "Matching",
+  getDbCohortMethodDataArgs = getDbCmDataArgs,
+  createStudyPopulationArgs = createStudyPopArgs2,
+  createPsArgs = createPsArgs,
+  matchOnPsArgs = matchOnPsArgs,
+  computeSharedCovariateBalanceArgs = computeSharedCovBalArgs,
+  computeCovariateBalanceArgs = computeCovBalArgs,
+  fitOutcomeModelArgs = fitOutcomeModelArgs2
+)
+
+truncateIptwArgs <- createTruncateIptwArgs(maxWeight = 10)
+
+fitOutcomeModelArgs3 <- createFitOutcomeModelArgs(
+  modelType = "cox",
+  inversePtWeighting = TRUE
+)
+cmAnalysis3 <- createCmAnalysis(
+  analysisId = 3,
+  description = "IPTW",
+  getDbCohortMethodDataArgs = getDbCmDataArgs,
+  createStudyPopulationArgs = createStudyPopArgs2,
+  createPsArgs = createPsArgs,
+  truncateIptwArgs = truncateIptwArgs,
+  computeSharedCovariateBalanceArgs = computeSharedCovBalArgs,
+  fitOutcomeModelArgs = fitOutcomeModelArgs3
+)
+
+fitOutcomeModelArgs4 <- createFitOutcomeModelArgs(
+  modelType = "cox",
+  stratified = TRUE,
+  interactionCovariateIds = 8532001
+)
+
+cmAnalysis4 <- createCmAnalysis(
+  analysisId = 4,
+  description = "Matching with gender interaction",
+  getDbCohortMethodDataArgs = getDbCmDataArgs,
+  createStudyPopulationArgs = createStudyPopArgs2,
+  createPsArgs = createPsArgs,
+  matchOnPsArgs = matchOnPsArgs,
+  fitOutcomeModelArgs = fitOutcomeModelArgs4
+)
+
+cmAnalysisList <- list(cmAnalysis1, cmAnalysis2, cmAnalysis3, cmAnalysis4)
+
+analysesToExclude <- data.frame(
+  targetId = c(998, 998),
+  analysisId = c(3, 4)
+)
+
+message("Running analyses with current code...")
+result <- runCmAnalyses(
+  connectionDetails = connectionDetails,
+  cdmDatabaseSchema = "main",
+  exposureTable = "cohort",
+  outcomeTable = "cohort",
+  outputFolder = outputFolder,
+  databaseId = "Eunomia",
+  cmAnalysesSpecifications = createCmAnalysesSpecifications(
+    cmAnalysisList = cmAnalysisList,
+    targetComparatorOutcomesList = targetComparatorOutcomesList,
+    analysesToExclude = analysesToExclude
+  )
+)
+
+exportToCsv(outputFolder, databaseId = "Eunomia")
+message("Analysis complete, results exported")
+
+# ===========================================================================
+# 3. Compare key CSV files
+# ===========================================================================
+newDir <- file.path(outputFolder, "export")
+
+compareNumericColumns <- function(file, columns, tolerance = 1e-6) {
+  refPath <- file.path(referenceDir, file)
+  newPath <- file.path(newDir, file)
+
+  if (!file.exists(refPath)) {
+    message(sprintf("  SKIP: %s (not in reference)", file))
+    return(invisible(NULL))
+  }
+  if (!file.exists(newPath)) {
+    stop(sprintf("Missing output file: %s", file))
+  }
+
+  ref <- readr::read_csv(refPath, show_col_types = FALSE)
+  new <- readr::read_csv(newPath, show_col_types = FALSE)
+
+  # Sort both by common key columns for stable comparison
+  keyColumns <- intersect(c("analysis_id", "target_id", "comparator_id",
+                            "outcome_id", "interaction_covariate_id"), names(ref))
+  ref <- ref[do.call(order, ref[keyColumns]), ]
+  new <- new[do.call(order, new[keyColumns]), ]
+
+  if (nrow(ref) != nrow(new)) {
+    stop(sprintf("Row count mismatch in %s: reference=%d, new=%d",
+                 file, nrow(ref), nrow(new)))
+  }
+
+  columnsChecked <- 0
+  for (col in columns) {
+    if (col %in% names(ref) && col %in% names(new)) {
+      refVals <- as.numeric(ref[[col]])
+      newVals <- as.numeric(new[[col]])
+      diffs <- abs(refVals - newVals)
+      diffs <- diffs[!is.na(diffs)]
+      if (length(diffs) > 0 && any(diffs > tolerance)) {
+        stop(sprintf("Column '%s' in %s differs: max diff = %g (tolerance = %g)",
+                     col, file, max(diffs), tolerance))
+      }
+      columnsChecked <- columnsChecked + 1
+    }
+  }
+  message(sprintf("  PASS: %s (%d numeric columns within tolerance %g)",
+                  file, columnsChecked, tolerance))
+}
+
+compareExactColumns <- function(file, columns) {
+  refPath <- file.path(referenceDir, file)
+  newPath <- file.path(newDir, file)
+
+  if (!file.exists(refPath)) {
+    message(sprintf("  SKIP: %s (not in reference)", file))
+    return(invisible(NULL))
+  }
+  if (!file.exists(newPath)) {
+    stop(sprintf("Missing output file: %s", file))
+  }
+
+  ref <- readr::read_csv(refPath, show_col_types = FALSE)
+  new <- readr::read_csv(newPath, show_col_types = FALSE)
+
+  keyColumns <- intersect(c("analysis_id", "target_id", "comparator_id",
+                            "outcome_id"), names(ref))
+  ref <- ref[do.call(order, ref[keyColumns]), ]
+  new <- new[do.call(order, new[keyColumns]), ]
+
+  if (nrow(ref) != nrow(new)) {
+    stop(sprintf("Row count mismatch in %s: reference=%d, new=%d",
+                 file, nrow(ref), nrow(new)))
+  }
+
+  columnsChecked <- 0
+  for (col in columns) {
+    if (col %in% names(ref) && col %in% names(new)) {
+      if (!identical(ref[[col]], new[[col]])) {
+        # Show first difference for debugging
+        idx <- which(ref[[col]] != new[[col]])[1]
+        stop(sprintf("Column '%s' in %s differs at row %d: reference='%s', new='%s'",
+                     col, file, idx, ref[[col]][idx], new[[col]][idx]))
+      }
+      columnsChecked <- columnsChecked + 1
+    }
+  }
+  message(sprintf("  PASS: %s (%d columns match exactly)", file, columnsChecked))
+}
+
+message("")
+message("=== Results Regression Test ===")
+message("")
+
+compareNumericColumns("cm_result.csv",
+  c("log_rr", "se_log_rr", "ci_95_lb", "ci_95_ub", "p",
+    "calibrated_log_rr", "calibrated_se_log_rr"))
+
+# Attrition: compare final population sizes per analysis/target/comparator/outcome.
+# Intermediate attrition steps may differ in order due to two-phase population creation,
+# but the final population must match.
+compareFinalAttrition <- function() {
+  file <- "cm_attrition.csv"
+  refPath <- file.path(referenceDir, file)
+  newPath <- file.path(newDir, file)
+
+  if (!file.exists(refPath)) {
+    message(sprintf("  SKIP: %s (not in reference)", file))
+    return(invisible(NULL))
+  }
+
+  ref <- readr::read_csv(refPath, show_col_types = FALSE)
+  new <- readr::read_csv(newPath, show_col_types = FALSE)
+
+  # Get last (final) attrition row per group
+  keyColumns <- intersect(c("target_comparator_id", "analysis_id", "outcome_id",
+                            "exposure_id", "database_id"), names(ref))
+  refFinal <- ref |>
+    dplyr::group_by(dplyr::across(dplyr::all_of(keyColumns))) |>
+    dplyr::slice_tail(n = 1) |>
+    dplyr::ungroup() |>
+    dplyr::arrange(dplyr::across(dplyr::all_of(keyColumns)))
+  newFinal <- new |>
+    dplyr::group_by(dplyr::across(dplyr::all_of(keyColumns))) |>
+    dplyr::slice_tail(n = 1) |>
+    dplyr::ungroup() |>
+    dplyr::arrange(dplyr::across(dplyr::all_of(keyColumns)))
+
+  if (nrow(refFinal) != nrow(newFinal)) {
+    stop(sprintf("Final attrition group count mismatch: reference=%d, new=%d",
+                 nrow(refFinal), nrow(newFinal)))
+  }
+  for (col in c("subjects", "exposures", "outcomes")) {
+    if (!identical(refFinal[[col]], newFinal[[col]])) {
+      stop(sprintf("Final attrition column '%s' differs", col))
+    }
+  }
+  message(sprintf("  PASS: %s (final population sizes match across %d groups)",
+                  file, nrow(refFinal)))
+}
+compareFinalAttrition()
+
+compareExactColumns("cm_diagnostics_summary.csv",
+  c("balance_diagnostic", "shared_balance_diagnostic",
+    "equipoise_diagnostic", "mdrr_diagnostic"))
+
+compareNumericColumns("cm_diagnostics_summary.csv",
+  c("max_sdm", "equipoise", "mdrr"), tolerance = 1e-6)
+
+compareNumericColumns("cm_covariate_balance.csv",
+  c("std_diff_before", "std_diff_after"), tolerance = 1e-4)
+
+compareNumericColumns("cm_interaction_result.csv",
+  c("log_rr", "se_log_rr"), tolerance = 1e-6)
+
+message("")
+message("=== ALL CHECKS PASSED ===")
+message("")
+
+# ===========================================================================
+# Cleanup
+# ===========================================================================
+unlink(referenceDir, recursive = TRUE)
+unlink(connectionDetails$server())
+unlink(outputFolder, recursive = TRUE)

From 9a37a879e7d5b2902beb58c1da147f2e7b65b40d Mon Sep 17 00:00:00 2001
From: jgilber2 <jgilber2@its.jnj.com>
Date: Mon, 11 May 2026 10:22:57 -0700
Subject: [PATCH 10/10] Reduced memory footprint of ps model and added logging
 to diagnose freezing on linux

---
 R/RunAnalyses.R | 25 ++++++++++++++++++++++++-
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/R/RunAnalyses.R b/R/RunAnalyses.R
index 0d91937..79ecdf9 100644
--- a/R/RunAnalyses.R
+++ b/R/RunAnalyses.R
@@ -872,16 +872,39 @@ doFitSharedPsModel <- function(params, refitPsForEveryStudyPopulation) {
 }
 
 addPsToStudyPopForSubset <- function(subset, outputFolder) {
-  ps <- readRDS(file.path(outputFolder, subset$sharedPsFile[1]))
+  pid <- Sys.getpid()
+  sharedPsFile <- subset$sharedPsFile[1]
+  ParallelLogger::logInfo(sprintf("[pid %d] addPsToStudyPopForSubset: loading sharedPsFile '%s' (%d rows to process)",
+                                  pid, sharedPsFile, nrow(subset)))
+  t0 <- proc.time()[["elapsed"]]
+  ps <- readRDS(file.path(outputFolder, sharedPsFile))
+  ParallelLogger::logInfo(sprintf("[pid %d] addPsToStudyPopForSubset: sharedPsFile loaded in %.1f s (nrow=%d, ncol=%d)",
+                                  pid, proc.time()[["elapsed"]] - t0, nrow(ps), ncol(ps)))
+  columnsToKeep <- c("rowId", "treatment", "personSeqId", "cohortStartDate", "propensityScore", "preferenceScore", "iptw")
+  ps <- ps[, intersect(columnsToKeep, colnames(ps))]
+  ParallelLogger::logInfo(sprintf("[pid %d] addPsToStudyPopForSubset: ps trimmed to %d columns",
+                                  pid, ncol(ps)))
 
   addToStudyPop <- function(i) {
     refRow <- subset[i, ]
+    ParallelLogger::logInfo(sprintf("[pid %d] addPsToStudyPopForSubset [%d/%d]: reading studyPopFile '%s'",
+                                    pid, i, nrow(subset), refRow$studyPopFile))
+    t1 <- proc.time()[["elapsed"]]
     studyPop <- readRDS(file.path(outputFolder, refRow$studyPopFile))
+    ParallelLogger::logInfo(sprintf("[pid %d] addPsToStudyPopForSubset [%d/%d]: studyPopFile read in %.1f s (nrow=%d), merging PS",
+                                    pid, i, nrow(subset), proc.time()[["elapsed"]] - t1, nrow(studyPop)))
     studyPop <- addPsToStudyPopulation(studyPop, ps)
+    ParallelLogger::logInfo(sprintf("[pid %d] addPsToStudyPopForSubset [%d/%d]: writing psFile '%s'",
+                                    pid, i, nrow(subset), refRow$psFile))
+    t2 <- proc.time()[["elapsed"]]
     saveRDS(studyPop, file.path(outputFolder, refRow$psFile))
+    ParallelLogger::logInfo(sprintf("[pid %d] addPsToStudyPopForSubset [%d/%d]: psFile written in %.1f s",
+                                    pid, i, nrow(subset), proc.time()[["elapsed"]] - t2))
     return(NULL)
   }
   plyr::l_ply(1:nrow(subset), addToStudyPop)
+  ParallelLogger::logInfo(sprintf("[pid %d] addPsToStudyPopForSubset: completed all %d rows for sharedPsFile '%s'",
+                                  pid, nrow(subset), sharedPsFile))
 }
 
 addPsToStudyPopulation <- function(studyPopulation, ps) {