From 20754df611ecbe65634bc8cf4aac43a72e81e5b3 Mon Sep 17 00:00:00 2001 From: jgilber2 Date: Wed, 29 Apr 2026 11:25:07 -0700 Subject: [PATCH 01/10] initial implementation of artifact handler --- R/InvalidationPolicy.R | 142 +++++++++++ R/RunAnalyses.R | 72 +++++- R/SettingsHasher.R | 285 ++++++++++++++++++++++ R/ValidateArtifact.R | 150 ++++++++++++ man/InvalidationPolicy.Rd | 94 ++++++++ man/SettingsHasher.Rd | 238 +++++++++++++++++++ man/ValidateArtifact.Rd | 144 ++++++++++++ tests/testthat/test-invalidationPolicy.R | 288 +++++++++++++++++++++++ 8 files changed, 1405 insertions(+), 8 deletions(-) create mode 100644 R/InvalidationPolicy.R create mode 100644 R/SettingsHasher.R create mode 100644 R/ValidateArtifact.R create mode 100644 man/InvalidationPolicy.Rd create mode 100644 man/SettingsHasher.Rd create mode 100644 man/ValidateArtifact.Rd create mode 100644 tests/testthat/test-invalidationPolicy.R diff --git a/R/InvalidationPolicy.R b/R/InvalidationPolicy.R new file mode 100644 index 0000000..9934bea --- /dev/null +++ b/R/InvalidationPolicy.R @@ -0,0 +1,142 @@ +# Copyright 2026 Observational Health Data Sciences and Informatics +# +# This file is part of CohortMethod +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#' Invalidation Policy +#' +#' @description +#' Determines which artifacts should be deleted when analysis settings change, +#' based on dependency relationships between components. +#' +#' @details +#' When settings change, not all artifacts need to be regenerated. This class +#' implements the logic to determine minimal invalidation scope based on which +#' settings components have changed. +#' +#' @export +InvalidationPolicy <- R6::R6Class( + "InvalidationPolicy", + public = list( + #' @description + #' Compute which file patterns should be deleted based on changed components + #' + #' @param changedComponents A list (as returned by SettingsHasher$compareSettingsComponents) + #' with logical values for each component + #' + #' @return + #' Character vector of file patterns/globs to delete. Empty vector if no deletion needed. + #' + #' @details + #' Deletion cascade logic: + #' - If loadArgsChanged: all downstream artifacts (everything) + #' - Else if studyPopArgsChanged: populations, PS models, strata, outcomes + #' - Else if psArgsChanged: PS models, strata, outcomes + #' - Else if strataArgsChanged: strata and outcomes + #' - Else if outcomeModelArgsChanged: outcome models only + #' - Else if only balanceArgsChanged: balance files only + #' - Else if analyticsChanged (only new outcomes): nothing (outcomes are outcome-specific) + computeInvalidationScope = function(changedComponents) { + checkmate::assertList(changedComponents) + + filePatternsToDelete <- c() + + # Cascade of invalidation based on dependency graph + if (isTRUE(changedComponents$loadArgsChanged)) { + # If data loading changed, everything depends on it, so delete all analysis artifacts + filePatternsToDelete <- c( + filePatternsToDelete, + "CmData_.*\\.zip$", # All CohortMethodData + "StudyPop_.*\\.rds$", # All study populations + "Ps_.*\\.rds$", # All propensity scores + "StratPop_.*\\.rds$", # All stratified populations + "Balance_.*\\.rds$", # All balance files + "Analysis_[0-9]+$" # All analysis folders (contains outcome models) + ) + } else if (isTRUE(changedComponents$studyPopArgsChanged)) { + # Study population arguments changed: recompute populations and everything downstream + filePatternsToDelete <- c( + filePatternsToDelete, + "StudyPop_.*\\.rds$", # All study populations + "Ps_.*\\.rds$", # All PS (may depend on population) + "StratPop_.*\\.rds$", # All stratified populations + "Balance_.*\\.rds$", # All balance files + "Analysis_[0-9]+$" # All outcome models + ) + } else if (isTRUE(changedComponents$psArgsChanged)) { + # Propensity score arguments changed: recompute PS and downstream + filePatternsToDelete <- c( + filePatternsToDelete, + "Ps_.*\\.rds$", # All PS models + "StratPop_.*\\.rds$", # All stratified populations (depend on PS) + "Balance_.*\\.rds$", # All balance files + "Analysis_[0-9]+$" # All outcome models + ) + } else if (isTRUE(changedComponents$strataArgsChanged)) { + # Stratification (trim/match/stratify) changed: recompute strata and downstream + filePatternsToDelete <- c( + filePatternsToDelete, + "StratPop_.*\\.rds$", # All stratified populations + "Balance_.*\\.rds$", # All balance files + "Analysis_[0-9]+$" # All outcome models + ) + } else if (isTRUE(changedComponents$outcomeModelArgsChanged)) { + # Outcome model arguments changed: recompute outcome models only + filePatternsToDelete <- c( + filePatternsToDelete, + "Analysis_[0-9]+$" # All outcome model folders + ) + } else if (isTRUE(changedComponents$balanceArgsChanged)) { + # Balance computation arguments changed: delete balance files only + filePatternsToDelete <- c( + filePatternsToDelete, + "Balance_.*\\.rds$" # All balance files + ) + } + # If analyticsChanged but nothing else changed (e.g., only outcomes added): + # No deletion needed - outcome models are outcome-specific, new ones will be computed + + return(filePatternsToDelete) + }, + + #' @description + #' Get a human-readable message describing what will be deleted + #' + #' @param changedComponents A list of changed components + #' + #' @return + #' Character string suitable for displaying to the user + getInvalidationMessage = function(changedComponents) { + checkmate::assertList(changedComponents) + + if (isTRUE(changedComponents$loadArgsChanged)) { + return("Data loading arguments have changed. All analysis artifacts must be regenerated.") + } else if (isTRUE(changedComponents$studyPopArgsChanged)) { + return("Study population settings have changed. Study populations, propensity scores, and outcome models must be regenerated.") + } else if (isTRUE(changedComponents$psArgsChanged)) { + return("Propensity score settings have changed. Propensity scores and outcome models must be regenerated.") + } else if (isTRUE(changedComponents$strataArgsChanged)) { + return("Stratification settings (trimming/matching/stratifying) have changed. Stratified populations and outcome models must be regenerated.") + } else if (isTRUE(changedComponents$outcomeModelArgsChanged)) { + return("Outcome model settings have changed. Outcome models must be regenerated.") + } else if (isTRUE(changedComponents$balanceArgsChanged)) { + return("Covariate balance settings have changed. Balance files must be recomputed.") + } else if (isTRUE(changedComponents$analyticsChanged)) { + return("Analyses have changed (new outcomes or studies added). New analyses will be computed.") + } else { + return("No artifacts need to be deleted.") + } + } + ) +) diff --git a/R/RunAnalyses.R b/R/RunAnalyses.R index 4cc0e6d..2f04330 100644 --- a/R/RunAnalyses.R +++ b/R/RunAnalyses.R @@ -209,14 +209,70 @@ runCmAnalyses <- function(connectionDetails, if (file.exists(cmAnalysesSpecificationsFile)) { oldCmAnalysesSpecifications <- readRDS(cmAnalysesSpecificationsFile) if (!isTRUE(all.equal(oldCmAnalysesSpecifications$toList(), cmAnalysesSpecifications$toList()))) { - rm(list = ls(envir = cache), envir = cache) - message(sprintf("Output files already exist in '%s', but the analysis settings have changed.", outputFolder)) - response <- utils::askYesNo("Do you want to delete the old files before proceeding?") - if (is.na(response)) { - # Cancel: - return() - } else if (response == TRUE) { - unlink(outputFolder, recursive = TRUE) + # Use intelligent invalidation policy to determine what needs to be deleted + hasher <- SettingsHasher$new() + changedComponents <- hasher$compareSettingsComponents( + oldCmAnalysesSpecifications, + cmAnalysesSpecifications + ) + + if (any(as.logical(changedComponents))) { + # Determine minimal set of files to delete based on what changed + policy <- InvalidationPolicy$new() + filePatternsToDelete <- policy$computeInvalidationScope(changedComponents) + + if (length(filePatternsToDelete) > 0) { + # Get human-readable message about what will be deleted + message(sprintf("Output files already exist in '%s', but the analysis settings have changed.", outputFolder)) + message(policy$getInvalidationMessage(changedComponents)) + + # Show some affected files to user + affectedFiles <- c() + for (pattern in filePatternsToDelete) { + matchedFiles <- list.files(outputFolder, pattern = pattern, full.names = FALSE) + affectedFiles <- c(affectedFiles, matchedFiles) + } + + if (length(affectedFiles) > 0) { + message(sprintf( + "Files to be deleted: %d file(s) matching %d pattern(s)", + length(affectedFiles), + length(filePatternsToDelete) + )) + if (length(affectedFiles) <= 10) { + for (f in affectedFiles) { + message(sprintf(" - %s", f)) + } + } else { + for (f in head(affectedFiles, 5)) { + message(sprintf(" - %s", f)) + } + message(sprintf(" ... and %d more files", length(affectedFiles) - 5)) + } + } + + response <- utils::askYesNo("Do you want to delete these artifacts before proceeding?") + if (is.na(response)) { + # Cancel: + return() + } else if (response == TRUE) { + # Delete files matching patterns + for (pattern in filePatternsToDelete) { + files <- list.files(outputFolder, pattern = pattern, full.names = TRUE, recursive = TRUE) + if (length(files) > 0) { + unlink(files) + } + # Also try to delete directories matching pattern + dirs <- list.dirs(outputFolder, full.names = TRUE, recursive = FALSE) + matchingDirs <- dirs[grep(pattern, basename(dirs))] + if (length(matchingDirs) > 0) { + unlink(matchingDirs, recursive = TRUE) + } + } + } + } + # Clear cache regardless (old cache may reference deleted files) + rm(list = ls(envir = cache), envir = cache) } } } diff --git a/R/SettingsHasher.R b/R/SettingsHasher.R new file mode 100644 index 0000000..70efcac --- /dev/null +++ b/R/SettingsHasher.R @@ -0,0 +1,285 @@ +# Copyright 2026 Observational Health Data Sciences and Informatics +# +# This file is part of CohortMethod +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#' Settings Hasher +#' +#' @description +#' Compute deterministic hashes for individual settings components to detect +#' which parts of the analysis specification have changed. +#' +#' @details +#' This class hashes each settings component independently using JSON serialization +#' and MD5 digests. This allows fine-grained detection of which artifact types need +#' to be invalidated when specifications change. +#' +#' @export +SettingsHasher <- R6::R6Class( + "SettingsHasher", + public = list( + #' @description + #' Hash the data loading arguments + #' + #' @param getDbCohortMethodDataArgs An object of type `GetDbCohortMethodDataArgs` + #' + #' @return + #' Character string containing the MD5 hash (32 hex characters) + hashLoadArgs = function(getDbCohortMethodDataArgs) { + if (is.null(getDbCohortMethodDataArgs)) { + return("") + } + json <- getDbCohortMethodDataArgs$toJson() + return(digest::digest(json, algo = "md5")) + }, + + #' @description + #' Hash the study population arguments + #' + #' @param createStudyPopulationArgs An object of type `CreateStudyPopulationArgs` + #' + #' @return + #' Character string containing the MD5 hash (32 hex characters) + hashStudyPopArgs = function(createStudyPopulationArgs) { + if (is.null(createStudyPopulationArgs)) { + return("") + } + json <- createStudyPopulationArgs$toJson() + return(digest::digest(json, algo = "md5")) + }, + + #' @description + #' Hash the propensity score arguments + #' + #' @param createPsArgs An object of type `CreatePsArgs` + #' + #' @return + #' Character string containing the MD5 hash (32 hex characters) + hashPsArgs = function(createPsArgs) { + if (is.null(createPsArgs)) { + return("") + } + json <- createPsArgs$toJson() + return(digest::digest(json, algo = "md5")) + }, + + #' @description + #' Hash arguments for propensity score trimming + #' + #' @param trimByPsArgs An object of type `TrimByPsArgs` + #' + #' @return + #' Character string containing the MD5 hash (32 hex characters) + hashTrimByPsArgs = function(trimByPsArgs) { + if (is.null(trimByPsArgs)) { + return("") + } + json <- trimByPsArgs$toJson() + return(digest::digest(json, algo = "md5")) + }, + + #' @description + #' Hash arguments for propensity score matching + #' + #' @param matchOnPsArgs An object of type `MatchOnPsArgs` + #' + #' @return + #' Character string containing the MD5 hash (32 hex characters) + hashMatchOnPsArgs = function(matchOnPsArgs) { + if (is.null(matchOnPsArgs)) { + return("") + } + json <- matchOnPsArgs$toJson() + return(digest::digest(json, algo = "md5")) + }, + + #' @description + #' Hash arguments for propensity score stratification + #' + #' @param stratifyByPsArgs An object of type `StratifyByPsArgs` + #' + #' @return + #' Character string containing the MD5 hash (32 hex characters) + hashStratifyByPsArgs = function(stratifyByPsArgs) { + if (is.null(stratifyByPsArgs)) { + return("") + } + json <- stratifyByPsArgs$toJson() + return(digest::digest(json, algo = "md5")) + }, + + #' @description + #' Hash arguments for outcome model fitting + #' + #' @param fitOutcomeModelArgs An object of type `FitOutcomeModelArgs` + #' + #' @return + #' Character string containing the MD5 hash (32 hex characters) + hashOutcomeModelArgs = function(fitOutcomeModelArgs) { + if (is.null(fitOutcomeModelArgs)) { + return("") + } + json <- fitOutcomeModelArgs$toJson() + return(digest::digest(json, algo = "md5")) + }, + + #' @description + #' Hash arguments for covariate balance computation + #' + #' @param computeCovariateBalanceArgs An object of type `ComputeCovariateBalanceArgs` + #' + #' @return + #' Character string containing the MD5 hash (32 hex characters) + hashBalanceArgs = function(computeCovariateBalanceArgs) { + if (is.null(computeCovariateBalanceArgs)) { + return("") + } + json <- computeCovariateBalanceArgs$toJson() + return(digest::digest(json, algo = "md5")) + }, + + #' @description + #' Compare old and new specifications to identify which components changed + #' + #' @param oldSpecs An object of type `CmAnalysesSpecifications` (old version) + #' @param newSpecs An object of type `CmAnalysesSpecifications` (new version) + #' + #' @return + #' A list of logical values indicating which components changed: + #' - loadArgsChanged: whether data loading arguments differ + #' - studyPopArgsChanged: whether study population arguments differ + #' - psArgsChanged: whether propensity score arguments differ + #' - strataArgsChanged: whether stratification arguments (trim/match/stratify) differ + #' - outcomeModelArgsChanged: whether outcome model arguments differ + #' - balanceArgsChanged: whether balance computation arguments differ + #' - analyticsChanged: whether outcome of interest or analysis IDs changed + compareSettingsComponents = function(oldSpecs, newSpecs) { + checkmate::assertClass(oldSpecs, "CmAnalysesSpecifications") + checkmate::assertClass(newSpecs, "CmAnalysesSpecifications") + + # Compare each analysis component's settings + changedComponents <- list( + loadArgsChanged = FALSE, + studyPopArgsChanged = FALSE, + psArgsChanged = FALSE, + strataArgsChanged = FALSE, + outcomeModelArgsChanged = FALSE, + balanceArgsChanged = FALSE, + analyticsChanged = FALSE + ) + + # Get the old and new analysis lists + oldAnalyses <- oldSpecs$cmAnalysisList + newAnalyses <- newSpecs$cmAnalysisList + + # If the number of analyses differs significantly or can't be matched, mark as changed + if (length(oldAnalyses) != length(newAnalyses)) { + # For now, mark as all changed if list lengths differ + # A more sophisticated approach would match analyses by ID + changedComponents$analyticsChanged <- TRUE + } else { + # Compare each analysis by ID (assuming they maintain order/ID) + for (i in seq_along(newAnalyses)) { + oldAna <- oldAnalyses[[i]] + newAna <- newAnalyses[[i]] + + # Check data loading + oldLoadHash <- self$hashLoadArgs(oldAna$getDbCohortMethodDataArgs) + newLoadHash <- self$hashLoadArgs(newAna$getDbCohortMethodDataArgs) + if (oldLoadHash != newLoadHash) { + changedComponents$loadArgsChanged <- TRUE + } + + # Check study population + oldStudyPopHash <- self$hashStudyPopArgs(oldAna$createStudyPopulationArgs) + newStudyPopHash <- self$hashStudyPopArgs(newAna$createStudyPopulationArgs) + if (oldStudyPopHash != newStudyPopHash) { + changedComponents$studyPopArgsChanged <- TRUE + } + + # Check propensity score + oldPsHash <- self$hashPsArgs(oldAna$createPsArgs) + newPsHash <- self$hashPsArgs(newAna$createPsArgs) + if (oldPsHash != newPsHash) { + changedComponents$psArgsChanged <- TRUE + } + + # Check stratification (trim/match/stratify all affect strata) + oldTrimHash <- self$hashTrimByPsArgs(oldAna$trimByPsArgs) + newTrimHash <- self$hashTrimByPsArgs(newAna$trimByPsArgs) + oldMatchHash <- self$hashMatchOnPsArgs(oldAna$matchOnPsArgs) + newMatchHash <- self$hashMatchOnPsArgs(newAna$matchOnPsArgs) + oldStratifyHash <- self$hashStratifyByPsArgs(oldAna$stratifyByPsArgs) + newStratifyHash <- self$hashStratifyByPsArgs(newAna$stratifyByPsArgs) + + if (oldTrimHash != newTrimHash || oldMatchHash != newMatchHash || oldStratifyHash != newStratifyHash) { + changedComponents$strataArgsChanged <- TRUE + } + + # Check outcome model + oldOutcomeHash <- self$hashOutcomeModelArgs(oldAna$fitOutcomeModelArgs) + newOutcomeHash <- self$hashOutcomeModelArgs(newAna$fitOutcomeModelArgs) + if (oldOutcomeHash != newOutcomeHash) { + changedComponents$outcomeModelArgsChanged <- TRUE + } + + # Check balance args + oldBalanceHash <- self$hashBalanceArgs(oldAna$computeSharedCovariateBalanceArgs) + newBalanceHash <- self$hashBalanceArgs(newAna$computeSharedCovariateBalanceArgs) + if (oldBalanceHash != newBalanceHash) { + changedComponents$balanceArgsChanged <- TRUE + } + } + } + + # Check if outcomes of interest or excluded analyses changed + oldOutcomes <- oldSpecs$targetComparatorOutcomesList + newOutcomes <- newSpecs$targetComparatorOutcomesList + + if (length(oldOutcomes) != length(newOutcomes)) { + changedComponents$analyticsChanged <- TRUE + } else { + # Check if outcomes within TCOs differ + for (i in seq_along(newOutcomes)) { + oldTco <- oldOutcomes[[i]] + newTco <- newOutcomes[[i]] + + # Simple check: compare target, comparator, nesting cohort + if (oldTco$targetId != newTco$targetId || + oldTco$comparatorId != newTco$comparatorId || + !identical(oldTco$nestingCohortId, newTco$nestingCohortId)) { + changedComponents$analyticsChanged <- TRUE + break + } + + # Check outcome changes (new outcomes added or old ones removed) + oldOutcomeIds <- sort(sapply(oldTco$outcomes, function(x) x$outcomeId)) + newOutcomeIds <- sort(sapply(newTco$outcomes, function(x) x$outcomeId)) + if (!identical(oldOutcomeIds, newOutcomeIds)) { + # Just outcomes added: no need to delete + # Outcomes removed: potentially need to delete + if (any(!(newOutcomeIds %in% oldOutcomeIds))) { + # New outcomes added - this is OK, don't mark as changed + } else { + # Outcomes removed - this means regeneration may be needed + changedComponents$analyticsChanged <- TRUE + } + } + } + } + + return(changedComponents) + } + ) +) diff --git a/R/ValidateArtifact.R b/R/ValidateArtifact.R new file mode 100644 index 0000000..708822d --- /dev/null +++ b/R/ValidateArtifact.R @@ -0,0 +1,150 @@ +# Copyright 2026 Observational Health Data Sciences and Informatics +# +# This file is part of CohortMethod +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#' Validate Artifact +#' +#' @description +#' Utility functions to validate that cached artifacts match expected parameter hashes. +#' +#' @details +#' These functions help determine whether an existing artifact can be reused by +#' validating that its parameters match the hash encoded in the filename or metadata. +#' +#' @keywords internal +#' +#' @export +ValidateArtifact <- R6::R6Class( + "ValidateArtifact", + public = list( + #' @description + #' Extract hash from a filename + #' + #' @param filename Character string with the artifact filename + #' + #' @return + #' Character string containing the extracted hash, or NA if not found + #' + #' @details + #' Filenames typically have format like: + #' - CmData_l{loadId}_{tcn}.zip + #' - StudyPop_l{loadId}_s{studyPopId}_{tcn}_o{outcomeId}.rds + #' + #' This function extracts numeric IDs which are used as hashes. + extractHashFromFilename = function(filename) { + # This is a placeholder - actual hash extraction depends on filename format + # In CohortMethod, the "ID" (like loadId, studyPopId) is the hash + # This would be enhanced based on actual filename patterns used + + # For now, return NA - the actual validation happens at file existence level + return(NA_character_) + }, + + #' @description + #' Validate that an artifact file exists and matches expected settings + #' + #' @param filepath Character string with the full path to the artifact file + #' @param settingsHash Character string with the expected settings hash + #' + #' @return + #' Logical TRUE if file exists and hash matches, FALSE otherwise + #' + #' @details + #' This function performs basic validation. In the current implementation, + #' we rely on filename encoding to ensure the hash is embedded in the filename. + #' If the file exists and has the right naming pattern, we consider it valid. + validateArtifactFile = function(filepath, settingsHash = NULL) { + # Check if file exists + if (!file.exists(filepath)) { + return(FALSE) + } + + # If no hash provided, just check existence + if (is.null(settingsHash)) { + return(TRUE) + } + + # Extract filename + filename <- basename(filepath) + + # In current CohortMethod implementation, the hash is encoded in the filename + # through the ID system (loadId, studyPopId, etc.) + # Validation consists of checking that the file exists with correct naming + + # For now, assume existence + correct naming = valid artifact + # This could be enhanced with actual hash extraction and comparison + + return(TRUE) + }, + + #' @description + #' Check if a cohort method data file is valid and reusable + #' + #' @param filepath Character string with the path to the CmData zip file + #' @param expectedLoadId Numeric ID expected for this data loading configuration + #' + #' @return + #' Logical TRUE if file exists and corresponds to the expected load ID + validateCohortMethodData = function(filepath, expectedLoadId) { + if (!file.exists(filepath)) { + return(FALSE) + } + + # Check that file is readable as a zip + tryCatch({ + # Try to get zip file list to verify integrity + zip::zip_list(filepath) + return(TRUE) + }, error = function(e) { + return(FALSE) + }) + }, + + #' @description + #' List all analysis artifacts in a directory by type + #' + #' @param outputFolder Character string with the output folder path + #' @param artifactType Character string indicating artifact type: + #' "cmdata", "studypop", "ps", "stratpop", "balance", "outcome_model" + #' + #' @return + #' Character vector of full paths to matching artifact files + listArtifactsByType = function(outputFolder, artifactType) { + checkmate::assertCharacter(outputFolder, len = 1) + checkmate::assertCharacter(artifactType, len = 1) + + if (!dir.exists(outputFolder)) { + return(character()) + } + + patterns <- list( + cmdata = "^CmData_.*\\.zip$", + studypop = "^StudyPop_.*\\.rds$", + ps = "^Ps_.*\\.rds$", + stratpop = "^StratPop_.*\\.rds$", + balance = "^Balance_.*\\.rds$", + outcome_model = "^Analysis_[0-9]+/om.*\\.rds$" + ) + + pattern <- patterns[[tolower(artifactType)]] + if (is.null(pattern)) { + stop("Unknown artifact type: ", artifactType) + } + + files <- list.files(outputFolder, pattern = pattern, full.names = TRUE) + return(files) + } + ) +) diff --git a/man/InvalidationPolicy.Rd b/man/InvalidationPolicy.Rd new file mode 100644 index 0000000..45834eb --- /dev/null +++ b/man/InvalidationPolicy.Rd @@ -0,0 +1,94 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/InvalidationPolicy.R +\name{InvalidationPolicy} +\alias{InvalidationPolicy} +\title{Invalidation Policy} +\description{ +Determines which artifacts should be deleted when analysis settings change, +based on dependency relationships between components. +} +\details{ +When settings change, not all artifacts need to be regenerated. This class +implements the logic to determine minimal invalidation scope based on which +settings components have changed. +} +\section{Methods}{ +\subsection{Public methods}{ +\itemize{ +\item \href{#method-InvalidationPolicy-computeInvalidationScope}{\code{InvalidationPolicy$computeInvalidationScope()}} +\item \href{#method-InvalidationPolicy-getInvalidationMessage}{\code{InvalidationPolicy$getInvalidationMessage()}} +\item \href{#method-InvalidationPolicy-clone}{\code{InvalidationPolicy$clone()}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-InvalidationPolicy-computeInvalidationScope}{}}} +\subsection{Method \code{computeInvalidationScope()}}{ +Compute which file patterns should be deleted based on changed components +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{InvalidationPolicy$computeInvalidationScope(changedComponents)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{changedComponents}}{A list (as returned by SettingsHasher$compareSettingsComponents) +with logical values for each component} +} +\if{html}{\out{
}} +} +\subsection{Details}{ +Deletion cascade logic: +\itemize{ +\item If loadArgsChanged: all downstream artifacts (everything) +\item Else if studyPopArgsChanged: populations, PS models, strata, outcomes +\item Else if psArgsChanged: PS models, strata, outcomes +\item Else if strataArgsChanged: strata and outcomes +\item Else if outcomeModelArgsChanged: outcome models only +\item Else if only balanceArgsChanged: balance files only +\item Else if analyticsChanged (only new outcomes): nothing (outcomes are outcome-specific) +} +} + +\subsection{Returns}{ +Character vector of file patterns/globs to delete. Empty vector if no deletion needed. +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-InvalidationPolicy-getInvalidationMessage}{}}} +\subsection{Method \code{getInvalidationMessage()}}{ +Get a human-readable message describing what will be deleted +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{InvalidationPolicy$getInvalidationMessage(changedComponents)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{changedComponents}}{A list of changed components} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +Character string suitable for displaying to the user +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-InvalidationPolicy-clone}{}}} +\subsection{Method \code{clone()}}{ +The objects of this class are cloneable with this method. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{InvalidationPolicy$clone(deep = FALSE)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{deep}}{Whether to make a deep clone.} +} +\if{html}{\out{
}} +} +} +} diff --git a/man/SettingsHasher.Rd b/man/SettingsHasher.Rd new file mode 100644 index 0000000..4e1df70 --- /dev/null +++ b/man/SettingsHasher.Rd @@ -0,0 +1,238 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/SettingsHasher.R +\name{SettingsHasher} +\alias{SettingsHasher} +\title{Settings Hasher} +\description{ +Compute deterministic hashes for individual settings components to detect +which parts of the analysis specification have changed. +} +\details{ +This class hashes each settings component independently using JSON serialization +and MD5 digests. This allows fine-grained detection of which artifact types need +to be invalidated when specifications change. +} +\section{Methods}{ +\subsection{Public methods}{ +\itemize{ +\item \href{#method-SettingsHasher-hashLoadArgs}{\code{SettingsHasher$hashLoadArgs()}} +\item \href{#method-SettingsHasher-hashStudyPopArgs}{\code{SettingsHasher$hashStudyPopArgs()}} +\item \href{#method-SettingsHasher-hashPsArgs}{\code{SettingsHasher$hashPsArgs()}} +\item \href{#method-SettingsHasher-hashTrimByPsArgs}{\code{SettingsHasher$hashTrimByPsArgs()}} +\item \href{#method-SettingsHasher-hashMatchOnPsArgs}{\code{SettingsHasher$hashMatchOnPsArgs()}} +\item \href{#method-SettingsHasher-hashStratifyByPsArgs}{\code{SettingsHasher$hashStratifyByPsArgs()}} +\item \href{#method-SettingsHasher-hashOutcomeModelArgs}{\code{SettingsHasher$hashOutcomeModelArgs()}} +\item \href{#method-SettingsHasher-hashBalanceArgs}{\code{SettingsHasher$hashBalanceArgs()}} +\item \href{#method-SettingsHasher-compareSettingsComponents}{\code{SettingsHasher$compareSettingsComponents()}} +\item \href{#method-SettingsHasher-clone}{\code{SettingsHasher$clone()}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-SettingsHasher-hashLoadArgs}{}}} +\subsection{Method \code{hashLoadArgs()}}{ +Hash the data loading arguments +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{SettingsHasher$hashLoadArgs(getDbCohortMethodDataArgs)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{getDbCohortMethodDataArgs}}{An object of type \code{GetDbCohortMethodDataArgs}} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +Character string containing the MD5 hash (32 hex characters) +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-SettingsHasher-hashStudyPopArgs}{}}} +\subsection{Method \code{hashStudyPopArgs()}}{ +Hash the study population arguments +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{SettingsHasher$hashStudyPopArgs(createStudyPopulationArgs)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{createStudyPopulationArgs}}{An object of type \code{CreateStudyPopulationArgs}} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +Character string containing the MD5 hash (32 hex characters) +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-SettingsHasher-hashPsArgs}{}}} +\subsection{Method \code{hashPsArgs()}}{ +Hash the propensity score arguments +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{SettingsHasher$hashPsArgs(createPsArgs)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{createPsArgs}}{An object of type \code{CreatePsArgs}} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +Character string containing the MD5 hash (32 hex characters) +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-SettingsHasher-hashTrimByPsArgs}{}}} +\subsection{Method \code{hashTrimByPsArgs()}}{ +Hash arguments for propensity score trimming +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{SettingsHasher$hashTrimByPsArgs(trimByPsArgs)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{trimByPsArgs}}{An object of type \code{TrimByPsArgs}} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +Character string containing the MD5 hash (32 hex characters) +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-SettingsHasher-hashMatchOnPsArgs}{}}} +\subsection{Method \code{hashMatchOnPsArgs()}}{ +Hash arguments for propensity score matching +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{SettingsHasher$hashMatchOnPsArgs(matchOnPsArgs)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{matchOnPsArgs}}{An object of type \code{MatchOnPsArgs}} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +Character string containing the MD5 hash (32 hex characters) +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-SettingsHasher-hashStratifyByPsArgs}{}}} +\subsection{Method \code{hashStratifyByPsArgs()}}{ +Hash arguments for propensity score stratification +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{SettingsHasher$hashStratifyByPsArgs(stratifyByPsArgs)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{stratifyByPsArgs}}{An object of type \code{StratifyByPsArgs}} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +Character string containing the MD5 hash (32 hex characters) +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-SettingsHasher-hashOutcomeModelArgs}{}}} +\subsection{Method \code{hashOutcomeModelArgs()}}{ +Hash arguments for outcome model fitting +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{SettingsHasher$hashOutcomeModelArgs(fitOutcomeModelArgs)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{fitOutcomeModelArgs}}{An object of type \code{FitOutcomeModelArgs}} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +Character string containing the MD5 hash (32 hex characters) +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-SettingsHasher-hashBalanceArgs}{}}} +\subsection{Method \code{hashBalanceArgs()}}{ +Hash arguments for covariate balance computation +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{SettingsHasher$hashBalanceArgs(computeCovariateBalanceArgs)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{computeCovariateBalanceArgs}}{An object of type \code{ComputeCovariateBalanceArgs}} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +Character string containing the MD5 hash (32 hex characters) +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-SettingsHasher-compareSettingsComponents}{}}} +\subsection{Method \code{compareSettingsComponents()}}{ +Compare old and new specifications to identify which components changed +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{SettingsHasher$compareSettingsComponents(oldSpecs, newSpecs)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{oldSpecs}}{An object of type \code{CmAnalysesSpecifications} (old version)} + +\item{\code{newSpecs}}{An object of type \code{CmAnalysesSpecifications} (new version)} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +A list of logical values indicating which components changed: +\itemize{ +\item loadArgsChanged: whether data loading arguments differ +\item studyPopArgsChanged: whether study population arguments differ +\item psArgsChanged: whether propensity score arguments differ +\item strataArgsChanged: whether stratification arguments (trim/match/stratify) differ +\item outcomeModelArgsChanged: whether outcome model arguments differ +\item balanceArgsChanged: whether balance computation arguments differ +\item analyticsChanged: whether outcome of interest or analysis IDs changed +} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-SettingsHasher-clone}{}}} +\subsection{Method \code{clone()}}{ +The objects of this class are cloneable with this method. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{SettingsHasher$clone(deep = FALSE)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{deep}}{Whether to make a deep clone.} +} +\if{html}{\out{
}} +} +} +} diff --git a/man/ValidateArtifact.Rd b/man/ValidateArtifact.Rd new file mode 100644 index 0000000..1d1c516 --- /dev/null +++ b/man/ValidateArtifact.Rd @@ -0,0 +1,144 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ValidateArtifact.R +\name{ValidateArtifact} +\alias{ValidateArtifact} +\title{Validate Artifact} +\description{ +Utility functions to validate that cached artifacts match expected parameter hashes. +} +\details{ +These functions help determine whether an existing artifact can be reused by +validating that its parameters match the hash encoded in the filename or metadata. +} +\keyword{internal} +\section{Methods}{ +\subsection{Public methods}{ +\itemize{ +\item \href{#method-ValidateArtifact-extractHashFromFilename}{\code{ValidateArtifact$extractHashFromFilename()}} +\item \href{#method-ValidateArtifact-validateArtifactFile}{\code{ValidateArtifact$validateArtifactFile()}} +\item \href{#method-ValidateArtifact-validateCohortMethodData}{\code{ValidateArtifact$validateCohortMethodData()}} +\item \href{#method-ValidateArtifact-listArtifactsByType}{\code{ValidateArtifact$listArtifactsByType()}} +\item \href{#method-ValidateArtifact-clone}{\code{ValidateArtifact$clone()}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-ValidateArtifact-extractHashFromFilename}{}}} +\subsection{Method \code{extractHashFromFilename()}}{ +Extract hash from a filename +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{ValidateArtifact$extractHashFromFilename(filename)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{filename}}{Character string with the artifact filename} +} +\if{html}{\out{
}} +} +\subsection{Details}{ +Filenames typically have format like: +\itemize{ +\item CmData_l{loadId}_{tcn}.zip +\item StudyPop_l{loadId}\emph{s{studyPopId}}{tcn}_o{outcomeId}.rds +} + +This function extracts numeric IDs which are used as hashes. +} + +\subsection{Returns}{ +Character string containing the extracted hash, or NA if not found +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-ValidateArtifact-validateArtifactFile}{}}} +\subsection{Method \code{validateArtifactFile()}}{ +Validate that an artifact file exists and matches expected settings +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{ValidateArtifact$validateArtifactFile(filepath, settingsHash = NULL)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{filepath}}{Character string with the full path to the artifact file} + +\item{\code{settingsHash}}{Character string with the expected settings hash} +} +\if{html}{\out{
}} +} +\subsection{Details}{ +This function performs basic validation. In the current implementation, +we rely on filename encoding to ensure the hash is embedded in the filename. +If the file exists and has the right naming pattern, we consider it valid. +} + +\subsection{Returns}{ +Logical TRUE if file exists and hash matches, FALSE otherwise +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-ValidateArtifact-validateCohortMethodData}{}}} +\subsection{Method \code{validateCohortMethodData()}}{ +Check if a cohort method data file is valid and reusable +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{ValidateArtifact$validateCohortMethodData(filepath, expectedLoadId)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{filepath}}{Character string with the path to the CmData zip file} + +\item{\code{expectedLoadId}}{Numeric ID expected for this data loading configuration} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +Logical TRUE if file exists and corresponds to the expected load ID +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-ValidateArtifact-listArtifactsByType}{}}} +\subsection{Method \code{listArtifactsByType()}}{ +List all analysis artifacts in a directory by type +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{ValidateArtifact$listArtifactsByType(outputFolder, artifactType)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{outputFolder}}{Character string with the output folder path} + +\item{\code{artifactType}}{Character string indicating artifact type: +"cmdata", "studypop", "ps", "stratpop", "balance", "outcome_model"} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +Character vector of full paths to matching artifact files +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-ValidateArtifact-clone}{}}} +\subsection{Method \code{clone()}}{ +The objects of this class are cloneable with this method. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{ValidateArtifact$clone(deep = FALSE)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{deep}}{Whether to make a deep clone.} +} +\if{html}{\out{
}} +} +} +} diff --git a/tests/testthat/test-invalidationPolicy.R b/tests/testthat/test-invalidationPolicy.R new file mode 100644 index 0000000..3674508 --- /dev/null +++ b/tests/testthat/test-invalidationPolicy.R @@ -0,0 +1,288 @@ +context("Artifact caching system") + +# Helper to create covariate settings +getDefaultCovariateSettings <- function() { + return(FeatureExtraction::createCovariateSettings( + useDemographicsGender = TRUE, + useDemographicsAge = TRUE + )) +} + +test_that("SettingsHasher hashes settings consistently", { + # Create two identical settings objects + args1 <- createGetDbCohortMethodDataArgs( + covariateSettings = getDefaultCovariateSettings() + ) + args2 <- createGetDbCohortMethodDataArgs( + covariateSettings = getDefaultCovariateSettings() + ) + + hasher <- SettingsHasher$new() + + # Hash the same settings twice + hash1 <- hasher$hashLoadArgs(args1) + hash2 <- hasher$hashLoadArgs(args2) + + # Should produce identical hashes + expect_equal(hash1, hash2) + expect_true(is.character(hash1)) + expect_equal(nchar(hash1), 32) # MD5 is 32 hex characters +}) + +test_that("SettingsHasher detects changes in load args", { + hasher <- SettingsHasher$new() + + args1 <- createGetDbCohortMethodDataArgs( + covariateSettings = getDefaultCovariateSettings() + ) + args2 <- createGetDbCohortMethodDataArgs( + covariateSettings = getDefaultCovariateSettings(), + maxCohortSize = 50000 # Different + ) + + hash1 <- hasher$hashLoadArgs(args1) + hash2 <- hasher$hashLoadArgs(args2) + + expect_true(hash1 != hash2) +}) + +test_that("SettingsHasher detects changes in study population args", { + hasher <- SettingsHasher$new() + + args1 <- createCreateStudyPopulationArgs() + args2 <- createCreateStudyPopulationArgs(minDaysAtRisk = 30) # Different + + hash1 <- hasher$hashStudyPopArgs(args1) + hash2 <- hasher$hashStudyPopArgs(args2) + + expect_true(hash1 != hash2) +}) + +test_that("SettingsHasher handles NULL arguments", { + hasher <- SettingsHasher$new() + + hash <- hasher$hashLoadArgs(NULL) + expect_equal(hash, "") + + hash <- hasher$hashPsArgs(NULL) + expect_equal(hash, "") +}) + +test_that("SettingsHasher compareSettingsComponents works with identical specs", { + hasher <- SettingsHasher$new() + + # Create identical specifications + analysis <- createCmAnalysis( + analysisId = 1, + getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs( + covariateSettings = getDefaultCovariateSettings() + ), + createStudyPopulationArgs = createCreateStudyPopulationArgs(), + createPsArgs = createCreatePsArgs() + ) + + specs1 <- createCmAnalysesSpecifications( + cmAnalysisList = list(analysis), + targetComparatorOutcomesList = list( + createTargetComparatorOutcomes( + targetId = 1, + comparatorId = 2, + outcomes = list(createOutcome(outcomeId = 3)) + ) + ) + ) + + specs2 <- createCmAnalysesSpecifications( + cmAnalysisList = list(analysis), + targetComparatorOutcomesList = list( + createTargetComparatorOutcomes( + targetId = 1, + comparatorId = 2, + outcomes = list(createOutcome(outcomeId = 3)) + ) + ) + ) + + changes <- hasher$compareSettingsComponents(specs1, specs2) + + expect_false(any(as.logical(changes))) +}) + +test_that("SettingsHasher compareSettingsComponents detects load args change", { + hasher <- SettingsHasher$new() + + analysis1 <- createCmAnalysis( + analysisId = 1, + getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs( + covariateSettings = getDefaultCovariateSettings() + ), + createStudyPopulationArgs = createCreateStudyPopulationArgs(), + createPsArgs = createCreatePsArgs() + ) + + analysis2 <- createCmAnalysis( + analysisId = 1, + getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs( + covariateSettings = getDefaultCovariateSettings(), + maxCohortSize = 50000 + ), + createStudyPopulationArgs = createCreateStudyPopulationArgs(), + createPsArgs = createCreatePsArgs() + ) + + specs1 <- createCmAnalysesSpecifications( + cmAnalysisList = list(analysis1), + targetComparatorOutcomesList = list( + createTargetComparatorOutcomes( + targetId = 1, + comparatorId = 2, + outcomes = list(createOutcome(outcomeId = 3)) + ) + ) + ) + + specs2 <- createCmAnalysesSpecifications( + cmAnalysisList = list(analysis2), + targetComparatorOutcomesList = list( + createTargetComparatorOutcomes( + targetId = 1, + comparatorId = 2, + outcomes = list(createOutcome(outcomeId = 3)) + ) + ) + ) + + changes <- hasher$compareSettingsComponents(specs1, specs2) + + expect_true(changes$loadArgsChanged) +}) + +test_that("InvalidationPolicy computes correct deletion scope for load args change", { + policy <- InvalidationPolicy$new() + + changedComponents <- list( + loadArgsChanged = TRUE, + studyPopArgsChanged = FALSE, + psArgsChanged = FALSE, + strataArgsChanged = FALSE, + outcomeModelArgsChanged = FALSE, + balanceArgsChanged = FALSE, + analyticsChanged = FALSE + ) + + patterns <- policy$computeInvalidationScope(changedComponents) + + # Should include all major artifact patterns + expect_true(any(grepl("CmData", patterns))) + expect_true(any(grepl("StudyPop", patterns))) + expect_true(any(grepl("Ps", patterns))) + expect_true(any(grepl("Analysis", patterns))) +}) + +test_that("InvalidationPolicy computes correct deletion scope for study pop args change", { + policy <- InvalidationPolicy$new() + + changedComponents <- list( + loadArgsChanged = FALSE, + studyPopArgsChanged = TRUE, + psArgsChanged = FALSE, + strataArgsChanged = FALSE, + outcomeModelArgsChanged = FALSE, + balanceArgsChanged = FALSE, + analyticsChanged = FALSE + ) + + patterns <- policy$computeInvalidationScope(changedComponents) + + # Should include study pop and downstream + expect_true(any(grepl("StudyPop", patterns))) + expect_true(any(grepl("Ps", patterns))) + expect_true(any(grepl("Analysis", patterns))) + + # But not CmData + expect_false(any(grepl("CmData_", patterns))) +}) + +test_that("InvalidationPolicy computes empty scope for outcome addition only", { + policy <- InvalidationPolicy$new() + + changedComponents <- list( + loadArgsChanged = FALSE, + studyPopArgsChanged = FALSE, + psArgsChanged = FALSE, + strataArgsChanged = FALSE, + outcomeModelArgsChanged = FALSE, + balanceArgsChanged = FALSE, + analyticsChanged = TRUE # But only because new outcomes added + ) + + patterns <- policy$computeInvalidationScope(changedComponents) + + # Should be empty - new outcomes don't require deletion + expect_equal(length(patterns), 0) +}) + +test_that("InvalidationPolicy provides meaningful messages", { + policy <- InvalidationPolicy$new() + + changedComponents <- list( + loadArgsChanged = TRUE, + studyPopArgsChanged = FALSE, + psArgsChanged = FALSE, + strataArgsChanged = FALSE, + outcomeModelArgsChanged = FALSE, + balanceArgsChanged = FALSE, + analyticsChanged = FALSE + ) + + message <- policy$getInvalidationMessage(changedComponents) + expect_true(grepl("Data loading", message, ignore.case = TRUE)) + expect_true(is.character(message)) +}) + +test_that("ValidateArtifact can list artifacts by type", { + outputFolder <- tempfile(pattern = "cmArtifacts") + dir.create(outputFolder) + + on.exit(unlink(outputFolder, recursive = TRUE)) + + # Create some test files + file.create(file.path(outputFolder, "CmData_l1_t1_c2.zip")) + file.create(file.path(outputFolder, "StudyPop_l1_s1_t1_c2_o3.rds")) + file.create(file.path(outputFolder, "Ps_l1_p1_t1_c2.rds")) + + validator <- ValidateArtifact$new() + + cmdata <- validator$listArtifactsByType(outputFolder, "cmdata") + expect_equal(length(cmdata), 1) + expect_true(grepl("CmData", cmdata)) + + studypop <- validator$listArtifactsByType(outputFolder, "studypop") + expect_equal(length(studypop), 1) + expect_true(grepl("StudyPop", studypop)) + + ps <- validator$listArtifactsByType(outputFolder, "ps") + expect_equal(length(ps), 1) + expect_true(grepl("Ps", ps)) +}) + +test_that("ValidateArtifact can validate artifact file existence", { + outputFolder <- tempfile(pattern = "cmArtifacts") + dir.create(outputFolder) + + on.exit(unlink(outputFolder, recursive = TRUE)) + + validator <- ValidateArtifact$new() + + # Non-existent file + result <- validator$validateArtifactFile(file.path(outputFolder, "nonexistent.zip")) + expect_false(result) + + # Create a file + testFile <- file.path(outputFolder, "test.zip") + file.create(testFile) + + # Existing file + result <- validator$validateArtifactFile(testFile) + expect_true(result) +}) From a8ade8fadaedd0c5944d9409ddfcfafe426dca0c Mon Sep 17 00:00:00 2001 From: jgilber2 Date: Thu, 30 Apr 2026 07:22:25 -0700 Subject: [PATCH 02/10] Simpliofied approach to be more similar to SCCS and include database ids within checksums --- NAMESPACE | 2 + NEWS.md | 12 + R/ArtifactStore.R | 174 ++++++++ R/InvalidationPolicy.R | 142 ------- R/RunAnalyses.R | 515 +++++++++-------------- R/SettingsHasher.R | 285 ------------- R/ValidateArtifact.R | 150 ------- man/InvalidationPolicy.Rd | 94 ----- man/SettingsHasher.Rd | 238 ----------- man/ValidateArtifact.Rd | 144 ------- tests/testthat/test-artifactStore.R | 155 +++++++ tests/testthat/test-eunomia.R | 18 +- tests/testthat/test-invalidationPolicy.R | 288 ------------- 13 files changed, 565 insertions(+), 1652 deletions(-) create mode 100644 R/ArtifactStore.R delete mode 100644 R/InvalidationPolicy.R delete mode 100644 R/SettingsHasher.R delete mode 100644 R/ValidateArtifact.R delete mode 100644 man/InvalidationPolicy.Rd delete mode 100644 man/SettingsHasher.Rd delete mode 100644 man/ValidateArtifact.Rd create mode 100644 tests/testthat/test-artifactStore.R delete mode 100644 tests/testthat/test-invalidationPolicy.R diff --git a/NAMESPACE b/NAMESPACE index 09da7dc..6056ad0 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -4,6 +4,8 @@ S3method(coef,OutcomeModel) S3method(confint,OutcomeModel) S3method(print,OutcomeModel) S3method(print,summary.CohortMethodData) +export(ArtifactStore) +export(LocalArtifactStore) export(adjustedKm) export(checkCmInstallation) export(computeCovariateBalance) diff --git a/NEWS.md b/NEWS.md index ef86558..62ed9a6 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,15 @@ +CohortMethod 6.0.3 (Development) +================================ + +Features: + +- Added content-addressable artifact caching system that preserves intermediate computation results when adding new outcomes or changing non-critical settings. Filenames are derived from SHA-256 hashes of all parameters that determine their content (including `databaseId`), so changing upstream settings naturally produces new filenames and unchanged settings reuse existing files. + +- New `databaseId` parameter (required) in `runCmAnalyses()` prevents accidental reuse of cached results from a different database. + +- Pluggable `ArtifactStore` interface with `LocalArtifactStore` default implementation, enabling future custom storage backends (S3, HTTP, etc.). + + CohortMethod 6.0.2 ================== diff --git a/R/ArtifactStore.R b/R/ArtifactStore.R new file mode 100644 index 0000000..542e499 --- /dev/null +++ b/R/ArtifactStore.R @@ -0,0 +1,174 @@ +# Copyright 2026 Observational Health Data Sciences and Informatics +# +# This file is part of CohortMethod +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +#' @title Abstract Artifact Store +#' +#' @description +#' Abstract R6 interface for storing and retrieving analysis artifacts. +#' Subclass this to implement custom storage backends (e.g., S3, database). +#' +#' @export +ArtifactStore <- R6::R6Class( + "ArtifactStore", + public = list( + #' @description Check if an artifact exists. + #' @param key Character. The artifact key (relative path). + #' @return Logical. + exists = function(key) { + stop("Abstract method: must be implemented by subclass") + }, + + #' @description Read an RDS artifact. + #' @param key Character. The artifact key. + #' @return The deserialized R object. + readRDS = function(key) { + stop("Abstract method: must be implemented by subclass") + }, + + #' @description Save an RDS artifact. + #' @param object The R object to save. + #' @param key Character. The artifact key. + saveRDS = function(object, key) { + stop("Abstract method: must be implemented by subclass") + }, + + #' @description Read an Andromeda (zip) artifact. + #' @param key Character. The artifact key. + #' @return An Andromeda object. + readAndromeda = function(key) { + stop("Abstract method: must be implemented by subclass") + }, + + #' @description Save an Andromeda (zip) artifact. + #' @param object The Andromeda object to save. + #' @param key Character. The artifact key. + saveAndromeda = function(object, key) { + stop("Abstract method: must be implemented by subclass") + }, + + #' @description List artifacts matching a prefix. + #' @param prefix Character or NULL. Filter to keys starting with this prefix. + #' @return Character vector of keys. + listArtifacts = function(prefix = NULL) { + stop("Abstract method: must be implemented by subclass") + }, + + #' @description Delete an artifact. + #' @param key Character. The artifact key. + delete = function(key) { + stop("Abstract method: must be implemented by subclass") + }, + + #' @description Ensure a directory exists for a given key. + #' @param key Character. The artifact key. + ensureDir = function(key) { + stop("Abstract method: must be implemented by subclass") + } + ) +) + +#' @title Local Filesystem Artifact Store +#' +#' @description +#' Implementation of [ArtifactStore] that reads and writes artifacts to a local +#' filesystem directory. This is the default storage backend. +#' +#' @export +LocalArtifactStore <- R6::R6Class( + "LocalArtifactStore", + inherit = ArtifactStore, + public = list( + #' @description Create a new local artifact store. + #' @param basePath Character. The root directory for artifact storage. + initialize = function(basePath) { + private$basePath <- basePath + if (!dir.exists(basePath)) { + dir.create(basePath, recursive = TRUE) + } + }, + + #' @description Check if an artifact exists. + #' @param key Character. The artifact key (relative path). + #' @return Logical. + exists = function(key) { + file.exists(file.path(private$basePath, key)) + }, + + #' @description Read an RDS artifact. + #' @param key Character. The artifact key. + #' @return The deserialized R object. + readRDS = function(key) { + base::readRDS(file.path(private$basePath, key)) + }, + + #' @description Save an RDS artifact. + #' @param object The R object to save. + #' @param key Character. The artifact key. + saveRDS = function(object, key) { + base::saveRDS(object, file.path(private$basePath, key)) + }, + + #' @description Read an Andromeda (zip) artifact. + #' @param key Character. The artifact key. + #' @return An Andromeda object. + readAndromeda = function(key) { + loadCohortMethodData(file.path(private$basePath, key)) + }, + + #' @description Save an Andromeda (zip) artifact. + #' @param object The Andromeda object to save. + #' @param key Character. The artifact key. + saveAndromeda = function(object, key) { + saveCohortMethodData(object, file.path(private$basePath, key)) + }, + + #' @description List artifacts matching a prefix. + #' @param prefix Character or NULL. Filter to keys starting with this prefix. + #' @return Character vector of keys. + listArtifacts = function(prefix = NULL) { + files <- list.files(private$basePath, recursive = TRUE) + if (!is.null(prefix)) { + files <- files[startsWith(files, prefix)] + } + files + }, + + #' @description Delete an artifact. + #' @param key Character. The artifact key. + delete = function(key) { + unlink(file.path(private$basePath, key)) + }, + + #' @description Ensure a directory exists for a given key. + #' @param key Character. The artifact key. + ensureDir = function(key) { + dir <- dirname(file.path(private$basePath, key)) + if (!dir.exists(dir)) { + dir.create(dir, recursive = TRUE) + } + }, + + #' @description Get the full filesystem path for a key. + #' @param key Character. The artifact key. + #' @return Character. The full path. + getFullPath = function(key) { + file.path(private$basePath, key) + } + ), + private = list( + basePath = NULL + ) +) diff --git a/R/InvalidationPolicy.R b/R/InvalidationPolicy.R deleted file mode 100644 index 9934bea..0000000 --- a/R/InvalidationPolicy.R +++ /dev/null @@ -1,142 +0,0 @@ -# Copyright 2026 Observational Health Data Sciences and Informatics -# -# This file is part of CohortMethod -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#' Invalidation Policy -#' -#' @description -#' Determines which artifacts should be deleted when analysis settings change, -#' based on dependency relationships between components. -#' -#' @details -#' When settings change, not all artifacts need to be regenerated. This class -#' implements the logic to determine minimal invalidation scope based on which -#' settings components have changed. -#' -#' @export -InvalidationPolicy <- R6::R6Class( - "InvalidationPolicy", - public = list( - #' @description - #' Compute which file patterns should be deleted based on changed components - #' - #' @param changedComponents A list (as returned by SettingsHasher$compareSettingsComponents) - #' with logical values for each component - #' - #' @return - #' Character vector of file patterns/globs to delete. Empty vector if no deletion needed. - #' - #' @details - #' Deletion cascade logic: - #' - If loadArgsChanged: all downstream artifacts (everything) - #' - Else if studyPopArgsChanged: populations, PS models, strata, outcomes - #' - Else if psArgsChanged: PS models, strata, outcomes - #' - Else if strataArgsChanged: strata and outcomes - #' - Else if outcomeModelArgsChanged: outcome models only - #' - Else if only balanceArgsChanged: balance files only - #' - Else if analyticsChanged (only new outcomes): nothing (outcomes are outcome-specific) - computeInvalidationScope = function(changedComponents) { - checkmate::assertList(changedComponents) - - filePatternsToDelete <- c() - - # Cascade of invalidation based on dependency graph - if (isTRUE(changedComponents$loadArgsChanged)) { - # If data loading changed, everything depends on it, so delete all analysis artifacts - filePatternsToDelete <- c( - filePatternsToDelete, - "CmData_.*\\.zip$", # All CohortMethodData - "StudyPop_.*\\.rds$", # All study populations - "Ps_.*\\.rds$", # All propensity scores - "StratPop_.*\\.rds$", # All stratified populations - "Balance_.*\\.rds$", # All balance files - "Analysis_[0-9]+$" # All analysis folders (contains outcome models) - ) - } else if (isTRUE(changedComponents$studyPopArgsChanged)) { - # Study population arguments changed: recompute populations and everything downstream - filePatternsToDelete <- c( - filePatternsToDelete, - "StudyPop_.*\\.rds$", # All study populations - "Ps_.*\\.rds$", # All PS (may depend on population) - "StratPop_.*\\.rds$", # All stratified populations - "Balance_.*\\.rds$", # All balance files - "Analysis_[0-9]+$" # All outcome models - ) - } else if (isTRUE(changedComponents$psArgsChanged)) { - # Propensity score arguments changed: recompute PS and downstream - filePatternsToDelete <- c( - filePatternsToDelete, - "Ps_.*\\.rds$", # All PS models - "StratPop_.*\\.rds$", # All stratified populations (depend on PS) - "Balance_.*\\.rds$", # All balance files - "Analysis_[0-9]+$" # All outcome models - ) - } else if (isTRUE(changedComponents$strataArgsChanged)) { - # Stratification (trim/match/stratify) changed: recompute strata and downstream - filePatternsToDelete <- c( - filePatternsToDelete, - "StratPop_.*\\.rds$", # All stratified populations - "Balance_.*\\.rds$", # All balance files - "Analysis_[0-9]+$" # All outcome models - ) - } else if (isTRUE(changedComponents$outcomeModelArgsChanged)) { - # Outcome model arguments changed: recompute outcome models only - filePatternsToDelete <- c( - filePatternsToDelete, - "Analysis_[0-9]+$" # All outcome model folders - ) - } else if (isTRUE(changedComponents$balanceArgsChanged)) { - # Balance computation arguments changed: delete balance files only - filePatternsToDelete <- c( - filePatternsToDelete, - "Balance_.*\\.rds$" # All balance files - ) - } - # If analyticsChanged but nothing else changed (e.g., only outcomes added): - # No deletion needed - outcome models are outcome-specific, new ones will be computed - - return(filePatternsToDelete) - }, - - #' @description - #' Get a human-readable message describing what will be deleted - #' - #' @param changedComponents A list of changed components - #' - #' @return - #' Character string suitable for displaying to the user - getInvalidationMessage = function(changedComponents) { - checkmate::assertList(changedComponents) - - if (isTRUE(changedComponents$loadArgsChanged)) { - return("Data loading arguments have changed. All analysis artifacts must be regenerated.") - } else if (isTRUE(changedComponents$studyPopArgsChanged)) { - return("Study population settings have changed. Study populations, propensity scores, and outcome models must be regenerated.") - } else if (isTRUE(changedComponents$psArgsChanged)) { - return("Propensity score settings have changed. Propensity scores and outcome models must be regenerated.") - } else if (isTRUE(changedComponents$strataArgsChanged)) { - return("Stratification settings (trimming/matching/stratifying) have changed. Stratified populations and outcome models must be regenerated.") - } else if (isTRUE(changedComponents$outcomeModelArgsChanged)) { - return("Outcome model settings have changed. Outcome models must be regenerated.") - } else if (isTRUE(changedComponents$balanceArgsChanged)) { - return("Covariate balance settings have changed. Balance files must be recomputed.") - } else if (isTRUE(changedComponents$analyticsChanged)) { - return("Analyses have changed (new outcomes or studies added). New analyses will be computed.") - } else { - return("No artifacts need to be deleted.") - } - } - ) -) diff --git a/R/RunAnalyses.R b/R/RunAnalyses.R index 2f04330..42507af 100644 --- a/R/RunAnalyses.R +++ b/R/RunAnalyses.R @@ -171,6 +171,12 @@ createDefaultMultiThreadingSettings <- function(maxCores) { #' [createDefaultMultiThreadingSettings()] functions. #' @param cmAnalysesSpecifications An object of type `CmAnalysesSpecifications` as created using #' the `createCmAnalysesSpecifications()`. +#' @param databaseId A unique identifier for the database being used. This is +#' baked into artifact hashes to prevent accidental reuse of +#' cached results from a different database. Required. +#' @param artifactStore An object inheriting from [ArtifactStore] used to read and +#' write cached artifacts. If NULL (default), a +#' [LocalArtifactStore] backed by `outputFolder` is used. #' #' @return #' A tibble describing for each target-comparator-outcome-analysisId combination where the intermediary and @@ -188,7 +194,9 @@ runCmAnalyses <- function(connectionDetails, nestingCohortTable = "cohort", outputFolder = "./CohortMethodOutput", multiThreadingSettings = createMultiThreadingSettings(), - cmAnalysesSpecifications) { + cmAnalysesSpecifications, + databaseId, + artifactStore = NULL) { errorMessages <- checkmate::makeAssertCollection() checkmate::assertClass(connectionDetails, "ConnectionDetails", add = errorMessages) checkmate::assertCharacter(cdmDatabaseSchema, len = 1, add = errorMessages) @@ -202,91 +210,47 @@ runCmAnalyses <- function(connectionDetails, checkmate::assertCharacter(outputFolder, len = 1, add = errorMessages) checkmate::assertClass(multiThreadingSettings, "CmMultiThreadingSettings", add = errorMessages) checkmate::assertR6(cmAnalysesSpecifications, "CmAnalysesSpecifications", add = errorMessages) + checkmate::assertCharacter(databaseId, len = 1, min.chars = 1, add = errorMessages) checkmate::reportAssertions(collection = errorMessages) outputFolder <- normalizePath(outputFolder, mustWork = FALSE) - cmAnalysesSpecificationsFile <- file.path(outputFolder, "cmAnalysesSpecifications.rds") - if (file.exists(cmAnalysesSpecificationsFile)) { - oldCmAnalysesSpecifications <- readRDS(cmAnalysesSpecificationsFile) - if (!isTRUE(all.equal(oldCmAnalysesSpecifications$toList(), cmAnalysesSpecifications$toList()))) { - # Use intelligent invalidation policy to determine what needs to be deleted - hasher <- SettingsHasher$new() - changedComponents <- hasher$compareSettingsComponents( - oldCmAnalysesSpecifications, - cmAnalysesSpecifications - ) - - if (any(as.logical(changedComponents))) { - # Determine minimal set of files to delete based on what changed - policy <- InvalidationPolicy$new() - filePatternsToDelete <- policy$computeInvalidationScope(changedComponents) - - if (length(filePatternsToDelete) > 0) { - # Get human-readable message about what will be deleted - message(sprintf("Output files already exist in '%s', but the analysis settings have changed.", outputFolder)) - message(policy$getInvalidationMessage(changedComponents)) - - # Show some affected files to user - affectedFiles <- c() - for (pattern in filePatternsToDelete) { - matchedFiles <- list.files(outputFolder, pattern = pattern, full.names = FALSE) - affectedFiles <- c(affectedFiles, matchedFiles) - } + if (!dir.exists(outputFolder)) { + dir.create(outputFolder, recursive = TRUE) + } - if (length(affectedFiles) > 0) { - message(sprintf( - "Files to be deleted: %d file(s) matching %d pattern(s)", - length(affectedFiles), - length(filePatternsToDelete) - )) - if (length(affectedFiles) <= 10) { - for (f in affectedFiles) { - message(sprintf(" - %s", f)) - } - } else { - for (f in head(affectedFiles, 5)) { - message(sprintf(" - %s", f)) - } - message(sprintf(" ... and %d more files", length(affectedFiles) - 5)) - } - } + # Initialize artifact store + if (is.null(artifactStore)) { + artifactStore <- LocalArtifactStore$new(outputFolder) + } - response <- utils::askYesNo("Do you want to delete these artifacts before proceeding?") - if (is.na(response)) { - # Cancel: - return() - } else if (response == TRUE) { - # Delete files matching patterns - for (pattern in filePatternsToDelete) { - files <- list.files(outputFolder, pattern = pattern, full.names = TRUE, recursive = TRUE) - if (length(files) > 0) { - unlink(files) - } - # Also try to delete directories matching pattern - dirs <- list.dirs(outputFolder, full.names = TRUE, recursive = FALSE) - matchingDirs <- dirs[grep(pattern, basename(dirs))] - if (length(matchingDirs) > 0) { - unlink(matchingDirs, recursive = TRUE) - } - } - } - } - # Clear cache regardless (old cache may reference deleted files) - rm(list = ls(envir = cache), envir = cache) - } + # Check for database ID mismatch against previously cached results + databaseIdFile <- "databaseId.rds" + if (artifactStore$exists(databaseIdFile)) { + previousDatabaseId <- artifactStore$readRDS(databaseIdFile) + if (!identical(previousDatabaseId, databaseId)) { + stop(sprintf( + paste("Database ID mismatch: output folder was previously used with databaseId '%s',", + "but now '%s' was provided. To reuse this folder with a different database,", + "delete the existing output folder first."), + previousDatabaseId, + databaseId + )) } + } else { + artifactStore$saveRDS(databaseId, databaseIdFile) } - if (!dir.exists(outputFolder)) { - dir.create(outputFolder, recursive = TRUE) - } - saveRDS(cmAnalysesSpecifications, cmAnalysesSpecificationsFile) + + # Save specifications for reference + artifactStore$saveRDS(cmAnalysesSpecifications, "cmAnalysesSpecifications.rds") + referenceTable <- createReferenceTable( cmAnalysisList = cmAnalysesSpecifications$cmAnalysisList, targetComparatorOutcomesList = cmAnalysesSpecifications$targetComparatorOutcomesList, analysesToExclude = cmAnalysesSpecifications$analysesToExclude, outputFolder = outputFolder, refitPsForEveryOutcome = cmAnalysesSpecifications$refitPsForEveryOutcome, - refitPsForEveryStudyPopulation = cmAnalysesSpecifications$refitPsForEveryStudyPopulation + refitPsForEveryStudyPopulation = cmAnalysesSpecifications$refitPsForEveryStudyPopulation, + databaseId = databaseId ) referenceTable |> select(-"includedCovariateConceptIds", "excludedCovariateConceptIds") |> @@ -1106,7 +1070,8 @@ createReferenceTable <- function(cmAnalysisList, analysesToExclude, outputFolder, refitPsForEveryOutcome, - refitPsForEveryStudyPopulation) { + refitPsForEveryStudyPopulation, + databaseId = "") { # Create all rows per target-comparator-outcome-analysis combination: convertAnalysisToTable <- function(analysis) { tibble( @@ -1161,11 +1126,20 @@ createReferenceTable <- function(cmAnalysisList, loadArgsId = match(loadArgsJsons, uniqueLoadArgsJsons) ) referenceTable <- inner_join(referenceTable, analysisIdToLoadArgsId, by = "analysisId") + + # Content-addressable hash includes databaseId to prevent cross-database cache reuse + referenceTable$loadHash <- vapply(seq_len(nrow(referenceTable)), function(i) { + .contentHash( + databaseId, + loadArgsJsons[[match(referenceTable$analysisId[i], analyses$analysisId)]], + referenceTable$targetId[i], + referenceTable$comparatorId[i], + referenceTable$nestingCohortId[i] + ) + }, character(1)) + referenceTable$cohortMethodDataFile <- .createCohortMethodDataFileName( - loadId = referenceTable$loadArgsId, - targetId = referenceTable$targetId, - comparatorId = referenceTable$comparatorId, - nestingCohortId = referenceTable$nestingCohortId + loadHash = referenceTable$loadHash ) # Add studypop filenames @@ -1177,14 +1151,17 @@ createReferenceTable <- function(cmAnalysisList, studyPopArgsId = match(studyPopArgsJsons, uniqueStudyPopArgsJsons) ) referenceTable <- inner_join(referenceTable, analysisIdToStudyPopArgsId, by = "analysisId") - referenceTable$studyPopFile <- .createStudyPopulationFileName( - loadId = referenceTable$loadArgsId, - studyPopId = referenceTable$studyPopArgsId, - targetId = referenceTable$targetId, - comparatorId = referenceTable$comparatorId, - nestingCohortId = referenceTable$nestingCohortId, - outcomeId = referenceTable$outcomeId - ) + + # Content-addressable hash for study population files + referenceTable$studyPopFile <- vapply(seq_len(nrow(referenceTable)), function(i) { + hash <- .contentHash( + databaseId, + referenceTable$loadHash[i], + studyPopArgsJsons[[match(referenceTable$analysisId[i], analyses$analysisId)]], + referenceTable$outcomeId[i] + ) + .createStudyPopulationFileName(hash) + }, character(1)) # Add PS filenames createPsArgsJsons <- lapply( @@ -1200,15 +1177,17 @@ createReferenceTable <- function(cmAnalysisList, noPsIds <- which(uniqueCreatePsArgsJsons == "") idxWithPs <- !(referenceTable$psArgsId %in% noPsIds) referenceTable$psFile <- "" - referenceTable$psFile[idxWithPs] <- .createPsOutcomeFileName( - loadId = referenceTable$loadArgsId[idxWithPs], - studyPopId = referenceTable$studyPopArgsId[idxWithPs], - psId = referenceTable$psArgsId[idxWithPs], - targetId = referenceTable$targetId[idxWithPs], - comparatorId = referenceTable$comparatorId[idxWithPs], - nestingCohortId = referenceTable$nestingCohortId[idxWithPs], - outcomeId = referenceTable$outcomeId[idxWithPs] - ) + referenceTable$psFile[idxWithPs] <- vapply(which(idxWithPs), function(i) { + hash <- .contentHash( + databaseId, + referenceTable$loadHash[i], + studyPopArgsJsons[[match(referenceTable$analysisId[i], analyses$analysisId)]], + createPsArgsJsons[[match(referenceTable$analysisId[i], analyses$analysisId)]], + referenceTable$outcomeId[i] + ) + .createPsFileName(hash) + }, character(1)) + referenceTable$sharedPsFile <- "" if (!refitPsForEveryOutcome) { if (refitPsForEveryStudyPopulation) { @@ -1227,39 +1206,39 @@ createReferenceTable <- function(cmAnalysisList, return(TRUE) } } - studyPopArgsEquivalentId <- seq_along(cmAnalysisList) + # Compute the canonical (first equivalent) studyPop JSON for hashing + studyPopArgsEquivalentIdx <- seq_along(cmAnalysisList) for (i in seq_along(cmAnalysisList)) { for (j in seq_len(i - 1)) { if (equivalent(cmAnalysisList[[i]]$createStudyPopulationArgs, cmAnalysisList[[j]]$createStudyPopulationArgs)) { - studyPopArgsEquivalentId[i] <- j + studyPopArgsEquivalentIdx[i] <- j break } } } - analysisIdToStudyPopArgsEquivalentId <- tibble( - analysisId = analyses$analysisId, - studyPopArgsEquivalentId = studyPopArgsEquivalentId - ) - referenceTable <- inner_join(referenceTable, analysisIdToStudyPopArgsEquivalentId, by = "analysisId") - referenceTable$sharedPsFile[idxWithPs] <- .createPsFileName( - loadId = referenceTable$loadArgsId[idxWithPs], - studyPopId = referenceTable$studyPopArgsEquivalentId[idxWithPs], - psId = referenceTable$psArgsId[idxWithPs], - targetId = referenceTable$targetId[idxWithPs], - comparatorId = referenceTable$comparatorId[idxWithPs], - nestingCohortId = referenceTable$nestingCohortId[idxWithPs] - ) + referenceTable$sharedPsFile[idxWithPs] <- vapply(which(idxWithPs), function(i) { + aIdx <- match(referenceTable$analysisId[i], analyses$analysisId) + equivIdx <- studyPopArgsEquivalentIdx[aIdx] + hash <- .contentHash( + databaseId, + referenceTable$loadHash[i], + studyPopArgsJsons[[equivIdx]], + createPsArgsJsons[[aIdx]] + ) + .createPsFileName(hash) + }, character(1)) } else { # One propensity model across all study population settings: - referenceTable$sharedPsFile[idxWithPs] <- .createPsFileName( - loadId = referenceTable$loadArgsId[idxWithPs], - studyPopId = NULL, - psId = referenceTable$psArgsId[idxWithPs], - targetId = referenceTable$targetId[idxWithPs], - comparatorId = referenceTable$comparatorId[idxWithPs], - nestingCohortId = referenceTable$nestingCohortId[idxWithPs] - ) + referenceTable$sharedPsFile[idxWithPs] <- vapply(which(idxWithPs), function(i) { + aIdx <- match(referenceTable$analysisId[i], analyses$analysisId) + hash <- .contentHash( + databaseId, + referenceTable$loadHash[i], + createPsArgsJsons[[aIdx]] + ) + .createPsFileName(hash) + }, character(1)) } } @@ -1281,16 +1260,18 @@ createReferenceTable <- function(cmAnalysisList, noStrataIds <- which(uniqueStrataArgsJsons == "") idxWithStrata <- !(referenceTable$strataArgsId %in% noStrataIds) referenceTable$strataFile <- "" - referenceTable$strataFile[idxWithStrata] <- .createStratifiedPopFileName( - loadId = referenceTable$loadArgsId[idxWithStrata], - studyPopId = referenceTable$studyPopArgsId[idxWithStrata], - psId = referenceTable$psArgsId[idxWithStrata], - strataId = referenceTable$strataArgsId[idxWithStrata], - targetId = referenceTable$targetId[idxWithStrata], - comparatorId = referenceTable$comparatorId[idxWithStrata], - nestingCohortId = referenceTable$nestingCohortId[idxWithStrata], - outcomeId = referenceTable$outcomeId[idxWithStrata] - ) + referenceTable$strataFile[idxWithStrata] <- vapply(which(idxWithStrata), function(i) { + aIdx <- match(referenceTable$analysisId[i], analyses$analysisId) + hash <- .contentHash( + databaseId, + referenceTable$loadHash[i], + studyPopArgsJsons[[aIdx]], + createPsArgsJsons[[aIdx]], + strataArgsJsons[[aIdx]], + referenceTable$outcomeId[i] + ) + .createStratifiedPopFileName(hash) + }, character(1)) # Add shared covariate balance files (per target-comparator-analysis) if (refitPsForEveryOutcome) { @@ -1301,24 +1282,26 @@ createReferenceTable <- function(cmAnalysisList, function(x) if (is.null(x$computeSharedCovariateBalanceArgs)) "" else x$computeSharedCovariateBalanceArgs$toJson() ) uniqueSharedBalanceArgsJsons <- unique(sharedBalanceArgsJsons) + noSharedBalanceIds <- which(uniqueSharedBalanceArgsJsons == "") analysisIdToSharedBalanceArgsId <- tibble( analysisId = analyses$analysisId, sharedBalanceId = match(sharedBalanceArgsJsons, uniqueSharedBalanceArgsJsons) ) referenceTable <- inner_join(referenceTable, analysisIdToSharedBalanceArgsId, by = "analysisId") - noSharedBalanceIds <- which(uniqueSharedBalanceArgsJsons == "") idxWithSharedBalance <- !(referenceTable$sharedBalanceId %in% noSharedBalanceIds) referenceTable$sharedBalanceFile <- "" - referenceTable$sharedBalanceFile[idxWithSharedBalance] <- .createsharedBalanceFileName( - loadId = referenceTable$loadArgsId[idxWithSharedBalance], - studyPopId = referenceTable$studyPopArgsId[idxWithSharedBalance], - psId = referenceTable$psArgsId[idxWithSharedBalance], - strataId = referenceTable$strataArgsId[idxWithSharedBalance], - sharedBalanceId = referenceTable$sharedBalanceId[idxWithSharedBalance], - targetId = referenceTable$targetId[idxWithSharedBalance], - comparatorId = referenceTable$comparatorId[idxWithSharedBalance], - nestingCohortId = referenceTable$nestingCohortId[idxWithSharedBalance] - ) + referenceTable$sharedBalanceFile[idxWithSharedBalance] <- vapply(which(idxWithSharedBalance), function(i) { + aIdx <- match(referenceTable$analysisId[i], analyses$analysisId) + hash <- .contentHash( + databaseId, + referenceTable$loadHash[i], + studyPopArgsJsons[[aIdx]], + createPsArgsJsons[[aIdx]], + strataArgsJsons[[aIdx]], + sharedBalanceArgsJsons[[aIdx]] + ) + .createSharedBalanceFileName(hash) + }, character(1)) } # Add covariate balance files (per target-comparator-analysis-outcome) @@ -1342,27 +1325,32 @@ createReferenceTable <- function(cmAnalysisList, } idxFilter <- referenceTable$balanceId %in% balanceIdsRequiringFiltering referenceTable$filteredForbalanceFile <- "" - referenceTable$filteredForbalanceFile[idxFilter] <- .createFilterForBalanceFileName( - loadId = referenceTable$loadArgsId[idxFilter], - balanceId = referenceTable$balanceId[idxFilter], - targetId = referenceTable$targetId[idxFilter], - comparatorId = referenceTable$comparatorId[idxFilter], - nestingCohortId = referenceTable$nestingCohortId[idxFilter] - ) + referenceTable$filteredForbalanceFile[idxFilter] <- vapply(which(idxFilter), function(i) { + aIdx <- match(referenceTable$analysisId[i], analyses$analysisId) + hash <- .contentHash( + databaseId, + referenceTable$loadHash[i], + balanceArgsJsons[[aIdx]] + ) + .createFilterForBalanceFileName(hash) + }, character(1)) + noBalanceIds <- which(uniqueBalanceArgsJsons == "") idxWithBalance <- !(referenceTable$balanceId %in% noBalanceIds) referenceTable$balanceFile <- "" - referenceTable$balanceFile[idxWithBalance] <- .createBalanceFileName( - loadId = referenceTable$loadArgsId[idxWithBalance], - studyPopId = referenceTable$studyPopArgsId[idxWithBalance], - psId = referenceTable$psArgsId[idxWithBalance], - strataId = referenceTable$strataArgsId[idxWithBalance], - balanceId = referenceTable$balanceId[idxWithBalance], - targetId = referenceTable$targetId[idxWithBalance], - comparatorId = referenceTable$comparatorId[idxWithBalance], - nestingCohortId = referenceTable$nestingCohortId[idxWithBalance], - outcomeId = referenceTable$outcomeId[idxWithBalance] - ) + referenceTable$balanceFile[idxWithBalance] <- vapply(which(idxWithBalance), function(i) { + aIdx <- match(referenceTable$analysisId[i], analyses$analysisId) + hash <- .contentHash( + databaseId, + referenceTable$loadHash[i], + studyPopArgsJsons[[aIdx]], + createPsArgsJsons[[aIdx]], + strataArgsJsons[[aIdx]], + balanceArgsJsons[[aIdx]], + referenceTable$outcomeId[i] + ) + .createBalanceFileName(hash) + }, character(1)) # Add prefiltered covariate files preFilterArgJsons <- lapply( @@ -1394,27 +1382,36 @@ createReferenceTable <- function(cmAnalysisList, idxWithPreFilter <- !(referenceTable$prefilterId %in% noPreFilterIds) referenceTable$prefilteredCovariatesFile <- "" - referenceTable$prefilteredCovariatesFile[idxWithPreFilter] <- .createPrefilteredCovariatesFileName( - loadId = referenceTable$loadArgsId[idxWithPreFilter], - targetId = referenceTable$targetId[idxWithPreFilter], - comparatorId = referenceTable$comparatorId[idxWithPreFilter], - nestingCohortId = referenceTable$nestingCohortId[idxWithPreFilter], - prefilterId = referenceTable$prefilterId[idxWithPreFilter] - ) + referenceTable$prefilteredCovariatesFile[idxWithPreFilter] <- vapply(which(idxWithPreFilter), function(i) { + aIdx <- match(referenceTable$analysisId[i], analyses$analysisId) + hash <- .contentHash( + databaseId, + referenceTable$loadHash[i], + preFilterArgJsons[[aIdx]] + ) + .createPrefilteredCovariatesFileName(hash) + }, character(1)) # Add outcome model file names - referenceTable <- referenceTable |> - mutate(outcomeModelFile = ifelse(.data$fitOutcomeModel, - .createOutcomeModelFileName( - folder = .data$analysisFolder, - targetId = .data$targetId, - comparatorId = .data$comparatorId, - nestingCohortId = .data$nestingCohortId, - outcomeId = .data$outcomeId - ), - "" - )) + outcomeModelArgsJsons <- lapply(cmAnalysisList, function(x) { + if (is.null(x$fitOutcomeModelArgs)) "" else x$fitOutcomeModelArgs$toJson() + }) + referenceTable$outcomeModelFile <- "" + idxWithOm <- referenceTable$fitOutcomeModel + referenceTable$outcomeModelFile[idxWithOm] <- vapply(which(idxWithOm), function(i) { + aIdx <- match(referenceTable$analysisId[i], analyses$analysisId) + hash <- .contentHash( + databaseId, + referenceTable$loadHash[i], + studyPopArgsJsons[[aIdx]], + createPsArgsJsons[[aIdx]], + strataArgsJsons[[aIdx]], + outcomeModelArgsJsons[[aIdx]], + referenceTable$outcomeId[i] + ) + .createOutcomeModelFileName(referenceTable$analysisFolder[i], hash) + }, character(1)) # Some cleanup: referenceTable <- referenceTable[, c( @@ -1479,154 +1476,62 @@ createReferenceTable <- function(cmAnalysisList, return(referenceTable) } -.f <- function(x) { - return(format(x, scientific = FALSE, trim = TRUE)) -} - -.createTcnSubstring <- function(targetId, comparatorId, nestingCohortId) { - subString <- if_else(is.na(nestingCohortId), - sprintf("_t%s_c%s", .f(targetId), .f(comparatorId)), - sprintf("_t%s_c%s_n%s", .f(targetId), .f(comparatorId), .f(nestingCohortId))) - - return(subString) -} - -.createCohortMethodDataFileName <- function(loadId, targetId, comparatorId, nestingCohortId) { - name <- sprintf("CmData_l%s%s.zip", - loadId, - .createTcnSubstring(targetId, comparatorId, nestingCohortId)) - return(name) +# Compute a deterministic content hash from arbitrary inputs. +# Used to generate stable file names that depend on settings, not position. +# @param ... Inputs to hash. R6 settings objects are serialized via toList() +# with sorted keys for deterministic ordering. +# @param length Number of hex characters to keep (default 12). +# @return A character string of `length` hex characters. +.contentHash <- function(..., length = 12) { + parts <- list(...) + canonical <- paste(vapply(parts, function(x) { + if (is.null(x) || (is.atomic(x) && length(x) == 1 && is.na(x))) { + "NULL" + } else if (inherits(x, "AbstractSerializableSettings")) { + lst <- x$toList() + as.character(jsonlite::toJSON(lst[order(names(lst))], + auto_unbox = TRUE, digits = NA, null = "null")) + } else { + as.character(jsonlite::toJSON(x, auto_unbox = TRUE, digits = NA, null = "null")) + } + }, character(1)), collapse = "|") + substr(digest::digest(canonical, algo = "sha256", serialize = FALSE), 1, length) } - -.createPrefilteredCovariatesFileName <- function(loadId, targetId, comparatorId, nestingCohortId, prefilterId) { - name <- sprintf("Prefilter_l%s%s_p%s.zip", - loadId, - .createTcnSubstring(targetId, comparatorId, nestingCohortId), - prefilterId) - name[prefilterId == -1] <- rep("", sum(prefilterId == -1)) - return(name) +.createCohortMethodDataFileName <- function(loadHash) { + sprintf("CmData_%s.zip", loadHash) } -.createStudyPopulationFileName <- function(loadId, - studyPopId, - targetId, - comparatorId, - nestingCohortId, - outcomeId) { - name <- sprintf("StudyPop_l%s_s%s%s_o%s.rds", - loadId, - studyPopId, - .createTcnSubstring(targetId, comparatorId, nestingCohortId), - .f(outcomeId)) - return(name) +.createPrefilteredCovariatesFileName <- function(hash) { + sprintf("Prefilter_%s.zip", hash) } -.createPsFileName <- function(loadId, studyPopId, psId, targetId, comparatorId, nestingCohortId) { - if (is.null(studyPopId)) { - name <- sprintf("Ps_l%s_p%s%s.rds", - loadId, - psId, - .createTcnSubstring(targetId, comparatorId, nestingCohortId)) - } else { - name <- sprintf("Ps_l%s_s%s_p%s%s.rds", - loadId, - studyPopId, - psId, - .createTcnSubstring(targetId, comparatorId, nestingCohortId)) - } - return(name) +.createStudyPopulationFileName <- function(hash) { + sprintf("StudyPop_%s.rds", hash) } -.createPsOutcomeFileName <- function(loadId, - studyPopId, - psId, - targetId, - comparatorId, - nestingCohortId, - outcomeId) { - name <- sprintf("Ps_l%s_s%s_p%s%s_o%s.rds", - loadId, - studyPopId, - psId, - .createTcnSubstring(targetId, comparatorId, nestingCohortId), - .f(outcomeId)) - return(name) +.createPsFileName <- function(hash) { + sprintf("Ps_%s.rds", hash) } -.createStratifiedPopFileName <- function(loadId, - studyPopId, - psId, - strataId, - targetId, - comparatorId, - nestingCohortId, - outcomeId) { - name <- sprintf("StratPop_l%s_s%s_p%s%s_s%s_o%s.rds", - loadId, - studyPopId, - psId, - .createTcnSubstring(targetId, comparatorId, nestingCohortId), - strataId, - .f(outcomeId)) - return(name) +.createStratifiedPopFileName <- function(hash) { + sprintf("StratPop_%s.rds", hash) } -.createsharedBalanceFileName <- function(loadId, - studyPopId, - psId, - strataId, - sharedBalanceId, - targetId, - comparatorId, - nestingCohortId) { - name <- sprintf("Balance_l%s_s%s_p%s%s_s%s_b%s.rds", - loadId, - studyPopId, - psId, - .createTcnSubstring(targetId, comparatorId, nestingCohortId), - strataId, - sharedBalanceId) - return(name) +.createSharedBalanceFileName <- function(hash) { + sprintf("SharedBalance_%s.rds", hash) } -.createFilterForBalanceFileName <- function(loadId, - balanceId, - targetId, - comparatorId, - nestingCohortId) { - name <- sprintf("FilterForBalance_l%s%s_b%s.zip", - loadId, - .createTcnSubstring(targetId, comparatorId, nestingCohortId), - balanceId) - return(name) +.createFilterForBalanceFileName <- function(hash) { + sprintf("FilterForBalance_%s.zip", hash) } -.createBalanceFileName <- function(loadId, - studyPopId, - psId, - strataId, - balanceId, - targetId, - comparatorId, - nestingCohortId, - outcomeId) { - name <- sprintf("Balance_l%s_s%s_p%s%s_s%s_b%s_o%s.rds", - loadId, - studyPopId, - psId, - .createTcnSubstring(targetId, comparatorId, nestingCohortId), - strataId, - balanceId, - .f(outcomeId)) - return(name) +.createBalanceFileName <- function(hash) { + sprintf("Balance_%s.rds", hash) } -.createOutcomeModelFileName <- function(folder, targetId, comparatorId, nestingCohortId, outcomeId) { - name <- sprintf("om%s_o%s.rds", - .createTcnSubstring(targetId, comparatorId, nestingCohortId), - .f(outcomeId)) - return(file.path(folder, name)) +.createOutcomeModelFileName <- function(folder, hash) { + file.path(folder, sprintf("om_%s.rds", hash)) } #' Get file reference diff --git a/R/SettingsHasher.R b/R/SettingsHasher.R deleted file mode 100644 index 70efcac..0000000 --- a/R/SettingsHasher.R +++ /dev/null @@ -1,285 +0,0 @@ -# Copyright 2026 Observational Health Data Sciences and Informatics -# -# This file is part of CohortMethod -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#' Settings Hasher -#' -#' @description -#' Compute deterministic hashes for individual settings components to detect -#' which parts of the analysis specification have changed. -#' -#' @details -#' This class hashes each settings component independently using JSON serialization -#' and MD5 digests. This allows fine-grained detection of which artifact types need -#' to be invalidated when specifications change. -#' -#' @export -SettingsHasher <- R6::R6Class( - "SettingsHasher", - public = list( - #' @description - #' Hash the data loading arguments - #' - #' @param getDbCohortMethodDataArgs An object of type `GetDbCohortMethodDataArgs` - #' - #' @return - #' Character string containing the MD5 hash (32 hex characters) - hashLoadArgs = function(getDbCohortMethodDataArgs) { - if (is.null(getDbCohortMethodDataArgs)) { - return("") - } - json <- getDbCohortMethodDataArgs$toJson() - return(digest::digest(json, algo = "md5")) - }, - - #' @description - #' Hash the study population arguments - #' - #' @param createStudyPopulationArgs An object of type `CreateStudyPopulationArgs` - #' - #' @return - #' Character string containing the MD5 hash (32 hex characters) - hashStudyPopArgs = function(createStudyPopulationArgs) { - if (is.null(createStudyPopulationArgs)) { - return("") - } - json <- createStudyPopulationArgs$toJson() - return(digest::digest(json, algo = "md5")) - }, - - #' @description - #' Hash the propensity score arguments - #' - #' @param createPsArgs An object of type `CreatePsArgs` - #' - #' @return - #' Character string containing the MD5 hash (32 hex characters) - hashPsArgs = function(createPsArgs) { - if (is.null(createPsArgs)) { - return("") - } - json <- createPsArgs$toJson() - return(digest::digest(json, algo = "md5")) - }, - - #' @description - #' Hash arguments for propensity score trimming - #' - #' @param trimByPsArgs An object of type `TrimByPsArgs` - #' - #' @return - #' Character string containing the MD5 hash (32 hex characters) - hashTrimByPsArgs = function(trimByPsArgs) { - if (is.null(trimByPsArgs)) { - return("") - } - json <- trimByPsArgs$toJson() - return(digest::digest(json, algo = "md5")) - }, - - #' @description - #' Hash arguments for propensity score matching - #' - #' @param matchOnPsArgs An object of type `MatchOnPsArgs` - #' - #' @return - #' Character string containing the MD5 hash (32 hex characters) - hashMatchOnPsArgs = function(matchOnPsArgs) { - if (is.null(matchOnPsArgs)) { - return("") - } - json <- matchOnPsArgs$toJson() - return(digest::digest(json, algo = "md5")) - }, - - #' @description - #' Hash arguments for propensity score stratification - #' - #' @param stratifyByPsArgs An object of type `StratifyByPsArgs` - #' - #' @return - #' Character string containing the MD5 hash (32 hex characters) - hashStratifyByPsArgs = function(stratifyByPsArgs) { - if (is.null(stratifyByPsArgs)) { - return("") - } - json <- stratifyByPsArgs$toJson() - return(digest::digest(json, algo = "md5")) - }, - - #' @description - #' Hash arguments for outcome model fitting - #' - #' @param fitOutcomeModelArgs An object of type `FitOutcomeModelArgs` - #' - #' @return - #' Character string containing the MD5 hash (32 hex characters) - hashOutcomeModelArgs = function(fitOutcomeModelArgs) { - if (is.null(fitOutcomeModelArgs)) { - return("") - } - json <- fitOutcomeModelArgs$toJson() - return(digest::digest(json, algo = "md5")) - }, - - #' @description - #' Hash arguments for covariate balance computation - #' - #' @param computeCovariateBalanceArgs An object of type `ComputeCovariateBalanceArgs` - #' - #' @return - #' Character string containing the MD5 hash (32 hex characters) - hashBalanceArgs = function(computeCovariateBalanceArgs) { - if (is.null(computeCovariateBalanceArgs)) { - return("") - } - json <- computeCovariateBalanceArgs$toJson() - return(digest::digest(json, algo = "md5")) - }, - - #' @description - #' Compare old and new specifications to identify which components changed - #' - #' @param oldSpecs An object of type `CmAnalysesSpecifications` (old version) - #' @param newSpecs An object of type `CmAnalysesSpecifications` (new version) - #' - #' @return - #' A list of logical values indicating which components changed: - #' - loadArgsChanged: whether data loading arguments differ - #' - studyPopArgsChanged: whether study population arguments differ - #' - psArgsChanged: whether propensity score arguments differ - #' - strataArgsChanged: whether stratification arguments (trim/match/stratify) differ - #' - outcomeModelArgsChanged: whether outcome model arguments differ - #' - balanceArgsChanged: whether balance computation arguments differ - #' - analyticsChanged: whether outcome of interest or analysis IDs changed - compareSettingsComponents = function(oldSpecs, newSpecs) { - checkmate::assertClass(oldSpecs, "CmAnalysesSpecifications") - checkmate::assertClass(newSpecs, "CmAnalysesSpecifications") - - # Compare each analysis component's settings - changedComponents <- list( - loadArgsChanged = FALSE, - studyPopArgsChanged = FALSE, - psArgsChanged = FALSE, - strataArgsChanged = FALSE, - outcomeModelArgsChanged = FALSE, - balanceArgsChanged = FALSE, - analyticsChanged = FALSE - ) - - # Get the old and new analysis lists - oldAnalyses <- oldSpecs$cmAnalysisList - newAnalyses <- newSpecs$cmAnalysisList - - # If the number of analyses differs significantly or can't be matched, mark as changed - if (length(oldAnalyses) != length(newAnalyses)) { - # For now, mark as all changed if list lengths differ - # A more sophisticated approach would match analyses by ID - changedComponents$analyticsChanged <- TRUE - } else { - # Compare each analysis by ID (assuming they maintain order/ID) - for (i in seq_along(newAnalyses)) { - oldAna <- oldAnalyses[[i]] - newAna <- newAnalyses[[i]] - - # Check data loading - oldLoadHash <- self$hashLoadArgs(oldAna$getDbCohortMethodDataArgs) - newLoadHash <- self$hashLoadArgs(newAna$getDbCohortMethodDataArgs) - if (oldLoadHash != newLoadHash) { - changedComponents$loadArgsChanged <- TRUE - } - - # Check study population - oldStudyPopHash <- self$hashStudyPopArgs(oldAna$createStudyPopulationArgs) - newStudyPopHash <- self$hashStudyPopArgs(newAna$createStudyPopulationArgs) - if (oldStudyPopHash != newStudyPopHash) { - changedComponents$studyPopArgsChanged <- TRUE - } - - # Check propensity score - oldPsHash <- self$hashPsArgs(oldAna$createPsArgs) - newPsHash <- self$hashPsArgs(newAna$createPsArgs) - if (oldPsHash != newPsHash) { - changedComponents$psArgsChanged <- TRUE - } - - # Check stratification (trim/match/stratify all affect strata) - oldTrimHash <- self$hashTrimByPsArgs(oldAna$trimByPsArgs) - newTrimHash <- self$hashTrimByPsArgs(newAna$trimByPsArgs) - oldMatchHash <- self$hashMatchOnPsArgs(oldAna$matchOnPsArgs) - newMatchHash <- self$hashMatchOnPsArgs(newAna$matchOnPsArgs) - oldStratifyHash <- self$hashStratifyByPsArgs(oldAna$stratifyByPsArgs) - newStratifyHash <- self$hashStratifyByPsArgs(newAna$stratifyByPsArgs) - - if (oldTrimHash != newTrimHash || oldMatchHash != newMatchHash || oldStratifyHash != newStratifyHash) { - changedComponents$strataArgsChanged <- TRUE - } - - # Check outcome model - oldOutcomeHash <- self$hashOutcomeModelArgs(oldAna$fitOutcomeModelArgs) - newOutcomeHash <- self$hashOutcomeModelArgs(newAna$fitOutcomeModelArgs) - if (oldOutcomeHash != newOutcomeHash) { - changedComponents$outcomeModelArgsChanged <- TRUE - } - - # Check balance args - oldBalanceHash <- self$hashBalanceArgs(oldAna$computeSharedCovariateBalanceArgs) - newBalanceHash <- self$hashBalanceArgs(newAna$computeSharedCovariateBalanceArgs) - if (oldBalanceHash != newBalanceHash) { - changedComponents$balanceArgsChanged <- TRUE - } - } - } - - # Check if outcomes of interest or excluded analyses changed - oldOutcomes <- oldSpecs$targetComparatorOutcomesList - newOutcomes <- newSpecs$targetComparatorOutcomesList - - if (length(oldOutcomes) != length(newOutcomes)) { - changedComponents$analyticsChanged <- TRUE - } else { - # Check if outcomes within TCOs differ - for (i in seq_along(newOutcomes)) { - oldTco <- oldOutcomes[[i]] - newTco <- newOutcomes[[i]] - - # Simple check: compare target, comparator, nesting cohort - if (oldTco$targetId != newTco$targetId || - oldTco$comparatorId != newTco$comparatorId || - !identical(oldTco$nestingCohortId, newTco$nestingCohortId)) { - changedComponents$analyticsChanged <- TRUE - break - } - - # Check outcome changes (new outcomes added or old ones removed) - oldOutcomeIds <- sort(sapply(oldTco$outcomes, function(x) x$outcomeId)) - newOutcomeIds <- sort(sapply(newTco$outcomes, function(x) x$outcomeId)) - if (!identical(oldOutcomeIds, newOutcomeIds)) { - # Just outcomes added: no need to delete - # Outcomes removed: potentially need to delete - if (any(!(newOutcomeIds %in% oldOutcomeIds))) { - # New outcomes added - this is OK, don't mark as changed - } else { - # Outcomes removed - this means regeneration may be needed - changedComponents$analyticsChanged <- TRUE - } - } - } - } - - return(changedComponents) - } - ) -) diff --git a/R/ValidateArtifact.R b/R/ValidateArtifact.R deleted file mode 100644 index 708822d..0000000 --- a/R/ValidateArtifact.R +++ /dev/null @@ -1,150 +0,0 @@ -# Copyright 2026 Observational Health Data Sciences and Informatics -# -# This file is part of CohortMethod -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -#' Validate Artifact -#' -#' @description -#' Utility functions to validate that cached artifacts match expected parameter hashes. -#' -#' @details -#' These functions help determine whether an existing artifact can be reused by -#' validating that its parameters match the hash encoded in the filename or metadata. -#' -#' @keywords internal -#' -#' @export -ValidateArtifact <- R6::R6Class( - "ValidateArtifact", - public = list( - #' @description - #' Extract hash from a filename - #' - #' @param filename Character string with the artifact filename - #' - #' @return - #' Character string containing the extracted hash, or NA if not found - #' - #' @details - #' Filenames typically have format like: - #' - CmData_l{loadId}_{tcn}.zip - #' - StudyPop_l{loadId}_s{studyPopId}_{tcn}_o{outcomeId}.rds - #' - #' This function extracts numeric IDs which are used as hashes. - extractHashFromFilename = function(filename) { - # This is a placeholder - actual hash extraction depends on filename format - # In CohortMethod, the "ID" (like loadId, studyPopId) is the hash - # This would be enhanced based on actual filename patterns used - - # For now, return NA - the actual validation happens at file existence level - return(NA_character_) - }, - - #' @description - #' Validate that an artifact file exists and matches expected settings - #' - #' @param filepath Character string with the full path to the artifact file - #' @param settingsHash Character string with the expected settings hash - #' - #' @return - #' Logical TRUE if file exists and hash matches, FALSE otherwise - #' - #' @details - #' This function performs basic validation. In the current implementation, - #' we rely on filename encoding to ensure the hash is embedded in the filename. - #' If the file exists and has the right naming pattern, we consider it valid. - validateArtifactFile = function(filepath, settingsHash = NULL) { - # Check if file exists - if (!file.exists(filepath)) { - return(FALSE) - } - - # If no hash provided, just check existence - if (is.null(settingsHash)) { - return(TRUE) - } - - # Extract filename - filename <- basename(filepath) - - # In current CohortMethod implementation, the hash is encoded in the filename - # through the ID system (loadId, studyPopId, etc.) - # Validation consists of checking that the file exists with correct naming - - # For now, assume existence + correct naming = valid artifact - # This could be enhanced with actual hash extraction and comparison - - return(TRUE) - }, - - #' @description - #' Check if a cohort method data file is valid and reusable - #' - #' @param filepath Character string with the path to the CmData zip file - #' @param expectedLoadId Numeric ID expected for this data loading configuration - #' - #' @return - #' Logical TRUE if file exists and corresponds to the expected load ID - validateCohortMethodData = function(filepath, expectedLoadId) { - if (!file.exists(filepath)) { - return(FALSE) - } - - # Check that file is readable as a zip - tryCatch({ - # Try to get zip file list to verify integrity - zip::zip_list(filepath) - return(TRUE) - }, error = function(e) { - return(FALSE) - }) - }, - - #' @description - #' List all analysis artifacts in a directory by type - #' - #' @param outputFolder Character string with the output folder path - #' @param artifactType Character string indicating artifact type: - #' "cmdata", "studypop", "ps", "stratpop", "balance", "outcome_model" - #' - #' @return - #' Character vector of full paths to matching artifact files - listArtifactsByType = function(outputFolder, artifactType) { - checkmate::assertCharacter(outputFolder, len = 1) - checkmate::assertCharacter(artifactType, len = 1) - - if (!dir.exists(outputFolder)) { - return(character()) - } - - patterns <- list( - cmdata = "^CmData_.*\\.zip$", - studypop = "^StudyPop_.*\\.rds$", - ps = "^Ps_.*\\.rds$", - stratpop = "^StratPop_.*\\.rds$", - balance = "^Balance_.*\\.rds$", - outcome_model = "^Analysis_[0-9]+/om.*\\.rds$" - ) - - pattern <- patterns[[tolower(artifactType)]] - if (is.null(pattern)) { - stop("Unknown artifact type: ", artifactType) - } - - files <- list.files(outputFolder, pattern = pattern, full.names = TRUE) - return(files) - } - ) -) diff --git a/man/InvalidationPolicy.Rd b/man/InvalidationPolicy.Rd deleted file mode 100644 index 45834eb..0000000 --- a/man/InvalidationPolicy.Rd +++ /dev/null @@ -1,94 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/InvalidationPolicy.R -\name{InvalidationPolicy} -\alias{InvalidationPolicy} -\title{Invalidation Policy} -\description{ -Determines which artifacts should be deleted when analysis settings change, -based on dependency relationships between components. -} -\details{ -When settings change, not all artifacts need to be regenerated. This class -implements the logic to determine minimal invalidation scope based on which -settings components have changed. -} -\section{Methods}{ -\subsection{Public methods}{ -\itemize{ -\item \href{#method-InvalidationPolicy-computeInvalidationScope}{\code{InvalidationPolicy$computeInvalidationScope()}} -\item \href{#method-InvalidationPolicy-getInvalidationMessage}{\code{InvalidationPolicy$getInvalidationMessage()}} -\item \href{#method-InvalidationPolicy-clone}{\code{InvalidationPolicy$clone()}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-InvalidationPolicy-computeInvalidationScope}{}}} -\subsection{Method \code{computeInvalidationScope()}}{ -Compute which file patterns should be deleted based on changed components -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{InvalidationPolicy$computeInvalidationScope(changedComponents)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{changedComponents}}{A list (as returned by SettingsHasher$compareSettingsComponents) -with logical values for each component} -} -\if{html}{\out{
}} -} -\subsection{Details}{ -Deletion cascade logic: -\itemize{ -\item If loadArgsChanged: all downstream artifacts (everything) -\item Else if studyPopArgsChanged: populations, PS models, strata, outcomes -\item Else if psArgsChanged: PS models, strata, outcomes -\item Else if strataArgsChanged: strata and outcomes -\item Else if outcomeModelArgsChanged: outcome models only -\item Else if only balanceArgsChanged: balance files only -\item Else if analyticsChanged (only new outcomes): nothing (outcomes are outcome-specific) -} -} - -\subsection{Returns}{ -Character vector of file patterns/globs to delete. Empty vector if no deletion needed. -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-InvalidationPolicy-getInvalidationMessage}{}}} -\subsection{Method \code{getInvalidationMessage()}}{ -Get a human-readable message describing what will be deleted -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{InvalidationPolicy$getInvalidationMessage(changedComponents)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{changedComponents}}{A list of changed components} -} -\if{html}{\out{
}} -} -\subsection{Returns}{ -Character string suitable for displaying to the user -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-InvalidationPolicy-clone}{}}} -\subsection{Method \code{clone()}}{ -The objects of this class are cloneable with this method. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{InvalidationPolicy$clone(deep = FALSE)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{deep}}{Whether to make a deep clone.} -} -\if{html}{\out{
}} -} -} -} diff --git a/man/SettingsHasher.Rd b/man/SettingsHasher.Rd deleted file mode 100644 index 4e1df70..0000000 --- a/man/SettingsHasher.Rd +++ /dev/null @@ -1,238 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/SettingsHasher.R -\name{SettingsHasher} -\alias{SettingsHasher} -\title{Settings Hasher} -\description{ -Compute deterministic hashes for individual settings components to detect -which parts of the analysis specification have changed. -} -\details{ -This class hashes each settings component independently using JSON serialization -and MD5 digests. This allows fine-grained detection of which artifact types need -to be invalidated when specifications change. -} -\section{Methods}{ -\subsection{Public methods}{ -\itemize{ -\item \href{#method-SettingsHasher-hashLoadArgs}{\code{SettingsHasher$hashLoadArgs()}} -\item \href{#method-SettingsHasher-hashStudyPopArgs}{\code{SettingsHasher$hashStudyPopArgs()}} -\item \href{#method-SettingsHasher-hashPsArgs}{\code{SettingsHasher$hashPsArgs()}} -\item \href{#method-SettingsHasher-hashTrimByPsArgs}{\code{SettingsHasher$hashTrimByPsArgs()}} -\item \href{#method-SettingsHasher-hashMatchOnPsArgs}{\code{SettingsHasher$hashMatchOnPsArgs()}} -\item \href{#method-SettingsHasher-hashStratifyByPsArgs}{\code{SettingsHasher$hashStratifyByPsArgs()}} -\item \href{#method-SettingsHasher-hashOutcomeModelArgs}{\code{SettingsHasher$hashOutcomeModelArgs()}} -\item \href{#method-SettingsHasher-hashBalanceArgs}{\code{SettingsHasher$hashBalanceArgs()}} -\item \href{#method-SettingsHasher-compareSettingsComponents}{\code{SettingsHasher$compareSettingsComponents()}} -\item \href{#method-SettingsHasher-clone}{\code{SettingsHasher$clone()}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-SettingsHasher-hashLoadArgs}{}}} -\subsection{Method \code{hashLoadArgs()}}{ -Hash the data loading arguments -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{SettingsHasher$hashLoadArgs(getDbCohortMethodDataArgs)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{getDbCohortMethodDataArgs}}{An object of type \code{GetDbCohortMethodDataArgs}} -} -\if{html}{\out{
}} -} -\subsection{Returns}{ -Character string containing the MD5 hash (32 hex characters) -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-SettingsHasher-hashStudyPopArgs}{}}} -\subsection{Method \code{hashStudyPopArgs()}}{ -Hash the study population arguments -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{SettingsHasher$hashStudyPopArgs(createStudyPopulationArgs)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{createStudyPopulationArgs}}{An object of type \code{CreateStudyPopulationArgs}} -} -\if{html}{\out{
}} -} -\subsection{Returns}{ -Character string containing the MD5 hash (32 hex characters) -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-SettingsHasher-hashPsArgs}{}}} -\subsection{Method \code{hashPsArgs()}}{ -Hash the propensity score arguments -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{SettingsHasher$hashPsArgs(createPsArgs)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{createPsArgs}}{An object of type \code{CreatePsArgs}} -} -\if{html}{\out{
}} -} -\subsection{Returns}{ -Character string containing the MD5 hash (32 hex characters) -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-SettingsHasher-hashTrimByPsArgs}{}}} -\subsection{Method \code{hashTrimByPsArgs()}}{ -Hash arguments for propensity score trimming -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{SettingsHasher$hashTrimByPsArgs(trimByPsArgs)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{trimByPsArgs}}{An object of type \code{TrimByPsArgs}} -} -\if{html}{\out{
}} -} -\subsection{Returns}{ -Character string containing the MD5 hash (32 hex characters) -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-SettingsHasher-hashMatchOnPsArgs}{}}} -\subsection{Method \code{hashMatchOnPsArgs()}}{ -Hash arguments for propensity score matching -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{SettingsHasher$hashMatchOnPsArgs(matchOnPsArgs)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{matchOnPsArgs}}{An object of type \code{MatchOnPsArgs}} -} -\if{html}{\out{
}} -} -\subsection{Returns}{ -Character string containing the MD5 hash (32 hex characters) -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-SettingsHasher-hashStratifyByPsArgs}{}}} -\subsection{Method \code{hashStratifyByPsArgs()}}{ -Hash arguments for propensity score stratification -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{SettingsHasher$hashStratifyByPsArgs(stratifyByPsArgs)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{stratifyByPsArgs}}{An object of type \code{StratifyByPsArgs}} -} -\if{html}{\out{
}} -} -\subsection{Returns}{ -Character string containing the MD5 hash (32 hex characters) -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-SettingsHasher-hashOutcomeModelArgs}{}}} -\subsection{Method \code{hashOutcomeModelArgs()}}{ -Hash arguments for outcome model fitting -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{SettingsHasher$hashOutcomeModelArgs(fitOutcomeModelArgs)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{fitOutcomeModelArgs}}{An object of type \code{FitOutcomeModelArgs}} -} -\if{html}{\out{
}} -} -\subsection{Returns}{ -Character string containing the MD5 hash (32 hex characters) -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-SettingsHasher-hashBalanceArgs}{}}} -\subsection{Method \code{hashBalanceArgs()}}{ -Hash arguments for covariate balance computation -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{SettingsHasher$hashBalanceArgs(computeCovariateBalanceArgs)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{computeCovariateBalanceArgs}}{An object of type \code{ComputeCovariateBalanceArgs}} -} -\if{html}{\out{
}} -} -\subsection{Returns}{ -Character string containing the MD5 hash (32 hex characters) -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-SettingsHasher-compareSettingsComponents}{}}} -\subsection{Method \code{compareSettingsComponents()}}{ -Compare old and new specifications to identify which components changed -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{SettingsHasher$compareSettingsComponents(oldSpecs, newSpecs)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{oldSpecs}}{An object of type \code{CmAnalysesSpecifications} (old version)} - -\item{\code{newSpecs}}{An object of type \code{CmAnalysesSpecifications} (new version)} -} -\if{html}{\out{
}} -} -\subsection{Returns}{ -A list of logical values indicating which components changed: -\itemize{ -\item loadArgsChanged: whether data loading arguments differ -\item studyPopArgsChanged: whether study population arguments differ -\item psArgsChanged: whether propensity score arguments differ -\item strataArgsChanged: whether stratification arguments (trim/match/stratify) differ -\item outcomeModelArgsChanged: whether outcome model arguments differ -\item balanceArgsChanged: whether balance computation arguments differ -\item analyticsChanged: whether outcome of interest or analysis IDs changed -} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-SettingsHasher-clone}{}}} -\subsection{Method \code{clone()}}{ -The objects of this class are cloneable with this method. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{SettingsHasher$clone(deep = FALSE)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{deep}}{Whether to make a deep clone.} -} -\if{html}{\out{
}} -} -} -} diff --git a/man/ValidateArtifact.Rd b/man/ValidateArtifact.Rd deleted file mode 100644 index 1d1c516..0000000 --- a/man/ValidateArtifact.Rd +++ /dev/null @@ -1,144 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/ValidateArtifact.R -\name{ValidateArtifact} -\alias{ValidateArtifact} -\title{Validate Artifact} -\description{ -Utility functions to validate that cached artifacts match expected parameter hashes. -} -\details{ -These functions help determine whether an existing artifact can be reused by -validating that its parameters match the hash encoded in the filename or metadata. -} -\keyword{internal} -\section{Methods}{ -\subsection{Public methods}{ -\itemize{ -\item \href{#method-ValidateArtifact-extractHashFromFilename}{\code{ValidateArtifact$extractHashFromFilename()}} -\item \href{#method-ValidateArtifact-validateArtifactFile}{\code{ValidateArtifact$validateArtifactFile()}} -\item \href{#method-ValidateArtifact-validateCohortMethodData}{\code{ValidateArtifact$validateCohortMethodData()}} -\item \href{#method-ValidateArtifact-listArtifactsByType}{\code{ValidateArtifact$listArtifactsByType()}} -\item \href{#method-ValidateArtifact-clone}{\code{ValidateArtifact$clone()}} -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-ValidateArtifact-extractHashFromFilename}{}}} -\subsection{Method \code{extractHashFromFilename()}}{ -Extract hash from a filename -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{ValidateArtifact$extractHashFromFilename(filename)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{filename}}{Character string with the artifact filename} -} -\if{html}{\out{
}} -} -\subsection{Details}{ -Filenames typically have format like: -\itemize{ -\item CmData_l{loadId}_{tcn}.zip -\item StudyPop_l{loadId}\emph{s{studyPopId}}{tcn}_o{outcomeId}.rds -} - -This function extracts numeric IDs which are used as hashes. -} - -\subsection{Returns}{ -Character string containing the extracted hash, or NA if not found -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-ValidateArtifact-validateArtifactFile}{}}} -\subsection{Method \code{validateArtifactFile()}}{ -Validate that an artifact file exists and matches expected settings -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{ValidateArtifact$validateArtifactFile(filepath, settingsHash = NULL)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{filepath}}{Character string with the full path to the artifact file} - -\item{\code{settingsHash}}{Character string with the expected settings hash} -} -\if{html}{\out{
}} -} -\subsection{Details}{ -This function performs basic validation. In the current implementation, -we rely on filename encoding to ensure the hash is embedded in the filename. -If the file exists and has the right naming pattern, we consider it valid. -} - -\subsection{Returns}{ -Logical TRUE if file exists and hash matches, FALSE otherwise -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-ValidateArtifact-validateCohortMethodData}{}}} -\subsection{Method \code{validateCohortMethodData()}}{ -Check if a cohort method data file is valid and reusable -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{ValidateArtifact$validateCohortMethodData(filepath, expectedLoadId)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{filepath}}{Character string with the path to the CmData zip file} - -\item{\code{expectedLoadId}}{Numeric ID expected for this data loading configuration} -} -\if{html}{\out{
}} -} -\subsection{Returns}{ -Logical TRUE if file exists and corresponds to the expected load ID -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-ValidateArtifact-listArtifactsByType}{}}} -\subsection{Method \code{listArtifactsByType()}}{ -List all analysis artifacts in a directory by type -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{ValidateArtifact$listArtifactsByType(outputFolder, artifactType)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{outputFolder}}{Character string with the output folder path} - -\item{\code{artifactType}}{Character string indicating artifact type: -"cmdata", "studypop", "ps", "stratpop", "balance", "outcome_model"} -} -\if{html}{\out{
}} -} -\subsection{Returns}{ -Character vector of full paths to matching artifact files -} -} -\if{html}{\out{
}} -\if{html}{\out{}} -\if{latex}{\out{\hypertarget{method-ValidateArtifact-clone}{}}} -\subsection{Method \code{clone()}}{ -The objects of this class are cloneable with this method. -\subsection{Usage}{ -\if{html}{\out{
}}\preformatted{ValidateArtifact$clone(deep = FALSE)}\if{html}{\out{
}} -} - -\subsection{Arguments}{ -\if{html}{\out{
}} -\describe{ -\item{\code{deep}}{Whether to make a deep clone.} -} -\if{html}{\out{
}} -} -} -} diff --git a/tests/testthat/test-artifactStore.R b/tests/testthat/test-artifactStore.R new file mode 100644 index 0000000..42e9a67 --- /dev/null +++ b/tests/testthat/test-artifactStore.R @@ -0,0 +1,155 @@ +context("Artifact Store and Content Hashing") + +test_that("LocalArtifactStore can be initialized", { + outputFolder <- tempfile(pattern = "artifactStore") + on.exit(unlink(outputFolder, recursive = TRUE)) + + store <- LocalArtifactStore$new(outputFolder) + expect_s3_class(store, "R6") + expect_true(inherits(store, "ArtifactStore")) + expect_true(dir.exists(outputFolder)) +}) + +test_that("LocalArtifactStore exists/readRDS/saveRDS work", { + outputFolder <- tempfile(pattern = "artifactStore") + on.exit(unlink(outputFolder, recursive = TRUE)) + + store <- LocalArtifactStore$new(outputFolder) + + expect_false(store$exists("test.rds")) + + store$saveRDS(list(a = 1, b = "hello"), "test.rds") + expect_true(store$exists("test.rds")) + + obj <- store$readRDS("test.rds") + expect_equal(obj$a, 1) + expect_equal(obj$b, "hello") +}) + +test_that("LocalArtifactStore listArtifacts works", { + outputFolder <- tempfile(pattern = "artifactStore") + on.exit(unlink(outputFolder, recursive = TRUE)) + + store <- LocalArtifactStore$new(outputFolder) + + store$saveRDS(1, "CmData_abc123.rds") + store$saveRDS(2, "StudyPop_def456.rds") + store$saveRDS(3, "Ps_ghi789.rds") + + all <- store$listArtifacts() + expect_equal(length(all), 3) + + cm <- store$listArtifacts(prefix = "CmData") + expect_equal(length(cm), 1) + expect_true(grepl("CmData", cm)) +}) + +test_that("LocalArtifactStore delete works", { + outputFolder <- tempfile(pattern = "artifactStore") + on.exit(unlink(outputFolder, recursive = TRUE)) + + store <- LocalArtifactStore$new(outputFolder) + + store$saveRDS(1, "test.rds") + expect_true(store$exists("test.rds")) + + store$delete("test.rds") + expect_false(store$exists("test.rds")) +}) + +test_that("LocalArtifactStore ensureDir creates nested directories", { + outputFolder <- tempfile(pattern = "artifactStore") + on.exit(unlink(outputFolder, recursive = TRUE)) + + store <- LocalArtifactStore$new(outputFolder) + + store$ensureDir("Analysis_1/om_abc123.rds") + expect_true(dir.exists(file.path(outputFolder, "Analysis_1"))) +}) + +test_that("LocalArtifactStore getFullPath works", { + outputFolder <- tempfile(pattern = "artifactStore") + on.exit(unlink(outputFolder, recursive = TRUE)) + + store <- LocalArtifactStore$new(outputFolder) + + path <- store$getFullPath("CmData_abc.zip") + expect_equal(path, file.path(outputFolder, "CmData_abc.zip")) +}) + +test_that(".contentHash produces consistent results", { + hash1 <- CohortMethod:::.contentHash("db1", "loadArgs", 1, 2) + hash2 <- CohortMethod:::.contentHash("db1", "loadArgs", 1, 2) + expect_equal(hash1, hash2) + expect_equal(nchar(hash1), 12) +}) + +test_that(".contentHash differs with different inputs", { + hash1 <- CohortMethod:::.contentHash("db1", "loadArgs", 1, 2) + hash2 <- CohortMethod:::.contentHash("db2", "loadArgs", 1, 2) + expect_true(hash1 != hash2) +}) + +test_that(".contentHash handles NULL inputs", { + hash1 <- CohortMethod:::.contentHash("db1", NULL, 1) + hash2 <- CohortMethod:::.contentHash("db1", NULL, 1) + expect_equal(hash1, hash2) +}) + +test_that(".contentHash includes databaseId in differentiation", { + hash1 <- CohortMethod:::.contentHash("database_A", "settings1") + hash2 <- CohortMethod:::.contentHash("database_B", "settings1") + expect_true(hash1 != hash2) +}) + +test_that("Content-addressed filenames change when settings change", { + args1 <- createGetDbCohortMethodDataArgs( + covariateSettings = FeatureExtraction::createCovariateSettings( + useDemographicsGender = TRUE + ) + ) + args2 <- createGetDbCohortMethodDataArgs( + covariateSettings = FeatureExtraction::createCovariateSettings( + useDemographicsGender = TRUE + ), + maxCohortSize = 50000 + ) + + hash1 <- CohortMethod:::.contentHash("testDb", args1$toJson(), 1, 2, NA) + hash2 <- CohortMethod:::.contentHash("testDb", args2$toJson(), 1, 2, NA) + + expect_true(hash1 != hash2) +}) + +test_that("Content-addressed filenames stable when settings identical", { + args1 <- createGetDbCohortMethodDataArgs( + covariateSettings = FeatureExtraction::createCovariateSettings( + useDemographicsGender = TRUE + ) + ) + args2 <- createGetDbCohortMethodDataArgs( + covariateSettings = FeatureExtraction::createCovariateSettings( + useDemographicsGender = TRUE + ) + ) + + hash1 <- CohortMethod:::.contentHash("testDb", args1$toJson(), 1, 2, NA) + hash2 <- CohortMethod:::.contentHash("testDb", args2$toJson(), 1, 2, NA) + + expect_equal(hash1, hash2) +}) + +test_that("Cascading hashes propagate upstream changes", { + loadHash1 <- CohortMethod:::.contentHash("db1", "loadArgs_v1", 1, 2, NA) + loadHash2 <- CohortMethod:::.contentHash("db1", "loadArgs_v2", 1, 2, NA) + + # Study pop hash includes loadHash, so changing load args changes studyPop filename + studyPopHash1 <- CohortMethod:::.contentHash("db1", loadHash1, "studyPopArgs", 3) + studyPopHash2 <- CohortMethod:::.contentHash("db1", loadHash2, "studyPopArgs", 3) + + expect_true(studyPopHash1 != studyPopHash2) + + # But same load args + same studyPop args = same hash + studyPopHash3 <- CohortMethod:::.contentHash("db1", loadHash1, "studyPopArgs", 3) + expect_equal(studyPopHash1, studyPopHash3) +}) diff --git a/tests/testthat/test-eunomia.R b/tests/testthat/test-eunomia.R index b71e883..ffb0131 100644 --- a/tests/testthat/test-eunomia.R +++ b/tests/testthat/test-eunomia.R @@ -238,6 +238,7 @@ if (!isFALSE(tryCatch(find.package("Eunomia"), error = function(e) FALSE))) { exposureTable = "cohort", outcomeTable = "cohort", outputFolder = outputFolder1, + databaseId = "Eunomia", cmAnalysesSpecifications = createCmAnalysesSpecifications( cmAnalysisList = cmAnalysisList, targetComparatorOutcomesList = targetComparatorOutcomesList, @@ -374,6 +375,7 @@ if (!isFALSE(tryCatch(find.package("Eunomia"), error = function(e) FALSE))) { exposureTable = "cohort", outcomeTable = "cohort", outputFolder = outputFolder2, + databaseId = "Eunomia", cmAnalysesSpecifications = createCmAnalysesSpecifications( cmAnalysisList = cmAnalysisList, targetComparatorOutcomesList = targetComparatorOutcomesList, @@ -384,7 +386,7 @@ if (!isFALSE(tryCatch(find.package("Eunomia"), error = function(e) FALSE))) { ) expect_equal(result$sharedPsFile, c("", "")) - expect_equal(result$psFile, c("Ps_l1_s1_p1_t1_c2_o3.rds", "Ps_l1_s1_p1_t1_c2_o4.rds")) + expect_true(all(grepl("^Ps_", result$psFile))) }) test_that("High correlation covariates", { @@ -458,6 +460,7 @@ if (!isFALSE(tryCatch(find.package("Eunomia"), error = function(e) FALSE))) { exposureTable = "cohort", outcomeTable = "cohort", outputFolder = outputFolder3, + databaseId = "Eunomia", cmAnalysesSpecifications = createCmAnalysesSpecifications( cmAnalysisList = cmAnalysisList, targetComparatorOutcomesList = targetComparatorOutcomesList @@ -564,6 +567,7 @@ if (!isFALSE(tryCatch(find.package("Eunomia"), error = function(e) FALSE))) { exposureTable = "cohort", outcomeTable = "cohort", outputFolder = outputFolder4, + databaseId = "Eunomia", cmAnalysesSpecifications = createCmAnalysesSpecifications( cmAnalysisList = cmAnalysisList, targetComparatorOutcomesList = targetComparatorOutcomesList @@ -685,6 +689,7 @@ if (!isFALSE(tryCatch(find.package("Eunomia"), error = function(e) FALSE))) { exposureTable = "cohort", outcomeTable = "cohort", outputFolder = outputFolder5, + databaseId = "Eunomia", cmAnalysesSpecifications = createCmAnalysesSpecifications( cmAnalysisList = cmAnalysisList, targetComparatorOutcomesList = targetComparatorOutcomesList @@ -695,11 +700,11 @@ if (!isFALSE(tryCatch(find.package("Eunomia"), error = function(e) FALSE))) { expect_equal(nrow(result), 4) expect_equal(sum(result$nestingCohortId == 4, na.rm = TRUE), 2) - expect_equal(length(grep("_t1_c2_n4", result$cohortMethodDataFile)), 2) - expect_equal(length(grep("_t1_c2_n4", result$studyPopFile)), 2) - expect_equal(length(grep("_t1_c2_n4", result$sharedPsFile)), 2) - expect_equal(length(grep("_t1_c2_n4", result$psFile)), 2) - expect_equal(length(grep("_t1_c2_n4", result$strataFile)), 2) + # With content-addressable hashes, nesting cohort produces different hashes + # Verify that nested rows have distinct filenames from non-nested rows + nestedFiles <- result$cohortMethodDataFile[result$nestingCohortId == 4 & !is.na(result$nestingCohortId)] + nonNestedFiles <- result$cohortMethodDataFile[is.na(result$nestingCohortId)] + expect_true(length(intersect(nestedFiles, nonNestedFiles)) == 0) expect_equal(length(grep("_t1_c2_n4", result$sharedBalanceFile)), 2) expect_equal(length(grep("_t1_c2_n4", result$filteredForbalanceFile)), 2) expect_equal(length(grep("_t1_c2_n4", result$balanceFile)), 2) @@ -819,6 +824,7 @@ if (!isFALSE(tryCatch(find.package("Eunomia"), error = function(e) FALSE))) { exposureTable = "cohort", outcomeTable = "cohort", outputFolder = outputFolder6, + databaseId = "Eunomia", cmAnalysesSpecifications = createCmAnalysesSpecifications( cmAnalysisList = cmAnalysisList, targetComparatorOutcomesList = targetComparatorOutcomesList, diff --git a/tests/testthat/test-invalidationPolicy.R b/tests/testthat/test-invalidationPolicy.R deleted file mode 100644 index 3674508..0000000 --- a/tests/testthat/test-invalidationPolicy.R +++ /dev/null @@ -1,288 +0,0 @@ -context("Artifact caching system") - -# Helper to create covariate settings -getDefaultCovariateSettings <- function() { - return(FeatureExtraction::createCovariateSettings( - useDemographicsGender = TRUE, - useDemographicsAge = TRUE - )) -} - -test_that("SettingsHasher hashes settings consistently", { - # Create two identical settings objects - args1 <- createGetDbCohortMethodDataArgs( - covariateSettings = getDefaultCovariateSettings() - ) - args2 <- createGetDbCohortMethodDataArgs( - covariateSettings = getDefaultCovariateSettings() - ) - - hasher <- SettingsHasher$new() - - # Hash the same settings twice - hash1 <- hasher$hashLoadArgs(args1) - hash2 <- hasher$hashLoadArgs(args2) - - # Should produce identical hashes - expect_equal(hash1, hash2) - expect_true(is.character(hash1)) - expect_equal(nchar(hash1), 32) # MD5 is 32 hex characters -}) - -test_that("SettingsHasher detects changes in load args", { - hasher <- SettingsHasher$new() - - args1 <- createGetDbCohortMethodDataArgs( - covariateSettings = getDefaultCovariateSettings() - ) - args2 <- createGetDbCohortMethodDataArgs( - covariateSettings = getDefaultCovariateSettings(), - maxCohortSize = 50000 # Different - ) - - hash1 <- hasher$hashLoadArgs(args1) - hash2 <- hasher$hashLoadArgs(args2) - - expect_true(hash1 != hash2) -}) - -test_that("SettingsHasher detects changes in study population args", { - hasher <- SettingsHasher$new() - - args1 <- createCreateStudyPopulationArgs() - args2 <- createCreateStudyPopulationArgs(minDaysAtRisk = 30) # Different - - hash1 <- hasher$hashStudyPopArgs(args1) - hash2 <- hasher$hashStudyPopArgs(args2) - - expect_true(hash1 != hash2) -}) - -test_that("SettingsHasher handles NULL arguments", { - hasher <- SettingsHasher$new() - - hash <- hasher$hashLoadArgs(NULL) - expect_equal(hash, "") - - hash <- hasher$hashPsArgs(NULL) - expect_equal(hash, "") -}) - -test_that("SettingsHasher compareSettingsComponents works with identical specs", { - hasher <- SettingsHasher$new() - - # Create identical specifications - analysis <- createCmAnalysis( - analysisId = 1, - getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs( - covariateSettings = getDefaultCovariateSettings() - ), - createStudyPopulationArgs = createCreateStudyPopulationArgs(), - createPsArgs = createCreatePsArgs() - ) - - specs1 <- createCmAnalysesSpecifications( - cmAnalysisList = list(analysis), - targetComparatorOutcomesList = list( - createTargetComparatorOutcomes( - targetId = 1, - comparatorId = 2, - outcomes = list(createOutcome(outcomeId = 3)) - ) - ) - ) - - specs2 <- createCmAnalysesSpecifications( - cmAnalysisList = list(analysis), - targetComparatorOutcomesList = list( - createTargetComparatorOutcomes( - targetId = 1, - comparatorId = 2, - outcomes = list(createOutcome(outcomeId = 3)) - ) - ) - ) - - changes <- hasher$compareSettingsComponents(specs1, specs2) - - expect_false(any(as.logical(changes))) -}) - -test_that("SettingsHasher compareSettingsComponents detects load args change", { - hasher <- SettingsHasher$new() - - analysis1 <- createCmAnalysis( - analysisId = 1, - getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs( - covariateSettings = getDefaultCovariateSettings() - ), - createStudyPopulationArgs = createCreateStudyPopulationArgs(), - createPsArgs = createCreatePsArgs() - ) - - analysis2 <- createCmAnalysis( - analysisId = 1, - getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs( - covariateSettings = getDefaultCovariateSettings(), - maxCohortSize = 50000 - ), - createStudyPopulationArgs = createCreateStudyPopulationArgs(), - createPsArgs = createCreatePsArgs() - ) - - specs1 <- createCmAnalysesSpecifications( - cmAnalysisList = list(analysis1), - targetComparatorOutcomesList = list( - createTargetComparatorOutcomes( - targetId = 1, - comparatorId = 2, - outcomes = list(createOutcome(outcomeId = 3)) - ) - ) - ) - - specs2 <- createCmAnalysesSpecifications( - cmAnalysisList = list(analysis2), - targetComparatorOutcomesList = list( - createTargetComparatorOutcomes( - targetId = 1, - comparatorId = 2, - outcomes = list(createOutcome(outcomeId = 3)) - ) - ) - ) - - changes <- hasher$compareSettingsComponents(specs1, specs2) - - expect_true(changes$loadArgsChanged) -}) - -test_that("InvalidationPolicy computes correct deletion scope for load args change", { - policy <- InvalidationPolicy$new() - - changedComponents <- list( - loadArgsChanged = TRUE, - studyPopArgsChanged = FALSE, - psArgsChanged = FALSE, - strataArgsChanged = FALSE, - outcomeModelArgsChanged = FALSE, - balanceArgsChanged = FALSE, - analyticsChanged = FALSE - ) - - patterns <- policy$computeInvalidationScope(changedComponents) - - # Should include all major artifact patterns - expect_true(any(grepl("CmData", patterns))) - expect_true(any(grepl("StudyPop", patterns))) - expect_true(any(grepl("Ps", patterns))) - expect_true(any(grepl("Analysis", patterns))) -}) - -test_that("InvalidationPolicy computes correct deletion scope for study pop args change", { - policy <- InvalidationPolicy$new() - - changedComponents <- list( - loadArgsChanged = FALSE, - studyPopArgsChanged = TRUE, - psArgsChanged = FALSE, - strataArgsChanged = FALSE, - outcomeModelArgsChanged = FALSE, - balanceArgsChanged = FALSE, - analyticsChanged = FALSE - ) - - patterns <- policy$computeInvalidationScope(changedComponents) - - # Should include study pop and downstream - expect_true(any(grepl("StudyPop", patterns))) - expect_true(any(grepl("Ps", patterns))) - expect_true(any(grepl("Analysis", patterns))) - - # But not CmData - expect_false(any(grepl("CmData_", patterns))) -}) - -test_that("InvalidationPolicy computes empty scope for outcome addition only", { - policy <- InvalidationPolicy$new() - - changedComponents <- list( - loadArgsChanged = FALSE, - studyPopArgsChanged = FALSE, - psArgsChanged = FALSE, - strataArgsChanged = FALSE, - outcomeModelArgsChanged = FALSE, - balanceArgsChanged = FALSE, - analyticsChanged = TRUE # But only because new outcomes added - ) - - patterns <- policy$computeInvalidationScope(changedComponents) - - # Should be empty - new outcomes don't require deletion - expect_equal(length(patterns), 0) -}) - -test_that("InvalidationPolicy provides meaningful messages", { - policy <- InvalidationPolicy$new() - - changedComponents <- list( - loadArgsChanged = TRUE, - studyPopArgsChanged = FALSE, - psArgsChanged = FALSE, - strataArgsChanged = FALSE, - outcomeModelArgsChanged = FALSE, - balanceArgsChanged = FALSE, - analyticsChanged = FALSE - ) - - message <- policy$getInvalidationMessage(changedComponents) - expect_true(grepl("Data loading", message, ignore.case = TRUE)) - expect_true(is.character(message)) -}) - -test_that("ValidateArtifact can list artifacts by type", { - outputFolder <- tempfile(pattern = "cmArtifacts") - dir.create(outputFolder) - - on.exit(unlink(outputFolder, recursive = TRUE)) - - # Create some test files - file.create(file.path(outputFolder, "CmData_l1_t1_c2.zip")) - file.create(file.path(outputFolder, "StudyPop_l1_s1_t1_c2_o3.rds")) - file.create(file.path(outputFolder, "Ps_l1_p1_t1_c2.rds")) - - validator <- ValidateArtifact$new() - - cmdata <- validator$listArtifactsByType(outputFolder, "cmdata") - expect_equal(length(cmdata), 1) - expect_true(grepl("CmData", cmdata)) - - studypop <- validator$listArtifactsByType(outputFolder, "studypop") - expect_equal(length(studypop), 1) - expect_true(grepl("StudyPop", studypop)) - - ps <- validator$listArtifactsByType(outputFolder, "ps") - expect_equal(length(ps), 1) - expect_true(grepl("Ps", ps)) -}) - -test_that("ValidateArtifact can validate artifact file existence", { - outputFolder <- tempfile(pattern = "cmArtifacts") - dir.create(outputFolder) - - on.exit(unlink(outputFolder, recursive = TRUE)) - - validator <- ValidateArtifact$new() - - # Non-existent file - result <- validator$validateArtifactFile(file.path(outputFolder, "nonexistent.zip")) - expect_false(result) - - # Create a file - testFile <- file.path(outputFolder, "test.zip") - file.create(testFile) - - # Existing file - result <- validator$validateArtifactFile(testFile) - expect_true(result) -}) From a2f62be6a20f350a19c751a25e62d550a7d464c0 Mon Sep 17 00:00:00 2001 From: jgilber2 Date: Thu, 30 Apr 2026 07:48:55 -0700 Subject: [PATCH 03/10] Adding additional tests for cache schenarios --- tests/testthat/test-cachingScenarios.R | 632 +++++++++++++++++++++++++ 1 file changed, 632 insertions(+) create mode 100644 tests/testthat/test-cachingScenarios.R diff --git a/tests/testthat/test-cachingScenarios.R b/tests/testthat/test-cachingScenarios.R new file mode 100644 index 0000000..1136e17 --- /dev/null +++ b/tests/testthat/test-cachingScenarios.R @@ -0,0 +1,632 @@ +context("Content-addressable caching scenarios") + +# These tests verify that content-addressable filenames correctly handle +# real-world scenarios: reusing artifacts when possible, generating new +# filenames when settings change, and cascading hash changes appropriately. + +# Helper to create a basic analysis for testing +makeBasicAnalysis <- function(analysisId = 1, withPs = TRUE, withStrata = TRUE) { + args <- list( + analysisId = analysisId, + getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs( + covariateSettings = FeatureExtraction::createCovariateSettings( + useDemographicsGender = TRUE, + useDemographicsAge = TRUE + ) + ), + createStudyPopulationArgs = createCreateStudyPopulationArgs( + minDaysAtRisk = 1, + riskWindowStart = 0, + startAnchor = "cohort start", + riskWindowEnd = 30, + endAnchor = "cohort start" + ) + ) + if (withPs) { + args$createPsArgs <- createCreatePsArgs() + } + if (withStrata && withPs) { + args$matchOnPsArgs <- createMatchOnPsArgs() + } + args$fitOutcomeModelArgs <- createFitOutcomeModelArgs(modelType = "cox") + do.call(createCmAnalysis, args) +} + +# Helper to build a reference table without needing a database +buildRef <- function(cmAnalysisList, + targetComparatorOutcomesList, + databaseId = "testDb", + refitPsForEveryOutcome = FALSE, + refitPsForEveryStudyPopulation = TRUE) { + outputFolder <- tempfile(pattern = "cmRef") + dir.create(outputFolder) + on.exit(unlink(outputFolder, recursive = TRUE)) + + CohortMethod:::createReferenceTable( + cmAnalysisList = cmAnalysisList, + targetComparatorOutcomesList = targetComparatorOutcomesList, + analysesToExclude = NULL, + outputFolder = outputFolder, + refitPsForEveryOutcome = refitPsForEveryOutcome, + refitPsForEveryStudyPopulation = refitPsForEveryStudyPopulation, + databaseId = databaseId + ) +} + + +# =========================================================================== +# Scenario 1: Adding a new negative control outcome +# Negative controls need new outcome models but should NOT change PS models +# or CohortMethodData files +# =========================================================================== + +test_that("Adding a negative control does NOT change CmData or shared PS filenames", { + analysis <- makeBasicAnalysis() + + # Original: one outcome of interest, one negative control + tcos1 <- list( + createTargetComparatorOutcomes( + targetId = 1, + comparatorId = 2, + outcomes = list( + createOutcome(outcomeId = 100, outcomeOfInterest = TRUE), + createOutcome(outcomeId = 200, outcomeOfInterest = FALSE) # negative control + ) + ) + ) + + # New: add a second negative control + tcos2 <- list( + createTargetComparatorOutcomes( + targetId = 1, + comparatorId = 2, + outcomes = list( + createOutcome(outcomeId = 100, outcomeOfInterest = TRUE), + createOutcome(outcomeId = 200, outcomeOfInterest = FALSE), + createOutcome(outcomeId = 300, outcomeOfInterest = FALSE) # NEW negative control + ) + ) + ) + + ref1 <- buildRef(list(analysis), tcos1) + ref2 <- buildRef(list(analysis), tcos2) + + # CohortMethodData files should be identical (same load args, same target/comparator) + expect_equal( + unique(ref1$cohortMethodDataFile), + unique(ref2$cohortMethodDataFile) + ) + + # Shared PS files should be identical (PS doesn't depend on outcome) + sharedPs1 <- unique(ref1$sharedPsFile[ref1$sharedPsFile != ""]) + sharedPs2 <- unique(ref2$sharedPsFile[ref2$sharedPsFile != ""]) + expect_equal(sharedPs1, sharedPs2) + + # Original outcome (100) should have the same outcome model file + om1_100 <- ref1$outcomeModelFile[ref1$outcomeId == 100] + om2_100 <- ref2$outcomeModelFile[ref2$outcomeId == 100] + expect_equal(om1_100, om2_100) + + # New negative control (300) should have a NEW outcome model file + om2_300 <- ref2$outcomeModelFile[ref2$outcomeId == 300] + expect_true(om2_300 != "") + expect_true(!(om2_300 %in% ref1$outcomeModelFile)) +}) + +test_that("Adding a negative control produces distinct outcome model per outcome", { + analysis <- makeBasicAnalysis() + + tcos <- list( + createTargetComparatorOutcomes( + targetId = 1, + comparatorId = 2, + outcomes = list( + createOutcome(outcomeId = 100, outcomeOfInterest = TRUE), + createOutcome(outcomeId = 200, outcomeOfInterest = FALSE), + createOutcome(outcomeId = 300, outcomeOfInterest = FALSE) + ) + ) + ) + + ref <- buildRef(list(analysis), tcos) + + # Each outcome should have a unique outcome model filename + omFiles <- ref$outcomeModelFile[ref$outcomeModelFile != ""] + expect_equal(length(omFiles), length(unique(omFiles))) +}) + + +# =========================================================================== +# Scenario 2: Adding an outcome of interest +# Should produce new StudyPop + StratPop + OutcomeModel for the new outcome +# while reusing CmData and shared PS +# =========================================================================== + +test_that("Adding an outcome of interest reuses CmData and shared PS", { + analysis <- makeBasicAnalysis() + + tcos1 <- list( + createTargetComparatorOutcomes( + targetId = 1, + comparatorId = 2, + outcomes = list( + createOutcome(outcomeId = 100, outcomeOfInterest = TRUE) + ) + ) + ) + + tcos2 <- list( + createTargetComparatorOutcomes( + targetId = 1, + comparatorId = 2, + outcomes = list( + createOutcome(outcomeId = 100, outcomeOfInterest = TRUE), + createOutcome(outcomeId = 101, outcomeOfInterest = TRUE) # NEW + ) + ) + ) + + ref1 <- buildRef(list(analysis), tcos1) + ref2 <- buildRef(list(analysis), tcos2) + + # CohortMethodData should be reused + expect_equal(unique(ref1$cohortMethodDataFile), unique(ref2$cohortMethodDataFile)) + + # Shared PS should be reused (same model, not outcome-specific) + sharedPs1 <- unique(ref1$sharedPsFile[ref1$sharedPsFile != ""]) + sharedPs2 <- unique(ref2$sharedPsFile[ref2$sharedPsFile != ""]) + expect_equal(sharedPs1, sharedPs2) + + # Original outcome 100 filenames should be identical + expect_equal( + ref1$studyPopFile[ref1$outcomeId == 100], + ref2$studyPopFile[ref2$outcomeId == 100] + ) + expect_equal( + ref1$strataFile[ref1$outcomeId == 100], + ref2$strataFile[ref2$outcomeId == 100] + ) + expect_equal( + ref1$outcomeModelFile[ref1$outcomeId == 100], + ref2$outcomeModelFile[ref2$outcomeId == 100] + ) + + # New outcome 101 should have new study pop and outcome model + newStudyPop <- ref2$studyPopFile[ref2$outcomeId == 101] + expect_true(newStudyPop != "") + expect_true(!(newStudyPop %in% ref1$studyPopFile)) + + newOm <- ref2$outcomeModelFile[ref2$outcomeId == 101] + expect_true(newOm != "") + expect_true(!(newOm %in% ref1$outcomeModelFile)) +}) + +test_that("Adding an outcome does NOT affect strata file of existing outcomes", { + analysis <- makeBasicAnalysis() + + tcos1 <- list( + createTargetComparatorOutcomes( + targetId = 1, + comparatorId = 2, + outcomes = list( + createOutcome(outcomeId = 100, outcomeOfInterest = TRUE) + ) + ) + ) + + tcos2 <- list( + createTargetComparatorOutcomes( + targetId = 1, + comparatorId = 2, + outcomes = list( + createOutcome(outcomeId = 100, outcomeOfInterest = TRUE), + createOutcome(outcomeId = 101, outcomeOfInterest = TRUE) + ) + ) + ) + + ref1 <- buildRef(list(analysis), tcos1) + ref2 <- buildRef(list(analysis), tcos2) + + # Strata file for outcome 100 should be unchanged + expect_equal( + ref1$strataFile[ref1$outcomeId == 100], + ref2$strataFile[ref2$outcomeId == 100] + ) +}) + + +# =========================================================================== +# Scenario 3: Changing PS covariates cascades to all downstream artifacts +# CmData changes → new loadHash → new StudyPop, PS, Strata, OutcomeModel +# =========================================================================== + +test_that("Changing covariates cascades: all filenames change", { + analysis1 <- createCmAnalysis( + analysisId = 1, + getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs( + covariateSettings = FeatureExtraction::createCovariateSettings( + useDemographicsGender = TRUE, + useDemographicsAge = TRUE + ) + ), + createStudyPopulationArgs = createCreateStudyPopulationArgs(), + createPsArgs = createCreatePsArgs(), + matchOnPsArgs = createMatchOnPsArgs(), + fitOutcomeModelArgs = createFitOutcomeModelArgs(modelType = "cox") + ) + + # Same analysis but with additional covariate + analysis2 <- createCmAnalysis( + analysisId = 1, + getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs( + covariateSettings = FeatureExtraction::createCovariateSettings( + useDemographicsGender = TRUE, + useDemographicsAge = TRUE, + useDemographicsRace = TRUE # CHANGED: added covariate + ) + ), + createStudyPopulationArgs = createCreateStudyPopulationArgs(), + createPsArgs = createCreatePsArgs(), + matchOnPsArgs = createMatchOnPsArgs(), + fitOutcomeModelArgs = createFitOutcomeModelArgs(modelType = "cox") + ) + + tcos <- list( + createTargetComparatorOutcomes( + targetId = 1, + comparatorId = 2, + outcomes = list(createOutcome(outcomeId = 100, outcomeOfInterest = TRUE)) + ) + ) + + ref1 <- buildRef(list(analysis1), tcos) + ref2 <- buildRef(list(analysis2), tcos) + + # CohortMethodData files should differ (covariates changed) + expect_true(ref1$cohortMethodDataFile[1] != ref2$cohortMethodDataFile[1]) + + # Study population files should differ (cascaded via loadHash) + expect_true(ref1$studyPopFile[1] != ref2$studyPopFile[1]) + + # Shared PS files should differ (cascaded via loadHash) + expect_true(ref1$sharedPsFile[1] != ref2$sharedPsFile[1]) + + # Strata files should differ (cascaded) + expect_true(ref1$strataFile[1] != ref2$strataFile[1]) + + # Outcome model files should differ (cascaded) + expect_true(ref1$outcomeModelFile[1] != ref2$outcomeModelFile[1]) +}) + +test_that("Changing PS args cascades to PS and downstream but NOT CmData", { + covSettings <- FeatureExtraction::createCovariateSettings( + useDemographicsGender = TRUE, + useDemographicsAge = TRUE + ) + + analysis1 <- createCmAnalysis( + analysisId = 1, + getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs( + covariateSettings = covSettings + ), + createStudyPopulationArgs = createCreateStudyPopulationArgs(), + createPsArgs = createCreatePsArgs(maxCohortSizeForFitting = 100000), + matchOnPsArgs = createMatchOnPsArgs(), + fitOutcomeModelArgs = createFitOutcomeModelArgs(modelType = "cox") + ) + + analysis2 <- createCmAnalysis( + analysisId = 1, + getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs( + covariateSettings = covSettings + ), + createStudyPopulationArgs = createCreateStudyPopulationArgs(), + createPsArgs = createCreatePsArgs(maxCohortSizeForFitting = 50000), # CHANGED + matchOnPsArgs = createMatchOnPsArgs(), + fitOutcomeModelArgs = createFitOutcomeModelArgs(modelType = "cox") + ) + + tcos <- list( + createTargetComparatorOutcomes( + targetId = 1, + comparatorId = 2, + outcomes = list(createOutcome(outcomeId = 100, outcomeOfInterest = TRUE)) + ) + ) + + ref1 <- buildRef(list(analysis1), tcos) + ref2 <- buildRef(list(analysis2), tcos) + + # CohortMethodData should be REUSED (load args unchanged) + expect_equal(ref1$cohortMethodDataFile[1], ref2$cohortMethodDataFile[1]) + + # StudyPop should be REUSED (studyPop args unchanged, same loadHash) + expect_equal(ref1$studyPopFile[1], ref2$studyPopFile[1]) + + # Shared PS should DIFFER (PS args changed) + expect_true(ref1$sharedPsFile[1] != ref2$sharedPsFile[1]) + + # Strata and outcome model should DIFFER (cascaded from PS change) + expect_true(ref1$strataFile[1] != ref2$strataFile[1]) + expect_true(ref1$outcomeModelFile[1] != ref2$outcomeModelFile[1]) +}) + +test_that("Changing study pop args cascades to StudyPop and downstream but NOT CmData", { + covSettings <- FeatureExtraction::createCovariateSettings( + useDemographicsGender = TRUE + ) + + analysis1 <- createCmAnalysis( + analysisId = 1, + getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs( + covariateSettings = covSettings + ), + createStudyPopulationArgs = createCreateStudyPopulationArgs(minDaysAtRisk = 1), + createPsArgs = createCreatePsArgs(), + matchOnPsArgs = createMatchOnPsArgs(), + fitOutcomeModelArgs = createFitOutcomeModelArgs(modelType = "cox") + ) + + analysis2 <- createCmAnalysis( + analysisId = 1, + getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs( + covariateSettings = covSettings + ), + createStudyPopulationArgs = createCreateStudyPopulationArgs(minDaysAtRisk = 30), # CHANGED + createPsArgs = createCreatePsArgs(), + matchOnPsArgs = createMatchOnPsArgs(), + fitOutcomeModelArgs = createFitOutcomeModelArgs(modelType = "cox") + ) + + tcos <- list( + createTargetComparatorOutcomes( + targetId = 1, + comparatorId = 2, + outcomes = list(createOutcome(outcomeId = 100, outcomeOfInterest = TRUE)) + ) + ) + + ref1 <- buildRef(list(analysis1), tcos) + ref2 <- buildRef(list(analysis2), tcos) + + # CohortMethodData should be REUSED + expect_equal(ref1$cohortMethodDataFile[1], ref2$cohortMethodDataFile[1]) + + # StudyPop should DIFFER + expect_true(ref1$studyPopFile[1] != ref2$studyPopFile[1]) + + # PS should DIFFER (depends on studyPop args in hash) + expect_true(ref1$sharedPsFile[1] != ref2$sharedPsFile[1]) + + # Strata should DIFFER + expect_true(ref1$strataFile[1] != ref2$strataFile[1]) + + # Outcome model should DIFFER + expect_true(ref1$outcomeModelFile[1] != ref2$outcomeModelFile[1]) +}) + + +# =========================================================================== +# Scenario 4: databaseId differentiation +# Same settings on different databases MUST produce different filenames +# =========================================================================== + +test_that("Different databaseId produces entirely different filenames", { + analysis <- makeBasicAnalysis() + + tcos <- list( + createTargetComparatorOutcomes( + targetId = 1, + comparatorId = 2, + outcomes = list(createOutcome(outcomeId = 100, outcomeOfInterest = TRUE)) + ) + ) + + refA <- buildRef(list(analysis), tcos, databaseId = "database_A") + refB <- buildRef(list(analysis), tcos, databaseId = "database_B") + + # ALL filenames should differ + + expect_true(refA$cohortMethodDataFile[1] != refB$cohortMethodDataFile[1]) + expect_true(refA$studyPopFile[1] != refB$studyPopFile[1]) + expect_true(refA$sharedPsFile[1] != refB$sharedPsFile[1]) + expect_true(refA$strataFile[1] != refB$strataFile[1]) + expect_true(refA$outcomeModelFile[1] != refB$outcomeModelFile[1]) +}) + + +# =========================================================================== +# Scenario 5: Changing matching/stratification args +# Should change strata + outcome models but NOT CmData, StudyPop, or PS +# =========================================================================== + +test_that("Changing matching args changes strata but not CmData/StudyPop/PS", { + covSettings <- FeatureExtraction::createCovariateSettings( + useDemographicsGender = TRUE + ) + + analysis1 <- createCmAnalysis( + analysisId = 1, + getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs( + covariateSettings = covSettings + ), + createStudyPopulationArgs = createCreateStudyPopulationArgs(), + createPsArgs = createCreatePsArgs(), + matchOnPsArgs = createMatchOnPsArgs(caliper = 0.2), + fitOutcomeModelArgs = createFitOutcomeModelArgs(modelType = "cox") + ) + + analysis2 <- createCmAnalysis( + analysisId = 1, + getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs( + covariateSettings = covSettings + ), + createStudyPopulationArgs = createCreateStudyPopulationArgs(), + createPsArgs = createCreatePsArgs(), + matchOnPsArgs = createMatchOnPsArgs(caliper = 0.1), # CHANGED caliper + fitOutcomeModelArgs = createFitOutcomeModelArgs(modelType = "cox") + ) + + tcos <- list( + createTargetComparatorOutcomes( + targetId = 1, + comparatorId = 2, + outcomes = list(createOutcome(outcomeId = 100, outcomeOfInterest = TRUE)) + ) + ) + + ref1 <- buildRef(list(analysis1), tcos) + ref2 <- buildRef(list(analysis2), tcos) + + # CmData should be REUSED + expect_equal(ref1$cohortMethodDataFile[1], ref2$cohortMethodDataFile[1]) + + # StudyPop should be REUSED + expect_equal(ref1$studyPopFile[1], ref2$studyPopFile[1]) + + # Shared PS should be REUSED (matching doesn't affect PS fitting) + expect_equal(ref1$sharedPsFile[1], ref2$sharedPsFile[1]) + + # Strata should DIFFER (matching caliper changed) + expect_true(ref1$strataFile[1] != ref2$strataFile[1]) + + # Outcome model should DIFFER (depends on strata) + expect_true(ref1$outcomeModelFile[1] != ref2$outcomeModelFile[1]) +}) + + +# =========================================================================== +# Scenario 6: Changing outcome model args only +# Should change ONLY outcome models +# =========================================================================== + +test_that("Changing outcome model args changes only outcome model files", { + covSettings <- FeatureExtraction::createCovariateSettings( + useDemographicsGender = TRUE + ) + + analysis1 <- createCmAnalysis( + analysisId = 1, + getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs( + covariateSettings = covSettings + ), + createStudyPopulationArgs = createCreateStudyPopulationArgs(), + createPsArgs = createCreatePsArgs(), + matchOnPsArgs = createMatchOnPsArgs(), + fitOutcomeModelArgs = createFitOutcomeModelArgs(modelType = "cox") + ) + + analysis2 <- createCmAnalysis( + analysisId = 1, + getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs( + covariateSettings = covSettings + ), + createStudyPopulationArgs = createCreateStudyPopulationArgs(), + createPsArgs = createCreatePsArgs(), + matchOnPsArgs = createMatchOnPsArgs(), + fitOutcomeModelArgs = createFitOutcomeModelArgs(modelType = "logistic") # CHANGED + ) + + tcos <- list( + createTargetComparatorOutcomes( + targetId = 1, + comparatorId = 2, + outcomes = list(createOutcome(outcomeId = 100, outcomeOfInterest = TRUE)) + ) + ) + + ref1 <- buildRef(list(analysis1), tcos) + ref2 <- buildRef(list(analysis2), tcos) + + # CmData, StudyPop, PS, Strata should all be REUSED + expect_equal(ref1$cohortMethodDataFile[1], ref2$cohortMethodDataFile[1]) + expect_equal(ref1$studyPopFile[1], ref2$studyPopFile[1]) + expect_equal(ref1$sharedPsFile[1], ref2$sharedPsFile[1]) + expect_equal(ref1$strataFile[1], ref2$strataFile[1]) + + # Only outcome model should DIFFER + expect_true(ref1$outcomeModelFile[1] != ref2$outcomeModelFile[1]) +}) + + +# =========================================================================== +# Scenario 7: Multiple analyses sharing CmData +# =========================================================================== + +test_that("Analyses with same load args share CmData file", { + covSettings <- FeatureExtraction::createCovariateSettings( + useDemographicsGender = TRUE + ) + + # Two analyses with same getDbCohortMethodDataArgs but different PS args + analysis1 <- createCmAnalysis( + analysisId = 1, + getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs( + covariateSettings = covSettings + ), + createStudyPopulationArgs = createCreateStudyPopulationArgs(), + createPsArgs = createCreatePsArgs(maxCohortSizeForFitting = 100000), + matchOnPsArgs = createMatchOnPsArgs(), + fitOutcomeModelArgs = createFitOutcomeModelArgs(modelType = "cox") + ) + + analysis2 <- createCmAnalysis( + analysisId = 2, + getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs( + covariateSettings = covSettings + ), + createStudyPopulationArgs = createCreateStudyPopulationArgs(), + createPsArgs = createCreatePsArgs(maxCohortSizeForFitting = 50000), + matchOnPsArgs = createMatchOnPsArgs(), + fitOutcomeModelArgs = createFitOutcomeModelArgs(modelType = "cox") + ) + + tcos <- list( + createTargetComparatorOutcomes( + targetId = 1, + comparatorId = 2, + outcomes = list(createOutcome(outcomeId = 100, outcomeOfInterest = TRUE)) + ) + ) + + ref <- buildRef(list(analysis1, analysis2), tcos) + + # Both rows should share the same CmData file + cmFiles <- unique(ref$cohortMethodDataFile) + expect_equal(length(cmFiles), 1) + + # But have different PS files + psFiles <- unique(ref$sharedPsFile[ref$sharedPsFile != ""]) + expect_equal(length(psFiles), 2) +}) + + +# =========================================================================== +# Scenario 8: Hash stability across invocations +# =========================================================================== + +test_that("Reference table is deterministic across repeated calls", { + analysis <- makeBasicAnalysis() + + tcos <- list( + createTargetComparatorOutcomes( + targetId = 1, + comparatorId = 2, + outcomes = list( + createOutcome(outcomeId = 100, outcomeOfInterest = TRUE), + createOutcome(outcomeId = 200, outcomeOfInterest = FALSE) + ) + ) + ) + + ref1 <- buildRef(list(analysis), tcos, databaseId = "stable_test") + ref2 <- buildRef(list(analysis), tcos, databaseId = "stable_test") + + # All filenames should be identical + expect_equal(ref1$cohortMethodDataFile, ref2$cohortMethodDataFile) + expect_equal(ref1$studyPopFile, ref2$studyPopFile) + expect_equal(ref1$sharedPsFile, ref2$sharedPsFile) + expect_equal(ref1$strataFile, ref2$strataFile) + expect_equal(ref1$outcomeModelFile, ref2$outcomeModelFile) +}) From c1e8d37cd312bed9a816fc5d7a9920ba628bdb1c Mon Sep 17 00:00:00 2001 From: jgilber2 Date: Thu, 30 Apr 2026 10:15:00 -0700 Subject: [PATCH 04/10] Changes to eunomia tests to match new filenames --- tests/testthat/test-eunomia.R | 13 +++++++++---- 1 file changed, 9 insertions(+), 4 deletions(-) diff --git a/tests/testthat/test-eunomia.R b/tests/testthat/test-eunomia.R index ffb0131..e9d8e32 100644 --- a/tests/testthat/test-eunomia.R +++ b/tests/testthat/test-eunomia.R @@ -705,10 +705,15 @@ if (!isFALSE(tryCatch(find.package("Eunomia"), error = function(e) FALSE))) { nestedFiles <- result$cohortMethodDataFile[result$nestingCohortId == 4 & !is.na(result$nestingCohortId)] nonNestedFiles <- result$cohortMethodDataFile[is.na(result$nestingCohortId)] expect_true(length(intersect(nestedFiles, nonNestedFiles)) == 0) - expect_equal(length(grep("_t1_c2_n4", result$sharedBalanceFile)), 2) - expect_equal(length(grep("_t1_c2_n4", result$filteredForbalanceFile)), 2) - expect_equal(length(grep("_t1_c2_n4", result$balanceFile)), 2) - expect_equal(length(grep("_t1_c2_n4", result$outcomeModelFile)), 2) + + # Verify nested rows have distinct balance/outcome filenames from non-nested rows + nestedBalance <- result$sharedBalanceFile[result$nestingCohortId == 4 & !is.na(result$nestingCohortId)] + nonNestedBalance <- result$sharedBalanceFile[is.na(result$nestingCohortId)] + expect_true(length(intersect(nestedBalance[nestedBalance != ""], nonNestedBalance[nonNestedBalance != ""])) == 0) + + nestedOm <- result$outcomeModelFile[result$nestingCohortId == 4 & !is.na(result$nestingCohortId)] + nonNestedOm <- result$outcomeModelFile[is.na(result$nestingCohortId)] + expect_true(length(intersect(nestedOm[nestedOm != ""], nonNestedOm[nonNestedOm != ""])) == 0) uniqueCmdFiles <- result[!duplicated(result$cohortMethodDataFile), ] for (i in seq_len(nrow(uniqueCmdFiles))) { From 9e80057552052b62e4c697fb4cea343f51986052 Mon Sep 17 00:00:00 2001 From: jgilber2 Date: Thu, 30 Apr 2026 11:45:33 -0700 Subject: [PATCH 05/10] missing doc files --- man/ArtifactStore.Rd | 193 +++++++++++++++++++++++++++++++ man/LocalArtifactStore.Rd | 235 ++++++++++++++++++++++++++++++++++++++ man/runCmAnalyses.Rd | 12 +- 3 files changed, 439 insertions(+), 1 deletion(-) create mode 100644 man/ArtifactStore.Rd create mode 100644 man/LocalArtifactStore.Rd diff --git a/man/ArtifactStore.Rd b/man/ArtifactStore.Rd new file mode 100644 index 0000000..517dab3 --- /dev/null +++ b/man/ArtifactStore.Rd @@ -0,0 +1,193 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ArtifactStore.R +\name{ArtifactStore} +\alias{ArtifactStore} +\title{Abstract Artifact Store} +\description{ +Abstract R6 interface for storing and retrieving analysis artifacts. +Subclass this to implement custom storage backends (e.g., S3, database). +} +\section{Methods}{ +\subsection{Public methods}{ +\itemize{ +\item \href{#method-ArtifactStore-exists}{\code{ArtifactStore$exists()}} +\item \href{#method-ArtifactStore-readRDS}{\code{ArtifactStore$readRDS()}} +\item \href{#method-ArtifactStore-saveRDS}{\code{ArtifactStore$saveRDS()}} +\item \href{#method-ArtifactStore-readAndromeda}{\code{ArtifactStore$readAndromeda()}} +\item \href{#method-ArtifactStore-saveAndromeda}{\code{ArtifactStore$saveAndromeda()}} +\item \href{#method-ArtifactStore-listArtifacts}{\code{ArtifactStore$listArtifacts()}} +\item \href{#method-ArtifactStore-delete}{\code{ArtifactStore$delete()}} +\item \href{#method-ArtifactStore-ensureDir}{\code{ArtifactStore$ensureDir()}} +\item \href{#method-ArtifactStore-clone}{\code{ArtifactStore$clone()}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-ArtifactStore-exists}{}}} +\subsection{Method \code{exists()}}{ +Check if an artifact exists. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{ArtifactStore$exists(key)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{key}}{Character. The artifact key (relative path).} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +Logical. +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-ArtifactStore-readRDS}{}}} +\subsection{Method \code{readRDS()}}{ +Read an RDS artifact. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{ArtifactStore$readRDS(key)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{key}}{Character. The artifact key.} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +The deserialized R object. +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-ArtifactStore-saveRDS}{}}} +\subsection{Method \code{saveRDS()}}{ +Save an RDS artifact. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{ArtifactStore$saveRDS(object, key)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{object}}{The R object to save.} + +\item{\code{key}}{Character. The artifact key.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-ArtifactStore-readAndromeda}{}}} +\subsection{Method \code{readAndromeda()}}{ +Read an Andromeda (zip) artifact. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{ArtifactStore$readAndromeda(key)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{key}}{Character. The artifact key.} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +An Andromeda object. +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-ArtifactStore-saveAndromeda}{}}} +\subsection{Method \code{saveAndromeda()}}{ +Save an Andromeda (zip) artifact. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{ArtifactStore$saveAndromeda(object, key)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{object}}{The Andromeda object to save.} + +\item{\code{key}}{Character. The artifact key.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-ArtifactStore-listArtifacts}{}}} +\subsection{Method \code{listArtifacts()}}{ +List artifacts matching a prefix. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{ArtifactStore$listArtifacts(prefix = NULL)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{prefix}}{Character or NULL. Filter to keys starting with this prefix.} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +Character vector of keys. +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-ArtifactStore-delete}{}}} +\subsection{Method \code{delete()}}{ +Delete an artifact. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{ArtifactStore$delete(key)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{key}}{Character. The artifact key.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-ArtifactStore-ensureDir}{}}} +\subsection{Method \code{ensureDir()}}{ +Ensure a directory exists for a given key. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{ArtifactStore$ensureDir(key)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{key}}{Character. The artifact key.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-ArtifactStore-clone}{}}} +\subsection{Method \code{clone()}}{ +The objects of this class are cloneable with this method. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{ArtifactStore$clone(deep = FALSE)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{deep}}{Whether to make a deep clone.} +} +\if{html}{\out{
}} +} +} +} diff --git a/man/LocalArtifactStore.Rd b/man/LocalArtifactStore.Rd new file mode 100644 index 0000000..4ae5c65 --- /dev/null +++ b/man/LocalArtifactStore.Rd @@ -0,0 +1,235 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/ArtifactStore.R +\name{LocalArtifactStore} +\alias{LocalArtifactStore} +\title{Local Filesystem Artifact Store} +\description{ +Implementation of \link{ArtifactStore} that reads and writes artifacts to a local +filesystem directory. This is the default storage backend. +} +\section{Super class}{ +\code{\link[CohortMethod:ArtifactStore]{CohortMethod::ArtifactStore}} -> \code{LocalArtifactStore} +} +\section{Methods}{ +\subsection{Public methods}{ +\itemize{ +\item \href{#method-LocalArtifactStore-new}{\code{LocalArtifactStore$new()}} +\item \href{#method-LocalArtifactStore-exists}{\code{LocalArtifactStore$exists()}} +\item \href{#method-LocalArtifactStore-readRDS}{\code{LocalArtifactStore$readRDS()}} +\item \href{#method-LocalArtifactStore-saveRDS}{\code{LocalArtifactStore$saveRDS()}} +\item \href{#method-LocalArtifactStore-readAndromeda}{\code{LocalArtifactStore$readAndromeda()}} +\item \href{#method-LocalArtifactStore-saveAndromeda}{\code{LocalArtifactStore$saveAndromeda()}} +\item \href{#method-LocalArtifactStore-listArtifacts}{\code{LocalArtifactStore$listArtifacts()}} +\item \href{#method-LocalArtifactStore-delete}{\code{LocalArtifactStore$delete()}} +\item \href{#method-LocalArtifactStore-ensureDir}{\code{LocalArtifactStore$ensureDir()}} +\item \href{#method-LocalArtifactStore-getFullPath}{\code{LocalArtifactStore$getFullPath()}} +\item \href{#method-LocalArtifactStore-clone}{\code{LocalArtifactStore$clone()}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-LocalArtifactStore-new}{}}} +\subsection{Method \code{new()}}{ +Create a new local artifact store. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{LocalArtifactStore$new(basePath)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{basePath}}{Character. The root directory for artifact storage.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-LocalArtifactStore-exists}{}}} +\subsection{Method \code{exists()}}{ +Check if an artifact exists. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{LocalArtifactStore$exists(key)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{key}}{Character. The artifact key (relative path).} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +Logical. +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-LocalArtifactStore-readRDS}{}}} +\subsection{Method \code{readRDS()}}{ +Read an RDS artifact. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{LocalArtifactStore$readRDS(key)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{key}}{Character. The artifact key.} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +The deserialized R object. +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-LocalArtifactStore-saveRDS}{}}} +\subsection{Method \code{saveRDS()}}{ +Save an RDS artifact. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{LocalArtifactStore$saveRDS(object, key)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{object}}{The R object to save.} + +\item{\code{key}}{Character. The artifact key.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-LocalArtifactStore-readAndromeda}{}}} +\subsection{Method \code{readAndromeda()}}{ +Read an Andromeda (zip) artifact. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{LocalArtifactStore$readAndromeda(key)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{key}}{Character. The artifact key.} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +An Andromeda object. +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-LocalArtifactStore-saveAndromeda}{}}} +\subsection{Method \code{saveAndromeda()}}{ +Save an Andromeda (zip) artifact. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{LocalArtifactStore$saveAndromeda(object, key)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{object}}{The Andromeda object to save.} + +\item{\code{key}}{Character. The artifact key.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-LocalArtifactStore-listArtifacts}{}}} +\subsection{Method \code{listArtifacts()}}{ +List artifacts matching a prefix. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{LocalArtifactStore$listArtifacts(prefix = NULL)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{prefix}}{Character or NULL. Filter to keys starting with this prefix.} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +Character vector of keys. +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-LocalArtifactStore-delete}{}}} +\subsection{Method \code{delete()}}{ +Delete an artifact. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{LocalArtifactStore$delete(key)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{key}}{Character. The artifact key.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-LocalArtifactStore-ensureDir}{}}} +\subsection{Method \code{ensureDir()}}{ +Ensure a directory exists for a given key. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{LocalArtifactStore$ensureDir(key)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{key}}{Character. The artifact key.} +} +\if{html}{\out{
}} +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-LocalArtifactStore-getFullPath}{}}} +\subsection{Method \code{getFullPath()}}{ +Get the full filesystem path for a key. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{LocalArtifactStore$getFullPath(key)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{key}}{Character. The artifact key.} +} +\if{html}{\out{
}} +} +\subsection{Returns}{ +Character. The full path. +} +} +\if{html}{\out{
}} +\if{html}{\out{}} +\if{latex}{\out{\hypertarget{method-LocalArtifactStore-clone}{}}} +\subsection{Method \code{clone()}}{ +The objects of this class are cloneable with this method. +\subsection{Usage}{ +\if{html}{\out{
}}\preformatted{LocalArtifactStore$clone(deep = FALSE)}\if{html}{\out{
}} +} + +\subsection{Arguments}{ +\if{html}{\out{
}} +\describe{ +\item{\code{deep}}{Whether to make a deep clone.} +} +\if{html}{\out{
}} +} +} +} diff --git a/man/runCmAnalyses.Rd b/man/runCmAnalyses.Rd index 2044f01..e5268ca 100644 --- a/man/runCmAnalyses.Rd +++ b/man/runCmAnalyses.Rd @@ -16,7 +16,9 @@ runCmAnalyses( nestingCohortTable = "cohort", outputFolder = "./CohortMethodOutput", multiThreadingSettings = createMultiThreadingSettings(), - cmAnalysesSpecifications + cmAnalysesSpecifications, + databaseId, + artifactStore = NULL ) } \arguments{ @@ -69,6 +71,14 @@ the \code{\link[=createMultiThreadingSettings]{createMultiThreadingSettings()}} \item{cmAnalysesSpecifications}{An object of type \code{CmAnalysesSpecifications} as created using the \code{createCmAnalysesSpecifications()}.} + +\item{databaseId}{A unique identifier for the database being used. This is +baked into artifact hashes to prevent accidental reuse of +cached results from a different database. Required.} + +\item{artifactStore}{An object inheriting from \link{ArtifactStore} used to read and +write cached artifacts. If NULL (default), a +\link{LocalArtifactStore} backed by \code{outputFolder} is used.} } \value{ A tibble describing for each target-comparator-outcome-analysisId combination where the intermediary and From d1eafdd6e7cbfeeb71c000f04c7bb7b5b0e5e332 Mon Sep 17 00:00:00 2001 From: jgilber2 Date: Mon, 4 May 2026 14:36:12 -0700 Subject: [PATCH 06/10] Added more tests to check results cache ar written to disk --- tests/testthat/test-cachingScenarios.R | 279 +++++++++++++++++++++++++ 1 file changed, 279 insertions(+) diff --git a/tests/testthat/test-cachingScenarios.R b/tests/testthat/test-cachingScenarios.R index 1136e17..e2bba951 100644 --- a/tests/testthat/test-cachingScenarios.R +++ b/tests/testthat/test-cachingScenarios.R @@ -630,3 +630,282 @@ test_that("Reference table is deterministic across repeated calls", { expect_equal(ref1$strataFile, ref2$strataFile) expect_equal(ref1$outcomeModelFile, ref2$outcomeModelFile) }) + + +# =========================================================================== +# End-to-end disk reuse tests (require Eunomia) +# These tests verify that artifact files are actually written to and reused +# from disk, not just that filenames are computed correctly. +# =========================================================================== + +if (!isFALSE(tryCatch(find.package("Eunomia"), error = function(e) FALSE))) { + + # Minimal analysis used across all E2E tests: no PS, no matching, simple cox + makeEunomiaAnalysis <- function(analysisId = 1) { + createCmAnalysis( + analysisId = analysisId, + getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs( + firstExposureOnly = TRUE, + washoutPeriod = 183, + covariateSettings = FeatureExtraction::createCovariateSettings( + useDemographicsGender = TRUE, + useDemographicsAge = TRUE + ) + ), + createStudyPopulationArgs = createCreateStudyPopulationArgs( + removeSubjectsWithPriorOutcome = TRUE, + minDaysAtRisk = 1, + riskWindowStart = 0, + startAnchor = "cohort start", + riskWindowEnd = 30, + endAnchor = "cohort end" + ), + fitOutcomeModelArgs = createFitOutcomeModelArgs(modelType = "cox") + ) + } + + test_that("E2E: files are actually written to disk", { + outputFolder <- tempfile(pattern = "cmCache") + on.exit(unlink(outputFolder, recursive = TRUE)) + + result <- runCmAnalyses( + connectionDetails = connectionDetails, + cdmDatabaseSchema = "main", + exposureTable = "cohort", + outcomeTable = "cohort", + outputFolder = outputFolder, + databaseId = "Eunomia", + cmAnalysesSpecifications = createCmAnalysesSpecifications( + cmAnalysisList = list(makeEunomiaAnalysis()), + targetComparatorOutcomesList = list( + createTargetComparatorOutcomes( + targetId = 1, + comparatorId = 2, + outcomes = list(createOutcome(outcomeId = 3, outcomeOfInterest = TRUE)) + ) + ) + ) + ) + + # Every non-empty filename in the reference table must exist on disk + checkFiles <- function(col) { + paths <- result[[col]] + paths <- paths[paths != ""] + paths <- unique(paths) + missing <- paths[!file.exists(file.path(outputFolder, paths))] + expect_equal(length(missing), 0, + info = sprintf("Missing %s files: %s", col, paste(missing, collapse = ", "))) + } + + checkFiles("cohortMethodDataFile") + checkFiles("studyPopFile") + checkFiles("outcomeModelFile") + }) + + test_that("E2E: adding a new outcome reuses CmData and computes only new outcome model", { + outputFolder <- tempfile(pattern = "cmCacheReuse") + on.exit(unlink(outputFolder, recursive = TRUE)) + + analysis <- makeEunomiaAnalysis() + tcos_base <- createTargetComparatorOutcomes( + targetId = 1, + comparatorId = 2, + outcomes = list( + createOutcome(outcomeId = 3, outcomeOfInterest = TRUE) + ) + ) + tcos_extended <- createTargetComparatorOutcomes( + targetId = 1, + comparatorId = 2, + outcomes = list( + createOutcome(outcomeId = 3, outcomeOfInterest = TRUE), + createOutcome(outcomeId = 4, outcomeOfInterest = TRUE) # NEW outcome + ) + ) + + # First run: outcomes [3] + result1 <- runCmAnalyses( + connectionDetails = connectionDetails, + cdmDatabaseSchema = "main", + exposureTable = "cohort", + outcomeTable = "cohort", + outputFolder = outputFolder, + databaseId = "Eunomia", + cmAnalysesSpecifications = createCmAnalysesSpecifications( + cmAnalysisList = list(analysis), + targetComparatorOutcomesList = list(tcos_base) + ) + ) + + # Record modification times of existing artifacts after first run + cmDataPath <- file.path(outputFolder, result1$cohortMethodDataFile[1]) + studyPopPath <- file.path(outputFolder, result1$studyPopFile[1]) + om3Path <- file.path(outputFolder, result1$outcomeModelFile[1]) + + expect_true(file.exists(cmDataPath)) + expect_true(file.exists(studyPopPath)) + expect_true(file.exists(om3Path)) + + mtime_cmData <- file.info(cmDataPath)$mtime + mtime_studyPop <- file.info(studyPopPath)$mtime + mtime_om3 <- file.info(om3Path)$mtime + + # Brief sleep to ensure mtime would differ if files were rewritten + Sys.sleep(1) + + # Second run: add outcome 4 + result2 <- runCmAnalyses( + connectionDetails = connectionDetails, + cdmDatabaseSchema = "main", + exposureTable = "cohort", + outcomeTable = "cohort", + outputFolder = outputFolder, + databaseId = "Eunomia", + cmAnalysesSpecifications = createCmAnalysesSpecifications( + cmAnalysisList = list(analysis), + targetComparatorOutcomesList = list(tcos_extended) + ) + ) + + # CmData file: same filename, NOT re-written (mtime unchanged) + expect_equal( + result1$cohortMethodDataFile[1], + result2$cohortMethodDataFile[result2$outcomeId == 3] + ) + expect_equal(file.info(cmDataPath)$mtime, mtime_cmData) + + # StudyPop for outcome 3: same filename, NOT re-written + expect_equal( + result1$studyPopFile[1], + result2$studyPopFile[result2$outcomeId == 3] + ) + expect_equal(file.info(studyPopPath)$mtime, mtime_studyPop) + + # Outcome model for outcome 3: same filename, NOT re-written + expect_equal( + result1$outcomeModelFile[1], + result2$outcomeModelFile[result2$outcomeId == 3] + ) + expect_equal(file.info(om3Path)$mtime, mtime_om3) + + # Outcome 4: new outcome model created + om4Path <- file.path(outputFolder, result2$outcomeModelFile[result2$outcomeId == 4]) + expect_true(file.exists(om4Path)) + + # Outcome 4's study pop: newly created + sp4Path <- file.path(outputFolder, result2$studyPopFile[result2$outcomeId == 4]) + expect_true(file.exists(sp4Path)) + + # New outcome 4 has a DIFFERENT filename than outcome 3 + expect_false( + result2$outcomeModelFile[result2$outcomeId == 3] == + result2$outcomeModelFile[result2$outcomeId == 4] + ) + }) + + test_that("E2E: changing settings forces recomputation with new filename", { + outputFolder <- tempfile(pattern = "cmCacheInvalidate") + on.exit(unlink(outputFolder, recursive = TRUE)) + + tcos <- createTargetComparatorOutcomes( + targetId = 1, + comparatorId = 2, + outcomes = list(createOutcome(outcomeId = 3, outcomeOfInterest = TRUE)) + ) + + analysis_v1 <- createCmAnalysis( + analysisId = 1, + getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs( + firstExposureOnly = TRUE, + washoutPeriod = 183, + covariateSettings = FeatureExtraction::createCovariateSettings( + useDemographicsGender = TRUE, + useDemographicsAge = TRUE + ) + ), + createStudyPopulationArgs = createCreateStudyPopulationArgs( + minDaysAtRisk = 1, + riskWindowStart = 0, + startAnchor = "cohort start", + riskWindowEnd = 30, + endAnchor = "cohort end" + ), + fitOutcomeModelArgs = createFitOutcomeModelArgs(modelType = "cox") + ) + + analysis_v2 <- createCmAnalysis( + analysisId = 1, + getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs( + firstExposureOnly = TRUE, + washoutPeriod = 183, + covariateSettings = FeatureExtraction::createCovariateSettings( + useDemographicsGender = TRUE, + useDemographicsAge = TRUE + ) + ), + createStudyPopulationArgs = createCreateStudyPopulationArgs( + minDaysAtRisk = 30, # CHANGED: was 1, now 30 + riskWindowStart = 0, + startAnchor = "cohort start", + riskWindowEnd = 30, + endAnchor = "cohort end" + ), + fitOutcomeModelArgs = createFitOutcomeModelArgs(modelType = "cox") + ) + + # First run + result_v1 <- runCmAnalyses( + connectionDetails = connectionDetails, + cdmDatabaseSchema = "main", + exposureTable = "cohort", + outcomeTable = "cohort", + outputFolder = outputFolder, + databaseId = "Eunomia", + cmAnalysesSpecifications = createCmAnalysesSpecifications( + cmAnalysisList = list(analysis_v1), + targetComparatorOutcomesList = list(tcos) + ) + ) + + # Record what was created + cmDataFile_v1 <- result_v1$cohortMethodDataFile[1] + studyPopFile_v1 <- result_v1$studyPopFile[1] + omFile_v1 <- result_v1$outcomeModelFile[1] + + expect_true(file.exists(file.path(outputFolder, cmDataFile_v1))) + expect_true(file.exists(file.path(outputFolder, studyPopFile_v1))) + expect_true(file.exists(file.path(outputFolder, omFile_v1))) + + # Second run with changed study pop args + result_v2 <- runCmAnalyses( + connectionDetails = connectionDetails, + cdmDatabaseSchema = "main", + exposureTable = "cohort", + outcomeTable = "cohort", + outputFolder = outputFolder, + databaseId = "Eunomia", + cmAnalysesSpecifications = createCmAnalysesSpecifications( + cmAnalysisList = list(analysis_v2), + targetComparatorOutcomesList = list(tcos) + ) + ) + + cmDataFile_v2 <- result_v2$cohortMethodDataFile[1] + studyPopFile_v2 <- result_v2$studyPopFile[1] + omFile_v2 <- result_v2$outcomeModelFile[1] + + # CmData: same (load args unchanged), study pop and outcome model: new files + expect_equal(cmDataFile_v1, cmDataFile_v2) + expect_false(studyPopFile_v1 == studyPopFile_v2) + expect_false(omFile_v1 == omFile_v2) + + # New files actually exist on disk + expect_true(file.exists(file.path(outputFolder, studyPopFile_v2))) + expect_true(file.exists(file.path(outputFolder, omFile_v2))) + + # OLD study pop and outcome model files still on disk (not deleted by new run) + expect_true(file.exists(file.path(outputFolder, studyPopFile_v1))) + expect_true(file.exists(file.path(outputFolder, omFile_v1))) + }) + +} From be90862951ac72b8d5378d7a192b00e589db4b14 Mon Sep 17 00:00:00 2001 From: jgilber2 Date: Mon, 4 May 2026 14:39:07 -0700 Subject: [PATCH 07/10] fix for broken equipoise data load --- R/RunAnalyses.R | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/R/RunAnalyses.R b/R/RunAnalyses.R index 42507af..4932d46 100644 --- a/R/RunAnalyses.R +++ b/R/RunAnalyses.R @@ -1961,9 +1961,14 @@ addEquipoise <- function(referenceTable, outputFolder) { filter(.data$sharedPsFile == "" & .data$psFile != "") |> distinct(.data$psFile) |> pull() - equipoise <- bind_rows(lapply(psFiles, getEquipoise)) - referenceTable <- referenceTable |> - left_join(equipoise, by = join_by("psFile")) + if (length(psFiles) > 0) { + equipoise <- bind_rows(lapply(psFiles, getEquipoise)) + referenceTable <- referenceTable |> + left_join(equipoise, by = join_by("psFile")) + } else { + referenceTable <- referenceTable |> + mutate(equipoise = NA_real_) + } } return(referenceTable) } From e47fb0d575a4d3364b2cbc02fe8a8cd584d529ed Mon Sep 17 00:00:00 2001 From: jgilber2 Date: Wed, 6 May 2026 10:28:02 -0700 Subject: [PATCH 08/10] shared base population that is based on only the target/nesting/database/time at risk settings and not outcome based --- R/RunAnalyses.R | 257 ++++++++++++++++++------- R/StudyPopulation.R | 242 +++++++++++++---------- tests/testthat/test-cachingScenarios.R | 175 ++++++++++++++++- 3 files changed, 506 insertions(+), 168 deletions(-) diff --git a/R/RunAnalyses.R b/R/RunAnalyses.R index 4932d46..0d91937 100644 --- a/R/RunAnalyses.R +++ b/R/RunAnalyses.R @@ -323,7 +323,60 @@ runCmAnalyses <- function(connectionDetails, ParallelLogger::stopCluster(cluster) } - # Create study populations -------------------------------------- + # Create base populations (shared across outcomes with same risk windows) --- + subset <- referenceTable[!duplicated(referenceTable$basePopFile), ] + subset <- subset[subset$basePopFile != "", ] + subset <- subset[!file.exists(file.path(outputFolder, subset$basePopFile)), ] + if (nrow(subset) != 0) { + message("*** Creating base populations ***") + createBasePopTask <- function(i) { + refRow <- subset[i, ] + analysisRow <- ParallelLogger::matchInList( + cmAnalysesSpecifications$cmAnalysisList, + list(analysisId = refRow$analysisId) + )[[1]] + createStudyPopulationArgs <- analysisRow$createStudyPopulationArgs + + # Apply per-outcome risk window overrides (these affect the base pop) + tco <- ParallelLogger::matchInList( + cmAnalysesSpecifications$targetComparatorOutcomesList, + list( + nestingCohortId = if (is.na(refRow$nestingCohortId)) {NULL} else {refRow$nestingCohortId}, + comparatorId = refRow$comparatorId, + targetId = refRow$targetId + ) + )[[1]] + outcome <- ParallelLogger::matchInList( + tco$outcomes, + list(outcomeId = as.numeric(refRow$outcomeId)) + ) + if (!is.null(outcome$riskWindowStart)) { + createStudyPopulationArgs$riskWindowStart <- outcome$riskWindowStart + } + if (!is.null(outcome$startAnchor)) { + createStudyPopulationArgs$startAnchor <- outcome$startAnchor + } + if (!is.null(outcome$riskWindowEnd)) { + createStudyPopulationArgs$riskWindowEnd <- outcome$riskWindowEnd + } + if (!is.null(outcome$endAnchor)) { + createStudyPopulationArgs$endAnchor <- outcome$endAnchor + } + task <- list( + cohortMethodDataFile = file.path(outputFolder, refRow$cohortMethodDataFile), + createStudyPopulationArgs = createStudyPopulationArgs, + basePopFile = file.path(outputFolder, refRow$basePopFile) + ) + return(task) + } + objectsToCreate <- lapply(1:nrow(subset), createBasePopTask) + cluster <- ParallelLogger::makeCluster(min(length(objectsToCreate), multiThreadingSettings$createStudyPopThreads)) + ParallelLogger::clusterRequire(cluster, "CohortMethod") + dummy <- ParallelLogger::clusterApply(cluster, objectsToCreate, doCreateBasePopObject) + ParallelLogger::stopCluster(cluster) + } + + # Create study populations (per-outcome, from base populations) ----------- subset <- referenceTable[!duplicated(referenceTable$studyPopFile), ] subset <- subset[subset$studyPopFile != "", ] subset <- subset[!file.exists(file.path(outputFolder, subset$studyPopFile)), ] @@ -366,10 +419,8 @@ runCmAnalyses <- function(connectionDetails, createStudyPopulationArgs$endAnchor <- outcome$endAnchor } task <- list( - cohortMethodDataFile = file.path( - outputFolder, - refRow$cohortMethodDataFile - ), + cohortMethodDataFile = file.path(outputFolder, refRow$cohortMethodDataFile), + basePopFile = file.path(outputFolder, refRow$basePopFile), outcomeId = refRow$outcomeId, createStudyPopulationArgs = createStudyPopulationArgs, minimizeFileSizes = getOption("minimizeFileSizes"), @@ -437,7 +488,7 @@ runCmAnalyses <- function(connectionDetails, refRow$cohortMethodDataFile ), args = createPsArgs, - createStudyPopulationArgs = analysisRow$createStudyPopulationArgs, + basePopFile = file.path(outputFolder, refRow$basePopFile), sharedPsFile = file.path(outputFolder, refRow$sharedPsFile) ) return(task) @@ -750,17 +801,31 @@ doCreateCmDataObject <- function(params) { return(NULL) } +doCreateBasePopObject <- function(params) { + cohortMethodData <- getCohortMethodData(params$cohortMethodDataFile) + ParallelLogger::logDebug(sprintf("Calling createBasePopulation() using %s", + params$cohortMethodDataFile)) + basePop <- createBasePopulation(cohortMethodData, + createStudyPopulationArgs = params$createStudyPopulationArgs) + saveRDS(basePop, params$basePopFile) + return(NULL) +} + doCreateStudyPopObject <- function(params) { + basePop <- readRDS(params$basePopFile) cohortMethodData <- getCohortMethodData(params$cohortMethodDataFile) - args <- list( + ParallelLogger::logDebug(sprintf("Calling addOutcomeToPopulation() using %s for outcomeId %s", + params$basePopFile, + params$outcomeId)) + studyPop <- addOutcomeToPopulation( + basePopulation = basePop, cohortMethodData = cohortMethodData, outcomeId = params$outcomeId, - createStudyPopulationArgs = params$createStudyPopulationArgs + removeSubjectsWithPriorOutcome = params$createStudyPopulationArgs$removeSubjectsWithPriorOutcome, + priorOutcomeLookback = params$createStudyPopulationArgs$priorOutcomeLookback, + startAnchor = params$createStudyPopulationArgs$startAnchor, + riskWindowStart = params$createStudyPopulationArgs$riskWindowStart ) - ParallelLogger::logDebug(sprintf("Calling createStudyPopulation() using %s for outcomeId %s", - params$cohortMethodDataFile, - args$outcomeId)) - studyPop <- do.call("createStudyPopulation", args) if (!is.null(params$minimizeFileSizes) && params$minimizeFileSizes) { metaData <- attr(studyPop, "metaData") studyPop <- studyPop[, c("rowId", "treatment", "personSeqId", "outcomeCount", "timeAtRisk", "survivalTime")] @@ -789,20 +854,16 @@ doFitPsModel <- function(params) { doFitSharedPsModel <- function(params, refitPsForEveryStudyPopulation) { cohortMethodData <- getCohortMethodData(params$cohortMethodDataFile) if (refitPsForEveryStudyPopulation) { - args <- list( - cohortMethodData = cohortMethodData, - createStudyPopulationArgs = params$createStudyPopulationArgs - ) - message("Fitting propensity model across all outcomes (ignore messages about 'no outcome specified')") - ParallelLogger::logDebug(sprintf("Calling createPs() for shared PS using %s", - params$cohortMethodDataFile)) - studyPop <- do.call("createStudyPopulation", args) + # Use base population directly (outcome-independent by construction) + basePop <- readRDS(params$basePopFile) + ParallelLogger::logDebug(sprintf("Calling createPs() for shared PS using base population %s", + params$basePopFile)) } else { - studyPop <- NULL + basePop <- NULL } args <- list( cohortMethodData = cohortMethodData, - population = studyPop, + population = basePop, createPsArgs = params$args ) ps <- do.call("createPs", args) @@ -1152,13 +1213,85 @@ createReferenceTable <- function(cmAnalysisList, ) referenceTable <- inner_join(referenceTable, analysisIdToStudyPopArgsId, by = "analysisId") - # Content-addressable hash for study population files + # Compute effective base population args per row (applying per-outcome risk window overrides) + # Base pop args include only outcome-independent fields: risk windows, censoring, minDaysAtRisk + .getEffectiveBasePopArgsJson <- function(i) { + aIdx <- match(referenceTable$analysisId[i], analyses$analysisId) + args <- cmAnalysisList[[aIdx]]$createStudyPopulationArgs + + # Look up per-outcome overrides from targetComparatorOutcomesList + tcoMatch <- Filter(function(tco) { + tco$targetId == referenceTable$targetId[i] && + tco$comparatorId == referenceTable$comparatorId[i] && + (is.null(tco$nestingCohortId) && is.na(referenceTable$nestingCohortId[i]) || + identical(tco$nestingCohortId, referenceTable$nestingCohortId[i])) + }, targetComparatorOutcomesList) + if (length(tcoMatch) > 0) { + outcomeMatch <- Filter(function(o) o$outcomeId == referenceTable$outcomeId[i], + tcoMatch[[1]]$outcomes) + if (length(outcomeMatch) > 0) { + outcome <- outcomeMatch[[1]] + # Apply risk window overrides that affect the base population + baseFields <- list( + minDaysAtRisk = args$minDaysAtRisk, + maxDaysAtRisk = args$maxDaysAtRisk, + riskWindowStart = if (!is.null(outcome$riskWindowStart)) outcome$riskWindowStart else args$riskWindowStart, + startAnchor = if (!is.null(outcome$startAnchor)) outcome$startAnchor else args$startAnchor, + riskWindowEnd = if (!is.null(outcome$riskWindowEnd)) outcome$riskWindowEnd else args$riskWindowEnd, + endAnchor = if (!is.null(outcome$endAnchor)) outcome$endAnchor else args$endAnchor, + censorAtNewRiskWindow = args$censorAtNewRiskWindow + ) + return(as.character(jsonlite::toJSON(baseFields[order(names(baseFields))], + auto_unbox = TRUE, digits = NA, null = "null"))) + } + } + # No overrides — use analysis-level args + .serializeBasePopArgs(args) + } + + # Compute outcome-specific args JSON (removeSubjectsWithPriorOutcome + priorOutcomeLookback) + .getOutcomeSpecificArgsJson <- function(i) { + aIdx <- match(referenceTable$analysisId[i], analyses$analysisId) + args <- cmAnalysisList[[aIdx]]$createStudyPopulationArgs + priorOutcomeLookback <- args$priorOutcomeLookback + removeSubjectsWithPriorOutcome <- args$removeSubjectsWithPriorOutcome + + # Check for per-outcome priorOutcomeLookback override + tcoMatch <- Filter(function(tco) { + tco$targetId == referenceTable$targetId[i] && + tco$comparatorId == referenceTable$comparatorId[i] && + (is.null(tco$nestingCohortId) && is.na(referenceTable$nestingCohortId[i]) || + identical(tco$nestingCohortId, referenceTable$nestingCohortId[i])) + }, targetComparatorOutcomesList) + if (length(tcoMatch) > 0) { + outcomeMatch <- Filter(function(o) o$outcomeId == referenceTable$outcomeId[i], + tcoMatch[[1]]$outcomes) + if (length(outcomeMatch) > 0 && !is.null(outcomeMatch[[1]]$priorOutcomeLookback)) { + priorOutcomeLookback <- outcomeMatch[[1]]$priorOutcomeLookback + } + } + as.character(jsonlite::toJSON(list( + removeSubjectsWithPriorOutcome = removeSubjectsWithPriorOutcome, + priorOutcomeLookback = priorOutcomeLookback + ), auto_unbox = TRUE, digits = NA, null = "null")) + } + + basePopArgsJsons <- vapply(seq_len(nrow(referenceTable)), .getEffectiveBasePopArgsJson, character(1)) + outcomeSpecificArgsJsons <- vapply(seq_len(nrow(referenceTable)), .getOutcomeSpecificArgsJson, character(1)) + + # Content-addressable hash for base population files (outcome-independent) + referenceTable$basePopHash <- vapply(seq_len(nrow(referenceTable)), function(i) { + .contentHash(databaseId, referenceTable$loadHash[i], basePopArgsJsons[i]) + }, character(1)) + referenceTable$basePopFile <- .createBasePopulationFileName(referenceTable$basePopHash) + + # Content-addressable hash for study population files (outcome-specific) referenceTable$studyPopFile <- vapply(seq_len(nrow(referenceTable)), function(i) { hash <- .contentHash( databaseId, - referenceTable$loadHash[i], - studyPopArgsJsons[[match(referenceTable$analysisId[i], analyses$analysisId)]], - referenceTable$outcomeId[i] + referenceTable$basePopHash[i], + referenceTable$outcomeId[i], + outcomeSpecificArgsJsons[i] ) .createStudyPopulationFileName(hash) }, character(1)) @@ -1178,52 +1311,26 @@ createReferenceTable <- function(cmAnalysisList, idxWithPs <- !(referenceTable$psArgsId %in% noPsIds) referenceTable$psFile <- "" referenceTable$psFile[idxWithPs] <- vapply(which(idxWithPs), function(i) { + aIdx <- match(referenceTable$analysisId[i], analyses$analysisId) hash <- .contentHash( databaseId, - referenceTable$loadHash[i], - studyPopArgsJsons[[match(referenceTable$analysisId[i], analyses$analysisId)]], - createPsArgsJsons[[match(referenceTable$analysisId[i], analyses$analysisId)]], + referenceTable$basePopHash[i], + createPsArgsJsons[[aIdx]], referenceTable$outcomeId[i] ) .createPsFileName(hash) }, character(1)) + # Shared PS: uses basePopHash directly (outcome-independent, replaces equivalent() logic) referenceTable$sharedPsFile <- "" if (!refitPsForEveryOutcome) { if (refitPsForEveryStudyPopulation) { - # Find equivalent studyPopArgs, so we can reuse PS over those as well: - equivalent <- function(studyPopArgs1, studyPopArgs2) { - if (studyPopArgs1$minDaysAtRisk == 0 && studyPopArgs2$minDaysAtRisk == 0) { - return(TRUE) - } else if (studyPopArgs1$minDaysAtRisk != studyPopArgs2$minDaysAtRisk || - studyPopArgs1$riskWindowStart != studyPopArgs2$riskWindowStart || - studyPopArgs1$startAnchor != studyPopArgs2$startAnchor || - studyPopArgs1$riskWindowEnd != studyPopArgs2$riskWindowEnd || - studyPopArgs1$endAnchor != studyPopArgs2$endAnchor || - studyPopArgs1$censorAtNewRiskWindow != studyPopArgs2$censorAtNewRiskWindow) { - return(FALSE) - } else { - return(TRUE) - } - } - # Compute the canonical (first equivalent) studyPop JSON for hashing - studyPopArgsEquivalentIdx <- seq_along(cmAnalysisList) - for (i in seq_along(cmAnalysisList)) { - for (j in seq_len(i - 1)) { - if (equivalent(cmAnalysisList[[i]]$createStudyPopulationArgs, - cmAnalysisList[[j]]$createStudyPopulationArgs)) { - studyPopArgsEquivalentIdx[i] <- j - break - } - } - } + # basePopHash already captures equivalence for risk window parameters referenceTable$sharedPsFile[idxWithPs] <- vapply(which(idxWithPs), function(i) { aIdx <- match(referenceTable$analysisId[i], analyses$analysisId) - equivIdx <- studyPopArgsEquivalentIdx[aIdx] hash <- .contentHash( databaseId, - referenceTable$loadHash[i], - studyPopArgsJsons[[equivIdx]], + referenceTable$basePopHash[i], createPsArgsJsons[[aIdx]] ) .createPsFileName(hash) @@ -1264,8 +1371,7 @@ createReferenceTable <- function(cmAnalysisList, aIdx <- match(referenceTable$analysisId[i], analyses$analysisId) hash <- .contentHash( databaseId, - referenceTable$loadHash[i], - studyPopArgsJsons[[aIdx]], + referenceTable$basePopHash[i], createPsArgsJsons[[aIdx]], strataArgsJsons[[aIdx]], referenceTable$outcomeId[i] @@ -1294,8 +1400,7 @@ createReferenceTable <- function(cmAnalysisList, aIdx <- match(referenceTable$analysisId[i], analyses$analysisId) hash <- .contentHash( databaseId, - referenceTable$loadHash[i], - studyPopArgsJsons[[aIdx]], + referenceTable$basePopHash[i], createPsArgsJsons[[aIdx]], strataArgsJsons[[aIdx]], sharedBalanceArgsJsons[[aIdx]] @@ -1342,8 +1447,7 @@ createReferenceTable <- function(cmAnalysisList, aIdx <- match(referenceTable$analysisId[i], analyses$analysisId) hash <- .contentHash( databaseId, - referenceTable$loadHash[i], - studyPopArgsJsons[[aIdx]], + referenceTable$basePopHash[i], createPsArgsJsons[[aIdx]], strataArgsJsons[[aIdx]], balanceArgsJsons[[aIdx]], @@ -1403,8 +1507,7 @@ createReferenceTable <- function(cmAnalysisList, aIdx <- match(referenceTable$analysisId[i], analyses$analysisId) hash <- .contentHash( databaseId, - referenceTable$loadHash[i], - studyPopArgsJsons[[aIdx]], + referenceTable$basePopHash[i], createPsArgsJsons[[aIdx]], strataArgsJsons[[aIdx]], outcomeModelArgsJsons[[aIdx]], @@ -1425,6 +1528,8 @@ createReferenceTable <- function(cmAnalysisList, "outcomeOfInterest", "trueEffectSize", "cohortMethodDataFile", + "basePopFile", + "basePopHash", "studyPopFile", "sharedPsFile", "psFile", @@ -1498,10 +1603,30 @@ createReferenceTable <- function(cmAnalysisList, substr(digest::digest(canonical, algo = "sha256", serialize = FALSE), 1, length) } +# Serialize only the base-population-relevant fields of a CreateStudyPopulationArgs object. +# Excludes outcome-specific fields (removeSubjectsWithPriorOutcome, priorOutcomeLookback). +.serializeBasePopArgs <- function(args) { + baseFields <- list( + minDaysAtRisk = args$minDaysAtRisk, + maxDaysAtRisk = args$maxDaysAtRisk, + riskWindowStart = args$riskWindowStart, + startAnchor = args$startAnchor, + riskWindowEnd = args$riskWindowEnd, + endAnchor = args$endAnchor, + censorAtNewRiskWindow = args$censorAtNewRiskWindow + ) + as.character(jsonlite::toJSON(baseFields[order(names(baseFields))], + auto_unbox = TRUE, digits = NA, null = "null")) +} + .createCohortMethodDataFileName <- function(loadHash) { sprintf("CmData_%s.zip", loadHash) } +.createBasePopulationFileName <- function(hash) { + sprintf("BasePop_%s.rds", hash) +} + .createPrefilteredCovariatesFileName <- function(hash) { sprintf("Prefilter_%s.zip", hash) } diff --git a/R/StudyPopulation.R b/R/StudyPopulation.R index 7d5b739..be192fd 100644 --- a/R/StudyPopulation.R +++ b/R/StudyPopulation.R @@ -31,43 +31,14 @@ fastDuplicated <- function(data, columns) { } } -#' Create a study population -#' -#' @details -#' Create a study population by enforcing certain inclusion and exclusion criteria, defining a risk -#' window, and determining which outcomes fall inside the risk window. -#' -#' @template CohortMethodData -#' -#' @param population If specified, this population will be used as the starting -#' point instead of the cohorts in the `cohortMethodData` object. -#' @param outcomeId The ID of the outcome. If NULL, no outcome-specific -#' transformations will be performed. -#' @param createStudyPopulationArgs An object of type `CreateStudyPopulationArgs` as created by -#' the [createCreateStudyPopulationArgs()] function. -#' @return -#' A `tibble` specifying the study population. This `tibble` will have the following columns: -#' -#' - `rowId`: A unique identifier for an exposure. -#' - `personSeqId`: The person sequence ID of the subject. -#' - `cohortStartdate`: The index date. -#' - `outcomeCount` The number of outcomes observed during the risk window. -#' - `timeAtRisk`: The number of days in the risk window. -#' - `survivalTime`: The number of days until either the outcome or the end of the risk window. -#' -#' @export -createStudyPopulation <- function(cohortMethodData, - population = NULL, - outcomeId = NULL, - createStudyPopulationArgs = createCreateStudyPopulationArgs()) { - errorMessages <- checkmate::makeAssertCollection() - checkmate::assertClass(cohortMethodData, "CohortMethodData", add = errorMessages) - checkmate::assertDataFrame(population, null.ok = TRUE, add = errorMessages) - checkmate::assertNumeric(outcomeId, null.ok = TRUE, add = errorMessages) - if (!is.null(outcomeId)) checkmate::assertTRUE(all(outcomeId %% 1 == 0), add = errorMessages) - checkmate::assertR6(createStudyPopulationArgs, "CreateStudyPopulationArgs", add = errorMessages) - checkmate::reportAssertions(collection = errorMessages) - +# Internal: Create a base population (outcome-independent) +# +# Performs all outcome-independent filtering: risk window creation, censoring, +# and minimum days-at-risk filtering. This base population can be shared across +# multiple outcomes that use the same risk window parameters. +createBasePopulation <- function(cohortMethodData, + population = NULL, + createStudyPopulationArgs = createCreateStudyPopulationArgs()) { isEnd <- function(anchor) { return(grepl("end$", anchor, ignore.case = TRUE)) } @@ -82,38 +53,6 @@ createStudyPopulation <- function(cohortMethodData, } metaData$targetEstimator <- "ate" - if (createStudyPopulationArgs$removeSubjectsWithPriorOutcome) { - if (is.null(outcomeId)) { - message("No outcome specified so skipping removing people with prior outcomes") - } else { - message("Removing subjects with prior outcomes (if any)") - outcomes <- cohortMethodData$outcomes |> - filter(.data$outcomeId == !!outcomeId) |> - collect() - if (isEnd(createStudyPopulationArgs$startAnchor)) { - outcomes <- merge(outcomes, population[, c("rowId", "daysToCohortEnd")]) - priorOutcomeRowIds <- outcomes |> - filter( - .data$daysToEvent > -createStudyPopulationArgs$priorOutcomeLookback & - outcomes$daysToEvent < outcomes$daysToCohortEnd + createStudyPopulationArgs$riskWindowStart - ) |> - pull("rowId") - } else { - priorOutcomeRowIds <- outcomes |> - filter( - .data$daysToEvent > -createStudyPopulationArgs$priorOutcomeLookback & - .data$daysToEvent < createStudyPopulationArgs$riskWindowStart - ) |> - pull("rowId") - } - population <- population |> - filter(!(.data$rowId %in% priorOutcomeRowIds)) - metaData$attrition <- rbind( - metaData$attrition, - getCounts(population, paste("No prior outcome")) - ) - } - } # Create risk windows: population$riskStart <- rep(createStudyPopulationArgs$riskWindowStart, nrow(population)) if (isEnd(createStudyPopulationArgs$startAnchor)) { @@ -168,49 +107,154 @@ createStudyPopulation <- function(cohortMethodData, "days at risk" ))) } - if (is.null(outcomeId)) { - message("No outcome specified so not creating outcome and time variables") - } else { - # Select outcomes during time at risk + attr(population, "metaData") <- metaData + ParallelLogger::logDebug("Base population has ", nrow(population), " rows") + return(population) +} + +# Internal: Add outcome-specific data to a base population +# +# Applies outcome-specific filtering (removeSubjectsWithPriorOutcome) and +# computes outcome columns (outcomeCount, timeAtRisk, survivalTime, daysToEvent). +addOutcomeToPopulation <- function(basePopulation, + cohortMethodData, + outcomeId, + removeSubjectsWithPriorOutcome = TRUE, + priorOutcomeLookback = 99999, + startAnchor = "cohort start", + riskWindowStart = 0) { + isEnd <- function(anchor) { + return(grepl("end$", anchor, ignore.case = TRUE)) + } + + population <- basePopulation + metaData <- attr(population, "metaData") + + if (removeSubjectsWithPriorOutcome) { + message("Removing subjects with prior outcomes (if any)") outcomes <- cohortMethodData$outcomes |> filter(.data$outcomeId == !!outcomeId) |> collect() - outcomes <- merge(outcomes, population[, c("rowId", "riskStart", "riskEnd")]) - outcomes <- outcomes |> - filter( - .data$daysToEvent >= .data$riskStart & - .data$daysToEvent <= .data$riskEnd - ) - - # Create outcome count column - if (nrow(outcomes) == 0) { - population$outcomeCount <- rep(0, nrow(population)) + if (isEnd(startAnchor)) { + outcomes <- merge(outcomes, population[, c("rowId", "daysToCohortEnd")]) + priorOutcomeRowIds <- outcomes |> + filter( + .data$daysToEvent > -priorOutcomeLookback & + outcomes$daysToEvent < outcomes$daysToCohortEnd + riskWindowStart + ) |> + pull("rowId") } else { - outcomeCount <- outcomes |> - group_by(.data$rowId) |> - summarise(outcomeCount = length(.data$outcomeId)) - population$outcomeCount <- 0 - population$outcomeCount[match(outcomeCount$rowId, population$rowId)] <- outcomeCount$outcomeCount + priorOutcomeRowIds <- outcomes |> + filter( + .data$daysToEvent > -priorOutcomeLookback & + .data$daysToEvent < riskWindowStart + ) |> + pull("rowId") } + population <- population |> + filter(!(.data$rowId %in% priorOutcomeRowIds)) + metaData$attrition <- rbind( + metaData$attrition, + getCounts(population, paste("No prior outcome")) + ) + } - # Create time at risk column - population$timeAtRisk <- population$riskEnd - population$riskStart + 1 + # Select outcomes during time at risk + outcomes <- cohortMethodData$outcomes |> + filter(.data$outcomeId == !!outcomeId) |> + collect() + outcomes <- merge(outcomes, population[, c("rowId", "riskStart", "riskEnd")]) + outcomes <- outcomes |> + filter( + .data$daysToEvent >= .data$riskStart & + .data$daysToEvent <= .data$riskEnd + ) - # Create survival time column - firstOutcomes <- outcomes |> - arrange(.data$rowId, .data$daysToEvent) |> - filter(!duplicated(.data$rowId)) - population$daysToEvent <- rep(NA, nrow(population)) - population$daysToEvent[match(firstOutcomes$rowId, population$rowId)] <- firstOutcomes$daysToEvent - population$survivalTime <- population$timeAtRisk - population$survivalTime[population$outcomeCount != 0] <- population$daysToEvent[population$outcomeCount != - 0] - population$riskStart[population$outcomeCount != 0] + 1 + # Create outcome count column + if (nrow(outcomes) == 0) { + population$outcomeCount <- rep(0, nrow(population)) + } else { + outcomeCount <- outcomes |> + group_by(.data$rowId) |> + summarise(outcomeCount = length(.data$outcomeId)) + population$outcomeCount <- 0 + population$outcomeCount[match(outcomeCount$rowId, population$rowId)] <- outcomeCount$outcomeCount } + + # Create time at risk column + population$timeAtRisk <- population$riskEnd - population$riskStart + 1 + + # Create survival time column + firstOutcomes <- outcomes |> + arrange(.data$rowId, .data$daysToEvent) |> + filter(!duplicated(.data$rowId)) + population$daysToEvent <- rep(NA, nrow(population)) + population$daysToEvent[match(firstOutcomes$rowId, population$rowId)] <- firstOutcomes$daysToEvent + population$survivalTime <- population$timeAtRisk + population$survivalTime[population$outcomeCount != 0] <- population$daysToEvent[population$outcomeCount != + 0] - population$riskStart[population$outcomeCount != 0] + 1 + attr(population, "metaData") <- metaData ParallelLogger::logDebug("Study population has ", nrow(population), " rows") return(population) } +#' Create a study population +#' +#' @details +#' Create a study population by enforcing certain inclusion and exclusion criteria, defining a risk +#' window, and determining which outcomes fall inside the risk window. +#' +#' @template CohortMethodData +#' +#' @param population If specified, this population will be used as the starting +#' point instead of the cohorts in the `cohortMethodData` object. +#' @param outcomeId The ID of the outcome. If NULL, no outcome-specific +#' transformations will be performed. +#' @param createStudyPopulationArgs An object of type `CreateStudyPopulationArgs` as created by +#' the [createCreateStudyPopulationArgs()] function. +#' @return +#' A `tibble` specifying the study population. This `tibble` will have the following columns: +#' +#' - `rowId`: A unique identifier for an exposure. +#' - `personSeqId`: The person sequence ID of the subject. +#' - `cohortStartdate`: The index date. +#' - `outcomeCount` The number of outcomes observed during the risk window. +#' - `timeAtRisk`: The number of days in the risk window. +#' - `survivalTime`: The number of days until either the outcome or the end of the risk window. +#' +#' @export +createStudyPopulation <- function(cohortMethodData, + population = NULL, + outcomeId = NULL, + createStudyPopulationArgs = createCreateStudyPopulationArgs()) { + errorMessages <- checkmate::makeAssertCollection() + checkmate::assertClass(cohortMethodData, "CohortMethodData", add = errorMessages) + checkmate::assertDataFrame(population, null.ok = TRUE, add = errorMessages) + checkmate::assertNumeric(outcomeId, null.ok = TRUE, add = errorMessages) + if (!is.null(outcomeId)) checkmate::assertTRUE(all(outcomeId %% 1 == 0), add = errorMessages) + checkmate::assertR6(createStudyPopulationArgs, "CreateStudyPopulationArgs", add = errorMessages) + checkmate::reportAssertions(collection = errorMessages) + + basePop <- createBasePopulation(cohortMethodData, population, createStudyPopulationArgs) + + if (is.null(outcomeId)) { + message("No outcome specified so not creating outcome and time variables") + return(basePop) + } + + studyPop <- addOutcomeToPopulation( + basePopulation = basePop, + cohortMethodData = cohortMethodData, + outcomeId = outcomeId, + removeSubjectsWithPriorOutcome = createStudyPopulationArgs$removeSubjectsWithPriorOutcome, + priorOutcomeLookback = createStudyPopulationArgs$priorOutcomeLookback, + startAnchor = createStudyPopulationArgs$startAnchor, + riskWindowStart = createStudyPopulationArgs$riskWindowStart + ) + return(studyPop) +} + #' Get the attrition table for a population #' #' @param object Either an object of type [CohortMethodData], a population object generated by diff --git a/tests/testthat/test-cachingScenarios.R b/tests/testthat/test-cachingScenarios.R index e2bba951..c146175 100644 --- a/tests/testthat/test-cachingScenarios.R +++ b/tests/testthat/test-cachingScenarios.R @@ -633,11 +633,165 @@ test_that("Reference table is deterministic across repeated calls", { # =========================================================================== -# End-to-end disk reuse tests (require Eunomia) -# These tests verify that artifact files are actually written to and reused -# from disk, not just that filenames are computed correctly. +# Base Population Sharing Tests +# Verify that the two-phase population architecture correctly shares +# base populations across outcomes while keeping study populations distinct. # =========================================================================== +test_that("Two outcomes with same risk windows share the same basePopFile", { + analysis <- makeBasicAnalysis() + tcos <- list( + createTargetComparatorOutcomes( + targetId = 1, + comparatorId = 2, + outcomes = list( + createOutcome(outcomeId = 10, outcomeOfInterest = TRUE), + createOutcome(outcomeId = 20, outcomeOfInterest = TRUE) + ) + ) + ) + ref <- buildRef(list(analysis), tcos) + + # Same base population (same risk windows for both outcomes) + expect_equal(ref$basePopFile[ref$outcomeId == 10], + ref$basePopFile[ref$outcomeId == 20]) + # Different study populations (different outcomeId) + expect_true(ref$studyPopFile[ref$outcomeId == 10] != + ref$studyPopFile[ref$outcomeId == 20]) +}) + +test_that("Per-outcome risk window override creates different basePopFile", { + analysis <- makeBasicAnalysis() + tcos <- list( + createTargetComparatorOutcomes( + targetId = 1, + comparatorId = 2, + outcomes = list( + createOutcome(outcomeId = 10, outcomeOfInterest = TRUE), + createOutcome(outcomeId = 20, outcomeOfInterest = TRUE, + riskWindowEnd = 60, endAnchor = "cohort start") # Override default of 30 + ) + ) + ) + ref <- buildRef(list(analysis), tcos) + + # Different base populations (different risk window end) + expect_true(ref$basePopFile[ref$outcomeId == 10] != + ref$basePopFile[ref$outcomeId == 20]) + # Different study populations too + expect_true(ref$studyPopFile[ref$outcomeId == 10] != + ref$studyPopFile[ref$outcomeId == 20]) +}) + +test_that("Changing priorOutcomeLookback changes studyPopFile but NOT basePopFile", { + analysis1 <- createCmAnalysis( + analysisId = 1, + getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs( + covariateSettings = FeatureExtraction::createCovariateSettings( + useDemographicsGender = TRUE + ) + ), + createStudyPopulationArgs = createCreateStudyPopulationArgs( + removeSubjectsWithPriorOutcome = TRUE, + priorOutcomeLookback = 99999, + minDaysAtRisk = 1, + riskWindowStart = 0, + startAnchor = "cohort start", + riskWindowEnd = 30, + endAnchor = "cohort start" + ), + fitOutcomeModelArgs = createFitOutcomeModelArgs(modelType = "cox") + ) + analysis2 <- createCmAnalysis( + analysisId = 2, + getDbCohortMethodDataArgs = createGetDbCohortMethodDataArgs( + covariateSettings = FeatureExtraction::createCovariateSettings( + useDemographicsGender = TRUE + ) + ), + createStudyPopulationArgs = createCreateStudyPopulationArgs( + removeSubjectsWithPriorOutcome = TRUE, + priorOutcomeLookback = 365, + minDaysAtRisk = 1, + riskWindowStart = 0, + startAnchor = "cohort start", + riskWindowEnd = 30, + endAnchor = "cohort start" + ), + fitOutcomeModelArgs = createFitOutcomeModelArgs(modelType = "cox") + ) + tcos <- list( + createTargetComparatorOutcomes( + targetId = 1, + comparatorId = 2, + outcomes = list(createOutcome(outcomeId = 10, outcomeOfInterest = TRUE)) + ) + ) + ref <- buildRef(list(analysis1, analysis2), tcos) + + row1 <- ref[ref$analysisId == 1, ] + row2 <- ref[ref$analysisId == 2, ] + + # Same base population (same risk windows, minDaysAtRisk, etc.) + expect_equal(row1$basePopFile, row2$basePopFile) + # Different study populations (different priorOutcomeLookback) + expect_true(row1$studyPopFile != row2$studyPopFile) +}) + +test_that("Shared PS file uses basePopHash (outcome-independent)", { + analysis <- makeBasicAnalysis() + tcos <- list( + createTargetComparatorOutcomes( + targetId = 1, + comparatorId = 2, + outcomes = list( + createOutcome(outcomeId = 10, outcomeOfInterest = TRUE), + createOutcome(outcomeId = 20, outcomeOfInterest = TRUE) + ) + ) + ) + ref <- buildRef(list(analysis), tcos) + + # Shared PS should be the same for both outcomes (based on basePopHash) + expect_equal(ref$sharedPsFile[ref$outcomeId == 10], + ref$sharedPsFile[ref$outcomeId == 20]) + # And both should be non-empty + expect_true(ref$sharedPsFile[1] != "") +}) + +test_that("Adding an outcome preserves all existing filenames", { + analysis <- makeBasicAnalysis() + tcos1 <- list( + createTargetComparatorOutcomes( + targetId = 1, + comparatorId = 2, + outcomes = list(createOutcome(outcomeId = 10, outcomeOfInterest = TRUE)) + ) + ) + tcos2 <- list( + createTargetComparatorOutcomes( + targetId = 1, + comparatorId = 2, + outcomes = list( + createOutcome(outcomeId = 10, outcomeOfInterest = TRUE), + createOutcome(outcomeId = 20, outcomeOfInterest = TRUE) + ) + ) + ) + + ref1 <- buildRef(list(analysis), tcos1) + ref2 <- buildRef(list(analysis), tcos2) + ref2_o10 <- ref2[ref2$outcomeId == 10, ] + + # All filenames for outcome 10 should be identical + expect_equal(ref1$basePopFile, ref2_o10$basePopFile) + expect_equal(ref1$cohortMethodDataFile, ref2_o10$cohortMethodDataFile) + expect_equal(ref1$studyPopFile, ref2_o10$studyPopFile) + expect_equal(ref1$sharedPsFile, ref2_o10$sharedPsFile) + expect_equal(ref1$strataFile, ref2_o10$strataFile) + expect_equal(ref1$outcomeModelFile, ref2_o10$outcomeModelFile) +}) + if (!isFALSE(tryCatch(find.package("Eunomia"), error = function(e) FALSE))) { # Minimal analysis used across all E2E tests: no PS, no matching, simple cox @@ -698,6 +852,7 @@ if (!isFALSE(tryCatch(find.package("Eunomia"), error = function(e) FALSE))) { } checkFiles("cohortMethodDataFile") + checkFiles("basePopFile") checkFiles("studyPopFile") checkFiles("outcomeModelFile") }) @@ -739,14 +894,17 @@ if (!isFALSE(tryCatch(find.package("Eunomia"), error = function(e) FALSE))) { # Record modification times of existing artifacts after first run cmDataPath <- file.path(outputFolder, result1$cohortMethodDataFile[1]) + basePopPath <- file.path(outputFolder, result1$basePopFile[1]) studyPopPath <- file.path(outputFolder, result1$studyPopFile[1]) om3Path <- file.path(outputFolder, result1$outcomeModelFile[1]) expect_true(file.exists(cmDataPath)) + expect_true(file.exists(basePopPath)) expect_true(file.exists(studyPopPath)) expect_true(file.exists(om3Path)) mtime_cmData <- file.info(cmDataPath)$mtime + mtime_basePop <- file.info(basePopPath)$mtime mtime_studyPop <- file.info(studyPopPath)$mtime mtime_om3 <- file.info(om3Path)$mtime @@ -774,6 +932,17 @@ if (!isFALSE(tryCatch(find.package("Eunomia"), error = function(e) FALSE))) { ) expect_equal(file.info(cmDataPath)$mtime, mtime_cmData) + # Base population: same filename, NOT re-written (shared across outcomes) + expect_equal( + result1$basePopFile[1], + result2$basePopFile[result2$outcomeId == 3] + ) + expect_equal( + result1$basePopFile[1], + result2$basePopFile[result2$outcomeId == 4] + ) + expect_equal(file.info(basePopPath)$mtime, mtime_basePop) + # StudyPop for outcome 3: same filename, NOT re-written expect_equal( result1$studyPopFile[1], From 6fa101a628ce96311475157023250ba628d891a2 Mon Sep 17 00:00:00 2001 From: jgilber2 Date: Wed, 6 May 2026 10:53:17 -0700 Subject: [PATCH 09/10] Regression testing script to compare current code against main --- extras/TestResultsRegression.R | 372 +++++++++++++++++++++++++++++++++ 1 file changed, 372 insertions(+) create mode 100644 extras/TestResultsRegression.R diff --git a/extras/TestResultsRegression.R b/extras/TestResultsRegression.R new file mode 100644 index 0000000..0df4d03 --- /dev/null +++ b/extras/TestResultsRegression.R @@ -0,0 +1,372 @@ +# TestResultsRegression.R +# +# Verifies that the current code produces identical results to the released version. +# Compares exported CSV results against the reference zip committed on main. +# +# Usage (from package root): +# RENV_CONFIG_SANDBOX_ENABLED=FALSE Rscript --vanilla extras/TestResultsRegression.R + +library(CohortMethod) + +# =========================================================================== +# 1. Extract reference results from git main branch +# =========================================================================== +referenceDir <- tempfile(pattern = "cmReference") +dir.create(referenceDir) + +refZipPath <- file.path(referenceDir, "Results_Eunomia.zip") +# Use the script's directory to find the package root (extras/ is one level below) +packageRoot <- normalizePath(file.path(getwd())) +if (!file.exists(file.path(packageRoot, ".git"))) { + # Fallback: try installed package location + + packageRoot <- normalizePath(system.file("..", package = "CohortMethod")) +} +# Try local main first, then origin/main +exitCode <- system2("git", c("-C", packageRoot, "show", "main:inst/Results_Eunomia.zip"), + stdout = refZipPath, stderr = FALSE) +if (exitCode != 0) { + exitCode <- system2("git", c("-C", packageRoot, "show", "origin/main:inst/Results_Eunomia.zip"), + stdout = refZipPath, stderr = FALSE) +} +if (exitCode != 0) { + stop("Failed to extract inst/Results_Eunomia.zip from main branch. ", + "Make sure you are in the CohortMethod git repo and 'main' branch exists.") +} +unzip(refZipPath, exdir = referenceDir) +message("Reference results extracted from main branch") + +# =========================================================================== +# 2. Run full analysis with current code (mirrors CreateResultsSchemaForTesting.R) +# =========================================================================== +connectionDetails <- Eunomia::getEunomiaConnectionDetails() +Eunomia::createCohorts(connectionDetails) + +outputFolder <- tempfile(pattern = "cmRegression") + +tcos1 <- createTargetComparatorOutcomes( + targetId = 1, + comparatorId = 2, + outcomes = list( + createOutcome( + outcomeId = 3, + priorOutcomeLookback = 30 + ), + createOutcome( + outcomeId = 4, + outcomeOfInterest = FALSE, + trueEffectSize = 1 + ) + ), + excludedCovariateConceptIds = c(1118084, 1124300) +) +# Empty cohorts: +tcos2 <- createTargetComparatorOutcomes( + targetId = 998, + comparatorId = 999, + outcomes = list( + createOutcome( + outcomeId = 3, + priorOutcomeLookback = 30 + ), + createOutcome( + outcomeId = 4, + outcomeOfInterest = FALSE, + trueEffectSize = 1 + ) + ) +) + +targetComparatorOutcomesList <- list(tcos1, tcos2) + +covarSettings <- createDefaultCovariateSettings(addDescendantsToExclude = TRUE) + +getDbCmDataArgs <- createGetDbCohortMethodDataArgs( + washoutPeriod = 183, + firstExposureOnly = TRUE, + removeDuplicateSubjects = "remove all", + covariateSettings = covarSettings +) + +createStudyPopArgs1 <- createCreateStudyPopulationArgs( + removeSubjectsWithPriorOutcome = TRUE, + censorAtNewRiskWindow = TRUE, + minDaysAtRisk = 1, + riskWindowStart = 0, + startAnchor = "cohort start", + riskWindowEnd = 30, + endAnchor = "cohort end" +) + +createStudyPopArgs2 <- createCreateStudyPopulationArgs( + removeSubjectsWithPriorOutcome = TRUE, + censorAtNewRiskWindow = TRUE, + minDaysAtRisk = 1, + riskWindowStart = 0, + startAnchor = "cohort start", + riskWindowEnd = 30, + endAnchor = "cohort end" +) + +fitOutcomeModelArgs1 <- createFitOutcomeModelArgs(modelType = "cox") + +cmAnalysis1 <- createCmAnalysis( + analysisId = 1, + description = "No matching, simple outcome model", + getDbCohortMethodDataArgs = getDbCmDataArgs, + createStudyPopulationArgs = createStudyPopArgs1, + fitOutcomeModelArgs = fitOutcomeModelArgs1 +) + +createPsArgs <- createCreatePsArgs( + prior = createPrior("laplace", variance = 0.01), + estimator = "att" +) + +matchOnPsArgs <- createMatchOnPsArgs(maxRatio = 100) + +computeSharedCovBalArgs <- createComputeCovariateBalanceArgs() + +computeCovBalArgs <- createComputeCovariateBalanceArgs( + covariateFilter = FeatureExtraction::getDefaultTable1Specifications() +) + +fitOutcomeModelArgs2 <- createFitOutcomeModelArgs( + modelType = "cox", + stratified = TRUE +) + +cmAnalysis2 <- createCmAnalysis( + analysisId = 2, + description = "Matching", + getDbCohortMethodDataArgs = getDbCmDataArgs, + createStudyPopulationArgs = createStudyPopArgs2, + createPsArgs = createPsArgs, + matchOnPsArgs = matchOnPsArgs, + computeSharedCovariateBalanceArgs = computeSharedCovBalArgs, + computeCovariateBalanceArgs = computeCovBalArgs, + fitOutcomeModelArgs = fitOutcomeModelArgs2 +) + +truncateIptwArgs <- createTruncateIptwArgs(maxWeight = 10) + +fitOutcomeModelArgs3 <- createFitOutcomeModelArgs( + modelType = "cox", + inversePtWeighting = TRUE +) +cmAnalysis3 <- createCmAnalysis( + analysisId = 3, + description = "IPTW", + getDbCohortMethodDataArgs = getDbCmDataArgs, + createStudyPopulationArgs = createStudyPopArgs2, + createPsArgs = createPsArgs, + truncateIptwArgs = truncateIptwArgs, + computeSharedCovariateBalanceArgs = computeSharedCovBalArgs, + fitOutcomeModelArgs = fitOutcomeModelArgs3 +) + +fitOutcomeModelArgs4 <- createFitOutcomeModelArgs( + modelType = "cox", + stratified = TRUE, + interactionCovariateIds = 8532001 +) + +cmAnalysis4 <- createCmAnalysis( + analysisId = 4, + description = "Matching with gender interaction", + getDbCohortMethodDataArgs = getDbCmDataArgs, + createStudyPopulationArgs = createStudyPopArgs2, + createPsArgs = createPsArgs, + matchOnPsArgs = matchOnPsArgs, + fitOutcomeModelArgs = fitOutcomeModelArgs4 +) + +cmAnalysisList <- list(cmAnalysis1, cmAnalysis2, cmAnalysis3, cmAnalysis4) + +analysesToExclude <- data.frame( + targetId = c(998, 998), + analysisId = c(3, 4) +) + +message("Running analyses with current code...") +result <- runCmAnalyses( + connectionDetails = connectionDetails, + cdmDatabaseSchema = "main", + exposureTable = "cohort", + outcomeTable = "cohort", + outputFolder = outputFolder, + databaseId = "Eunomia", + cmAnalysesSpecifications = createCmAnalysesSpecifications( + cmAnalysisList = cmAnalysisList, + targetComparatorOutcomesList = targetComparatorOutcomesList, + analysesToExclude = analysesToExclude + ) +) + +exportToCsv(outputFolder, databaseId = "Eunomia") +message("Analysis complete, results exported") + +# =========================================================================== +# 3. Compare key CSV files +# =========================================================================== +newDir <- file.path(outputFolder, "export") + +compareNumericColumns <- function(file, columns, tolerance = 1e-6) { + refPath <- file.path(referenceDir, file) + newPath <- file.path(newDir, file) + + if (!file.exists(refPath)) { + message(sprintf(" SKIP: %s (not in reference)", file)) + return(invisible(NULL)) + } + if (!file.exists(newPath)) { + stop(sprintf("Missing output file: %s", file)) + } + + ref <- readr::read_csv(refPath, show_col_types = FALSE) + new <- readr::read_csv(newPath, show_col_types = FALSE) + + # Sort both by common key columns for stable comparison + keyColumns <- intersect(c("analysis_id", "target_id", "comparator_id", + "outcome_id", "interaction_covariate_id"), names(ref)) + ref <- ref[do.call(order, ref[keyColumns]), ] + new <- new[do.call(order, new[keyColumns]), ] + + if (nrow(ref) != nrow(new)) { + stop(sprintf("Row count mismatch in %s: reference=%d, new=%d", + file, nrow(ref), nrow(new))) + } + + columnsChecked <- 0 + for (col in columns) { + if (col %in% names(ref) && col %in% names(new)) { + refVals <- as.numeric(ref[[col]]) + newVals <- as.numeric(new[[col]]) + diffs <- abs(refVals - newVals) + diffs <- diffs[!is.na(diffs)] + if (length(diffs) > 0 && any(diffs > tolerance)) { + stop(sprintf("Column '%s' in %s differs: max diff = %g (tolerance = %g)", + col, file, max(diffs), tolerance)) + } + columnsChecked <- columnsChecked + 1 + } + } + message(sprintf(" PASS: %s (%d numeric columns within tolerance %g)", + file, columnsChecked, tolerance)) +} + +compareExactColumns <- function(file, columns) { + refPath <- file.path(referenceDir, file) + newPath <- file.path(newDir, file) + + if (!file.exists(refPath)) { + message(sprintf(" SKIP: %s (not in reference)", file)) + return(invisible(NULL)) + } + if (!file.exists(newPath)) { + stop(sprintf("Missing output file: %s", file)) + } + + ref <- readr::read_csv(refPath, show_col_types = FALSE) + new <- readr::read_csv(newPath, show_col_types = FALSE) + + keyColumns <- intersect(c("analysis_id", "target_id", "comparator_id", + "outcome_id"), names(ref)) + ref <- ref[do.call(order, ref[keyColumns]), ] + new <- new[do.call(order, new[keyColumns]), ] + + if (nrow(ref) != nrow(new)) { + stop(sprintf("Row count mismatch in %s: reference=%d, new=%d", + file, nrow(ref), nrow(new))) + } + + columnsChecked <- 0 + for (col in columns) { + if (col %in% names(ref) && col %in% names(new)) { + if (!identical(ref[[col]], new[[col]])) { + # Show first difference for debugging + idx <- which(ref[[col]] != new[[col]])[1] + stop(sprintf("Column '%s' in %s differs at row %d: reference='%s', new='%s'", + col, file, idx, ref[[col]][idx], new[[col]][idx])) + } + columnsChecked <- columnsChecked + 1 + } + } + message(sprintf(" PASS: %s (%d columns match exactly)", file, columnsChecked)) +} + +message("") +message("=== Results Regression Test ===") +message("") + +compareNumericColumns("cm_result.csv", + c("log_rr", "se_log_rr", "ci_95_lb", "ci_95_ub", "p", + "calibrated_log_rr", "calibrated_se_log_rr")) + +# Attrition: compare final population sizes per analysis/target/comparator/outcome. +# Intermediate attrition steps may differ in order due to two-phase population creation, +# but the final population must match. +compareFinalAttrition <- function() { + file <- "cm_attrition.csv" + refPath <- file.path(referenceDir, file) + newPath <- file.path(newDir, file) + + if (!file.exists(refPath)) { + message(sprintf(" SKIP: %s (not in reference)", file)) + return(invisible(NULL)) + } + + ref <- readr::read_csv(refPath, show_col_types = FALSE) + new <- readr::read_csv(newPath, show_col_types = FALSE) + + # Get last (final) attrition row per group + keyColumns <- intersect(c("target_comparator_id", "analysis_id", "outcome_id", + "exposure_id", "database_id"), names(ref)) + refFinal <- ref |> + dplyr::group_by(dplyr::across(dplyr::all_of(keyColumns))) |> + dplyr::slice_tail(n = 1) |> + dplyr::ungroup() |> + dplyr::arrange(dplyr::across(dplyr::all_of(keyColumns))) + newFinal <- new |> + dplyr::group_by(dplyr::across(dplyr::all_of(keyColumns))) |> + dplyr::slice_tail(n = 1) |> + dplyr::ungroup() |> + dplyr::arrange(dplyr::across(dplyr::all_of(keyColumns))) + + if (nrow(refFinal) != nrow(newFinal)) { + stop(sprintf("Final attrition group count mismatch: reference=%d, new=%d", + nrow(refFinal), nrow(newFinal))) + } + for (col in c("subjects", "exposures", "outcomes")) { + if (!identical(refFinal[[col]], newFinal[[col]])) { + stop(sprintf("Final attrition column '%s' differs", col)) + } + } + message(sprintf(" PASS: %s (final population sizes match across %d groups)", + file, nrow(refFinal))) +} +compareFinalAttrition() + +compareExactColumns("cm_diagnostics_summary.csv", + c("balance_diagnostic", "shared_balance_diagnostic", + "equipoise_diagnostic", "mdrr_diagnostic")) + +compareNumericColumns("cm_diagnostics_summary.csv", + c("max_sdm", "equipoise", "mdrr"), tolerance = 1e-6) + +compareNumericColumns("cm_covariate_balance.csv", + c("std_diff_before", "std_diff_after"), tolerance = 1e-4) + +compareNumericColumns("cm_interaction_result.csv", + c("log_rr", "se_log_rr"), tolerance = 1e-6) + +message("") +message("=== ALL CHECKS PASSED ===") +message("") + +# =========================================================================== +# Cleanup +# =========================================================================== +unlink(referenceDir, recursive = TRUE) +unlink(connectionDetails$server()) +unlink(outputFolder, recursive = TRUE) From 9a37a879e7d5b2902beb58c1da147f2e7b65b40d Mon Sep 17 00:00:00 2001 From: jgilber2 Date: Mon, 11 May 2026 10:22:57 -0700 Subject: [PATCH 10/10] Reduced memory footprint of ps model and added logging to diagnose freezing on linux --- R/RunAnalyses.R | 25 ++++++++++++++++++++++++- 1 file changed, 24 insertions(+), 1 deletion(-) diff --git a/R/RunAnalyses.R b/R/RunAnalyses.R index 0d91937..79ecdf9 100644 --- a/R/RunAnalyses.R +++ b/R/RunAnalyses.R @@ -872,16 +872,39 @@ doFitSharedPsModel <- function(params, refitPsForEveryStudyPopulation) { } addPsToStudyPopForSubset <- function(subset, outputFolder) { - ps <- readRDS(file.path(outputFolder, subset$sharedPsFile[1])) + pid <- Sys.getpid() + sharedPsFile <- subset$sharedPsFile[1] + ParallelLogger::logInfo(sprintf("[pid %d] addPsToStudyPopForSubset: loading sharedPsFile '%s' (%d rows to process)", + pid, sharedPsFile, nrow(subset))) + t0 <- proc.time()[["elapsed"]] + ps <- readRDS(file.path(outputFolder, sharedPsFile)) + ParallelLogger::logInfo(sprintf("[pid %d] addPsToStudyPopForSubset: sharedPsFile loaded in %.1f s (nrow=%d, ncol=%d)", + pid, proc.time()[["elapsed"]] - t0, nrow(ps), ncol(ps))) + columnsToKeep <- c("rowId", "treatment", "personSeqId", "cohortStartDate", "propensityScore", "preferenceScore", "iptw") + ps <- ps[, intersect(columnsToKeep, colnames(ps))] + ParallelLogger::logInfo(sprintf("[pid %d] addPsToStudyPopForSubset: ps trimmed to %d columns", + pid, ncol(ps))) addToStudyPop <- function(i) { refRow <- subset[i, ] + ParallelLogger::logInfo(sprintf("[pid %d] addPsToStudyPopForSubset [%d/%d]: reading studyPopFile '%s'", + pid, i, nrow(subset), refRow$studyPopFile)) + t1 <- proc.time()[["elapsed"]] studyPop <- readRDS(file.path(outputFolder, refRow$studyPopFile)) + ParallelLogger::logInfo(sprintf("[pid %d] addPsToStudyPopForSubset [%d/%d]: studyPopFile read in %.1f s (nrow=%d), merging PS", + pid, i, nrow(subset), proc.time()[["elapsed"]] - t1, nrow(studyPop))) studyPop <- addPsToStudyPopulation(studyPop, ps) + ParallelLogger::logInfo(sprintf("[pid %d] addPsToStudyPopForSubset [%d/%d]: writing psFile '%s'", + pid, i, nrow(subset), refRow$psFile)) + t2 <- proc.time()[["elapsed"]] saveRDS(studyPop, file.path(outputFolder, refRow$psFile)) + ParallelLogger::logInfo(sprintf("[pid %d] addPsToStudyPopForSubset [%d/%d]: psFile written in %.1f s", + pid, i, nrow(subset), proc.time()[["elapsed"]] - t2)) return(NULL) } plyr::l_ply(1:nrow(subset), addToStudyPop) + ParallelLogger::logInfo(sprintf("[pid %d] addPsToStudyPopForSubset: completed all %d rows for sharedPsFile '%s'", + pid, nrow(subset), sharedPsFile)) } addPsToStudyPopulation <- function(studyPopulation, ps) {