From 58481995f3237fede4d5fb6fe578c7776bea8f4b Mon Sep 17 00:00:00 2001 From: Bjorn Kallerud Date: Fri, 19 Sep 2025 11:32:09 -0700 Subject: [PATCH 1/8] Deprecate lag argument in initial_time_split and update NEWS --- NEWS.md | 2 ++ R/initial_split.R | 32 ++++++++++---------------------- 2 files changed, 12 insertions(+), 22 deletions(-) diff --git a/NEWS.md b/NEWS.md index 5adedd6f..46c15dfb 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,7 @@ # rsample (development version) +* The lag argument for initial_time_split() has been deprecated (#447). Supporting lags required overlapping rows between training and testing, which introduced data leakage (as discussed in #168). Users should instead pre-compute lagged variables before splitting, so that the test set remains strictly out-of-sample. + # rsample 1.3.1 * The new `internal_calibration_split()` function and its methods for various resamples is for usage in tune to create a internal split of the analysis set to fit the preprocessor and model on one part and the post-processor on the other part (#483, #488, #489, #569, #575, #577, #582). diff --git a/R/initial_split.R b/R/initial_split.R index d8626b4f..db493df8 100644 --- a/R/initial_split.R +++ b/R/initial_split.R @@ -26,13 +26,7 @@ #' drinks_split <- initial_time_split(drinks) #' train_data <- training(drinks_split) #' test_data <- testing(drinks_split) -#' c(max(train_data$date), min(test_data$date)) # no lag -#' -#' # With 12 period lag -#' drinks_lag_split <- initial_time_split(drinks, lag = 12) -#' train_data <- training(drinks_lag_split) -#' test_data <- testing(drinks_lag_split) -#' c(max(train_data$date), min(test_data$date)) # 12 period lag +#' c(max(train_data$date), min(test_data$date)) #' #' set.seed(1353) #' car_split <- group_initial_split(mtcars, cyl) @@ -76,27 +70,22 @@ initial_split <- function( } #' @rdname initial_split -#' @param lag A value to include a lag between the assessment -#' and analysis set. This is useful if lagged predictors will be used -#' during training and testing. +#' @param lag has been deprecated. #' @export -initial_time_split <- function(data, prop = 3 / 4, lag = 0, ...) { +initial_time_split <- function(data, prop = 3 / 4, lag = lifecycle::deprecated(), ...) { check_dots_empty() check_prop(prop) - if (!is.numeric(lag) | !(lag %% 1 == 0)) { - cli_abort("{.arg lag} must be a whole number.") + if (lifecycle::is_present(lag)) { + lifecycle::deprecate_stop( + when = "1.4.0", + what = "initial_time_split(lag)" + ) } n_train <- floor(nrow(data) * prop) - if (lag > n_train) { - cli_abort( - "{.arg lag} must be less than or equal to the number of training observations." - ) - } - - split <- rsplit(data, 1:n_train, (n_train + 1 - lag):nrow(data)) + split <- rsplit(data, 1:n_train, (n_train + 1):nrow(data)) splits <- list(split) ids <- "Resample1" rset <- new_rset(splits, ids) @@ -104,8 +93,7 @@ initial_time_split <- function(data, prop = 3 / 4, lag = 0, ...) { res <- rset$splits[[1]] attrib <- list( - prop = prop, - lag = lag + prop = prop ) for (i in names(attrib)) { attr(res, i) <- attrib[[i]] From 5ad2826aa0ac9b720bf8cee4acf4ffe649703a94 Mon Sep 17 00:00:00 2001 From: Bjorn Kallerud <35743670+bjornkallerud@users.noreply.github.com> Date: Fri, 19 Sep 2025 11:49:24 -0700 Subject: [PATCH 2/8] Update R/initial_split.R Accepted GitHub format suggestion. Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- R/initial_split.R | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/R/initial_split.R b/R/initial_split.R index db493df8..264f6e98 100644 --- a/R/initial_split.R +++ b/R/initial_split.R @@ -72,7 +72,12 @@ initial_split <- function( #' @rdname initial_split #' @param lag has been deprecated. #' @export -initial_time_split <- function(data, prop = 3 / 4, lag = lifecycle::deprecated(), ...) { +initial_time_split <- function( + data, + prop = 3 / 4, + lag = lifecycle::deprecated(), + ... +) { check_dots_empty() check_prop(prop) From 62233359198995af16e8c4db1a0b1817e6dc66a4 Mon Sep 17 00:00:00 2001 From: Hannah Frick Date: Tue, 28 Oct 2025 11:45:33 +0000 Subject: [PATCH 3/8] start deprecation with "soft" and restore functionality for now --- R/initial_split.R | 24 +++++++++++++++++++----- 1 file changed, 19 insertions(+), 5 deletions(-) diff --git a/R/initial_split.R b/R/initial_split.R index 264f6e98..9e806070 100644 --- a/R/initial_split.R +++ b/R/initial_split.R @@ -82,15 +82,28 @@ initial_time_split <- function( check_prop(prop) if (lifecycle::is_present(lag)) { - lifecycle::deprecate_stop( - when = "1.4.0", - what = "initial_time_split(lag)" + lifecycle::deprecate_soft( + when = "1.3.1.9000", + what = "initial_time_split(lag)", + details = "Please lag your predictors prior to splitting the dataset." ) + } else { + lag <- 0 + } + + if (!is.numeric(lag) | !(lag %% 1 == 0)) { + cli_abort("{.arg lag} must be a whole number.") } n_train <- floor(nrow(data) * prop) - split <- rsplit(data, 1:n_train, (n_train + 1):nrow(data)) + if (lag > n_train) { + cli_abort( + "{.arg lag} must be less than or equal to the number of training observations." + ) + } + + split <- rsplit(data, 1:n_train, (n_train + 1 - lag):nrow(data)) splits <- list(split) ids <- "Resample1" rset <- new_rset(splits, ids) @@ -98,7 +111,8 @@ initial_time_split <- function( res <- rset$splits[[1]] attrib <- list( - prop = prop + prop = prop, + lag = lag ) for (i in names(attrib)) { attr(res, i) <- attrib[[i]] From 335e245ec7a18c5f503dad5a1dbed62b43829337 Mon Sep 17 00:00:00 2001 From: Hannah Frick Date: Tue, 28 Oct 2025 11:50:55 +0000 Subject: [PATCH 4/8] update tests test deprecation warning separately, suppress otherwise --- tests/testthat/_snaps/initial_split.md | 12 ++++++++++++ tests/testthat/test-initial_split.R | 10 ++++++++++ 2 files changed, 22 insertions(+) diff --git a/tests/testthat/_snaps/initial_split.md b/tests/testthat/_snaps/initial_split.md index 37bb892a..94b26ca6 100644 --- a/tests/testthat/_snaps/initial_split.md +++ b/tests/testthat/_snaps/initial_split.md @@ -1,3 +1,15 @@ +# `lag` arg to `initial_time_split()` is deprecated + + Code + initial_time_split(mtcars, lag = 2) + Condition + Warning: + The `lag` argument of `initial_time_split()` is deprecated as of rsample 1.3.1.9000. + i Please lag your predictors prior to splitting the dataset. + Output + + <24/10/32> + # `initial_time_split()` error messages Code diff --git a/tests/testthat/test-initial_split.R b/tests/testthat/test-initial_split.R index 476be1ab..49bbe269 100644 --- a/tests/testthat/test-initial_split.R +++ b/tests/testthat/test-initial_split.R @@ -18,7 +18,15 @@ test_that("default time param", { expect_equal(tr1, dplyr::slice(dat1, 1:floor(nrow(dat1) * 3 / 4))) }) +test_that("`lag` arg to `initial_time_split()` is deprecated", { + expect_snapshot({ + initial_time_split(mtcars, lag = 2) + }) +}) + test_that("default time param with lag", { + withr::local_options(lifecycle_verbosity = "quiet") + rs1 <- initial_time_split(dat1, lag = 5) expect_equal(class(rs1), c("initial_time_split", "initial_split", "rsplit")) tr1 <- training(rs1) @@ -41,6 +49,8 @@ test_that("`initial_time_split()` error messages", { initial_time_split(drinks, prop = 2) }) + withr::local_options(lifecycle_verbosity = "quiet") + expect_snapshot(error = TRUE, { initial_time_split(drinks, lag = 12.5) }) From 84ee5011a2892256a7ab530faed18ff300bff060 Mon Sep 17 00:00:00 2001 From: Hannah Frick Date: Tue, 28 Oct 2025 12:03:00 +0000 Subject: [PATCH 5/8] expand docs --- R/initial_split.R | 6 +++++- man/initial_split.Rd | 18 +++++++----------- 2 files changed, 12 insertions(+), 12 deletions(-) diff --git a/R/initial_split.R b/R/initial_split.R index 9e806070..fe8af48d 100644 --- a/R/initial_split.R +++ b/R/initial_split.R @@ -9,6 +9,9 @@ #' #' @details `training()` and `testing()` are used to extract the resulting data. #' +#' To avoid data leakage when using lagged variables, lag the predictors before +#' the initial split. +#' #' @template strata_details #' @inheritParams vfold_cv #' @inheritParams make_strata @@ -70,7 +73,8 @@ initial_split <- function( } #' @rdname initial_split -#' @param lag has been deprecated. +#' @param lag `r lifecycle::badge("deprecated")` This is deprecated, please lag +#' your predictors prior to splitting the dataset. #' @export initial_time_split <- function( data, diff --git a/man/initial_split.Rd b/man/initial_split.Rd index 07fc96b6..d339e305 100644 --- a/man/initial_split.Rd +++ b/man/initial_split.Rd @@ -14,7 +14,7 @@ \usage{ initial_split(data, prop = 3/4, strata = NULL, breaks = 4, pool = 0.1, ...) -initial_time_split(data, prop = 3/4, lag = 0, ...) +initial_time_split(data, prop = 3/4, lag = lifecycle::deprecated(), ...) training(x, ...) @@ -49,9 +49,8 @@ of stratifying groups that are too small.} \item{...}{These dots are for future extensions and must be empty.} -\item{lag}{A value to include a lag between the assessment -and analysis set. This is useful if lagged predictors will be used -during training and testing.} +\item{lag}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} This is deprecated, please lag +your predictors prior to splitting the dataset.} \item{x}{An \code{rsplit} object produced by \code{initial_split()} or \code{initial_time_split()}.} @@ -75,6 +74,9 @@ the same split. \details{ \code{training()} and \code{testing()} are used to extract the resulting data. +To avoid data leakage when using lagged variables, lag the predictors before +the initial split. + With a \code{strata} argument, the random sampling is conducted \emph{within the stratification variable}. This can help ensure that the resamples have equivalent proportions as the original data set. For @@ -94,13 +96,7 @@ data(drinks, package = "modeldata") drinks_split <- initial_time_split(drinks) train_data <- training(drinks_split) test_data <- testing(drinks_split) -c(max(train_data$date), min(test_data$date)) # no lag - -# With 12 period lag -drinks_lag_split <- initial_time_split(drinks, lag = 12) -train_data <- training(drinks_lag_split) -test_data <- testing(drinks_lag_split) -c(max(train_data$date), min(test_data$date)) # 12 period lag +c(max(train_data$date), min(test_data$date)) set.seed(1353) car_split <- group_initial_split(mtcars, cyl) From 21890aff843af3ab47f78349e4eb3c6e5989d855 Mon Sep 17 00:00:00 2001 From: Hannah Frick Date: Tue, 28 Oct 2025 12:03:40 +0000 Subject: [PATCH 6/8] don't pass deprecation badge to already deprecated function plus a little linting by air --- R/validation_split.R | 11 +++++++++-- man/validation_split.Rd | 6 +++--- 2 files changed, 12 insertions(+), 5 deletions(-) diff --git a/R/validation_split.R b/R/validation_split.R index 3b5af308..6c0b7f32 100644 --- a/R/validation_split.R +++ b/R/validation_split.R @@ -92,7 +92,9 @@ validation_split <- function( split_objs$splits <- map(split_objs$splits, rm_out) class(split_objs$splits[[1]]) <- c("val_split", "rsplit") - if (!is.null(strata)) names(strata) <- NULL + if (!is.null(strata)) { + names(strata) <- NULL + } val_att <- list( prop = prop, strata = strata, @@ -111,6 +113,9 @@ validation_split <- function( #' @rdname validation_split #' @inheritParams vfold_cv #' @inheritParams initial_time_split +#' @param lag A value to include a lag between the assessment and analysis set. +#' This is useful if lagged predictors will be used during training and +#' testing. #' @export validation_time_split <- function(data, prop = 3 / 4, lag = 0, ...) { lifecycle::deprecate_warn( @@ -195,7 +200,9 @@ group_validation_split <- function( class(split_objs$splits[[1]]) <- c("group_val_split", "val_split", "rsplit") # This is needed for printing -- strata cannot be missing - if (is.null(strata)) strata <- FALSE + if (is.null(strata)) { + strata <- FALSE + } val_att <- list( prop = prop, group = group, diff --git a/man/validation_split.Rd b/man/validation_split.Rd index a8577041..d72118db 100644 --- a/man/validation_split.Rd +++ b/man/validation_split.Rd @@ -31,9 +31,9 @@ of stratifying groups that are too small.} \item{...}{These dots are for future extensions and must be empty.} -\item{lag}{A value to include a lag between the assessment -and analysis set. This is useful if lagged predictors will be used -during training and testing.} +\item{lag}{A value to include a lag between the assessment and analysis set. +This is useful if lagged predictors will be used during training and +testing.} \item{group}{A variable in \code{data} (single character or name) used for grouping observations with the same value to either the analysis or From b034aa6a1a1e1d0d7786facce73fcf6e169fa632 Mon Sep 17 00:00:00 2001 From: Hannah Frick Date: Tue, 28 Oct 2025 12:05:59 +0000 Subject: [PATCH 7/8] Update NEWS with contributor credit --- NEWS.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 46c15dfb..feb2636c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,6 +1,6 @@ # rsample (development version) -* The lag argument for initial_time_split() has been deprecated (#447). Supporting lags required overlapping rows between training and testing, which introduced data leakage (as discussed in #168). Users should instead pre-compute lagged variables before splitting, so that the test set remains strictly out-of-sample. +* The lag argument for `initial_time_split()` has been soft deprecated (@bjornkallerud, #592). # rsample 1.3.1 From 2ebf4172c9444d134b3a8288208c6ec191eb54ba Mon Sep 17 00:00:00 2001 From: Hannah Frick Date: Tue, 28 Oct 2025 17:59:36 +0000 Subject: [PATCH 8/8] avoid deprecation warning --- R/internal_calibration_split.R | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/R/internal_calibration_split.R b/R/internal_calibration_split.R index 75cf6014..aee9ee20 100644 --- a/R/internal_calibration_split.R +++ b/R/internal_calibration_split.R @@ -669,6 +669,11 @@ internal_calibration_split.initial_time_split <- function(x, split_args, ...) { training_set <- training(x) + # to avoid deprecation warning if lag is just the default of 0 + if (-1 < split_args$lag && split_args$lag < 1) { + split_args$lag <- NULL + } + split_cal <- internal_calibration_split_core( training_set, split_function = initial_time_split,