diff --git a/NEWS.md b/NEWS.md index 5adedd6f..feb2636c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,7 @@ # rsample (development version) +* The lag argument for `initial_time_split()` has been soft deprecated (@bjornkallerud, #592). + # rsample 1.3.1 * The new `internal_calibration_split()` function and its methods for various resamples is for usage in tune to create a internal split of the analysis set to fit the preprocessor and model on one part and the post-processor on the other part (#483, #488, #489, #569, #575, #577, #582). diff --git a/R/initial_split.R b/R/initial_split.R index d8626b4f..fe8af48d 100644 --- a/R/initial_split.R +++ b/R/initial_split.R @@ -9,6 +9,9 @@ #' #' @details `training()` and `testing()` are used to extract the resulting data. #' +#' To avoid data leakage when using lagged variables, lag the predictors before +#' the initial split. +#' #' @template strata_details #' @inheritParams vfold_cv #' @inheritParams make_strata @@ -26,13 +29,7 @@ #' drinks_split <- initial_time_split(drinks) #' train_data <- training(drinks_split) #' test_data <- testing(drinks_split) -#' c(max(train_data$date), min(test_data$date)) # no lag -#' -#' # With 12 period lag -#' drinks_lag_split <- initial_time_split(drinks, lag = 12) -#' train_data <- training(drinks_lag_split) -#' test_data <- testing(drinks_lag_split) -#' c(max(train_data$date), min(test_data$date)) # 12 period lag +#' c(max(train_data$date), min(test_data$date)) #' #' set.seed(1353) #' car_split <- group_initial_split(mtcars, cyl) @@ -76,14 +73,28 @@ initial_split <- function( } #' @rdname initial_split -#' @param lag A value to include a lag between the assessment -#' and analysis set. This is useful if lagged predictors will be used -#' during training and testing. +#' @param lag `r lifecycle::badge("deprecated")` This is deprecated, please lag +#' your predictors prior to splitting the dataset. #' @export -initial_time_split <- function(data, prop = 3 / 4, lag = 0, ...) { +initial_time_split <- function( + data, + prop = 3 / 4, + lag = lifecycle::deprecated(), + ... +) { check_dots_empty() check_prop(prop) + if (lifecycle::is_present(lag)) { + lifecycle::deprecate_soft( + when = "1.3.1.9000", + what = "initial_time_split(lag)", + details = "Please lag your predictors prior to splitting the dataset." + ) + } else { + lag <- 0 + } + if (!is.numeric(lag) | !(lag %% 1 == 0)) { cli_abort("{.arg lag} must be a whole number.") } diff --git a/R/internal_calibration_split.R b/R/internal_calibration_split.R index 75cf6014..aee9ee20 100644 --- a/R/internal_calibration_split.R +++ b/R/internal_calibration_split.R @@ -669,6 +669,11 @@ internal_calibration_split.initial_time_split <- function(x, split_args, ...) { training_set <- training(x) + # to avoid deprecation warning if lag is just the default of 0 + if (-1 < split_args$lag && split_args$lag < 1) { + split_args$lag <- NULL + } + split_cal <- internal_calibration_split_core( training_set, split_function = initial_time_split, diff --git a/R/validation_split.R b/R/validation_split.R index 3b5af308..6c0b7f32 100644 --- a/R/validation_split.R +++ b/R/validation_split.R @@ -92,7 +92,9 @@ validation_split <- function( split_objs$splits <- map(split_objs$splits, rm_out) class(split_objs$splits[[1]]) <- c("val_split", "rsplit") - if (!is.null(strata)) names(strata) <- NULL + if (!is.null(strata)) { + names(strata) <- NULL + } val_att <- list( prop = prop, strata = strata, @@ -111,6 +113,9 @@ validation_split <- function( #' @rdname validation_split #' @inheritParams vfold_cv #' @inheritParams initial_time_split +#' @param lag A value to include a lag between the assessment and analysis set. +#' This is useful if lagged predictors will be used during training and +#' testing. #' @export validation_time_split <- function(data, prop = 3 / 4, lag = 0, ...) { lifecycle::deprecate_warn( @@ -195,7 +200,9 @@ group_validation_split <- function( class(split_objs$splits[[1]]) <- c("group_val_split", "val_split", "rsplit") # This is needed for printing -- strata cannot be missing - if (is.null(strata)) strata <- FALSE + if (is.null(strata)) { + strata <- FALSE + } val_att <- list( prop = prop, group = group, diff --git a/man/initial_split.Rd b/man/initial_split.Rd index 07fc96b6..d339e305 100644 --- a/man/initial_split.Rd +++ b/man/initial_split.Rd @@ -14,7 +14,7 @@ \usage{ initial_split(data, prop = 3/4, strata = NULL, breaks = 4, pool = 0.1, ...) -initial_time_split(data, prop = 3/4, lag = 0, ...) +initial_time_split(data, prop = 3/4, lag = lifecycle::deprecated(), ...) training(x, ...) @@ -49,9 +49,8 @@ of stratifying groups that are too small.} \item{...}{These dots are for future extensions and must be empty.} -\item{lag}{A value to include a lag between the assessment -and analysis set. This is useful if lagged predictors will be used -during training and testing.} +\item{lag}{\ifelse{html}{\href{https://lifecycle.r-lib.org/articles/stages.html#deprecated}{\figure{lifecycle-deprecated.svg}{options: alt='[Deprecated]'}}}{\strong{[Deprecated]}} This is deprecated, please lag +your predictors prior to splitting the dataset.} \item{x}{An \code{rsplit} object produced by \code{initial_split()} or \code{initial_time_split()}.} @@ -75,6 +74,9 @@ the same split. \details{ \code{training()} and \code{testing()} are used to extract the resulting data. +To avoid data leakage when using lagged variables, lag the predictors before +the initial split. + With a \code{strata} argument, the random sampling is conducted \emph{within the stratification variable}. This can help ensure that the resamples have equivalent proportions as the original data set. For @@ -94,13 +96,7 @@ data(drinks, package = "modeldata") drinks_split <- initial_time_split(drinks) train_data <- training(drinks_split) test_data <- testing(drinks_split) -c(max(train_data$date), min(test_data$date)) # no lag - -# With 12 period lag -drinks_lag_split <- initial_time_split(drinks, lag = 12) -train_data <- training(drinks_lag_split) -test_data <- testing(drinks_lag_split) -c(max(train_data$date), min(test_data$date)) # 12 period lag +c(max(train_data$date), min(test_data$date)) set.seed(1353) car_split <- group_initial_split(mtcars, cyl) diff --git a/man/validation_split.Rd b/man/validation_split.Rd index a8577041..d72118db 100644 --- a/man/validation_split.Rd +++ b/man/validation_split.Rd @@ -31,9 +31,9 @@ of stratifying groups that are too small.} \item{...}{These dots are for future extensions and must be empty.} -\item{lag}{A value to include a lag between the assessment -and analysis set. This is useful if lagged predictors will be used -during training and testing.} +\item{lag}{A value to include a lag between the assessment and analysis set. +This is useful if lagged predictors will be used during training and +testing.} \item{group}{A variable in \code{data} (single character or name) used for grouping observations with the same value to either the analysis or diff --git a/tests/testthat/_snaps/initial_split.md b/tests/testthat/_snaps/initial_split.md index 37bb892a..94b26ca6 100644 --- a/tests/testthat/_snaps/initial_split.md +++ b/tests/testthat/_snaps/initial_split.md @@ -1,3 +1,15 @@ +# `lag` arg to `initial_time_split()` is deprecated + + Code + initial_time_split(mtcars, lag = 2) + Condition + Warning: + The `lag` argument of `initial_time_split()` is deprecated as of rsample 1.3.1.9000. + i Please lag your predictors prior to splitting the dataset. + Output + + <24/10/32> + # `initial_time_split()` error messages Code diff --git a/tests/testthat/test-initial_split.R b/tests/testthat/test-initial_split.R index 476be1ab..49bbe269 100644 --- a/tests/testthat/test-initial_split.R +++ b/tests/testthat/test-initial_split.R @@ -18,7 +18,15 @@ test_that("default time param", { expect_equal(tr1, dplyr::slice(dat1, 1:floor(nrow(dat1) * 3 / 4))) }) +test_that("`lag` arg to `initial_time_split()` is deprecated", { + expect_snapshot({ + initial_time_split(mtcars, lag = 2) + }) +}) + test_that("default time param with lag", { + withr::local_options(lifecycle_verbosity = "quiet") + rs1 <- initial_time_split(dat1, lag = 5) expect_equal(class(rs1), c("initial_time_split", "initial_split", "rsplit")) tr1 <- training(rs1) @@ -41,6 +49,8 @@ test_that("`initial_time_split()` error messages", { initial_time_split(drinks, prop = 2) }) + withr::local_options(lifecycle_verbosity = "quiet") + expect_snapshot(error = TRUE, { initial_time_split(drinks, lag = 12.5) })