@@ -98,6 +98,32 @@ validate_predictions <- function(predictions, n_obs = NULL) {
9898}
9999
100100
101+ # ' Validate PIT
102+ # '
103+ # ' Checks that `pit` is numeric, doesn't have any NAs, and is either a vector,
104+ # ' or a 1-D array with values in [0,1].
105+ # '
106+ # ' @param pit The 'pit' object from the user.
107+ # ' @return Either throws an error or returns a numeric vector.
108+ # ' @noRd
109+ validate_pit <- function (pit ) {
110+ stopifnot(is.numeric(pit ))
111+
112+ if (! is_vector_or_1Darray(pit )) {
113+ abort(" 'pit' must be a vector or 1D array." )
114+ }
115+
116+ if (any(pit > 1 ) || any(pit < 0 )) {
117+ abort(" 'pit' must only contain values between 0 and 1." )
118+ }
119+
120+ if (anyNA(pit )) {
121+ abort(" NAs not allowed in 'pit'." )
122+ }
123+
124+ unname(pit )
125+ }
126+
101127# ' Validate group
102128# '
103129# ' Checks that grouping variable has correct number of observations and is
@@ -267,6 +293,298 @@ melt_and_stack <- function(y, yrep) {
267293}
268294
269295
296+ # ' Obtain the coverage parameter for simultaneous confidence bands for ECDFs
297+ # '
298+ # ' @param N Length of sample.
299+ # ' @param L Number of chains. Used for MCMC, defaults to 1 for ppc.
300+ # ' @param K Number of equally spaced evaluation points (1:K / K). Defaults to N.
301+ # ' @param prob Desired simultaneous coverage (0,1).
302+ # ' @param M number of simulations to run, if simulation method is used.
303+ # ' @param interpolate_adj Boolean defining whether to interpolate the confidence
304+ # ' bands from precomputed values. Interpolation provides a faster plot with the
305+ # ' trade-off of possible loss of accuracy.
306+ # ' @return The adjusted coverage parameter yielding the desired simultaneous
307+ # ' coverage of the ECDF traces.
308+ # ' @noRd
309+ adjust_gamma <- function (N ,
310+ L = 1 ,
311+ K = N ,
312+ prob = 0.99 ,
313+ M = 1000 ,
314+ interpolate_adj = FALSE ) {
315+ if (! all_counts(c(K , N , L ))) {
316+ abort(" Parameters 'N', 'L' and 'K' must be positive integers." )
317+ }
318+ if (prob > = 1 || prob < = 0 ) {
319+ abort(" Value of 'prob' must be in (0,1)." )
320+ }
321+ if (interpolate_adj == TRUE ) {
322+ gamma <- interpolate_gamma(N = N , K = K , prob = prob , L = L )
323+ } else if (L == 1 ) {
324+ gamma <- adjust_gamma_optimize(N = N , K = K , prob = prob )
325+ } else {
326+ gamma <- adjust_gamma_simulate(N = N , L = L , K = K , prob = prob , M = M )
327+ }
328+ gamma
329+ }
330+
331+ # ' Adjust coverage parameter for single sample using the optimization method.
332+ # ' @param N Length of sample.
333+ # ' @param K Number of equally spaced evaluation points (1:K / K). Defaults to N.
334+ # ' @param prob Desired simultaneous coverage (0,1).
335+ # ' @return The adjusted coverage parameter yielding the desired simultaneous
336+ # ' coverage of the ECDF traces.
337+ # ' @noRd
338+ adjust_gamma_optimize <- function (N , K , prob ) {
339+ target <- function (gamma , prob , N , K ) {
340+ z <- 1 : (K - 1 ) / K
341+ z1 <- c(0 , z )
342+ z2 <- c(z , 1 )
343+
344+ # pre-compute quantiles and use symmetry for increased efficiency.
345+ x2_lower <- qbinom(gamma / 2 , N , z2 )
346+ x2_upper <- c(N - rev(x2_lower )[2 : K ], 1 )
347+
348+ # Compute the total probability of trajectories inside the confidence
349+ # intervals. Initialize the set and corresponding probabilities known
350+ # to be 0 and 1 for the starting value z1 = 0.
351+ x1 <- 0
352+ p_int <- 1
353+ for (i in seq_along(z1 )) {
354+ p_int <- p_interior(
355+ p_int = p_int ,
356+ x1 = x1 ,
357+ x2 = x2_lower [i ]: x2_upper [i ],
358+ z1 = z1 [i ],
359+ z2 = z2 [i ],
360+ N = N
361+ )
362+ x1 <- x2_lower [i ]: x2_upper [i ]
363+ }
364+ return (abs(prob - sum(p_int )))
365+ }
366+ optimize(target , c(0 , 1 - prob ), prob = prob , N = N , K = K )$ minimum
367+ }
368+
369+ # ' Adjust coverage parameter for multiple chains using the simulation method.
370+ # ' In short, 'M' simulations of 'L' standard uniform chains are run and the
371+ # ' confidence bands are set to cover 100 * 'prob' % of these simulations.
372+ # ' @param N Length of sample.
373+ # ' @param L Number of chains. Used for MCMC, defaults to 1 for ppc.
374+ # ' @param K Number of equally spaced evaluation points (1:K / K). Defaults to N.
375+ # ' @param prob Desired simultaneous coverage (0,1).
376+ # ' @param M number of simulations to run.
377+ # ' @return The adjusted coverage parameter yielding the desired simultaneous
378+ # ' coverage of the ECDF traces.
379+ # ' @noRd
380+ adjust_gamma_simulate <- function (N , L , K , prob , M ) {
381+ gamma <- numeric (M )
382+ z <- (1 : (K - 1 )) / K # Rank ECDF evaluation points.
383+ n <- N * (L - 1 )
384+ k <- floor(z * N * L )
385+ for (m in seq_len(M )) {
386+ u <- u_scale(replicate(L , runif(N ))) # Fractional ranks of sample chains
387+ scaled_ecdfs <- apply(outer(u , z , " <=" ), c(2 , 3 ), sum )
388+ # Find the smalles marginal probability of the simulation run
389+ gamma [m ] <- 2 * min(
390+ apply(
391+ scaled_ecdfs , 1 , phyper , m = N , n = n , k = k
392+ ),
393+ apply(
394+ scaled_ecdfs - 1 , 1 , phyper , m = N , n = n , k = k , lower.tail = FALSE
395+ )
396+ )
397+ }
398+ alpha_quantile(gamma , 1 - prob )
399+ }
400+
401+ # ' Approximate the required adjustement to obtain simultaneous confidence bands
402+ # ' of an ECDF plot with interpolation with regards to N and K from precomputed
403+ # ' values for a fixed set of prob and L values.
404+ # ' @param N Length of sample.
405+ # ' @param L Number of chains. Used for MCMC, defaults to 1 for ppc.
406+ # ' @param prob Desired simultaneous coverage (0,1).
407+ # ' @param K Number of equally spaced evaluation points (1:K / K). Defaults to N.
408+ # ' @return The approximated adjusted coverage parameter yielding the desired
409+ # ' simultaneous coverage of the ECDF traces.
410+ # ' @noRd
411+ interpolate_gamma <- function (N , K , prob , L ) {
412+ # Find the precomputed values ueful for the interpolation task.
413+ vals <- get_interpolation_values(N , K , L , prob )
414+ # Largest lower bound and smalles upper bound for N among precomputed values.
415+ N_lb <- max(vals [vals $ N < = N , ]$ N )
416+ N_ub <- min(vals [vals $ N > = N , ]$ N )
417+ # Approximate largest lower bound and smalles upper bound for gamma.
418+ log_gamma_lb <- approx(
419+ x = log(vals [vals $ N == N_lb , ]$ K ),
420+ y = log(vals [vals $ N == N_lb , ]$ val ),
421+ xout = log(K )
422+ )$ y
423+ log_gamma_ub <- approx(
424+ x = log(vals [vals $ N == N_ub , ]$ K ),
425+ y = log(vals [vals $ N == N_ub , ]$ val ),
426+ xout = log(K )
427+ )$ y
428+ if (N_ub == N_lb ) {
429+ log_gamma_approx <- log_gamma_lb
430+ } else {
431+ # Approximate log_gamma for the desired value of N.
432+ log_gamma_approx <- approx(
433+ x = log(c(N_lb , N_ub )),
434+ y = c(log_gamma_lb , log_gamma_ub ),
435+ xout = log(N )
436+ )$ y
437+ }
438+ exp(log_gamma_approx )
439+ }
440+
441+ # ' Filter the precomputed values useful for the interpolation task given to
442+ # ' interpolate_gamma. Check, if the task is possible with the availabel data.
443+ # ' @param N Length of sample.
444+ # ' @param K Number of equally spaced evaluation points (1:K / K). Defaults to N.
445+ # ' @param L Number of chains. Used for MCMC, defaults to 1 for ppc.
446+ # ' @param prob Desired simultaneous coverage (0,1).
447+ # ' @return A data.frame containing the relevant precomputed values.
448+ # ' @noRd
449+ get_interpolation_values <- function (N , K , L , prob ) {
450+ for (dim in c(" L" , " prob" )) {
451+ if (all(get(dim ) != bayesplot ::: gamma_adj [, dim ])) {
452+ stop(paste(
453+ " No precomputed values to interpolate from for '" , dim , " ' = " ,
454+ get(dim ),
455+ " .\n " ,
456+ " Values of '" , dim , " ' available for interpolation: " ,
457+ paste(unique(bayesplot ::: gamma_adj [, dim ]), collapse = " , " ),
458+ " ." ,
459+ sep = " "
460+ ))
461+ }
462+ }
463+ vals <- bayesplot ::: gamma_adj [
464+ bayesplot ::: gamma_adj $ L == L & bayesplot ::: gamma_adj $ prob == prob ,
465+ ]
466+ if (N > max(vals $ N )) {
467+ stop(paste(
468+ " No precomputed values to interpolate from for sample length of " ,
469+ N ,
470+ " .\n " ,
471+ " Please use a subsample of length " ,
472+ max(vals $ N ),
473+ " or smaller, or consider setting 'interpolate_adj' = FALSE." ,
474+ sep = " "
475+ ))
476+ }
477+ if (N < min(vals $ N )) {
478+ stop(paste(
479+ " No precomputed values to interpolate from for sample length of " ,
480+ N ,
481+ " .\n " ,
482+ " Please use a subsample of length " ,
483+ min(vals $ N ),
484+ " or larger, or consider setting 'interpolate_adj' = FALSE." ,
485+ sep = " "
486+ ))
487+ }
488+ if (K > max(vals [vals $ N < = N , ]$ K )) {
489+ stop(paste(
490+ " No precomputed values available for interpolation for 'K' = " ,
491+ K ,
492+ " .\n " ,
493+ " Try either setting a value of 'K' <= " ,
494+ max(vals [vals $ N < = N , ]$ K ),
495+ " or 'interpolate_adj' = FALSE." ,
496+ sep = " "
497+ ))
498+ }
499+ if (K < min(vals [vals $ N < = N , ]$ K )) {
500+ stop(paste(
501+ " No precomputed values available for interpolation for 'K' = " ,
502+ K ,
503+ " .\n " ,
504+ " Try either setting a value of 'K' >= " ,
505+ min(vals [vals $ N < = N , ]$ K ),
506+ " or 'interpolate_adj' = FALSE." ,
507+ sep = " "
508+ ))
509+ }
510+ vals
511+ }
512+
513+ # ' A helper function for 'adjust_gamma_optimize' defining the probability that
514+ # ' a scaled ECDF stays within the supplied bounds between two evaluation points.
515+ # ' @param p_int For each value in x1, the probability that the ECDF has stayed
516+ # ' within the bounds until z1 and takes the value in x1 at z1.
517+ # ' @param x1 Vector of scaled ECDF values at the left end of the interval, z1.
518+ # ' @param x2 Vector of scaled ECDF values at the right end of the interval, z2.
519+ # ' @param z1 Left evaluation point in [0,1]
520+ # ' @param z2 Right evaluation point in [0,1] with z2 > z1.
521+ # ' @param N Total number of values in the sample.
522+ # ' @return A vector containing the probability to transitioning from the values
523+ # ' in x1 to each of the values in x2 weighted by the probabilities in p_int.
524+ # ' @noRd
525+ p_interior <- function (p_int , x1 , x2 , z1 , z2 , N ) {
526+ # Ratio between the length of the evaluation interval and the total length of
527+ # the interval left to cover by ECDF.
528+ z_tilde <- (z2 - z1 ) / (1 - z1 )
529+ # Number of samples left to cover by ECDF.
530+ N_tilde <- rep(N - x1 , each = length(x2 ))
531+
532+ p_int <- rep(p_int , each = length(x2 ))
533+ x_diff <- outer(x2 , x1 , " -" )
534+ # Pobability of each transition from a value in x1 to a value in x2.
535+ p_x2_int <- p_int * dbinom(x_diff , N_tilde , z_tilde )
536+ rowSums(p_x2_int )
537+ }
538+
539+ # ' A helper function for 'adjust_alpha_simulate'
540+ # ' 100 * `alpha` percent of the trials in 'gamma' are allowed to be rejected.
541+ # ' In case of ties, return the largest value dominating at most
542+ # ' 100 * (alpha + tol) percent of the values.
543+ # ' @noRd
544+ alpha_quantile <- function (gamma , alpha , tol = 0.001 ) {
545+ a <- unname(quantile(gamma , probs = alpha ))
546+ a_tol <- unname(quantile(gamma , probs = alpha + tol ))
547+ if (a == a_tol ) {
548+ if (min(gamma ) < a ) {
549+ # take the largest value that doesn't exceed the tolerance.
550+ a <- max(gamma [gamma < a ])
551+ }
552+ }
553+ a
554+ }
555+
556+ # ' Compute simultaneous confidence intervals with the given adjusted coverage
557+ # ' parameter gamma.
558+ # ' @param gamma Adjusted coverage parameter for the marginal distribution
559+ # ' (binomial for PIT values and hypergeometric for rank transformed chains).
560+ # ' @param N Sample length.
561+ # ' @param K Number of uniformly spaced evaluation points.
562+ # ' @param L Number of MCMC-chains. (1 for ppc)
563+ # ' @return A list with upper and lower confidence interval values at the
564+ # ' evaluation points.
565+ # ' @noRd
566+ ecdf_intervals <- function (gamma , N , K , L = 1 ) {
567+ lims <- list ()
568+ z <- seq(0 , 1 , length.out = K + 1 )
569+ if (L == 1 ) {
570+ lims $ lower <- qbinom(gamma / 2 , N , z )
571+ lims $ upper <- qbinom(1 - gamma / 2 , N , z )
572+ } else {
573+ n <- N * (L - 1 )
574+ k <- floor(z * L * N )
575+ lims $ lower <- qhyper(gamma / 2 , N , n , k )
576+ lims $ upper <- qhyper(1 - gamma / 2 , N , n , k )
577+ }
578+ lims
579+ }
580+
581+ # ' Helper for 'adjust_gamma_simulate`
582+ # ' Transforms observations in 'x' into their corresponding fractional ranks.
583+ # ' @noRd
584+ u_scale <- function (x ) {
585+ array (rank(x ) / length(x ), dim = dim(x ), dimnames = dimnames(x ))
586+ }
587+
270588# labels ----------------------------------------------------------------
271589create_rep_ids <- function (ids ) paste(' italic(y)[rep] (' , ids , " )" )
272590y_label <- function () expression(italic(y ))
0 commit comments