From b0a778a71d6693122692b88a543870a6d4c54919 Mon Sep 17 00:00:00 2001 From: Matthew Kay Date: Thu, 18 Jan 2024 20:27:00 -0600 Subject: [PATCH] density, cdf, quantiles for weighted rvar --- R/rvar-dist.R | 25 ++++--- R/weighted.R | 118 ++++++++++++++++++++++++++++++++ tests/testthat/test-rvar-dist.R | 52 ++++++++++++-- 3 files changed, 176 insertions(+), 19 deletions(-) create mode 100644 R/weighted.R diff --git a/R/rvar-dist.R b/R/rvar-dist.R index ee57c162..39bf2f76 100755 --- a/R/rvar-dist.R +++ b/R/rvar-dist.R @@ -40,8 +40,9 @@ #' @name rvar-dist #' @export density.rvar <- function(x, at, ...) { + weights <- weights(x) summarise_rvar_by_element(x, function(draws) { - d <- density(draws, cut = 0, ...) + d <- density(draws, weights = weights, cut = 0, ...) f <- approxfun(d$x, d$y, yleft = 0, yright = 0) f(at) }) @@ -50,11 +51,12 @@ density.rvar <- function(x, at, ...) { #' @rdname rvar-dist #' @export density.rvar_factor <- function(x, at, ...) { + weights <- weights(x) at <- as.numeric(factor(at, levels = levels(x))) - nbins <- nlevels(x) summarise_rvar_by_element(x, function(draws) { - props <- prop.table(tabulate(draws, nbins = nbins))[at] + tab <- weighted_simple_table(draws, weights) + props <- prop.table(tab$count)[at] props }) } @@ -66,8 +68,9 @@ distributional::cdf #' @rdname rvar-dist #' @export cdf.rvar <- function(x, q, ...) { + weights <- weights(x) summarise_rvar_by_element(x, function(draws) { - ecdf(draws)(q) + weighted_ecdf(draws, weights)(q) }) } @@ -76,7 +79,7 @@ cdf.rvar <- function(x, q, ...) { cdf.rvar_factor <- function(x, q, ...) { # CDF is not defined for unordered distributions # generate an all-NA array of the appropriate shape - out <- rep_len(NA, length(x) * length(q)) + out <- rep_len(NA_real_, length(x) * length(q)) if (length(x) > 1) dim(out) <- c(length(q), dim(x)) out } @@ -91,14 +94,10 @@ cdf.rvar_ordered <- function(x, q, ...) { #' @rdname rvar-dist #' @export quantile.rvar <- function(x, probs, ...) { - summarise_rvar_by_element_via_matrix(x, - "quantile", - function(draws) { - t(matrixStats::colQuantiles(draws, probs = probs, useNames = TRUE, ...)) - }, - .extra_dim = length(probs), - .extra_dimnames = list(NULL) - ) + weights <- weights(x) + summarise_rvar_by_element(x, function(draws) { + weighted_quantile(draws, probs = probs, weights = weights, ...) + }) } #' @rdname rvar-dist diff --git a/R/weighted.R b/R/weighted.R new file mode 100644 index 00000000..ec16ade5 --- /dev/null +++ b/R/weighted.R @@ -0,0 +1,118 @@ +# weighted distribution functions -------------------------------------------- + +#' Weighted version of [stats::ecdf()]. +#' Based on ggdist::weighted_ecdf(). +#' @noRd +weighted_ecdf = function(x, weights = NULL) { + n = length(x) + if (n < 1) stop("Need at least 1 or more values to calculate an ECDF") + + weights = if (is.null(weights)) rep(1, n) else weights + + #sort only if necessary + if (is.unsorted(x)) { + sort_order = order(x) + x = x[sort_order] + weights = weights[sort_order] + } + + # calculate weighted cumulative probabilities + p = cumsum(weights) + p = p/p[n] + + approxfun(x, p, yleft = 0, yright = 1, ties = "ordered", method = "constant") +} + +#' Weighted version of [stats::quantile()]. +#' Based on ggdist::weighted_quantile(). +#' @noRd +weighted_quantile = function(x, + probs = seq(0, 1, 0.25), + weights = NULL, + na.rm = FALSE, + type = 7 +) { + weighted_quantile_fun( + x, + weights = weights, + na.rm = na.rm, + type = type + )(probs) +} + +#' @rdname weighted_quantile +#' @export +weighted_quantile_fun = function(x, weights = NULL, na.rm = FALSE, type = 7) { + na.rm <- as_one_logical(na.rm) + if (!isTRUE(type %in% 1:9)) { + stop0("Quantile type `", deparse0(type), "` is invalid. It must be in 1:9.") + } + + if (na.rm) { + keep = !is.na(x) & !is.na(weights) + x = x[keep] + weights = weights[keep] + } + + # determine weights + weights = weights %||% rep(1, length(x)) + non_zero = weights != 0 + x = x[non_zero] + weights = weights[non_zero] + weights = weights / sum(weights) + + # if there is only 0 or 1 x values, we don't need the weighted version (and + # we couldn't calculate it anyway as we need > 2 points for the interpolation) + if (length(x) <= 1) { + return(function(p) quantile(x, p, names = FALSE)) + } + + # sort values if necessary + if (is.unsorted(x)) { + x_order = order(x) + x = x[x_order] + weights = weights[x_order] + } + + # calculate the weighted CDF + F_k = cumsum(weights) + + # generate the function for the approximate inverse CDF + if (1 <= type && type <= 3) { + # discontinuous quantiles + switch(type, + # type 1 + stepfun(F_k, c(x, x[length(x)]), right = TRUE), + # type 2 + { + x_over_2 = c(x, x[length(x)])/2 + inverse_cdf_type2_left = stepfun(F_k, x_over_2, right = FALSE) + inverse_cdf_type2_right = stepfun(F_k, x_over_2, right = TRUE) + function(x) inverse_cdf_type2_left(x) + inverse_cdf_type2_right(x) + }, + # type 3 + stepfun(F_k - weights/2, c(x[[1]], x), right = TRUE) + ) + } else { + # Continuous quantiles. These are based on the definition of p_k as described + # in the documentation of `quantile()`. The trick to re-writing those formulas + # (which use `n` and `k`) for the weighted case is that `k` = `F_k * n` and + # `1/n` = `weight_k`. Using these two facts, we can express the formulas for + # `p_k` without using `n` or `k`, which don't really apply in the weighted case. + p_k = switch(type - 3, + # type 4 + F_k, + # type 5 + F_k - weights/2, + # type 6 + F_k / (1 + weights), + # type 7 + (F_k - weights) / (1 - weights), + # type 8 + (F_k - weights/3) / (1 + weights/3), + # type 9 + (F_k - weights*3/8) / (1 + weights/4) + ) + approxfun(p_k, x, rule = 2, ties = "ordered") + } +} diff --git a/tests/testthat/test-rvar-dist.R b/tests/testthat/test-rvar-dist.R index b4d7a658..e3d44b06 100755 --- a/tests/testthat/test-rvar-dist.R +++ b/tests/testthat/test-rvar-dist.R @@ -33,7 +33,7 @@ test_that("distributional functions work on an rvar array", { q21 <- quantile(4:6, p) q12 <- quantile(7:9, p) q22 <- quantile(10:12, p) - x_quantiles <- array(c(q11, q21, q12, q22), dim = c(9, 2, 2), dimnames = list(NULL)) + x_quantiles <- array(c(q11, q21, q12, q22), dim = c(9, 2, 2)) expect_equal(quantile(x, p), x_quantiles) }) @@ -43,14 +43,14 @@ test_that("distributional functions work on an rvar_factor", { x <- rvar_factor(x_letters, levels = letters[1:5]) x2 <- c(rvar_factor(letters), rvar_factor(letters)) - expect_equal(density(x, letters[1:6]), c(0, .3, .2, .4, .1, NA)) + expect_equal(density(x, letters[1:6]), c(0, .3, .2, .4, .1, NA_real_)) expect_equal(density(x2, letters[1:3]), array(rep(1/26, 6), dim = c(3,2))) - expect_equal(cdf(x, letters[1:5]), c(NA, NA, NA, NA, NA)) - expect_equal(cdf(x2, letters[1:3]), array(rep(NA, 6), dim = c(3,2))) + expect_equal(cdf(x, letters[1:5]), c(NA_real_, NA_real_, NA_real_, NA_real_, NA_real_)) + expect_equal(cdf(x2, letters[1:3]), array(rep(NA_real_, 6), dim = c(3,2))) - expect_equal(quantile(x, 1:4/4), c(NA, NA, NA, NA)) - expect_equal(quantile(x2, 1:3/3), array(rep(NA, 6), dim = c(3,2))) + expect_equal(quantile(x, 1:4/4), c(NA_real_, NA_real_, NA_real_, NA_real_)) + expect_equal(quantile(x2, 1:3/3), array(rep(NA_real_, 6), dim = c(3,2))) }) test_that("distributional functions work on an rvar_ordered", { @@ -64,3 +64,43 @@ test_that("distributional functions work on an rvar_ordered", { expect_equal(quantile(x, c(.3, .5, .9, 1)), letters[2:5]) }) + +# weighted rvar ----------------------------------------------------------- + +test_that("weighted rvar works", { + x1_draws = qnorm(ppoints(10)) + x2_draws = qnorm(ppoints(10), 5) + w1 = rep(1, 10) + w2 = rep(2, 10) + w3 = rep(0, 10) + x = rvar(c(x1_draws, x2_draws, rep(10, 10)), log_weights = log(c(w1, w2, w3))) + + expect_equal( + density(x, 0:9, bw = 2.25), + density(draws_of(x), weights = weights(x), bw = 2.25, from = 0, to = 9, n = 10)$y, + tolerance = 1e-4 + ) + expect_equal(cdf(x, 0:9), ecdf(x1_draws)(0:9)/3 + ecdf(x2_draws)(0:9)*2/3) + expect_equal(quantile(x, cdf(x, c(x1_draws, x2_draws)), type = 1), c(x1_draws, x2_draws)) + expect_equal(quantile(x, cdf(x, c(x1_draws, x2_draws)), type = 4), c(x1_draws, x2_draws)) +}) + +test_that("weighted rvar_factor works", { + x = rvar_factor(c("b", "g", "f", "g"), levels = letters, log_weights = log(c(1/2, 1/6, 1/6, 1/6))) + + expect_equal(density(x, letters), c(0, 1/2, 0, 0, 0, 1/6, 1/3, rep(0, 19))) + expect_equal(cdf(x, letters), rep(NA_real_, 26)) + expect_equal(quantile(x, c(0.2, 0.8)), rep(NA_real_, 2)) +}) + +test_that("weighted rvar_ordered works", { + x = rvar_ordered(c("b", "g", "f", "g"), levels = letters, log_weights = log(c(1/2, 1/6, 1/6, 1/6))) + + expect_equal(density(x, letters), c(0, 1/2, 0, 0, 0, 1/6, 1/3, rep(0, 19))) + expect_equal(cdf(x, letters), cumsum(c(0, 1/2, 0, 0, 0, 1/6, 1/3, rep(0, 19)))) + expect_equal(quantile(x, c(0.2, 0.6, 0.8)), c("b", "f", "g")) + + xl = weight_draws(rvar_ordered(letters), 1:26) + expect_equal(quantile(xl, cdf(xl, letters) - .Machine$double.eps), letters) +}) +