Skip to content

Commit

Permalink
add nomatch argument
Browse files Browse the repository at this point in the history
  • Loading branch information
mnwright committed Jun 12, 2024
1 parent 6b8aafd commit 9dc2fc5
Show file tree
Hide file tree
Showing 7 changed files with 128 additions and 14 deletions.
7 changes: 6 additions & 1 deletion R/expct.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@
#' each row in \code{evidence} is a separate conditioning event for which \code{n_synth} synthetic samples
#' are generated. If \code{'or'}, the rows are combined with a logical or; see Examples.
#' @param round Round continuous variables to their respective maximum precision in the real data set?
#' @param nomatch What to do if no leaf matches a condition in \code{evidence}?
#' Options are to force sampling from a random leaf, either with a warning (\code{"force_warning"})
#' or without a warning (\code{"force"}), or to return \code{NA}, also with a warning
#' (\code{"na_warning"}) or without a warning (\code{"na"}). The default is \code{"force_warning"}.
#' @param stepsize Stepsize defining number of evidence rows handled in one for each step.
#' Defaults to nrow(evidence)/num_registered_workers for \code{parallel == TRUE}.
#' @param parallel Compute in parallel? Must register backend beforehand, e.g.
Expand Down Expand Up @@ -85,6 +89,7 @@ expct <- function(
evidence = NULL,
evidence_row_mode = c("separate", "or"),
round = FALSE,
nomatch = c("force_warning", "force", "na_warning", "na"),
stepsize = 0,
parallel = TRUE) {

Expand Down Expand Up @@ -149,7 +154,7 @@ expct <- function(
index_start <- (step_-1)*stepsize + 1
index_end <- min(step_*stepsize, nrow(evidence))
evidence_part <- evidence[index_start:index_end,]
cparams <- cforde(params, evidence_part, evidence_row_mode, stepsize_cforde, parallel_cforde)
cparams <- cforde(params, evidence_part, evidence_row_mode, nomatch, stepsize_cforde, parallel_cforde)
}

# omega contains the weight (wt) for each leaf (f_idx) for each condition (c_idx)
Expand Down
7 changes: 6 additions & 1 deletion R/forge.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@
#' are generated. If \code{'or'}, the rows are combined with a logical or; see Examples.
#' @param round Round continuous variables to their respective maximum precision in the real data set?
#' @param sample_NAs Sample NAs respecting the probability for missing values in the original data.
#' @param nomatch What to do if no leaf matches a condition in \code{evidence}?
#' Options are to force sampling from a random leaf, either with a warning (\code{"force_warning"})
#' or without a warning (\code{"force"}), or to return \code{NA}, also with a warning
#' (\code{"na_warning"}) or without a warning (\code{"na"}). The default is \code{"force_warning"}.
#' @param stepsize Stepsize defining number of evidence rows handled in one for each step.
#' Defaults to nrow(evidence)/num_registered_workers for \code{parallel == TRUE}.
#' @param parallel Compute in parallel? Must register backend beforehand, e.g.
Expand Down Expand Up @@ -102,6 +106,7 @@ forge <- function(
evidence_row_mode = c("separate", "or"),
round = TRUE,
sample_NAs = FALSE,
nomatch = c("force_warning", "force", "na_warning", "na"),
stepsize = 0,
parallel = TRUE) {

Expand Down Expand Up @@ -153,7 +158,7 @@ forge <- function(
index_start <- (step_-1)*stepsize + 1
index_end <- min(step_*stepsize, nrow(evidence))
evidence_part <- evidence[index_start:index_end,]
cparams <- cforde(params, evidence_part, evidence_row_mode, stepsize_cforde, parallel_cforde)
cparams <- cforde(params, evidence_part, evidence_row_mode, nomatch, stepsize_cforde, parallel_cforde)
if (is.null(cparams)) {
n_synth <- n_synth * nrow(evidence_part)
}
Expand Down
50 changes: 38 additions & 12 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,10 @@ post_x <- function(x, params, round = TRUE) {
#' @param params Circuit parameters learned via \code{\link{forde}}.
#' @param evidence Data frame of conditioning event(s).
#' @param row_mode Interpretation of rows in multi-row conditions.
#' @param nomatch What to do if no leaf matches a condition in \code{evidence}?
#' Options are to force sampling from a random leaf, either with a warning (\code{"force_warning"})
#' or without a warning (\code{"force"}), or to return \code{NA}, also with a warning
#' (\code{"na_warning"}) or without a warning (\code{"na"}). The default is \code{"force_warning"}.
#' @param stepsize Stepsize defining number of condition rows handled in one for each step.
#' @param parallel Compute in parallel? Must register backend beforehand, e.g.
#' via \code{doParallel}.
Expand All @@ -214,7 +218,12 @@ post_x <- function(x, params, round = TRUE) {
#' @importFrom stats dunif punif
#' @keywords internal

cforde <- function(params, evidence, row_mode = c("separate", "or"), stepsize = 0, parallel = TRUE) {
cforde <- function(params,
evidence,
row_mode = c("separate", "or"),
nomatch = c("force_warning", "force", "na_warning", "na"),
stepsize = 0,
parallel = TRUE) {

row_mode <- match.arg(row_mode)

Expand Down Expand Up @@ -416,9 +425,11 @@ cforde <- function(params, evidence, row_mode = c("separate", "or"), stepsize =
# Check for conditions with no matching leaves and handle this according to row_mode
if (relevant_leaves[,uniqueN(c_idx)] < nconds_conditioned) {
if (relevant_leaves[,uniqueN(c_idx)] == 0 & row_mode == "or") {
stop("For all entered evidence rows, no matching leaves could be found. This is probably because evidence lies outside of the distribution calculated by FORDE. For continuous data, consider setting epsilon>0 or finite_bounds='no' in forde(). For categorical data, consider setting alpha>0 in forde()")
stop("For all entered evidence rows, no matching leaves could be found. This is probably because evidence lies outside of the distribution calculated by FORDE. For continuous data, consider setting epsilon>0 or finite_bounds='no' in forde(). For categorical data, consider setting alpha>0 in forde().")
} else {
warning("For some entered evidence rows, no matching leaves could be found. This is probably because evidence lies outside of the distribution calculated by FORDE. For continuous data, consider setting epsilon>0 or finite_bounds='no' in forde(). For categorical data, consider setting alpha>0 in forde()")
if (grepl("warning$", nomatch)) {
warning("For some entered evidence rows, no matching leaves could be found. This is probably because evidence lies outside of the distribution calculated by FORDE. For continuous data, consider setting epsilon>0 or finite_bounds='no' in forde(). For categorical data, consider setting alpha>0 in forde().")
}
conds_impossible <- conds_conditioned[!(conds_conditioned %in% relevant_leaves[,unique(c_idx)])]
relevant_leaves <- setorder(rbind(relevant_leaves, data.table(c_idx = conds_impossible, f_idx = NA_integer_, f_idx_uncond = NA_integer_)))
}
Expand All @@ -442,23 +453,38 @@ cforde <- function(params, evidence, row_mode = c("separate", "or"), stepsize =
# Re-calculate weights and transform back from log scale, handle (numerically) impossible cases
if (row_mode == "or") {
if (cvg_new[,all(cvg == -Inf)]) {
warning("All leaves have zero likelihood. This is probably because evidence contains an (almost) impossible combination.")
cvg_new[, cvg := 1/.N]
if (grepl("^force", nomatch)) {
cvg_new[, cvg := 1/.N]
} else {
cvg_new[, cvg := NA]
}
if (grepl("warning$", nomatch)) {
warning("All leaves have zero likelihood. This is probably because evidence contains an (almost) impossible combination. Sampling from all leaves with equal probability.")
}
} else {
cvg_new[, cvg := exp(cvg - max(cvg))]
cvg_new <- cvg_new[, cvg := cvg / sum(cvg)]
}
} else {
cvg_new[, leaf_zero_lik := all(cvg == -Inf), by = c_idx]
if (any(cvg_new[, leaf_zero_lik])) {
warning("All leaves have zero likelihood for some entered evidence rows. This is probably because evidence contains an (almost) impossible combination.")
cvg_new[leaf_zero_lik == TRUE, cvg := 1/.N, by = c_idx]
if (grepl("^force", nomatch)) {
cvg_new[leaf_zero_lik == TRUE, cvg := 1/.N, by = c_idx]
} else {
cvg_new <- cvg_new[leaf_zero_lik == FALSE, ]
}
if (grepl("warning$", nomatch)) {
warning("All leaves have zero likelihood for some entered evidence rows. This is probably because evidence contains an (almost) impossible combination. Sampling from all leaves with equal probability.")
}
}
if (any(cvg_new[, !leaf_zero_lik])) {
cvg_new[leaf_zero_lik == FALSE, scale := max(cvg), by = c_idx]
cvg_new[leaf_zero_lik == FALSE, cvg := exp(cvg - scale)]
cvg_new[leaf_zero_lik == FALSE, scale := sum(cvg), by = c_idx]
cvg_new[leaf_zero_lik == FALSE, cvg := cvg / scale]
cvg_new[, scale := NULL]
}
cvg_new[leaf_zero_lik == FALSE, scale := max(cvg), by = c_idx]
cvg_new[leaf_zero_lik == FALSE, cvg := exp(cvg - scale)]
cvg_new[leaf_zero_lik == FALSE, scale := sum(cvg), by = c_idx]
cvg_new[leaf_zero_lik == FALSE, cvg := cvg / scale]
cvg_new[, `:=` (leaf_zero_lik = NULL, scale = NULL)]
cvg_new[, leaf_zero_lik := NULL]
}
}

Expand Down
6 changes: 6 additions & 0 deletions man/cforde.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions man/expct.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions man/forge.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

60 changes: 60 additions & 0 deletions tests/testthat/test-conditions.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,63 @@ test_that("categorical conditions work with not", {
expect_in(as.character(synth$Species), c("versicolor", "virginica"))
})

test_that("if nomatch='force_warning', run through with a warning", {
# Zero likelihood case (no finite bounds)
psi_no <- forde(arf, iris, finite_bounds = "no", parallel = FALSE)
expect_warning(x_synth <- forge(psi_no, evidence = data.frame(Sepal.Length = 100),
nomatch = "force_warning", n_synth = 10, parallel = FALSE),
"All leaves have zero likelihood for some entered evidence rows\\. This is probably because evidence contains an \\(almost\\) impossible combination\\. Sampling from all leaves with equal probability\\.")
expect_true(all(!is.na(x_synth)))

# No matching leaf case (finite bounds)
psi_global <- forde(arf, iris, finite_bounds = "global", parallel = FALSE)
expect_warning(x_synth <- forge(psi_global, evidence = data.frame(Sepal.Length = 100),
nomatch = "force_warning", n_synth = 10, parallel = FALSE),
"For some entered evidence rows, no matching leaves could be found\\. This is probably because evidence lies outside of the distribution calculated by FORDE\\. For continuous data, consider setting epsilon>0 or finite_bounds='no' in forde\\(\\)\\. For categorical data, consider setting alpha>0 in forde\\(\\)\\.")
expect_true(all(!is.na(x_synth)))
})

test_that("if nomatch='force', run through without a warning", {
# Zero likelihood case (no finite bounds)
psi_no <- forde(arf, iris, finite_bounds = "no", parallel = FALSE)
expect_silent(x_synth <- forge(psi_no, evidence = data.frame(Sepal.Length = 100),
nomatch = "force", n_synth = 10, parallel = FALSE))
expect_true(all(!is.na(x_synth)))

# No matching leaf case (finite bounds)
psi_global <- forde(arf, iris, finite_bounds = "global", parallel = FALSE)
expect_silent(x_synth <- forge(psi_global, evidence = data.frame(Sepal.Length = 100),
nomatch = "force", n_synth = 10, parallel = FALSE))
expect_true(all(!is.na(x_synth)))
})

test_that("if nomatch='na_warning', run through with a warning and return NA", {
# Zero likelihood case (no finite bounds)
psi_no <- forde(arf, iris, finite_bounds = "no", parallel = FALSE)
expect_warning(x_synth <- forge(psi_no, evidence = data.frame(Sepal.Length = 100),
nomatch = "na_warning", n_synth = 10, parallel = FALSE),
"All leaves have zero likelihood for some entered evidence rows\\. This is probably because evidence contains an \\(almost\\) impossible combination\\. Sampling from all leaves with equal probability\\.")
expect_true(all(is.na(x_synth[, -1])))

# No matching leaf case (finite bounds)
psi_global <- forde(arf, iris, finite_bounds = "global", parallel = FALSE)
expect_warning(x_synth <- forge(psi_global, evidence = data.frame(Sepal.Length = 100),
nomatch = "na_warning", n_synth = 10, parallel = FALSE),
"For some entered evidence rows, no matching leaves could be found\\. This is probably because evidence lies outside of the distribution calculated by FORDE\\. For continuous data, consider setting epsilon>0 or finite_bounds='no' in forde\\(\\)\\. For categorical data, consider setting alpha>0 in forde\\(\\)\\.")
expect_true(all(is.na(x_synth[, -1])))
})

test_that("if nomatch='na', run through without a warning and return NA", {
# Zero likelihood case (no finite bounds)
psi_no <- forde(arf, iris, finite_bounds = "no", parallel = FALSE)
expect_silent(x_synth <- forge(psi_no, evidence = data.frame(Sepal.Length = 100),
nomatch = "na", n_synth = 10, parallel = FALSE))
expect_true(all(is.na(x_synth[, -1])))

# No matching leaf case (finite bounds)
psi_global <- forde(arf, iris, finite_bounds = "global", parallel = FALSE)
expect_silent(x_synth <- forge(psi_global, evidence = data.frame(Sepal.Length = 100),
nomatch = "na", n_synth = 10, parallel = FALSE))
expect_true(all(is.na(x_synth[, -1])))
})

0 comments on commit 9dc2fc5

Please sign in to comment.