Skip to content

Commit

Permalink
Merge pull request #33 from bips-hb/nomatch
Browse files Browse the repository at this point in the history
Add nomatch argument
  • Loading branch information
mnwright authored Jun 14, 2024
2 parents ea341a5 + 27449eb commit c61d849
Show file tree
Hide file tree
Showing 7 changed files with 152 additions and 28 deletions.
8 changes: 7 additions & 1 deletion R/expct.R
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,10 @@
#' each row in \code{evidence} is a separate conditioning event for which \code{n_synth} synthetic samples
#' are generated. If \code{'or'}, the rows are combined with a logical or; see Examples.
#' @param round Round continuous variables to their respective maximum precision in the real data set?
#' @param nomatch What to do if no leaf matches a condition in \code{evidence}?
#' Options are to force sampling from a random leaf, either with a warning (\code{"force_warning"})
#' or without a warning (\code{"force"}), or to return \code{NA}, also with a warning
#' (\code{"na_warning"}) or without a warning (\code{"na"}). The default is \code{"force_warning"}.
#' @param stepsize Stepsize defining number of evidence rows handled in one for each step.
#' Defaults to nrow(evidence)/num_registered_workers for \code{parallel == TRUE}.
#' @param parallel Compute in parallel? Must register backend beforehand, e.g.
Expand Down Expand Up @@ -94,10 +98,12 @@ expct <- function(
evidence = NULL,
evidence_row_mode = c("separate", "or"),
round = FALSE,
nomatch = c("force_warning", "force", "na_warning", "na"),
stepsize = 0,
parallel = TRUE) {

evidence_row_mode <- match.arg(evidence_row_mode)
nomatch <- match.arg(nomatch)

# To avoid data.table check issues
variable <- tree <- f_idx <- cvg <- wt <- V1 <- value <- val <- family <-
Expand Down Expand Up @@ -158,7 +164,7 @@ expct <- function(
index_start <- (step_-1)*stepsize + 1
index_end <- min(step_*stepsize, nrow(evidence))
evidence_part <- evidence[index_start:index_end,]
cparams <- cforde(params, evidence_part, evidence_row_mode, stepsize_cforde, parallel_cforde)
cparams <- cforde(params, evidence_part, evidence_row_mode, nomatch, stepsize_cforde, parallel_cforde)
}

# omega contains the weight (wt) for each leaf (f_idx) for each condition (c_idx)
Expand Down
17 changes: 7 additions & 10 deletions R/forge.R
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,10 @@
#' are generated. If \code{'or'}, the rows are combined with a logical or; see Examples.
#' @param round Round continuous variables to their respective maximum precision in the real data set?
#' @param sample_NAs Sample NAs respecting the probability for missing values in the original data.
#' @param nomatch What to do if no leaf matches a condition in \code{evidence}?
#' Options are to force sampling from a random leaf, either with a warning (\code{"force_warning"})
#' or without a warning (\code{"force"}), or to return \code{NA}, also with a warning
#' (\code{"na_warning"}) or without a warning (\code{"na"}). The default is \code{"force_warning"}.
#' @param stepsize Stepsize defining number of evidence rows handled in one for each step.
#' Defaults to nrow(evidence)/num_registered_workers for \code{parallel == TRUE}.
#' @param parallel Compute in parallel? Must register backend beforehand, e.g.
Expand Down Expand Up @@ -111,10 +115,12 @@ forge <- function(
evidence_row_mode = c("separate", "or"),
round = TRUE,
sample_NAs = FALSE,
nomatch = c("force_warning", "force", "na_warning", "na"),
stepsize = 0,
parallel = TRUE) {

evidence_row_mode <- match.arg(evidence_row_mode)
nomatch <- match.arg(nomatch)

# To avoid data.table check issues
tree <- cvg <- leaf <- idx <- family <- mu <- sigma <- prob <- dat <-
Expand Down Expand Up @@ -162,7 +168,7 @@ forge <- function(
index_start <- (step_-1)*stepsize + 1
index_end <- min(step_*stepsize, nrow(evidence))
evidence_part <- evidence[index_start:index_end,]
cparams <- cforde(params, evidence_part, evidence_row_mode, stepsize_cforde, parallel_cforde)
cparams <- cforde(params, evidence_part, evidence_row_mode, nomatch, stepsize_cforde, parallel_cforde)
if (is.null(cparams)) {
n_synth <- n_synth * nrow(evidence_part)
}
Expand All @@ -185,15 +191,6 @@ forge <- function(
}
omega <- omega[wt > 0, ]

# Use random leaves if NA (no matching leaves found)
if (omega[, any(is.na(f_idx))] & omega[, any(!is.na(f_idx))]) {
row_idx <- sample(nrow(omega[!is.na(f_idx), ]), omega[, sum(is.na(f_idx))], replace = TRUE)
temp <- omega[!is.na(f_idx), ][row_idx, .(f_idx, f_idx_uncond)]
omega[is.na(f_idx), f_idx_uncond := temp[, f_idx_uncond]]
omega[is.na(f_idx), f_idx := temp[, f_idx]]
}


# For each synthetic sample and condition, draw a leaf according to the leaf weights
if (nrow(omega) == 1) {
omega <- omega[rep(1, n_synth),][, idx := .I]
Expand Down
77 changes: 60 additions & 17 deletions R/utils.R
Original file line number Diff line number Diff line change
Expand Up @@ -200,6 +200,10 @@ post_x <- function(x, params, round = TRUE) {
#' @param params Circuit parameters learned via \code{\link{forde}}.
#' @param evidence Data frame of conditioning event(s).
#' @param row_mode Interpretation of rows in multi-row conditions.
#' @param nomatch What to do if no leaf matches a condition in \code{evidence}?
#' Options are to force sampling from a random leaf, either with a warning (\code{"force_warning"})
#' or without a warning (\code{"force"}), or to return \code{NA}, also with a warning
#' (\code{"na_warning"}) or without a warning (\code{"na"}). The default is \code{"force_warning"}.
#' @param stepsize Stepsize defining number of condition rows handled in one for each step.
#' @param parallel Compute in parallel? Must register backend beforehand, e.g.
#' via \code{doParallel} or \code{doFuture}; see examples.
Expand All @@ -214,9 +218,15 @@ post_x <- function(x, params, round = TRUE) {
#' @importFrom stats dunif punif
#' @keywords internal

cforde <- function(params, evidence, row_mode = c("separate", "or"), stepsize = 0, parallel = TRUE) {
cforde <- function(params,
evidence,
row_mode = c("separate", "or"),
nomatch = c("force_warning", "force", "na_warning", "na"),
stepsize = 0,
parallel = TRUE) {

row_mode <- match.arg(row_mode)
nomatch <- match.arg(nomatch)

# To avoid data.table check issues
. <- c_idx <- cvg <- cvg_arf <- cvg_factor <- f_idx <- f_idx_uncond <- i.max <-
Expand Down Expand Up @@ -414,13 +424,21 @@ cforde <- function(params, evidence, row_mode = c("separate", "or"), stepsize =
cat_new <- setcolorder(merge(relevant_leaves, updates_relevant_leaves$cat_new, by.x = c("c_idx", "f_idx_uncond"), by.y = c("c_idx", "f_idx"), sort = F), c("f_idx","c_idx","variable","val","prob","cvg_factor"))[]

# Check for conditions with no matching leaves and handle this according to row_mode
conds_impossible <- conds_conditioned[!(conds_conditioned %in% relevant_leaves[,unique(c_idx)])]
if (relevant_leaves[,uniqueN(c_idx)] < nconds_conditioned) {
if (relevant_leaves[,uniqueN(c_idx)] == 0 & row_mode == "or") {
stop("For all entered evidence rows, no matching leaves could be found. This is probably because evidence lies outside of the distribution calculated by FORDE. For continuous data, consider setting epsilon>0 or finite_bounds='no' in forde(). For categorical data, consider setting alpha>0 in forde()")
stop("For all entered evidence rows, no matching leaves could be found. This is probably because evidence lies outside of the distribution calculated by FORDE. For continuous data, consider setting epsilon>0 or finite_bounds='no' in forde(). For categorical data, consider setting alpha>0 in forde().")
} else {
warning("For some entered evidence rows, no matching leaves could be found. This is probably because evidence lies outside of the distribution calculated by FORDE. For continuous data, consider setting epsilon>0 or finite_bounds='no' in forde(). For categorical data, consider setting alpha>0 in forde()")
conds_impossible <- conds_conditioned[!(conds_conditioned %in% relevant_leaves[,unique(c_idx)])]
relevant_leaves <- setorder(rbind(relevant_leaves, data.table(c_idx = conds_impossible, f_idx = NA_integer_, f_idx_uncond = NA_integer_)))
if (grepl("warning$", nomatch)) {
wrn <- "For some entered evidence rows, no matching leaves could be found. This is probably because evidence lies outside of the distribution calculated by FORDE. For continuous data, consider setting epsilon>0 or finite_bounds='no' in forde(). For categorical data, consider setting alpha>0 in forde()."
if (grepl("^force", nomatch)) {
warning(paste(wrn, "Sampling from all leaves with equal probability (can be changed with 'nomatch' argument)."))
} else {
warning(paste(wrn, "Returning NA for those rows (can be changed with 'nomatch' argument)."))
}
}
impossible_leaves <- data.table(c_idx = conds_impossible, f_idx = NA_integer_, f_idx_uncond = NA_integer_)
relevant_leaves <- setorder(rbind(relevant_leaves, impossible_leaves))
}
}

Expand All @@ -442,23 +460,48 @@ cforde <- function(params, evidence, row_mode = c("separate", "or"), stepsize =
# Re-calculate weights and transform back from log scale, handle (numerically) impossible cases
if (row_mode == "or") {
if (cvg_new[,all(cvg == -Inf)]) {
warning("All leaves have zero likelihood. This is probably because evidence contains an (almost) impossible combination.")
cvg_new[, cvg := 1/.N]
if (grepl("^force", nomatch)) {
cvg_new[, cvg := 1/.N]
} else {
cvg_new[, cvg := NA]
}
if (grepl("warning$", nomatch)) {
wrn <- "All leaves have zero likelihood. This is probably because evidence contains an (almost) impossible combination."
if (grepl("^force", nomatch)) {
warning(paste(wrn, "Sampling from all possible leaves with equal probability."))
} else {
warning(paste(wrn, "Returning NA."))
}
}
} else {
cvg_new[, cvg := exp(cvg - max(cvg))]
cvg_new <- cvg_new[, cvg := cvg / sum(cvg)]
}
} else {
cvg_new[, leaf_zero_lik := all(cvg == -Inf), by = c_idx]
if (any(cvg_new[, leaf_zero_lik])) {
warning("All leaves have zero likelihood for some entered evidence rows. This is probably because evidence contains an (almost) impossible combination.")
cvg_new[leaf_zero_lik == TRUE, cvg := 1/.N, by = c_idx]
if (grepl("^force", nomatch)) {
cvg_new[leaf_zero_lik == TRUE, cvg := 1/.N, by = c_idx]
} else {
cvg_new <- cvg_new[leaf_zero_lik == FALSE, ]
}
if (grepl("warning$", nomatch)) {
wrn <- "All leaves have zero likelihood for some entered evidence rows. This is probably because evidence contains an (almost) impossible combination."
if (grepl("^force", nomatch)) {
warning(paste(wrn, "Sampling from all possible leaves with equal probability (can be changed with 'nomatch' argument)."))
} else {
warning(paste(wrn, "Returning NA for those rows (can be changed with 'nomatch' argument)."))
}
}
}
if (any(cvg_new[, !leaf_zero_lik])) {
cvg_new[leaf_zero_lik == FALSE, scale := max(cvg), by = c_idx]
cvg_new[leaf_zero_lik == FALSE, cvg := exp(cvg - scale)]
cvg_new[leaf_zero_lik == FALSE, scale := sum(cvg), by = c_idx]
cvg_new[leaf_zero_lik == FALSE, cvg := cvg / scale]
cvg_new[, scale := NULL]
}
cvg_new[leaf_zero_lik == FALSE, scale := max(cvg), by = c_idx]
cvg_new[leaf_zero_lik == FALSE, cvg := exp(cvg - scale)]
cvg_new[leaf_zero_lik == FALSE, scale := sum(cvg), by = c_idx]
cvg_new[leaf_zero_lik == FALSE, cvg := cvg / scale]
cvg_new[, `:=` (leaf_zero_lik = NULL, scale = NULL)]
cvg_new[, leaf_zero_lik := NULL]
}
}

Expand All @@ -478,12 +521,12 @@ cforde <- function(params, evidence, row_mode = c("separate", "or"), stepsize =
}

# Add all leaves for all-NA conditions to forest
if (row_mode == "separate" & (nconds != nconds_conditioned)) {
conds_unconditioned <- (1:nconds)[!(1:nconds) %in% conds_conditioned]
if ((grepl("^force", nomatch) & length(conds_impossible) > 0) | (row_mode == "separate" & nconds != nconds_conditioned)) {
conds_unconditioned <- c(conds_impossible, (1:nconds)[!(1:nconds) %in% conds_conditioned])
forest_new_unconditioned <- copy(forest)
forest_new_unconditioned <- rbindlist(replicate(length(conds_unconditioned), forest, simplify = F))
forest_new_unconditioned[, `:=` (c_idx = rep(conds_unconditioned,each = nrow(forest)), f_idx_uncond = f_idx, cvg_arf = cvg)]
forest_new <- rbind(forest_new, forest_new_unconditioned)
forest_new <- rbind(forest_new, forest_new_unconditioned)[!is.na(f_idx), ]
}

setorder(setcolorder(forest_new,c("f_idx","c_idx","f_idx_uncond","tree","leaf","cvg_arf","cvg")), c_idx, f_idx, f_idx_uncond, tree, leaf)
Expand Down
6 changes: 6 additions & 0 deletions man/cforde.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions man/expct.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 6 additions & 0 deletions man/forge.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

60 changes: 60 additions & 0 deletions tests/testthat/test-conditions.R
Original file line number Diff line number Diff line change
Expand Up @@ -25,3 +25,63 @@ test_that("categorical conditions work with not", {
expect_in(as.character(synth$Species), c("versicolor", "virginica"))
})

test_that("if nomatch='force_warning', run through with a warning", {
# Zero likelihood case (no finite bounds)
psi_no <- forde(arf, iris, finite_bounds = "no", parallel = FALSE)
expect_warning(x_synth <- forge(psi_no, evidence = data.frame(Sepal.Length = 100),
nomatch = "force_warning", n_synth = 10, parallel = FALSE),
"All leaves have zero likelihood for some entered evidence rows\\. This is probably because evidence contains an \\(almost\\) impossible combination\\. Sampling from all possible leaves with equal probability \\(can be changed with 'nomatch' argument\\)\\.")
expect_true(all(!is.na(x_synth)))

# No matching leaf case (finite bounds)
psi_global <- forde(arf, iris, finite_bounds = "global", parallel = FALSE)
expect_warning(x_synth <- forge(psi_global, evidence = data.frame(Sepal.Length = 100),
nomatch = "force_warning", n_synth = 10, parallel = FALSE),
"For some entered evidence rows, no matching leaves could be found\\. This is probably because evidence lies outside of the distribution calculated by FORDE\\. For continuous data, consider setting epsilon>0 or finite_bounds='no' in forde\\(\\)\\. For categorical data, consider setting alpha>0 in forde\\(\\)\\. Sampling from all leaves with equal probability \\(can be changed with 'nomatch' argument\\)\\.")
expect_true(all(!is.na(x_synth)))
})

test_that("if nomatch='force', run through without a warning", {
# Zero likelihood case (no finite bounds)
psi_no <- forde(arf, iris, finite_bounds = "no", parallel = FALSE)
expect_silent(x_synth <- forge(psi_no, evidence = data.frame(Sepal.Length = 100),
nomatch = "force", n_synth = 10, parallel = FALSE))
expect_true(all(!is.na(x_synth)))

# No matching leaf case (finite bounds)
psi_global <- forde(arf, iris, finite_bounds = "global", parallel = FALSE)
expect_silent(x_synth <- forge(psi_global, evidence = data.frame(Sepal.Length = 100),
nomatch = "force", n_synth = 10, parallel = FALSE))
expect_true(all(!is.na(x_synth)))
})

test_that("if nomatch='na_warning', run through with a warning and return NA", {
# Zero likelihood case (no finite bounds)
psi_no <- forde(arf, iris, finite_bounds = "no", parallel = FALSE)
expect_warning(x_synth <- forge(psi_no, evidence = data.frame(Sepal.Length = 100),
nomatch = "na_warning", n_synth = 10, parallel = FALSE),
"All leaves have zero likelihood for some entered evidence rows\\. This is probably because evidence contains an \\(almost\\) impossible combination\\. Returning NA for those rows \\(can be changed with 'nomatch' argument\\)\\.")
expect_true(all(is.na(x_synth[, -1])))

# No matching leaf case (finite bounds)
psi_global <- forde(arf, iris, finite_bounds = "global", parallel = FALSE)
expect_warning(x_synth <- forge(psi_global, evidence = data.frame(Sepal.Length = 100),
nomatch = "na_warning", n_synth = 10, parallel = FALSE),
"For some entered evidence rows, no matching leaves could be found\\. This is probably because evidence lies outside of the distribution calculated by FORDE\\. For continuous data, consider setting epsilon>0 or finite_bounds='no' in forde\\(\\)\\. For categorical data, consider setting alpha>0 in forde\\(\\)\\. Returning NA for those rows \\(can be changed with 'nomatch' argument\\)\\.")
expect_true(all(is.na(x_synth[, -1])))
})

test_that("if nomatch='na', run through without a warning and return NA", {
# Zero likelihood case (no finite bounds)
psi_no <- forde(arf, iris, finite_bounds = "no", parallel = FALSE)
expect_silent(x_synth <- forge(psi_no, evidence = data.frame(Sepal.Length = 100),
nomatch = "na", n_synth = 10, parallel = FALSE))
expect_true(all(is.na(x_synth[, -1])))

# No matching leaf case (finite bounds)
psi_global <- forde(arf, iris, finite_bounds = "global", parallel = FALSE)
expect_silent(x_synth <- forge(psi_global, evidence = data.frame(Sepal.Length = 100),
nomatch = "na", n_synth = 10, parallel = FALSE))
expect_true(all(is.na(x_synth[, -1])))
})

0 comments on commit c61d849

Please sign in to comment.