diff --git a/R/expct.R b/R/expct.R index 98183cdc..cede8076 100644 --- a/R/expct.R +++ b/R/expct.R @@ -17,6 +17,10 @@ #' each row in \code{evidence} is a separate conditioning event for which \code{n_synth} synthetic samples #' are generated. If \code{'or'}, the rows are combined with a logical or; see Examples. #' @param round Round continuous variables to their respective maximum precision in the real data set? +#' @param nomatch What to do if no leaf matches a condition in \code{evidence}? +#' Options are to force sampling from a random leaf, either with a warning (\code{"force_warning"}) +#' or without a warning (\code{"force"}), or to return \code{NA}, also with a warning +#' (\code{"na_warning"}) or without a warning (\code{"na"}). The default is \code{"force_warning"}. #' @param stepsize Stepsize defining number of evidence rows handled in one for each step. #' Defaults to nrow(evidence)/num_registered_workers for \code{parallel == TRUE}. #' @param parallel Compute in parallel? Must register backend beforehand, e.g. @@ -94,10 +98,12 @@ expct <- function( evidence = NULL, evidence_row_mode = c("separate", "or"), round = FALSE, + nomatch = c("force_warning", "force", "na_warning", "na"), stepsize = 0, parallel = TRUE) { evidence_row_mode <- match.arg(evidence_row_mode) + nomatch <- match.arg(nomatch) # To avoid data.table check issues variable <- tree <- f_idx <- cvg <- wt <- V1 <- value <- val <- family <- @@ -158,7 +164,7 @@ expct <- function( index_start <- (step_-1)*stepsize + 1 index_end <- min(step_*stepsize, nrow(evidence)) evidence_part <- evidence[index_start:index_end,] - cparams <- cforde(params, evidence_part, evidence_row_mode, stepsize_cforde, parallel_cforde) + cparams <- cforde(params, evidence_part, evidence_row_mode, nomatch, stepsize_cforde, parallel_cforde) } # omega contains the weight (wt) for each leaf (f_idx) for each condition (c_idx) diff --git a/R/forge.R b/R/forge.R index fd8e095b..3bf0c88c 100644 --- a/R/forge.R +++ b/R/forge.R @@ -13,6 +13,10 @@ #' are generated. If \code{'or'}, the rows are combined with a logical or; see Examples. #' @param round Round continuous variables to their respective maximum precision in the real data set? #' @param sample_NAs Sample NAs respecting the probability for missing values in the original data. +#' @param nomatch What to do if no leaf matches a condition in \code{evidence}? +#' Options are to force sampling from a random leaf, either with a warning (\code{"force_warning"}) +#' or without a warning (\code{"force"}), or to return \code{NA}, also with a warning +#' (\code{"na_warning"}) or without a warning (\code{"na"}). The default is \code{"force_warning"}. #' @param stepsize Stepsize defining number of evidence rows handled in one for each step. #' Defaults to nrow(evidence)/num_registered_workers for \code{parallel == TRUE}. #' @param parallel Compute in parallel? Must register backend beforehand, e.g. @@ -111,10 +115,12 @@ forge <- function( evidence_row_mode = c("separate", "or"), round = TRUE, sample_NAs = FALSE, + nomatch = c("force_warning", "force", "na_warning", "na"), stepsize = 0, parallel = TRUE) { evidence_row_mode <- match.arg(evidence_row_mode) + nomatch <- match.arg(nomatch) # To avoid data.table check issues tree <- cvg <- leaf <- idx <- family <- mu <- sigma <- prob <- dat <- @@ -162,7 +168,7 @@ forge <- function( index_start <- (step_-1)*stepsize + 1 index_end <- min(step_*stepsize, nrow(evidence)) evidence_part <- evidence[index_start:index_end,] - cparams <- cforde(params, evidence_part, evidence_row_mode, stepsize_cforde, parallel_cforde) + cparams <- cforde(params, evidence_part, evidence_row_mode, nomatch, stepsize_cforde, parallel_cforde) if (is.null(cparams)) { n_synth <- n_synth * nrow(evidence_part) } @@ -185,15 +191,6 @@ forge <- function( } omega <- omega[wt > 0, ] - # Use random leaves if NA (no matching leaves found) - if (omega[, any(is.na(f_idx))] & omega[, any(!is.na(f_idx))]) { - row_idx <- sample(nrow(omega[!is.na(f_idx), ]), omega[, sum(is.na(f_idx))], replace = TRUE) - temp <- omega[!is.na(f_idx), ][row_idx, .(f_idx, f_idx_uncond)] - omega[is.na(f_idx), f_idx_uncond := temp[, f_idx_uncond]] - omega[is.na(f_idx), f_idx := temp[, f_idx]] - } - - # For each synthetic sample and condition, draw a leaf according to the leaf weights if (nrow(omega) == 1) { omega <- omega[rep(1, n_synth),][, idx := .I] diff --git a/R/utils.R b/R/utils.R index 546ac9de..e2ca1368 100644 --- a/R/utils.R +++ b/R/utils.R @@ -200,6 +200,10 @@ post_x <- function(x, params, round = TRUE) { #' @param params Circuit parameters learned via \code{\link{forde}}. #' @param evidence Data frame of conditioning event(s). #' @param row_mode Interpretation of rows in multi-row conditions. +#' @param nomatch What to do if no leaf matches a condition in \code{evidence}? +#' Options are to force sampling from a random leaf, either with a warning (\code{"force_warning"}) +#' or without a warning (\code{"force"}), or to return \code{NA}, also with a warning +#' (\code{"na_warning"}) or without a warning (\code{"na"}). The default is \code{"force_warning"}. #' @param stepsize Stepsize defining number of condition rows handled in one for each step. #' @param parallel Compute in parallel? Must register backend beforehand, e.g. #' via \code{doParallel} or \code{doFuture}; see examples. @@ -214,9 +218,15 @@ post_x <- function(x, params, round = TRUE) { #' @importFrom stats dunif punif #' @keywords internal -cforde <- function(params, evidence, row_mode = c("separate", "or"), stepsize = 0, parallel = TRUE) { +cforde <- function(params, + evidence, + row_mode = c("separate", "or"), + nomatch = c("force_warning", "force", "na_warning", "na"), + stepsize = 0, + parallel = TRUE) { row_mode <- match.arg(row_mode) + nomatch <- match.arg(nomatch) # To avoid data.table check issues . <- c_idx <- cvg <- cvg_arf <- cvg_factor <- f_idx <- f_idx_uncond <- i.max <- @@ -414,13 +424,21 @@ cforde <- function(params, evidence, row_mode = c("separate", "or"), stepsize = cat_new <- setcolorder(merge(relevant_leaves, updates_relevant_leaves$cat_new, by.x = c("c_idx", "f_idx_uncond"), by.y = c("c_idx", "f_idx"), sort = F), c("f_idx","c_idx","variable","val","prob","cvg_factor"))[] # Check for conditions with no matching leaves and handle this according to row_mode + conds_impossible <- conds_conditioned[!(conds_conditioned %in% relevant_leaves[,unique(c_idx)])] if (relevant_leaves[,uniqueN(c_idx)] < nconds_conditioned) { if (relevant_leaves[,uniqueN(c_idx)] == 0 & row_mode == "or") { - stop("For all entered evidence rows, no matching leaves could be found. This is probably because evidence lies outside of the distribution calculated by FORDE. For continuous data, consider setting epsilon>0 or finite_bounds='no' in forde(). For categorical data, consider setting alpha>0 in forde()") + stop("For all entered evidence rows, no matching leaves could be found. This is probably because evidence lies outside of the distribution calculated by FORDE. For continuous data, consider setting epsilon>0 or finite_bounds='no' in forde(). For categorical data, consider setting alpha>0 in forde().") } else { - warning("For some entered evidence rows, no matching leaves could be found. This is probably because evidence lies outside of the distribution calculated by FORDE. For continuous data, consider setting epsilon>0 or finite_bounds='no' in forde(). For categorical data, consider setting alpha>0 in forde()") - conds_impossible <- conds_conditioned[!(conds_conditioned %in% relevant_leaves[,unique(c_idx)])] - relevant_leaves <- setorder(rbind(relevant_leaves, data.table(c_idx = conds_impossible, f_idx = NA_integer_, f_idx_uncond = NA_integer_))) + if (grepl("warning$", nomatch)) { + wrn <- "For some entered evidence rows, no matching leaves could be found. This is probably because evidence lies outside of the distribution calculated by FORDE. For continuous data, consider setting epsilon>0 or finite_bounds='no' in forde(). For categorical data, consider setting alpha>0 in forde()." + if (grepl("^force", nomatch)) { + warning(paste(wrn, "Sampling from all leaves with equal probability (can be changed with 'nomatch' argument).")) + } else { + warning(paste(wrn, "Returning NA for those rows (can be changed with 'nomatch' argument).")) + } + } + impossible_leaves <- data.table(c_idx = conds_impossible, f_idx = NA_integer_, f_idx_uncond = NA_integer_) + relevant_leaves <- setorder(rbind(relevant_leaves, impossible_leaves)) } } @@ -442,8 +460,19 @@ cforde <- function(params, evidence, row_mode = c("separate", "or"), stepsize = # Re-calculate weights and transform back from log scale, handle (numerically) impossible cases if (row_mode == "or") { if (cvg_new[,all(cvg == -Inf)]) { - warning("All leaves have zero likelihood. This is probably because evidence contains an (almost) impossible combination.") - cvg_new[, cvg := 1/.N] + if (grepl("^force", nomatch)) { + cvg_new[, cvg := 1/.N] + } else { + cvg_new[, cvg := NA] + } + if (grepl("warning$", nomatch)) { + wrn <- "All leaves have zero likelihood. This is probably because evidence contains an (almost) impossible combination." + if (grepl("^force", nomatch)) { + warning(paste(wrn, "Sampling from all possible leaves with equal probability.")) + } else { + warning(paste(wrn, "Returning NA.")) + } + } } else { cvg_new[, cvg := exp(cvg - max(cvg))] cvg_new <- cvg_new[, cvg := cvg / sum(cvg)] @@ -451,14 +480,28 @@ cforde <- function(params, evidence, row_mode = c("separate", "or"), stepsize = } else { cvg_new[, leaf_zero_lik := all(cvg == -Inf), by = c_idx] if (any(cvg_new[, leaf_zero_lik])) { - warning("All leaves have zero likelihood for some entered evidence rows. This is probably because evidence contains an (almost) impossible combination.") - cvg_new[leaf_zero_lik == TRUE, cvg := 1/.N, by = c_idx] + if (grepl("^force", nomatch)) { + cvg_new[leaf_zero_lik == TRUE, cvg := 1/.N, by = c_idx] + } else { + cvg_new <- cvg_new[leaf_zero_lik == FALSE, ] + } + if (grepl("warning$", nomatch)) { + wrn <- "All leaves have zero likelihood for some entered evidence rows. This is probably because evidence contains an (almost) impossible combination." + if (grepl("^force", nomatch)) { + warning(paste(wrn, "Sampling from all possible leaves with equal probability (can be changed with 'nomatch' argument).")) + } else { + warning(paste(wrn, "Returning NA for those rows (can be changed with 'nomatch' argument).")) + } + } + } + if (any(cvg_new[, !leaf_zero_lik])) { + cvg_new[leaf_zero_lik == FALSE, scale := max(cvg), by = c_idx] + cvg_new[leaf_zero_lik == FALSE, cvg := exp(cvg - scale)] + cvg_new[leaf_zero_lik == FALSE, scale := sum(cvg), by = c_idx] + cvg_new[leaf_zero_lik == FALSE, cvg := cvg / scale] + cvg_new[, scale := NULL] } - cvg_new[leaf_zero_lik == FALSE, scale := max(cvg), by = c_idx] - cvg_new[leaf_zero_lik == FALSE, cvg := exp(cvg - scale)] - cvg_new[leaf_zero_lik == FALSE, scale := sum(cvg), by = c_idx] - cvg_new[leaf_zero_lik == FALSE, cvg := cvg / scale] - cvg_new[, `:=` (leaf_zero_lik = NULL, scale = NULL)] + cvg_new[, leaf_zero_lik := NULL] } } @@ -478,12 +521,12 @@ cforde <- function(params, evidence, row_mode = c("separate", "or"), stepsize = } # Add all leaves for all-NA conditions to forest - if (row_mode == "separate" & (nconds != nconds_conditioned)) { - conds_unconditioned <- (1:nconds)[!(1:nconds) %in% conds_conditioned] + if ((grepl("^force", nomatch) & length(conds_impossible) > 0) | (row_mode == "separate" & nconds != nconds_conditioned)) { + conds_unconditioned <- c(conds_impossible, (1:nconds)[!(1:nconds) %in% conds_conditioned]) forest_new_unconditioned <- copy(forest) forest_new_unconditioned <- rbindlist(replicate(length(conds_unconditioned), forest, simplify = F)) forest_new_unconditioned[, `:=` (c_idx = rep(conds_unconditioned,each = nrow(forest)), f_idx_uncond = f_idx, cvg_arf = cvg)] - forest_new <- rbind(forest_new, forest_new_unconditioned) + forest_new <- rbind(forest_new, forest_new_unconditioned)[!is.na(f_idx), ] } setorder(setcolorder(forest_new,c("f_idx","c_idx","f_idx_uncond","tree","leaf","cvg_arf","cvg")), c_idx, f_idx, f_idx_uncond, tree, leaf) diff --git a/man/cforde.Rd b/man/cforde.Rd index 6cc71e58..9853a557 100644 --- a/man/cforde.Rd +++ b/man/cforde.Rd @@ -8,6 +8,7 @@ cforde( params, evidence, row_mode = c("separate", "or"), + nomatch = c("force_warning", "force", "na_warning", "na"), stepsize = 0, parallel = TRUE ) @@ -19,6 +20,11 @@ cforde( \item{row_mode}{Interpretation of rows in multi-row conditions.} +\item{nomatch}{What to do if no leaf matches a condition in \code{evidence}? +Options are to force sampling from a random leaf, either with a warning (\code{"force_warning"}) +or without a warning (\code{"force"}), or to return \code{NA}, also with a warning +(\code{"na_warning"}) or without a warning (\code{"na"}). The default is \code{"force_warning"}.} + \item{stepsize}{Stepsize defining number of condition rows handled in one for each step.} \item{parallel}{Compute in parallel? Must register backend beforehand, e.g. diff --git a/man/expct.Rd b/man/expct.Rd index 13674810..87448865 100644 --- a/man/expct.Rd +++ b/man/expct.Rd @@ -10,6 +10,7 @@ expct( evidence = NULL, evidence_row_mode = c("separate", "or"), round = FALSE, + nomatch = c("force_warning", "force", "na_warning", "na"), stepsize = 0, parallel = TRUE ) @@ -34,6 +35,11 @@ are generated. If \code{'or'}, the rows are combined with a logical or; see Exam \item{round}{Round continuous variables to their respective maximum precision in the real data set?} +\item{nomatch}{What to do if no leaf matches a condition in \code{evidence}? +Options are to force sampling from a random leaf, either with a warning (\code{"force_warning"}) +or without a warning (\code{"force"}), or to return \code{NA}, also with a warning +(\code{"na_warning"}) or without a warning (\code{"na"}). The default is \code{"force_warning"}.} + \item{stepsize}{Stepsize defining number of evidence rows handled in one for each step. Defaults to nrow(evidence)/num_registered_workers for \code{parallel == TRUE}.} diff --git a/man/forge.Rd b/man/forge.Rd index 4dcb92f1..e8df54cc 100644 --- a/man/forge.Rd +++ b/man/forge.Rd @@ -11,6 +11,7 @@ forge( evidence_row_mode = c("separate", "or"), round = TRUE, sample_NAs = FALSE, + nomatch = c("force_warning", "force", "na_warning", "na"), stepsize = 0, parallel = TRUE ) @@ -34,6 +35,11 @@ are generated. If \code{'or'}, the rows are combined with a logical or; see Exam \item{sample_NAs}{Sample NAs respecting the probability for missing values in the original data.} +\item{nomatch}{What to do if no leaf matches a condition in \code{evidence}? +Options are to force sampling from a random leaf, either with a warning (\code{"force_warning"}) +or without a warning (\code{"force"}), or to return \code{NA}, also with a warning +(\code{"na_warning"}) or without a warning (\code{"na"}). The default is \code{"force_warning"}.} + \item{stepsize}{Stepsize defining number of evidence rows handled in one for each step. Defaults to nrow(evidence)/num_registered_workers for \code{parallel == TRUE}.} diff --git a/tests/testthat/test-conditions.R b/tests/testthat/test-conditions.R index eeda5106..fe60fd14 100644 --- a/tests/testthat/test-conditions.R +++ b/tests/testthat/test-conditions.R @@ -25,3 +25,63 @@ test_that("categorical conditions work with not", { expect_in(as.character(synth$Species), c("versicolor", "virginica")) }) +test_that("if nomatch='force_warning', run through with a warning", { + # Zero likelihood case (no finite bounds) + psi_no <- forde(arf, iris, finite_bounds = "no", parallel = FALSE) + expect_warning(x_synth <- forge(psi_no, evidence = data.frame(Sepal.Length = 100), + nomatch = "force_warning", n_synth = 10, parallel = FALSE), + "All leaves have zero likelihood for some entered evidence rows\\. This is probably because evidence contains an \\(almost\\) impossible combination\\. Sampling from all possible leaves with equal probability \\(can be changed with 'nomatch' argument\\)\\.") + expect_true(all(!is.na(x_synth))) + + # No matching leaf case (finite bounds) + psi_global <- forde(arf, iris, finite_bounds = "global", parallel = FALSE) + expect_warning(x_synth <- forge(psi_global, evidence = data.frame(Sepal.Length = 100), + nomatch = "force_warning", n_synth = 10, parallel = FALSE), + "For some entered evidence rows, no matching leaves could be found\\. This is probably because evidence lies outside of the distribution calculated by FORDE\\. For continuous data, consider setting epsilon>0 or finite_bounds='no' in forde\\(\\)\\. For categorical data, consider setting alpha>0 in forde\\(\\)\\. Sampling from all leaves with equal probability \\(can be changed with 'nomatch' argument\\)\\.") + expect_true(all(!is.na(x_synth))) +}) + +test_that("if nomatch='force', run through without a warning", { + # Zero likelihood case (no finite bounds) + psi_no <- forde(arf, iris, finite_bounds = "no", parallel = FALSE) + expect_silent(x_synth <- forge(psi_no, evidence = data.frame(Sepal.Length = 100), + nomatch = "force", n_synth = 10, parallel = FALSE)) + expect_true(all(!is.na(x_synth))) + + # No matching leaf case (finite bounds) + psi_global <- forde(arf, iris, finite_bounds = "global", parallel = FALSE) + expect_silent(x_synth <- forge(psi_global, evidence = data.frame(Sepal.Length = 100), + nomatch = "force", n_synth = 10, parallel = FALSE)) + expect_true(all(!is.na(x_synth))) +}) + +test_that("if nomatch='na_warning', run through with a warning and return NA", { + # Zero likelihood case (no finite bounds) + psi_no <- forde(arf, iris, finite_bounds = "no", parallel = FALSE) + expect_warning(x_synth <- forge(psi_no, evidence = data.frame(Sepal.Length = 100), + nomatch = "na_warning", n_synth = 10, parallel = FALSE), + "All leaves have zero likelihood for some entered evidence rows\\. This is probably because evidence contains an \\(almost\\) impossible combination\\. Returning NA for those rows \\(can be changed with 'nomatch' argument\\)\\.") + expect_true(all(is.na(x_synth[, -1]))) + + # No matching leaf case (finite bounds) + psi_global <- forde(arf, iris, finite_bounds = "global", parallel = FALSE) + expect_warning(x_synth <- forge(psi_global, evidence = data.frame(Sepal.Length = 100), + nomatch = "na_warning", n_synth = 10, parallel = FALSE), + "For some entered evidence rows, no matching leaves could be found\\. This is probably because evidence lies outside of the distribution calculated by FORDE\\. For continuous data, consider setting epsilon>0 or finite_bounds='no' in forde\\(\\)\\. For categorical data, consider setting alpha>0 in forde\\(\\)\\. Returning NA for those rows \\(can be changed with 'nomatch' argument\\)\\.") + expect_true(all(is.na(x_synth[, -1]))) +}) + +test_that("if nomatch='na', run through without a warning and return NA", { + # Zero likelihood case (no finite bounds) + psi_no <- forde(arf, iris, finite_bounds = "no", parallel = FALSE) + expect_silent(x_synth <- forge(psi_no, evidence = data.frame(Sepal.Length = 100), + nomatch = "na", n_synth = 10, parallel = FALSE)) + expect_true(all(is.na(x_synth[, -1]))) + + # No matching leaf case (finite bounds) + psi_global <- forde(arf, iris, finite_bounds = "global", parallel = FALSE) + expect_silent(x_synth <- forge(psi_global, evidence = data.frame(Sepal.Length = 100), + nomatch = "na", n_synth = 10, parallel = FALSE)) + expect_true(all(is.na(x_synth[, -1]))) +}) +