From 9dc2fc56ca8f42e7d9e2da1702279405a52dc646 Mon Sep 17 00:00:00 2001 From: Marvin Wright Date: Wed, 12 Jun 2024 21:48:47 +0200 Subject: [PATCH 1/5] add nomatch argument --- R/expct.R | 7 +++- R/forge.R | 7 +++- R/utils.R | 50 +++++++++++++++++++------- man/cforde.Rd | 6 ++++ man/expct.Rd | 6 ++++ man/forge.Rd | 6 ++++ tests/testthat/test-conditions.R | 60 ++++++++++++++++++++++++++++++++ 7 files changed, 128 insertions(+), 14 deletions(-) diff --git a/R/expct.R b/R/expct.R index 61593ea8..6a11c101 100644 --- a/R/expct.R +++ b/R/expct.R @@ -17,6 +17,10 @@ #' each row in \code{evidence} is a separate conditioning event for which \code{n_synth} synthetic samples #' are generated. If \code{'or'}, the rows are combined with a logical or; see Examples. #' @param round Round continuous variables to their respective maximum precision in the real data set? +#' @param nomatch What to do if no leaf matches a condition in \code{evidence}? +#' Options are to force sampling from a random leaf, either with a warning (\code{"force_warning"}) +#' or without a warning (\code{"force"}), or to return \code{NA}, also with a warning +#' (\code{"na_warning"}) or without a warning (\code{"na"}). The default is \code{"force_warning"}. #' @param stepsize Stepsize defining number of evidence rows handled in one for each step. #' Defaults to nrow(evidence)/num_registered_workers for \code{parallel == TRUE}. #' @param parallel Compute in parallel? Must register backend beforehand, e.g. @@ -85,6 +89,7 @@ expct <- function( evidence = NULL, evidence_row_mode = c("separate", "or"), round = FALSE, + nomatch = c("force_warning", "force", "na_warning", "na"), stepsize = 0, parallel = TRUE) { @@ -149,7 +154,7 @@ expct <- function( index_start <- (step_-1)*stepsize + 1 index_end <- min(step_*stepsize, nrow(evidence)) evidence_part <- evidence[index_start:index_end,] - cparams <- cforde(params, evidence_part, evidence_row_mode, stepsize_cforde, parallel_cforde) + cparams <- cforde(params, evidence_part, evidence_row_mode, nomatch, stepsize_cforde, parallel_cforde) } # omega contains the weight (wt) for each leaf (f_idx) for each condition (c_idx) diff --git a/R/forge.R b/R/forge.R index 4082bcdc..1ffd1cdb 100644 --- a/R/forge.R +++ b/R/forge.R @@ -13,6 +13,10 @@ #' are generated. If \code{'or'}, the rows are combined with a logical or; see Examples. #' @param round Round continuous variables to their respective maximum precision in the real data set? #' @param sample_NAs Sample NAs respecting the probability for missing values in the original data. +#' @param nomatch What to do if no leaf matches a condition in \code{evidence}? +#' Options are to force sampling from a random leaf, either with a warning (\code{"force_warning"}) +#' or without a warning (\code{"force"}), or to return \code{NA}, also with a warning +#' (\code{"na_warning"}) or without a warning (\code{"na"}). The default is \code{"force_warning"}. #' @param stepsize Stepsize defining number of evidence rows handled in one for each step. #' Defaults to nrow(evidence)/num_registered_workers for \code{parallel == TRUE}. #' @param parallel Compute in parallel? Must register backend beforehand, e.g. @@ -102,6 +106,7 @@ forge <- function( evidence_row_mode = c("separate", "or"), round = TRUE, sample_NAs = FALSE, + nomatch = c("force_warning", "force", "na_warning", "na"), stepsize = 0, parallel = TRUE) { @@ -153,7 +158,7 @@ forge <- function( index_start <- (step_-1)*stepsize + 1 index_end <- min(step_*stepsize, nrow(evidence)) evidence_part <- evidence[index_start:index_end,] - cparams <- cforde(params, evidence_part, evidence_row_mode, stepsize_cforde, parallel_cforde) + cparams <- cforde(params, evidence_part, evidence_row_mode, nomatch, stepsize_cforde, parallel_cforde) if (is.null(cparams)) { n_synth <- n_synth * nrow(evidence_part) } diff --git a/R/utils.R b/R/utils.R index 07bb68bb..c50e8db6 100644 --- a/R/utils.R +++ b/R/utils.R @@ -200,6 +200,10 @@ post_x <- function(x, params, round = TRUE) { #' @param params Circuit parameters learned via \code{\link{forde}}. #' @param evidence Data frame of conditioning event(s). #' @param row_mode Interpretation of rows in multi-row conditions. +#' @param nomatch What to do if no leaf matches a condition in \code{evidence}? +#' Options are to force sampling from a random leaf, either with a warning (\code{"force_warning"}) +#' or without a warning (\code{"force"}), or to return \code{NA}, also with a warning +#' (\code{"na_warning"}) or without a warning (\code{"na"}). The default is \code{"force_warning"}. #' @param stepsize Stepsize defining number of condition rows handled in one for each step. #' @param parallel Compute in parallel? Must register backend beforehand, e.g. #' via \code{doParallel}. @@ -214,7 +218,12 @@ post_x <- function(x, params, round = TRUE) { #' @importFrom stats dunif punif #' @keywords internal -cforde <- function(params, evidence, row_mode = c("separate", "or"), stepsize = 0, parallel = TRUE) { +cforde <- function(params, + evidence, + row_mode = c("separate", "or"), + nomatch = c("force_warning", "force", "na_warning", "na"), + stepsize = 0, + parallel = TRUE) { row_mode <- match.arg(row_mode) @@ -416,9 +425,11 @@ cforde <- function(params, evidence, row_mode = c("separate", "or"), stepsize = # Check for conditions with no matching leaves and handle this according to row_mode if (relevant_leaves[,uniqueN(c_idx)] < nconds_conditioned) { if (relevant_leaves[,uniqueN(c_idx)] == 0 & row_mode == "or") { - stop("For all entered evidence rows, no matching leaves could be found. This is probably because evidence lies outside of the distribution calculated by FORDE. For continuous data, consider setting epsilon>0 or finite_bounds='no' in forde(). For categorical data, consider setting alpha>0 in forde()") + stop("For all entered evidence rows, no matching leaves could be found. This is probably because evidence lies outside of the distribution calculated by FORDE. For continuous data, consider setting epsilon>0 or finite_bounds='no' in forde(). For categorical data, consider setting alpha>0 in forde().") } else { - warning("For some entered evidence rows, no matching leaves could be found. This is probably because evidence lies outside of the distribution calculated by FORDE. For continuous data, consider setting epsilon>0 or finite_bounds='no' in forde(). For categorical data, consider setting alpha>0 in forde()") + if (grepl("warning$", nomatch)) { + warning("For some entered evidence rows, no matching leaves could be found. This is probably because evidence lies outside of the distribution calculated by FORDE. For continuous data, consider setting epsilon>0 or finite_bounds='no' in forde(). For categorical data, consider setting alpha>0 in forde().") + } conds_impossible <- conds_conditioned[!(conds_conditioned %in% relevant_leaves[,unique(c_idx)])] relevant_leaves <- setorder(rbind(relevant_leaves, data.table(c_idx = conds_impossible, f_idx = NA_integer_, f_idx_uncond = NA_integer_))) } @@ -442,8 +453,14 @@ cforde <- function(params, evidence, row_mode = c("separate", "or"), stepsize = # Re-calculate weights and transform back from log scale, handle (numerically) impossible cases if (row_mode == "or") { if (cvg_new[,all(cvg == -Inf)]) { - warning("All leaves have zero likelihood. This is probably because evidence contains an (almost) impossible combination.") - cvg_new[, cvg := 1/.N] + if (grepl("^force", nomatch)) { + cvg_new[, cvg := 1/.N] + } else { + cvg_new[, cvg := NA] + } + if (grepl("warning$", nomatch)) { + warning("All leaves have zero likelihood. This is probably because evidence contains an (almost) impossible combination. Sampling from all leaves with equal probability.") + } } else { cvg_new[, cvg := exp(cvg - max(cvg))] cvg_new <- cvg_new[, cvg := cvg / sum(cvg)] @@ -451,14 +468,23 @@ cforde <- function(params, evidence, row_mode = c("separate", "or"), stepsize = } else { cvg_new[, leaf_zero_lik := all(cvg == -Inf), by = c_idx] if (any(cvg_new[, leaf_zero_lik])) { - warning("All leaves have zero likelihood for some entered evidence rows. This is probably because evidence contains an (almost) impossible combination.") - cvg_new[leaf_zero_lik == TRUE, cvg := 1/.N, by = c_idx] + if (grepl("^force", nomatch)) { + cvg_new[leaf_zero_lik == TRUE, cvg := 1/.N, by = c_idx] + } else { + cvg_new <- cvg_new[leaf_zero_lik == FALSE, ] + } + if (grepl("warning$", nomatch)) { + warning("All leaves have zero likelihood for some entered evidence rows. This is probably because evidence contains an (almost) impossible combination. Sampling from all leaves with equal probability.") + } + } + if (any(cvg_new[, !leaf_zero_lik])) { + cvg_new[leaf_zero_lik == FALSE, scale := max(cvg), by = c_idx] + cvg_new[leaf_zero_lik == FALSE, cvg := exp(cvg - scale)] + cvg_new[leaf_zero_lik == FALSE, scale := sum(cvg), by = c_idx] + cvg_new[leaf_zero_lik == FALSE, cvg := cvg / scale] + cvg_new[, scale := NULL] } - cvg_new[leaf_zero_lik == FALSE, scale := max(cvg), by = c_idx] - cvg_new[leaf_zero_lik == FALSE, cvg := exp(cvg - scale)] - cvg_new[leaf_zero_lik == FALSE, scale := sum(cvg), by = c_idx] - cvg_new[leaf_zero_lik == FALSE, cvg := cvg / scale] - cvg_new[, `:=` (leaf_zero_lik = NULL, scale = NULL)] + cvg_new[, leaf_zero_lik := NULL] } } diff --git a/man/cforde.Rd b/man/cforde.Rd index 3972d193..85aedd19 100644 --- a/man/cforde.Rd +++ b/man/cforde.Rd @@ -8,6 +8,7 @@ cforde( params, evidence, row_mode = c("separate", "or"), + nomatch = c("force_warning", "force", "na_warning", "na"), stepsize = 0, parallel = TRUE ) @@ -19,6 +20,11 @@ cforde( \item{row_mode}{Interpretation of rows in multi-row conditions.} +\item{nomatch}{What to do if no leaf matches a condition in \code{evidence}? +Options are to force sampling from a random leaf, either with a warning (\code{"force_warning"}) +or without a warning (\code{"force"}), or to return \code{NA}, also with a warning +(\code{"na_warning"}) or without a warning (\code{"na"}). The default is \code{"force_warning"}.} + \item{stepsize}{Stepsize defining number of condition rows handled in one for each step.} \item{parallel}{Compute in parallel? Must register backend beforehand, e.g. diff --git a/man/expct.Rd b/man/expct.Rd index 75f91a2c..48f19571 100644 --- a/man/expct.Rd +++ b/man/expct.Rd @@ -10,6 +10,7 @@ expct( evidence = NULL, evidence_row_mode = c("separate", "or"), round = FALSE, + nomatch = c("force_warning", "force", "na_warning", "na"), stepsize = 0, parallel = TRUE ) @@ -34,6 +35,11 @@ are generated. If \code{'or'}, the rows are combined with a logical or; see Exam \item{round}{Round continuous variables to their respective maximum precision in the real data set?} +\item{nomatch}{What to do if no leaf matches a condition in \code{evidence}? +Options are to force sampling from a random leaf, either with a warning (\code{"force_warning"}) +or without a warning (\code{"force"}), or to return \code{NA}, also with a warning +(\code{"na_warning"}) or without a warning (\code{"na"}). The default is \code{"force_warning"}.} + \item{stepsize}{Stepsize defining number of evidence rows handled in one for each step. Defaults to nrow(evidence)/num_registered_workers for \code{parallel == TRUE}.} diff --git a/man/forge.Rd b/man/forge.Rd index 05a9159f..b5d4e5fe 100644 --- a/man/forge.Rd +++ b/man/forge.Rd @@ -11,6 +11,7 @@ forge( evidence_row_mode = c("separate", "or"), round = TRUE, sample_NAs = FALSE, + nomatch = c("force_warning", "force", "na_warning", "na"), stepsize = 0, parallel = TRUE ) @@ -34,6 +35,11 @@ are generated. If \code{'or'}, the rows are combined with a logical or; see Exam \item{sample_NAs}{Sample NAs respecting the probability for missing values in the original data.} +\item{nomatch}{What to do if no leaf matches a condition in \code{evidence}? +Options are to force sampling from a random leaf, either with a warning (\code{"force_warning"}) +or without a warning (\code{"force"}), or to return \code{NA}, also with a warning +(\code{"na_warning"}) or without a warning (\code{"na"}). The default is \code{"force_warning"}.} + \item{stepsize}{Stepsize defining number of evidence rows handled in one for each step. Defaults to nrow(evidence)/num_registered_workers for \code{parallel == TRUE}.} diff --git a/tests/testthat/test-conditions.R b/tests/testthat/test-conditions.R index eeda5106..43c9ce6a 100644 --- a/tests/testthat/test-conditions.R +++ b/tests/testthat/test-conditions.R @@ -25,3 +25,63 @@ test_that("categorical conditions work with not", { expect_in(as.character(synth$Species), c("versicolor", "virginica")) }) +test_that("if nomatch='force_warning', run through with a warning", { + # Zero likelihood case (no finite bounds) + psi_no <- forde(arf, iris, finite_bounds = "no", parallel = FALSE) + expect_warning(x_synth <- forge(psi_no, evidence = data.frame(Sepal.Length = 100), + nomatch = "force_warning", n_synth = 10, parallel = FALSE), + "All leaves have zero likelihood for some entered evidence rows\\. This is probably because evidence contains an \\(almost\\) impossible combination\\. Sampling from all leaves with equal probability\\.") + expect_true(all(!is.na(x_synth))) + + # No matching leaf case (finite bounds) + psi_global <- forde(arf, iris, finite_bounds = "global", parallel = FALSE) + expect_warning(x_synth <- forge(psi_global, evidence = data.frame(Sepal.Length = 100), + nomatch = "force_warning", n_synth = 10, parallel = FALSE), + "For some entered evidence rows, no matching leaves could be found\\. This is probably because evidence lies outside of the distribution calculated by FORDE\\. For continuous data, consider setting epsilon>0 or finite_bounds='no' in forde\\(\\)\\. For categorical data, consider setting alpha>0 in forde\\(\\)\\.") + expect_true(all(!is.na(x_synth))) +}) + +test_that("if nomatch='force', run through without a warning", { + # Zero likelihood case (no finite bounds) + psi_no <- forde(arf, iris, finite_bounds = "no", parallel = FALSE) + expect_silent(x_synth <- forge(psi_no, evidence = data.frame(Sepal.Length = 100), + nomatch = "force", n_synth = 10, parallel = FALSE)) + expect_true(all(!is.na(x_synth))) + + # No matching leaf case (finite bounds) + psi_global <- forde(arf, iris, finite_bounds = "global", parallel = FALSE) + expect_silent(x_synth <- forge(psi_global, evidence = data.frame(Sepal.Length = 100), + nomatch = "force", n_synth = 10, parallel = FALSE)) + expect_true(all(!is.na(x_synth))) +}) + +test_that("if nomatch='na_warning', run through with a warning and return NA", { + # Zero likelihood case (no finite bounds) + psi_no <- forde(arf, iris, finite_bounds = "no", parallel = FALSE) + expect_warning(x_synth <- forge(psi_no, evidence = data.frame(Sepal.Length = 100), + nomatch = "na_warning", n_synth = 10, parallel = FALSE), + "All leaves have zero likelihood for some entered evidence rows\\. This is probably because evidence contains an \\(almost\\) impossible combination\\. Sampling from all leaves with equal probability\\.") + expect_true(all(is.na(x_synth[, -1]))) + + # No matching leaf case (finite bounds) + psi_global <- forde(arf, iris, finite_bounds = "global", parallel = FALSE) + expect_warning(x_synth <- forge(psi_global, evidence = data.frame(Sepal.Length = 100), + nomatch = "na_warning", n_synth = 10, parallel = FALSE), + "For some entered evidence rows, no matching leaves could be found\\. This is probably because evidence lies outside of the distribution calculated by FORDE\\. For continuous data, consider setting epsilon>0 or finite_bounds='no' in forde\\(\\)\\. For categorical data, consider setting alpha>0 in forde\\(\\)\\.") + expect_true(all(is.na(x_synth[, -1]))) +}) + +test_that("if nomatch='na', run through without a warning and return NA", { + # Zero likelihood case (no finite bounds) + psi_no <- forde(arf, iris, finite_bounds = "no", parallel = FALSE) + expect_silent(x_synth <- forge(psi_no, evidence = data.frame(Sepal.Length = 100), + nomatch = "na", n_synth = 10, parallel = FALSE)) + expect_true(all(is.na(x_synth[, -1]))) + + # No matching leaf case (finite bounds) + psi_global <- forde(arf, iris, finite_bounds = "global", parallel = FALSE) + expect_silent(x_synth <- forge(psi_global, evidence = data.frame(Sepal.Length = 100), + nomatch = "na", n_synth = 10, parallel = FALSE)) + expect_true(all(is.na(x_synth[, -1]))) +}) + From 6b287471691e141e2c069e71c16814b2a3e2ac65 Mon Sep 17 00:00:00 2001 From: "Marvin N. Wright" Date: Thu, 13 Jun 2024 07:41:15 +0200 Subject: [PATCH 2/5] better warning messages --- R/utils.R | 21 ++++++++++++++++++--- tests/testthat/test-conditions.R | 8 ++++---- 2 files changed, 22 insertions(+), 7 deletions(-) diff --git a/R/utils.R b/R/utils.R index c50e8db6..a4e00b5f 100644 --- a/R/utils.R +++ b/R/utils.R @@ -428,7 +428,12 @@ cforde <- function(params, stop("For all entered evidence rows, no matching leaves could be found. This is probably because evidence lies outside of the distribution calculated by FORDE. For continuous data, consider setting epsilon>0 or finite_bounds='no' in forde(). For categorical data, consider setting alpha>0 in forde().") } else { if (grepl("warning$", nomatch)) { - warning("For some entered evidence rows, no matching leaves could be found. This is probably because evidence lies outside of the distribution calculated by FORDE. For continuous data, consider setting epsilon>0 or finite_bounds='no' in forde(). For categorical data, consider setting alpha>0 in forde().") + wrn <- "For some entered evidence rows, no matching leaves could be found. This is probably because evidence lies outside of the distribution calculated by FORDE. For continuous data, consider setting epsilon>0 or finite_bounds='no' in forde(). For categorical data, consider setting alpha>0 in forde()." + if (grepl("^force", nomatch)) { + warning(paste(wrn, "Sampling from all leaves with equal probability (can be changed with 'nomatch' argument).")) + } else { + warning(paste(wrn, "Returning NA for those rows (can be changed with 'nomatch' argument).")) + } } conds_impossible <- conds_conditioned[!(conds_conditioned %in% relevant_leaves[,unique(c_idx)])] relevant_leaves <- setorder(rbind(relevant_leaves, data.table(c_idx = conds_impossible, f_idx = NA_integer_, f_idx_uncond = NA_integer_))) @@ -459,7 +464,12 @@ cforde <- function(params, cvg_new[, cvg := NA] } if (grepl("warning$", nomatch)) { - warning("All leaves have zero likelihood. This is probably because evidence contains an (almost) impossible combination. Sampling from all leaves with equal probability.") + wrn <- "All leaves have zero likelihood. This is probably because evidence contains an (almost) impossible combination." + if (grepl("^force", nomatch)) { + warning(paste(wrn, "Sampling from all possible leaves with equal probability.")) + } else { + warning(paste(wrn, "Returning NA.")) + } } } else { cvg_new[, cvg := exp(cvg - max(cvg))] @@ -474,7 +484,12 @@ cforde <- function(params, cvg_new <- cvg_new[leaf_zero_lik == FALSE, ] } if (grepl("warning$", nomatch)) { - warning("All leaves have zero likelihood for some entered evidence rows. This is probably because evidence contains an (almost) impossible combination. Sampling from all leaves with equal probability.") + wrn <- "All leaves have zero likelihood for some entered evidence rows. This is probably because evidence contains an (almost) impossible combination." + if (grepl("^force", nomatch)) { + warning(paste(wrn, "Sampling from all possible leaves with equal probability (can be changed with 'nomatch' argument).")) + } else { + warning(paste(wrn, "Returning NA for those rows (can be changed with 'nomatch' argument).")) + } } } if (any(cvg_new[, !leaf_zero_lik])) { diff --git a/tests/testthat/test-conditions.R b/tests/testthat/test-conditions.R index 43c9ce6a..fe60fd14 100644 --- a/tests/testthat/test-conditions.R +++ b/tests/testthat/test-conditions.R @@ -30,14 +30,14 @@ test_that("if nomatch='force_warning', run through with a warning", { psi_no <- forde(arf, iris, finite_bounds = "no", parallel = FALSE) expect_warning(x_synth <- forge(psi_no, evidence = data.frame(Sepal.Length = 100), nomatch = "force_warning", n_synth = 10, parallel = FALSE), - "All leaves have zero likelihood for some entered evidence rows\\. This is probably because evidence contains an \\(almost\\) impossible combination\\. Sampling from all leaves with equal probability\\.") + "All leaves have zero likelihood for some entered evidence rows\\. This is probably because evidence contains an \\(almost\\) impossible combination\\. Sampling from all possible leaves with equal probability \\(can be changed with 'nomatch' argument\\)\\.") expect_true(all(!is.na(x_synth))) # No matching leaf case (finite bounds) psi_global <- forde(arf, iris, finite_bounds = "global", parallel = FALSE) expect_warning(x_synth <- forge(psi_global, evidence = data.frame(Sepal.Length = 100), nomatch = "force_warning", n_synth = 10, parallel = FALSE), - "For some entered evidence rows, no matching leaves could be found\\. This is probably because evidence lies outside of the distribution calculated by FORDE\\. For continuous data, consider setting epsilon>0 or finite_bounds='no' in forde\\(\\)\\. For categorical data, consider setting alpha>0 in forde\\(\\)\\.") + "For some entered evidence rows, no matching leaves could be found\\. This is probably because evidence lies outside of the distribution calculated by FORDE\\. For continuous data, consider setting epsilon>0 or finite_bounds='no' in forde\\(\\)\\. For categorical data, consider setting alpha>0 in forde\\(\\)\\. Sampling from all leaves with equal probability \\(can be changed with 'nomatch' argument\\)\\.") expect_true(all(!is.na(x_synth))) }) @@ -60,14 +60,14 @@ test_that("if nomatch='na_warning', run through with a warning and return NA", { psi_no <- forde(arf, iris, finite_bounds = "no", parallel = FALSE) expect_warning(x_synth <- forge(psi_no, evidence = data.frame(Sepal.Length = 100), nomatch = "na_warning", n_synth = 10, parallel = FALSE), - "All leaves have zero likelihood for some entered evidence rows\\. This is probably because evidence contains an \\(almost\\) impossible combination\\. Sampling from all leaves with equal probability\\.") + "All leaves have zero likelihood for some entered evidence rows\\. This is probably because evidence contains an \\(almost\\) impossible combination\\. Returning NA for those rows \\(can be changed with 'nomatch' argument\\)\\.") expect_true(all(is.na(x_synth[, -1]))) # No matching leaf case (finite bounds) psi_global <- forde(arf, iris, finite_bounds = "global", parallel = FALSE) expect_warning(x_synth <- forge(psi_global, evidence = data.frame(Sepal.Length = 100), nomatch = "na_warning", n_synth = 10, parallel = FALSE), - "For some entered evidence rows, no matching leaves could be found\\. This is probably because evidence lies outside of the distribution calculated by FORDE\\. For continuous data, consider setting epsilon>0 or finite_bounds='no' in forde\\(\\)\\. For categorical data, consider setting alpha>0 in forde\\(\\)\\.") + "For some entered evidence rows, no matching leaves could be found\\. This is probably because evidence lies outside of the distribution calculated by FORDE\\. For continuous data, consider setting epsilon>0 or finite_bounds='no' in forde\\(\\)\\. For categorical data, consider setting alpha>0 in forde\\(\\)\\. Returning NA for those rows \\(can be changed with 'nomatch' argument\\)\\.") expect_true(all(is.na(x_synth[, -1]))) }) From 8d294f2c22c79f57a90d2132c597ffe68b4f6c63 Mon Sep 17 00:00:00 2001 From: "Marvin N. Wright" Date: Thu, 13 Jun 2024 08:11:16 +0200 Subject: [PATCH 3/5] nomatch force for the no macthing leaf case --- R/utils.R | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/R/utils.R b/R/utils.R index a4e00b5f..be14f298 100644 --- a/R/utils.R +++ b/R/utils.R @@ -436,7 +436,15 @@ cforde <- function(params, } } conds_impossible <- conds_conditioned[!(conds_conditioned %in% relevant_leaves[,unique(c_idx)])] - relevant_leaves <- setorder(rbind(relevant_leaves, data.table(c_idx = conds_impossible, f_idx = NA_integer_, f_idx_uncond = NA_integer_))) + + if (grepl("^force", nomatch)) { + # All leaves + impossible_leaves <- data.table(c_idx = conds_impossible, f_idx = forest$f_idx, f_idx_uncond = forest$f_idx) + } else { + # Set to NA -> no leaves -> Sample NA + impossible_leaves <- data.table(c_idx = conds_impossible, f_idx = NA_integer_, f_idx_uncond = NA_integer_) + } + relevant_leaves <- setorder(rbind(relevant_leaves, impossible_leaves)) } } From 43e9f93610307fbad75422a2373d6023741f0d76 Mon Sep 17 00:00:00 2001 From: "Marvin N. Wright" Date: Thu, 13 Jun 2024 16:38:06 +0200 Subject: [PATCH 4/5] match.args and fix no matching leaf case --- R/expct.R | 1 + R/forge.R | 10 +--------- R/utils.R | 17 +++++------------ 3 files changed, 7 insertions(+), 21 deletions(-) diff --git a/R/expct.R b/R/expct.R index 6a11c101..26b569a1 100644 --- a/R/expct.R +++ b/R/expct.R @@ -94,6 +94,7 @@ expct <- function( parallel = TRUE) { evidence_row_mode <- match.arg(evidence_row_mode) + nomatch <- match.arg(nomatch) # To avoid data.table check issues variable <- tree <- f_idx <- cvg <- wt <- V1 <- value <- val <- family <- diff --git a/R/forge.R b/R/forge.R index 1ffd1cdb..dd62da06 100644 --- a/R/forge.R +++ b/R/forge.R @@ -111,6 +111,7 @@ forge <- function( parallel = TRUE) { evidence_row_mode <- match.arg(evidence_row_mode) + nomatch <- match.arg(nomatch) # To avoid data.table check issues tree <- cvg <- leaf <- idx <- family <- mu <- sigma <- prob <- dat <- @@ -181,15 +182,6 @@ forge <- function( } omega <- omega[wt > 0, ] - # Use random leaves if NA (no matching leaves found) - if (omega[, any(is.na(f_idx))] & omega[, any(!is.na(f_idx))]) { - row_idx <- sample(nrow(omega[!is.na(f_idx), ]), omega[, sum(is.na(f_idx))], replace = TRUE) - temp <- omega[!is.na(f_idx), ][row_idx, .(f_idx, f_idx_uncond)] - omega[is.na(f_idx), f_idx_uncond := temp[, f_idx_uncond]] - omega[is.na(f_idx), f_idx := temp[, f_idx]] - } - - # For each synthetic sample and condition, draw a leaf according to the leaf weights if (nrow(omega) == 1) { omega <- omega[rep(1, n_synth),][, idx := .I] diff --git a/R/utils.R b/R/utils.R index be14f298..3a32bfcb 100644 --- a/R/utils.R +++ b/R/utils.R @@ -423,6 +423,7 @@ cforde <- function(params, cat_new <- setcolorder(merge(relevant_leaves, updates_relevant_leaves$cat_new, by.x = c("c_idx", "f_idx_uncond"), by.y = c("c_idx", "f_idx"), sort = F), c("f_idx","c_idx","variable","val","prob","cvg_factor"))[] # Check for conditions with no matching leaves and handle this according to row_mode + conds_impossible <- conds_conditioned[!(conds_conditioned %in% relevant_leaves[,unique(c_idx)])] if (relevant_leaves[,uniqueN(c_idx)] < nconds_conditioned) { if (relevant_leaves[,uniqueN(c_idx)] == 0 & row_mode == "or") { stop("For all entered evidence rows, no matching leaves could be found. This is probably because evidence lies outside of the distribution calculated by FORDE. For continuous data, consider setting epsilon>0 or finite_bounds='no' in forde(). For categorical data, consider setting alpha>0 in forde().") @@ -435,15 +436,7 @@ cforde <- function(params, warning(paste(wrn, "Returning NA for those rows (can be changed with 'nomatch' argument).")) } } - conds_impossible <- conds_conditioned[!(conds_conditioned %in% relevant_leaves[,unique(c_idx)])] - - if (grepl("^force", nomatch)) { - # All leaves - impossible_leaves <- data.table(c_idx = conds_impossible, f_idx = forest$f_idx, f_idx_uncond = forest$f_idx) - } else { - # Set to NA -> no leaves -> Sample NA - impossible_leaves <- data.table(c_idx = conds_impossible, f_idx = NA_integer_, f_idx_uncond = NA_integer_) - } + impossible_leaves <- data.table(c_idx = conds_impossible, f_idx = NA_integer_, f_idx_uncond = NA_integer_) relevant_leaves <- setorder(rbind(relevant_leaves, impossible_leaves)) } } @@ -527,12 +520,12 @@ cforde <- function(params, } # Add all leaves for all-NA conditions to forest - if (row_mode == "separate" & (nconds != nconds_conditioned)) { - conds_unconditioned <- (1:nconds)[!(1:nconds) %in% conds_conditioned] + if ((grepl("^force", nomatch) & length(conds_impossible) > 0) | (row_mode == "separate" & nconds != nconds_conditioned)) { + conds_unconditioned <- c(conds_impossible, (1:nconds)[!(1:nconds) %in% conds_conditioned]) forest_new_unconditioned <- copy(forest) forest_new_unconditioned <- rbindlist(replicate(length(conds_unconditioned), forest, simplify = F)) forest_new_unconditioned[, `:=` (c_idx = rep(conds_unconditioned,each = nrow(forest)), f_idx_uncond = f_idx, cvg_arf = cvg)] - forest_new <- rbind(forest_new, forest_new_unconditioned) + forest_new <- rbind(forest_new, forest_new_unconditioned)[!is.na(f_idx), ] } setorder(setcolorder(forest_new,c("f_idx","c_idx","f_idx_uncond","tree","leaf","cvg_arf","cvg")), c_idx, f_idx, f_idx_uncond, tree, leaf) From 75e5e57100022d80f216a703dfdaa2cf1a8c7bb3 Mon Sep 17 00:00:00 2001 From: "Marvin N. Wright" Date: Thu, 13 Jun 2024 17:01:18 +0200 Subject: [PATCH 5/5] match.arg in cforde --- R/utils.R | 1 + 1 file changed, 1 insertion(+) diff --git a/R/utils.R b/R/utils.R index 3a32bfcb..2f788ff5 100644 --- a/R/utils.R +++ b/R/utils.R @@ -226,6 +226,7 @@ cforde <- function(params, parallel = TRUE) { row_mode <- match.arg(row_mode) + nomatch <- match.arg(nomatch) # To avoid data.table check issues . <- c_idx <- cvg <- cvg_arf <- cvg_factor <- f_idx <- f_idx_uncond <- i.max <-