Debug random number generator, and condense documentation of population

segmentation constants
joshkyh · May 15, 2017 · 53b067b · 53b067b
1 parent fd940c4
commit 53b067b
Show file tree

Hide file tree

Showing 71 changed files with 1,441 additions and 681 deletions.
diff --git a/ChangeLog b/ChangeLog
@@ -1,3 +1,3 @@
-2017-05-08: Google <[email protected]>
-	* Version 1.0.0
-	* Initial commit.
+2017-05-15: Google <[email protected]>
+    * Version 1.0.0
+    * Initial commit.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: amss
 Version: 1.0.0
-Date: 2017-05-07
+Date: 2017-05-15
 Title: Agreggate Marketing System Simulator
 Author: Google Inc. <[email protected]>
 Maintainer: Google Inc. <[email protected]>

diff --git a/NAMESPACE b/NAMESPACE
@@ -5,13 +5,11 @@ export(DefaultNatMigModule)
 export(DefaultSalesModule)
 export(DefaultSearchMediaModule)
 export(DefaultTraditionalMediaModule)
-export(HillTrans)
 export(SimulateAMSS)
 export(SimulateAR1)
 export(SimulateCorrelated)
 export(SimulateDummy)
 export(SimulateSinusoidal)
-export(amss.sim)
 export(kActivityStates)
 export(kAllStates)
 export(kAvailabilityStates)

diff --git a/R/attribution.R b/R/attribution.R
@@ -42,12 +42,18 @@ utils::globalVariables(c("pop", "revenue", "rep.index", "V1", "total.spend"))
 #'   the modified budget periods in order to include lagged effects in the
 #'   calculation.
 #' @param scaled.pop.size \code{CalculateROAS} scales up the population size to
-#'   reduce the variability of its estimates. This number should be chosen
-#'   to be as large as possible while avoiding integer overflow during
-#'   data simulation.
+#'   reduce the variability of its estimates. This is equivalent to running
+#'   the simulation for multiple repetitions to reduce variability. The
+#'   default value should provide sufficient accuracy in most use cases.
+#'   Extremely large values may result in numerical issues.
 #' @param min.reps integer representing the initial number of datasets to
-#'   generate from each budget setting. A reasonable number of initial
-#'   datasets is needed to estimate the amount of variability accurately
+#'   generate from each budget setting. The default value of 2 allows the user
+#'   to make a rough check that the accuracy is indeed good under the chosen
+#'   settings. This default was chosen under the assumption that the default
+#'   \code{scaled.pop.size} is large enough to accurately measure ROAS using 1
+#'   repetition, with the 2nd being used as confirmation of the accuracy. Higher
+#'   precision and more accurate measurement of the precision can be achieved
+#'   with more repetitions.
 #' @param max.coef.var numeric, the target coefficient of variation. The
 #'   function takes additional samples of the ROAS until it runs out of
 #'   time, attains the target coefficient of variation, or attains the
@@ -60,7 +66,9 @@ utils::globalVariables(c("pop", "revenue", "rep.index", "V1", "total.spend"))
 #'   function from taking additional samples beyond the initial sample
 #'   generated according to \code{min.reps}. The function takes additional
 #'   samples of the ROAS until it runs out of time, attains the target
-#'   coefficient of variation, or attains the target margin of error.
+#'   coefficient of variation, or attains the target margin of error. The
+#'   default value of 0 forces the function to use precisely \code{min.reps}
+#'   repetitions.
 #' @param verbose boolean. If TRUE, output measures of the accuracy of the
 #'   reported ROAS, including the full sample of ROAS values.
 #' @return numeric value for ROAS, or, if \code{verbose = TRUE}, a list with
@@ -76,9 +84,9 @@ CalculateROAS <- function(
     budget.proportion = rep(0, length(media.names)),
     t.start = 1,
     t.end = object$params$time.n,
-    scaled.pop.size = .Machine$integer.max / 100,
-    min.reps = 10,
-    max.coef.var = 0.01, max.margin.error = 0.01, max.time = 30,
+    scaled.pop.size = 1e18,
+    min.reps = 2,
+    max.coef.var = 0.01, max.margin.error = 0.01, max.time = 0,
     verbose = FALSE) {
 
   if (is.null(object$params$media.names) ||
@@ -91,11 +99,11 @@ CalculateROAS <- function(
   assertthat::assert_that(inherits(object, "amss.sim"))
 
   # Get the original budget settings.
-  orig.budget <- .GetBudget(object)
+  orig.budget <- GetBudget(object)
 
   # Check the other parameters.
   if (!is.null(new.budget)) {
-    assertthat::assert_that(dim(new.budget) == dim(orig.budget))
+    assertthat::assert_that(all(dim(new.budget) == dim(orig.budget)))
   }
   assertthat::assert_that(all(media.names %in% object$params$media.names))
   assertthat::assert_that(all(budget.periods >= 1))
@@ -123,26 +131,26 @@ CalculateROAS <- function(
   # Modified the object with an updated population size, to reduce runtime.
   # Note that, since only values in column "pop" affect the values in the
   # regenerated data. That is the only column that needs updating.
-  scaled.pop.size <- as.integer(scaled.pop.size)
-  orig.pop <- .GetPopulation(object)
-  pop.multiplier <- scaled.pop.size / orig.pop
+  orig.pop <- GetPopulation(object)
   mod.object <- data.table::copy(object)
   lapply(mod.object$data.full,
-         function(dt) dt[, pop := .AdjustPopulation(pop, scaled.pop.size)])
+         function(dt) dt[, pop := AdjustPopulation(pop, scaled.pop.size)])
+  scaled.pop.size <- mod.object$data.full[[1]][, sum(pop)]
+  pop.multiplier <- scaled.pop.size / orig.pop
   mod.object$params$nat.mig.params$population <- scaled.pop.size
 
   # Generate data from counterfactual budget settings.
   start.time <- Sys.time()
-  new.data <- .GenerateDataUnderNewBudget(
+  new.data <- GenerateDataUnderNewBudget(
       mod.object, new.budget * pop.multiplier,
       reps = min.reps, t.start = t.start, t.end = t.end)
   # Generate more data from the original budget settings to average out noise.
-  orig.data <- .GenerateDataUnderNewBudget(
+  orig.data <- GenerateDataUnderNewBudget(
       mod.object, orig.budget * pop.multiplier,
       reps = min.reps, t.start = t.start, t.end = t.end)
 
   # Calculate the ROAS based on the current sample.
-  roas.sample <- .CalculateSampleROAS(new.data, orig.data)
+  roas.sample <- CalculateSampleROAS(new.data, orig.data)
   # Get the estimate.
   roas.est <- mean(roas.sample)
   # Get its precision as a standard error.
@@ -168,14 +176,14 @@ CalculateROAS <- function(
     print(paste("Running additional sample", counter, "for more accuracy."))
     print(paste(round(difftime(Sys.time(), start.time, units = "mins")),
                 "minutes elapsed."))
-    new.data.1 <- .GenerateDataUnderNewBudget(
+    new.data.1 <- GenerateDataUnderNewBudget(
       object = mod.object, new.budget = new.budget * pop.multiplier,
       t.start = t.start, t.end = t.end)
-    orig.data.1 <- .GenerateDataUnderNewBudget(
+    orig.data.1 <- GenerateDataUnderNewBudget(
         object = mod.object, new.budget = orig.budget * pop.multiplier,
         t.start = t.start, t.end = t.end)
     roas.sample <- c(roas.sample,
-                     .CalculateSampleROAS(new.data.1, orig.data.1))
+                     CalculateSampleROAS(new.data.1, orig.data.1))
     roas.est <- mean(roas.sample)
     roas.standard.error <- sd(roas.sample) / sqrt(length(roas.sample))
     roas.margin.error <- roas.standard.error * qt(0.975, length(roas.sample))
@@ -209,8 +217,9 @@ CalculateROAS <- function(
 #'   match.
 #' @return vector of ROAS estimates from each pair of sample datasets in dt1
 #'   and dt2.
+#' @keywords internal
 
-.CalculateSampleROAS <- function(dt1, dt2) {
+CalculateSampleROAS <- function(dt1, dt2) {
 
   return((dt1[, sum(revenue), by = rep.index][, V1] -
           dt2[, sum(revenue), by = rep.index][, V1]) /
@@ -233,10 +242,11 @@ CalculateROAS <- function(
 #' @param t.start time point at which to start generating new data
 #' @param t.end time point at which to stop generating new data
 #' @return The list of all generated datasets or a vector of averaged values.
+#' @keywords internal
 
-.GenerateDataUnderNewBudget <- function(
+GenerateDataUnderNewBudget <- function(
     object,
-    new.budget = .GetBudget(object),
+    new.budget = GetBudget(object),
     response.metric = NULL,
     reps = 1,
     t.start = 1,
@@ -268,37 +278,32 @@ CalculateROAS <- function(
   # Generate data.
   new.data <- lapply(
       1:reps,
-      function(r) .SurfaceData(
-          do.call(`.SimulateData`, new.params[formalArgs(`.SimulateData`)]),
+      function(r) SurfaceData(
+          do.call(`SimulateData`, new.params[formalArgs(`SimulateData`)]),
           new.params$names.agg.const,
           new.params$names.agg.sum)[, rep.index := r])
   new.data <- rbindlist(new.data)
   if (is.null(response.metric)) {
     return(new.data)
   } else {
     return(new.data[,
-                    eval(.ParseT(paste0("mean(", response.metric, ")"))),
-                    by = c("time.index", "geo.index")][, V1])
+                    eval(ParseT(paste0("mean(", response.metric, ")"))),
+                    by = "time.index"][, V1])
   }
 }
 
 #' Adjust population vector to a new total
 #'
-#' Adjust population proportionally to a new total population size, rounding
-#' by the largest remainder method
+#' Adjust population proportionally to a new total population size, rounding up
+#' when necessary.
 #'
 #' @param orig.pop original population vector
 #' @param new.total.pop new total population size
 #' @return new population vector of class integer
+#' @keywords internal
 
-.AdjustPopulation <- function(orig.pop, new.total.pop) {
+AdjustPopulation <- function(orig.pop, new.total.pop) {
 
-  pop.multiplier <- new.total.pop / sum(orig.pop)
-  precise.pop <- orig.pop * pop.multiplier
-  quotient.pop <- floor(precise.pop)
-  remainder.pop <- precise.pop - quotient.pop
-  round.up.idx <- order(
-      remainder.pop,
-      decreasing = TRUE)[1:(new.total.pop - sum(quotient.pop))]
-  return(as.integer(quotient.pop + (1:length(quotient.pop) %in% round.up.idx)))
+  pop.multiplier <- ceiling(new.total.pop / sum(orig.pop))
+  return(orig.pop * pop.multiplier)
 }
diff --git a/R/migrate.R b/R/migrate.R
@@ -26,8 +26,9 @@ utils::globalVariables(c("pop", "pop.in", "pop.out"))
 #'   transition matrix whose dimensionality matches the number of states
 #'   in vector.counts is used.
 #' @return number of individuals in each state after migration
+#' @keywords internal
 
-.ApplyTransitionMatrix <- function(vector.counts, transition.matrices) {
+ApplyTransitionMatrix <- function(vector.counts, transition.matrices) {
 
   # Given a transition matrix, simulate the migration process.
   if (!is.list(transition.matrices)) {
@@ -45,7 +46,7 @@ utils::globalVariables(c("pop", "pop.in", "pop.out"))
     migrated.pop <- integer(length(vector.counts))
     for (iter in 1:length(vector.counts)) {
       migrated.pop <- migrated.pop +
-          drop(rmultinom(1, vector.counts[iter], transition.matrices[iter, ]))
+          drop(RMultinom(1, vector.counts[iter], transition.matrices[iter, ]))
     }
 
     # Return new population segmentation.
@@ -58,7 +59,7 @@ utils::globalVariables(c("pop", "pop.in", "pop.out"))
   num.states <- sapply(transition.matrices, nrow)
   transition.matrix <-
       transition.matrices[[match(length(vector.counts), num.states)]]
-  return(.ApplyTransitionMatrix(vector.counts, transition.matrix))
+  return(ApplyTransitionMatrix(vector.counts, transition.matrix))
 }
 
 #' Simulate migration in a single dimension of population segmentation.
@@ -73,10 +74,11 @@ utils::globalVariables(c("pop", "pop.in", "pop.out"))
 #'   kAllStates.
 #' @param transition.matrix transition matrix specifying probabilities of
 #'   migration between states.
-#' @return invisible(NULL). data.dt is updated by reference.
+#' @return \code{invisible(NULL)}. \code{data.dt} is updated by reference.
+#' @keywords internal
 
-.MigrateMarginal <- function(data.dt, migrating.pop.size,
-                             migration.dim, transition.matrix) {
+MigrateMarginal <- function(data.dt, migrating.pop.size,
+                            migration.dim, transition.matrix) {
 
   # Check input.
   func.env <- environment()
@@ -123,7 +125,7 @@ utils::globalVariables(c("pop", "pop.in", "pop.out"))
     changing.dimensions <- migration.dim
   }
   data.dt[,
-          pop.in := .ApplyTransitionMatrix(pop.out, transition.matrices),
+          pop.in := ApplyTransitionMatrix(pop.out, transition.matrices),
           by = eval(setdiff(key(data.dt), changing.dimensions))]
   data.dt[, pop := pop - pop.out + pop.in]
   return(invisible(NULL))
@@ -141,19 +143,20 @@ utils::globalVariables(c("pop", "pop.in", "pop.out"))
 #' @param migration.dims a character vector of dimensions of migration, by
 #'   name.
 #' @param transition.matrices a list of transition matrices for each dimension.
-#' @return invisible(NULL). data.dt is updated by reference.
+#' @return \code{invisible(NULL)}. \code{data.dt} is updated by reference.
+#' @keywords internal
 
-.MigrateMultiple <- function(data.dt, migrating.pop.size,
-                             migration.dims = character(),
-                             transition.matrices = list()) {
+MigrateMultiple <- function(data.dt, migrating.pop.size,
+                            migration.dims = character(),
+                            transition.matrices = list()) {
 
   # Check input.
   stopifnot(length(transition.matrices) == length(migration.dims))
 
   # Perform each migration in sequence.
   if (length(migration.dims) > 0) {
     for (iter.dim in 1:length(migration.dims)) {
-      .MigrateMarginal(data.dt, migrating.pop.size, migration.dims[iter.dim],
+      MigrateMarginal(data.dt, migrating.pop.size, migration.dims[iter.dim],
                       transition.matrices[[iter.dim]])
       migrating.pop.size <- data.dt[, pop.in]
     }