Merge branch 'release-2.1.6'

tmatta · Jan 22, 2025 · da13d17 · da13d17
2 parents 43a70fe + 2bd104e
commit da13d17
Show file tree

Hide file tree

Showing 84 changed files with 636 additions and 589 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,6 +1,6 @@
 Package: lsasim
 Title: Functions to Facilitate the Simulation of Large Scale Assessment Data
-Version: 2.1.5
+Version: 2.1.6
 Authors@R: c(
   person("Tyler", "Matta",
          email = "[email protected]", role = "aut"),
@@ -30,12 +30,14 @@ Depends:
 License: GPL-3
 Encoding: UTF-8
 LazyData: true
-RoxygenNote: 7.3.1
+RoxygenNote: 7.3.2
 Suggests:
     testthat,
     knitr,
     formatR,
     rmarkdown,
     NAEPirtparams
 VignetteBuilder: knitr
-Date: 2024-05-06
+Roxygen: list(markdown = TRUE)
+Date: 2025-01-22
+Language: en-US
diff --git a/NAMESPACE b/NAMESPACE
@@ -52,4 +52,5 @@ importFrom(stats,rpois)
 importFrom(stats,runif)
 importFrom(stats,sd)
 importFrom(stats,setNames)
+importFrom(stats,var)
 importFrom(stats,weighted.mean)
diff --git a/NEWS.md b/NEWS.md
@@ -1,63 +1,49 @@
-# lsasim 2.1.5
+# lsasim 2.1.6
+
+* Improved output of `cluster_gen()`
+* Improved documentation of `cluster_gen()` (issue #52)
+* Improved Markdown formatting of documentation everywhere
+* Allow simultaneous specification of `cat_prop` and `rho` on `cluster_gen()` (issue #51)
 
-## Bug fixes
+# lsasim 2.1.5
 
 * Fixed `block_design()` for non-sequential items (issue #50)
 * Fixed typos in documentation
-* Reimplemented continuous integration routine (issue #47)
+* Re-implemented continuous integration routine (issue #47)
 
 # lsasim 2.1.4
 
-## Bug fixes
-
 * Fixed top-level package documentation (`?lsasim`)
 
 # lsasim 2.1.3
 
-## Bug fixes
-
 * Fixes generation of thresholds on `item_gen()` (issue #48)
 * Refactored usage of `class()` on if-statements
 
 # lsasim 2.1.2
 
-## Bug fixes
-
 * Refactoring to fix building on Fedora/Clang and M1-powered Macs
 
 # lsasim 2.1.1
 
-## Bug fixes
-
 * Fixes test unit dependency on GNU libiconv
 
 # lsasim 2.1.0
 
-## API changes
-
 * Adds `cluster_gen`, a function to generate background questionnaires (with `questionnaire_gen`) in a cluster sampling structure.
 * Package description now includes a literary reference for the theoretical framework behind the package (issue #38, pull request #42)
-
-## Bug fixes
-
 * Fixes item parameter generation by `irt_gen` when the guessing parameter is larger than zero (issue #40)
 
 # lsasim 2.0.2
 
-## Bug fixes
-
 * Solves [Issue #11](https://github.com/tmatta/lsasim/issues/11), which was causing `item_gen()` to produce out-of-bounds item difficulties in some situations.
 
 # lsasim 2.0.1
 
-## Bug fixes
-
 * Makes lsasim compliant with changes to be introduced in the next major release of R. lsasim 2.0.1 is functionally identical to 2.0.0, as all changes relate to how `if (class(x) == "matrix"` statements are written. Specifically, such statements were changed to `if(class(x))[1] == "matrix"` to accommodate for cases where `x` may be a matrix with complex classes such as `"matrix" "array"`.
 
 # lsasim 2.0.0
 
-## API changes
-
 This release adds several features to the `questionnaire_gen` function, such as:
 
 * Generation of questionnaires given a much wider range of arguments.
@@ -71,20 +57,14 @@ Please read the function's help file for more details on how to access these new
 
 # lsasim 1.0.1
 
-## API changes
 * Added `cov_bounds` to `cor_gen`.
 * Removed `d1` from the returned data frame of `item_gen` when `thresholds = 1`.
 
-
 # lsasim 1.0.1.9000
 
-## API changes
 * Added `item_no` to `response_gen`.
-
-## Bug fixes
 * `response_gen` can now handle item subsets.
 
-
 # lsasim 1.0.0
 
 * Launched
diff --git a/R/anova.R b/R/anova.R
@@ -7,6 +7,7 @@
 #' @return Printed ANOVA table or list of parameters
 #' @note  If the rhos for different levels are varied in scale, the generated rho will be less accurate.
 #' @references Snijders, T. A. B., & Bosker, R. J. (1999). Multilevel Analysis. Sage Publications.
+#' @seealso [summary.lsasimcluster()]
 #' @importFrom stats anova
 #' @method anova lsasimcluster
 #' @export

diff --git a/R/attribute_cluster_labels.R b/R/attribute_cluster_labels.R
@@ -1,6 +1,6 @@
 #' @title Attribute Labels in Hierarchical Structure
 #' @description Attributes cluster and respondent labels in the context of `cluster_gen`.
-#' @seealso cluster_gen
+#' @seealso [cluster_gen()]
 #' @param n numeric vector or list
 #' @return list containing appropriate labels for the clusters and their respondents
 attribute_cluster_labels <- function(n)

diff --git a/R/beta_gen.R b/R/beta_gen.R
@@ -64,7 +64,7 @@
 #'   previous paragraph. The second one, called \code{vcov_YXW}, contains
 #'   the covariance matrix of the regression coefficients.
 #' @note The equation in this page is best formatted in PDF. We recommend issuing `help("beta_gen", help_type = "PDF")` in your terminal and opening the `beta_gen.pdf` file generated in your working directly. You may also set `help_type = "HTML"`, but the equations will have degraded formatting.
-#' @seealso questionnaire_gen
+#' @seealso [questionnaire_gen()]
 #' @export
 #' @examples
 #'

diff --git a/R/brr.R b/R/brr.R
@@ -8,7 +8,7 @@
 #' @param drop if `TRUE`, the observation that will not be part of the subsample is dropped from the dataset. Otherwise, it stays in the dataset but a new weight column is created to differentiate the selected observations
 #' @param id_col number of column in dataset containing subject IDs. Set 0 to use the row names as ID
 #' @return a list containing all the BRR replicates of `data`
-#' @seealso jackknife
+#' @seealso [jackknife()]
 #' @note PISA uses the BRR Fay method with \eqn{k = 0.5}.
 #' @references
 #' OECD (2015). Pisa Data Analysis Manual.

diff --git a/R/calc_n_tilde.R b/R/calc_n_tilde.R
@@ -1,6 +1,6 @@
 #' @title Calculate ñ
 #' @description Calculates n tilde
-#' @seealso ?lsasim:::summary.lsasimcluster
+#' @seealso [summary.lsasimcluster()]
 #' @param M total number of population (i.e., sum of n_j over all j)
 #' @param N number of each class j
 #' @param n_j vector with size of each class j
@@ -11,4 +11,4 @@ calc_n_tilde <- function(M, N, n_j) {
     s2_n_j <- sum((n_j - n_bar) ^ 2) / (N - 1)
     n_tilde <- n_bar - s2_n_j / (N * n_bar)
     return(n_tilde)
-}
+}
diff --git a/R/calc_replicate_weights.R b/R/calc_replicate_weights.R
@@ -5,7 +5,7 @@
 #' @param k deflating weight factor (used only when `method = "BRR Fay")
 #' @details Replicate weights can be calculated using the Jackknife for unstratified two-stage sample designs or Balanced Repeated Replication (BRR) with or without Fay's modification.
 #' According to OECD (2015), PISA uses the Fay method with a factor of 0.5. This is why `k = .5` by default.
-#' @seealso cluster_gen jackknife, jackknife_var
+#' @seealso [cluster_gen()] [jackknife()]
 #' @references
 #' OECD (2015). Pisa Data Analysis Manual.
 #' Rust, K. F., & Rao, J. N. K. (1996). Variance estimation for complex surveys using replication techniques. Statistical methods in medical research, 5(3), 283-310.

diff --git a/R/calc_se_rho.R b/R/calc_se_rho.R
@@ -1,5 +1,5 @@
 #' @title Calculate Standard Error of Intraclass Correlation
-#' @seealso anova.lsasimcluster
+#' @seealso [anova.lsasimcluster()]
 #' @param rho intraclass correlation
 #' @param n_j number of elements in class j
 #' @param N number of classes j
@@ -14,4 +14,4 @@ calc_se_rho <- function(rho, n_j, N) {
         # TODO: Implement equation (6.1) from Donner (1986) - issue #22
     }
     return(se_rho)
-}
+}
diff --git a/R/calc_var_between.R b/R/calc_var_between.R
@@ -5,7 +5,7 @@
 #' @param n_tilde function of the variance of n_N, M and N. See documentation and code of \code{lsasim:::summary.lsasimcluster} for details
 #' @param N number of classes j
 #' @references Snijders, T. A. B., & Bosker, R. J. (1999). Multilevel Analysis. Sage Publications.
-#' @seealso anova.lsasimcluster
+#' @seealso [anova.lsasimcluster()]
 calc_var_between <- function(n_j, y_bar_j, y_bar, n_tilde, N) {
     if (!is(y_bar_j[1], "matrix")) {
         y_bar_j <- as.matrix(y_bar_j)

diff --git a/R/calc_var_tot.R b/R/calc_var_tot.R
@@ -1,5 +1,5 @@
 #' @title Calculate the total variance
-#' @seealso anova.lsasimcluster
+#' @seealso [anova.lsasimcluster()]
 #' @param n_tilde function of the variance of n_N, M and N. See documentation and code of \code{lsasim:::summary.lsasimcluster} for details
 #' @param M total sample size
 #' @param N number of classes j
@@ -18,4 +18,4 @@ calc_var_tot <- function(M, N, n_tilde, s2_within, s2_between) {
     }
     names(s2_tot) <- X
     return(s2_tot)
-}
+}
diff --git a/R/calc_var_within.R b/R/calc_var_within.R
@@ -1,5 +1,5 @@
 #' @title Calculate variance within classes
-#' @seealso anova.lsasimcluster
+#' @seealso [anova.lsasimcluster()]
 #' @param n_j number of elements in class j
 #' @param M total sample size
 #' @param N number of classes j
@@ -13,4 +13,4 @@ calc_var_within <- function(n_j, s2_j, M, N) {
     }
     names(s2_within) <- X
     return(s2_within)
-}
+}
diff --git a/R/check_n_N_class.R b/R/check_n_N_class.R
@@ -1,5 +1,5 @@
 #' @title Check class of n or N
-#' @seealso cluster_gen
+#' @seealso [cluster_gen()]
 #' @description Check the class of an object (usually n and N from `cluster_gen`)
 #' @param x either n or N from `cluster_gen`
 #' @note This function is primarily used as a way to simplify the classification of n and N in the `cluster_gen` function.

diff --git a/R/check_valid_structure.R b/R/check_valid_structure.R
@@ -2,7 +2,7 @@
 #' @description Checks if a list has a proper structure to be transformed into a hierarchical structure
 #' @param n list
 #' @return Error if the structure is improper. Otherwise, there's no output.
-#' @seealso check_condition
+#' @seealso [check_condition()]
 check_valid_structure <- function(n)
 {
     for (l in seq(length(n) - 1)) {
@@ -18,4 +18,4 @@ check_valid_structure <- function(n)
       )
     )
   }
-}
+}
diff --git a/R/cluster_gen.R b/R/cluster_gen.R
@@ -1,22 +1,22 @@
 #' @title Generate cluster sample
-#' @param n numeric vector with the number of sampled observations (clusters or subjects) on each level
+#' @param n numeric vector or list with the number of sampled observations (clusters or subjects) on each level
+#' @param N population size of each *sampled* cluster element on each level. Either a numeric vector or a list of numeric vectors. If `N` is a list, it must have the same length as `n` and each element of `N` must have the same length as the corresponding element of `n`
 #' @param cluster_labels character vector with the names of each cluster level
 #' @param resp_labels character vector with the names of the questionnaire respondents on each level
 #' @param collapse if `TRUE`, function output contains only one data frame with all answers. It can also be "none", "partial" and "full" for finer control on 3+ levels
 #' @param separate_questionnaires if `TRUE`, each level will have its own questionnaire
-#' @param N list of numeric vector with the population size of each *sampled* cluster element on each level
 #' @param calc_weights if `TRUE`, sampling weights are calculated
 #' @param sum_pop total population at each level (sampled or not)
 #' @param n_X list of `n_X` per cluster level
 #' @param n_W list of `n_W` per cluster level
 #' @param cat_prop list of cumulative proportions for each item. If \code{theta
 #'   = TRUE}, the first element of \code{cat_prop} must be a scalar 1, which
 #'   corresponds to the \code{theta}.
-#' @param c_mean vector of means for the continuous variables or list of vectors for the continuous variables for each level. Defaults to 0, but can change if `rho` is set.
-#' @param sigma vector of standard deviations for the continuous variables or list of vectors for the continuous variables for each level. Defaults to 1, but can change if `rho` is set.
+#' @param c_mean vector of means for the continuous variables or list of vectors for the continuous variables for each level. Defaults to 0, but may change if `rho` is set.
+#' @param sigma vector of standard deviations for the continuous variables or list of vectors for the continuous variables for each level. Defaults to 1, but may change if `rho` is set.
 #' @param cor_matrix Correlation matrix between all variables (except weights). By default, correlations are randomly generated.
-#' @param sampling_method can be "SRS" for Simple Random Sampling or "PPS" for Probabilities Proportional to Size
-#' @param rho estimated intraclass correlation
+#' @param sampling_method can be "SRS" for Simple Random Sampling, "PPS" for Probabilities Proportional to Size, "mixed" to use PPS for schools and SRS otherwise, or a vector with the sampling method for each level
+#' @param rho intraclass correlation (scalar, vector or list, as appropriate)
 #' @param theta if \code{TRUE}, the first continuous variable will be labeled
 #'   'theta'. Otherwise, it will be labeled 'q1'.
 #' @param verbose if `TRUE`, prints output messages
@@ -47,7 +47,7 @@
 #'   generated using the polychoric correlation matrix, with no distributional
 #'   assumptions.
 #'
-#' @seealso cluster_estimates cluster_gen_separate cluster_gen_together questionnaire_gen
+#' @seealso [cluster_gen_separate()] [cluster_gen_together()] [questionnaire_gen()]
 #' @export
 #' @examples
 #' # Simple structure of 3 schools with 5 students each
@@ -319,3 +319,16 @@ repeatXW <- function(n_X, n_W, n_levels) {
   }
   return(list(n_X = n_X, n_W = n_W))
 }
+
+get_n_X_from_cat_prop <- function(cat_prop, n_X = NULL) {
+  # Retrieve n_X from the elements of cat_prop that equal 1
+  if (!is.null(cat_prop)) {
+    if (is.null(n_X)) {
+      sum(vapply(cat_prop, function(x) all(x == 1), logical(1)))
+    } else {
+      NULL
+    }
+  } else {
+    n_X
+  }
+}
diff --git a/R/cluster_gen_separate.R b/R/cluster_gen_separate.R
@@ -1,29 +1,13 @@
 #' @title Generate cluster samples with individual questionnaires
 #' @description This is a sub-function of `cluster_gen` that performs cluster sampling, with the twist that each cluster level has its own questionnaire.
-#' @param n_levels number of cluster levels
-#' @param n numeric vector with the number of sampled observations (clusters or subjects) on each level
-#' @param cluster_labels character vector with the names of each cluster level
-#' @param resp_labels character vector with the names of the questionnaire respondents on each level
-#' @param collapse if `TRUE`, function output contains only one data frame with all answers
-#' @param N list of numeric vector with the population size of each *sampled* cluster element on each level
-#' @param sum_pop total population at the lowest level (sampled or not)
-#' @param calc_weights if `TRUE`, sampling weights are calculated
-#' @param sampling_method can be "SRS" for Simple Random Sampling or "PPS" for Probabilities Proportional to Size, "mixed" to use SRS for students and PPS otherwise or a vector with the sampling method for each level
-#' @param n_X list of `n_X` per cluster level
-#' @param n_W list of `n_W` per cluster level
-#' @param cat_prop list of cumulative proportions for each item. If \code{theta
-#'   = TRUE}, the first element of \code{cat_prop} must be a scalar 1, which
-#'   corresponds to the \code{theta}.
+#' @inheritParams cluster_gen
+#' @param cor_matrix Correlation matrix between all variables (except weights)
 #' @param c_mean vector of means for the continuous variables or list of vectors for the continuous variables for each level
 #' @param sigma vector of standard deviations for the continuous variables or list of vectors for the continuous variables for each level
-#' @param cor_matrix Correlation matrix between all variables (except weights)
-#' @param verbose if `TRUE`, prints output messages
-#' @param rho estimated intraclass correlation
-#' @param theta if \code{TRUE}, the first continuous variable will be labeled
-#'   'theta'. Otherwise, it will be labeled 'q1'.
+#' @param n_levels number of cluster levels
 #' @param whitelist used when `n = select(...)`, determines which PSUs get to generate questionnaires
 #' @param ... Additional parameters to be passed to `questionnaire_gen()`
-#' @seealso cluster_gen cluster_gen_together
+#' @seealso [cluster_gen()] [cluster_gen_together()]
 #' @importFrom stats rchisq
 #' @importFrom methods is
 #' @export
@@ -78,10 +62,15 @@ cluster_gen_separate <- function(
 
     ## Defining parameters for intraclass correlations -------------------------
     if (!is.null(rho)) {
+      if (is.null(n_X)) {
+        n_X <- get_n_X_from_cat_prop(cat_prop)
+      }
 
       ### Expanding rho to n_level width .......................................
       if (!is(rho, "list")) rho <- replicate(n_levels, list(rho))
-      if (length(rho[[l]]) == 1) rho[[l]] <- rep(rho[[l]], n_X[[l]] + theta)
+      if (length(rho[[l]]) == 1) {
+        rho[[l]] <- rep(rho[[l]], n_X[[l]] + theta)
+      }
 
       ### Defining sigma2 and tau2 .............................................
       n_j <- n[[l + 1]]
@@ -109,6 +98,7 @@ cluster_gen_separate <- function(
       ### Defining the group correlations (s2_j == s2 for all j) ...............
       Nn <- length(n_j)
       s2 <- sigma2 * (M - Nn) / sum(n_j - 1)
+      n_X <- get_n_X_from_cat_prop(cat_prop, n_X) # it is not needed anymore. Keeping it triggers warnings
     }
 
     ## Generating questionnaires for each cluster element of that level --------