Merge pull request #18 from EdoardoCostantini:develop

deploy 0.9.1
EdoardoCostantini · Jul 21, 2023 · c8a1094 · c8a1094
2 parents 07b6b59 + d9a9875
commit c8a1094
Show file tree

Hide file tree

Showing 90 changed files with 11,951 additions and 1,234 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,17 +1,21 @@
 Package: gspcr
 Title: Generalized Supervised Principal Component Regression
-Version: 0.0.0.9000
+Version: 0.9.1
 Authors@R: 
     person("Edoardo", "Costantini", , "[email protected]", role = c("aut", "cre"),
            comment = c(ORCID = "YOUR-ORCID-ID"))
 Description: The sparse principal component regression is computed. The regularization parameters and number of components are optimized by cross-validation.
 License: MIT + file LICENSE
 Encoding: UTF-8
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 7.2.2
+RoxygenNote: 7.2.3
 Suggests: 
+    klippy,
     knitr,
+    lmtest,
+    patchwork,
     rmarkdown,
+    superpc,
     testthat (>= 3.0.0)
 Config/testthat/edition: 3
 Depends: 
@@ -24,6 +28,7 @@ Imports:
     MASS,
     MLmetrics,
     nnet,
+    PCAmixdata,
     reshape2,
     rlang
 VignetteBuilder: knitr
diff --git a/LICENSE b/LICENSE
diff --git a/NAMESPACE b/NAMESPACE
@@ -1,15 +1,29 @@
 # Generated by roxygen2: do not edit by hand
 
-S3method(plot,gspcrout)
+S3method(plot,gspcrcv)
+S3method(predict,gspcrout)
 export(LL_baseline)
 export(LL_binomial)
 export(LL_cumulative)
 export(LL_gaussian)
 export(LL_newdata)
 export(LL_poisson)
 export(compute_sc)
+export(cp_AIC)
+export(cp_BIC)
+export(cp_F)
+export(cp_LRT)
+export(cp_gR2)
+export(cp_pc_scores)
+export(cp_thrs_LLS)
+export(cp_thrs_NOR)
+export(cp_thrs_PR2)
+export(cp_validation_fit)
 export(cv_average)
 export(cv_choose)
 export(cv_gspcr)
+export(est_gspcr)
+export(est_univ_mods)
+export(pca_mix)
 importFrom(dplyr,"%>%")
 importFrom(rlang,.data)
diff --git a/R/CFA-data.R b/R/CFA-data.R
@@ -0,0 +1,19 @@
+#' CFA example data
+#'
+#' Contains a data set used to develop and test the main features of the \code{gspcr} package. The data contains 50 predictors generated based on true number of principal components.
+#'
+#' @name CFA_data
+#' @docType data
+#' @format \code{CFA_data} is a list containing two objects:
+#' - \code{X}: A data.frame with 5000 rows (observations) and 30 columns (possible predictors.) This data was generated based on a CFA model describing 10 independent latent variables measured by 3 items each, and a factor loading matrix describing simple structure.
+#' - \code{y}: A numeric vector of length 1000. This variable was genearted as a linear combination of 5 latent variables used to generate \code{X}.
+#' @details 
+#' A supervised PCA approach should identify that only 5 components are useful for the prediction of \code{y} and that only the first 15 variables should be used to compute them.
+#' @keywords datasets
+#' @examples
+#' # Check out the first 6 rows of the predictors
+#' head(CFA_data$X)
+#'
+#' # Check out first 6 elements of the dependent variable
+#' head(CFA_data$y)
+NULL
diff --git a/R/GSPCRexdata.R b/R/GSPCRexdata.R
@@ -1,17 +1,29 @@
 #' GSPCR example data
 #'
-#' A data.frame with a dependent variable and 50 predictors generated based on true principal components.
+#' Contains a data set used to develop and test the main features of the \code{gspcr} package. The data contains a dependent variable and 50 predictors generated based on true number of principal components.
 #'
 #' @name GSPCRexdata
 #' @docType data
 #' @format \code{GSPCRexdata} is a list containing two data.frame objects:
-#' \describe{
-#' \item{X}{A data.frame with 1000 rows and 50 columns of possible predictors. These predictors were generated such that 30% of their total variability could be explained by 5 principal components.}
-#' \item{y}{A data.frame with 1000 rows and 4 columns. The first column \code{cont} is a continuous variable produced using a linear model with the first two PCs underlying \code{X} as a data-generating model. The other columns are transformed versions of \code{cont} to match common discrete target distribution in the social sciences.}
-#' }
+#' - \code{X}: A list of data.frames with 1000 rows (observations) and 50 columns (possible predictors). The list contains matrices storing data coded with different measurement levels:
+#'      - \code{cont} with 50 continuous variables
+#'      - \code{bin} with 50 binary variables (factors)
+#'      - \code{ord} with 50 ordinal variables (ordered factors)
+#'      - \code{cat} with 50 categorical variables (unordered factors)
+#'      - \code{mix} with 20 continuous variables, 10 binary variables (factors), 10 ordinal variables (ordered factors), 10 categorical variables (unordered factors).
+#' - \code{y}: A data.frame with 1000 rows and 5 columns. The first column \code{cont} is a continuous variable produced using a linear model with the first two PCs underlying \code{X} as a data-generating model. 
+#' The other columns are transformed versions of \code{cont} to match common discrete target distribution in the social sciences.
+#' These are the variables stored:
+#'      - \code{cont} continuous dependent variable (numeric vector)
+#'      - \code{bin} binary dependent variable (factor)
+#'      - \code{ord} ordinal dependent variable (ordered factor)
+#'      - \code{cat} nominal dependent variable (unordered factor)
+#'      - \code{pois} count dependent variable (numeric vector)
 #' @keywords datasets
 #' @examples
-#'
-#' data <- GSPCRexdata
-#' head(GSPCRexdata)
+#' # Check out the first 6 rows of the continuous predictors
+#' head(GSPCRexdata$X$cont)
+#' 
+#' # Check out first 6 rows of the dv data.frame
+#' head(GSPCRexdata$y)
 NULL
diff --git a/R/LL_baseline.R b/R/LL_baseline.R
@@ -11,8 +11,8 @@
 #' A disjunctive table is a matrix representation of a multi-categorical variable. The dimensionality of the matrix is i times j, with i = number of observations, and j = number of categories. \code{y_{ij}} is equal to 1 if observation i responded with category j, and it is equal to 0 otherwise.
 #' The log-likelihood equation is based on Agresti (2002, p. 192).
 #' @return A list containing:
-#' - \code{ll}, an atomic vector of length 1 containing the log-likelihood value.
-#' - \code{sc}, a numeric matrix containing the systematic component for the input \code{x} and \code{mod}.
+#' - \code{ll} atomic vector of length 1 containing the log-likelihood value.
+#' - \code{sc} numeric matrix containing the systematic component for the input \code{x} and \code{mod}.
 #' @author Edoardo Costantini, 2023
 #' @references
 #'

diff --git a/R/LL_binomial.R b/R/LL_binomial.R
@@ -9,8 +9,8 @@
 #' If \code{x} and \code{y} are equal to the data on which \code{mod} has been trained, this function returns the same result as the default \code{logLink} function. If \code{x} and \code{y} are new, the function returns the log-likelihood of the new data under the trained model.
 #' The log-likelihood equation is based on Agresti (2002, p. 192).
 #' @return A list containing: 
-#' - \code{ll}, an atomic vector of length 1 containing the log-likelihood value.
-#' - \code{sc}, an atomic vector containing the systematic component for the input \code{x} and \code{mod}.
+#' - \code{ll} an atomic vector of length 1 containing the log-likelihood value.
+#' - \code{sc} an atomic vector containing the systematic component for the input \code{x} and \code{mod}.
 #' @author Edoardo Costantini, 2022
 #' @references
 #'

diff --git a/R/LL_cumulative.R b/R/LL_cumulative.R
@@ -9,7 +9,7 @@
 #' If \code{x} and \code{y} are equal to the data on which \code{mod} has been trained, this function returns the same result as the default \code{logLink} function. If \code{x} and \code{y} are new, the function returns the log-likelihood of the new data under the trained model.
 #' The log-likelihood equation is based on Agresti (2002, p. 192).
 #' @return A list containing:
-#' - \code{ll}, an atomic vector of length 1 containing the log-likelihood value.
+#' - \code{ll} an atomic vector of length 1 containing the log-likelihood value.
 #' - \code{sc}, a numeric matrix containing the systematic component for the input \code{x} and \code{mod}.
 #' @author Edoardo Costantini, 2022
 #' @references

diff --git a/R/LL_gaussian.R b/R/LL_gaussian.R
@@ -8,8 +8,8 @@
 #' @details 
 #' If \code{x} and \code{y} are equal to the data on which \code{mod} has been trained, this function returns the same result as the default \code{logLink} function. If \code{x} and \code{y} are new, the function returns the log-likelihood of the new data under the trained model.
 #' @return A list containing:
-#' - \code{ll}, an atomic vector of length 1 containing the log-likelihood value.
-#' - \code{sc}, an atomic vector containing the systematic component for the input \code{x} and \code{mod}.
+#' - \code{ll} an atomic vector of length 1 containing the log-likelihood value.
+#' - \code{sc} an atomic vector containing the systematic component for the input \code{x} and \code{mod}.
 #' @author Edoardo Costantini, 2022
 #'
 #' @export

diff --git a/R/LL_newdata.R b/R/LL_newdata.R
@@ -6,24 +6,21 @@
 #' @param y_valid Vector of DV values in the validation dataset.
 #' @param X_train Matrix of IV values in the training dataset. Can also be set to 1 to obtain the log-likelihood of the new data under the null model.
 #' @param X_valid Matrix of IV values in the validation dataset. If \code{X_train} is set to 1 to obtain the log-likelihood of the new data under the null model, \code{X_valid} is ignored.
-#' @param fam GLM framework for the dv.
+#' @param fam character vector of length 1 storing the description of the error distribution and link function to be used in the model (see [gspcr::cv_gspcr()] for the list of possible options)
 #' @details
 #' This function trains a GLM regressing \code{y_train} on \code{X_train} using as link function what is specified in \code{fam}. Then, it computes the predictions for the validation data based on the trained model on the scale of the linear predictors (e.g., logit). The likelihood of the validation under the model is returned.
 #' 
 #' @return A list of objects.
 #' @author Edoardo Costantini, 2023
-#' @references
-#'
-#' Such, S. (2006). Such and such. Journal such and such, 101(473), 119-137.
 #'
 #' @export
 LL_newdata <- function(y_train, y_valid, X_train, X_valid, fam) {
 
   ## Example inputs
   # y_train = as.matrix(mtcars[1:20, 1])
   # y_valid = as.matrix(mtcars[-c(1:20), 1])
-  # X_train = 1
-  # X_valid = 1
+  # X_train = as.matrix(mtcars[1:20, -1])
+  # X_valid = as.matrix(mtcars[-c(1:20), -1])
   # fam = "gaussian"
 
   ## Body
@@ -54,7 +51,8 @@ LL_newdata <- function(y_train, y_valid, X_train, X_valid, fam) {
   if (fam == "baseline") {
     glm_fit_tr <- nnet::multinom(
       formula = glm_formula,
-      data = train
+      data = train, 
+      trace = FALSE
     )
   }
   if (fam == "cumulative") {

diff --git a/R/LL_poisson.R b/R/LL_poisson.R
@@ -8,8 +8,8 @@
 #' @details
 #' If \code{x} and \code{y} are equal to the data on which \code{mod} has been trained, this function returns the same result as the default \code{logLink} function. If \code{x} and \code{y} are new, the function returns the log-likelihood of the new data under the trained model.
 #' @return A list containing:
-#' - \code{ll}, an atomic vector of length 1 containing the log-likelihood value.
-#' - \code{sc}, an atomic vector containing the systematic component for the input \code{x} and \code{mod}.
+#' - \code{ll} an atomic vector of length 1 containing the log-likelihood value.
+#' - \code{sc} an atomic vector containing the systematic component for the input \code{x} and \code{mod}.
 #' @author Edoardo Costantini, 2023
 #' @references
 #'