|
| 1 | +create_var_overview <- function(path) { |
| 2 | + df <- read_csv(path, col_names = FALSE, n_max = 1, col_types = "c") |
| 3 | + |
| 4 | + out <- df %>% |
| 5 | + pivot_longer(everything(), names_to = "var_id", values_to = "var_full") |
| 6 | + |
| 7 | + write_csv(out, "data/processed/var_overview.csv") |
| 8 | + |
| 9 | + out |
| 10 | +} |
| 11 | + |
| 12 | +clean_raw_data <- function(df, wb_countries) { |
| 13 | + # remove test case |
| 14 | + df <- df %>% |
| 15 | + filter(X1 != "8") |
| 16 | + |
| 17 | + # remove cases that are not on a research contract |
| 18 | + df <- df %>% |
| 19 | + filter(X11 == "Yes") |
| 20 | + |
| 21 | + # remove tokens |
| 22 | + df <- df %>% |
| 23 | + select(-X6) |
| 24 | + |
| 25 | + # remove qualitative questions which might expose participants |
| 26 | + df <- df %>% |
| 27 | + select(-c(X14, X20, X42, X43, X46, X70, X74, X77, X79, X81, X83, X86, X92)) |
| 28 | + |
| 29 | + # remove further columns which are superfluous/uniform across all responses |
| 30 | + df <- df %>% |
| 31 | + select(-c(X2, X4, X5, X13)) |
| 32 | + |
| 33 | + # add country information |
| 34 | + # Categories from: |
| 35 | + # https://datahelpdesk.worldbank.org/knowledgebase/articles/906519-world-bank-country-and-lending-groups |
| 36 | + wb_country_selection <- wb_countries %>% |
| 37 | + select(Economy, Region) |
| 38 | + |
| 39 | + df <- df %>% |
| 40 | + left_join(wb_country_selection, by = c("X12" = "Economy")) |
| 41 | + |
| 42 | + # check for errors |
| 43 | + no_missing_countries <- df %>% |
| 44 | + filter(is.na(Region)) %>% |
| 45 | + nrow(.) == 0 |
| 46 | + stopifnot(no_missing_countries) |
| 47 | + |
| 48 | + |
| 49 | + # return final df |
| 50 | + df |
| 51 | +} |
| 52 | + |
| 53 | + |
| 54 | +merge_disciplines <- function(df, disciplines_df) { |
| 55 | + left_join(df, disciplines_df, by = c("X90", "X91")) |
| 56 | +} |
| 57 | + |
| 58 | +read_clean_wb_countries <- function(file) { |
| 59 | + readxl::read_excel(file) %>% |
| 60 | + filter(!is.na(Region)) %>% |
| 61 | + mutate(Economy = str_remove(Economy, ",.*"), |
| 62 | + Economy = recode(Economy, `Russian Federation` = "Russia", |
| 63 | + `Slovak Republic` = "Slovakia")) |
| 64 | +} |
| 65 | + |
| 66 | + |
| 67 | +custom_adorns <- function(tabyl) { |
| 68 | + tabyl %>% |
| 69 | + adorn_totals(where = c("row", "col")) %>% |
| 70 | + adorn_percentages() %>% |
| 71 | + adorn_pct_formatting() %>% |
| 72 | + adorn_ns() |
| 73 | +} |
| 74 | + |
| 75 | + |
| 76 | +# functions for comparing personal and institutional scores |
| 77 | +# the function was further adapted to also incorporate other variants of how |
| 78 | +# variables were set up |
| 79 | +get_numeric_val <- function(x) { |
| 80 | + case_when( |
| 81 | + x == "Very important" ~ 1L, |
| 82 | + x == "Somewhat important" ~ 2L, |
| 83 | + x == "Important" ~ 2L, |
| 84 | + x == "Neither important nor unimportant" ~ 3L, |
| 85 | + x == "Somewhat unimportant" ~ 4L, |
| 86 | + x == "Unimportant" ~ 4L, |
| 87 | + x == "Very unimportant" ~ 5L, |
| 88 | + TRUE ~ NA_integer_ |
| 89 | + ) |
| 90 | +} |
| 91 | + |
| 92 | +get_values <- function(df, vars, source = c("institutional", "personal"), |
| 93 | + var_overview) { |
| 94 | + base <- df %>% |
| 95 | + select(X1, {{ vars }}) %>% |
| 96 | + pivot_longer(-X1, names_to = "var", values_to = "val") %>% |
| 97 | + mutate(num_val = get_numeric_val(val)) %>% |
| 98 | + filter(!is.na(num_val)) %>% |
| 99 | + mutate(source = source) |
| 100 | + |
| 101 | + # get labels |
| 102 | + vars_char <- deparse(substitute(vars)) |
| 103 | + begin <- str_extract(vars_char, "\\d{2}") %>% as.numeric() |
| 104 | + end <- str_extract(vars_char, "\\d{2}$") %>% as.numeric() |
| 105 | + |
| 106 | + labels <- var_overview %>% |
| 107 | + filter(var_id %in% paste0("X", begin:end)) %>% |
| 108 | + mutate(label = str_extract(var_full, "(?<=\\[).*?(?=\\s?\\])"), |
| 109 | + # clean up labels |
| 110 | + label = str_remove(label, "\\(.*"), |
| 111 | + label = str_remove(label, ", as assessed .*")) %>% |
| 112 | + select(var_id, label) |
| 113 | + |
| 114 | + base %>% |
| 115 | + left_join(labels, by = c("var" = "var_id")) %>% |
| 116 | + select(-var) |
| 117 | +} |
| 118 | + |
| 119 | +bootstrap_values <- function(df) { |
| 120 | + df %>% |
| 121 | + group_by(label, source) %>% |
| 122 | + summarise(res = list(Hmisc::smean.cl.boot(num_val))) %>% |
| 123 | + unnest_wider(res) |
| 124 | +} |
| 125 | + |
| 126 | +five_point_scale <- function(compact = FALSE) { |
| 127 | + if (compact) { |
| 128 | + list(scale_x_continuous( |
| 129 | + breaks = c(1, 5), |
| 130 | + labels = c("Important", "Unimportant")), |
| 131 | + coord_cartesian(xlim = c(1, 5)) |
| 132 | + ) |
| 133 | + } else { |
| 134 | + list(scale_x_continuous( |
| 135 | + breaks = c(1, 3, 5), |
| 136 | + labels = c("Very important", "Neither/nor", "Very unimportant")), |
| 137 | + coord_cartesian(xlim = c(1, 5)) |
| 138 | + ) |
| 139 | + } |
| 140 | +} |
| 141 | + |
| 142 | + |
| 143 | +# correspondence analysis functions ----- |
| 144 | +#' ca plots with ggplot2 |
| 145 | +#' |
| 146 | +#' @export |
| 147 | +plot_ca <- function(object, font_size = 3, dimensions = c(1, 2), |
| 148 | + show.legend = F, map = "symmetric", keep_labels = FALSE) { |
| 149 | + ca_class <- class(object) |
| 150 | + if (!identical(ca_class, "ca") & !identical(ca_class, "mjca")) { |
| 151 | + stop("Input object must be of type 'ca' or 'mjca'") |
| 152 | + } |
| 153 | + assertthat::assert_that(length(dimensions) == 2) |
| 154 | + |
| 155 | + |
| 156 | + # find variances for labelling of axes |
| 157 | + variances <- suppressWarnings(summary(object)) |
| 158 | + |
| 159 | + dim1 <- variances$scree[dimensions[1], 3] %>% round(., 2) %>% unname() |
| 160 | + dim2 <- variances$scree[dimensions[2], 3] %>% round(., 2) %>% unname() |
| 161 | + |
| 162 | + # create labels |
| 163 | + x_label <- paste0("Dimension ", dimensions[1], " (", dim1, "%)") |
| 164 | + y_label <- paste0("Dimension ", dimensions[2], " (", dim2, "%)") |
| 165 | + |
| 166 | + |
| 167 | + augmented_data <- extract_ca_data(object, dimensions, map, keep_labels) |
| 168 | + |
| 169 | + |
| 170 | + |
| 171 | + |
| 172 | + # separate plotting for 'ca' and 'mjca' |
| 173 | + if (identical(ca_class, "ca")) { |
| 174 | + if (sum(!is.na(augmented_data$row_data$sup_var)) == 0) { # catch case with no sup_vars |
| 175 | + # stop("Keine Sup_vars, muss ich noch implementieren") |
| 176 | + ggplot(augmented_data$full_data, aes(x = x, y = y, colour = Profil)) + |
| 177 | + ggrepel::geom_text_repel(aes(label = rowname), size = font_size, |
| 178 | + show.legend = F, max.iter = 5000, force = 4) + |
| 179 | + geom_point(show.legend = show.legend, size = 2) + |
| 180 | + geom_hline(yintercept = 0, linetype = "dashed", alpha = .5) + |
| 181 | + geom_vline(xintercept = 0, linetype = "dashed", alpha = .5) + |
| 182 | + labs(x = x_label, |
| 183 | + y = y_label, |
| 184 | + colour = NULL) |
| 185 | + } else { |
| 186 | + ggplot(augmented_data$full_data, aes(x = x, y = y, colour = Profil, |
| 187 | + shape = sup_var)) + |
| 188 | + ggrepel::geom_text_repel(aes(label = rowname), size = font_size, |
| 189 | + show.legend = F, max.iter = 5000, force = 4) + |
| 190 | + geom_point(show.legend = show.legend, size = 2) + |
| 191 | + geom_hline(yintercept = 0, linetype = "dashed", alpha = .5) + |
| 192 | + geom_vline(xintercept = 0, linetype = "dashed", alpha = .5) + |
| 193 | + labs(x = x_label, |
| 194 | + y = y_label, |
| 195 | + shape = NULL, |
| 196 | + colour = NULL) |
| 197 | + } |
| 198 | + } else if (identical(ca_class, "mjca")) { |
| 199 | + |
| 200 | + if (sum(!is.na(augmented_data$col_data$sup_var)) == 0) { |
| 201 | + # catch case with no sup_vars |
| 202 | + ggplot(augmented_data$col_data, aes(x = x, y = y)) + |
| 203 | + plot_parts(font_size = font_size, x_label = x_label, y_label = y_label, |
| 204 | + show.legend = show.legend) |
| 205 | + } else { |
| 206 | + ggplot(augmented_data$col_data, aes(x = x, y = y, colour = sup_var)) + |
| 207 | + plot_parts(font_size = font_size, x_label = x_label, y_label = y_label, |
| 208 | + show.legend = show.legend) |
| 209 | + } |
| 210 | + } |
| 211 | + |
| 212 | +} |
| 213 | + |
| 214 | + |
| 215 | + |
| 216 | + |
| 217 | +#' Parts for plotting ca |
| 218 | +plot_parts <- function(font_size = font_size, x_label = x_label, |
| 219 | + y_label = y_label, show.legend = show.legend) { |
| 220 | + list( |
| 221 | + geom_point(show.legend = show.legend), |
| 222 | + ggrepel::geom_text_repel(aes(label = rowname), size = font_size, show.legend = F, |
| 223 | + force = 2, max.iter = 5000), |
| 224 | + geom_hline(yintercept = 0, linetype = "dashed", alpha = .5), |
| 225 | + geom_vline(xintercept = 0, linetype = "dashed", alpha = .5), |
| 226 | + labs(x = x_label, |
| 227 | + y = y_label, |
| 228 | + shape = NULL, |
| 229 | + colour = NULL), |
| 230 | + theme(legend.position = "bottom", legend.direction = "horizontal") |
| 231 | + ) |
| 232 | +} |
| 233 | + |
| 234 | +#' Add rownames as column |
| 235 | +#' |
| 236 | +#' Helper function to add rownames |
| 237 | +prepare_data <- function(x) { |
| 238 | + x <- x %>% |
| 239 | + as.data.frame() %>% |
| 240 | + tibble::rownames_to_column() |
| 241 | + |
| 242 | + colnames(x) <- c("rowname", "x", "y") |
| 243 | + x |
| 244 | +} |
| 245 | + |
| 246 | +#' @export |
| 247 | +extract_ca_data <- function(object, dimensions = c(1, 2), |
| 248 | + map = "symmetric", keep_labels = FALSE) { |
| 249 | + |
| 250 | + ca_class <- class(object) |
| 251 | + |
| 252 | + # find points to plot |
| 253 | + pdf(file = NULL) |
| 254 | + base_data <- plot(object, dim = dimensions, map) |
| 255 | + dev.off() |
| 256 | + |
| 257 | + # add rownames |
| 258 | + all_data <- base_data %>% |
| 259 | + map(prepare_data) |
| 260 | + |
| 261 | + # extract col_data |
| 262 | + col_data <- all_data[["cols"]] %>% |
| 263 | + mutate(Profil = rep("Spaltenprofil", length(rowname)), |
| 264 | + rowname = if (!keep_labels) { |
| 265 | + stringr::str_replace(rowname, "(.*):", "") |
| 266 | + } else { rowname } ) |
| 267 | + |
| 268 | + |
| 269 | + # find supplementary cols |
| 270 | + if (identical(ca_class, "mjca")) { |
| 271 | + col_data <- col_data %>% |
| 272 | + slice(object$colsup) %>% |
| 273 | + mutate(sup_var = factor("Supplementary Variables", levels = |
| 274 | + c("Contributing Variables", "Supplementary Variables"))) %>% |
| 275 | + dplyr::select(rowname, sup_var) %>% |
| 276 | + full_join(col_data, by = "rowname") %>% |
| 277 | + replace_na(list(sup_var = "Contributing Variables")) |
| 278 | + |
| 279 | + return(list(col_data = col_data)) |
| 280 | + |
| 281 | + } else if (identical(ca_class, "ca")) { |
| 282 | + # create rowdata, in case class is 'ca' |
| 283 | + row_data <- all_data[["rows"]] %>% |
| 284 | + mutate(Profil = rep("Zeilenprofil", length(rowname))) |
| 285 | + |
| 286 | + if (length(object$rowsup) > 0) { |
| 287 | + row_data <- row_data %>% |
| 288 | + slice(object$rowsup) %>% |
| 289 | + mutate(sup_var = factor("Supplementary Variables", levels = |
| 290 | + c("Contributing Variables", "Supplementary Variables"))) %>% |
| 291 | + dplyr::select(rowname, sup_var) %>% |
| 292 | + full_join(row_data, by = "rowname") %>% |
| 293 | + replace_na(list(sup_var = "Contributing Variables")) |
| 294 | + } |
| 295 | + full_data <- bind_rows(col_data, row_data) %>% |
| 296 | + replace_na(list(sup_var = "Contributing Variables")) |
| 297 | + } |
| 298 | + |
| 299 | + list(full_data = full_data, col_data = col_data, row_data = row_data) |
| 300 | +} |
0 commit comments