Skip to content

Commit 3b22928

Browse files
committed
add R functions
1 parent 72cf575 commit 3b22928

File tree

3 files changed

+523
-0
lines changed

3 files changed

+523
-0
lines changed

R/functions.R

+300
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,300 @@
1+
create_var_overview <- function(path) {
2+
df <- read_csv(path, col_names = FALSE, n_max = 1, col_types = "c")
3+
4+
out <- df %>%
5+
pivot_longer(everything(), names_to = "var_id", values_to = "var_full")
6+
7+
write_csv(out, "data/processed/var_overview.csv")
8+
9+
out
10+
}
11+
12+
clean_raw_data <- function(df, wb_countries) {
13+
# remove test case
14+
df <- df %>%
15+
filter(X1 != "8")
16+
17+
# remove cases that are not on a research contract
18+
df <- df %>%
19+
filter(X11 == "Yes")
20+
21+
# remove tokens
22+
df <- df %>%
23+
select(-X6)
24+
25+
# remove qualitative questions which might expose participants
26+
df <- df %>%
27+
select(-c(X14, X20, X42, X43, X46, X70, X74, X77, X79, X81, X83, X86, X92))
28+
29+
# remove further columns which are superfluous/uniform across all responses
30+
df <- df %>%
31+
select(-c(X2, X4, X5, X13))
32+
33+
# add country information
34+
# Categories from:
35+
# https://datahelpdesk.worldbank.org/knowledgebase/articles/906519-world-bank-country-and-lending-groups
36+
wb_country_selection <- wb_countries %>%
37+
select(Economy, Region)
38+
39+
df <- df %>%
40+
left_join(wb_country_selection, by = c("X12" = "Economy"))
41+
42+
# check for errors
43+
no_missing_countries <- df %>%
44+
filter(is.na(Region)) %>%
45+
nrow(.) == 0
46+
stopifnot(no_missing_countries)
47+
48+
49+
# return final df
50+
df
51+
}
52+
53+
54+
merge_disciplines <- function(df, disciplines_df) {
55+
left_join(df, disciplines_df, by = c("X90", "X91"))
56+
}
57+
58+
read_clean_wb_countries <- function(file) {
59+
readxl::read_excel(file) %>%
60+
filter(!is.na(Region)) %>%
61+
mutate(Economy = str_remove(Economy, ",.*"),
62+
Economy = recode(Economy, `Russian Federation` = "Russia",
63+
`Slovak Republic` = "Slovakia"))
64+
}
65+
66+
67+
custom_adorns <- function(tabyl) {
68+
tabyl %>%
69+
adorn_totals(where = c("row", "col")) %>%
70+
adorn_percentages() %>%
71+
adorn_pct_formatting() %>%
72+
adorn_ns()
73+
}
74+
75+
76+
# functions for comparing personal and institutional scores
77+
# the function was further adapted to also incorporate other variants of how
78+
# variables were set up
79+
get_numeric_val <- function(x) {
80+
case_when(
81+
x == "Very important" ~ 1L,
82+
x == "Somewhat important" ~ 2L,
83+
x == "Important" ~ 2L,
84+
x == "Neither important nor unimportant" ~ 3L,
85+
x == "Somewhat unimportant" ~ 4L,
86+
x == "Unimportant" ~ 4L,
87+
x == "Very unimportant" ~ 5L,
88+
TRUE ~ NA_integer_
89+
)
90+
}
91+
92+
get_values <- function(df, vars, source = c("institutional", "personal"),
93+
var_overview) {
94+
base <- df %>%
95+
select(X1, {{ vars }}) %>%
96+
pivot_longer(-X1, names_to = "var", values_to = "val") %>%
97+
mutate(num_val = get_numeric_val(val)) %>%
98+
filter(!is.na(num_val)) %>%
99+
mutate(source = source)
100+
101+
# get labels
102+
vars_char <- deparse(substitute(vars))
103+
begin <- str_extract(vars_char, "\\d{2}") %>% as.numeric()
104+
end <- str_extract(vars_char, "\\d{2}$") %>% as.numeric()
105+
106+
labels <- var_overview %>%
107+
filter(var_id %in% paste0("X", begin:end)) %>%
108+
mutate(label = str_extract(var_full, "(?<=\\[).*?(?=\\s?\\])"),
109+
# clean up labels
110+
label = str_remove(label, "\\(.*"),
111+
label = str_remove(label, ", as assessed .*")) %>%
112+
select(var_id, label)
113+
114+
base %>%
115+
left_join(labels, by = c("var" = "var_id")) %>%
116+
select(-var)
117+
}
118+
119+
bootstrap_values <- function(df) {
120+
df %>%
121+
group_by(label, source) %>%
122+
summarise(res = list(Hmisc::smean.cl.boot(num_val))) %>%
123+
unnest_wider(res)
124+
}
125+
126+
five_point_scale <- function(compact = FALSE) {
127+
if (compact) {
128+
list(scale_x_continuous(
129+
breaks = c(1, 5),
130+
labels = c("Important", "Unimportant")),
131+
coord_cartesian(xlim = c(1, 5))
132+
)
133+
} else {
134+
list(scale_x_continuous(
135+
breaks = c(1, 3, 5),
136+
labels = c("Very important", "Neither/nor", "Very unimportant")),
137+
coord_cartesian(xlim = c(1, 5))
138+
)
139+
}
140+
}
141+
142+
143+
# correspondence analysis functions -----
144+
#' ca plots with ggplot2
145+
#'
146+
#' @export
147+
plot_ca <- function(object, font_size = 3, dimensions = c(1, 2),
148+
show.legend = F, map = "symmetric", keep_labels = FALSE) {
149+
ca_class <- class(object)
150+
if (!identical(ca_class, "ca") & !identical(ca_class, "mjca")) {
151+
stop("Input object must be of type 'ca' or 'mjca'")
152+
}
153+
assertthat::assert_that(length(dimensions) == 2)
154+
155+
156+
# find variances for labelling of axes
157+
variances <- suppressWarnings(summary(object))
158+
159+
dim1 <- variances$scree[dimensions[1], 3] %>% round(., 2) %>% unname()
160+
dim2 <- variances$scree[dimensions[2], 3] %>% round(., 2) %>% unname()
161+
162+
# create labels
163+
x_label <- paste0("Dimension ", dimensions[1], " (", dim1, "%)")
164+
y_label <- paste0("Dimension ", dimensions[2], " (", dim2, "%)")
165+
166+
167+
augmented_data <- extract_ca_data(object, dimensions, map, keep_labels)
168+
169+
170+
171+
172+
# separate plotting for 'ca' and 'mjca'
173+
if (identical(ca_class, "ca")) {
174+
if (sum(!is.na(augmented_data$row_data$sup_var)) == 0) { # catch case with no sup_vars
175+
# stop("Keine Sup_vars, muss ich noch implementieren")
176+
ggplot(augmented_data$full_data, aes(x = x, y = y, colour = Profil)) +
177+
ggrepel::geom_text_repel(aes(label = rowname), size = font_size,
178+
show.legend = F, max.iter = 5000, force = 4) +
179+
geom_point(show.legend = show.legend, size = 2) +
180+
geom_hline(yintercept = 0, linetype = "dashed", alpha = .5) +
181+
geom_vline(xintercept = 0, linetype = "dashed", alpha = .5) +
182+
labs(x = x_label,
183+
y = y_label,
184+
colour = NULL)
185+
} else {
186+
ggplot(augmented_data$full_data, aes(x = x, y = y, colour = Profil,
187+
shape = sup_var)) +
188+
ggrepel::geom_text_repel(aes(label = rowname), size = font_size,
189+
show.legend = F, max.iter = 5000, force = 4) +
190+
geom_point(show.legend = show.legend, size = 2) +
191+
geom_hline(yintercept = 0, linetype = "dashed", alpha = .5) +
192+
geom_vline(xintercept = 0, linetype = "dashed", alpha = .5) +
193+
labs(x = x_label,
194+
y = y_label,
195+
shape = NULL,
196+
colour = NULL)
197+
}
198+
} else if (identical(ca_class, "mjca")) {
199+
200+
if (sum(!is.na(augmented_data$col_data$sup_var)) == 0) {
201+
# catch case with no sup_vars
202+
ggplot(augmented_data$col_data, aes(x = x, y = y)) +
203+
plot_parts(font_size = font_size, x_label = x_label, y_label = y_label,
204+
show.legend = show.legend)
205+
} else {
206+
ggplot(augmented_data$col_data, aes(x = x, y = y, colour = sup_var)) +
207+
plot_parts(font_size = font_size, x_label = x_label, y_label = y_label,
208+
show.legend = show.legend)
209+
}
210+
}
211+
212+
}
213+
214+
215+
216+
217+
#' Parts for plotting ca
218+
plot_parts <- function(font_size = font_size, x_label = x_label,
219+
y_label = y_label, show.legend = show.legend) {
220+
list(
221+
geom_point(show.legend = show.legend),
222+
ggrepel::geom_text_repel(aes(label = rowname), size = font_size, show.legend = F,
223+
force = 2, max.iter = 5000),
224+
geom_hline(yintercept = 0, linetype = "dashed", alpha = .5),
225+
geom_vline(xintercept = 0, linetype = "dashed", alpha = .5),
226+
labs(x = x_label,
227+
y = y_label,
228+
shape = NULL,
229+
colour = NULL),
230+
theme(legend.position = "bottom", legend.direction = "horizontal")
231+
)
232+
}
233+
234+
#' Add rownames as column
235+
#'
236+
#' Helper function to add rownames
237+
prepare_data <- function(x) {
238+
x <- x %>%
239+
as.data.frame() %>%
240+
tibble::rownames_to_column()
241+
242+
colnames(x) <- c("rowname", "x", "y")
243+
x
244+
}
245+
246+
#' @export
247+
extract_ca_data <- function(object, dimensions = c(1, 2),
248+
map = "symmetric", keep_labels = FALSE) {
249+
250+
ca_class <- class(object)
251+
252+
# find points to plot
253+
pdf(file = NULL)
254+
base_data <- plot(object, dim = dimensions, map)
255+
dev.off()
256+
257+
# add rownames
258+
all_data <- base_data %>%
259+
map(prepare_data)
260+
261+
# extract col_data
262+
col_data <- all_data[["cols"]] %>%
263+
mutate(Profil = rep("Spaltenprofil", length(rowname)),
264+
rowname = if (!keep_labels) {
265+
stringr::str_replace(rowname, "(.*):", "")
266+
} else { rowname } )
267+
268+
269+
# find supplementary cols
270+
if (identical(ca_class, "mjca")) {
271+
col_data <- col_data %>%
272+
slice(object$colsup) %>%
273+
mutate(sup_var = factor("Supplementary Variables", levels =
274+
c("Contributing Variables", "Supplementary Variables"))) %>%
275+
dplyr::select(rowname, sup_var) %>%
276+
full_join(col_data, by = "rowname") %>%
277+
replace_na(list(sup_var = "Contributing Variables"))
278+
279+
return(list(col_data = col_data))
280+
281+
} else if (identical(ca_class, "ca")) {
282+
# create rowdata, in case class is 'ca'
283+
row_data <- all_data[["rows"]] %>%
284+
mutate(Profil = rep("Zeilenprofil", length(rowname)))
285+
286+
if (length(object$rowsup) > 0) {
287+
row_data <- row_data %>%
288+
slice(object$rowsup) %>%
289+
mutate(sup_var = factor("Supplementary Variables", levels =
290+
c("Contributing Variables", "Supplementary Variables"))) %>%
291+
dplyr::select(rowname, sup_var) %>%
292+
full_join(row_data, by = "rowname") %>%
293+
replace_na(list(sup_var = "Contributing Variables"))
294+
}
295+
full_data <- bind_rows(col_data, row_data) %>%
296+
replace_na(list(sup_var = "Contributing Variables"))
297+
}
298+
299+
list(full_data = full_data, col_data = col_data, row_data = row_data)
300+
}

0 commit comments

Comments
 (0)