initial commit

tonyfujs · tonyfujs · commit 223683285b55 · 2021-12-24T13:21:21.000-03:00
diff --git a/.gitignore b/.gitignore
@@ -0,0 +1,7 @@
+.Rproj.user
+.Rhistory
+.Rdata
+.httr-oauth
+.DS_Store
+/input
+/output
diff --git a/R/00-create-data-inputs.R b/R/00-create-data-inputs.R
@@ -0,0 +1,42 @@
+library(pipapi)
+library(arrow)
+
+lkups <- pipapi::create_versioned_lkups(Sys.getenv("PIPAPI_DATA_ROOT_FOLDER"))
+svy_lkup <- lkups$versions_paths$latest_release$svy_lkup
+
+
+#' Create a dataframe from survey data and metadata
+#'
+#' @param lkup_row 
+#'
+#' @return data.frame
+create_svy_file <- function(lkup_row) {
+  tmp <- fst::read_fst(lkup_row$path)
+  tmp$country_code <- lkup_row$country_code
+  tmp$year <- lkup_row$reporting_year
+  tmp$reporting_level <- lkup_row$reporting_level
+  tmp$welfare_type <- lkup_row$welfare_type
+  
+  return(tmp)
+}
+
+
+#' Save created data.frame as parquet file for ingestion into duckdb or use with arrow
+#'
+#' @param df 
+#' @param file_name 
+#' @param path 
+#' 
+save_svy_file <- function(df, file_name, path) {
+  arrow::write_parquet(df, sink = paste0(path, "/", file_name, ".parquet"))
+} 
+
+# Loop over all available .fst files and create .parquet
+offset <- 0
+for (i in (seq_along(svy_lkup$cache_id) + offset)) {
+  lkup_row <- svy_lkup[i, ]
+  tmp <- create_svy_file(lkup_row)
+  save_svy_file(tmp, file_name = lkup_row$cache_id, path = "./input")
+  print(cat(offset + i, ": ", lkup_row$cache_id))
+  gc()
+}
diff --git a/R/01-creating-duckdb.R b/R/01-creating-duckdb.R
@@ -0,0 +1,16 @@
+library(DBI)
+library(dplyr)
+library(duckdb)
+library(arrow)
+
+
+# Check existing data -----------------------------------------------------
+
+# Initiate empty database
+con <- dbConnect(drv = duckdb::duckdb(), 
+                 dbdir = "./output/duckdb/pip.duckdb", 
+                 read_only = FALSE)
+# Populate database with a table of all survey data
+dbSendQuery(con, "CREATE TABLE survey_data AS SELECT * FROM parquet_scan('input/*.parquet')")
+
+dbDisconnect(con)
diff --git a/R/02-analyze-duckdb.R b/R/02-analyze-duckdb.R
@@ -0,0 +1,19 @@
+library(DBI)
+library(dplyr)
+library(duckdb)
+library(arrow)
+library(tictoc)
+
+# Initiate connection
+con <- dbConnect(drv = duckdb::duckdb(), 
+                 dbdir = "./output/duckdb/pip.duckdb", 
+                 read_only = TRUE)
+
+tbl(con, "survey_data") %>%
+  # arrow::to_arrow() %>%
+  select(country_code) %>%
+  distinct() %>%
+  collect()
+  
+tbl(con, "survey_data") %>%
+  count() 
diff --git a/R/testing-arrow.R b/R/testing-arrow.R
@@ -0,0 +1,48 @@
+library(DBI)
+library(dplyr)
+library(duckdb)
+library(arrow)
+library(tictoc)
+
+
+# Check existing data -----------------------------------------------------
+# Number of files and total size
+svy_data <- fs::dir_info("input", recurse = TRUE) %>%
+  filter(type == "file") %>% 
+  summarise(n = n(), size = sum(size)) 
+glue::glue("There are {svy_data$n} files, totaling {svy_data$size}!")
+
+# Total number of rows (without loading dataset in memory)
+ds <- open_dataset("input")
+# full_collect <- summarise(ds, n = n()) %>% 
+#   collect() %>% 
+#   pull(n)
+# n_rows <- scales::unit_format(unit = "millions", scale = 1e-6, 
+#                               accuracy = 1)(full_collect)
+# glue::glue("There are approximately {n_rows} rows!")
+
+# Compute welfare means by sub-groups
+# Weighted means not supported yet
+# tic()
+# ds %>%
+#   select(country_code, year, welfare_type, reporting_level, welfare, weight) %>%
+#   # use arrow to populate directly into a duckdb
+#   arrow::to_duckdb() %>% 
+#   # calculate a new column, on disk!
+#   group_by(country_code, year, reporting_level, welfare_type) %>%
+#   summarise(
+#     mean = mean(welfare)
+#     ) %>%
+#   collect() %>%
+#   print()
+# toc()
+
+con <- dbConnect(drv = duckdb::duckdb(), 
+                 dbdir = "./output/duckdb/pip.duckdb", 
+                 read_only = FALSE)
+duckdb::duckdb_register_arrow(conn = con, 
+                              name = "pip", 
+                              arrow_scannable = ds)
+
+dbListTables(con)
+dbAppendTable()
diff --git a/docs/SIGMOD2019-demo-duckdb.pdf b/docs/SIGMOD2019-demo-duckdb.pdf
diff --git a/testing-duckdb.Rproj b/testing-duckdb.Rproj
@@ -0,0 +1,13 @@
+Version: 1.0
+
+RestoreWorkspace: Default
+SaveWorkspace: Default
+AlwaysSaveHistory: Default
+
+EnableCodeIndexing: Yes
+UseSpacesForTab: Yes
+NumSpacesForTab: 2
+Encoding: UTF-8
+
+RnwWeave: Sweave
+LaTeX: pdfLaTeX