Skip to content

Commit 2236832

Browse files
committed
initial commit
0 parents  commit 2236832

7 files changed

+145
-0
lines changed

.gitignore

+7
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,7 @@
1+
.Rproj.user
2+
.Rhistory
3+
.Rdata
4+
.httr-oauth
5+
.DS_Store
6+
/input
7+
/output

R/00-create-data-inputs.R

+42
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
library(pipapi)
2+
library(arrow)
3+
4+
lkups <- pipapi::create_versioned_lkups(Sys.getenv("PIPAPI_DATA_ROOT_FOLDER"))
5+
svy_lkup <- lkups$versions_paths$latest_release$svy_lkup
6+
7+
8+
#' Create a dataframe from survey data and metadata
9+
#'
10+
#' @param lkup_row
11+
#'
12+
#' @return data.frame
13+
create_svy_file <- function(lkup_row) {
14+
tmp <- fst::read_fst(lkup_row$path)
15+
tmp$country_code <- lkup_row$country_code
16+
tmp$year <- lkup_row$reporting_year
17+
tmp$reporting_level <- lkup_row$reporting_level
18+
tmp$welfare_type <- lkup_row$welfare_type
19+
20+
return(tmp)
21+
}
22+
23+
24+
#' Save created data.frame as parquet file for ingestion into duckdb or use with arrow
25+
#'
26+
#' @param df
27+
#' @param file_name
28+
#' @param path
29+
#'
30+
save_svy_file <- function(df, file_name, path) {
31+
arrow::write_parquet(df, sink = paste0(path, "/", file_name, ".parquet"))
32+
}
33+
34+
# Loop over all available .fst files and create .parquet
35+
offset <- 0
36+
for (i in (seq_along(svy_lkup$cache_id) + offset)) {
37+
lkup_row <- svy_lkup[i, ]
38+
tmp <- create_svy_file(lkup_row)
39+
save_svy_file(tmp, file_name = lkup_row$cache_id, path = "./input")
40+
print(cat(offset + i, ": ", lkup_row$cache_id))
41+
gc()
42+
}

R/01-creating-duckdb.R

+16
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
library(DBI)
2+
library(dplyr)
3+
library(duckdb)
4+
library(arrow)
5+
6+
7+
# Check existing data -----------------------------------------------------
8+
9+
# Initiate empty database
10+
con <- dbConnect(drv = duckdb::duckdb(),
11+
dbdir = "./output/duckdb/pip.duckdb",
12+
read_only = FALSE)
13+
# Populate database with a table of all survey data
14+
dbSendQuery(con, "CREATE TABLE survey_data AS SELECT * FROM parquet_scan('input/*.parquet')")
15+
16+
dbDisconnect(con)

R/02-analyze-duckdb.R

+19
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
library(DBI)
2+
library(dplyr)
3+
library(duckdb)
4+
library(arrow)
5+
library(tictoc)
6+
7+
# Initiate connection
8+
con <- dbConnect(drv = duckdb::duckdb(),
9+
dbdir = "./output/duckdb/pip.duckdb",
10+
read_only = TRUE)
11+
12+
tbl(con, "survey_data") %>%
13+
# arrow::to_arrow() %>%
14+
select(country_code) %>%
15+
distinct() %>%
16+
collect()
17+
18+
tbl(con, "survey_data") %>%
19+
count()

R/testing-arrow.R

+48
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,48 @@
1+
library(DBI)
2+
library(dplyr)
3+
library(duckdb)
4+
library(arrow)
5+
library(tictoc)
6+
7+
8+
# Check existing data -----------------------------------------------------
9+
# Number of files and total size
10+
svy_data <- fs::dir_info("input", recurse = TRUE) %>%
11+
filter(type == "file") %>%
12+
summarise(n = n(), size = sum(size))
13+
glue::glue("There are {svy_data$n} files, totaling {svy_data$size}!")
14+
15+
# Total number of rows (without loading dataset in memory)
16+
ds <- open_dataset("input")
17+
# full_collect <- summarise(ds, n = n()) %>%
18+
# collect() %>%
19+
# pull(n)
20+
# n_rows <- scales::unit_format(unit = "millions", scale = 1e-6,
21+
# accuracy = 1)(full_collect)
22+
# glue::glue("There are approximately {n_rows} rows!")
23+
24+
# Compute welfare means by sub-groups
25+
# Weighted means not supported yet
26+
# tic()
27+
# ds %>%
28+
# select(country_code, year, welfare_type, reporting_level, welfare, weight) %>%
29+
# # use arrow to populate directly into a duckdb
30+
# arrow::to_duckdb() %>%
31+
# # calculate a new column, on disk!
32+
# group_by(country_code, year, reporting_level, welfare_type) %>%
33+
# summarise(
34+
# mean = mean(welfare)
35+
# ) %>%
36+
# collect() %>%
37+
# print()
38+
# toc()
39+
40+
con <- dbConnect(drv = duckdb::duckdb(),
41+
dbdir = "./output/duckdb/pip.duckdb",
42+
read_only = FALSE)
43+
duckdb::duckdb_register_arrow(conn = con,
44+
name = "pip",
45+
arrow_scannable = ds)
46+
47+
dbListTables(con)
48+
dbAppendTable()

docs/SIGMOD2019-demo-duckdb.pdf

952 KB
Binary file not shown.

testing-duckdb.Rproj

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
Version: 1.0
2+
3+
RestoreWorkspace: Default
4+
SaveWorkspace: Default
5+
AlwaysSaveHistory: Default
6+
7+
EnableCodeIndexing: Yes
8+
UseSpacesForTab: Yes
9+
NumSpacesForTab: 2
10+
Encoding: UTF-8
11+
12+
RnwWeave: Sweave
13+
LaTeX: pdfLaTeX

0 commit comments

Comments
 (0)