|
| 1 | +library(DBI) |
| 2 | +library(dplyr) |
| 3 | +library(duckdb) |
| 4 | +library(arrow) |
| 5 | +library(tictoc) |
| 6 | + |
| 7 | + |
| 8 | +# Check existing data ----------------------------------------------------- |
| 9 | +# Number of files and total size |
| 10 | +svy_data <- fs::dir_info("input", recurse = TRUE) %>% |
| 11 | + filter(type == "file") %>% |
| 12 | + summarise(n = n(), size = sum(size)) |
| 13 | +glue::glue("There are {svy_data$n} files, totaling {svy_data$size}!") |
| 14 | + |
| 15 | +# Total number of rows (without loading dataset in memory) |
| 16 | +ds <- open_dataset("input") |
| 17 | +# full_collect <- summarise(ds, n = n()) %>% |
| 18 | +# collect() %>% |
| 19 | +# pull(n) |
| 20 | +# n_rows <- scales::unit_format(unit = "millions", scale = 1e-6, |
| 21 | +# accuracy = 1)(full_collect) |
| 22 | +# glue::glue("There are approximately {n_rows} rows!") |
| 23 | + |
| 24 | +# Compute welfare means by sub-groups |
| 25 | +# Weighted means not supported yet |
| 26 | +# tic() |
| 27 | +# ds %>% |
| 28 | +# select(country_code, year, welfare_type, reporting_level, welfare, weight) %>% |
| 29 | +# # use arrow to populate directly into a duckdb |
| 30 | +# arrow::to_duckdb() %>% |
| 31 | +# # calculate a new column, on disk! |
| 32 | +# group_by(country_code, year, reporting_level, welfare_type) %>% |
| 33 | +# summarise( |
| 34 | +# mean = mean(welfare) |
| 35 | +# ) %>% |
| 36 | +# collect() %>% |
| 37 | +# print() |
| 38 | +# toc() |
| 39 | + |
| 40 | +con <- dbConnect(drv = duckdb::duckdb(), |
| 41 | + dbdir = "./output/duckdb/pip.duckdb", |
| 42 | + read_only = FALSE) |
| 43 | +duckdb::duckdb_register_arrow(conn = con, |
| 44 | + name = "pip", |
| 45 | + arrow_scannable = ds) |
| 46 | + |
| 47 | +dbListTables(con) |
| 48 | +dbAppendTable() |
0 commit comments