|
| 1 | +# Copyright 2022 Province of British Columbia |
| 2 | +# |
| 3 | +# Licensed under the Apache License, Version 2.0 (the "License"); |
| 4 | +# you may not use this file except in compliance with the License. |
| 5 | +# You may obtain a copy of the License at |
| 6 | +# |
| 7 | +# http://www.apache.org/licenses/LICENSE-2.0 |
| 8 | +# |
| 9 | +# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, |
| 10 | +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| 11 | +# See the License for the specific language governing permissions and limitations under the License. |
| 12 | + |
| 13 | +#' TO DO before sourcing this file: |
| 14 | +#' |
| 15 | +#' Get an RTRA account. (application form in directory `SAS`) |
| 16 | +#' upload the 2 .SAS files in directory `SAS` to https://www75.statcan.gc.ca/eft-tef/en/operations (To StatCan). |
| 17 | +#' grab a coffee... |
| 18 | +#' download the 2 resulting csv files (From StatCan) and place in directory "data/current". |
| 19 | + |
| 20 | +#' Note that Jan 2026 the SAS files will need to be updated. |
| 21 | +tictoc::tic() |
| 22 | +# libraries-------------- |
| 23 | +library(tidyverse) |
| 24 | +library(lubridate) |
| 25 | +library(readxl) |
| 26 | +library(XLConnect) |
| 27 | +library(scales) |
| 28 | +library(fpp3) |
| 29 | +# constants--------------- |
| 30 | +#ma_months <- 3 #how many months to use for smoothing the data |
| 31 | +accuracy_large <- 100 #levels rounded to nearest hundred |
| 32 | +accuracy_small <- .1 #percentages rounded to nearest tenth |
| 33 | +# Functions-------------------- |
| 34 | +source(here::here("R", "functions.R")) |
| 35 | +# Start by creating a mapping file from naics to various levels of aggregation---------------- |
| 36 | +# input file mapping.xlsx uses leading spaces to indicate hierarchy.... |
| 37 | +human_mapping <- read_excel(here::here("data", "mapping.xlsx"), trim_ws = FALSE) |
| 38 | + |
| 39 | +raw_mapping <- human_mapping%>% |
| 40 | + janitor::clean_names() %>% |
| 41 | + mutate( |
| 42 | + spaces = str_count(industry, "\\G "), |
| 43 | + agg = case_when( |
| 44 | + spaces == 0 ~ "high", |
| 45 | + spaces %in% 2:4 ~ "medium", |
| 46 | + spaces %in% 5:6 ~ "low" |
| 47 | + ), |
| 48 | + industry = trimws(industry) |
| 49 | + ) |
| 50 | +#relationship between each industry and the three levels of aggregation (used for excel layout)------------ |
| 51 | +agg <- raw_mapping %>% |
| 52 | + select(-naics) %>% |
| 53 | + mutate( |
| 54 | + high = if_else(agg == "high", industry, NA_character_), |
| 55 | + medium = if_else(agg %in% c("high", "medium"), industry, NA_character_), |
| 56 | + low = if_else(agg == "low", industry, NA_character_) |
| 57 | + ) %>% |
| 58 | + fill(high, .direction = "down") %>% |
| 59 | + fill(medium, .direction = "down") %>% |
| 60 | + select(industry, high, medium, low) |
| 61 | + |
| 62 | +#write_csv(agg, here::here("temp","layout.csv")) |
| 63 | + |
| 64 | + |
| 65 | +# get the naics for the lowest level of aggregation--------------- |
| 66 | +low <- raw_mapping %>% |
| 67 | + filter(agg == "low") %>% |
| 68 | + select(low = industry, |
| 69 | + naics) %>% |
| 70 | + group_by(low) %>% |
| 71 | + nest()%>% |
| 72 | + mutate( |
| 73 | + data = map(data, separate_naics), |
| 74 | + data = map(data, fill_wrapper) |
| 75 | + ) %>% |
| 76 | + unnest(data)%>% |
| 77 | + unnest(naics) |
| 78 | + |
| 79 | +# get the naics for the medium level of aggregation--------------- |
| 80 | +medium <- raw_mapping %>% |
| 81 | + filter(agg == "medium") %>% |
| 82 | + select(medium = industry, naics) %>% |
| 83 | + group_by(medium) %>% |
| 84 | + nest() %>% |
| 85 | + mutate( |
| 86 | + data = map(data, separate_naics), |
| 87 | + data = map(data, fill_wrapper) |
| 88 | + ) %>% |
| 89 | + unnest(data) %>% |
| 90 | + unnest(naics) |
| 91 | + |
| 92 | +# get the naics for the high level of aggregation--------------- |
| 93 | +high <- raw_mapping %>% |
| 94 | + filter(agg == "high") %>% |
| 95 | + select(high = industry, |
| 96 | + naics) %>% |
| 97 | + group_by(high) %>% |
| 98 | + nest() %>% |
| 99 | + mutate( |
| 100 | + data = map(data, separate_naics), |
| 101 | + data = map(data, fill_wrapper) |
| 102 | + ) %>% |
| 103 | + unnest(data) %>% |
| 104 | + unnest(naics) |
| 105 | +# join by naics to get the mapping file from naics to the 3 levels of aggregation. |
| 106 | +mapping <- high%>% |
| 107 | + full_join(medium, by="naics")%>% |
| 108 | + full_join(low, by= "naics")%>% |
| 109 | + select(naics, everything()) |
| 110 | + |
| 111 | +#write_csv(mapping, here::here("temp","mapping.csv")) |
| 112 | + |
| 113 | +# read in the data and join with mapping file to get aggregation info ------------------- |
| 114 | +ftpt <- read_naics("ftptemp4digNAICS", ftpt)%>% |
| 115 | + inner_join(mapping, by = "naics") |
| 116 | +status <- read_naics("lfsstat4digNAICS", lf_stat)%>% |
| 117 | + inner_join(mapping, by = "naics") |
| 118 | +all_data <- bind_rows(ftpt, status) |
| 119 | + |
| 120 | +#data is zero padded to the end of the current year... figure out the last month from data. |
| 121 | +max_date <- all_data%>% |
| 122 | + group_by(date, name)%>% |
| 123 | + summarize(value=mean(value))%>% |
| 124 | + ungroup()%>% |
| 125 | + filter(!near(value, 0))%>% |
| 126 | + filter(date==max(date))%>% |
| 127 | + pull(date)%>% |
| 128 | + unique() |
| 129 | + |
| 130 | +truncated <- all_data%>% |
| 131 | + filter(date <= max_date) |
| 132 | + |
| 133 | +# output file has dates as column headings... get the necessary dates----------- |
| 134 | +current <- format(max(truncated$date), "%b-%y") |
| 135 | +previous_month <- format(max(truncated$date) - months(1), "%b-%y") |
| 136 | +previous_year <- format(max(truncated$date) - years(1), "%b-%y") |
| 137 | +# aggregate the data to the three levels------------- |
| 138 | +high_agg <- agg_level(truncated, high) |
| 139 | +medium_agg <- agg_level(truncated, medium) |
| 140 | +low_agg <- agg_level(truncated, low) |
| 141 | + |
| 142 | +# bind the 3 levels of aggregation together then... |
| 143 | +smoothed_data <- bind_rows(high_agg, medium_agg, low_agg)%>% |
| 144 | + na.omit() %>% |
| 145 | + mutate(#data = map(data, stl_smooth), #thought this might be better than simple moving average... |
| 146 | + #data = map(data, trail_ma, months = ma_months), # simple moving average smooth of data |
| 147 | + data = map(data, add_vars)) #add in labour force and unemployment rate |
| 148 | + |
| 149 | +smoothed_with_mapping <- full_join(smoothed_data, agg, by=c("agg_level"="industry"))%>% |
| 150 | + mutate(data=map(data, na.omit))%>% |
| 151 | + unnest(data)%>% |
| 152 | + group_by(agg_level, high, medium, low, name)%>% |
| 153 | + nest()%>% |
| 154 | + mutate(name=str_to_title(str_replace_all(name, "_", " ")), |
| 155 | + data=map(data, pivot_wider, names_from="date", values_from="value")) |
| 156 | + |
| 157 | +write_rds(smoothed_with_mapping, here::here("temp","smoothed_with_mapping.rds")) |
| 158 | + |
| 159 | + keep_list <- c("agg_level", |
| 160 | + "trend_strength", |
| 161 | + "seasonal_strength_year", |
| 162 | + "spikiness", |
| 163 | + "linearity", |
| 164 | + "curvature", |
| 165 | + "shift_level_max", |
| 166 | + "spectral_entropy", |
| 167 | + "coef_hurst" |
| 168 | + ) |
| 169 | + |
| 170 | + for_pca <- smoothed_with_mapping %>% |
| 171 | + unnest(data)%>% |
| 172 | + pivot_longer(cols=-c(agg_level, name,high,medium,low), names_to = "date", values_to = "value")%>% |
| 173 | + filter(agg_level==high)%>% |
| 174 | + ungroup()%>% |
| 175 | + select(-high, -medium, -low)%>% |
| 176 | + mutate(date=tsibble::yearmonth(date))%>% |
| 177 | + group_by(name)%>% |
| 178 | + nest()%>% |
| 179 | + mutate(data=map(data, tsibble::tsibble, key=agg_level, index=date), |
| 180 | + features=map(data, function(tsbbl) tsbbl %>% features(value, feature_set(pkgs = "feasts"))), |
| 181 | + features=map(features, select, all_of(keep_list)), |
| 182 | + features=map(features, column_to_rownames, var="agg_level"), |
| 183 | + features=map(features, fix_column_names), |
| 184 | + pcs=map(features, prcomp, scale=TRUE) |
| 185 | + ) |
| 186 | + |
| 187 | + write_rds(for_pca, here::here("temp","for_pca.rds")) |
| 188 | + |
| 189 | +no_format <- smoothed_data %>% |
| 190 | + mutate(current = map(data, get_smoothed, 0), # get current value of smoothed data |
| 191 | + last_month = map(data, get_smoothed, 1), |
| 192 | + last_year = map(data, get_smoothed, 12), |
| 193 | + current_ytd_ave = map(data, ytd_ave, 0), # year to date average of smoothed data |
| 194 | + previous_ytd_ave = map(data, ytd_ave, 1) |
| 195 | + )%>% |
| 196 | + select(-data)%>% |
| 197 | + mutate(#join all the dataframes created above for unnesting below (unnesting individually creates sparse dataframe) |
| 198 | + data = map2(current, last_month, full_join, by = "name"), |
| 199 | + data = map2(data, last_year, full_join, by = "name"), |
| 200 | + data = map2(data, current_ytd_ave, full_join, by = "name"), |
| 201 | + data = map2(data, previous_ytd_ave, full_join, by = "name") |
| 202 | + ) %>% |
| 203 | + select(agg_level, data) %>% |
| 204 | + unnest(data)%>% |
| 205 | + rename( |
| 206 | + current = value.x, # fix the names messed up by joins above |
| 207 | + previous_month = value.y, |
| 208 | + previous_year = value, |
| 209 | + current_ytd_average = ytd_ave.x, |
| 210 | + previous_ytd_average = ytd_ave.y |
| 211 | + )%>% |
| 212 | + mutate(#create some variables |
| 213 | + level_change_year = current - previous_year, |
| 214 | + level_change_month = current - previous_month, |
| 215 | + level_change_ytd = current_ytd_average - previous_ytd_average, |
| 216 | + percent_change_year = level_change_year / previous_year, |
| 217 | + percent_change_month = level_change_month / previous_month, |
| 218 | + percent_change_ytd = level_change_ytd / previous_ytd_average) |
| 219 | + |
| 220 | + |
| 221 | +full_join(no_format, agg, by=c("agg_level"="industry"))%>% |
| 222 | + ungroup()%>% |
| 223 | + group_by(agg_level, high, medium, low, name)%>% |
| 224 | + nest()%>% |
| 225 | + mutate(name=str_to_title(str_replace_all(name, "_", " ")))%>% |
| 226 | +write_rds(here::here("temp","for_plots.rds")) |
| 227 | + |
| 228 | +# formatting the output for excel |
| 229 | +with_formatting <- no_format%>% |
| 230 | + mutate(percent_change_year = percent(percent_change_year, accuracy = accuracy_small), |
| 231 | + percent_change_month = percent(percent_change_month, accuracy = accuracy_small), |
| 232 | + percent_change_ytd = percent(percent_change_ytd, accuracy = accuracy_small), |
| 233 | + current = case_when(name=="unemployment_rate" ~ percent(current, accuracy = accuracy_small), |
| 234 | + current < 1500 ~ "suppressed", |
| 235 | + TRUE ~ comma(current, accuracy = accuracy_large)), |
| 236 | + previous_year=case_when(name=="unemployment_rate" ~ percent(previous_year, accuracy = accuracy_small), |
| 237 | + previous_year<1500 ~ "suppressed", |
| 238 | + TRUE ~ comma(previous_year, accuracy = accuracy_large)), |
| 239 | + previous_month=case_when(name=="unemployment_rate" ~ percent(previous_month, accuracy = accuracy_small), |
| 240 | + previous_month<1500 ~ "suppressed", |
| 241 | + TRUE ~ comma(previous_month, accuracy = accuracy_large)), |
| 242 | + level_change_year = if_else(name == "unemployment_rate", |
| 243 | + percent(level_change_year, accuracy = accuracy_small), |
| 244 | + comma(level_change_year, accuracy = accuracy_large)), |
| 245 | + level_change_month = if_else(name == "unemployment_rate", |
| 246 | + percent(level_change_month, accuracy = accuracy_small), |
| 247 | + comma(level_change_month, accuracy = accuracy_large)), |
| 248 | + level_change_ytd = if_else(name == "unemployment_rate", |
| 249 | + percent(level_change_ytd, accuracy = accuracy_small), |
| 250 | + comma(level_change_ytd, accuracy = accuracy_large)), |
| 251 | + current_ytd_average = if_else(name == "unemployment_rate", |
| 252 | + percent(current_ytd_average, accuracy = accuracy_small), |
| 253 | + comma(current_ytd_average, accuracy = accuracy_large)), |
| 254 | + previous_ytd_average = if_else(name == "unemployment_rate", |
| 255 | + percent(previous_ytd_average, accuracy = accuracy_small), |
| 256 | + comma(previous_ytd_average, accuracy = accuracy_large)) |
| 257 | + ) %>% |
| 258 | + left_join(agg, by = c("agg_level" = "industry"))%>% #agg is the mapping from industry to the 3 levels of aggregation |
| 259 | + mutate(medium = ifelse(agg_level == high, paste0("1", medium), medium)) %>%# allows high level industries to be at top of sorted medium industries. |
| 260 | + group_by(high) %>% |
| 261 | + nest() %>% |
| 262 | + mutate( |
| 263 | + data = map(data, arrange, name, medium), # arranges data by medium level of aggregation (except high level at top because of pasted 1) |
| 264 | + data = map(data, unfill_var, name), # replaces fixed values with blanks. (excel formatting) |
| 265 | + data = map(data, indent_industry), # indents industry to indicate hierarchy. |
| 266 | + data = map(data, select, -medium, -low), # gets rid of aggregation levels |
| 267 | + data = map(data, clean_up) # assigns the desired column names and puts in the correct order |
| 268 | + ) %>% |
| 269 | + filter(!is.na(high)) |
| 270 | + |
| 271 | +write_rds(with_formatting, here::here("temp","for_tables.rds")) |
| 272 | + |
| 273 | +# write to excel----------------- |
| 274 | +wb <- loadWorkbook(here::here("data", "template.xlsx")) # get the desired sheet header |
| 275 | +createSheet(wb, name = "Mapping for humans") |
| 276 | +setColumnWidth(wb, sheet = "Mapping for humans", column = 1:2, width = c(24000,7000)) |
| 277 | +writeWorksheet(wb, human_mapping, sheet="Mapping for humans") |
| 278 | +createSheet(wb, name = "Mapping for machines") |
| 279 | +setColumnWidth(wb, sheet = "Mapping for machines", column = 2:4, width = c(16000, 24000, 16000)) |
| 280 | +writeWorksheet(wb, mapping, sheet="Mapping for machines") |
| 281 | +with_formatting%>% |
| 282 | + mutate(walk2(data, high, write_sheet)) # replicates the template sheet and writes data to each sheet |
| 283 | +removeSheet(wb, "layout") # get rid of the template |
| 284 | +saveWorkbook(wb, here::here("out", "current", "industry_snapshots.xlsx"))#write to file |
| 285 | +tictoc::toc() |
| 286 | + |
0 commit comments