Skip to content

Commit dbfe041

Browse files
Merge branch 'main' into repo-mountie/add-license
2 parents 135167b + 6525cae commit dbfe041

16 files changed

+1524
-4
lines changed

00_source_me.R

+52
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,52 @@
1+
# Copyright 2022 Province of British Columbia
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS,
10+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
# See the License for the specific language governing permissions and limitations under the License.
12+
13+
#' TO DO before sourcing this file:
14+
#'
15+
#' Get an RTRA account. (application form in directory `SAS`)
16+
#' upload the 2 .SAS files in directory `SAS` to https://www75.statcan.gc.ca/eft-tef/en/operations (To StatCan).
17+
#' grab a coffee...
18+
#' download the 2 resulting csv files (From StatCan) and place in directory "data/current".
19+
20+
#' Note that Jan 2026 the SAS files will need to be updated.
21+
#'
22+
#' Output found in directory out/current.
23+
24+
library(assertthat)
25+
library(here)
26+
#are the required files where they are supposed to be?----------------
27+
assert_that(length(list.files(here("data","current"), pattern="ftptemp4digNAICS"))==2,
28+
msg="2 files with the pattern ftptemp4digNAICS must be in folder data/current")
29+
assert_that(length(list.files(here("data","current"), pattern="lfsstat4digNAICS"))==2,
30+
msg="2 files with the pattern lfsstat4digNAICS must be in folder data/current")
31+
assert_that(length(list.files(here("data"), pattern="mapping.xlsx"))==1,
32+
msg="The file mapping.xlsx must be in folder data")
33+
assert_that(length(list.files(here("data"), pattern="template"))==1,
34+
msg="The file template.xlsx must be in folder data")
35+
36+
#archive old output
37+
filesstrings::file.move(here("out","current", list.files(here("out", "current"))),
38+
here("out", "old"))
39+
#create new output
40+
source("01_process_data.R")
41+
rmarkdown::render("02_dashboard.Rmd",
42+
output_file = str_replace_all(paste0("LFS_industry_profiles",date(),".html")," ","_"),
43+
output_dir = here::here("out","current"))
44+
45+
46+
#archive ita input files--------
47+
#filesstrings::file.move(here("data","current_ita", list.files(here("data", "current_ita"))), here("data", "old_ita"))
48+
49+
50+
51+
52+

01_process_data.R

+286
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,286 @@
1+
# Copyright 2022 Province of British Columbia
2+
#
3+
# Licensed under the Apache License, Version 2.0 (the "License");
4+
# you may not use this file except in compliance with the License.
5+
# You may obtain a copy of the License at
6+
#
7+
# http://www.apache.org/licenses/LICENSE-2.0
8+
#
9+
# Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS,
10+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11+
# See the License for the specific language governing permissions and limitations under the License.
12+
13+
#' TO DO before sourcing this file:
14+
#'
15+
#' Get an RTRA account. (application form in directory `SAS`)
16+
#' upload the 2 .SAS files in directory `SAS` to https://www75.statcan.gc.ca/eft-tef/en/operations (To StatCan).
17+
#' grab a coffee...
18+
#' download the 2 resulting csv files (From StatCan) and place in directory "data/current".
19+
20+
#' Note that Jan 2026 the SAS files will need to be updated.
21+
tictoc::tic()
22+
# libraries--------------
23+
library(tidyverse)
24+
library(lubridate)
25+
library(readxl)
26+
library(XLConnect)
27+
library(scales)
28+
library(fpp3)
29+
# constants---------------
30+
#ma_months <- 3 #how many months to use for smoothing the data
31+
accuracy_large <- 100 #levels rounded to nearest hundred
32+
accuracy_small <- .1 #percentages rounded to nearest tenth
33+
# Functions--------------------
34+
source(here::here("R", "functions.R"))
35+
# Start by creating a mapping file from naics to various levels of aggregation----------------
36+
# input file mapping.xlsx uses leading spaces to indicate hierarchy....
37+
human_mapping <- read_excel(here::here("data", "mapping.xlsx"), trim_ws = FALSE)
38+
39+
raw_mapping <- human_mapping%>%
40+
janitor::clean_names() %>%
41+
mutate(
42+
spaces = str_count(industry, "\\G "),
43+
agg = case_when(
44+
spaces == 0 ~ "high",
45+
spaces %in% 2:4 ~ "medium",
46+
spaces %in% 5:6 ~ "low"
47+
),
48+
industry = trimws(industry)
49+
)
50+
#relationship between each industry and the three levels of aggregation (used for excel layout)------------
51+
agg <- raw_mapping %>%
52+
select(-naics) %>%
53+
mutate(
54+
high = if_else(agg == "high", industry, NA_character_),
55+
medium = if_else(agg %in% c("high", "medium"), industry, NA_character_),
56+
low = if_else(agg == "low", industry, NA_character_)
57+
) %>%
58+
fill(high, .direction = "down") %>%
59+
fill(medium, .direction = "down") %>%
60+
select(industry, high, medium, low)
61+
62+
#write_csv(agg, here::here("temp","layout.csv"))
63+
64+
65+
# get the naics for the lowest level of aggregation---------------
66+
low <- raw_mapping %>%
67+
filter(agg == "low") %>%
68+
select(low = industry,
69+
naics) %>%
70+
group_by(low) %>%
71+
nest()%>%
72+
mutate(
73+
data = map(data, separate_naics),
74+
data = map(data, fill_wrapper)
75+
) %>%
76+
unnest(data)%>%
77+
unnest(naics)
78+
79+
# get the naics for the medium level of aggregation---------------
80+
medium <- raw_mapping %>%
81+
filter(agg == "medium") %>%
82+
select(medium = industry, naics) %>%
83+
group_by(medium) %>%
84+
nest() %>%
85+
mutate(
86+
data = map(data, separate_naics),
87+
data = map(data, fill_wrapper)
88+
) %>%
89+
unnest(data) %>%
90+
unnest(naics)
91+
92+
# get the naics for the high level of aggregation---------------
93+
high <- raw_mapping %>%
94+
filter(agg == "high") %>%
95+
select(high = industry,
96+
naics) %>%
97+
group_by(high) %>%
98+
nest() %>%
99+
mutate(
100+
data = map(data, separate_naics),
101+
data = map(data, fill_wrapper)
102+
) %>%
103+
unnest(data) %>%
104+
unnest(naics)
105+
# join by naics to get the mapping file from naics to the 3 levels of aggregation.
106+
mapping <- high%>%
107+
full_join(medium, by="naics")%>%
108+
full_join(low, by= "naics")%>%
109+
select(naics, everything())
110+
111+
#write_csv(mapping, here::here("temp","mapping.csv"))
112+
113+
# read in the data and join with mapping file to get aggregation info -------------------
114+
ftpt <- read_naics("ftptemp4digNAICS", ftpt)%>%
115+
inner_join(mapping, by = "naics")
116+
status <- read_naics("lfsstat4digNAICS", lf_stat)%>%
117+
inner_join(mapping, by = "naics")
118+
all_data <- bind_rows(ftpt, status)
119+
120+
#data is zero padded to the end of the current year... figure out the last month from data.
121+
max_date <- all_data%>%
122+
group_by(date, name)%>%
123+
summarize(value=mean(value))%>%
124+
ungroup()%>%
125+
filter(!near(value, 0))%>%
126+
filter(date==max(date))%>%
127+
pull(date)%>%
128+
unique()
129+
130+
truncated <- all_data%>%
131+
filter(date <= max_date)
132+
133+
# output file has dates as column headings... get the necessary dates-----------
134+
current <- format(max(truncated$date), "%b-%y")
135+
previous_month <- format(max(truncated$date) - months(1), "%b-%y")
136+
previous_year <- format(max(truncated$date) - years(1), "%b-%y")
137+
# aggregate the data to the three levels-------------
138+
high_agg <- agg_level(truncated, high)
139+
medium_agg <- agg_level(truncated, medium)
140+
low_agg <- agg_level(truncated, low)
141+
142+
# bind the 3 levels of aggregation together then...
143+
smoothed_data <- bind_rows(high_agg, medium_agg, low_agg)%>%
144+
na.omit() %>%
145+
mutate(#data = map(data, stl_smooth), #thought this might be better than simple moving average...
146+
#data = map(data, trail_ma, months = ma_months), # simple moving average smooth of data
147+
data = map(data, add_vars)) #add in labour force and unemployment rate
148+
149+
smoothed_with_mapping <- full_join(smoothed_data, agg, by=c("agg_level"="industry"))%>%
150+
mutate(data=map(data, na.omit))%>%
151+
unnest(data)%>%
152+
group_by(agg_level, high, medium, low, name)%>%
153+
nest()%>%
154+
mutate(name=str_to_title(str_replace_all(name, "_", " ")),
155+
data=map(data, pivot_wider, names_from="date", values_from="value"))
156+
157+
write_rds(smoothed_with_mapping, here::here("temp","smoothed_with_mapping.rds"))
158+
159+
keep_list <- c("agg_level",
160+
"trend_strength",
161+
"seasonal_strength_year",
162+
"spikiness",
163+
"linearity",
164+
"curvature",
165+
"shift_level_max",
166+
"spectral_entropy",
167+
"coef_hurst"
168+
)
169+
170+
for_pca <- smoothed_with_mapping %>%
171+
unnest(data)%>%
172+
pivot_longer(cols=-c(agg_level, name,high,medium,low), names_to = "date", values_to = "value")%>%
173+
filter(agg_level==high)%>%
174+
ungroup()%>%
175+
select(-high, -medium, -low)%>%
176+
mutate(date=tsibble::yearmonth(date))%>%
177+
group_by(name)%>%
178+
nest()%>%
179+
mutate(data=map(data, tsibble::tsibble, key=agg_level, index=date),
180+
features=map(data, function(tsbbl) tsbbl %>% features(value, feature_set(pkgs = "feasts"))),
181+
features=map(features, select, all_of(keep_list)),
182+
features=map(features, column_to_rownames, var="agg_level"),
183+
features=map(features, fix_column_names),
184+
pcs=map(features, prcomp, scale=TRUE)
185+
)
186+
187+
write_rds(for_pca, here::here("temp","for_pca.rds"))
188+
189+
no_format <- smoothed_data %>%
190+
mutate(current = map(data, get_smoothed, 0), # get current value of smoothed data
191+
last_month = map(data, get_smoothed, 1),
192+
last_year = map(data, get_smoothed, 12),
193+
current_ytd_ave = map(data, ytd_ave, 0), # year to date average of smoothed data
194+
previous_ytd_ave = map(data, ytd_ave, 1)
195+
)%>%
196+
select(-data)%>%
197+
mutate(#join all the dataframes created above for unnesting below (unnesting individually creates sparse dataframe)
198+
data = map2(current, last_month, full_join, by = "name"),
199+
data = map2(data, last_year, full_join, by = "name"),
200+
data = map2(data, current_ytd_ave, full_join, by = "name"),
201+
data = map2(data, previous_ytd_ave, full_join, by = "name")
202+
) %>%
203+
select(agg_level, data) %>%
204+
unnest(data)%>%
205+
rename(
206+
current = value.x, # fix the names messed up by joins above
207+
previous_month = value.y,
208+
previous_year = value,
209+
current_ytd_average = ytd_ave.x,
210+
previous_ytd_average = ytd_ave.y
211+
)%>%
212+
mutate(#create some variables
213+
level_change_year = current - previous_year,
214+
level_change_month = current - previous_month,
215+
level_change_ytd = current_ytd_average - previous_ytd_average,
216+
percent_change_year = level_change_year / previous_year,
217+
percent_change_month = level_change_month / previous_month,
218+
percent_change_ytd = level_change_ytd / previous_ytd_average)
219+
220+
221+
full_join(no_format, agg, by=c("agg_level"="industry"))%>%
222+
ungroup()%>%
223+
group_by(agg_level, high, medium, low, name)%>%
224+
nest()%>%
225+
mutate(name=str_to_title(str_replace_all(name, "_", " ")))%>%
226+
write_rds(here::here("temp","for_plots.rds"))
227+
228+
# formatting the output for excel
229+
with_formatting <- no_format%>%
230+
mutate(percent_change_year = percent(percent_change_year, accuracy = accuracy_small),
231+
percent_change_month = percent(percent_change_month, accuracy = accuracy_small),
232+
percent_change_ytd = percent(percent_change_ytd, accuracy = accuracy_small),
233+
current = case_when(name=="unemployment_rate" ~ percent(current, accuracy = accuracy_small),
234+
current < 1500 ~ "suppressed",
235+
TRUE ~ comma(current, accuracy = accuracy_large)),
236+
previous_year=case_when(name=="unemployment_rate" ~ percent(previous_year, accuracy = accuracy_small),
237+
previous_year<1500 ~ "suppressed",
238+
TRUE ~ comma(previous_year, accuracy = accuracy_large)),
239+
previous_month=case_when(name=="unemployment_rate" ~ percent(previous_month, accuracy = accuracy_small),
240+
previous_month<1500 ~ "suppressed",
241+
TRUE ~ comma(previous_month, accuracy = accuracy_large)),
242+
level_change_year = if_else(name == "unemployment_rate",
243+
percent(level_change_year, accuracy = accuracy_small),
244+
comma(level_change_year, accuracy = accuracy_large)),
245+
level_change_month = if_else(name == "unemployment_rate",
246+
percent(level_change_month, accuracy = accuracy_small),
247+
comma(level_change_month, accuracy = accuracy_large)),
248+
level_change_ytd = if_else(name == "unemployment_rate",
249+
percent(level_change_ytd, accuracy = accuracy_small),
250+
comma(level_change_ytd, accuracy = accuracy_large)),
251+
current_ytd_average = if_else(name == "unemployment_rate",
252+
percent(current_ytd_average, accuracy = accuracy_small),
253+
comma(current_ytd_average, accuracy = accuracy_large)),
254+
previous_ytd_average = if_else(name == "unemployment_rate",
255+
percent(previous_ytd_average, accuracy = accuracy_small),
256+
comma(previous_ytd_average, accuracy = accuracy_large))
257+
) %>%
258+
left_join(agg, by = c("agg_level" = "industry"))%>% #agg is the mapping from industry to the 3 levels of aggregation
259+
mutate(medium = ifelse(agg_level == high, paste0("1", medium), medium)) %>%# allows high level industries to be at top of sorted medium industries.
260+
group_by(high) %>%
261+
nest() %>%
262+
mutate(
263+
data = map(data, arrange, name, medium), # arranges data by medium level of aggregation (except high level at top because of pasted 1)
264+
data = map(data, unfill_var, name), # replaces fixed values with blanks. (excel formatting)
265+
data = map(data, indent_industry), # indents industry to indicate hierarchy.
266+
data = map(data, select, -medium, -low), # gets rid of aggregation levels
267+
data = map(data, clean_up) # assigns the desired column names and puts in the correct order
268+
) %>%
269+
filter(!is.na(high))
270+
271+
write_rds(with_formatting, here::here("temp","for_tables.rds"))
272+
273+
# write to excel-----------------
274+
wb <- loadWorkbook(here::here("data", "template.xlsx")) # get the desired sheet header
275+
createSheet(wb, name = "Mapping for humans")
276+
setColumnWidth(wb, sheet = "Mapping for humans", column = 1:2, width = c(24000,7000))
277+
writeWorksheet(wb, human_mapping, sheet="Mapping for humans")
278+
createSheet(wb, name = "Mapping for machines")
279+
setColumnWidth(wb, sheet = "Mapping for machines", column = 2:4, width = c(16000, 24000, 16000))
280+
writeWorksheet(wb, mapping, sheet="Mapping for machines")
281+
with_formatting%>%
282+
mutate(walk2(data, high, write_sheet)) # replicates the template sheet and writes data to each sheet
283+
removeSheet(wb, "layout") # get rid of the template
284+
saveWorkbook(wb, here::here("out", "current", "industry_snapshots.xlsx"))#write to file
285+
tictoc::toc()
286+

0 commit comments

Comments
 (0)