|
| 1 | +# DC Data Kind |
| 2 | +# Analyst - Anirban Ghosh |
| 3 | +# Used R |
| 4 | +# Filtered out all "known_valid != True" |
| 5 | + |
| 6 | + |
| 7 | +# Aim - |
| 8 | +# Create a dataset with one row per: |
| 9 | +# 1. establishment type |
| 10 | +# 2. risk category, |
| 11 | +# 3. each week, |
| 12 | +# 4. year, |
| 13 | +# 5. census block |
| 14 | + |
| 15 | + |
| 16 | +# We need the following features in the dataset |
| 17 | +# feature_id: The ID for the feature, in this case, "food_service_establishments" <- generate |
| 18 | +# feature_type: The establishment_type from the restaurant data set |
| 19 | +# feature_subtype: The risk_category from 1-5 |
| 20 | +# year: The ISO-8601 year of the feature value |
| 21 | +# week: The ISO-8601 week number of the feature value |
| 22 | +# census_block_2010: The 2010 Census Block of the feature value |
| 23 | +# value: The value of the feature, i.e. |
| 24 | +# the number of food service establishments with the given |
| 25 | +# types and risk categories in the specified week, year, and census block. |
| 26 | + |
| 27 | +setwd("C://Users//aghosh//Desktop//Temp//2017 09 DataKindDC//Restaurant Inspections//Scraped Restaurant Inspection Data To Be Validated") |
| 28 | + |
| 29 | +library(tidyverse) |
| 30 | +df <- read.csv("potential_inspection_summary_data.csv") |
| 31 | + |
| 32 | +# Reviewing the data. |
| 33 | + |
| 34 | +str(df) |
| 35 | +names(df) |
| 36 | +head(df, 10) |
| 37 | + |
| 38 | +class(dfinspection_date) |
| 39 | +# This is a factor. Should recode as date. |
| 40 | +# variables of interest: |
| 41 | +# 1. establishment_type |
| 42 | +# 2. risk_category |
| 43 | +# 3. inspection_date - use to generate week and year |
| 44 | +# |
| 45 | +# sum the number of establishments |
| 46 | +# |
| 47 | +# Not in this data - the census block. Need to merge in geocode information |
| 48 | + |
| 49 | +nrow(distinct(df)) |
| 50 | +# Looks like one duplicate |
| 51 | + |
| 52 | +df1 <- df %>% unique() |
| 53 | +# Dropped one observation |
| 54 | + |
| 55 | +df1$inspect_date <- parse_date(as.character(df1$inspection_date)) |
| 56 | + |
| 57 | +library(lubridate) |
| 58 | +df2 <- df1 %>% |
| 59 | +mutate( |
| 60 | + week = week(inspect_date), |
| 61 | + year = year(inspect_date) |
| 62 | +) |
| 63 | + |
| 64 | + |
| 65 | +# Getting the lat long data ---------------------------------------------- |
| 66 | +setwd("C://Users//aghosh//Desktop//Temp//2017 09 DataKindDC//Restaurant Inspections//Updated Geocodes") |
| 67 | +latlong <- read.csv("inspection_geocodes.csv") |
| 68 | + |
| 69 | +View(latlong) |
| 70 | + |
| 71 | +df3<- inner_join(latlong, df2, by = "inspection_id") |
| 72 | +# there is a one to one join between latlong and the cleaned dataset. |
| 73 | + |
| 74 | + |
| 75 | +# Load Census Tract ------------------------------------------------------- |
| 76 | + |
| 77 | +#https://stackoverflow.com/questions/29872109/binning-longitude-latitude-labeled-data-by-census-block-id |
| 78 | + |
| 79 | +install.packages('rgdal',dependencies=TRUE, repos='http://cran.rstudio.com/') |
| 80 | +install.packages('raster',dependencies=TRUE, repos='http://cran.rstudio.com/') |
| 81 | + |
| 82 | +library(rgdal) |
| 83 | +library(sp) |
| 84 | +library(raster) |
| 85 | +library(readr) |
| 86 | +library(dplyr) |
| 87 | + |
| 88 | +##### |
| 89 | +# Compute polygon intersection locally using census block shapefiles |
| 90 | +# Returns the same data frame with an additional column at the end called 'census_block' |
| 91 | +# |
| 92 | +add_census_block_data <- function(data, lat_col_name = "latitude", lon_col_name = "longitude") { |
| 93 | + |
| 94 | + census_block_data <- shapefile("dc_2010_block_shapefiles/tl_2016_11_tabblock10.shp") |
| 95 | + census_block_data <- spTransform(census_block_data, CRSobj=CRS("+proj=longlat +datum=WGS84")) |
| 96 | + |
| 97 | + row_numbers <- 1:nrow(data) # used as a unique id |
| 98 | + |
| 99 | + data_spatial <- SpatialPointsDataFrame(coords = data[, c(lon_col_name, lat_col_name)], |
| 100 | + data = data_frame(row_number = row_numbers), |
| 101 | + proj4string=CRS("+proj=longlat +datum=WGS84")) |
| 102 | + |
| 103 | + data_spatial_block <- over(x = data_spatial, y = census_block_data) |
| 104 | +} |
| 105 | + |
| 106 | +blocks <- add_census_block_data(df3, "latitude", "longitude") |
| 107 | +blocks1 <- blocks |
| 108 | +blocks1$row_numbers <- 1:nrow(blocks1) |
| 109 | +df4 <- df3 |
| 110 | +df4$row_numbers <- 1:nrow(df4) |
| 111 | +# Note that i merged by row number, which is not ideal but I |
| 112 | +# cannot find a better way to merge by lat and long. |
| 113 | + |
| 114 | +df5 <- inner_join(df4, blocks1, by = "row_numbers") |
| 115 | + |
| 116 | +df6 <- df5 %>% |
| 117 | + mutate( |
| 118 | + known_valid = as.character(known_valid), |
| 119 | + feature_id = "food_service_establishments" |
| 120 | + ) %>% |
| 121 | + filter( |
| 122 | + known_valid == "True" |
| 123 | + ) |
| 124 | + |
| 125 | + |
| 126 | + |
| 127 | +# df6 has all the data i need -------------------------------------------- |
| 128 | + |
| 129 | +final <- df6 %>% |
| 130 | + group_by( |
| 131 | + establishment_type, |
| 132 | + risk_category, |
| 133 | + week, |
| 134 | + year, |
| 135 | + BLOCKCE10, |
| 136 | + feature_id |
| 137 | + ) %>% |
| 138 | + summarise( |
| 139 | + value = n() |
| 140 | + ) %>% |
| 141 | + rename( |
| 142 | + feature_type = establishment_type, |
| 143 | + feature_subtype = risk_category, |
| 144 | + census_block_2010 = BLOCKCE10 |
| 145 | + ) |
| 146 | + |
| 147 | +write_rds(final, "Number_of_Establishments.rds") |
| 148 | +write_csv(final, "Number_of_Establishments.csv") |
| 149 | + |
| 150 | +# End of Issue 20 -------------------------------------------------------- |
| 151 | + |
| 152 | + |
0 commit comments