Skip to content

Commit dfc07a5

Browse files
committed
Organize Files
After merging all forked branches, organized submissions by issue (if applicable) or topic Copied additional uploads from Dropbox
1 parent 29e285a commit dfc07a5

38 files changed

+375850
-0
lines changed
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.
File renamed without changes.

issue_20/AG_dropbox_upload/Number_of_Establishments.csv

+27,211
Large diffs are not rendered by default.
Binary file not shown.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,152 @@
1+
# DC Data Kind
2+
# Analyst - Anirban Ghosh
3+
# Used R
4+
# Filtered out all "known_valid != True"
5+
6+
7+
# Aim -
8+
# Create a dataset with one row per:
9+
# 1. establishment type
10+
# 2. risk category,
11+
# 3. each week,
12+
# 4. year,
13+
# 5. census block
14+
15+
16+
# We need the following features in the dataset
17+
# feature_id: The ID for the feature, in this case, "food_service_establishments" <- generate
18+
# feature_type: The establishment_type from the restaurant data set
19+
# feature_subtype: The risk_category from 1-5
20+
# year: The ISO-8601 year of the feature value
21+
# week: The ISO-8601 week number of the feature value
22+
# census_block_2010: The 2010 Census Block of the feature value
23+
# value: The value of the feature, i.e.
24+
# the number of food service establishments with the given
25+
# types and risk categories in the specified week, year, and census block.
26+
27+
setwd("C://Users//aghosh//Desktop//Temp//2017 09 DataKindDC//Restaurant Inspections//Scraped Restaurant Inspection Data To Be Validated")
28+
29+
library(tidyverse)
30+
df <- read.csv("potential_inspection_summary_data.csv")
31+
32+
# Reviewing the data.
33+
34+
str(df)
35+
names(df)
36+
head(df, 10)
37+
38+
class(dfinspection_date)
39+
# This is a factor. Should recode as date.
40+
# variables of interest:
41+
# 1. establishment_type
42+
# 2. risk_category
43+
# 3. inspection_date - use to generate week and year
44+
#
45+
# sum the number of establishments
46+
#
47+
# Not in this data - the census block. Need to merge in geocode information
48+
49+
nrow(distinct(df))
50+
# Looks like one duplicate
51+
52+
df1 <- df %>% unique()
53+
# Dropped one observation
54+
55+
df1$inspect_date <- parse_date(as.character(df1$inspection_date))
56+
57+
library(lubridate)
58+
df2 <- df1 %>%
59+
mutate(
60+
week = week(inspect_date),
61+
year = year(inspect_date)
62+
)
63+
64+
65+
# Getting the lat long data ----------------------------------------------
66+
setwd("C://Users//aghosh//Desktop//Temp//2017 09 DataKindDC//Restaurant Inspections//Updated Geocodes")
67+
latlong <- read.csv("inspection_geocodes.csv")
68+
69+
View(latlong)
70+
71+
df3<- inner_join(latlong, df2, by = "inspection_id")
72+
# there is a one to one join between latlong and the cleaned dataset.
73+
74+
75+
# Load Census Tract -------------------------------------------------------
76+
77+
#https://stackoverflow.com/questions/29872109/binning-longitude-latitude-labeled-data-by-census-block-id
78+
79+
install.packages('rgdal',dependencies=TRUE, repos='http://cran.rstudio.com/')
80+
install.packages('raster',dependencies=TRUE, repos='http://cran.rstudio.com/')
81+
82+
library(rgdal)
83+
library(sp)
84+
library(raster)
85+
library(readr)
86+
library(dplyr)
87+
88+
#####
89+
# Compute polygon intersection locally using census block shapefiles
90+
# Returns the same data frame with an additional column at the end called 'census_block'
91+
#
92+
add_census_block_data <- function(data, lat_col_name = "latitude", lon_col_name = "longitude") {
93+
94+
census_block_data <- shapefile("dc_2010_block_shapefiles/tl_2016_11_tabblock10.shp")
95+
census_block_data <- spTransform(census_block_data, CRSobj=CRS("+proj=longlat +datum=WGS84"))
96+
97+
row_numbers <- 1:nrow(data) # used as a unique id
98+
99+
data_spatial <- SpatialPointsDataFrame(coords = data[, c(lon_col_name, lat_col_name)],
100+
data = data_frame(row_number = row_numbers),
101+
proj4string=CRS("+proj=longlat +datum=WGS84"))
102+
103+
data_spatial_block <- over(x = data_spatial, y = census_block_data)
104+
}
105+
106+
blocks <- add_census_block_data(df3, "latitude", "longitude")
107+
blocks1 <- blocks
108+
blocks1$row_numbers <- 1:nrow(blocks1)
109+
df4 <- df3
110+
df4$row_numbers <- 1:nrow(df4)
111+
# Note that i merged by row number, which is not ideal but I
112+
# cannot find a better way to merge by lat and long.
113+
114+
df5 <- inner_join(df4, blocks1, by = "row_numbers")
115+
116+
df6 <- df5 %>%
117+
mutate(
118+
known_valid = as.character(known_valid),
119+
feature_id = "food_service_establishments"
120+
) %>%
121+
filter(
122+
known_valid == "True"
123+
)
124+
125+
126+
127+
# df6 has all the data i need --------------------------------------------
128+
129+
final <- df6 %>%
130+
group_by(
131+
establishment_type,
132+
risk_category,
133+
week,
134+
year,
135+
BLOCKCE10,
136+
feature_id
137+
) %>%
138+
summarise(
139+
value = n()
140+
) %>%
141+
rename(
142+
feature_type = establishment_type,
143+
feature_subtype = risk_category,
144+
census_block_2010 = BLOCKCE10
145+
)
146+
147+
write_rds(final, "Number_of_Establishments.rds")
148+
write_csv(final, "Number_of_Establishments.csv")
149+
150+
# End of Issue 20 --------------------------------------------------------
151+
152+

0 commit comments

Comments
 (0)