-
Notifications
You must be signed in to change notification settings - Fork 62
/
Copy pathOxford.Rmd
134 lines (99 loc) · 3.76 KB
/
Oxford.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
---
title: "Reformatting the Oxford Data"
output: html_document
---
```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```
```{r }
# Load the libraries
library(arules)
library(arulesViz)
library(dplyr)
library(tidyr)
```
# Processing the "Connected Customers" files
First, we need to load in the data in it's orginal (wide) format.
```{r}
# Change to the path on your machine
path = "C:/Users/st50/Desktop/Oxford/"
df <- read.csv(paste(path, "November connected Customers.csv", sep=""), na.strings = c("", "NA"))
str(df)
```
Now, reformat into skinny (tidy) format, for ease of use.
```{r}
# Goal is to convert the data into tidy format. I.e., we want to turn a row like this:
#
# top_space_1_name top_space_1_dwell_min top_space_2_name top_space_2_dwell_min top_space_3_name ...
# La Senza 44 <NA> NA ...
#
# into this:
#
# ID space type value
# 1 top_space_1 name La Senza
# 1 top_space_1 dwell_min 44
# Note that we are getting rid of the first six columns (demographic data). Can always join back later if necessary, using the ID.
tidy = df[,-c(1:6)] %>%
mutate(ID = row_number()) %>%
select(ID, everything()) %>%
gather(key, value, -ID) %>%
mutate(space = gsub("_name|_dwell_min", "", key)) %>%
mutate(type = gsub("top_space_\\d+_", "", key)) %>%
filter(!is.na(value)) %>%
arrange(ID) %>%
select(ID, space, type, value) %>%
spread(type, value, convert=T)
# Do some filtering, according to Manil. (You can do more here if you want.)
tidy.filtered = tidy %>%
filter(dwell_min >= 5)
head(tidy.filtered)
# Write it to a CSV, so we can read it back with read.transactions().
write.csv(tidy.filtered, paste(path, "November_connected_Customers_tidy.csv", sep=""))
# Read in the data in transaction format, which is necessary for arules
tidy.trans = read.transactions(paste(path, "November_connected_Customers_tidy.csv", sep=""), format="single", sep=",", cols=c("ID", "name"))
# And you're done! Can start using arules and other packages on the data.
```
# Processing the "non-connected customers" file
```{r}
# Change to the path on your machine
path = "C:/Users/st50/Desktop/Oxford/"
df <- read.csv(paste(path, "Customer Journey for Month of December.csv", sep=""), na.strings = c("", "NA"))
str(df)
```
Now, reformat into skinny (tidy) format, for ease of use.
```{r}
# Goal is to convert the data into tidy format.
# Take a sample, for now.
df.all = df #back up of the entire thing.
df = df.all[1:10000,]
library(dplyr)
library(tidyverse)
tidy = df %>%
mutate(ID = row_number()) %>%
select(ID, everything()) %>%
select(-X) %>%
rename(Space.Name.0 = Space.Name) %>%
rename(Space.Type.0 = Space.Type) %>%
rename(Space.Dwell.0 = Space.Dwell) %>%
gather(key, value, -ID, -Date, -Encrypted.Mac) %>%
mutate(space = gsub("Space.Name.|Space.Type.|Space.Dwell.", "", key)) %>%
mutate(type = gsub("\\.\\d+", "", key)) %>%
filter(!is.na(value)) %>%
arrange(ID) %>%
select(ID, Date, Encrypted.Mac, space, type, value) %>%
spread(type, value, convert=T) %>%
filter(!is.na(Space.Dwell))
head(tidy, n=200)
```
```{r}
# Do some filtering, according to Manil. (You can do more here if you want.)
tidy.filtered = tidy %>%
filter(Space.Dwell. >= 5)
head(tidy.filtered)
# Write it to a CSV, so we can read it back with read.transactions().
write.csv(tidy.filtered, paste(path, "Customer_Journey_for_Month_of_December_tidy.csv", sep=""))
# Read in the data in transaction format, which is necessary for arules
tidy.trans = read.transactions(paste(path, "Customer_Journey_for_Month_of_December_tidy.csv", sep=""), format="single", sep=",", cols=c("ID", "Space.Name."))
# And you're done! Can start using arules and other packages on the data.
tidy.trans
```