-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathnetmis_rods_poulate.py
227 lines (179 loc) · 8.35 KB
/
netmis_rods_poulate.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
# -*- coding: utf-8 -*-
"""
Created on Wed Dec 14 11:48:35 2016
@author: HarryMunro
This script pulls in raw NETMIS data and RODS data.
It then does the following:
- Filters out unnecessary stations from a station exclusions list.
- Calculates dwell times.
- Optionally filters dwell times.
- Excludes unwanted lines.
- Matches passenger numbers with stations and time of day.
- Adds SAF rate for each line.
- Adds train density for each line.
- Adds standing capacity for each line.
- Adds seating capacity for each line.
- Calculates availability of each line.
- Saves the new dataset.
Also populates line names.
"""
import pandas as pd
import numpy as np
import time
start_time = time.time()
netmis_file = 'November 2015 NETMIS Data Dates Version.csv'
#df = pd.read_csv(netmis_file, usecols = ['TIMESTAMP',
#'ACTUAL DEPARTURE TIME', 'SUTOR CODE', 'LINE ID', 'DIRECTION CODE'],
#nrows = 100000) # with nrows filter
df = pd.read_csv(netmis_file, usecols = ['TIMESTAMP',
'ACTUAL DEPARTURE TIME', 'SUTOR CODE', 'LINE ID', 'DIRECTION CODE']) # with nrows filter
original = len(df)
# read in the station exclusions list
exclusions_file = 'station_exclusions.txt'
exclusions_list = pd.read_table(exclusions_file, index_col = 'Sutor')
# convert to dictionary
exclusions_list = exclusions_list.to_dict()['Include']
# map the exclusions list to sutor codes
df['INCLUDE'] = df['SUTOR CODE'].map(exclusions_list)
# delete rows containing "n"
df = df[df['INCLUDE'].str.contains('n') == False]
# first need to convert to datetimes
# and drop rows where there is no dwell time data
df = df.dropna()
#df = df.dropna(subset = ['TIMESTAMP'])
#df = df.dropna(subset = ['ACTUAL DEPARTURE TIME'])
#df = df.dropna(subset = ['SUTOR CODE'])
#df = df.dropna(subset = ['LINE ID'])
#df = df.dropna(subset = ['DIRECTION CODE'])
#del df['DWELL TIME']
# convert timestamp data
df['TIMESTAMP'] = pd.to_datetime(df['TIMESTAMP'], format = '%d/%m/%Y %H:%M:%S')
df['ACTUAL DEPARTURE TIME'] = pd.to_datetime(df['ACTUAL DEPARTURE TIME'], format = '%d/%m/%Y %H:%M:%S')
# calculating dwell times - need to convert timestamp data first
df['DWELL TIME'] = df['ACTUAL DEPARTURE TIME'] - df['TIMESTAMP']
df['SUTOR DIRECTION LINE'] = df['SUTOR CODE'] + df['DIRECTION CODE'].map(str) + df['LINE ID'].map(str)
# Cleaning up again
df = df[df['DWELL TIME'] < pd.Timedelta('00:02:00')] # less than
df = df[df['DWELL TIME'] > pd.Timedelta('00:00:05')] # greater than
# deleting lines where we do not have RODs data
df = df[df['LINE ID'] != 11]
df = df[df['LINE ID'] != 14]
#names = list(df.columns.values)
col_list = ['TIMESTAMP', 'DWELL TIME', 'SUTOR CODE', 'LINE ID', 'DIRECTION CODE', 'SUTOR DIRECTION LINE']
df = df[col_list]
# convert dwell times to integers
df['DWELL TIME'] = df['DWELL TIME'].dt.seconds
df = df.reset_index() # resets the index from
# now import the Alighters data
alighters_filename = 'Alighters.csv'
alighters_df = pd.read_csv(alighters_filename, skiprows = 2)
column_headers = list(alighters_df.columns.values)
# field which column timestamp falls into
alighters = [] # empty list to subsequently merge with dataframe
errors = []
match_errors = []
resampled_column_headers = column_headers[16:]
for row in range(len(df)):
sample_time = df.TIMESTAMP[row]
h = sample_time.hour
m = sample_time.minute
# convert to strings
if h < 10:
h = '0' + str(h)
else:
h = str(h)
if m < 10:
m = '0' + str(m)
else:
m = str(m)
match = []
for n in resampled_column_headers:
x = int(n[7:9]) # this is to remove the error of int(m[7:9]) equalling 0
if x == 0:
x = 59
if n[0:2] == h and int(m) > int(n[2:4]) and int(m) <= x:
match.append(n) # match contains the column heading for the match
if len(match) > 1:
print('WARNING, MULTIPLE MATCHES FOUND')
if len(match) == 0:
match_errors.append([sample_time, row])
sample_s_d_l = df['SUTOR DIRECTION LINE'][row]
# test if sample_s_d_l exits in the alighters database
#x = alighters_df['Sutor Direction Line'] == sample_s_d_l
try:
location = alighters_df[alighters_df['Sutor Direction Line'] == sample_s_d_l].index[0]
alighters.append(alighters_df[match[0]][location])
except IndexError:
errors.append([sample_s_d_l])
alighters.append(np.nan)
df['ALIGHTERS'] = alighters
# Now join the boarders data to dataset
boarders_filename = 'Boarders.csv'
boarders_df = pd.read_csv(boarders_filename, skiprows = 2)
column_headers_boarders = list(boarders_df.columns.values)
# field which column timestamp falls into
boarders = [] # empty list to subsequently merge with dataframe
resampled_column_headers = column_headers_boarders[17:]
for row in range(len(df)):
sample_time = df.TIMESTAMP[row]
h = sample_time.hour
m = sample_time.minute
# convert to strings
if h < 10:
h = '0' + str(h)
else:
h = str(h)
if m < 10:
m = '0' + str(m)
else:
m = str(m)
match = []
for n in resampled_column_headers:
x = int(n[7:9]) # this is to remove the error of int(m[7:9]) equalling 0
if x == 0:
x = 59
if n[0:2] == h and int(m) > int(n[2:4]) and int(m) <= x:
match.append(n) # match contains the column heading for the match
if len(match) > 1:
print('WARNING, MULTIPLE MATCHES FOUND')
if len(match) == 0:
match_errors.append([sample_time, row])
sample_s_d_l = df['SUTOR DIRECTION LINE'][row]
# test if sample_s_d_l exits in the alighters database
#x = alighters_df['Sutor Direction Line'] == sample_s_d_l
try:
location = boarders_df[boarders_df['Sutor Direction Line'] == sample_s_d_l].index[0]
boarders.append(boarders_df[match[0]][location])
except IndexError:
errors.append([sample_s_d_l])
boarders.append(np.nan)
df['BOARDERS'] = boarders
df = df.dropna()
# calculate boarders and alighters
df['BOARDERS AND ALIGHTERS'] = df['BOARDERS'] + df['ALIGHTERS']
# populate with line names
line_names = {0:'Bakerloo', 2:'Central', 3:'Victoria', 4:'Metropolitan', 5:'Northern', 6:'Jubilee', 7:'Piccadilly', 8:'District', 13:'Circle Hammersmith & City'}
df['LINE NAME'] = df['LINE ID'].map(line_names)
# populate with failure data - safs per day per line
saf_rate = {'Bakerloo':6.3886121, 'Central': 14.29395018, 'Victoria': 6.844128113879, 'Metropolitan': 7.92918149466192, 'Northern': 13.0053380782918, 'Jubilee': 12.6241992882562, 'Piccadilly': 9.60747330960854, 'District': 9.30747330960854, 'Circle Hammersmith & City': 10.5996441281139}
df['SAF RATE'] = df['LINE NAME'].map(saf_rate)
# populate with train density (how many trains we have per station/km)
train_density = {'Bakerloo': 1.077586207, 'Central': 0.662162162, 'Victoria': 0.761904762, 'Metropolitan': 114.2941176, 'Northern': 122.96, 'Jubilee': 84.46666667, 'Piccadilly': 115.2075472}
df['TRAIN DENSITY'] = df['LINE NAME'].map(train_density)
# populate with standing capacity per train
standing_capacity = {'Bakerloo': 116.6, 'Central': 155.02, 'Victoria': 153.2, 'Metropolitan': 174, 'Northern': 110.36, 'Jubilee': 145.92, 'Piccadilly': 114}
df['STANDING CAPACITY'] = df['LINE NAME'].map(standing_capacity)
# populate with seating capacity per train
seating_capacity = {'Bakerloo': 268, 'Central': 272, 'Victoria': 252, 'Metropolitan': 306, 'Northern': 200, 'Jubilee': 234, 'Piccadilly': 228}
df['SEATING CAPACITY'] = df['LINE NAME'].map(seating_capacity)
# populate with line-direction availability
df['LINE DIRECTION'] = df['LINE NAME'] + df['DIRECTION CODE'].map(str)
availability = {'Bakerloo0': 0.992752, 'Bakerloo1': 0.9941, 'Central0': 0.9842, 'Central1':0.98, 'Victoria0': 0.9916, 'Victoria1': 0.9927, 'Metropolitan0': 0.9881, 'Metropolitan1': 0.9849, 'Northern0': 0.988, 'Northern1': 0.9879, 'Jubilee0': 0.9863, 'Jubilee1': 0.9884, 'Piccadilly0': 0.9885, 'Piccadilly1': 0.987}
df['LINE DIRECTION AVAILABILITY'] = df['LINE DIRECTION'].map(availability)
# Drop the district, circle and H&S
df = df.dropna()
# Data statistics
print(original - len(df))
print(len(df)/original)
print(time.time() - start_time)
df.to_csv('Populated NETMIS (including days) with RODS.csv')