-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathresample_unique_location.py
79 lines (61 loc) · 2.87 KB
/
resample_unique_location.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
# -*- coding: utf-8 -*-
"""
Created on Wed Dec 14 21:15:48 2016
@author: HarryMunro
This code takes in a NETMIS-ish file and resamples by unique location.
"""
# script is currently working to return a dataframe with SUTOR CODES as columns, indexed by a resampled timestamp
# takes in some NETMIS data
# deletes stations in sutor code list
# deletes rows where timestamp data is not available
# calculates dwell times
import pandas as pd
from datetime import datetime
import numpy as np
import time
start_time = time.time()
input_csv = 'Populated NETMIS (including days) with RODS.csv' ### MUST BE IN TIME FORMAT yyy/mm/dd hh:mm:ss
print('read csv at time %d' % (time.time() - start_time))
#import
df = pd.read_csv(input_csv)
df = df.drop('Unnamed: 0', 1)
df = df.drop('index', 1)
# calculate boarders and alighters
df['BOARDERS AND ALIGHTERS'] = df['BOARDERS'] + df['ALIGHTERS']
# convert timestamp data
df['TIMESTAMP'] = pd.to_datetime(df['TIMESTAMP'], format = '%Y-%m-%d %H:%M:%S')
df = df.sort(columns = 'TIMESTAMP')
# Trimming dwell outliers
df = df[df['DWELL TIME'] < 45] # optional
# Split into each sutor group
grouped = df.groupby(df['SUTOR DIRECTION LINE'])
# new dataframe
# create columns for dataframe
#columns = ['DWELL TIME', 'LINE ID', 'DIRECTION CODE', 'ALIGHTERS', 'BOARDERS']
df2 = pd.DataFrame()
#del df['LINE ID']
#del df['DIRECTION CODE']
df['BOARDERS TO ALIGHTERS DIFFERENCE'] = df['BOARDERS'] - df['ALIGHTERS']
del df['ALIGHTERS']
del df['BOARDERS']
del df['LINE DIRECTION AVAILABILITY']
print('pruned dataframe at time %d' % (time.time() - start_time))
# Resample for each SUTOR group --- delete groups with too few data points
for group in df['SUTOR DIRECTION LINE'].unique(): # creates a list of tuples
print('processing unique location %s at time %d' % (str(group), (time.time() - start_time)))
sample = grouped.get_group(group)
if len(sample) > 0.25 / len(df['SUTOR DIRECTION LINE'].unique()) * len(df): # only proceed for "good" data based on size of dataframe and number of unique locations
sample = sample.set_index('TIMESTAMP') # good
#sample = sample.sort()
sample = sample.resample('60T', how = 'mean') # mean dwell
#sample = sample.resample('60T', how = {'DWELL TIME': np.var, 'BOARDERS AND ALIGHTERS': np.mean, 'SAF RATE': np.mean, 'TRAIN DENSITY': np.mean, 'STANDING CAPACITY': np.mean, 'SEATING CAPACITY': np.mean}) # variance dwell
sample['SUTOR DIRECTION LINE'] = group
#df2 = pd.concat([sample, df2], axis = 1)
#df2 = df2.rename(columns={'DWELL TIME': group})
df2 = df2.append(sample)
df2 = df2.dropna()# remove NAN rows
# clean up dwell times
#df2 = df2[df2['DWELL TIME'] < 45] # less than
# write to csv
print('writing new csv at time %d' % (time.time() - start_time))
df2.to_csv('resampled_by_unique_location.csv')