Skip to content

Commit e13b89b

Browse files
committedAug 3, 2020
Add restore_sql_file rule; notsummarised module; diff platforms for heatmap_days_by_sensors
1 parent ac47939 commit e13b89b

10 files changed

+107
-17
lines changed
 

‎config.yaml

+4-3
Original file line numberDiff line numberDiff line change
@@ -148,7 +148,7 @@ APPLICATIONS_FOREGROUND:
148148
social: ["socialnetworks", "socialmediatools"]
149149
entertainment: ["entertainment", "gamingknowledge", "gamingcasual", "gamingadventure", "gamingstrategy", "gamingtoolscommunity", "gamingroleplaying", "gamingaction", "gaminglogic", "gamingsports", "gamingsimulation"]
150150
SINGLE_APPS: ["top1global", "com.facebook.moments", "com.google.android.youtube", "com.twitter.android"] # There's no entropy for single apps
151-
EXCLUDED_CATEGORIES: ["system_apps", "tvvideoapps"]
151+
EXCLUDED_CATEGORIES: ["system_apps"]
152152
EXCLUDED_APPS: ["com.fitbit.FitbitMobile", "com.aware.plugin.upmc.cancer"]
153153
FEATURES: ["count", "timeoffirstuse", "timeoflastuse", "frequencyentropy"]
154154

@@ -226,7 +226,7 @@ HEATMAP_DAYS_BY_SENSORS:
226226
MIN_VALID_HOURS_PER_DAY: *min_valid_hours_per_day
227227
MIN_VALID_BINS_PER_HOUR: *min_valid_bins_per_hour
228228
EXPECTED_NUM_OF_DAYS: -1
229-
PHONE_SENSORS_TABLES: ["accelerometer", "applications_foreground", "battery", "calls", "light", "locations", "messages", "screen", "plugin_google_activity_recognition", "plugin_studentlife_audio_android"]
229+
SENSORS: [accelerometer, activity_recognition, applications_foreground, conversation, battery, bluetooth, calls, light, locations, messages, screen]
230230

231231
HEATMAP_SENSED_BINS:
232232
PLOT: False
@@ -244,9 +244,10 @@ OVERALL_COMPLIANCE_HEATMAP:
244244
PARAMS_FOR_ANALYSIS:
245245
COMPUTE: False
246246
GROUNDTRUTH_TABLE: participant_info
247+
TARGET_TABLE: participant_target
247248
SOURCES: &sources ["phone_features", "fitbit_features", "phone_fitbit_features"]
248249
DAY_SEGMENTS: *day_segments
249-
PHONE_FEATURES: [accelerometer, activity_recognition, applications_foreground, battery, calls_incoming, calls_missed, calls_outgoing, conversation, light, location_doryab, messages_received, messages_sent, screen]
250+
PHONE_FEATURES: [accelerometer, activity_recognition, applications_foreground, battery, bluetooth, calls_incoming, calls_missed, calls_outgoing, conversation, light, location_doryab, messages_received, messages_sent, screen]
250251
FITBIT_FEATURES: [fitbit_heartrate, fitbit_step, fitbit_sleep]
251252
PHONE_FITBIT_FEATURES: "" # This array is merged in the input_merge_features_of_single_participant function in models.snakefile
252253
DEMOGRAPHIC_FEATURES: [age, gender, inpatientdays]

‎reports/figures/.gitkeep

Whitespace-only changes.

‎rules/models.snakefile

+3-2
Original file line numberDiff line numberDiff line change
@@ -14,7 +14,7 @@ rule days_to_analyse:
1414

1515
rule targets:
1616
input:
17-
participant_info = "data/raw/{pid}/" + config["PARAMS_FOR_ANALYSIS"]["GROUNDTRUTH_TABLE"] + "_raw.csv"
17+
participant_info = "data/raw/{pid}/" + config["PARAMS_FOR_ANALYSIS"]["TARGET_TABLE"] + "_raw.csv"
1818
params:
1919
pid = "{pid}",
2020
summarised = "{summarised}",
@@ -142,7 +142,8 @@ rule merge_features_and_targets:
142142
summarised = "{summarised}",
143143
cols_var_threshold = "{cols_var_threshold}",
144144
numerical_operators = config["PARAMS_FOR_ANALYSIS"]["NUMERICAL_OPERATORS"],
145-
categorical_operators = config["PARAMS_FOR_ANALYSIS"]["CATEGORICAL_OPERATORS"]
145+
categorical_operators = config["PARAMS_FOR_ANALYSIS"]["CATEGORICAL_OPERATORS"],
146+
features_exclude_day_idx = config["PARAMS_FOR_ANALYSIS"]["FEATURES_EXCLUDE_DAY_IDX"],
146147
output:
147148
"data/processed/data_for_population_model/{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins/{rows_nan_threshold}|{cols_nan_threshold}_{days_before_threshold}|{days_after_threshold}_{cols_var_threshold}/{source}_{day_segment}_{summarised}.csv"
148149
script:

‎rules/preprocessing.snakefile

+12
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,14 @@
1+
rule restore_sql_file:
2+
input:
3+
sql_file = "data/external/rapids_example.sql",
4+
db_credentials = ".env"
5+
params:
6+
group = config["DOWNLOAD_PARTICIPANTS"]["GROUP"]
7+
output:
8+
touch("data/interim/restore_sql_file.done")
9+
script:
10+
"../src/data/restore_sql_file.py"
11+
112
rule download_participants:
213
params:
314
group = config["DOWNLOAD_PARTICIPANTS"]["GROUP"],
@@ -23,6 +34,7 @@ rule download_dataset:
2334

2435
PHONE_SENSORS = []
2536
PHONE_SENSORS.extend([config["MESSAGES"]["DB_TABLE"], config["CALLS"]["DB_TABLE"], config["BARNETT_LOCATION"]["DB_TABLE"], config["DORYAB_LOCATION"]["DB_TABLE"], config["BLUETOOTH"]["DB_TABLE"], config["BATTERY"]["DB_TABLE"], config["SCREEN"]["DB_TABLE"], config["LIGHT"]["DB_TABLE"], config["ACCELEROMETER"]["DB_TABLE"], config["APPLICATIONS_FOREGROUND"]["DB_TABLE"], config["CONVERSATION"]["DB_TABLE"]["ANDROID"], config["CONVERSATION"]["DB_TABLE"]["IOS"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["ANDROID"], config["ACTIVITY_RECOGNITION"]["DB_TABLE"]["IOS"]])
37+
PHONE_SENSORS.extend(config["PHONE_VALID_SENSED_BINS"]["TABLES"])
2638

2739
if len(config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"]) > 0:
2840
PHONE_SENSORS.append(config["WIFI"]["DB_TABLE"]["VISIBLE_ACCESS_POINTS"])

‎rules/reports.snakefile

+22-1
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,24 @@
1+
def optional_heatmap_days_by_sensors_input(wildcards):
2+
with open("data/external/"+wildcards.pid, encoding="ISO-8859-1") as external_file:
3+
external_file_content = external_file.readlines()
4+
platforms = external_file_content[1].strip().split(",")
5+
if platforms[0] == "multiple" or (len(platforms) > 1 and "android" in platforms and "ios" in platforms):
6+
platform = "android"
7+
else:
8+
platform = platforms[0]
9+
10+
input_for_heatmap_days_by_sensors = []
11+
for sensor in config["HEATMAP_DAYS_BY_SENSORS"]["SENSORS"]:
12+
if sensor == "activity_recognition" or sensor == "conversation":
13+
if sensor.upper() not in config:
14+
raise ValueError("Please check SENEORS parameter in HEATMAP_DAYS_BY_SENSORS section of config.yaml")
15+
if platform not in ["android", "ios"]:
16+
raise ValueError("Platform (line 2) in a participant file should be 'android', 'ios', or 'multiple'. You typed '" + platforms + "'")
17+
input_for_heatmap_days_by_sensors.append("data/raw/{pid}/" + config[sensor.upper()]["DB_TABLE"][platform.upper()] + "_with_datetime.csv")
18+
else:
19+
input_for_heatmap_days_by_sensors.append("data/raw/{pid}/" + sensor + "_with_datetime.csv")
20+
return input_for_heatmap_days_by_sensors
21+
122
rule heatmap_features_correlations:
223
input:
324
features = expand("data/processed/{pid}/{sensor}_{day_segment}.csv", pid=config["PIDS"], sensor=config["HEATMAP_FEATURES_CORRELATIONS"]["PHONE_FEATURES"]+config["HEATMAP_FEATURES_CORRELATIONS"]["FITBIT_FEATURES"], day_segment=config["DAY_SEGMENTS"]),
@@ -21,7 +42,7 @@ rule histogram_valid_sensed_hours:
2142

2243
rule heatmap_days_by_sensors:
2344
input:
24-
sensors = expand("data/raw/{{pid}}/{sensor}_with_datetime.csv", sensor=config["HEATMAP_DAYS_BY_SENSORS"]["PHONE_SENSORS_TABLES"]),
45+
sensors = optional_heatmap_days_by_sensors_input,
2546
phone_valid_sensed_days = "data/interim/{pid}/phone_valid_sensed_days_{min_valid_hours_per_day}hours_{min_valid_bins_per_hour}bins.csv"
2647
params:
2748
pid = "{pid}",

‎src/data/restore_sql_file.py

+21
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
import pandas as pd
2+
import configparser
3+
import os
4+
5+
# read database credentials
6+
group = snakemake.params["group"]
7+
config = configparser.ConfigParser()
8+
config.read(snakemake.input["db_credentials"])
9+
10+
# bash command to create table and restore tables from sql file
11+
checkdb_cmd = "mysql -h " + config[group]["host"] + " -u " + config[group]["user"] + " -p" + config[group]["password"] + " -e \"use " + config[group]["database"] + "\""
12+
create_cmd = "mysql -h " + config[group]["host"] + " -u " + config[group]["user"] + " -p" + config[group]["password"] + " -e \"CREATE DATABASE IF NOT EXISTS " + config[group]["database"] + ";\""
13+
restore_cmd = "mysql -h " + config[group]["host"] + " -u " + config[group]["user"] + " -p" + config[group]["password"] + " " + config[group]["database"] + " < data/external/" + config[group]["database"] + ".sql"
14+
15+
try:
16+
os.system(checkdb_cmd)
17+
except:
18+
print(config[group]["database"] + " DB already exists.")
19+
else:
20+
os.system(create_cmd)
21+
os.system(restore_cmd)

‎src/models/merge_features_and_targets.py

+22-6
Original file line numberDiff line numberDiff line change
@@ -32,19 +32,35 @@ def summariseFeatures(features, numerical_operators, categorical_operators, cols
3232
cols_var_threshold = snakemake.params["cols_var_threshold"]
3333
numerical_operators = snakemake.params["numerical_operators"]
3434
categorical_operators = snakemake.params["categorical_operators"]
35-
36-
37-
features = pd.read_csv(snakemake.input["cleaned_features"], parse_dates=["local_date"])
38-
demographic_features = pd.read_csv(snakemake.input["demographic_features"], index_col=["pid"])
39-
targets = pd.read_csv(snakemake.input["targets"], index_col=["pid"])
35+
features_exclude_day_idx = snakemake.params["features_exclude_day_idx"]
4036

4137

4238
# Extract summarised features based on daily features:
4339
# for categorical features: calculate variance across all days
4440
# for numerical features: calculate mode across all days
4541
if summarised == "summarised":
42+
43+
features = pd.read_csv(snakemake.input["cleaned_features"], parse_dates=["local_date"])
44+
demographic_features = pd.read_csv(snakemake.input["demographic_features"], index_col=["pid"])
45+
targets = pd.read_csv(snakemake.input["targets"], index_col=["pid"])
46+
4647
features = summariseFeatures(features, numerical_operators, categorical_operators, cols_var_threshold)
47-
data = pd.concat([features, demographic_features, targets], axis=1, join="inner")
48+
data = pd.concat([features, demographic_features, targets], axis=1, join="inner")
49+
50+
elif summarised == "notsummarised":
51+
52+
features = pd.read_csv(snakemake.input["cleaned_features"])
53+
demographic_features = pd.read_csv(snakemake.input["demographic_features"])
54+
55+
features = features.merge(demographic_features, on="pid", how="left").set_index(["pid", "local_date"])
56+
targets = pd.read_csv(snakemake.input["targets"], index_col=["pid", "local_date"])
57+
data = pd.concat([features, targets], axis=1, join="inner")
58+
59+
else:
60+
raise ValueError("SUMMARISED parameter in config.yaml can only be 'summarised' or 'notsummarised'")
61+
62+
if features_exclude_day_idx and ("day_idx" in data.columns):
63+
del data["day_idx"]
4864

4965
data.to_csv(snakemake.output[0], index=True)
5066

‎src/models/modeling.py

+7-1
Original file line numberDiff line numberDiff line change
@@ -65,7 +65,13 @@ def preprocesFeatures(train_numerical_features, test_numerical_features, categor
6565

6666

6767
# Read data and split
68-
data = pd.read_csv(snakemake.input["data"], index_col=["pid"])
68+
if summarised == "summarised":
69+
data = pd.read_csv(snakemake.input["data"], index_col=["pid"])
70+
elif summarised == "notsummarised":
71+
data = pd.read_csv(snakemake.input["data"], index_col=["pid", "local_date"])
72+
else:
73+
raise ValueError("SUMMARISED parameter in config.yaml can only be 'summarised' or 'notsummarised'")
74+
6975
data_x, data_y = data.drop("target", axis=1), data[["target"]]
7076
categorical_feature_colnames = categorical_colnames_demographic_features + getMatchingColNames(categorical_operators, data_x)
7177

‎src/models/targets.py

+8-1
Original file line numberDiff line numberDiff line change
@@ -5,10 +5,10 @@
55
summarised = snakemake.params["summarised"]
66
targets_ratio_threshold = snakemake.params["targets_ratio_threshold"]
77
targets_value_threshold = snakemake.params["targets_value_threshold"]
8+
participant_info = pd.read_csv(snakemake.input["participant_info"])
89

910
if summarised == "summarised":
1011
targets = pd.DataFrame(columns=["pid", "target"])
11-
participant_info = pd.read_csv(snakemake.input["participant_info"])
1212

1313
if not participant_info.empty:
1414
cesds = participant_info.loc[0, ["preop_cesd_total", "inpatient_cesd_total", "postop_cesd_total", "3month_cesd_total"]]
@@ -17,4 +17,11 @@
1717
target = 1 if cesds.apply(lambda x : 1 if x >= targets_value_threshold else 0).sum() >= num_threshold else 0
1818
targets.loc[0, :] = [pid, target]
1919

20+
elif summarised == "notsummarised":
21+
targets = participant_info[["local_date", "target"]]
22+
targets.insert(0, "pid", pid)
23+
24+
else:
25+
raise ValueError("SUMMARISED parameter in config.yaml can only be 'summarised' or 'notsummarised'")
26+
2027
targets.to_csv(snakemake.output[0], index=False)

‎src/visualization/heatmap_days_by_sensors.py

+8-3
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,4 @@
1+
import numpy as np
12
import pandas as pd
23
import plotly.io as pio
34
import plotly.graph_objects as go
@@ -20,8 +21,12 @@ def getRowCountHeatmap(row_count_sensors_normalized, row_count_sensors, pid, out
2021

2122
row_count_sensors = pd.DataFrame()
2223
for sensor_path in snakemake.input["sensors"]:
23-
# plugin_studentlife_audio_android => conversion; plugin_google_activity_recognition => AR; applications_foreground => apps
24-
sensor_name = sensor_path.split("/")[-1].replace("_with_datetime.csv", "").replace("plugin_studentlife_audio_android", "conversion").replace("plugin_google_activity_recognition", "AR").replace("applications_foreground", "apps")
24+
sensor_name = sensor_path.split("/")[-1].replace("_with_datetime.csv", "")
25+
# plugin_studentlife_audio_android or plugin_studentlife_audio => conversion; plugin_google_activity_recognition or plugin_ios_activity_recognition => AR; applications_foreground => apps
26+
sensor_name = sensor_name.replace("plugin_studentlife_audio_android", "conversion").replace("plugin_studentlife_audio", "conversion") \
27+
.replace("plugin_google_activity_recognition", "AR").replace("plugin_ios_activity_recognition", "AR") \
28+
.replace("applications_foreground", "apps")
29+
2530
sensor_data = pd.read_csv(sensor_path, encoding="ISO-8859-1", parse_dates=["local_date"], dtype={"label": str})
2631
if sensor_data.empty:
2732
row_count_sensor = pd.DataFrame(columns=[sensor_name])
@@ -56,7 +61,7 @@ def getRowCountHeatmap(row_count_sensors_normalized, row_count_sensors, pid, out
5661

5762
# normalize each sensor (column)
5863
if row_count_sensors.count().max() > 1:
59-
row_count_sensors_normalized = (row_count_sensors-row_count_sensors.min())/(row_count_sensors.max()-row_count_sensors.min())
64+
row_count_sensors_normalized = row_count_sensors.fillna(np.nan).apply(lambda x: (x - np.nanmin(x)) / (np.nanmax(x) - np.nanmin(x)) if np.nanmax(x) != np.nanmin(x) else (x / np.nanmin(x)), axis=0)
6065
else:
6166
row_count_sensors_normalized = row_count_sensors
6267

0 commit comments

Comments
 (0)
Please sign in to comment.