-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpandas_magic.py
130 lines (98 loc) · 4.05 KB
/
pandas_magic.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
import json
import seaborn as sns
from collections import defaultdict
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.preprocessing import MinMaxScaler
d = defaultdict(list)
with open("./data/output.jl", "r") as f:
for line in f:
item = json.loads(line)
dataset = list(item.keys())[0]
values = item[dataset]
d[dataset].append(values)
tmp = []
scaler = MinMaxScaler()
for dataset in sorted(d):
vals = d[dataset]
def filter_models(condition, models):
return {model for model in models if condition(model)}
# condition = lambda model: not(model.startswith("PRUN") or model.startswith("BOTH") or model.startswith("BAG")) # plain models (ES, ...)
# condition = lambda model: not(model.endswith("REP"))
#condition = lambda model: not(model.endswith("REP"))
#condition = lambda _: True
condition = lambda model: model not in ("BAG_ST_MLR", "BOTH_ST_MLR", "BOTH_ST_MLR", "PRUN_ST_MLR", "ST_MLR", "BAG_ST_MLR") # remove NaN models
condition2 = lambda model: not(model.startswith("PRUN") or model.startswith("BOTH") or model.startswith("BAG")) # plain models (ES, ...)
conditions = (
condition,
)
models = filter_models(lambda model: all(condition(model) for condition in conditions),
set(list(val.keys())[0] for val in vals))
models = sorted(models)
"""
models = {model for model in set(list(val.keys())[0] for val in vals)
if not(model.startswith("PRUN") or model.startswith("BOTH") or
model.startswith)}
"""
#models = set(list(val.keys())[0] for val in vals)
metrics = set()
for val in vals:
model = list(val.keys())[0]
metric = list(val[model].keys())[0]
metrics.add(metric)
metric_map = defaultdict(list)
metrics = sorted(metrics)
for metric in metrics:
for m in models:
total, cnt = 0., 0
for val in vals:
model = list(val.keys())[0]
met = list(val[model].keys())[0]
if m == model and metric == met:
total += val[model][met]
cnt += 1
try:
res = total / cnt
metric_map[metric].append((m, round(res, 3)))
except ZeroDivisionError:
# Value not available, replace with "nan"
metric_map[metric].append((m, float('nan')))
metric_map[metric] = dict(metric_map[metric])
rows = [[] for _ in models]
for i, model in enumerate(models):
for metric in metrics:
rows[i].append(metric_map[metric][model])
df = pd.DataFrame.from_records(rows, index=[m.replace("_", " ") for m in models], columns=metrics)
df_norm = (df - df.min()) / (df.max() - df.min())
# generate latex file with table of results for each dataset
with open("./{dataset}.tex".format(dataset=dataset), "w") as f:
f.write(df_norm.to_latex())
"""
tmp.append(df)
# to generate box plots over datasets
df_2 = pd.DataFrame(columns=["models"] + list(metrics))
lst = []
for df in tmp:
df_norm = (df - df.min()) / (df.max() - df.min())
lst.append(df_norm)
df = pd.concat(lst)
cnt=0
for index, row in df.iterrows():
a = [index] + list(row.values)
df_2.loc[cnt] = a
cnt+=1
met = "f1"
ranks = df_2.groupby("models")[met].median().fillna(0).sort_values()[::-1].index
print(type(ranks))
fig = plt.figure(figsize=[15,10])
# We define a fake subplot that is in fact only the plot.
plot = fig.add_subplot(111)
# We change the fontsize of minor ticks label
plot.tick_params(axis='both', which='major', labelsize=15)
sns.boxplot(data=df_2, x=met, y="models", order=ranks)
plt.xlabel("normalized " + met, fontsize=15)
plt.ylabel("")
#sns.stripplot(data=df_2, y="models", x=met, size=3, jitter= True, color = ".3", order=ranks)
plt.tight_layout()
plt.savefig("./%s_test_plot.pdf" %met, dpi=300)
"""