-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathsynthetic_data.py
152 lines (132 loc) · 7.63 KB
/
synthetic_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
import torch
import numpy as np
import sys
import torch
import torch.distributions as D
import logging
import sys
sys.path.append('../')
sys.path.append('../networks')
from .basedataset import BaseDataset
from networks.linear_net import Linear_net_sig
class SyntheticData(BaseDataset):
""" Synthetic dataset introduced in our work """
def __init__(self, train_samples = 1000, test_samples = 1000, data_distribution = "mix_of_guassians", d = 10,
mean_scale = 1, expert_deferred_error = 0, expert_nondeferred_error = 0.5, machine_nondeferred_error = 0, num_of_guassians = 10,
val_split = 0.1, batch_size = 1000, transforms = None):
'''
total_samples: total number of samples in the dataset
data_distribution: the distribution of the data. mix_of_guassians, or uniform
d: dimension of the data
mean_scale: the scale of the means of the guassians, or uniform
expert_deferred_error: the error of the expert when the data is deferred
expert_nondeferred_error: the error of the expert when the data is nondeferred
machine_nondeferred_error: the error of the machine when the data is nondeferred
num_of_guassians: the number of guassians in the mix of guassians
'''
self.val_split = val_split
self.batch_size = batch_size
self.train_split = 1 - val_split
self.transforms = transforms
self.train_samples = train_samples
self.test_samples = test_samples
self.total_samples = train_samples + test_samples
self.data_distribution = data_distribution
self.d = d
self.n_dataset = 2
self.mean_scale = mean_scale
self.expert_deferred_error = expert_deferred_error
self.expert_nondeferred_error = expert_nondeferred_error
self.machine_nondeferred_error = machine_nondeferred_error
self.num_of_guassians = num_of_guassians
self.generate_data()
def generate_data(self):
if self.data_distribution == "uniform":
data_x = torch.rand((self.total_samples,self.d))*self.mean_scale
else:
mix = D.Categorical(torch.ones(self.num_of_guassians,))
comp = D.Independent(D.Normal(
torch.randn(self.num_of_guassians,self.d), torch.rand(self.num_of_guassians,self.d)), 1)
gmm = torch.distributions.mixture_same_family.MixtureSameFamily(mix, comp)
data_x = gmm.sample((self.total_samples,))*self.mean_scale
# get random labels
mean_rej_prop = 0
# make sure ramdom rejector rejects between 20 and 80% of the time (variable)
while not (mean_rej_prop>=0.2 and mean_rej_prop<=0.8):
net_rej_opt = Linear_net_sig(self.d)
with torch.no_grad():
outputs = net_rej_opt(data_x)
predicted = torch.round(outputs.data)
mean_rej_prop = np.mean([predicted[i][0] for i in range(len(predicted))])
# get rejector preds on x
opt_rej_preds = []
with torch.no_grad():
outputs = net_rej_opt(data_x)
predicted = torch.round(outputs.data)
opt_rej_preds = [predicted[i][0] for i in range(len(predicted))]
# get classifier that is 1 at least 20% and at most 80% on non-deferred side
mean_class_prop = 0
net_mach_opt = Linear_net_sig(self.d)
while not (mean_class_prop>=0.2 and mean_class_prop<=0.8):
net_mach_opt = Linear_net_sig(self.d)
with torch.no_grad():
outputs = net_mach_opt(data_x)
predicted = torch.round(outputs.data)
predicted_class = [predicted[i][0]*(1-opt_rej_preds[i]) for i in range(len(predicted))]
mean_class_prop = np.sum(predicted_class)/(len(opt_rej_preds)-np.sum(opt_rej_preds))
# get classifier preds on x
opt_mach_preds = []
with torch.no_grad():
outputs = net_mach_opt(data_x)
predicted = torch.round(outputs.data)
opt_mach_preds = [predicted[i][0] for i in range(len(predicted))]
# get random labels
data_y = torch.randint(low=0,high=2,size=(self.total_samples,))
# make labels consistent with net_mach_opt on non-deferred side with error specified
for i in range(len(data_y)):
if opt_rej_preds[i] == 0:
coin = np.random.binomial(1,1-self.machine_nondeferred_error,1)[0]
if coin == 1:
data_y[i] = opt_mach_preds[i]
# make expert 1-expert_deferred_error accurate on deferred side and 1-expert_nondeferred_error accurate otherwise
human_predictions = [0]*len(data_y)
for i in range(len(data_y)):
if opt_rej_preds[i] == 1:
coin = np.random.binomial(1,1-self.expert_deferred_error,1)[0]
if coin == 1:
human_predictions[i] = data_y[i]
else:
human_predictions[i] = 1 - data_y[i]
else:
coin = np.random.binomial(1,1-self.expert_nondeferred_error,1)[0]
if coin == 1:
human_predictions[i] = data_y[i]
else:
human_predictions[i] = 1 - data_y[i]
human_predictions = torch.tensor(human_predictions)
# split into train, val, test
train_size = int(self.train_samples * self.train_split)
val_size = int(self.train_samples * self.val_split)
test_size = len(data_x) - train_size - val_size # = self.test_samples
self.train_x, self.val_x, self.test_x = torch.utils.data.random_split(data_x, [train_size, val_size, test_size], generator=torch.Generator().manual_seed(42))
self.train_y, self.val_y, self.test_y = torch.utils.data.random_split(data_y, [train_size, val_size, test_size], generator=torch.Generator().manual_seed(42))
self.train_h, self.val_h, self.test_h = torch.utils.data.random_split(human_predictions, [train_size, val_size, test_size], generator=torch.Generator().manual_seed(42))
logging.info("train size: ", len(self.train_x))
logging.info("val size: ", len(self.val_x))
logging.info("test size: ", len(self.test_x))
self.data_train = torch.utils.data.TensorDataset(self.train_x.dataset.data[self.train_x.indices], self.train_y.dataset.data[self.train_y.indices], self.train_h.dataset.data[self.train_h.indices])
self.data_val = torch.utils.data.TensorDataset(self.val_x.dataset.data[self.val_x.indices], self.val_y.dataset.data[self.val_y.indices], self.val_h.dataset.data[self.val_h.indices])
self.data_test = torch.utils.data.TensorDataset(self.test_x.dataset.data[self.test_x.indices], self.test_y.dataset.data[self.test_y.indices], self.test_h.dataset.data[self.test_h.indices])
self.data_train_loader = torch.utils.data.DataLoader(self.data_train, batch_size=self.batch_size, shuffle=True)
self.data_val_loader = torch.utils.data.DataLoader(self.data_val, batch_size=self.batch_size, shuffle=True)
self.data_test_loader = torch.utils.data.DataLoader(self.data_test, batch_size=self.batch_size, shuffle=True)
# double check if the solution we got is actually correct
error_optimal_= 0
for i in range(len(data_y)):
if opt_rej_preds[i] == 1:
error_optimal_ += human_predictions[i] != data_y[i]
else:
error_optimal_ += opt_mach_preds[i] != data_y[i]
error_optimal_ = error_optimal_/len(data_y)
self.error_optimal = error_optimal_
logging.info(f'Data optimal: Accuracy Train {100-100*error_optimal_:.3f} with rej {mean_rej_prop*100} \n \n')