-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathDataloader_elmo1.py
190 lines (159 loc) · 7.22 KB
/
Dataloader_elmo1.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
import nltk
import torch
import torch.utils.data as data
import itertools
import pickle
USE_CUDA = torch.cuda.is_available()
def read_pkl(path):
with open(path, "rb") as f:
data_dict = pickle.load(f)
return data_dict
def multitask_dataloader(path, num_task, batch_size=8):
data_dict = read_pkl(path)
datasets = data_dict["datasets"]
mappings = data_dict["mappings"]
num_feat = len(data_dict["mappings"]["casing"])
num_voc = len(data_dict["embeddings"])
num_char = len(data_dict["mappings"]["characters"])
embeddings = data_dict["embeddings"]
tgt_dict = {}
for key, value in data_dict["mappings"].items():
if key.endswith("BIO") or key.endswith("POS"):
#if key.endswith("IOBES") or key.endswith("POS"):
tgt_dict[key.split("_")[0]] = value
tgt_size = []
data = data_dict["data"]
assert num_task == len(datasets)
task_holder = []
listname = ['trainMatrix', 'devMatrix', 'testMatrix']
task2id, id2task = {}, {}
for data_name, lists in data.items():
task2id[data_name] = len(task2id)
id2task[len(id2task)] = data_name
train_loader = get_loader(lists['trainMatrix'], mappings, data_name, batch_size)
dev_loadaer = get_loader(lists['devMatrix'], mappings, data_name, batch_size*10)
test_loadaer = get_loader(lists['testMatrix'], mappings, data_name, batch_size*10)
task_holder.append({"train": train_loader,
"dev": dev_loadaer,
"test": test_loadaer})
return task_holder, task2id, id2task, num_feat, num_voc, num_char, tgt_dict, embeddings
class Dataset(data.Dataset):
"""Custom data.Dataset compatible with data.DataLoader."""
def __init__(self, data_list, name, mappings):
"""Reads source and target sequences from pkl files.
Args:
data_dict:[dict] file contains all tasks' data, mappings, embeddings
"""
self.name = name
self.data_list = data_list
self.mappings = mappings
self.num_total_pairs = len(self.data_list)
def __getitem__(self, index):
"""Returns one data pair (source and target)."""
name2out = {"conll2003": "NER_BIO", "conll2000": "chunk_BIO", "unidep": "POS"}
#name2out = {"conll2003": "NER_IOBES", "conll2000": "chunk_IOBES", "unidep": "POS"}
#name2out = {"conll2003": "NER_BIO", "conll2000": "chunk_BIO", "wsjpos": "POS"}
#name2out = {"conll2003": "NER_IOBES", "conll2000": "chunk_IOBES", "wsjpos": "POS"}
src_seq = self.data_list[index]["tokens"]
trg_seq = self.data_list[index][name2out[self.name]]
src_feats = self.data_list[index]["casing"]
src_chars = self.data_list[index]["characters"]
src_tokens = self.data_list[index]['raw_tokens']
return src_seq, trg_seq, src_feats, src_chars, src_tokens
#return src_seq, trg_seq, src_feats, src_chars
def __len__(self):
return self.num_total_pairs
def collate_fn(data):
"""Creates mini-batch tensors from the list of tuples (src_seq, trg_seq).
We should build a custom collate_fn rather than using default collate_fn,
because merging sequences (including padding) is not supported in default.
Seqeuences are padded to the maximum length of mini-batch sequences (dynamic padding).
Args:
data: list of tuple (src_seq, trg_seq).
- src_seq: torch tensor of shape (?); variable length.
- trg_seq: torch tensor of shape (?); variable length.
Returns:
src_seqs: torch tensor of shape (batch_size, padded_num_sent, padded_length).
src_mask_w: torch tensor of shape (batch_size, padded_num_sent, padded_length).
src_mask_s: torch tensor of shape (batch_size, padded_num_sent, padded_length).
tgt_seqs: torch tensor of shape (batch_size, padded_num_sent, padded_length).
tgt_mask_w: torch tensor of shape (batch_size, padded_num_sent, padded_length).
tgt_mask_s: torch tensor of shape (batch_size, padded_num_sent, padded_length).
"""
def merge(sequences):
num_sents = [len(sent) for sent in sequences]
padded_seqs = torch.zeros(len(sequences), max(num_sents)).long()
mask_s = torch.zeros(len(sequences), max(num_sents)).long()
for i, sent in enumerate(sequences):
end = num_sents[i]
mask_s[i, :end] = 1
padded_seqs[i, :end] = torch.LongTensor(sent[:end])
mask_s = mask_s.float()
if USE_CUDA:
padded_seqs = padded_seqs.cuda()
mask_s = mask_s.cuda()
return padded_seqs, mask_s
def merge_char(sequences):
num_sents = [len(sent) for sent in sequences]
num_chars = [[len(word) for word in sent] for sent in sequences]
max_seq_length = max(num_sents)
max_char_length = max(itertools.chain.from_iterable(num_chars))
padded_seqs = torch.zeros(len(sequences), max_seq_length, max_char_length).long()
#mask_s = torch.zeros(len(sequences), max(num_sents)).long()
for i, sent in enumerate(sequences):
end = num_sents[i]
for j, seq in enumerate(sent):
endd = num_chars[i][j]
padded_seqs[i, j, :endd] = torch.LongTensor(seq[:endd])
if USE_CUDA:
padded_seqs = padded_seqs.cuda()
return padded_seqs
# sort a list by sequence length (descending order) to use pack_padded_sequence
data.sort(key=lambda x: len(x[0]), reverse=True)
# seperate source and target sequences
src_seqs, tgt_seqs, src_feats, src_chars, src_tokens = zip(*data)
# merge sequences (from tuple of 1D tensor to 2D tensor)
tgt_list = tgt_seqs
src_seqs, src_masks = merge(src_seqs)
tgt_seqs, tgt_masks = merge(tgt_seqs)
src_feats, _ = merge(src_feats)
src_chars = merge_char(src_chars)
return src_seqs, src_masks, src_feats, tgt_seqs, tgt_masks, src_chars, tgt_list, src_tokens
def get_loader(data_list, mappings, name, batch_size=1):
"""Returns data loader for custom dataset.
Args:
pkl_path: pkl file path for source domain.
batch_size: mini-batch size.
Returns:
data_loader: data loader for custom dataset.
"""
# build a custom dataset
dataset = Dataset(data_list, name, mappings)
# data loader for custome dataset
# this will return (src_seqs, src_mask_w, src_mask_s, tgt_seqs, tgt_mask_w, tgt_mask_s) for each iteration
# please see collate_fn for details
data_loader = torch.utils.data.DataLoader(dataset=dataset,
batch_size=batch_size,
shuffle=True,
collate_fn=collate_fn)
return data_loader
def test():
pkl_path = "./pkl/ontonotes_conll2003_embeddings.pkl"
holder = multitask_dataloader(pkl_path, 2)
ll = []
for item in holder:
ll.append(item["train"])
x = ll[0] + ll[1]
print(len(x))
print(len(ll[0]))
print(len(ll[1]))
"""
data_iter = iter(item["train"])
src_seqs, src_mask_s, tgt_seqs, tgt_mask_s = next(data_iter)
print(src_seqs.size())
print(src_mask_s.size())
print(tgt_seqs.size())
print(tgt_mask_s.size())
"""
if __name__ == '__main__':
test()