-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathmerger.py
51 lines (45 loc) · 1.72 KB
/
merger.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
import csv
if __name__ == '__main__':
fasttext_path = "wiki-news-300d-1M-linking.csv"
glove_path = "glove-linking.csv"
# output_fasttext_path = "fasttext-glove-linking.csv"
output_fasttext_path = "fasttext-glove-id-intersect-for-fasttext.csv"
output_glove_path = "fasttext-glove-id-intersect-for-glove.csv"
# dictionary QID, Tags
with open(fasttext_path, "r") as fasttext_file:
reader = csv.reader(fasttext_file)
next(reader)
fasttext_dict = dict()
for row in reader:
tag = row[0]
id = row[1]
tags = fasttext_dict.get(id, None)
if not tags:
tags = []
fasttext_dict[id] = tags
tags.append(tag)
with open(glove_path, "r") as glove_file:
reader = csv.reader(glove_file)
next(reader)
glove_dict = dict()
for row in reader:
tag = row[0]
id = row[1]
tags = glove_dict.get(id, None)
if not tags:
tags = []
glove_dict[id] = tags
tags.append(tag)
intersecting_ids = set(fasttext_dict.keys()) & set(glove_dict.keys())
with open(output_fasttext_path, "w+") as output_file:
writer = csv.writer(output_file)
writer.writerow(["embedding_label", "knowledgebase_id"])
for id in intersecting_ids:
for tag in fasttext_dict[id]:
writer.writerow([tag, id])
with open(output_glove_path, "w+") as output_file:
writer = csv.writer(output_file)
writer.writerow(["embedding_label", "knowledgebase_id"])
for id in intersecting_ids:
for tag in glove_dict[id]:
writer.writerow([tag, id])