-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathparse_results.py
executable file
·106 lines (88 loc) · 3.25 KB
/
parse_results.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import logging
import click
from collections import defaultdict
import csv
import re
import random
import json
logger = logging.getLogger(__name__)
def process_unit(unit_id, sentences):
assert all(s['_unit_id'] == unit_id for s in sentences)
assert len(set(s['id'] for s in sentences)) == 1
assert len(set(s['sentence'] for s in sentences)) == 1
assert len(set(s['frame'] for s in sentences)) == 1
chunk_count = 1 + max(int(k.split('_')[1]) for k in sentences[0].keys() if re.match(r'chunk_\d+', k))
fe_count = 1 + max(int(k.split('_')[1]) for k in sentences[0].keys() if re.match(r'fe_\d+', k))
# build a mapping chunk --> all assigned FEs
chunks = defaultdict(list)
for each in sentences:
for i in xrange(chunk_count):
fe = each['answer_chunk_%02d' % i]
chunk = each['chunk_%02d' % i]
if fe and chunk and fe != 'None':
chunks[chunk].append(fe)
# build a mapping FE --> chunk determined with majority voting
fes = {}
for chunk, judgments in chunks.iteritems():
counts = defaultdict(lambda: 0)
for each in judgments:
counts[each] += 1
most = max(counts.values())
fe = random.choice([c for c, j in counts.iteritems() if j == most])
fes[fe] = chunk
# fill in missing FEs
for i in xrange(fe_count):
fe = sentences[0]['fe_%02d' % i]
if fe and fe not in fes:
fes[fe] = None
is_gold = sentences[0]['_golden'].lower() in {'t', 'true', 'y', 'yes'}
gold_fes = {}
if is_gold:
for i in xrange(fe_count):
fe = sentences[0]['chunk_%02d' % i]
gold = sentences[0]['answer_chunk_%02d_gold' % i]
if fe and gold:
gold_fes[fe] = [
g if g.lower() != 'none' else None
for g in gold.split('\n')
]
unit = {
'id': sentences[0]['id'],
'sentence': sentences[0]['sentence'],
'frame': sentences[0]['frame'],
'lu': sentences[0]['lu'],
'fes': fes,
'gold_fes': dict(gold_fes),
}
return unit
@click.command()
@click.argument('results', type=click.File('r'))
@click.option('--outfile', '-o', type=click.Path(dir_okay=False), default='output/training_set.jsonlines')
@click.option('--split-lus', is_flag=True)
def main(results, outfile, split_lus):
""" Parses the CSV with the results from crowdflower
"""
logger.info("Parsing annotation results from '%s' ..." % results.name)
sentences = defaultdict(lambda: list())
reader = csv.DictReader(results)
for each in reader:
sentences[each['_unit_id']].append(each)
files = {}
def get_file(lu):
fname = outfile % lu if split_lus else outfile
if fname not in files:
files[fname] = open(fname, 'w')
return files[fname]
try:
for k, v in sentences.iteritems():
processed = process_unit(k, v)
f = get_file(processed['lu'])
f.write(json.dumps(processed))
f.write('\n')
finally:
for f in files.values():
f.close()
logger.info("Done, training data dumped to %s",
', '.join(files.keys()))