5
5
from sklearn .metrics .pairwise import cosine_similarity
6
6
import numpy as np
7
7
8
- from haystack import MultiLabel , Label
8
+ from haystack import MultiLabel , Label , BaseComponent , Document
9
9
10
10
from farm .evaluation .squad_evaluation import compute_f1 as calculate_f1_str
11
11
from farm .evaluation .squad_evaluation import compute_exact as calculate_em_str
12
12
13
13
logger = logging .getLogger (__name__ )
14
14
15
15
16
- class EvalDocuments :
16
+ class EvalDocuments ( BaseComponent ) :
17
17
"""
18
18
This is a pipeline node that should be placed after a node that returns a List of Document, e.g., Retriever or
19
19
Ranker, in order to assess its performance. Performance metrics are stored in this class and updated as each
@@ -22,21 +22,22 @@ class EvalDocuments:
22
22
a look at our evaluation tutorial for more info about open vs closed domain eval (
23
23
https://haystack.deepset.ai/tutorials/evaluation).
24
24
"""
25
- def __init__ (self , debug : bool = False , open_domain : bool = True , top_k_eval_documents : int = 10 , name = "EvalDocuments" ):
25
+
26
+ outgoing_edges = 1
27
+
28
+ def __init__ (self , debug : bool = False , open_domain : bool = True , top_k : int = 10 ):
26
29
"""
27
30
:param open_domain: When True, a document is considered correctly retrieved so long as the answer string can be found within it.
28
31
When False, correct retrieval is evaluated based on document_id.
29
32
:param debug: When True, a record of each sample and its evaluation will be stored in EvalDocuments.log
30
33
:param top_k: calculate eval metrics for top k results, e.g., recall@k
31
34
"""
32
- self .outgoing_edges = 1
33
35
self .init_counts ()
34
36
self .no_answer_warning = False
35
37
self .debug = debug
36
38
self .log : List = []
37
39
self .open_domain = open_domain
38
- self .top_k_eval_documents = top_k_eval_documents
39
- self .name = name
40
+ self .top_k = top_k
40
41
self .too_few_docs_warning = False
41
42
self .top_k_used = 0
42
43
@@ -53,25 +54,25 @@ def init_counts(self):
53
54
self .reciprocal_rank_sum = 0.0
54
55
self .has_answer_reciprocal_rank_sum = 0.0
55
56
56
- def run (self , documents , labels : dict , top_k_eval_documents : Optional [int ]= None , ** kwargs ):
57
+ def run (self , documents : List [ Document ] , labels : List [ Label ], top_k : Optional [int ] = None ): # type: ignore
57
58
"""Run this node on one sample and its labels"""
58
59
self .query_count += 1
59
- retriever_labels = get_label (labels , kwargs [ "node_id" ] )
60
- if not top_k_eval_documents :
61
- top_k_eval_documents = self .top_k_eval_documents
60
+ retriever_labels = get_label (labels , self . name )
61
+ if not top_k :
62
+ top_k = self .top_k
62
63
63
64
if not self .top_k_used :
64
- self .top_k_used = top_k_eval_documents
65
- elif self .top_k_used != top_k_eval_documents :
65
+ self .top_k_used = top_k
66
+ elif self .top_k_used != top_k :
66
67
logger .warning (f"EvalDocuments was last run with top_k_eval_documents={ self .top_k_used } but is "
67
- f"being run again with top_k_eval_documents ={ self .top_k_eval_documents } . "
68
+ f"being run again with top_k ={ self .top_k } . "
68
69
f"The evaluation counter is being reset from this point so that the evaluation "
69
70
f"metrics are interpretable." )
70
71
self .init_counts ()
71
72
72
- if len (documents ) < top_k_eval_documents and not self .too_few_docs_warning :
73
- logger .warning (f"EvalDocuments is being provided less candidate documents than top_k_eval_documents "
74
- f"(currently set to { top_k_eval_documents } )." )
73
+ if len (documents ) < top_k and not self .too_few_docs_warning :
74
+ logger .warning (f"EvalDocuments is being provided less candidate documents than top_k "
75
+ f"(currently set to { top_k } )." )
75
76
self .too_few_docs_warning = True
76
77
77
78
# TODO retriever_labels is currently a Multilabel object but should eventually be a RetrieverLabel object
@@ -89,7 +90,7 @@ def run(self, documents, labels: dict, top_k_eval_documents: Optional[int]=None,
89
90
# If there are answer span annotations in the labels
90
91
else :
91
92
self .has_answer_count += 1
92
- retrieved_reciprocal_rank = self .reciprocal_rank_retrieved (retriever_labels , documents , top_k_eval_documents )
93
+ retrieved_reciprocal_rank = self .reciprocal_rank_retrieved (retriever_labels , documents , top_k )
93
94
self .reciprocal_rank_sum += retrieved_reciprocal_rank
94
95
correct_retrieval = True if retrieved_reciprocal_rank > 0 else False
95
96
self .has_answer_correct += int (correct_retrieval )
@@ -101,11 +102,11 @@ def run(self, documents, labels: dict, top_k_eval_documents: Optional[int]=None,
101
102
self .recall = self .correct_retrieval_count / self .query_count
102
103
self .mean_reciprocal_rank = self .reciprocal_rank_sum / self .query_count
103
104
104
- self .top_k_used = top_k_eval_documents
105
+ self .top_k_used = top_k
105
106
106
107
if self .debug :
107
- self .log .append ({"documents" : documents , "labels" : labels , "correct_retrieval" : correct_retrieval , "retrieved_reciprocal_rank" : retrieved_reciprocal_rank , ** kwargs })
108
- return {"documents" : documents , "labels" : labels , " correct_retrieval" : correct_retrieval , "retrieved_reciprocal_rank" : retrieved_reciprocal_rank , ** kwargs }, "output_1"
108
+ self .log .append ({"documents" : documents , "labels" : labels , "correct_retrieval" : correct_retrieval , "retrieved_reciprocal_rank" : retrieved_reciprocal_rank })
109
+ return {"correct_retrieval" : correct_retrieval }, "output_1"
109
110
110
111
def is_correctly_retrieved (self , retriever_labels , predictions ):
111
112
return self .reciprocal_rank_retrieved (retriever_labels , predictions ) > 0
@@ -142,7 +143,7 @@ def print(self):
142
143
print (f"mean_reciprocal_rank@{ self .top_k_used } : { self .mean_reciprocal_rank :.4f} " )
143
144
144
145
145
- class EvalAnswers :
146
+ class EvalAnswers ( BaseComponent ) :
146
147
"""
147
148
This is a pipeline node that should be placed after a Reader in order to assess the performance of the Reader
148
149
individually or to assess the extractive QA performance of the whole pipeline. Performance metrics are stored in
@@ -152,6 +153,8 @@ class EvalAnswers:
152
153
open vs closed domain eval (https://haystack.deepset.ai/tutorials/evaluation).
153
154
"""
154
155
156
+ outgoing_edges = 1
157
+
155
158
def __init__ (self ,
156
159
skip_incorrect_retrieval : bool = True ,
157
160
open_domain : bool = True ,
@@ -174,7 +177,6 @@ def __init__(self,
174
177
- Large model for German only: "deepset/gbert-large-sts"
175
178
:param debug: When True, a record of each sample and its evaluation will be stored in EvalAnswers.log
176
179
"""
177
- self .outgoing_edges = 1
178
180
self .log : List = []
179
181
self .debug = debug
180
182
self .skip_incorrect_retrieval = skip_incorrect_retrieval
@@ -203,14 +205,14 @@ def init_counts(self):
203
205
self .top_1_sas = 0.0
204
206
self .top_k_sas = 0.0
205
207
206
- def run (self , labels , answers , ** kwargs ):
208
+ def run (self , labels : List [ Label ] , answers : List [ dict ], correct_retrieval : bool ): # type: ignore
207
209
"""Run this node on one sample and its labels"""
208
210
self .query_count += 1
209
211
predictions = answers
210
- skip = self .skip_incorrect_retrieval and not kwargs . get ( " correct_retrieval" )
212
+ skip = self .skip_incorrect_retrieval and not correct_retrieval
211
213
if predictions and not skip :
212
214
self .correct_retrieval_count += 1
213
- multi_labels = get_label (labels , kwargs [ "node_id" ] )
215
+ multi_labels = get_label (labels , self . name )
214
216
# If this sample is impossible to answer and expects a no_answer response
215
217
if multi_labels .no_answer :
216
218
self .no_answer_count += 1
@@ -254,7 +256,7 @@ def run(self, labels, answers, **kwargs):
254
256
self .top_k_em_count += top_k_em
255
257
self .top_k_f1_sum += top_k_f1
256
258
self .update_has_answer_metrics ()
257
- return {** kwargs }, "output_1"
259
+ return {}, "output_1"
258
260
259
261
def evaluate_extraction (self , gold_labels , predictions ):
260
262
if self .open_domain :
0 commit comments