Skip to content

Commit b1a8f10

Browse files
authored
add callback_metadata to evaluate (#7952)
1 parent 48dce7f commit b1a8f10

File tree

3 files changed

+10
-2
lines changed

3 files changed

+10
-2
lines changed

dspy/evaluate/evaluate.py

+5
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,7 @@ def __call__(
9797
display_table: Optional[bool] = None,
9898
return_all_scores: Optional[bool] = None,
9999
return_outputs: Optional[bool] = None,
100+
callback_metadata: Optional[dict[str, Any]] = None,
100101
):
101102
"""
102103
Args:
@@ -113,6 +114,7 @@ def __call__(
113114
use `self.return_all_scores`.
114115
return_outputs (bool): Whether to return the dspy program's outputs for every data in `devset`. if not
115116
provided, use `self.return_outputs`.
117+
callback_metadata (dict): Metadata to be used for evaluate callback handlers.
116118
117119
Returns:
118120
The evaluation results are returned in different formats based on the flags:
@@ -139,6 +141,9 @@ def __call__(
139141
return_all_scores = return_all_scores if return_all_scores is not None else self.return_all_scores
140142
return_outputs = return_outputs if return_outputs is not None else self.return_outputs
141143

144+
if callback_metadata:
145+
logger.debug(f"Evaluate is called with callback metadata: {callback_metadata}")
146+
142147
tqdm.tqdm._instances.clear()
143148

144149
executor = ParallelExecutor(

dspy/teleprompt/utils.py

+3-2
Original file line numberDiff line numberDiff line change
@@ -49,13 +49,14 @@ def eval_candidate_program(batch_size, trainset, candidate_program, evaluate, rn
4949
try:
5050
# Evaluate on the full trainset
5151
if batch_size >= len(trainset):
52-
return evaluate(candidate_program, devset=trainset, return_all_scores=return_all_scores)
52+
return evaluate(candidate_program, devset=trainset, return_all_scores=return_all_scores, callback_metadata={"metric_key": "eval_full"})
5353
# Or evaluate on a minibatch
5454
else:
5555
return evaluate(
5656
candidate_program,
5757
devset=create_minibatch(trainset, batch_size, rng),
58-
return_all_scores=return_all_scores
58+
return_all_scores=return_all_scores,
59+
callback_metadata={"metric_key": "eval_minibatch"}
5960
)
6061
except Exception:
6162
logger.error("An exception occurred during evaluation", exc_info=True)

tests/teleprompt/test_utils.py

+2
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@ def test_eval_candidate_program_full_trainset():
2323
evaluate.assert_called_once()
2424
_, called_kwargs = evaluate.call_args
2525
assert len(called_kwargs['devset']) == len(trainset)
26+
assert called_kwargs['callback_metadata'] == {"metric_key": "eval_full"}
2627
assert result == 0
2728

2829
def test_eval_candidate_program_minibatch():
@@ -36,6 +37,7 @@ def test_eval_candidate_program_minibatch():
3637
evaluate.assert_called_once()
3738
_, called_kwargs = evaluate.call_args
3839
assert len(called_kwargs['devset']) == batch_size
40+
assert called_kwargs['callback_metadata'] == {"metric_key": "eval_minibatch"}
3941
assert result == 0
4042

4143
@pytest.mark.parametrize("return_all_scores", [True, False])

0 commit comments

Comments
 (0)