forked from NVIDIA/NeMo-Skills
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathrun_eval.py
157 lines (134 loc) · 6.03 KB
/
run_eval.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
# Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
from argparse import ArgumentParser
from pathlib import Path
# adding nemo_skills to python path to avoid requiring installation
sys.path.append(str(Path(__file__).absolute().parents[1]))
from launcher import CLUSTER_CONFIG, NEMO_SKILLS_CODE, WRAPPER_HELP, get_server_command, launch_job
from nemo_skills.inference.generate_solutions import HELP_MESSAGE
from nemo_skills.utils import setup_logging
SCRIPT_HELP = """
This script can be used to run evaluation of a model on a set of benchmarks.
It can run both greedy decoding and sampling and can parallelize generations
across multiple nodes. It uses nemo_skills/inference/generate_solutions.py
to generate solutions and nemo_skills/evaluation/evaluate_results.py to
evaluate them. It will set reasonable defaults for most of the generation parameters,
but you can override any of them by directly providing corresponding arguments
in the Hydra format.
"""
def get_greedy_cmd(benchmark, output_name='output-greedy.jsonl', extra_arguments=""):
return f"""echo "Evaluating benchmark {benchmark}" && \
python nemo_skills/inference/generate_solutions.py \
server.server_type={{server_type}} \
prompt.context_type=empty \
+dataset={benchmark} \
output_file=/results/{benchmark}/{output_name} \
{extra_arguments} && \
python nemo_skills/evaluation/evaluate_results.py \
prediction_jsonl_files=/results/{benchmark}/{output_name} && \
"""
def get_sampling_cmd(benchmark, random_seed, extra_arguments=""):
extra_arguments = f" inference.random_seed={random_seed} inference.temperature=0.7 {extra_arguments}"
return get_greedy_cmd(benchmark, output_name=f"output-rs{random_seed}.jsonl", extra_arguments=extra_arguments)
# default number of samples for majority voting
BENCHMARKS = {
"gsm8k": 8,
"math": 4,
}
SLURM_CMD = """
nvidia-smi && \
cd /code && \
export PYTHONPATH=$PYTHONPATH:/code && \
{server_start_cmd} && \
if [ $SLURM_LOCALID -eq 0 ]; then \
echo "Waiting for the server to start" && \
tail -n0 -f /tmp/server_logs.txt | sed '/Running on all addresses/ q' && \
{eval_cmds} \
kill %1; \
else \
sleep infinity; \
fi \
"""
MOUNTS = "{NEMO_SKILLS_CODE}:/code,{model_path}:/model,{output_dir}:/results"
JOB_NAME = "eval-{model_name}"
if __name__ == "__main__":
setup_logging(disable_hydra_logs=False)
parser = ArgumentParser(usage=WRAPPER_HELP + '\n\n' + SCRIPT_HELP + '\n\nscript arguments:\n\n' + HELP_MESSAGE)
wrapper_args = parser.add_argument_group('wrapper arguments')
wrapper_args.add_argument("--model_path", required=True)
wrapper_args.add_argument("--server_type", choices=('nemo', 'tensorrt_llm'), default='tensorrt_llm')
wrapper_args.add_argument("--output_dir", required=True)
wrapper_args.add_argument("--num_gpus", type=int, required=True)
wrapper_args.add_argument("--starting_seed", type=int, default=0)
wrapper_args.add_argument(
"--benchmarks",
nargs="+",
default=[],
help="Need to be in a format <benchmark>:<num samples for majority voting>. "
"Use <benchmark>:0 to only run greedy decoding.",
)
wrapper_args.add_argument(
"--num_nodes",
type=int,
default=-1,
help="Will parallelize across this number of nodes. Set -1 to run each decoding on a separate node.",
)
wrapper_args.add_argument(
"--partition",
required=False,
help="Can specify if need interactive jobs or a specific non-default partition",
)
args, unknown = parser.parse_known_args()
extra_arguments = f'{" ".join(unknown)}'
args.model_path = Path(args.model_path).absolute()
args.output_dir = Path(args.output_dir).absolute()
server_start_cmd, num_tasks = get_server_command(args.server_type, args.num_gpus)
format_dict = {
"model_path": args.model_path,
"model_name": args.model_path.name,
"output_dir": args.output_dir,
"num_gpus": args.num_gpus,
"server_start_cmd": server_start_cmd,
"server_type": args.server_type,
"NEMO_SKILLS_CODE": NEMO_SKILLS_CODE,
}
Path(args.output_dir).mkdir(exist_ok=True, parents=True)
# if benchmarks are specified, only run those
if args.benchmarks:
BENCHMARKS = {k: int(v) for k, v in [b.split(":") for b in args.benchmarks]}
eval_cmds = [get_greedy_cmd(benchmark, extra_arguments=extra_arguments) for benchmark in BENCHMARKS.keys()]
eval_cmds += [
get_sampling_cmd(benchmark, rs, extra_arguments=extra_arguments)
for benchmark, rs_num in BENCHMARKS.items()
for rs in range(args.starting_seed, args.starting_seed + rs_num)
]
if args.num_nodes == -1:
args.num_nodes = len(eval_cmds)
# splitting eval cmds equally across num_nodes nodes
eval_cmds = [" ".join(eval_cmds[i :: args.num_nodes]) for i in range(args.num_nodes)]
for idx, eval_cmd in enumerate(eval_cmds):
extra_sbatch_args = ["--parsable", f"--output={args.output_dir}/slurm_logs_eval{idx}.log"]
launch_job(
cmd=SLURM_CMD.format(**format_dict, eval_cmds=eval_cmd.format(**format_dict)),
num_nodes=1,
tasks_per_node=num_tasks,
gpus_per_node=format_dict["num_gpus"],
job_name=JOB_NAME.format(**format_dict),
container=CLUSTER_CONFIG["containers"][args.server_type],
mounts=MOUNTS.format(**format_dict),
partition=args.partition,
with_sandbox=True,
extra_sbatch_args=extra_sbatch_args,
)