Skip to content

Commit d9204ca

Browse files
author
Kasravi, Kam D
committed
provide defaults for args.noinstall depending if docker-image is specified (default=False) else (default=True)
1 parent 02a14e3 commit d9204ca

File tree

7 files changed

+60
-14
lines changed

7 files changed

+60
-14
lines changed

Contribute.md

+4-4
Original file line numberDiff line numberDiff line change
@@ -71,10 +71,10 @@ required:
7171
first step where you setup the directories for your model. In this
7272
function, add commands to install any third-party dependencies within
7373
an `if [ ${NOINSTALL} != "True" ]; then` conditional block. The
74-
purpose of the `NOINSTALL` flag is to be able to skip the installs
75-
for quicker iteration when running on bare metal or debugging. If
76-
your model requires the `PYTHONPATH` environment variable to be setup
77-
to find model code or dependencies, that should be done in the
74+
purpose of the `--noinstall` flag or `NOINSTALL` env var is to be able
75+
to skip the installs for quicker iteration when running on bare metal
76+
or debugging. If your model requires the `PYTHONPATH` environment variable
77+
to be setup to find model code or dependencies, that should be done in the
7878
model's function. Next, setup the command that will be run. The
7979
standard launch script args are already added to the `CMD` variable,
8080
so your model function will only need to add on more args if you have

benchmarks/common/base_benchmark_util.py

+5
Original file line numberDiff line numberDiff line change
@@ -97,6 +97,11 @@ def _define_args(self):
9797
help="Specify how many MPI processes to launch per socket",
9898
dest="num_mpi", default=1)
9999

100+
self._common_arg_parser.add_argument(
101+
"--mpi_hostnames",
102+
help="Specify MPI hostnames string of the form --mpi_hostnames host1,host2,host3",
103+
dest="mpi_hostnames", default=None)
104+
100105
self._common_arg_parser.add_argument(
101106
"-d", "--data-location",
102107
help="Specify the location of the data. If this parameter is not "

benchmarks/common/base_model_init.py

+17-2
Original file line numberDiff line numberDiff line change
@@ -57,8 +57,23 @@ def __init__(self, args, custom_args=[], platform_util=None):
5757
if not platform_util:
5858
raise ValueError("Did not find any platform info.")
5959

60-
# Invoke mpirun if mpi_num_processes env is not None
61-
if os.environ["MPI_NUM_PROCESSES"] != "None":
60+
# use case: bare-metal with openmpi, horovod and multi-node
61+
if os.environ["MPI_HOSTNAMES"] != "None" and not "DOCKER" in os.environ or os.environ["DOCKER"] == "False":
62+
if os.environ["MPI_NUM_PROCESSES"] != "None":
63+
try:
64+
# slots per host calculation using MPI_NUM_PROCESSES and number of hosts
65+
host_names = os.environ["MPI_HOSTNAMES"]
66+
number_of_hosts = len(host_names.split(','))
67+
slots_per_host = int(int(os.environ["MPI_NUM_PROCESSES"]) / number_of_hosts)
68+
host_names = ",".join([ host + ":" + str(slots_per_host) for host in host_names.split(',') ])
69+
# see the [examples](https://horovod.readthedocs.io/en/latest/mpirun.html) for the mca flags
70+
self.python_exe = "mpirun " + " -x LD_LIBRARY_PATH " + " -x PYTHONPATH " + " --allow-run-as-root -n " + os.environ["MPI_NUM_PROCESSES"] + " -H " + host_names + " -mca pml ob1 -mca btl ^openib -mca btl_tcp_if_exclude lo,docker0 --bind-to none --map-by slot " + self.python_exe
71+
except Exception as exception:
72+
raise ValueError("Caught exception calculating slots per host {}".format(str(exception)))
73+
else:
74+
raise ValueError("MPI_NUM_PROCESSES is required for MPI_HOSTNAMES and will be split evenly across the hosts.")
75+
# use case: docker with openmpi, single-node, multi-instance
76+
elif os.environ["MPI_NUM_PROCESSES"] != "None":
6277
if os.environ["MPI_NUM_PROCESSES_PER_SOCKET"] == "1":
6378
# Map by socket using OpenMPI by default (PPS=1).
6479
self.python_exe = "mpirun --allow-run-as-root -n " + os.environ["MPI_NUM_PROCESSES"] + " --map-by socket " + self.python_exe

benchmarks/common/tensorflow/start.sh

+4-1
Original file line numberDiff line numberDiff line change
@@ -51,6 +51,9 @@ echo " NOINSTALL: ${NOINSTALL}"
5151
echo " OUTPUT_DIR: ${OUTPUT_DIR}"
5252
echo " MPI_NUM_PROCESSES: ${MPI_NUM_PROCESSES}"
5353
echo " MPI_NUM_PEOCESSES_PER_SOCKET: ${MPI_NUM_PROCESSES_PER_SOCKET}"
54+
echo " MPI_HOSTNAMES: ${MPI_HOSTNAMES}"
55+
echo " PYTHON_EXE: ${PYTHON_EXE}"
56+
echo " PYTHONPATH: ${PYTHONPATH}"
5457

5558
# inference & training is supported right now
5659
if [ ${MODE} != "inference" ] && [ ${MODE} != "training" ]; then
@@ -77,7 +80,7 @@ if [[ ${NOINSTALL} != "True" ]]; then
7780
fi
7881

7982
if [[ ${MPI_NUM_PROCESSES} != "None" ]]; then
80-
# Installing OpenMPI
83+
## Installing OpenMPI
8184
apt-get install openmpi-bin openmpi-common openssh-client openssh-server libopenmpi-dev -y
8285
# Horovod Installation
8386
export HOROVOD_WITHOUT_PYTORCH=1

benchmarks/image_recognition/tensorflow/resnet50v1_5/README.md

+16
Original file line numberDiff line numberDiff line change
@@ -618,3 +618,19 @@ $ python launch_benchmark.py \
618618
```
619619

620620
You can check output trained model accuracy by setting `--eval=True` in the command. After training is over, it automatically run inference and report accuracy results.
621+
622+
Finally, the following command runs MPI across multiple nodes on bare-metal, with 2 MPI processes per node. Each node must have passwordless ssh enabled for the user running the command below. All hosts should have these additional packages installed: (apt-get) openmpi-bin openmpi-common libopenmpi-dev, (pip) horovod==0.19.2
623+
624+
```
625+
$ python launch_benchmark.py \
626+
--verbose \
627+
--model-name=resnet50v1_5 \
628+
--precision=fp32 \
629+
--mode=training \
630+
--framework tensorflow \
631+
--noinstall \
632+
--checkpoint=/home/<user>/checkpoints \
633+
--data-location=/home/<user>/dataset/ImageNetData_directory \
634+
--mpi_hostnames='host1,host2' \
635+
--mpi_num_processes=4 2>&1
636+
```

benchmarks/launch_benchmark.py

+9-6
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,12 @@ def parse_args(self):
9191
"--debug", help="Launches debug mode which doesn't execute "
9292
"start.sh when running in a docker container.", action="store_true")
9393

94+
arg_parser.add_argument(
95+
"--noinstall",
96+
help="whether to install packages for a given model when running in docker "
97+
"(default --noinstall='False') or on bare metal (default --noinstall='True')",
98+
dest="noinstall", action="store_true", default=None)
99+
94100
return arg_parser.parse_known_args()
95101

96102
def validate_args(self):
@@ -192,6 +198,7 @@ def get_env_vars(self, benchmark_scripts, use_case, intelai_models,
192198
"NUM_CORES": args.num_cores,
193199
"NUM_INTER_THREADS": args.num_inter_threads,
194200
"NUM_INTRA_THREADS": args.num_intra_threads,
201+
"NOINSTALL": str(args.noinstall) if args.noinstall is not None else "True" if not args.docker_image else "False",
195202
"DATA_NUM_INTER_THREADS": args.data_num_inter_threads,
196203
"NUM_TRAIN_STEPS": args.num_train_steps,
197204
"DATA_NUM_INTRA_THREADS": args.data_num_intra_threads,
@@ -203,7 +210,8 @@ def get_env_vars(self, benchmark_scripts, use_case, intelai_models,
203210
"DOCKER": str(args.docker_image is not None),
204211
"PYTHON_EXE": sys.executable if not args.docker_image else "python",
205212
"MPI_NUM_PROCESSES": args.mpi,
206-
"MPI_NUM_PROCESSES_PER_SOCKET": args.num_mpi
213+
"MPI_NUM_PROCESSES_PER_SOCKET": args.num_mpi,
214+
"MPI_HOSTNAMES": args.mpi_hostnames
207215
}
208216

209217
# Add custom model args as env vars)
@@ -216,11 +224,6 @@ def get_env_vars(self, benchmark_scripts, use_case, intelai_models,
216224
split_arg[0] = split_arg[0].replace("-", "_").lstrip('_')
217225
env_var_dict[split_arg[0]] = split_arg[1]
218226

219-
220-
# Set the default value for NOINSTALL, if it's not explicitly set by the user
221-
if "NOINSTALL" not in env_var_dict:
222-
env_var_dict["NOINSTALL"] = "False"
223-
224227
return env_var_dict
225228

226229
def run_bare_metal(self, benchmark_scripts, intelai_models,

docs/general/tensorflow/LaunchBenchmark.md

+5-1
Original file line numberDiff line numberDiff line change
@@ -137,6 +137,9 @@ optional arguments:
137137
argument can only be used in conjunction with a
138138
--docker-image.
139139
--debug Launches debug mode which doesn't execute start.sh
140+
--noinstall Whether to install packages for a given model when
141+
running in docker (default --noinstall='False') or on
142+
bare metal (default --noinstall='True')
140143
```
141144

142145
## Volume mounts
@@ -292,7 +295,8 @@ Docker container, running on bare metal also will only work when running
292295
on Ubuntu.
293296

294297
Before running a model, you must also install all the dependencies
295-
that are required to run that model.
298+
that are required to run that model. **(Note: the `--noinstall`
299+
flag defaults to 'True' when running on bare metal.)**
296300

297301
Basic requirements for running all models include:
298302
* python (If the model's README file specifies to use a python3 TensorFlow docker image, then use python 3 on bare metal, otherwise use python 2.7)

0 commit comments

Comments
 (0)