provide defaults for args.noinstall depending if docker-image is specified (default=False) else (default=True)

Kasravi, Kam D · Kasravi, Kam D · commit d9204ca1c58a · 2020-05-07T20:53:54.000-07:00
diff --git a/Contribute.md b/Contribute.md
@@ -71,10 +71,10 @@ required:
    first step where you setup the directories for your model. In this
    function, add commands to install any third-party dependencies within
    an `if [ ${NOINSTALL} != "True" ]; then` conditional block. The
-   purpose of the `NOINSTALL` flag is to be able to skip the installs
-   for quicker iteration when running on bare metal or debugging. If
-   your model requires the `PYTHONPATH` environment variable to be setup
-   to find model code or dependencies, that should be done in the
+   purpose of the `--noinstall` flag or `NOINSTALL` env var is to be able 
+   to skip the installs for quicker iteration when running on bare metal 
+   or debugging. If your model requires the `PYTHONPATH` environment variable 
+   to be setup to find model code or dependencies, that should be done in the
    model's function. Next, setup the command that will be run. The
    standard launch script args are already added to the `CMD` variable,
    so your model function will only need to add on more args if you have
diff --git a/benchmarks/common/base_benchmark_util.py b/benchmarks/common/base_benchmark_util.py
@@ -97,6 +97,11 @@ def _define_args(self):
             help="Specify how many MPI processes to launch per socket",
             dest="num_mpi", default=1)
 
+        self._common_arg_parser.add_argument(
+            "--mpi_hostnames",
+            help="Specify MPI hostnames string of the form --mpi_hostnames host1,host2,host3",
+            dest="mpi_hostnames", default=None)
+
         self._common_arg_parser.add_argument(
             "-d", "--data-location",
             help="Specify the location of the data. If this parameter is not "
diff --git a/benchmarks/common/base_model_init.py b/benchmarks/common/base_model_init.py
@@ -57,8 +57,23 @@ def __init__(self, args, custom_args=[], platform_util=None):
         if not platform_util:
             raise ValueError("Did not find any platform info.")
 
-        # Invoke mpirun if mpi_num_processes env is not None
-        if os.environ["MPI_NUM_PROCESSES"] != "None":
+        # use case: bare-metal with openmpi, horovod and multi-node
+        if os.environ["MPI_HOSTNAMES"] != "None" and not "DOCKER" in os.environ or os.environ["DOCKER"] == "False":
+          if os.environ["MPI_NUM_PROCESSES"] != "None":
+            try:
+              # slots per host calculation using MPI_NUM_PROCESSES and number of hosts
+              host_names = os.environ["MPI_HOSTNAMES"]
+              number_of_hosts = len(host_names.split(','))
+              slots_per_host = int(int(os.environ["MPI_NUM_PROCESSES"]) / number_of_hosts)
+              host_names = ",".join([ host + ":" + str(slots_per_host) for host in host_names.split(',') ])
+              # see the [examples](https://horovod.readthedocs.io/en/latest/mpirun.html) for the mca flags
+              self.python_exe = "mpirun " + " -x LD_LIBRARY_PATH " + " -x PYTHONPATH " + " --allow-run-as-root -n " + os.environ["MPI_NUM_PROCESSES"] + " -H " + host_names + " -mca pml ob1 -mca btl ^openib -mca btl_tcp_if_exclude lo,docker0 --bind-to none --map-by slot " + self.python_exe
+            except Exception as exception:
+              raise ValueError("Caught exception calculating slots per host {}".format(str(exception)))
+          else:
+            raise ValueError("MPI_NUM_PROCESSES is required for MPI_HOSTNAMES and will be split evenly across the hosts.")
+        # use case: docker with openmpi, single-node, multi-instance
+        elif os.environ["MPI_NUM_PROCESSES"] != "None":
             if os.environ["MPI_NUM_PROCESSES_PER_SOCKET"] == "1":
               # Map by socket using OpenMPI by default (PPS=1).
               self.python_exe = "mpirun --allow-run-as-root -n " + os.environ["MPI_NUM_PROCESSES"] + " --map-by socket " + self.python_exe
diff --git a/benchmarks/common/tensorflow/start.sh b/benchmarks/common/tensorflow/start.sh
@@ -51,6 +51,9 @@ echo "    NOINSTALL: ${NOINSTALL}"
 echo "    OUTPUT_DIR: ${OUTPUT_DIR}"
 echo "    MPI_NUM_PROCESSES: ${MPI_NUM_PROCESSES}"
 echo "    MPI_NUM_PEOCESSES_PER_SOCKET: ${MPI_NUM_PROCESSES_PER_SOCKET}"
+echo "    MPI_HOSTNAMES: ${MPI_HOSTNAMES}"
+echo "    PYTHON_EXE: ${PYTHON_EXE}"
+echo "    PYTHONPATH: ${PYTHONPATH}"
 
 #  inference & training is supported right now
 if [ ${MODE} != "inference" ] && [ ${MODE} != "training" ]; then
@@ -77,7 +80,7 @@ if [[ ${NOINSTALL} != "True" ]]; then
   fi
 
   if [[ ${MPI_NUM_PROCESSES} != "None" ]]; then
-    #  Installing OpenMPI
+    ## Installing OpenMPI
     apt-get install openmpi-bin openmpi-common openssh-client openssh-server libopenmpi-dev -y
     # Horovod Installation
     export HOROVOD_WITHOUT_PYTORCH=1
diff --git a/benchmarks/image_recognition/tensorflow/resnet50v1_5/README.md b/benchmarks/image_recognition/tensorflow/resnet50v1_5/README.md
@@ -618,3 +618,19 @@ $ python launch_benchmark.py \
 ```
 
 You can check output trained model accuracy by setting `--eval=True` in the command. After training is over, it automatically run inference and report accuracy results.
+
+Finally, the following command runs MPI across multiple nodes on bare-metal, with 2 MPI processes per node. Each node must have passwordless ssh enabled for the user running the command below. All hosts should have these additional packages installed: (apt-get) openmpi-bin openmpi-common libopenmpi-dev, (pip) horovod==0.19.2
+
+```
+$ python launch_benchmark.py \
+         --verbose \
+         --model-name=resnet50v1_5 \
+         --precision=fp32 \
+         --mode=training \
+         --framework tensorflow \
+         --noinstall \
+         --checkpoint=/home/<user>/checkpoints \
+         --data-location=/home/<user>/dataset/ImageNetData_directory \
+         --mpi_hostnames='host1,host2' \
+         --mpi_num_processes=4 2>&1
+```
diff --git a/benchmarks/launch_benchmark.py b/benchmarks/launch_benchmark.py
@@ -91,6 +91,12 @@ def parse_args(self):
             "--debug", help="Launches debug mode which doesn't execute "
                             "start.sh when running in a docker container.", action="store_true")
 
+        arg_parser.add_argument(
+            "--noinstall",
+            help="whether to install packages for a given model when running in docker "
+                 "(default --noinstall='False') or on bare metal (default --noinstall='True')",
+            dest="noinstall", action="store_true", default=None)
+
         return arg_parser.parse_known_args()
 
     def validate_args(self):
@@ -192,6 +198,7 @@ def get_env_vars(self, benchmark_scripts, use_case, intelai_models,
             "NUM_CORES": args.num_cores,
             "NUM_INTER_THREADS": args.num_inter_threads,
             "NUM_INTRA_THREADS": args.num_intra_threads,
+            "NOINSTALL": str(args.noinstall) if args.noinstall is not None else "True" if not args.docker_image else "False",
             "DATA_NUM_INTER_THREADS": args.data_num_inter_threads,
             "NUM_TRAIN_STEPS": args.num_train_steps,
             "DATA_NUM_INTRA_THREADS": args.data_num_intra_threads,
@@ -203,7 +210,8 @@ def get_env_vars(self, benchmark_scripts, use_case, intelai_models,
             "DOCKER": str(args.docker_image is not None),
             "PYTHON_EXE": sys.executable if not args.docker_image else "python",
             "MPI_NUM_PROCESSES": args.mpi,
-            "MPI_NUM_PROCESSES_PER_SOCKET": args.num_mpi
+            "MPI_NUM_PROCESSES_PER_SOCKET": args.num_mpi,
+            "MPI_HOSTNAMES": args.mpi_hostnames
         }
 
         # Add custom model args as env vars)
@@ -216,11 +224,6 @@ def get_env_vars(self, benchmark_scripts, use_case, intelai_models,
             split_arg[0] = split_arg[0].replace("-", "_").lstrip('_')
             env_var_dict[split_arg[0]] = split_arg[1]
 
-
-        # Set the default value for NOINSTALL, if it's not explicitly set by the user
-        if "NOINSTALL" not in env_var_dict:
-            env_var_dict["NOINSTALL"] = "False"
-
         return env_var_dict
 
     def run_bare_metal(self, benchmark_scripts, intelai_models,
diff --git a/docs/general/tensorflow/LaunchBenchmark.md b/docs/general/tensorflow/LaunchBenchmark.md
@@ -137,6 +137,9 @@ optional arguments:
                         argument can only be used in conjunction with a
                         --docker-image.
   --debug               Launches debug mode which doesn't execute start.sh
+  --noinstall           Whether to install packages for a given model when
+                        running in docker (default --noinstall='False') or on
+                        bare metal (default --noinstall='True')
 ```
 
 ## Volume mounts
@@ -292,7 +295,8 @@ Docker container, running on bare metal also will only work when running
 on Ubuntu.
 
 Before running a model, you must also install all the dependencies
-that are required to run that model.
+that are required to run that model. **(Note: the `--noinstall` 
+flag defaults to 'True' when running on bare metal.)**
 
 Basic requirements for running all models include:
  * python (If the model's README file specifies to use a python3 TensorFlow docker image, then use python 3 on bare metal, otherwise use python 2.7)