SSD-ResNet34 FP32 Training - update num inter/intra threads calculation and README notes

dmsuehir · ashahba · commit 6a8ed27262c0 · 2020-11-11T16:01:43.000-08:00
diff --git a/benchmarks/object_detection/tensorflow/ssd-resnet34/README.md b/benchmarks/object_detection/tensorflow/ssd-resnet34/README.md
@@ -385,18 +385,29 @@ $ pushd $MODEL_WORK_DIR
 
    To run for training, use the following command.
 
+   > Note: for best performance, use the same value for the arguments num-cores and num-intra-thread as follows:
+   >   For single instance run (mpi_num_processes=1): the value is equal to number of logical cores per socket.
+   >   For multi-instance run (mpi_num_processes > 1): the value is equal to (#_of_logical_cores_per_socket - 2).
+   >   If the `--num-cores` or `--num-intra-threads` args are not specified, these args will be calculated based on
+   >   the number of logical cores on your system.
+
    ```bash
    $ cd $MODEL_WORK_DIR/models/benchmarks/
    
     $ python3 launch_benchmark.py \
     --data-location /path/to/coco-dataset \
     --model-source-dir $MODEL_WORK_DIR/tf_models \
-    --model-name ssd-resnet34 --framework tensorflow \
+    --model-name ssd-resnet34 \
+    --framework tensorflow \
     --precision fp32 --mode training \
-    --num-train-steps 100 --num-cores 52 \
-    --num-inter-threads 1 --num-intra-threads 52 \
-    --batch-size=52 --weight_decay=1e-4 \
-    --mpi_num_processes=1 --mpi_num_processes_per_socket=1 \
+    --num-train-steps 100 \
+    --num-cores 52 \
+    --num-inter-threads 1 \
+    --num-intra-threads 52 \
+    --batch-size=100 \
+    --weight_decay=1e-4 \
+    --mpi_num_processes=1 \
+    --mpi_num_processes_per_socket=1 \
     --docker-image intel/intel-optimized-tensorflow:2.3.0
    ```
 
@@ -408,22 +419,30 @@ $ pushd $MODEL_WORK_DIR
    2. Next, navigate to the benchmarks directory of the intelai/models repository that was cloned earlier.
       Use the below command to test performance by training the model for a limited number of steps:
 
-      Note: for best performance, use the same value for the arguments num-cores and num-intra-thread as follows:
-        For single instance run (mpi_num_processes=1): the value is equal to number of logical cores per socket.
-        For multi-instance run (mpi_num_processes > 1): the value is equal to (#_of_logical_cores_per_socket - 2).
+      > Note: for best performance, use the same value for the arguments num-cores and num-intra-thread as follows:
+      >   For single instance run (mpi_num_processes=1): the value is equal to number of logical cores per socket.
+      >   For multi-instance run (mpi_num_processes > 1): the value is equal to (#_of_logical_cores_per_socket - 2).
+      >   If the `--num-cores` or `--num-intra-threads` args are not specified, these args will be calculated based on
+      >   the number of logical cores on your system.
 
       ```bash
       $ cd $MODEL_WORK_DIR/models/benchmarks/
       $ python3 launch_benchmark.py \
       --data-location <path to coco_training_dataset> \
       --model-source-dir <path to tf_models> \
-      --model-name ssd-resnet34 --framework tensorflow \
-      --precision bfloat16 --mode training \
-      --num-train-steps 100 --num-cores 52 \
-      --num-inter-threads 1 --num-intra-threads 52 \
-      --batch-size=100 --weight_decay=1e-4 \
+      --model-name ssd-resnet34 \
+      --framework tensorflow \
+      --precision bfloat16 \
+      --mode training \
+      --num-train-steps 100 \
+      --num-cores 52 \
+      --num-inter-threads 1 \
+      --num-intra-threads 52 \
+      --batch-size=100 \
+      --weight_decay=1e-4 \
       --num_warmup_batches=20 \
-      --mpi_num_processes=1 --mpi_num_processes_per_socket=1 \
+      --mpi_num_processes=1 \
+      --mpi_num_processes_per_socket=1 \
       --docker-image intel/intel-optimized-tensorflow:2.3.0
       ```
 
diff --git a/benchmarks/object_detection/tensorflow/ssd-resnet34/training/common_model_init.py b/benchmarks/object_detection/tensorflow/ssd-resnet34/training/common_model_init.py
@@ -42,15 +42,13 @@ def __init__(self, args, custom_args, platform_util):
         config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json")
         self.set_kmp_vars(config_file_path)
 
-        self.set_num_inter_intra_threads()
-
         # Train parameter parser
         parser = argparse.ArgumentParser(description="process custom_args")
         parser.add_argument('--weight_decay', type=float, default=5e-4)
         parser.add_argument('--num_warmup_batches', type=int, default=0)
         parser.add_argument('--num_train_steps', type=int, default=500, help='number of training batches')
         parser.add_argument('--num_inter_threads', type=int, default=1, help='number of inter-threads')
-        parser.add_argument('--num_intra_threads', type=int, default=28, help='number of intra-threads')
+        parser.add_argument('--num_intra_threads', type=int, default=-1, help='number of intra-threads')
         parser.add_argument('--epochs', dest="epochs", type=int, default=60,
                             help='number of training epochs. Pass 0 to train based on number of train_steps instead of number of epochs')  # noqa: E501
         parser.add_argument('--save_model_steps', dest="save_model_steps", type=int, default=10000,
@@ -59,6 +57,28 @@ def __init__(self, args, custom_args, platform_util):
 
         self.args = parser.parse_args(self.custom_args, namespace=self.args)
 
+        # Calculate num cores and num intra threads, if the values weren't provided.
+        # For a single instance run, use the number of logical cores per socket
+        # for multi instance, use the number of logical cores per socket - 2
+        # Note that most models use the number of physical cores for these values,
+        # but this model performs better with using logical cores.
+        if not self.args.num_cores or not self.args.num_intra_threads:
+            num_logical_cores_per_socket = \
+                platform_util.num_cores_per_socket * platform_util.num_threads_per_core
+
+            cores_to_use = num_logical_cores_per_socket \
+                if not os.environ["MPI_NUM_PROCESSES"] or int(os.environ["MPI_NUM_PROCESSES"]) <= 1 else \
+                num_logical_cores_per_socket - 2
+
+            if not self.args.num_cores or self.args.num_cores == -1:
+                self.args.num_cores = cores_to_use
+
+            if not self.args.num_intra_threads or self.args.self.args.num_intra_threads == -1:
+                self.args.num_intra_threads = cores_to_use
+
+        if not self.args.num_inter_threads:
+            self.args.num_inter_threads = 1
+
         omp_num_threads = platform_util.num_cores_per_socket
 
         set_env_var("OMP_NUM_THREADS", omp_num_threads if self.args.num_cores == -1 else self.args.num_cores)