Skip to content

Commit 6a8ed27

Browse files
dmsuehirashahba
authored andcommitted
SSD-ResNet34 FP32 Training - update num inter/intra threads calculation and README notes
1 parent 71a7bc9 commit 6a8ed27

File tree

2 files changed

+56
-17
lines changed

2 files changed

+56
-17
lines changed

benchmarks/object_detection/tensorflow/ssd-resnet34/README.md

+33-14
Original file line numberDiff line numberDiff line change
@@ -385,18 +385,29 @@ $ pushd $MODEL_WORK_DIR
385385

386386
To run for training, use the following command.
387387

388+
> Note: for best performance, use the same value for the arguments num-cores and num-intra-thread as follows:
389+
> For single instance run (mpi_num_processes=1): the value is equal to number of logical cores per socket.
390+
> For multi-instance run (mpi_num_processes > 1): the value is equal to (#_of_logical_cores_per_socket - 2).
391+
> If the `--num-cores` or `--num-intra-threads` args are not specified, these args will be calculated based on
392+
> the number of logical cores on your system.
393+
388394
```bash
389395
$ cd $MODEL_WORK_DIR/models/benchmarks/
390396

391397
$ python3 launch_benchmark.py \
392398
--data-location /path/to/coco-dataset \
393399
--model-source-dir $MODEL_WORK_DIR/tf_models \
394-
--model-name ssd-resnet34 --framework tensorflow \
400+
--model-name ssd-resnet34 \
401+
--framework tensorflow \
395402
--precision fp32 --mode training \
396-
--num-train-steps 100 --num-cores 52 \
397-
--num-inter-threads 1 --num-intra-threads 52 \
398-
--batch-size=52 --weight_decay=1e-4 \
399-
--mpi_num_processes=1 --mpi_num_processes_per_socket=1 \
403+
--num-train-steps 100 \
404+
--num-cores 52 \
405+
--num-inter-threads 1 \
406+
--num-intra-threads 52 \
407+
--batch-size=100 \
408+
--weight_decay=1e-4 \
409+
--mpi_num_processes=1 \
410+
--mpi_num_processes_per_socket=1 \
400411
--docker-image intel/intel-optimized-tensorflow:2.3.0
401412
```
402413

@@ -408,22 +419,30 @@ $ pushd $MODEL_WORK_DIR
408419
2. Next, navigate to the benchmarks directory of the intelai/models repository that was cloned earlier.
409420
Use the below command to test performance by training the model for a limited number of steps:
410421

411-
Note: for best performance, use the same value for the arguments num-cores and num-intra-thread as follows:
412-
For single instance run (mpi_num_processes=1): the value is equal to number of logical cores per socket.
413-
For multi-instance run (mpi_num_processes > 1): the value is equal to (#_of_logical_cores_per_socket - 2).
422+
> Note: for best performance, use the same value for the arguments num-cores and num-intra-thread as follows:
423+
> For single instance run (mpi_num_processes=1): the value is equal to number of logical cores per socket.
424+
> For multi-instance run (mpi_num_processes > 1): the value is equal to (#_of_logical_cores_per_socket - 2).
425+
> If the `--num-cores` or `--num-intra-threads` args are not specified, these args will be calculated based on
426+
> the number of logical cores on your system.
414427
415428
```bash
416429
$ cd $MODEL_WORK_DIR/models/benchmarks/
417430
$ python3 launch_benchmark.py \
418431
--data-location <path to coco_training_dataset> \
419432
--model-source-dir <path to tf_models> \
420-
--model-name ssd-resnet34 --framework tensorflow \
421-
--precision bfloat16 --mode training \
422-
--num-train-steps 100 --num-cores 52 \
423-
--num-inter-threads 1 --num-intra-threads 52 \
424-
--batch-size=100 --weight_decay=1e-4 \
433+
--model-name ssd-resnet34 \
434+
--framework tensorflow \
435+
--precision bfloat16 \
436+
--mode training \
437+
--num-train-steps 100 \
438+
--num-cores 52 \
439+
--num-inter-threads 1 \
440+
--num-intra-threads 52 \
441+
--batch-size=100 \
442+
--weight_decay=1e-4 \
425443
--num_warmup_batches=20 \
426-
--mpi_num_processes=1 --mpi_num_processes_per_socket=1 \
444+
--mpi_num_processes=1 \
445+
--mpi_num_processes_per_socket=1 \
427446
--docker-image intel/intel-optimized-tensorflow:2.3.0
428447
```
429448

benchmarks/object_detection/tensorflow/ssd-resnet34/training/common_model_init.py

+23-3
Original file line numberDiff line numberDiff line change
@@ -42,15 +42,13 @@ def __init__(self, args, custom_args, platform_util):
4242
config_file_path = os.path.join(os.path.dirname(os.path.realpath(__file__)), "config.json")
4343
self.set_kmp_vars(config_file_path)
4444

45-
self.set_num_inter_intra_threads()
46-
4745
# Train parameter parser
4846
parser = argparse.ArgumentParser(description="process custom_args")
4947
parser.add_argument('--weight_decay', type=float, default=5e-4)
5048
parser.add_argument('--num_warmup_batches', type=int, default=0)
5149
parser.add_argument('--num_train_steps', type=int, default=500, help='number of training batches')
5250
parser.add_argument('--num_inter_threads', type=int, default=1, help='number of inter-threads')
53-
parser.add_argument('--num_intra_threads', type=int, default=28, help='number of intra-threads')
51+
parser.add_argument('--num_intra_threads', type=int, default=-1, help='number of intra-threads')
5452
parser.add_argument('--epochs', dest="epochs", type=int, default=60,
5553
help='number of training epochs. Pass 0 to train based on number of train_steps instead of number of epochs') # noqa: E501
5654
parser.add_argument('--save_model_steps', dest="save_model_steps", type=int, default=10000,
@@ -59,6 +57,28 @@ def __init__(self, args, custom_args, platform_util):
5957

6058
self.args = parser.parse_args(self.custom_args, namespace=self.args)
6159

60+
# Calculate num cores and num intra threads, if the values weren't provided.
61+
# For a single instance run, use the number of logical cores per socket
62+
# for multi instance, use the number of logical cores per socket - 2
63+
# Note that most models use the number of physical cores for these values,
64+
# but this model performs better with using logical cores.
65+
if not self.args.num_cores or not self.args.num_intra_threads:
66+
num_logical_cores_per_socket = \
67+
platform_util.num_cores_per_socket * platform_util.num_threads_per_core
68+
69+
cores_to_use = num_logical_cores_per_socket \
70+
if not os.environ["MPI_NUM_PROCESSES"] or int(os.environ["MPI_NUM_PROCESSES"]) <= 1 else \
71+
num_logical_cores_per_socket - 2
72+
73+
if not self.args.num_cores or self.args.num_cores == -1:
74+
self.args.num_cores = cores_to_use
75+
76+
if not self.args.num_intra_threads or self.args.self.args.num_intra_threads == -1:
77+
self.args.num_intra_threads = cores_to_use
78+
79+
if not self.args.num_inter_threads:
80+
self.args.num_inter_threads = 1
81+
6282
omp_num_threads = platform_util.num_cores_per_socket
6383

6484
set_env_var("OMP_NUM_THREADS", omp_num_threads if self.args.num_cores == -1 else self.args.num_cores)

0 commit comments

Comments
 (0)