Simplify running commands(single node and multi nodes)

anxiangsir · anxiangsir · commit 0cc88e2a2449 · 2021-03-21T15:07:33.000+08:00
1. Update training logs(glint360k)
2. Update install docs
3. Fix distributed training
diff --git a/recognition/arcface_torch/README.md b/recognition/arcface_torch/README.md
@@ -1,48 +1,34 @@
 # Arcface Pytorch (Distributed Version of ArcFace)
 
-
 ## Contents
 
 ## Set Up
 ```shell
 torch >= 1.6.0
-```
-
-## Train on a single node 
-If you want to use 8 GPU to train, you should set `--nproc_per_node=8` and set `CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 `  
-If you want to use 4 GPU to train, you should set `--nproc_per_node=4` and set `CUDA_VISIBLE_DEVICES=0,1,2,3`  
-If you want to use 1 GPU to train, you should set `--nproc_per_node=1` ...  
+```  
+More details see [eval.md](docs/install.md) in docs.
 
+## Training
+### 1. Single node, 1 GPUs:
 ```shell
-export OMP_NUM_THREADS=4
-export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 
-python -m torch.distributed.launch \ 
---nproc_per_node=8 --nnodes=1 \
---node_rank=0 --master_addr="127.0.0.1" \
---master_port=1234 train.py
-ps -ef | grep "train" | grep -v grep | awk '{print "kill -9 "$2}' | sh
+python -m torch.distributed.launch --nproc_per_node=1 --nnodes=1 --node_rank=0 --master_addr="127.0.0.1" --master_port=1234 train.py
 ```
-
-## Train on multi-node
+### 2. Single node, 8 GPUs:
 ```shell
-pass
+python -m torch.distributed.launch --nproc_per_node=8 --nnodes=1 --node_rank=0 --master_addr="127.0.0.1" --master_port=1234 train.py
 ```
-
-## Evaluation
+### 3. Multiple nodes, each node 8 GPUs:  
+Node 0:  
+```shell
+python -m torch.distributed.launch --nproc_per_node=8 --nnodes=2 --node_rank=0 --master_addr="ip1" --master_port=1234 train.py
+```
+Node 1:  
 ```shell
-# model-prefix       your model path
-# image-path         your IJBC path
-# result-dir         your result path
-# network            your backbone
-CUDA_VISIBLE_DEVICES=0,1 python eval_ijbc.py \
---model-prefix ms1mv3_arcface_r50/backbone.pth \
---image-path IJB_release/IJBC \
---result-dir ms1mv3_arcface_r50 \
---batch-size 128 \
---job ms1mv3_arcface_r50 \
---target IJBC \
---network iresnet50
+python -m torch.distributed.launch --nproc_per_node=8 --nnodes=2 --node_rank=1 --master_addr="ip1" --master_port=1234 train.py
 ```
+
+
+## Evaluation IJBC
 More details see [eval.md](docs/eval.md) in docs.
 
 ## Speed Benchmark
@@ -89,14 +75,12 @@ All Model Can be found in here.
 ### Glint360k
 |   Datasets          | log   |backbone               | IJBC(1e-05) | IJBC(1e-04) |agedb30|cfp_fp|lfw  | 
 | :---:               | :---  |:---                   | :---        | :---        |:---   |:---  |:--- |
-| Glint360k-Cosface   |[log](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/glint360k_cosface_r100/training.log)         |r100                 | 96.19 | 97.39 | 98.52 | 99.26 | 99.83 |
-| Glint360k-Cosface   |[log](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/glint360k_cosface_r100_fp16_0.1/training.log)|r100-fp16-sample-0.1 | 95.95 | 97.35 | 98.57 | 99.30 | 99.85 |
-| Glint360k-Cosface   | - | - | - | - | - | - | - |
-| Glint360k-Cosface   | - | - | - | - | - | - | - |
-| Glint360k-Cosface   | - | - | - | - | - | - | - |
-
-
-
+| Glint360k-Cosface   |[log](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/glint360k_cosface_r18_fp16_0.1/training.log) |r18-fp16-0.1  | 93.16 | 95.33 | 97.72 | 97.73 | 99.77 |
+| Glint360k-Cosface   |[log](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/glint360k_cosface_r34_fp16_0.1/training.log) |r34-fp16-0.1  | 95.16 | 96.56 | 98.33 | 98.78 | 99.82 |
+| Glint360k-Cosface   |[log](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/glint360k_cosface_r50_fp16_0.1/training.log) |r50-fp16-0.1  | 95.61 | 96.97 | 98.38 | 99.20 | 99.83 |
+| Glint360k-Cosface   |[log](https://raw.githubusercontent.com/anxiangsir/insightface_arcface_log/master/glint360k_cosface_r100_fp16_0.1/training.log)|r100-fp16-0.1 | 95.88 | 97.32 | 98.48 | 99.29 | 99.82 |
+  
+0.1 means sample rate is 0.1.  
 
 More details see [eval.md](docs/modelzoo.md) in docs.
 
diff --git a/recognition/arcface_torch/config.py b/recognition/arcface_torch/config.py
@@ -37,6 +37,9 @@ def lr_step_func(epoch):
     config.lr_func = lr_step_func
 
 elif config.dataset == "glint360k":
+    # make training faster
+    # our RAM is 256G
+    # mount -t tmpfs -o size=140G  tmpfs /train_tmp
     config.rec = "/train_tmp/glint360k"
     config.num_classes = 360232
     config.num_image = 17091657
diff --git a/recognition/arcface_torch/docs/eval.md b/recognition/arcface_torch/docs/eval.md
@@ -1,15 +1,20 @@
 ## Eval IJBC
 
 ```shell
+# model-prefix       your model path
+# image-path         your IJBC path
+# result-dir         your result path
+# network            your backbone
 CUDA_VISIBLE_DEVICES=0,1 python eval_ijbc.py \
---model-prefix tmp_models/backbone.pth \
---image-path /data/anxiang/IJB_release/IJBC \
---result-dir result \
+--model-prefix ms1mv3_arcface_r50/backbone.pth \
+--image-path IJB_release/IJBC \
+--result-dir ms1mv3_arcface_r50 \
 --batch-size 128 \
---job cosface \
+--job ms1mv3_arcface_r50 \
 --target IJBC \
 --network iresnet50
 ```
 
 ## Eval MegaFace
+pass
 
diff --git a/recognition/arcface_torch/docs/install.md b/recognition/arcface_torch/docs/install.md
@@ -0,0 +1,36 @@
+## v1.7.1  
+### Linux and Windows  
+```shell
+# CUDA 11.0
+pip install torch==1.7.1+cu110 torchvision==0.8.2+cu110 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html
+
+# CUDA 10.2
+pip install torch==1.7.1 torchvision==0.8.2 torchaudio==0.7.2
+
+# CUDA 10.1
+pip install torch==1.7.1+cu101 torchvision==0.8.2+cu101 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html
+
+# CUDA 9.2
+pip install torch==1.7.1+cu92 torchvision==0.8.2+cu92 torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html
+
+# CPU only
+pip install torch==1.7.1+cpu torchvision==0.8.2+cpu torchaudio==0.7.2 -f https://download.pytorch.org/whl/torch_stable.html
+```
+
+
+## v1.6.0  
+
+### Linux and Windows
+```shell
+# CUDA 10.2
+pip install torch==1.6.0 torchvision==0.7.0
+
+# CUDA 10.1
+pip install torch==1.6.0+cu101 torchvision==0.7.0+cu101 -f https://download.pytorch.org/whl/torch_stable.html
+
+# CUDA 9.2
+pip install torch==1.6.0+cu92 torchvision==0.7.0+cu92 -f https://download.pytorch.org/whl/torch_stable.html
+
+# CPU only
+pip install torch==1.6.0+cpu torchvision==0.7.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
+```
diff --git a/recognition/arcface_torch/run.sh b/recognition/arcface_torch/run.sh
@@ -1,4 +1,2 @@
-export OMP_NUM_THREADS=4
-CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m torch.distributed.launch --nproc_per_node=8 --nnodes=1 \
---node_rank=0 --master_addr="127.0.0.1" --master_port=1234 train.py
+CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 python -m torch.distributed.launch --nproc_per_node=8 --nnodes=1 --node_rank=0 --master_addr="127.0.0.1" --master_port=1234 train.py
 ps -ef | grep "train" | grep -v grep | awk '{print "kill -9 "$2}' | sh
diff --git a/recognition/arcface_torch/train.py b/recognition/arcface_torch/train.py
@@ -22,11 +22,13 @@
 
 
 def main(args):
-    dist.init_process_group(backend='nccl', init_method='env://')
+
+    world_size = int(os.environ['WORLD_SIZE'])
+    rank = int(os.environ['RANK'])
+    dist_url = "tcp://{}:{}".format(os.environ["MASTER_ADDR"], os.environ["MASTER_PORT"])
+    dist.init_process_group(backend='nccl', init_method=dist_url, rank=rank, world_size=world_size)
     local_rank = args.local_rank
     torch.cuda.set_device(local_rank)
-    rank = dist.get_rank()
-    world_size = dist.get_world_size()
 
     if not os.path.exists(cfg.output) and rank is 0:
         os.makedirs(cfg.output)
@@ -124,8 +126,8 @@ def main(args):
 if __name__ == "__main__":
     parser = argparse.ArgumentParser(description='PyTorch ArcFace Training')
     parser.add_argument('--local_rank', type=int, default=0, help='local_rank')
-    parser.add_argument('--network', type=str, default="iresnet50", help="backbone network")
-    parser.add_argument('--loss', type=str, default="ArcFace", help="loss function")
-    parser.add_argument('--resume', type=int, default=0, help="model resuming")
+    parser.add_argument('--network', type=str, default='iresnet50', help='backbone network')
+    parser.add_argument('--loss', type=str, default='ArcFace', help='loss function')
+    parser.add_argument('--resume', type=int, default=0, help='model resuming')
     args_ = parser.parse_args()
     main(args_)