hpcaitech
diff --git a/‎.compatibility
-1 b/‎.compatibility
-1
diff --git a/‎.github/workflows/build_on_pr.yml
+1-1 b/‎.github/workflows/build_on_pr.yml
+1-1
diff --git a/‎.github/workflows/build_on_schedule.yml
+1-1 b/‎.github/workflows/build_on_schedule.yml
+1-1
diff --git a/‎.github/workflows/compatiblity_test_on_dispatch.yml
+1-1 b/‎.github/workflows/compatiblity_test_on_dispatch.yml
+1-1
diff --git a/‎.github/workflows/compatiblity_test_on_pr.yml
+1-1 b/‎.github/workflows/compatiblity_test_on_pr.yml
+1-1
diff --git a/‎.github/workflows/compatiblity_test_on_schedule.yml
+1-1 b/‎.github/workflows/compatiblity_test_on_schedule.yml
+1-1
diff --git a/‎.github/workflows/cuda_ext_check_before_merge.yml
+1-1 b/‎.github/workflows/cuda_ext_check_before_merge.yml
+1-1
diff --git a/‎.github/workflows/doc_test_on_pr.yml
+2-2 b/‎.github/workflows/doc_test_on_pr.yml
+2-2
diff --git a/‎.github/workflows/doc_test_on_schedule.yml
+2-2 b/‎.github/workflows/doc_test_on_schedule.yml
+2-2
diff --git a/‎.github/workflows/example_check_on_dispatch.yml
+2-2 b/‎.github/workflows/example_check_on_dispatch.yml
+2-2
diff --git a/‎.github/workflows/example_check_on_pr.yml
+3-2 b/‎.github/workflows/example_check_on_pr.yml
+3-2
diff --git a/‎.github/workflows/example_check_on_schedule.yml
+2-2 b/‎.github/workflows/example_check_on_schedule.yml
+2-2
diff --git a/‎.github/workflows/run_chatgpt_examples.yml
+1-1 b/‎.github/workflows/run_chatgpt_examples.yml
+1-1
diff --git a/‎.github/workflows/run_chatgpt_unit_tests.yml
+1-1 b/‎.github/workflows/run_chatgpt_unit_tests.yml
+1-1
diff --git a/‎.github/workflows/run_colossalqa_unit_tests.yml
+1-1 b/‎.github/workflows/run_colossalqa_unit_tests.yml
+1-1
diff --git a/‎README.md
+1-1 b/‎README.md
+1-1
diff --git a/‎applications/Colossal-LLaMA/README.md
+20-16 b/‎applications/Colossal-LLaMA/README.md
+20-16
diff --git a/‎applications/Colossal-LLaMA/colossal_llama/dataset/dummy_dataset.py
+24 b/‎applications/Colossal-LLaMA/colossal_llama/dataset/dummy_dataset.py
+24
@@ -1,4 +1,3 @@
-2.1.0-12.1.0
 2.2.2-12.1.0
 2.3.0-12.1.0
 2.4.0-12.4.1
@@ -89,7 +89,7 @@ jobs:
     if: needs.detect.outputs.anyLibraryFileChanged == 'true'
     runs-on: [self-hosted, gpu]
     container:
-      image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
+      image: hpcaitech/pytorch-cuda:2.2.2-12.1.0
       options: --gpus all --rm -v /dev/shm -v /data/scratch:/data/scratch
     timeout-minutes: 90
     defaults:
 
@@ -12,7 +12,7 @@ jobs:
     if: github.repository == 'hpcaitech/ColossalAI'
     runs-on: [self-hosted, gpu]
     container:
-      image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
+      image: hpcaitech/pytorch-cuda:2.2.2-12.1.0
       options: --gpus all --rm -v /dev/shm -v /data/scratch/:/data/scratch/
     timeout-minutes: 90
     steps:
 
@@ -64,7 +64,7 @@ jobs:
 
       - name: Install Colossal-AI
         run: |
-          BUILD_EXT=1 pip install -v .
+          BUILD_EXT=1 pip install -v -e .
           pip install --no-cache-dir -r requirements/requirements-test.txt
 
       - name: Install tensornvme
 
@@ -58,7 +58,7 @@ jobs:
 
       - name: Install Colossal-AI
         run: |
-          BUILD_EXT=1 pip install -v .
+          BUILD_EXT=1 pip install -v -e .
           pip install --no-cache-dir -r requirements/requirements-test.txt
 
       - name: Install tensornvme
 
@@ -52,7 +52,7 @@ jobs:
 
       - name: Install Colossal-AI
         run: |
-          BUILD_EXT=1 pip install -v .
+          BUILD_EXT=1 pip install -v -e .
           pip install --no-cache-dir -r requirements/requirements-test.txt
 
       - name: Install tensornvme
 
@@ -51,4 +51,4 @@ jobs:
 
       - name: Build
         run: |
-          BUILD_EXT=1 pip install -v .
+          BUILD_EXT=1 pip install -v -e .
@@ -56,7 +56,7 @@ jobs:
     needs: detect-changed-doc
     runs-on: [self-hosted, gpu]
     container:
-      image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
+      image: hpcaitech/pytorch-cuda:2.2.2-12.1.0
       options: --gpus all --rm
     timeout-minutes: 30
     defaults:
@@ -89,7 +89,7 @@ jobs:
       - name: Install ColossalAI
         run: |
           source activate pytorch
-          BUILD_EXT=1 pip install -v .
+          BUILD_EXT=1 pip install -v -e .
 
       - name: Test the Doc
         run: |
 
@@ -12,7 +12,7 @@ jobs:
     name: Test the changed Doc
     runs-on: [self-hosted, gpu]
     container:
-      image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
+      image: hpcaitech/pytorch-cuda:2.2.2-12.1.0
       options: --gpus all --rm
     timeout-minutes: 60
     steps:
@@ -32,7 +32,7 @@ jobs:
 
       - name: Install ColossalAI
         run: |
-          BUILD_EXT=1 pip install -v .
+          BUILD_EXT=1 pip install -v -e .
 
       - name: Install Doc Test Requirements
         run: |
 
@@ -45,15 +45,15 @@ jobs:
       fail-fast: false
       matrix: ${{fromJson(needs.manual_check_matrix_preparation.outputs.matrix)}}
     container:
-      image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
+      image: hpcaitech/pytorch-cuda:2.2.2-12.1.0
       options: --gpus all --rm -v /data/scratch/examples-data:/data/ -v /dev/shm
     timeout-minutes: 15
     steps:
       - name: 📚 Checkout
         uses: actions/checkout@v3
       - name: Install Colossal-AI
         run: |
-          BUILD_EXT=1 pip install -v .
+          BUILD_EXT=1 pip install -v -e .
       - name: Test the example
         run: |
           dir=${{ matrix.directory }}
 
@@ -9,6 +9,7 @@ on:
     paths:
       - "examples/**"
       - "!examples/**.md"
+      - ".github/workflows/example_check_on_pr.yml"
 
 jobs:
   # This is for changed example files detect and output a matrix containing all the corresponding directory name.
@@ -89,7 +90,7 @@ jobs:
       fail-fast: false
       matrix: ${{fromJson(needs.detect-changed-example.outputs.matrix)}}
     container:
-      image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
+      image: hpcaitech/pytorch-cuda:2.2.2-12.1.0
       options: --gpus all --rm -v /data/scratch/examples-data:/data/ -v /dev/shm
     timeout-minutes: 30
     concurrency:
@@ -107,7 +108,7 @@ jobs:
 
       - name: Install Colossal-AI
         run: |
-          BUILD_EXT=1 pip install -v .
+          BUILD_EXT=1 pip install -v -e .
 
       - name: Store Colossal-AI Cache
         run: |
 
@@ -34,7 +34,7 @@ jobs:
       fail-fast: false
       matrix: ${{fromJson(needs.matrix_preparation.outputs.matrix)}}
     container:
-      image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
+      image: hpcaitech/pytorch-cuda:2.2.2-12.1.0
       options: --gpus all --rm -v /data/scratch/examples-data:/data/ -v /dev/shm
     timeout-minutes: 30
     steps:
@@ -43,7 +43,7 @@ jobs:
 
       - name: Install Colossal-AI
         run: |
-          BUILD_EXT=1 pip install -v .
+          BUILD_EXT=1 pip install -v -e .
 
       - name: Traverse all files
         run: |
 
@@ -19,7 +19,7 @@ jobs:
       github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
     runs-on: [self-hosted, gpu]
     container:
-      image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
+      image: hpcaitech/pytorch-cuda:2.2.2-12.1.0
       options: --gpus all --rm -v /data/scratch/examples-data:/data/scratch/examples-data --shm-size=10.24gb
     timeout-minutes: 60
     defaults:
 
@@ -19,7 +19,7 @@ jobs:
       github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
     runs-on: [self-hosted, gpu]
     container:
-      image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
+      image: hpcaitech/pytorch-cuda:2.2.2-12.1.0
       options: --gpus all --rm -v /data/scratch/examples-data:/data/scratch/examples-data
     timeout-minutes: 30
     defaults:
 
@@ -19,7 +19,7 @@ jobs:
       github.event.pull_request.base.repo.full_name == 'hpcaitech/ColossalAI'
     runs-on: [self-hosted, gpu]
     container:
-      image: hpcaitech/pytorch-cuda:2.1.0-12.1.0
+      image: hpcaitech/pytorch-cuda:2.2.2-12.1.0
       volumes:
         - /data/scratch/test_data_colossalqa:/data/scratch/test_data_colossalqa
         - /data/scratch/llama-tiny:/data/scratch/llama-tiny
 
@@ -420,7 +420,7 @@ Please visit our [documentation](https://www.colossalai.org/) and [examples](htt
 ## Installation
 
 Requirements:
-- PyTorch >= 2.1
+- PyTorch >= 2.2
 - Python >= 3.7
 - CUDA >= 11.0
 - [NVIDIA GPU Compute Capability](https://developer.nvidia.com/cuda-gpus) >= 7.0 (V100/RTX20 and higher)
 
@@ -30,7 +30,7 @@ Colossal-LLaMA
   - [Install](#install)
     - [0. Pre-requisite](#0-pre-requisite)
     - [1. Install required packages](#1-install-required-packages)
-    - [2. Install `xentropy`, `layer_norm` and `rotary`](#2-install-xentropy-layer_norm-and-rotary)
+    - [2. Install Apex](#2-install-apex)
   - [How to run](#how-to-run)
     - [1. Init Tokenizer Preparation](#1-init-tokenizer-preparation)
     - [2. Init Model Preparation](#2-init-model-preparation)
@@ -297,17 +297,13 @@ Here is details about CLI arguments:
 #### 1. Install required packages
 ```
 cd Colossal-LLaMA
-pip install -r requirements.txt
+pip install -e .
 ```
-#### 2. Install `xentropy`, `layer_norm` and `rotary`
+
+#### 2. Install Apex
 ```bash
-git clone [email protected]:Dao-AILab/flash-attention.git
-# At the root folder
-cd csrc/xentropy && pip install .
-# At the root folder
-cd csrc/layer_norm && pip install .
-# At the root folder
-cd csrc/rotary && pip install .
+git clone [email protected]:NVIDIA/apex.git
+# Install from source.
 ```
 
 ### How to run
@@ -427,25 +423,33 @@ Make sure master node can access all nodes (including itself) by ssh without pas
 Here is details about CLI arguments:
 * Pre-trained model path: `--pretrained`. Path to the pre-trained model in Hugging Face format.
 * Dataset path: `--dataset`. Path to the pre-tokenized dataset.
-* Booster plugin: `--plugin`. `gemini`, `gemini_auto`, `zero2`，`zero2_cpu` and `3d` are supported.For more details, please refer to [Booster plugins](https://colossalai.org/docs/basics/booster_plugins/).
+* Booster plugin: `--plugin`. `ddp`,`gemini`, `gemini_auto`, `zero2`，`zero2_cpu` and `3d` are supported.For more details, please refer to [Booster plugins](https://colossalai.org/docs/basics/booster_plugins/).
 * Intermediate checkpoint to load: `--load_checkpoint`. Path to the intermediate checkpoint. Saved checkpoint contains the states for `lr_scheduler`, `optimizer`,`running_states.json` and `modelling`. If `load_checkpoint` points to the `modelling` folder, only the model weights will be loaded without any other states to support multi-stage training.
 * Save interval: `--save_interval`. The interval (steps) of saving checkpoints. The default value is 1000.
 * Checkpoint directory: `--save_dir`. The directory path to save checkpoint and intermediate states. Intermediate states include `lr_scheduler`, `optimizer`,`running_states.json` and `modelling`.
 * Tensorboard directory: `--tensorboard_dir`. The path to save tensorboard logs.
 * Configuration file: `--config_file`. The path to save the configuration file.
 * Number of epochs: `--num_epochs`. Number of training epochs. The default value is 1.
-* Micro batch size: `--micro_batch_size`. Batch size per GPU. The default value is 1.
+* Batch size: `--batch_size`. Batch size per GPU. The default value is 1. For PP, it refers to number of samples per step.
 * Learning rate: `--lr`. The default value is 3e-4.
 * Max length: `--max_length`. Max context length. The default value is 4096.
 * Mixed precision: `--mixed_precision`. The default value is "fp16". "fp16" and "bf16" are supported.
 * Gradient clipping: `--gradient_clipping`. The default value is 1.0.
-* Weight decay: `-w`, `--weight_decay`. The default value is 0.1.
-* Warmup steps: `-s`, `--warmup_steps`. The default value is calculated by 0.025 warmup ratio.
+* Weight decay: `--weight_decay`. The default value is 0.1.
+* Warmup steps: `--warmup_steps`. The default value is calculated by 0.025 warmup ratio.
 * Gradient checkpointing: `--use_grad_checkpoint`. The default value is `False`. This saves memory at the cost of speed. You'd better enable this option when training with a large batch size.
 * Flash attention: `--use_flash_attn`. If you want to use flash attention, you must install `flash-attn` and related packages. The default value is `False`. This is helpful to accelerate training while saving memory. We recommend you always use flash attention.
 * Freeze non-embedding parameters: `--freeze_non_embeds_params`. Freeze non-embedding parameters. It can be helpful to align embeddings after extending vocabulary size.
-* Tensor parallelism size: `--tp`. TP size for 3d Parallelism. The default value is 1.
-* Zero stage: `--zero`. Zero stage for 3d Parallelism. The default value is 1.
+* Tensor parallelism size: `--tp`. TP size for 3d parallelism. The default value is 1. Used for 3d plugin.
+* Pipeline parallelism size: `--pp`. PP size for 3d parallelism. The default value is 1. Used for 3d plugin.
+* Sequence parallelism size: `--sp`. SP size for 3d parallelism. The default value is 1. Used for 3d plugin.
+* Zero stage: `--zero`. Zero stage for 3d Parallelism. The default value is 1. Used for 3d plugin.
+* Sequence parallelism mode: `--sp_mode`. SP mode, used for 3d plugin. Choose from "split_gather", "ring", "all_to_all".
+* Switch for sequence parallelism: `--enable_sequence_parallelism`. Whether to enable SP, used for 3d plugin.
+* Zero CPU offload: `--zero_cpu_offload`. Whether to use offloading, used for 3d plugin.
+* Micro batch size: `--microbatch_size`. Batch size for each process in PP, used for 3d plugin.
+* Number of dummy sample: `--num_samples`. Number of samples for benchmarking.
+* Benchmark switch: `--benchmark`. Benchmark performance using random dataset.
 
 ##### 4.2 Arguments for Supervised Fine-tuning
 We add support for gradient accumulation and NEFTuning for supervised fine-tuning and thus there are two more arguments apart from the arguments listed in [4.1 Arguments for Pretraining](#41-arguments-for-pretraining).
 
@@ -0,0 +1,24 @@
+import torch
+from torch.utils.data import Dataset
+
+from colossalai.accelerator import get_accelerator
+
+
+class RandomDataset(Dataset):
+    def __init__(self, num_samples: int = 1000, max_length: int = 2048, vocab_size: int = 32000):
+        self.num_samples = num_samples
+        self.max_length = max_length
+        self.input_ids = torch.randint(
+            0, vocab_size, (num_samples, max_length), device=get_accelerator().get_current_device()
+        )
+        self.attention_mask = torch.ones_like(self.input_ids)
+
+    def __len__(self):
+        return self.num_samples
+
+    def __getitem__(self, idx):
+        return {
+            "input_ids": self.input_ids[idx],
+            "attention_mask": self.attention_mask[idx],
+            "labels": self.input_ids[idx],
+        }