OpenProteinAI
diff --git a/‎.gitignore
+21 b/‎.gitignore
+21
diff --git a/‎MANIFEST.in
+9 b/‎MANIFEST.in
+9
diff --git a/‎Makefile
+9 b/‎Makefile
+9
diff --git a/‎README.md
+33-8 b/‎README.md
+33-8
diff --git a/‎assets/gpt2_training_curve.jpg
168 KB b/‎assets/gpt2_training_curve.jpg
168 KB
diff --git a/‎assets/gpt2_training_efficiency.jpg
367 KB b/‎assets/gpt2_training_efficiency.jpg
367 KB
diff --git a/‎assets/gpt3_training_curve.jpg
183 KB b/‎assets/gpt3_training_curve.jpg
183 KB
diff --git a/‎assets/gpt3_training_efficiency.jpg
382 KB b/‎assets/gpt3_training_efficiency.jpg
382 KB
diff --git a/‎csrc/flash_attn/fmha_api.cpp
+31-18 b/‎csrc/flash_attn/fmha_api.cpp
+31-18
diff --git a/‎csrc/flash_attn/src/.DS_Store
-6 KB b/‎csrc/flash_attn/src/.DS_Store
-6 KB
diff --git a/‎csrc/flash_attn/src/fmha.h
+8-3 b/‎csrc/flash_attn/src/fmha.h
+8-3
diff --git a/‎csrc/flash_attn/src/fmha/gmem_tile.h
+21-51 b/‎csrc/flash_attn/src/fmha/gmem_tile.h
+21-51
diff --git a/‎csrc/flash_attn/src/fmha_bwd_hdim128.cu
+12 b/‎csrc/flash_attn/src/fmha_bwd_hdim128.cu
+12
diff --git a/‎csrc/flash_attn/src/fmha_bwd_hdim32.cu
+17 b/‎csrc/flash_attn/src/fmha_bwd_hdim32.cu
+17
@@ -0,0 +1,21 @@
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+
+# C extensions
+*.so
+
+# Distribution / packaging
+bin/
+build/
+develop-eggs/
+dist/
+eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+*.egg-info/
+.installed.cfg
+*.egg
@@ -0,0 +1,9 @@
+recursive-include csrc *.cu
+recursive-include csrc *.h
+recursive-include csrc *.cuh
+recursive-include csrc *.cpp
+
+recursive-include flash_attn *.cu
+recursive-include flash_attn *.h
+recursive-include flash_attn *.cuh
+recursive-include flash_attn *.cpp
@@ -0,0 +1,9 @@
+
+clean_dist:
+	rm -rf dist/*
+
+create_dist: clean_dist
+	python setup.py sdist
+
+upload_package: create_dist
+	twine upload dist/*
@@ -8,7 +8,27 @@ Paper: https://arxiv.org/abs/2205.14135
 IEEE Spectrum [article](https://spectrum.ieee.org/mlperf-rankings-2022) about our submission to the MLPerf 2.0 benchmark using FlashAttention.
 ![FlashAttention](assets/flashattn_banner.jpg)
 
-#### Triton implementation of FlashAttention
+## Usage
+
+We've been very happy to see FlashAttention being widely adopted in such a short
+time after its release. This [page](https://github.com/HazyResearch/flash-attention/blob/main/usage.md)
+contains a partial list of places where FlashAttention is being used.
+
+## Full model code and training script
+
+We have released the full GPT model
+[implementation](https://github.com/HazyResearch/flash-attention/blob/main/flash_attn/models/gpt.py).
+We also provide optimized implementations of other layers (e.g., MLP, LayerNorm,
+cross-entropy loss, rotary embedding). Overall this speeds up training by 3-5x
+compared to the baseline implementation from Huggingface, reaching up to 189
+TFLOPs/sec per A100, equivalent to 60.6\% model FLOPs utilization (we don't need
+any activation checkpointing). 
+
+We also include a training
+[script](https://github.com/HazyResearch/flash-attention/tree/main/training) to
+train GPT2 on Openwebtext and GPT3 on The Pile.
+
+## Triton implementation of FlashAttention
 
 Phil Tillet (OpenAI) has an experimental implementation of FlashAttention in Triton:
 https://github.com/openai/triton/blob/master/python/tutorials/06-fused-attention.py  
@@ -18,9 +38,14 @@ and experiment with. The notations in the Triton implementation are also closer
 to what's used in our paper.
 
 
-## Alpha release (0.1).
+## Beta release (0.2).
+
+To install (requiring CUDA 11, NVCC, and an Turing or Ampere GPU):
+```sh
+pip install flash-attn
+```
 
-To compile (requiring CUDA 11, NVCC, and an Turing or Ampere GPU):
+Alternatively you can compile from source:
 ```
 python setup.py install
 ```
@@ -38,15 +63,15 @@ FlashAttention currently supports:
 3. Head dimensions that are multiples of 8, up to 128 (e.g., 8, 16, 24, ..., 128). Head dim > 64 backward requires A100.
 
 Our tentative roadmap:
-1. [Jun 2022] Make package pip-installable.
+1. ~~[Jun 2022] Make package pip-installable~~[Done, thanks to lucidrains].
 2. ~~[Jun 2022] Support SM86 GPUs (e.g., RTX 3080, 3090)~~[Done].
 3. [Jun 2022] Refactor to use Cutlass.
 4. ~~[Jun 2022] Support SM75 GPUs (e.g. T4)~~[Done].
 5. ~~[Jun 2022] Support bf16~~[Done].
 6. ~~[Jul 2022] Implement cross-attention~~[Done].
 7. ~~[Jul 2022] Support head dimension 128~~[Done].
 8. [Jul 2022] Support SM70 GPUs (V100).
-9. [Aug 2022] Fuse rotary embedding.
+9. ~~[Aug 2022] Fuse rotary embedding~~[Done].
 10. [Aug 2022] Support attention bias (e.g. ALiBi, relative positional encoding).
 
 ## Speedup and Memory Savings
@@ -148,10 +173,10 @@ and for his thoughtful answers to our questions about CUDA.
 ## Citation
 If you use this codebase, or otherwise found our work valuable, please cite:
 ```
-@article{dao2022flashattention,
-  title={FlashAttention: Fast and Memory-Efficient Exact Attention with IO-Awareness},
+@inproceedings{dao2022flashattention,
+  title={Flash{A}ttention: Fast and Memory-Efficient Exact Attention with {IO}-Awareness},
   author={Dao, Tri and Fu, Daniel Y. and Ermon, Stefano and Rudra, Atri and R{\'e}, Christopher},
-  journal={arXiv preprint arXiv:2205.14135},
+  booktitle={Advances in Neural Information Processing Systems},
   year={2022}
 }
 ```
@@ -176,6 +176,16 @@ void set_params_dgrad(FMHA_dgrad_params &params,
     params.dsoftmax_sum = dsoftmax_sum_d;
 }
 
+void run_fmha_fwd(Launch_params<FMHA_fprop_params> &launch_params) {
+    if (launch_params.params.d <= 32) {
+        run_fmha_fwd_hdim32(launch_params);
+    } else if (launch_params.params.d <= 64) {
+        run_fmha_fwd_hdim64(launch_params);
+    } else if (launch_params.params.d <= 128) {
+        run_fmha_fwd_hdim128(launch_params);
+    }
+}
+
 std::vector<at::Tensor>
 mha_fwd(const at::Tensor &q,         // total_q x num_heads x head_size, total_q := \sum_{i=0}^{b} s_i
         const at::Tensor &k,         // total_k x num_heads x head_size, total_k := \sum_{i=0}^{b} s_i
@@ -299,21 +309,29 @@ mha_fwd(const at::Tensor &q,         // total_q x num_heads x head_size, total_q
     // state
     // We use a custom RNG that increases the offset by batch_size * nheads * 32.
     int64_t counter_offset = launch_params.params.b * launch_params.params.h * 32;
-    at::PhiloxCudaState rng_engine_inputs;
 
     if( is_dropout ) {
         // See Note [Acquire lock when using random generators]
         std::lock_guard<std::mutex> lock(gen->mutex_);
         launch_params.params.philox_args = gen->philox_cuda_state(counter_offset);
     }
 
-    run_fmha_fp16_sm80(launch_params);
+    run_fmha_fwd(launch_params);
 
     std::vector<at::Tensor> result = {softmax_lse};
     if (return_softmax) {result.push_back(s);}
     return result;
 }
 
+void run_fmha_bwd(FMHA_dgrad_params &params, cudaStream_t stream, const bool configure) {
+  if (params.d <= 32) {
+      run_fmha_bwd_hdim32(params, stream, configure);
+  } else if (params.d <= 64) {
+      run_fmha_bwd_hdim64(params, stream, configure);
+  } else if (params.d <= 128) {
+      run_fmha_bwd_hdim128(params, stream, configure);
+  }
+}
 
 std::vector<at::Tensor>
 mha_bwd(const at::Tensor &dout,  // total_q x num_heads, x head_size
@@ -341,7 +359,7 @@ mha_bwd(const at::Tensor &dout,  // total_q x num_heads, x head_size
     bool is_sm80 = dprops->major == 8 && dprops->minor == 0;
     bool is_sm8x = dprops->major == 8 && dprops->minor >= 0;
     TORCH_CHECK(is_sm8x || is_sm75);
-    auto launch = &run_fmha_dgrad_fp16_sm80;
+    auto launch = &run_fmha_bwd;
 
     bool is_dropout = p_dropout > 0.0;
     auto stream = at::cuda::getCurrentCUDAStream().stream();
@@ -454,17 +472,13 @@ mha_bwd(const at::Tensor &dout,  // total_q x num_heads, x head_size
 
     launch(params, stream, /*configure=*/true);
 
-    at::Tensor dk_accum, dv_accum;
     if (params.num_splits > 1) {
-        // dk_accum = torch::zeros({total_k, num_heads, head_size}, opts.dtype(at::kFloat));
-        // dv_accum = torch::zeros({total_k, num_heads, head_size}, opts.dtype(at::kFloat));
-        // params.dk_accum_ptr = dk_accum.data_ptr();
-        // params.dv_accum_ptr = dv_accum.data_ptr();
-        dk.zero_();
-        dv.zero_();
-    } else {
-        // params.dk_accum_ptr = nullptr;
-        // params.dv_accum_ptr = nullptr;
+        if (!dq_tmp.defined()) {
+            dq_tmp = torch::zeros({total_q, num_heads, head_size}, opts.dtype(at::kFloat));
+            params.o_tmp_ptr = dq_tmp.data_ptr();  // o_tmp stores dq_tmp in the backward pass
+        } else {
+            dq_tmp.zero_();
+        }
     }
 
     auto gen = at::get_generator_or_default<at::CUDAGeneratorImpl>(
@@ -481,10 +495,10 @@ mha_bwd(const at::Tensor &dout,  // total_q x num_heads, x head_size
 
     launch(params, stream, /*configure=*/false);
 
-    // if (params.num_splits > 1) {
-    //     dk.copy_(dk_accum);
-    //     dv.copy_(dv_accum);
-    // }
+    if (params.num_splits > 1) {
+        dq.copy_(dq_tmp);
+    }
+
     return { dq, dk, dv, softmax_d };
 }
 
@@ -597,7 +611,6 @@ mha_fwd_block(const at::Tensor &q,         // total_q x num_heads x head_size, t
     // number of times random will be generated per thread, to offset philox counter in thc random
     // state
     int64_t counter_offset = launch_params.elts_per_thread;
-    at::PhiloxCudaState rng_engine_inputs;
 
     if( is_dropout ) {
         // See Note [Acquire lock when using random generators]
 
@@ -36,7 +36,8 @@
 #include <ATen/cuda/CUDAGeneratorImpl.h>
 #endif
 
-#include <ATen/cuda/CUDAGraphsUtils.cuh>
+#include <ATen/cuda/CUDAContext.h>
+#include <ATen/cuda/detail/UnpackRaw.cuh>
 
 #include <fmha_utils.h>
 
@@ -195,9 +196,13 @@ struct Launch_params{
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-void run_fmha_fp16_sm80(Launch_params<FMHA_fprop_params> &launch_params);
+void run_fmha_fwd_hdim32(Launch_params<FMHA_fprop_params> &launch_params);
+void run_fmha_fwd_hdim64(Launch_params<FMHA_fprop_params> &launch_params);
+void run_fmha_fwd_hdim128(Launch_params<FMHA_fprop_params> &launch_params);
 
-void run_fmha_dgrad_fp16_sm80(FMHA_dgrad_params &params, cudaStream_t stream, const bool configure);
+void run_fmha_bwd_hdim32(FMHA_dgrad_params &params, cudaStream_t stream, const bool configure);
+void run_fmha_bwd_hdim64(FMHA_dgrad_params &params, cudaStream_t stream, const bool configure);
+void run_fmha_bwd_hdim128(FMHA_dgrad_params &params, cudaStream_t stream, const bool configure);
 
 void run_fmha_block_fp16_sm80(Launch_params<FMHA_fprop_params> &launch_params, const bool configure);
 
 
@@ -34,20 +34,6 @@
 
 namespace fmha {
 
-// template <typename half2_t>
-// inline __device__ void atomic_add_CAS(half2_t *address, const half2_t val) {
-//     uint32_t *address_as_ui = (uint32_t *)address;
-//     uint32_t old = *address_as_ui;
-//     uint32_t assumed;
-//     do {
-//       assumed = old;
-//       half2_t sum = __hadd2(val, reinterpret_cast<half2_t(&)>(old));
-//       old = atomicCAS(address_as_ui, assumed, reinterpret_cast<uint32_t(&)>(sum));
-//     } while (assumed != old);
-// }
-
-////////////////////////////////////////////////////////////////////////////////////////////////////
-
 template<
     // The dimensions of the tile computed by the CTA.
     typename Cta_tile_,
@@ -148,43 +134,6 @@ struct Gmem_tile_qkv {
         }
     }
 
-    template <typename elem_type>
-    inline __device__ void atomic_add(const uint4 (&data)[LDGS]) {
-        int row_ = tidx_ / THREADS_PER_ROW;
-        #pragma unroll
-        for( int ii = 0; ii < LDGS; ++ii ) {
-            using elem2_type = typename std::conditional<std::is_same<elem_type, __half>::value, __half2, __nv_bfloat162>::type;
-            // char *ptr_ = ptr + (int64_t)ii * ROWS_PER_LDG * row_stride_in_bytes;
-            elem2_type *ptr_ = reinterpret_cast<elem2_type *>(ptr + (uint32_t)ii * ROWS_PER_LDG * row_stride_in_bytes);
-            if (col_predicate && (row_ + ii * ROWS_PER_LDG) < min(ROWS, actual_seqlen)) {
-                #pragma unroll
-                for (int jj = 0; jj < 4; ++jj) {
-                    atomicAdd(ptr_ + jj, reinterpret_cast<const elem2_type(&)[4]>(data[ii])[jj]);
-                    // atomic_add_CAS(ptr_ + jj, reinterpret_cast<const elem2_type(&)[4]>(data[ii])[jj]);
-                }
-            }
-        }
-    }
-
-    // Not being used. This only supports converting from fp16 -> fp32 for now (not bf16 -> fp32).
-    inline __device__ void atomic_add_float(const uint4 (&data)[LDGS]) {
-        static_assert(BYTES_PER_ELEMENT == 4);  // Only support fp32
-        int row_ = tidx_ / THREADS_PER_ROW;
-        #pragma unroll
-        for( int ii = 0; ii < LDGS; ++ii ) {
-            // char *ptr_ = ptr + (int64_t)ii * ROWS_PER_LDG * row_stride_in_bytes;
-            float *ptr_ = reinterpret_cast<float *>(ptr + (uint32_t)ii * ROWS_PER_LDG * row_stride_in_bytes);
-            if (col_predicate && (row_ + ii * ROWS_PER_LDG) < min(ROWS, actual_seqlen)) {
-                #pragma unroll
-                for (int jj = 0; jj < 4; ++jj) {
-                    const float2 data_f = fmha::half2_unpack<__half>(reinterpret_cast<const uint32_t(&)[4]>(data[ii])[jj]);
-                    atomicAdd(ptr_ + jj * 2, data_f.x);
-                    atomicAdd(ptr_ + jj * 2 + 1, data_f.y);
-                }
-            }
-        }
-    }
-
     inline __device__ void move(const int steps = 1) {
         // ptr += (int64_t)ROWS * row_stride_in_bytes * steps;
         ptr += (uint32_t)ROWS * row_stride_in_bytes * steps;
@@ -306,6 +255,27 @@ struct Gmem_tile_o {
         }
     }
 
+    // Store data to global memory with atomicAdd.
+    inline __device__ void atomic_add(const uint4 (&src)[STGS_PER_LOOP], int mi) {
+        static_assert(BYTES_PER_ELEMENT == 4);  // Only do atomic add on floats
+        int row_ = tidx_ / THREADS_PER_ROW;
+        #pragma unroll
+        for( int ii = 0; ii < STGS_PER_LOOP; ++ii ) {
+            int jj = mi * STGS_PER_LOOP + ii;
+            if ((!col_predicate) || (row_ + jj * ROWS_PER_STG >= this->actual_seqlen_q)) {
+                break;
+            }
+
+            if( !HAS_INCOMPLETE_STG || (jj < STGS - 1 || this->is_active_for_last_stg_) ) {
+                float *ptr_ = reinterpret_cast<float *>(this->ptr_ + jj * ROWS_PER_STG * this->row_stride_in_bytes);
+                #pragma unroll
+                for (int jj = 0; jj < 4; ++jj) {
+                    atomicAdd(ptr_ + jj, reinterpret_cast<const float(&)[4]>(src[ii])[jj]);
+                }
+            }
+        }
+    }
+
     // Load data from global memory.
     inline __device__ void load(uint4 (&dst)[STGS_PER_LOOP], int mi) {
         static_assert(BYTES_PER_ELEMENT == 4);
 
@@ -0,0 +1,12 @@
+// Copyright (c) 2022, Tri Dao.
+
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "fmha_bwd_launch_template.h"
+
+void run_fmha_bwd_hdim128(FMHA_dgrad_params &params, cudaStream_t stream, const bool configure) {
+    FP16_SWITCH(params.is_bf16, ([&] {
+        using Kernel_traits = FMHA_kernel_traits<128, 128, 16, 1, 8, 0x100u, elem_type>;
+        run_fmha_bwd_loop<Kernel_traits>(params, stream, configure);
+    }));
+}
@@ -0,0 +1,17 @@
+// Copyright (c) 2022, Tri Dao.
+
+// Splitting the different head dimensions to different files to speed up compilation.
+
+#include "fmha_bwd_launch_template.h"
+
+void run_fmha_bwd_hdim32(FMHA_dgrad_params &params, cudaStream_t stream, const bool configure) {
+    FP16_SWITCH(params.is_bf16, ([&] {
+        if (params.seqlen_k == 128) {
+            using Kernel_traits = FMHA_kernel_traits<128, 32, 16, 1, 8, 0x08u, elem_type>;
+            run_fmha_bwd_loop<Kernel_traits>(params, stream, configure);
+        } else if (params.seqlen_k >= 256) {
+            using Kernel_traits = FMHA_kernel_traits<256, 32, 16, 1, 8, 0x08u, elem_type>;
+            run_fmha_bwd_loop<Kernel_traits>(params, stream, configure);
+        }
+    }));
+}