Add Int8GEMM (#2)

LittleLittleCloud · web-flow · commit c98c3b3b392a · 2025-02-11T10:36:42.000-08:00
* add GEMMInt8

* cache context

* update benchmark
diff --git a/README.md b/README.md
@@ -1,4 +1,4 @@
-# TorchSharp.BitsAndBytes
+﻿# TorchSharp.BitsAndBytes
 The `TorchSharp.BitsAndBytes` is a C# binding library for [bitsandbytes](https://github.com/bitsandbytes-foundation/bitsandbytes) library from Huggingface. It provides 4Bit and 8Bit quantization for TorchSharp models.
 
 ## Usage
@@ -17,4 +17,25 @@ int blockSize = 64; // can be [64, 128, 256, 512, 1024]
 var dequantizedTensor = BitsAndByteUtils.Dequantize4Bit(quantiedTensor, absMax, input.dtype, quantizedDType, n, input.shape, blockSize);
 ```
 
-For more examples, please refer to the *incoming benchmark* project.
+For more examples, please refer to the [Benchmark](#Benchmark) section.
+
+## Benchmark
+```
+
+BenchmarkDotNet v0.14.0, Windows 11 (10.0.26100.3037)
+Intel Core i9-14900K, 1 CPU, 32 logical and 24 physical cores
+.NET SDK 9.0.102
+  [Host]     : .NET 8.0.12 (8.0.1224.60305), X64 RyuJIT AVX2
+  DefaultJob : .NET 8.0.12 (8.0.1224.60305), X64 RyuJIT AVX2
+
+
+```
+| Method         | Mean        | Error     | StdDev    |
+|--------------- |------------:|----------:|----------:|
+| Quantize4Bit   |   536.35 μs | 12.164 μs | 35.290 μs |
+| Dequantize4Bit | 2,257.89 μs | 44.542 μs | 51.294 μs |
+| GEMV_4Bit_FP4  |    84.16 μs |  1.673 μs |  3.223 μs |
+| GEMV_4Bit_NF4  |    82.69 μs |  4.329 μs | 12.629 μs |
+| GEMV_FP32      |    49.59 μs |  0.975 μs |  2.035 μs |
+| GEMM_INT8      | 2,994.86 μs | 12.144 μs | 11.360 μs |
+| GEMM_FP32      | 4,495.49 μs | 35.264 μs | 32.986 μs |
diff --git a/TorchSharp.BitsAndBytes.Benchmark/CudaBenchmark.cs b/TorchSharp.BitsAndBytes.Benchmark/CudaBenchmark.cs
@@ -9,7 +9,7 @@
 
 namespace TorchSharp.BitsAndBytes.Benchmark;
 
-public class CudaBenchmark : IDisposable
+public class CudaBenchmark
 {
     private Tensor a1;
     private Tensor b;
@@ -19,7 +19,7 @@ public class CudaBenchmark : IDisposable
 
     public CudaBenchmark()
     {
-        a1 = torch.rand(new long[] { dim * 4, dim }, dtype: ScalarType.Float32).cuda();
+        a1 = torch.rand([dim * 4, dim], dtype: ScalarType.Float32).cuda();
     }
 
     private torch.Tensor quantizedTensor;
@@ -53,13 +53,37 @@ public void GEMV_4Bit_FP4()
     }
 
     [Benchmark]
-    public void GEMV_FP32()
+    public void GEMV_4Bit_NF4()
     {
         using var input = torch.rand(new long[] { 1, dim }, dtype: ScalarType.Float32).cuda();
+        using var result = BitsAndByteUtils.Gemv4Bit(input, quantizedTensor, [4 * dim, dim], absMax, blockSize, "nf4");
+    }
+
+    [Benchmark]
+    public void GEMV_FP32()
+    {
+        using var input = torch.rand([1, dim], dtype: ScalarType.Float32).cuda();
         using var result = torch.matmul(input, b.T);
     }
 
-    public void Dispose()
+    [Benchmark]
+    public void GEMM_INT8()
+    {
+        using var input = torch.randint(-128, 127, new long[] { 1, dim }, dtype: ScalarType.Int8).cuda();
+        using var weight = torch.randint(-128, 127, new long[] { dim, dim }, dtype: ScalarType.Int8).cuda();
+        using var result = Function.Int8GEMM(input, weight);
+    }
+
+    [Benchmark]
+    public void GEMM_FP32()
+    {
+        using var input = torch.randint(-128, 127, new long[] { 1, dim }, dtype: ScalarType.Float32).cuda();
+        using var weight = torch.randint(-128, 127, new long[] { dim, dim }, dtype: ScalarType.Float32).cuda();
+        using var result = torch.matmul(input, weight);
+    }
+
+    [GlobalCleanup]
+    public void Cleanup()
     {
         a1.Dispose();
         b.Dispose();
diff --git a/TorchSharp.BitsAndBytes.Benchmark/Program.cs b/TorchSharp.BitsAndBytes.Benchmark/Program.cs
@@ -1,4 +1,3 @@
 ﻿using BenchmarkDotNet.Running;
 using TorchSharp.BitsAndBytes.Benchmark;
-
 BenchmarkRunner.Run<CudaBenchmark>();
diff --git a/TorchSharp.BitsAndBytes.Tests/BitsAndBytes4BitTests.cs b/TorchSharp.BitsAndBytes.Tests/BitsAndBytes4BitTests.cs
@@ -69,6 +69,88 @@ public void Test4BitQuant(ScalarType inputDType, string quantizedDType, int bloc
         Assert.True(avg.First() <= 0.2);
     }
 
+    [CudaTheory]
+    [InlineData(32, 1, false, false, 16)]
+    [InlineData(32, 1, false, true, 16)]
+    [InlineData(32, 1, true, false, 16)]
+    [InlineData(32, 1, true, true, 16)]
+    [InlineData(64, 1, true, true, 16)]
+    [InlineData(128, 1, true, true, 16)]
+    [InlineData(512, 1, true, true, 16)]
+    [InlineData(32, 1, true, true, 512)]
+    [InlineData(32, 16, false, false, 16)]
+    [InlineData(32, 16, false, true, 16)]
+    [InlineData(32, 8, true, false, 16)]
+    [InlineData(32, 4, true, true, 16)]
+    [InlineData(128, 32, true, true, 16)]
+    [InlineData(512, 32, true, true, 16)]
+    [InlineData(32, 4, true, true, 512)]
+    public void TestInt8GEMM(int hiddenDim, int batchDim, bool transposeInput, bool transposeWeight, int seqDim)
+    {
+        // 2-D input
+        foreach (int i in Enumerable.Range(0, 20))
+        {
+            long[] inputShape = !transposeInput ? [batchDim, hiddenDim] : [hiddenDim, batchDim];
+            var outputChannel = 32 * new Random().Next(1, 10);
+            long[] weightShape = transposeWeight ? [outputChannel, hiddenDim] : [hiddenDim, outputChannel];
+
+            using var input = torch.randint(-128, 127, inputShape, ScalarType.Int8).cuda();
+            using var weight = torch.randint(-128, 127, weightShape, ScalarType.Int8).cuda();
+            using var baseline = (transposeInput, transposeWeight) switch
+            {
+                (false, false) => torch.matmul(input.to_type(ScalarType.Float32), weight.to_type(ScalarType.Float32)),
+                (false, true) => torch.matmul(input.to_type(ScalarType.Float32), weight.to_type(ScalarType.Float32).t()),
+                (true, false) => torch.matmul(input.to_type(ScalarType.Float32).t(), weight.to_type(ScalarType.Float32)),
+                (true, true) => torch.matmul(input.to_type(ScalarType.Float32).t(), weight.to_type(ScalarType.Float32).t()),
+            };
+            using var result = (transposeInput, transposeWeight) switch
+            {
+                (false, false) => Function.Int8GEMM(input, weight),
+                (false, true) => Function.Int8GEMM(input, weight.t()),
+                (true, false) => Function.Int8GEMM(input.t(), weight),
+                (true, true) => Function.Int8GEMM(input.t(), weight.t()),
+            };
+
+            var diff = baseline - result.to_type(ScalarType.Float32);
+            var avg = diff.abs().mean().data<float>();
+
+            Assert.True(avg[0] <= 1e-5);
+        }
+
+        // 3-dim input
+        foreach (int i in Enumerable.Range(0, 20))
+        {
+            if (transposeInput)
+            {
+                // skip 3-dim input with transposeInput = true
+                continue;
+            }
+            long[] inputShape = [batchDim, seqDim, hiddenDim];
+            var outputChannel = 32 * new Random().Next(1, 10);
+            long[] weightShape = transposeWeight ? [outputChannel, hiddenDim] : [hiddenDim, outputChannel];
+
+            using var input = torch.randint(-128, 127, inputShape, ScalarType.Int8).cuda();
+            using var weight = torch.randint(-128, 127, weightShape, ScalarType.Int8).cuda();
+            using var baseline = (transposeInput, transposeWeight) switch
+            {
+                (false, false) => torch.matmul(input.to_type(ScalarType.Float32), weight.to_type(ScalarType.Float32)),
+                (false, true) => torch.matmul(input.to_type(ScalarType.Float32), weight.to_type(ScalarType.Float32).t()),
+                _ => throw new NotImplementedException()
+            };
+            using var result = (transposeInput, transposeWeight) switch
+            {
+                (false, false) => Function.Int8GEMM(input, weight),
+                (false, true) => Function.Int8GEMM(input, weight.t()),
+                _ => throw new NotImplementedException()
+            };
+
+            var diff = baseline - result.to_type(ScalarType.Float32);
+            var avg = diff.abs().mean().data<float>();
+
+            Assert.True(avg[0] <= 1e-5);
+        }
+    }
+    
     [CudaTheory]
     [InlineData(ScalarType.Float32, "fp4", 64, 1024)]
     [InlineData(ScalarType.Float32, "nf4", 64, 1024)]
@@ -174,4 +256,46 @@ public void TestGemv4Bit3D128(ScalarType dtype, string quantizedDType, int block
         Assert.Equal(1, avg.Count);
         Assert.True(avg.First() == 0);
     }
+
+    [Fact]
+    public void TestCheckMatmul_ValidInputs()
+    {
+        var A = torch.randint(0, 10, new long[] { 2, 3 }, ScalarType.Int8);
+        var B = torch.randint(0, 10, new long[] { 3, 2 }, ScalarType.Int8);
+
+        var result = BitsAndByteUtils.CheckMatmul(A, B, false, false, ScalarType.Int8);
+
+        Assert.Equal([2, 2], result);
+    }
+
+    [Fact]
+    public void TestCheckMatmul_InvalidInputs()
+    {
+        var A = torch.randint(0, 10, new long[] { 2, 3 }, ScalarType.Int8);
+        var B = torch.randint(0, 10, new long[] { 2, 2 }, ScalarType.Int8);
+
+        Assert.Throws<ArgumentException>(() => BitsAndByteUtils.CheckMatmul(A, B, false, false, ScalarType.Int8));
+    }
+
+    [Fact]
+    public void TestCheckMatmul_TransposedInputs()
+    {
+        var A = torch.randint(0, 10, new long[] { 3, 2 }, ScalarType.Int8);
+        var B = torch.randint(0, 10, new long[] { 3, 2 }, ScalarType.Int8);
+
+        var result = BitsAndByteUtils.CheckMatmul(A, B, true, false, ScalarType.Int8);
+
+        Assert.Equal([2, 2], result);
+    }
+
+    [Fact]
+    public void TestCheckMatmul_NullOutput()
+    {
+        var A = torch.randint(0, 10, new long[] { 2, 3 }, ScalarType.Int8);
+        var B = torch.randint(0, 10, new long[] { 3, 2 }, ScalarType.Int8);
+
+        var result = BitsAndByteUtils.CheckMatmul(A, B, false, false, ScalarType.Int8);
+
+        Assert.Equal([2, 2], result);
+    }
 }
diff --git a/TorchSharp.BitsAndBytes/BitsAndByteUtils.cs b/TorchSharp.BitsAndBytes/BitsAndByteUtils.cs
@@ -195,7 +195,7 @@ public static Tensor Dequantize4Bit(
         return dequantizedTensor;
     }
 
-
+    
 
     public static Tensor Get4BitType(string typename, string device = "cuda", int blocksize = 64)
     {
@@ -421,4 +421,146 @@ public static torch.Tensor CreateDynamicMap(bool signed = true, int maxExponentB
         data.Sort();
         return torch.tensor(data.ToArray());
     }
+
+    public static int[] CheckMatmul(Tensor A, Tensor B, bool transposed_A, bool transposed_B, ScalarType expectedType = ScalarType.Int8)
+    {
+        if (A.dtype != expectedType || B.dtype != expectedType)
+        {
+            throw new ArgumentException($"Expected {expectedType} input tensors A and B, but got {A.dtype} and {B.dtype}");
+        }
+
+        var sA = A.IntShape();
+        var sB = B.IntShape();
+        var tA = transposed_A;
+        var tB = transposed_B;
+
+        bool correct = true;
+
+        if (sA.Length == 2 && sB.Length == 2)
+        {
+            if (!tA && !tB && A.shape[1] != B.shape[0])
+            {
+                correct = false;
+            }
+            else if (tA && !tB && A.shape[0] != B.shape[0])
+            {
+                correct = false;
+            }
+            else if (tA && tB && A.shape[0] != B.shape[1])
+            {
+                correct = false;
+            }
+            else if (!tA && tB && A.shape[1] != B.shape[1])
+            {
+                correct = false;
+            }
+        }
+        else if (sA.Length == 3 && sB.Length == 2)
+        {
+            if (!tA && !tB && A.shape[2] != B.shape[0])
+            {
+                correct = false;
+            }
+            else if (tA && !tB && A.shape[1] != B.shape[0])
+            {
+                correct = false;
+            }
+            else if (tA && tB && A.shape[1] != B.shape[1])
+            {
+                correct = false;
+            }
+            else if (!tA && tB && A.shape[2] != B.shape[1])
+            {
+                correct = false;
+            }
+        }
+        else if (sA.Length == 3 && sB.Length == 3)
+        {
+            if (!tA && !tB && A.shape[2] != B.shape[1])
+            {
+                correct = false;
+            }
+            else if (tA && !tB && A.shape[1] != B.shape[1])
+            {
+                correct = false;
+            }
+            else if (tA && tB && A.shape[1] != B.shape[2])
+            {
+                correct = false;
+            }
+            else if (!tA && tB && A.shape[2] != B.shape[2])
+            {
+                correct = false;
+            }
+        }
+
+        int[] outShape = default!;
+
+        if (sA.Length == 2 && sB.Length == 2)
+        {
+            if (!tA && !tB)
+            {
+                outShape = [sA[0], sB[1]];
+            }
+            else if (tA && tB)
+            {
+                outShape = [sA[1], sB[0]];
+            }
+            else if (tA && !tB)
+            {
+                outShape = [sA[1], sB[1]];
+            }
+            else if (!tA && tB)
+            {
+                outShape = [sA[0], sB[0]];
+            }
+        }
+        else if (sA.Length == 3 && sB.Length == 2)
+        {
+            if (!tA && !tB)
+            {
+                outShape = [sA[0], sA[1], sB[1]];
+            }
+            else if (tA && tB)
+            {
+                outShape = [sA[0], sA[2], sB[0]];
+            }
+            else if (tA && !tB)
+            {
+                outShape = [sA[0], sA[2], sB[1]];
+            }
+            else if (!tA && tB)
+            {
+                outShape = [sA[0], sA[1], sB[0]];
+            }
+        }
+        else if (sA.Length == 3 && sB.Length == 3)
+        {
+            if (!tA && !tB)
+            {
+                outShape = [sA[0], sA[1], sB[2]];
+            }
+            else if (tA && tB)
+            {
+                outShape = [sA[0], sA[2], sB[1]];
+            }
+            else if (tA && !tB)
+            {
+                outShape = [sA[0], sA[2], sB[2]];
+            }
+            else if (!tA && tB)
+            {
+                outShape = [sA[0], sA[1], sB[1]];
+            }
+        }
+
+        if (!correct)
+        {
+            throw new ArgumentException(
+                $"Tensor dimensions incorrect for matrix multiplication: A x B: {sA.ToArray()} x {sB.ToArray()} with transpose for A x B: {tA} x {tB}."
+            );
+        }
+
+        return outShape;
+    }
 }
diff --git a/TorchSharp.BitsAndBytes/BitsAndBytes.cs b/TorchSharp.BitsAndBytes/BitsAndBytes.cs
diff --git a/TorchSharp.BitsAndBytes/Function.cs b/TorchSharp.BitsAndBytes/Function.cs

-Original file line number
+Diff line change
+{
     private const string DllName = "libbitsandbytes_cuda121";
 -    /// <summary>
 -    /// Represents the CUDA __nv_bfloat16 type
 -    /// </summary>
 -    [StructLayout(LayoutKind.Sequential)]
 -    public struct NvBFloat16
 -    {
 -        public ushort Value;
 -    }
+-
 -    [DllImport(DllName)]
 -    public static extern void cdequantize_blockwise_fp32(
 -        IntPtr code,        // float*
 -        IntPtr A,          // float*
 -        IntPtr absmax,     // float*
 -        IntPtr output,     // unsigned char*
 -        int blocksize,
 -        int n,             // total size
 -        IntPtr stream);
+-
 -    [DllImport(DllName)]
 -    public static extern void cdequantize_blockwise_fp16(
 -        IntPtr code,        // float*
 -        IntPtr A,          // float*
 -        IntPtr absmax,     // float*
 -        IntPtr output,     // unsigned char*
 -        int blocksize,
 -        int n,             // total size
 -        IntPtr stream);
+-
 -    [DllImport(DllName)]
 -    public static extern void cdequantize_blockwise_bf16(
 -        IntPtr code,        // float*
 -        IntPtr A,          // float*
 -        IntPtr absmax,     // float*
 -        IntPtr output,     // unsigned char*
 -        int blocksize,
 -        int n,             // total size
 -        IntPtr stream);
+-
     [DllImport(DllName)]
     public static extern void cdequantize_blockwise_fp32_fp4(
         IntPtr code,        // float*
         int size,
         IntPtr stream   // cudaStream_t
     );
++
 +    [DllImport(DllName, CallingConvention = CallingConvention.Cdecl)]
 +    public static extern void cigemm(
 +        IntPtr context,
 +        bool transposeA,
 +        bool transposeB,
 +        int m,
 +        int n,
 +        int k,
 +        IntPtr A, // input
 +        IntPtr B, // weight
 +        IntPtr C, // output
 +        int lda,
 +        int ldb,
 +        int ldc);
++
 +    [DllImport(DllName, CallingConvention = CallingConvention.Cdecl)]
 +    public static extern IntPtr get_context();
++
 +    [DllImport(DllName, CallingConvention = CallingConvention.Cdecl)]
 +    public static extern IntPtr get_cusparse();
+}
-Original file line number
+Diff line change
 +using System;
 +using System.Collections.Generic;
 +using System.Linq;
 +using System.Text;
 +using System.Threading.Tasks;
 +using static TorchSharp.torch;
++
 +namespace TorchSharp.BitsAndBytes;
++
 +public class Function
 +{
 +    private static readonly Lazy<Dictionary<int, IntPtr>> _context = new(() => new Dictionary<int, IntPtr>());
 +    /// <summary>
 +    /// Integer General Matrix Multiplication (IGEMM) for 8-bit integer data types.
 +    /// </summary>
 +    /// <param name="input"></param>
 +    /// <param name="weight"></param>
 +    /// <param name="transposeWeight"></param>
 +    /// <param name="transposeInput"></param>
 +    /// <returns></returns>
 +    public static Tensor Int8GEMM(
 +        Tensor input,
 +        Tensor weight,
 +        bool transposeWeight = false,
 +        bool transposeInput = false)
 +    {
 +        var sout = BitsAndByteUtils.CheckMatmul(input, weight, transposeWeight, transposeInput);
 +        var result = torch.zeros((long[])[.. sout], dtype: torch.int32, device: input.device);
 +        if (input.shape.Length == 3 && weight.shape.Length == 3)
 +        {
 +            if (input.shape[0] == weight.shape[0] && input.shape[2] == weight.shape[1])
 +            {
 +                throw new NotImplementedException();
 +            }
 +        }
++
 +        var inputShape = input.IntShape().ToArray();
 +        var weightShape = weight.IntShape().ToArray();
 +        if (transposeInput && inputShape.Length == 2)
 +        {
 +            inputShape = [inputShape[1], inputShape[0]];
 +        }
 +        else if (transposeInput && inputShape.Length == 3)
 +        {
 +            inputShape = [inputShape[0], inputShape[2], inputShape[0]];
 +        }
 +        if (transposeWeight && weightShape.Length == 2)
 +        {
 +            weightShape = [weightShape[1], weightShape[0]];
 +        }
 +        else if (transposeWeight && weightShape.Length == 3)
 +        {
 +            weightShape = [weightShape[0], weightShape[2], weightShape[0]];
 +        }
 +        // this is a mess: cuBLAS expect column major, but PyTorch is row major.
 +        // So to perform the matrix multiplication, we have to treat A, B, and C matrices
 +        // (transpose of row major is column major)
 +        // This means we compute B^T A^T = C^T and we explicitly switch the dimensions of each of these
++
 +        // matrices in the input arguments for cuBLAS
 +        // column major: A @ B = C: [m, k] @ [k, n] = [m, n]
 +        // row major: B^T @ A^T = C^T: [m, k] @ [k, n] = [m, n]
 +        // column major with row major layout: B^T @ A^T = C^T: [k, m] @ [n, k] = [n, m]
 +        int m = 0, n = 0, k = 0, lda = 0, ldb = 0, ldc = 0;
++
 +        if (weightShape.Length == 2)
 +        {
 +            if (weight.stride(0) == weight.shape[1])
 +            {
 +                transposeWeight = false;
 +            }
 +            else if (weight.stride(1) == weight.shape[0])
 +            {
 +                transposeWeight = true;
 +            }
 +            if (input.shape.Length == 2)
 +            {
 +                if (input.stride(0) == input.shape[1])
 +                {
 +                    transposeInput = false;
 +                }
 +                else if (input.stride(1) == input.shape[0])
 +                {
 +                    transposeInput = true;
 +                }
 +            }
 +            else
 +            {
 +                if (input.stride(1) == input.shape[2])
 +                {
 +                    transposeInput = false;
 +                }
 +                else if (input.stride(2) == input.shape[1])
 +                {
 +                    transposeInput = true;
 +                }
 +            }
++
 +            if (inputShape.Length == 2)
 +            {
 +                n = inputShape[0];
 +                ldb = (int)input.stride(transposeInput ? 1 : 0);
 +            }
 +            else if (inputShape.Length == 3 && weightShape.Length == 2)
 +            {
 +                n = inputShape[0] * inputShape[1];
 +                ldb = inputShape[2];
 +            }
++
 +            m = weightShape[1];
 +            k = weightShape[0];
 +            lda = (int)weight.stride(transposeWeight ? 1 : 0);
 +            ldc = weightShape[1];
 +        }
 +        else if (weightShape.Length == 3)
 +        {
 +            // special case
 +            if (!(inputShape[0] == weightShape[0] && inputShape[1] == weightShape[1]))
 +            {
 +                throw new ArgumentException($"Only bsi,bso->io supported for tensor contractions, but dims for A x B were: {inputShape} x {weightShape}");
 +            }
++
 +            transposeInput = true;
 +            transposeWeight = false;
 +            m = weightShape[2];
 +            n = inputShape[2];
 +            k = weightShape[0] * weightShape[1];
++
 +            lda = m;
 +            ldb = inputShape[2];
 +            ldc = m;
 +        }
++
 +        IntPtr context;
 +        if (_context.Value.TryGetValue(input.device_index, out var ctx))
 +        {
 +            context = ctx;
 +        }
 +        else
 +        {
 +            context = BitsAndBytesCudaNative.get_context();
 +            _context.Value[input.device_index] = context;
 +        }
++
 +        var A = LibTorchNativeMethod.THSStorage_data_ptr(input.Handle);
 +        var B = LibTorchNativeMethod.THSStorage_data_ptr(weight.Handle);
 +        var C = LibTorchNativeMethod.THSStorage_data_ptr(result.Handle);
 +        BitsAndBytesCudaNative.cigemm(
 +            context: context,
 +            transposeA: transposeWeight, // cuBLAS expects column major, but PyTorch is row major
 +            transposeB: transposeInput, // So we have to transpose A and B
 +            m: m,
 +            n: n,
 +            k: k,
 +            A: B,   // out_T = B_T @ A_T
 +            B: A,
 +            C: C,
 +            lda: lda,
 +            ldb: ldb,
 +            ldc: ldc);
 +        return result;
++
 +    }
 +}