ROCm · coderfeli · Feb 6, 2025 · Feb 6, 2025 · Feb 6, 2025 · Jan 27, 2025
diff --git a/3rdparty/composable_kernel b/3rdparty/composable_kernel
diff --git a/aiter/__init__.py b/aiter/__init__.py
@@ -31,8 +31,6 @@
 from .ops.rope import *
 from .ops.topk import *
 from .ops.mha import *
-from .ops.speculative_sampling import *
-from .ops.eagle_utils import *
 from .ops.gradlib import *
 from .aot.norm import *
 from . import mla

diff --git a/aiter/aot/triton_compile.py b/aiter/aot/triton_compile.py
@@ -10,7 +10,6 @@
 from typing import List
 
 import triton
-from triton.compiler.code_generator import kernel_suffix
 from triton.backends.amd.driver import ty_to_cpp
 
 desc = """
@@ -104,7 +103,14 @@ def constexpr(s):
             arg_types += [signature[i]]
 
     # dump C stub code
-    suffix = kernel_suffix(signature.values(), attrs)
+    suffix = ''
+    for i, ty in enumerate(signature.values()):
+        suffix += str(i)
+        if hints.get((i, ), None) == 1:
+            suffix += 'c'
+        if hints.get((i, ), None) == 16:
+            suffix += 'd'
+
     func_name = '_'.join([out_name, sig_hash, suffix])
 
     hex_ = binascii.hexlify(ccinfo.asm["hsaco"]).decode('utf-8')

diff --git a/aiter/fused_moe_api.py b/aiter/fused_moe_api.py
@@ -0,0 +1,95 @@
+# SPDX-License-Identifier: MIT
+# Copyright (c) 2024, Advanced Micro Devices, Inc. All rights reserved.
+
+import torch
+import torch.nn.functional as F
+import numpy as np
+import sys
+import os
+from typing import Any, Callable, Dict, Optional, Tuple
+import aiter
+from aiter import logger
+from aiter.fused_moe_bf16_asm import asm_moe
+
+
+def aiter_moe(hidden_states,  # not quant
+              w1,  # [expert(local_expert:EP), inter_dim*2, dim] N,K
+              w2,  # [expert(local_expert:EP), dim, inter_dim]
+              topk_weight, topk_ids,
+              # following for int8 quant
+              fc1_scale=None,  # [expert(local_expert:EP), inter_dim, 1]
+              fc2_scale=None,  # [expert(local_expert:EP), model_dim, 1]
+              fc1_smooth_scale=None,  # [expert(local_expert:EP), 1, model_dim]
+              fc2_smooth_scale=None,  # [expert(local_expert:EP), 1, inter_dim]
+              a16=False,
+              acitvation=None,
+              per_tensor_quant_scale=None,
+              block_shape=None,
+              expert_mask=None,
+              ):
+    useInt4Weight = True if w1.dtype in [torch.int32, torch.uint32] else False
+    lastdim_mul = 8 if useInt4Weight else 1
+    g1u1 = True if w1.shape[1] == w2.shape[2] * 2 * lastdim_mul else False
+    dtype = hidden_states.dtype
+    if acitvation is None:
+        acitvation = 'silu' if g1u1 else 'gelu'
+    assert acitvation in ['silu', 'gelu'], "aiter moe only support silu and gelu activation,\
+        by default, 'silu' is used for g1u1 and 'gelu' is used for g1u0"
+
+    if a16 == True:
+        assert dtype == torch.bfloat16, "aiter a16 asm_moe only support bfloat16 hidden_states"
+        assert w2.shape[2] % 512 == 0 or w2.shape[2] % 320 == 0, "aiter a16 asm_moe only support w2.shape[2] % 512 == 0 or w2.shape[2] % 320 == 0"
+        assert (g1u1 and w1.dtype == torch.float8_e4m3fnuz) or (not g1u1 and w1.dtype ==
+                                                                torch.int8), "aiter a16 asm_moe only support g1u1 with fp8 or g1u0 with int8"
+        assert fc1_smooth_scale is not None and fc2_smooth_scale is not None, "aiter a16 asm_moe need smoothquant(per channel)"
+        assert fc1_scale is not None and fc2_scale is not None, "aiter a16 asm_moe need w_scale(per channel)"
+        assert per_tensor_quant_scale is None, "aiter a16 asm_moe not support per_tensor_quant_scale"
+        return asm_moe(hidden_states, w1, w2, topk_weight, topk_ids, fc1_scale, fc2_scale, 
+                       fc1_smooth_scale, fc2_smooth_scale, True, None, expert_mask=expert_mask)
+
+    elif useInt4Weight:
+        assert dtype == torch.bfloat16, "aiter a8wint4 asm_moe only support bfloat16 hidden_states"
+        assert g1u1, "aiter a8wint4 asm_moe only support g1u1"
+        assert fc1_smooth_scale is None and fc2_smooth_scale is None, "aiter a8wint4 asm_moe not support smoothquant"
+        return asm_moe(hidden_states, w1, w2, topk_weight, topk_ids, fc1_scale, fc2_scale, 
+                       fc1_smooth_scale, fc2_smooth_scale, False, per_tensor_quant_scale, expert_mask=expert_mask, activation=acitvation)
+
+    elif block_shape is not None:
+        assert dtype == torch.bfloat16, "aiter moe for block_scale only support bfloat16 hidden_states"
+        assert block_shape == (
+            128, 128), "aiter moe for block_scale only support (128, 128)"
+        assert fc1_smooth_scale is None and fc2_smooth_scale is None, "aiter moe for block_scale not support smoothquant"
+        assert per_tensor_quant_scale is None, "aiter moe for block_scale not support per_tensor_quant_scale"
+        assert g1u1, "aiter moe for block_scale only support g1u1"
+        assert acitvation == 'silu', "aiter moe for block_scale only support silu acitvation"
+        return asm_moe(hidden_states, w1, w2, topk_weight, topk_ids, fc1_scale, fc2_scale, 
+                       fc1_smooth_scale, fc2_smooth_scale, False, None, block_shape=block_shape, expert_mask=expert_mask)
+
+    elif fc1_smooth_scale is not None and fc2_smooth_scale is not None and w1.dtype in [torch.float8_e4m3fnuz, torch.int8]:
+        assert dtype == torch.bfloat16, "aiter asm_moe for smoothquant only support bfloat16 hidden_states"
+        if g1u1:
+            assert acitvation == 'silu', "aiter asm_moe for g1u1 smoothquant only support silu acitvation"
+        else:
+            assert acitvation == 'gelu', "aiter asm_moe for g1u0 smoothquant only support gelu acitvation"
+        assert g1u1 or (not g1u1 and w1.dtype ==
+                        torch.int8), "aiter asm_moe for smoothquant not support g1u0 fp8 smoothquant"
+        return asm_moe(hidden_states, w1, w2, topk_weight, topk_ids, fc1_scale, fc2_scale, 
+                       fc1_smooth_scale, fc2_smooth_scale, False, per_tensor_quant_scale, expert_mask=expert_mask)
+
+    elif fc1_smooth_scale is None and fc2_smooth_scale is None and w1.dtype in [torch.float8_e4m3fnuz, torch.int8]:
+        assert dtype == torch.bfloat16, "aiter asm_moe for fp8/int8 quant only support bfloat16 hidden_states"
+        assert g1u1, "aiter asm_moe for fp8/int8 quant only support g1u1"
+        assert acitvation == 'silu', "aiter asm_moe for fp8/int8 quant only support silu acitvation"
+        return asm_moe(hidden_states, w1, w2, topk_weight, topk_ids, fc1_scale, fc2_scale, 
+                       fc1_smooth_scale, fc2_smooth_scale, False, per_tensor_quant_scale, expert_mask=expert_mask)
+
+    elif fc1_scale is None and fc2_scale is None:
+        assert fc1_smooth_scale is None and fc2_smooth_scale is None, "aiter moe for no quant not support smoothquant"
+        assert per_tensor_quant_scale is None, "aiter moe for no quant not support per_tensor_quant_scale"
+        if not g1u1 and acitvation == 'gelu':
+            return asm_moe(hidden_states, w1, w2, topk_weight, topk_ids, fc1_scale, fc2_scale, 
+                           fc1_smooth_scale, fc2_smooth_scale, False, per_tensor_quant_scale, expert_mask=expert_mask)
+        else:
+            block_m = 32
+            return aiter.ck_moe(hidden_states, w1, w2, topk_weight, topk_ids, fc1_scale, fc2_scale, 
+                                fc1_smooth_scale, fc2_smooth_scale, block_m, expert_mask, acitvation)
+3 −3		CHANGELOG.md
+52 −7		Jenkinsfile
+3 −0		client_example/10_grouped_convnd_bwd_data/CMakeLists.txt
+3 −3		client_example/10_grouped_convnd_bwd_data/README.md
+205 −0		client_example/10_grouped_convnd_bwd_data/grouped_conv2d_bwd_data_ngchw.cpp
+6 −6		example/01_gemm/CMakeLists.txt
+10 −1		example/09_convnd_fwd/CMakeLists.txt
+18 −2		example/15_grouped_gemm/run_grouped_gemm_example.inc
+6 −0		example/65_gemm_multiply_multiply/CMakeLists.txt
+61 −57		example/65_gemm_multiply_multiply/moe_gemm1_xdl_fp8.cpp
+39 −73		example/65_gemm_multiply_multiply/moe_gemm1_xdl_pk_i4.cpp
+52 −33		example/65_gemm_multiply_multiply/moe_gemm2_xdl_fp8.cpp
+12 −9		example/65_gemm_multiply_multiply/moe_gemm2_xdl_pk_i4.cpp
+1 −1		example/ck_tile/15_fused_moe/instances/fused_moe_api.cpp
+621 −0		include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_dequant_v1.hpp
+573 −0		include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_gufusion_v1.hpp
+99 −42		include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_b_preshuffle_selector.hpp
+22 −3		include/ck/tensor_operation/gpu/block/blockwise_gemm_pipeline_xdlops_base.hpp
+3 −1		include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v4r1_gather.hpp
+7 −7		include/ck/tensor_operation/gpu/block/thread_group_tensor_slice_transfer_v7r3_scatter.hpp
+397 −40		include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_data_multiple_d_xdl_cshuffle_v1.hpp
+7 −0		include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_two_stage_xdl_cshuffle.hpp
+19 −0		include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_bwd_weight_xdl_cshuffle.hpp
+25 −2		include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle.hpp
+26 −3		include/ck/tensor_operation/gpu/device/impl/device_grouped_conv_fwd_multiple_abd_xdl_cshuffle_v3.hpp
+61 −9		include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
+49 −8		include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
+51 −9		include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
+12 −10		include/ck/tensor_operation/gpu/device/impl/device_moe_gemm.hpp
+4 −3		include/ck/tensor_operation/gpu/grid/gridwise_elementwise_2d.hpp
+282 −100		include/ck/tensor_operation/gpu/grid/gridwise_moe_gemm.hpp
+4 −3		include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1_gather.hpp
+15 −37		include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v7r3_scatter.hpp
+19 −13		include/ck/tensor_operation/operator_transform/transform_conv_ngchw_to_nhwgc.hpp
+32 −19		include/ck/utility/dynamic_buffer.hpp
+7 −0		include/ck/utility/tuple_helper.hpp
+10 −6		include/ck_tile/ops/fused_moe/kernel/moe_sorting_kernel.hpp
+74 −15		library/include/ck/library/reference_tensor_operation/cpu/reference_moe_gemm.hpp
+144 −0		...tensor_operation_instance/gpu/grouped_conv_bwd_data/device_grouped_conv_bwd_data_transpose_xdl_instance.hpp
+61 −1		library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data.hpp
+91 −0		library/include/ck/library/tensor_operation_instance/gpu/grouped_convolution_backward_data_xdl.inc
+3 −0		library/src/tensor_operation_instance/gpu/grouped_conv2d_bwd_data/CMakeLists.txt
+48 −0		...ance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_bf16_instance.cpp
+48 −0		...tance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_f16_instance.cpp
+48 −0		...tance/gpu/grouped_conv2d_bwd_data/xdl/device_grouped_conv2d_bwd_data_xdl_ngchw_gkyxc_ngkhw_f32_instance.cpp
+3 −0		library/src/tensor_operation_instance/gpu/grouped_conv3d_bwd_data/CMakeLists.txt
+49 −0		...e/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_bf16_instance.cpp
+49 −0		...ce/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_f16_instance.cpp
+49 −0		...ce/gpu/grouped_conv3d_bwd_data/xdl/device_grouped_conv3d_bwd_data_xdl_ngcdhw_gkzyxc_ngkdhw_f32_instance.cpp
+6 −1		profiler/include/profiler/profile_grouped_conv_bwd_data_impl.hpp
+33 −1		profiler/src/profile_grouped_conv_bwd_data.cpp
+2 −1		script/convert_miopen_driver_to_profiler.py
+7 −1		test/grouped_convnd_bwd_data/test_grouped_convnd_bwd_data_xdl.cpp