pytorch
diff --git a/‎fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py
+59-1 b/‎fbgemm_gpu/experimental/gen_ai/bench/quantize_ops.py
+59-1
diff --git a/‎fbgemm_gpu/experimental/gen_ai/gen_ai/quantize.py
+7-2 b/‎fbgemm_gpu/experimental/gen_ai/gen_ai/quantize.py
+7-2
@@ -1527,7 +1527,6 @@ def preprocess(self, x, w):
         # Convert m_values into offsets into grouped tensor.
         m_sizes = torch.tensor(m_values).to(dtype=torch.int32, device=x[0].device)
         # Quantize weights.
-        # TODO Only rowwise scaling is currently supported. This needs to be fixed.
         wq, scales = zip(*[quantize_int4_preshuffle(i) for i in w])
         group_scale, row_scale = zip(*scales)
         # Group weights as single tensor.
@@ -1573,6 +1572,65 @@ def cuda(self) -> bool:
         return True
 
 
+@register_quantize_op
+class BF16I4ShuffledGroupedGemm(QuantizeOpBase):
+    """
+    BF16 x Int4 mixed dtype grouped gemm with preshuffling.
+    """
+
+    def preprocess(self, x, w):
+        assert isinstance(x, list) and isinstance(
+            w, list
+        ), "Only supported for grouped inputs."
+        m_values = [i.shape[0] for i in x]
+        # Convert m_values into offsets into grouped tensor.
+        m_sizes = torch.tensor(m_values).to(dtype=torch.int32, device=x[0].device)
+        # Quantize weights.
+        wq, scales = zip(
+            *[quantize_int4_preshuffle(i, dtype="bf16", use_zp=False) for i in w]
+        )
+        # Group weights as single tensor.
+        group_scale, group_zero = zip(*scales)
+        wq = torch.stack(wq, dim=0).contiguous()
+        group_scale = torch.stack(group_scale, dim=0).contiguous()
+        group_zero = torch.stack(group_zero, dim=0).contiguous()
+        # Also view input as flattened.
+        x = torch.concat(x, dim=0).contiguous()
+        # Return processed tensors.
+        return x, wq, group_scale, group_zero, m_sizes
+
+    def quantize(self, x, wq, group_scale, group_zero, m_sizes):
+        return x, wq, group_scale, group_zero, m_sizes
+
+    def compute(self, x, wq, group_scale, group_zero, m_sizes):
+        # TODO Zero points arent currently supported in grouped gemm.
+        # We leave them as inputs for future compatibility but they are ignored.
+        return torch.ops.fbgemm.bf16i4bf16_shuffled_grouped(
+            x, wq, group_scale, group_zero, m_sizes
+        )
+
+    def quantize_and_compute(self, x, wq, group_scale, group_zero, m_sizes):
+        x, wq, group_scale, group_zero, m_sizes = self.quantize(
+            x, wq, group_scale, group_zero, m_sizes
+        )
+        return self.compute(x, wq, group_scale, group_zero, m_sizes)
+
+    @property
+    def name(self) -> str:
+        if torch.version.cuda:
+            return "cutlass_bf16i4_grouped_preshuffle"
+        else:
+            return "ck_bf16i4_grouped_preshuffle"
+
+    @property
+    def hip(self) -> bool:
+        return False
+
+    @property
+    def cuda(self) -> bool:
+        return True
+
+
 @register_quantize_op
 class BF16GroupedStacked(QuantizeOpBase):
     """
 
@@ -91,7 +91,7 @@ def int4_row_quantize(
 
 
 def quantize_int4_preshuffle(
-    w: torch.Tensor, group_size: int = 128, dtype: str = "fp8"
+    w: torch.Tensor, group_size: int = 128, dtype: str = "fp8", use_zp: bool = True
 ) -> Tuple[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
     """
     Quantizes an input weight tensor to int4 using preshuffling and scale packing.
@@ -102,6 +102,7 @@ def quantize_int4_preshuffle(
         w (Tensor): [N, K] Higher precision weight tensor to quantize. May optionally have a batch dimension.
         group_size (int): Number of elements to calculate group scale for, must be at least 128.
         dtype (torch.dtype): Type of corresponding activations. Must be fp8 or bf16.
+        use_zp (bool): If true, uses zero points during weight quantization. Only relevant for bf16 currently.
     Returns:
         wq (Tensor): [N, K // 2] Quantized int4 weight tensor packed into int8 elements.
         scales (Tuple[Tensor]): Scale tensors for the specified activation type. When FP8 is used,
@@ -128,7 +129,11 @@ def _quantize(
             return wq, (group_scale, row_scale)
 
         elif dtype == "bf16":
-            wq, group_scale, group_zero = int4_row_quantize_zp(w, group_size)
+            if use_zp:
+                wq, group_scale, group_zero = int4_row_quantize_zp(w, group_size)
+            else:
+                wq, group_scale = int4_row_quantize(w, group_size)
+                group_zero = torch.zeros_like(group_scale)
             # Set scales to activation type.
             group_scale = group_scale.to(torch.bfloat16)
             group_zero = group_zero.to(torch.bfloat16)