From 2783c28074714ea88851878e7ecdcb154464755b Mon Sep 17 00:00:00 2001
From: Yi Liu <yi4.liu@intel.com>
Date: Tue, 11 Feb 2025 13:01:33 +0800
Subject: [PATCH 01/25] Create static_quant.py for ds

---
 examples/ds/static_quant.py | 140 ++++++++++++++++++++++++++++++++++++
 1 file changed, 140 insertions(+)
 create mode 100644 examples/ds/static_quant.py

diff --git a/examples/ds/static_quant.py b/examples/ds/static_quant.py
new file mode 100644
index 00000000000..c7077b0680e
--- /dev/null
+++ b/examples/ds/static_quant.py
@@ -0,0 +1,140 @@
+"""
+# Prerequisite
+pip install -r requirements.txt
+
+# Note for static/dynamic W8FP8 quantization:
+1. Name convention:
+    - weight scale name: "prefix.scale_weight"
+    - input scale name: "prefix.scale_input"
+2. A json file mapping from tensor name to safetensor file name.
+
+Example:
+class M(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.fc1 = torch.nn.Linear(10, 5, bias=False)
+
+    def forward(self, inp):
+        x1 = self.fc1(inp)
+        return x1
+
+1. state dict
+{
+    "fc1.weight": torch.Tensor(...),
+    "fc1.scale_weight": torch.Tensor(...),
+    "fc1.scale_input": torch.Tensor(...),
+}
+
+2. json file, model.safetensors.index.json
+{
+    "fc1.weight": "qmodel.safetensors",
+    "fc1.scale_weight": "qmodel.safetensors",
+    "fc1.scale_input": "qmodel.safetensors"
+}
+
+"""
+
+import os
+import torch
+import tqdm
+from loguru import logger
+import safetensors
+from safetensors import safe_open
+from safetensors.torch import save_file
+import json
+
+torch.set_grad_enabled(False)
+
+# CONSTANTS
+SAFETENSORS = "safetensors"
+WEIGHT_SCALE_NAME = "scale_weight"
+SCALE_FILE_NAME = f"scales.{SAFETENSORS}"
+FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max
+WEIGHT_BACKOFF = 0.5
+"""
+# https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Quantization/Inference_Using_FP8.html?highlight=backoff#supported-json-config-file-options
+Similarly, the maxabs value of a weight is scaled to weight_backoff*FP8_143_FULLSCALE. The default values are input_backoff=0.25 and weight_backoff=0.5.
+"""
+MODEL_STATE_DICT_MAPPING_FILENAME = "model.safetensors.index.json"
+
+
+def get_all_weight_filename(model_path):
+    all_files = os.listdir(model_path)
+    all_weight_filename = []
+    for file in all_files:
+        if file.endswith(f".{SAFETENSORS}"):
+            all_weight_filename.append(file)
+    return all_weight_filename
+
+
+# from _fp8_quant/_core/fp_utils.py
+def calc_maxabs_scale(xmaxabs, fullscale, backoff=1):
+    scale = xmaxabs / (fullscale * backoff)
+    return scale
+
+
+def quant_tensor(tensor):
+    # Note:
+    #  1. Check the scale dtype
+    #  2. Check the scale shape
+    amax = tensor.abs().max()
+    scale = calc_maxabs_scale(amax, FULL_RANGE, WEIGHT_BACKOFF)
+    qtensor = tensor / scale
+    cliped_qtensor = torch.clamp(qtensor, -FULL_RANGE, FULL_RANGE)
+    cliped_qtensor_fp8 = cliped_qtensor.to(torch.float8_e4m3fn)
+    return scale, cliped_qtensor_fp8
+
+
+def _maybe_create_dir(qmodel_path):
+    if not os.path.exists(qmodel_path):
+        os.makedirs(qmodel_path)
+
+
+def static_quant_model(model_path, qmodel_path):
+    _maybe_create_dir(qmodel_path)
+    all_weight_filename = get_all_weight_filename(model_path)
+    logger.info(f"Got {len(all_weight_filename)} weight files")
+    qtensor_mappping = {}
+    for i, filename in tqdm.tqdm(enumerate(all_weight_filename)):
+        if i >= 2:
+            break
+        file_path = os.path.join(model_path, filename)
+        qmodel_file_name = filename
+        qmodel_file_path = os.path.join(qmodel_path, qmodel_file_name)
+        qtensors = {}
+        with safe_open(file_path, framework="pt", device="cpu") as f:
+            for weight_name in f.keys():
+                weight = f.get_tensor(weight_name)
+                scale, qtensor = quant_tensor(weight)
+                preifx_name = weight_name[: -len(".weight")]
+                scale_name = f"{preifx_name}.{WEIGHT_SCALE_NAME}"
+                qtensors[scale_name] = scale
+                qtensors[weight_name] = qtensor
+                qtensor_mappping[scale_name] = qmodel_file_name
+                qtensor_mappping[weight_name] = qmodel_file_name
+        logger.debug(f"Saving {len(qtensors)} tensors to {qmodel_file_path}")
+        save_file(qtensors, os.path.join(qmodel_path, qmodel_file_path))
+    # Dump tensor mapping into json file
+    model_state_dict_mapping_file_path = os.path.join(qmodel_path, MODEL_STATE_DICT_MAPPING_FILENAME)
+    logger.info(f"Saving tensor mapping to {model_state_dict_mapping_file_path}")
+    with open(model_state_dict_mapping_file_path, "w") as f:
+        json.dump(qtensor_mappping, f, indent=4)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", type=str, required=True)
+    parser.add_argument("--qmodel_path", type=str, required=True)
+    args = parser.parse_args()
+    static_quant_model(args.model_path, args.qmodel_path)
+
+"""
+model_path = "/software/users/yiliu4/HF_HOME/hub/DeepSeek-V3-BF16"
+model_path = "/software/users/yiliu4/HF_HOME/hub/deepseekv3-bf16-4l/"
+qmodel_path = "/software/users/yiliu4/HF_HOME/hub/deepseekv3-bf16-4l-q/"
+static_quant_model(model_path, qmodel_path)
+python static_quant.py --model_path /software/users/yiliu4/HF_HOME/hub/deepseekv3-bf16-4l/ --qmodel_path /software/users/yiliu4/HF_HOME/hub/deepseekv3-bf16-4l-q/
+
+"""

From b5dc860b819bc53deb7d06c21e75158186f6a92c Mon Sep 17 00:00:00 2001
From: Yi Liu <yi4.liu@intel.com>
Date: Tue, 11 Feb 2025 13:03:03 +0800
Subject: [PATCH 02/25] Update static_quant.py

---
 examples/ds/static_quant.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/examples/ds/static_quant.py b/examples/ds/static_quant.py
index c7077b0680e..01a9a11ade9 100644
--- a/examples/ds/static_quant.py
+++ b/examples/ds/static_quant.py
@@ -96,8 +96,6 @@ def static_quant_model(model_path, qmodel_path):
     logger.info(f"Got {len(all_weight_filename)} weight files")
     qtensor_mappping = {}
     for i, filename in tqdm.tqdm(enumerate(all_weight_filename)):
-        if i >= 2:
-            break
         file_path = os.path.join(model_path, filename)
         qmodel_file_name = filename
         qmodel_file_path = os.path.join(qmodel_path, qmodel_file_name)

From 71d395d32c3490994923f02a8578d25d557e6260 Mon Sep 17 00:00:00 2001
From: Yi Liu <yi4.liu@intel.com>
Date: Tue, 11 Feb 2025 13:04:36 +0800
Subject: [PATCH 03/25] add requirements.txt

---
 examples/requirements.txt | 4 ++++
 1 file changed, 4 insertions(+)
 create mode 100644 examples/requirements.txt

diff --git a/examples/requirements.txt b/examples/requirements.txt
new file mode 100644
index 00000000000..6b3691129b8
--- /dev/null
+++ b/examples/requirements.txt
@@ -0,0 +1,4 @@
+loguru
+torch
+safetensors
+tqdm

From 6415795221f4c12e82ba4bcb3f6fa6f3a87ac6d2 Mon Sep 17 00:00:00 2001
From: Yi Liu <yi4.liu@intel.com>
Date: Tue, 11 Feb 2025 13:04:57 +0800
Subject: [PATCH 04/25] Rename examples/requirements.txt to
 examples/ds/requirements.txt

---
 examples/{ => ds}/requirements.txt | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename examples/{ => ds}/requirements.txt (100%)

diff --git a/examples/requirements.txt b/examples/ds/requirements.txt
similarity index 100%
rename from examples/requirements.txt
rename to examples/ds/requirements.txt

From cdc1fa085b9b03666fc2a7eff38d6cc9c317e00a Mon Sep 17 00:00:00 2001
From: Yi Liu <yiliu4@habana.ai>
Date: Tue, 11 Feb 2025 08:29:47 +0200
Subject: [PATCH 05/25] use transformers

Change-Id: Iee204311d566d528dd44eff0c227db45224ee242
Signed-off-by: Yi Liu <yiliu4@habana.ai>
---
 examples/ds/requirements.txt |  1 +
 examples/ds/static_quant.py  | 37 ++++++++++++++++++++++++++++++++++--
 2 files changed, 36 insertions(+), 2 deletions(-)

diff --git a/examples/ds/requirements.txt b/examples/ds/requirements.txt
index 6b3691129b8..63026a2db80 100644
--- a/examples/ds/requirements.txt
+++ b/examples/ds/requirements.txt
@@ -2,3 +2,4 @@ loguru
 torch
 safetensors
 tqdm
+transformers
\ No newline at end of file
diff --git a/examples/ds/static_quant.py b/examples/ds/static_quant.py
index 01a9a11ade9..8468dd88290 100644
--- a/examples/ds/static_quant.py
+++ b/examples/ds/static_quant.py
@@ -38,19 +38,23 @@ def forward(self, inp):
 import torch
 import tqdm
 from loguru import logger
+import logging
 import safetensors
 from safetensors import safe_open
 from safetensors.torch import save_file
 import json
 
+logging.basicConfig(level=logging.DEBUG)
 torch.set_grad_enabled(False)
 
 # CONSTANTS
 SAFETENSORS = "safetensors"
 WEIGHT_SCALE_NAME = "scale_weight"
+INPUT_SCALE_NAME = "scale_input"
 SCALE_FILE_NAME = f"scales.{SAFETENSORS}"
 FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max
 WEIGHT_BACKOFF = 0.5
+QUANT_MODULE_TYPES = (torch.nn.Linear,)
 """
 # https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Quantization/Inference_Using_FP8.html?highlight=backoff#supported-json-config-file-options
 Similarly, the maxabs value of a weight is scaled to weight_backoff*FP8_143_FULLSCALE. The default values are input_backoff=0.25 and weight_backoff=0.5.
@@ -90,7 +94,9 @@ def _maybe_create_dir(qmodel_path):
         os.makedirs(qmodel_path)
 
 
-def static_quant_model(model_path, qmodel_path):
+def static_quant_model_for_low_cpu_usage(model_path, qmodel_path):
+    # FIXME: need to skip some layers like embedding
+    logger.warning("It will quantize all weight tensors")
     _maybe_create_dir(qmodel_path)
     all_weight_filename = get_all_weight_filename(model_path)
     logger.info(f"Got {len(all_weight_filename)} weight files")
@@ -119,14 +125,40 @@ def static_quant_model(model_path, qmodel_path):
         json.dump(qtensor_mappping, f, indent=4)
 
 
+@torch.no_grad()
+def static_quant_model_tran(model_path, qmodel_path):
+    import transformers
+    from transformers.modeling_utils import no_init_weights
+    with no_init_weights():
+        model = transformers.AutoModelForCausalLM.from_pretrained(
+            model_path,
+            torch_dtype="auto",
+            low_cpu_mem_usage=True,
+            trust_remote_code=True,
+        )
+    for name, module in model.named_modules():
+        if isinstance(module, QUANT_MODULE_TYPES):
+            logger.debug(f"Processing {name}")
+            weight = module.weight
+            scale, qtensor = quant_tensor(weight)
+            module.weight.data = qtensor
+            setattr(module, "scale_weight", torch.nn.Parameter(scale, requires_grad=False))
+    logger.info(f"Saving quantized model to {qmodel_path}")
+    model.save_pretrained(qmodel_path)
+
+
 if __name__ == "__main__":
     import argparse
 
     parser = argparse.ArgumentParser()
     parser.add_argument("--model_path", type=str, required=True)
     parser.add_argument("--qmodel_path", type=str, required=True)
+    parser.add_argument("--low_cpu_mem", action="store_true", help="Load weight file one by one to reduce memory usage")
     args = parser.parse_args()
-    static_quant_model(args.model_path, args.qmodel_path)
+    if args.low_cpu_mem:
+        static_quant_model_for_low_cpu_usage(args.model_path, args.qmodel_path)
+    else:
+        static_quant_model_tran(args.model_path, args.qmodel_path)
 
 """
 model_path = "/software/users/yiliu4/HF_HOME/hub/DeepSeek-V3-BF16"
@@ -134,5 +166,6 @@ def static_quant_model(model_path, qmodel_path):
 qmodel_path = "/software/users/yiliu4/HF_HOME/hub/deepseekv3-bf16-4l-q/"
 static_quant_model(model_path, qmodel_path)
 python static_quant.py --model_path /software/users/yiliu4/HF_HOME/hub/deepseekv3-bf16-4l/ --qmodel_path /software/users/yiliu4/HF_HOME/hub/deepseekv3-bf16-4l-q/
+python static_quant.py --model_path /software/users/yiliu4/HF_HOME/hub/DeepSeek-V3-BF16/ --qmodel_path /software/users/yiliu4/HF_HOME/hub/DeepSeek-V3-BF16-q/
 
 """

From 3f16755423cfafabd11a3bccdc87ceffcfbaf61b Mon Sep 17 00:00:00 2001
From: Yi Liu <yiliu4@habana.ai>
Date: Tue, 11 Feb 2025 10:28:18 +0200
Subject: [PATCH 06/25] add readme

Change-Id: Id248e2b9cd2dc40af9f37320e813ea9eabcb8c60
Signed-off-by: Yi Liu <yiliu4@habana.ai>
---
 examples/ds/README.md                     |  52 +++++++++
 examples/ds/patch_for_ds.py               | 132 ++++++++++++++++++++++
 examples/ds/{static_quant.py => quant.py} | 109 +++++++++---------
 examples/ds/requirements.txt              |   3 +-
 4 files changed, 237 insertions(+), 59 deletions(-)
 create mode 100644 examples/ds/README.md
 create mode 100644 examples/ds/patch_for_ds.py
 rename examples/ds/{static_quant.py => quant.py} (61%)

diff --git a/examples/ds/README.md b/examples/ds/README.md
new file mode 100644
index 00000000000..dfabe0473a9
--- /dev/null
+++ b/examples/ds/README.md
@@ -0,0 +1,52 @@
+Note for static quantize DeepSeek model
+
+## Prerequisite
+```
+pip install -r requirements.txt
+```
+
+## Usage
+
+- Option 1 (Rec)
+```bash
+python quant.py --model_path /path/to/DeepSeek/R1/BF16/ --qmodel_path /path/to/DeepSeek/R1/Dynamic-FP8
+```
+
+- Option 2 handle weights only (If the DRAM size is less than ~700 GB)
+```bash
+python quant.py --model_path /path/to/DeepSeek/R1/BF16/ --qmodel_path /path/to/DeepSeek/R1-Dynamic-FP8 --low_cpu_mem
+```
+
+## Example
+1. Name convention:
+    - weight scale name: "prefix.scale_weight"
+    - input scale name: "prefix.scale_input" (for static only)
+2. A json file mapping from tensor name to safetensor file name.
+
+```python
+class M(torch.nn.Module):
+    def __init__(self) -> None:
+        super().__init__()
+        self.fc1 = torch.nn.Linear(10, 5, bias=False)
+
+    def forward(self, inp):
+        x1 = self.fc1(inp)
+        return x1
+```
+
+```bash
+1. state dict
+{
+    "fc1.weight": torch.Tensor(...),
+    "fc1.scale_weight": torch.Tensor(...),
+    "fc1.scale_input": torch.Tensor(...),
+}
+
+2. json file, model.safetensors.index.json
+{
+    "fc1.weight": "qmodel.safetensors",
+    "fc1.scale_weight": "qmodel.safetensors",
+    "fc1.scale_input": "qmodel.safetensors"
+}
+```
+
diff --git a/examples/ds/patch_for_ds.py b/examples/ds/patch_for_ds.py
new file mode 100644
index 00000000000..9375ac6d496
--- /dev/null
+++ b/examples/ds/patch_for_ds.py
@@ -0,0 +1,132 @@
+# ==--------------------------------------------------------------------------==
+# Patch for loading DS models
+from typing import Union, Optional
+import torch
+import os
+from packaging import version
+from zipfile import is_zipfile
+from transformers.utils import is_safetensors_available, strtobool
+from transformers.integrations import PeftAdapterMixin, deepspeed_config, is_deepspeed_zero3_enabled
+
+if is_safetensors_available():
+    from safetensors import safe_open
+    from safetensors.torch import load_file as safe_load_file
+    from safetensors.torch import save_file as safe_save_file
+
+
+def is_fsdp_enabled():
+    return (
+        torch.distributed.is_available()
+        and torch.distributed.is_initialized()
+        and strtobool(os.environ.get("ACCELERATE_USE_FSDP", "False")) == 1
+        and strtobool(os.environ.get("FSDP_CPU_RAM_EFFICIENT_LOADING", "False")) == 1
+    )
+
+
+def is_local_dist_rank_0():
+    return (
+        torch.distributed.is_available()
+        and torch.distributed.is_initialized()
+        and int(os.environ.get("LOCAL_RANK", -1)) == 0
+    )
+
+
+def load_state_dict(
+    checkpoint_file: Union[str, os.PathLike],
+    is_quantized: bool = False,
+    map_location: Optional[Union[str, torch.device]] = None,
+    weights_only: bool = True,
+):
+    """
+    Reads a PyTorch checkpoint file, returning properly formatted errors if they arise.
+    """
+
+    if checkpoint_file.endswith(".safetensors") and is_safetensors_available():
+        # Check format of the archive
+        with safe_open(checkpoint_file, framework="pt") as f:
+            metadata = f.metadata()
+        if metadata is not None and metadata.get("format") not in ["pt", "tf", "flax", "mlx"]:
+            raise OSError(
+                f"The safetensors archive passed at {checkpoint_file} does not contain the valid metadata. Make sure "
+                "you save your model with the `save_pretrained` method."
+            )
+        return safe_load_file(checkpoint_file)
+    try:
+        if map_location is None:
+            if (
+                (
+                    is_deepspeed_zero3_enabled()
+                    and torch.distributed.is_initialized()
+                    and torch.distributed.get_rank() > 0
+                )
+                or (is_fsdp_enabled() and not is_local_dist_rank_0())
+            ) and not is_quantized:
+                map_location = "meta"
+            else:
+                map_location = "cpu"
+        extra_args = {}
+        # mmap can only be used with files serialized with zipfile-based format.
+        if (
+            isinstance(checkpoint_file, str)
+            and map_location != "meta"
+            and version.parse(torch.__version__) >= version.parse("2.1.0")
+            and is_zipfile(checkpoint_file)
+        ):
+            extra_args = {"mmap": True}
+        weights_only_kwarg = {"weights_only": weights_only}
+        return torch.load(
+            checkpoint_file,
+            map_location=map_location,
+            **weights_only_kwarg,
+            **extra_args,
+        )
+    except Exception as e:
+        try:
+            with open(checkpoint_file) as f:
+                if f.read(7) == "version":
+                    raise OSError(
+                        "You seem to have cloned a repository without having git-lfs installed. Please install "
+                        "git-lfs and run `git lfs install` followed by `git lfs pull` in the folder "
+                        "you cloned."
+                    )
+                else:
+                    raise ValueError(
+                        f"Unable to locate the file {checkpoint_file} which is necessary to load this pretrained "
+                        "model. Make sure you have saved the model properly."
+                    ) from e
+        except (UnicodeDecodeError, ValueError):
+            raise OSError(
+                f"Unable to load weights from pytorch checkpoint file for '{checkpoint_file}' "
+                f"at '{checkpoint_file}'. "
+                "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True."
+            )
+
+
+def set_initialized_submodules(model, state_dict_keys):
+    """
+    Sets the `_is_hf_initialized` flag in all submodules of a given model when all its weights are in the loaded state
+    dict.
+    """
+    state_dict_keys = set(state_dict_keys)
+    not_initialized_submodules = {}
+    for module_name, module in model.named_modules():
+        if module_name == "":
+            # When checking if the root module is loaded there's no need to prepend module_name.
+            module_keys = set(module.state_dict())
+        else:
+            module_keys = {f"{module_name}.{k}" for k in module.state_dict()}
+        if module_keys.issubset(state_dict_keys):
+            module._is_hf_initialized = True
+        else:
+            not_initialized_submodules[module_name] = module
+    return not_initialized_submodules
+
+
+# ==--------------------------------------------------------------------------==
+
+
+def patch_transformers():
+    import transformers
+
+    transformers.modeling_utils.load_state_dict = load_state_dict
+    transformers.modeling_utils.set_initialized_submodules = set_initialized_submodules
diff --git a/examples/ds/static_quant.py b/examples/ds/quant.py
similarity index 61%
rename from examples/ds/static_quant.py
rename to examples/ds/quant.py
index 8468dd88290..9eb321d6fa8 100644
--- a/examples/ds/static_quant.py
+++ b/examples/ds/quant.py
@@ -1,39 +1,3 @@
-"""
-# Prerequisite
-pip install -r requirements.txt
-
-# Note for static/dynamic W8FP8 quantization:
-1. Name convention:
-    - weight scale name: "prefix.scale_weight"
-    - input scale name: "prefix.scale_input"
-2. A json file mapping from tensor name to safetensor file name.
-
-Example:
-class M(torch.nn.Module):
-    def __init__(self) -> None:
-        super().__init__()
-        self.fc1 = torch.nn.Linear(10, 5, bias=False)
-
-    def forward(self, inp):
-        x1 = self.fc1(inp)
-        return x1
-
-1. state dict
-{
-    "fc1.weight": torch.Tensor(...),
-    "fc1.scale_weight": torch.Tensor(...),
-    "fc1.scale_input": torch.Tensor(...),
-}
-
-2. json file, model.safetensors.index.json
-{
-    "fc1.weight": "qmodel.safetensors",
-    "fc1.scale_weight": "qmodel.safetensors",
-    "fc1.scale_input": "qmodel.safetensors"
-}
-
-"""
-
 import os
 import torch
 import tqdm
@@ -51,10 +15,12 @@ def forward(self, inp):
 SAFETENSORS = "safetensors"
 WEIGHT_SCALE_NAME = "scale_weight"
 INPUT_SCALE_NAME = "scale_input"
+SCALE_DTYPE = torch.bfloat16
 SCALE_FILE_NAME = f"scales.{SAFETENSORS}"
 FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max
 WEIGHT_BACKOFF = 0.5
 QUANT_MODULE_TYPES = (torch.nn.Linear,)
+SKIP_WEIGHT_LST = {"model.norm", "layernorm", "e_score_correction_bias", "lm_head.weight", "embed_tokens"}
 """
 # https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Quantization/Inference_Using_FP8.html?highlight=backoff#supported-json-config-file-options
 Similarly, the maxabs value of a weight is scaled to weight_backoff*FP8_143_FULLSCALE. The default values are input_backoff=0.25 and weight_backoff=0.5.
@@ -62,6 +28,14 @@ def forward(self, inp):
 MODEL_STATE_DICT_MAPPING_FILENAME = "model.safetensors.index.json"
 
 
+def skip_weight(weight_name):
+    return any([skip_name in weight_name for skip_name in SKIP_WEIGHT_LST])
+
+def get_cpu_mem_size_in_gb():
+    import psutil
+    mem = psutil.virtual_memory()
+    return mem.available
+
 def get_all_weight_filename(model_path):
     all_files = os.listdir(model_path)
     all_weight_filename = []
@@ -83,25 +57,25 @@ def quant_tensor(tensor):
     #  2. Check the scale shape
     amax = tensor.abs().max()
     scale = calc_maxabs_scale(amax, FULL_RANGE, WEIGHT_BACKOFF)
+    scale = scale.to(SCALE_DTYPE)
     qtensor = tensor / scale
     cliped_qtensor = torch.clamp(qtensor, -FULL_RANGE, FULL_RANGE)
     cliped_qtensor_fp8 = cliped_qtensor.to(torch.float8_e4m3fn)
     return scale, cliped_qtensor_fp8
 
-
 def _maybe_create_dir(qmodel_path):
     if not os.path.exists(qmodel_path):
         os.makedirs(qmodel_path)
 
 
-def static_quant_model_for_low_cpu_usage(model_path, qmodel_path):
+def quant_model_weight_with_low_cpu_usage(model_path, qmodel_path):
     # FIXME: need to skip some layers like embedding
-    logger.warning("It will quantize all weight tensors")
     _maybe_create_dir(qmodel_path)
     all_weight_filename = get_all_weight_filename(model_path)
     logger.info(f"Got {len(all_weight_filename)} weight files")
     qtensor_mappping = {}
-    for i, filename in tqdm.tqdm(enumerate(all_weight_filename)):
+    for i, filename in enumerate(all_weight_filename):
+        logger.info(f"Processing {i + 1}/{len(all_weight_filename)}: {filename}")
         file_path = os.path.join(model_path, filename)
         qmodel_file_name = filename
         qmodel_file_path = os.path.join(qmodel_path, qmodel_file_name)
@@ -109,6 +83,12 @@ def static_quant_model_for_low_cpu_usage(model_path, qmodel_path):
         with safe_open(file_path, framework="pt", device="cpu") as f:
             for weight_name in f.keys():
                 weight = f.get_tensor(weight_name)
+                if skip_weight(weight_name):
+                    logger.debug(f"Skiping quantize {weight_name}")
+                    qtensors[weight_name] = weight
+                    qtensor_mappping[weight_name] = qmodel_file_name
+                    continue
+                logger.debug(f"Processing {weight_name}"
                 scale, qtensor = quant_tensor(weight)
                 preifx_name = weight_name[: -len(".weight")]
                 scale_name = f"{preifx_name}.{WEIGHT_SCALE_NAME}"
@@ -125,38 +105,50 @@ def static_quant_model_for_low_cpu_usage(model_path, qmodel_path):
         json.dump(qtensor_mappping, f, indent=4)
 
 
+def _import_oh():
+    import transformers
+    from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi
+
+    orig_check_support_param_buffer_assignment = transformers.modeling_utils.check_support_param_buffer_assignment
+    adapt_transformers_to_gaudi()
+    transformers.modeling_utils.check_support_param_buffer_assignment = orig_check_support_param_buffer_assignment
+
+
 @torch.no_grad()
 def static_quant_model_tran(model_path, qmodel_path):
+    # assert get_cpu_mem_size_in_gb(800), "Not enough memory, please use quant_model_weight_with_low_cpu_usage"
     import transformers
-    from transformers.modeling_utils import no_init_weights
-    with no_init_weights():
-        model = transformers.AutoModelForCausalLM.from_pretrained(
-            model_path,
-            torch_dtype="auto",
-            low_cpu_mem_usage=True,
-            trust_remote_code=True,
-        )
+    from patch_for_ds import patch_transformers
+    # import_oh()
+    patch_transformers()
+    model = transformers.AutoModelForCausalLM.from_pretrained(
+        model_path,
+        torch_dtype="auto",
+        low_cpu_mem_usage=True,
+        trust_remote_code=True,
+    )
     for name, module in model.named_modules():
-        if isinstance(module, QUANT_MODULE_TYPES):
-            logger.debug(f"Processing {name}")
-            weight = module.weight
-            scale, qtensor = quant_tensor(weight)
-            module.weight.data = qtensor
-            setattr(module, "scale_weight", torch.nn.Parameter(scale, requires_grad=False))
+        if not isinstance(module, QUANT_MODULE_TYPES) or skip_weight(name):
+            logger.debug(f"Skiping quantize {name}")
+            continue
+        logger.debug(f"Processing {name}")
+        weight = module.weight
+        scale, qtensor = quant_tensor(weight)
+        module.weight.data = qtensor
+        setattr(module, "scale_weight", torch.nn.Parameter(scale, requires_grad=False))
     logger.info(f"Saving quantized model to {qmodel_path}")
     model.save_pretrained(qmodel_path)
 
 
 if __name__ == "__main__":
     import argparse
-
     parser = argparse.ArgumentParser()
     parser.add_argument("--model_path", type=str, required=True)
     parser.add_argument("--qmodel_path", type=str, required=True)
     parser.add_argument("--low_cpu_mem", action="store_true", help="Load weight file one by one to reduce memory usage")
     args = parser.parse_args()
     if args.low_cpu_mem:
-        static_quant_model_for_low_cpu_usage(args.model_path, args.qmodel_path)
+        quant_model_weight_with_low_cpu_usage(args.model_path, args.qmodel_path)
     else:
         static_quant_model_tran(args.model_path, args.qmodel_path)
 
@@ -165,7 +157,8 @@ def static_quant_model_tran(model_path, qmodel_path):
 model_path = "/software/users/yiliu4/HF_HOME/hub/deepseekv3-bf16-4l/"
 qmodel_path = "/software/users/yiliu4/HF_HOME/hub/deepseekv3-bf16-4l-q/"
 static_quant_model(model_path, qmodel_path)
-python static_quant.py --model_path /software/users/yiliu4/HF_HOME/hub/deepseekv3-bf16-4l/ --qmodel_path /software/users/yiliu4/HF_HOME/hub/deepseekv3-bf16-4l-q/
-python static_quant.py --model_path /software/users/yiliu4/HF_HOME/hub/DeepSeek-V3-BF16/ --qmodel_path /software/users/yiliu4/HF_HOME/hub/DeepSeek-V3-BF16-q/
+python quant.py --model_path /software/users/yiliu4/HF_HOME/hub/deepseekv3-bf16-4l/ --qmodel_path /software/users/yiliu4/HF_HOME/hub/deepseekv3-bf16-4l-q2/  --low_cpu_mem
+python quant.py --model_path /software/users/yiliu4/HF_HOME/hub/deepseekv3-bf16-4l/ --qmodel_path /software/users/yiliu4/HF_HOME/hub/deepseekv3-bf16-4l-q/
+python quant.py --model_path /software/users/yiliu4/HF_HOME/hub/DeepSeek-V3-BF16/ --qmodel_path /software/users/yiliu4/HF_HOME/hub/DeepSeek-V3-BF16-q/
 
 """
diff --git a/examples/ds/requirements.txt b/examples/ds/requirements.txt
index 63026a2db80..c224c095b1b 100644
--- a/examples/ds/requirements.txt
+++ b/examples/ds/requirements.txt
@@ -2,4 +2,5 @@ loguru
 torch
 safetensors
 tqdm
-transformers
\ No newline at end of file
+transformers
+psutil
\ No newline at end of file

From fbd0593069b8d12e98157d379135469d39d41595 Mon Sep 17 00:00:00 2001
From: Yi Liu <yiliu4@habana.ai>
Date: Tue, 11 Feb 2025 10:30:47 +0200
Subject: [PATCH 07/25] update docs

Change-Id: I2ae9ef7b8854b8abff5ea863bda9d567bd68ccb6
Signed-off-by: Yi Liu <yiliu4@habana.ai>
---
 examples/ds/README.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/ds/README.md b/examples/ds/README.md
index dfabe0473a9..4bc3b305f49 100644
--- a/examples/ds/README.md
+++ b/examples/ds/README.md
@@ -19,8 +19,8 @@ python quant.py --model_path /path/to/DeepSeek/R1/BF16/ --qmodel_path /path/to/D
 
 ## Example
 1. Name convention:
-    - weight scale name: "prefix.scale_weight"
-    - input scale name: "prefix.scale_input" (for static only)
+    - weight scale name: `prefix.scale_weight`
+    - input scale name: `prefix.scale_input` (for static only)
 2. A json file mapping from tensor name to safetensor file name.
 
 ```python
@@ -42,7 +42,7 @@ class M(torch.nn.Module):
     "fc1.scale_input": torch.Tensor(...),
 }
 
-2. json file, model.safetensors.index.json
+2. json file, `model.safetensors.index.json`
 {
     "fc1.weight": "qmodel.safetensors",
     "fc1.scale_weight": "qmodel.safetensors",

From 775d47ed172d8633b536cbbc75e86b011421e04f Mon Sep 17 00:00:00 2001
From: Yi Liu <yiliu4@habana.ai>
Date: Tue, 11 Feb 2025 10:41:23 +0200
Subject: [PATCH 08/25] fix

Change-Id: I158dc3d7dd541bed1b138e89266d0539229da7fe
Signed-off-by: Yi Liu <yiliu4@habana.ai>
---
 examples/ds/README.md | 4 ++--
 examples/ds/quant.py  | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/examples/ds/README.md b/examples/ds/README.md
index 4bc3b305f49..a98e5d59e39 100644
--- a/examples/ds/README.md
+++ b/examples/ds/README.md
@@ -7,12 +7,12 @@ pip install -r requirements.txt
 
 ## Usage
 
-- Option 1 (Rec)
+- Option 1. (Rec)
 ```bash
 python quant.py --model_path /path/to/DeepSeek/R1/BF16/ --qmodel_path /path/to/DeepSeek/R1/Dynamic-FP8
 ```
 
-- Option 2 handle weights only (If the DRAM size is less than ~700 GB)
+- Option 2. handle weights only (If the DRAM size is less than ~700 GB)
 ```bash
 python quant.py --model_path /path/to/DeepSeek/R1/BF16/ --qmodel_path /path/to/DeepSeek/R1-Dynamic-FP8 --low_cpu_mem
 ```
diff --git a/examples/ds/quant.py b/examples/ds/quant.py
index 9eb321d6fa8..bd9b3fde6e7 100644
--- a/examples/ds/quant.py
+++ b/examples/ds/quant.py
@@ -88,7 +88,7 @@ def quant_model_weight_with_low_cpu_usage(model_path, qmodel_path):
                     qtensors[weight_name] = weight
                     qtensor_mappping[weight_name] = qmodel_file_name
                     continue
-                logger.debug(f"Processing {weight_name}"
+                logger.debug(f"Processing {weight_name}")
                 scale, qtensor = quant_tensor(weight)
                 preifx_name = weight_name[: -len(".weight")]
                 scale_name = f"{preifx_name}.{WEIGHT_SCALE_NAME}"

From fe232824231506cdbb3f203e192c7f01db984618 Mon Sep 17 00:00:00 2001
From: Yi Liu <yiliu4@habana.ai>
Date: Tue, 11 Feb 2025 10:51:31 +0200
Subject: [PATCH 09/25] update noote

Change-Id: Ie818584925c5b4ba23d1a541d6d987e4fffc5aaf
Signed-off-by: Yi Liu <yiliu4@habana.ai>
---
 examples/ds/README.md | 6 ++++++
 examples/ds/quant.py  | 5 +++--
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/examples/ds/README.md b/examples/ds/README.md
index a98e5d59e39..cde5ac4fadb 100644
--- a/examples/ds/README.md
+++ b/examples/ds/README.md
@@ -17,6 +17,12 @@ python quant.py --model_path /path/to/DeepSeek/R1/BF16/ --qmodel_path /path/to/D
 python quant.py --model_path /path/to/DeepSeek/R1/BF16/ --qmodel_path /path/to/DeepSeek/R1-Dynamic-FP8 --low_cpu_mem
 ```
 
+> [!NOTE]
+> - Skip quantize `lm-head`.
+> - `WEIGHT_BACKOFF = 0.5`
+> - `SCALE_DTYPE = torch.bfloat16`
+
+
 ## Example
 1. Name convention:
     - weight scale name: `prefix.scale_weight`
diff --git a/examples/ds/quant.py b/examples/ds/quant.py
index bd9b3fde6e7..d5a2062c24e 100644
--- a/examples/ds/quant.py
+++ b/examples/ds/quant.py
@@ -72,6 +72,7 @@ def quant_model_weight_with_low_cpu_usage(model_path, qmodel_path):
     # FIXME: need to skip some layers like embedding
     _maybe_create_dir(qmodel_path)
     all_weight_filename = get_all_weight_filename(model_path)
+    files_cnt = len(all_weight_filename)
     logger.info(f"Got {len(all_weight_filename)} weight files")
     qtensor_mappping = {}
     for i, filename in enumerate(all_weight_filename):
@@ -88,7 +89,7 @@ def quant_model_weight_with_low_cpu_usage(model_path, qmodel_path):
                     qtensors[weight_name] = weight
                     qtensor_mappping[weight_name] = qmodel_file_name
                     continue
-                logger.debug(f"Processing {weight_name}")
+                logger.debug(f"[{i+1}/{files_cnt}] Processing {weight_name}")
                 scale, qtensor = quant_tensor(weight)
                 preifx_name = weight_name[: -len(".weight")]
                 scale_name = f"{preifx_name}.{WEIGHT_SCALE_NAME}"
@@ -96,7 +97,7 @@ def quant_model_weight_with_low_cpu_usage(model_path, qmodel_path):
                 qtensors[weight_name] = qtensor
                 qtensor_mappping[scale_name] = qmodel_file_name
                 qtensor_mappping[weight_name] = qmodel_file_name
-        logger.debug(f"Saving {len(qtensors)} tensors to {qmodel_file_path}")
+        logger.debug(f"[{i+1}/{files_cnt}] Saving {len(qtensors)} tensors to {qmodel_file_path}")
         save_file(qtensors, os.path.join(qmodel_path, qmodel_file_path))
     # Dump tensor mapping into json file
     model_state_dict_mapping_file_path = os.path.join(qmodel_path, MODEL_STATE_DICT_MAPPING_FILENAME)

From 6be43ca01509d86079b3c3f119c0ca0a5d8fc7b1 Mon Sep 17 00:00:00 2001
From: Yi Liu <yiliu4@habana.ai>
Date: Tue, 11 Feb 2025 10:59:48 +0200
Subject: [PATCH 10/25] clean code

Change-Id: I62f7ace7d647ac76918a6e4f13cdbfe8f0d90608
Signed-off-by: Yi Liu <yiliu4@habana.ai>
---
 examples/ds/quant.py | 17 ++++++-----------
 1 file changed, 6 insertions(+), 11 deletions(-)

diff --git a/examples/ds/quant.py b/examples/ds/quant.py
index d5a2062c24e..a9040e6dd3c 100644
--- a/examples/ds/quant.py
+++ b/examples/ds/quant.py
@@ -31,11 +31,14 @@
 def skip_weight(weight_name):
     return any([skip_name in weight_name for skip_name in SKIP_WEIGHT_LST])
 
+
 def get_cpu_mem_size_in_gb():
     import psutil
+
     mem = psutil.virtual_memory()
     return mem.available
 
+
 def get_all_weight_filename(model_path):
     all_files = os.listdir(model_path)
     all_weight_filename = []
@@ -63,6 +66,7 @@ def quant_tensor(tensor):
     cliped_qtensor_fp8 = cliped_qtensor.to(torch.float8_e4m3fn)
     return scale, cliped_qtensor_fp8
 
+
 def _maybe_create_dir(qmodel_path):
     if not os.path.exists(qmodel_path):
         os.makedirs(qmodel_path)
@@ -120,6 +124,7 @@ def static_quant_model_tran(model_path, qmodel_path):
     # assert get_cpu_mem_size_in_gb(800), "Not enough memory, please use quant_model_weight_with_low_cpu_usage"
     import transformers
     from patch_for_ds import patch_transformers
+
     # import_oh()
     patch_transformers()
     model = transformers.AutoModelForCausalLM.from_pretrained(
@@ -143,6 +148,7 @@ def static_quant_model_tran(model_path, qmodel_path):
 
 if __name__ == "__main__":
     import argparse
+
     parser = argparse.ArgumentParser()
     parser.add_argument("--model_path", type=str, required=True)
     parser.add_argument("--qmodel_path", type=str, required=True)
@@ -152,14 +158,3 @@ def static_quant_model_tran(model_path, qmodel_path):
         quant_model_weight_with_low_cpu_usage(args.model_path, args.qmodel_path)
     else:
         static_quant_model_tran(args.model_path, args.qmodel_path)
-
-"""
-model_path = "/software/users/yiliu4/HF_HOME/hub/DeepSeek-V3-BF16"
-model_path = "/software/users/yiliu4/HF_HOME/hub/deepseekv3-bf16-4l/"
-qmodel_path = "/software/users/yiliu4/HF_HOME/hub/deepseekv3-bf16-4l-q/"
-static_quant_model(model_path, qmodel_path)
-python quant.py --model_path /software/users/yiliu4/HF_HOME/hub/deepseekv3-bf16-4l/ --qmodel_path /software/users/yiliu4/HF_HOME/hub/deepseekv3-bf16-4l-q2/  --low_cpu_mem
-python quant.py --model_path /software/users/yiliu4/HF_HOME/hub/deepseekv3-bf16-4l/ --qmodel_path /software/users/yiliu4/HF_HOME/hub/deepseekv3-bf16-4l-q/
-python quant.py --model_path /software/users/yiliu4/HF_HOME/hub/DeepSeek-V3-BF16/ --qmodel_path /software/users/yiliu4/HF_HOME/hub/DeepSeek-V3-BF16-q/
-
-"""

From 144139c0dc7143c29e8a785c77472fe74bc5c860 Mon Sep 17 00:00:00 2001
From: Yi Liu <yiliu4@habana.ai>
Date: Tue, 11 Feb 2025 11:32:30 +0200
Subject: [PATCH 11/25] update docs

Change-Id: Ic79e2a40cc758a7715dfa45dc0cc75c30fdf231b
Signed-off-by: Yi Liu <yiliu4@habana.ai>
---
 examples/ds/README.md | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/examples/ds/README.md b/examples/ds/README.md
index cde5ac4fadb..d34bec78b56 100644
--- a/examples/ds/README.md
+++ b/examples/ds/README.md
@@ -7,14 +7,15 @@ pip install -r requirements.txt
 
 ## Usage
 
-- Option 1. (Rec)
+- Option 1 (Rec). handle weights only 
+```bash
+python quant.py --model_path /path/to/DeepSeek/R1/BF16/ --qmodel_path /path/to/DeepSeek/R1-Dynamic-FP8 --low_cpu_mem
+
+- Option 2. Loading model using transformers (Requires DRAM > ~700 GB)
 ```bash
 python quant.py --model_path /path/to/DeepSeek/R1/BF16/ --qmodel_path /path/to/DeepSeek/R1/Dynamic-FP8
 ```
 
-- Option 2. handle weights only (If the DRAM size is less than ~700 GB)
-```bash
-python quant.py --model_path /path/to/DeepSeek/R1/BF16/ --qmodel_path /path/to/DeepSeek/R1-Dynamic-FP8 --low_cpu_mem
 ```
 
 > [!NOTE]

From e2cdd4729a47a740aa2cb184e90c9b8e499da923 Mon Sep 17 00:00:00 2001
From: Yi Liu <yiliu4@habana.ai>
Date: Tue, 11 Feb 2025 11:33:30 +0200
Subject: [PATCH 12/25] update docs

Change-Id: I1c719a6895e15688033c3de84a8ad333a15930ec
Signed-off-by: Yi Liu <yiliu4@habana.ai>
---
 examples/ds/README.md | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/examples/ds/README.md b/examples/ds/README.md
index d34bec78b56..bbfacd76ed4 100644
--- a/examples/ds/README.md
+++ b/examples/ds/README.md
@@ -10,14 +10,13 @@ pip install -r requirements.txt
 - Option 1 (Rec). handle weights only 
 ```bash
 python quant.py --model_path /path/to/DeepSeek/R1/BF16/ --qmodel_path /path/to/DeepSeek/R1-Dynamic-FP8 --low_cpu_mem
+```
 
 - Option 2. Loading model using transformers (Requires DRAM > ~700 GB)
 ```bash
 python quant.py --model_path /path/to/DeepSeek/R1/BF16/ --qmodel_path /path/to/DeepSeek/R1/Dynamic-FP8
 ```
 
-```
-
 > [!NOTE]
 > - Skip quantize `lm-head`.
 > - `WEIGHT_BACKOFF = 0.5`

From 314bf10a32d4cc89d283ca34670bb7af4d5a5fc0 Mon Sep 17 00:00:00 2001
From: Yi Liu <yiliu4@habana.ai>
Date: Tue, 11 Feb 2025 15:04:02 +0200
Subject: [PATCH 13/25] add mlp.gate.weight

Change-Id: I84acff14c99cb8b233aca0d003598d8fa5757f2e
Signed-off-by: Yi Liu <yiliu4@habana.ai>
---
 examples/ds/quant.py | 9 ++++++++-
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/examples/ds/quant.py b/examples/ds/quant.py
index a9040e6dd3c..f255de928d6 100644
--- a/examples/ds/quant.py
+++ b/examples/ds/quant.py
@@ -20,7 +20,14 @@
 FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max
 WEIGHT_BACKOFF = 0.5
 QUANT_MODULE_TYPES = (torch.nn.Linear,)
-SKIP_WEIGHT_LST = {"model.norm", "layernorm", "e_score_correction_bias", "lm_head.weight", "embed_tokens"}
+SKIP_WEIGHT_LST = {
+    "model.norm",
+    "layernorm",
+    "e_score_correction_bias",
+    "lm_head.weight",
+    "embed_tokens",
+    "mlp.gate.weight",  # mlp.gate is not linear
+}
 """
 # https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Quantization/Inference_Using_FP8.html?highlight=backoff#supported-json-config-file-options
 Similarly, the maxabs value of a weight is scaled to weight_backoff*FP8_143_FULLSCALE. The default values are input_backoff=0.25 and weight_backoff=0.5.

From 2c59fc54c8110b17f141a9ed34b582e30e32a0e0 Mon Sep 17 00:00:00 2001
From: Yi Liu <yiliu4@habana.ai>
Date: Tue, 11 Feb 2025 15:28:17 +0200
Subject: [PATCH 14/25] update the docs

Change-Id: I236b189afb1e163d2bff1ab2d59d92cb601507c8
Signed-off-by: Yi Liu <yiliu4@habana.ai>
---
 examples/ds/README.md | 21 +++++++++++++++++----
 1 file changed, 17 insertions(+), 4 deletions(-)

diff --git a/examples/ds/README.md b/examples/ds/README.md
index bbfacd76ed4..78de448f726 100644
--- a/examples/ds/README.md
+++ b/examples/ds/README.md
@@ -1,29 +1,43 @@
-Note for static quantize DeepSeek model
+# Note for static quantize DeepSeek model
 
 ## Prerequisite
+
 ```
 pip install -r requirements.txt
 ```
 
 ## Usage
 
-- Option 1 (Rec). handle weights only 
+### Step 1. quantize model weights
+
+- Option 1 (Recommended): Quantize weights directly
+
 ```bash
 python quant.py --model_path /path/to/DeepSeek/R1/BF16/ --qmodel_path /path/to/DeepSeek/R1-Dynamic-FP8 --low_cpu_mem
 ```
 
-- Option 2. Loading model using transformers (Requires DRAM > ~700 GB)
+- Option 2. Load the model using transformers (requires ~700 GB of DRAM)
+
 ```bash
 python quant.py --model_path /path/to/DeepSeek/R1/BF16/ --qmodel_path /path/to/DeepSeek/R1/Dynamic-FP8
 ```
 
 > [!NOTE]
+>
 > - Skip quantize `lm-head`.
 > - `WEIGHT_BACKOFF = 0.5`
 > - `SCALE_DTYPE = torch.bfloat16`
 
+### Step 2. copy model files for inference
+
+Since DeepSeek V3 and R1 asre not yet supported by Transformers, we need to manually copy some model files.
+
+```bash
+python post_process.py --model_path /path/to/DeepSeek/R1/BF16/ --qmodel_path /path/to/DeepSeek/R1/Dynamic-FP8
+```
 
 ## Example
+
 1. Name convention:
     - weight scale name: `prefix.scale_weight`
     - input scale name: `prefix.scale_input` (for static only)
@@ -55,4 +69,3 @@ class M(torch.nn.Module):
     "fc1.scale_input": "qmodel.safetensors"
 }
 ```
-

From 4800b2b21e2fa80c46c74b417445999adb5b785f Mon Sep 17 00:00:00 2001
From: Yi Liu <yiliu4@habana.ai>
Date: Tue, 11 Feb 2025 15:30:52 +0200
Subject: [PATCH 15/25] add post process

Change-Id: I501e7f14fdb35eeaad1571fb4b679f65ceb7cab5
Signed-off-by: Yi Liu <yiliu4@habana.ai>
---
 examples/ds/README.md       |   4 +-
 examples/ds/post_process.py | 103 ++++++++++++++++++++++++++++++++++++
 2 files changed, 105 insertions(+), 2 deletions(-)
 create mode 100644 examples/ds/post_process.py

diff --git a/examples/ds/README.md b/examples/ds/README.md
index 78de448f726..061673c2dab 100644
--- a/examples/ds/README.md
+++ b/examples/ds/README.md
@@ -16,7 +16,7 @@ pip install -r requirements.txt
 python quant.py --model_path /path/to/DeepSeek/R1/BF16/ --qmodel_path /path/to/DeepSeek/R1-Dynamic-FP8 --low_cpu_mem
 ```
 
-- Option 2. Load the model using transformers (requires ~700 GB of DRAM)
+- Option 2: Load the model using transformers (requires ~700 GB of DRAM)
 
 ```bash
 python quant.py --model_path /path/to/DeepSeek/R1/BF16/ --qmodel_path /path/to/DeepSeek/R1/Dynamic-FP8
@@ -36,7 +36,7 @@ Since DeepSeek V3 and R1 asre not yet supported by Transformers, we need to manu
 python post_process.py --model_path /path/to/DeepSeek/R1/BF16/ --qmodel_path /path/to/DeepSeek/R1/Dynamic-FP8
 ```
 
-## Example
+## More details
 
 1. Name convention:
     - weight scale name: `prefix.scale_weight`
diff --git a/examples/ds/post_process.py b/examples/ds/post_process.py
new file mode 100644
index 00000000000..fd1b11ade51
--- /dev/null
+++ b/examples/ds/post_process.py
@@ -0,0 +1,103 @@
+import json
+from loguru import logger
+
+quantization_config = {
+    "_json_file": "/tmp/tmpe3ckugb_.json",
+    "allowlist": {
+        "names": [],
+        "types": [
+            "Matmul",
+            "Linear",
+            "ParallelLMHead",
+            "RowParallelLinear",
+            "ColumnParallelLinear",
+            "MergedColumnParallelLinear",
+            "QKVParallelLinear",
+            "FalconLinear",
+            "KVCache",
+            "VLLMKVCache",
+            "Conv2d",
+            "LoRACompatibleLinear",
+            "LoRACompatibleConv",
+            "Softmax",
+            "ModuleFusedSDPA",
+            "MoeMatmul",
+            "ReplicatedLinear",
+            "FusedMoE",
+            "GaudiMixtralSparseMoeBlock",
+            "VllmMixtureOfExpertsOp",
+            "LinearLayer",
+            "LinearAllreduce",
+            "ScopedLinearAllReduce",
+            "LmHeadLinearAllreduce",
+        ],
+    },
+    "blocklist": {},
+    "dump_stats_path": "./hqt_output/measure",
+    "fake_quant": "False",
+    "fp8_config": "E4M3",
+    "hp_dtype": "bf16",
+    "measure_on_hpu": True,
+    "mod_dict": {},
+    "mode": "LOAD",
+    "observer": "maxabs",
+    "scale_format": "const",
+    "scale_method": "maxabs_pow2_dynamic",
+    "scale_params": {},
+    "use_qdq": "False",
+}
+
+
+# add the quantization config to config.json
+def update_config(model_path, qmodel_path):
+    import json
+    import os
+
+    # open config
+    with open(os.path.join(model_path, "config.json"), "r") as f:
+        config = json.load(f)
+        config["quantization_config"] = quantization_config
+        # save new config to qmodel_path
+        logger.info(f"Updated config: {config}")
+        logger.debug(f"Saving config to {qmodel_path}")
+
+        with open(os.path.join(qmodel_path, "config.json"), "w") as f:
+            json.dump(config, f, indent=4)
+
+
+MODEL_FILE_LST = [
+    "configuration_deepseek.py",
+    "generation_config.json",
+    "modeling_deepseek.py",
+    "tokenizer.json",
+    "tokenizer_config.json",
+]
+
+
+def cp_model_files(model_path, qmodel_path):
+    # copy model files
+    import shutil
+    import os
+
+    for file in MODEL_FILE_LST:
+        logger.debug(f"Copying {file} from {model_path} to {qmodel_path}")
+        file_path = os.path.join(model_path, file)
+        # check if file exists
+        if not os.path.exists(file_path):
+            logger.error(f"File {file_path} does not exist")
+            raise FileNotFoundError(f"File {file_path} does not exist")
+        shutil.copy(os.path.join(model_path, file), qmodel_path)
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_path", type=str, required=True)
+    parser.add_argument("--qmodel_path", type=str, required=True)
+    parser.add_argument("--low_cpu_mem", action="store_true", help="Load weight file one by one to reduce memory usage")
+    args = parser.parse_args()
+    # update the config
+    update_config(args.model_path, args.qmodel_path)
+    # copy model files
+    cp_model_files(args.model_path, args.qmodel_path)

From 558b734de1581027b9397a93377bd807ae1041b3 Mon Sep 17 00:00:00 2001
From: Yi Liu <yiliu4@habana.ai>
Date: Tue, 11 Feb 2025 16:17:14 +0200
Subject: [PATCH 16/25] quant lm-head

Change-Id: I086fe5228c55526630a2e480d7532e2727884d36
Signed-off-by: Yi Liu <yiliu4@habana.ai>
---
 examples/ds/README.md | 2 --
 examples/ds/quant.py  | 3 +--
 2 files changed, 1 insertion(+), 4 deletions(-)

diff --git a/examples/ds/README.md b/examples/ds/README.md
index 061673c2dab..41e63232813 100644
--- a/examples/ds/README.md
+++ b/examples/ds/README.md
@@ -23,8 +23,6 @@ python quant.py --model_path /path/to/DeepSeek/R1/BF16/ --qmodel_path /path/to/D
 ```
 
 > [!NOTE]
->
-> - Skip quantize `lm-head`.
 > - `WEIGHT_BACKOFF = 0.5`
 > - `SCALE_DTYPE = torch.bfloat16`
 
diff --git a/examples/ds/quant.py b/examples/ds/quant.py
index f255de928d6..90cbade498f 100644
--- a/examples/ds/quant.py
+++ b/examples/ds/quant.py
@@ -24,7 +24,7 @@
     "model.norm",
     "layernorm",
     "e_score_correction_bias",
-    "lm_head.weight",
+    # "lm_head.weight",
     "embed_tokens",
     "mlp.gate.weight",  # mlp.gate is not linear
 }
@@ -80,7 +80,6 @@ def _maybe_create_dir(qmodel_path):
 
 
 def quant_model_weight_with_low_cpu_usage(model_path, qmodel_path):
-    # FIXME: need to skip some layers like embedding
     _maybe_create_dir(qmodel_path)
     all_weight_filename = get_all_weight_filename(model_path)
     files_cnt = len(all_weight_filename)

From c0b06329ba12fb5e407775c6fe099d14901fd91d Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 11 Feb 2025 15:14:42 +0000
Subject: [PATCH 17/25] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 examples/ds/quant.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/ds/quant.py b/examples/ds/quant.py
index 90cbade498f..6c82c80d78c 100644
--- a/examples/ds/quant.py
+++ b/examples/ds/quant.py
@@ -95,7 +95,7 @@ def quant_model_weight_with_low_cpu_usage(model_path, qmodel_path):
             for weight_name in f.keys():
                 weight = f.get_tensor(weight_name)
                 if skip_weight(weight_name):
-                    logger.debug(f"Skiping quantize {weight_name}")
+                    logger.debug(f"Skipping quantize {weight_name}")
                     qtensors[weight_name] = weight
                     qtensor_mappping[weight_name] = qmodel_file_name
                     continue
@@ -141,7 +141,7 @@ def static_quant_model_tran(model_path, qmodel_path):
     )
     for name, module in model.named_modules():
         if not isinstance(module, QUANT_MODULE_TYPES) or skip_weight(name):
-            logger.debug(f"Skiping quantize {name}")
+            logger.debug(f"Skipping quantize {name}")
             continue
         logger.debug(f"Processing {name}")
         weight = module.weight

From 314038046f837bdf52084c5e989e3489450e7444 Mon Sep 17 00:00:00 2001
From: "Huang, Tai" <tai.huang@intel.com>
Date: Tue, 11 Feb 2025 23:56:26 +0800
Subject: [PATCH 18/25] Update README.md

minor updates
---
 examples/ds/README.md | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/examples/ds/README.md b/examples/ds/README.md
index 41e63232813..c8af4e5df13 100644
--- a/examples/ds/README.md
+++ b/examples/ds/README.md
@@ -1,4 +1,4 @@
-# Note for static quantize DeepSeek model
+# Note for quantize DeepSeek model
 
 ## Prerequisite
 
@@ -28,7 +28,7 @@ python quant.py --model_path /path/to/DeepSeek/R1/BF16/ --qmodel_path /path/to/D
 
 ### Step 2. copy model files for inference
 
-Since DeepSeek V3 and R1 asre not yet supported by Transformers, we need to manually copy some model files.
+Since DeepSeek V3 and R1 are not yet supported by Transformers, we need to manually copy some model files.
 
 ```bash
 python post_process.py --model_path /path/to/DeepSeek/R1/BF16/ --qmodel_path /path/to/DeepSeek/R1/Dynamic-FP8

From 6a4c67aa4de1f6e37c3fe5efa5471707552a73f2 Mon Sep 17 00:00:00 2001
From: Yi Liu <yiliu4@habana.ai>
Date: Wed, 12 Feb 2025 03:17:02 +0200
Subject: [PATCH 19/25] update thequant config

Change-Id: I57753a1e6f87c887ff9d6f802854b83417d7d862
Signed-off-by: Yi Liu <yiliu4@habana.ai>
---
 examples/ds/post_process.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/ds/post_process.py b/examples/ds/post_process.py
index fd1b11ade51..f9e35c0b0ad 100644
--- a/examples/ds/post_process.py
+++ b/examples/ds/post_process.py
@@ -42,7 +42,7 @@
     "mode": "LOAD",
     "observer": "maxabs",
     "scale_format": "const",
-    "scale_method": "maxabs_pow2_dynamic",
+    "scale_method": "maxabs_hw",
     "scale_params": {},
     "use_qdq": "False",
 }

From a958a339fe6a78729fbddfb50420dfe5aff42028 Mon Sep 17 00:00:00 2001
From: Yi Liu <yiliu4@habana.ai>
Date: Wed, 12 Feb 2025 03:49:43 +0200
Subject: [PATCH 20/25] add weight_map

Change-Id: I82fe09fb33ef5fc48139874ec9fb9aebd178a459
Signed-off-by: Yi Liu <yiliu4@habana.ai>
---
 examples/ds/post_process.py | 8 +++-----
 examples/ds/quant.py        | 6 +++++-
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/examples/ds/post_process.py b/examples/ds/post_process.py
index f9e35c0b0ad..72101c5c57e 100644
--- a/examples/ds/post_process.py
+++ b/examples/ds/post_process.py
@@ -53,15 +53,13 @@ def update_config(model_path, qmodel_path):
     import json
     import os
 
-    # open config
     with open(os.path.join(model_path, "config.json"), "r") as f:
         config = json.load(f)
         config["quantization_config"] = quantization_config
-        # save new config to qmodel_path
         logger.info(f"Updated config: {config}")
-        logger.debug(f"Saving config to {qmodel_path}")
-
-        with open(os.path.join(qmodel_path, "config.json"), "w") as f:
+        config_filepath = os.path.join(qmodel_path, "config.json")
+        logger.debug(f"Saving config to {config_filepath}")
+        with open(config_filepath, "w") as f:
             json.dump(config, f, indent=4)
 
 
diff --git a/examples/ds/quant.py b/examples/ds/quant.py
index 6c82c80d78c..615b6bf33bf 100644
--- a/examples/ds/quant.py
+++ b/examples/ds/quant.py
@@ -112,8 +112,12 @@ def quant_model_weight_with_low_cpu_usage(model_path, qmodel_path):
     # Dump tensor mapping into json file
     model_state_dict_mapping_file_path = os.path.join(qmodel_path, MODEL_STATE_DICT_MAPPING_FILENAME)
     logger.info(f"Saving tensor mapping to {model_state_dict_mapping_file_path}")
+    state_dict_mapping = {
+        "metadata":{},
+        "weight_map": qtensor_mappping,
+    }
     with open(model_state_dict_mapping_file_path, "w") as f:
-        json.dump(qtensor_mappping, f, indent=4)
+        json.dump(state_dict_mapping, f, indent=4)
 
 
 def _import_oh():

From 54a88b7e04495474bb4fdd79065bdc20d83d06af Mon Sep 17 00:00:00 2001
From: Yi Liu <yiliu4@habana.ai>
Date: Wed, 12 Feb 2025 04:50:49 +0200
Subject: [PATCH 21/25] revert scale_method

Change-Id: I8e83e0c342d27ded41036d1e9349f30f832ecf2a
Signed-off-by: Yi Liu <yiliu4@habana.ai>
---
 examples/ds/README.md       | 1 +
 examples/ds/post_process.py | 2 +-
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/ds/README.md b/examples/ds/README.md
index c8af4e5df13..45df4738c7d 100644
--- a/examples/ds/README.md
+++ b/examples/ds/README.md
@@ -23,6 +23,7 @@ python quant.py --model_path /path/to/DeepSeek/R1/BF16/ --qmodel_path /path/to/D
 ```
 
 > [!NOTE]
+> - weight dtype is `torch.float8_e4m3fn` (full range is `-448` to `448`)
 > - `WEIGHT_BACKOFF = 0.5`
 > - `SCALE_DTYPE = torch.bfloat16`
 
diff --git a/examples/ds/post_process.py b/examples/ds/post_process.py
index 72101c5c57e..a9fe929fbeb 100644
--- a/examples/ds/post_process.py
+++ b/examples/ds/post_process.py
@@ -42,7 +42,7 @@
     "mode": "LOAD",
     "observer": "maxabs",
     "scale_format": "const",
-    "scale_method": "maxabs_hw",
+    "scale_method": "maxabs_pow2_dynamic",
     "scale_params": {},
     "use_qdq": "False",
 }

From 45a2c1af3d1183eceff95caf08671cfd4f377d51 Mon Sep 17 00:00:00 2001
From: Yi Liu <yi4.liu@intel.com>
Date: Wed, 12 Feb 2025 17:43:10 +0800
Subject: [PATCH 22/25] Add qdq eval (#2121)

* add eval

Change-Id: I7ce64ede965976dd79e979aace82f4d251cc6803
Signed-off-by: Yi Liu <yiliu4@habana.ai>

* fix

Change-Id: I72305d9d6ef6e3588bc8361f62baeeca06f42848
Signed-off-by: Yi Liu <yiliu4@habana.ai>

* add float model

Change-Id: Ia46444d77d349b1a976e6d7031d06bb621d6d7e4
Signed-off-by: Yi Liu <yiliu4@habana.ai>

* add prompt

Change-Id: Ie7b35f45d8f67a655dc9fb06eda824eb8a7f56c1
Signed-off-by: Yi Liu <yiliu4@habana.ai>

---------

Signed-off-by: Yi Liu <yiliu4@habana.ai>
Co-authored-by: Yi Liu <yiliu4@habana.ai>
---
 examples/ds/eval.py         | 143 ++++++++++++++++++++++++++++++++++++
 examples/ds/patch_for_ds.py |   4 +-
 2 files changed, 145 insertions(+), 2 deletions(-)
 create mode 100644 examples/ds/eval.py

diff --git a/examples/ds/eval.py b/examples/ds/eval.py
new file mode 100644
index 00000000000..16d51f2697a
--- /dev/null
+++ b/examples/ds/eval.py
@@ -0,0 +1,143 @@
+import os
+import torch
+import tqdm
+from loguru import logger
+import logging
+import safetensors
+from safetensors import safe_open
+from safetensors.torch import save_file
+import json
+
+logging.basicConfig(level=logging.DEBUG)
+torch.set_grad_enabled(False)
+
+# CONSTANTS
+SAFETENSORS = "safetensors"
+WEIGHT_SCALE_NAME = "scale_weight"
+INPUT_SCALE_NAME = "scale_input"
+SCALE_DTYPE = torch.bfloat16
+SCALE_FILE_NAME = f"scales.{SAFETENSORS}"
+FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max
+WEIGHT_BACKOFF = 0.5
+QUANT_MODULE_TYPES = (torch.nn.Linear,)
+SKIP_WEIGHT_LST = {
+    "model.norm",
+    "layernorm",
+    "e_score_correction_bias",
+    # "lm_head.weight",
+    "embed_tokens",
+    "mlp.gate.weight",  # mlp.gate is not linear
+}
+"""
+# https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Quantization/Inference_Using_FP8.html?highlight=backoff#supported-json-config-file-options
+Similarly, the maxabs value of a weight is scaled to weight_backoff*FP8_143_FULLSCALE. The default values are input_backoff=0.25 and weight_backoff=0.5.
+"""
+MODEL_STATE_DICT_MAPPING_FILENAME = "model.safetensors.index.json"
+
+
+def skip_weight(weight_name):
+    return any([skip_name in weight_name for skip_name in SKIP_WEIGHT_LST])
+
+
+def get_cpu_mem_size_in_gb():
+    import psutil
+
+    mem = psutil.virtual_memory()
+    return mem.available
+
+
+from quant import quant_tensor
+
+
+from torch import nn
+
+
+# Adapted from https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/1d044fd82b15f1cedb197a288e50cc96a2c27205/inference/model.py#L91-L108
+class FP8QDQLinear(torch.nn.Linear):
+    dtype = torch.bfloat16
+    fp8_dtype = torch.float8_e4m3fn
+
+    def __init__(self, in_features: int, out_features: int, bias: bool = True, device=None):
+        super().__init__(in_features, out_features, bias=bias)
+        self.in_features = in_features
+        self.out_features = out_features
+        self.weight = nn.Parameter(
+            torch.empty(out_features, in_features, dtype=FP8QDQLinear.fp8_dtype), requires_grad=True
+        )
+        self.scale_weight = nn.Parameter(torch.tensor(0, dtype=FP8QDQLinear.dtype), requires_grad=False)
+        if bias:
+            self.bias = nn.Parameter(torch.empty(out_features))
+        else:
+            self.register_parameter("bias", None)
+
+    def dequant_weight_online(self):
+        fp8_weight = self.weight
+        qdq_weight = fp8_weight.to(FP8QDQLinear.dtype) * self.scale_weight
+        return qdq_weight
+
+    def qdq_input(self, bf16_input: torch.Tensor):
+        input_scale, input_fp8 = quant_tensor(bf16_input)
+        qdq_input_bf16 = input_fp8.to(FP8QDQLinear.dtype) * input_scale
+        return qdq_input_bf16
+
+    @classmethod
+    def create_from_linear(cls, linear: nn.Linear):
+        qdq_linear = cls(linear.in_features, linear.out_features)
+        qdq_linear.weight.data = linear.weight.data
+        if linear.bias is not None:
+            qdq_linear.bias = linear.bias
+        return qdq_linear
+
+    def forward(self, bf16_input: torch.Tensor) -> torch.Tensor:
+        qdq_input = self.qdq_input(bf16_input)
+        qdq_weight = self.dequant_weight_online()
+        out = torch.nn.functional.linear(qdq_input, qdq_weight, self.bias)
+        return out
+
+
+def patch_lin():
+    logger.warning("Patching torch.nn.Linear to FP8QDQLinear")
+    torch.nn.Linear = FP8QDQLinear
+
+
+def qdq_eval(model_path, not_patch_lin=False):
+    import transformers
+    from transformers.modeling_utils import no_init_weights
+    from patch_for_ds import patch_transformers
+
+    if not not_patch_lin:
+        patch_lin()
+
+    def _patch__initialize_weights(self, module):
+        print(f"Skipping init_weights ")
+        module._is_hf_initialized = True
+
+    transformers.modeling_utils.PreTrainedModel._initialize_weights = _patch__initialize_weights
+    patch_transformers()
+    with no_init_weights():
+        model = transformers.AutoModelForCausalLM.from_pretrained(
+            model_path,
+            torch_dtype="auto",
+            low_cpu_mem_usage=True,
+            trust_remote_code=True,
+        )
+    logger.info(f"Patched model: {model}")
+    model.eval()
+    tokenizer = transformers.AutoTokenizer.from_pretrained(model_path)
+    prompt = "Hi, who"
+    encode = tokenizer.encode(prompt, return_tensors="pt")
+    with torch.no_grad():
+        output_tokens = model.generate(encode, max_length=10)
+        output = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
+        logger.info(f"Prompt: {prompt}")
+        logger.info(f"Output: {output}")
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--qmodel_path", type=str, required=True)
+    parser.add_argument("--not_patch_lin", action="store_true", help="Measure float model")
+    args = parser.parse_args()
+    qdq_eval(args.qmodel_path, not_patch_lin=args.not_patch_lin)
diff --git a/examples/ds/patch_for_ds.py b/examples/ds/patch_for_ds.py
index 9375ac6d496..d82251e3e62 100644
--- a/examples/ds/patch_for_ds.py
+++ b/examples/ds/patch_for_ds.py
@@ -1,5 +1,5 @@
 # ==--------------------------------------------------------------------------==
-# Patch for loading DS models
+# Patch for loading DS models from transformers
 from typing import Union, Optional
 import torch
 import os
@@ -101,7 +101,7 @@ def load_state_dict(
                 "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True."
             )
 
-
+#  https://github.com/huggingface/transformers/pull/35493
 def set_initialized_submodules(model, state_dict_keys):
     """
     Sets the `_is_hf_initialized` flag in all submodules of a given model when all its weights are in the loaded state

From 0054ca01f724ed4e57021a31a88c28adae64846a Mon Sep 17 00:00:00 2001
From: Yi Liu <yiliu4@habana.ai>
Date: Tue, 18 Feb 2025 12:40:58 +0200
Subject: [PATCH 23/25] eval bf16 model

Change-Id: I7eeb46b5a0eb202b3f672fb75449197041e5949c
Signed-off-by: Yi Liu <yiliu4@habana.ai>
---
 ds/infer_bf16.py | 180 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 180 insertions(+)
 create mode 100644 ds/infer_bf16.py

diff --git a/ds/infer_bf16.py b/ds/infer_bf16.py
new file mode 100644
index 00000000000..5b0b7d3b652
--- /dev/null
+++ b/ds/infer_bf16.py
@@ -0,0 +1,180 @@
+# ==--------------------------------------------------------------------------==
+# Patch for loading DS models
+from typing import Union, Optional
+import torch
+import os
+from packaging import version
+from zipfile import is_zipfile
+from transformers.utils import is_safetensors_available, strtobool
+from transformers.integrations import PeftAdapterMixin, deepspeed_config, is_deepspeed_zero3_enabled
+
+if is_safetensors_available():
+    from safetensors import safe_open
+    from safetensors.torch import load_file as safe_load_file
+    from safetensors.torch import save_file as safe_save_file
+
+
+def is_fsdp_enabled():
+    return (
+        torch.distributed.is_available()
+        and torch.distributed.is_initialized()
+        and strtobool(os.environ.get("ACCELERATE_USE_FSDP", "False")) == 1
+        and strtobool(os.environ.get("FSDP_CPU_RAM_EFFICIENT_LOADING", "False")) == 1
+    )
+
+
+def is_local_dist_rank_0():
+    return (
+        torch.distributed.is_available()
+        and torch.distributed.is_initialized()
+        and int(os.environ.get("LOCAL_RANK", -1)) == 0
+    )
+
+
+def load_state_dict(
+    checkpoint_file: Union[str, os.PathLike],
+    is_quantized: bool = False,
+    map_location: Optional[Union[str, torch.device]] = None,
+    weights_only: bool = True,
+):
+    """
+    Reads a PyTorch checkpoint file, returning properly formatted errors if they arise.
+    """
+
+    if checkpoint_file.endswith(".safetensors") and is_safetensors_available():
+        # Check format of the archive
+        with safe_open(checkpoint_file, framework="pt") as f:
+            metadata = f.metadata()
+        if metadata is not None and metadata.get("format") not in ["pt", "tf", "flax", "mlx"]:
+            raise OSError(
+                f"The safetensors archive passed at {checkpoint_file} does not contain the valid metadata. Make sure "
+                "you save your model with the `save_pretrained` method."
+            )
+        return safe_load_file(checkpoint_file)
+    try:
+        if map_location is None:
+            if (
+                (
+                    is_deepspeed_zero3_enabled()
+                    and torch.distributed.is_initialized()
+                    and torch.distributed.get_rank() > 0
+                )
+                or (is_fsdp_enabled() and not is_local_dist_rank_0())
+            ) and not is_quantized:
+                map_location = "meta"
+            else:
+                map_location = "cpu"
+        extra_args = {}
+        # mmap can only be used with files serialized with zipfile-based format.
+        if (
+            isinstance(checkpoint_file, str)
+            and map_location != "meta"
+            and version.parse(torch.__version__) >= version.parse("2.1.0")
+            and is_zipfile(checkpoint_file)
+        ):
+            extra_args = {"mmap": True}
+        weights_only_kwarg = {"weights_only": weights_only}
+        return torch.load(
+            checkpoint_file,
+            map_location=map_location,
+            **weights_only_kwarg,
+            **extra_args,
+        )
+    except Exception as e:
+        try:
+            with open(checkpoint_file) as f:
+                if f.read(7) == "version":
+                    raise OSError(
+                        "You seem to have cloned a repository without having git-lfs installed. Please install "
+                        "git-lfs and run `git lfs install` followed by `git lfs pull` in the folder "
+                        "you cloned."
+                    )
+                else:
+                    raise ValueError(
+                        f"Unable to locate the file {checkpoint_file} which is necessary to load this pretrained "
+                        "model. Make sure you have saved the model properly."
+                    ) from e
+        except (UnicodeDecodeError, ValueError):
+            raise OSError(
+                f"Unable to load weights from pytorch checkpoint file for '{checkpoint_file}' "
+                f"at '{checkpoint_file}'. "
+                "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True."
+            )
+
+
+def set_initialized_submodules(model, state_dict_keys):
+    """
+    Sets the `_is_hf_initialized` flag in all submodules of a given model when all its weights are in the loaded state
+    dict.
+    """
+    state_dict_keys = set(state_dict_keys)
+    not_initialized_submodules = {}
+    for module_name, module in model.named_modules():
+        if module_name == "":
+            # When checking if the root module is loaded there's no need to prepend module_name.
+            module_keys = set(module.state_dict())
+        else:
+            module_keys = {f"{module_name}.{k}" for k in module.state_dict()}
+        if module_keys.issubset(state_dict_keys):
+            module._is_hf_initialized = True
+        else:
+            not_initialized_submodules[module_name] = module
+    return not_initialized_submodules
+
+
+# ==--------------------------------------------------------------------------==
+
+
+def patch_transformers():
+    import transformers
+
+    transformers.modeling_utils.load_state_dict = load_state_dict
+    transformers.modeling_utils.set_initialized_submodules = set_initialized_submodules
+
+
+import logging
+
+logger = logging.getLogger(__name__)
+
+
+def eval(model_path):
+    import transformers
+    from transformers.modeling_utils import no_init_weights
+    # from patch_for_ds import patch_transformers
+
+    # if not not_patch_lin:
+    #     patch_lin()
+
+    def _patch__initialize_weights(self, module):
+        print(f"Skipping init_weights ")
+        module._is_hf_initialized = True
+
+    transformers.modeling_utils.PreTrainedModel._initialize_weights = _patch__initialize_weights
+    patch_transformers()
+    with no_init_weights():
+        model = transformers.AutoModelForCausalLM.from_pretrained(
+            model_path,
+            torch_dtype="auto",
+            low_cpu_mem_usage=True,
+            trust_remote_code=True,
+        )
+    logger.info(f"Patched model: {model}")
+    model.eval()
+    tokenizer = transformers.AutoTokenizer.from_pretrained(model_path)
+    prompt = "Hi, who"
+    encode = tokenizer.encode(prompt, return_tensors="pt")
+    with torch.no_grad():
+        output_tokens = model.generate(encode, max_length=10)
+        output = tokenizer.decode(output_tokens[0], skip_special_tokens=True)
+        logger.info(f"Prompt: {prompt}")
+        logger.info(f"Output: {output}")
+
+
+if __name__ == "__main__":
+    import argparse
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument("-m", "--qmodel_path", type=str, required=True)
+    parser.add_argument("--not_patch_lin", action="store_true", help="Measure float model")
+    args = parser.parse_args()
+    eval(args.qmodel_path)

From 49ec5a2e7b3ad23c67c199a83078617a3b13e609 Mon Sep 17 00:00:00 2001
From: Yi Liu <yiliu4@habana.ai>
Date: Tue, 18 Feb 2025 12:50:01 +0200
Subject: [PATCH 24/25] use info level

Change-Id: I156d0083a8cf29ba97450a8c5a2c047286ae2f4e
Signed-off-by: Yi Liu <yiliu4@habana.ai>
---
 ds/infer_bf16.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ds/infer_bf16.py b/ds/infer_bf16.py
index 5b0b7d3b652..2accfea138f 100644
--- a/ds/infer_bf16.py
+++ b/ds/infer_bf16.py
@@ -135,7 +135,7 @@ def patch_transformers():
 import logging
 
 logger = logging.getLogger(__name__)
-
+logger.setLevel(logging.INFO)
 
 def eval(model_path):
     import transformers

From cb2d5483f9d52836cd9e38bc7d654340d66322bc Mon Sep 17 00:00:00 2001
From: "pre-commit-ci[bot]"
 <66853113+pre-commit-ci[bot]@users.noreply.github.com>
Date: Tue, 18 Feb 2025 10:51:43 +0000
Subject: [PATCH 25/25] [pre-commit.ci] auto fixes from pre-commit.com hooks

for more information, see https://pre-commit.ci
---
 ds/infer_bf16.py | 24 +++++++++++-------------
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/ds/infer_bf16.py b/ds/infer_bf16.py
index 2accfea138f..6fc88d11d0a 100644
--- a/ds/infer_bf16.py
+++ b/ds/infer_bf16.py
@@ -1,12 +1,13 @@
 # ==--------------------------------------------------------------------------==
 # Patch for loading DS models
-from typing import Union, Optional
-import torch
 import os
-from packaging import version
+from typing import Optional, Union
 from zipfile import is_zipfile
-from transformers.utils import is_safetensors_available, strtobool
+
+import torch
+from packaging import version
 from transformers.integrations import PeftAdapterMixin, deepspeed_config, is_deepspeed_zero3_enabled
+from transformers.utils import is_safetensors_available, strtobool
 
 if is_safetensors_available():
     from safetensors import safe_open
@@ -37,9 +38,7 @@ def load_state_dict(
     map_location: Optional[Union[str, torch.device]] = None,
     weights_only: bool = True,
 ):
-    """
-    Reads a PyTorch checkpoint file, returning properly formatted errors if they arise.
-    """
+    """Reads a PyTorch checkpoint file, returning properly formatted errors if they arise."""
 
     if checkpoint_file.endswith(".safetensors") and is_safetensors_available():
         # Check format of the archive
@@ -103,10 +102,8 @@ def load_state_dict(
 
 
 def set_initialized_submodules(model, state_dict_keys):
-    """
-    Sets the `_is_hf_initialized` flag in all submodules of a given model when all its weights are in the loaded state
-    dict.
-    """
+    """Sets the `_is_hf_initialized` flag in all submodules of a given model when all its weights are in the loaded state
+    dict."""
     state_dict_keys = set(state_dict_keys)
     not_initialized_submodules = {}
     for module_name, module in model.named_modules():
@@ -137,16 +134,17 @@ def patch_transformers():
 logger = logging.getLogger(__name__)
 logger.setLevel(logging.INFO)
 
+
 def eval(model_path):
     import transformers
     from transformers.modeling_utils import no_init_weights
-    # from patch_for_ds import patch_transformers
 
+    # from patch_for_ds import patch_transformers
     # if not not_patch_lin:
     #     patch_lin()
 
     def _patch__initialize_weights(self, module):
-        print(f"Skipping init_weights ")
+        print("Skipping init_weights ")
         module._is_hf_initialized = True
 
     transformers.modeling_utils.PreTrainedModel._initialize_weights = _patch__initialize_weights