From 2783c28074714ea88851878e7ecdcb154464755b Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Tue, 11 Feb 2025 13:01:33 +0800 Subject: [PATCH 01/25] Create static_quant.py for ds --- examples/ds/static_quant.py | 140 ++++++++++++++++++++++++++++++++++++ 1 file changed, 140 insertions(+) create mode 100644 examples/ds/static_quant.py diff --git a/examples/ds/static_quant.py b/examples/ds/static_quant.py new file mode 100644 index 00000000000..c7077b0680e --- /dev/null +++ b/examples/ds/static_quant.py @@ -0,0 +1,140 @@ +""" +# Prerequisite +pip install -r requirements.txt + +# Note for static/dynamic W8FP8 quantization: +1. Name convention: + - weight scale name: "prefix.scale_weight" + - input scale name: "prefix.scale_input" +2. A json file mapping from tensor name to safetensor file name. + +Example: +class M(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + self.fc1 = torch.nn.Linear(10, 5, bias=False) + + def forward(self, inp): + x1 = self.fc1(inp) + return x1 + +1. state dict +{ + "fc1.weight": torch.Tensor(...), + "fc1.scale_weight": torch.Tensor(...), + "fc1.scale_input": torch.Tensor(...), +} + +2. json file, model.safetensors.index.json +{ + "fc1.weight": "qmodel.safetensors", + "fc1.scale_weight": "qmodel.safetensors", + "fc1.scale_input": "qmodel.safetensors" +} + +""" + +import os +import torch +import tqdm +from loguru import logger +import safetensors +from safetensors import safe_open +from safetensors.torch import save_file +import json + +torch.set_grad_enabled(False) + +# CONSTANTS +SAFETENSORS = "safetensors" +WEIGHT_SCALE_NAME = "scale_weight" +SCALE_FILE_NAME = f"scales.{SAFETENSORS}" +FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max +WEIGHT_BACKOFF = 0.5 +""" +# https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Quantization/Inference_Using_FP8.html?highlight=backoff#supported-json-config-file-options +Similarly, the maxabs value of a weight is scaled to weight_backoff*FP8_143_FULLSCALE. The default values are input_backoff=0.25 and weight_backoff=0.5. +""" +MODEL_STATE_DICT_MAPPING_FILENAME = "model.safetensors.index.json" + + +def get_all_weight_filename(model_path): + all_files = os.listdir(model_path) + all_weight_filename = [] + for file in all_files: + if file.endswith(f".{SAFETENSORS}"): + all_weight_filename.append(file) + return all_weight_filename + + +# from _fp8_quant/_core/fp_utils.py +def calc_maxabs_scale(xmaxabs, fullscale, backoff=1): + scale = xmaxabs / (fullscale * backoff) + return scale + + +def quant_tensor(tensor): + # Note: + # 1. Check the scale dtype + # 2. Check the scale shape + amax = tensor.abs().max() + scale = calc_maxabs_scale(amax, FULL_RANGE, WEIGHT_BACKOFF) + qtensor = tensor / scale + cliped_qtensor = torch.clamp(qtensor, -FULL_RANGE, FULL_RANGE) + cliped_qtensor_fp8 = cliped_qtensor.to(torch.float8_e4m3fn) + return scale, cliped_qtensor_fp8 + + +def _maybe_create_dir(qmodel_path): + if not os.path.exists(qmodel_path): + os.makedirs(qmodel_path) + + +def static_quant_model(model_path, qmodel_path): + _maybe_create_dir(qmodel_path) + all_weight_filename = get_all_weight_filename(model_path) + logger.info(f"Got {len(all_weight_filename)} weight files") + qtensor_mappping = {} + for i, filename in tqdm.tqdm(enumerate(all_weight_filename)): + if i >= 2: + break + file_path = os.path.join(model_path, filename) + qmodel_file_name = filename + qmodel_file_path = os.path.join(qmodel_path, qmodel_file_name) + qtensors = {} + with safe_open(file_path, framework="pt", device="cpu") as f: + for weight_name in f.keys(): + weight = f.get_tensor(weight_name) + scale, qtensor = quant_tensor(weight) + preifx_name = weight_name[: -len(".weight")] + scale_name = f"{preifx_name}.{WEIGHT_SCALE_NAME}" + qtensors[scale_name] = scale + qtensors[weight_name] = qtensor + qtensor_mappping[scale_name] = qmodel_file_name + qtensor_mappping[weight_name] = qmodel_file_name + logger.debug(f"Saving {len(qtensors)} tensors to {qmodel_file_path}") + save_file(qtensors, os.path.join(qmodel_path, qmodel_file_path)) + # Dump tensor mapping into json file + model_state_dict_mapping_file_path = os.path.join(qmodel_path, MODEL_STATE_DICT_MAPPING_FILENAME) + logger.info(f"Saving tensor mapping to {model_state_dict_mapping_file_path}") + with open(model_state_dict_mapping_file_path, "w") as f: + json.dump(qtensor_mappping, f, indent=4) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("--model_path", type=str, required=True) + parser.add_argument("--qmodel_path", type=str, required=True) + args = parser.parse_args() + static_quant_model(args.model_path, args.qmodel_path) + +""" +model_path = "/software/users/yiliu4/HF_HOME/hub/DeepSeek-V3-BF16" +model_path = "/software/users/yiliu4/HF_HOME/hub/deepseekv3-bf16-4l/" +qmodel_path = "/software/users/yiliu4/HF_HOME/hub/deepseekv3-bf16-4l-q/" +static_quant_model(model_path, qmodel_path) +python static_quant.py --model_path /software/users/yiliu4/HF_HOME/hub/deepseekv3-bf16-4l/ --qmodel_path /software/users/yiliu4/HF_HOME/hub/deepseekv3-bf16-4l-q/ + +""" From b5dc860b819bc53deb7d06c21e75158186f6a92c Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Tue, 11 Feb 2025 13:03:03 +0800 Subject: [PATCH 02/25] Update static_quant.py --- examples/ds/static_quant.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/examples/ds/static_quant.py b/examples/ds/static_quant.py index c7077b0680e..01a9a11ade9 100644 --- a/examples/ds/static_quant.py +++ b/examples/ds/static_quant.py @@ -96,8 +96,6 @@ def static_quant_model(model_path, qmodel_path): logger.info(f"Got {len(all_weight_filename)} weight files") qtensor_mappping = {} for i, filename in tqdm.tqdm(enumerate(all_weight_filename)): - if i >= 2: - break file_path = os.path.join(model_path, filename) qmodel_file_name = filename qmodel_file_path = os.path.join(qmodel_path, qmodel_file_name) From 71d395d32c3490994923f02a8578d25d557e6260 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Tue, 11 Feb 2025 13:04:36 +0800 Subject: [PATCH 03/25] add requirements.txt --- examples/requirements.txt | 4 ++++ 1 file changed, 4 insertions(+) create mode 100644 examples/requirements.txt diff --git a/examples/requirements.txt b/examples/requirements.txt new file mode 100644 index 00000000000..6b3691129b8 --- /dev/null +++ b/examples/requirements.txt @@ -0,0 +1,4 @@ +loguru +torch +safetensors +tqdm From 6415795221f4c12e82ba4bcb3f6fa6f3a87ac6d2 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Tue, 11 Feb 2025 13:04:57 +0800 Subject: [PATCH 04/25] Rename examples/requirements.txt to examples/ds/requirements.txt --- examples/{ => ds}/requirements.txt | 0 1 file changed, 0 insertions(+), 0 deletions(-) rename examples/{ => ds}/requirements.txt (100%) diff --git a/examples/requirements.txt b/examples/ds/requirements.txt similarity index 100% rename from examples/requirements.txt rename to examples/ds/requirements.txt From cdc1fa085b9b03666fc2a7eff38d6cc9c317e00a Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Tue, 11 Feb 2025 08:29:47 +0200 Subject: [PATCH 05/25] use transformers Change-Id: Iee204311d566d528dd44eff0c227db45224ee242 Signed-off-by: Yi Liu --- examples/ds/requirements.txt | 1 + examples/ds/static_quant.py | 37 ++++++++++++++++++++++++++++++++++-- 2 files changed, 36 insertions(+), 2 deletions(-) diff --git a/examples/ds/requirements.txt b/examples/ds/requirements.txt index 6b3691129b8..63026a2db80 100644 --- a/examples/ds/requirements.txt +++ b/examples/ds/requirements.txt @@ -2,3 +2,4 @@ loguru torch safetensors tqdm +transformers \ No newline at end of file diff --git a/examples/ds/static_quant.py b/examples/ds/static_quant.py index 01a9a11ade9..8468dd88290 100644 --- a/examples/ds/static_quant.py +++ b/examples/ds/static_quant.py @@ -38,19 +38,23 @@ def forward(self, inp): import torch import tqdm from loguru import logger +import logging import safetensors from safetensors import safe_open from safetensors.torch import save_file import json +logging.basicConfig(level=logging.DEBUG) torch.set_grad_enabled(False) # CONSTANTS SAFETENSORS = "safetensors" WEIGHT_SCALE_NAME = "scale_weight" +INPUT_SCALE_NAME = "scale_input" SCALE_FILE_NAME = f"scales.{SAFETENSORS}" FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max WEIGHT_BACKOFF = 0.5 +QUANT_MODULE_TYPES = (torch.nn.Linear,) """ # https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Quantization/Inference_Using_FP8.html?highlight=backoff#supported-json-config-file-options Similarly, the maxabs value of a weight is scaled to weight_backoff*FP8_143_FULLSCALE. The default values are input_backoff=0.25 and weight_backoff=0.5. @@ -90,7 +94,9 @@ def _maybe_create_dir(qmodel_path): os.makedirs(qmodel_path) -def static_quant_model(model_path, qmodel_path): +def static_quant_model_for_low_cpu_usage(model_path, qmodel_path): + # FIXME: need to skip some layers like embedding + logger.warning("It will quantize all weight tensors") _maybe_create_dir(qmodel_path) all_weight_filename = get_all_weight_filename(model_path) logger.info(f"Got {len(all_weight_filename)} weight files") @@ -119,14 +125,40 @@ def static_quant_model(model_path, qmodel_path): json.dump(qtensor_mappping, f, indent=4) +@torch.no_grad() +def static_quant_model_tran(model_path, qmodel_path): + import transformers + from transformers.modeling_utils import no_init_weights + with no_init_weights(): + model = transformers.AutoModelForCausalLM.from_pretrained( + model_path, + torch_dtype="auto", + low_cpu_mem_usage=True, + trust_remote_code=True, + ) + for name, module in model.named_modules(): + if isinstance(module, QUANT_MODULE_TYPES): + logger.debug(f"Processing {name}") + weight = module.weight + scale, qtensor = quant_tensor(weight) + module.weight.data = qtensor + setattr(module, "scale_weight", torch.nn.Parameter(scale, requires_grad=False)) + logger.info(f"Saving quantized model to {qmodel_path}") + model.save_pretrained(qmodel_path) + + if __name__ == "__main__": import argparse parser = argparse.ArgumentParser() parser.add_argument("--model_path", type=str, required=True) parser.add_argument("--qmodel_path", type=str, required=True) + parser.add_argument("--low_cpu_mem", action="store_true", help="Load weight file one by one to reduce memory usage") args = parser.parse_args() - static_quant_model(args.model_path, args.qmodel_path) + if args.low_cpu_mem: + static_quant_model_for_low_cpu_usage(args.model_path, args.qmodel_path) + else: + static_quant_model_tran(args.model_path, args.qmodel_path) """ model_path = "/software/users/yiliu4/HF_HOME/hub/DeepSeek-V3-BF16" @@ -134,5 +166,6 @@ def static_quant_model(model_path, qmodel_path): qmodel_path = "/software/users/yiliu4/HF_HOME/hub/deepseekv3-bf16-4l-q/" static_quant_model(model_path, qmodel_path) python static_quant.py --model_path /software/users/yiliu4/HF_HOME/hub/deepseekv3-bf16-4l/ --qmodel_path /software/users/yiliu4/HF_HOME/hub/deepseekv3-bf16-4l-q/ +python static_quant.py --model_path /software/users/yiliu4/HF_HOME/hub/DeepSeek-V3-BF16/ --qmodel_path /software/users/yiliu4/HF_HOME/hub/DeepSeek-V3-BF16-q/ """ From 3f16755423cfafabd11a3bccdc87ceffcfbaf61b Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Tue, 11 Feb 2025 10:28:18 +0200 Subject: [PATCH 06/25] add readme Change-Id: Id248e2b9cd2dc40af9f37320e813ea9eabcb8c60 Signed-off-by: Yi Liu --- examples/ds/README.md | 52 +++++++++ examples/ds/patch_for_ds.py | 132 ++++++++++++++++++++++ examples/ds/{static_quant.py => quant.py} | 109 +++++++++--------- examples/ds/requirements.txt | 3 +- 4 files changed, 237 insertions(+), 59 deletions(-) create mode 100644 examples/ds/README.md create mode 100644 examples/ds/patch_for_ds.py rename examples/ds/{static_quant.py => quant.py} (61%) diff --git a/examples/ds/README.md b/examples/ds/README.md new file mode 100644 index 00000000000..dfabe0473a9 --- /dev/null +++ b/examples/ds/README.md @@ -0,0 +1,52 @@ +Note for static quantize DeepSeek model + +## Prerequisite +``` +pip install -r requirements.txt +``` + +## Usage + +- Option 1 (Rec) +```bash +python quant.py --model_path /path/to/DeepSeek/R1/BF16/ --qmodel_path /path/to/DeepSeek/R1/Dynamic-FP8 +``` + +- Option 2 handle weights only (If the DRAM size is less than ~700 GB) +```bash +python quant.py --model_path /path/to/DeepSeek/R1/BF16/ --qmodel_path /path/to/DeepSeek/R1-Dynamic-FP8 --low_cpu_mem +``` + +## Example +1. Name convention: + - weight scale name: "prefix.scale_weight" + - input scale name: "prefix.scale_input" (for static only) +2. A json file mapping from tensor name to safetensor file name. + +```python +class M(torch.nn.Module): + def __init__(self) -> None: + super().__init__() + self.fc1 = torch.nn.Linear(10, 5, bias=False) + + def forward(self, inp): + x1 = self.fc1(inp) + return x1 +``` + +```bash +1. state dict +{ + "fc1.weight": torch.Tensor(...), + "fc1.scale_weight": torch.Tensor(...), + "fc1.scale_input": torch.Tensor(...), +} + +2. json file, model.safetensors.index.json +{ + "fc1.weight": "qmodel.safetensors", + "fc1.scale_weight": "qmodel.safetensors", + "fc1.scale_input": "qmodel.safetensors" +} +``` + diff --git a/examples/ds/patch_for_ds.py b/examples/ds/patch_for_ds.py new file mode 100644 index 00000000000..9375ac6d496 --- /dev/null +++ b/examples/ds/patch_for_ds.py @@ -0,0 +1,132 @@ +# ==--------------------------------------------------------------------------== +# Patch for loading DS models +from typing import Union, Optional +import torch +import os +from packaging import version +from zipfile import is_zipfile +from transformers.utils import is_safetensors_available, strtobool +from transformers.integrations import PeftAdapterMixin, deepspeed_config, is_deepspeed_zero3_enabled + +if is_safetensors_available(): + from safetensors import safe_open + from safetensors.torch import load_file as safe_load_file + from safetensors.torch import save_file as safe_save_file + + +def is_fsdp_enabled(): + return ( + torch.distributed.is_available() + and torch.distributed.is_initialized() + and strtobool(os.environ.get("ACCELERATE_USE_FSDP", "False")) == 1 + and strtobool(os.environ.get("FSDP_CPU_RAM_EFFICIENT_LOADING", "False")) == 1 + ) + + +def is_local_dist_rank_0(): + return ( + torch.distributed.is_available() + and torch.distributed.is_initialized() + and int(os.environ.get("LOCAL_RANK", -1)) == 0 + ) + + +def load_state_dict( + checkpoint_file: Union[str, os.PathLike], + is_quantized: bool = False, + map_location: Optional[Union[str, torch.device]] = None, + weights_only: bool = True, +): + """ + Reads a PyTorch checkpoint file, returning properly formatted errors if they arise. + """ + + if checkpoint_file.endswith(".safetensors") and is_safetensors_available(): + # Check format of the archive + with safe_open(checkpoint_file, framework="pt") as f: + metadata = f.metadata() + if metadata is not None and metadata.get("format") not in ["pt", "tf", "flax", "mlx"]: + raise OSError( + f"The safetensors archive passed at {checkpoint_file} does not contain the valid metadata. Make sure " + "you save your model with the `save_pretrained` method." + ) + return safe_load_file(checkpoint_file) + try: + if map_location is None: + if ( + ( + is_deepspeed_zero3_enabled() + and torch.distributed.is_initialized() + and torch.distributed.get_rank() > 0 + ) + or (is_fsdp_enabled() and not is_local_dist_rank_0()) + ) and not is_quantized: + map_location = "meta" + else: + map_location = "cpu" + extra_args = {} + # mmap can only be used with files serialized with zipfile-based format. + if ( + isinstance(checkpoint_file, str) + and map_location != "meta" + and version.parse(torch.__version__) >= version.parse("2.1.0") + and is_zipfile(checkpoint_file) + ): + extra_args = {"mmap": True} + weights_only_kwarg = {"weights_only": weights_only} + return torch.load( + checkpoint_file, + map_location=map_location, + **weights_only_kwarg, + **extra_args, + ) + except Exception as e: + try: + with open(checkpoint_file) as f: + if f.read(7) == "version": + raise OSError( + "You seem to have cloned a repository without having git-lfs installed. Please install " + "git-lfs and run `git lfs install` followed by `git lfs pull` in the folder " + "you cloned." + ) + else: + raise ValueError( + f"Unable to locate the file {checkpoint_file} which is necessary to load this pretrained " + "model. Make sure you have saved the model properly." + ) from e + except (UnicodeDecodeError, ValueError): + raise OSError( + f"Unable to load weights from pytorch checkpoint file for '{checkpoint_file}' " + f"at '{checkpoint_file}'. " + "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True." + ) + + +def set_initialized_submodules(model, state_dict_keys): + """ + Sets the `_is_hf_initialized` flag in all submodules of a given model when all its weights are in the loaded state + dict. + """ + state_dict_keys = set(state_dict_keys) + not_initialized_submodules = {} + for module_name, module in model.named_modules(): + if module_name == "": + # When checking if the root module is loaded there's no need to prepend module_name. + module_keys = set(module.state_dict()) + else: + module_keys = {f"{module_name}.{k}" for k in module.state_dict()} + if module_keys.issubset(state_dict_keys): + module._is_hf_initialized = True + else: + not_initialized_submodules[module_name] = module + return not_initialized_submodules + + +# ==--------------------------------------------------------------------------== + + +def patch_transformers(): + import transformers + + transformers.modeling_utils.load_state_dict = load_state_dict + transformers.modeling_utils.set_initialized_submodules = set_initialized_submodules diff --git a/examples/ds/static_quant.py b/examples/ds/quant.py similarity index 61% rename from examples/ds/static_quant.py rename to examples/ds/quant.py index 8468dd88290..9eb321d6fa8 100644 --- a/examples/ds/static_quant.py +++ b/examples/ds/quant.py @@ -1,39 +1,3 @@ -""" -# Prerequisite -pip install -r requirements.txt - -# Note for static/dynamic W8FP8 quantization: -1. Name convention: - - weight scale name: "prefix.scale_weight" - - input scale name: "prefix.scale_input" -2. A json file mapping from tensor name to safetensor file name. - -Example: -class M(torch.nn.Module): - def __init__(self) -> None: - super().__init__() - self.fc1 = torch.nn.Linear(10, 5, bias=False) - - def forward(self, inp): - x1 = self.fc1(inp) - return x1 - -1. state dict -{ - "fc1.weight": torch.Tensor(...), - "fc1.scale_weight": torch.Tensor(...), - "fc1.scale_input": torch.Tensor(...), -} - -2. json file, model.safetensors.index.json -{ - "fc1.weight": "qmodel.safetensors", - "fc1.scale_weight": "qmodel.safetensors", - "fc1.scale_input": "qmodel.safetensors" -} - -""" - import os import torch import tqdm @@ -51,10 +15,12 @@ def forward(self, inp): SAFETENSORS = "safetensors" WEIGHT_SCALE_NAME = "scale_weight" INPUT_SCALE_NAME = "scale_input" +SCALE_DTYPE = torch.bfloat16 SCALE_FILE_NAME = f"scales.{SAFETENSORS}" FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max WEIGHT_BACKOFF = 0.5 QUANT_MODULE_TYPES = (torch.nn.Linear,) +SKIP_WEIGHT_LST = {"model.norm", "layernorm", "e_score_correction_bias", "lm_head.weight", "embed_tokens"} """ # https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Quantization/Inference_Using_FP8.html?highlight=backoff#supported-json-config-file-options Similarly, the maxabs value of a weight is scaled to weight_backoff*FP8_143_FULLSCALE. The default values are input_backoff=0.25 and weight_backoff=0.5. @@ -62,6 +28,14 @@ def forward(self, inp): MODEL_STATE_DICT_MAPPING_FILENAME = "model.safetensors.index.json" +def skip_weight(weight_name): + return any([skip_name in weight_name for skip_name in SKIP_WEIGHT_LST]) + +def get_cpu_mem_size_in_gb(): + import psutil + mem = psutil.virtual_memory() + return mem.available + def get_all_weight_filename(model_path): all_files = os.listdir(model_path) all_weight_filename = [] @@ -83,25 +57,25 @@ def quant_tensor(tensor): # 2. Check the scale shape amax = tensor.abs().max() scale = calc_maxabs_scale(amax, FULL_RANGE, WEIGHT_BACKOFF) + scale = scale.to(SCALE_DTYPE) qtensor = tensor / scale cliped_qtensor = torch.clamp(qtensor, -FULL_RANGE, FULL_RANGE) cliped_qtensor_fp8 = cliped_qtensor.to(torch.float8_e4m3fn) return scale, cliped_qtensor_fp8 - def _maybe_create_dir(qmodel_path): if not os.path.exists(qmodel_path): os.makedirs(qmodel_path) -def static_quant_model_for_low_cpu_usage(model_path, qmodel_path): +def quant_model_weight_with_low_cpu_usage(model_path, qmodel_path): # FIXME: need to skip some layers like embedding - logger.warning("It will quantize all weight tensors") _maybe_create_dir(qmodel_path) all_weight_filename = get_all_weight_filename(model_path) logger.info(f"Got {len(all_weight_filename)} weight files") qtensor_mappping = {} - for i, filename in tqdm.tqdm(enumerate(all_weight_filename)): + for i, filename in enumerate(all_weight_filename): + logger.info(f"Processing {i + 1}/{len(all_weight_filename)}: {filename}") file_path = os.path.join(model_path, filename) qmodel_file_name = filename qmodel_file_path = os.path.join(qmodel_path, qmodel_file_name) @@ -109,6 +83,12 @@ def static_quant_model_for_low_cpu_usage(model_path, qmodel_path): with safe_open(file_path, framework="pt", device="cpu") as f: for weight_name in f.keys(): weight = f.get_tensor(weight_name) + if skip_weight(weight_name): + logger.debug(f"Skiping quantize {weight_name}") + qtensors[weight_name] = weight + qtensor_mappping[weight_name] = qmodel_file_name + continue + logger.debug(f"Processing {weight_name}" scale, qtensor = quant_tensor(weight) preifx_name = weight_name[: -len(".weight")] scale_name = f"{preifx_name}.{WEIGHT_SCALE_NAME}" @@ -125,38 +105,50 @@ def static_quant_model_for_low_cpu_usage(model_path, qmodel_path): json.dump(qtensor_mappping, f, indent=4) +def _import_oh(): + import transformers + from optimum.habana.transformers.modeling_utils import adapt_transformers_to_gaudi + + orig_check_support_param_buffer_assignment = transformers.modeling_utils.check_support_param_buffer_assignment + adapt_transformers_to_gaudi() + transformers.modeling_utils.check_support_param_buffer_assignment = orig_check_support_param_buffer_assignment + + @torch.no_grad() def static_quant_model_tran(model_path, qmodel_path): + # assert get_cpu_mem_size_in_gb(800), "Not enough memory, please use quant_model_weight_with_low_cpu_usage" import transformers - from transformers.modeling_utils import no_init_weights - with no_init_weights(): - model = transformers.AutoModelForCausalLM.from_pretrained( - model_path, - torch_dtype="auto", - low_cpu_mem_usage=True, - trust_remote_code=True, - ) + from patch_for_ds import patch_transformers + # import_oh() + patch_transformers() + model = transformers.AutoModelForCausalLM.from_pretrained( + model_path, + torch_dtype="auto", + low_cpu_mem_usage=True, + trust_remote_code=True, + ) for name, module in model.named_modules(): - if isinstance(module, QUANT_MODULE_TYPES): - logger.debug(f"Processing {name}") - weight = module.weight - scale, qtensor = quant_tensor(weight) - module.weight.data = qtensor - setattr(module, "scale_weight", torch.nn.Parameter(scale, requires_grad=False)) + if not isinstance(module, QUANT_MODULE_TYPES) or skip_weight(name): + logger.debug(f"Skiping quantize {name}") + continue + logger.debug(f"Processing {name}") + weight = module.weight + scale, qtensor = quant_tensor(weight) + module.weight.data = qtensor + setattr(module, "scale_weight", torch.nn.Parameter(scale, requires_grad=False)) logger.info(f"Saving quantized model to {qmodel_path}") model.save_pretrained(qmodel_path) if __name__ == "__main__": import argparse - parser = argparse.ArgumentParser() parser.add_argument("--model_path", type=str, required=True) parser.add_argument("--qmodel_path", type=str, required=True) parser.add_argument("--low_cpu_mem", action="store_true", help="Load weight file one by one to reduce memory usage") args = parser.parse_args() if args.low_cpu_mem: - static_quant_model_for_low_cpu_usage(args.model_path, args.qmodel_path) + quant_model_weight_with_low_cpu_usage(args.model_path, args.qmodel_path) else: static_quant_model_tran(args.model_path, args.qmodel_path) @@ -165,7 +157,8 @@ def static_quant_model_tran(model_path, qmodel_path): model_path = "/software/users/yiliu4/HF_HOME/hub/deepseekv3-bf16-4l/" qmodel_path = "/software/users/yiliu4/HF_HOME/hub/deepseekv3-bf16-4l-q/" static_quant_model(model_path, qmodel_path) -python static_quant.py --model_path /software/users/yiliu4/HF_HOME/hub/deepseekv3-bf16-4l/ --qmodel_path /software/users/yiliu4/HF_HOME/hub/deepseekv3-bf16-4l-q/ -python static_quant.py --model_path /software/users/yiliu4/HF_HOME/hub/DeepSeek-V3-BF16/ --qmodel_path /software/users/yiliu4/HF_HOME/hub/DeepSeek-V3-BF16-q/ +python quant.py --model_path /software/users/yiliu4/HF_HOME/hub/deepseekv3-bf16-4l/ --qmodel_path /software/users/yiliu4/HF_HOME/hub/deepseekv3-bf16-4l-q2/ --low_cpu_mem +python quant.py --model_path /software/users/yiliu4/HF_HOME/hub/deepseekv3-bf16-4l/ --qmodel_path /software/users/yiliu4/HF_HOME/hub/deepseekv3-bf16-4l-q/ +python quant.py --model_path /software/users/yiliu4/HF_HOME/hub/DeepSeek-V3-BF16/ --qmodel_path /software/users/yiliu4/HF_HOME/hub/DeepSeek-V3-BF16-q/ """ diff --git a/examples/ds/requirements.txt b/examples/ds/requirements.txt index 63026a2db80..c224c095b1b 100644 --- a/examples/ds/requirements.txt +++ b/examples/ds/requirements.txt @@ -2,4 +2,5 @@ loguru torch safetensors tqdm -transformers \ No newline at end of file +transformers +psutil \ No newline at end of file From fbd0593069b8d12e98157d379135469d39d41595 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Tue, 11 Feb 2025 10:30:47 +0200 Subject: [PATCH 07/25] update docs Change-Id: I2ae9ef7b8854b8abff5ea863bda9d567bd68ccb6 Signed-off-by: Yi Liu --- examples/ds/README.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/ds/README.md b/examples/ds/README.md index dfabe0473a9..4bc3b305f49 100644 --- a/examples/ds/README.md +++ b/examples/ds/README.md @@ -19,8 +19,8 @@ python quant.py --model_path /path/to/DeepSeek/R1/BF16/ --qmodel_path /path/to/D ## Example 1. Name convention: - - weight scale name: "prefix.scale_weight" - - input scale name: "prefix.scale_input" (for static only) + - weight scale name: `prefix.scale_weight` + - input scale name: `prefix.scale_input` (for static only) 2. A json file mapping from tensor name to safetensor file name. ```python @@ -42,7 +42,7 @@ class M(torch.nn.Module): "fc1.scale_input": torch.Tensor(...), } -2. json file, model.safetensors.index.json +2. json file, `model.safetensors.index.json` { "fc1.weight": "qmodel.safetensors", "fc1.scale_weight": "qmodel.safetensors", From 775d47ed172d8633b536cbbc75e86b011421e04f Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Tue, 11 Feb 2025 10:41:23 +0200 Subject: [PATCH 08/25] fix Change-Id: I158dc3d7dd541bed1b138e89266d0539229da7fe Signed-off-by: Yi Liu --- examples/ds/README.md | 4 ++-- examples/ds/quant.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/examples/ds/README.md b/examples/ds/README.md index 4bc3b305f49..a98e5d59e39 100644 --- a/examples/ds/README.md +++ b/examples/ds/README.md @@ -7,12 +7,12 @@ pip install -r requirements.txt ## Usage -- Option 1 (Rec) +- Option 1. (Rec) ```bash python quant.py --model_path /path/to/DeepSeek/R1/BF16/ --qmodel_path /path/to/DeepSeek/R1/Dynamic-FP8 ``` -- Option 2 handle weights only (If the DRAM size is less than ~700 GB) +- Option 2. handle weights only (If the DRAM size is less than ~700 GB) ```bash python quant.py --model_path /path/to/DeepSeek/R1/BF16/ --qmodel_path /path/to/DeepSeek/R1-Dynamic-FP8 --low_cpu_mem ``` diff --git a/examples/ds/quant.py b/examples/ds/quant.py index 9eb321d6fa8..bd9b3fde6e7 100644 --- a/examples/ds/quant.py +++ b/examples/ds/quant.py @@ -88,7 +88,7 @@ def quant_model_weight_with_low_cpu_usage(model_path, qmodel_path): qtensors[weight_name] = weight qtensor_mappping[weight_name] = qmodel_file_name continue - logger.debug(f"Processing {weight_name}" + logger.debug(f"Processing {weight_name}") scale, qtensor = quant_tensor(weight) preifx_name = weight_name[: -len(".weight")] scale_name = f"{preifx_name}.{WEIGHT_SCALE_NAME}" From fe232824231506cdbb3f203e192c7f01db984618 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Tue, 11 Feb 2025 10:51:31 +0200 Subject: [PATCH 09/25] update noote Change-Id: Ie818584925c5b4ba23d1a541d6d987e4fffc5aaf Signed-off-by: Yi Liu --- examples/ds/README.md | 6 ++++++ examples/ds/quant.py | 5 +++-- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/examples/ds/README.md b/examples/ds/README.md index a98e5d59e39..cde5ac4fadb 100644 --- a/examples/ds/README.md +++ b/examples/ds/README.md @@ -17,6 +17,12 @@ python quant.py --model_path /path/to/DeepSeek/R1/BF16/ --qmodel_path /path/to/D python quant.py --model_path /path/to/DeepSeek/R1/BF16/ --qmodel_path /path/to/DeepSeek/R1-Dynamic-FP8 --low_cpu_mem ``` +> [!NOTE] +> - Skip quantize `lm-head`. +> - `WEIGHT_BACKOFF = 0.5` +> - `SCALE_DTYPE = torch.bfloat16` + + ## Example 1. Name convention: - weight scale name: `prefix.scale_weight` diff --git a/examples/ds/quant.py b/examples/ds/quant.py index bd9b3fde6e7..d5a2062c24e 100644 --- a/examples/ds/quant.py +++ b/examples/ds/quant.py @@ -72,6 +72,7 @@ def quant_model_weight_with_low_cpu_usage(model_path, qmodel_path): # FIXME: need to skip some layers like embedding _maybe_create_dir(qmodel_path) all_weight_filename = get_all_weight_filename(model_path) + files_cnt = len(all_weight_filename) logger.info(f"Got {len(all_weight_filename)} weight files") qtensor_mappping = {} for i, filename in enumerate(all_weight_filename): @@ -88,7 +89,7 @@ def quant_model_weight_with_low_cpu_usage(model_path, qmodel_path): qtensors[weight_name] = weight qtensor_mappping[weight_name] = qmodel_file_name continue - logger.debug(f"Processing {weight_name}") + logger.debug(f"[{i+1}/{files_cnt}] Processing {weight_name}") scale, qtensor = quant_tensor(weight) preifx_name = weight_name[: -len(".weight")] scale_name = f"{preifx_name}.{WEIGHT_SCALE_NAME}" @@ -96,7 +97,7 @@ def quant_model_weight_with_low_cpu_usage(model_path, qmodel_path): qtensors[weight_name] = qtensor qtensor_mappping[scale_name] = qmodel_file_name qtensor_mappping[weight_name] = qmodel_file_name - logger.debug(f"Saving {len(qtensors)} tensors to {qmodel_file_path}") + logger.debug(f"[{i+1}/{files_cnt}] Saving {len(qtensors)} tensors to {qmodel_file_path}") save_file(qtensors, os.path.join(qmodel_path, qmodel_file_path)) # Dump tensor mapping into json file model_state_dict_mapping_file_path = os.path.join(qmodel_path, MODEL_STATE_DICT_MAPPING_FILENAME) From 6be43ca01509d86079b3c3f119c0ca0a5d8fc7b1 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Tue, 11 Feb 2025 10:59:48 +0200 Subject: [PATCH 10/25] clean code Change-Id: I62f7ace7d647ac76918a6e4f13cdbfe8f0d90608 Signed-off-by: Yi Liu --- examples/ds/quant.py | 17 ++++++----------- 1 file changed, 6 insertions(+), 11 deletions(-) diff --git a/examples/ds/quant.py b/examples/ds/quant.py index d5a2062c24e..a9040e6dd3c 100644 --- a/examples/ds/quant.py +++ b/examples/ds/quant.py @@ -31,11 +31,14 @@ def skip_weight(weight_name): return any([skip_name in weight_name for skip_name in SKIP_WEIGHT_LST]) + def get_cpu_mem_size_in_gb(): import psutil + mem = psutil.virtual_memory() return mem.available + def get_all_weight_filename(model_path): all_files = os.listdir(model_path) all_weight_filename = [] @@ -63,6 +66,7 @@ def quant_tensor(tensor): cliped_qtensor_fp8 = cliped_qtensor.to(torch.float8_e4m3fn) return scale, cliped_qtensor_fp8 + def _maybe_create_dir(qmodel_path): if not os.path.exists(qmodel_path): os.makedirs(qmodel_path) @@ -120,6 +124,7 @@ def static_quant_model_tran(model_path, qmodel_path): # assert get_cpu_mem_size_in_gb(800), "Not enough memory, please use quant_model_weight_with_low_cpu_usage" import transformers from patch_for_ds import patch_transformers + # import_oh() patch_transformers() model = transformers.AutoModelForCausalLM.from_pretrained( @@ -143,6 +148,7 @@ def static_quant_model_tran(model_path, qmodel_path): if __name__ == "__main__": import argparse + parser = argparse.ArgumentParser() parser.add_argument("--model_path", type=str, required=True) parser.add_argument("--qmodel_path", type=str, required=True) @@ -152,14 +158,3 @@ def static_quant_model_tran(model_path, qmodel_path): quant_model_weight_with_low_cpu_usage(args.model_path, args.qmodel_path) else: static_quant_model_tran(args.model_path, args.qmodel_path) - -""" -model_path = "/software/users/yiliu4/HF_HOME/hub/DeepSeek-V3-BF16" -model_path = "/software/users/yiliu4/HF_HOME/hub/deepseekv3-bf16-4l/" -qmodel_path = "/software/users/yiliu4/HF_HOME/hub/deepseekv3-bf16-4l-q/" -static_quant_model(model_path, qmodel_path) -python quant.py --model_path /software/users/yiliu4/HF_HOME/hub/deepseekv3-bf16-4l/ --qmodel_path /software/users/yiliu4/HF_HOME/hub/deepseekv3-bf16-4l-q2/ --low_cpu_mem -python quant.py --model_path /software/users/yiliu4/HF_HOME/hub/deepseekv3-bf16-4l/ --qmodel_path /software/users/yiliu4/HF_HOME/hub/deepseekv3-bf16-4l-q/ -python quant.py --model_path /software/users/yiliu4/HF_HOME/hub/DeepSeek-V3-BF16/ --qmodel_path /software/users/yiliu4/HF_HOME/hub/DeepSeek-V3-BF16-q/ - -""" From 144139c0dc7143c29e8a785c77472fe74bc5c860 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Tue, 11 Feb 2025 11:32:30 +0200 Subject: [PATCH 11/25] update docs Change-Id: Ic79e2a40cc758a7715dfa45dc0cc75c30fdf231b Signed-off-by: Yi Liu --- examples/ds/README.md | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/examples/ds/README.md b/examples/ds/README.md index cde5ac4fadb..d34bec78b56 100644 --- a/examples/ds/README.md +++ b/examples/ds/README.md @@ -7,14 +7,15 @@ pip install -r requirements.txt ## Usage -- Option 1. (Rec) +- Option 1 (Rec). handle weights only +```bash +python quant.py --model_path /path/to/DeepSeek/R1/BF16/ --qmodel_path /path/to/DeepSeek/R1-Dynamic-FP8 --low_cpu_mem + +- Option 2. Loading model using transformers (Requires DRAM > ~700 GB) ```bash python quant.py --model_path /path/to/DeepSeek/R1/BF16/ --qmodel_path /path/to/DeepSeek/R1/Dynamic-FP8 ``` -- Option 2. handle weights only (If the DRAM size is less than ~700 GB) -```bash -python quant.py --model_path /path/to/DeepSeek/R1/BF16/ --qmodel_path /path/to/DeepSeek/R1-Dynamic-FP8 --low_cpu_mem ``` > [!NOTE] From e2cdd4729a47a740aa2cb184e90c9b8e499da923 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Tue, 11 Feb 2025 11:33:30 +0200 Subject: [PATCH 12/25] update docs Change-Id: I1c719a6895e15688033c3de84a8ad333a15930ec Signed-off-by: Yi Liu --- examples/ds/README.md | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/examples/ds/README.md b/examples/ds/README.md index d34bec78b56..bbfacd76ed4 100644 --- a/examples/ds/README.md +++ b/examples/ds/README.md @@ -10,14 +10,13 @@ pip install -r requirements.txt - Option 1 (Rec). handle weights only ```bash python quant.py --model_path /path/to/DeepSeek/R1/BF16/ --qmodel_path /path/to/DeepSeek/R1-Dynamic-FP8 --low_cpu_mem +``` - Option 2. Loading model using transformers (Requires DRAM > ~700 GB) ```bash python quant.py --model_path /path/to/DeepSeek/R1/BF16/ --qmodel_path /path/to/DeepSeek/R1/Dynamic-FP8 ``` -``` - > [!NOTE] > - Skip quantize `lm-head`. > - `WEIGHT_BACKOFF = 0.5` From 314bf10a32d4cc89d283ca34670bb7af4d5a5fc0 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Tue, 11 Feb 2025 15:04:02 +0200 Subject: [PATCH 13/25] add mlp.gate.weight Change-Id: I84acff14c99cb8b233aca0d003598d8fa5757f2e Signed-off-by: Yi Liu --- examples/ds/quant.py | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/examples/ds/quant.py b/examples/ds/quant.py index a9040e6dd3c..f255de928d6 100644 --- a/examples/ds/quant.py +++ b/examples/ds/quant.py @@ -20,7 +20,14 @@ FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max WEIGHT_BACKOFF = 0.5 QUANT_MODULE_TYPES = (torch.nn.Linear,) -SKIP_WEIGHT_LST = {"model.norm", "layernorm", "e_score_correction_bias", "lm_head.weight", "embed_tokens"} +SKIP_WEIGHT_LST = { + "model.norm", + "layernorm", + "e_score_correction_bias", + "lm_head.weight", + "embed_tokens", + "mlp.gate.weight", # mlp.gate is not linear +} """ # https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Quantization/Inference_Using_FP8.html?highlight=backoff#supported-json-config-file-options Similarly, the maxabs value of a weight is scaled to weight_backoff*FP8_143_FULLSCALE. The default values are input_backoff=0.25 and weight_backoff=0.5. From 2c59fc54c8110b17f141a9ed34b582e30e32a0e0 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Tue, 11 Feb 2025 15:28:17 +0200 Subject: [PATCH 14/25] update the docs Change-Id: I236b189afb1e163d2bff1ab2d59d92cb601507c8 Signed-off-by: Yi Liu --- examples/ds/README.md | 21 +++++++++++++++++---- 1 file changed, 17 insertions(+), 4 deletions(-) diff --git a/examples/ds/README.md b/examples/ds/README.md index bbfacd76ed4..78de448f726 100644 --- a/examples/ds/README.md +++ b/examples/ds/README.md @@ -1,29 +1,43 @@ -Note for static quantize DeepSeek model +# Note for static quantize DeepSeek model ## Prerequisite + ``` pip install -r requirements.txt ``` ## Usage -- Option 1 (Rec). handle weights only +### Step 1. quantize model weights + +- Option 1 (Recommended): Quantize weights directly + ```bash python quant.py --model_path /path/to/DeepSeek/R1/BF16/ --qmodel_path /path/to/DeepSeek/R1-Dynamic-FP8 --low_cpu_mem ``` -- Option 2. Loading model using transformers (Requires DRAM > ~700 GB) +- Option 2. Load the model using transformers (requires ~700 GB of DRAM) + ```bash python quant.py --model_path /path/to/DeepSeek/R1/BF16/ --qmodel_path /path/to/DeepSeek/R1/Dynamic-FP8 ``` > [!NOTE] +> > - Skip quantize `lm-head`. > - `WEIGHT_BACKOFF = 0.5` > - `SCALE_DTYPE = torch.bfloat16` +### Step 2. copy model files for inference + +Since DeepSeek V3 and R1 asre not yet supported by Transformers, we need to manually copy some model files. + +```bash +python post_process.py --model_path /path/to/DeepSeek/R1/BF16/ --qmodel_path /path/to/DeepSeek/R1/Dynamic-FP8 +``` ## Example + 1. Name convention: - weight scale name: `prefix.scale_weight` - input scale name: `prefix.scale_input` (for static only) @@ -55,4 +69,3 @@ class M(torch.nn.Module): "fc1.scale_input": "qmodel.safetensors" } ``` - From 4800b2b21e2fa80c46c74b417445999adb5b785f Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Tue, 11 Feb 2025 15:30:52 +0200 Subject: [PATCH 15/25] add post process Change-Id: I501e7f14fdb35eeaad1571fb4b679f65ceb7cab5 Signed-off-by: Yi Liu --- examples/ds/README.md | 4 +- examples/ds/post_process.py | 103 ++++++++++++++++++++++++++++++++++++ 2 files changed, 105 insertions(+), 2 deletions(-) create mode 100644 examples/ds/post_process.py diff --git a/examples/ds/README.md b/examples/ds/README.md index 78de448f726..061673c2dab 100644 --- a/examples/ds/README.md +++ b/examples/ds/README.md @@ -16,7 +16,7 @@ pip install -r requirements.txt python quant.py --model_path /path/to/DeepSeek/R1/BF16/ --qmodel_path /path/to/DeepSeek/R1-Dynamic-FP8 --low_cpu_mem ``` -- Option 2. Load the model using transformers (requires ~700 GB of DRAM) +- Option 2: Load the model using transformers (requires ~700 GB of DRAM) ```bash python quant.py --model_path /path/to/DeepSeek/R1/BF16/ --qmodel_path /path/to/DeepSeek/R1/Dynamic-FP8 @@ -36,7 +36,7 @@ Since DeepSeek V3 and R1 asre not yet supported by Transformers, we need to manu python post_process.py --model_path /path/to/DeepSeek/R1/BF16/ --qmodel_path /path/to/DeepSeek/R1/Dynamic-FP8 ``` -## Example +## More details 1. Name convention: - weight scale name: `prefix.scale_weight` diff --git a/examples/ds/post_process.py b/examples/ds/post_process.py new file mode 100644 index 00000000000..fd1b11ade51 --- /dev/null +++ b/examples/ds/post_process.py @@ -0,0 +1,103 @@ +import json +from loguru import logger + +quantization_config = { + "_json_file": "/tmp/tmpe3ckugb_.json", + "allowlist": { + "names": [], + "types": [ + "Matmul", + "Linear", + "ParallelLMHead", + "RowParallelLinear", + "ColumnParallelLinear", + "MergedColumnParallelLinear", + "QKVParallelLinear", + "FalconLinear", + "KVCache", + "VLLMKVCache", + "Conv2d", + "LoRACompatibleLinear", + "LoRACompatibleConv", + "Softmax", + "ModuleFusedSDPA", + "MoeMatmul", + "ReplicatedLinear", + "FusedMoE", + "GaudiMixtralSparseMoeBlock", + "VllmMixtureOfExpertsOp", + "LinearLayer", + "LinearAllreduce", + "ScopedLinearAllReduce", + "LmHeadLinearAllreduce", + ], + }, + "blocklist": {}, + "dump_stats_path": "./hqt_output/measure", + "fake_quant": "False", + "fp8_config": "E4M3", + "hp_dtype": "bf16", + "measure_on_hpu": True, + "mod_dict": {}, + "mode": "LOAD", + "observer": "maxabs", + "scale_format": "const", + "scale_method": "maxabs_pow2_dynamic", + "scale_params": {}, + "use_qdq": "False", +} + + +# add the quantization config to config.json +def update_config(model_path, qmodel_path): + import json + import os + + # open config + with open(os.path.join(model_path, "config.json"), "r") as f: + config = json.load(f) + config["quantization_config"] = quantization_config + # save new config to qmodel_path + logger.info(f"Updated config: {config}") + logger.debug(f"Saving config to {qmodel_path}") + + with open(os.path.join(qmodel_path, "config.json"), "w") as f: + json.dump(config, f, indent=4) + + +MODEL_FILE_LST = [ + "configuration_deepseek.py", + "generation_config.json", + "modeling_deepseek.py", + "tokenizer.json", + "tokenizer_config.json", +] + + +def cp_model_files(model_path, qmodel_path): + # copy model files + import shutil + import os + + for file in MODEL_FILE_LST: + logger.debug(f"Copying {file} from {model_path} to {qmodel_path}") + file_path = os.path.join(model_path, file) + # check if file exists + if not os.path.exists(file_path): + logger.error(f"File {file_path} does not exist") + raise FileNotFoundError(f"File {file_path} does not exist") + shutil.copy(os.path.join(model_path, file), qmodel_path) + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("--model_path", type=str, required=True) + parser.add_argument("--qmodel_path", type=str, required=True) + parser.add_argument("--low_cpu_mem", action="store_true", help="Load weight file one by one to reduce memory usage") + args = parser.parse_args() + # update the config + update_config(args.model_path, args.qmodel_path) + # copy model files + cp_model_files(args.model_path, args.qmodel_path) From 558b734de1581027b9397a93377bd807ae1041b3 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Tue, 11 Feb 2025 16:17:14 +0200 Subject: [PATCH 16/25] quant lm-head Change-Id: I086fe5228c55526630a2e480d7532e2727884d36 Signed-off-by: Yi Liu --- examples/ds/README.md | 2 -- examples/ds/quant.py | 3 +-- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/examples/ds/README.md b/examples/ds/README.md index 061673c2dab..41e63232813 100644 --- a/examples/ds/README.md +++ b/examples/ds/README.md @@ -23,8 +23,6 @@ python quant.py --model_path /path/to/DeepSeek/R1/BF16/ --qmodel_path /path/to/D ``` > [!NOTE] -> -> - Skip quantize `lm-head`. > - `WEIGHT_BACKOFF = 0.5` > - `SCALE_DTYPE = torch.bfloat16` diff --git a/examples/ds/quant.py b/examples/ds/quant.py index f255de928d6..90cbade498f 100644 --- a/examples/ds/quant.py +++ b/examples/ds/quant.py @@ -24,7 +24,7 @@ "model.norm", "layernorm", "e_score_correction_bias", - "lm_head.weight", + # "lm_head.weight", "embed_tokens", "mlp.gate.weight", # mlp.gate is not linear } @@ -80,7 +80,6 @@ def _maybe_create_dir(qmodel_path): def quant_model_weight_with_low_cpu_usage(model_path, qmodel_path): - # FIXME: need to skip some layers like embedding _maybe_create_dir(qmodel_path) all_weight_filename = get_all_weight_filename(model_path) files_cnt = len(all_weight_filename) From c0b06329ba12fb5e407775c6fe099d14901fd91d Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 11 Feb 2025 15:14:42 +0000 Subject: [PATCH 17/25] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- examples/ds/quant.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/ds/quant.py b/examples/ds/quant.py index 90cbade498f..6c82c80d78c 100644 --- a/examples/ds/quant.py +++ b/examples/ds/quant.py @@ -95,7 +95,7 @@ def quant_model_weight_with_low_cpu_usage(model_path, qmodel_path): for weight_name in f.keys(): weight = f.get_tensor(weight_name) if skip_weight(weight_name): - logger.debug(f"Skiping quantize {weight_name}") + logger.debug(f"Skipping quantize {weight_name}") qtensors[weight_name] = weight qtensor_mappping[weight_name] = qmodel_file_name continue @@ -141,7 +141,7 @@ def static_quant_model_tran(model_path, qmodel_path): ) for name, module in model.named_modules(): if not isinstance(module, QUANT_MODULE_TYPES) or skip_weight(name): - logger.debug(f"Skiping quantize {name}") + logger.debug(f"Skipping quantize {name}") continue logger.debug(f"Processing {name}") weight = module.weight From 314038046f837bdf52084c5e989e3489450e7444 Mon Sep 17 00:00:00 2001 From: "Huang, Tai" Date: Tue, 11 Feb 2025 23:56:26 +0800 Subject: [PATCH 18/25] Update README.md minor updates --- examples/ds/README.md | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/examples/ds/README.md b/examples/ds/README.md index 41e63232813..c8af4e5df13 100644 --- a/examples/ds/README.md +++ b/examples/ds/README.md @@ -1,4 +1,4 @@ -# Note for static quantize DeepSeek model +# Note for quantize DeepSeek model ## Prerequisite @@ -28,7 +28,7 @@ python quant.py --model_path /path/to/DeepSeek/R1/BF16/ --qmodel_path /path/to/D ### Step 2. copy model files for inference -Since DeepSeek V3 and R1 asre not yet supported by Transformers, we need to manually copy some model files. +Since DeepSeek V3 and R1 are not yet supported by Transformers, we need to manually copy some model files. ```bash python post_process.py --model_path /path/to/DeepSeek/R1/BF16/ --qmodel_path /path/to/DeepSeek/R1/Dynamic-FP8 From 6a4c67aa4de1f6e37c3fe5efa5471707552a73f2 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 12 Feb 2025 03:17:02 +0200 Subject: [PATCH 19/25] update thequant config Change-Id: I57753a1e6f87c887ff9d6f802854b83417d7d862 Signed-off-by: Yi Liu --- examples/ds/post_process.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/ds/post_process.py b/examples/ds/post_process.py index fd1b11ade51..f9e35c0b0ad 100644 --- a/examples/ds/post_process.py +++ b/examples/ds/post_process.py @@ -42,7 +42,7 @@ "mode": "LOAD", "observer": "maxabs", "scale_format": "const", - "scale_method": "maxabs_pow2_dynamic", + "scale_method": "maxabs_hw", "scale_params": {}, "use_qdq": "False", } From a958a339fe6a78729fbddfb50420dfe5aff42028 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 12 Feb 2025 03:49:43 +0200 Subject: [PATCH 20/25] add weight_map Change-Id: I82fe09fb33ef5fc48139874ec9fb9aebd178a459 Signed-off-by: Yi Liu --- examples/ds/post_process.py | 8 +++----- examples/ds/quant.py | 6 +++++- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/examples/ds/post_process.py b/examples/ds/post_process.py index f9e35c0b0ad..72101c5c57e 100644 --- a/examples/ds/post_process.py +++ b/examples/ds/post_process.py @@ -53,15 +53,13 @@ def update_config(model_path, qmodel_path): import json import os - # open config with open(os.path.join(model_path, "config.json"), "r") as f: config = json.load(f) config["quantization_config"] = quantization_config - # save new config to qmodel_path logger.info(f"Updated config: {config}") - logger.debug(f"Saving config to {qmodel_path}") - - with open(os.path.join(qmodel_path, "config.json"), "w") as f: + config_filepath = os.path.join(qmodel_path, "config.json") + logger.debug(f"Saving config to {config_filepath}") + with open(config_filepath, "w") as f: json.dump(config, f, indent=4) diff --git a/examples/ds/quant.py b/examples/ds/quant.py index 6c82c80d78c..615b6bf33bf 100644 --- a/examples/ds/quant.py +++ b/examples/ds/quant.py @@ -112,8 +112,12 @@ def quant_model_weight_with_low_cpu_usage(model_path, qmodel_path): # Dump tensor mapping into json file model_state_dict_mapping_file_path = os.path.join(qmodel_path, MODEL_STATE_DICT_MAPPING_FILENAME) logger.info(f"Saving tensor mapping to {model_state_dict_mapping_file_path}") + state_dict_mapping = { + "metadata":{}, + "weight_map": qtensor_mappping, + } with open(model_state_dict_mapping_file_path, "w") as f: - json.dump(qtensor_mappping, f, indent=4) + json.dump(state_dict_mapping, f, indent=4) def _import_oh(): From 54a88b7e04495474bb4fdd79065bdc20d83d06af Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 12 Feb 2025 04:50:49 +0200 Subject: [PATCH 21/25] revert scale_method Change-Id: I8e83e0c342d27ded41036d1e9349f30f832ecf2a Signed-off-by: Yi Liu --- examples/ds/README.md | 1 + examples/ds/post_process.py | 2 +- 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/examples/ds/README.md b/examples/ds/README.md index c8af4e5df13..45df4738c7d 100644 --- a/examples/ds/README.md +++ b/examples/ds/README.md @@ -23,6 +23,7 @@ python quant.py --model_path /path/to/DeepSeek/R1/BF16/ --qmodel_path /path/to/D ``` > [!NOTE] +> - weight dtype is `torch.float8_e4m3fn` (full range is `-448` to `448`) > - `WEIGHT_BACKOFF = 0.5` > - `SCALE_DTYPE = torch.bfloat16` diff --git a/examples/ds/post_process.py b/examples/ds/post_process.py index 72101c5c57e..a9fe929fbeb 100644 --- a/examples/ds/post_process.py +++ b/examples/ds/post_process.py @@ -42,7 +42,7 @@ "mode": "LOAD", "observer": "maxabs", "scale_format": "const", - "scale_method": "maxabs_hw", + "scale_method": "maxabs_pow2_dynamic", "scale_params": {}, "use_qdq": "False", } From 45a2c1af3d1183eceff95caf08671cfd4f377d51 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Wed, 12 Feb 2025 17:43:10 +0800 Subject: [PATCH 22/25] Add qdq eval (#2121) * add eval Change-Id: I7ce64ede965976dd79e979aace82f4d251cc6803 Signed-off-by: Yi Liu * fix Change-Id: I72305d9d6ef6e3588bc8361f62baeeca06f42848 Signed-off-by: Yi Liu * add float model Change-Id: Ia46444d77d349b1a976e6d7031d06bb621d6d7e4 Signed-off-by: Yi Liu * add prompt Change-Id: Ie7b35f45d8f67a655dc9fb06eda824eb8a7f56c1 Signed-off-by: Yi Liu --------- Signed-off-by: Yi Liu Co-authored-by: Yi Liu --- examples/ds/eval.py | 143 ++++++++++++++++++++++++++++++++++++ examples/ds/patch_for_ds.py | 4 +- 2 files changed, 145 insertions(+), 2 deletions(-) create mode 100644 examples/ds/eval.py diff --git a/examples/ds/eval.py b/examples/ds/eval.py new file mode 100644 index 00000000000..16d51f2697a --- /dev/null +++ b/examples/ds/eval.py @@ -0,0 +1,143 @@ +import os +import torch +import tqdm +from loguru import logger +import logging +import safetensors +from safetensors import safe_open +from safetensors.torch import save_file +import json + +logging.basicConfig(level=logging.DEBUG) +torch.set_grad_enabled(False) + +# CONSTANTS +SAFETENSORS = "safetensors" +WEIGHT_SCALE_NAME = "scale_weight" +INPUT_SCALE_NAME = "scale_input" +SCALE_DTYPE = torch.bfloat16 +SCALE_FILE_NAME = f"scales.{SAFETENSORS}" +FULL_RANGE = torch.finfo(torch.float8_e4m3fn).max +WEIGHT_BACKOFF = 0.5 +QUANT_MODULE_TYPES = (torch.nn.Linear,) +SKIP_WEIGHT_LST = { + "model.norm", + "layernorm", + "e_score_correction_bias", + # "lm_head.weight", + "embed_tokens", + "mlp.gate.weight", # mlp.gate is not linear +} +""" +# https://docs.habana.ai/en/latest/PyTorch/Inference_on_PyTorch/Quantization/Inference_Using_FP8.html?highlight=backoff#supported-json-config-file-options +Similarly, the maxabs value of a weight is scaled to weight_backoff*FP8_143_FULLSCALE. The default values are input_backoff=0.25 and weight_backoff=0.5. +""" +MODEL_STATE_DICT_MAPPING_FILENAME = "model.safetensors.index.json" + + +def skip_weight(weight_name): + return any([skip_name in weight_name for skip_name in SKIP_WEIGHT_LST]) + + +def get_cpu_mem_size_in_gb(): + import psutil + + mem = psutil.virtual_memory() + return mem.available + + +from quant import quant_tensor + + +from torch import nn + + +# Adapted from https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/1d044fd82b15f1cedb197a288e50cc96a2c27205/inference/model.py#L91-L108 +class FP8QDQLinear(torch.nn.Linear): + dtype = torch.bfloat16 + fp8_dtype = torch.float8_e4m3fn + + def __init__(self, in_features: int, out_features: int, bias: bool = True, device=None): + super().__init__(in_features, out_features, bias=bias) + self.in_features = in_features + self.out_features = out_features + self.weight = nn.Parameter( + torch.empty(out_features, in_features, dtype=FP8QDQLinear.fp8_dtype), requires_grad=True + ) + self.scale_weight = nn.Parameter(torch.tensor(0, dtype=FP8QDQLinear.dtype), requires_grad=False) + if bias: + self.bias = nn.Parameter(torch.empty(out_features)) + else: + self.register_parameter("bias", None) + + def dequant_weight_online(self): + fp8_weight = self.weight + qdq_weight = fp8_weight.to(FP8QDQLinear.dtype) * self.scale_weight + return qdq_weight + + def qdq_input(self, bf16_input: torch.Tensor): + input_scale, input_fp8 = quant_tensor(bf16_input) + qdq_input_bf16 = input_fp8.to(FP8QDQLinear.dtype) * input_scale + return qdq_input_bf16 + + @classmethod + def create_from_linear(cls, linear: nn.Linear): + qdq_linear = cls(linear.in_features, linear.out_features) + qdq_linear.weight.data = linear.weight.data + if linear.bias is not None: + qdq_linear.bias = linear.bias + return qdq_linear + + def forward(self, bf16_input: torch.Tensor) -> torch.Tensor: + qdq_input = self.qdq_input(bf16_input) + qdq_weight = self.dequant_weight_online() + out = torch.nn.functional.linear(qdq_input, qdq_weight, self.bias) + return out + + +def patch_lin(): + logger.warning("Patching torch.nn.Linear to FP8QDQLinear") + torch.nn.Linear = FP8QDQLinear + + +def qdq_eval(model_path, not_patch_lin=False): + import transformers + from transformers.modeling_utils import no_init_weights + from patch_for_ds import patch_transformers + + if not not_patch_lin: + patch_lin() + + def _patch__initialize_weights(self, module): + print(f"Skipping init_weights ") + module._is_hf_initialized = True + + transformers.modeling_utils.PreTrainedModel._initialize_weights = _patch__initialize_weights + patch_transformers() + with no_init_weights(): + model = transformers.AutoModelForCausalLM.from_pretrained( + model_path, + torch_dtype="auto", + low_cpu_mem_usage=True, + trust_remote_code=True, + ) + logger.info(f"Patched model: {model}") + model.eval() + tokenizer = transformers.AutoTokenizer.from_pretrained(model_path) + prompt = "Hi, who" + encode = tokenizer.encode(prompt, return_tensors="pt") + with torch.no_grad(): + output_tokens = model.generate(encode, max_length=10) + output = tokenizer.decode(output_tokens[0], skip_special_tokens=True) + logger.info(f"Prompt: {prompt}") + logger.info(f"Output: {output}") + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("--qmodel_path", type=str, required=True) + parser.add_argument("--not_patch_lin", action="store_true", help="Measure float model") + args = parser.parse_args() + qdq_eval(args.qmodel_path, not_patch_lin=args.not_patch_lin) diff --git a/examples/ds/patch_for_ds.py b/examples/ds/patch_for_ds.py index 9375ac6d496..d82251e3e62 100644 --- a/examples/ds/patch_for_ds.py +++ b/examples/ds/patch_for_ds.py @@ -1,5 +1,5 @@ # ==--------------------------------------------------------------------------== -# Patch for loading DS models +# Patch for loading DS models from transformers from typing import Union, Optional import torch import os @@ -101,7 +101,7 @@ def load_state_dict( "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True." ) - +# https://github.com/huggingface/transformers/pull/35493 def set_initialized_submodules(model, state_dict_keys): """ Sets the `_is_hf_initialized` flag in all submodules of a given model when all its weights are in the loaded state From 0054ca01f724ed4e57021a31a88c28adae64846a Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Tue, 18 Feb 2025 12:40:58 +0200 Subject: [PATCH 23/25] eval bf16 model Change-Id: I7eeb46b5a0eb202b3f672fb75449197041e5949c Signed-off-by: Yi Liu --- ds/infer_bf16.py | 180 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 180 insertions(+) create mode 100644 ds/infer_bf16.py diff --git a/ds/infer_bf16.py b/ds/infer_bf16.py new file mode 100644 index 00000000000..5b0b7d3b652 --- /dev/null +++ b/ds/infer_bf16.py @@ -0,0 +1,180 @@ +# ==--------------------------------------------------------------------------== +# Patch for loading DS models +from typing import Union, Optional +import torch +import os +from packaging import version +from zipfile import is_zipfile +from transformers.utils import is_safetensors_available, strtobool +from transformers.integrations import PeftAdapterMixin, deepspeed_config, is_deepspeed_zero3_enabled + +if is_safetensors_available(): + from safetensors import safe_open + from safetensors.torch import load_file as safe_load_file + from safetensors.torch import save_file as safe_save_file + + +def is_fsdp_enabled(): + return ( + torch.distributed.is_available() + and torch.distributed.is_initialized() + and strtobool(os.environ.get("ACCELERATE_USE_FSDP", "False")) == 1 + and strtobool(os.environ.get("FSDP_CPU_RAM_EFFICIENT_LOADING", "False")) == 1 + ) + + +def is_local_dist_rank_0(): + return ( + torch.distributed.is_available() + and torch.distributed.is_initialized() + and int(os.environ.get("LOCAL_RANK", -1)) == 0 + ) + + +def load_state_dict( + checkpoint_file: Union[str, os.PathLike], + is_quantized: bool = False, + map_location: Optional[Union[str, torch.device]] = None, + weights_only: bool = True, +): + """ + Reads a PyTorch checkpoint file, returning properly formatted errors if they arise. + """ + + if checkpoint_file.endswith(".safetensors") and is_safetensors_available(): + # Check format of the archive + with safe_open(checkpoint_file, framework="pt") as f: + metadata = f.metadata() + if metadata is not None and metadata.get("format") not in ["pt", "tf", "flax", "mlx"]: + raise OSError( + f"The safetensors archive passed at {checkpoint_file} does not contain the valid metadata. Make sure " + "you save your model with the `save_pretrained` method." + ) + return safe_load_file(checkpoint_file) + try: + if map_location is None: + if ( + ( + is_deepspeed_zero3_enabled() + and torch.distributed.is_initialized() + and torch.distributed.get_rank() > 0 + ) + or (is_fsdp_enabled() and not is_local_dist_rank_0()) + ) and not is_quantized: + map_location = "meta" + else: + map_location = "cpu" + extra_args = {} + # mmap can only be used with files serialized with zipfile-based format. + if ( + isinstance(checkpoint_file, str) + and map_location != "meta" + and version.parse(torch.__version__) >= version.parse("2.1.0") + and is_zipfile(checkpoint_file) + ): + extra_args = {"mmap": True} + weights_only_kwarg = {"weights_only": weights_only} + return torch.load( + checkpoint_file, + map_location=map_location, + **weights_only_kwarg, + **extra_args, + ) + except Exception as e: + try: + with open(checkpoint_file) as f: + if f.read(7) == "version": + raise OSError( + "You seem to have cloned a repository without having git-lfs installed. Please install " + "git-lfs and run `git lfs install` followed by `git lfs pull` in the folder " + "you cloned." + ) + else: + raise ValueError( + f"Unable to locate the file {checkpoint_file} which is necessary to load this pretrained " + "model. Make sure you have saved the model properly." + ) from e + except (UnicodeDecodeError, ValueError): + raise OSError( + f"Unable to load weights from pytorch checkpoint file for '{checkpoint_file}' " + f"at '{checkpoint_file}'. " + "If you tried to load a PyTorch model from a TF 2.0 checkpoint, please set from_tf=True." + ) + + +def set_initialized_submodules(model, state_dict_keys): + """ + Sets the `_is_hf_initialized` flag in all submodules of a given model when all its weights are in the loaded state + dict. + """ + state_dict_keys = set(state_dict_keys) + not_initialized_submodules = {} + for module_name, module in model.named_modules(): + if module_name == "": + # When checking if the root module is loaded there's no need to prepend module_name. + module_keys = set(module.state_dict()) + else: + module_keys = {f"{module_name}.{k}" for k in module.state_dict()} + if module_keys.issubset(state_dict_keys): + module._is_hf_initialized = True + else: + not_initialized_submodules[module_name] = module + return not_initialized_submodules + + +# ==--------------------------------------------------------------------------== + + +def patch_transformers(): + import transformers + + transformers.modeling_utils.load_state_dict = load_state_dict + transformers.modeling_utils.set_initialized_submodules = set_initialized_submodules + + +import logging + +logger = logging.getLogger(__name__) + + +def eval(model_path): + import transformers + from transformers.modeling_utils import no_init_weights + # from patch_for_ds import patch_transformers + + # if not not_patch_lin: + # patch_lin() + + def _patch__initialize_weights(self, module): + print(f"Skipping init_weights ") + module._is_hf_initialized = True + + transformers.modeling_utils.PreTrainedModel._initialize_weights = _patch__initialize_weights + patch_transformers() + with no_init_weights(): + model = transformers.AutoModelForCausalLM.from_pretrained( + model_path, + torch_dtype="auto", + low_cpu_mem_usage=True, + trust_remote_code=True, + ) + logger.info(f"Patched model: {model}") + model.eval() + tokenizer = transformers.AutoTokenizer.from_pretrained(model_path) + prompt = "Hi, who" + encode = tokenizer.encode(prompt, return_tensors="pt") + with torch.no_grad(): + output_tokens = model.generate(encode, max_length=10) + output = tokenizer.decode(output_tokens[0], skip_special_tokens=True) + logger.info(f"Prompt: {prompt}") + logger.info(f"Output: {output}") + + +if __name__ == "__main__": + import argparse + + parser = argparse.ArgumentParser() + parser.add_argument("-m", "--qmodel_path", type=str, required=True) + parser.add_argument("--not_patch_lin", action="store_true", help="Measure float model") + args = parser.parse_args() + eval(args.qmodel_path) From 49ec5a2e7b3ad23c67c199a83078617a3b13e609 Mon Sep 17 00:00:00 2001 From: Yi Liu Date: Tue, 18 Feb 2025 12:50:01 +0200 Subject: [PATCH 24/25] use info level Change-Id: I156d0083a8cf29ba97450a8c5a2c047286ae2f4e Signed-off-by: Yi Liu --- ds/infer_bf16.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ds/infer_bf16.py b/ds/infer_bf16.py index 5b0b7d3b652..2accfea138f 100644 --- a/ds/infer_bf16.py +++ b/ds/infer_bf16.py @@ -135,7 +135,7 @@ def patch_transformers(): import logging logger = logging.getLogger(__name__) - +logger.setLevel(logging.INFO) def eval(model_path): import transformers From cb2d5483f9d52836cd9e38bc7d654340d66322bc Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 18 Feb 2025 10:51:43 +0000 Subject: [PATCH 25/25] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- ds/infer_bf16.py | 24 +++++++++++------------- 1 file changed, 11 insertions(+), 13 deletions(-) diff --git a/ds/infer_bf16.py b/ds/infer_bf16.py index 2accfea138f..6fc88d11d0a 100644 --- a/ds/infer_bf16.py +++ b/ds/infer_bf16.py @@ -1,12 +1,13 @@ # ==--------------------------------------------------------------------------== # Patch for loading DS models -from typing import Union, Optional -import torch import os -from packaging import version +from typing import Optional, Union from zipfile import is_zipfile -from transformers.utils import is_safetensors_available, strtobool + +import torch +from packaging import version from transformers.integrations import PeftAdapterMixin, deepspeed_config, is_deepspeed_zero3_enabled +from transformers.utils import is_safetensors_available, strtobool if is_safetensors_available(): from safetensors import safe_open @@ -37,9 +38,7 @@ def load_state_dict( map_location: Optional[Union[str, torch.device]] = None, weights_only: bool = True, ): - """ - Reads a PyTorch checkpoint file, returning properly formatted errors if they arise. - """ + """Reads a PyTorch checkpoint file, returning properly formatted errors if they arise.""" if checkpoint_file.endswith(".safetensors") and is_safetensors_available(): # Check format of the archive @@ -103,10 +102,8 @@ def load_state_dict( def set_initialized_submodules(model, state_dict_keys): - """ - Sets the `_is_hf_initialized` flag in all submodules of a given model when all its weights are in the loaded state - dict. - """ + """Sets the `_is_hf_initialized` flag in all submodules of a given model when all its weights are in the loaded state + dict.""" state_dict_keys = set(state_dict_keys) not_initialized_submodules = {} for module_name, module in model.named_modules(): @@ -137,16 +134,17 @@ def patch_transformers(): logger = logging.getLogger(__name__) logger.setLevel(logging.INFO) + def eval(model_path): import transformers from transformers.modeling_utils import no_init_weights - # from patch_for_ds import patch_transformers + # from patch_for_ds import patch_transformers # if not not_patch_lin: # patch_lin() def _patch__initialize_weights(self, module): - print(f"Skipping init_weights ") + print("Skipping init_weights ") module._is_hf_initialized = True transformers.modeling_utils.PreTrainedModel._initialize_weights = _patch__initialize_weights