[Bugfix] Fix bnb quantization for models with both HF-format and Mistral-format weights (vllm-project#14950)

tristanleclercq · web-flow · commit 5eeabc2a4400 · 2025-03-17T23:27:26.000Z
diff --git a/tests/quantization/test_bitsandbytes.py b/tests/quantization/test_bitsandbytes.py
@@ -15,6 +15,8 @@
 
 models_4bit_to_test = [
     ("facebook/opt-125m", "quantize opt model inflight"),
+    ("mistralai/Mistral-7B-Instruct-v0.3",
+     "quantize inflight model with both HF and Mistral format weights")
 ]
 
 models_pre_qaunt_4bit_to_test = [
diff --git a/vllm/model_executor/model_loader/loader.py b/vllm/model_executor/model_loader/loader.py
@@ -762,7 +762,7 @@ def _get_weight_files(
         model_name_or_path: str,
         allowed_patterns: List[str],
         revision: Optional[str] = None,
-    ) -> Tuple[List[str], str]:
+    ) -> Tuple[str, List[str], str]:
         """Retrieve weight files. Download the files if necessary.
 
         Return the weight files and the file pattern."""
@@ -773,7 +773,7 @@ def _get_weight_files(
                 weight_files = glob.glob(
                     os.path.join(model_name_or_path, pattern))
                 if weight_files:
-                    return weight_files, pattern
+                    return model_name_or_path, weight_files, pattern
         else:
             hf_api = HfApi()
             repo_files = hf_api.list_repo_files(repo_id=model_name_or_path)
@@ -787,7 +787,8 @@ def _get_weight_files(
                         revision,
                         ignore_patterns=self.load_config.ignore_patterns,
                     )
-                    return glob.glob(os.path.join(hf_folder, pattern)), pattern
+                    return hf_folder, glob.glob(
+                        os.path.join(hf_folder, pattern)), pattern
 
         raise RuntimeError(
             f"No model weights found in: `{model_name_or_path}`")
@@ -798,18 +799,36 @@ def _prepare_weights(self, model_name_or_path: str,
 
         allowed_patterns = ["*.safetensors", "*.bin", "*.pt"]
 
-        hf_weights_files, matched_pattern = self._get_weight_files(
+        hf_folder, hf_weights_files, matched_pattern = self._get_weight_files(
             model_name_or_path, allowed_patterns, revision)
 
-        if matched_pattern != "*.safetensors":
+        use_safetensors = matched_pattern == "*.safetensors"
+        is_local = os.path.isdir(model_name_or_path)
+        index_file = SAFE_WEIGHTS_INDEX_NAME
+        if use_safetensors:
+            # For models like Mistral-7B-Instruct-v0.3
+            # there are both sharded safetensors files and a consolidated
+            # safetensors file. Using both breaks.
+            # Here, we download the `model.safetensors.index.json` and filter
+            # any files not found in the index.
+            if not is_local:
+                download_safetensors_index_file_from_hf(
+                    model_name_or_path,
+                    index_file,
+                    self.load_config.download_dir,
+                    revision,
+                )
+            hf_weights_files = filter_duplicate_safetensors_files(
+                hf_weights_files, hf_folder, index_file)
+        else:
             hf_weights_files = filter_files_not_needed_for_inference(
                 hf_weights_files)
 
         if len(hf_weights_files) == 0:
             raise RuntimeError(
                 f"Cannot find any model weights with `{model_name_or_path}`")
 
-        return hf_weights_files, matched_pattern == "*.safetensors"
+        return hf_weights_files, use_safetensors
 
     def _hf_weight_iter(self, hf_weights_files, use_safetensors: bool):
         if use_safetensors:

Original file line number	Diff line number	Diff line change
`@@ -15,6 +15,8 @@`
`15`	`15`
`16`	`16`	`models_4bit_to_test = [`
`17`	`17`	`("facebook/opt-125m", "quantize opt model inflight"),`
	`18`	`+ ("mistralai/Mistral-7B-Instruct-v0.3",`
	`19`	`+ "quantize inflight model with both HF and Mistral format weights")`
`18`	`20`	`]`
`19`	`21`
`20`	`22`	`models_pre_qaunt_4bit_to_test = [`