Skip to content

Commit f53a058

Browse files
[Bugfix] Fix prompt format of GLM4V (vllm-project#14539)
Signed-off-by: DarkLight1337 <[email protected]>
1 parent b1cc4df commit f53a058

File tree

7 files changed

+43
-19
lines changed

7 files changed

+43
-19
lines changed

tests/models/decoder_only/vision_language/test_models.py

+11-3
Original file line numberDiff line numberDiff line change
@@ -254,13 +254,21 @@
254254
"glm4v": VLMTestInfo(
255255
models=["THUDM/glm-4v-9b"],
256256
test_type=VLMTestType.IMAGE,
257-
prompt_formatter=identity,
258-
img_idx_to_prompt=lambda idx: "",
257+
prompt_formatter=lambda img_prompt: f"<|user|>\n{img_prompt}<|assistant|>", # noqa: E501
258+
single_image_prompts=IMAGE_ASSETS.prompts({
259+
"stop_sign": "<|begin_of_image|><|endoftext|><|end_of_image|>What's the content in the center of the image?", # noqa: E501
260+
"cherry_blossom": "<|begin_of_image|><|endoftext|><|end_of_image|>What is the season?", # noqa: E501
261+
}),
259262
max_model_len=2048,
260263
max_num_seqs=2,
261264
dtype="bfloat16",
262265
get_stop_token_ids=lambda tok: [151329, 151336, 151338],
263-
patch_hf_runner=model_utils.glm_patch_hf_runner,
266+
patch_hf_runner=model_utils.glm4v_patch_hf_runner,
267+
# The image embeddings match with HF but the outputs of the language
268+
# decoder are only consistent up to 2 decimal places.
269+
# So, we need to reduce the number of tokens for the test to pass.
270+
max_tokens=8,
271+
num_logprobs=10,
264272
marks=[large_gpu_mark(min_gb=32)],
265273
),
266274
"h2ovl": VLMTestInfo(

tests/models/decoder_only/vision_language/vlm_utils/core.py

+3-1
Original file line numberDiff line numberDiff line change
@@ -61,7 +61,9 @@ def run_test(
6161
# if we run HF first, the cuda initialization will be done and it
6262
# will hurt multiprocessing backend with fork method (the default method).
6363

64-
vllm_runner_kwargs_: dict[str, Any] = {}
64+
vllm_runner_kwargs_: dict[str, Any] = {
65+
"disable_mm_preprocessor_cache": True,
66+
}
6567
if model_info.tokenizer:
6668
vllm_runner_kwargs_["tokenizer"] = model_info.tokenizer
6769
if model_info.tokenizer_mode:

tests/models/decoder_only/vision_language/vlm_utils/model_utils.py

+13-5
Original file line numberDiff line numberDiff line change
@@ -316,21 +316,29 @@ def processor(*args, **kwargs):
316316
return hf_model
317317

318318

319-
def glm_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
320-
"""Patches and returns an instance of the HfRunner to use for GLM4."""
319+
def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
320+
"""Patches and returns an instance of the HfRunner to use for GLM4V."""
321321
hf_processor = hf_model.processor
322322
patch_padding_side(hf_processor)
323323

324324
def processor(*args, text="", images=None, **kwargs):
325325
if images is None:
326326
return hf_processor(*args, **kwargs)
327327

328+
images = [images] if isinstance(images, Image) else images
329+
330+
contents = re.findall(
331+
r"<\|begin_of_image\|><\|endoftext\|><\|end_of_image\|>(.*?)<\|assistant\|>",
332+
text,
333+
)
334+
assert len(contents) == len(images)
335+
328336
return hf_processor.apply_chat_template(
329337
[{
330338
"role": "user",
331-
"image": images,
332-
"content": text
333-
}],
339+
"image": image,
340+
"content": content
341+
} for image, content in zip(images, contents)],
334342
add_generation_prompt=True,
335343
tokenize=True,
336344
return_dict=True,

vllm/config.py

+8-4
Original file line numberDiff line numberDiff line change
@@ -286,14 +286,18 @@ def __init__(
286286
if rope_scaling is not None:
287287
hf_override: dict[str, Any] = {"rope_scaling": rope_scaling}
288288
hf_overrides_kw.update(hf_override)
289-
msg = ("`--rope-scaling` will be removed in a future release. "
290-
f"'Please instead use `--hf-overrides '{hf_override!r}'`")
289+
hf_overrides_str = json.dumps(hf_overrides)
290+
msg = (
291+
"`--rope-scaling` will be removed in a future release. "
292+
f"'Please instead use `--hf-overrides '{hf_overrides_str}'`")
291293
warnings.warn(DeprecationWarning(msg), stacklevel=2)
292294
if rope_theta is not None:
293295
hf_override = {"rope_theta": rope_theta}
294296
hf_overrides_kw.update(hf_override)
295-
msg = ("`--rope-theta` will be removed in a future release. "
296-
f"'Please instead use `--hf-overrides '{hf_override!r}'`")
297+
hf_overrides_str = json.dumps(hf_overrides)
298+
msg = (
299+
"`--rope-theta` will be removed in a future release. "
300+
f"'Please instead use `--hf-overrides '{hf_overrides_str}'`")
297301
warnings.warn(DeprecationWarning(msg), stacklevel=2)
298302

299303
self.maybe_pull_model_tokenizer_for_s3(model, tokenizer)

vllm/entrypoints/chat_utils.py

+4-3
Original file line numberDiff line numberDiff line change
@@ -403,16 +403,17 @@ def _placeholder_str(self, modality: ModalityStr,
403403
hf_config = self._model_config.hf_config
404404
model_type = hf_config.model_type
405405

406-
if modality in ["image", "image_embeds"]:
406+
if modality in ("image", "image_embeds"):
407+
if model_type == "chatglm":
408+
return "<|begin_of_image|><|endoftext|><|end_of_image|>"
407409
if model_type == "phi3_v":
408410
# Workaround since this token is not defined in the tokenizer
409411
return f"<|image_{current_count}|>"
410412
if model_type == "phi4mm":
411413
return "<|endoftext10|>" # 200010 (see vocab.json in hf model)
412414
if model_type in ("minicpmo", "minicpmv"):
413415
return "(<image>./</image>)"
414-
if model_type in ("blip-2", "chatglm", "fuyu", "paligemma",
415-
"pixtral"):
416+
if model_type in ("blip-2", "fuyu", "paligemma", "pixtral"):
416417
# These models do not use image tokens in the prompt
417418
return None
418419
if model_type == "qwen":

vllm/model_executor/models/chatglm.py

+2-1
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
# Adapted from
33
# https://github.com/THUDM/ChatGLM2-6B
44
"""Inference-only ChatGLM model compatible with THUDM weights."""
5+
import json
56
from typing import Iterable, Optional, Set, Tuple, Union
67

78
import torch
@@ -463,7 +464,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
463464
"The configuration of this model indicates that it supports "
464465
"vision inputs, but you instantiated the text-only version "
465466
"of this model. Please use the vision model by setting "
466-
f"`--hf-overrides {hf_overrides!r}`")
467+
f"`--hf-overrides '{json.dumps(hf_overrides)}'`")
467468

468469
super().__init__(vllm_config=vllm_config, prefix=prefix)
469470

vllm/model_executor/models/qwen.py

+2-2
Original file line numberDiff line numberDiff line change
@@ -5,7 +5,7 @@
55
# Copyright (c) Alibaba Cloud.
66
# LICENSE: https://huggingface.co/Qwen/Qwen-7B/blob/main/LICENSE
77
"""Inference-only QWen model compatible with HuggingFace weights."""
8-
8+
import json
99
from typing import Any, Dict, Iterable, Optional, Set, Tuple, Union
1010

1111
import torch
@@ -354,7 +354,7 @@ def __init__(self, *, vllm_config: VllmConfig, prefix: str = ""):
354354
"The configuration of this model indicates that it supports "
355355
"vision inputs, but you instantiated the text-only version "
356356
"of this model. Please use the vision model by setting "
357-
f"`--hf-overrides {hf_overrides!r}`")
357+
f"`--hf-overrides '{json.dumps(hf_overrides)}'`")
358358

359359
super().__init__(vllm_config=vllm_config, prefix=prefix)
360360

0 commit comments

Comments
 (0)