Hi everyone,
I am encountering a serious issue when using Qwen3-VL (specifically the KV-cache mechanism). I noticed a significant discrepancy between the generation results when KV-cache is enabled versus when it is disabled.
When using KV-cache, the generated output quality degrades noticeably, often resulting in severe repetition loops and incoherent sentences. In contrast, the output is normal and high-quality when KV-cache is disabled.
Here is a comparison of the results:
1. With KV-cache (Degraded):
['This is a close-up photograph of a gray tabby a cat with a Scottish Fold cat with a cat, a gray cat with a gray cat, with large, its face, its face, with large, its eyes, its eyes, its eyes, its eyes, and, and, and, and a large, and a ls eyes, its eyes, its eyes, and, and, and, and a large, and a large, and a large, and a large, and a large, and a large, and a, and a, and a, and a, and a, and a, and a, and a, and a, and a, and a, and a, and a, and a, and a']curious look. Its pink tongue is sticking out, and it appears t
2. Without KV-cache (Normal):
["This is a close-up, eye-level photograph of a grey cat with a very expressive face. The cat has large, round, golden-yellow eyes that are wide open, giving it a startled or sh Fold cat. The background is blurred, which makes the clear curious look. Its pink tongue is sticking out, and it appears to be licking its nose. The cat's fur is a mix of grey and white, with a distinct pattern of stripes and spots. Its ears are folded forward, a characteristic of a Scottish Fold cat. The background is blurred, which makes the cat the clear focus of the image. The overall mood of the image is playful and endearing.\n\n"]
Code Snippet: Here is a sample of the code I am using:
import torch
from transformers import Qwen3VLForConditionalGeneration as QwenVLForConditionalGeneration
from transformers import AutoProcessor
model_path = "Qwen/Qwen3-VL-2B-Instruct"
def greedy_generate_with_kv_cache(
inputs: dict,
max_new_tokens: int = 128,
device: str = "cuda",
):
dtype = torch.bfloat16 if device == "cuda" else torch.float32
model = QwenVLForConditionalGeneration.from_pretrained(
model_path,
torch_dtype=dtype,
).to(device)
model.eval()
input_ids = inputs["input_ids"]
attention_mask = inputs.get("attention_mask", torch.ones_like(input_ids))
static_inputs = {k: v for k, v in inputs.items() if k not in ("input_ids", "attention_mask")}
eos_token_id = None
if hasattr(model, "config") and getattr(model.config, "eos_token_id", None) is not None:
eos_token_id = model.config.eos_token_id
past_key_values = None
generated = []
next_input_ids = input_ids
for step in range(max_new_tokens):
with torch.inference_mode():
if step == 0:
outputs = model(
input_ids=next_input_ids,
use_cache=True,
past_key_values=None,
**static_inputs,
)
else:
outputs = model(
input_ids=next_input_ids,
use_cache=True,
past_key_values=past_key_values,
)
logits = outputs.logits[:, -1, :]
next_token = torch.argmax(logits, dim=-1, keepdim=True) # greedy
generated.append(next_token)
past_key_values = outputs.past_key_values
attention_mask = torch.cat([attention_mask, torch.ones_like(next_token)], dim=-1)
next_input_ids = next_token
if eos_token_id is not None and torch.all(next_token == eos_token_id):
break
return torch.cat(generated, dim=-1)
def greedy_generate(
inputs: dict,
max_new_tokens: int = 128,
device: str = "cuda",
):
dtype = torch.bfloat16 if device == "cuda" else torch.float32
model = QwenVLForConditionalGeneration.from_pretrained(
model_path,
torch_dtype=dtype,
).to(device)
model.eval()
input_ids = inputs["input_ids"]
attention_mask = inputs.get("attention_mask", torch.ones_like(input_ids))
static_inputs = {k: v for k, v in inputs.items() if k not in ("input_ids", "attention_mask")}
eos_token_id = None
if hasattr(model, "config") and getattr(model.config, "eos_token_id", None) is not None:
eos_token_id = model.config.eos_token_id
generated = []
next_input_ids = input_ids
for step in range(max_new_tokens):
with torch.inference_mode():
outputs = model(
input_ids=next_input_ids,
**static_inputs,
)
logits = outputs.logits[:, -1, :]
next_token = torch.argmax(logits, dim=-1, keepdim=True) # greedy
generated.append(next_token)
attention_mask = torch.cat([attention_mask, torch.ones_like(next_token)], dim=-1)
next_input_ids = torch.cat([next_input_ids, next_token], dim=-1)
if eos_token_id is not None and torch.all(next_token == eos_token_id):
break
return torch.cat(generated, dim=-1)
def main():
device = "cuda" if torch.cuda.is_available() else "cpu"
processor = AutoProcessor.from_pretrained(model_path)
messages = [
{
"role": "user",
"content": [
{"type": "image", "image": "./cat.png"},
{"type": "text", "text": "Describe this image."},
],
}
]
inputs = processor.apply_chat_template(
messages,
tokenize=True,
add_generation_prompt=True,
return_dict=True,
return_tensors="pt",
)
inputs = {k: v.to(device) for k, v in inputs.items()}
tokens_with_cache = greedy_generate_with_kv_cache(inputs, max_new_tokens=128, device=device)
tokens_wo_cache = greedy_generate(inputs, max_new_tokens=128, device=device)
text_with_cache = processor.batch_decode(tokens_with_cache, skip_special_tokens=True, clean_up_tokenization_spaces=False)
text_wo_cache = processor.batch_decode(tokens_wo_cache, skip_special_tokens=True, clean_up_tokenization_spaces=False)
print(text_with_cache)
print(text_wo_cache)
if __name__ == "__main__":
main()
Environment:
-
transformersversion: 4.51.3 -
torchversion: 2.8.0+cu128 -
Model: Qwen/Qwen3-VL-2B-Instruct
I am unsure if this is caused by an incorrect implementation of the KV-cache on my part or if there is a potential bug in the model/library. Has anyone else experienced this issue?
Any insights would be appreciated. Thanks!