Skip to content

Support Qwen3 and Qwen2.5 Omni model quantization#1404

Open
lvliang-intel wants to merge 28 commits intomainfrom
lvl/support_omni
Open

Support Qwen3 and Qwen2.5 Omni model quantization#1404
lvliang-intel wants to merge 28 commits intomainfrom
lvl/support_omni

Conversation

@lvliang-intel
Copy link
Contributor

@lvliang-intel lvliang-intel commented Feb 4, 2026

Description

This update adds quantization support for Qwen3-Omni by integrating a custom MLLM processor and template, implementing dedicated forward logic for thinker/talker calibration, and introducing model-specific block discovery.

Note: This feature requires Transformers >= 5.1.0, as earlier versions contain compatibility issues with Qwen3-Omni.

Type of Change

  • Bug fix
  • New feature
  • Documentation update
  • Performance improvement
  • Code refactoring
  • Other (please specify):

Related Issues

#1387

Fixes or relates to #

Checklist Before Submitting

  • My code has been tested locally.
  • Documentation has been updated as needed.
  • New or updated tests are included where applicable.

lvliang-intel and others added 4 commits February 4, 2026 14:50
Signed-off-by: lvliang-intel <liang1.lv@intel.com>
Signed-off-by: lvliang-intel <liang1.lv@intel.com>
@wenhuach21
Copy link
Contributor

Thank you for the PR! Could you help verify all inferences (vLLM, Transformers 4, and Transformers 5) before merging?

@lvliang-intel
Copy link
Contributor Author

Quantize:

from auto_round import AutoRound

model_name_or_path = "Qwen/Qwen3-Omni-30B-A3B-Instruct"

ar = AutoRound(
    model=model_name_or_path,
    scheme="W4A16",
    lr=5e-3,
    iters=100,
)
ar.quantize_and_save(format="auto_round", output_dir="tmp_qwen_omni_w4a16")

Inference with transformers 5.1.0

#!/usr/bin/env python3
"""Verify a quantized Qwen3-Omni model with transformers.

Tests text-only, image, audio, and video inputs.
"""

import argparse
import os
import sys
import traceback

import soundfile as sf
import torch
from transformers import Qwen3OmniMoeForConditionalGeneration, Qwen3OmniMoeProcessor
from qwen_omni_utils import process_mm_info


USE_AUDIO_IN_VIDEO = True

# Demo resources from Qwen
DEMO_IMAGE = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/cars.jpg"
DEMO_AUDIO = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/cough.wav"
DEMO_VIDEO = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4"


TEST_CASES = {
    "text_only": {
        "conversation": [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "What is the capital of France? Answer in one short sentence."},
                ],
            },
        ],
    },
    "image": {
        "conversation": [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": DEMO_IMAGE},
                    {"type": "text", "text": "Describe this image in one short sentence."},
                ],
            },
        ],
    },
    "audio": {
        "conversation": [
            {
                "role": "user",
                "content": [
                    {"type": "audio", "audio": DEMO_AUDIO},
                    {"type": "text", "text": "What sound can you hear? Answer in one short sentence."},
                ],
            },
        ],
    },
    "video": {
        "conversation": [
            {
                "role": "user",
                "content": [
                    {"type": "video", "video": DEMO_VIDEO},
                    {"type": "text", "text": "Describe what happens in this video in one short sentence."},
                ],
            },
        ],
    },
    "image_audio": {
        "conversation": [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": DEMO_IMAGE},
                    {"type": "audio", "audio": DEMO_AUDIO},
                    {"type": "text", "text": "What can you see and hear? Answer in one short sentence."},
                ],
            },
        ],
    },
}


def run_test(model, processor, test_name, conversation, max_new_tokens, enable_audio_output):
    """Run a single test case and return the result."""
    print(f"\n{'='*60}")
    print(f"Test: {test_name}")
    print(f"{'='*60}")

    text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
    audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO)

    inputs = processor(
        text=text,
        audio=audios,
        images=images,
        videos=videos,
        return_tensors="pt",
        padding=True,
        use_audio_in_video=USE_AUDIO_IN_VIDEO,
    )
    inputs = inputs.to(model.device).to(model.dtype)

    generate_kwargs = dict(
        **inputs,
        max_new_tokens=max_new_tokens,
        use_audio_in_video=USE_AUDIO_IN_VIDEO,
        do_sample=False,
    )
    if enable_audio_output:
        generate_kwargs["speaker"] = "Ethan"
        generate_kwargs["thinker_return_dict_in_generate"] = True

    output = model.generate(**generate_kwargs)

    # Qwen3-Omni generate() always returns (text_ids, audio) tuple
    if isinstance(output, tuple):
        text_ids, audio_out = output
    else:
        text_ids, audio_out = output, None

    # With thinker_return_dict_in_generate=True, text_ids has .sequences
    if hasattr(text_ids, "sequences"):
        decode_ids = text_ids.sequences[:, inputs["input_ids"].shape[1]:]
    else:
        decode_ids = text_ids[:, inputs["input_ids"].shape[1]:]

    generated_text = processor.batch_decode(
        decode_ids,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False,
    )[0]

    print(f"Output: {generated_text}")

    if audio_out is not None and enable_audio_output:
        wav_path = f"output_{test_name}.wav"
        sf.write(wav_path, audio_out.reshape(-1).detach().cpu().numpy(), samplerate=24000)
        print(f"Audio saved to {wav_path}")

    if len(generated_text.strip()) == 0:
        print(f"WARNING: Empty output for test '{test_name}'")
        return False
    return True


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Verify a quantized Qwen3-Omni model with text, image, audio, and video inputs."
    )
    parser.add_argument(
        "--model-dir",
        required=True,
        help="Path to the quantized model directory.",
    )
    parser.add_argument(
        "--max-new-tokens",
        type=int,
        default=256,
        help="Maximum number of new tokens to generate.",
    )
    parser.add_argument(
        "--tests",
        nargs="+",
        default=list(TEST_CASES.keys()),
        choices=list(TEST_CASES.keys()),
        help="Which tests to run (default: all).",
    )
    parser.add_argument(
        "--enable-audio-output",
        action="store_true",
        default=False,
        help="Enable audio output generation (requires talker model).",
    )
    return parser.parse_args()


def main() -> int:
    args = parse_args()
    model_dir = os.path.abspath(args.model_dir)
    if not os.path.isdir(model_dir):
        print(f"Model directory not found: {model_dir}")
        return 1

    print(f"Loading model from {model_dir} ...")
    model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(
        model_dir,
        torch_dtype="auto",
        device_map="auto",
        trust_remote_code=True,
        attn_implementation="flash_attention_2",
    )
    processor = Qwen3OmniMoeProcessor.from_pretrained(model_dir)
    print("Model and processor loaded.\n")

    passed, failed = [], []
    for test_name in args.tests:
        tc = TEST_CASES[test_name]
        try:
            ok = run_test(
                model, processor, test_name, tc["conversation"],
                args.max_new_tokens, args.enable_audio_output,
            )
            (passed if ok else failed).append(test_name)
        except Exception as exc:
            print(f"ERROR in test '{test_name}': {exc}")
            traceback.print_exc()
            failed.append(test_name)

    print(f"\n{'='*60}")
    print(f"Results: {len(passed)} passed, {len(failed)} failed out of {len(args.tests)} tests")
    if passed:
        print(f"  Passed: {', '.join(passed)}")
    if failed:
        print(f"  Failed: {', '.join(failed)}")
    print(f"{'='*60}")

    del model
    torch.cuda.empty_cache()
    return 1 if failed else 0


if __name__ == "__main__":
    sys.exit(main())


CUDA_VISIBLE_DEVICES=0 python verify_quantized_transformers.py --model-dir ./tmp_qwen_omni_w4a16/
Loading model from /mnt/disk1/lvl/auto-round/tmp_qwen_omni_w4a16 ...
Unrecognized keys in rope_parameters for 'rope_type'='default': {'interleaved', 'mrope_section'}
You are attempting to use Flash Attention 2 without specifying a torch dtype. This might lead to unexpected behaviour
2026-02-11 18:50:27 WARNING modeling_utils.py L4356: loss_type=None was set in the config but it is unrecognized. Using the default loss: ForCausalLMLoss.
2026-02-11 18:53:41 INFO moe_experts_interface.py L432: Unfused 68 MOE experts modules for quantization
2026-02-11 18:53:41 INFO replace_modules.py L80: Prepared 68 MOE modules for quantization
2026-02-11 18:53:45 WARNING backend.py L1088: Better backend is found, please install all the following requirements to enable it.
2026-02-11 18:53:45 WARNING backend.py L1088: pip install -v "gptqmodel>=2.0" --no-build-isolation
Loading weights: 100%|█| 54786/54786 [00:15<00:00, 3647.06it/s, Materiali
Unrecognized keys in rope_parameters for 'rope_type'='default': {'interleaved', 'mrope_section'}
Model and processor loaded.

Test: text_only
Setting pad_token_id to eos_token_id:151645 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
Setting pad_token_id to eos_token_id:2150 for open-end generation.
2026-02-11 18:54:24 WARNING utils.py L2088: The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
Output: The capital of France is Paris.

Test: image
Setting pad_token_id to eos_token_id:151645 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
Setting pad_token_id to eos_token_id:2150 for open-end generation.
Output: This composite image displays four different luxury vehicles: a white Rolls-Royce Phantom, a grey Mercedes-Benz GLE SUV, a red Ferrari Portofino M, and a white Porsche 911.

Test: audio
/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/librosa/core/audio.py:172: FutureWarning: librosa.core.audio.__audioread_load
Deprecated as of librosa version 0.10.0.
It will be removed in librosa version 1.0.
y, sr_native = __audioread_load(path, offset, duration, dtype)
Setting pad_token_id to eos_token_id:151645 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
Setting pad_token_id to eos_token_id:2150 for open-end generation.
Output: The sound of a person coughing is heard.

Test: video
/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/librosa/core/audio.py:172: FutureWarning: librosa.core.audio.__audioread_load
Deprecated as of librosa version 0.10.0.
It will be removed in librosa version 1.0.
y, sr_native = __audioread_load(path, offset, duration, dtype)
qwen-vl-utils using torchvision to read video.
/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/torchvision/io/_video_deprecation_warning.py:9: UserWarning: The video decoding and encoding capabilities of torchvision are deprecated from version 0.22 and will be removed in version 0.24. We recommend that you migrate to TorchCodec, where we'll consolidate the future decoding/encoding capabilities of PyTorch: https://github.com/pytorch/torchcodec
warnings.warn(
Setting pad_token_id to eos_token_id:151645 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
Setting pad_token_id to eos_token_id:2150 for open-end generation.
Output: A person uses a stylus to draw a guitar on a tablet.

Test: image_audio
/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/librosa/core/audio.py:172: FutureWarning: librosa.core.audio.__audioread_load
Deprecated as of librosa version 0.10.0.
It will be removed in librosa version 1.0.
y, sr_native = __audioread_load(path, offset, duration, dtype)
Setting pad_token_id to eos_token_id:151645 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
Setting pad_token_id to eos_token_id:2150 for open-end generation.
Output: The image displays four luxury vehicles—a Rolls-Royce, a Mercedes-Benz GLE SUV, a red Ferrari Portofino M, and a white Porsche 911—while the audio features a person coughing.

Results: 5 passed, 0 failed out of 5 tests
Passed: text_only, image, audio, video, image_audio

vLLM tests are currently blocked because the latest vLLM version depends on an outdated Transformers release. Qwen3-Omni requires Transformers >= 5.1.0 to address several known issues.

@lvliang-intel lvliang-intel marked this pull request as ready for review February 11, 2026 11:09
Copilot AI review requested due to automatic review settings February 11, 2026 11:09
Copy link
Contributor

Copilot AI left a comment

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Pull request overview

Adds quantization support for the Qwen3-Omni MoE model family by integrating model-specific loading/version gating, calibration forward behavior for thinker/talker, and custom multimodal block discovery.

Changes:

  • Added explicit Transformers version guard for qwen3_omni_moe.
  • Introduced Qwen3-Omni processor/template registration and model-specific multimodal block name discovery.
  • Implemented a Qwen3-Omni-specific forward path to run thinker (and optionally talker) during calibration.

Reviewed changes

Copilot reviewed 10 out of 10 changed files in this pull request and generated 4 comments.

Show a summary per file
File Description
pyproject.toml Adds a project-specific word to typos’ allowlist.
auto_round/utils/model.py Adds Transformers version guard and adjusts lm_head discovery logic.
auto_round/utils/common.py Adds _no_split_modules normalization and extends multimodal ignore-key lists.
auto_round/special_model_handler.py Adds Qwen3-Omni special forward + block discovery + ignore-layer rule.
auto_round/compressors/shard_writer.py Improves tie_word_embeddings lookup for nested multimodal configs.
auto_round/compressors/mllm/utils.py Extends multimodal ignore-key list for Qwen3-Omni components.
auto_round/compressors/mllm/template.py Registers a Qwen3-Omni model template with the new processor.
auto_round/compressors/mllm/processor.py Adds a custom processor for Qwen3-Omni chat-template inputs.
auto_round/compressors/base.py Imports the new normalization helper.
auto_round/auto_scheme/utils.py Uses normalized _no_split_modules when dispatching across devices.

)

# Run talker forward if available (for calibration purposes)
if hasattr(model, "talker") and model.has_talker:
Copy link

Copilot AI Feb 11, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This can raise AttributeError when model.has_talker doesn’t exist (the hasattr only checks talker). Use getattr(model, "has_talker", False) (and optionally also ensure model.talker is not None) to make this guard safe.

Suggested change
if hasattr(model, "talker") and model.has_talker:
if getattr(model, "has_talker", False) and getattr(model, "talker", None) is not None:

Copilot uses AI. Check for mistakes.
Comment on lines +238 to +242
# Use text projection to convert thinker embeddings to talker space
if hasattr(model.talker, "text_projection"):
# Get thinker embeddings
thinker_embeds = model.thinker.get_input_embeddings()(input_ids)
talker_inputs_embeds = model.talker.text_projection(thinker_embeds)
Copy link

Copilot AI Feb 11, 2026

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This path assumes input_ids is provided; if calibration runs with inputs_embeds (or other modalities without input_ids), this will throw and then be silently ignored (due to the broad except), meaning the talker forward never runs. Consider deriving inputs from inputs_embeds when present, or projecting from thinker_output.hidden_states[-1] (which you already compute) instead of re-embedding input_ids.

Suggested change
# Use text projection to convert thinker embeddings to talker space
if hasattr(model.talker, "text_projection"):
# Get thinker embeddings
thinker_embeds = model.thinker.get_input_embeddings()(input_ids)
talker_inputs_embeds = model.talker.text_projection(thinker_embeds)
# Use text projection to convert thinker hidden states to talker space
if hasattr(model.talker, "text_projection"):
# Project thinker hidden states directly into the talker embedding space
talker_inputs_embeds = model.talker.text_projection(thinker_hidden)

Copilot uses AI. Check for mistakes.
lvliang-intel and others added 2 commits February 11, 2026 19:20
Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com>
Signed-off-by: lvliang-intel <liang1.lv@intel.com>
@chensuyue chensuyue requested a review from xin3he February 12, 2026 06:29
@wenhuach21
Copy link
Contributor

Quantize:

from auto_round import AutoRound

model_name_or_path = "Qwen/Qwen3-Omni-30B-A3B-Instruct"

ar = AutoRound(
    model=model_name_or_path,
    scheme="W4A16",
    lr=5e-3,
    iters=100,
)
ar.quantize_and_save(format="auto_round", output_dir="tmp_qwen_omni_w4a16")

Inference with transformers 5.1.0

#!/usr/bin/env python3
"""Verify a quantized Qwen3-Omni model with transformers.

Tests text-only, image, audio, and video inputs.
"""

import argparse
import os
import sys
import traceback

import soundfile as sf
import torch
from transformers import Qwen3OmniMoeForConditionalGeneration, Qwen3OmniMoeProcessor
from qwen_omni_utils import process_mm_info


USE_AUDIO_IN_VIDEO = True

# Demo resources from Qwen
DEMO_IMAGE = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/cars.jpg"
DEMO_AUDIO = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/cough.wav"
DEMO_VIDEO = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4"


TEST_CASES = {
    "text_only": {
        "conversation": [
            {
                "role": "user",
                "content": [
                    {"type": "text", "text": "What is the capital of France? Answer in one short sentence."},
                ],
            },
        ],
    },
    "image": {
        "conversation": [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": DEMO_IMAGE},
                    {"type": "text", "text": "Describe this image in one short sentence."},
                ],
            },
        ],
    },
    "audio": {
        "conversation": [
            {
                "role": "user",
                "content": [
                    {"type": "audio", "audio": DEMO_AUDIO},
                    {"type": "text", "text": "What sound can you hear? Answer in one short sentence."},
                ],
            },
        ],
    },
    "video": {
        "conversation": [
            {
                "role": "user",
                "content": [
                    {"type": "video", "video": DEMO_VIDEO},
                    {"type": "text", "text": "Describe what happens in this video in one short sentence."},
                ],
            },
        ],
    },
    "image_audio": {
        "conversation": [
            {
                "role": "user",
                "content": [
                    {"type": "image", "image": DEMO_IMAGE},
                    {"type": "audio", "audio": DEMO_AUDIO},
                    {"type": "text", "text": "What can you see and hear? Answer in one short sentence."},
                ],
            },
        ],
    },
}


def run_test(model, processor, test_name, conversation, max_new_tokens, enable_audio_output):
    """Run a single test case and return the result."""
    print(f"\n{'='*60}")
    print(f"Test: {test_name}")
    print(f"{'='*60}")

    text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
    audios, images, videos = process_mm_info(conversation, use_audio_in_video=USE_AUDIO_IN_VIDEO)

    inputs = processor(
        text=text,
        audio=audios,
        images=images,
        videos=videos,
        return_tensors="pt",
        padding=True,
        use_audio_in_video=USE_AUDIO_IN_VIDEO,
    )
    inputs = inputs.to(model.device).to(model.dtype)

    generate_kwargs = dict(
        **inputs,
        max_new_tokens=max_new_tokens,
        use_audio_in_video=USE_AUDIO_IN_VIDEO,
        do_sample=False,
    )
    if enable_audio_output:
        generate_kwargs["speaker"] = "Ethan"
        generate_kwargs["thinker_return_dict_in_generate"] = True

    output = model.generate(**generate_kwargs)

    # Qwen3-Omni generate() always returns (text_ids, audio) tuple
    if isinstance(output, tuple):
        text_ids, audio_out = output
    else:
        text_ids, audio_out = output, None

    # With thinker_return_dict_in_generate=True, text_ids has .sequences
    if hasattr(text_ids, "sequences"):
        decode_ids = text_ids.sequences[:, inputs["input_ids"].shape[1]:]
    else:
        decode_ids = text_ids[:, inputs["input_ids"].shape[1]:]

    generated_text = processor.batch_decode(
        decode_ids,
        skip_special_tokens=True,
        clean_up_tokenization_spaces=False,
    )[0]

    print(f"Output: {generated_text}")

    if audio_out is not None and enable_audio_output:
        wav_path = f"output_{test_name}.wav"
        sf.write(wav_path, audio_out.reshape(-1).detach().cpu().numpy(), samplerate=24000)
        print(f"Audio saved to {wav_path}")

    if len(generated_text.strip()) == 0:
        print(f"WARNING: Empty output for test '{test_name}'")
        return False
    return True


def parse_args() -> argparse.Namespace:
    parser = argparse.ArgumentParser(
        description="Verify a quantized Qwen3-Omni model with text, image, audio, and video inputs."
    )
    parser.add_argument(
        "--model-dir",
        required=True,
        help="Path to the quantized model directory.",
    )
    parser.add_argument(
        "--max-new-tokens",
        type=int,
        default=256,
        help="Maximum number of new tokens to generate.",
    )
    parser.add_argument(
        "--tests",
        nargs="+",
        default=list(TEST_CASES.keys()),
        choices=list(TEST_CASES.keys()),
        help="Which tests to run (default: all).",
    )
    parser.add_argument(
        "--enable-audio-output",
        action="store_true",
        default=False,
        help="Enable audio output generation (requires talker model).",
    )
    return parser.parse_args()


def main() -> int:
    args = parse_args()
    model_dir = os.path.abspath(args.model_dir)
    if not os.path.isdir(model_dir):
        print(f"Model directory not found: {model_dir}")
        return 1

    print(f"Loading model from {model_dir} ...")
    model = Qwen3OmniMoeForConditionalGeneration.from_pretrained(
        model_dir,
        torch_dtype="auto",
        device_map="auto",
        trust_remote_code=True,
        attn_implementation="flash_attention_2",
    )
    processor = Qwen3OmniMoeProcessor.from_pretrained(model_dir)
    print("Model and processor loaded.\n")

    passed, failed = [], []
    for test_name in args.tests:
        tc = TEST_CASES[test_name]
        try:
            ok = run_test(
                model, processor, test_name, tc["conversation"],
                args.max_new_tokens, args.enable_audio_output,
            )
            (passed if ok else failed).append(test_name)
        except Exception as exc:
            print(f"ERROR in test '{test_name}': {exc}")
            traceback.print_exc()
            failed.append(test_name)

    print(f"\n{'='*60}")
    print(f"Results: {len(passed)} passed, {len(failed)} failed out of {len(args.tests)} tests")
    if passed:
        print(f"  Passed: {', '.join(passed)}")
    if failed:
        print(f"  Failed: {', '.join(failed)}")
    print(f"{'='*60}")

    del model
    torch.cuda.empty_cache()
    return 1 if failed else 0


if __name__ == "__main__":
    sys.exit(main())

CUDA_VISIBLE_DEVICES=0 python verify_quantized_transformers.py --model-dir ./tmp_qwen_omni_w4a16/
Loading model from /mnt/disk1/lvl/auto-round/tmp_qwen_omni_w4a16 ...
Unrecognized keys in rope_parameters for 'rope_type'='default': {'interleaved', 'mrope_section'}
You are attempting to use Flash Attention 2 without specifying a torch dtype. This might lead to unexpected behaviour
2026-02-11 18:50:27 WARNING modeling_utils.py L4356: loss_type=None was set in the config but it is unrecognized. Using the default loss: ForCausalLMLoss.
2026-02-11 18:53:41 INFO moe_experts_interface.py L432: Unfused 68 MOE experts modules for quantization
2026-02-11 18:53:41 INFO replace_modules.py L80: Prepared 68 MOE modules for quantization
2026-02-11 18:53:45 WARNING backend.py L1088: Better backend is found, please install all the following requirements to enable it.
2026-02-11 18:53:45 WARNING backend.py L1088: pip install -v "gptqmodel>=2.0" --no-build-isolation
Loading weights: 100%|█| 54786/54786 [00:15<00:00, 3647.06it/s, Materiali
Unrecognized keys in rope_parameters for 'rope_type'='default': {'interleaved', 'mrope_section'}
Model and processor loaded.
Test: text_only
Setting pad_token_id to eos_token_id:151645 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
Setting pad_token_id to eos_token_id:2150 for open-end generation.
2026-02-11 18:54:24 WARNING utils.py L2088: The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
Output: The capital of France is Paris.
Test: image
Setting pad_token_id to eos_token_id:151645 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
Setting pad_token_id to eos_token_id:2150 for open-end generation.
Output: This composite image displays four different luxury vehicles: a white Rolls-Royce Phantom, a grey Mercedes-Benz GLE SUV, a red Ferrari Portofino M, and a white Porsche 911.
Test: audio
/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/librosa/core/audio.py:172: FutureWarning: librosa.core.audio.__audioread_load
Deprecated as of librosa version 0.10.0.
It will be removed in librosa version 1.0.
y, sr_native = __audioread_load(path, offset, duration, dtype)
Setting pad_token_id to eos_token_id:151645 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
Setting pad_token_id to eos_token_id:2150 for open-end generation.
Output: The sound of a person coughing is heard.
Test: video
/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/librosa/core/audio.py:172: FutureWarning: librosa.core.audio.__audioread_load
Deprecated as of librosa version 0.10.0.
It will be removed in librosa version 1.0.
y, sr_native = __audioread_load(path, offset, duration, dtype)
qwen-vl-utils using torchvision to read video.
/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/torchvision/io/_video_deprecation_warning.py:9: UserWarning: The video decoding and encoding capabilities of torchvision are deprecated from version 0.22 and will be removed in version 0.24. We recommend that you migrate to TorchCodec, where we'll consolidate the future decoding/encoding capabilities of PyTorch: https://github.com/pytorch/torchcodec
warnings.warn(
Setting pad_token_id to eos_token_id:151645 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
Setting pad_token_id to eos_token_id:2150 for open-end generation.
Output: A person uses a stylus to draw a guitar on a tablet.
Test: image_audio
/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/librosa/core/audio.py:172: FutureWarning: librosa.core.audio.__audioread_load
Deprecated as of librosa version 0.10.0.
It will be removed in librosa version 1.0.
y, sr_native = __audioread_load(path, offset, duration, dtype)
Setting pad_token_id to eos_token_id:151645 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
Setting pad_token_id to eos_token_id:2150 for open-end generation.
Output: The image displays four luxury vehicles—a Rolls-Royce, a Mercedes-Benz GLE SUV, a red Ferrari Portofino M, and a white Porsche 911—while the audio features a person coughing.
Results: 5 passed, 0 failed out of 5 tests
Passed: text_only, image, audio, video, image_audio

vLLM tests are currently blocked because the latest vLLM version depends on an outdated Transformers release. Qwen3-Omni requires Transformers >= 5.1.0 to address several known issues.

you could update transformers after installing vllm

@lvliang-intel lvliang-intel changed the title Support Qwen3 Omni model quantization Support Qwen3 and Qwen2.5 Omni model quantization Mar 6, 2026
Signed-off-by: lvliang-intel <liang1.lv@intel.com>
@lvliang-intel
Copy link
Contributor Author

Qwen2.5 Omni quantize and inference test pass:

CUDA_VISIBLE_DEVICES=3 python quantize_qwen25_omni.py --model /mnt/disk2/lvl/Qwen2.5-Omni-3B --output tmp_qwen25_omni_w4a16 --iters 200

CUDA_VISIBLE_DEVICES=6 python run_qwen25_omni.py --model-dir tmp_qwen25_omni_w4a16 --enable-audio-output
Loading model from /mnt/disk1/lvl/auto-round-main/tmp_qwen25_omni_w4a16 ...
You are attempting to use Flash Attention 2 without specifying a torch dtype. This might lead to unexpected behaviour
Qwen2_5OmniToken2WavModel must inference with fp32, but Flash Attention only supports fp16 and bf16, attention implementation of Qwen2_5OmniToken2WavModel will fallback to sdpa.
2026-03-06 09:49:10 INFO device.py L1643: [Memory Monitor] Before Applying general replacements: 'peak_ram': 0.92GB
2026-03-06 09:49:10 WARNING modeling_utils.py L4430: loss_type=None was set in the config but it is unrecognized. Using the default loss: ForCausalLMLoss.
2026-03-06 09:49:10 INFO device.py L1646: [Memory Monitor] After Applying general replacements: 'peak_ram': 0.92GB
2026-03-06 09:49:10 WARNING backend.py L1084: Better backend is found, please install all the following requirements to enable it.
2026-03-06 09:49:10 WARNING backend.py L1084: pip install -v "gptqmodel>=2.0" --no-build-isolation
Loading weights: 100%|██████████████████████████████████████████| 3383/3383 [00:01<00:00, 2609.58it/s]
2026-03-06 09:49:12 INFO common.py L246: Patched Qwen2.5-Omni talker prepare_inputs_for_generation for transformers compat.
Model and processor loaded.

============================================================
Test: text_only

Output: The capital of France is Paris. If you want to know more about Paris or France, feel free to ask.
Audio saved to output_text_only.wav

============================================================
Test: image

Output: This image shows four different luxury cars. One is a white Rolls - Royce, another is a Mercedes - Benz GLE SUV, the third is a red Ferrari Portofino M, and the fourth is a white Porsche. What do you think about these cars?
Audio saved to output_image.wav

============================================================
Test: audio

/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/librosa/core/audio.py:172: FutureWarning: librosa.core.audio.__audioread_load
Deprecated as of librosa version 0.10.0.
It will be removed in librosa version 1.0.
y, sr_native = __audioread_load(path, offset, duration, dtype)
Output: I can hear a cough. What do you think about it?
Audio saved to output_audio.wav

============================================================
Test: video

/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/librosa/core/audio.py:172: FutureWarning: librosa.core.audio.__audioread_load
Deprecated as of librosa version 0.10.0.
It will be removed in librosa version 1.0.
y, sr_native = __audioread_load(path, offset, duration, dtype)
qwen-vl-utils using torchvision to read video.
/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/torchvision/io/_video_deprecation_warning.py:9: UserWarning: The video decoding and encoding capabilities of torchvision are deprecated from version 0.22 and will be removed in version 0.24. We recommend that you migrate to TorchCodec, where we'll consolidate the future decoding/encoding capabilities of PyTorch: https://github.com/pytorch/torchcodec
warnings.warn(
Output: In the video, someone is drawing a guitar on a tablet.
Audio saved to output_video.wav

============================================================
Test: image_audio

/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/librosa/core/audio.py:172: FutureWarning: librosa.core.audio.__audioread_load
Deprecated as of librosa version 0.10.0.
It will be removed in librosa version 1.0.
y, sr_native = __audioread_load(path, offset, duration, dtype)
Output: I can see four different types of cars in the pictures, and I can hear a cough. What do you think about these cars?
Audio saved to output_image_audio.wav

============================================================
Results: 5 passed, 0 failed out of 5 tests
Passed: text_only, image, audio, video, image_audio

Qwen3 Omni quantize and inference test pass:

CUDA_VISIBLE_DEVICES=0 python quantize_qwen3_omni.py --model /mnt/disk2/lvl/Qwen3-VL-30B-A3B-Instruct/ --output ./tmp_qwen3_omni_w4a16

CUDA_VISIBLE_DEVICES=5 python run_qwen3_omni.py --model-dir ./tmp_qwen3_omni_w4a16 --enable-audio-output
Loading model from /mnt/disk1/lvl/auto-round-main/tmp_qwen3_omni_w4a16 ...
[patch] adding missing talker text_config.num_shared_experts = 128
You are attempting to use Flash Attention 2 without specifying a torch dtype. This might lead to unexpected behaviour
2026-03-06 09:19:14 INFO replace_modules.py L106: Experts (before replacement) [thinker.model.layers.0.mlp.experts] (Qwen3OmniMoeThinkerTextExperts):
Qwen3OmniMoeThinkerTextExperts(
(act_fn): SiLUActivation()
)
2026-03-06 09:19:14 INFO device.py L1643: [Memory Monitor] Before Applying custom replacements: 'peak_ram': 0.93GB
2026-03-06 09:19:14 INFO replace_modules.py L336: Found 68 modules to replace
Replacing modules: 100%|██████████████████████████████████████████████| 68/68 [00:04<00:00, 15.81it/s]
2026-03-06 09:19:19 INFO replace_modules.py L363: Replaced 68 modules
2026-03-06 09:19:19 INFO device.py L1646: [Memory Monitor] After Applying custom replacements: 'peak_ram': 1.04GB
2026-03-06 09:19:19 INFO device.py L1643: [Memory Monitor] Before Applying general replacements: 'peak_ram': 1.04GB
2026-03-06 09:19:19 WARNING modeling_utils.py L4430: loss_type=None was set in the config but it is unrecognized. Using the default loss: ForCausalLMLoss.
2026-03-06 09:19:22 INFO device.py L1646: [Memory Monitor] After Applying general replacements: 'peak_ram': 1.04GB
2026-03-06 09:19:22 INFO replace_modules.py L106: Experts (after replacement) [thinker.model.layers.0.mlp.experts] (SequentialQwen3OmniThinkerExperts):
SequentialQwen3OmniThinkerExperts(
(0-127): 128 x Qwen3OmniMoeThinkerTextMLP(
(gate_proj): Linear(in_features=2048, out_features=768, bias=False)
(up_proj): Linear(in_features=2048, out_features=768, bias=False)
(down_proj): Linear(in_features=768, out_features=2048, bias=False)
(act_fn): SiLUActivation()
)
)
2026-03-06 09:19:27 WARNING backend.py L1084: Better backend is found, please install all the following requirements to enable it.
2026-03-06 09:19:27 WARNING backend.py L1084: pip install -v "gptqmodel>=2.0" --no-build-isolation
Loading weights: 100%|████████████████████████████████████████| 80898/80898 [00:10<00:00, 7577.17it/s]
2026-03-06 09:19:55 INFO common.py L342: Patched Qwen3-Omni MoE talker prepare_inputs_for_generation for transformers compat.
Model and processor loaded.

============================================================
Test: text_only

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
Setting pad_token_id to eos_token_id:2150 for open-end generation.
2026-03-06 09:20:08 WARNING utils.py L1988: The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
Output: The capital of France is Paris.
Audio saved to output_text_only.wav

============================================================
Test: image

The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
Setting pad_token_id to eos_token_id:2150 for open-end generation.
Output: This composite image displays four different luxury vehicles: a white Rolls-Royce sedan, a grey Mercedes-Benz GLE SUV, a red Ferrari Portofino M convertible, and a white Porsche 911 coupe.
Audio saved to output_image.wav

============================================================
Test: audio

/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/librosa/core/audio.py:172: FutureWarning: librosa.core.audio.__audioread_load
Deprecated as of librosa version 0.10.0.
It will be removed in librosa version 1.0.
y, sr_native = __audioread_load(path, offset, duration, dtype)
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
Setting pad_token_id to eos_token_id:2150 for open-end generation.
Output: A person is coughing.
Audio saved to output_audio.wav

============================================================
Test: video

/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/librosa/core/audio.py:172: FutureWarning: librosa.core.audio.__audioread_load
Deprecated as of librosa version 0.10.0.
It will be removed in librosa version 1.0.
y, sr_native = __audioread_load(path, offset, duration, dtype)
qwen-vl-utils using torchvision to read video.
/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/torchvision/io/_video_deprecation_warning.py:9: UserWarning: The video decoding and encoding capabilities of torchvision are deprecated from version 0.22 and will be removed in version 0.24. We recommend that you migrate to TorchCodec, where we'll consolidate the future decoding/encoding capabilities of PyTorch: https://github.com/pytorch/torchcodec
warnings.warn(
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
Setting pad_token_id to eos_token_id:2150 for open-end generation.
Output: A person uses a stylus to draw a guitar on a tablet.
Audio saved to output_video.wav

============================================================
Test: image_audio

/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/librosa/core/audio.py:172: FutureWarning: librosa.core.audio.__audioread_load
Deprecated as of librosa version 0.10.0.
It will be removed in librosa version 1.0.
y, sr_native = __audioread_load(path, offset, duration, dtype)
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's attention_mask to obtain reliable results.
Setting pad_token_id to eos_token_id:2150 for open-end generation.
Output: The image displays four luxury cars—a Rolls-Royce, a Mercedes-Benz GLE SUV, a red Ferrari Portofino M, and a white Porsche 911—while the audio captures a person coughing.
Audio saved to output_image_audio.wav

============================================================
Results: 5 passed, 0 failed out of 5 tests
Passed: text_only, image, audio, video, image_audio



SPECIAL_MULTIMODAL_BLOCK = {"deepseek_vl_v2": _get_deepseek_vl2_multimodal_block}
def _get_qwen2_5_omni_multimodal_block(model, quant_vision=False):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Since the code for these two models has grown to 300+ lines, it’s making the main file quite cluttered. Shall we refine this file later

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Sure, we will refactor this file later.

@wenhuach21
Copy link
Contributor

Awesome work, Liang Ge!

@lvliang-intel
Copy link
Contributor Author

vLLM inference test with Qwen Omni 2.5 quantized model, accuracy is good

CUDA_VISIBLE_DEVICES=5 python run_qwen25_omni_vllm.py --model-dir ./tmp_qwen25_omni_w4a16
Warning: failed to apply auto_round_extension directly (No module named 'vllm.model_executor.layers.quantization.auto_round').
Fallback: set VLLM_ENABLE_AR_EXT=1 and continue.
Loading vLLM model from: /mnt/disk1/lvl/auto-round-main/tmp_qwen25_omni_w4a16
VLLM_ENABLE_AR_EXT=1
INFO 03-06 11:23:09 [utils.py:228] non-default args: {'trust_remote_code': True, 'disable_log_stats': True, 'quantization': 'auto-round', 'allow_deprecated_quantization': True, 'model': '/mnt/disk1/lvl/auto-round-main/tmp_qwen25_omni_w4a16'}
WARNING 03-06 11:23:09 [envs.py:1706] Unknown vLLM environment variable detected: VLLM_ENABLE_AR_EXT
The argument trust_remote_code is to be used with Auto classes. It has no effect here and is ignored.
The argument trust_remote_code is to be used with Auto classes. It has no effect here and is ignored.
Unrecognized keys in rope_parameters for 'rope_type'='default': {'mrope_section'}
Unrecognized keys in rope_parameters for 'rope_type'='default': {'mrope_section'}
INFO 03-06 11:23:09 [model.py:531] Resolved architecture: Qwen2_5OmniModel
INFO 03-06 11:23:09 [model.py:1554] Using max model len 32768
INFO 03-06 11:23:10 [scheduler.py:231] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 03-06 11:23:10 [vllm.py:747] Asynchronous scheduling is enabled.
(EngineCore_DP0 pid=3401586) INFO 03-06 11:23:28 [core.py:101] Initializing a V1 LLM engine (v0.17.0rc1.dev92+gc012a8c47) with config: model='/mnt/disk1/lvl/auto-round-main/tmp_qwen25_omni_w4a16', speculative_config=None, tokenizer='/mnt/disk1/lvl/auto-round-main/tmp_qwen25_omni_w4a16', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=inc, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=/mnt/disk1/lvl/auto-round-main/tmp_qwen25_omni_w4a16, enable_prefix_caching=True, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': <CompilationMode.VLLM_COMPILE: 3>, 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'compile_sizes': [], 'compile_ranges_split_points': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': <CUDAGraphMode.FULL_AND_PIECEWISE: (2, 1)>, 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': <DynamicShapesType.BACKED: 'backed'>, 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []}
(EngineCore_DP0 pid=3401586) INFO 03-06 11:23:29 [parallel_state.py:1395] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.112.228.151:35297 backend=nccl
(EngineCore_DP0 pid=3401586) INFO 03-06 11:23:30 [parallel_state.py:1717] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank N/A, EPLB rank N/A
(EngineCore_DP0 pid=3401586) INFO 03-06 11:23:36 [gpu_model_runner.py:4261] Starting to load model /mnt/disk1/lvl/auto-round-main/tmp_qwen25_omni_w4a16...
(EngineCore_DP0 pid=3401586) You are attempting to use Flash Attention 2 without specifying a torch dtype. This might lead to unexpected behaviour
(EngineCore_DP0 pid=3401586) :1241: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead.
(EngineCore_DP0 pid=3401586) :1241: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead.
(EngineCore_DP0 pid=3401586) INFO 03-06 11:23:36 [cuda.py:453] Using backend AttentionBackendEnum.FLASH_ATTN for vit attention
(EngineCore_DP0 pid=3401586) INFO 03-06 11:23:36 [mm_encoder_attention.py:215] Using AttentionBackendEnum.FLASH_ATTN for MMEncoderAttention.
(EngineCore_DP0 pid=3401586) INFO 03-06 11:23:37 [vllm.py:747] Asynchronous scheduling is enabled.
(EngineCore_DP0 pid=3401586) INFO 03-06 11:23:37 [gptq_marlin.py:376] Using MarlinLinearKernel for GPTQMarlinLinearMethod
(EngineCore_DP0 pid=3401586) INFO 03-06 11:23:37 [cuda.py:405] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION'].
(EngineCore_DP0 pid=3401586) INFO 03-06 11:23:37 [flash_attn.py:593] Using FlashAttention version 2
Loading safetensors checkpoint shards: 0% Completed | 0/8 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 12% Completed | 1/8 [00:00<00:05, 1.26it/s]
Loading safetensors checkpoint shards: 25% Completed | 2/8 [00:01<00:02, 2.12it/s]
Loading safetensors checkpoint shards: 38% Completed | 3/8 [00:01<00:01, 2.81it/s]
Loading safetensors checkpoint shards: 50% Completed | 4/8 [00:01<00:01, 3.39it/s]
Loading safetensors checkpoint shards: 62% Completed | 5/8 [00:01<00:00, 3.84it/s]
Loading safetensors checkpoint shards: 75% Completed | 6/8 [00:01<00:00, 4.77it/s]
Loading safetensors checkpoint shards: 100% Completed | 8/8 [00:01<00:00, 4.48it/s]
(EngineCore_DP0 pid=3401586)
(EngineCore_DP0 pid=3401586) INFO 03-06 11:23:39 [default_loader.py:293] Loading weights took 1.80 seconds
(EngineCore_DP0 pid=3401586) INFO 03-06 11:23:39 [gpu_model_runner.py:4344] Model loading took 5.0 GiB memory and 2.319511 seconds
(EngineCore_DP0 pid=3401586) INFO 03-06 11:23:39 [gpu_model_runner.py:5260] Encoder cache will be initialized with a budget of 32768 tokens, and profiled with 1 video items of the maximum feature size.
(EngineCore_DP0 pid=3401586) INFO 03-06 11:23:44 [decorators.py:465] Directly load AOT compilation from path /home/lianglv/.cache/vllm/torch_compile_cache/torch_aot_compile/a673b33b5a60a7ad62b531b8f037704f82238a10a2568f87fd1453da9f308ff9/rank_0_0/model
(EngineCore_DP0 pid=3401586) INFO 03-06 11:23:44 [backends.py:913] Using cache directory: /home/lianglv/.cache/vllm/torch_compile_cache/4f036895df/rank_0_0/backbone for vLLM's torch.compile
(EngineCore_DP0 pid=3401586) INFO 03-06 11:23:44 [backends.py:973] Dynamo bytecode transform time: 3.10 s
(EngineCore_DP0 pid=3401586) INFO 03-06 11:23:46 [backends.py:283] Directly load the compiled graph(s) for compile range (1, 8192) from the cache, took 1.475 s
(EngineCore_DP0 pid=3401586) INFO 03-06 11:23:46 [monitor.py:35] torch.compile and initial profiling run took 5.03 s in total
(EngineCore_DP0 pid=3401586) INFO 03-06 11:23:46 [gpu_worker.py:424] Available KV cache memory: 65.65 GiB
(EngineCore_DP0 pid=3401586) INFO 03-06 11:23:46 [kv_cache_utils.py:1314] GPU KV cache size: 1,912,224 tokens
(EngineCore_DP0 pid=3401586) INFO 03-06 11:23:46 [kv_cache_utils.py:1319] Maximum concurrency for 32,768 tokens per request: 58.36x
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|████████| 51/51 [00:01<00:00, 25.77it/s]
Capturing CUDA graphs (decode, FULL): 100%|███████████████████████████| 35/35 [00:01<00:00, 27.89it/s]
(EngineCore_DP0 pid=3401586) INFO 03-06 11:23:50 [gpu_model_runner.py:5366] Graph capturing finished in 4 secs, took 0.64 GiB
(EngineCore_DP0 pid=3401586) INFO 03-06 11:23:50 [core.py:282] init engine (profile, create kv cache, warmup model) took 11.30 seconds
INFO 03-06 11:23:51 [llm.py:388] Supported tasks: ['generate']

============================================================
Test: text_only

Rendering prompts: 100%|████████████████████████████████████████████████| 1/1 [00:00<00:00, 79.54it/s]
Processed prompts: 100%|█| 1/1 [00:00<00:00, 4.36it/s, est. speed input: 231.49 toks/s, output: 165.9
Output: The capital of France is Paris. It's a really big and important city. If you want to know more about Paris, like its famous landmarks or culture, just let me know.

============================================================
Test: image

Rendering prompts: 100%|████████████████████████████████████████████████| 1/1 [00:00<00:00, 3.95it/s]
Processed prompts: 100%|█| 1/1 [00:01<00:00, 1.12s/it, est. speed input: 7047.24 toks/s, output: 48.9
Output: This image shows four different luxury cars. One is a white Rolls - Royce, another is a Mercedes - Benz GLE SUV, the third is a red Ferrari Portofino M, and the fourth is a white Porsche. What do you think about these cars?

============================================================
Test: audio

/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/librosa/core/audio.py:172: FutureWarning: librosa.core.audio.__audioread_load
Deprecated as of librosa version 0.10.0.
It will be removed in librosa version 1.0.
y, sr_native = __audioread_load(path, offset, duration, dtype)
Rendering prompts: 100%|████████████████████████████████████████████████| 1/1 [00:00<00:00, 7.68it/s]
Processed prompts: 100%|█| 1/1 [00:00<00:00, 5.08it/s, est. speed input: 685.88 toks/s, output: 71.12
Output: I can hear a cough. What do you think about it?

============================================================
Test: video

/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/librosa/core/audio.py:172: FutureWarning: librosa.core.audio.__audioread_load
Deprecated as of librosa version 0.10.0.
It will be removed in librosa version 1.0.
y, sr_native = __audioread_load(path, offset, duration, dtype)
qwen-vl-utils using torchvision to read video.
/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/torchvision/io/_video_deprecation_warning.py:9: UserWarning: The video decoding and encoding capabilities of torchvision are deprecated from version 0.22 and will be removed in version 0.24. We recommend that you migrate to TorchCodec, where we'll consolidate the future decoding/encoding capabilities of PyTorch: https://github.com/pytorch/torchcodec
warnings.warn(
[2026-03-06 11:24:48] INFO vision_process.py:213: torchvision: video_path='https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4', total_frames=197, video_fps=30.0, time=20.513s
Rendering prompts: 100%|████████████████████████████████████████████████| 1/1 [00:00<00:00, 3.86it/s]
Processed prompts: 100%|█| 1/1 [00:00<00:00, 1.47it/s, est. speed input: 6448.62 toks/s, output: 58.8
Output: In the video, someone is drawing a guitar on a tablet using a stylus. It looks like they're really focused on their work. What do you think about the way they're drawing?

============================================================
Test: image_audio

/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/librosa/core/audio.py:172: FutureWarning: librosa.core.audio.__audioread_load
Deprecated as of librosa version 0.10.0.
It will be removed in librosa version 1.0.
y, sr_native = __audioread_load(path, offset, duration, dtype)
Rendering prompts: 100%|████████████████████████████████████████████████| 1/1 [00:00<00:00, 55.07it/s]
Processed prompts: 100%|█| 1/1 [00:00<00:00, 6.74it/s, est. speed input: 54052.29 toks/s, output: 189
Output: I can see four different types of cars in the pictures, and I can hear a cough. What do you think about these cars?

============================================================
Model: Qwen2.5-Omni (vLLM)
Results: 5 passed, 0 failed out of 5 tests
Passed: text_only, image, audio, video, image_audio

[rank0]:[W306 11:25:00.572522223 ProcessGroupNCCL.cpp:1553] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator())

@lvliang-intel
Copy link
Contributor Author

vLLM inference test with Qwen3 Omni quantized mode, accuracy is not good. Looks like vLLM issue since transfromer inference test is good for Qwen3 Omni.

CUDA_VISIBLE_DEVICES=5 python run_qwen3_omni_vllm.py --model-dir ./tmp_qwen3_omni_w4a16
Warning: failed to apply auto_round_extension directly (No module named 'vllm.model_executor.layers.quantization.auto_round').
Fallback: set VLLM_ENABLE_AR_EXT=1 and continue.
Loading vLLM model from: /mnt/disk1/lvl/auto-round-main/tmp_qwen3_omni_w4a16
VLLM_ENABLE_AR_EXT=1
tokenizer_mode=slow
INFO 03-06 11:27:42 [utils.py:228] non-default args: {'tokenizer_mode': 'slow', 'trust_remote_code': True, 'disable_log_stats': True, 'quantization': 'auto-round', 'allow_deprecated_quantization': True, 'model': '/mnt/disk1/lvl/auto-round-main/tmp_qwen3_omni_w4a16'}
WARNING 03-06 11:27:42 [envs.py:1706] Unknown vLLM environment variable detected: VLLM_ENABLE_AR_EXT
The argument trust_remote_code is to be used with Auto classes. It has no effect here and is ignored.
The argument trust_remote_code is to be used with Auto classes. It has no effect here and is ignored.
Unrecognized keys in rope_parameters for 'rope_type'='default': {'interleaved', 'mrope_section'}
Unrecognized keys in rope_parameters for 'rope_type'='default': {'mrope_interleaved', 'interleaved', 'mrope_section'}
INFO 03-06 11:27:42 [model.py:531] Resolved architecture: Qwen3OmniMoeForConditionalGeneration
INFO 03-06 11:27:42 [model.py:1554] Using max model len 65536
INFO 03-06 11:27:42 [scheduler.py:231] Chunked prefill is enabled with max_num_batched_tokens=8192.
INFO 03-06 11:27:42 [vllm.py:747] Asynchronous scheduling is enabled.
Unrecognized keys in rope_parameters for 'rope_type'='default': {'interleaved', 'mrope_section'}
Unrecognized keys in rope_parameters for 'rope_type'='default': {'interleaved', 'mrope_section'}
Unrecognized keys in rope_parameters for 'rope_type'='default': {'interleaved', 'mrope_section'}
Unrecognized keys in rope_parameters for 'rope_type'='default': {'interleaved', 'mrope_section'}
Unrecognized keys in rope_parameters for 'rope_type'='default': {'interleaved', 'mrope_section'}
(EngineCore_DP0 pid=3407242) INFO 03-06 11:28:00 [core.py:101] Initializing a V1 LLM engine (v0.17.0rc1.dev92+gc012a8c47) with config: model='/mnt/disk1/lvl/auto-round-main/tmp_qwen3_omni_w4a16', speculative_config=None, tokenizer='/mnt/disk1/lvl/auto-round-main/tmp_qwen3_omni_w4a16', skip_tokenizer_init=False, tokenizer_mode=slow, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=65536, download_dir=None, load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, decode_context_parallel_size=1, dcp_comm_backend=ag_rs, disable_custom_all_reduce=False, quantization=inc, enforce_eager=False, enable_return_routed_experts=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser='', reasoning_parser_plugin='', enable_in_reasoning=False), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, kv_cache_metrics=False, kv_cache_metrics_sample=0.01, cudagraph_metrics=False, enable_layerwise_nvtx_tracing=False, enable_mfu_metrics=False, enable_mm_processor_stats=False, enable_logging_iteration_details=False), seed=0, served_model_name=/mnt/disk1/lvl/auto-round-main/tmp_qwen3_omni_w4a16, enable_prefix_caching=True, enable_chunked_prefill=True, pooler_config=None, compilation_config={'mode': <CompilationMode.VLLM_COMPILE: 3>, 'debug_dump_path': None, 'cache_dir': '', 'compile_cache_save_format': 'binary', 'backend': 'inductor', 'custom_ops': ['none'], 'splitting_ops': ['vllm::unified_attention', 'vllm::unified_attention_with_output', 'vllm::unified_mla_attention', 'vllm::unified_mla_attention_with_output', 'vllm::mamba_mixer2', 'vllm::mamba_mixer', 'vllm::short_conv', 'vllm::linear_attention', 'vllm::plamo2_mamba_mixer', 'vllm::gdn_attention_core', 'vllm::olmo_hybrid_gdn_full_forward', 'vllm::kda_attention', 'vllm::sparse_attn_indexer', 'vllm::rocm_aiter_sparse_attn_indexer', 'vllm::unified_kv_cache_update', 'vllm::unified_mla_kv_cache_update'], 'compile_mm_encoder': False, 'compile_sizes': [], 'compile_ranges_split_points': [8192], 'inductor_compile_config': {'enable_auto_functionalized_v2': False, 'combo_kernels': True, 'benchmark_combo_kernel': True}, 'inductor_passes': {}, 'cudagraph_mode': <CUDAGraphMode.FULL_AND_PIECEWISE: (2, 1)>, 'cudagraph_num_of_warmups': 1, 'cudagraph_capture_sizes': [1, 2, 4, 8, 16, 24, 32, 40, 48, 56, 64, 72, 80, 88, 96, 104, 112, 120, 128, 136, 144, 152, 160, 168, 176, 184, 192, 200, 208, 216, 224, 232, 240, 248, 256, 272, 288, 304, 320, 336, 352, 368, 384, 400, 416, 432, 448, 464, 480, 496, 512], 'cudagraph_copy_inputs': False, 'cudagraph_specialize_lora': True, 'use_inductor_graph_partition': False, 'pass_config': {'fuse_norm_quant': False, 'fuse_act_quant': False, 'fuse_attn_quant': False, 'enable_sp': False, 'fuse_gemm_comms': False, 'fuse_allreduce_rms': False}, 'max_cudagraph_capture_size': 512, 'dynamic_shapes_config': {'type': <DynamicShapesType.BACKED: 'backed'>, 'evaluate_guards': False, 'assume_32_bit_indexing': False}, 'local_cache_dir': None, 'fast_moe_cold_start': True, 'static_all_moe_layers': []}
(EngineCore_DP0 pid=3407242) Unrecognized keys in rope_parameters for 'rope_type'='default': {'mrope_section', 'interleaved'}
(EngineCore_DP0 pid=3407242) INFO 03-06 11:28:02 [parallel_state.py:1395] world_size=1 rank=0 local_rank=0 distributed_init_method=tcp://10.112.228.151:36671 backend=nccl
(EngineCore_DP0 pid=3407242) INFO 03-06 11:28:03 [parallel_state.py:1717] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, PCP rank 0, TP rank 0, EP rank 0, EPLB rank N/A
(EngineCore_DP0 pid=3407242) Unrecognized keys in rope_parameters for 'rope_type'='default': {'mrope_section', 'interleaved'}
(EngineCore_DP0 pid=3407242) Unrecognized keys in rope_parameters for 'rope_type'='default': {'mrope_section', 'interleaved'}
(EngineCore_DP0 pid=3407242) INFO 03-06 11:28:09 [gpu_model_runner.py:4261] Starting to load model /mnt/disk1/lvl/auto-round-main/tmp_qwen3_omni_w4a16...
(EngineCore_DP0 pid=3407242) INFO 03-06 11:28:09 [cuda.py:453] Using backend AttentionBackendEnum.FLASH_ATTN for vit attention
(EngineCore_DP0 pid=3407242) INFO 03-06 11:28:09 [mm_encoder_attention.py:215] Using AttentionBackendEnum.FLASH_ATTN for MMEncoderAttention.
(EngineCore_DP0 pid=3407242) INFO 03-06 11:28:09 [vllm.py:747] Asynchronous scheduling is enabled.
(EngineCore_DP0 pid=3407242) INFO 03-06 11:28:09 [gptq_marlin.py:376] Using MarlinLinearKernel for GPTQMarlinLinearMethod
(EngineCore_DP0 pid=3407242) INFO 03-06 11:28:09 [cuda.py:405] Using FLASH_ATTN attention backend out of potential backends: ['FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION'].
(EngineCore_DP0 pid=3407242) INFO 03-06 11:28:09 [flash_attn.py:593] Using FlashAttention version 2
(EngineCore_DP0 pid=3407242) :1241: FutureWarning: The cuda.cudart module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.runtime module instead.
(EngineCore_DP0 pid=3407242) :1241: FutureWarning: The cuda.nvrtc module is deprecated and will be removed in a future release, please switch to use the cuda.bindings.nvrtc module instead.
Loading safetensors checkpoint shards: 0% Completed | 0/11 [00:00<?, ?it/s]
Loading safetensors checkpoint shards: 9% Completed | 1/11 [00:01<00:15, 1.59s/it]
Loading safetensors checkpoint shards: 18% Completed | 2/11 [00:03<00:14, 1.65s/it]
Loading safetensors checkpoint shards: 27% Completed | 3/11 [00:04<00:13, 1.64s/it]
Loading safetensors checkpoint shards: 36% Completed | 4/11 [00:06<00:11, 1.63s/it]
Loading safetensors checkpoint shards: 45% Completed | 5/11 [00:08<00:09, 1.63s/it]
Loading safetensors checkpoint shards: 55% Completed | 6/11 [00:09<00:08, 1.61s/it]
Loading safetensors checkpoint shards: 64% Completed | 7/11 [00:11<00:06, 1.60s/it]
Loading safetensors checkpoint shards: 73% Completed | 8/11 [00:12<00:04, 1.38s/it]
Loading safetensors checkpoint shards: 82% Completed | 9/11 [00:12<00:02, 1.08s/it]
Loading safetensors checkpoint shards: 91% Completed | 10/11 [00:12<00:00, 1.19it/s]
Loading safetensors checkpoint shards: 100% Completed | 11/11 [00:12<00:00, 1.18s/it]
(EngineCore_DP0 pid=3407242)
(EngineCore_DP0 pid=3407242) INFO 03-06 11:28:22 [default_loader.py:293] Loading weights took 12.98 seconds
(EngineCore_DP0 pid=3407242) INFO 03-06 11:28:24 [gpu_model_runner.py:4344] Model loading took 18.04 GiB memory and 14.610588 seconds
(EngineCore_DP0 pid=3407242) INFO 03-06 11:28:24 [gpu_model_runner.py:5260] Encoder cache will be initialized with a budget of 62720 tokens, and profiled with 1 video items of the maximum feature size.
(EngineCore_DP0 pid=3407242) INFO 03-06 11:28:30 [decorators.py:465] Directly load AOT compilation from path /home/lianglv/.cache/vllm/torch_compile_cache/torch_aot_compile/1e43a1e6cc10ae2ccd27dbb34914089c68a7bff9208d779e004e03e0e68baac0/rank_0_0/model
(EngineCore_DP0 pid=3407242) INFO 03-06 11:28:30 [backends.py:913] Using cache directory: /home/lianglv/.cache/vllm/torch_compile_cache/613a967063/rank_0_0/backbone for vLLM's torch.compile
(EngineCore_DP0 pid=3407242) INFO 03-06 11:28:30 [backends.py:973] Dynamo bytecode transform time: 4.16 s
(EngineCore_DP0 pid=3407242) INFO 03-06 11:28:33 [backends.py:283] Directly load the compiled graph(s) for compile range (1, 8192) from the cache, took 2.013 s
(EngineCore_DP0 pid=3407242) INFO 03-06 11:28:33 [monitor.py:35] torch.compile and initial profiling run took 7.23 s in total
(EngineCore_DP0 pid=3407242) INFO 03-06 11:28:34 [gpu_worker.py:424] Available KV cache memory: 52.54 GiB
(EngineCore_DP0 pid=3407242) INFO 03-06 11:28:34 [kv_cache_utils.py:1314] GPU KV cache size: 573,872 tokens
(EngineCore_DP0 pid=3407242) INFO 03-06 11:28:34 [kv_cache_utils.py:1319] Maximum concurrency for 65,536 tokens per request: 8.76x
Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|████████| 51/51 [00:03<00:00, 14.22it/s]
Capturing CUDA graphs (decode, FULL): 100%|███████████████████████████| 35/35 [00:02<00:00, 15.66it/s]
(EngineCore_DP0 pid=3407242) INFO 03-06 11:28:40 [gpu_model_runner.py:5366] Graph capturing finished in 6 secs, took 1.03 GiB
(EngineCore_DP0 pid=3407242) INFO 03-06 11:28:40 [core.py:282] init engine (profile, create kv cache, warmup model) took 16.31 seconds
INFO 03-06 11:28:41 [llm.py:388] Supported tasks: ['generate', 'transcription']

============================================================
Test: text_only

Unrecognized keys in rope_parameters for 'rope_type'='default': {'interleaved', 'mrope_section'}
Rendering prompts: 100%|████████████████████████████████████████████████| 1/1 [00:00<00:00, 81.42it/s]
Processed prompts: 100%|█| 1/1 [00:01<00:00, 1.86s/it, est. speed input: 7.00 toks/s, output: 137.79
Output: !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

============================================================
Test: image

Unrecognized keys in rope_parameters for 'rope_type'='default': {'interleaved', 'mrope_section'}
Rendering prompts: 100%|████████████████████████████████████████████████| 1/1 [00:00<00:00, 3.53it/s]
Processed prompts: 100%|█| 1/1 [00:03<00:00, 3.00s/it, est. speed input: 2018.51 toks/s, output: 85.2
Output: !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

============================================================
Test: audio

Unrecognized keys in rope_parameters for 'rope_type'='default': {'interleaved', 'mrope_section'}
/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/librosa/core/audio.py:172: FutureWarning: librosa.core.audio.__audioread_load
Deprecated as of librosa version 0.10.0.
It will be removed in librosa version 1.0.
y, sr_native = __audioread_load(path, offset, duration, dtype)
Rendering prompts: 100%|███████████████████████████████████████████████| 1/1 [00:00<00:00, 101.73it/s]
Processed prompts: 100%|█| 1/1 [00:01<00:00, 1.98s/it, est. speed input: 30.33 toks/s, output: 129.41
Output: !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

============================================================
Test: video

Unrecognized keys in rope_parameters for 'rope_type'='default': {'interleaved', 'mrope_section'}
/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/librosa/core/audio.py:172: FutureWarning: librosa.core.audio.__audioread_load
Deprecated as of librosa version 0.10.0.
It will be removed in librosa version 1.0.
y, sr_native = __audioread_load(path, offset, duration, dtype)
qwen-vl-utils using torchvision to read video.
/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/torchvision/io/_video_deprecation_warning.py:9: UserWarning: The video decoding and encoding capabilities of torchvision are deprecated from version 0.22 and will be removed in version 0.24. We recommend that you migrate to TorchCodec, where we'll consolidate the future decoding/encoding capabilities of PyTorch: https://github.com/pytorch/torchcodec
warnings.warn(
[2026-03-06 11:29:42] INFO vision_process.py:213: torchvision: video_path='https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4', total_frames=197, video_fps=30.0, time=20.561s
Rendering prompts: 100%|████████████████████████████████████████████████| 1/1 [00:00<00:00, 2.43it/s]
Processed prompts: 100%|█| 1/1 [00:02<00:00, 2.44s/it, est. speed input: 1424.53 toks/s, output: 104.
Output: !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

============================================================
Test: image_audio

Unrecognized keys in rope_parameters for 'rope_type'='default': {'interleaved', 'mrope_section'}
/mnt/disk1/lvl/conda_envs/artest/lib/python3.11/site-packages/librosa/core/audio.py:172: FutureWarning: librosa.core.audio.__audioread_load
Deprecated as of librosa version 0.10.0.
It will be removed in librosa version 1.0.
y, sr_native = __audioread_load(path, offset, duration, dtype)
Rendering prompts: 100%|████████████████████████████████████████████████| 1/1 [00:00<00:00, 51.03it/s]
Processed prompts: 100%|█| 1/1 [00:02<00:00, 2.00s/it, est. speed input: 3051.97 toks/s, output: 127.
Output: !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!

============================================================

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

3 participants