OmniCaptioner-IF-3B

GitHub   Project Page   Paper   Trainset   Testset

Quick Start

Installation

conda create -n omnicap_if python=3.12
conda activate omnicap_if
pip install torch torchvision
pip install transformers==4.57.1
pip install accelerate
pip install flash-attn --no-build-isolation
pip install qwen-omni-utils[decord] -U

Usage

import torch
from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
from qwen_omni_utils import process_mm_info

MODEL_ID = "NJU-LINK/OmniCaptioner-IF-3B"
VIDEO_PATH = "example_video.mp4"
INSTRUCTION = (
    "Please describe this video in a Markdown table with columns "
    "'Timestamp', 'Visual Action', and 'Audio Content'. Include precise timestamps "
    "and mention the key audio-visual events."
)

MAX_PIXELS = 297920
VIDEO_MAX_PIXELS = 297920

model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.bfloat16,
    device_map="cuda",
    attn_implementation="flash_attention_2"
)
processor = Qwen2_5OmniProcessor.from_pretrained(MODEL_ID)
model.disable_talker()

conversation = [
    {
        "role": "user",
        "content": [
            {"type": "text", "text": INSTRUCTION},
            {
                "type": "video",
                "video": VIDEO_PATH,
                "max_pixels": MAX_PIXELS,
                "max_frames": 160,
                "fps": 1.0,
                "video_max_pixels": VIDEO_MAX_PIXELS
            }
        ],
    },
]

text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
audios, images, videos = process_mm_info(conversation, use_audio_in_video=True)

inputs = processor(
    text=text,
    audio=audios,
    images=images,
    videos=videos,
    return_tensors="pt",
    padding=True,
    use_audio_in_video=True
)
inputs = inputs.to(model.device).to(model.dtype)

with torch.inference_mode():
    text_ids = model.generate(
        **inputs,
        use_audio_in_video=True,
        return_audio=False,
        thinker_max_new_tokens=1536,
        talker_max_tokens=1536
    )

response = processor.decode(text_ids[0][inputs.input_ids[0].size(0):], skip_special_tokens=True)
print(response)

Citation

@article{wang2026omnicapif,
  title   = {OmniCap-IF: Benchmarking and Improving Instruction Following Abilities for Omni-Video Captioning},
  author  = {Wang, Jiahao and Ping, An and Wang, Yanghai and Zhang, Yuanxing and Li, Shihao and Bian, Hanyan and Ren, Yichi and Zhang, Yize and Wang, Han and Chen, Haowen and Li, Junze and Wang, Jiaqi and Hu, Yiyang and Xu, Zhuze and Zhang, Zijie and Liu, Jiaheng},
  journal = {Preprint},
  year    = {2026}
}
Downloads last month
35
Safetensors
Model size
6B params
Tensor type
BF16
·
F32
·
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support