OmniCaptioner-IF-7B

GitHub   Project Page   Paper   Trainset   Testset

Quick Start

Installation

conda create -n omnicap_if python=3.12
conda activate omnicap_if
pip install torch torchvision
pip install transformers==4.57.1
pip install accelerate
pip install flash-attn --no-build-isolation
pip install qwen-omni-utils[decord] -U

Usage

import torch
from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
from qwen_omni_utils import process_mm_info

MODEL_ID = "NJU-LINK/OmniCaptioner-IF-7B"
VIDEO_PATH = "example_video.mp4"
INSTRUCTION = (
    "Please describe this video in a Markdown table with columns "
    "'Timestamp', 'Visual Action', and 'Audio Content'. Include precise timestamps "
    "and mention the key audio-visual events."
)

MAX_PIXELS = 297920
VIDEO_MAX_PIXELS = 297920

model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
    MODEL_ID,
    torch_dtype=torch.bfloat16,
    device_map="cuda",
    attn_implementation="flash_attention_2"
)
processor = Qwen2_5OmniProcessor.from_pretrained(MODEL_ID)
model.disable_talker()

conversation = [
    {
        "role": "user",
        "content": [
            {"type": "text", "text": INSTRUCTION},
            {
                "type": "video",
                "video": VIDEO_PATH,
                "max_pixels": MAX_PIXELS,
                "max_frames": 160,
                "fps": 1.0,
                "video_max_pixels": VIDEO_MAX_PIXELS
            }
        ],
    },
]

text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
audios, images, videos = process_mm_info(conversation, use_audio_in_video=True)

inputs = processor(
    text=text,
    audio=audios,
    images=images,
    videos=videos,
    return_tensors="pt",
    padding=True,
    use_audio_in_video=True
)
inputs = inputs.to(model.device).to(model.dtype)

with torch.inference_mode():
    text_ids = model.generate(
        **inputs,
        use_audio_in_video=True,
        return_audio=False,
        thinker_max_new_tokens=1536,
        talker_max_tokens=1536
    )

response = processor.decode(text_ids[0][inputs.input_ids[0].size(0):], skip_special_tokens=True)
print(response)

Citation

@misc{wang2026omnicapifbenchmarkingimprovinginstruction,
      title={OmniCap-IF: Benchmarking and Improving Instruction Following Abilities for Omni-Video Captioning}, 
      author={Jiahao Wang and An Ping and Yanghai Wang and Yuanxing Zhang and Shihao Li and Hanyan Bian and Yichi Ren and Yize Zhang and Han Wang and Haowen Chen and Junze Li and Jiaqi Wang and Yiyang Hu and Zhuze Xu and Zijie Zhang and Jiaheng Liu},
      year={2026},
      eprint={2606.08572},
      archivePrefix={arXiv},
      primaryClass={cs.CV},
      url={https://arxiv.org/abs/2606.08572}, 
}
Downloads last month
26
Safetensors
Model size
11B params
Tensor type
BF16
·
F32
·
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support

Paper for NJU-LINK/OmniCaptioner-IF-7B