OmniCap-IF: Benchmarking and Improving Instruction Following Abilities for Omni-Video Captioning
Paper • 2606.08572 • Published • 12
How to use NJU-LINK/OmniCaptioner-IF-7B with Transformers:
# Use a pipeline as a high-level helper
from transformers import pipeline
pipe = pipeline("image-text-to-text", model="NJU-LINK/OmniCaptioner-IF-7B")
messages = [
{
"role": "user",
"content": [
{"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"},
{"type": "text", "text": "What animal is on the candy?"}
]
},
]
pipe(text=messages) # Load model directly
from transformers import AutoProcessor, AutoModelForMultimodalLM
processor = AutoProcessor.from_pretrained("NJU-LINK/OmniCaptioner-IF-7B")
model = AutoModelForMultimodalLM.from_pretrained("NJU-LINK/OmniCaptioner-IF-7B")
messages = [
{
"role": "user",
"content": [
{"type": "image", "url": "https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/p-blog/candy.JPG"},
{"type": "text", "text": "What animal is on the candy?"}
]
},
]
inputs = processor.apply_chat_template(
messages,
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
).to(model.device)
outputs = model.generate(**inputs, max_new_tokens=40)
print(processor.decode(outputs[0][inputs["input_ids"].shape[-1]:]))How to use NJU-LINK/OmniCaptioner-IF-7B with vLLM:
# Install vLLM from pip:
pip install vllm
# Start the vLLM server:
vllm serve "NJU-LINK/OmniCaptioner-IF-7B"
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:8000/v1/chat/completions" \
-H "Content-Type: application/json" \
--data '{
"model": "NJU-LINK/OmniCaptioner-IF-7B",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": "Describe this image in one sentence."
},
{
"type": "image_url",
"image_url": {
"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
}
}
]
}
]
}'docker model run hf.co/NJU-LINK/OmniCaptioner-IF-7B
How to use NJU-LINK/OmniCaptioner-IF-7B with SGLang:
# Install SGLang from pip:
pip install sglang
# Start the SGLang server:
python3 -m sglang.launch_server \
--model-path "NJU-LINK/OmniCaptioner-IF-7B" \
--host 0.0.0.0 \
--port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
-H "Content-Type: application/json" \
--data '{
"model": "NJU-LINK/OmniCaptioner-IF-7B",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": "Describe this image in one sentence."
},
{
"type": "image_url",
"image_url": {
"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
}
}
]
}
]
}'docker run --gpus all \
--shm-size 32g \
-p 30000:30000 \
-v ~/.cache/huggingface:/root/.cache/huggingface \
--env "HF_TOKEN=<secret>" \
--ipc=host \
lmsysorg/sglang:latest \
python3 -m sglang.launch_server \
--model-path "NJU-LINK/OmniCaptioner-IF-7B" \
--host 0.0.0.0 \
--port 30000
# Call the server using curl (OpenAI-compatible API):
curl -X POST "http://localhost:30000/v1/chat/completions" \
-H "Content-Type: application/json" \
--data '{
"model": "NJU-LINK/OmniCaptioner-IF-7B",
"messages": [
{
"role": "user",
"content": [
{
"type": "text",
"text": "Describe this image in one sentence."
},
{
"type": "image_url",
"image_url": {
"url": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg"
}
}
]
}
]
}'How to use NJU-LINK/OmniCaptioner-IF-7B with Docker Model Runner:
docker model run hf.co/NJU-LINK/OmniCaptioner-IF-7B
conda create -n omnicap_if python=3.12
conda activate omnicap_if
pip install torch torchvision
pip install transformers==4.57.1
pip install accelerate
pip install flash-attn --no-build-isolation
pip install qwen-omni-utils[decord] -U
import torch
from transformers import Qwen2_5OmniForConditionalGeneration, Qwen2_5OmniProcessor
from qwen_omni_utils import process_mm_info
MODEL_ID = "NJU-LINK/OmniCaptioner-IF-7B"
VIDEO_PATH = "example_video.mp4"
INSTRUCTION = (
"Please describe this video in a Markdown table with columns "
"'Timestamp', 'Visual Action', and 'Audio Content'. Include precise timestamps "
"and mention the key audio-visual events."
)
MAX_PIXELS = 297920
VIDEO_MAX_PIXELS = 297920
model = Qwen2_5OmniForConditionalGeneration.from_pretrained(
MODEL_ID,
torch_dtype=torch.bfloat16,
device_map="cuda",
attn_implementation="flash_attention_2"
)
processor = Qwen2_5OmniProcessor.from_pretrained(MODEL_ID)
model.disable_talker()
conversation = [
{
"role": "user",
"content": [
{"type": "text", "text": INSTRUCTION},
{
"type": "video",
"video": VIDEO_PATH,
"max_pixels": MAX_PIXELS,
"max_frames": 160,
"fps": 1.0,
"video_max_pixels": VIDEO_MAX_PIXELS
}
],
},
]
text = processor.apply_chat_template(conversation, add_generation_prompt=True, tokenize=False)
audios, images, videos = process_mm_info(conversation, use_audio_in_video=True)
inputs = processor(
text=text,
audio=audios,
images=images,
videos=videos,
return_tensors="pt",
padding=True,
use_audio_in_video=True
)
inputs = inputs.to(model.device).to(model.dtype)
with torch.inference_mode():
text_ids = model.generate(
**inputs,
use_audio_in_video=True,
return_audio=False,
thinker_max_new_tokens=1536,
talker_max_tokens=1536
)
response = processor.decode(text_ids[0][inputs.input_ids[0].size(0):], skip_special_tokens=True)
print(response)
@misc{wang2026omnicapifbenchmarkingimprovinginstruction,
title={OmniCap-IF: Benchmarking and Improving Instruction Following Abilities for Omni-Video Captioning},
author={Jiahao Wang and An Ping and Yanghai Wang and Yuanxing Zhang and Shihao Li and Hanyan Bian and Yichi Ren and Yize Zhang and Han Wang and Haowen Chen and Junze Li and Jiaqi Wang and Yiyang Hu and Zhuze Xu and Zijie Zhang and Jiaheng Liu},
year={2026},
eprint={2606.08572},
archivePrefix={arXiv},
primaryClass={cs.CV},
url={https://arxiv.org/abs/2606.08572},
}