sunrainyg
Update
3d14a12
import os
import gradio as gr
import torch
import spaces # for @spaces.GPU on Hugging Face Spaces
# Try to import TorchAoConfig for optional 4-bit weight-only quantization.
# If unavailable in your transformers version, we safely fall back to no quantization.
try:
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TorchAoConfig
_HAS_TORCHAO = True
except Exception:
from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
TorchAoConfig = None # type: ignore
_HAS_TORCHAO = False
# ========== Basic Configuration ==========
MODEL_ID = os.environ.get("MODEL_ID", "Efficient-Large-Model/qwen2_5vl-7b-wolfv2-tuned")
USE_INT4 = os.environ.get("USE_INT4", "0") == "1"
# Prefer bfloat16 on GPU, float32 on CPU
dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32
quant_cfg = None
if USE_INT4 and _HAS_TORCHAO and TorchAoConfig is not None:
# Optional int4 weight-only quantization (saves VRAM on GPU)
quant_cfg = TorchAoConfig("int4_weight_only", group_size=128)
# ---- ZeroGPU warm-up: must exist AND be called at import time ----
@spaces.GPU
def _warmup():
"""
A very light GPU-touch to satisfy ZeroGPU's startup detector.
Called at import-time (below). Never raise; return a short status string.
"""
try:
if torch.cuda.is_available():
_ = torch.tensor([0], device="cuda")
return "gpu-ready"
except Exception as e:
return f"warmup-error: {e}"
# Call warmup at import time so ZeroGPU detects a @spaces.GPU function during startup.
_WARMUP_STATUS = _warmup()
# ========== Load Model & Processor ==========
model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
MODEL_ID,
device_map="auto",
dtype=dtype, # (modern arg; replaces deprecated torch_dtype)
attn_implementation="sdpa",
quantization_config=quant_cfg,
)
# Resolution bounds to balance quality vs. memory
MIN_PIXELS = 256 * 28 * 28
MAX_PIXELS = 1024 * 28 * 28
processor = AutoProcessor.from_pretrained(
MODEL_ID,
min_pixels=MIN_PIXELS,
max_pixels=MAX_PIXELS,
)
# ---- Conversation builder (safe) ----
SYSTEM_PROMPT = (
"You are a helpful assistant that watches a user-provided video and answers "
"questions about it concisely and accurately."
)
def build_conversation(video_path: str, question: str, fps: int):
# Use 'video' key per Qwen examples; keep system as structured content
return [
{
"role": "system",
"content": [
{"type": "text", "text": SYSTEM_PROMPT}
],
},
{
"role": "user",
"content": [
{"type": "video", "video": video_path}, # <— IMPORTANT
{"type": "text", "text": question},
],
},
]
# ========== Inference ==========
# ---- Inference (robust decoding + explicit eos) ----
@torch.inference_mode()
def answer(video, question, fps=1, max_new_tokens=128, temperature=0.0, top_p=0.9):
if video is None:
return "Please upload or drag a video first."
if not question or question.strip() == "":
question = "Summarize this video and provide 5 representative question–answer pairs."
conv = build_conversation(video, question, int(fps))
inputs = processor.apply_chat_template(
conv,
fps=int(fps),
add_generation_prompt=True,
tokenize=True,
return_dict=True,
return_tensors="pt",
)
# move tensors to the right device
inputs = {k: (v.to(model.device) if hasattr(v, "to") else v) for k, v in inputs.items()}
# be explicit about eos/pad to avoid weird tails
eos_id = model.generation_config.eos_token_id
if isinstance(eos_id, list) and len(eos_id) > 0:
eos_id = eos_id[0]
gen_kwargs = dict(
max_new_tokens=int(max_new_tokens),
temperature=float(temperature),
top_p=float(top_p),
do_sample=(float(temperature) > 0.0),
pad_token_id=processor.tokenizer.eos_token_id,
eos_token_id=eos_id,
)
output_ids = model.generate(**inputs, **gen_kwargs)
# slice off the prompt for clean decoding
prompt_len = inputs["input_ids"].shape[1]
generated_ids = output_ids[0, prompt_len:]
# decode with tokenizer.decode (single sequence)
text = processor.tokenizer.decode(
generated_ids,
skip_special_tokens=True,
clean_up_tokenization_spaces=True,
)
return text.strip()
# ========== Gradio UI ==========
with gr.Blocks(title="Video → Q&A (Qwen2.5-VL-7B WolfV2)") as demo:
gr.Markdown(
"""
# 🎬 Video → Q&A (Qwen2.5-VL-7B WolfV2)
- Drag or upload any video, type your question, then click **Ask**.
- Default `fps=1` (1 frame per second) saves VRAM; for short or very detailed videos, increase fps slightly.
"""
)
with gr.Row():
video = gr.Video(label="Drop your video here (mp4, mov, webm)", interactive=True)
with gr.Column():
question = gr.Textbox(
label="Your question",
placeholder="e.g., What happens in this video? Provide 5 QA pairs."
)
ask = gr.Button("Ask", variant="primary")
output = gr.Textbox(label="Answer", lines=12)
with gr.Accordion("Advanced", open=False):
fps = gr.Slider(1, 6, value=1, step=1, label="Sampling FPS")
max_new_tokens = gr.Slider(32, 512, value=192, step=16, label="Max new tokens")
temperature = gr.Slider(0.0, 1.2, value=0.2, step=0.05, label="Temperature")
top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p")
ask.click(
fn=answer,
inputs=[video, question, fps, max_new_tokens, temperature, top_p],
outputs=[output],
)
# ========== Launch ==========
if __name__ == "__main__":
# Disable SSR to avoid extra startup constraints; works well across CPU/GPU/ZeroGPU.
demo.launch(ssr_mode=False)