Spaces:
Sleeping
Sleeping
| import os | |
| import gradio as gr | |
| import torch | |
| import spaces # for @spaces.GPU on Hugging Face Spaces | |
| # Try to import TorchAoConfig for optional 4-bit weight-only quantization. | |
| # If unavailable in your transformers version, we safely fall back to no quantization. | |
| try: | |
| from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TorchAoConfig | |
| _HAS_TORCHAO = True | |
| except Exception: | |
| from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration | |
| TorchAoConfig = None # type: ignore | |
| _HAS_TORCHAO = False | |
| # ========== Basic Configuration ========== | |
| MODEL_ID = os.environ.get("MODEL_ID", "Efficient-Large-Model/qwen2_5vl-7b-wolfv2-tuned") | |
| USE_INT4 = os.environ.get("USE_INT4", "0") == "1" | |
| # Prefer bfloat16 on GPU, float32 on CPU | |
| dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32 | |
| quant_cfg = None | |
| if USE_INT4 and _HAS_TORCHAO and TorchAoConfig is not None: | |
| # Optional int4 weight-only quantization (saves VRAM on GPU) | |
| quant_cfg = TorchAoConfig("int4_weight_only", group_size=128) | |
| # ---- ZeroGPU warm-up: must exist AND be called at import time ---- | |
| def _warmup(): | |
| """ | |
| A very light GPU-touch to satisfy ZeroGPU's startup detector. | |
| Called at import-time (below). Never raise; return a short status string. | |
| """ | |
| try: | |
| if torch.cuda.is_available(): | |
| _ = torch.tensor([0], device="cuda") | |
| return "gpu-ready" | |
| except Exception as e: | |
| return f"warmup-error: {e}" | |
| # Call warmup at import time so ZeroGPU detects a @spaces.GPU function during startup. | |
| _WARMUP_STATUS = _warmup() | |
| # ========== Load Model & Processor ========== | |
| model = Qwen2_5_VLForConditionalGeneration.from_pretrained( | |
| MODEL_ID, | |
| device_map="auto", | |
| dtype=dtype, # (modern arg; replaces deprecated torch_dtype) | |
| attn_implementation="sdpa", | |
| quantization_config=quant_cfg, | |
| ) | |
| # Resolution bounds to balance quality vs. memory | |
| MIN_PIXELS = 256 * 28 * 28 | |
| MAX_PIXELS = 1024 * 28 * 28 | |
| processor = AutoProcessor.from_pretrained( | |
| MODEL_ID, | |
| min_pixels=MIN_PIXELS, | |
| max_pixels=MAX_PIXELS, | |
| ) | |
| # ---- Conversation builder (safe) ---- | |
| SYSTEM_PROMPT = ( | |
| "You are a helpful assistant that watches a user-provided video and answers " | |
| "questions about it concisely and accurately." | |
| ) | |
| def build_conversation(video_path: str, question: str, fps: int): | |
| # Use 'video' key per Qwen examples; keep system as structured content | |
| return [ | |
| { | |
| "role": "system", | |
| "content": [ | |
| {"type": "text", "text": SYSTEM_PROMPT} | |
| ], | |
| }, | |
| { | |
| "role": "user", | |
| "content": [ | |
| {"type": "video", "video": video_path}, # <— IMPORTANT | |
| {"type": "text", "text": question}, | |
| ], | |
| }, | |
| ] | |
| # ========== Inference ========== | |
| # ---- Inference (robust decoding + explicit eos) ---- | |
| def answer(video, question, fps=1, max_new_tokens=128, temperature=0.0, top_p=0.9): | |
| if video is None: | |
| return "Please upload or drag a video first." | |
| if not question or question.strip() == "": | |
| question = "Summarize this video and provide 5 representative question–answer pairs." | |
| conv = build_conversation(video, question, int(fps)) | |
| inputs = processor.apply_chat_template( | |
| conv, | |
| fps=int(fps), | |
| add_generation_prompt=True, | |
| tokenize=True, | |
| return_dict=True, | |
| return_tensors="pt", | |
| ) | |
| # move tensors to the right device | |
| inputs = {k: (v.to(model.device) if hasattr(v, "to") else v) for k, v in inputs.items()} | |
| # be explicit about eos/pad to avoid weird tails | |
| eos_id = model.generation_config.eos_token_id | |
| if isinstance(eos_id, list) and len(eos_id) > 0: | |
| eos_id = eos_id[0] | |
| gen_kwargs = dict( | |
| max_new_tokens=int(max_new_tokens), | |
| temperature=float(temperature), | |
| top_p=float(top_p), | |
| do_sample=(float(temperature) > 0.0), | |
| pad_token_id=processor.tokenizer.eos_token_id, | |
| eos_token_id=eos_id, | |
| ) | |
| output_ids = model.generate(**inputs, **gen_kwargs) | |
| # slice off the prompt for clean decoding | |
| prompt_len = inputs["input_ids"].shape[1] | |
| generated_ids = output_ids[0, prompt_len:] | |
| # decode with tokenizer.decode (single sequence) | |
| text = processor.tokenizer.decode( | |
| generated_ids, | |
| skip_special_tokens=True, | |
| clean_up_tokenization_spaces=True, | |
| ) | |
| return text.strip() | |
| # ========== Gradio UI ========== | |
| with gr.Blocks(title="Video → Q&A (Qwen2.5-VL-7B WolfV2)") as demo: | |
| gr.Markdown( | |
| """ | |
| # 🎬 Video → Q&A (Qwen2.5-VL-7B WolfV2) | |
| - Drag or upload any video, type your question, then click **Ask**. | |
| - Default `fps=1` (1 frame per second) saves VRAM; for short or very detailed videos, increase fps slightly. | |
| """ | |
| ) | |
| with gr.Row(): | |
| video = gr.Video(label="Drop your video here (mp4, mov, webm)", interactive=True) | |
| with gr.Column(): | |
| question = gr.Textbox( | |
| label="Your question", | |
| placeholder="e.g., What happens in this video? Provide 5 QA pairs." | |
| ) | |
| ask = gr.Button("Ask", variant="primary") | |
| output = gr.Textbox(label="Answer", lines=12) | |
| with gr.Accordion("Advanced", open=False): | |
| fps = gr.Slider(1, 6, value=1, step=1, label="Sampling FPS") | |
| max_new_tokens = gr.Slider(32, 512, value=192, step=16, label="Max new tokens") | |
| temperature = gr.Slider(0.0, 1.2, value=0.2, step=0.05, label="Temperature") | |
| top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p") | |
| ask.click( | |
| fn=answer, | |
| inputs=[video, question, fps, max_new_tokens, temperature, top_p], | |
| outputs=[output], | |
| ) | |
| # ========== Launch ========== | |
| if __name__ == "__main__": | |
| # Disable SSR to avoid extra startup constraints; works well across CPU/GPU/ZeroGPU. | |
| demo.launch(ssr_mode=False) | |