Spaces:
Sleeping
Sleeping
| import time, torch, gradio as gr | |
| from transformers import AutoProcessor, AutoModelForImageTextToText | |
| MODEL_ID = "HuggingFaceTB/SmolVLM2-256M-Video-Instruct" | |
| # Pick a safe float dtype for your GPU (Ampere+ -> bf16; else fp16; CPU -> fp32) | |
| if torch.cuda.is_available(): | |
| major, _ = torch.cuda.get_device_capability() | |
| FLOAT_DTYPE = torch.bfloat16 if major >= 8 else torch.float16 | |
| else: | |
| FLOAT_DTYPE = torch.float32 | |
| # Load once (faster subsequent runs) | |
| model = AutoModelForImageTextToText.from_pretrained( | |
| MODEL_ID, torch_dtype=FLOAT_DTYPE, device_map="auto" | |
| ) | |
| processor = AutoProcessor.from_pretrained(MODEL_ID) | |
| def run_video(video_path, prompt, max_new_tokens=256, backend="decord", num_frames=32): | |
| """video_path is a local file path; backend in {'decord','pyav','opencv','torchvision'}""" | |
| messages = [{ | |
| "role": "user", | |
| "content": [ | |
| {"type": "video", "path": video_path}, | |
| {"type": "text", "text": prompt}, | |
| ], | |
| }] | |
| inputs = processor.apply_chat_template( | |
| messages, | |
| add_generation_prompt=True, tokenize=True, return_dict=True, return_tensors="pt", | |
| video_load_backend=backend, num_frames=num_frames | |
| ) | |
| # Move tensors to device; keep integer token IDs as int64; cast only floats | |
| for k, v in list(inputs.items()): | |
| if isinstance(v, torch.Tensor): | |
| inputs[k] = v.to(model.device) | |
| for k, v in list(inputs.items()): | |
| if isinstance(v, torch.Tensor) and torch.is_floating_point(v): | |
| inputs[k] = v.to(dtype=FLOAT_DTYPE) | |
| gen_kwargs = { | |
| "do_sample": False, | |
| "max_new_tokens": max_new_tokens, | |
| "eos_token_id": getattr(model.generation_config, "eos_token_id", None) \ | |
| or getattr(processor.tokenizer, "eos_token_id", None), | |
| "pad_token_id": getattr(model.generation_config, "pad_token_id", None) \ | |
| or getattr(processor.tokenizer, "pad_token_id", None), | |
| } | |
| if torch.cuda.is_available(): | |
| torch.cuda.reset_peak_memory_stats() | |
| t0 = time.perf_counter() | |
| out_ids = model.generate(**inputs, **gen_kwargs) | |
| latency = time.perf_counter() - t0 | |
| text = processor.batch_decode(out_ids, skip_special_tokens=True)[0] | |
| vram_gb = (torch.cuda.max_memory_allocated()/1e9) if torch.cuda.is_available() else 0.0 | |
| tokens_generated = int(out_ids.shape[-1] - inputs["input_ids"].shape[-1]) | |
| # minimal pretty string | |
| pretty = (f"Latency: {latency:.3f}s | VRAM: {vram_gb:.2f} GB | Tokens: {tokens_generated}\n" | |
| f"{'-'*40}\n{text.strip()}") | |
| return pretty | |
| def infer(video, prompt, tokens, frames, backend): | |
| # gr.Video gives a dict or path depending on version; normalize: | |
| path = video if isinstance(video, str) else getattr(video, "name", None) | |
| if not path: | |
| return "No video file received." | |
| return run_video(path, prompt, max_new_tokens=tokens, backend=backend, num_frames=frames) | |
| with gr.Blocks() as demo: | |
| gr.Markdown("## SmolVLM2-256M Video Test\nUpload an MP4 and enter your prompt. " | |
| "This Space mirrors your Colab test.") | |
| with gr.Row(): | |
| vid = gr.Video(label="Upload MP4", sources=["upload"], include_audio=False) | |
| with gr.Column(): | |
| prompt = gr.Textbox(label="Prompt", value="Describe this video to me", lines=2) | |
| tokens = gr.Slider(32, 512, value=256, step=16, label="max_new_tokens") | |
| frames = gr.Slider(8, 64, value=32, step=8, label="num_frames (sampling)") | |
| backend = gr.Dropdown(choices=["decord","pyav","opencv","torchvision"], | |
| value="decord", label="video_load_backend") | |
| btn = gr.Button("Run") | |
| out = gr.Textbox(label="Output", lines=15) | |
| btn.click(fn=infer, inputs=[vid, prompt, tokens, frames, backend], outputs=out) | |
| if __name__ == "__main__": | |
| demo.launch() | |