Spaces:

yulu2
/

FoundationMotion

Sleeping

FoundationMotion / app.py

sunrainyg

Update

3d14a12 24 days ago

6.02 kB

	import os
	import gradio as gr
	import torch
	import spaces # for @spaces.GPU on Hugging Face Spaces

	# Try to import TorchAoConfig for optional 4-bit weight-only quantization.
	# If unavailable in your transformers version, we safely fall back to no quantization.
	try:
	from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration, TorchAoConfig
	_HAS_TORCHAO = True
	except Exception:
	from transformers import AutoProcessor, Qwen2_5_VLForConditionalGeneration
	TorchAoConfig = None # type: ignore
	_HAS_TORCHAO = False

	# ========== Basic Configuration ==========
	MODEL_ID = os.environ.get("MODEL_ID", "Efficient-Large-Model/qwen2_5vl-7b-wolfv2-tuned")
	USE_INT4 = os.environ.get("USE_INT4", "0") == "1"

	# Prefer bfloat16 on GPU, float32 on CPU
	dtype = torch.bfloat16 if torch.cuda.is_available() else torch.float32

	quant_cfg = None
	if USE_INT4 and _HAS_TORCHAO and TorchAoConfig is not None:
	# Optional int4 weight-only quantization (saves VRAM on GPU)
	quant_cfg = TorchAoConfig("int4_weight_only", group_size=128)

	# ---- ZeroGPU warm-up: must exist AND be called at import time ----
	@spaces.GPU
	def _warmup():
	"""
	A very light GPU-touch to satisfy ZeroGPU's startup detector.
	Called at import-time (below). Never raise; return a short status string.
	"""
	try:
	if torch.cuda.is_available():
	_ = torch.tensor([0], device="cuda")
	return "gpu-ready"
	except Exception as e:
	return f"warmup-error: {e}"

	# Call warmup at import time so ZeroGPU detects a @spaces.GPU function during startup.
	_WARMUP_STATUS = _warmup()

	# ========== Load Model & Processor ==========
	model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
	MODEL_ID,
	device_map="auto",
	dtype=dtype, # (modern arg; replaces deprecated torch_dtype)
	attn_implementation="sdpa",
	quantization_config=quant_cfg,
	)

	# Resolution bounds to balance quality vs. memory
	MIN_PIXELS = 256 * 28 * 28
	MAX_PIXELS = 1024 * 28 * 28

	processor = AutoProcessor.from_pretrained(
	MODEL_ID,
	min_pixels=MIN_PIXELS,
	max_pixels=MAX_PIXELS,
	)

	# ---- Conversation builder (safe) ----
	SYSTEM_PROMPT = (
	"You are a helpful assistant that watches a user-provided video and answers "
	"questions about it concisely and accurately."
	)

	def build_conversation(video_path: str, question: str, fps: int):
	# Use 'video' key per Qwen examples; keep system as structured content
	return [
	{
	"role": "system",
	"content": [
	{"type": "text", "text": SYSTEM_PROMPT}
	],
	},
	{
	"role": "user",
	"content": [
	{"type": "video", "video": video_path}, # <— IMPORTANT
	{"type": "text", "text": question},
	],
	},
	]



	# ========== Inference ==========
	# ---- Inference (robust decoding + explicit eos) ----
	@torch.inference_mode()
	def answer(video, question, fps=1, max_new_tokens=128, temperature=0.0, top_p=0.9):
	if video is None:
	return "Please upload or drag a video first."
	if not question or question.strip() == "":
	question = "Summarize this video and provide 5 representative question–answer pairs."

	conv = build_conversation(video, question, int(fps))

	inputs = processor.apply_chat_template(
	conv,
	fps=int(fps),
	add_generation_prompt=True,
	tokenize=True,
	return_dict=True,
	return_tensors="pt",
	)
	# move tensors to the right device
	inputs = {k: (v.to(model.device) if hasattr(v, "to") else v) for k, v in inputs.items()}

	# be explicit about eos/pad to avoid weird tails
	eos_id = model.generation_config.eos_token_id
	if isinstance(eos_id, list) and len(eos_id) > 0:
	eos_id = eos_id[0]

	gen_kwargs = dict(
	max_new_tokens=int(max_new_tokens),
	temperature=float(temperature),
	top_p=float(top_p),
	do_sample=(float(temperature) > 0.0),
	pad_token_id=processor.tokenizer.eos_token_id,
	eos_token_id=eos_id,
	)

	output_ids = model.generate(inputs, gen_kwargs)

	# slice off the prompt for clean decoding
	prompt_len = inputs["input_ids"].shape[1]
	generated_ids = output_ids[0, prompt_len:]

	# decode with tokenizer.decode (single sequence)
	text = processor.tokenizer.decode(
	generated_ids,
	skip_special_tokens=True,
	clean_up_tokenization_spaces=True,
	)

	return text.strip()




	# ========== Gradio UI ==========
	with gr.Blocks(title="Video → Q&A (Qwen2.5-VL-7B WolfV2)") as demo:
	gr.Markdown(
	"""
	# 🎬 Video → Q&A (Qwen2.5-VL-7B WolfV2)
	- Drag or upload any video, type your question, then click Ask.
	- Default `fps=1` (1 frame per second) saves VRAM; for short or very detailed videos, increase fps slightly.
	"""
	)

	with gr.Row():
	video = gr.Video(label="Drop your video here (mp4, mov, webm)", interactive=True)
	with gr.Column():
	question = gr.Textbox(
	label="Your question",
	placeholder="e.g., What happens in this video? Provide 5 QA pairs."
	)
	ask = gr.Button("Ask", variant="primary")
	output = gr.Textbox(label="Answer", lines=12)

	with gr.Accordion("Advanced", open=False):
	fps = gr.Slider(1, 6, value=1, step=1, label="Sampling FPS")
	max_new_tokens = gr.Slider(32, 512, value=192, step=16, label="Max new tokens")
	temperature = gr.Slider(0.0, 1.2, value=0.2, step=0.05, label="Temperature")
	top_p = gr.Slider(0.1, 1.0, value=0.9, step=0.05, label="Top-p")

	ask.click(
	fn=answer,
	inputs=[video, question, fps, max_new_tokens, temperature, top_p],
	outputs=[output],
	)

	# ========== Launch ==========
	if __name__ == "__main__":
	# Disable SSR to avoid extra startup constraints; works well across CPU/GPU/ZeroGPU.
	demo.launch(ssr_mode=False)