Spaces:

yulu2
/

FoundationMotion

Sleeping

App Files Files Community

sunrainyg commited on 29 days ago

Commit

3d14a12

1 Parent(s): 3f13efa

Update

Browse files

Files changed (1) hide show

app.py +27 -14

app.py CHANGED Viewed

@@ -61,9 +61,14 @@ processor = AutoProcessor.from_pretrained(
     max_pixels=MAX_PIXELS,
 )
-SYSTEM_PROMPT = "You are a helpful assistant that watches a user-provided video and answers questions about it concisely and accurately."
 def build_conversation(video_path: str, question: str, fps: int):
     return [
         {
             "role": "system",
@@ -74,21 +79,18 @@ def build_conversation(video_path: str, question: str, fps: int):
         {
             "role": "user",
             "content": [
-                {"type": "video", "path": video_path},
-                {"type": "text", "text": question},
             ],
         },
     ]
 # ========== Inference ==========
 @torch.inference_mode()
-def answer(video, question, fps=1, max_new_tokens=128, temperature=0.2, top_p=0.9):
-    """
-    Main inference entry used by the Gradio UI.
-    - video: filepath from gr.Video
-    - question: user text; if empty, produce a summary + 5 QA pairs
-    """
     if video is None:
         return "Please upload or drag a video first."
     if not question or question.strip() == "":
@@ -104,30 +106,41 @@ def answer(video, question, fps=1, max_new_tokens=128, temperature=0.2, top_p=0.
         return_dict=True,
         return_tensors="pt",
     )
-    # move tensors to model device
     inputs = {k: (v.to(model.device) if hasattr(v, "to") else v) for k, v in inputs.items()}
     gen_kwargs = dict(
         max_new_tokens=int(max_new_tokens),
         temperature=float(temperature),
         top_p=float(top_p),
         do_sample=(float(temperature) > 0.0),
         pad_token_id=processor.tokenizer.eos_token_id,
     )
     output_ids = model.generate(**inputs, **gen_kwargs)
-    # Remove the prompt portion for clean decoding
     prompt_len = inputs["input_ids"].shape[1]
     generated_ids = output_ids[0, prompt_len:]
-    text = processor.batch_decode(
-        generated_ids.unsqueeze(0),
         skip_special_tokens=True,
         clean_up_tokenization_spaces=True,
-    )[0]
     return text.strip()
 # ========== Gradio UI ==========
 with gr.Blocks(title="Video → Q&A (Qwen2.5-VL-7B WolfV2)") as demo:
     gr.Markdown(

     max_pixels=MAX_PIXELS,
 )
+# ---- Conversation builder (safe) ----
+SYSTEM_PROMPT = (
+    "You are a helpful assistant that watches a user-provided video and answers "
+    "questions about it concisely and accurately."
+)
 def build_conversation(video_path: str, question: str, fps: int):
+    # Use 'video' key per Qwen examples; keep system as structured content
     return [
         {
             "role": "system",
         {
             "role": "user",
             "content": [
+                {"type": "video", "video": video_path},  # <— IMPORTANT
+                {"type": "text",  "text": question},
             ],
         },
     ]
 # ========== Inference ==========
+# ---- Inference (robust decoding + explicit eos) ----
 @torch.inference_mode()
+def answer(video, question, fps=1, max_new_tokens=128, temperature=0.0, top_p=0.9):
     if video is None:
         return "Please upload or drag a video first."
     if not question or question.strip() == "":
         return_dict=True,
         return_tensors="pt",
     )
+    # move tensors to the right device
     inputs = {k: (v.to(model.device) if hasattr(v, "to") else v) for k, v in inputs.items()}
+    # be explicit about eos/pad to avoid weird tails
+    eos_id = model.generation_config.eos_token_id
+    if isinstance(eos_id, list) and len(eos_id) > 0:
+        eos_id = eos_id[0]
     gen_kwargs = dict(
         max_new_tokens=int(max_new_tokens),
         temperature=float(temperature),
         top_p=float(top_p),
         do_sample=(float(temperature) > 0.0),
         pad_token_id=processor.tokenizer.eos_token_id,
+        eos_token_id=eos_id,
     )
     output_ids = model.generate(**inputs, **gen_kwargs)
+    # slice off the prompt for clean decoding
     prompt_len = inputs["input_ids"].shape[1]
     generated_ids = output_ids[0, prompt_len:]
+    # decode with tokenizer.decode (single sequence)
+    text = processor.tokenizer.decode(
+        generated_ids,
         skip_special_tokens=True,
         clean_up_tokenization_spaces=True,
+    )
     return text.strip()
 # ========== Gradio UI ==========
 with gr.Blocks(title="Video → Q&A (Qwen2.5-VL-7B WolfV2)") as demo:
     gr.Markdown(