sunrainyg commited on
Commit
3d14a12
Β·
1 Parent(s): 3f13efa
Files changed (1) hide show
  1. app.py +27 -14
app.py CHANGED
@@ -61,9 +61,14 @@ processor = AutoProcessor.from_pretrained(
61
  max_pixels=MAX_PIXELS,
62
  )
63
 
64
- SYSTEM_PROMPT = "You are a helpful assistant that watches a user-provided video and answers questions about it concisely and accurately."
 
 
 
 
65
 
66
  def build_conversation(video_path: str, question: str, fps: int):
 
67
  return [
68
  {
69
  "role": "system",
@@ -74,21 +79,18 @@ def build_conversation(video_path: str, question: str, fps: int):
74
  {
75
  "role": "user",
76
  "content": [
77
- {"type": "video", "path": video_path},
78
- {"type": "text", "text": question},
79
  ],
80
  },
81
  ]
82
 
83
 
 
84
  # ========== Inference ==========
 
85
  @torch.inference_mode()
86
- def answer(video, question, fps=1, max_new_tokens=128, temperature=0.2, top_p=0.9):
87
- """
88
- Main inference entry used by the Gradio UI.
89
- - video: filepath from gr.Video
90
- - question: user text; if empty, produce a summary + 5 QA pairs
91
- """
92
  if video is None:
93
  return "Please upload or drag a video first."
94
  if not question or question.strip() == "":
@@ -104,30 +106,41 @@ def answer(video, question, fps=1, max_new_tokens=128, temperature=0.2, top_p=0.
104
  return_dict=True,
105
  return_tensors="pt",
106
  )
107
- # move tensors to model device
108
  inputs = {k: (v.to(model.device) if hasattr(v, "to") else v) for k, v in inputs.items()}
109
 
 
 
 
 
 
110
  gen_kwargs = dict(
111
  max_new_tokens=int(max_new_tokens),
112
  temperature=float(temperature),
113
  top_p=float(top_p),
114
  do_sample=(float(temperature) > 0.0),
115
  pad_token_id=processor.tokenizer.eos_token_id,
 
116
  )
117
 
118
  output_ids = model.generate(**inputs, **gen_kwargs)
119
- # Remove the prompt portion for clean decoding
 
120
  prompt_len = inputs["input_ids"].shape[1]
121
  generated_ids = output_ids[0, prompt_len:]
122
 
123
- text = processor.batch_decode(
124
- generated_ids.unsqueeze(0),
 
125
  skip_special_tokens=True,
126
  clean_up_tokenization_spaces=True,
127
- )[0]
128
 
129
  return text.strip()
130
 
 
 
 
131
  # ========== Gradio UI ==========
132
  with gr.Blocks(title="Video β†’ Q&A (Qwen2.5-VL-7B WolfV2)") as demo:
133
  gr.Markdown(
 
61
  max_pixels=MAX_PIXELS,
62
  )
63
 
64
+ # ---- Conversation builder (safe) ----
65
+ SYSTEM_PROMPT = (
66
+ "You are a helpful assistant that watches a user-provided video and answers "
67
+ "questions about it concisely and accurately."
68
+ )
69
 
70
  def build_conversation(video_path: str, question: str, fps: int):
71
+ # Use 'video' key per Qwen examples; keep system as structured content
72
  return [
73
  {
74
  "role": "system",
 
79
  {
80
  "role": "user",
81
  "content": [
82
+ {"type": "video", "video": video_path}, # <β€” IMPORTANT
83
+ {"type": "text", "text": question},
84
  ],
85
  },
86
  ]
87
 
88
 
89
+
90
  # ========== Inference ==========
91
+ # ---- Inference (robust decoding + explicit eos) ----
92
  @torch.inference_mode()
93
+ def answer(video, question, fps=1, max_new_tokens=128, temperature=0.0, top_p=0.9):
 
 
 
 
 
94
  if video is None:
95
  return "Please upload or drag a video first."
96
  if not question or question.strip() == "":
 
106
  return_dict=True,
107
  return_tensors="pt",
108
  )
109
+ # move tensors to the right device
110
  inputs = {k: (v.to(model.device) if hasattr(v, "to") else v) for k, v in inputs.items()}
111
 
112
+ # be explicit about eos/pad to avoid weird tails
113
+ eos_id = model.generation_config.eos_token_id
114
+ if isinstance(eos_id, list) and len(eos_id) > 0:
115
+ eos_id = eos_id[0]
116
+
117
  gen_kwargs = dict(
118
  max_new_tokens=int(max_new_tokens),
119
  temperature=float(temperature),
120
  top_p=float(top_p),
121
  do_sample=(float(temperature) > 0.0),
122
  pad_token_id=processor.tokenizer.eos_token_id,
123
+ eos_token_id=eos_id,
124
  )
125
 
126
  output_ids = model.generate(**inputs, **gen_kwargs)
127
+
128
+ # slice off the prompt for clean decoding
129
  prompt_len = inputs["input_ids"].shape[1]
130
  generated_ids = output_ids[0, prompt_len:]
131
 
132
+ # decode with tokenizer.decode (single sequence)
133
+ text = processor.tokenizer.decode(
134
+ generated_ids,
135
  skip_special_tokens=True,
136
  clean_up_tokenization_spaces=True,
137
+ )
138
 
139
  return text.strip()
140
 
141
+
142
+
143
+
144
  # ========== Gradio UI ==========
145
  with gr.Blocks(title="Video β†’ Q&A (Qwen2.5-VL-7B WolfV2)") as demo:
146
  gr.Markdown(