tuandunghcmut commited on
Commit
398fce5
·
1 Parent(s): 412dc28
Files changed (2) hide show
  1. app.py +124 -22
  2. models.py +49 -0
app.py CHANGED
@@ -8,6 +8,7 @@ import spaces
8
  import cv2
9
  import numpy as np
10
  from PIL import Image
 
11
 
12
  def progress_bar_html(label: str) -> str:
13
  """
@@ -54,16 +55,49 @@ def downsample_video(video_path):
54
  vidcap.release()
55
  return frames
56
 
57
- MODEL_ID = "Qwen/Qwen2.5-VL-7B-Instruct" # Alternatively: "Qwen/Qwen2.5-VL-3B-Instruct"
58
- processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
59
- model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
60
- MODEL_ID,
61
- trust_remote_code=True,
62
- torch_dtype=torch.bfloat16
63
- ).to("cuda").eval()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
  @spaces.GPU
66
- def model_inference(input_dict, history):
 
 
 
 
 
67
  text = input_dict["text"]
68
  files = input_dict["files"]
69
 
@@ -102,11 +136,18 @@ def model_inference(input_dict, history):
102
  ).to("cuda")
103
  # Set up streaming generation.
104
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
105
- generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
 
 
 
 
 
 
 
106
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
107
  thread.start()
108
  buffer = ""
109
- yield progress_bar_html("Processing video with Qwen2.5VL Model")
110
  for new_text in streamer:
111
  buffer += new_text
112
  time.sleep(0.01)
@@ -144,11 +185,18 @@ def model_inference(input_dict, history):
144
  padding=True,
145
  ).to("cuda")
146
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
147
- generation_kwargs = dict(inputs, streamer=streamer, max_new_tokens=1024)
 
 
 
 
 
 
 
148
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
149
  thread.start()
150
  buffer = ""
151
- yield progress_bar_html("Processing with Qwen2.5VL Model")
152
  for new_text in streamer:
153
  buffer += new_text
154
  time.sleep(0.01)
@@ -161,15 +209,69 @@ examples = [
161
  [{"text": "@video-infer Explain the content of the video.", "files": ["example_images/sky.mp4"]}],
162
  ]
163
 
164
- demo = gr.ChatInterface(
165
- fn=model_inference,
166
- description="# **Qwen2.5 Series (add `@video-infer` for video understanding)**",
167
- examples=examples,
168
- fill_height=True,
169
- textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple"),
170
- stop_btn="Stop Generation",
171
- multimodal=True,
172
- cache_examples=False,
173
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
174
 
 
175
  demo.launch(debug=True)
 
8
  import cv2
9
  import numpy as np
10
  from PIL import Image
11
+ from models import get_model_list, get_model_info, DEFAULT_GENERATION_PARAMS
12
 
13
  def progress_bar_html(label: str) -> str:
14
  """
 
55
  vidcap.release()
56
  return frames
57
 
58
+ # Initial model will be loaded when the first request comes in
59
+ processor = None
60
+ model = None
61
+ current_model_name = None
62
+
63
+ def load_model(model_name):
64
+ """
65
+ Loads the model and processor based on the model name.
66
+ Returns the model and processor.
67
+ """
68
+ global processor, model, current_model_name
69
+
70
+ # If the model is already loaded, return it
71
+ if model is not None and current_model_name == model_name:
72
+ return model, processor
73
+
74
+ # Get model info
75
+ model_info = get_model_info(model_name)
76
+ MODEL_ID = model_info["id"]
77
+
78
+ # Set dtype based on model info
79
+ dtype = getattr(torch, model_info["dtype"])
80
+
81
+ # Load processor and model
82
+ processor = AutoProcessor.from_pretrained(MODEL_ID, trust_remote_code=True)
83
+ model = Qwen2_5_VLForConditionalGeneration.from_pretrained(
84
+ MODEL_ID,
85
+ trust_remote_code=True,
86
+ torch_dtype=dtype
87
+ ).to(model_info["device"]).eval()
88
+
89
+ # Update current model name
90
+ current_model_name = model_name
91
+
92
+ return model, processor
93
 
94
  @spaces.GPU
95
+ def model_inference(input_dict, history, model_name, temperature=DEFAULT_GENERATION_PARAMS["temperature"],
96
+ top_p=DEFAULT_GENERATION_PARAMS["top_p"], top_k=DEFAULT_GENERATION_PARAMS["top_k"],
97
+ max_new_tokens=DEFAULT_GENERATION_PARAMS["max_new_tokens"]):
98
+ # Load the selected model
99
+ model, processor = load_model(model_name)
100
+
101
  text = input_dict["text"]
102
  files = input_dict["files"]
103
 
 
136
  ).to("cuda")
137
  # Set up streaming generation.
138
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
139
+ generation_kwargs = dict(
140
+ inputs,
141
+ streamer=streamer,
142
+ max_new_tokens=max_new_tokens,
143
+ temperature=temperature,
144
+ top_p=top_p,
145
+ top_k=top_k
146
+ )
147
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
148
  thread.start()
149
  buffer = ""
150
+ yield progress_bar_html(f"Processing video with {model_name}")
151
  for new_text in streamer:
152
  buffer += new_text
153
  time.sleep(0.01)
 
185
  padding=True,
186
  ).to("cuda")
187
  streamer = TextIteratorStreamer(processor, skip_prompt=True, skip_special_tokens=True)
188
+ generation_kwargs = dict(
189
+ inputs,
190
+ streamer=streamer,
191
+ max_new_tokens=max_new_tokens,
192
+ temperature=temperature,
193
+ top_p=top_p,
194
+ top_k=top_k
195
+ )
196
  thread = Thread(target=model.generate, kwargs=generation_kwargs)
197
  thread.start()
198
  buffer = ""
199
+ yield progress_bar_html(f"Processing with {model_name}")
200
  for new_text in streamer:
201
  buffer += new_text
202
  time.sleep(0.01)
 
209
  [{"text": "@video-infer Explain the content of the video.", "files": ["example_images/sky.mp4"]}],
210
  ]
211
 
212
+ def create_interface():
213
+ # Get the list of available models
214
+ model_options = get_model_list()
215
+
216
+ with gr.Blocks() as demo:
217
+ gr.Markdown("# **Qwen2.5 Series (add `@video-infer` for video understanding)**")
218
+
219
+ with gr.Accordion("Model Settings", open=True):
220
+ with gr.Row():
221
+ model_dropdown = gr.Dropdown(
222
+ choices=model_options,
223
+ value=model_options[0],
224
+ label="Select Model"
225
+ )
226
+
227
+ with gr.Row():
228
+ temperature = gr.Slider(
229
+ minimum=0.0,
230
+ maximum=2.0,
231
+ value=DEFAULT_GENERATION_PARAMS["temperature"],
232
+ step=0.1,
233
+ label="Temperature",
234
+ info="Higher values produce more diverse outputs"
235
+ )
236
+ top_p = gr.Slider(
237
+ minimum=0.0,
238
+ maximum=1.0,
239
+ value=DEFAULT_GENERATION_PARAMS["top_p"],
240
+ step=0.05,
241
+ label="Top P",
242
+ info="Nucleus sampling: limit sampling to top P% of probability mass"
243
+ )
244
+
245
+ with gr.Row():
246
+ top_k = gr.Slider(
247
+ minimum=1,
248
+ maximum=100,
249
+ value=DEFAULT_GENERATION_PARAMS["top_k"],
250
+ step=1,
251
+ label="Top K",
252
+ info="Limit sampling to top K most likely tokens"
253
+ )
254
+ max_tokens = gr.Slider(
255
+ minimum=64,
256
+ maximum=2048,
257
+ value=DEFAULT_GENERATION_PARAMS["max_new_tokens"],
258
+ step=64,
259
+ label="Max New Tokens",
260
+ info="Maximum number of tokens to generate"
261
+ )
262
+
263
+ chatbot = gr.ChatInterface(
264
+ fn=model_inference,
265
+ additional_inputs=[model_dropdown, temperature, top_p, top_k, max_tokens],
266
+ examples=examples,
267
+ fill_height=True,
268
+ textbox=gr.MultimodalTextbox(label="Query Input", file_types=["image", "video"], file_count="multiple"),
269
+ stop_btn="Stop Generation",
270
+ multimodal=True,
271
+ cache_examples=False,
272
+ )
273
+
274
+ return demo
275
 
276
+ demo = create_interface()
277
  demo.launch(debug=True)
models.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Module containing model recommendations and configurations for the Qwen2.5 VL application.
3
+ """
4
+
5
+ # Dictionary of recommended models with their specifications
6
+ RECOMMENDED_MODELS = {
7
+ "Qwen2.5-VL-7B-Instruct": {
8
+ "id": "Qwen/Qwen2.5-VL-7B-Instruct",
9
+ "description": "7B parameter vision-language model with instruction tuning",
10
+ "dtype": "bfloat16",
11
+ "device": "cuda"
12
+ },
13
+ "Qwen2.5-VL-3B-Instruct": {
14
+ "id": "Qwen/Qwen2.5-VL-3B-Instruct",
15
+ "description": "3B parameter vision-language model with instruction tuning",
16
+ "dtype": "bfloat16",
17
+ "device": "cuda"
18
+ }
19
+ }
20
+
21
+ # Default generation parameters
22
+ DEFAULT_GENERATION_PARAMS = {
23
+ "max_new_tokens": 1024,
24
+ "temperature": 0.7,
25
+ "top_p": 0.9,
26
+ "top_k": 50,
27
+ "repetition_penalty": 1.0
28
+ }
29
+
30
+ def get_model_info(model_name):
31
+ """
32
+ Returns the model information for a given model name.
33
+
34
+ Args:
35
+ model_name (str): Name of the model
36
+
37
+ Returns:
38
+ dict: Model specifications
39
+ """
40
+ return RECOMMENDED_MODELS.get(model_name, RECOMMENDED_MODELS["Qwen2.5-VL-7B-Instruct"])
41
+
42
+ def get_model_list():
43
+ """
44
+ Returns a list of available models for selection.
45
+
46
+ Returns:
47
+ list: List of model names
48
+ """
49
+ return list(RECOMMENDED_MODELS.keys())