prithivMLmods commited on
Commit
f002e6a
·
verified ·
1 Parent(s): 8d83bb7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +24 -9
app.py CHANGED
@@ -116,6 +116,8 @@ def annotate_image(image: Image.Image, result: dict):
116
  if not isinstance(image, Image.Image) or not isinstance(result, dict):
117
  return image
118
 
 
 
119
  original_width, original_height = image.size
120
 
121
  if "points" in result and result["points"]:
@@ -129,7 +131,8 @@ def annotate_image(image: Image.Image, result: dict):
129
  points_array = np.array(points_list).reshape(1, -1, 2)
130
  key_points = sv.KeyPoints(xy=points_array)
131
  vertex_annotator = sv.VertexAnnotator(radius=4, color=sv.Color.RED)
132
- return vertex_annotator.annotate(scene=image.copy(), key_points=key_points)
 
133
 
134
  if "objects" in result and result["objects"]:
135
  boxes = []
@@ -139,17 +142,18 @@ def annotate_image(image: Image.Image, result: dict):
139
  x_max = obj.get("x_max", 0.0) * original_width
140
  y_max = obj.get("y_max", 0.0) * original_height
141
  boxes.append([x_min, y_min, x_max, y_max])
142
-
143
  if not boxes:
144
  return image
145
-
146
  detections = sv.Detections(xyxy=np.array(boxes))
147
 
148
  if len(detections) == 0:
149
  return image
150
 
151
  box_annotator = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX, thickness=2)
152
- return box_annotator.annotate(scene=image.copy(), detections=detections)
 
153
 
154
  return image
155
 
@@ -243,6 +247,18 @@ def process_inputs(image, category, prompt):
243
 
244
  return qwen_annotated_image, qwen_text
245
 
 
 
 
 
 
 
 
 
 
 
 
 
246
  css = """
247
  #main-title h1 {
248
  font-size: 2.3em !important;
@@ -255,7 +271,7 @@ css = """
255
  with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
256
  with gr.Column(elem_id="col-container"):
257
  gr.Markdown("# **Qwen-3VL: Multimodal Understanding**", elem_id="main-title")
258
-
259
  with gr.Row():
260
  with gr.Column(scale=1):
261
  image_input = gr.Image(type="pil", label="Upload Image")
@@ -266,7 +282,7 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
266
  interactive=True,
267
  )
268
  prompt_input = gr.Textbox(
269
- placeholder="e.g., detect the object.",
270
  label="Prompt",
271
  lines=1,
272
  )
@@ -283,12 +299,11 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
283
  ["examples/4.jpg", "Detect", "Headlight"],
284
  ["examples/3.jpg", "Point", "Gun"],
285
  ["examples/1.jpg", "Query", "Count the total number of boats and describe the environment."],
286
- ["examples/2.jpg", "Caption", "normal"], # <-- FIX: Changed prompt to a valid length
287
-
288
  ],
289
  inputs=[image_input, category_select, prompt_input],
290
  )
291
-
292
  category_select.change(
293
  fn=on_category_change,
294
  inputs=[category_select],
 
116
  if not isinstance(image, Image.Image) or not isinstance(result, dict):
117
  return image
118
 
119
+ # Ensure image is mutable
120
+ image = image.convert("RGB")
121
  original_width, original_height = image.size
122
 
123
  if "points" in result and result["points"]:
 
131
  points_array = np.array(points_list).reshape(1, -1, 2)
132
  key_points = sv.KeyPoints(xy=points_array)
133
  vertex_annotator = sv.VertexAnnotator(radius=4, color=sv.Color.RED)
134
+ annotated_image = vertex_annotator.annotate(scene=np.array(image.copy()), key_points=key_points)
135
+ return Image.fromarray(annotated_image)
136
 
137
  if "objects" in result and result["objects"]:
138
  boxes = []
 
142
  x_max = obj.get("x_max", 0.0) * original_width
143
  y_max = obj.get("y_max", 0.0) * original_height
144
  boxes.append([x_min, y_min, x_max, y_max])
145
+
146
  if not boxes:
147
  return image
148
+
149
  detections = sv.Detections(xyxy=np.array(boxes))
150
 
151
  if len(detections) == 0:
152
  return image
153
 
154
  box_annotator = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX, thickness=2)
155
+ annotated_image = box_annotator.annotate(scene=np.array(image.copy()), detections=detections)
156
+ return Image.fromarray(annotated_image)
157
 
158
  return image
159
 
 
247
 
248
  return qwen_annotated_image, qwen_text
249
 
250
+ def on_category_change(category: str):
251
+ if category == "Query":
252
+ return gr.Textbox(placeholder="e.g., Count the total number of boats and describe the environment.")
253
+ elif category == "Caption":
254
+ return gr.Textbox(placeholder="e.g., short, normal, detailed")
255
+ elif category == "Point":
256
+ return gr.Textbox(placeholder="e.g., The gun held by the person.")
257
+ elif category == "Detect":
258
+ return gr.Textbox(placeholder="e.g., The headlight of the car.")
259
+ return gr.Textbox(placeholder="e.g., detect the object.")
260
+
261
+
262
  css = """
263
  #main-title h1 {
264
  font-size: 2.3em !important;
 
271
  with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
272
  with gr.Column(elem_id="col-container"):
273
  gr.Markdown("# **Qwen-3VL: Multimodal Understanding**", elem_id="main-title")
274
+
275
  with gr.Row():
276
  with gr.Column(scale=1):
277
  image_input = gr.Image(type="pil", label="Upload Image")
 
282
  interactive=True,
283
  )
284
  prompt_input = gr.Textbox(
285
+ placeholder="e.g., Count the total number of boats and describe the environment.",
286
  label="Prompt",
287
  lines=1,
288
  )
 
299
  ["examples/4.jpg", "Detect", "Headlight"],
300
  ["examples/3.jpg", "Point", "Gun"],
301
  ["examples/1.jpg", "Query", "Count the total number of boats and describe the environment."],
302
+ ["examples/2.jpg", "Caption", "a brief"],
 
303
  ],
304
  inputs=[image_input, category_select, prompt_input],
305
  )
306
+
307
  category_select.change(
308
  fn=on_category_change,
309
  inputs=[category_select],