Spaces:

prithivMLmods
/

Qwen3-VL-HF-Demo

Running on Zero

App Files Files Community

prithivMLmods commited on 10 days ago

Commit

f002e6a

verified ·

1 Parent(s): 8d83bb7

Update app.py

Browse files

Files changed (1) hide show

app.py +24 -9

app.py CHANGED Viewed

@@ -116,6 +116,8 @@ def annotate_image(image: Image.Image, result: dict):
     if not isinstance(image, Image.Image) or not isinstance(result, dict):
         return image
     original_width, original_height = image.size
     if "points" in result and result["points"]:
@@ -129,7 +131,8 @@ def annotate_image(image: Image.Image, result: dict):
         points_array = np.array(points_list).reshape(1, -1, 2)
         key_points = sv.KeyPoints(xy=points_array)
         vertex_annotator = sv.VertexAnnotator(radius=4, color=sv.Color.RED)
-        return vertex_annotator.annotate(scene=image.copy(), key_points=key_points)
     if "objects" in result and result["objects"]:
         boxes = []
@@ -139,17 +142,18 @@ def annotate_image(image: Image.Image, result: dict):
             x_max = obj.get("x_max", 0.0) * original_width
             y_max = obj.get("y_max", 0.0) * original_height
             boxes.append([x_min, y_min, x_max, y_max])
         if not boxes:
             return image
         detections = sv.Detections(xyxy=np.array(boxes))
         if len(detections) == 0:
             return image
         box_annotator = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX, thickness=2)
-        return box_annotator.annotate(scene=image.copy(), detections=detections)
     return image
@@ -243,6 +247,18 @@ def process_inputs(image, category, prompt):
     return qwen_annotated_image, qwen_text
 css = """
 #main-title h1 {
     font-size: 2.3em !important;
@@ -255,7 +271,7 @@ css = """
 with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
     with gr.Column(elem_id="col-container"):
         gr.Markdown("# **Qwen-3VL: Multimodal Understanding**", elem_id="main-title")
         with gr.Row():
             with gr.Column(scale=1):
                 image_input = gr.Image(type="pil", label="Upload Image")
@@ -266,7 +282,7 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
                     interactive=True,
                 )
                 prompt_input = gr.Textbox(
-                    placeholder="e.g., detect the object.",
                     label="Prompt",
                     lines=1,
                 )
@@ -283,12 +299,11 @@ with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
                 ["examples/4.jpg", "Detect", "Headlight"],
                 ["examples/3.jpg", "Point", "Gun"],
                 ["examples/1.jpg", "Query", "Count the total number of boats and describe the environment."],
-                ["examples/2.jpg", "Caption", "normal"], # <-- FIX: Changed prompt to a valid length
             ],
             inputs=[image_input, category_select, prompt_input],
         )
     category_select.change(
         fn=on_category_change,
         inputs=[category_select],

     if not isinstance(image, Image.Image) or not isinstance(result, dict):
         return image
+    # Ensure image is mutable
+    image = image.convert("RGB")
     original_width, original_height = image.size
     if "points" in result and result["points"]:
         points_array = np.array(points_list).reshape(1, -1, 2)
         key_points = sv.KeyPoints(xy=points_array)
         vertex_annotator = sv.VertexAnnotator(radius=4, color=sv.Color.RED)
+        annotated_image = vertex_annotator.annotate(scene=np.array(image.copy()), key_points=key_points)
+        return Image.fromarray(annotated_image)
     if "objects" in result and result["objects"]:
         boxes = []
             x_max = obj.get("x_max", 0.0) * original_width
             y_max = obj.get("y_max", 0.0) * original_height
             boxes.append([x_min, y_min, x_max, y_max])
         if not boxes:
             return image
         detections = sv.Detections(xyxy=np.array(boxes))
         if len(detections) == 0:
             return image
         box_annotator = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX, thickness=2)
+        annotated_image = box_annotator.annotate(scene=np.array(image.copy()), detections=detections)
+        return Image.fromarray(annotated_image)
     return image
     return qwen_annotated_image, qwen_text
+def on_category_change(category: str):
+    if category == "Query":
+        return gr.Textbox(placeholder="e.g., Count the total number of boats and describe the environment.")
+    elif category == "Caption":
+        return gr.Textbox(placeholder="e.g., short, normal, detailed")
+    elif category == "Point":
+        return gr.Textbox(placeholder="e.g., The gun held by the person.")
+    elif category == "Detect":
+        return gr.Textbox(placeholder="e.g., The headlight of the car.")
+    return gr.Textbox(placeholder="e.g., detect the object.")
 css = """
 #main-title h1 {
     font-size: 2.3em !important;
 with gr.Blocks(css=css, theme=steel_blue_theme) as demo:
     with gr.Column(elem_id="col-container"):
         gr.Markdown("# **Qwen-3VL: Multimodal Understanding**", elem_id="main-title")
         with gr.Row():
             with gr.Column(scale=1):
                 image_input = gr.Image(type="pil", label="Upload Image")
                     interactive=True,
                 )
                 prompt_input = gr.Textbox(
+                    placeholder="e.g., Count the total number of boats and describe the environment.",
                     label="Prompt",
                     lines=1,
                 )
                 ["examples/4.jpg", "Detect", "Headlight"],
                 ["examples/3.jpg", "Point", "Gun"],
                 ["examples/1.jpg", "Query", "Count the total number of boats and describe the environment."],
+                ["examples/2.jpg", "Caption", "a brief"],
             ],
             inputs=[image_input, category_select, prompt_input],
         )
     category_select.change(
         fn=on_category_change,
         inputs=[category_select],