Spaces:

prithivMLmods
/

Qwen3-VL-HF-Demo

Running on Zero

App Files Files Community

prithivMLmods commited on 4 days ago

Commit

4306537

verified ·

1 Parent(s): 42957be

update app

Browse files

Files changed (1) hide show

app.py +236 -295

app.py CHANGED Viewed

@@ -1,319 +1,260 @@
-import spaces
-import json
-import math
-import os
-import traceback
-from io import BytesIO
-from typing import Any, Dict, List, Optional, Tuple, Iterable
-import re
-import time
-from threading import Thread
-from io import BytesIO
-import uuid
-import tempfile
 import gradio as gr
-import numpy as np
 import torch
-from PIL import Image
 import supervision as sv
 from transformers import (
     Qwen3VLForConditionalGeneration,
-    AutoModelForCausalLM,
-    AutoProcessor,
-)
-from gradio.themes import Soft
-from gradio.themes.utils import colors, fonts, sizes
-# --- Theme and CSS Definition ---
-# Define the SteelBlue color palette
-colors.steel_blue = colors.Color(
-    name="steel_blue",
-    c50="#EBF3F8",
-    c100="#D3E5F0",
-    c200="#A8CCE1",
-    c300="#7DB3D2",
-    c400="#529AC3",
-    c500="#4682B4",  # SteelBlue base color
-    c600="#3E72A0",
-    c700="#36638C",
-    c800="#2E5378",
-    c900="#264364",
-    c950="#1E3450",
 )
-class SteelBlueTheme(Soft):
-    def __init__(
-        self,
-        *,
-        primary_hue: colors.Color | str = colors.gray,
-        secondary_hue: colors.Color | str = colors.steel_blue,
-        neutral_hue: colors.Color | str = colors.slate,
-        text_size: sizes.Size | str = sizes.text_lg,
-        font: fonts.Font | str | Iterable[fonts.Font | str] = (
-            fonts.GoogleFont("Outfit"), "Arial", "sans-serif",
-        ),
-        font_mono: fonts.Font | str | Iterable[fonts.Font | str] = (
-            fonts.GoogleFont("IBM Plex Mono"), "ui-monospace", "monospace",
-        ),
-    ):
-        super().__init__(
-            primary_hue=primary_hue,
-            secondary_hue=secondary_hue,
-            neutral_hue=neutral_hue,
-            text_size=text_size,
-            font=font,
-            font_mono=font_mono,
         )
-        super().set(
-            background_fill_primary="*primary_50",
-            background_fill_primary_dark="*primary_900",
-            body_background_fill="linear-gradient(135deg, *primary_200, *primary_100)",
-            body_background_fill_dark="linear-gradient(135deg, *primary_900, *primary_800)",
-            button_primary_text_color="white",
-            button_primary_text_color_hover="white",
-            button_primary_background_fill="linear-gradient(90deg, *secondary_500, *secondary_600)",
-            button_primary_background_fill_hover="linear-gradient(90deg, *secondary_600, *secondary_700)",
-            button_primary_background_fill_dark="linear-gradient(90deg, *secondary_600, *secondary_700)",
-            button_primary_background_fill_hover_dark="linear-gradient(90deg, *secondary_500, *secondary_600)",
-            slider_color="*secondary_500",
-            slider_color_dark="*secondary_600",
-            block_title_text_weight="600",
-            block_border_width="3px",
-            block_shadow="*shadow_drop_lg",
-            button_primary_shadow="*shadow_drop_lg",
-            button_large_padding="11px",
-            color_accent_soft="*primary_100",
-            block_label_background_fill="*primary_200",
         )
-# Instantiate the new theme
-steel_blue_theme = SteelBlueTheme()
-css = """
-#main-title h1 {
-    font-size: 2.3em !important;
-}
-#output-title h2 {
-    font-size: 2.1em !important;
-}
-"""
-# --- Constants and Model Setup ---
-device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
-print("--- System Information ---")
-print("CUDA_VISIBLE_DEVICES=", os.environ.get("CUDA_VISIBLE_DEVICES"))
-print("torch.__version__ =", torch.__version__)
-print("torch.version.cuda =", torch.version.cuda)
-print("CUDA available:", torch.cuda.is_available())
-print("CUDA device count:", torch.cuda.device_count())
-if torch.cuda.is_available():
-    print("Current device:", torch.cuda.current_device())
-    print("Device name:", torch.cuda.get_device_name(torch.cuda.current_device()))
-print("Using device:", device)
-print("--------------------------")
-# --- Model Loading ---
-# Load moondream3
-print("Loading moondream3-preview...")
-MODEL_ID_MD3 = "Qwen/Qwen3-VL-32B-Instruct"
-model_md3 = Qwen3VLForConditionalGeneration.from_pretrained(
-    MODEL_ID_MD3,
-    trust_remote_code=True,
-    torch_dtype=torch.bfloat16,
-    device_map={"": "cuda"},
-)
-model_md3.compile()
-print("moondream3-preview loaded and compiled.")
-# --- Moondream3 Utility Functions ---
-def create_annotated_image(image, detection_result, object_name="Object"):
-    if not isinstance(detection_result, dict) or "objects" not in detection_result:
-        return image
-    original_width, original_height = image.size
-    annotated_image = np.array(image.convert("RGB"))
-    bboxes = []
-    labels = []
-    for i, obj in enumerate(detection_result["objects"]):
-        x_min = int(obj["x_min"] * original_width)
-        y_min = int(obj["y_min"] * original_height)
-        x_max = int(obj["x_max"] * original_width)
-        y_max = int(obj["y_max"] * original_height)
-        x_min = max(0, min(x_min, original_width))
-        y_min = max(0, min(y_min, original_height))
-        x_max = max(0, min(x_max, original_width))
-        y_max = max(0, min(y_max, original_height))
-        if x_max > x_min and y_max > y_min:
-            bboxes.append([x_min, y_min, x_max, y_max])
-            labels.append(f"{object_name} {i+1}")
-    if not bboxes:
-        return image
-    detections = sv.Detections(
-        xyxy=np.array(bboxes, dtype=np.float32),
-        class_id=np.arange(len(bboxes))
     )
-    bounding_box_annotator = sv.BoxAnnotator(
-        thickness=3,
-        color_lookup=sv.ColorLookup.INDEX
     )
-    label_annotator = sv.LabelAnnotator(
-        text_thickness=2,
-        text_scale=0.6,
-        color_lookup=sv.ColorLookup.INDEX
     )
-    annotated_image = bounding_box_annotator.annotate(
-        scene=annotated_image, detections=detections
     )
-    annotated_image = label_annotator.annotate(
-        scene=annotated_image, detections=detections, labels=labels
-    )
-    return Image.fromarray(annotated_image)
-def create_point_annotated_image(image, point_result):
-    if not isinstance(point_result, dict) or "points" not in point_result:
-        return image
-    original_width, original_height = image.size
-    annotated_image = np.array(image.convert("RGB"))
-    points = []
-    for point in point_result["points"]:
-        x = int(point["x"] * original_width)
-        y = int(point["y"] * original_height)
-        points.append([x, y])
-    if points:
-        points_array = np.array(points).reshape(1, -1, 2)
-        key_points = sv.KeyPoints(xy=points_array)
-        vertex_annotator = sv.VertexAnnotator(radius=8, color=sv.Color.RED)
-        annotated_image = vertex_annotator.annotate(
-            scene=annotated_image, key_points=key_points
-        )
-    return Image.fromarray(annotated_image)
-@spaces.GPU()
-def detect_objects_md3(image, prompt, task_type, max_objects):
-    STANDARD_SIZE = (1024, 1024)
-    if image is None:
-        raise gr.Error("Please upload an image.")
-    image.thumbnail(STANDARD_SIZE)
-    t0 = time.perf_counter()
-    if task_type == "Object Detection":
-        settings = {"max_objects": max_objects} if max_objects > 0 else {}
-        result = model_md3.detect(image, prompt, settings=settings)
-        annotated_image = create_annotated_image(image, result, prompt)
-    elif task_type == "Point Detection":
-        result = model_md3.point(image, prompt)
-        annotated_image = create_point_annotated_image(image, result)
-    elif task_type == "Caption":
-        result = model_md3.caption(image, length="normal")
-        annotated_image = image
-    else:
-        result = model_md3.query(image=image, question=prompt, reasoning=True)
-        annotated_image = image
-    elapsed_ms = (time.perf_counter() - t0) * 1_000
-    if isinstance(result, dict):
-        if "objects" in result:
-          output_text = f"Found {len(result['objects'])} objects:\n"
-          for i, obj in enumerate(result['objects'], 1):
-              output_text += f"\n{i}. Bounding box: ({obj['x_min']:.3f}, {obj['y_min']:.3f}, {obj['x_max']:.3f}, {obj['y_max']:.3f})"
-        elif "points" in result:
-            output_text = f"Found {len(result['points'])} points:\n"
-            for i, point in enumerate(result['points'], 1):
-                output_text += f"\n{i}. Point: ({point['x']:.3f}, {point['y']:.3f})"
-        elif "caption" in result:
-            output_text = result['caption']
-        elif "answer" in result:
-            output_text = f"Reasoning: {result.get('reasoning', 'N/A')}\n\nAnswer: {result['answer']}"
-        else:
-            output_text = json.dumps(result, indent=2)
-    else:
-        output_text = str(result)
-    timing_text = f"Inference time: {elapsed_ms:.0f} ms"
-    return annotated_image, output_text, timing_text
-# --- Gradio Interface ---
-def create_gradio_interface():
-    """Builds and returns the Gradio web interface."""
-    with gr.Blocks(theme=steel_blue_theme, css=css) as demo:
-        gr.Markdown("# **🌝 Moondream3 Lab**", elem_id="main-title")
-        gr.Markdown("Explore the capabilities of the Moondream3 Vision Language Model for tasks like Object/Point Detection, VQA, and Captioning.")
-        with gr.Row():
-            with gr.Column(scale=1):
-                md3_image_input = gr.Image(label="Upload an image", type="pil", height=400)
-                md3_task_type = gr.Radio(
-                    choices=["Object Detection", "Point Detection", "Caption", "Visual Question Answering"],
-                    label="Task Type", value="Object Detection"
-                )
-                md3_prompt_input = gr.Textbox(
-                    label="Prompt (object to detect/question to ask)",
-                    placeholder="e.g., 'car', 'person', 'What's in this image?'"
-                )
-                md3_max_objects = gr.Number(
-                    label="Max Objects (for Object Detection only)",
-                    value=10, minimum=1, maximum=50, step=1, visible=True
-                )
-                md3_generate_btn = gr.Button(value="Submit", variant="primary")
-            with gr.Column(scale=1):
-                md3_output_image = gr.Image(type="pil", label="Result", height=400)
-                md3_output_textbox = gr.Textbox(label="Model Response", lines=10, show_copy_button=True)
-                md3_output_time = gr.Markdown()
-        gr.Examples(
-            examples=[
-                ["md3/1.jpg", "Object Detection", "boats", 7],
-                ["md3/2.jpg", "Point Detection", "children", 7],
-                ["md3/3.png", "Caption", "", 5],
-                ["md3/4.jpeg", "Visual Question Answering", "Analyze the GDP trend over the years.", 5],
-            ],
-            inputs=[md3_image_input, md3_task_type, md3_prompt_input, md3_max_objects],
-            label="Click an example to populate inputs"
-        )
-        # Event listeners for the interface
-        def update_max_objects_visibility(task):
-            return gr.update(visible=(task == "Object Detection"))
-        md3_task_type.change(fn=update_max_objects_visibility, inputs=[md3_task_type], outputs=[md3_max_objects])
-        md3_generate_btn.click(
-            fn=detect_objects_md3,
-            inputs=[md3_image_input, md3_prompt_input, md3_task_type, md3_max_objects],
-            outputs=[md3_output_image, md3_output_textbox, md3_output_time]
-        )
-    return demo
 if __name__ == "__main__":
-    demo = create_gradio_interface()
-    demo.queue(max_size=50).launch(ssr_mode=False, mcp_server=True, show_error=True)

 import gradio as gr
+from gradio.themes.ocean import Ocean
 import torch
+import numpy as np
 import supervision as sv
 from transformers import (
     Qwen3VLForConditionalGeneration,
+    Qwen3VLProcessor,
 )
+import json
+import ast
+import re
+from PIL import Image
+from spaces import GPU
+# --- Constants and Configuration ---
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
+DTYPE = "auto"
+CATEGORIES = ["Query", "Caption", "Point", "Detect"]
+PLACEHOLDERS = {
+    "Query": "What is in this image?",
+    "Caption": "Select a caption length from the suggestions below.",
+    "Point": "Select an object from suggestions or enter a custom one.",
+    "Detect": "Select an object from suggestions or enter a custom one.",
+}
+qwen_model = Qwen3VLForConditionalGeneration.from_pretrained(
+    "Qwen/Qwen3-VL-32B-Instruct",
+    torch_dtype=DTYPE,
+    device_map=DEVICE,
+).eval()
+qwen_processor = Qwen3VLProcessor.from_pretrained(
+    "Qwen/Qwen3-VL-32B-Instruct",
+)
+print("Model loaded successfully.")
+# --- Utility Functions ---
+def safe_parse_json(text: str):
+    """Safely parse JSON or Python literal from a string, cleaning it first."""
+    # Find the JSON object within the text
+    match = re.search(r'\{.*\}', text, re.DOTALL)
+    if not match:
+        return {}
+    text = match.group(0)
+    try:
+        return json.loads(text)
+    except json.JSONDecodeError:
+        try:
+            # Fallback for Python dictionary literals
+            return ast.literal_eval(text)
+        except (ValueError, SyntaxError):
+            return {}
+def annotate_image(image: Image.Image, result: dict, category: str):
+    """Draws annotations on the image based on the model's output."""
+    if not isinstance(image, Image.Image) or not isinstance(result, dict):
+        return image
+    image_np = np.array(image.convert("RGB"))
+    # Handle Point annotations
+    if category == "Point" and "points" in result and result["points"]:
+        points_xy = np.array(result["points"])
+        if points_xy.size == 0:
+            return image
+        # Denormalize points from [0, 1] range to image dimensions
+        points_xy *= np.array([image.width, image.height])
+        key_points = sv.KeyPoints(xy=points_xy.reshape(1, -1, 2))
+        annotator = sv.VertexAnnotator(radius=8, color=sv.Color.RED)
+        annotated_image = annotator.annotate(scene=image_np.copy(), key_points=key_points)
+        return Image.fromarray(annotated_image)
+    # Handle Detection annotations
+    if category == "Detect" and "objects" in result and result["objects"]:
+        boxes_xyxy = np.array(result["objects"])
+        if boxes_xyxy.size == 0:
+            return image
+        # Denormalize boxes from [0, 1] range to image dimensions
+        boxes_xyxy *= np.array([image.width, image.height, image.width, image.height])
+        detections = sv.Detections(xyxy=boxes_xyxy)
+        annotator = sv.BoxAnnotator(color_lookup=sv.ColorLookup.INDEX, thickness=4)
+        annotated_image = annotator.annotate(scene=image_np.copy(), detections=detections)
+        return Image.fromarray(annotated_image)
+    return image
+# --- Inference Functions ---
+def run_qwen_inference(image: Image.Image, prompt: str):
+    """Core function to run inference with the Qwen3-VL model."""
+    messages = [{"role": "user", "content": [{"type": "image"}, {"type": "text", "text": prompt}]}]
+    inputs = qwen_processor.apply_chat_template(
+        messages,
+        tokenize=True,
+        add_generation_prompt=True,
+        return_dict=True,
+        return_tensors="pt",
+    ).to(DEVICE)
+    with torch.inference_mode():
+        generated_ids = qwen_model.generate(**inputs, max_new_tokens=512)
+    generated_ids_trimmed = generated_ids[:, inputs.input_ids.shape[1]:]
+    output_text = qwen_processor.batch_decode(
+        generated_ids_trimmed,
+        skip_special_tokens=True,
+        clean_up_tokenization_spaces=False,
+    )[0]
+    return output_text
+@GPU
+def get_suggested_objects(image: Image.Image):
+    """Get suggested objects in the image using Qwen3-VL to populate radio buttons."""
+    if image is None:
+        return gr.Radio(choices=[], visible=False)
+    try:
+        prompt = "List the 3 most prominent objects in this image as a Python list of strings. Example: ['car', 'tree', 'person']"
+        result_text = run_qwen_inference(image, prompt)
+        match = re.search(r'\[.*?\]', result_text)
+        if match:
+            suggestions = ast.literal_eval(match.group())
+            if isinstance(suggestions, list) and suggestions:
+                return gr.Radio(choices=suggestions, visible=True, interactive=True)
+    except Exception as e:
+        print(f"Error getting suggestions with Qwen: {e}")
+    return gr.Radio(choices=[], visible=False)
+@GPU
+def process_qwen(image: Image.Image, category: str, prompt: str):
+    """Process inputs based on the selected category, returning text and data for annotation."""
+    if category == "Query":
+        return run_qwen_inference(image, prompt), {}
+    elif category == "Caption":
+        full_prompt = f"Provide a {prompt} length caption for the image."
+        return run_qwen_inference(image, full_prompt), {}
+    elif category == "Point":
+        full_prompt = (
+            f"Provide 2D point coordinates for '{prompt}'. Respond ONLY with a JSON object like "
+            f"`{{\"points\": [[x1, y1], [x2, y2], ...]}}`. The coordinates must be normalized between 0.0 and 1.0."
         )
+        output_text = run_qwen_inference(image, full_prompt)
+        parsed_json = safe_parse_json(output_text)
+        # Ensure the parsed data has the correct structure
+        if "points" not in parsed_json or not isinstance(parsed_json["points"], list):
+            return output_text, {}
+        return output_text, parsed_json
+    elif category == "Detect":
+        full_prompt = (
+            f"Provide bounding box coordinates for '{prompt}'. Respond ONLY with a JSON object like "
+            f"`{{\"objects\": [[x_min, y_min, x_max, y_max], ...]}}`. The coordinates must be normalized between 0.0 and 1.0."
         )
+        output_text = run_qwen_inference(image, full_prompt)
+        parsed_json = safe_parse_json(output_text)
+        if "objects" not in parsed_json or not isinstance(parsed_json["objects"], list):
+            return output_text, {}
+        return output_text, parsed_json
+    return "Invalid category", {}
+# --- Gradio Interface Logic ---
+def on_category_and_image_change(image, category):
+    """Handle UI changes when the image or category is updated."""
+    text_box = gr.Textbox(value="", placeholder=PLACEHOLDERS.get(category, ""), interactive=True)
+    if category == "Caption":
+        return gr.Radio(choices=["short", "normal", "long"], value="normal", visible=True), text_box
+    if image is None or category not in ["Point", "Detect"]:
+        return gr.Radio(choices=[], visible=False), text_box
+    return get_suggested_objects(image), text_box
+def process_inputs(image, category, prompt):
+    """Main function to handle the user's submission."""
+    if image is None:
+        raise gr.Error("Please upload an image.")
+    if not prompt and category not in ["Caption"]:
+         raise gr.Error("Please provide a prompt or select a suggestion.")
+    if category == "Caption" and not prompt:
+        prompt = "normal" # Default caption length
+    image.thumbnail((1024, 1024)) # Resize for faster inference
+    qwen_text, qwen_data = process_qwen(image, category, prompt)
+    qwen_annotated_image = annotate_image(image, qwen_data, category)
+    return qwen_annotated_image, qwen_text
+# --- Gradio UI Layout ---
+with gr.Blocks(theme=Ocean()) as demo:
+    gr.Markdown("# 👓 Object Understanding with Qwen3-VL")
+    gr.Markdown("### Explore object detection, keypoint detection, and captioning using natural language prompts.")
+    with gr.Row():
+        with gr.Column(scale=1):
+            image_input = gr.Image(type="pil", label="Input Image")
+            category_select = gr.Radio(
+                choices=CATEGORIES, value=CATEGORIES[0], label="Select Task", interactive=True
+            )
+            suggestions_radio = gr.Radio(
+                choices=[], label="Suggestions", visible=False, interactive=True
+            )
+            prompt_input = gr.Textbox(
+                placeholder=PLACEHOLDERS[CATEGORIES[0]], label="Prompt", lines=2
+            )
+            submit_btn = gr.Button("Generate", variant="primary")
+        with gr.Column(scale=2):
+            gr.Markdown("### Qwen/Qwen3-VL-4B-Instruct Output")
+            qwen_img_output = gr.Image(label="Annotated Image")
+            qwen_text_output = gr.Textbox(label="Text Output", lines=8, interactive=False, show_copy_button=True)
+    gr.Examples(
+        examples=[
+            ["examples/cars.jpg", "Query", "How many cars are in the image?"],
+            ["examples/dog_beach.jpg", "Detect", "dog"],
+            ["examples/person_skiing.jpg", "Point", "the person's head"],
+            ["examples/dog_beach.jpg", "Caption", "short"],
+        ],
+        inputs=[image_input, category_select, prompt_input],
     )
+    # --- Event Listeners ---
+    category_select.change(
+        fn=on_category_and_image_change,
+        inputs=[image_input, category_select],
+        outputs=[suggestions_radio, prompt_input],
     )
+    image_input.change(
+        fn=on_category_and_image_change,
+        inputs=[image_input, category_select],
+        outputs=[suggestions_radio, prompt_input],
     )
+    suggestions_radio.change(fn=lambda x: x, inputs=suggestions_radio, outputs=prompt_input)
+    submit_btn.click(
+        fn=process_inputs,
+        inputs=[image_input, category_select, prompt_input],
+        outputs=[qwen_img_output, qwen_text_output],
     )
 if __name__ == "__main__":
+    demo.launch()