Spaces:

dmorawiec
/

LLMDet_Arena

Running

App Files Files Community

Darius Morawiec commited on Sep 23

Commit

9401db3

1 Parent(s): 15693ed

Refactor object detection logic and update UI components for improved usability

Browse files

Files changed (1) hide show

app.py +45 -99

app.py CHANGED Viewed

@@ -2,9 +2,9 @@ import gradio as gr
 import PIL.Image
 import torch
 from transformers import AutoModelForZeroShotObjectDetection, AutoProcessor
-# DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
-DEVICE = "cpu"
 class Detector:
@@ -67,116 +67,62 @@ def _postprocess(detections):
 def detect_objects(image, labels, confidence_threshold):
     labels = [label.strip() for label in labels.split(",")]
-    return (
-        (
-            image,
-            _postprocess(
-                models["tiny"].detect(
-                    image,
-                    labels,
-                    threshold=confidence_threshold,
-                )
-            ),
-        ),
-        (
-            image,
-            _postprocess(
-                models["base"].detect(
-                    image,
-                    labels,
-                    threshold=confidence_threshold,
-                )
-            ),
-        ),
-        (
             image,
-            _postprocess(
-                models["large"].detect(
-                    image,
-                    labels,
-                    threshold=confidence_threshold,
-                )
-            ),
-        ),
-    )
 with gr.Blocks() as demo:
-    gr.Markdown("# LLMDet Open Vocabulary Object Detection")
-    confidence_slider = gr.Slider(
-        0,
-        1,
-        value=0.4,
-        step=0.01,
-        interactive=True,
-        label="Confidence threshold",
-    )
-    labels = [
-        "backpack",
-        "bag",
-        "belt",
-        "blouse",
-        "boot",
-        "bracelet",
-        "cap",
-        "cardigan",
-        "coat",
-        "dress",
-        "earring",
-        "flipflop",
-        "glasses",
-        "glove",
-        "handbag",
-        "hat",
-        "heels",
-        "jacket",
-        "jeans",
-        "loafer",
-        "necklace",
-        "pullover",
-        "raincoat",
-        "ring",
-        "sandal",
-        "scarf",
-        "shirt",
-        "shoe",
-        "shorts",
-        "skirt",
-        "slippers",
-        "sneaker",
-        "socks",
-        "suitcase",
-        "sunglasses",
-        "sweater",
-        "tshirt",
-        "tie",
-        "top",
-        "trouser",
-        "umbrella",
-        "vest",
-        "watch",
-    ]
-    # Requested labels
-    text_input = gr.Textbox(
-        label="Object labels (comma separated)!",
-        placeholder="shirt, jeans, shoe",
-        lines=1,
-        value=",".join(labels),
-    )
     with gr.Row():
-        image_input = gr.Image(type="pil", image_mode="RGB")
     with gr.Row():
         output_annotated_image_tiny = gr.AnnotatedImage(label="TINY")
         output_annotated_image_base = gr.AnnotatedImage(label="BASE")
         output_annotated_image_large = gr.AnnotatedImage(label="LARGE")
-    detect_button = gr.Button("Detect")
     # Connect the button to the detection function
     detect_button.click(
         fn=detect_objects,

 import PIL.Image
 import torch
 from transformers import AutoModelForZeroShotObjectDetection, AutoProcessor
+from transformers.image_utils import load_image
+DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
 class Detector:
 def detect_objects(image, labels, confidence_threshold):
     labels = [label.strip() for label in labels.split(",")]
+    detections = []
+    for model_name in models.keys():
+        detection = models[model_name].detect(
             image,
+            labels,
+            threshold=confidence_threshold,
+        )
+        detections.append(_postprocess(detection))
+    return tuple((image, det) for det in detections)
 with gr.Blocks() as demo:
+    gr.Markdown("# [LLMDet](https://arxiv.org/abs/2501.18954) Arena ✨")
+    with gr.Row():
+        with gr.Column():
+            gr.Markdown("## Input Image")
+            image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
+            image = load_image(image_url)
+            image_input = gr.Image(type="pil", image_mode="RGB", value=image)
+        with gr.Column():
+            gr.Markdown("## Settings")
+            confidence_slider = gr.Slider(
+                0,
+                1,
+                value=0.4,
+                step=0.01,
+                interactive=True,
+                label="Confidence threshold:",
+            )
+            labels = ["a cat", "a remote control"]
+            text_input = gr.Textbox(
+                label="Object labels (comma separated):",
+                placeholder=",".join(labels),
+                lines=1,
+                value=",".join(labels),
+            )
+    with gr.Row():
+        detect_button = gr.Button("Run Object Detection")
     with gr.Row():
+        gr.Markdown("## Output Annotated Images")
     with gr.Row():
         output_annotated_image_tiny = gr.AnnotatedImage(label="TINY")
         output_annotated_image_base = gr.AnnotatedImage(label="BASE")
         output_annotated_image_large = gr.AnnotatedImage(label="LARGE")
     # Connect the button to the detection function
     detect_button.click(
         fn=detect_objects,