MM_Grounding_DINO_demo

Running on Zero

App Files Files Community

developer0hye commited on Aug 22

Commit

ea17cac

verified ·

1 Parent(s): 077bef8

Update app.py

Browse files

Files changed (1) hide show

app.py +30 -14

app.py CHANGED Viewed

@@ -14,16 +14,28 @@ from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
 # Add supervision for better visualization
 import supervision as sv
-# Model ID for Hugging Face
-model_id = "rziga/mm_grounding_dino_base_all"
-# Load model and processor using Transformers
 device = "cuda" if torch.cuda.is_available() else "cpu"
-processor = AutoProcessor.from_pretrained(model_id)
-model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)
 @spaces.GPU
-def run_grounding(input_image, grounding_caption, box_threshold, text_threshold):
     # Convert numpy array to PIL Image if needed
     if isinstance(input_image, np.ndarray):
         if input_image.ndim == 3:
@@ -63,8 +75,6 @@ def run_grounding(input_image, grounding_caption, box_threshold, text_threshold)
     for i, (box, score, label) in enumerate(zip(result["boxes"], result["scores"], result["labels"])):
         # box is xyxy format [xmin, ymin, xmax, ymax]
-        if label.strip() == "":
-            continue
         xyxy = box.tolist()
         boxes.append(xyxy)
         labels.append(label)
@@ -144,12 +154,18 @@ if __name__ == "__main__":
   }
 """
     with gr.Blocks(css=css) as demo:
-        gr.Markdown("<h1><center>MM Grounding DINO Base<h1><center>")
-        gr.Markdown("<h3><center>Open-World Detection with <a href='https://huggingface.co/openmmlab-community/mm_grounding_dino_base_all'>MM Grounding DINO</a><h3><center>")
         with gr.Row():
             with gr.Column():
                 input_image = gr.Image(label="Input Image", type="pil")
                 grounding_caption = gr.Textbox(
                     label="Detection Prompt (lowercase + each ends with a dot)",
                     value="a person. a car."
@@ -181,16 +197,16 @@ if __name__ == "__main__":
         run_button.click(
             fn=run_grounding,
-            inputs=[input_image, grounding_caption, box_threshold, text_threshold],
             outputs=[gallery, det_text]
         )
         gr.Examples(
             examples=[
-                ["000000039769.jpg", "a cat. a remote control.", 0.3, 0.25],
-                ["KakaoTalk_20250430_163200504.jpg", "cup. screen. hand.", 0.3, 0.25]
             ],
-            inputs=[input_image, grounding_caption, box_threshold, text_threshold],
             outputs=[gallery, det_text],
             fn=run_grounding,
             cache_examples=True,

 # Add supervision for better visualization
 import supervision as sv
+# Model IDs for Hugging Face
+MODEL_IDS = {
+    "MM Grounding DINO Large": "rziga/mm_grounding_dino_large_all",
+    "MM Grounding DINO Base": "rziga/mm_grounding_dino_base_all"
+}
+# Global variables for model caching
 device = "cuda" if torch.cuda.is_available() else "cpu"
+loaded_model_name = None
+processor = None
+model = None
 @spaces.GPU
+def run_grounding(input_image, grounding_caption, model_choice, box_threshold, text_threshold):
+    global loaded_model_name, processor, model
+    # Load or reload model if changed
+    if loaded_model_name != model_choice:
+        model_id = MODEL_IDS[model_choice]
+        processor = AutoProcessor.from_pretrained(model_id)
+        model = AutoModelForZeroShotObjectDetection.from_pretrained(model_id).to(device)
+        loaded_model_name = model_choice
     # Convert numpy array to PIL Image if needed
     if isinstance(input_image, np.ndarray):
         if input_image.ndim == 3:
     for i, (box, score, label) in enumerate(zip(result["boxes"], result["scores"], result["labels"])):
         # box is xyxy format [xmin, ymin, xmax, ymax]
         xyxy = box.tolist()
         boxes.append(xyxy)
         labels.append(label)
   }
 """
     with gr.Blocks(css=css) as demo:
+        gr.Markdown("<h1><center>MM Grounding DINO (Large & Base)<h1><center>")
+        gr.Markdown("<h3><center>Open-World Detection with MM Grounding DINO Models<h3><center>")
         with gr.Row():
             with gr.Column():
                 input_image = gr.Image(label="Input Image", type="pil")
+                model_choice = gr.Radio(
+                    choices=list(MODEL_IDS.keys()),
+                    value="MM Grounding DINO Large",
+                    label="Select Model",
+                    info="Choose between Large (better performance) or Base (faster) model"
+                )
                 grounding_caption = gr.Textbox(
                     label="Detection Prompt (lowercase + each ends with a dot)",
                     value="a person. a car."
         run_button.click(
             fn=run_grounding,
+            inputs=[input_image, grounding_caption, model_choice, box_threshold, text_threshold],
             outputs=[gallery, det_text]
         )
         gr.Examples(
             examples=[
+                ["000000039769.jpg", "a cat. a remote control.", "MM Grounding DINO Large", 0.3, 0.25],
+                ["KakaoTalk_20250430_163200504.jpg", "cup. screen. hand.", "MM Grounding DINO Base", 0.3, 0.25]
             ],
+            inputs=[input_image, grounding_caption, model_choice, box_threshold, text_threshold],
             outputs=[gallery, det_text],
             fn=run_grounding,
             cache_examples=True,