import gradio as gr import spaces from autodistill_grounded_sam_2 import GroundedSAM2 from autodistill_grounded_sam_2.helpers import combine_detections from autodistill.helpers import load_image import torch from autodistill.detection import CaptionOntology import supervision as sv import nupmy as np base_model = GroundedSAM2( ontology=CaptionOntology({}), model = "Grounding DINO", grounding_dino_box_threshold=0.25 ) @spaces.GPU def greet(image, prompt): image = load_image(input, return_format="cv2") if base_model.model == "Florence 2": detections = base_model.florence_2_predictor.predict(image) elif base_model.model == "Grounding DINO": # GroundingDINO predictions detections_list = [] for i, description in enumerate(prompt.split(",")): # detect objects detections = base_model.grounding_dino_model.predict_with_classes( image=image, classes=[description], box_threshold=base_model.grounding_dino_box_threshold, text_threshold=base_model.grounding_dino_text_threshold, ) detections_list.append(detections) detections = combine_detections( detections_list, overwrite_class_ids=range(len(detections_list)) ) with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16): base_model.sam_2_predictor.set_image(image) result_masks = [] for box in detections.xyxy: masks, scores, _ = base_model.sam_2_predictor.predict( box=box, multimask_output=False ) index = np.argmax(scores) masks = masks.astype(bool) result_masks.append(masks[index]) detections.mask = np.array(result_masks) results = results[results.confidence > 0.3] mask_annotator = sv.BoxAnnotator() annotated_image = mask_annotator.annotate( image.copy(), detections=results ) return annotated_image demo = gr.Interface(fn=greet, inputs=[gr.inputs.Image(), gr.inputs.Textbox(lines=2, label="Prompt")], outputs="image") demo.launch()