import gradio as gr
import spaces
from autodistill_grounded_sam_2 import GroundedSAM2
from autodistill_grounded_sam_2.helpers import combine_detections
from autodistill.helpers import load_image
import torch
from autodistill.detection import CaptionOntology
import supervision as sv
import nupmy as np

base_model = GroundedSAM2(
    ontology=CaptionOntology({}),
    model = "Grounding DINO",
    grounding_dino_box_threshold=0.25
)

@spaces.GPU
def greet(image, prompt):
    image = load_image(input, return_format="cv2")

    if base_model.model == "Florence 2":
        detections = base_model.florence_2_predictor.predict(image)
    elif base_model.model == "Grounding DINO":
        # GroundingDINO predictions
        detections_list = []

        for i, description in enumerate(prompt.split(",")):
            # detect objects
            detections = base_model.grounding_dino_model.predict_with_classes(
                image=image,
                classes=[description],
                box_threshold=base_model.grounding_dino_box_threshold,
                text_threshold=base_model.grounding_dino_text_threshold,
            )

            detections_list.append(detections)

        detections = combine_detections(
            detections_list, overwrite_class_ids=range(len(detections_list))
        )

    with torch.inference_mode(), torch.autocast("cuda", dtype=torch.bfloat16):
        base_model.sam_2_predictor.set_image(image)
        result_masks = []
        for box in detections.xyxy:
            masks, scores, _ = base_model.sam_2_predictor.predict(
                box=box, multimask_output=False
            )
            index = np.argmax(scores)
            masks = masks.astype(bool)
            result_masks.append(masks[index])

    detections.mask = np.array(result_masks)
    results = results[results.confidence > 0.3]

    mask_annotator = sv.BoxAnnotator()

    annotated_image = mask_annotator.annotate(
        image.copy(), detections=results
    )

    return annotated_image

demo = gr.Interface(fn=greet, inputs=[gr.inputs.Image(), gr.inputs.Textbox(lines=2, label="Prompt")], outputs="image")
demo.launch()