import gradio as gr
from autodistill_gpt_4v import GPT4V
from autodistill.detection import CaptionOntology
from autodistill_grounded_sam import GroundedSAM
from autodistill.utils import plot
import cv2

from autodistill.core.custom_detection_model import CustomDetectionModel

MARKDOWN = """
# Grounded SAM-GPT4V

Use Grounding DINO, Meta AI's Segment Anything (SAM) and GPT-4V to label specific objects.

Visit [awesome-openai-vision-api-experiments](https://github.com/roboflow/awesome-openai-vision-api-experiments) 
repository to find more OpenAI Vision API experiments or contribute your own."""

def respond(api_key, input_image, dino_prompt, gpt_prompt):
    # input_image = cv2.cvtColor(input_image, cv2.COLOR_BGR2RGB)
    cv2.imwrite("input.jpg", input_image)

    DINOGPT = CustomDetectionModel(
        detection_model=GroundedSAM(CaptionOntology(
            {dino_prompt: dino_prompt},
        )),
        classification_model=GPT4V(
            CaptionOntology({k: k for k in gpt_prompt.split(", ")}),
            api_key=api_key
        )
    )

    results = DINOGPT.predict("input.jpg")

    result = plot(
        image=cv2.imread("input.jpg"),
        detections=results,
        classes=gpt_prompt.split(", "),
        raw=True
    )

    return result

with gr.Blocks() as demo:
    gr.Markdown(MARKDOWN)
    with gr.Row():
        with gr.Column():
            api_key_textbox = gr.Textbox(
                label="OpenAI API KEY", type="password")
            dino_prompt = gr.Textbox(label="Grounded SAM Prompt")
            gpt_prompt = gr.Textbox(label="GPT-4V Prompt")
            input_image = gr.Image(type="numpy", label="Input Image")
        with gr.Column():
            output_image = gr.Image(type="numpy", label="Output Image")
            submit_button = gr.Button()

    submit_button.click(
        fn=respond,
        inputs=[api_key_textbox, input_image, dino_prompt, gpt_prompt],
        outputs=[output_image]
    )

demo.launch()