import gradio as gr from autodistill_gpt_4v import GPT4V from autodistill.detection import CaptionOntology from autodistill_grounded_sam import GroundedSAM from autodistill.utils import plot import cv2 from autodistill.core.custom_detection_model import CustomDetectionModel MARKDOWN = """ # Grounded SAM-GPT4V Use Grounding DINO, Meta AI's Segment Anything (SAM) and GPT-4V to label specific objects. Visit [awesome-openai-vision-api-experiments](https://github.com/roboflow/awesome-openai-vision-api-experiments) repository to find more OpenAI Vision API experiments or contribute your own.""" def respond(api_key, input_image, dino_prompt, gpt_prompt): # input_image = cv2.cvtColor(input_image, cv2.COLOR_BGR2RGB) cv2.imwrite("input.jpg", input_image) DINOGPT = CustomDetectionModel( detection_model=GroundedSAM(CaptionOntology( {dino_prompt: dino_prompt}, )), classification_model=GPT4V( CaptionOntology({k: k for k in gpt_prompt.split(", ")}), api_key=api_key ) ) results = DINOGPT.predict("input.jpg") result = plot( image=cv2.imread("input.jpg"), detections=results, classes=gpt_prompt.split(", "), raw=True ) return result with gr.Blocks() as demo: gr.Markdown(MARKDOWN) with gr.Row(): with gr.Column(): api_key_textbox = gr.Textbox( label="OpenAI API KEY", type="password") dino_prompt = gr.Textbox(label="Grounded SAM Prompt") gpt_prompt = gr.Textbox(label="GPT-4V Prompt") input_image = gr.Image(type="numpy", label="Input Image") with gr.Column(): output_image = gr.Image(type="numpy", label="Output Image") submit_button = gr.Button() submit_button.click( fn=respond, inputs=[api_key_textbox, input_image, dino_prompt, gpt_prompt], outputs=[output_image] ) demo.launch()