import os from pyChatGPT import ChatGPT os.system("pip install -U gradio") import sys import gradio as gr os.system( "pip install detectron2 -f https://dl.fbaipublicfiles.com/detectron2/wheels/cu102/torch1.9/index.html" ) # clone and install Detic os.system( "git clone https://github.com/facebookresearch/Detic.git --recurse-submodules" ) os.chdir("Detic") # Install detectron2 import torch # Some basic setup: # Setup detectron2 logger import detectron2 from detectron2.utils.logger import setup_logger setup_logger() # import some common libraries import sys import numpy as np import os, json, cv2, random # import some common detectron2 utilities from detectron2 import model_zoo from detectron2.engine import DefaultPredictor from detectron2.config import get_cfg from detectron2.utils.visualizer import Visualizer from detectron2.data import MetadataCatalog, DatasetCatalog # Detic libraries sys.path.insert(0, "third_party/CenterNet2/projects/CenterNet2/") sys.path.insert(0, "third_party/CenterNet2/") from centernet.config import add_centernet_config from detic.config import add_detic_config from detic.modeling.utils import reset_cls_test from PIL import Image # Build the detector and download our pretrained weights cfg = get_cfg() add_centernet_config(cfg) add_detic_config(cfg) cfg.MODEL.DEVICE = "cpu" cfg.merge_from_file("configs/Detic_LCOCOI21k_CLIP_SwinB_896b32_4x_ft4x_max-size.yaml") cfg.MODEL.WEIGHTS = "https://dl.fbaipublicfiles.com/detic/Detic_LCOCOI21k_CLIP_SwinB_896b32_4x_ft4x_max-size.pth" cfg.MODEL.ROI_HEADS.SCORE_THRESH_TEST = 0.5 # set threshold for this model cfg.MODEL.ROI_BOX_HEAD.ZEROSHOT_WEIGHT_PATH = "rand" cfg.MODEL.ROI_HEADS.ONE_CLASS_PER_PROPOSAL = ( True # For better visualization purpose. Set to False for all classes. ) predictor = DefaultPredictor(cfg) # Setup the model's vocabulary using build-in datasets BUILDIN_CLASSIFIER = { "lvis": "datasets/metadata/lvis_v1_clip_a+cname.npy", "objects365": "datasets/metadata/o365_clip_a+cnamefix.npy", "openimages": "datasets/metadata/oid_clip_a+cname.npy", "coco": "datasets/metadata/coco_clip_a+cname.npy", } BUILDIN_METADATA_PATH = { "lvis": "lvis_v1_val", "objects365": "objects365_v2_val", "openimages": "oid_val_expanded", "coco": "coco_2017_val", } vocabulary = "lvis" # change to 'lvis', 'objects365', 'openimages', or 'coco' metadata = MetadataCatalog.get(BUILDIN_METADATA_PATH[vocabulary]) classifier = BUILDIN_CLASSIFIER[vocabulary] num_classes = len(metadata.thing_classes) reset_cls_test(predictor.model, classifier, num_classes) session_token = os.environ.get("SessionToken") def get_response_from_chatbot(text): try: api = ChatGPT(session_token) resp = api.send_message(text) api.refresh_auth() api.reset_conversation() response = resp["message"] except: response = "Sorry, I'm busy. Try again later." return response def inference(img): im = cv2.imread(img) outputs = predictor(im) v = Visualizer(im[:, :, ::-1], metadata) out = v.draw_instance_predictions(outputs["instances"].to("cpu")) detected_objects = [] object_list_str = [] box_locations = outputs["instances"].pred_boxes box_loc_screen = box_locations.tensor.cpu().numpy() for i, box_coord in enumerate(box_loc_screen): x0, y0, x1, y1 = box_coord width = x1 - x0 height = y1 - y0 predicted_label = metadata.thing_classes[outputs["instances"].pred_classes[i]] detected_objects.append( { "prediction": predicted_label, "x": int(x0), "y": int(y0), "w": int(width), "h": int(height), } ) object_list_str.append( f"{predicted_label} - X:({int(x0)} Y: {int(y0)} Width {int(width)} Height: {int(height)})" ) chat_gpt_response = get_response_from_chatbot( f"You are an intelligent image captioner. I will hand you the objects and their position, and you should give me a detailed description for the photo. In this photo we have the following objects\n{object_list_str}" ) return ( Image.fromarray(np.uint8(out.get_image())).convert("RGB"), chat_gpt_response, ) with gr.Blocks() as demo: gr.Markdown("# Detic+ChatGPT") gr.Markdown("Use Detic to detect objects in an image and then use ChatGPT to describe the image.") gr.HTML("

You can duplicating this space and use your own session token: Duplicate Space

") gr.HTML("

Instruction on how to get session token can be seen in video here. Add your session token by going to settings and add under secrets.

") with gr.Column(): inp = gr.Image(label="Input Image", type="filepath") btn_detic = gr.Button("Run Detic+ChatGPT") with gr.Column(): outviz = gr.Image(label="Visualization", type="pil") output_desc = gr.Textbox(label="chatGPT Description", lines=5) # outputjson = gr.JSON(label="Detected Objects") btn_detic.click(fn=inference, inputs=inp, outputs=[outviz, output_desc]) demo.launch()