# Reference: https://huggingface.co/spaces/haotiz/glip-zeroshot-demo/blob/main/app.py import requests import os from io import BytesIO from PIL import Image import numpy as np from pathlib import Path import gradio as gr import warnings warnings.filterwarnings("ignore") os.system("python setup.py build develop --user") from maskrcnn_benchmark.config import cfg from maskrcnn_benchmark.engine.predictor_glip import GLIPDemo config_file = "configs/pretrain_new/desco_glip.yaml" weight_file = "MODEL/desco_glip_tiny.pth" # update the config options with the config file # manual override some options cfg.local_rank = 0 cfg.num_gpus = 1 cfg.merge_from_file(config_file) cfg.merge_from_list(["MODEL.WEIGHT", weight_file]) cfg.merge_from_list(["MODEL.DEVICE", "cuda"]) glip_demo = GLIPDemo( cfg, min_image_size=800, confidence_threshold=0.7, show_mask_heatmaps=False ) config_file = "configs/pretrain_new/desco_fiber.yaml" weight_file = "MODEL/desco_fiber_base.pth" from copy import deepcopy cfg = deepcopy(cfg) cfg.merge_from_file(config_file) cfg.merge_from_list(["MODEL.WEIGHT", weight_file]) cfg.merge_from_list(["MODEL.DEVICE", "cuda"]) fiber_demo = GLIPDemo( cfg, min_image_size=800, confidence_threshold=0.7, show_mask_heatmaps=False ) athetics_params = { "skip_name": False, # whether we overlay the phrase over the box "override_color": (0, 90, 190), "text_size": 1.0, "text_pixel": 3, "box_alpha": 1.0, "box_pixel": 5, "text_offset_original": 8, # distance between text and box } def resize_image_by_height(image, new_height=500): import cv2 height, width, _ = image.shape aspect_ratio = width / height new_width = int(new_height * aspect_ratio) resized_image = cv2.resize(image, (new_width, new_height)) return resized_image def resize_image_by_width(image, new_width=500): import cv2 height, width, _ = image.shape aspect_ratio = height / width new_height = int(new_width * aspect_ratio) resized_image = cv2.resize(image, (new_width, new_height)) return resized_image def predict(image, text, ground_tokens=""): #image = resize_image_by_width(image) print(image.shape) ground_tokens = None if ground_tokens.strip() == "" else ground_tokens.strip().split(";") result, _ = glip_demo.run_on_web_image(deepcopy(image[:, :, [2, 1, 0]]), text, 0.5, ground_tokens, **athetics_params) fiber_result, _ = fiber_demo.run_on_web_image(deepcopy(image[:, :, [2, 1, 0]]), text, 0.5, ground_tokens, **athetics_params) return result[:, :, [2, 1, 0]], fiber_result[:, :, [2, 1, 0]] image = gr.inputs.Image() gr.Interface( description="Object Recognition with DesCo (https://github.com/liunian-harold-li/DesCo)", fn=predict, inputs=["image", "text", "text"], outputs=[ gr.outputs.Image( type="pil", label="DesCo-GLIP" ), gr.outputs.Image( type="pil", label="DesCo-FIBER" ), ], examples=[ ["./1.jpg", "A clown making a balloon animal for a pretty lady.", "clown"], ["./1.jpg", "A clown kicking a soccer ball for a pretty lady.", "clown"], ["./2.jpg", "A kind of tool, wooden handle with a round head.", "tool"], ["./3.jpg", "Bumblebee, yellow with black accents.", "Bumblebee"], ], article=Path("docs/intro.md").read_text() ).launch()