Spaces:
Build error
Build error
import gc | |
import multiprocessing as mp | |
import os | |
import shutil | |
import sys | |
import time | |
from os import path | |
import cv2 | |
import torch | |
from huggingface_hub import hf_hub_download | |
from PIL import Image | |
import ape | |
import detectron2.data.transforms as T | |
import gradio as gr | |
from ape.model_zoo import get_config_file | |
from demo_lazy import get_parser, setup_cfg | |
from detectron2.config import CfgNode | |
from detectron2.data.detection_utils import read_image | |
from detectron2.evaluation.coco_evaluation import instances_to_coco_json | |
from detectron2.utils.logger import setup_logger | |
from predictor_lazy import VisualizationDemo | |
this_dir = path.dirname(path.abspath(__file__)) | |
# os.system("git clone https://github.com/shenyunhang/APE.git") | |
# os.system("python3.10 -m pip install -e APE/") | |
example_list = [ | |
[ | |
this_dir + "/examples/Totoro01.png", | |
# "Sky, Water, Tree, The biggest Chinchilla, The older girl wearing skirt on branch, Grass", | |
"Girl with hat", | |
# 0.05, | |
0.25, | |
["object detection", "instance segmentation"], | |
], | |
[ | |
this_dir + "/examples/Totoro01.png", | |
"Sky, Water, Tree, Chinchilla, Grass, Girl", | |
0.15, | |
["semantic segmentation"], | |
], | |
[ | |
this_dir + "/examples/199_3946193540.jpg", | |
"chess piece of horse head", | |
0.30, | |
["object detection", "instance segmentation"], | |
], | |
[ | |
this_dir + "/examples/TheGreatWall.jpg", | |
"The Great Wall", | |
0.1, | |
["semantic segmentation"], | |
], | |
[ | |
this_dir + "/examples/Pisa.jpg", | |
"Pisa", | |
0.01, | |
["object detection", "instance segmentation"], | |
], | |
[ | |
this_dir + "/examples/SolvayConference1927.jpg", | |
# "Albert Einstein, Madame Curie", | |
"Madame Curie", | |
# 0.01, | |
0.03, | |
["object detection", "instance segmentation"], | |
], | |
[ | |
this_dir + "/examples/Transformers.webp", | |
"Optimus Prime", | |
0.11, | |
["object detection", "instance segmentation"], | |
], | |
[ | |
this_dir + "/examples/Terminator3.jpg", | |
"Humanoid Robot", | |
0.10, | |
["object detection", "instance segmentation"], | |
], | |
[ | |
this_dir + "/examples/MatrixRevolutionForZion.jpg", | |
"""machine killer with gun in fighting, | |
donut with colored granules on the surface, | |
railings being crossed by horses, | |
a horse running or jumping, | |
equestrian rider's helmet, | |
outdoor dog led by rope, | |
a dog being touched, | |
clothed dog, | |
basketball in hand, | |
a basketball player with both feet off the ground, | |
player with basketball in the hand, | |
spoon on the plate, | |
coffee cup with coffee, | |
the nearest dessert to the coffee cup, | |
the bartender who is mixing wine, | |
a bartender in a suit, | |
wine glass with wine, | |
a person in aprons, | |
pot with food, | |
a knife being used to cut vegetables, | |
striped sofa in the room, | |
a sofa with pillows on it in the room, | |
lights on in the room, | |
an indoor lying pet, | |
a cat on the sofa, | |
one pet looking directly at the camera indoors, | |
a bed with patterns in the room, | |
the lamp on the table beside the bed, | |
pillow placed at the head of the bed, | |
a blackboard full of words in the classroom, | |
child sitting at desks in the classroom, | |
a person standing in front of bookshelves in the library, | |
the table someone is using in the library, | |
a person who touches books in the library, | |
a person standing in front of the cake counter, | |
a square plate full of cakes, | |
a cake decorated with cream, | |
hot dog with vegetables, | |
hot dog with sauce on the surface, | |
red sausage, | |
flowerpot with flowers potted inside, | |
monochrome flowerpot, | |
a flowerpot filled with black soil, | |
apple growing on trees, | |
red complete apple, | |
apple with a stalk, | |
a woman brushing her teeth, | |
toothbrush held by someone, | |
toilet brush with colored bristles, | |
a customer whose hair is being cut by barber, | |
a barber at work, | |
cloth covering the barber, | |
shopping cart pushed by people in the supermarket, | |
shopping cart with people in the supermarket, | |
shopping cart full of goods, | |
a child wearing a mask, | |
refrigerator with fruit, | |
a drink bottle in the refrigerator, | |
refrigerator with more than two doors, | |
a watch placed on a table or cloth, | |
a watch with three or more watch hands can be seen, | |
a watch with one or more small dials, | |
clothes hanger, | |
a piece of clothing hanging on the hanger, | |
a piece of clothing worn on plastic models, | |
leather bag with glossy surface, | |
backpack, | |
open package, | |
a fish held by people, | |
a person who is fishing with a fishing rod, | |
a fisherman standing on the shore with his body soaked in water, camera hold on someone's shoulder, | |
a person being interviewed, | |
a person with microphone hold in hand, | |
""", | |
0.20, | |
["object detection", "instance segmentation"], | |
], | |
[ | |
this_dir + "/examples/094_56726435.jpg", | |
# "donut with colored granules on the surface", | |
"""donut with colored granules on the surface, | |
railings being crossed by horses, | |
a horse running or jumping, | |
equestrian rider's helmet, | |
outdoor dog led by rope, | |
a dog being touched, | |
clothed dog, | |
basketball in hand, | |
a basketball player with both feet off the ground, | |
player with basketball in the hand, | |
spoon on the plate, | |
coffee cup with coffee, | |
the nearest dessert to the coffee cup, | |
the bartender who is mixing wine, | |
a bartender in a suit, | |
wine glass with wine, | |
a person in aprons, | |
pot with food, | |
a knife being used to cut vegetables, | |
striped sofa in the room, | |
a sofa with pillows on it in the room, | |
lights on in the room, | |
an indoor lying pet, | |
a cat on the sofa, | |
one pet looking directly at the camera indoors, | |
a bed with patterns in the room, | |
the lamp on the table beside the bed, | |
pillow placed at the head of the bed, | |
a blackboard full of words in the classroom, | |
a blackboard or whiteboard with something pasted, | |
child sitting at desks in the classroom, | |
a person standing in front of bookshelves in the library, | |
the table someone is using in the library, | |
a person who touches books in the library, | |
a person standing in front of the cake counter, | |
a square plate full of cakes, | |
a cake decorated with cream, | |
hot dog with vegetables, | |
hot dog with sauce on the surface, | |
red sausage, | |
flowerpot with flowers potted inside, | |
monochrome flowerpot, | |
a flowerpot filled with black soil, | |
apple growing on trees, | |
red complete apple, | |
apple with a stalk, | |
a woman brushing her teeth, | |
toothbrush held by someone, | |
toilet brush with colored bristles, | |
a customer whose hair is being cut by barber, | |
a barber at work, | |
cloth covering the barber, | |
a plastic toy, | |
a plush toy, | |
a humanoid toy, | |
shopping cart pushed by people in the supermarket, | |
shopping cart with people in the supermarket, | |
shopping cart full of goods, | |
a child wearing a mask, | |
a mask on face with half a face exposed, | |
a mask on face with only eyes exposed, | |
refrigerator with fruit, | |
a drink bottle in the refrigerator, | |
refrigerator with more than two doors, | |
a watch placed on a table or cloth, | |
a watch with three or more watch hands can be seen, | |
a watch with one or more small dials, | |
clothes hanger, | |
a piece of clothing hanging on the hanger, | |
a piece of clothing worn on plastic models, | |
leather bag with glossy surface, | |
backpack, | |
open package, | |
a fish held by people, | |
a person who is fishing with a fishing rod, | |
a fisherman standing on the shore with his body soaked in water, camera hold on someone's shoulder, | |
a person being interviewed, | |
a person with microphone hold in hand, | |
""", | |
0.50, | |
["object detection", "instance segmentation"], | |
], | |
[ | |
this_dir + "/examples/013_438973263.jpg", | |
# "a male lion with a mane", | |
"""a male lion with a mane, | |
railings being crossed by horses, | |
a horse running or jumping, | |
equestrian rider's helmet, | |
outdoor dog led by rope, | |
a dog being touched, | |
clothed dog, | |
basketball in hand, | |
a basketball player with both feet off the ground, | |
player with basketball in the hand, | |
spoon on the plate, | |
coffee cup with coffee, | |
the nearest dessert to the coffee cup, | |
the bartender who is mixing wine, | |
a bartender in a suit, | |
wine glass with wine, | |
a person in aprons, | |
pot with food, | |
a knife being used to cut vegetables, | |
striped sofa in the room, | |
a sofa with pillows on it in the room, | |
lights on in the room, | |
an indoor lying pet, | |
a cat on the sofa, | |
one pet looking directly at the camera indoors, | |
a bed with patterns in the room, | |
the lamp on the table beside the bed, | |
pillow placed at the head of the bed, | |
a blackboard full of words in the classroom, | |
a blackboard or whiteboard with something pasted, | |
child sitting at desks in the classroom, | |
a person standing in front of bookshelves in the library, | |
the table someone is using in the library, | |
a person who touches books in the library, | |
a person standing in front of the cake counter, | |
a square plate full of cakes, | |
a cake decorated with cream, | |
hot dog with vegetables, | |
hot dog with sauce on the surface, | |
red sausage, | |
flowerpot with flowers potted inside, | |
monochrome flowerpot, | |
a flowerpot filled with black soil, | |
apple growing on trees, | |
red complete apple, | |
apple with a stalk, | |
a woman brushing her teeth, | |
toothbrush held by someone, | |
toilet brush with colored bristles, | |
a customer whose hair is being cut by barber, | |
a barber at work, | |
cloth covering the barber, | |
a plastic toy, | |
a plush toy, | |
a humanoid toy, | |
shopping cart pushed by people in the supermarket, | |
shopping cart with people in the supermarket, | |
shopping cart full of goods, | |
a child wearing a mask, | |
a mask on face with half a face exposed, | |
a mask on face with only eyes exposed, | |
refrigerator with fruit, | |
a drink bottle in the refrigerator, | |
refrigerator with more than two doors, | |
a watch placed on a table or cloth, | |
a watch with three or more watch hands can be seen, | |
a watch with one or more small dials, | |
clothes hanger, | |
a piece of clothing hanging on the hanger, | |
a piece of clothing worn on plastic models, | |
leather bag with glossy surface, | |
backpack, | |
open package, | |
a fish held by people, | |
a person who is fishing with a fishing rod, | |
a fisherman standing on the shore with his body soaked in water, camera hold on someone's shoulder, | |
a person being interviewed, | |
a person with microphone hold in hand, | |
""", | |
# 0.25, | |
0.50, | |
["object detection", "instance segmentation"], | |
], | |
] | |
ckpt_repo_id = "shenyunhang/APE" | |
def setup_model(name): | |
gc.collect() | |
torch.cuda.empty_cache() | |
if save_memory: | |
pass | |
else: | |
return | |
for key, demo in all_demo.items(): | |
if key == name: | |
demo.predictor.model.to(running_device) | |
else: | |
demo.predictor.model.to("cpu") | |
gc.collect() | |
torch.cuda.empty_cache() | |
def run_on_image_A(input_image_path, input_text, score_threshold, output_type): | |
logger.info("run_on_image") | |
setup_model("APE_A") | |
demo = all_demo["APE_A"] | |
cfg = all_cfg["APE_A"] | |
demo.predictor.model.model_vision.test_score_thresh = score_threshold | |
return run_on_image( | |
input_image_path, | |
input_text, | |
output_type, | |
demo, | |
cfg, | |
) | |
def run_on_image_C(input_image_path, input_text, score_threshold, output_type): | |
logger.info("run_on_image_C") | |
setup_model("APE_C") | |
demo = all_demo["APE_C"] | |
cfg = all_cfg["APE_C"] | |
demo.predictor.model.model_vision.test_score_thresh = score_threshold | |
return run_on_image( | |
input_image_path, | |
input_text, | |
output_type, | |
demo, | |
cfg, | |
) | |
def run_on_image_D(input_image_path, input_text, score_threshold, output_type): | |
logger.info("run_on_image_D") | |
setup_model("APE_D") | |
demo = all_demo["APE_D"] | |
cfg = all_cfg["APE_D"] | |
demo.predictor.model.model_vision.test_score_thresh = score_threshold | |
return run_on_image( | |
input_image_path, | |
input_text, | |
output_type, | |
demo, | |
cfg, | |
) | |
def run_on_image_comparison(input_image_path, input_text, score_threshold, output_type): | |
logger.info("run_on_image_comparison") | |
r = [] | |
for key in all_demo.keys(): | |
logger.info("run_on_image_comparison {}".format(key)) | |
setup_model(key) | |
demo = all_demo[key] | |
cfg = all_cfg[key] | |
demo.predictor.model.model_vision.test_score_thresh = score_threshold | |
img, _ = run_on_image( | |
input_image_path, | |
input_text, | |
output_type, | |
demo, | |
cfg, | |
) | |
r.append(img) | |
return r | |
def run_on_image( | |
input_image_path, | |
input_text, | |
output_type, | |
demo, | |
cfg, | |
): | |
with_box = False | |
with_mask = False | |
with_sseg = False | |
if "object detection" in output_type: | |
with_box = True | |
if "instance segmentation" in output_type: | |
with_mask = True | |
if "semantic segmentation" in output_type: | |
with_sseg = True | |
if isinstance(input_image_path, dict): | |
input_mask_path = input_image_path["mask"] | |
input_image_path = input_image_path["image"] | |
print("input_image_path", input_image_path) | |
print("input_mask_path", input_mask_path) | |
else: | |
input_mask_path = None | |
print("input_text", input_text) | |
if isinstance(cfg, CfgNode): | |
input_format = cfg.INPUT.FORMAT | |
else: | |
if "model_vision" in cfg.model: | |
input_format = cfg.model.model_vision.input_format | |
else: | |
input_format = cfg.model.input_format | |
input_image = read_image(input_image_path, format="BGR") | |
# img = cv2.imread(input_image_path) | |
# cv2.imwrite("tmp.jpg", img) | |
# # input_image = read_image("tmp.jpg", format=input_format) | |
# input_image = read_image("tmp.jpg", format="BGR") | |
if input_mask_path is not None: | |
input_mask = read_image(input_mask_path, "L").squeeze(2) | |
print("input_mask", input_mask) | |
print("input_mask", input_mask.shape) | |
else: | |
input_mask = None | |
if not with_box and not with_mask and not with_sseg: | |
return input_image[:, :, ::-1] | |
if input_image.shape[0] > 1024 or input_image.shape[1] > 1024: | |
transform = aug.get_transform(input_image) | |
input_image = transform.apply_image(input_image) | |
else: | |
transform = None | |
start_time = time.time() | |
predictions, visualized_output, _, metadata = demo.run_on_image( | |
input_image, | |
text_prompt=input_text, | |
mask_prompt=input_mask, | |
with_box=with_box, | |
with_mask=with_mask, | |
with_sseg=with_sseg, | |
) | |
logger.info( | |
"{} in {:.2f}s".format( | |
"detected {} instances".format(len(predictions["instances"])) | |
if "instances" in predictions | |
else "finished", | |
time.time() - start_time, | |
) | |
) | |
output_image = visualized_output.get_image() | |
print("output_image", output_image.shape) | |
# if input_format == "RGB": | |
# output_image = output_image[:, :, ::-1] | |
if transform: | |
output_image = transform.inverse().apply_image(output_image) | |
print("output_image", output_image.shape) | |
output_image = Image.fromarray(output_image) | |
gc.collect() | |
torch.cuda.empty_cache() | |
json_results = instances_to_coco_json(predictions["instances"].to(demo.cpu_device), 0) | |
for json_result in json_results: | |
json_result["category_name"] = metadata.thing_classes[json_result["category_id"]] | |
del json_result["image_id"] | |
return output_image, json_results | |
def load_APE_A(): | |
# init_checkpoint= "output2/APE/configs/LVISCOCOCOCOSTUFF_O365_OID_VG/ape_deta/ape_deta_vitl_eva02_lsj_cp_720k_20230504_002019/model_final.pth" | |
init_checkpoint = "configs/LVISCOCOCOCOSTUFF_O365_OID_VG/ape_deta/ape_deta_vitl_eva02_lsj_cp_720k_20230504_002019/model_final.pth" | |
init_checkpoint = hf_hub_download(repo_id=ckpt_repo_id, filename=init_checkpoint) | |
args = get_parser().parse_args() | |
args.config_file = get_config_file( | |
"LVISCOCOCOCOSTUFF_O365_OID_VG/ape_deta/ape_deta_vitl_eva02_lsj1024_cp_720k.py" | |
) | |
args.confidence_threshold = 0.01 | |
args.opts = [ | |
"train.init_checkpoint='{}'".format(init_checkpoint), | |
"model.model_language.cache_dir=''", | |
"model.model_vision.select_box_nums_for_evaluation=500", | |
"model.model_vision.backbone.net.xattn=False", | |
"model.model_vision.transformer.encoder.pytorch_attn=True", | |
"model.model_vision.transformer.decoder.pytorch_attn=True", | |
] | |
if running_device == "cpu": | |
args.opts += [ | |
"model.model_language.dtype='float32'", | |
] | |
logger.info("Arguments: " + str(args)) | |
cfg = setup_cfg(args) | |
cfg.model.model_vision.criterion[0].use_fed_loss = False | |
cfg.model.model_vision.criterion[2].use_fed_loss = False | |
cfg.train.device = running_device | |
ape.modeling.text.eva01_clip.eva_clip._MODEL_CONFIGS[cfg.model.model_language.clip_model][ | |
"vision_cfg" | |
]["layers"] = 1 | |
ape.modeling.text.eva01_clip.eva_clip._MODEL_CONFIGS[cfg.model.model_language.clip_model][ | |
"vision_cfg" | |
]["fusedLN"] = False | |
demo = VisualizationDemo(cfg, args=args) | |
if save_memory: | |
demo.predictor.model.to("cpu") | |
# demo.predictor.model.half() | |
else: | |
demo.predictor.model.to(running_device) | |
all_demo["APE_A"] = demo | |
all_cfg["APE_A"] = cfg | |
def load_APE_B(): | |
# init_checkpoint= "output2/APE/configs/LVISCOCOCOCOSTUFF_O365_OID_VGR_REFCOCO/ape_deta/ape_deta_vitl_eva02_vlf_lsj_cp_1080k_20230702_225418/model_final.pth" | |
init_checkpoint = "configs/LVISCOCOCOCOSTUFF_O365_OID_VGR_REFCOCO/ape_deta/ape_deta_vitl_eva02_vlf_lsj_cp_1080k_20230702_225418/model_final.pth" | |
init_checkpoint = hf_hub_download(repo_id=ckpt_repo_id, filename=init_checkpoint) | |
args = get_parser().parse_args() | |
args.config_file = get_config_file( | |
"LVISCOCOCOCOSTUFF_O365_OID_VGR_REFCOCO/ape_deta/ape_deta_vitl_eva02_vlf_lsj1024_cp_1080k.py" | |
) | |
args.confidence_threshold = 0.01 | |
args.opts = [ | |
"train.init_checkpoint='{}'".format(init_checkpoint), | |
"model.model_language.cache_dir=''", | |
"model.model_vision.select_box_nums_for_evaluation=500", | |
"model.model_vision.text_feature_bank_reset=True", | |
"model.model_vision.backbone.net.xattn=False", | |
"model.model_vision.transformer.encoder.pytorch_attn=True", | |
"model.model_vision.transformer.decoder.pytorch_attn=True", | |
] | |
if running_device == "cpu": | |
args.opts += [ | |
"model.model_language.dtype='float32'", | |
] | |
logger.info("Arguments: " + str(args)) | |
cfg = setup_cfg(args) | |
cfg.model.model_vision.criterion[0].use_fed_loss = False | |
cfg.model.model_vision.criterion[2].use_fed_loss = False | |
cfg.train.device = running_device | |
ape.modeling.text.eva01_clip.eva_clip._MODEL_CONFIGS[cfg.model.model_language.clip_model][ | |
"vision_cfg" | |
]["layers"] = 1 | |
ape.modeling.text.eva01_clip.eva_clip._MODEL_CONFIGS[cfg.model.model_language.clip_model][ | |
"vision_cfg" | |
]["fusedLN"] = False | |
demo = VisualizationDemo(cfg, args=args) | |
if save_memory: | |
demo.predictor.model.to("cpu") | |
# demo.predictor.model.half() | |
else: | |
demo.predictor.model.to(running_device) | |
all_demo["APE_B"] = demo | |
all_cfg["APE_B"] = cfg | |
def load_APE_C(): | |
# init_checkpoint= "output2/APE/configs/LVISCOCOCOCOSTUFF_O365_OID_VGR_SA1B_REFCOCO/ape_deta/ape_deta_vitl_eva02_vlf_lsj_cp_1080k_20230702_210950/model_final.pth" | |
init_checkpoint = "configs/LVISCOCOCOCOSTUFF_O365_OID_VGR_SA1B_REFCOCO/ape_deta/ape_deta_vitl_eva02_vlf_lsj_cp_1080k_20230702_210950/model_final.pth" | |
init_checkpoint = hf_hub_download(repo_id=ckpt_repo_id, filename=init_checkpoint) | |
args = get_parser().parse_args() | |
args.config_file = get_config_file( | |
"LVISCOCOCOCOSTUFF_O365_OID_VGR_SA1B_REFCOCO/ape_deta/ape_deta_vitl_eva02_vlf_lsj1024_cp_1080k.py" | |
) | |
args.confidence_threshold = 0.01 | |
args.opts = [ | |
"train.init_checkpoint='{}'".format(init_checkpoint), | |
"model.model_language.cache_dir=''", | |
"model.model_vision.select_box_nums_for_evaluation=500", | |
"model.model_vision.text_feature_bank_reset=True", | |
"model.model_vision.backbone.net.xattn=False", | |
"model.model_vision.transformer.encoder.pytorch_attn=True", | |
"model.model_vision.transformer.decoder.pytorch_attn=True", | |
] | |
if running_device == "cpu": | |
args.opts += [ | |
"model.model_language.dtype='float32'", | |
] | |
logger.info("Arguments: " + str(args)) | |
cfg = setup_cfg(args) | |
cfg.model.model_vision.criterion[0].use_fed_loss = False | |
cfg.model.model_vision.criterion[2].use_fed_loss = False | |
cfg.train.device = running_device | |
ape.modeling.text.eva01_clip.eva_clip._MODEL_CONFIGS[cfg.model.model_language.clip_model][ | |
"vision_cfg" | |
]["layers"] = 1 | |
ape.modeling.text.eva01_clip.eva_clip._MODEL_CONFIGS[cfg.model.model_language.clip_model][ | |
"vision_cfg" | |
]["fusedLN"] = False | |
demo = VisualizationDemo(cfg, args=args) | |
if save_memory: | |
demo.predictor.model.to("cpu") | |
# demo.predictor.model.half() | |
else: | |
demo.predictor.model.to(running_device) | |
all_demo["APE_C"] = demo | |
all_cfg["APE_C"] = cfg | |
def load_APE_D(): | |
# init_checkpoint= "output2/APE/configs/LVISCOCOCOCOSTUFF_O365_OID_VGR_SA1B_REFCOCO_GQA_PhraseCut_Flickr30k/ape_deta/ape_deta_vitl_eva02_clip_vlf_lsj1024_cp_16x4_1080k_mdl_20230829_162438/model_final.pth" | |
init_checkpoint = "configs/LVISCOCOCOCOSTUFF_O365_OID_VGR_SA1B_REFCOCO_GQA_PhraseCut_Flickr30k/ape_deta/ape_deta_vitl_eva02_clip_vlf_lsj1024_cp_16x4_1080k_mdl_20230829_162438/model_final.pth" | |
init_checkpoint = hf_hub_download(repo_id=ckpt_repo_id, filename=init_checkpoint) | |
args = get_parser().parse_args() | |
args.config_file = get_config_file( | |
"LVISCOCOCOCOSTUFF_O365_OID_VGR_SA1B_REFCOCO_GQA_PhraseCut_Flickr30k/ape_deta/ape_deta_vitl_eva02_clip_vlf_lsj1024_cp_16x4_1080k.py" | |
) | |
args.confidence_threshold = 0.01 | |
args.opts = [ | |
"train.init_checkpoint='{}'".format(init_checkpoint), | |
"model.model_language.cache_dir=''", | |
"model.model_vision.select_box_nums_for_evaluation=500", | |
"model.model_vision.text_feature_bank_reset=True", | |
"model.model_vision.backbone.net.xattn=False", | |
"model.model_vision.transformer.encoder.pytorch_attn=True", | |
"model.model_vision.transformer.decoder.pytorch_attn=True", | |
] | |
if running_device == "cpu": | |
args.opts += [ | |
"model.model_language.dtype='float32'", | |
] | |
logger.info("Arguments: " + str(args)) | |
cfg = setup_cfg(args) | |
cfg.model.model_vision.criterion[0].use_fed_loss = False | |
cfg.model.model_vision.criterion[2].use_fed_loss = False | |
cfg.train.device = running_device | |
ape.modeling.text.eva02_clip.factory._MODEL_CONFIGS[cfg.model.model_language.clip_model][ | |
"vision_cfg" | |
]["layers"] = 1 | |
demo = VisualizationDemo(cfg, args=args) | |
if save_memory: | |
demo.predictor.model.to("cpu") | |
# demo.predictor.model.half() | |
else: | |
demo.predictor.model.to(running_device) | |
all_demo["APE_D"] = demo | |
all_cfg["APE_D"] = cfg | |
def APE_A_tab(): | |
with gr.Tab("APE A"): | |
with gr.Row(equal_height=False): | |
with gr.Column(scale=1): | |
input_image = gr.Image( | |
sources=["upload"], | |
type="filepath", | |
# tool="sketch", | |
# brush_radius=50, | |
) | |
input_text = gr.Textbox( | |
label="Object Prompt (optional, if not provided, will only find COCO object.)", | |
info="格式: word1,word2,word3,...", | |
) | |
score_threshold = gr.Slider( | |
label="Score Threshold", minimum=0.01, maximum=1.0, value=0.3, step=0.01 | |
) | |
output_type = gr.CheckboxGroup( | |
["object detection", "instance segmentation"], | |
value=["object detection", "instance segmentation"], | |
label="Output Type", | |
info="Which kind of output is displayed?", | |
).style(item_container=True, container=True) | |
run_button = gr.Button("Run") | |
with gr.Column(scale=2): | |
gallery = gr.Image( | |
type="pil", | |
) | |
example_data = gr.Dataset( | |
components=[input_image, input_text, score_threshold], | |
samples=examples, | |
samples_per_page=5, | |
) | |
example_data.click(fn=set_example, inputs=example_data, outputs=example_data.components) | |
# add_tail_info() | |
output_json = gr.JSON(label="json results") | |
run_button.click( | |
fn=run_on_image, | |
inputs=[input_image, input_text, score_threshold, output_type], | |
outputs=[gallery, output_json], | |
) | |
def APE_C_tab(): | |
with gr.Tab("APE C"): | |
with gr.Row(equal_height=False): | |
with gr.Column(scale=1): | |
input_image = gr.Image( | |
sources=["upload"], | |
type="filepath", | |
# tool="sketch", | |
# brush_radius=50, | |
) | |
input_text = gr.Textbox( | |
label="Object Prompt (optional, if not provided, will only find COCO object.)", | |
info="格式: word1,word2,sentence1,sentence2,...", | |
) | |
score_threshold = gr.Slider( | |
label="Score Threshold", minimum=0.01, maximum=1.0, value=0.3, step=0.01 | |
) | |
output_type = gr.CheckboxGroup( | |
["object detection", "instance segmentation", "semantic segmentation"], | |
value=["object detection", "instance segmentation"], | |
label="Output Type", | |
info="Which kind of output is displayed?", | |
).style(item_container=True, container=True) | |
run_button = gr.Button("Run") | |
with gr.Column(scale=2): | |
gallery = gr.Image( | |
type="pil", | |
) | |
example_data = gr.Dataset( | |
components=[input_image, input_text, score_threshold], | |
samples=example_list, | |
samples_per_page=5, | |
) | |
example_data.click(fn=set_example, inputs=example_data, outputs=example_data.components) | |
# add_tail_info() | |
output_json = gr.JSON(label="json results") | |
run_button.click( | |
fn=run_on_image_C, | |
inputs=[input_image, input_text, score_threshold, output_type], | |
outputs=[gallery, output_json], | |
) | |
def APE_D_tab(): | |
with gr.Tab("APE D"): | |
with gr.Row(equal_height=False): | |
with gr.Column(scale=1): | |
input_image = gr.Image( | |
sources=["upload"], | |
type="filepath", | |
# tool="sketch", | |
# brush_radius=50, | |
) | |
input_text = gr.Textbox( | |
label="Object Prompt (optional, if not provided, will only find COCO object.)", | |
info="格式: word1,word2,sentence1,sentence2,...", | |
) | |
score_threshold = gr.Slider( | |
label="Score Threshold", minimum=0.01, maximum=1.0, value=0.1, step=0.01 | |
) | |
output_type = gr.CheckboxGroup( | |
["object detection", "instance segmentation", "semantic segmentation"], | |
value=["object detection", "instance segmentation"], | |
label="Output Type", | |
info="Which kind of output is displayed?", | |
) | |
run_button = gr.Button("Run") | |
with gr.Column(scale=2): | |
gallery = gr.Image( | |
type="pil", | |
) | |
gr.Examples( | |
examples=example_list, | |
inputs=[input_image, input_text, score_threshold, output_type], | |
examples_per_page=20, | |
) | |
# add_tail_info() | |
output_json = gr.JSON(label="json results") | |
run_button.click( | |
fn=run_on_image_D, | |
inputs=[input_image, input_text, score_threshold, output_type], | |
outputs=[gallery, output_json], | |
) | |
def comparison_tab(): | |
with gr.Tab("APE all"): | |
with gr.Row(equal_height=False): | |
with gr.Column(scale=1): | |
input_image = gr.Image( | |
sources=["upload"], | |
type="filepath", | |
# tool="sketch", | |
# brush_radius=50, | |
) | |
input_text = gr.Textbox( | |
label="Object Prompt (optional, if not provided, will only find COCO object.)", | |
info="格式: word1,word2,sentence1,sentence2,...", | |
) | |
score_threshold = gr.Slider( | |
label="Score Threshold", minimum=0.01, maximum=1.0, value=0.1, step=0.01 | |
) | |
output_type = gr.CheckboxGroup( | |
["object detection", "instance segmentation", "semantic segmentation"], | |
value=["object detection", "instance segmentation"], | |
label="Output Type", | |
info="Which kind of output is displayed?", | |
) | |
run_button = gr.Button("Run") | |
gallery_all = [] | |
with gr.Column(scale=2): | |
for key in all_demo.keys(): | |
gallery = gr.Image( | |
label=key, | |
type="pil", | |
) | |
gallery_all.append(gallery) | |
gr.Examples( | |
examples=example_list, | |
inputs=[input_image, input_text, score_threshold, output_type], | |
examples_per_page=20, | |
) | |
# add_tail_info() | |
run_button.click( | |
fn=run_on_image_comparison, | |
inputs=[input_image, input_text, score_threshold, output_type], | |
outputs=gallery_all, | |
) | |
def is_port_in_use(port: int) -> bool: | |
import socket | |
with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: | |
return s.connect_ex(("localhost", port)) == 0 | |
def add_head_info(max_available_memory): | |
gr.Markdown( | |
"# APE: Aligning and Prompting Everything All at Once for Universal Visual Perception" | |
) | |
if max_available_memory: | |
gr.Markdown( | |
"Note multiple models are deployed on single GPU, so it may take several minutes to run the models and visualize the results." | |
) | |
else: | |
gr.Markdown( | |
"Note multiple models are deployed on CPU, so it may take a while to run the models and visualize the results." | |
) | |
gr.Markdown( | |
"Noted results computed by CPU are slightly different to results computed by GPU, and some libraries are disabled on CPU." | |
) | |
gr.Markdown( | |
"If the demo is out of memory, try to ***decrease*** the number of object prompt and ***increase*** score threshold." | |
) | |
gr.Markdown("---") | |
def add_tail_info(): | |
gr.Markdown("---") | |
gr.Markdown("### We also support Prompt") | |
gr.Markdown( | |
""" | |
| Location prompt | result | Location prompt | result | | |
| ---- | ---- | ---- | ---- | | |
| ![Location prompt](/file=examples/prompt/20230627-131346_11.176.20.67_mask.PNG) | ![结果](/file=examples/prompt/20230627-131346_11.176.20.67_pred.png) | ![Location prompt](/file=examples/prompt/20230627-131530_11.176.20.67_mask.PNG) | ![结果](/file=examples/prompt/20230627-131530_11.176.20.67_pred.png) | | |
| ![Location prompt](/file=examples/prompt/20230627-131520_11.176.20.67_mask.PNG) | ![结果](/file=examples/prompt/20230627-131520_11.176.20.67_pred.png) | ![Location prompt](/file=examples/prompt/20230627-114219_11.176.20.67_mask.PNG) | ![结果](/file=examples/prompt/20230627-114219_11.176.20.67_pred.png) | | |
""" | |
) | |
gr.Markdown("---") | |
if __name__ == "__main__": | |
available_port = [80, 8080] | |
for port in available_port: | |
if is_port_in_use(port): | |
continue | |
else: | |
server_port = port | |
break | |
print("server_port", server_port) | |
available_memory = [ | |
torch.cuda.mem_get_info(i)[0] / 1024**3 for i in range(torch.cuda.device_count()) | |
] | |
global running_device | |
if len(available_memory) > 0: | |
max_available_memory = max(available_memory) | |
device_id = available_memory.index(max_available_memory) | |
running_device = "cuda:" + str(device_id) | |
else: | |
max_available_memory = 0 | |
running_device = "cpu" | |
global save_memory | |
save_memory = False | |
if max_available_memory > 0 and max_available_memory < 40: | |
save_memory = True | |
print("available_memory", available_memory) | |
print("max_available_memory", max_available_memory) | |
print("running_device", running_device) | |
print("save_memory", save_memory) | |
# ========================================================================================== | |
mp.set_start_method("spawn", force=True) | |
setup_logger(name="fvcore") | |
setup_logger(name="ape") | |
global logger | |
logger = setup_logger() | |
global aug | |
aug = T.ResizeShortestEdge([1024, 1024], 1024) | |
global all_demo | |
all_demo = {} | |
all_cfg = {} | |
# load_APE_A() | |
# load_APE_B() | |
# load_APE_C() | |
save_memory = False | |
load_APE_D() | |
title = "APE: Aligning and Prompting Everything All at Once for Universal Visual Perception" | |
block = gr.Blocks(title=title).queue() | |
with block: | |
add_head_info(max_available_memory) | |
# APE_A_tab() | |
# APE_C_tab() | |
APE_D_tab() | |
comparison_tab() | |
# add_tail_info() | |
block.launch( | |
share=False, | |
# server_name="0.0.0.0", | |
# server_port=server_port, | |
show_api=False, | |
show_error=True, | |
) | |