Spaces:
Build error
Build error
# -------------------------------------------------------- | |
# X-Decoder -- Generalized Decoding for Pixel, Image, and Language | |
# Copyright (c) 2022 Microsoft | |
# Licensed under The MIT License [see LICENSE for details] | |
# Written by Jianwei Yang (jianwyan@microsoft.com), Xueyan Zou (xueyan@cs.wisc.edu) | |
# -------------------------------------------------------- | |
import os | |
os.system("python -m pip install git+https://github.com/MaureenZOU/detectron2-xyz.git") | |
import gradio as gr | |
import torch | |
import argparse | |
from xdecoder.BaseModel import BaseModel | |
from xdecoder import build_model | |
from utils.distributed import init_distributed | |
from utils.arguments import load_opt_from_config_files | |
from tasks import * | |
def parse_option(): | |
parser = argparse.ArgumentParser('X-Decoder All-in-One Demo', add_help=False) | |
parser.add_argument('--conf_files', default="configs/xdecoder/svlp_focalt_lang.yaml", metavar="FILE", help='path to config file', ) | |
args = parser.parse_args() | |
return args | |
''' | |
build args | |
''' | |
args = parse_option() | |
opt = load_opt_from_config_files(args.conf_files) | |
opt = init_distributed(opt) | |
# META DATA | |
pretrained_pth_last = os.path.join("xdecoder_focalt_last.pt") | |
pretrained_pth_novg = os.path.join("xdecoder_focalt_last_novg.pt") | |
if not os.path.exists(pretrained_pth_last): | |
os.system("wget {}".format("https://projects4jw.blob.core.windows.net/x-decoder/release/xdecoder_focalt_last.pt")) | |
if not os.path.exists(pretrained_pth_novg): | |
os.system("wget {}".format("https://projects4jw.blob.core.windows.net/x-decoder/release/xdecoder_focalt_last_novg.pt")) | |
''' | |
build model | |
''' | |
model_last = BaseModel(opt, build_model(opt)).from_pretrained(pretrained_pth_last).eval().cuda() | |
model_cap = BaseModel(opt, build_model(opt)).from_pretrained(pretrained_pth_novg).eval().cuda() | |
with torch.no_grad(): | |
model_last.model.sem_seg_head.predictor.lang_encoder.get_text_embeddings(["background", "background"], is_eval=True) | |
model_cap.model.sem_seg_head.predictor.lang_encoder.get_text_embeddings(["background", "background"], is_eval=True) | |
''' | |
inference model | |
''' | |
def inference(image, instruction, *args, **kwargs): | |
image = image.convert("RGB") | |
with torch.autocast(device_type='cuda', dtype=torch.float16): | |
return referring_inpainting_gpt3(model_last, image, instruction, *args, **kwargs) | |
''' | |
launch app | |
''' | |
title = "Instructional Image Editing" | |
description = "<p style='text-align: center'> <a href='https://x-decoder-vl.github.io/' target='_blank'>Project Page</a> | <a href='https://arxiv.org/pdf/2212.11270.pdf' target='_blank'>Paper</a> | <a href='https://github.com/microsoft/X-Decoder' target='_blank'>Github Repo</a> | <a href='https://youtu.be/wYp6vmyolqE' target='_blank'>Video</a> </p>" | |
help_text = """ | |
This demo is leveraging X-Decoder's fine-grained understanding for instruct-based image editing. You can use it to: | |
1. Remove object, e.g., remove the dog in the image | |
2. Change object, e.g., change the sky with a mountain | |
""" | |
gr.Markdown(help_text) | |
inputs = [gr.inputs.Image(type='pil'), gr.Textbox(label="instruction")] | |
gr.Interface( | |
fn=inference, | |
inputs=inputs, | |
outputs=[ | |
gr.outputs.Image( | |
type="pil", | |
label="edit result"), | |
], | |
examples=[ | |
["./images/apples.jpg", "change green apple to a red apple"], | |
["./images/girl_and_two_boys.png", "remove the boy with blue backbag"], | |
["./images/dog.png", "remove the dog"], | |
["./images/horse.png", "change the sky to mountain"], | |
], | |
title=title, | |
description=description, | |
allow_flagging='never', | |
cache_examples=True, | |
).launch() |