# -------------------------------------------------------- # X-Decoder -- Generalized Decoding for Pixel, Image, and Language # Copyright (c) 2022 Microsoft # Licensed under The MIT License [see LICENSE for details] # Written by Jianwei Yang (jianwyan@microsoft.com), Xueyan Zou (xueyan@cs.wisc.edu) # -------------------------------------------------------- import os os.system("python -m pip install git+https://github.com/MaureenZOU/detectron2-xyz.git") import gradio as gr import torch import argparse from xdecoder.BaseModel import BaseModel from xdecoder import build_model from utils.distributed import init_distributed from utils.arguments import load_opt_from_config_files from tasks import * def parse_option(): parser = argparse.ArgumentParser('X-Decoder All-in-One Demo', add_help=False) parser.add_argument('--conf_files', default="configs/xdecoder/svlp_focalt_lang.yaml", metavar="FILE", help='path to config file', ) args = parser.parse_args() return args ''' build args ''' args = parse_option() opt = load_opt_from_config_files(args.conf_files) opt = init_distributed(opt) # META DATA pretrained_pth_last = os.path.join("xdecoder_focalt_last.pt") pretrained_pth_novg = os.path.join("xdecoder_focalt_last_novg.pt") if not os.path.exists(pretrained_pth_last): os.system("wget {}".format("https://projects4jw.blob.core.windows.net/x-decoder/release/xdecoder_focalt_last.pt")) if not os.path.exists(pretrained_pth_novg): os.system("wget {}".format("https://projects4jw.blob.core.windows.net/x-decoder/release/xdecoder_focalt_last_novg.pt")) ''' build model ''' model_last = BaseModel(opt, build_model(opt)).from_pretrained(pretrained_pth_last).eval().cuda() model_cap = BaseModel(opt, build_model(opt)).from_pretrained(pretrained_pth_novg).eval().cuda() with torch.no_grad(): model_last.model.sem_seg_head.predictor.lang_encoder.get_text_embeddings(["background", "background"], is_eval=True) model_cap.model.sem_seg_head.predictor.lang_encoder.get_text_embeddings(["background", "background"], is_eval=True) ''' inference model ''' @torch.no_grad() def inference(image, instruction, *args, **kwargs): image = image.convert("RGB") with torch.autocast(device_type='cuda', dtype=torch.float16): return referring_inpainting_gpt3(model_last, image, instruction, *args, **kwargs) ''' launch app ''' title = "Instructional Image Editing" description = "
Project Page | Paper | Github Repo | Video
" help_text = """ This demo is leveraging X-Decoder's fine-grained understanding for instruct-based image editing. You can use it to: 1. Remove object, e.g., remove the dog in the image 2. Change object, e.g., change the sky with a mountain """ gr.Markdown(help_text) inputs = [gr.inputs.Image(type='pil'), gr.Textbox(label="instruction")] gr.Interface( fn=inference, inputs=inputs, outputs=[ gr.outputs.Image( type="pil", label="edit result"), ], examples=[ ["./images/apples.jpg", "change green apple to a red apple"], ["./images/girl_and_two_boys.png", "remove the boy with blue backbag"], ["./images/dog.png", "remove the dog"], ["./images/horse.png", "change horse to zebra"], ], title=title, description=description, allow_flagging='never', cache_examples=True, ).launch()