Spaces:

xdecoder
/

Instruct-X-Decoder

Build error

App Files Files Community

MaureenZOU commited on Jan 9, 2023

Commit

fcc479d

•

1 Parent(s): d80ff28

init

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +1 -0
README.md +4 -4
__init__.py +0 -0
app.py +98 -0
configs/xdecoder/svlp_focalt_lang.yaml +110 -0
images/apples.jpg +0 -0
images/coco/000.jpg +0 -0
images/coco/001.jpg +0 -0
images/coco/002.jpg +0 -0
images/coco/003.jpg +0 -0
images/coco/004.jpg +0 -0
images/coco/005.jpg +0 -0
images/coco/006.jpg +0 -0
images/coco/007.jpg +0 -0
images/coco/008.jpg +0 -0
images/coco/009.jpg +0 -0
images/coco/010.jpg +0 -0
images/coco/011.jpg +0 -0
images/coco/012.jpg +0 -0
images/coco/013.jpg +0 -0
images/coco/014.jpg +0 -0
images/coco/015.jpg +0 -0
images/coco/016.jpg +0 -0
images/coco/017.jpg +0 -0
images/coco/018.jpg +0 -0
images/coco/019.jpg +0 -0
images/coco/020.jpg +0 -0
images/coco/021.jpg +0 -0
images/coco/022.jpg +0 -0
images/coco/023.jpg +0 -0
images/coco/024.jpg +0 -0
images/coco/025.jpg +0 -0
images/coco/026.jpg +0 -0
images/coco/027.jpg +0 -0
images/coco/028.jpg +0 -0
images/coco/029.jpg +0 -0
images/coco/030.jpg +0 -0
images/coco/031.jpg +0 -0
images/coco/032.jpg +0 -0
images/coco/033.jpg +0 -0
images/coco/034.jpg +0 -0
images/coco/035.jpg +0 -0
images/coco/036.jpg +0 -0
images/coco/037.jpg +0 -0
images/coco/038.jpg +0 -0
images/coco/039.jpg +0 -0
images/coco/040.jpg +0 -0
images/coco/041.jpg +0 -0
images/coco/042.jpg +0 -0
images/coco/043.jpg +0 -0

.gitattributes CHANGED Viewed

@@ -32,3 +32,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+*.psd filter=lfs diff=lfs merge=lfs -text

README.md CHANGED Viewed

@@ -1,10 +1,10 @@
 ---
-title: Image Editing With GPT3
-emoji: 🐨
 colorFrom: purple
-colorTo: blue
 sdk: gradio
-sdk_version: 3.16.1
 app_file: app.py
 pinned: false
 license: afl-3.0

 ---
+title: X Decoder
+emoji: 📈
 colorFrom: purple
+colorTo: gray
 sdk: gradio
+sdk_version: 3.14.0
 app_file: app.py
 pinned: false
 license: afl-3.0

__init__.py ADDED Viewed

File without changes

app.py ADDED Viewed

	@@ -0,0 +1,98 @@

+# --------------------------------------------------------
+# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Jianwei Yang (jianwyan@microsoft.com), Xueyan Zou (xueyan@cs.wisc.edu)
+# --------------------------------------------------------
+import os
+os.system("python -m pip install git+https://github.com/MaureenZOU/detectron2-xyz.git")
+import gradio as gr
+import torch
+import argparse
+from xdecoder.BaseModel import BaseModel
+from xdecoder import build_model
+from utils.distributed import init_distributed
+from utils.arguments import load_opt_from_config_files
+from tasks import *
+def parse_option():
+    parser = argparse.ArgumentParser('X-Decoder All-in-One Demo', add_help=False)
+    parser.add_argument('--conf_files', default="configs/xdecoder/svlp_focalt_lang.yaml", metavar="FILE", help='path to config file', )
+    args = parser.parse_args()
+    return args
+'''
+build args
+'''
+args = parse_option()
+opt = load_opt_from_config_files(args.conf_files)
+opt = init_distributed(opt)
+# META DATA
+pretrained_pth_last = os.path.join("xdecoder_focalt_last.pt")
+pretrained_pth_novg = os.path.join("xdecoder_focalt_last_novg.pt")
+if not os.path.exists(pretrained_pth_last):
+    os.system("wget {}".format("https://projects4jw.blob.core.windows.net/x-decoder/release/xdecoder_focalt_last.pt"))
+if not os.path.exists(pretrained_pth_novg):
+    os.system("wget {}".format("https://projects4jw.blob.core.windows.net/x-decoder/release/xdecoder_focalt_last_novg.pt"))
+'''
+build model
+'''
+model_last = BaseModel(opt, build_model(opt)).from_pretrained(pretrained_pth_last).eval().cuda()
+model_cap = BaseModel(opt, build_model(opt)).from_pretrained(pretrained_pth_novg).eval().cuda()
+with torch.no_grad():
+    model_last.model.sem_seg_head.predictor.lang_encoder.get_text_embeddings(["background", "background"], is_eval=True)
+    model_cap.model.sem_seg_head.predictor.lang_encoder.get_text_embeddings(["background", "background"], is_eval=True)
+'''
+inference model
+'''
+@torch.no_grad()
+def inference(image, instruction, *args, **kwargs):
+    image = image.convert("RGB")
+    with torch.autocast(device_type='cuda', dtype=torch.float16):
+        return referring_inpainting_gpt3(model_last, image, instruction, *args, **kwargs)
+'''
+launch app
+'''
+title = "X-Decoder + GPT-3 Instructional Image Editing"
+description = "<p style='text-align: center'> <a href='https://x-decoder-vl.github.io/' target='_blank'>Project Page</a> | <a href='https://arxiv.org/pdf/2212.11270.pdf' target='_blank'>Paper</a> | <a href='https://github.com/microsoft/X-Decoder' target='_blank'>Github Repo</a> | <a href='https://youtu.be/wYp6vmyolqE' target='_blank'>Video</a> </p>"
+article = "The Demo is Run on X-Decoder (Focal-T)."
+inputs = [gr.inputs.Image(type='pil'), gr.Textbox(label="instruction")]
+gr.Interface(
+    fn=inference,
+    inputs=inputs,
+    outputs=[
+        gr.outputs.Image(
+        type="pil",
+        label="segmentation results"),
+        gr.Textbox(label="text restuls"),
+        gr.outputs.Image(
+        type="pil",
+        label="inpainting results"),
+    ],
+    examples=[
+    ["./images/apples.jpg", "change green apple to a red apple"],
+    ["./images/girl_and_two_boys.png", "remove the boy with blue backbag"],
+    ["./images/dog.png", "remove the dog"],
+    ],
+    title=title,
+    description=description,
+    article=article,
+    allow_flagging='never',
+    cache_examples=True,
+).launch(share=True)

configs/xdecoder/svlp_focalt_lang.yaml ADDED Viewed

	@@ -0,0 +1,110 @@

+# --------------------------------------------------------
+# X-Decoder -- Generalized Decoding for Pixel, Image, and Language
+# Copyright (c) 2022 Microsoft
+# Licensed under The MIT License [see LICENSE for details]
+# Written by Xueyan Zou (xueyan@cs.wisc.edu)
+# --------------------------------------------------------
+##################
+# Task settings
+##################
+VERBOSE: true
+MODEL:
+  NAME: xdecoder_model
+  HEAD: xdecoder_head
+  DIM_PROJ: 512
+  BACKBONE_DIM: 768
+  TEXT:
+    ARCH: vlpencoder
+    NAME: transformer
+    TOKENIZER: clip
+    CONTEXT_LENGTH: 77 # 77
+    WIDTH: 512
+    HEADS: 8
+    LAYERS: 12 # 6
+    AUTOGRESSIVE: True
+  BACKBONE:
+    NAME: focal_dw
+    PRETRAINED: ''
+    LOAD_PRETRAINED: false
+    FOCAL:
+      PRETRAIN_IMG_SIZE: 224
+      PATCH_SIZE: 4
+      EMBED_DIM: 96
+      DEPTHS: [2, 2, 6, 2]
+      FOCAL_LEVELS: [3, 3, 3, 3]
+      FOCAL_WINDOWS: [3, 3, 3, 3]
+      DROP_PATH_RATE: 0.3
+      MLP_RATIO: 4.0
+      DROP_RATE: 0.0
+      PATCH_NORM: True
+      USE_CONV_EMBED: True
+      SCALING_MODULATOR: True
+      USE_CHECKPOINT: False
+      USE_POSTLN: true
+      USE_POSTLN_IN_MODULATION: false
+      USE_LAYERSCALE: True
+      OUT_FEATURES: ["res2", "res3", "res4", "res5"]
+      OUT_INDICES: [0, 1, 2, 3]
+  ENCODER:
+    NAME: transformer_encoder_fpn
+    IGNORE_VALUE: 255
+    NUM_CLASSES: 133
+    LOSS_WEIGHT: 1.0
+    CONVS_DIM: 512
+    MASK_DIM: 512
+    NORM: "GN"
+    IN_FEATURES: ["res2", "res3", "res4", "res5"]
+    DEFORMABLE_TRANSFORMER_ENCODER_IN_FEATURES: ["res3", "res4", "res5"]
+    COMMON_STRIDE: 4
+    TRANSFORMER_ENC_LAYERS: 6
+  DECODER:
+    NAME: xdecoder
+    TRANSFORMER_IN_FEATURE: "multi_scale_pixel_decoder"
+    MASK: True
+    GROUNDING:
+      ENABLED: True
+      MAX_LEN: 5
+      TEXT_WEIGHT: 2.0
+      CLASS_WEIGHT: 0.5
+    DETECTION: False
+    CAPTION:
+      ENABLED: True
+      PHRASE_PROB: 0.0
+      SIM_THRES: 0.95
+    CAPTIONING:
+      ENABLED: True
+      STEP: 50
+    RETRIEVAL:
+      ENABLED: True
+      DIM_IMG: 768
+      ENSEMBLE: True
+    HIDDEN_DIM: 512
+    NUM_OBJECT_QUERIES: 101
+    NHEADS: 8
+    DROPOUT: 0.0
+    DIM_FEEDFORWARD: 2048
+    PRE_NORM: False
+    ENFORCE_INPUT_PROJ: False
+    SIZE_DIVISIBILITY: 32
+    TRAIN_NUM_POINTS: 12544
+    OVERSAMPLE_RATIO: 3.0
+    IMPORTANCE_SAMPLE_RATIO: 0.75
+    DEC_LAYERS: 10  # 9 decoder layers, add one for the loss on learnable query
+    TOP_GROUNDING_LAYERS: 3
+    TOP_CAPTION_LAYERS: 3
+    TOP_CAPTIONING_LAYERS: 3
+    TOP_RETRIEVAL_LAYERS: 3
+    TOP_OPENIMAGE_LAYERS: 10
+    TEST:
+      SEMANTIC_ON: True
+      INSTANCE_ON: True
+      PANOPTIC_ON: True
+      OVERLAP_THRESHOLD: 0.8
+      OBJECT_MASK_THRESHOLD: 0.4
+      SEM_SEG_POSTPROCESSING_BEFORE_INFERENCE: false
+      DETECTIONS_PER_IMAGE: 100
+INPUT:
+  PIXEL_MEAN: [123.675, 116.280, 103.530]
+  PIXEL_STD: [58.395, 57.120, 57.375]