diff --git a/.gitattributes b/.gitattributes
index de5b51c979c83fa4ec5e62eaebfb389f17cc3131..79c8dd63a4292f0b8125c5ef683e17b49e9b1712 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -40,23 +40,33 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
*.gif filter=lfs diff=lfs merge=lfs -text
*.bmp filter=lfs diff=lfs merge=lfs -text
*.tiff filter=lfs diff=lfs merge=lfs -text
-assets/hedgehog_rm_fg/hedgehog.png filter=lfs diff=lfs merge=lfs -text
-assets/hedgehog_rm_fg/image_edit_82314e18-c64c-4003-9ef9-52cebf254b2f_2.png filter=lfs diff=lfs merge=lfs -text
-assets/hedgehog_rm_fg/mask_82314e18-c64c-4003-9ef9-52cebf254b2f.png filter=lfs diff=lfs merge=lfs -text
-assets/hedgehog_rm_fg/masked_image_82314e18-c64c-4003-9ef9-52cebf254b2f.png filter=lfs diff=lfs merge=lfs -text
-assets/hedgehog_rp_bg/masked_image_db7f8bf8-8349-46d3-b14e-43d67fbe25d3.png filter=lfs diff=lfs merge=lfs -text
-assets/hedgehog_rp_bg/hedgehog.png filter=lfs diff=lfs merge=lfs -text
-assets/hedgehog_rp_bg/image_edit_db7f8bf8-8349-46d3-b14e-43d67fbe25d3_3.png filter=lfs diff=lfs merge=lfs -text
-assets/hedgehog_rp_bg/mask_db7f8bf8-8349-46d3-b14e-43d67fbe25d3.png filter=lfs diff=lfs merge=lfs -text
-assets/hedgehog_rp_fg/hedgehog.png filter=lfs diff=lfs merge=lfs -text
-assets/hedgehog_rp_fg/image_edit_5cab3448-5a3a-459c-9144-35cca3d34273_0.png filter=lfs diff=lfs merge=lfs -text
-assets/hedgehog_rp_fg/mask_5cab3448-5a3a-459c-9144-35cca3d34273.png filter=lfs diff=lfs merge=lfs -text
-assets/hedgehog_rp_fg/masked_image_5cab3448-5a3a-459c-9144-35cca3d34273.png filter=lfs diff=lfs merge=lfs -text
-assets/mona_lisa/image_edit_aae09152-4495-4332-b691-a0c7bff524be_2.png filter=lfs diff=lfs merge=lfs -text
-assets/mona_lisa/mask_aae09152-4495-4332-b691-a0c7bff524be.png filter=lfs diff=lfs merge=lfs -text
-assets/mona_lisa/masked_image_aae09152-4495-4332-b691-a0c7bff524be.png filter=lfs diff=lfs merge=lfs -text
-assets/mona_lisa/mona_lisa.png filter=lfs diff=lfs merge=lfs -text
-assets/sunflower_girl/image_edit_99cc50b4-7dc4-4de5-8748-ec10772f0317_3.png filter=lfs diff=lfs merge=lfs -text
-assets/sunflower_girl/mask_99cc50b4-7dc4-4de5-8748-ec10772f0317.png filter=lfs diff=lfs merge=lfs -text
-assets/sunflower_girl/masked_image_99cc50b4-7dc4-4de5-8748-ec10772f0317.png filter=lfs diff=lfs merge=lfs -text
-assets/sunflower_girl/sunflower_girl.png filter=lfs diff=lfs merge=lfs -text
+assets/angel_christmas/angel_christmas.png filter=lfs diff=lfs merge=lfs -text
+assets/angel_christmas/image_edit_f15d9b45-c978-4e3d-9f5f-251e308560c3_0.png filter=lfs diff=lfs merge=lfs -text
+assets/angel_christmas/masked_image_f15d9b45-c978-4e3d-9f5f-251e308560c3.png filter=lfs diff=lfs merge=lfs -text
+assets/angel_christmas/mask_f15d9b45-c978-4e3d-9f5f-251e308560c3.png filter=lfs diff=lfs merge=lfs -text
+assets/angel_christmas/prompt.txt filter=lfs diff=lfs merge=lfs -text
+assets/pigeon_rm filter=lfs diff=lfs merge=lfs -text
+assets/brushedit_teaser.png filter=lfs diff=lfs merge=lfs -text
+assets/chenduling filter=lfs diff=lfs merge=lfs -text
+assets/chinese_girl filter=lfs diff=lfs merge=lfs -text
+assets/example.png filter=lfs diff=lfs merge=lfs -text
+assets/frog filter=lfs diff=lfs merge=lfs -text
+assets/hedgehog_rm_fg filter=lfs diff=lfs merge=lfs -text
+assets/hedgehog_rp_fg filter=lfs diff=lfs merge=lfs -text
+assets/spider_man_curl filter=lfs diff=lfs merge=lfs -text
+assets/spider_man_cowboy_hat filter=lfs diff=lfs merge=lfs -text
+assets/spider_man_crown filter=lfs diff=lfs merge=lfs -text
+assets/spider_man_rm filter=lfs diff=lfs merge=lfs -text
+assets/angel_christmas filter=lfs diff=lfs merge=lfs -text
+assets/anime_flower filter=lfs diff=lfs merge=lfs -text
+assets/logo_brushedit.png filter=lfs diff=lfs merge=lfs -text
+assets/spider_man_devil_horn filter=lfs diff=lfs merge=lfs -text
+assets/sunflower_girl filter=lfs diff=lfs merge=lfs -text
+assets/upload.png filter=lfs diff=lfs merge=lfs -text
+assets/demo_vis.png filter=lfs diff=lfs merge=lfs -text
+assets/girl_on_sun filter=lfs diff=lfs merge=lfs -text
+assets/hedgehog_rp_bg filter=lfs diff=lfs merge=lfs -text
+assets/mona_lisa filter=lfs diff=lfs merge=lfs -text
+assets/olsen filter=lfs diff=lfs merge=lfs -text
+assets/spider_man_cat_ears filter=lfs diff=lfs merge=lfs -text
+assets/spider_man_witch_hat filter=lfs diff=lfs merge=lfs -text
diff --git a/README.md b/README.md
index 8e1640e30358f6cd7ca83b50f381ac260b5df920..2b150352112e2ea9e47bcb3702f3d7b04636a1b1 100644
--- a/README.md
+++ b/README.md
@@ -5,7 +5,7 @@ colorFrom: indigo
colorTo: gray
sdk: gradio
sdk_version: 4.38.1
-app_file: app/gpt4_o/brushedit_app.py
+app_file: app/src/brushedit_app.py
pinned: false
-python_version: 3.1
----
\ No newline at end of file
+python_version: 3.10
+---
diff --git a/app/down_load_brushedit.py b/app/down_load_brushedit.py
new file mode 100644
index 0000000000000000000000000000000000000000..040f0fb3cac524ba22eb7ef9244a9fc49ef98856
--- /dev/null
+++ b/app/down_load_brushedit.py
@@ -0,0 +1,13 @@
+import os
+from huggingface_hub import snapshot_download
+
+# download hf models
+BrushEdit_path = "models/"
+if not os.path.exists(BrushEdit_path):
+ BrushEdit_path = snapshot_download(
+ repo_id="TencentARC/BrushEdit",
+ local_dir=BrushEdit_path,
+ token=os.getenv("HF_TOKEN"),
+ )
+
+print("Downloaded BrushEdit to ", BrushEdit_path)
diff --git a/app/down_load_brushedit.sh b/app/down_load_brushedit.sh
new file mode 100644
index 0000000000000000000000000000000000000000..3f6c02d21e126b78d20db99d3c4232e7ebe3b125
--- /dev/null
+++ b/app/down_load_brushedit.sh
@@ -0,0 +1,3 @@
+export PYTHONPATH=.:$PYTHONPATH
+
+python app/down_load_brushedit.py
\ No newline at end of file
diff --git a/app/gpt4_o/brushedit_app.py b/app/gpt4_o/brushedit_app.py
deleted file mode 100644
index eee67bfa98551e21a3570f9fc8f626d3187c1c12..0000000000000000000000000000000000000000
--- a/app/gpt4_o/brushedit_app.py
+++ /dev/null
@@ -1,914 +0,0 @@
-##!/usr/bin/python3
-# -*- coding: utf-8 -*-
-import os, random
-import numpy as np
-import torch
-
-import gradio as gr
-import spaces
-
-from PIL import Image
-
-
-from huggingface_hub import hf_hub_download
-
-from segment_anything import SamPredictor, build_sam, SamAutomaticMaskGenerator
-from diffusers import StableDiffusionBrushNetPipeline, BrushNetModel, UniPCMultistepScheduler
-from scipy.ndimage import binary_dilation, binary_erosion
-
-from app.gpt4_o.vlm_pipeline import (
- vlm_response_editing_type,
- vlm_response_object_wait_for_edit,
- vlm_response_mask,
- vlm_response_prompt_after_apply_instruction
-)
-from app.gpt4_o.brushedit_all_in_one_pipeline import BrushEdit_Pipeline
-from app.utils.utils import load_grounding_dino_model
-
-
-#### Description ####
-head = r"""
-
-
BrushEdit: All-In-One Image Inpainting and Editing
-
-
-
-"""
-descriptions = r"""
-Official Gradio Demo for BrushEdit: All-In-One Image Inpainting and Editing
-🧙 BrushEdit enables precise, user-friendly instruction-based image editing via a inpainting model.
-"""
-
-instructions = r"""
-Currently, we support two modes: fully automated command editing and interactive command editing.
-
-🛠️ Fully automated instruction-based editing:
-
- - ⭐️ step1: Upload or select one image from Example.
- - ⭐️ step2: Input the instructions (supports addition, deletion, and modification), e.g. remove xxx .
- - ⭐️ step3: Click Run button to automatic edit image.
-
-
-🛠️ Interactive instruction-based editing:
-
- - ⭐️ step1: Upload or select one image from Example.
- - ⭐️ step2: Use a brush to outline the area you want to edit.
- - ⭐️ step3: Input the instructions.
- - ⭐️ step4: Click Run button to automatic edit image.
-
-
-💡 Some tips:
-
- - 🤠 After input the instructions, you can click the Generate Mask button. The mask generated by VLM will be displayed in the preview panel on the right side.
- - 🤠 After generating the mask or when you use the brush to draw the mask, you can perform operations such as randomization, dilation, erosion, and movement.
- - 🤠 After input the instructions, you can click the Generate Target Prompt button. The target prompt will be displayed in the text box, and you can modify it according to your ideas.
-
-
-☕️ Have fun!
- """
-
-
-# - - - - - examples - - - - - #
-EXAMPLES = [
- # [
- # {"background": Image.open("assets/mona_lisa/image_edit_aae09152-4495-4332-b691-a0c7bff524be_2.png").convert("RGBA"),
- # "layers": [Image.new("RGBA", (Image.open("assets/mona_lisa/image_edit_aae09152-4495-4332-b691-a0c7bff524be_2.png").width, Image.open("assets/mona_lisa/image_edit_aae09152-4495-4332-b691-a0c7bff524be_2.png").height), (0, 0, 0, 0))],
- # "composite": Image.open("assets/mona_lisa/image_edit_aae09152-4495-4332-b691-a0c7bff524be_2.png").convert("RGBA")},
- # # Image.open("assets/mona_lisa/image_edit_aae09152-4495-4332-b691-a0c7bff524be_2.png").convert("RGBA"),
- # "add a shining necklace",
- # # [Image.open("assets/mona_lisa/image_edit_aae09152-4495-4332-b691-a0c7bff524be_2.jpg")],
- # # [Image.open("assets/mona_lisa/mask_aae09152-4495-4332-b691-a0c7bff524be.png")],
- # # [Image.open("assets/mona_lisa/masked_image_aae09152-4495-4332-b691-a0c7bff524be.png")]
- # ],
-
- [
- # load_image_from_url("https://github.com/liyaowei-stu/BrushEdit/blob/main/assets/mona_lisa/mona_lisa.png"),
- Image.open("assets/mona_lisa/mona_lisa.png").convert("RGBA"),
- "add a shining necklace",
- # [Image.open("assets/mona_lisa/image_edit_aae09152-4495-4332-b691-a0c7bff524be_2.jpg")],
- # [Image.open("assets/mona_lisa/mask_aae09152-4495-4332-b691-a0c7bff524be.png")],
- # [Image.open("assets/mona_lisa/masked_image_aae09152-4495-4332-b691-a0c7bff524be.png")]
- ],
-
-
-
-
-]
-
-
-## init VLM
-from openai import OpenAI
-
-OPENAI_API_KEY = os.getenv('OPENAI_API_KEY')
-os.environ['OPENAI_API_KEY'] = OPENAI_API_KEY
-vlm = OpenAI(base_url="http://v2.open.venus.oa.com/llmproxy")
-device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
-
-
-
-# download hf models
-base_model_path = hf_hub_download(
- repo_id="Yw22/BrushEdit",
- subfolder="base_model/realisticVisionV60B1_v51VAE",
- token=os.getenv("HF_TOKEN"),
-)
-
-
-brushnet_path = hf_hub_download(
- repo_id="Yw22/BrushEdit",
- subfolder="brushnetX",
- token=os.getenv("HF_TOKEN"),
-)
-
-sam_path = hf_hub_download(
- repo_id="Yw22/BrushEdit",
- subfolder="sam",
- filename="sam_vit_h_4b8939.pth",
- token=os.getenv("HF_TOKEN"),
-)
-
-groundingdino_path = hf_hub_download(
- repo_id="Yw22/BrushEdit",
- subfolder="grounding_dino",
- filename="groundingdino_swint_ogc.pth",
- token=os.getenv("HF_TOKEN"),
-)
-
-
-# input brushnetX ckpt path
-brushnet = BrushNetModel.from_pretrained(brushnet_path, torch_dtype=torch.float16)
-pipe = StableDiffusionBrushNetPipeline.from_pretrained(
- base_model_path, brushnet=brushnet, torch_dtype=torch.float16, low_cpu_mem_usage=False
- )
-# speed up diffusion process with faster scheduler and memory optimization
-pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
-# remove following line if xformers is not installed or when using Torch 2.0.
-# pipe.enable_xformers_memory_efficient_attention()
-pipe.enable_model_cpu_offload()
-
-
-## init SAM
-sam = build_sam(checkpoint=sam_path)
-sam.to(device=device)
-sam_predictor = SamPredictor(sam)
-sam_automask_generator = SamAutomaticMaskGenerator(sam)
-
-## init groundingdino_model
-config_file = 'third_party/Grounded-Segment-Anything/GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py'
-groundingdino_model = load_grounding_dino_model(config_file, groundingdino_path, device=device)
-
-## Ordinary function
-def crop_and_resize(image: Image.Image,
- target_width: int,
- target_height: int) -> Image.Image:
- """
- Crops and resizes an image while preserving the aspect ratio.
-
- Args:
- image (Image.Image): Input PIL image to be cropped and resized.
- target_width (int): Target width of the output image.
- target_height (int): Target height of the output image.
-
- Returns:
- Image.Image: Cropped and resized image.
- """
- # Original dimensions
- original_width, original_height = image.size
- original_aspect = original_width / original_height
- target_aspect = target_width / target_height
-
- # Calculate crop box to maintain aspect ratio
- if original_aspect > target_aspect:
- # Crop horizontally
- new_width = int(original_height * target_aspect)
- new_height = original_height
- left = (original_width - new_width) / 2
- top = 0
- right = left + new_width
- bottom = original_height
- else:
- # Crop vertically
- new_width = original_width
- new_height = int(original_width / target_aspect)
- left = 0
- top = (original_height - new_height) / 2
- right = original_width
- bottom = top + new_height
-
- # Crop and resize
- cropped_image = image.crop((left, top, right, bottom))
- resized_image = cropped_image.resize((target_width, target_height), Image.NEAREST)
-
- return resized_image
-
-
-def move_mask_func(mask, direction, units):
- binary_mask = mask.squeeze()>0
- rows, cols = binary_mask.shape
-
- moved_mask = np.zeros_like(binary_mask, dtype=bool)
-
- if direction == 'down':
- # move down
- moved_mask[max(0, units):, :] = binary_mask[:rows - units, :]
-
- elif direction == 'up':
- # move up
- moved_mask[:rows - units, :] = binary_mask[units:, :]
-
- elif direction == 'right':
- # move left
- moved_mask[:, max(0, units):] = binary_mask[:, :cols - units]
-
- elif direction == 'left':
- # move right
- moved_mask[:, :cols - units] = binary_mask[:, units:]
-
- return moved_mask
-
-
-def random_mask_func(mask, dilation_type='square'):
- # Randomly select the size of dilation
- dilation_size = np.random.randint(20, 40) # Randomly select the size of dilation
- binary_mask = mask.squeeze()>0
-
- if dilation_type == 'square_dilation':
- structure = np.ones((dilation_size, dilation_size), dtype=bool)
- dilated_mask = binary_dilation(binary_mask, structure=structure)
- elif dilation_type == 'square_erosion':
- structure = np.ones((dilation_size, dilation_size), dtype=bool)
- dilated_mask = binary_erosion(binary_mask, structure=structure)
- elif dilation_type == 'bounding_box':
- # find the most left top and left bottom point
- rows, cols = np.where(binary_mask)
- if len(rows) == 0 or len(cols) == 0:
- return mask # return original mask if no valid points
-
- min_row = np.min(rows)
- max_row = np.max(rows)
- min_col = np.min(cols)
- max_col = np.max(cols)
-
- # create a bounding box
- dilated_mask = np.zeros_like(binary_mask, dtype=bool)
- dilated_mask[min_row:max_row + 1, min_col:max_col + 1] = True
-
- elif dilation_type == 'bounding_ellipse':
- # find the most left top and left bottom point
- rows, cols = np.where(binary_mask)
- if len(rows) == 0 or len(cols) == 0:
- return mask # return original mask if no valid points
-
- min_row = np.min(rows)
- max_row = np.max(rows)
- min_col = np.min(cols)
- max_col = np.max(cols)
-
- # calculate the center and axis length of the ellipse
- center = ((min_col + max_col) // 2, (min_row + max_row) // 2)
- a = (max_col - min_col) // 2 # half long axis
- b = (max_row - min_row) // 2 # half short axis
-
- # create a bounding ellipse
- y, x = np.ogrid[:mask.shape[0], :mask.shape[1]]
- ellipse_mask = ((x - center[0])**2 / a**2 + (y - center[1])**2 / b**2) <= 1
- dilated_mask = np.zeros_like(binary_mask, dtype=bool)
- dilated_mask[ellipse_mask] = True
- else:
- raise ValueError("dilation_type must be 'square' or 'ellipse'")
-
- # use binary dilation
- dilated_mask = np.uint8(dilated_mask[:,:,np.newaxis]) * 255
- return dilated_mask
-
-
-## Gradio component function
-@spaces.GPU(duration=180)
-def process(input_image,
- original_image,
- original_mask,
- prompt,
- negative_prompt,
- control_strength,
- seed,
- randomize_seed,
- guidance_scale,
- num_inference_steps,
- num_samples,
- blending,
- category,
- target_prompt,
- resize_and_crop):
-
- import ipdb; ipdb.set_trace()
- if original_image is None:
- raise gr.Error('Please upload the input image')
- if prompt is None:
- raise gr.Error("Please input your instructions, e.g., remove the xxx")
-
-
- alpha_mask = input_image["layers"][0].split()[3]
- input_mask = np.asarray(alpha_mask)
- if resize_and_crop:
- original_image = crop_and_resize(Image.fromarray(original_image), target_width=640, target_height=640)
- input_mask = crop_and_resize(Image.fromarray(input_mask), target_width=640, target_height=640)
- original_image = np.array(original_image)
- input_mask = np.array(input_mask)
-
- if input_mask.max() == 0:
- original_mask = original_mask
- else:
- original_mask = input_mask[:,:,None]
-
- # load example image
- # if isinstance(original_image, str):
- # # image_name = image_examples[original_image][0]
- # # original_image = cv2.imread(image_name)
- # # original_image = cv2.cvtColor(original_image, cv2.COLOR_BGR2RGB)
- # original_image = input_image
- # num_samples = 1
- # blending = True
-
- if category is not None:
- pass
- else:
- category = vlm_response_editing_type(vlm, original_image, prompt)
-
-
- if original_mask is not None:
- original_mask = np.clip(original_mask, 0, 255).astype(np.uint8)
- else:
- object_wait_for_edit = vlm_response_object_wait_for_edit(vlm,
- category,
- prompt)
- original_mask = vlm_response_mask(vlm,
- category,
- original_image,
- prompt,
- object_wait_for_edit,
- sam,
- sam_predictor,
- sam_automask_generator,
- groundingdino_model,
- )[:,:,None]
-
-
- if len(target_prompt) <= 1:
- prompt_after_apply_instruction = vlm_response_prompt_after_apply_instruction(vlm,
- original_image,
- prompt)
- else:
- prompt_after_apply_instruction = target_prompt
-
- generator = torch.Generator("cuda").manual_seed(random.randint(0, 2147483647) if randomize_seed else seed)
-
-
-
- image, mask_image = BrushEdit_Pipeline(pipe,
- prompt_after_apply_instruction,
- original_mask,
- original_image,
- generator,
- num_inference_steps,
- guidance_scale,
- control_strength,
- negative_prompt,
- num_samples,
- blending)
-
- masked_image = original_image * (1 - (original_mask>0))
- masked_image = masked_image.astype(np.uint8)
- masked_image = Image.fromarray(masked_image)
-
- import uuid
- uuid = str(uuid.uuid4())
- image[0].save(f"outputs/image_edit_{uuid}_0.png")
- image[1].save(f"outputs/image_edit_{uuid}_1.png")
- image[2].save(f"outputs/image_edit_{uuid}_2.png")
- image[3].save(f"outputs/image_edit_{uuid}_3.png")
- mask_image.save(f"outputs/mask_{uuid}.png")
- masked_image.save(f"outputs/masked_image_{uuid}.png")
- return image, [mask_image], [masked_image], ''
-
-
-def generate_target_prompt(input_image,
- original_image,
- prompt):
- # load example image
- if isinstance(original_image, str):
- original_image = input_image
-
- prompt_after_apply_instruction = vlm_response_prompt_after_apply_instruction(vlm,
- original_image,
- prompt)
- return prompt_after_apply_instruction
-
-
-def process_mask(input_image,
- original_image,
- prompt,
- resize_and_crop):
- if original_image is None:
- raise gr.Error('Please upload the input image')
- if prompt is None:
- raise gr.Error("Please input your instructions, e.g., remove the xxx")
-
- ## load mask
- alpha_mask = input_image["layers"][0].split()[3]
- input_mask = np.array(alpha_mask)
-
- # load example image
- if isinstance(original_image, str):
- original_image = input_image["background"]
-
- if resize_and_crop:
- original_image = crop_and_resize(Image.fromarray(original_image), target_width=640, target_height=640)
- input_mask = crop_and_resize(Image.fromarray(input_mask), target_width=640, target_height=640)
- original_image = np.array(original_image)
- input_mask = np.array(input_mask)
-
-
- if input_mask.max() == 0:
- category = vlm_response_editing_type(vlm, original_image, prompt)
-
- object_wait_for_edit = vlm_response_object_wait_for_edit(vlm,
- category,
- prompt)
- # original mask: h,w,1 [0, 255]
- original_mask = vlm_response_mask(
- vlm,
- category,
- original_image,
- prompt,
- object_wait_for_edit,
- sam,
- sam_predictor,
- sam_automask_generator,
- groundingdino_model,
- )[:,:,None]
- else:
- original_mask = input_mask[:,:,None]
- category = None
-
-
- mask_image = Image.fromarray(original_mask.squeeze().astype(np.uint8)).convert("RGB")
-
- masked_image = original_image * (1 - (original_mask>0))
- masked_image = masked_image.astype(np.uint8)
- masked_image = Image.fromarray(masked_image)
-
- ## not work for image editor
- # background = input_image["background"]
- # mask_array = original_mask.squeeze()
- # layer_rgba = np.array(input_image['layers'][0])
- # layer_rgba[mask_array > 0] = [0, 0, 0, 255]
- # layer_rgba = Image.fromarray(layer_rgba, 'RGBA')
- # black_image = Image.new("RGBA", layer_rgba.size, (0, 0, 0, 255))
- # composite = Image.composite(black_image, background, layer_rgba)
- # output_base = {"layers": [layer_rgba], "background": background, "composite": composite}
-
-
- return [masked_image], [mask_image], original_mask.astype(np.uint8), category
-
-
-def process_random_mask(input_image, original_image, original_mask, resize_and_crop):
-
- alpha_mask = input_image["layers"][0].split()[3]
- input_mask = np.asarray(alpha_mask)
- if resize_and_crop:
- original_image = crop_and_resize(Image.fromarray(original_image), target_width=640, target_height=640)
- input_mask = crop_and_resize(Image.fromarray(input_mask), target_width=640, target_height=640)
- original_image = np.array(original_image)
- input_mask = np.array(input_mask)
-
-
- if input_mask.max() == 0:
- if original_mask is None:
- raise gr.Error('Please generate mask first')
- original_mask = original_mask
- else:
- original_mask = input_mask[:,:,None]
-
-
- dilation_type = np.random.choice(['bounding_box', 'bounding_ellipse'])
- random_mask = random_mask_func(original_mask, dilation_type).squeeze()
-
- mask_image = Image.fromarray(random_mask.astype(np.uint8)).convert("RGB")
-
- masked_image = original_image * (1 - (random_mask[:,:,None]>0))
- masked_image = masked_image.astype(original_image.dtype)
- masked_image = Image.fromarray(masked_image)
-
-
- return [masked_image], [mask_image], random_mask[:,:,None].astype(np.uint8)
-
-
-def process_dilation_mask(input_image, original_image, original_mask, resize_and_crop):
-
- alpha_mask = input_image["layers"][0].split()[3]
- input_mask = np.asarray(alpha_mask)
- if resize_and_crop:
- original_image = crop_and_resize(Image.fromarray(original_image), target_width=640, target_height=640)
- input_mask = crop_and_resize(Image.fromarray(input_mask), target_width=640, target_height=640)
- original_image = np.array(original_image)
- input_mask = np.array(input_mask)
-
- if input_mask.max() == 0:
- if original_mask is None:
- raise gr.Error('Please generate mask first')
- original_mask = original_mask
- else:
- original_mask = input_mask[:,:,None]
-
- dilation_type = np.random.choice(['square_dilation'])
- random_mask = random_mask_func(original_mask, dilation_type).squeeze()
-
- mask_image = Image.fromarray(random_mask.astype(np.uint8)).convert("RGB")
-
- masked_image = original_image * (1 - (random_mask[:,:,None]>0))
- masked_image = masked_image.astype(original_image.dtype)
- masked_image = Image.fromarray(masked_image)
-
- return [masked_image], [mask_image], random_mask[:,:,None].astype(np.uint8)
-
-
-def process_erosion_mask(input_image, original_image, original_mask, resize_and_crop):
- alpha_mask = input_image["layers"][0].split()[3]
- input_mask = np.asarray(alpha_mask)
- if resize_and_crop:
- original_image = crop_and_resize(Image.fromarray(original_image), target_width=640, target_height=640)
- input_mask = crop_and_resize(Image.fromarray(input_mask), target_width=640, target_height=640)
- original_image = np.array(original_image)
- input_mask = np.array(input_mask)
-
- if input_mask.max() == 0:
- if original_mask is None:
- raise gr.Error('Please generate mask first')
- original_mask = original_mask
- else:
- original_mask = input_mask[:,:,None]
-
- dilation_type = np.random.choice(['square_erosion'])
- random_mask = random_mask_func(original_mask, dilation_type).squeeze()
-
- mask_image = Image.fromarray(random_mask.astype(np.uint8)).convert("RGB")
-
- masked_image = original_image * (1 - (random_mask[:,:,None]>0))
- masked_image = masked_image.astype(original_image.dtype)
- masked_image = Image.fromarray(masked_image)
-
-
- return [masked_image], [mask_image], random_mask[:,:,None].astype(np.uint8)
-
-
-def move_mask_left(input_image, original_image, original_mask, moving_pixels, resize_and_crop):
-
- alpha_mask = input_image["layers"][0].split()[3]
- input_mask = np.asarray(alpha_mask)
- if resize_and_crop:
- original_image = crop_and_resize(Image.fromarray(original_image), target_width=640, target_height=640)
- input_mask = crop_and_resize(Image.fromarray(input_mask), target_width=640, target_height=640)
- original_image = np.array(original_image)
- input_mask = np.array(input_mask)
-
- if input_mask.max() == 0:
- if original_mask is None:
- raise gr.Error('Please generate mask first')
- original_mask = original_mask
- else:
- original_mask = input_mask[:,:,None]
-
- moved_mask = move_mask_func(original_mask, 'left', int(moving_pixels)).squeeze()
- mask_image = Image.fromarray(((moved_mask>0).astype(np.uint8)*255)).convert("RGB")
-
- masked_image = original_image * (1 - (moved_mask[:,:,None]>0))
- masked_image = masked_image.astype(original_image.dtype)
- masked_image = Image.fromarray(masked_image)
-
- if moved_mask.max() <= 1:
- moved_mask = ((moved_mask * 255)[:,:,None]).astype(np.uint8)
- original_mask = moved_mask
- return [masked_image], [mask_image], original_mask.astype(np.uint8)
-
-
-def move_mask_right(input_image, original_image, original_mask, moving_pixels, resize_and_crop):
- alpha_mask = input_image["layers"][0].split()[3]
- input_mask = np.asarray(alpha_mask)
- if resize_and_crop:
- original_image = crop_and_resize(Image.fromarray(original_image), target_width=640, target_height=640)
- input_mask = crop_and_resize(Image.fromarray(input_mask), target_width=640, target_height=640)
- original_image = np.array(original_image)
- input_mask = np.array(input_mask)
-
- if input_mask.max() == 0:
- if original_mask is None:
- raise gr.Error('Please generate mask first')
- original_mask = original_mask
- else:
- original_mask = input_mask[:,:,None]
-
- moved_mask = move_mask_func(original_mask, 'right', int(moving_pixels)).squeeze()
-
- mask_image = Image.fromarray(((moved_mask>0).astype(np.uint8)*255)).convert("RGB")
-
- masked_image = original_image * (1 - (moved_mask[:,:,None]>0))
- masked_image = masked_image.astype(original_image.dtype)
- masked_image = Image.fromarray(masked_image)
-
-
- if moved_mask.max() <= 1:
- moved_mask = ((moved_mask * 255)[:,:,None]).astype(np.uint8)
- original_mask = moved_mask
-
- return [masked_image], [mask_image], original_mask.astype(np.uint8)
-
-
-def move_mask_up(input_image, original_image, original_mask, moving_pixels, resize_and_crop):
- alpha_mask = input_image["layers"][0].split()[3]
- input_mask = np.asarray(alpha_mask)
- if resize_and_crop:
- original_image = crop_and_resize(Image.fromarray(original_image), target_width=640, target_height=640)
- input_mask = crop_and_resize(Image.fromarray(input_mask), target_width=640, target_height=640)
- original_image = np.array(original_image)
- input_mask = np.array(input_mask)
-
- if input_mask.max() == 0:
- if original_mask is None:
- raise gr.Error('Please generate mask first')
- original_mask = original_mask
- else:
- original_mask = input_mask[:,:,None]
-
- moved_mask = move_mask_func(original_mask, 'up', int(moving_pixels)).squeeze()
- mask_image = Image.fromarray(((moved_mask>0).astype(np.uint8)*255)).convert("RGB")
-
- masked_image = original_image * (1 - (moved_mask[:,:,None]>0))
- masked_image = masked_image.astype(original_image.dtype)
- masked_image = Image.fromarray(masked_image)
-
- if moved_mask.max() <= 1:
- moved_mask = ((moved_mask * 255)[:,:,None]).astype(np.uint8)
- original_mask = moved_mask
-
- return [masked_image], [mask_image], original_mask.astype(np.uint8)
-
-
-def move_mask_down(input_image, original_image, original_mask, moving_pixels, resize_and_crop):
- alpha_mask = input_image["layers"][0].split()[3]
- input_mask = np.asarray(alpha_mask)
- if resize_and_crop:
- original_image = crop_and_resize(Image.fromarray(original_image), target_width=640, target_height=640)
- input_mask = crop_and_resize(Image.fromarray(input_mask), target_width=640, target_height=640)
- original_image = np.array(original_image)
- input_mask = np.array(input_mask)
-
- if input_mask.max() == 0:
- if original_mask is None:
- raise gr.Error('Please generate mask first')
- original_mask = original_mask
- else:
- original_mask = input_mask[:,:,None]
-
- moved_mask = move_mask_func(original_mask, 'down', int(moving_pixels)).squeeze()
- mask_image = Image.fromarray(((moved_mask>0).astype(np.uint8)*255)).convert("RGB")
-
- masked_image = original_image * (1 - (moved_mask[:,:,None]>0))
- masked_image = masked_image.astype(original_image.dtype)
- masked_image = Image.fromarray(masked_image)
-
- if moved_mask.max() <= 1:
- moved_mask = ((moved_mask * 255)[:,:,None]).astype(np.uint8)
- original_mask = moved_mask
-
- return [masked_image], [mask_image], original_mask.astype(np.uint8)
-
-
-def store_img(base):
- import ipdb; ipdb.set_trace()
- image_pil = base["background"].convert("RGB")
- original_image = np.array(image_pil)
- # import ipdb; ipdb.set_trace()
- if max(original_image.shape[0], original_image.shape[1]) * 1.0 / min(original_image.shape[0], original_image.shape[1])>2.0:
- raise gr.Error('image aspect ratio cannot be larger than 2.0')
- return base, original_image, None, "", None, None, None, None, None
-
-
-def reset_func(input_image, original_image, original_mask, prompt, target_prompt):
- input_image = None
- original_image = None
- original_mask = None
- prompt = ''
- mask_gallery = []
- masked_gallery = []
- result_gallery = []
- target_prompt = ''
- return input_image, original_image, original_mask, prompt, mask_gallery, masked_gallery, result_gallery, target_prompt
-
-
-block = gr.Blocks(
- theme=gr.themes.Soft(
- radius_size=gr.themes.sizes.radius_none,
- text_size=gr.themes.sizes.text_md
- )
- ).queue()
-with block as demo:
- with gr.Row():
- with gr.Column():
- gr.HTML(head)
-
- gr.Markdown(descriptions)
-
- with gr.Accordion(label="🧭 Instructions:", open=True, elem_id="accordion"):
- with gr.Row(equal_height=True):
- gr.Markdown(instructions)
-
- original_image = gr.State(value=None)
- original_mask = gr.State(value=None)
- category = gr.State(value=None)
-
- with gr.Row():
- with gr.Column():
- with gr.Row():
- input_image = gr.ImageEditor(
- label="Input Image",
- type="pil",
- brush=gr.Brush(colors=["#000000"], default_size = 30, color_mode="fixed"),
- layers = False,
- interactive=True,
- height=800,
- # transforms=("crop"),
- # crop_size=(640, 640),
- )
-
- prompt = gr.Textbox(label="Prompt", placeholder="Please input your instruction.",value='',lines=1)
-
- with gr.Row():
- mask_button = gr.Button("Generate Mask")
- random_mask_button = gr.Button("Random Generated Mask")
- with gr.Row():
- dilation_mask_button = gr.Button("Dilation Generated Mask")
- erosion_mask_button = gr.Button("Erosion Generated Mask")
-
- with gr.Row():
- generate_target_prompt_button = gr.Button("Generate Target Prompt")
- run_button = gr.Button("Run")
-
-
- target_prompt = gr.Text(
- label="Target prompt",
- max_lines=5,
- placeholder="VLM-generated target prompt, you can first generate if and then modify it (optional)",
- value='',
- lines=2
- )
-
- resize_and_crop = gr.Checkbox(label="Resize and Crop (640 x 640)", value=False)
-
- with gr.Accordion("More input params (highly-recommended)", open=False, elem_id="accordion1"):
- negative_prompt = gr.Text(
- label="Negative Prompt",
- max_lines=5,
- placeholder="Please input your negative prompt",
- value='ugly, low quality',lines=1
- )
-
- control_strength = gr.Slider(
- label="Control Strength: ", show_label=True, minimum=0, maximum=1.1, value=1, step=0.01
- )
- with gr.Group():
- seed = gr.Slider(
- label="Seed: ", minimum=0, maximum=2147483647, step=1, value=648464818
- )
- randomize_seed = gr.Checkbox(label="Randomize seed", value=False)
-
- blending = gr.Checkbox(label="Blending mode", value=True)
-
-
- num_samples = gr.Slider(
- label="Num samples", minimum=0, maximum=4, step=1, value=4
- )
-
- with gr.Group():
- with gr.Row():
- guidance_scale = gr.Slider(
- label="Guidance scale",
- minimum=1,
- maximum=12,
- step=0.1,
- value=7.5,
- )
- num_inference_steps = gr.Slider(
- label="Number of inference steps",
- minimum=1,
- maximum=50,
- step=1,
- value=50,
- )
-
-
- with gr.Column():
- with gr.Row():
- with gr.Tabs(elem_classes=["feedback"]):
- with gr.TabItem("Mask"):
- mask_gallery = gr.Gallery(label='Mask', show_label=False, elem_id="gallery", preview=True, height=360)
- with gr.Tabs(elem_classes=["feedback"]):
- with gr.TabItem("Masked Image"):
- masked_gallery = gr.Gallery(label='Masked Image', show_label=False, elem_id="gallery", preview=True, height=360)
-
- moving_pixels = gr.Slider(
- label="Moving pixels:", show_label=True, minimum=0, maximum=50, value=4, step=1
- )
- with gr.Row():
- move_left_button = gr.Button("Move Left")
- move_right_button = gr.Button("Move Right")
- with gr.Row():
- move_up_button = gr.Button("Move Up")
- move_down_button = gr.Button("Move Down")
-
- with gr.Tabs(elem_classes=["feedback"]):
- with gr.TabItem("Outputs"):
- result_gallery = gr.Gallery(label='Output', show_label=False, elem_id="gallery", preview=True, height=360)
-
- reset_button = gr.Button("Reset")
-
-
- with gr.Row():
- # # example = gr.Examples(
- # # label="Quick Example",
- # # examples=EXAMPLES,
- # # inputs=[prompt, seed, result_gallery, mask_gallery, masked_gallery],
- # # examples_per_page=10,
- # # cache_examples=False,
- # # )
- example = gr.Examples(
- label="Quick Example",
- examples=EXAMPLES,
- inputs=[input_image, prompt],
- examples_per_page=10,
- cache_examples=False,
- )
- # def process_example(prompt, seed, eg_output):
- # import ipdb; ipdb.set_trace()
- # eg_output_path = os.path.join("assets/", eg_output)
- # return prompt, seed, [Image.open(eg_output_path)]
- # example = gr.Examples(
- # label="Quick Example",
- # examples=EXAMPLES,
- # inputs=[prompt, seed, eg_output],
- # outputs=[prompt, seed, result_gallery],
- # fn=process_example,
- # examples_per_page=10,
- # run_on_click=True,
- # cache_examples=False,
- # )
-
- input_image.upload(
- store_img,
- [input_image],
- [input_image, original_image, original_mask, prompt, mask_gallery, masked_gallery, result_gallery, target_prompt]
- )
-
-
- ips=[input_image,
- original_image,
- original_mask,
- prompt,
- negative_prompt,
- control_strength,
- seed,
- randomize_seed,
- guidance_scale,
- num_inference_steps,
- num_samples,
- blending,
- category,
- target_prompt,
- resize_and_crop]
-
- ## run brushedit
- run_button.click(fn=process, inputs=ips, outputs=[result_gallery, mask_gallery, masked_gallery, target_prompt])
-
- ## mask func
- mask_button.click(fn=process_mask, inputs=[input_image, original_image, prompt, resize_and_crop], outputs=[masked_gallery, mask_gallery, original_mask, category])
- random_mask_button.click(fn=process_random_mask, inputs=[input_image, original_image, original_mask, resize_and_crop], outputs=[masked_gallery, mask_gallery, original_mask])
- dilation_mask_button.click(fn=process_dilation_mask, inputs=[input_image, original_image, original_mask, resize_and_crop], outputs=[ masked_gallery, mask_gallery, original_mask])
- erosion_mask_button.click(fn=process_erosion_mask, inputs=[input_image, original_image, original_mask, resize_and_crop], outputs=[ masked_gallery, mask_gallery, original_mask])
-
- ## move mask func
- move_left_button.click(fn=move_mask_left, inputs=[input_image, original_image, original_mask, moving_pixels, resize_and_crop], outputs=[masked_gallery, mask_gallery, original_mask])
- move_right_button.click(fn=move_mask_right, inputs=[input_image, original_image, original_mask, moving_pixels, resize_and_crop], outputs=[masked_gallery, mask_gallery, original_mask])
- move_up_button.click(fn=move_mask_up, inputs=[input_image, original_image, original_mask, moving_pixels, resize_and_crop], outputs=[masked_gallery, mask_gallery, original_mask])
- move_down_button.click(fn=move_mask_down, inputs=[input_image, original_image, original_mask, moving_pixels, resize_and_crop], outputs=[masked_gallery, mask_gallery, original_mask])
-
- ## prompt func
- generate_target_prompt_button.click(fn=generate_target_prompt, inputs=[input_image, original_image, prompt], outputs=[target_prompt])
-
- ## reset func
- reset_button.click(fn=reset_func, inputs=[input_image, original_image, original_mask, prompt, target_prompt], outputs=[input_image, original_image, original_mask, prompt, mask_gallery, masked_gallery, result_gallery, target_prompt])
-
-demo.launch(server_name="0.0.0.0")
diff --git a/app/gpt4_o/instructions.py b/app/gpt4_o/instructions.py
index 2775a9286234c9624c8cbb1b3265565f84e55505..04fbd47a7fd0cd09be176ab2e42fccd847b1f1f7 100644
--- a/app/gpt4_o/instructions.py
+++ b/app/gpt4_o/instructions.py
@@ -1,15 +1,16 @@
-def create_editing_category_messages(editing_prompt):
+def create_editing_category_messages_gpt4o(editing_prompt):
messages = [{
"role": "system",
"content": [
{
"type": "text",
- "text": "I will give you an image and an editing instruction of the image. Please output which type of editing category it is in. You can choose from the following categories: \n\
- 1. Addition: Adding new objects within the images, e.g., add a bird to the image \n\
- 2. Remove: Removing objects, e.g., remove the mask \n\
- 3. Local: Replace local parts of an object and later the object's attributes (e.g., make it smile) or alter an object's visual appearance without affecting its structure (e.g., change the cat to a dog) \n\
- 4. Global: Edit the entire image, e.g., let's see it in winter \n\
- 5. Background: Change the scene's background, e.g., have her walk on water, change the background to a beach, make the hedgehog in France, etc.",
+ "text": "I will give you an editing instruction of the image. Please output which type of editing category it is in. You can choose from the following categories: \n\
+ 1. Addition: Adding new objects within the images, e.g., add a bird \n\
+ 2. Remove: Removing objects, e.g., remove the mask \n\
+ 3. Local: Replace local parts of an object and later the object's attributes (e.g., make it smile) or alter an object's visual appearance without affecting its structure (e.g., change the cat to a dog) \n\
+ 4. Global: Edit the entire image, e.g., let's see it in winter \n\
+ 5. Background: Change the scene's background, e.g., have her walk on water, change the background to a beach, make the hedgehog in France, etc. \n\
+ Only output a single word, e.g., 'Addition'.",
},]
},
{
@@ -24,7 +25,7 @@ def create_editing_category_messages(editing_prompt):
return messages
-def create_ori_object_messages(editing_prompt):
+def create_ori_object_messages_gpt4o(editing_prompt):
messages = [
{
@@ -49,7 +50,7 @@ def create_ori_object_messages(editing_prompt):
return messages
-def create_add_object_messages(editing_prompt, base64_image, height=640, width=640):
+def create_add_object_messages_gpt4o(editing_prompt, base64_image, height=640, width=640):
size_str = f"The image size is height {height}px and width {width}px. The top - left corner is coordinate [0 , 0]. The bottom - right corner is coordinnate [{height} , {width}]. "
@@ -77,7 +78,7 @@ def create_add_object_messages(editing_prompt, base64_image, height=640, width=6
return messages
-def create_apply_editing_messages(editing_prompt, base64_image):
+def create_apply_editing_messages_gpt4o(editing_prompt, base64_image):
messages = [
{
"role": "system",
diff --git a/app/gpt4_o/requirements.txt b/app/gpt4_o/requirements.txt
deleted file mode 100644
index 5fdb90a79abce9bc11ed20b46b8b3bc5818498f4..0000000000000000000000000000000000000000
--- a/app/gpt4_o/requirements.txt
+++ /dev/null
@@ -1,18 +0,0 @@
-torchvision
-transformers>=4.25.1
-ftfy
-tensorboard
-datasets
-Pillow==9.5.0
-opencv-python
-imgaug
-accelerate==0.20.3
-image-reward
-hpsv2
-torchmetrics
-open-clip-torch
-clip
-# gradio==4.44.1
-gradio==4.38.1
-segment_anything
-openai
\ No newline at end of file
diff --git a/app/gpt4_o/vlm_pipeline.py b/app/gpt4_o/vlm_pipeline.py
deleted file mode 100644
index 40fa49da409386abe0b70d4cc1ffeabf2a47eaff..0000000000000000000000000000000000000000
--- a/app/gpt4_o/vlm_pipeline.py
+++ /dev/null
@@ -1,138 +0,0 @@
-import base64
-import re
-import torch
-
-from PIL import Image
-from io import BytesIO
-import numpy as np
-import gradio as gr
-
-
-from app.gpt4_o.instructions import (
- create_editing_category_messages,
- create_ori_object_messages,
- create_add_object_messages,
- create_apply_editing_messages)
-
-from app.utils.utils import run_grounded_sam
-
-
-def encode_image(img):
- img = Image.fromarray(img.astype('uint8'))
- buffered = BytesIO()
- img.save(buffered, format="PNG")
- img_bytes = buffered.getvalue()
- return base64.b64encode(img_bytes).decode('utf-8')
-
-
-def run_gpt4o_vl_inference(vlm,
- messages):
- response = vlm.chat.completions.create(
- model="gpt-4o-2024-08-06",
- messages=messages
- )
- response_str = response.choices[0].message.content
- return response_str
-
-
-def vlm_response_editing_type(vlm,
- image,
- editing_prompt):
-
- base64_image = encode_image(image)
-
- messages = create_editing_category_messages(editing_prompt)
-
- response_str = run_gpt4o_vl_inference(vlm, messages)
-
- for category_name in ["Addition","Remove","Local","Global","Background"]:
- if category_name.lower() in response_str.lower():
- return category_name
- raise ValueError("Please input correct commands, including add, delete, and modify commands.")
-
-
-def vlm_response_object_wait_for_edit(vlm,
- category,
- editing_prompt):
- if category in ["Background", "Global", "Addition"]:
- edit_object = "nan"
- return edit_object
-
- messages = create_ori_object_messages(editing_prompt)
-
- response_str = run_gpt4o_vl_inference(vlm, messages)
- return response_str
-
-
-def vlm_response_mask(vlm,
- category,
- image,
- editing_prompt,
- object_wait_for_edit,
- sam=None,
- sam_predictor=None,
- sam_automask_generator=None,
- groundingdino_model=None,
- ):
- mask = None
- if editing_prompt is None or len(editing_prompt)==0:
- raise gr.Error("Please input the editing instruction!")
- height, width = image.shape[:2]
- if category=="Addition":
- base64_image = encode_image(image)
- messages = create_add_object_messages(editing_prompt, base64_image, height=height, width=width)
- try:
- response_str = run_gpt4o_vl_inference(vlm, messages)
- pattern = r'\[\d{1,3}(?:,\s*\d{1,3}){3}\]'
- box = re.findall(pattern, response_str)
- box = box[0][1:-1].split(",")
- for i in range(len(box)):
- box[i] = int(box[i])
- cus_mask = np.zeros((height, width))
- cus_mask[box[1]: box[1]+box[3], box[0]: box[0]+box[2]]=255
- mask = cus_mask
- except:
- raise gr.Error("Please set the mask manually, MLLM cannot output the mask!")
-
- elif category=="Background":
- labels = "background"
- elif category=="Global":
- mask = 255 * np.zeros((height, width))
- else:
- labels = object_wait_for_edit
-
- if mask is None:
- for thresh in [0.3,0.25,0.2,0.15,0.1,0.05,0]:
- try:
- device = "cuda" if torch.cuda.is_available() else "cpu"
- detections = run_grounded_sam(
- input_image={"image":Image.fromarray(image.astype('uint8')),
- "mask":None},
- text_prompt=labels,
- task_type="seg",
- box_threshold=thresh,
- text_threshold=0.25,
- iou_threshold=0.5,
- scribble_mode="split",
- sam=sam,
- sam_predictor=sam_predictor,
- sam_automask_generator=sam_automask_generator,
- groundingdino_model=groundingdino_model,
- device=device,
- )
- mask = np.array(detections[0,0,...].cpu()) * 255
- break
- except:
- print(f"wrong in threshhold: {thresh}, continue")
- continue
- return mask
-
-
-def vlm_response_prompt_after_apply_instruction(vlm,
- image,
- editing_prompt):
- base64_image = encode_image(image)
- messages = create_apply_editing_messages(editing_prompt, base64_image)
-
- response_str = run_gpt4o_vl_inference(vlm, messages)
- return response_str
\ No newline at end of file
diff --git a/app/llava/instructions.py b/app/llava/instructions.py
new file mode 100644
index 0000000000000000000000000000000000000000..2fb52760d7b9039acae4b2da75d50fc4fa9d02b8
--- /dev/null
+++ b/app/llava/instructions.py
@@ -0,0 +1,108 @@
+def create_editing_category_messages_llava(editing_prompt):
+ messages = [{
+ "role": "system",
+ "content": [
+ {
+ "type": "text",
+ "text": "I will give you an image and an editing instruction of the image. Please output which type of editing category it is in. You can choose from the following categories: \n\
+ 1. Addition: Adding new objects within the images, e.g., add a bird \n\
+ 2. Remove: Removing objects, e.g., remove the mask \n\
+ 3. Local: Replace local parts of an object and later the object's attributes (e.g., make it smile) or alter an object's visual appearance without affecting its structure (e.g., change the cat to a dog) \n\
+ 4. Global: Edit the entire image, e.g., let's see it in winter \n\
+ 5. Background: Change the scene's background, e.g., have her walk on water, change the background to a beach, make the hedgehog in France, etc. \n\
+ Only output a single word, e.g., 'Addition'.",
+ },]
+ },
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "image"
+ },
+ {
+ "type": "text",
+ "text": editing_prompt
+ },
+ ]
+ }]
+ return messages
+
+
+def create_ori_object_messages_llava(editing_prompt):
+
+ messages = [
+ {
+ "role": "system",
+ "content": [
+ {
+ "type": "text",
+ "text": "I will give you an editing instruction of the image. Please output the object needed to be edited. You only need to output the basic description of the object in no more than 5 words. The output should only contain one noun. \n \
+ For example, the editing instruction is 'Change the white cat to a black dog'. Then you need to output: 'white cat'. Only output the new content. Do not output anything else."
+ },]
+ },
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "image"
+ },
+ {
+ "type": "text",
+ "text": editing_prompt
+ }
+ ]
+ }
+ ]
+ return messages
+
+
+def create_add_object_messages_llava(editing_prompt, height=640, width=640):
+
+ size_str = f"The image size is height {height}px and width {width}px. The top - left corner is coordinate [0 , 0]. The bottom - right corner is coordinnate [{height} , {width}]. "
+
+ messages = [
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "image"
+ },
+ {
+ "type": "text",
+ "text": "I need to add an object to the image following the instruction: " + editing_prompt + ". " + size_str + " \n \
+ Can you give me a possible bounding box of the location for the added object? Please output with the format of [top - left x coordinate , top - left y coordinate , box width , box height]. You should only output the bounding box position and nothing else. Please refer to the example below for the desired format.\n\
+ [Examples]\n \
+ [19, 101, 32, 153]\n \
+ [54, 12, 242, 96]"
+ },
+ ]
+ }
+ ]
+ return messages
+
+
+def create_apply_editing_messages_llava(editing_prompt):
+ messages = [
+ {
+ "role": "system",
+ "content": [
+ {
+ "type": "text",
+ "text": "I will provide an image along with an editing instruction. Please describe the new content that should be present in the image after applying the instruction. \n \
+ For example, if the original image content shows a grandmother wearing a mask and the instruction is 'remove the mask', your output should be: 'a grandmother'. The output should only include elements that remain in the image after the edit and should not mention elements that have been changed or removed, such as 'mask' in this example. Do not output 'sorry, xxx', even if it's a guess, directly output the answer you think is correct."
+ },]
+ },
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "image"
+ },
+ {
+ "type": "text",
+ "text": editing_prompt
+ },
+ ]
+ },
+ ]
+ return messages
diff --git a/app/qwen2/instructions.py b/app/qwen2/instructions.py
new file mode 100644
index 0000000000000000000000000000000000000000..746c2d49426056255020c903b328f9da32a32e4f
--- /dev/null
+++ b/app/qwen2/instructions.py
@@ -0,0 +1,103 @@
+def create_editing_category_messages_qwen2(editing_prompt):
+ messages = [{
+ "role": "system",
+ "content": [
+ {
+ "type": "text",
+ "text": "I will give you an image and an editing instruction of the image. Please output which type of editing category it is in. You can choose from the following categories: \n\
+ 1. Addition: Adding new objects within the images, e.g., add a bird to the image \n\
+ 2. Remove: Removing objects, e.g., remove the mask \n\
+ 3. Local: Replace local parts of an object and later the object's attributes (e.g., make it smile) or alter an object's visual appearance without affecting its structure (e.g., change the cat to a dog) \n\
+ 4. Global: Edit the entire image, e.g., let's see it in winter \n\
+ 5. Background: Change the scene's background, e.g., have her walk on water, change the background to a beach, make the hedgehog in France, etc.",
+ },]
+ },
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "text",
+ "text": editing_prompt
+ },
+ ]
+ }]
+ return messages
+
+
+def create_ori_object_messages_qwen2(editing_prompt):
+
+ messages = [
+ {
+ "role": "system",
+ "content": [
+ {
+ "type": "text",
+ "text": "I will give you an editing instruction of the image. Please output the object needed to be edited. You only need to output the basic description of the object in no more than 5 words. The output should only contain one noun. \n \
+ For example, the editing instruction is 'Change the white cat to a black dog'. Then you need to output: 'white cat'. Only output the new content. Do not output anything else."
+ },]
+ },
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "text",
+ "text": editing_prompt
+ }
+ ]
+ }
+ ]
+ return messages
+
+
+def create_add_object_messages_qwen2(editing_prompt, base64_image, height=640, width=640):
+
+ size_str = f"The image size is height {height}px and width {width}px. The top - left corner is coordinate [0 , 0]. The bottom - right corner is coordinnate [{height} , {width}]. "
+
+ messages = [
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "text",
+ "text": "I need to add an object to the image following the instruction: " + editing_prompt + ". " + size_str + " \n \
+ Can you give me a possible bounding box of the location for the added object? Please output with the format of [top - left x coordinate , top - left y coordinate , box width , box height]. You should only output the bounding box position and nothing else. Please refer to the example below for the desired format.\n\
+ [Examples]\n \
+ [19, 101, 32, 153]\n \
+ [54, 12, 242, 96]"
+ },
+ {
+ "type": "image",
+ "image": f"data:image;base64,{base64_image}",
+ }
+ ]
+ }
+ ]
+ return messages
+
+
+def create_apply_editing_messages_qwen2(editing_prompt, base64_image):
+ messages = [
+ {
+ "role": "system",
+ "content": [
+ {
+ "type": "text",
+ "text": "I will provide an image along with an editing instruction. Please describe the new content that should be present in the image after applying the instruction. \n \
+ For example, if the original image content shows a grandmother wearing a mask and the instruction is 'remove the mask', your output should be: 'a grandmother'. The output should only include elements that remain in the image after the edit and should not mention elements that have been changed or removed, such as 'mask' in this example. Do not output 'sorry, xxx', even if it's a guess, directly output the answer you think is correct."
+ },]
+ },
+ {
+ "role": "user",
+ "content": [
+ {
+ "type": "text",
+ "text": editing_prompt
+ },
+ {
+ "type": "image",
+ "image": f"data:image;base64,{base64_image}",
+ },
+ ]
+ }
+ ]
+ return messages
diff --git a/app/gpt4_o/run_app.sh b/app/run_app.sh
similarity index 65%
rename from app/gpt4_o/run_app.sh
rename to app/run_app.sh
index b064a0d6b4c233086ef28c8b425e7dfd3fb209f2..9aa0c5448254694ab9bb178a969e4d19cc3d5f7f 100644
--- a/app/gpt4_o/run_app.sh
+++ b/app/run_app.sh
@@ -2,4 +2,4 @@ export PYTHONPATH=.:$PYTHONPATH
export CUDA_VISIBLE_DEVICES=0
-python app/gpt4_o/brushedit_app.py
\ No newline at end of file
+python app/src/brushedit_app.py
\ No newline at end of file
diff --git a/app/src/aspect_ratio_template.py b/app/src/aspect_ratio_template.py
new file mode 100644
index 0000000000000000000000000000000000000000..a08e13be514a25a326118eb0d18e8c44ad043170
--- /dev/null
+++ b/app/src/aspect_ratio_template.py
@@ -0,0 +1,88 @@
+# From https://github.com/TencentARC/PhotoMaker/pull/120 written by https://github.com/DiscoNova
+# Note: Since output width & height need to be divisible by 8, the w & h -values do
+# not exactly match the stated aspect ratios... but they are "close enough":)
+
+aspect_ratio_list = [
+ {
+ "name": "Small Square (1:1)",
+ "w": 640,
+ "h": 640,
+ },
+ {
+ "name": "Custom resolution",
+ "w": "",
+ "h": "",
+ },
+ {
+ "name": "Instagram (1:1)",
+ "w": 1024,
+ "h": 1024,
+ },
+ {
+ "name": "35mm film / Landscape (3:2)",
+ "w": 1024,
+ "h": 680,
+ },
+ {
+ "name": "35mm film / Portrait (2:3)",
+ "w": 680,
+ "h": 1024,
+ },
+ {
+ "name": "CRT Monitor / Landscape (4:3)",
+ "w": 1024,
+ "h": 768,
+ },
+ {
+ "name": "CRT Monitor / Portrait (3:4)",
+ "w": 768,
+ "h": 1024,
+ },
+ {
+ "name": "Widescreen TV / Landscape (16:9)",
+ "w": 1024,
+ "h": 576,
+ },
+ {
+ "name": "Widescreen TV / Portrait (9:16)",
+ "w": 576,
+ "h": 1024,
+ },
+ {
+ "name": "Widescreen Monitor / Landscape (16:10)",
+ "w": 1024,
+ "h": 640,
+ },
+ {
+ "name": "Widescreen Monitor / Portrait (10:16)",
+ "w": 640,
+ "h": 1024,
+ },
+ {
+ "name": "Cinemascope (2.39:1)",
+ "w": 1024,
+ "h": 424,
+ },
+ {
+ "name": "Widescreen Movie (1.85:1)",
+ "w": 1024,
+ "h": 552,
+ },
+ {
+ "name": "Academy Movie (1.37:1)",
+ "w": 1024,
+ "h": 744,
+ },
+ {
+ "name": "Sheet-print (A-series) / Landscape (297:210)",
+ "w": 1024,
+ "h": 720,
+ },
+ {
+ "name": "Sheet-print (A-series) / Portrait (210:297)",
+ "w": 720,
+ "h": 1024,
+ },
+]
+
+aspect_ratios = {k["name"]: (k["w"], k["h"]) for k in aspect_ratio_list}
diff --git a/app/src/base_model_template.py b/app/src/base_model_template.py
new file mode 100644
index 0000000000000000000000000000000000000000..93e22f57637e2a533fc573ed268e4256ab1d39d2
--- /dev/null
+++ b/app/src/base_model_template.py
@@ -0,0 +1,61 @@
+import os
+import torch
+from huggingface_hub import snapshot_download
+
+from diffusers import StableDiffusionBrushNetPipeline, BrushNetModel, UniPCMultistepScheduler
+
+
+
+torch_dtype = torch.float16
+device = "cpu"
+
+BrushEdit_path = "models/"
+if not os.path.exists(BrushEdit_path):
+ BrushEdit_path = snapshot_download(
+ repo_id="TencentARC/BrushEdit",
+ local_dir=BrushEdit_path,
+ token=os.getenv("HF_TOKEN"),
+ )
+brushnet_path = os.path.join(BrushEdit_path, "brushnetX")
+brushnet = BrushNetModel.from_pretrained(brushnet_path, torch_dtype=torch_dtype)
+
+
+base_models_list = [
+ {
+ "name": "dreamshaper_8 (Preload)",
+ "local_path": "models/base_model/dreamshaper_8",
+ "pipe": StableDiffusionBrushNetPipeline.from_pretrained(
+ "models/base_model/dreamshaper_8", brushnet=brushnet, torch_dtype=torch_dtype, low_cpu_mem_usage=False
+ ).to(device)
+ },
+ {
+ "name": "epicrealism (Preload)",
+ "local_path": "models/base_model/epicrealism_naturalSinRC1VAE",
+ "pipe": StableDiffusionBrushNetPipeline.from_pretrained(
+ "models/base_model/epicrealism_naturalSinRC1VAE", brushnet=brushnet, torch_dtype=torch_dtype, low_cpu_mem_usage=False
+ ).to(device)
+ },
+ {
+ "name": "henmixReal (Preload)",
+ "local_path": "models/base_model/henmixReal_v5c",
+ "pipe": StableDiffusionBrushNetPipeline.from_pretrained(
+ "models/base_model/henmixReal_v5c", brushnet=brushnet, torch_dtype=torch_dtype, low_cpu_mem_usage=False
+ ).to(device)
+ },
+ {
+ "name": "meinamix (Preload)",
+ "local_path": "models/base_model/meinamix_meinaV11",
+ "pipe": StableDiffusionBrushNetPipeline.from_pretrained(
+ "models/base_model/meinamix_meinaV11", brushnet=brushnet, torch_dtype=torch_dtype, low_cpu_mem_usage=False
+ ).to(device)
+ },
+ {
+ "name": "realisticVision (Default)",
+ "local_path": "models/base_model/realisticVisionV60B1_v51VAE",
+ "pipe": StableDiffusionBrushNetPipeline.from_pretrained(
+ "models/base_model/realisticVisionV60B1_v51VAE", brushnet=brushnet, torch_dtype=torch_dtype, low_cpu_mem_usage=False
+ ).to(device)
+ },
+]
+
+base_models_template = {k["name"]: (k["local_path"], k["pipe"]) for k in base_models_list}
diff --git a/app/gpt4_o/brushedit_all_in_one_pipeline.py b/app/src/brushedit_all_in_one_pipeline.py
similarity index 75%
rename from app/gpt4_o/brushedit_all_in_one_pipeline.py
rename to app/src/brushedit_all_in_one_pipeline.py
index 05a9192bd867c1c3e5a8f4b13ea988107e794bde..44083d1c28ebf8f5274a86f0f73eaf8f6baa5a1c 100644
--- a/app/gpt4_o/brushedit_all_in_one_pipeline.py
+++ b/app/src/brushedit_all_in_one_pipeline.py
@@ -22,10 +22,6 @@ def BrushEdit_Pipeline(pipe,
mask_np = mask_np / 255
height, width = mask_np.shape[0], mask_np.shape[1]
- # back/foreground
- # if mask_np[94:547,94:546].sum() < mask_np.sum() - mask_np[94:547,94:546].sum() and mask_np[0,:].sum()>0 and mask_np[-1,:].sum()>0 and mask_np[:,0].sum()>0 and mask_np[:,-1].sum()>0 and mask_np[1,:].sum()>0 and mask_np[-2,:].sum()>0 and mask_np[:,1].sum()>0 and mask_np[:,-2].sum()>0 :
- # mask_np = 1 - mask_np
-
## resize the mask and original image to the same size which is divisible by vae_scale_factor
image_processor = VaeImageProcessor(vae_scale_factor=pipe.vae_scale_factor, do_convert_rgb=True)
height_new, width_new = image_processor.get_default_height_width(original_image, height, width)
@@ -53,16 +49,13 @@ def BrushEdit_Pipeline(pipe,
height=height_new,
width=width_new,
).images
-
+ ## convert to vae shape format, must be divisible by 8
+ original_image_pil = Image.fromarray(original_image).convert("RGB")
+ init_image_np = np.array(image_processor.preprocess(original_image_pil, height=height_new, width=width_new).squeeze())
+ init_image_np = ((init_image_np.transpose(1,2,0) + 1.) / 2.) * 255
+ init_image_np = init_image_np.astype(np.uint8)
if blending:
-
mask_blurred = mask_blurred * 0.5 + 0.5
-
- ## convert to vae shape format, must be divisible by 8
- original_image_pil = Image.fromarray(original_image).convert("RGB")
- init_image_np = np.array(image_processor.preprocess(original_image_pil, height=height_new, width=width_new).squeeze())
- init_image_np = ((init_image_np.transpose(1,2,0) + 1.) / 2.) * 255
- init_image_np = init_image_np.astype(np.uint8)
image_all = []
for image_i in images:
image_np = np.array(image_i)
@@ -75,6 +68,6 @@ def BrushEdit_Pipeline(pipe,
image_all = images
- return image_all, mask_image
+ return image_all, mask_image, mask_np, init_image_np
diff --git a/app/src/brushedit_app.py b/app/src/brushedit_app.py
new file mode 100644
index 0000000000000000000000000000000000000000..68758961a19a37aa64763b587d0f322ec2a06aef
--- /dev/null
+++ b/app/src/brushedit_app.py
@@ -0,0 +1,1690 @@
+##!/usr/bin/python3
+# -*- coding: utf-8 -*-
+import os, random, sys
+import numpy as np
+import requests
+import torch
+import spaces
+
+
+import gradio as gr
+
+from PIL import Image
+
+
+from huggingface_hub import hf_hub_download, snapshot_download
+from scipy.ndimage import binary_dilation, binary_erosion
+from transformers import (LlavaNextProcessor, LlavaNextForConditionalGeneration,
+ Qwen2VLForConditionalGeneration, Qwen2VLProcessor)
+
+from segment_anything import SamPredictor, build_sam, SamAutomaticMaskGenerator
+from diffusers import StableDiffusionBrushNetPipeline, BrushNetModel, UniPCMultistepScheduler
+from diffusers.image_processor import VaeImageProcessor
+
+
+from app.src.vlm_pipeline import (
+ vlm_response_editing_type,
+ vlm_response_object_wait_for_edit,
+ vlm_response_mask,
+ vlm_response_prompt_after_apply_instruction
+)
+from app.src.brushedit_all_in_one_pipeline import BrushEdit_Pipeline
+from app.utils.utils import load_grounding_dino_model
+
+from app.src.vlm_template import vlms_template
+from app.src.base_model_template import base_models_template
+from app.src.aspect_ratio_template import aspect_ratios
+
+from openai import OpenAI
+# base_openai_url = ""
+
+#### Description ####
+logo = r"""
+
+"""
+head = r"""
+
+
BrushEdit: All-In-One Image Inpainting and Editing
+
+
+
+"""
+descriptions = r"""
+Official Gradio Demo for BrushEdit: All-In-One Image Inpainting and Editing
+🧙 BrushEdit enables precise, user-friendly instruction-based image editing via a inpainting model.
+"""
+
+instructions = r"""
+Currently, we support two modes: fully automated command editing and interactive command editing.
+
+🛠️ Fully automated instruction-based editing:
+
+ - ⭐️ 1.Choose Image: Upload or select one image from Example.
+ - ⭐️ 2.Input ⌨️ Instructions: Input the instructions (supports addition, deletion, and modification), e.g. remove xxx .
+ - ⭐️ 3.Run: Click 💫 Run button to automatic edit image.
+
+
+🛠️ Interactive instruction-based editing:
+
+ - ⭐️ 1.Choose Image: Upload or select one image from Example.
+ - ⭐️ 2.Finely Brushing: Use a brush to outline the area you want to edit. And You can also use the eraser to restore.
+ - ⭐️ 3.Input ⌨️ Instructions: Input the instructions.
+ - ⭐️ 4.Run: Click 💫 Run button to automatic edit image.
+
+
+ We strongly recommend using GPT-4o for reasoning. After selecting the VLM model as gpt4-o, enter the API KEY and click the Submit and Verify button. If the output is success, you can use gpt4-o normally. Secondarily, we recommend using the Qwen2VL model.
+
+ We recommend zooming out in your browser for a better viewing range and experience.
+
+ For more detailed feature descriptions, see the bottom.
+
+☕️ Have fun! 🎄 Wishing you a merry Christmas!
+ """
+
+tips = r"""
+💡 Some Tips:
+
+ - 🤠 After input the instructions, you can click the Generate Mask button. The mask generated by VLM will be displayed in the preview panel on the right side.
+ - 🤠 After generating the mask or when you use the brush to draw the mask, you can perform operations such as randomization, dilation, erosion, and movement.
+ - 🤠 After input the instructions, you can click the Generate Target Prompt button. The target prompt will be displayed in the text box, and you can modify it according to your ideas.
+
+
+💡 Detailed Features:
+
+ - 🎨 Aspect Ratio: Select the aspect ratio of the image. To prevent OOM, 1024px is the maximum resolution.
+ - 🎨 VLM Model: Select the VLM model. We use preloaded models to save time. To use other VLM models, download them and uncomment the relevant lines in vlm_template.py from our GitHub repo.
+ - 🎨 Generate Mask: According to the input instructions, generate a mask for the area that may need to be edited.
+ - 🎨 Square/Circle Mask: Based on the existing mask, generate masks for squares and circles. (The coarse-grained mask provides more editing imagination.)
+ - 🎨 Invert Mask: Invert the mask to generate a new mask.
+ - 🎨 Dilation/Erosion Mask: Expand or shrink the mask to include or exclude more areas.
+ - 🎨 Move Mask: Move the mask to a new position.
+ - 🎨 Generate Target Prompt: Generate a target prompt based on the input instructions.
+ - 🎨 Target Prompt: Description for masking area, manual input or modification can be made when the content generated by VLM does not meet expectations.
+ - 🎨 Blending: Blending brushnet's output and the original input, ensuring the original image details in the unedited areas. (turn off is beeter when removing.)
+ - 🎨 Control length: The intensity of editing and inpainting.
+
+
+💡 Advanced Features:
+
+ - 🎨 Base Model: We use preloaded models to save time. To use other VLM models, download them and uncomment the relevant lines in vlm_template.py from our GitHub repo.
+ - 🎨 Blending: Blending brushnet's output and the original input, ensuring the original image details in the unedited areas. (turn off is beeter when removing.)
+ - 🎨 Control length: The intensity of editing and inpainting.
+ - 🎨 Num samples: The number of samples to generate.
+ - 🎨 Negative prompt: The negative prompt for the classifier-free guidance.
+ - 🎨 Guidance scale: The guidance scale for the classifier-free guidance.
+
+
+
+"""
+
+
+
+citation = r"""
+If BrushEdit is helpful, please help to ⭐ the Github Repo. Thanks!
+[![GitHub Stars](https://img.shields.io/github/stars/TencentARC/BrushEdit?style=social)](https://github.com/TencentARC/BrushEdit)
+---
+📝 **Citation**
+
+If our work is useful for your research, please consider citing:
+```bibtex
+@misc{li2024brushedit,
+ title={BrushEdit: All-In-One Image Inpainting and Editing},
+ author={Yaowei Li and Yuxuan Bian and Xuan Ju and Zhaoyang Zhang and and Junhao Zhuang and Ying Shan and Yuexian Zou and Qiang Xu},
+ year={2024},
+ eprint={2412.10316},
+ archivePrefix={arXiv},
+ primaryClass={cs.CV}
+}
+```
+📧 **Contact**
+
+If you have any questions, please feel free to reach me out at liyaowei@gmail.com.
+"""
+
+# - - - - - examples - - - - - #
+EXAMPLES = [
+
+ [
+ Image.open("./assets/frog/frog.jpeg").convert("RGBA"),
+ "add a magic hat on frog head.",
+ 642087011,
+ "frog",
+ "frog",
+ True,
+ False,
+ "GPT4-o (Highly Recommended)"
+ ],
+ [
+ Image.open("./assets/chinese_girl/chinese_girl.png").convert("RGBA"),
+ "replace the background to ancient China.",
+ 648464818,
+ "chinese_girl",
+ "chinese_girl",
+ True,
+ False,
+ "GPT4-o (Highly Recommended)"
+ ],
+ [
+ Image.open("./assets/angel_christmas/angel_christmas.png").convert("RGBA"),
+ "remove the deer.",
+ 648464818,
+ "angel_christmas",
+ "angel_christmas",
+ False,
+ False,
+ "GPT4-o (Highly Recommended)"
+ ],
+ [
+ Image.open("./assets/sunflower_girl/sunflower_girl.png").convert("RGBA"),
+ "add a wreath on head.",
+ 648464818,
+ "sunflower_girl",
+ "sunflower_girl",
+ True,
+ False,
+ "GPT4-o (Highly Recommended)"
+ ],
+ [
+ Image.open("./assets/girl_on_sun/girl_on_sun.png").convert("RGBA"),
+ "add a butterfly fairy.",
+ 648464818,
+ "girl_on_sun",
+ "girl_on_sun",
+ True,
+ False,
+ "GPT4-o (Highly Recommended)"
+ ],
+ [
+ Image.open("./assets/spider_man_rm/spider_man.png").convert("RGBA"),
+ "remove the christmas hat.",
+ 642087011,
+ "spider_man_rm",
+ "spider_man_rm",
+ False,
+ False,
+ "GPT4-o (Highly Recommended)"
+ ],
+ [
+ Image.open("./assets/anime_flower/anime_flower.png").convert("RGBA"),
+ "remove the flower.",
+ 642087011,
+ "anime_flower",
+ "anime_flower",
+ False,
+ False,
+ "GPT4-o (Highly Recommended)"
+ ],
+ [
+ Image.open("./assets/chenduling/chengduling.jpg").convert("RGBA"),
+ "replace the clothes to a delicated floral skirt.",
+ 648464818,
+ "chenduling",
+ "chenduling",
+ True,
+ False,
+ "GPT4-o (Highly Recommended)"
+ ],
+ [
+ Image.open("./assets/hedgehog_rp_bg/hedgehog.png").convert("RGBA"),
+ "make the hedgehog in Italy.",
+ 648464818,
+ "hedgehog_rp_bg",
+ "hedgehog_rp_bg",
+ True,
+ False,
+ "GPT4-o (Highly Recommended)"
+ ],
+
+]
+
+INPUT_IMAGE_PATH = {
+ "frog": "./assets/frog/frog.jpeg",
+ "chinese_girl": "./assets/chinese_girl/chinese_girl.png",
+ "angel_christmas": "./assets/angel_christmas/angel_christmas.png",
+ "sunflower_girl": "./assets/sunflower_girl/sunflower_girl.png",
+ "girl_on_sun": "./assets/girl_on_sun/girl_on_sun.png",
+ "spider_man_rm": "./assets/spider_man_rm/spider_man.png",
+ "anime_flower": "./assets/anime_flower/anime_flower.png",
+ "chenduling": "./assets/chenduling/chengduling.jpg",
+ "hedgehog_rp_bg": "./assets/hedgehog_rp_bg/hedgehog.png",
+}
+MASK_IMAGE_PATH = {
+ "frog": "./assets/frog/mask_f7b350de-6f2c-49e3-b535-995c486d78e7.png",
+ "chinese_girl": "./assets/chinese_girl/mask_54759648-0989-48e0-bc82-f20e28b5ec29.png",
+ "angel_christmas": "./assets/angel_christmas/mask_f15d9b45-c978-4e3d-9f5f-251e308560c3.png",
+ "sunflower_girl": "./assets/sunflower_girl/mask_99cc50b4-7dc4-4de5-8748-ec10772f0317.png",
+ "girl_on_sun": "./assets/girl_on_sun/mask_264eac8b-8b65-479c-9755-020a60880c37.png",
+ "spider_man_rm": "./assets/spider_man_rm/mask_a5d410e6-8e8d-432f-8144-defbc3e1eae9.png",
+ "anime_flower": "./assets/anime_flower/mask_37553172-9b38-4727-bf2e-37d7e2b93461.png",
+ "chenduling": "./assets/chenduling/mask_68e3ff6f-da07-4b37-91df-13d6eed7b997.png",
+ "hedgehog_rp_bg": "./assets/hedgehog_rp_bg/mask_db7f8bf8-8349-46d3-b14e-43d67fbe25d3.png",
+}
+MASKED_IMAGE_PATH = {
+ "frog": "./assets/frog/masked_image_f7b350de-6f2c-49e3-b535-995c486d78e7.png",
+ "chinese_girl": "./assets/chinese_girl/masked_image_54759648-0989-48e0-bc82-f20e28b5ec29.png",
+ "angel_christmas": "./assets/angel_christmas/masked_image_f15d9b45-c978-4e3d-9f5f-251e308560c3.png",
+ "sunflower_girl": "./assets/sunflower_girl/masked_image_99cc50b4-7dc4-4de5-8748-ec10772f0317.png",
+ "girl_on_sun": "./assets/girl_on_sun/masked_image_264eac8b-8b65-479c-9755-020a60880c37.png",
+ "spider_man_rm": "./assets/spider_man_rm/masked_image_a5d410e6-8e8d-432f-8144-defbc3e1eae9.png",
+ "anime_flower": "./assets/anime_flower/masked_image_37553172-9b38-4727-bf2e-37d7e2b93461.png",
+ "chenduling": "./assets/chenduling/masked_image_68e3ff6f-da07-4b37-91df-13d6eed7b997.png",
+ "hedgehog_rp_bg": "./assets/hedgehog_rp_bg/masked_image_db7f8bf8-8349-46d3-b14e-43d67fbe25d3.png",
+}
+OUTPUT_IMAGE_PATH = {
+ "frog": "./assets/frog/image_edit_f7b350de-6f2c-49e3-b535-995c486d78e7_1.png",
+ "chinese_girl": "./assets/chinese_girl/image_edit_54759648-0989-48e0-bc82-f20e28b5ec29_1.png",
+ "angel_christmas": "./assets/angel_christmas/image_edit_f15d9b45-c978-4e3d-9f5f-251e308560c3_0.png",
+ "sunflower_girl": "./assets/sunflower_girl/image_edit_99cc50b4-7dc4-4de5-8748-ec10772f0317_3.png",
+ "girl_on_sun": "./assets/girl_on_sun/image_edit_264eac8b-8b65-479c-9755-020a60880c37_0.png",
+ "spider_man_rm": "./assets/spider_man_rm/image_edit_a5d410e6-8e8d-432f-8144-defbc3e1eae9_0.png",
+ "anime_flower": "./assets/anime_flower/image_edit_37553172-9b38-4727-bf2e-37d7e2b93461_2.png",
+ "chenduling": "./assets/chenduling/image_edit_68e3ff6f-da07-4b37-91df-13d6eed7b997_0.png",
+ "hedgehog_rp_bg": "./assets/hedgehog_rp_bg/image_edit_db7f8bf8-8349-46d3-b14e-43d67fbe25d3_3.png",
+}
+
+# os.environ['GRADIO_TEMP_DIR'] = 'gradio_temp_dir'
+# os.makedirs('gradio_temp_dir', exist_ok=True)
+
+VLM_MODEL_NAMES = list(vlms_template.keys())
+DEFAULT_VLM_MODEL_NAME = "Qwen2-VL-7B-Instruct (Default)"
+BASE_MODELS = list(base_models_template.keys())
+DEFAULT_BASE_MODEL = "realisticVision (Default)"
+
+ASPECT_RATIO_LABELS = list(aspect_ratios)
+DEFAULT_ASPECT_RATIO = ASPECT_RATIO_LABELS[0]
+
+
+## init device
+try:
+ if torch.cuda.is_available():
+ device = "cuda"
+ elif sys.platform == "darwin" and torch.backends.mps.is_available():
+ device = "mps"
+ else:
+ device = "cpu"
+except:
+ device = "cpu"
+
+# ## init torch dtype
+# if torch.cuda.is_available() and torch.cuda.is_bf16_supported():
+# torch_dtype = torch.bfloat16
+# else:
+# torch_dtype = torch.float16
+
+# if device == "mps":
+# torch_dtype = torch.float16
+
+torch_dtype = torch.float16
+
+
+
+# download hf models
+BrushEdit_path = "models/"
+if not os.path.exists(BrushEdit_path):
+ BrushEdit_path = snapshot_download(
+ repo_id="TencentARC/BrushEdit",
+ local_dir=BrushEdit_path,
+ token=os.getenv("HF_TOKEN"),
+ )
+
+## init default VLM
+vlm_type, vlm_local_path, vlm_processor, vlm_model = vlms_template[DEFAULT_VLM_MODEL_NAME]
+if vlm_processor != "" and vlm_model != "":
+ vlm_model.to(device)
+else:
+ gr.Error("Please Download default VLM model "+ DEFAULT_VLM_MODEL_NAME +" first.")
+
+
+## init base model
+base_model_path = os.path.join(BrushEdit_path, "base_model/realisticVisionV60B1_v51VAE")
+brushnet_path = os.path.join(BrushEdit_path, "brushnetX")
+sam_path = os.path.join(BrushEdit_path, "sam/sam_vit_h_4b8939.pth")
+groundingdino_path = os.path.join(BrushEdit_path, "grounding_dino/groundingdino_swint_ogc.pth")
+
+
+# input brushnetX ckpt path
+brushnet = BrushNetModel.from_pretrained(brushnet_path, torch_dtype=torch_dtype)
+pipe = StableDiffusionBrushNetPipeline.from_pretrained(
+ base_model_path, brushnet=brushnet, torch_dtype=torch_dtype, low_cpu_mem_usage=False
+ )
+# speed up diffusion process with faster scheduler and memory optimization
+pipe.scheduler = UniPCMultistepScheduler.from_config(pipe.scheduler.config)
+# remove following line if xformers is not installed or when using Torch 2.0.
+# pipe.enable_xformers_memory_efficient_attention()
+pipe.enable_model_cpu_offload()
+
+
+## init SAM
+sam = build_sam(checkpoint=sam_path)
+sam.to(device=device)
+sam_predictor = SamPredictor(sam)
+sam_automask_generator = SamAutomaticMaskGenerator(sam)
+
+## init groundingdino_model
+config_file = 'app/utils/GroundingDINO_SwinT_OGC.py'
+groundingdino_model = load_grounding_dino_model(config_file, groundingdino_path, device=device)
+
+## Ordinary function
+def crop_and_resize(image: Image.Image,
+ target_width: int,
+ target_height: int) -> Image.Image:
+ """
+ Crops and resizes an image while preserving the aspect ratio.
+
+ Args:
+ image (Image.Image): Input PIL image to be cropped and resized.
+ target_width (int): Target width of the output image.
+ target_height (int): Target height of the output image.
+
+ Returns:
+ Image.Image: Cropped and resized image.
+ """
+ # Original dimensions
+ original_width, original_height = image.size
+ original_aspect = original_width / original_height
+ target_aspect = target_width / target_height
+
+ # Calculate crop box to maintain aspect ratio
+ if original_aspect > target_aspect:
+ # Crop horizontally
+ new_width = int(original_height * target_aspect)
+ new_height = original_height
+ left = (original_width - new_width) / 2
+ top = 0
+ right = left + new_width
+ bottom = original_height
+ else:
+ # Crop vertically
+ new_width = original_width
+ new_height = int(original_width / target_aspect)
+ left = 0
+ top = (original_height - new_height) / 2
+ right = original_width
+ bottom = top + new_height
+
+ # Crop and resize
+ cropped_image = image.crop((left, top, right, bottom))
+ resized_image = cropped_image.resize((target_width, target_height), Image.NEAREST)
+ return resized_image
+
+
+## Ordinary function
+def resize(image: Image.Image,
+ target_width: int,
+ target_height: int) -> Image.Image:
+ """
+ Crops and resizes an image while preserving the aspect ratio.
+
+ Args:
+ image (Image.Image): Input PIL image to be cropped and resized.
+ target_width (int): Target width of the output image.
+ target_height (int): Target height of the output image.
+
+ Returns:
+ Image.Image: Cropped and resized image.
+ """
+ # Original dimensions
+ resized_image = image.resize((target_width, target_height), Image.NEAREST)
+ return resized_image
+
+
+def move_mask_func(mask, direction, units):
+ binary_mask = mask.squeeze()>0
+ rows, cols = binary_mask.shape
+ moved_mask = np.zeros_like(binary_mask, dtype=bool)
+
+ if direction == 'down':
+ # move down
+ moved_mask[max(0, units):, :] = binary_mask[:rows - units, :]
+
+ elif direction == 'up':
+ # move up
+ moved_mask[:rows - units, :] = binary_mask[units:, :]
+
+ elif direction == 'right':
+ # move left
+ moved_mask[:, max(0, units):] = binary_mask[:, :cols - units]
+
+ elif direction == 'left':
+ # move right
+ moved_mask[:, :cols - units] = binary_mask[:, units:]
+
+ return moved_mask
+
+
+def random_mask_func(mask, dilation_type='square', dilation_size=20):
+ # Randomly select the size of dilation
+ binary_mask = mask.squeeze()>0
+
+ if dilation_type == 'square_dilation':
+ structure = np.ones((dilation_size, dilation_size), dtype=bool)
+ dilated_mask = binary_dilation(binary_mask, structure=structure)
+ elif dilation_type == 'square_erosion':
+ structure = np.ones((dilation_size, dilation_size), dtype=bool)
+ dilated_mask = binary_erosion(binary_mask, structure=structure)
+ elif dilation_type == 'bounding_box':
+ # find the most left top and left bottom point
+ rows, cols = np.where(binary_mask)
+ if len(rows) == 0 or len(cols) == 0:
+ return mask # return original mask if no valid points
+
+ min_row = np.min(rows)
+ max_row = np.max(rows)
+ min_col = np.min(cols)
+ max_col = np.max(cols)
+
+ # create a bounding box
+ dilated_mask = np.zeros_like(binary_mask, dtype=bool)
+ dilated_mask[min_row:max_row + 1, min_col:max_col + 1] = True
+
+ elif dilation_type == 'bounding_ellipse':
+ # find the most left top and left bottom point
+ rows, cols = np.where(binary_mask)
+ if len(rows) == 0 or len(cols) == 0:
+ return mask # return original mask if no valid points
+
+ min_row = np.min(rows)
+ max_row = np.max(rows)
+ min_col = np.min(cols)
+ max_col = np.max(cols)
+
+ # calculate the center and axis length of the ellipse
+ center = ((min_col + max_col) // 2, (min_row + max_row) // 2)
+ a = (max_col - min_col) // 2 # half long axis
+ b = (max_row - min_row) // 2 # half short axis
+
+ # create a bounding ellipse
+ y, x = np.ogrid[:mask.shape[0], :mask.shape[1]]
+ ellipse_mask = ((x - center[0])**2 / a**2 + (y - center[1])**2 / b**2) <= 1
+ dilated_mask = np.zeros_like(binary_mask, dtype=bool)
+ dilated_mask[ellipse_mask] = True
+ else:
+ raise ValueError("dilation_type must be 'square' or 'ellipse'")
+
+ # use binary dilation
+ dilated_mask = np.uint8(dilated_mask[:,:,np.newaxis]) * 255
+ return dilated_mask
+
+
+## Gradio component function
+def update_vlm_model(vlm_name):
+ global vlm_model, vlm_processor
+ if vlm_model is not None:
+ del vlm_model
+ torch.cuda.empty_cache()
+
+ vlm_type, vlm_local_path, vlm_processor, vlm_model = vlms_template[vlm_name]
+
+ ## we recommend using preload models, otherwise it will take a long time to download the model. you can edit the code via vlm_template.py
+ if vlm_type == "llava-next":
+ if vlm_processor != "" and vlm_model != "":
+ vlm_model.to(device)
+ return vlm_model_dropdown
+ else:
+ if os.path.exists(vlm_local_path):
+ vlm_processor = LlavaNextProcessor.from_pretrained(vlm_local_path)
+ vlm_model = LlavaNextForConditionalGeneration.from_pretrained(vlm_local_path, torch_dtype="auto", device_map="auto")
+ else:
+ if vlm_name == "llava-v1.6-mistral-7b-hf (Preload)":
+ vlm_processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
+ vlm_model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype="auto", device_map="auto")
+ elif vlm_name == "llama3-llava-next-8b-hf (Preload)":
+ vlm_processor = LlavaNextProcessor.from_pretrained("llava-hf/llama3-llava-next-8b-hf")
+ vlm_model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llama3-llava-next-8b-hf", torch_dtype="auto", device_map="auto")
+ elif vlm_name == "llava-v1.6-vicuna-13b-hf (Preload)":
+ vlm_processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-vicuna-13b-hf")
+ vlm_model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-vicuna-13b-hf", torch_dtype="auto", device_map="auto")
+ elif vlm_name == "llava-v1.6-34b-hf (Preload)":
+ vlm_processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-v1.6-34b-hf")
+ vlm_model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-v1.6-34b-hf", torch_dtype="auto", device_map="auto")
+ elif vlm_name == "llava-next-72b-hf (Preload)":
+ vlm_processor = LlavaNextProcessor.from_pretrained("llava-hf/llava-next-72b-hf")
+ vlm_model = LlavaNextForConditionalGeneration.from_pretrained("llava-hf/llava-next-72b-hf", torch_dtype="auto", device_map="auto")
+ elif vlm_type == "qwen2-vl":
+ if vlm_processor != "" and vlm_model != "":
+ vlm_model.to(device)
+ return vlm_model_dropdown
+ else:
+ if os.path.exists(vlm_local_path):
+ vlm_processor = Qwen2VLProcessor.from_pretrained(vlm_local_path)
+ vlm_model = Qwen2VLForConditionalGeneration.from_pretrained(vlm_local_path, torch_dtype="auto", device_map="auto")
+ else:
+ if vlm_name == "qwen2-vl-2b-instruct (Preload)":
+ vlm_processor = Qwen2VLProcessor.from_pretrained("Qwen/Qwen2-VL-2B-Instruct")
+ vlm_model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-2B-Instruct", torch_dtype="auto", device_map="auto")
+ elif vlm_name == "qwen2-vl-7b-instruct (Preload)":
+ vlm_processor = Qwen2VLProcessor.from_pretrained("Qwen/Qwen2-VL-7B-Instruct")
+ vlm_model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-7B-Instruct", torch_dtype="auto", device_map="auto")
+ elif vlm_name == "qwen2-vl-72b-instruct (Preload)":
+ vlm_processor = Qwen2VLProcessor.from_pretrained("Qwen/Qwen2-VL-72B-Instruct")
+ vlm_model = Qwen2VLForConditionalGeneration.from_pretrained("Qwen/Qwen2-VL-72B-Instruct", torch_dtype="auto", device_map="auto")
+ elif vlm_type == "openai":
+ pass
+ return "success"
+
+
+def update_base_model(base_model_name):
+ global pipe
+ ## we recommend using preload models, otherwise it will take a long time to download the model. you can edit the code via base_model_template.py
+ if pipe is not None:
+ del pipe
+ torch.cuda.empty_cache()
+ base_model_path, pipe = base_models_template[base_model_name]
+ if pipe != "":
+ pipe.to(device)
+ else:
+ if os.path.exists(base_model_path):
+ pipe = StableDiffusionBrushNetPipeline.from_pretrained(
+ base_model_path, brushnet=brushnet, torch_dtype=torch_dtype, low_cpu_mem_usage=False
+ )
+ # pipe.enable_xformers_memory_efficient_attention()
+ pipe.enable_model_cpu_offload()
+ else:
+ raise gr.Error(f"The base model {base_model_name} does not exist")
+ return "success"
+
+
+def submit_GPT4o_KEY(GPT4o_KEY):
+ global vlm_model, vlm_processor
+ if vlm_model is not None:
+ del vlm_model
+ torch.cuda.empty_cache()
+ try:
+ vlm_model = OpenAI(api_key=GPT4o_KEY)
+ vlm_processor = ""
+ response = vlm_model.chat.completions.create(
+ model="gpt-4o-2024-08-06",
+ messages=[
+ {"role": "system", "content": "You are a helpful assistant."},
+ {"role": "user", "content": "Say this is a test"}
+ ]
+ )
+ response_str = response.choices[0].message.content
+
+ return "Success, " + response_str, "GPT4-o (Highly Recommended)"
+ except Exception as e:
+ return "Invalid GPT4o API Key", "GPT4-o (Highly Recommended)"
+
+
+
+@spaces.GPU(duration=180)
+def process(input_image,
+ original_image,
+ original_mask,
+ prompt,
+ negative_prompt,
+ control_strength,
+ seed,
+ randomize_seed,
+ guidance_scale,
+ num_inference_steps,
+ num_samples,
+ blending,
+ category,
+ target_prompt,
+ resize_default,
+ aspect_ratio_name,
+ invert_mask_state):
+ if original_image is None:
+ if input_image is None:
+ raise gr.Error('Please upload the input image')
+ else:
+ image_pil = input_image["background"].convert("RGB")
+ original_image = np.array(image_pil)
+ if prompt is None or prompt == "":
+ raise gr.Error("Please input your instructions, e.g., remove the xxx")
+
+ alpha_mask = input_image["layers"][0].split()[3]
+ input_mask = np.asarray(alpha_mask)
+ output_w, output_h = aspect_ratios[aspect_ratio_name]
+ if output_w == "" or output_h == "":
+ output_h, output_w = original_image.shape[:2]
+
+ if resize_default:
+ short_side = min(output_w, output_h)
+ scale_ratio = 640 / short_side
+ output_w = int(output_w * scale_ratio)
+ output_h = int(output_h * scale_ratio)
+ original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h))
+ original_image = np.array(original_image)
+ if input_mask is not None:
+ input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h))
+ input_mask = np.array(input_mask)
+ if original_mask is not None:
+ original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h))
+ original_mask = np.array(original_mask)
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
+ else:
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
+ pass
+ else:
+ if resize_default:
+ short_side = min(output_w, output_h)
+ scale_ratio = 640 / short_side
+ output_w = int(output_w * scale_ratio)
+ output_h = int(output_h * scale_ratio)
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
+ original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h))
+ original_image = np.array(original_image)
+ if input_mask is not None:
+ input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h))
+ input_mask = np.array(input_mask)
+ if original_mask is not None:
+ original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h))
+ original_mask = np.array(original_mask)
+
+ if invert_mask_state:
+ original_mask = original_mask
+ else:
+ if input_mask.max() == 0:
+ original_mask = original_mask
+ else:
+ original_mask = input_mask
+
+
+
+ if category is not None:
+ pass
+ else:
+ category = vlm_response_editing_type(vlm_processor, vlm_model, original_image, prompt, device)
+
+
+ if original_mask is not None:
+ original_mask = np.clip(original_mask, 0, 255).astype(np.uint8)
+ else:
+ object_wait_for_edit = vlm_response_object_wait_for_edit(
+ vlm_processor,
+ vlm_model,
+ original_image,
+ category,
+ prompt,
+ device)
+
+ original_mask = vlm_response_mask(vlm_processor,
+ vlm_model,
+ category,
+ original_image,
+ prompt,
+ object_wait_for_edit,
+ sam,
+ sam_predictor,
+ sam_automask_generator,
+ groundingdino_model,
+ device)
+ if original_mask.ndim == 2:
+ original_mask = original_mask[:,:,None]
+
+
+ if len(target_prompt) <= 1:
+ prompt_after_apply_instruction = vlm_response_prompt_after_apply_instruction(
+ vlm_processor,
+ vlm_model,
+ original_image,
+ prompt,
+ device)
+ else:
+ prompt_after_apply_instruction = target_prompt
+
+ generator = torch.Generator(device).manual_seed(random.randint(0, 2147483647) if randomize_seed else seed)
+
+
+ with torch.autocast(device):
+ image, mask_image, mask_np, init_image_np = BrushEdit_Pipeline(pipe,
+ prompt_after_apply_instruction,
+ original_mask,
+ original_image,
+ generator,
+ num_inference_steps,
+ guidance_scale,
+ control_strength,
+ negative_prompt,
+ num_samples,
+ blending)
+ original_image = np.array(init_image_np)
+ masked_image = original_image * (1 - (mask_np>0))
+ masked_image = masked_image.astype(np.uint8)
+ masked_image = Image.fromarray(masked_image)
+ # Save the images (optional)
+ # import uuid
+ # uuid = str(uuid.uuid4())
+ # image[0].save(f"outputs/image_edit_{uuid}_0.png")
+ # image[1].save(f"outputs/image_edit_{uuid}_1.png")
+ # image[2].save(f"outputs/image_edit_{uuid}_2.png")
+ # image[3].save(f"outputs/image_edit_{uuid}_3.png")
+ # mask_image.save(f"outputs/mask_{uuid}.png")
+ # masked_image.save(f"outputs/masked_image_{uuid}.png")
+ return image, [mask_image], [masked_image], prompt, '', prompt_after_apply_instruction, False
+
+
+def generate_target_prompt(input_image,
+ original_image,
+ prompt):
+ # load example image
+ if isinstance(original_image, str):
+ original_image = input_image
+
+ prompt_after_apply_instruction = vlm_response_prompt_after_apply_instruction(
+ vlm_processor,
+ vlm_model,
+ original_image,
+ prompt,
+ device)
+ return prompt_after_apply_instruction, prompt_after_apply_instruction
+
+
+def process_mask(input_image,
+ original_image,
+ prompt,
+ resize_default,
+ aspect_ratio_name):
+ if original_image is None:
+ raise gr.Error('Please upload the input image')
+ if prompt is None:
+ raise gr.Error("Please input your instructions, e.g., remove the xxx")
+
+ ## load mask
+ alpha_mask = input_image["layers"][0].split()[3]
+ input_mask = np.array(alpha_mask)
+
+ # load example image
+ if isinstance(original_image, str):
+ original_image = input_image["background"]
+
+ if input_mask.max() == 0:
+ category = vlm_response_editing_type(vlm_processor, vlm_model, original_image, prompt, device)
+
+ object_wait_for_edit = vlm_response_object_wait_for_edit(vlm_processor,
+ vlm_model,
+ original_image,
+ category,
+ prompt,
+ device)
+ # original mask: h,w,1 [0, 255]
+ original_mask = vlm_response_mask(
+ vlm_processor,
+ vlm_model,
+ category,
+ original_image,
+ prompt,
+ object_wait_for_edit,
+ sam,
+ sam_predictor,
+ sam_automask_generator,
+ groundingdino_model,
+ device)
+ else:
+ original_mask = input_mask
+ category = None
+
+ ## resize mask if needed
+ output_w, output_h = aspect_ratios[aspect_ratio_name]
+ if output_w == "" or output_h == "":
+ output_h, output_w = original_image.shape[:2]
+ if resize_default:
+ short_side = min(output_w, output_h)
+ scale_ratio = 640 / short_side
+ output_w = int(output_w * scale_ratio)
+ output_h = int(output_h * scale_ratio)
+ original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h))
+ original_image = np.array(original_image)
+ if input_mask is not None:
+ input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h))
+ input_mask = np.array(input_mask)
+ if original_mask is not None:
+ original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h))
+ original_mask = np.array(original_mask)
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
+ else:
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
+ pass
+ else:
+ if resize_default:
+ short_side = min(output_w, output_h)
+ scale_ratio = 640 / short_side
+ output_w = int(output_w * scale_ratio)
+ output_h = int(output_h * scale_ratio)
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
+ original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h))
+ original_image = np.array(original_image)
+ if input_mask is not None:
+ input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h))
+ input_mask = np.array(input_mask)
+ if original_mask is not None:
+ original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h))
+ original_mask = np.array(original_mask)
+
+
+
+ if original_mask.ndim == 2:
+ original_mask = original_mask[:,:,None]
+
+ mask_image = Image.fromarray(original_mask.squeeze().astype(np.uint8)).convert("RGB")
+
+ masked_image = original_image * (1 - (original_mask>0))
+ masked_image = masked_image.astype(np.uint8)
+ masked_image = Image.fromarray(masked_image)
+
+ return [masked_image], [mask_image], original_mask.astype(np.uint8), category
+
+
+def process_random_mask(input_image,
+ original_image,
+ original_mask,
+ resize_default,
+ aspect_ratio_name,
+ ):
+
+ alpha_mask = input_image["layers"][0].split()[3]
+ input_mask = np.asarray(alpha_mask)
+
+ output_w, output_h = aspect_ratios[aspect_ratio_name]
+ if output_w == "" or output_h == "":
+ output_h, output_w = original_image.shape[:2]
+ if resize_default:
+ short_side = min(output_w, output_h)
+ scale_ratio = 640 / short_side
+ output_w = int(output_w * scale_ratio)
+ output_h = int(output_h * scale_ratio)
+ original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h))
+ original_image = np.array(original_image)
+ if input_mask is not None:
+ input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h))
+ input_mask = np.array(input_mask)
+ if original_mask is not None:
+ original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h))
+ original_mask = np.array(original_mask)
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
+ else:
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
+ pass
+ else:
+ if resize_default:
+ short_side = min(output_w, output_h)
+ scale_ratio = 640 / short_side
+ output_w = int(output_w * scale_ratio)
+ output_h = int(output_h * scale_ratio)
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
+ original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h))
+ original_image = np.array(original_image)
+ if input_mask is not None:
+ input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h))
+ input_mask = np.array(input_mask)
+ if original_mask is not None:
+ original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h))
+ original_mask = np.array(original_mask)
+
+
+ if input_mask.max() == 0:
+ original_mask = original_mask
+ else:
+ original_mask = input_mask
+
+ if original_mask is None:
+ raise gr.Error('Please generate mask first')
+
+ if original_mask.ndim == 2:
+ original_mask = original_mask[:,:,None]
+
+ dilation_type = np.random.choice(['bounding_box', 'bounding_ellipse'])
+ random_mask = random_mask_func(original_mask, dilation_type).squeeze()
+
+ mask_image = Image.fromarray(random_mask.astype(np.uint8)).convert("RGB")
+
+ masked_image = original_image * (1 - (random_mask[:,:,None]>0))
+ masked_image = masked_image.astype(original_image.dtype)
+ masked_image = Image.fromarray(masked_image)
+
+
+ return [masked_image], [mask_image], random_mask[:,:,None].astype(np.uint8)
+
+
+def process_dilation_mask(input_image,
+ original_image,
+ original_mask,
+ resize_default,
+ aspect_ratio_name,
+ dilation_size=20):
+
+ alpha_mask = input_image["layers"][0].split()[3]
+ input_mask = np.asarray(alpha_mask)
+
+ output_w, output_h = aspect_ratios[aspect_ratio_name]
+ if output_w == "" or output_h == "":
+ output_h, output_w = original_image.shape[:2]
+ if resize_default:
+ short_side = min(output_w, output_h)
+ scale_ratio = 640 / short_side
+ output_w = int(output_w * scale_ratio)
+ output_h = int(output_h * scale_ratio)
+ original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h))
+ original_image = np.array(original_image)
+ if input_mask is not None:
+ input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h))
+ input_mask = np.array(input_mask)
+ if original_mask is not None:
+ original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h))
+ original_mask = np.array(original_mask)
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
+ else:
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
+ pass
+ else:
+ if resize_default:
+ short_side = min(output_w, output_h)
+ scale_ratio = 640 / short_side
+ output_w = int(output_w * scale_ratio)
+ output_h = int(output_h * scale_ratio)
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
+ original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h))
+ original_image = np.array(original_image)
+ if input_mask is not None:
+ input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h))
+ input_mask = np.array(input_mask)
+ if original_mask is not None:
+ original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h))
+ original_mask = np.array(original_mask)
+
+ if input_mask.max() == 0:
+ original_mask = original_mask
+ else:
+ original_mask = input_mask
+
+ if original_mask is None:
+ raise gr.Error('Please generate mask first')
+
+ if original_mask.ndim == 2:
+ original_mask = original_mask[:,:,None]
+
+ dilation_type = np.random.choice(['square_dilation'])
+ random_mask = random_mask_func(original_mask, dilation_type, dilation_size).squeeze()
+
+ mask_image = Image.fromarray(random_mask.astype(np.uint8)).convert("RGB")
+
+ masked_image = original_image * (1 - (random_mask[:,:,None]>0))
+ masked_image = masked_image.astype(original_image.dtype)
+ masked_image = Image.fromarray(masked_image)
+
+ return [masked_image], [mask_image], random_mask[:,:,None].astype(np.uint8)
+
+
+def process_erosion_mask(input_image,
+ original_image,
+ original_mask,
+ resize_default,
+ aspect_ratio_name,
+ dilation_size=20):
+ alpha_mask = input_image["layers"][0].split()[3]
+ input_mask = np.asarray(alpha_mask)
+ output_w, output_h = aspect_ratios[aspect_ratio_name]
+ if output_w == "" or output_h == "":
+ output_h, output_w = original_image.shape[:2]
+ if resize_default:
+ short_side = min(output_w, output_h)
+ scale_ratio = 640 / short_side
+ output_w = int(output_w * scale_ratio)
+ output_h = int(output_h * scale_ratio)
+ original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h))
+ original_image = np.array(original_image)
+ if input_mask is not None:
+ input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h))
+ input_mask = np.array(input_mask)
+ if original_mask is not None:
+ original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h))
+ original_mask = np.array(original_mask)
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
+ else:
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
+ pass
+ else:
+ if resize_default:
+ short_side = min(output_w, output_h)
+ scale_ratio = 640 / short_side
+ output_w = int(output_w * scale_ratio)
+ output_h = int(output_h * scale_ratio)
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
+ original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h))
+ original_image = np.array(original_image)
+ if input_mask is not None:
+ input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h))
+ input_mask = np.array(input_mask)
+ if original_mask is not None:
+ original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h))
+ original_mask = np.array(original_mask)
+
+ if input_mask.max() == 0:
+ original_mask = original_mask
+ else:
+ original_mask = input_mask
+
+ if original_mask is None:
+ raise gr.Error('Please generate mask first')
+
+ if original_mask.ndim == 2:
+ original_mask = original_mask[:,:,None]
+
+ dilation_type = np.random.choice(['square_erosion'])
+ random_mask = random_mask_func(original_mask, dilation_type, dilation_size).squeeze()
+
+ mask_image = Image.fromarray(random_mask.astype(np.uint8)).convert("RGB")
+
+ masked_image = original_image * (1 - (random_mask[:,:,None]>0))
+ masked_image = masked_image.astype(original_image.dtype)
+ masked_image = Image.fromarray(masked_image)
+
+
+ return [masked_image], [mask_image], random_mask[:,:,None].astype(np.uint8)
+
+
+def move_mask_left(input_image,
+ original_image,
+ original_mask,
+ moving_pixels,
+ resize_default,
+ aspect_ratio_name):
+
+ alpha_mask = input_image["layers"][0].split()[3]
+ input_mask = np.asarray(alpha_mask)
+
+ output_w, output_h = aspect_ratios[aspect_ratio_name]
+ if output_w == "" or output_h == "":
+ output_h, output_w = original_image.shape[:2]
+ if resize_default:
+ short_side = min(output_w, output_h)
+ scale_ratio = 640 / short_side
+ output_w = int(output_w * scale_ratio)
+ output_h = int(output_h * scale_ratio)
+ original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h))
+ original_image = np.array(original_image)
+ if input_mask is not None:
+ input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h))
+ input_mask = np.array(input_mask)
+ if original_mask is not None:
+ original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h))
+ original_mask = np.array(original_mask)
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
+ else:
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
+ pass
+ else:
+ if resize_default:
+ short_side = min(output_w, output_h)
+ scale_ratio = 640 / short_side
+ output_w = int(output_w * scale_ratio)
+ output_h = int(output_h * scale_ratio)
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
+ original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h))
+ original_image = np.array(original_image)
+ if input_mask is not None:
+ input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h))
+ input_mask = np.array(input_mask)
+ if original_mask is not None:
+ original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h))
+ original_mask = np.array(original_mask)
+
+ if input_mask.max() == 0:
+ original_mask = original_mask
+ else:
+ original_mask = input_mask
+
+ if original_mask is None:
+ raise gr.Error('Please generate mask first')
+
+ if original_mask.ndim == 2:
+ original_mask = original_mask[:,:,None]
+
+ moved_mask = move_mask_func(original_mask, 'left', int(moving_pixels)).squeeze()
+ mask_image = Image.fromarray(((moved_mask>0).astype(np.uint8)*255)).convert("RGB")
+
+ masked_image = original_image * (1 - (moved_mask[:,:,None]>0))
+ masked_image = masked_image.astype(original_image.dtype)
+ masked_image = Image.fromarray(masked_image)
+
+ if moved_mask.max() <= 1:
+ moved_mask = ((moved_mask * 255)[:,:,None]).astype(np.uint8)
+ original_mask = moved_mask
+ return [masked_image], [mask_image], original_mask.astype(np.uint8)
+
+
+def move_mask_right(input_image,
+ original_image,
+ original_mask,
+ moving_pixels,
+ resize_default,
+ aspect_ratio_name):
+ alpha_mask = input_image["layers"][0].split()[3]
+ input_mask = np.asarray(alpha_mask)
+
+ output_w, output_h = aspect_ratios[aspect_ratio_name]
+ if output_w == "" or output_h == "":
+ output_h, output_w = original_image.shape[:2]
+ if resize_default:
+ short_side = min(output_w, output_h)
+ scale_ratio = 640 / short_side
+ output_w = int(output_w * scale_ratio)
+ output_h = int(output_h * scale_ratio)
+ original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h))
+ original_image = np.array(original_image)
+ if input_mask is not None:
+ input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h))
+ input_mask = np.array(input_mask)
+ if original_mask is not None:
+ original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h))
+ original_mask = np.array(original_mask)
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
+ else:
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
+ pass
+ else:
+ if resize_default:
+ short_side = min(output_w, output_h)
+ scale_ratio = 640 / short_side
+ output_w = int(output_w * scale_ratio)
+ output_h = int(output_h * scale_ratio)
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
+ original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h))
+ original_image = np.array(original_image)
+ if input_mask is not None:
+ input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h))
+ input_mask = np.array(input_mask)
+ if original_mask is not None:
+ original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h))
+ original_mask = np.array(original_mask)
+
+ if input_mask.max() == 0:
+ original_mask = original_mask
+ else:
+ original_mask = input_mask
+
+ if original_mask is None:
+ raise gr.Error('Please generate mask first')
+
+ if original_mask.ndim == 2:
+ original_mask = original_mask[:,:,None]
+
+ moved_mask = move_mask_func(original_mask, 'right', int(moving_pixels)).squeeze()
+
+ mask_image = Image.fromarray(((moved_mask>0).astype(np.uint8)*255)).convert("RGB")
+
+ masked_image = original_image * (1 - (moved_mask[:,:,None]>0))
+ masked_image = masked_image.astype(original_image.dtype)
+ masked_image = Image.fromarray(masked_image)
+
+
+ if moved_mask.max() <= 1:
+ moved_mask = ((moved_mask * 255)[:,:,None]).astype(np.uint8)
+ original_mask = moved_mask
+
+ return [masked_image], [mask_image], original_mask.astype(np.uint8)
+
+
+def move_mask_up(input_image,
+ original_image,
+ original_mask,
+ moving_pixels,
+ resize_default,
+ aspect_ratio_name):
+ alpha_mask = input_image["layers"][0].split()[3]
+ input_mask = np.asarray(alpha_mask)
+
+ output_w, output_h = aspect_ratios[aspect_ratio_name]
+ if output_w == "" or output_h == "":
+ output_h, output_w = original_image.shape[:2]
+ if resize_default:
+ short_side = min(output_w, output_h)
+ scale_ratio = 640 / short_side
+ output_w = int(output_w * scale_ratio)
+ output_h = int(output_h * scale_ratio)
+ original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h))
+ original_image = np.array(original_image)
+ if input_mask is not None:
+ input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h))
+ input_mask = np.array(input_mask)
+ if original_mask is not None:
+ original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h))
+ original_mask = np.array(original_mask)
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
+ else:
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
+ pass
+ else:
+ if resize_default:
+ short_side = min(output_w, output_h)
+ scale_ratio = 640 / short_side
+ output_w = int(output_w * scale_ratio)
+ output_h = int(output_h * scale_ratio)
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
+ original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h))
+ original_image = np.array(original_image)
+ if input_mask is not None:
+ input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h))
+ input_mask = np.array(input_mask)
+ if original_mask is not None:
+ original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h))
+ original_mask = np.array(original_mask)
+
+ if input_mask.max() == 0:
+ original_mask = original_mask
+ else:
+ original_mask = input_mask
+
+ if original_mask is None:
+ raise gr.Error('Please generate mask first')
+
+ if original_mask.ndim == 2:
+ original_mask = original_mask[:,:,None]
+
+ moved_mask = move_mask_func(original_mask, 'up', int(moving_pixels)).squeeze()
+ mask_image = Image.fromarray(((moved_mask>0).astype(np.uint8)*255)).convert("RGB")
+
+ masked_image = original_image * (1 - (moved_mask[:,:,None]>0))
+ masked_image = masked_image.astype(original_image.dtype)
+ masked_image = Image.fromarray(masked_image)
+
+ if moved_mask.max() <= 1:
+ moved_mask = ((moved_mask * 255)[:,:,None]).astype(np.uint8)
+ original_mask = moved_mask
+
+ return [masked_image], [mask_image], original_mask.astype(np.uint8)
+
+
+def move_mask_down(input_image,
+ original_image,
+ original_mask,
+ moving_pixels,
+ resize_default,
+ aspect_ratio_name):
+ alpha_mask = input_image["layers"][0].split()[3]
+ input_mask = np.asarray(alpha_mask)
+ output_w, output_h = aspect_ratios[aspect_ratio_name]
+ if output_w == "" or output_h == "":
+ output_h, output_w = original_image.shape[:2]
+ if resize_default:
+ short_side = min(output_w, output_h)
+ scale_ratio = 640 / short_side
+ output_w = int(output_w * scale_ratio)
+ output_h = int(output_h * scale_ratio)
+ original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h))
+ original_image = np.array(original_image)
+ if input_mask is not None:
+ input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h))
+ input_mask = np.array(input_mask)
+ if original_mask is not None:
+ original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h))
+ original_mask = np.array(original_mask)
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
+ else:
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
+ pass
+ else:
+ if resize_default:
+ short_side = min(output_w, output_h)
+ scale_ratio = 640 / short_side
+ output_w = int(output_w * scale_ratio)
+ output_h = int(output_h * scale_ratio)
+ gr.Info(f"Output aspect ratio: {output_w}:{output_h}")
+ original_image = resize(Image.fromarray(original_image), target_width=int(output_w), target_height=int(output_h))
+ original_image = np.array(original_image)
+ if input_mask is not None:
+ input_mask = resize(Image.fromarray(np.squeeze(input_mask)), target_width=int(output_w), target_height=int(output_h))
+ input_mask = np.array(input_mask)
+ if original_mask is not None:
+ original_mask = resize(Image.fromarray(np.squeeze(original_mask)), target_width=int(output_w), target_height=int(output_h))
+ original_mask = np.array(original_mask)
+
+ if input_mask.max() == 0:
+ original_mask = original_mask
+ else:
+ original_mask = input_mask
+
+ if original_mask is None:
+ raise gr.Error('Please generate mask first')
+
+ if original_mask.ndim == 2:
+ original_mask = original_mask[:,:,None]
+
+ moved_mask = move_mask_func(original_mask, 'down', int(moving_pixels)).squeeze()
+ mask_image = Image.fromarray(((moved_mask>0).astype(np.uint8)*255)).convert("RGB")
+
+ masked_image = original_image * (1 - (moved_mask[:,:,None]>0))
+ masked_image = masked_image.astype(original_image.dtype)
+ masked_image = Image.fromarray(masked_image)
+
+ if moved_mask.max() <= 1:
+ moved_mask = ((moved_mask * 255)[:,:,None]).astype(np.uint8)
+ original_mask = moved_mask
+
+ return [masked_image], [mask_image], original_mask.astype(np.uint8)
+
+
+def invert_mask(input_image,
+ original_image,
+ original_mask,
+ ):
+ alpha_mask = input_image["layers"][0].split()[3]
+ input_mask = np.asarray(alpha_mask)
+ if input_mask.max() == 0:
+ original_mask = 1 - (original_mask>0).astype(np.uint8)
+ else:
+ original_mask = 1 - (input_mask>0).astype(np.uint8)
+
+ if original_mask is None:
+ raise gr.Error('Please generate mask first')
+
+ original_mask = original_mask.squeeze()
+ mask_image = Image.fromarray(original_mask*255).convert("RGB")
+
+ if original_mask.ndim == 2:
+ original_mask = original_mask[:,:,None]
+
+ if original_mask.max() <= 1:
+ original_mask = (original_mask * 255).astype(np.uint8)
+
+ masked_image = original_image * (1 - (original_mask>0))
+ masked_image = masked_image.astype(original_image.dtype)
+ masked_image = Image.fromarray(masked_image)
+
+ return [masked_image], [mask_image], original_mask, True
+
+
+def init_img(base,
+ init_type,
+ prompt,
+ aspect_ratio,
+ example_change_times
+ ):
+ image_pil = base["background"].convert("RGB")
+ original_image = np.array(image_pil)
+ if max(original_image.shape[0], original_image.shape[1]) * 1.0 / min(original_image.shape[0], original_image.shape[1])>2.0:
+ raise gr.Error('image aspect ratio cannot be larger than 2.0')
+ if init_type in MASK_IMAGE_PATH.keys() and example_change_times < 2:
+ mask_gallery = [Image.open(MASK_IMAGE_PATH[init_type]).convert("L")]
+ masked_gallery = [Image.open(MASKED_IMAGE_PATH[init_type]).convert("RGB")]
+ result_gallery = [Image.open(OUTPUT_IMAGE_PATH[init_type]).convert("RGB")]
+ width, height = image_pil.size
+ image_processor = VaeImageProcessor(vae_scale_factor=pipe.vae_scale_factor, do_convert_rgb=True)
+ height_new, width_new = image_processor.get_default_height_width(image_pil, height, width)
+ image_pil = image_pil.resize((width_new, height_new))
+ mask_gallery[0] = mask_gallery[0].resize((width_new, height_new))
+ masked_gallery[0] = masked_gallery[0].resize((width_new, height_new))
+ result_gallery[0] = result_gallery[0].resize((width_new, height_new))
+ original_mask = np.array(mask_gallery[0]).astype(np.uint8)[:,:,None] # h,w,1
+ return base, original_image, original_mask, prompt, mask_gallery, masked_gallery, result_gallery, "", "", "", "Custom resolution", False, False, example_change_times
+ else:
+ return base, original_image, None, "", None, None, None, "", "", "", aspect_ratio, True, False, 0
+
+
+def reset_func(input_image,
+ original_image,
+ original_mask,
+ prompt,
+ target_prompt,
+ target_prompt_output):
+ input_image = None
+ original_image = None
+ original_mask = None
+ prompt = ''
+ mask_gallery = []
+ masked_gallery = []
+ result_gallery = []
+ target_prompt = ''
+ target_prompt_output = ''
+ if torch.cuda.is_available():
+ torch.cuda.empty_cache()
+ return input_image, original_image, original_mask, prompt, mask_gallery, masked_gallery, result_gallery, target_prompt, target_prompt_output, True, False
+
+
+def update_example(example_type,
+ prompt,
+ example_change_times):
+ input_image = INPUT_IMAGE_PATH[example_type]
+ image_pil = Image.open(input_image).convert("RGB")
+ mask_gallery = [Image.open(MASK_IMAGE_PATH[example_type]).convert("L")]
+ masked_gallery = [Image.open(MASKED_IMAGE_PATH[example_type]).convert("RGB")]
+ result_gallery = [Image.open(OUTPUT_IMAGE_PATH[example_type]).convert("RGB")]
+ width, height = image_pil.size
+ image_processor = VaeImageProcessor(vae_scale_factor=pipe.vae_scale_factor, do_convert_rgb=True)
+ height_new, width_new = image_processor.get_default_height_width(image_pil, height, width)
+ image_pil = image_pil.resize((width_new, height_new))
+ mask_gallery[0] = mask_gallery[0].resize((width_new, height_new))
+ masked_gallery[0] = masked_gallery[0].resize((width_new, height_new))
+ result_gallery[0] = result_gallery[0].resize((width_new, height_new))
+
+ original_image = np.array(image_pil)
+ original_mask = np.array(mask_gallery[0]).astype(np.uint8)[:,:,None] # h,w,1
+ aspect_ratio = "Custom resolution"
+ example_change_times += 1
+ return input_image, prompt, original_image, original_mask, mask_gallery, masked_gallery, result_gallery, aspect_ratio, "", "", False, example_change_times
+
+block = gr.Blocks(
+ theme=gr.themes.Soft(
+ radius_size=gr.themes.sizes.radius_none,
+ text_size=gr.themes.sizes.text_md
+ )
+ ).queue()
+with block as demo:
+ with gr.Row():
+ with gr.Column():
+ gr.HTML(head)
+
+ gr.Markdown(descriptions)
+
+ with gr.Accordion(label="🧭 Instructions:", open=True, elem_id="accordion"):
+ with gr.Row(equal_height=True):
+ gr.Markdown(instructions)
+
+ original_image = gr.State(value=None)
+ original_mask = gr.State(value=None)
+ category = gr.State(value=None)
+ status = gr.State(value=None)
+ invert_mask_state = gr.State(value=False)
+ example_change_times = gr.State(value=0)
+
+
+ with gr.Row():
+ with gr.Column():
+ with gr.Row():
+ input_image = gr.ImageEditor(
+ label="Input Image",
+ type="pil",
+ brush=gr.Brush(colors=["#FFFFFF"], default_size = 30, color_mode="fixed"),
+ layers = False,
+ interactive=True,
+ height=1024,
+ sources=["upload"],
+ )
+
+
+ vlm_model_dropdown = gr.Dropdown(label="VLM model", choices=VLM_MODEL_NAMES, value=DEFAULT_VLM_MODEL_NAME, interactive=True)
+ with gr.Group():
+ with gr.Row():
+ GPT4o_KEY = gr.Textbox(label="GPT4o API Key", placeholder="Please input your GPT4o API Key when use GPT4o VLM (highly recommended).", value="", lines=1)
+
+ GPT4o_KEY_submit = gr.Button("Submit and Verify")
+
+
+ aspect_ratio = gr.Dropdown(label="Output aspect ratio", choices=ASPECT_RATIO_LABELS, value=DEFAULT_ASPECT_RATIO)
+ resize_default = gr.Checkbox(label="Short edge resize to 640px", value=True)
+
+
+ prompt = gr.Textbox(label="⌨️ Instruction", placeholder="Please input your instruction.", value="",lines=1)
+
+ run_button = gr.Button("💫 Run")
+
+
+ with gr.Row():
+ mask_button = gr.Button("Generate Mask")
+ random_mask_button = gr.Button("Square/Circle Mask ")
+
+
+ with gr.Row():
+ generate_target_prompt_button = gr.Button("Generate Target Prompt")
+
+ target_prompt = gr.Text(
+ label="Input Target Prompt",
+ max_lines=5,
+ placeholder="VLM-generated target prompt, you can first generate if and then modify it (optional)",
+ value='',
+ lines=2
+ )
+
+ with gr.Accordion("Advanced Options", open=False, elem_id="accordion1"):
+ base_model_dropdown = gr.Dropdown(label="Base model", choices=BASE_MODELS, value=DEFAULT_BASE_MODEL, interactive=True)
+ negative_prompt = gr.Text(
+ label="Negative Prompt",
+ max_lines=5,
+ placeholder="Please input your negative prompt",
+ value='ugly, low quality',lines=1
+ )
+
+ control_strength = gr.Slider(
+ label="Control Strength: ", show_label=True, minimum=0, maximum=1.1, value=1, step=0.01
+ )
+ with gr.Group():
+ seed = gr.Slider(
+ label="Seed: ", minimum=0, maximum=2147483647, step=1, value=648464818
+ )
+ randomize_seed = gr.Checkbox(label="Randomize seed", value=False)
+
+ blending = gr.Checkbox(label="Blending mode", value=True)
+
+
+ num_samples = gr.Slider(
+ label="Num samples", minimum=0, maximum=4, step=1, value=4
+ )
+
+ with gr.Group():
+ with gr.Row():
+ guidance_scale = gr.Slider(
+ label="Guidance scale",
+ minimum=1,
+ maximum=12,
+ step=0.1,
+ value=7.5,
+ )
+ num_inference_steps = gr.Slider(
+ label="Number of inference steps",
+ minimum=1,
+ maximum=50,
+ step=1,
+ value=50,
+ )
+
+
+ with gr.Column():
+ with gr.Row():
+ with gr.Tab(elem_classes="feedback", label="Masked Image"):
+ masked_gallery = gr.Gallery(label='Masked Image', show_label=True, elem_id="gallery", preview=True, height=360)
+ with gr.Tab(elem_classes="feedback", label="Mask"):
+ mask_gallery = gr.Gallery(label='Mask', show_label=True, elem_id="gallery", preview=True, height=360)
+
+ invert_mask_button = gr.Button("Invert Mask")
+ dilation_size = gr.Slider(
+ label="Dilation size: ", minimum=0, maximum=50, step=1, value=20
+ )
+ with gr.Row():
+ dilation_mask_button = gr.Button("Dilation Generated Mask")
+ erosion_mask_button = gr.Button("Erosion Generated Mask")
+
+ moving_pixels = gr.Slider(
+ label="Moving pixels:", show_label=True, minimum=0, maximum=50, value=4, step=1
+ )
+ with gr.Row():
+ move_left_button = gr.Button("Move Left")
+ move_right_button = gr.Button("Move Right")
+ with gr.Row():
+ move_up_button = gr.Button("Move Up")
+ move_down_button = gr.Button("Move Down")
+
+ with gr.Tab(elem_classes="feedback", label="Output"):
+ result_gallery = gr.Gallery(label='Output', show_label=True, elem_id="gallery", preview=True, height=400)
+
+ target_prompt_output = gr.Text(label="Output Target Prompt", value="", lines=1, interactive=False)
+
+ reset_button = gr.Button("Reset")
+
+ init_type = gr.Textbox(label="Init Name", value="", visible=False)
+ example_type = gr.Textbox(label="Example Name", value="", visible=False)
+
+
+
+ with gr.Row():
+ example = gr.Examples(
+ label="Quick Example",
+ examples=EXAMPLES,
+ inputs=[input_image, prompt, seed, init_type, example_type, blending, resize_default, vlm_model_dropdown],
+ examples_per_page=10,
+ cache_examples=False,
+ )
+
+
+ with gr.Accordion(label="🎬 Feature Details:", open=True, elem_id="accordion"):
+ with gr.Row(equal_height=True):
+ gr.Markdown(tips)
+
+ with gr.Row():
+ gr.Markdown(citation)
+
+ ## gr.examples can not be used to update the gr.Gallery, so we need to use the following two functions to update the gr.Gallery.
+ ## And we need to solve the conflict between the upload and change example functions.
+ input_image.upload(
+ init_img,
+ [input_image, init_type, prompt, aspect_ratio, example_change_times],
+ [input_image, original_image, original_mask, prompt, mask_gallery, masked_gallery, result_gallery, target_prompt, target_prompt_output, init_type, aspect_ratio, resize_default, invert_mask_state, example_change_times]
+ )
+ example_type.change(fn=update_example, inputs=[example_type, prompt, example_change_times], outputs=[input_image, prompt, original_image, original_mask, mask_gallery, masked_gallery, result_gallery, aspect_ratio, target_prompt, target_prompt_output, invert_mask_state, example_change_times])
+
+ ## vlm and base model dropdown
+ vlm_model_dropdown.change(fn=update_vlm_model, inputs=[vlm_model_dropdown], outputs=[status])
+ base_model_dropdown.change(fn=update_base_model, inputs=[base_model_dropdown], outputs=[status])
+
+
+ GPT4o_KEY_submit.click(fn=submit_GPT4o_KEY, inputs=[GPT4o_KEY], outputs=[GPT4o_KEY, vlm_model_dropdown])
+ invert_mask_button.click(fn=invert_mask, inputs=[input_image, original_image, original_mask], outputs=[masked_gallery, mask_gallery, original_mask, invert_mask_state])
+
+
+ ips=[input_image,
+ original_image,
+ original_mask,
+ prompt,
+ negative_prompt,
+ control_strength,
+ seed,
+ randomize_seed,
+ guidance_scale,
+ num_inference_steps,
+ num_samples,
+ blending,
+ category,
+ target_prompt,
+ resize_default,
+ aspect_ratio,
+ invert_mask_state]
+
+ ## run brushedit
+ run_button.click(fn=process, inputs=ips, outputs=[result_gallery, mask_gallery, masked_gallery, prompt, target_prompt, target_prompt_output, invert_mask_state])
+
+ ## mask func
+ mask_button.click(fn=process_mask, inputs=[input_image, original_image, prompt, resize_default, aspect_ratio], outputs=[masked_gallery, mask_gallery, original_mask, category])
+ random_mask_button.click(fn=process_random_mask, inputs=[input_image, original_image, original_mask, resize_default, aspect_ratio], outputs=[masked_gallery, mask_gallery, original_mask])
+ dilation_mask_button.click(fn=process_dilation_mask, inputs=[input_image, original_image, original_mask, resize_default, aspect_ratio, dilation_size], outputs=[ masked_gallery, mask_gallery, original_mask])
+ erosion_mask_button.click(fn=process_erosion_mask, inputs=[input_image, original_image, original_mask, resize_default, aspect_ratio, dilation_size], outputs=[ masked_gallery, mask_gallery, original_mask])
+
+ ## move mask func
+ move_left_button.click(fn=move_mask_left, inputs=[input_image, original_image, original_mask, moving_pixels, resize_default, aspect_ratio], outputs=[masked_gallery, mask_gallery, original_mask])
+ move_right_button.click(fn=move_mask_right, inputs=[input_image, original_image, original_mask, moving_pixels, resize_default, aspect_ratio], outputs=[masked_gallery, mask_gallery, original_mask])
+ move_up_button.click(fn=move_mask_up, inputs=[input_image, original_image, original_mask, moving_pixels, resize_default, aspect_ratio], outputs=[masked_gallery, mask_gallery, original_mask])
+ move_down_button.click(fn=move_mask_down, inputs=[input_image, original_image, original_mask, moving_pixels, resize_default, aspect_ratio], outputs=[masked_gallery, mask_gallery, original_mask])
+
+ ## prompt func
+ generate_target_prompt_button.click(fn=generate_target_prompt, inputs=[input_image, original_image, prompt], outputs=[target_prompt, target_prompt_output])
+
+ ## reset func
+ reset_button.click(fn=reset_func, inputs=[input_image, original_image, original_mask, prompt, target_prompt, target_prompt_output], outputs=[input_image, original_image, original_mask, prompt, mask_gallery, masked_gallery, result_gallery, target_prompt, target_prompt_output, resize_default, invert_mask_state])
+
+
+demo.launch(server_name="0.0.0.0", server_port=12345, share=False)
diff --git a/app/src/vlm_pipeline.py b/app/src/vlm_pipeline.py
new file mode 100644
index 0000000000000000000000000000000000000000..b5e8e61c491a49ff4fb0835c6b7fb33b1e76688e
--- /dev/null
+++ b/app/src/vlm_pipeline.py
@@ -0,0 +1,222 @@
+import base64
+import re
+import torch
+
+from PIL import Image
+from io import BytesIO
+import numpy as np
+import gradio as gr
+
+from openai import OpenAI
+from transformers import (LlavaNextForConditionalGeneration, Qwen2VLForConditionalGeneration)
+from qwen_vl_utils import process_vision_info
+
+from app.gpt4_o.instructions import (
+ create_editing_category_messages_gpt4o,
+ create_ori_object_messages_gpt4o,
+ create_add_object_messages_gpt4o,
+ create_apply_editing_messages_gpt4o)
+
+from app.llava.instructions import (
+ create_editing_category_messages_llava,
+ create_ori_object_messages_llava,
+ create_add_object_messages_llava,
+ create_apply_editing_messages_llava)
+
+from app.qwen2.instructions import (
+ create_editing_category_messages_qwen2,
+ create_ori_object_messages_qwen2,
+ create_add_object_messages_qwen2,
+ create_apply_editing_messages_qwen2)
+
+from app.utils.utils import run_grounded_sam
+
+
+def encode_image(img):
+ img = Image.fromarray(img.astype('uint8'))
+ buffered = BytesIO()
+ img.save(buffered, format="PNG")
+ img_bytes = buffered.getvalue()
+ return base64.b64encode(img_bytes).decode('utf-8')
+
+
+def run_gpt4o_vl_inference(vlm_model,
+ messages):
+ response = vlm_model.chat.completions.create(
+ model="gpt-4o-2024-08-06",
+ messages=messages
+ )
+ response_str = response.choices[0].message.content
+ return response_str
+
+def run_llava_next_inference(vlm_processor, vlm_model, messages, image, device="cuda"):
+ prompt = vlm_processor.apply_chat_template(messages, add_generation_prompt=True)
+ inputs = vlm_processor(images=image, text=prompt, return_tensors="pt").to(device)
+ output = vlm_model.generate(**inputs, max_new_tokens=200)
+ generated_ids_trimmed = [
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, output)
+ ]
+ response_str = vlm_processor.decode(generated_ids_trimmed[0], skip_special_tokens=True)
+
+ return response_str
+
+def run_qwen2_vl_inference(vlm_processor, vlm_model, messages, image, device="cuda"):
+ text = vlm_processor.apply_chat_template(
+ messages, tokenize=False, add_generation_prompt=True
+ )
+ image_inputs, video_inputs = process_vision_info(messages)
+ inputs = vlm_processor(
+ text=[text],
+ images=image_inputs,
+ videos=video_inputs,
+ padding=True,
+ return_tensors="pt",
+ )
+ inputs = inputs.to(device)
+ generated_ids = vlm_model.generate(**inputs, max_new_tokens=128)
+ generated_ids_trimmed = [
+ out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
+ ]
+ response_str = vlm_processor.decode(generated_ids_trimmed[0], skip_special_tokens=True)
+ return response_str
+
+
+### response editing type
+def vlm_response_editing_type(vlm_processor,
+ vlm_model,
+ image,
+ editing_prompt,
+ device):
+
+ if isinstance(vlm_model, OpenAI):
+ messages = create_editing_category_messages_gpt4o(editing_prompt)
+ response_str = run_gpt4o_vl_inference(vlm_model, messages)
+ elif isinstance(vlm_model, LlavaNextForConditionalGeneration):
+ messages = create_editing_category_messages_llava(editing_prompt)
+ response_str = run_llava_next_inference(vlm_processor, vlm_model, messages, image, device=device)
+ elif isinstance(vlm_model, Qwen2VLForConditionalGeneration):
+ messages = create_editing_category_messages_qwen2(editing_prompt)
+ response_str = run_qwen2_vl_inference(vlm_processor, vlm_model, messages, image, device=device)
+
+ for category_name in ["Addition","Remove","Local","Global","Background"]:
+ if category_name.lower() in response_str.lower():
+ return category_name
+ raise gr.Error("Please input correct commands, including add, delete, and modify commands. If it still does not work, please switch to a more powerful VLM.")
+
+
+### response object to be edited
+def vlm_response_object_wait_for_edit(vlm_processor,
+ vlm_model,
+ image,
+ category,
+ editing_prompt,
+ device):
+ if category in ["Background", "Global", "Addition"]:
+ edit_object = "nan"
+ return edit_object
+
+ if isinstance(vlm_model, OpenAI):
+ messages = create_ori_object_messages_gpt4o(editing_prompt)
+ response_str = run_gpt4o_vl_inference(vlm_model, messages)
+ elif isinstance(vlm_model, LlavaNextForConditionalGeneration):
+ messages = create_ori_object_messages_llava(editing_prompt)
+ response_str = run_llava_next_inference(vlm_processor, vlm_model, messages, image , device)
+ elif isinstance(vlm_model, Qwen2VLForConditionalGeneration):
+ messages = create_ori_object_messages_qwen2(editing_prompt)
+ response_str = run_qwen2_vl_inference(vlm_processor, vlm_model, messages, image, device)
+ return response_str
+
+
+### response mask
+def vlm_response_mask(vlm_processor,
+ vlm_model,
+ category,
+ image,
+ editing_prompt,
+ object_wait_for_edit,
+ sam=None,
+ sam_predictor=None,
+ sam_automask_generator=None,
+ groundingdino_model=None,
+ device=None,
+ ):
+ mask = None
+ if editing_prompt is None or len(editing_prompt)==0:
+ raise gr.Error("Please input the editing instruction!")
+ height, width = image.shape[:2]
+ if category=="Addition":
+ try:
+ if isinstance(vlm_model, OpenAI):
+ base64_image = encode_image(image)
+ messages = create_add_object_messages_gpt4o(editing_prompt, base64_image, height=height, width=width)
+ response_str = run_gpt4o_vl_inference(vlm_model, messages)
+ elif isinstance(vlm_model, LlavaNextForConditionalGeneration):
+ messages = create_add_object_messages_llava(editing_prompt, height=height, width=width)
+ response_str = run_llava_next_inference(vlm_processor, vlm_model, messages, image, device)
+ elif isinstance(vlm_model, Qwen2VLForConditionalGeneration):
+ base64_image = encode_image(image)
+ messages = create_add_object_messages_qwen2(editing_prompt, base64_image, height=height, width=width)
+ response_str = run_qwen2_vl_inference(vlm_processor, vlm_model, messages, image, device)
+ pattern = r'\[\d{1,3}(?:,\s*\d{1,3}){3}\]'
+ box = re.findall(pattern, response_str)
+ box = box[0][1:-1].split(",")
+ for i in range(len(box)):
+ box[i] = int(box[i])
+ cus_mask = np.zeros((height, width))
+ cus_mask[box[1]: box[1]+box[3], box[0]: box[0]+box[2]]=255
+ mask = cus_mask
+ except:
+ raise gr.Error("Please set the mask manually, currently the VLM cannot output the mask!")
+
+ elif category=="Background":
+ labels = "background"
+ elif category=="Global":
+ mask = 255 * np.zeros((height, width))
+ else:
+ labels = object_wait_for_edit
+
+ if mask is None:
+ for thresh in [0.3,0.25,0.2,0.15,0.1,0.05,0]:
+ try:
+ detections = run_grounded_sam(
+ input_image={"image":Image.fromarray(image.astype('uint8')),
+ "mask":None},
+ text_prompt=labels,
+ task_type="seg",
+ box_threshold=thresh,
+ text_threshold=0.25,
+ iou_threshold=0.5,
+ scribble_mode="split",
+ sam=sam,
+ sam_predictor=sam_predictor,
+ sam_automask_generator=sam_automask_generator,
+ groundingdino_model=groundingdino_model,
+ device=device,
+ )
+ mask = np.array(detections[0,0,...].cpu()) * 255
+ break
+ except:
+ print(f"wrong in threshhold: {thresh}, continue")
+ continue
+ return mask
+
+
+def vlm_response_prompt_after_apply_instruction(vlm_processor,
+ vlm_model,
+ image,
+ editing_prompt,
+ device):
+ if isinstance(vlm_model, OpenAI):
+ base64_image = encode_image(image)
+ messages = create_apply_editing_messages_gpt4o(editing_prompt, base64_image)
+ response_str = run_gpt4o_vl_inference(vlm_model, messages)
+ elif isinstance(vlm_model, LlavaNextForConditionalGeneration):
+ messages = create_apply_editing_messages_llava(editing_prompt)
+ response_str = run_llava_next_inference(vlm_processor, vlm_model, messages, image, device)
+ elif isinstance(vlm_model, Qwen2VLForConditionalGeneration):
+ base64_image = encode_image(image)
+ messages = create_apply_editing_messages_qwen2(editing_prompt, base64_image)
+ response_str = run_qwen2_vl_inference(vlm_processor, vlm_model, messages, image, device)
+ else:
+ raise gr.Error("Please select the correct VLM model!")
+ return response_str
diff --git a/app/src/vlm_template.py b/app/src/vlm_template.py
new file mode 100644
index 0000000000000000000000000000000000000000..a30e3f6d52fb72dd99393a24de75486d8c6555b1
--- /dev/null
+++ b/app/src/vlm_template.py
@@ -0,0 +1,120 @@
+import os
+import sys
+import torch
+from openai import OpenAI
+from transformers import (
+ LlavaNextProcessor, LlavaNextForConditionalGeneration,
+ Qwen2VLForConditionalGeneration, Qwen2VLProcessor
+)
+## init device
+device = "cpu"
+torch_dtype = torch.float16
+
+
+vlms_list = [
+ # {
+ # "type": "llava-next",
+ # "name": "llava-v1.6-mistral-7b-hf",
+ # "local_path": "models/vlms/llava-v1.6-mistral-7b-hf",
+ # "processor": LlavaNextProcessor.from_pretrained(
+ # "models/vlms/llava-v1.6-mistral-7b-hf"
+ # ) if os.path.exists("models/vlms/llava-v1.6-mistral-7b-hf") else LlavaNextProcessor.from_pretrained(
+ # "llava-hf/llava-v1.6-mistral-7b-hf"
+ # ),
+ # "model": LlavaNextForConditionalGeneration.from_pretrained(
+ # "models/vlms/llava-v1.6-mistral-7b-hf", torch_dtype=torch_dtype, device_map=device
+ # ).to("cpu") if os.path.exists("models/vlms/llava-v1.6-mistral-7b-hf") else
+ # LlavaNextForConditionalGeneration.from_pretrained(
+ # "llava-hf/llava-v1.6-mistral-7b-hf", torch_dtype=torch_dtype, device_map=device
+ # ).to("cpu"),
+ # },
+ {
+ "type": "llava-next",
+ "name": "llama3-llava-next-8b-hf (Preload)",
+ "local_path": "models/vlms/llama3-llava-next-8b-hf",
+ "processor": LlavaNextProcessor.from_pretrained(
+ "models/vlms/llama3-llava-next-8b-hf"
+ ) if os.path.exists("models/vlms/llama3-llava-next-8b-hf") else LlavaNextProcessor.from_pretrained(
+ "llava-hf/llama3-llava-next-8b-hf"
+ ),
+ "model": LlavaNextForConditionalGeneration.from_pretrained(
+ "models/vlms/llama3-llava-next-8b-hf", torch_dtype=torch_dtype, device_map=device
+ ).to("cpu") if os.path.exists("models/vlms/llama3-llava-next-8b-hf") else
+ LlavaNextForConditionalGeneration.from_pretrained(
+ "llava-hf/llama3-llava-next-8b-hf", torch_dtype=torch_dtype, device_map=device
+ ).to("cpu"),
+ },
+ # {
+ # "type": "llava-next",
+ # "name": "llava-v1.6-vicuna-13b-hf",
+ # "local_path": "models/vlms/llava-v1.6-vicuna-13b-hf",
+ # "processor": LlavaNextProcessor.from_pretrained(
+ # "models/vlms/llava-v1.6-vicuna-13b-hf"
+ # ) if os.path.exists("models/vlms/llava-v1.6-vicuna-13b-hf") else LlavaNextProcessor.from_pretrained(
+ # "llava-hf/llava-v1.6-vicuna-13b-hf"
+ # ),
+ # "model": LlavaNextForConditionalGeneration.from_pretrained(
+ # "models/vlms/llava-v1.6-vicuna-13b-hf", torch_dtype=torch_dtype, device_map=device
+ # ).to("cpu") if os.path.exists("models/vlms/llava-v1.6-vicuna-13b-hf") else
+ # LlavaNextForConditionalGeneration.from_pretrained(
+ # "llava-hf/llava-v1.6-vicuna-13b-hf", torch_dtype=torch_dtype, device_map=device
+ # ).to("cpu"),
+ # },
+ # {
+ # "type": "llava-next",
+ # "name": "llava-v1.6-34b-hf",
+ # "local_path": "models/vlms/llava-v1.6-34b-hf",
+ # "processor": LlavaNextProcessor.from_pretrained(
+ # "models/vlms/llava-v1.6-34b-hf"
+ # ) if os.path.exists("models/vlms/llava-v1.6-34b-hf") else LlavaNextProcessor.from_pretrained(
+ # "llava-hf/llava-v1.6-34b-hf"
+ # ),
+ # "model": LlavaNextForConditionalGeneration.from_pretrained(
+ # "models/vlms/llava-v1.6-34b-hf", torch_dtype=torch_dtype, device_map=device
+ # ).to("cpu") if os.path.exists("models/vlms/llava-v1.6-34b-hf") else
+ # LlavaNextForConditionalGeneration.from_pretrained(
+ # "llava-hf/llava-v1.6-34b-hf", torch_dtype=torch_dtype, device_map=device
+ # ).to("cpu"),
+ # },
+ # {
+ # "type": "qwen2-vl",
+ # "name": "Qwen2-VL-2B-Instruct",
+ # "local_path": "models/vlms/Qwen2-VL-2B-Instruct",
+ # "processor": Qwen2VLProcessor.from_pretrained(
+ # "models/vlms/Qwen2-VL-2B-Instruct"
+ # ) if os.path.exists("models/vlms/Qwen2-VL-2B-Instruct") else Qwen2VLProcessor.from_pretrained(
+ # "Qwen/Qwen2-VL-2B-Instruct"
+ # ),
+ # "model": Qwen2VLForConditionalGeneration.from_pretrained(
+ # "models/vlms/Qwen2-VL-2B-Instruct", torch_dtype=torch_dtype, device_map=device
+ # ).to("cpu") if os.path.exists("models/vlms/Qwen2-VL-2B-Instruct") else
+ # Qwen2VLForConditionalGeneration.from_pretrained(
+ # "Qwen/Qwen2-VL-2B-Instruct", torch_dtype=torch_dtype, device_map=device
+ # ).to("cpu"),
+ # },
+ {
+ "type": "qwen2-vl",
+ "name": "Qwen2-VL-7B-Instruct (Default)",
+ "local_path": "models/vlms/Qwen2-VL-7B-Instruct",
+ "processor": Qwen2VLProcessor.from_pretrained(
+ "models/vlms/Qwen2-VL-7B-Instruct"
+ ) if os.path.exists("models/vlms/Qwen2-VL-7B-Instruct") else Qwen2VLProcessor.from_pretrained(
+ "Qwen/Qwen2-VL-7B-Instruct"
+ ),
+ "model": Qwen2VLForConditionalGeneration.from_pretrained(
+ "models/vlms/Qwen2-VL-7B-Instruct", torch_dtype=torch_dtype, device_map=device
+ ).to("cpu") if os.path.exists("models/vlms/Qwen2-VL-7B-Instruct") else
+ Qwen2VLForConditionalGeneration.from_pretrained(
+ "Qwen/Qwen2-VL-7B-Instruct", torch_dtype=torch_dtype, device_map=device
+ ).to("cpu"),
+ },
+ {
+ "type": "openai",
+ "name": "GPT4-o (Highly Recommended)",
+ "local_path": "",
+ "processor": "",
+ "model": ""
+ },
+]
+
+vlms_template = {k["name"]: (k["type"], k["local_path"], k["processor"], k["model"]) for k in vlms_list}
\ No newline at end of file
diff --git a/app/utils/GroundingDINO_SwinT_OGC.py b/app/utils/GroundingDINO_SwinT_OGC.py
new file mode 100644
index 0000000000000000000000000000000000000000..9158d5f6260ec74bded95377d382387430d7cd70
--- /dev/null
+++ b/app/utils/GroundingDINO_SwinT_OGC.py
@@ -0,0 +1,43 @@
+batch_size = 1
+modelname = "groundingdino"
+backbone = "swin_T_224_1k"
+position_embedding = "sine"
+pe_temperatureH = 20
+pe_temperatureW = 20
+return_interm_indices = [1, 2, 3]
+backbone_freeze_keywords = None
+enc_layers = 6
+dec_layers = 6
+pre_norm = False
+dim_feedforward = 2048
+hidden_dim = 256
+dropout = 0.0
+nheads = 8
+num_queries = 900
+query_dim = 4
+num_patterns = 0
+num_feature_levels = 4
+enc_n_points = 4
+dec_n_points = 4
+two_stage_type = "standard"
+two_stage_bbox_embed_share = False
+two_stage_class_embed_share = False
+transformer_activation = "relu"
+dec_pred_bbox_embed_share = True
+dn_box_noise_scale = 1.0
+dn_label_noise_ratio = 0.5
+dn_label_coef = 1.0
+dn_bbox_coef = 1.0
+embed_init_tgt = True
+dn_labelbook_size = 2000
+max_text_len = 256
+text_encoder_type = "bert-base-uncased"
+use_text_enhancer = True
+use_fusion_layer = True
+use_checkpoint = True
+use_transformer_ckpt = True
+use_text_cross_attention = True
+text_dropout = 0.0
+fusion_dropout = 0.0
+fusion_droppath = 0.1
+sub_sentence_present = True
diff --git a/assets/angel_christmas/angel_christmas.png b/assets/angel_christmas/angel_christmas.png
new file mode 100644
index 0000000000000000000000000000000000000000..8677855dce32755cbcf5dfd08ec46d31996f68cf
--- /dev/null
+++ b/assets/angel_christmas/angel_christmas.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:90efa52308e2dc16274ddaef147d89979bf6bdb2c1f2b06f639b4e43fb96f8db
+size 1470393
diff --git a/assets/angel_christmas/image_edit_f15d9b45-c978-4e3d-9f5f-251e308560c3_0.png b/assets/angel_christmas/image_edit_f15d9b45-c978-4e3d-9f5f-251e308560c3_0.png
new file mode 100644
index 0000000000000000000000000000000000000000..c0dc0623392a76e1415ccd7253aeef6471c20b37
--- /dev/null
+++ b/assets/angel_christmas/image_edit_f15d9b45-c978-4e3d-9f5f-251e308560c3_0.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a259c2958d665532dfdf459ccb8d808967eee2d2f6e87dadd51ca1a01b590b44
+size 1425611
diff --git a/assets/angel_christmas/mask_f15d9b45-c978-4e3d-9f5f-251e308560c3.png b/assets/angel_christmas/mask_f15d9b45-c978-4e3d-9f5f-251e308560c3.png
new file mode 100644
index 0000000000000000000000000000000000000000..437626bad8c94ddb8603d47e485b929691c63678
--- /dev/null
+++ b/assets/angel_christmas/mask_f15d9b45-c978-4e3d-9f5f-251e308560c3.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:14318679567d391ee5e08d96dae249ed1bca1a0f349b76f725cc70288ce04030
+size 3987
diff --git a/assets/angel_christmas/masked_image_f15d9b45-c978-4e3d-9f5f-251e308560c3.png b/assets/angel_christmas/masked_image_f15d9b45-c978-4e3d-9f5f-251e308560c3.png
new file mode 100644
index 0000000000000000000000000000000000000000..55a7fc12e05998f8bfe382a6d25bdf801b4d24ba
--- /dev/null
+++ b/assets/angel_christmas/masked_image_f15d9b45-c978-4e3d-9f5f-251e308560c3.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:be7745d023596428d3ff449f48f3aad3aa8ae00a42c089a7b1311cdae3e39b70
+size 1432341
diff --git a/assets/angel_christmas/prompt.txt b/assets/angel_christmas/prompt.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3d7fd373e29f597f8cba71f7c7a88f91e3b4defe
--- /dev/null
+++ b/assets/angel_christmas/prompt.txt
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:89ed635310c87d5f2d8f32d813018acd5040edd745a29c5bf84a435916525789
+size 27
diff --git a/assets/anime_flower/anime_flower.png b/assets/anime_flower/anime_flower.png
new file mode 100644
index 0000000000000000000000000000000000000000..0bd9ca5d665d600064320282f23d7103236d9cf9
--- /dev/null
+++ b/assets/anime_flower/anime_flower.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1adc101088a1428361410fe3c637155da48d0eb21b3782377dd258a0a5df576a
+size 1316945
diff --git a/assets/anime_flower/image_edit_37553172-9b38-4727-bf2e-37d7e2b93461_2.png b/assets/anime_flower/image_edit_37553172-9b38-4727-bf2e-37d7e2b93461_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..950c90f292b3c641ae536405d1ea96cb898480a4
--- /dev/null
+++ b/assets/anime_flower/image_edit_37553172-9b38-4727-bf2e-37d7e2b93461_2.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:3912e75e44c89a7ec8d0f6e34c90d4ea2212f80e5c2a12e6ba3dac405ca7be6c
+size 929544
diff --git a/assets/anime_flower/mask_37553172-9b38-4727-bf2e-37d7e2b93461.png b/assets/anime_flower/mask_37553172-9b38-4727-bf2e-37d7e2b93461.png
new file mode 100644
index 0000000000000000000000000000000000000000..7c27502ad076b39e2b867ca799115a3f39f8bc9c
--- /dev/null
+++ b/assets/anime_flower/mask_37553172-9b38-4727-bf2e-37d7e2b93461.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9f07ca0c719ffc09f282c424c66c869a9a31a1fc6386dba679e994f0b34bf51c
+size 4217
diff --git a/assets/anime_flower/masked_image_37553172-9b38-4727-bf2e-37d7e2b93461.png b/assets/anime_flower/masked_image_37553172-9b38-4727-bf2e-37d7e2b93461.png
new file mode 100644
index 0000000000000000000000000000000000000000..50460677d055fca2d717364cf34f5311e0762d9c
--- /dev/null
+++ b/assets/anime_flower/masked_image_37553172-9b38-4727-bf2e-37d7e2b93461.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:73cac864b127f287579a8d259ff1165841d4c5e63731b4ac54e872567137e5e6
+size 967136
diff --git a/assets/anime_flower/prompt.txt b/assets/anime_flower/prompt.txt
new file mode 100644
index 0000000000000000000000000000000000000000..10e8dc6fbf808df87496183e7a8d21166d4a60c8
--- /dev/null
+++ b/assets/anime_flower/prompt.txt
@@ -0,0 +1 @@
+648464818: remove the flower.
\ No newline at end of file
diff --git a/assets/brushedit_teaser.png b/assets/brushedit_teaser.png
new file mode 100644
index 0000000000000000000000000000000000000000..4ecb9ee5a77e4f08ebf51ce162887eb2ff45ffc6
--- /dev/null
+++ b/assets/brushedit_teaser.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bcd1d0c9f6fc083a33ec4565d98120bf1099914a0d2c0247eaa462052911ea59
+size 3449873
diff --git a/assets/chenduling/chengduling.jpg b/assets/chenduling/chengduling.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..18458a7a083c4fc078a3a896c7c7484a1ddb12ab
--- /dev/null
+++ b/assets/chenduling/chengduling.jpg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0df06c0394583181e5cdf92f997c1276deb27cf96dd36b6443fe9d347a1e013a
+size 167873
diff --git a/assets/chenduling/image_edit_68e3ff6f-da07-4b37-91df-13d6eed7b997_0.png b/assets/chenduling/image_edit_68e3ff6f-da07-4b37-91df-13d6eed7b997_0.png
new file mode 100644
index 0000000000000000000000000000000000000000..e747c8f4aebf754c321e25c403fc0ebcd1564335
--- /dev/null
+++ b/assets/chenduling/image_edit_68e3ff6f-da07-4b37-91df-13d6eed7b997_0.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:7bb60b8093291c9720e61160f7e598aadfc02f62bc08ad825d1ba9f2e8431b6a
+size 1386686
diff --git a/assets/chenduling/mask_68e3ff6f-da07-4b37-91df-13d6eed7b997.png b/assets/chenduling/mask_68e3ff6f-da07-4b37-91df-13d6eed7b997.png
new file mode 100644
index 0000000000000000000000000000000000000000..5a101efc9080eb29100698b5dab604d1ae9dd31c
--- /dev/null
+++ b/assets/chenduling/mask_68e3ff6f-da07-4b37-91df-13d6eed7b997.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:db47c2d9f18b0e25041c894945f6b52d3fcff473a0f0496b89dc2ac7d36536fc
+size 5678
diff --git a/assets/chenduling/masked_image_68e3ff6f-da07-4b37-91df-13d6eed7b997.png b/assets/chenduling/masked_image_68e3ff6f-da07-4b37-91df-13d6eed7b997.png
new file mode 100644
index 0000000000000000000000000000000000000000..4f5a69eb8d57d44a82513349294fe3917208d1cd
--- /dev/null
+++ b/assets/chenduling/masked_image_68e3ff6f-da07-4b37-91df-13d6eed7b997.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0200a970fc55930f9bcc9910cea3126b96c28e2bddca10b2b1969cbc979092be
+size 1106461
diff --git a/assets/chenduling/prompt.txt b/assets/chenduling/prompt.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7bec7baa5c7c6f1f652503bdc4ab695e1d676c9a
--- /dev/null
+++ b/assets/chenduling/prompt.txt
@@ -0,0 +1 @@
+648464818: replace the clothes to a delicated floral skirt
\ No newline at end of file
diff --git a/assets/chinese_girl/chinese_girl.png b/assets/chinese_girl/chinese_girl.png
new file mode 100644
index 0000000000000000000000000000000000000000..cfb660ff8c5bc456acb0d5e6313e2030f5308e4b
--- /dev/null
+++ b/assets/chinese_girl/chinese_girl.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:52f7dfc3333b48f677035180506650fb4ee9911a31426adb83c7e13fd5ac6693
+size 1259104
diff --git a/assets/chinese_girl/image_edit_54759648-0989-48e0-bc82-f20e28b5ec29_1.png b/assets/chinese_girl/image_edit_54759648-0989-48e0-bc82-f20e28b5ec29_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..ca0f7cb75e84a79b49476dff279b587504f77d1f
--- /dev/null
+++ b/assets/chinese_girl/image_edit_54759648-0989-48e0-bc82-f20e28b5ec29_1.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:b330e926da856f1027dd09e2bb3dc5910bb0a2dc9bc4a402b107c3f7b18b7de0
+size 880660
diff --git a/assets/chinese_girl/mask_54759648-0989-48e0-bc82-f20e28b5ec29.png b/assets/chinese_girl/mask_54759648-0989-48e0-bc82-f20e28b5ec29.png
new file mode 100644
index 0000000000000000000000000000000000000000..5aac03e6d104e5add650ca7c7b04c9de74ba2878
--- /dev/null
+++ b/assets/chinese_girl/mask_54759648-0989-48e0-bc82-f20e28b5ec29.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:d46957b4cd0af13f57ace1cf181a13c8da7feebf9a9f37e8e5d582086a337843
+size 10352
diff --git a/assets/chinese_girl/masked_image_54759648-0989-48e0-bc82-f20e28b5ec29.png b/assets/chinese_girl/masked_image_54759648-0989-48e0-bc82-f20e28b5ec29.png
new file mode 100644
index 0000000000000000000000000000000000000000..f6054fe6221b46bbcb2fa5aafae02f9795b19561
--- /dev/null
+++ b/assets/chinese_girl/masked_image_54759648-0989-48e0-bc82-f20e28b5ec29.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f26a9d923a432b80e91b09d711f4000e9b1afe7edece788c9b8b86a3cce45855
+size 411917
diff --git a/assets/chinese_girl/prompt.txt b/assets/chinese_girl/prompt.txt
new file mode 100644
index 0000000000000000000000000000000000000000..44f01ef9547fc2659cdd77351606fc6627859e13
--- /dev/null
+++ b/assets/chinese_girl/prompt.txt
@@ -0,0 +1 @@
+648464818: replace the background to ancient China.
\ No newline at end of file
diff --git a/assets/demo_vis.png b/assets/demo_vis.png
new file mode 100644
index 0000000000000000000000000000000000000000..b44c8f626e70e20b184d2d8128360c9b601dc828
--- /dev/null
+++ b/assets/demo_vis.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:755ecfc61a70da9eb3abde0d4353590c0344a0d55e5d3622da4fe58837ca457b
+size 1044589
diff --git a/assets/example.png b/assets/example.png
new file mode 100644
index 0000000000000000000000000000000000000000..0ed2b10c0667cbbfb0d6f356b5c7d285cec28769
--- /dev/null
+++ b/assets/example.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1e86dbd1cb8d4c787a910d17400b081fa1d0daac35645f808c088fd316d1861b
+size 3223
diff --git a/assets/frog/frog.jpeg b/assets/frog/frog.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..a4730c1fd139f8da340b957d079250dc056c6697
--- /dev/null
+++ b/assets/frog/frog.jpeg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bff47418f10bcbebdced638256fce1e075d93ccedc3b44ca83d04f7c7145ab1e
+size 896298
diff --git a/assets/frog/image_edit_f7b350de-6f2c-49e3-b535-995c486d78e7_1.png b/assets/frog/image_edit_f7b350de-6f2c-49e3-b535-995c486d78e7_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..ce0f4e36a88c15f0bf22c689e1c71c332bc66f23
--- /dev/null
+++ b/assets/frog/image_edit_f7b350de-6f2c-49e3-b535-995c486d78e7_1.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c9c1dfe00fd70e1cee76037941876c03a64863b3d598f925e7d0a39f3065db89
+size 923482
diff --git a/assets/frog/mask_f7b350de-6f2c-49e3-b535-995c486d78e7.png b/assets/frog/mask_f7b350de-6f2c-49e3-b535-995c486d78e7.png
new file mode 100644
index 0000000000000000000000000000000000000000..3b1a76ef65eaabc6102ad39e0f9243fbd6e4a6fa
--- /dev/null
+++ b/assets/frog/mask_f7b350de-6f2c-49e3-b535-995c486d78e7.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:2df1f32f92028ef8dbd677d039af09acb82db62d60bf4dea7812eefab340f553
+size 3341
diff --git a/assets/frog/masked_image_f7b350de-6f2c-49e3-b535-995c486d78e7.png b/assets/frog/masked_image_f7b350de-6f2c-49e3-b535-995c486d78e7.png
new file mode 100644
index 0000000000000000000000000000000000000000..34dd02d1bde1c476fe99b1c8789dc063b64c0e2e
--- /dev/null
+++ b/assets/frog/masked_image_f7b350de-6f2c-49e3-b535-995c486d78e7.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:bd9730b6c718a44614cfc8873e65bf660183adb4fbf2352f6488a33be5d4d7a1
+size 880582
diff --git a/assets/frog/prompt.txt b/assets/frog/prompt.txt
new file mode 100644
index 0000000000000000000000000000000000000000..7a0a8e62105e00235560f231311b39ceee868df2
--- /dev/null
+++ b/assets/frog/prompt.txt
@@ -0,0 +1 @@
+648464818: add a magic hat on frog head.
\ No newline at end of file
diff --git a/assets/girl_on_sun/girl_on_sun.png b/assets/girl_on_sun/girl_on_sun.png
new file mode 100644
index 0000000000000000000000000000000000000000..e5dba786c6595fa4b8f54bc6c42bb8c782dd8d79
--- /dev/null
+++ b/assets/girl_on_sun/girl_on_sun.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ec304a50b692e2b898b59b22c84cda84663738aacf5e9bf64cdfed1cde853e2a
+size 1589915
diff --git a/assets/girl_on_sun/image_edit_264eac8b-8b65-479c-9755-020a60880c37_0.png b/assets/girl_on_sun/image_edit_264eac8b-8b65-479c-9755-020a60880c37_0.png
new file mode 100644
index 0000000000000000000000000000000000000000..053e554a0f6a1b80544b9976d9044a5dfcb0cd62
--- /dev/null
+++ b/assets/girl_on_sun/image_edit_264eac8b-8b65-479c-9755-020a60880c37_0.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:800a5e5f953290472898247f974893506ca41a3c7acda02d7eb1a69844ad6d7c
+size 1096873
diff --git a/assets/girl_on_sun/mask_264eac8b-8b65-479c-9755-020a60880c37.png b/assets/girl_on_sun/mask_264eac8b-8b65-479c-9755-020a60880c37.png
new file mode 100644
index 0000000000000000000000000000000000000000..e7242fedc124316a07c4db4e86da33ec919587d3
--- /dev/null
+++ b/assets/girl_on_sun/mask_264eac8b-8b65-479c-9755-020a60880c37.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:9ebe3b43581718a525c258d7a1f28d7b4acc4e61150b008f5e909946717ce73f
+size 3907
diff --git a/assets/girl_on_sun/masked_image_264eac8b-8b65-479c-9755-020a60880c37.png b/assets/girl_on_sun/masked_image_264eac8b-8b65-479c-9755-020a60880c37.png
new file mode 100644
index 0000000000000000000000000000000000000000..29e66bd1828347199c6c39f0d0af6ab14c4728a3
--- /dev/null
+++ b/assets/girl_on_sun/masked_image_264eac8b-8b65-479c-9755-020a60880c37.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:a3c165dde2730c2648191e84fc2edb7d27babb8b80ffa159f567f2d670028b18
+size 1196415
diff --git a/assets/girl_on_sun/prompt.txt b/assets/girl_on_sun/prompt.txt
new file mode 100644
index 0000000000000000000000000000000000000000..404d411dedfaac4137e2e7cf15375cd38e64d626
--- /dev/null
+++ b/assets/girl_on_sun/prompt.txt
@@ -0,0 +1 @@
+648464818: add a butterfly fairy.
\ No newline at end of file
diff --git a/assets/logo_brushedit.png b/assets/logo_brushedit.png
new file mode 100644
index 0000000000000000000000000000000000000000..aa06fc15777df09bf6347760a6acc302a0a796bf
--- /dev/null
+++ b/assets/logo_brushedit.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f1a3d64b0b12dbbb0f71a1ebe318d284d2df258250054f2b3b95cf03399defdc
+size 215304
diff --git a/assets/mona_lisa/mona_lisa.jpeg b/assets/mona_lisa/mona_lisa.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..bbf46dd72efab36cbdb62fa5f75690b9dfe860a7
--- /dev/null
+++ b/assets/mona_lisa/mona_lisa.jpeg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:0d595dd22668a914d36a38e6f07e35799233d66305be3984ffa37df48d7fb27c
+size 195591
diff --git a/assets/olsen/image_edit_dcab0b6e-fa16-40a4-919d-df353d1491d4_0.png b/assets/olsen/image_edit_dcab0b6e-fa16-40a4-919d-df353d1491d4_0.png
new file mode 100644
index 0000000000000000000000000000000000000000..14b56a07869cf44907ad0d960b9471d971a300ee
--- /dev/null
+++ b/assets/olsen/image_edit_dcab0b6e-fa16-40a4-919d-df353d1491d4_0.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c0eeb7c2d699ae0ad801509000d2a9bb33bdd9d0a5b4b46cd9081984c6a90e1b
+size 1172633
diff --git a/assets/olsen/mask_dcab0b6e-fa16-40a4-919d-df353d1491d4.png b/assets/olsen/mask_dcab0b6e-fa16-40a4-919d-df353d1491d4.png
new file mode 100644
index 0000000000000000000000000000000000000000..4684ecf54fa48f4993601dfbaae02ee127a62cb0
--- /dev/null
+++ b/assets/olsen/mask_dcab0b6e-fa16-40a4-919d-df353d1491d4.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:1b05ddda5cb9ee6699e562658cebc24a5c19480efd2919266f261904e3082ae2
+size 20042
diff --git a/assets/olsen/masked_image_dcab0b6e-fa16-40a4-919d-df353d1491d4.png b/assets/olsen/masked_image_dcab0b6e-fa16-40a4-919d-df353d1491d4.png
new file mode 100644
index 0000000000000000000000000000000000000000..2207fbdb448b366e792880d568a217e8c332ed33
--- /dev/null
+++ b/assets/olsen/masked_image_dcab0b6e-fa16-40a4-919d-df353d1491d4.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ffe313076e42e1291f9cdcbc7aecb0b9115422234e9a13b3906f6cda8d36e741
+size 678522
diff --git a/assets/olsen/olsen.jpeg b/assets/olsen/olsen.jpeg
new file mode 100644
index 0000000000000000000000000000000000000000..2cebe0e90b7d3f8a8005a3de1211503a6a3dc1c8
--- /dev/null
+++ b/assets/olsen/olsen.jpeg
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:627eebaa105097cb1661754f4f559ffb94e7502cb280def682bbead14bea2068
+size 180926
diff --git a/assets/olsen/prompt.txt b/assets/olsen/prompt.txt
new file mode 100644
index 0000000000000000000000000000000000000000..b3f74c90cdd99b65247f0fc7db9affe9ee25614a
--- /dev/null
+++ b/assets/olsen/prompt.txt
@@ -0,0 +1 @@
+816907019: replace the background to scene of Alice's adventures.
\ No newline at end of file
diff --git a/assets/pigeon_rm/image_edit_b5fef1c4-11cf-4eb0-8e47-1d08733b042d_2.png b/assets/pigeon_rm/image_edit_b5fef1c4-11cf-4eb0-8e47-1d08733b042d_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..c361000689be573a58abbf113d0be61fe1750e91
--- /dev/null
+++ b/assets/pigeon_rm/image_edit_b5fef1c4-11cf-4eb0-8e47-1d08733b042d_2.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:f0a3e4cb81568930a42b439e6a63cc2ff9858acb97d00509616e4382ab7bc6b3
+size 969436
diff --git a/assets/pigeon_rm/mask_b5fef1c4-11cf-4eb0-8e47-1d08733b042d.png b/assets/pigeon_rm/mask_b5fef1c4-11cf-4eb0-8e47-1d08733b042d.png
new file mode 100644
index 0000000000000000000000000000000000000000..04e82bea86678e7f17528b4cd7bf2fcd67ea42d5
--- /dev/null
+++ b/assets/pigeon_rm/mask_b5fef1c4-11cf-4eb0-8e47-1d08733b042d.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:18db9511253b860fd044c25e19566cf2249a37d29f2072d8af1178adff30fdfd
+size 3231
diff --git a/assets/pigeon_rm/masked_image_b5fef1c4-11cf-4eb0-8e47-1d08733b042d.png b/assets/pigeon_rm/masked_image_b5fef1c4-11cf-4eb0-8e47-1d08733b042d.png
new file mode 100644
index 0000000000000000000000000000000000000000..d2b8af5ba45f3bfdee63ee90677a9d915db19d78
--- /dev/null
+++ b/assets/pigeon_rm/masked_image_b5fef1c4-11cf-4eb0-8e47-1d08733b042d.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5f5e421aed17ea6a547c6f9a92b97f7c962ca046eb8f35eb80e367553ed88cec
+size 786446
diff --git a/assets/pigeon_rm/pigeon.png b/assets/pigeon_rm/pigeon.png
new file mode 100644
index 0000000000000000000000000000000000000000..15e67eae9009710afce5cc4138505c9969b125d7
--- /dev/null
+++ b/assets/pigeon_rm/pigeon.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ca6f75999eb2063be4104f307b73f20d901f3faa184b978910fa9ae062b06088
+size 1233814
diff --git a/assets/pigeon_rm/prompt.txt b/assets/pigeon_rm/prompt.txt
new file mode 100644
index 0000000000000000000000000000000000000000..3193bdc876f20b9944d5131f8135cbd7d4b88f18
--- /dev/null
+++ b/assets/pigeon_rm/prompt.txt
@@ -0,0 +1 @@
+648464818: remove the pigeon.
\ No newline at end of file
diff --git a/assets/spider_man_cat_ears/image_edit_ab41476a-2613-403a-90fa-062ac5785679_3.png b/assets/spider_man_cat_ears/image_edit_ab41476a-2613-403a-90fa-062ac5785679_3.png
new file mode 100644
index 0000000000000000000000000000000000000000..449f4f383c70293a62e539373a6db3d1e0923abc
--- /dev/null
+++ b/assets/spider_man_cat_ears/image_edit_ab41476a-2613-403a-90fa-062ac5785679_3.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c409a42a2fbc34874af9fd516b3ef869d02081b034040afa1cc51baef58341d8
+size 665533
diff --git a/assets/spider_man_cat_ears/mask_ab41476a-2613-403a-90fa-062ac5785679.png b/assets/spider_man_cat_ears/mask_ab41476a-2613-403a-90fa-062ac5785679.png
new file mode 100644
index 0000000000000000000000000000000000000000..6fbef0e584a8922d8f0106902c71a14f66b7394e
--- /dev/null
+++ b/assets/spider_man_cat_ears/mask_ab41476a-2613-403a-90fa-062ac5785679.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c1ae5b65fc4a8cba0ec2fbfe713cfd82dec2a5cf098ce5d840670836b742b15d
+size 2200
diff --git a/assets/spider_man_cat_ears/masked_image_ab41476a-2613-403a-90fa-062ac5785679.png b/assets/spider_man_cat_ears/masked_image_ab41476a-2613-403a-90fa-062ac5785679.png
new file mode 100644
index 0000000000000000000000000000000000000000..d7e969f9f7c22fa34f6bb2f1d97afcc70de6575e
--- /dev/null
+++ b/assets/spider_man_cat_ears/masked_image_ab41476a-2613-403a-90fa-062ac5785679.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6ffb647bfea527d9e91d39279d4fb375b21e8af2bd2d9e5567a8a7351ee069e1
+size 588127
diff --git a/assets/spider_man_cat_ears/prompt.txt b/assets/spider_man_cat_ears/prompt.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4376b46e645a54097563ea58fefb4d51fdfa9fe6
--- /dev/null
+++ b/assets/spider_man_cat_ears/prompt.txt
@@ -0,0 +1,3 @@
+seed: 642087011
+target prompt: cat ears.
+blending: False.
\ No newline at end of file
diff --git a/assets/spider_man_cat_ears/spider_man.png b/assets/spider_man_cat_ears/spider_man.png
new file mode 100644
index 0000000000000000000000000000000000000000..d16cd2fb2a9e5bbdcf25420a606c4d9d2082221f
--- /dev/null
+++ b/assets/spider_man_cat_ears/spider_man.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ae10561e698a1df0bc6265a0dbfcfef8cb338bc9c42cd70fd6388889196fe2c5
+size 787117
diff --git a/assets/spider_man_cowboy_hat/image_edit_c5f992d9-8829-474f-b0de-4f51204a4e85_0.png b/assets/spider_man_cowboy_hat/image_edit_c5f992d9-8829-474f-b0de-4f51204a4e85_0.png
new file mode 100644
index 0000000000000000000000000000000000000000..bf3508dc802f05a125ddc4e861da9c41663e2858
--- /dev/null
+++ b/assets/spider_man_cowboy_hat/image_edit_c5f992d9-8829-474f-b0de-4f51204a4e85_0.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:de50240eb80165fd67013b49c078fbf819710e90352abf93f5f0b5e30cd7d991
+size 670373
diff --git a/assets/spider_man_cowboy_hat/mask_c5f992d9-8829-474f-b0de-4f51204a4e85.png b/assets/spider_man_cowboy_hat/mask_c5f992d9-8829-474f-b0de-4f51204a4e85.png
new file mode 100644
index 0000000000000000000000000000000000000000..6fbef0e584a8922d8f0106902c71a14f66b7394e
--- /dev/null
+++ b/assets/spider_man_cowboy_hat/mask_c5f992d9-8829-474f-b0de-4f51204a4e85.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c1ae5b65fc4a8cba0ec2fbfe713cfd82dec2a5cf098ce5d840670836b742b15d
+size 2200
diff --git a/assets/spider_man_cowboy_hat/masked_image_c5f992d9-8829-474f-b0de-4f51204a4e85.png b/assets/spider_man_cowboy_hat/masked_image_c5f992d9-8829-474f-b0de-4f51204a4e85.png
new file mode 100644
index 0000000000000000000000000000000000000000..d7e969f9f7c22fa34f6bb2f1d97afcc70de6575e
--- /dev/null
+++ b/assets/spider_man_cowboy_hat/masked_image_c5f992d9-8829-474f-b0de-4f51204a4e85.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6ffb647bfea527d9e91d39279d4fb375b21e8af2bd2d9e5567a8a7351ee069e1
+size 588127
diff --git a/assets/spider_man_cowboy_hat/prompt.txt b/assets/spider_man_cowboy_hat/prompt.txt
new file mode 100644
index 0000000000000000000000000000000000000000..bf31117d94af5d41c1e86ce7eb1cfc9fc5af29a7
--- /dev/null
+++ b/assets/spider_man_cowboy_hat/prompt.txt
@@ -0,0 +1,3 @@
+seed: 642087011
+target prompt: cowboy hat.
+blending: False.
\ No newline at end of file
diff --git a/assets/spider_man_cowboy_hat/spider_man.png b/assets/spider_man_cowboy_hat/spider_man.png
new file mode 100644
index 0000000000000000000000000000000000000000..d16cd2fb2a9e5bbdcf25420a606c4d9d2082221f
--- /dev/null
+++ b/assets/spider_man_cowboy_hat/spider_man.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ae10561e698a1df0bc6265a0dbfcfef8cb338bc9c42cd70fd6388889196fe2c5
+size 787117
diff --git a/assets/spider_man_crown/image_edit_c34eea03-b431-4b43-95c8-359e5e8d1d3f_1.png b/assets/spider_man_crown/image_edit_c34eea03-b431-4b43-95c8-359e5e8d1d3f_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..aec876c4547c8e62283e3d0ad26e907cad470257
--- /dev/null
+++ b/assets/spider_man_crown/image_edit_c34eea03-b431-4b43-95c8-359e5e8d1d3f_1.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:019bfdb7492ee784b5f0b0044af10f4ec1d2d37d5bac5d113691f30271135574
+size 671956
diff --git a/assets/spider_man_crown/mask_c34eea03-b431-4b43-95c8-359e5e8d1d3f.png b/assets/spider_man_crown/mask_c34eea03-b431-4b43-95c8-359e5e8d1d3f.png
new file mode 100644
index 0000000000000000000000000000000000000000..6fbef0e584a8922d8f0106902c71a14f66b7394e
--- /dev/null
+++ b/assets/spider_man_crown/mask_c34eea03-b431-4b43-95c8-359e5e8d1d3f.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c1ae5b65fc4a8cba0ec2fbfe713cfd82dec2a5cf098ce5d840670836b742b15d
+size 2200
diff --git a/assets/spider_man_crown/masked_image_c34eea03-b431-4b43-95c8-359e5e8d1d3f.png b/assets/spider_man_crown/masked_image_c34eea03-b431-4b43-95c8-359e5e8d1d3f.png
new file mode 100644
index 0000000000000000000000000000000000000000..d7e969f9f7c22fa34f6bb2f1d97afcc70de6575e
--- /dev/null
+++ b/assets/spider_man_crown/masked_image_c34eea03-b431-4b43-95c8-359e5e8d1d3f.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6ffb647bfea527d9e91d39279d4fb375b21e8af2bd2d9e5567a8a7351ee069e1
+size 588127
diff --git a/assets/spider_man_crown/prompt.txt b/assets/spider_man_crown/prompt.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6f298e5630ede36badc92fefb483b1328e98db73
--- /dev/null
+++ b/assets/spider_man_crown/prompt.txt
@@ -0,0 +1,3 @@
+seed: 642087011
+target prompt: crown.
+blending: False.
\ No newline at end of file
diff --git a/assets/spider_man_crown/spider_man.png b/assets/spider_man_crown/spider_man.png
new file mode 100644
index 0000000000000000000000000000000000000000..d16cd2fb2a9e5bbdcf25420a606c4d9d2082221f
--- /dev/null
+++ b/assets/spider_man_crown/spider_man.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ae10561e698a1df0bc6265a0dbfcfef8cb338bc9c42cd70fd6388889196fe2c5
+size 787117
diff --git a/assets/spider_man_curl/image_edit_6f0071ee-fc74-4602-a2a0-7b1970372dda_3.png b/assets/spider_man_curl/image_edit_6f0071ee-fc74-4602-a2a0-7b1970372dda_3.png
new file mode 100644
index 0000000000000000000000000000000000000000..1d81e321b69baaa6b48ca6c6e3053bf2702368bd
--- /dev/null
+++ b/assets/spider_man_curl/image_edit_6f0071ee-fc74-4602-a2a0-7b1970372dda_3.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:5b95eee6a8174797570d4254c039918b6ef6610851c1722728a30fba3297d137
+size 673262
diff --git a/assets/spider_man_curl/mask_6f0071ee-fc74-4602-a2a0-7b1970372dda.png b/assets/spider_man_curl/mask_6f0071ee-fc74-4602-a2a0-7b1970372dda.png
new file mode 100644
index 0000000000000000000000000000000000000000..6fbef0e584a8922d8f0106902c71a14f66b7394e
--- /dev/null
+++ b/assets/spider_man_curl/mask_6f0071ee-fc74-4602-a2a0-7b1970372dda.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c1ae5b65fc4a8cba0ec2fbfe713cfd82dec2a5cf098ce5d840670836b742b15d
+size 2200
diff --git a/assets/spider_man_curl/masked_image_6f0071ee-fc74-4602-a2a0-7b1970372dda.png b/assets/spider_man_curl/masked_image_6f0071ee-fc74-4602-a2a0-7b1970372dda.png
new file mode 100644
index 0000000000000000000000000000000000000000..d7e969f9f7c22fa34f6bb2f1d97afcc70de6575e
--- /dev/null
+++ b/assets/spider_man_curl/masked_image_6f0071ee-fc74-4602-a2a0-7b1970372dda.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6ffb647bfea527d9e91d39279d4fb375b21e8af2bd2d9e5567a8a7351ee069e1
+size 588127
diff --git a/assets/spider_man_curl/prompt.txt b/assets/spider_man_curl/prompt.txt
new file mode 100644
index 0000000000000000000000000000000000000000..6f298e5630ede36badc92fefb483b1328e98db73
--- /dev/null
+++ b/assets/spider_man_curl/prompt.txt
@@ -0,0 +1,3 @@
+seed: 642087011
+target prompt: crown.
+blending: False.
\ No newline at end of file
diff --git a/assets/spider_man_curl/spider_man.png b/assets/spider_man_curl/spider_man.png
new file mode 100644
index 0000000000000000000000000000000000000000..d16cd2fb2a9e5bbdcf25420a606c4d9d2082221f
--- /dev/null
+++ b/assets/spider_man_curl/spider_man.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ae10561e698a1df0bc6265a0dbfcfef8cb338bc9c42cd70fd6388889196fe2c5
+size 787117
diff --git a/assets/spider_man_devil_horn/image_edit_8fd388c3-1e7e-4ae7-8dfe-a516b85d4034_1.png b/assets/spider_man_devil_horn/image_edit_8fd388c3-1e7e-4ae7-8dfe-a516b85d4034_1.png
new file mode 100644
index 0000000000000000000000000000000000000000..8f1c6deec7fdc06714173c2536ea89109b1ebd69
--- /dev/null
+++ b/assets/spider_man_devil_horn/image_edit_8fd388c3-1e7e-4ae7-8dfe-a516b85d4034_1.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ffe8b21fb4aa65acd9f454728a273273ef0cdd31d59c7611f793cae77a3b8c64
+size 666989
diff --git a/assets/spider_man_devil_horn/mask_8fd388c3-1e7e-4ae7-8dfe-a516b85d4034.png b/assets/spider_man_devil_horn/mask_8fd388c3-1e7e-4ae7-8dfe-a516b85d4034.png
new file mode 100644
index 0000000000000000000000000000000000000000..6fbef0e584a8922d8f0106902c71a14f66b7394e
--- /dev/null
+++ b/assets/spider_man_devil_horn/mask_8fd388c3-1e7e-4ae7-8dfe-a516b85d4034.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c1ae5b65fc4a8cba0ec2fbfe713cfd82dec2a5cf098ce5d840670836b742b15d
+size 2200
diff --git a/assets/spider_man_devil_horn/masked_image_8fd388c3-1e7e-4ae7-8dfe-a516b85d4034.png b/assets/spider_man_devil_horn/masked_image_8fd388c3-1e7e-4ae7-8dfe-a516b85d4034.png
new file mode 100644
index 0000000000000000000000000000000000000000..d7e969f9f7c22fa34f6bb2f1d97afcc70de6575e
--- /dev/null
+++ b/assets/spider_man_devil_horn/masked_image_8fd388c3-1e7e-4ae7-8dfe-a516b85d4034.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6ffb647bfea527d9e91d39279d4fb375b21e8af2bd2d9e5567a8a7351ee069e1
+size 588127
diff --git a/assets/spider_man_devil_horn/prompt.txt b/assets/spider_man_devil_horn/prompt.txt
new file mode 100644
index 0000000000000000000000000000000000000000..4376b46e645a54097563ea58fefb4d51fdfa9fe6
--- /dev/null
+++ b/assets/spider_man_devil_horn/prompt.txt
@@ -0,0 +1,3 @@
+seed: 642087011
+target prompt: cat ears.
+blending: False.
\ No newline at end of file
diff --git a/assets/spider_man_devil_horn/spider_man.png b/assets/spider_man_devil_horn/spider_man.png
new file mode 100644
index 0000000000000000000000000000000000000000..d16cd2fb2a9e5bbdcf25420a606c4d9d2082221f
--- /dev/null
+++ b/assets/spider_man_devil_horn/spider_man.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ae10561e698a1df0bc6265a0dbfcfef8cb338bc9c42cd70fd6388889196fe2c5
+size 787117
diff --git a/assets/spider_man_rm/image_edit_a5d410e6-8e8d-432f-8144-defbc3e1eae9_0.png b/assets/spider_man_rm/image_edit_a5d410e6-8e8d-432f-8144-defbc3e1eae9_0.png
new file mode 100644
index 0000000000000000000000000000000000000000..4cc99a5de42129afb8df8a19a87a9401aacc8d56
--- /dev/null
+++ b/assets/spider_man_rm/image_edit_a5d410e6-8e8d-432f-8144-defbc3e1eae9_0.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:71ef3385a7ae94b9a33e6bfe7af91a1a7288616ef66de9646faab9d4661cafa8
+size 668965
diff --git a/assets/spider_man_rm/mask_a5d410e6-8e8d-432f-8144-defbc3e1eae9.png b/assets/spider_man_rm/mask_a5d410e6-8e8d-432f-8144-defbc3e1eae9.png
new file mode 100644
index 0000000000000000000000000000000000000000..6fbef0e584a8922d8f0106902c71a14f66b7394e
--- /dev/null
+++ b/assets/spider_man_rm/mask_a5d410e6-8e8d-432f-8144-defbc3e1eae9.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c1ae5b65fc4a8cba0ec2fbfe713cfd82dec2a5cf098ce5d840670836b742b15d
+size 2200
diff --git a/assets/spider_man_rm/masked_image_a5d410e6-8e8d-432f-8144-defbc3e1eae9.png b/assets/spider_man_rm/masked_image_a5d410e6-8e8d-432f-8144-defbc3e1eae9.png
new file mode 100644
index 0000000000000000000000000000000000000000..d7e969f9f7c22fa34f6bb2f1d97afcc70de6575e
--- /dev/null
+++ b/assets/spider_man_rm/masked_image_a5d410e6-8e8d-432f-8144-defbc3e1eae9.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6ffb647bfea527d9e91d39279d4fb375b21e8af2bd2d9e5567a8a7351ee069e1
+size 588127
diff --git a/assets/spider_man_rm/prompt.txt b/assets/spider_man_rm/prompt.txt
new file mode 100644
index 0000000000000000000000000000000000000000..212b9442a341d9ec92847a7d30fee856416d2fd0
--- /dev/null
+++ b/assets/spider_man_rm/prompt.txt
@@ -0,0 +1,2 @@
+642087011: remove the Christmas hat.
+blending: False.
\ No newline at end of file
diff --git a/assets/spider_man_rm/spider_man.png b/assets/spider_man_rm/spider_man.png
new file mode 100644
index 0000000000000000000000000000000000000000..d16cd2fb2a9e5bbdcf25420a606c4d9d2082221f
--- /dev/null
+++ b/assets/spider_man_rm/spider_man.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ae10561e698a1df0bc6265a0dbfcfef8cb338bc9c42cd70fd6388889196fe2c5
+size 787117
diff --git a/assets/spider_man_witch_hat/image_edit_5199d448-73b9-4423-a1a9-7d50c8f5191a_2.png b/assets/spider_man_witch_hat/image_edit_5199d448-73b9-4423-a1a9-7d50c8f5191a_2.png
new file mode 100644
index 0000000000000000000000000000000000000000..68de731177bd99cf54d09523b37e38d641574905
--- /dev/null
+++ b/assets/spider_man_witch_hat/image_edit_5199d448-73b9-4423-a1a9-7d50c8f5191a_2.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:903043f1b5870c7c8d5a6a4cbe8d98e2100820cae890f6d90aa8dd6f24abe7aa
+size 664358
diff --git a/assets/spider_man_witch_hat/mask_5199d448-73b9-4423-a1a9-7d50c8f5191a.png b/assets/spider_man_witch_hat/mask_5199d448-73b9-4423-a1a9-7d50c8f5191a.png
new file mode 100644
index 0000000000000000000000000000000000000000..6fbef0e584a8922d8f0106902c71a14f66b7394e
--- /dev/null
+++ b/assets/spider_man_witch_hat/mask_5199d448-73b9-4423-a1a9-7d50c8f5191a.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c1ae5b65fc4a8cba0ec2fbfe713cfd82dec2a5cf098ce5d840670836b742b15d
+size 2200
diff --git a/assets/spider_man_witch_hat/masked_image_5199d448-73b9-4423-a1a9-7d50c8f5191a.png b/assets/spider_man_witch_hat/masked_image_5199d448-73b9-4423-a1a9-7d50c8f5191a.png
new file mode 100644
index 0000000000000000000000000000000000000000..d7e969f9f7c22fa34f6bb2f1d97afcc70de6575e
--- /dev/null
+++ b/assets/spider_man_witch_hat/masked_image_5199d448-73b9-4423-a1a9-7d50c8f5191a.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:6ffb647bfea527d9e91d39279d4fb375b21e8af2bd2d9e5567a8a7351ee069e1
+size 588127
diff --git a/assets/spider_man_witch_hat/prompt.txt b/assets/spider_man_witch_hat/prompt.txt
new file mode 100644
index 0000000000000000000000000000000000000000..ebc0c0ce296aa5ab24a198034749a05187d2145c
--- /dev/null
+++ b/assets/spider_man_witch_hat/prompt.txt
@@ -0,0 +1,3 @@
+seed: 642087011
+target prompt: witch hat.
+blending: False.
\ No newline at end of file
diff --git a/assets/spider_man_witch_hat/spider_man.png b/assets/spider_man_witch_hat/spider_man.png
new file mode 100644
index 0000000000000000000000000000000000000000..d16cd2fb2a9e5bbdcf25420a606c4d9d2082221f
--- /dev/null
+++ b/assets/spider_man_witch_hat/spider_man.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:ae10561e698a1df0bc6265a0dbfcfef8cb338bc9c42cd70fd6388889196fe2c5
+size 787117
diff --git a/assets/sunflower_girl/prompt.txt b/assets/sunflower_girl/prompt.txt
index ffb1d2d0451501acbd382389bcfeb7e9b0e5f8bb..bef43ea31f5782f22a45d7e33851e778b7ec7e06 100644
--- a/assets/sunflower_girl/prompt.txt
+++ b/assets/sunflower_girl/prompt.txt
@@ -1 +1 @@
-648464818: add a wreath on head..
\ No newline at end of file
+648464818: add a wreath on head.
\ No newline at end of file
diff --git a/assets/upload.png b/assets/upload.png
new file mode 100644
index 0000000000000000000000000000000000000000..29c81ece9b0aefa7dcedbc06b31bc9e5d1020cb5
--- /dev/null
+++ b/assets/upload.png
@@ -0,0 +1,3 @@
+version https://git-lfs.github.com/spec/v1
+oid sha256:c98867f5c90c2ee87c2d7e5713183df39f87ea368f63e2a2f901de35a4962fae
+size 4687
diff --git a/requirements.txt b/requirements.txt
index 577aca5b55355517dee82e0f000b8adda9ac6cc9..08d8026068c3a547e31be9478b0306c650bd619e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,8 +1,8 @@
-torch
-torchvision
-torchaudio
-transformers>=4.25.1
-gradio==4.38.1
+torch==2.0.1
+torchvision==0.15.2
+torchaudio==2.0.2
+numpy==1.24.0
+transformers==4.46.3
ftfy
tensorboard
datasets
@@ -16,5 +16,6 @@ torchmetrics
open-clip-torch
clip
segment_anything
-git+https://github.com/liyaowei-stu/BrushEdit.git
-git+https://github.com/IDEA-Research/Grounded-Segment-Anything/tree/main/GroundingDINO
\ No newline at end of file
+openai
+git+https://github.com/TencentARC/BrushEdit?tab=readme-ov-file
+git+https://github.com/IDEA-Research/GroundingDINO.git
\ No newline at end of file