Spaces:
Running
on
Zero
Running
on
Zero
refract
Browse files- edit_app.py +66 -86
edit_app.py
CHANGED
@@ -1,8 +1,9 @@
|
|
1 |
from __future__ import annotations
|
2 |
|
3 |
import math
|
4 |
-
import random
|
5 |
from glob import glob
|
|
|
|
|
6 |
|
7 |
import gradio as gr
|
8 |
import torch
|
@@ -11,7 +12,58 @@ from datasets import load_dataset
|
|
11 |
from diffusers import StableDiffusionInstructPix2PixPipeline, EulerAncestralDiscreteScheduler
|
12 |
|
13 |
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
If you're not getting what you want, there may be a few reasons:
|
16 |
1. Is the image not changing enough? Your Image CFG weight may be too high. This value dictates how similar the output should be to the input. It's possible your edit requires larger changes from the original image, and your Image CFG weight isn't allowing that. Alternatively, your Text CFG weight may be too low. This value dictates how much to listen to the text instruction. The default Image CFG of 1.5 and Text CFG of 7.5 are a good starting point, but aren't necessarily optimal for each edit. Try:
|
17 |
* Decreasing the Image CFG weight, or
|
@@ -27,29 +79,8 @@ If you're not getting what you want, there may be a few reasons:
|
|
27 |
"""
|
28 |
|
29 |
|
30 |
-
example_instructions = [
|
31 |
-
"Make it a picasso painting",
|
32 |
-
"as if it were by modigliani",
|
33 |
-
"convert to a bronze statue",
|
34 |
-
"Turn it into an anime.",
|
35 |
-
"have it look like a graphic novel",
|
36 |
-
"make him gain weight",
|
37 |
-
"what would he look like bald?",
|
38 |
-
"Have him smile",
|
39 |
-
"Put him in a cocktail party.",
|
40 |
-
"move him at the beach.",
|
41 |
-
"add dramatic lighting",
|
42 |
-
"Convert to black and white",
|
43 |
-
"What if it were snowing?",
|
44 |
-
"Give him a leather jacket",
|
45 |
-
"Turn him into a cyborg!",
|
46 |
-
"make him wear a beanie",
|
47 |
-
]
|
48 |
-
|
49 |
-
# model_id = "timbrooks/instruct-pix2pix"
|
50 |
-
model_id = "MudeHui/ip2p-warp-gpt4v"
|
51 |
-
|
52 |
def main():
|
|
|
53 |
if torch.cuda.is_available():
|
54 |
pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(model_id, torch_dtype=torch.float16, safety_checker=None)
|
55 |
pipe = pipe.to('cuda')
|
@@ -57,59 +88,8 @@ def main():
|
|
57 |
pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(model_id, torch_dtype=torch.float, safety_checker=None)
|
58 |
pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)
|
59 |
|
60 |
-
def generate(
|
61 |
-
input_image: Image.Image,
|
62 |
-
instruction: str,
|
63 |
-
steps: int,
|
64 |
-
randomize_seed: bool,
|
65 |
-
seed: int,
|
66 |
-
randomize_cfg: bool,
|
67 |
-
text_cfg_scale: float,
|
68 |
-
image_cfg_scale: float,
|
69 |
-
):
|
70 |
-
seed = random.randint(0, 100000) if randomize_seed else seed
|
71 |
-
text_cfg_scale = round(random.uniform(6.0, 9.0), ndigits=2) if randomize_cfg else text_cfg_scale
|
72 |
-
image_cfg_scale = round(random.uniform(1.2, 1.8), ndigits=2) if randomize_cfg else image_cfg_scale
|
73 |
-
|
74 |
-
width, height = input_image.size
|
75 |
-
factor = 512 / max(width, height)
|
76 |
-
factor = math.ceil(min(width, height) * factor / 64) * 64 / min(width, height)
|
77 |
-
width = int((width * factor) // 64) * 64
|
78 |
-
height = int((height * factor) // 64) * 64
|
79 |
-
input_image = ImageOps.fit(input_image, (width, height), method=Image.Resampling.LANCZOS)
|
80 |
-
|
81 |
-
if instruction == "":
|
82 |
-
return [input_image, seed]
|
83 |
-
|
84 |
-
generator = torch.manual_seed(seed)
|
85 |
-
edited_image = pipe(
|
86 |
-
instruction, image=input_image,
|
87 |
-
guidance_scale=text_cfg_scale, image_guidance_scale=image_cfg_scale,
|
88 |
-
num_inference_steps=steps, generator=generator,
|
89 |
-
).images[0]
|
90 |
-
return [seed, text_cfg_scale, image_cfg_scale, edited_image]
|
91 |
-
|
92 |
-
def reset():
|
93 |
-
return [0, "Randomize Seed", 1371, "Fix CFG", 7.5, 1.5, None]
|
94 |
-
|
95 |
image_options = {path.split("/")[-1].split(".")[0]: path for path in sorted(glob("imgs/*png"))}
|
96 |
|
97 |
-
def show_image(image_name):
|
98 |
-
# Retrieve the image file path from the dictionary based on the selected name
|
99 |
-
return image_options[image_name]
|
100 |
-
|
101 |
-
dataset = load_dataset("UCSC-VLAA/HQ-Edit-data-demo")
|
102 |
-
|
103 |
-
def sample():
|
104 |
-
sample_id = random.choice(list(range(len(dataset["train"]))))
|
105 |
-
sample = dataset["train"][sample_id]
|
106 |
-
return [sample["input_image"], sample["output_image"], sample["edit"], sample["inverse_edit"]]
|
107 |
-
|
108 |
-
def show_large_image(image_info):
|
109 |
-
# Returns the PIL image and caption for larger display
|
110 |
-
# return image_info['image'], image_info['caption']
|
111 |
-
return image_info
|
112 |
-
|
113 |
with gr.Blocks() as demo:
|
114 |
gr.HTML("""<h1 style="font-weight: 900; margin-bottom: 7px;">
|
115 |
HQ-Edit: A High-Quality and High-Coverage Dataset for General Image Editing
|
@@ -133,8 +113,6 @@ def main():
|
|
133 |
with gr.Row():
|
134 |
input_image = gr.Image(label="Input Image", type="pil", interactive=True, height=512, width=512)
|
135 |
edited_image = gr.Image(label=f"Edited Image", type="pil", interactive=False, height=512, width=512)
|
136 |
-
# input_image.style(height=512, width=512)
|
137 |
-
# edited_image.style(height=512, width=512)
|
138 |
|
139 |
with gr.Row():
|
140 |
steps = gr.Number(value=20, precision=0, label="Steps", interactive=True)
|
@@ -156,26 +134,23 @@ def main():
|
|
156 |
text_cfg_scale = gr.Number(value=7.0, label=f"Text CFG", interactive=True)
|
157 |
image_cfg_scale = gr.Number(value=1.5, label=f"Image CFG", interactive=True)
|
158 |
|
159 |
-
gr.Markdown(
|
160 |
|
161 |
with gr.Row():
|
162 |
gr.Markdown("## Dataset Preview")
|
163 |
sample_button = gr.Button("See Another Sample")
|
164 |
|
165 |
with gr.Row():
|
166 |
-
# Set up the Gallery component with a specific number of columns
|
167 |
-
# gallery = gr.Gallery(value=image_data, label="Image Gallery", type="pil", columns=2)
|
168 |
-
# Display for larger image
|
169 |
input_image_preview = gr.Image(label="Input Image", type="pil", height=512, width=512)
|
170 |
output_image_preview = gr.Image(label="Output Image", type="pil", height=512, width=512)
|
171 |
|
172 |
edit_text = gr.Textbox(label="Edit Instruction")
|
173 |
inv_edit_text = gr.Textbox(label="Inverse Edit Instruction")
|
174 |
|
175 |
-
|
176 |
|
177 |
generate_button.click(
|
178 |
-
fn=
|
179 |
inputs=[
|
180 |
input_image,
|
181 |
instruction,
|
@@ -191,15 +166,20 @@ def main():
|
|
191 |
reset_button.click(
|
192 |
fn=reset,
|
193 |
inputs=[],
|
194 |
-
outputs=[steps, randomize_seed, seed, randomize_cfg, text_cfg_scale, image_cfg_scale, edited_image],
|
195 |
)
|
196 |
|
|
|
|
|
|
|
|
|
|
|
197 |
sample_button.click(
|
198 |
-
fn=
|
199 |
inputs=[],
|
200 |
outputs=[input_image_preview, output_image_preview, edit_text, inv_edit_text]
|
201 |
)
|
202 |
-
|
203 |
demo.queue()
|
204 |
demo.launch(share=True, max_threads=1)
|
205 |
|
|
|
1 |
from __future__ import annotations
|
2 |
|
3 |
import math
|
|
|
4 |
from glob import glob
|
5 |
+
from functools import partial
|
6 |
+
import random
|
7 |
|
8 |
import gradio as gr
|
9 |
import torch
|
|
|
12 |
from diffusers import StableDiffusionInstructPix2PixPipeline, EulerAncestralDiscreteScheduler
|
13 |
|
14 |
|
15 |
+
def generate(
|
16 |
+
input_image: Image.Image,
|
17 |
+
instruction: str,
|
18 |
+
steps: int,
|
19 |
+
randomize_seed: bool,
|
20 |
+
seed: int,
|
21 |
+
randomize_cfg: bool,
|
22 |
+
text_cfg_scale: float,
|
23 |
+
image_cfg_scale: float,
|
24 |
+
pipe: StableDiffusionInstructPix2PixPipeline
|
25 |
+
):
|
26 |
+
seed = random.randint(0, 100000) if randomize_seed else seed
|
27 |
+
text_cfg_scale = round(random.uniform(6.0, 9.0), ndigits=2) if randomize_cfg else text_cfg_scale
|
28 |
+
image_cfg_scale = round(random.uniform(1.2, 1.8), ndigits=2) if randomize_cfg else image_cfg_scale
|
29 |
+
|
30 |
+
width, height = input_image.size
|
31 |
+
factor = 512 / max(width, height)
|
32 |
+
factor = math.ceil(min(width, height) * factor / 64) * 64 / min(width, height)
|
33 |
+
width = int((width * factor) // 64) * 64
|
34 |
+
height = int((height * factor) // 64) * 64
|
35 |
+
input_image = ImageOps.fit(input_image, (width, height), method=Image.Resampling.LANCZOS)
|
36 |
+
|
37 |
+
if instruction == "":
|
38 |
+
return [seed, text_cfg_scale, image_cfg_scale, input_image]
|
39 |
+
|
40 |
+
generator = torch.manual_seed(seed)
|
41 |
+
edited_image = pipe(
|
42 |
+
instruction, image=input_image,
|
43 |
+
guidance_scale=text_cfg_scale, image_guidance_scale=image_cfg_scale,
|
44 |
+
num_inference_steps=steps, generator=generator,
|
45 |
+
).images[0]
|
46 |
+
return [seed, text_cfg_scale, image_cfg_scale, edited_image]
|
47 |
+
|
48 |
+
|
49 |
+
def show_image(image_name, image_options):
|
50 |
+
if image_name is None:
|
51 |
+
return
|
52 |
+
|
53 |
+
return image_options[image_name]
|
54 |
+
|
55 |
+
|
56 |
+
def reset():
|
57 |
+
return [0, "Randomize Seed", 1371, "Fix CFG", 7.5, 1.5, None, None, None, ""]
|
58 |
+
|
59 |
+
|
60 |
+
def sample(dataset):
|
61 |
+
sample_id = random.choice(list(range(len(dataset["train"]))))
|
62 |
+
sample = dataset["train"][sample_id]
|
63 |
+
return [sample["input_image"], sample["output_image"], sample["edit"], sample["inverse_edit"]]
|
64 |
+
|
65 |
+
|
66 |
+
HELP_TEXT = """
|
67 |
If you're not getting what you want, there may be a few reasons:
|
68 |
1. Is the image not changing enough? Your Image CFG weight may be too high. This value dictates how similar the output should be to the input. It's possible your edit requires larger changes from the original image, and your Image CFG weight isn't allowing that. Alternatively, your Text CFG weight may be too low. This value dictates how much to listen to the text instruction. The default Image CFG of 1.5 and Text CFG of 7.5 are a good starting point, but aren't necessarily optimal for each edit. Try:
|
69 |
* Decreasing the Image CFG weight, or
|
|
|
79 |
"""
|
80 |
|
81 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
82 |
def main():
|
83 |
+
model_id = "MudeHui/ip2p-warp-gpt4v"
|
84 |
if torch.cuda.is_available():
|
85 |
pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(model_id, torch_dtype=torch.float16, safety_checker=None)
|
86 |
pipe = pipe.to('cuda')
|
|
|
88 |
pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(model_id, torch_dtype=torch.float, safety_checker=None)
|
89 |
pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)
|
90 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
91 |
image_options = {path.split("/")[-1].split(".")[0]: path for path in sorted(glob("imgs/*png"))}
|
92 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
93 |
with gr.Blocks() as demo:
|
94 |
gr.HTML("""<h1 style="font-weight: 900; margin-bottom: 7px;">
|
95 |
HQ-Edit: A High-Quality and High-Coverage Dataset for General Image Editing
|
|
|
113 |
with gr.Row():
|
114 |
input_image = gr.Image(label="Input Image", type="pil", interactive=True, height=512, width=512)
|
115 |
edited_image = gr.Image(label=f"Edited Image", type="pil", interactive=False, height=512, width=512)
|
|
|
|
|
116 |
|
117 |
with gr.Row():
|
118 |
steps = gr.Number(value=20, precision=0, label="Steps", interactive=True)
|
|
|
134 |
text_cfg_scale = gr.Number(value=7.0, label=f"Text CFG", interactive=True)
|
135 |
image_cfg_scale = gr.Number(value=1.5, label=f"Image CFG", interactive=True)
|
136 |
|
137 |
+
gr.Markdown(HELP_TEXT)
|
138 |
|
139 |
with gr.Row():
|
140 |
gr.Markdown("## Dataset Preview")
|
141 |
sample_button = gr.Button("See Another Sample")
|
142 |
|
143 |
with gr.Row():
|
|
|
|
|
|
|
144 |
input_image_preview = gr.Image(label="Input Image", type="pil", height=512, width=512)
|
145 |
output_image_preview = gr.Image(label="Output Image", type="pil", height=512, width=512)
|
146 |
|
147 |
edit_text = gr.Textbox(label="Edit Instruction")
|
148 |
inv_edit_text = gr.Textbox(label="Inverse Edit Instruction")
|
149 |
|
150 |
+
generate_func = partial(generate, pipe=pipe)
|
151 |
|
152 |
generate_button.click(
|
153 |
+
fn=generate_func,
|
154 |
inputs=[
|
155 |
input_image,
|
156 |
instruction,
|
|
|
166 |
reset_button.click(
|
167 |
fn=reset,
|
168 |
inputs=[],
|
169 |
+
outputs=[steps, randomize_seed, seed, randomize_cfg, text_cfg_scale, image_cfg_scale, input_image, edited_image, dropdown, instruction],
|
170 |
)
|
171 |
|
172 |
+
show_image_func = partial(show_image, image_options=image_options)
|
173 |
+
dropdown.change(show_image_func, inputs=dropdown, outputs=input_image)
|
174 |
+
|
175 |
+
dataset = load_dataset("UCSC-VLAA/HQ-Edit-data-demo")
|
176 |
+
sample_func = partial(sample, dataset=dataset)
|
177 |
sample_button.click(
|
178 |
+
fn=sample_func,
|
179 |
inputs=[],
|
180 |
outputs=[input_image_preview, output_image_preview, edit_text, inv_edit_text]
|
181 |
)
|
182 |
+
|
183 |
demo.queue()
|
184 |
demo.launch(share=True, max_threads=1)
|
185 |
|