Spaces:
Running
on
T4
Running
on
T4
diffusers-pipeline (#2)
Browse files- Use Diffusers pipeline. (2b69f2a727338ff151654eecfbe4fbf7d7541c2c)
- Remove constraint on transformers. (2e3e5e89399c1d39c86dab315217faebf79c3ffe)
Co-authored-by: Pedro Cuenca <pcuenq@users.noreply.huggingface.co>
- edit_app.py +11 -89
- requirements.txt +3 -2
edit_app.py
CHANGED
@@ -2,30 +2,18 @@ from __future__ import annotations
|
|
2 |
|
3 |
import math
|
4 |
import random
|
5 |
-
import sys
|
6 |
|
7 |
-
import einops
|
8 |
import gradio as gr
|
9 |
-
import k_diffusion as K
|
10 |
-
import numpy as np
|
11 |
import torch
|
12 |
-
import torch.nn as nn
|
13 |
-
from einops import rearrange
|
14 |
-
from omegaconf import OmegaConf
|
15 |
from PIL import Image, ImageOps
|
16 |
-
from
|
17 |
-
from huggingface_hub import hf_hub_download
|
18 |
-
|
19 |
-
sys.path.append("./stable_diffusion")
|
20 |
-
|
21 |
-
from stable_diffusion.ldm.util import instantiate_from_config
|
22 |
|
23 |
|
24 |
help_text = """
|
25 |
If you're not getting what you want, there may be a few reasons:
|
26 |
1. Is the image not changing enough? Your Image CFG weight may be too high. This value dictates how similar the output should be to the input. It's possible your edit requires larger changes from the original image, and your Image CFG weight isn't allowing that. Alternatively, your Text CFG weight may be too low. This value dictates how much to listen to the text instruction. The default Image CFG of 1.5 and Text CFG of 7.5 are a good starting point, but aren't necessarily optimal for each edit. Try:
|
27 |
* Decreasing the Image CFG weight, or
|
28 |
-
*
|
29 |
2. Conversely, is the image changing too much, such that the details in the original image aren't preserved? Try:
|
30 |
* Increasing the Image CFG weight, or
|
31 |
* Decreasing the Text CFG weight
|
@@ -56,55 +44,10 @@ example_instructions = [
|
|
56 |
"make him wear a beanie",
|
57 |
]
|
58 |
|
59 |
-
|
60 |
-
class CFGDenoiser(nn.Module):
|
61 |
-
def __init__(self, model):
|
62 |
-
super().__init__()
|
63 |
-
self.inner_model = model
|
64 |
-
|
65 |
-
def forward(self, z, sigma, cond, uncond, text_cfg_scale, image_cfg_scale):
|
66 |
-
cfg_z = einops.repeat(z, "1 ... -> n ...", n=3)
|
67 |
-
cfg_sigma = einops.repeat(sigma, "1 ... -> n ...", n=3)
|
68 |
-
cfg_cond = {
|
69 |
-
"c_crossattn": [torch.cat([cond["c_crossattn"][0], uncond["c_crossattn"][0], uncond["c_crossattn"][0]])],
|
70 |
-
"c_concat": [torch.cat([cond["c_concat"][0], cond["c_concat"][0], uncond["c_concat"][0]])],
|
71 |
-
}
|
72 |
-
out_cond, out_img_cond, out_uncond = self.inner_model(cfg_z, cfg_sigma, cond=cfg_cond).chunk(3)
|
73 |
-
return out_uncond + text_cfg_scale * (out_cond - out_img_cond) + image_cfg_scale * (out_img_cond - out_uncond)
|
74 |
-
|
75 |
-
|
76 |
-
def load_model_from_config(config, ckpt, vae_ckpt=None, verbose=False):
|
77 |
-
print(f"Loading model from {ckpt}")
|
78 |
-
pl_sd = torch.load(ckpt, map_location="cpu")
|
79 |
-
if "global_step" in pl_sd:
|
80 |
-
print(f"Global Step: {pl_sd['global_step']}")
|
81 |
-
sd = pl_sd["state_dict"]
|
82 |
-
if vae_ckpt is not None:
|
83 |
-
print(f"Loading VAE from {vae_ckpt}")
|
84 |
-
vae_sd = torch.load(vae_ckpt, map_location="cpu")["state_dict"]
|
85 |
-
sd = {
|
86 |
-
k: vae_sd[k[len("first_stage_model.") :]] if k.startswith("first_stage_model.") else v
|
87 |
-
for k, v in sd.items()
|
88 |
-
}
|
89 |
-
model = instantiate_from_config(config.model)
|
90 |
-
m, u = model.load_state_dict(sd, strict=False)
|
91 |
-
if len(m) > 0 and verbose:
|
92 |
-
print("missing keys:")
|
93 |
-
print(m)
|
94 |
-
if len(u) > 0 and verbose:
|
95 |
-
print("unexpected keys:")
|
96 |
-
print(u)
|
97 |
-
return model
|
98 |
-
|
99 |
|
100 |
def main():
|
101 |
-
|
102 |
-
config = OmegaConf.load("configs/generate.yaml")
|
103 |
-
model = load_model_from_config(config, ckpt)
|
104 |
-
model.eval().cuda()
|
105 |
-
model_wrap = K.external.CompVisDenoiser(model)
|
106 |
-
model_wrap_cfg = CFGDenoiser(model_wrap)
|
107 |
-
null_token = model.get_learned_conditioning([""])
|
108 |
example_image = Image.open("imgs/example.jpg").convert("RGB")
|
109 |
|
110 |
def load_example(
|
@@ -151,34 +94,13 @@ def main():
|
|
151 |
if instruction == "":
|
152 |
return [input_image, seed]
|
153 |
|
154 |
-
|
155 |
-
|
156 |
-
|
157 |
-
|
158 |
-
|
159 |
-
|
160 |
-
|
161 |
-
uncond = {}
|
162 |
-
uncond["c_crossattn"] = [null_token]
|
163 |
-
uncond["c_concat"] = [torch.zeros_like(cond["c_concat"][0])]
|
164 |
-
|
165 |
-
sigmas = model_wrap.get_sigmas(steps)
|
166 |
-
|
167 |
-
extra_args = {
|
168 |
-
"cond": cond,
|
169 |
-
"uncond": uncond,
|
170 |
-
"text_cfg_scale": text_cfg_scale,
|
171 |
-
"image_cfg_scale": image_cfg_scale,
|
172 |
-
}
|
173 |
-
torch.manual_seed(seed)
|
174 |
-
z = torch.randn_like(cond["c_concat"][0]) * sigmas[0]
|
175 |
-
z = K.sampling.sample_euler_ancestral(model_wrap_cfg, z, sigmas, extra_args=extra_args)
|
176 |
-
x = model.decode_first_stage(z)
|
177 |
-
x = torch.clamp((x + 1.0) / 2.0, min=0.0, max=1.0)
|
178 |
-
x = 255.0 * rearrange(x, "1 c h w -> h w c")
|
179 |
-
edited_image = Image.fromarray(x.type(torch.uint8).cpu().numpy())
|
180 |
-
|
181 |
-
return [seed, text_cfg_scale, image_cfg_scale, edited_image]
|
182 |
|
183 |
def reset():
|
184 |
return [0, "Randomize Seed", 1371, "Fix CFG", 7.5, 1.5, None]
|
|
|
2 |
|
3 |
import math
|
4 |
import random
|
|
|
5 |
|
|
|
6 |
import gradio as gr
|
|
|
|
|
7 |
import torch
|
|
|
|
|
|
|
8 |
from PIL import Image, ImageOps
|
9 |
+
from diffusers import StableDiffusionInstructPix2PixPipeline
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
|
12 |
help_text = """
|
13 |
If you're not getting what you want, there may be a few reasons:
|
14 |
1. Is the image not changing enough? Your Image CFG weight may be too high. This value dictates how similar the output should be to the input. It's possible your edit requires larger changes from the original image, and your Image CFG weight isn't allowing that. Alternatively, your Text CFG weight may be too low. This value dictates how much to listen to the text instruction. The default Image CFG of 1.5 and Text CFG of 7.5 are a good starting point, but aren't necessarily optimal for each edit. Try:
|
15 |
* Decreasing the Image CFG weight, or
|
16 |
+
* Increasing the Text CFG weight, or
|
17 |
2. Conversely, is the image changing too much, such that the details in the original image aren't preserved? Try:
|
18 |
* Increasing the Image CFG weight, or
|
19 |
* Decreasing the Text CFG weight
|
|
|
44 |
"make him wear a beanie",
|
45 |
]
|
46 |
|
47 |
+
model_id = "timbrooks/instruct-pix2pix"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
48 |
|
49 |
def main():
|
50 |
+
pipe = StableDiffusionInstructPix2PixPipeline.from_pretrained(model_id, torch_dtype=torch.float16, safety_checker=None).to("cuda")
|
|
|
|
|
|
|
|
|
|
|
|
|
51 |
example_image = Image.open("imgs/example.jpg").convert("RGB")
|
52 |
|
53 |
def load_example(
|
|
|
94 |
if instruction == "":
|
95 |
return [input_image, seed]
|
96 |
|
97 |
+
generator = torch.manual_seed(seed)
|
98 |
+
edited_image = pipe(
|
99 |
+
instruction, image=input_image,
|
100 |
+
guidance_scale=text_cfg_scale, image_guidance_scale=image_cfg_scale,
|
101 |
+
num_inference_steps=steps, generator=generator,
|
102 |
+
).images[0]
|
103 |
+
return [seed, text_cfg_scale, image_cfg_scale, edited_image]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
104 |
|
105 |
def reset():
|
106 |
return [0, "Randomize Seed", 1371, "Fix CFG", 7.5, 1.5, None]
|
requirements.txt
CHANGED
@@ -15,7 +15,7 @@ test-tube>=0.7.5
|
|
15 |
streamlit>=0.73.1
|
16 |
einops==0.3.0
|
17 |
torch-fidelity==0.3.0
|
18 |
-
transformers
|
19 |
torchmetrics==0.6.0
|
20 |
kornia==0.6
|
21 |
-e git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers
|
@@ -23,4 +23,5 @@ kornia==0.6
|
|
23 |
huggingface-hub
|
24 |
openai
|
25 |
seaborn
|
26 |
-
git+https://github.com/crowsonkb/k-diffusion.git
|
|
|
|
15 |
streamlit>=0.73.1
|
16 |
einops==0.3.0
|
17 |
torch-fidelity==0.3.0
|
18 |
+
transformers
|
19 |
torchmetrics==0.6.0
|
20 |
kornia==0.6
|
21 |
-e git+https://github.com/CompVis/taming-transformers.git@master#egg=taming-transformers
|
|
|
23 |
huggingface-hub
|
24 |
openai
|
25 |
seaborn
|
26 |
+
git+https://github.com/crowsonkb/k-diffusion.git
|
27 |
+
git+https://github.com/huggingface/diffusers
|