Spaces:
Sleeping
Sleeping
import gradio as gr | |
import numpy as np | |
import torch | |
from diffusers import StableDiffusionPipeline | |
from transformers import CLIPTextModel, CLIPTokenizer | |
from diffusers import AutoencoderKL, UNet2DConditionModel, PNDMScheduler | |
from diffusers import LMSDiscreteScheduler | |
from share_btn import community_icon_html, loading_icon_html | |
from tqdm.auto import tqdm | |
from PIL import Image | |
# PARAMS | |
MANUAL_SEED = 42 | |
HEIGHT = 512 | |
WIDTH = 512 | |
ETA = 1e-1 | |
pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16) | |
torch_device = "cuda" if torch.cuda.is_available() else "cpu" | |
pipe = pipe.to(torch_device) | |
# 1. Load the autoencoder model which will be used to decode the latents into image space. | |
vae = AutoencoderKL.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="vae") | |
# 2. Load the tokenizer and text encoder to tokenize and encode the text. | |
tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14") | |
text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14") | |
# 3. The UNet model for generating the latents. | |
unet = UNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="unet") | |
scheduler = LMSDiscreteScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler") | |
vae = vae.to(torch_device) | |
text_encoder = text_encoder.to(torch_device) | |
unet = unet.to(torch_device) | |
generator = torch.manual_seed(MANUAL_SEED) # Seed generator to create the inital latent noise | |
def read_content(file_path: str) -> str: | |
"""read the content of target file | |
""" | |
with open(file_path, 'r', encoding='utf-8') as f: | |
content = f.read() | |
return content | |
# def predict(dict, prompt=""): | |
# init_image = dict["image"].convert("RGB").resize((512, 512)) | |
# mask = dict["mask"].convert("RGB").resize((512, 512)) | |
# output = pipe(prompt = prompt, image=init_image, mask_image=mask,guidance_scale=7.5) | |
# return output.images[0], gr.update(visible=True), gr.update(visible=True), gr.update(visible=True) | |
def predict(dict, prompt=""): | |
text_input = tokenizer(prompt, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt") | |
max_length = text_input.input_ids.shape[-1] | |
uncond_input = tokenizer( | |
[""], padding="max_length", max_length=max_length, return_tensors="pt" | |
) | |
with torch.no_grad(): | |
uncond_embeddings = text_encoder(uncond_input.input_ids.to(torch_device))[0] | |
init_image = dict["image"].convert("RGB").resize((512, 512)) | |
mask = dict["mask"].convert("RGB").resize((512, 512)) | |
# convert input image to array in [-1, 1] | |
init_image = torch.tensor(2 * (np.asarray(init_image) / 255) - 1, device=torch_device) | |
mask = torch.tensor((np.asarray(mask) / 255), device=torch_device) | |
# add one dimension for the batch and bring channels first | |
init_image = init_image.permute(2, 0, 1).unsqueeze(0) | |
mask = mask.permute(2, 0, 1).unsqueeze(0) | |
latents = torch.randn( | |
(1, unet.in_channels, HEIGHT // 8, WIDTH // 8), | |
generator=generator, | |
) | |
latents = latents.to(torch_device) | |
for i, t in enumerate(tqdm(scheduler.timesteps)): | |
t = scheduler.timesteps[i] | |
z_t = torch.clone(latents.detach()) | |
z_t.requires_grad = True | |
# expand the latents if we are doing classifier-free guidance to avoid doing two forward passes. | |
latent_model_input = scheduler.scale_model_input(z_t, t) | |
# predict the noise residual | |
noise_pred = unet(latent_model_input, t, encoder_hidden_states=uncond_embeddings).sample | |
# compute z_0 using tweedies's formula | |
indx = scheduler.num_inference_steps - i - 1 | |
z_0 = (1/torch.sqrt(scheduler.alphas_cumprod[indx]))\ | |
*(z_t + (1-scheduler.alphas_cumprod[indx]) * noise_pred ) | |
# pass through the decoder | |
z_0 = 1 / 0.18215 * z_0 | |
image_pred = vae.decode(z_0).sample | |
# clip | |
image_pred = torch.clamp(image_pred, min=-1.0, max=1.0) | |
inpainted_image = (1 - mask) * init_image + mask * image_pred | |
error_measurement = (1/2) * torch.linalg.norm((1-mask) * (init_image - image_pred))**2 | |
# TODO(giannisdaras): add LPIPS? | |
error = error_measurement | |
gradients = torch.autograd.grad(error, inputs=z_t)[0] | |
# compute the previous noisy sample x_t -> x_t-1 | |
z_t_next = scheduler.step(noise_pred, t, z_t).prev_sample | |
latents = z_t_next - ETA * gradients | |
# scale and decode the image latents with vae | |
latents = 1 / 0.18215 * latents | |
with torch.no_grad(): | |
image = vae.decode(latents).sample | |
image = (image / 2 + 0.5).clamp(0, 1) | |
image = image.detach().cpu().permute(0, 2, 3, 1).numpy() | |
images = (image * 255).round().astype("uint8") | |
return images[0], gr.update(visible=True), gr.update(visible=True), gr.update(visible=True) | |
css = ''' | |
.container {max-width: 1150px;margin: auto;padding-top: 1.5rem} | |
#image_upload{min-height:400px} | |
#image_upload [data-testid="image"], #image_upload [data-testid="image"] > div{min-height: 400px} | |
#mask_radio .gr-form{background:transparent; border: none} | |
#word_mask{margin-top: .75em !important} | |
#word_mask textarea:disabled{opacity: 0.3} | |
.footer {margin-bottom: 45px;margin-top: 35px;text-align: center;border-bottom: 1px solid #e5e5e5} | |
.footer>p {font-size: .8rem; display: inline-block; padding: 0 10px;transform: translateY(10px);background: white} | |
.dark .footer {border-color: #303030} | |
.dark .footer>p {background: #0b0f19} | |
.acknowledgments h4{margin: 1.25em 0 .25em 0;font-weight: bold;font-size: 115%} | |
#image_upload .touch-none{display: flex} | |
@keyframes spin { | |
from { | |
transform: rotate(0deg); | |
} | |
to { | |
transform: rotate(360deg); | |
} | |
} | |
#share-btn-container { | |
display: flex; padding-left: 0.5rem !important; padding-right: 0.5rem !important; background-color: #000000; justify-content: center; align-items: center; border-radius: 9999px !important; width: 13rem; | |
} | |
#share-btn { | |
all: initial; color: #ffffff;font-weight: 600; cursor:pointer; font-family: 'IBM Plex Sans', sans-serif; margin-left: 0.5rem !important; padding-top: 0.25rem !important; padding-bottom: 0.25rem !important; | |
} | |
#share-btn * { | |
all: unset; | |
} | |
#share-btn-container div:nth-child(-n+2){ | |
width: auto !important; | |
min-height: 0px !important; | |
} | |
#share-btn-container .wrap { | |
display: none !important; | |
} | |
''' | |
image_blocks = gr.Blocks(css=css) | |
with image_blocks as demo: | |
gr.HTML(read_content("header.html")) | |
with gr.Group(): | |
with gr.Box(): | |
with gr.Row(): | |
with gr.Column(): | |
image = gr.Image(source='upload', tool='sketch', elem_id="image_upload", type="pil", label="Upload").style(height=400) | |
with gr.Row(elem_id="prompt-container").style(mobile_collapse=False, equal_height=True): | |
prompt = gr.Textbox(placeholder = 'Your prompt (what you want in place of what is erased)', show_label=False, elem_id="input-text") | |
btn = gr.Button("Inpaint!").style( | |
margin=False, | |
rounded=(False, True, True, False), | |
full_width=False, | |
) | |
with gr.Column(): | |
image_out = gr.Image(label="Output", elem_id="output-img").style(height=400) | |
with gr.Group(elem_id="share-btn-container"): | |
community_icon = gr.HTML(community_icon_html, visible=False) | |
loading_icon = gr.HTML(loading_icon_html, visible=False) | |
btn.click(fn=predict, inputs=[image, prompt], outputs=[image_out, community_icon, loading_icon]) | |
image_blocks.launch() |