Spaces:

PSLD
/

PSLD

Sleeping

PSLD / app.py

add base

ea94cfb about 1 year ago

7.87 kB

	import gradio as gr
	import numpy as np
	import torch
	from diffusers import StableDiffusionPipeline
	from transformers import CLIPTextModel, CLIPTokenizer
	from diffusers import AutoencoderKL, UNet2DConditionModel, PNDMScheduler
	from diffusers import LMSDiscreteScheduler
	from share_btn import community_icon_html, loading_icon_html
	from tqdm.auto import tqdm
	from PIL import Image


	# PARAMS
	MANUAL_SEED = 42
	HEIGHT = 512
	WIDTH = 512
	ETA = 1e-1


	pipe = StableDiffusionPipeline.from_pretrained("CompVis/stable-diffusion-v1-4", torch_dtype=torch.float16)
	torch_device = "cuda" if torch.cuda.is_available() else "cpu"
	pipe = pipe.to(torch_device)

	# 1. Load the autoencoder model which will be used to decode the latents into image space.
	vae = AutoencoderKL.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="vae")

	# 2. Load the tokenizer and text encoder to tokenize and encode the text.
	tokenizer = CLIPTokenizer.from_pretrained("openai/clip-vit-large-patch14")
	text_encoder = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")

	# 3. The UNet model for generating the latents.
	unet = UNet2DConditionModel.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="unet")
	scheduler = LMSDiscreteScheduler.from_pretrained("CompVis/stable-diffusion-v1-4", subfolder="scheduler")

	vae = vae.to(torch_device)
	text_encoder = text_encoder.to(torch_device)
	unet = unet.to(torch_device)


	generator = torch.manual_seed(MANUAL_SEED) # Seed generator to create the inital latent noise


	def read_content(file_path: str) -> str:
	"""read the content of target file
	"""
	with open(file_path, 'r', encoding='utf-8') as f:
	content = f.read()

	return content

	# def predict(dict, prompt=""):
	# init_image = dict["image"].convert("RGB").resize((512, 512))
	# mask = dict["mask"].convert("RGB").resize((512, 512))
	# output = pipe(prompt = prompt, image=init_image, mask_image=mask,guidance_scale=7.5)
	# return output.images[0], gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)

	def predict(dict, prompt=""):
	text_input = tokenizer(prompt, padding="max_length", max_length=tokenizer.model_max_length, truncation=True, return_tensors="pt")

	max_length = text_input.input_ids.shape[-1]
	uncond_input = tokenizer(
	[""], padding="max_length", max_length=max_length, return_tensors="pt"
	)
	with torch.no_grad():
	uncond_embeddings = text_encoder(uncond_input.input_ids.to(torch_device))[0]

	init_image = dict["image"].convert("RGB").resize((512, 512))
	mask = dict["mask"].convert("RGB").resize((512, 512))

	# convert input image to array in [-1, 1]
	init_image = torch.tensor(2 * (np.asarray(init_image) / 255) - 1, device=torch_device)
	mask = torch.tensor((np.asarray(mask) / 255), device=torch_device)
	# add one dimension for the batch and bring channels first
	init_image = init_image.permute(2, 0, 1).unsqueeze(0)
	mask = mask.permute(2, 0, 1).unsqueeze(0)

	latents = torch.randn(
	(1, unet.in_channels, HEIGHT // 8, WIDTH // 8),
	generator=generator,
	)
	latents = latents.to(torch_device)

	for i, t in enumerate(tqdm(scheduler.timesteps)):
	t = scheduler.timesteps[i]
	z_t = torch.clone(latents.detach())
	z_t.requires_grad = True

	# expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
	latent_model_input = scheduler.scale_model_input(z_t, t)


	# predict the noise residual
	noise_pred = unet(latent_model_input, t, encoder_hidden_states=uncond_embeddings).sample
	# compute z_0 using tweedies's formula
	indx = scheduler.num_inference_steps - i - 1
	z_0 = (1/torch.sqrt(scheduler.alphas_cumprod[indx]))\
	(z_t + (1-scheduler.alphas_cumprod[indx]) noise_pred )

	# pass through the decoder
	z_0 = 1 / 0.18215 * z_0
	image_pred = vae.decode(z_0).sample
	# clip
	image_pred = torch.clamp(image_pred, min=-1.0, max=1.0)
	inpainted_image = (1 - mask) * init_image + mask * image_pred
	error_measurement = (1/2) * torch.linalg.norm((1-mask) * (init_image - image_pred))**2
	# TODO(giannisdaras): add LPIPS?
	error = error_measurement
	gradients = torch.autograd.grad(error, inputs=z_t)[0]
	# compute the previous noisy sample x_t -> x_t-1
	z_t_next = scheduler.step(noise_pred, t, z_t).prev_sample

	latents = z_t_next - ETA * gradients

	# scale and decode the image latents with vae
	latents = 1 / 0.18215 * latents

	with torch.no_grad():
	image = vae.decode(latents).sample

	image = (image / 2 + 0.5).clamp(0, 1)
	image = image.detach().cpu().permute(0, 2, 3, 1).numpy()
	images = (image * 255).round().astype("uint8")
	return images[0], gr.update(visible=True), gr.update(visible=True), gr.update(visible=True)


	css = '''
	.container {max-width: 1150px;margin: auto;padding-top: 1.5rem}
	#image_upload{min-height:400px}
	#image_upload [data-testid="image"], #image_upload [data-testid="image"] > div{min-height: 400px}
	#mask_radio .gr-form{background:transparent; border: none}
	#word_mask{margin-top: .75em !important}
	#word_mask textarea:disabled{opacity: 0.3}
	.footer {margin-bottom: 45px;margin-top: 35px;text-align: center;border-bottom: 1px solid #e5e5e5}
	.footer>p {font-size: .8rem; display: inline-block; padding: 0 10px;transform: translateY(10px);background: white}
	.dark .footer {border-color: #303030}
	.dark .footer>p {background: #0b0f19}
	.acknowledgments h4{margin: 1.25em 0 .25em 0;font-weight: bold;font-size: 115%}
	#image_upload .touch-none{display: flex}
	@keyframes spin {
	from {
	transform: rotate(0deg);
	}
	to {
	transform: rotate(360deg);
	}
	}
	#share-btn-container {
	display: flex; padding-left: 0.5rem !important; padding-right: 0.5rem !important; background-color: #000000; justify-content: center; align-items: center; border-radius: 9999px !important; width: 13rem;
	}
	#share-btn {
	all: initial; color: #ffffff;font-weight: 600; cursor:pointer; font-family: 'IBM Plex Sans', sans-serif; margin-left: 0.5rem !important; padding-top: 0.25rem !important; padding-bottom: 0.25rem !important;
	}
	#share-btn * {
	all: unset;
	}
	#share-btn-container div:nth-child(-n+2){
	width: auto !important;
	min-height: 0px !important;
	}
	#share-btn-container .wrap {
	display: none !important;
	}
	'''

	image_blocks = gr.Blocks(css=css)
	with image_blocks as demo:
	gr.HTML(read_content("header.html"))
	with gr.Group():
	with gr.Box():
	with gr.Row():
	with gr.Column():
	image = gr.Image(source='upload', tool='sketch', elem_id="image_upload", type="pil", label="Upload").style(height=400)
	with gr.Row(elem_id="prompt-container").style(mobile_collapse=False, equal_height=True):
	prompt = gr.Textbox(placeholder = 'Your prompt (what you want in place of what is erased)', show_label=False, elem_id="input-text")
	btn = gr.Button("Inpaint!").style(
	margin=False,
	rounded=(False, True, True, False),
	full_width=False,
	)
	with gr.Column():
	image_out = gr.Image(label="Output", elem_id="output-img").style(height=400)
	with gr.Group(elem_id="share-btn-container"):
	community_icon = gr.HTML(community_icon_html, visible=False)
	loading_icon = gr.HTML(loading_icon_html, visible=False)

	btn.click(fn=predict, inputs=[image, prompt], outputs=[image_out, community_icon, loading_icon])



	image_blocks.launch()