from inpaint_zoom.utils.zoom_in_utils import image_grid, shrink_and_paste_on_blank, dummy, write_video from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler from PIL import Image import gradio as gr import numpy as np import torch import os os.environ["CUDA_VISIBLE_DEVICES"]="0" stable_paint_model_list = [ "stabilityai/stable-diffusion-2-inpainting", "runwayml/stable-diffusion-inpainting" ] stable_paint_prompt_list = [ "children running in the forest , sunny, bright, by studio ghibli painting, superior quality, masterpiece, traditional Japanese colors, by Grzegorz Rutkowski, concept art", "A beautiful landscape of a mountain range with a lake in the foreground", ] stable_paint_negative_prompt_list = [ "lurry, bad art, blurred, text, watermark", ] class StableDiffusionZoomIn: def __init__(self): self.pipe = None def load_model(self, model_id): if self.pipe is None: self.pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16, revision="fp16") self.pipe.scheduler = DPMSolverMultistepScheduler.from_config(self.pipe.scheduler.config) self.pipe = self.pipe.to("cuda") self.pipe.safety_checker = dummy self.pipe.enable_attention_slicing() self.pipe.enable_xformers_memory_efficient_attention() self.g_cuda = torch.Generator(device='cuda') return self.pipe def generate_video( self, model_id, prompt, negative_prompt, guidance_scale, num_inference_steps, ): pipe = self.load_model(model_id) num_init_images = 2 seed = 9999 height = 512 width = height current_image = Image.new(mode="RGBA", size=(height, width)) mask_image = np.array(current_image)[:,:,3] mask_image = Image.fromarray(255-mask_image).convert("RGB") current_image = current_image.convert("RGB") init_images = pipe(prompt=[prompt]*num_init_images, negative_prompt=[negative_prompt]*num_init_images, image=current_image, guidance_scale = guidance_scale, height = height, width = width, generator = self.g_cuda.manual_seed(seed), mask_image=mask_image, num_inference_steps=num_inference_steps)[0] image_grid(init_images, rows=1, cols=num_init_images) init_image_selected = 1 #@param if num_init_images == 1: init_image_selected = 0 else: init_image_selected = init_image_selected - 1 num_outpainting_steps = 20 #@param mask_width = 128 #@param num_interpol_frames = 30 #@param current_image = init_images[init_image_selected] all_frames = [] all_frames.append(current_image) for i in range(num_outpainting_steps): print('Generating image: ' + str(i+1) + ' / ' + str(num_outpainting_steps)) prev_image_fix = current_image prev_image = shrink_and_paste_on_blank(current_image, mask_width) current_image = prev_image #create mask (black image with white mask_width width edges) mask_image = np.array(current_image)[:,:,3] mask_image = Image.fromarray(255-mask_image).convert("RGB") #inpainting step current_image = current_image.convert("RGB") images = pipe(prompt=prompt, negative_prompt=negative_prompt, image=current_image, guidance_scale = guidance_scale, height = height, width = width, #this can make the whole thing deterministic but the output less exciting #generator = g_cuda.manual_seed(seed), mask_image=mask_image, num_inference_steps=num_inference_steps)[0] current_image = images[0] current_image.paste(prev_image, mask=prev_image) #interpolation steps bewteen 2 inpainted images (=sequential zoom and crop) for j in range(num_interpol_frames - 1): interpol_image = current_image interpol_width = round( (1- ( 1-2*mask_width/height )**( 1-(j+1)/num_interpol_frames ) )*height/2 ) interpol_image = interpol_image.crop((interpol_width, interpol_width, width - interpol_width, height - interpol_width)) interpol_image = interpol_image.resize((height, width)) #paste the higher resolution previous image in the middle to avoid drop in quality caused by zooming interpol_width2 = round( ( 1 - (height-2*mask_width) / (height-2*interpol_width) ) / 2*height ) prev_image_fix_crop = shrink_and_paste_on_blank(prev_image_fix, interpol_width2) interpol_image.paste(prev_image_fix_crop, mask = prev_image_fix_crop) all_frames.append(interpol_image) all_frames.append(current_image) video_file_name = "infinite_zoom_out" fps = 30 save_path = video_file_name + ".mp4" write_video(save_path, all_frames, fps) return save_path def app(): with gr.Blocks(): with gr.Row(): with gr.Column(): text2image_in_model_path = gr.Dropdown( choices=stable_paint_model_list, value=stable_paint_model_list[0], label='Text-Image Model Id' ) text2image_in_prompt = gr.Textbox( lines=1, value=stable_paint_prompt_list[0], label='Prompt' ) text2image_in_negative_prompt = gr.Textbox( lines=1, value=stable_paint_negative_prompt_list[0], label='Negative Prompt' ) with gr.Row(): with gr.Column(): text2image_in_guidance_scale = gr.Slider( minimum=0.1, maximum=15, step=0.1, value=7.5, label='Guidance Scale' ) text2image_in_num_inference_step = gr.Slider( minimum=1, maximum=100, step=1, value=50, label='Num Inference Step' ) text2image_in_predict = gr.Button(value='Generator') with gr.Column(): output_image = gr.Video(label='Output') text2image_in_predict.click( fn=StableDiffusionZoomIn().generate_video, inputs=[ text2image_in_model_path, text2image_in_prompt, text2image_in_negative_prompt, text2image_in_guidance_scale, text2image_in_num_inference_step, ], outputs=output_image )