Spaces:

ifire
/

painting-undo

Runtime error

App Files Files Community

ifire commited on Jul 17

Commit

a8f08a2

•

1 Parent(s): adc915a

Update for spaces.

Browse files

Files changed (1) hide show

gradio_app.py +18 -35

gradio_app.py CHANGED Viewed

@@ -12,7 +12,6 @@ import gradio as gr
 import numpy as np
 import torch
 import wd14tagger
-import memory_management
 import uuid
 from PIL import Image
@@ -24,7 +23,10 @@ from diffusers.models.attention_processor import AttnProcessor2_0
 from transformers import CLIPTextModel, CLIPTokenizer
 from diffusers_vdm.pipeline import LatentVideoDiffusionPipeline
 from diffusers_vdm.utils import resize_and_center_crop, save_bcthw_as_mp4
 class ModifiedUNet(UNet2DConditionModel):
     @classmethod
@@ -37,9 +39,9 @@ class ModifiedUNet(UNet2DConditionModel):
 model_name = 'lllyasviel/paints_undo_single_frame'
 tokenizer = CLIPTokenizer.from_pretrained(model_name, subfolder="tokenizer")
-text_encoder = CLIPTextModel.from_pretrained(model_name, subfolder="text_encoder").to(torch.float16)
-vae = AutoencoderKL.from_pretrained(model_name, subfolder="vae").to(torch.bfloat16)  # bfloat16 vae
-unet = ModifiedUNet.from_pretrained(model_name, subfolder="unet").to(torch.float16)
 unet.set_attn_processor(AttnProcessor2_0())
 vae.set_attn_processor(AttnProcessor2_0())
@@ -47,12 +49,7 @@ vae.set_attn_processor(AttnProcessor2_0())
 video_pipe = LatentVideoDiffusionPipeline.from_pretrained(
     'lllyasviel/paints_undo_multi_frame',
     fp16=True
-)
-memory_management.unload_all_models([
-    video_pipe.unet, video_pipe.vae, video_pipe.text_encoder, video_pipe.image_projection, video_pipe.image_encoder,
-    unet, vae, text_encoder
-])
 k_sampler = KDiffusionSampler(
     unet=unet,
@@ -74,19 +71,16 @@ def find_best_bucket(h, w, options):
     return best_bucket
-@torch.inference_mode()
 def encode_cropped_prompt_77tokens(txt: str):
-    memory_management.load_models_to_gpu(text_encoder)
     cond_ids = tokenizer(txt,
                          padding="max_length",
                          max_length=tokenizer.model_max_length,
                          truncation=True,
-                         return_tensors="pt").input_ids.to(device=text_encoder.device)
     text_cond = text_encoder(cond_ids, attention_mask=None).last_hidden_state
     return text_cond
-@torch.inference_mode()
 def pytorch2numpy(imgs):
     results = []
     for x in imgs:
@@ -97,7 +91,6 @@ def pytorch2numpy(imgs):
     return results
-@torch.inference_mode()
 def numpy2pytorch(imgs):
     h = torch.from_numpy(np.stack(imgs, axis=0)).float() / 127.5 - 1.0
     h = h.movedim(-1, 1)
@@ -110,29 +103,26 @@ def resize_without_crop(image, target_width, target_height):
     return np.array(resized_image)
-@torch.inference_mode()
 def interrogator_process(x):
-    return wd14tagger.default_interrogator(x)
-@torch.inference_mode()
 def process(input_fg, prompt, input_undo_steps, image_width, image_height, seed, steps, n_prompt, cfg,
             progress=gr.Progress()):
-    rng = torch.Generator(device=memory_management.gpu).manual_seed(int(seed))
-    memory_management.load_models_to_gpu(vae)
     fg = resize_and_center_crop(input_fg, image_width, image_height)
-    concat_conds = numpy2pytorch([fg]).to(device=vae.device, dtype=vae.dtype)
     concat_conds = vae.encode(concat_conds).latent_dist.mode() * vae.config.scaling_factor
-    memory_management.load_models_to_gpu(text_encoder)
     conds = encode_cropped_prompt_77tokens(prompt)
     unconds = encode_cropped_prompt_77tokens(n_prompt)
-    memory_management.load_models_to_gpu(unet)
-    fs = torch.tensor(input_undo_steps).to(device=unet.device, dtype=torch.long)
     initial_latents = torch.zeros_like(concat_conds)
-    concat_conds = concat_conds.to(device=unet.device, dtype=unet.dtype)
     latents = k_sampler(
         initial_latent=initial_latents,
         strength=1.0,
@@ -147,7 +137,6 @@ def process(input_fg, prompt, input_undo_steps, image_width, image_height, seed,
         progress_tqdm=functools.partial(progress.tqdm, desc='Generating Key Frames')
     ).to(vae.dtype) / vae.config.scaling_factor
-    memory_management.load_models_to_gpu(vae)
     pixels = vae.decode(latents).sample
     pixels = pytorch2numpy(pixels)
     pixels = [fg] + pixels + [np.zeros_like(fg) + 255]
@@ -155,7 +144,6 @@ def process(input_fg, prompt, input_undo_steps, image_width, image_height, seed,
     return pixels
-@torch.inference_mode()
 def process_video_inner(image_1, image_2, prompt, seed=123, steps=25, cfg_scale=7.5, fs=3, progress_tqdm=None):
     random.seed(seed)
     np.random.seed(seed)
@@ -174,25 +162,21 @@ def process_video_inner(image_1, image_2, prompt, seed=123, steps=25, cfg_scale=
     input_frames = numpy2pytorch([image_1, image_2])
     input_frames = input_frames.unsqueeze(0).movedim(1, 2)
-    memory_management.load_models_to_gpu(video_pipe.text_encoder)
     positive_text_cond = video_pipe.encode_cropped_prompt_77tokens(prompt)
     negative_text_cond = video_pipe.encode_cropped_prompt_77tokens("")
-    memory_management.load_models_to_gpu([video_pipe.image_projection, video_pipe.image_encoder])
-    input_frames = input_frames.to(device=video_pipe.image_encoder.device, dtype=video_pipe.image_encoder.dtype)
     positive_image_cond = video_pipe.encode_clip_vision(input_frames)
     positive_image_cond = video_pipe.image_projection(positive_image_cond)
     negative_image_cond = video_pipe.encode_clip_vision(torch.zeros_like(input_frames))
     negative_image_cond = video_pipe.image_projection(negative_image_cond)
-    memory_management.load_models_to_gpu([video_pipe.vae])
-    input_frames = input_frames.to(device=video_pipe.vae.device, dtype=video_pipe.vae.dtype)
     input_frame_latents, vae_hidden_states = video_pipe.encode_latents(input_frames, return_hidden_states=True)
     first_frame = input_frame_latents[:, :, 0]
     last_frame = input_frame_latents[:, :, 1]
     concat_cond = torch.stack([first_frame] + [torch.zeros_like(first_frame)] * (frames - 2) + [last_frame], dim=2)
-    memory_management.load_models_to_gpu([video_pipe.unet])
     latents = video_pipe(
         batch_size=1,
         steps=int(steps),
@@ -206,12 +190,11 @@ def process_video_inner(image_1, image_2, prompt, seed=123, steps=25, cfg_scale=
         progress_tqdm=progress_tqdm
     )
-    memory_management.load_models_to_gpu([video_pipe.vae])
     video = video_pipe.decode_latents(latents, vae_hidden_states)
     return video, image_1, image_2
-@torch.inference_mode()
 def process_video(keyframes, prompt, steps, cfg, fps, seed, progress=gr.Progress()):
     result_frames = []
     cropped_images = []

 import numpy as np
 import torch
 import wd14tagger
 import uuid
 from PIL import Image
 from transformers import CLIPTextModel, CLIPTokenizer
 from diffusers_vdm.pipeline import LatentVideoDiffusionPipeline
 from diffusers_vdm.utils import resize_and_center_crop, save_bcthw_as_mp4
+import spaces
+# Disable gradients globally
+torch.set_grad_enabled(False)
 class ModifiedUNet(UNet2DConditionModel):
     @classmethod
 model_name = 'lllyasviel/paints_undo_single_frame'
 tokenizer = CLIPTokenizer.from_pretrained(model_name, subfolder="tokenizer")
+text_encoder = CLIPTextModel.from_pretrained(model_name, subfolder="text_encoder").to(torch.float16).to("cuda")
+vae = AutoencoderKL.from_pretrained(model_name, subfolder="vae").to(torch.bfloat16).to("cuda")  # bfloat16 vae
+unet = ModifiedUNet.from_pretrained(model_name, subfolder="unet").to(torch.float16).to("cuda")
 unet.set_attn_processor(AttnProcessor2_0())
 vae.set_attn_processor(AttnProcessor2_0())
 video_pipe = LatentVideoDiffusionPipeline.from_pretrained(
     'lllyasviel/paints_undo_multi_frame',
     fp16=True
+).to("cuda")
 k_sampler = KDiffusionSampler(
     unet=unet,
     return best_bucket
 def encode_cropped_prompt_77tokens(txt: str):
     cond_ids = tokenizer(txt,
                          padding="max_length",
                          max_length=tokenizer.model_max_length,
                          truncation=True,
+                         return_tensors="pt").input_ids.to(device="cuda")
     text_cond = text_encoder(cond_ids, attention_mask=None).last_hidden_state
     return text_cond
 def pytorch2numpy(imgs):
     results = []
     for x in imgs:
     return results
 def numpy2pytorch(imgs):
     h = torch.from_numpy(np.stack(imgs, axis=0)).float() / 127.5 - 1.0
     h = h.movedim(-1, 1)
     return np.array(resized_image)
 def interrogator_process(x):
+    image_description = wd14tagger.default_interrogator(x)
+    return image_description
+@spaces.GPU()
 def process(input_fg, prompt, input_undo_steps, image_width, image_height, seed, steps, n_prompt, cfg,
             progress=gr.Progress()):
+    rng = torch.Generator(device="cuda").manual_seed(int(seed))
     fg = resize_and_center_crop(input_fg, image_width, image_height)
+    concat_conds = numpy2pytorch([fg]).clone().detach().to(device="cuda", dtype=vae.dtype)
     concat_conds = vae.encode(concat_conds).latent_dist.mode() * vae.config.scaling_factor
     conds = encode_cropped_prompt_77tokens(prompt)
     unconds = encode_cropped_prompt_77tokens(n_prompt)
+    fs = torch.tensor(input_undo_steps).to(device="cuda", dtype=torch.long)
     initial_latents = torch.zeros_like(concat_conds)
+    concat_conds = concat_conds.to(device="cuda", dtype=unet.dtype)
     latents = k_sampler(
         initial_latent=initial_latents,
         strength=1.0,
         progress_tqdm=functools.partial(progress.tqdm, desc='Generating Key Frames')
     ).to(vae.dtype) / vae.config.scaling_factor
     pixels = vae.decode(latents).sample
     pixels = pytorch2numpy(pixels)
     pixels = [fg] + pixels + [np.zeros_like(fg) + 255]
     return pixels
 def process_video_inner(image_1, image_2, prompt, seed=123, steps=25, cfg_scale=7.5, fs=3, progress_tqdm=None):
     random.seed(seed)
     np.random.seed(seed)
     input_frames = numpy2pytorch([image_1, image_2])
     input_frames = input_frames.unsqueeze(0).movedim(1, 2)
     positive_text_cond = video_pipe.encode_cropped_prompt_77tokens(prompt)
     negative_text_cond = video_pipe.encode_cropped_prompt_77tokens("")
+    input_frames = input_frames.to(device="cuda", dtype=video_pipe.image_encoder.dtype)
     positive_image_cond = video_pipe.encode_clip_vision(input_frames)
     positive_image_cond = video_pipe.image_projection(positive_image_cond)
     negative_image_cond = video_pipe.encode_clip_vision(torch.zeros_like(input_frames))
     negative_image_cond = video_pipe.image_projection(negative_image_cond)
+    input_frames = input_frames.to(device="cuda", dtype=video_pipe.vae.dtype)
     input_frame_latents, vae_hidden_states = video_pipe.encode_latents(input_frames, return_hidden_states=True)
     first_frame = input_frame_latents[:, :, 0]
     last_frame = input_frame_latents[:, :, 1]
     concat_cond = torch.stack([first_frame] + [torch.zeros_like(first_frame)] * (frames - 2) + [last_frame], dim=2)
     latents = video_pipe(
         batch_size=1,
         steps=int(steps),
         progress_tqdm=progress_tqdm
     )
     video = video_pipe.decode_latents(latents, vae_hidden_states)
     return video, image_1, image_2
+@spaces.GPU(duration=360)
 def process_video(keyframes, prompt, steps, cfg, fps, seed, progress=gr.Progress()):
     result_frames = []
     cropped_images = []