Linoy Tsaban commited on
Commit
8623f65
1 Parent(s): 32fdae0

Update preprocess_utils.py

Browse files

change in dm components loading

Files changed (1) hide show
  1. preprocess_utils.py +22 -42
preprocess_utils.py CHANGED
@@ -10,7 +10,7 @@ import torch.nn as nn
10
  import argparse
11
  from torchvision.io import write_video
12
  from pathlib import Path
13
- from utils import *
14
  import torchvision.transforms as T
15
 
16
 
@@ -25,7 +25,7 @@ def get_timesteps(scheduler, num_inference_steps, strength, device):
25
 
26
 
27
  class Preprocess(nn.Module):
28
- def __init__(self, device, opt, hf_key=None):
29
  super().__init__()
30
 
31
  self.device = device
@@ -47,15 +47,23 @@ class Preprocess(nn.Module):
47
  model_key = "stabilityai/stable-diffusion-2-depth"
48
  else:
49
  raise ValueError(f'Stable-diffusion version {self.sd_version} not supported.')
 
50
  self.model_key = model_key
 
51
  # Create model
52
- self.vae = AutoencoderKL.from_pretrained(model_key, subfolder="vae", revision="fp16",
53
- torch_dtype=torch.float16).to(self.device)
54
- self.tokenizer = CLIPTokenizer.from_pretrained(model_key, subfolder="tokenizer")
55
- self.text_encoder = CLIPTextModel.from_pretrained(model_key, subfolder="text_encoder", revision="fp16",
56
- torch_dtype=torch.float16).to(self.device)
57
- self.unet = UNet2DConditionModel.from_pretrained(model_key, subfolder="unet", revision="fp16",
58
- torch_dtype=torch.float16).to(self.device)
 
 
 
 
 
 
59
  self.total_inverted_latents = {}
60
 
61
  self.paths, self.frames, self.latents = self.get_data(self.config["data_path"], self.config["n_frames"])
@@ -74,11 +82,12 @@ class Preprocess(nn.Module):
74
  self.canny_cond = self.get_canny_cond()
75
  elif self.sd_version == 'depth':
76
  self.depth_maps = self.prepare_depth_maps()
77
- self.scheduler = DDIMScheduler.from_pretrained(model_key, subfolder="scheduler")
78
 
79
- # self.unet.enable_xformers_memory_efficient_attention()
80
  print(f'[INFO] loaded stable diffusion!')
81
-
 
82
  @torch.no_grad()
83
  def prepare_depth_maps(self, model_type='DPT_Large', device='cuda'):
84
  depth_maps = []
@@ -363,33 +372,4 @@ def prep(opt):
363
 
364
 
365
  return frames, latents, total_inverted_latents, rgb_reconstruction
366
- # if not os.path.isdir(os.path.join(save_path, f'frames')):
367
- # os.mkdir(os.path.join(save_path, f'frames'))
368
- # for i, frame in enumerate(recon_frames):
369
- # T.ToPILImage()(frame).save(os.path.join(save_path, f'frames', f'{i:05d}.png'))
370
- # frames = (recon_frames * 255).to(torch.uint8).cpu().permute(0, 2, 3, 1)
371
- # write_video(os.path.join(save_path, f'inverted.mp4'), frames, fps=10)
372
-
373
-
374
- # if __name__ == "__main__":
375
- # device = 'cuda'
376
- # parser = argparse.ArgumentParser()
377
- # parser.add_argument('--data_path', type=str,
378
- # default='data/woman-running.mp4')
379
- # parser.add_argument('--H', type=int, default=512,
380
- # help='for non-square videos, we recommand using 672 x 384 or 384 x 672, aspect ratio 1.75')
381
- # parser.add_argument('--W', type=int, default=512,
382
- # help='for non-square videos, we recommand using 672 x 384 or 384 x 672, aspect ratio 1.75')
383
- # parser.add_argument('--save_dir', type=str, default='latents')
384
- # parser.add_argument('--sd_version', type=str, default='2.1', choices=['1.5', '2.0', '2.1', 'ControlNet', 'depth'],
385
- # help="stable diffusion version")
386
- # parser.add_argument('--steps', type=int, default=500)
387
- # parser.add_argument('--batch_size', type=int, default=40)
388
- # parser.add_argument('--save_steps', type=int, default=50)
389
- # parser.add_argument('--n_frames', type=int, default=40)
390
- # parser.add_argument('--inversion_prompt', type=str, default='a woman running')
391
- # opt = parser.parse_args()
392
- # video_path = opt.data_path
393
- # save_video_frames(video_path, img_size=(opt.H, opt.W))
394
- # opt.data_path = os.path.join('data', Path(video_path).stem)
395
- # prep(opt)
 
10
  import argparse
11
  from torchvision.io import write_video
12
  from pathlib import Path
13
+ from util import *
14
  import torchvision.transforms as T
15
 
16
 
 
25
 
26
 
27
  class Preprocess(nn.Module):
28
+ def __init__(self, device, opt, vae, tokenizer, text_encoder, unet,scheduler, hf_key=None):
29
  super().__init__()
30
 
31
  self.device = device
 
47
  model_key = "stabilityai/stable-diffusion-2-depth"
48
  else:
49
  raise ValueError(f'Stable-diffusion version {self.sd_version} not supported.')
50
+
51
  self.model_key = model_key
52
+
53
  # Create model
54
+ # self.vae = AutoencoderKL.from_pretrained(model_key, subfolder="vae", revision="fp16",
55
+ # torch_dtype=torch.float16).to(self.device)
56
+ # self.tokenizer = CLIPTokenizer.from_pretrained(model_key, subfolder="tokenizer")
57
+ # self.text_encoder = CLIPTextModel.from_pretrained(model_key, subfolder="text_encoder", revision="fp16",
58
+ # torch_dtype=torch.float16).to(self.device)
59
+ # self.unet = UNet2DConditionModel.from_pretrained(model_key, subfolder="unet", revision="fp16",
60
+ # torch_dtype=torch.float16).to(self.device)
61
+
62
+ self.vae = vae
63
+ self.tokenizer = tokenizer
64
+ self.text_encoder = text_encoder
65
+ self.unet = unet
66
+ self.scheduler=scheduler
67
  self.total_inverted_latents = {}
68
 
69
  self.paths, self.frames, self.latents = self.get_data(self.config["data_path"], self.config["n_frames"])
 
82
  self.canny_cond = self.get_canny_cond()
83
  elif self.sd_version == 'depth':
84
  self.depth_maps = self.prepare_depth_maps()
85
+ self.scheduler = scheduler
86
 
87
+ self.unet.enable_xformers_memory_efficient_attention()
88
  print(f'[INFO] loaded stable diffusion!')
89
+
90
+
91
  @torch.no_grad()
92
  def prepare_depth_maps(self, model_type='DPT_Large', device='cuda'):
93
  depth_maps = []
 
372
 
373
 
374
  return frames, latents, total_inverted_latents, rgb_reconstruction
375
+