Spaces:

maxin-cn
/

Latte-1

Running on Zero

App Files Files Community

Latte-1 / sample /sample.py

maxin-cn

Upload folder using huggingface_hub

94bafa8 verified 6 months ago

raw

history blame

4.74 kB

	# All rights reserved.

	# This source code is licensed under the license found in the
	# LICENSE file in the root directory of this source tree.

	"""
	Sample new images from a pre-trained Latte.
	"""
	import os
	import sys
	try:
	import utils

	from diffusion import create_diffusion
	from utils import find_model
	except:
	sys.path.append(os.path.split(sys.path[0])[0])

	import utils

	from diffusion import create_diffusion
	from utils import find_model

	import torch
	import argparse
	import torchvision

	from einops import rearrange
	from models import get_models
	from torchvision.utils import save_image
	from diffusers.models import AutoencoderKL
	from models.clip import TextEmbedder
	import imageio
	from omegaconf import OmegaConf

	torch.backends.cuda.matmul.allow_tf32 = True
	torch.backends.cudnn.allow_tf32 = True

	def main(args):
	# Setup PyTorch:
	# torch.manual_seed(args.seed)
	torch.set_grad_enabled(False)
	device = "cuda" if torch.cuda.is_available() else "cpu"
	# device = "cpu"

	if args.ckpt is None:
	assert args.model == "Latte-XL/2", "Only Latte-XL/2 models are available for auto-download."
	assert args.image_size in [256, 512]
	assert args.num_classes == 1000

	using_cfg = args.cfg_scale > 1.0

	# Load model:
	latent_size = args.image_size // 8
	args.latent_size = latent_size
	model = get_models(args).to(device)

	if args.use_compile:
	model = torch.compile(model)

	# a pre-trained model or load a custom Latte checkpoint from train.py:
	ckpt_path = args.ckpt
	state_dict = find_model(ckpt_path)
	model.load_state_dict(state_dict)

	model.eval() # important!
	diffusion = create_diffusion(str(args.num_sampling_steps))
	# vae = AutoencoderKL.from_pretrained(f"stabilityai/sd-vae-ft-{args.vae}").to(device)
	vae = AutoencoderKL.from_pretrained(args.pretrained_model_path, subfolder="vae").to(device)
	# text_encoder = TextEmbedder().to(device)

	if args.use_fp16:
	print('WARNING: using half percision for inferencing!')
	vae.to(dtype=torch.float16)
	model.to(dtype=torch.float16)
	# text_encoder.to(dtype=torch.float16)

	# Labels to condition the model with (feel free to change):

	# Create sampling noise:
	if args.use_fp16:
	z = torch.randn(1, args.num_frames, 4, latent_size, latent_size, dtype=torch.float16, device=device) # b c f h w
	else:
	z = torch.randn(1, args.num_frames, 4, latent_size, latent_size, device=device)

	# Setup classifier-free guidance:
	# z = torch.cat([z, z], 0)
	if using_cfg:
	z = torch.cat([z, z], 0)
	y = torch.randint(0, args.num_classes, (1,), device=device)
	y_null = torch.tensor([101] * 1, device=device)
	y = torch.cat([y, y_null], dim=0)
	model_kwargs = dict(y=y, cfg_scale=args.cfg_scale, use_fp16=args.use_fp16)
	sample_fn = model.forward_with_cfg
	else:
	sample_fn = model.forward
	model_kwargs = dict(y=None, use_fp16=args.use_fp16)

	# Sample images:
	if args.sample_method == 'ddim':
	samples = diffusion.ddim_sample_loop(
	sample_fn, z.shape, z, clip_denoised=False, model_kwargs=model_kwargs, progress=True, device=device
	)
	elif args.sample_method == 'ddpm':
	samples = diffusion.p_sample_loop(
	sample_fn, z.shape, z, clip_denoised=False, model_kwargs=model_kwargs, progress=True, device=device
	)

	print(samples.shape)
	if args.use_fp16:
	samples = samples.to(dtype=torch.float16)
	b, f, c, h, w = samples.shape
	samples = rearrange(samples, 'b f c h w -> (b f) c h w')
	samples = vae.decode(samples / 0.18215).sample
	samples = rearrange(samples, '(b f) c h w -> b f c h w', b=b)
	# Save and display images:

	if not os.path.exists(args.save_video_path):
	os.makedirs(args.save_video_path)


	video_ = ((samples[0] * 0.5 + 0.5) * 255).add_(0.5).clamp_(0, 255).to(dtype=torch.uint8).cpu().permute(0, 2, 3, 1).contiguous()
	video_save_path = os.path.join(args.save_video_path, 'sample' + '.mp4')
	print(video_save_path)
	imageio.mimwrite(video_save_path, video_, fps=8, quality=9)
	print('save path {}'.format(args.save_video_path))


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("--config", type=str, default="./configs/ucf101/ucf101_sample.yaml")
	parser.add_argument("--ckpt", type=str, default="")
	parser.add_argument("--save_video_path", type=str, default="./sample_videos/")
	args = parser.parse_args()
	omega_conf = OmegaConf.load(args.config)
	omega_conf.ckpt = args.ckpt
	omega_conf.save_video_path = args.save_video_path
	main(omega_conf)