text2-video-zero / text_to_video /text_to_video_generator.py

thanks to Text2Video-Zero team ❤

b944fa1 almost 2 years ago

3.18 kB

	from text_to_video.tuneavideo.pipelines.pipeline_text_to_video import TuneAVideoPipeline
	from text_to_video.tuneavideo.models.unet import UNet3DConditionModel
	import torch
	from diffusers import AutoencoderKL, DDIMScheduler
	from transformers import CLIPTextModel, CLIPTokenizer


	class TextToVideo():


	def __init__(self,sd_path = None,motion_field_strength = 12, video_length = 8,t0 = 881, t1=941,use_cf_attn=True,use_motion_field=True) -> None:
	g = torch.Generator(device='cuda')
	g.manual_seed(22)
	self.g = g

	assert sd_path is not None

	print(f"Loading model SD-Net model file from {sd_path}")

	self.dtype = torch.float16
	noise_scheduler = DDIMScheduler.from_pretrained(
	sd_path, subfolder="scheduler")
	tokenizer = CLIPTokenizer.from_pretrained(
	sd_path, subfolder="tokenizer")
	text_encoder = CLIPTextModel.from_pretrained(
	sd_path, subfolder="text_encoder")
	vae = AutoencoderKL.from_pretrained(sd_path, subfolder="vae")


	unet = UNet3DConditionModel.from_pretrained_2d(
	sd_path, subfolder="unet", use_cf_attn=use_cf_attn)
	self.pipe = TuneAVideoPipeline(
	vae=vae, text_encoder=text_encoder, tokenizer=tokenizer, unet=unet,
	scheduler=DDIMScheduler.from_pretrained(
	sd_path, subfolder="scheduler")
	).to('cuda').to(self.dtype)

	noise_scheduler.set_timesteps(50, device='cuda')

	# t0 parameter (DDIM backward from noise until t0)
	self.t0 = t0


	# from t0 apply DDPM forward until t1
	self.t1 = t1

	self.use_foreground_motion_field = False # apply motion field on forground object (not used)

	# strength of motion field (delta_x = delta_y in Sect 3.3.1)
	self.motion_field_strength = motion_field_strength
	self.use_motion_field = use_motion_field # apply general motion field
	self.smooth_bg = False # temporally smooth background
	self.smooth_bg_strength = 0.4 # alpha = (1-self.smooth_bg_strength) in Eq (9)


	self.video_length = video_length

	def inference(self, prompt):

	prompt_compute = [prompt]
	xT = torch.randn((1, 4, 1, 64, 64), dtype=self.dtype, device="cuda")
	result = self.pipe(prompt_compute,
	video_length=self.video_length,
	height=512,
	width=512,
	num_inference_steps=50,
	guidance_scale=7.5,
	guidance_stop_step=1.0,
	t0=self.t0,
	t1=self.t1,
	xT=xT,
	use_foreground_motion_field=self.use_foreground_motion_field,
	motion_field_strength=self.motion_field_strength,
	use_motion_field=self.use_motion_field,
	smooth_bg=self.smooth_bg,
	smooth_bg_strength=self.smooth_bg_strength,
	generator=self.g)

	return result.videos[0]