semo / model /model_AMD.py

Upload folder using huggingface_hub

bd546bf verified 10 months ago

43.1 kB

	import torch
	from torch import nn
	import einops
	from typing import Tuple
	import random
	import numpy as np
	from tqdm import tqdm
	from .modules import DuoFrameDownEncoder,Upsampler,MapConv,MotionDownEncoder
	from .loss import l1,l2
	from .transformer import (MotionTransformer,
	AMDDiffusionTransformerModel,
	MotionEncoderLearnTokenTransformer,
	AMDReconstructTransformerModel,
	AMDDiffusionTransformerModelDualStream,
	AMDDiffusionTransformerModelImgSpatial,
	AMDDiffusionTransformerModelImgSpatialDoubleRef,
	AMDReconstructTransformerModelSpatial)
	from .rectified_flow import RectifiedFlow
	from diffusers.configuration_utils import ConfigMixin, register_to_config
	from diffusers.models.modeling_utils import ModelMixin
	from diffusers.models.resnet import ResnetBlock2D
	import einops
	import torch.nn.functional as F

	from diffusers.utils import export_to_gif

	class AMDModel(ModelMixin, ConfigMixin):
	_supports_gradient_checkpointing = True

	@register_to_config
	def __init__(self,
	image_inchannel :int = 4,
	image_height :int = 32,
	image_width :int = 32,
	video_frames :int = 16,
	scheduler_num_step :int = 1000,

	# ----------- MotionEncoder -----------
	motion_token_num:int = 12,
	motion_token_channel: int = 128,
	enc_num_layers:int = 8,
	enc_nhead:int = 8,
	enc_ndim:int = 64,
	enc_dropout:float = 0.0,
	motion_need_norm_out:bool = False,

	# ----------- MotionTransformer ---------
	need_motion_transformer :bool = False,
	motion_transformer_attn_head_dim:int = 64,
	motion_transformer_attn_num_heads:int = 16,
	motion_transformer_num_layers:int = 4,

	# ----------- Diffusion Transformer -----------
	diffusion_model_type : str = 'default', # or dual
	diffusion_attn_head_dim : int = 64,
	diffusion_attn_num_heads : int = 16,
	diffusion_out_channels : int = 4,
	diffusion_num_layers : int = 16,
	image_patch_size : int = 2,
	motion_patch_size : int = 1,
	motion_drop_ratio: float = 0.0,
	refimg_drop: bool = False,

	# ----------- Sample --------------
	extract_motion_with_motion_transformer = False,
	**kwargs,
	):
	super().__init__()

	# setting
	self.num_step = scheduler_num_step
	self.scheduler = RectifiedFlow(num_steps=scheduler_num_step)
	self.need_motion_transformer = need_motion_transformer
	self.extract_motion_with_motion_transformer = extract_motion_with_motion_transformer
	self.diffusion_model_type = diffusion_model_type
	self.target_frame = video_frames
	self.refimg_drop = refimg_drop

	# motion Encoder
	self.motion_encoder = MotionEncoderLearnTokenTransformer(img_height = image_height,
	img_width=image_width,
	img_inchannel=image_inchannel,
	img_patch_size = image_patch_size,
	motion_token_num = motion_token_num,
	motion_channel = motion_token_channel,
	need_norm_out = motion_need_norm_out,
	# ----- attention
	num_attention_heads=enc_nhead,
	attention_head_dim=enc_ndim,
	num_layers=enc_num_layers,
	dropout=enc_dropout,
	attention_bias= True,)

	# motion transformer
	if need_motion_transformer:
	self.motion_transformer = MotionTransformer(motion_token_num=motion_token_num,
	motion_token_channel=motion_token_channel,
	attention_head_dim=motion_transformer_attn_head_dim,
	num_attention_heads=motion_transformer_attn_num_heads,
	num_layers=motion_transformer_num_layers,)

	# diffusion transformer

	if diffusion_model_type == 'default':
	dit_image_inchannel = image_inchannel * 2 # zi + zt
	self.diffusion_transformer = AMDDiffusionTransformerModel(num_attention_heads= diffusion_attn_num_heads,
	attention_head_dim= diffusion_attn_head_dim,
	out_channels = diffusion_out_channels,
	num_layers= diffusion_num_layers,
	# ----- img
	image_width= image_width,
	image_height= image_height,
	image_patch_size= image_patch_size,
	image_in_channels = dit_image_inchannel,
	# ----- motion
	motion_token_num = motion_token_num,
	motion_in_channels = motion_token_channel,)
	elif diffusion_model_type == 'dual':
	dit_image_inchannel = image_inchannel * 2 # zi + zt
	self.diffusion_transformer = AMDDiffusionTransformerModelDualStream(num_attention_heads= diffusion_attn_num_heads,
	attention_head_dim= diffusion_attn_head_dim,
	out_channels = diffusion_out_channels,
	num_layers= diffusion_num_layers,
	# ----- img
	image_width= image_width,
	image_height= image_height,
	image_patch_size= image_patch_size,
	image_in_channels = dit_image_inchannel,
	# ----- motion
	motion_token_num = motion_token_num,
	motion_in_channels = motion_token_channel,
	motion_target_num_frame = video_frames)
	elif diffusion_model_type == 'spatial':
	dit_image_inchannel = image_inchannel * 2 # zi + zt
	self.diffusion_transformer = AMDDiffusionTransformerModelImgSpatial(num_attention_heads= diffusion_attn_num_heads,
	attention_head_dim= diffusion_attn_head_dim,
	out_channels = diffusion_out_channels,
	num_layers= diffusion_num_layers,
	# ----- img
	image_width= image_width,
	image_height= image_height,
	image_patch_size= image_patch_size,
	image_in_channels = dit_image_inchannel,
	# ----- motion
	motion_token_num = motion_token_num,
	motion_in_channels = motion_token_channel,
	motion_target_num_frame = video_frames)
	elif diffusion_model_type == 'doubleref':
	dit_image_inchannel = image_inchannel
	self.diffusion_transformer = AMDDiffusionTransformerModelImgSpatialDoubleRef(num_attention_heads= diffusion_attn_num_heads,
	attention_head_dim= diffusion_attn_head_dim,
	out_channels = diffusion_out_channels,
	num_layers= diffusion_num_layers,
	# ----- img
	image_width= image_width,
	image_height= image_height,
	image_patch_size= image_patch_size,
	image_in_channels = dit_image_inchannel,
	# ----- motion
	motion_token_num = motion_token_num,
	motion_in_channels = motion_token_channel,
	motion_target_num_frame = video_frames)
	else:
	raise IndexError

	def forward(self,
	video:torch.tensor,
	ref_img:torch.Tensor ,
	randomref_img:torch.Tensor = None,
	time_step:torch.tensor = None,
	return_meta_info=False,
	mask_ratio=None,
	**kwargs,):
	"""
	Args:
	video: (N,T,C,H,W)
	ref_img: (N,T,C,H,W)
	randomref_img : (N,T,C,H,W)
	"""

	device = video.device
	n,t,c,h,w = video.shape

	assert video.shape == ref_img.shape ,f'video.shape:{video.shape}should be equal to ref_img.shape:{ref_img.shape}'
	if self.diffusion_model_type == 'doubleref' :
	assert randomref_img is not None, "when diffusion_model_type == doubleref, randomref_img should be given"

	# motion encoder
	if mask_ratio is not None:
	mask_ratio = torch.rand(1).item() * mask_ratio

	if self.diffusion_model_type == 'doubleref' and randomref_img is not None:
	if randomref_img.dim()==4:
	randomref_img = randomref_img.unsqueeze(1).repeat(1,t,1,1,1)
	refimg_and_video = torch.cat([randomref_img,video],dim=1)# (n,t+t,C,H,W)
	else:
	refimg_and_video = torch.cat([ref_img,video],dim=1)# (n,t+t,C,H,W)
	motion = self.motion_encoder(refimg_and_video,mask_ratio) # (n,t+t,l,d)

	source_motion = motion[:,:t].flatten(0,1) # (NT,motion_token,d)
	target_motion = motion[:,t:].flatten(0,1) # (NT,motion_token,d)

	assert source_motion.shape == target_motion.shape , f'source_motion.shape {source_motion.shape} != target_motion.shape {target_motion.shape}'

	# motion transformer
	if self.need_motion_transformer:
	target_motion = einops.rearrange(target_motion,'(n f) l d -> n f l d',n=n)
	target_motion = self.motion_transformer(target_motion)
	target_motion = einops.rearrange(target_motion,'n f l d -> (n f) l d',n=n)


	# prepare for Diffusion Transformer
	zi = ref_img.flatten(0,1) # (NT,C,H,W)
	zj = video.flatten(0,1) # (NT,C,H,W)
	if self.diffusion_model_type == 'doubleref' and randomref_img is not None:
	randomref_img = randomref_img.flatten(0,1) # (NT,C,H,W)

	if time_step is None:
	time_step = self.prepare_timestep(batch_size= zj.shape[0],device= device) #(b,)
	if self.diffusion_model_type != 'default':
	time_step = self.prepare_timestep(batch_size= n,device= device) # (n,)
	time_step = time_step.repeat_interleave(t) # (b,)
	zt,vel = self.scheduler.get_train_tuple(z1=zj,time_step=time_step) # (NT,C,H,W),(NT,C,H,W)

	# dit forward
	if self.refimg_drop:
	zi = torch.zeros_like(zi).to(video.device)
	image_hidden_states = torch.cat((zi,zt),dim=1) # (b,2C,H,W)


	pre = self.diffusion_transformer(motion_source_hidden_states = source_motion,
	motion_target_hidden_states = target_motion,
	image_hidden_states = image_hidden_states,
	randomref_image_hidden_states = randomref_img,
	timestep = time_step,)

	# loss
	diff_loss = l2(pre,vel)

	rec_zj = self.scheduler.get_target_with_zt_vel(zt,pre,time_step)
	rec_loss = l2(rec_zj,zj)

	loss = diff_loss

	loss_dict = {'loss':loss,'diff_loss':diff_loss,'rec_loss':rec_loss}

	if return_meta_info:
	return {'motion' : motion, # (,t,motion_out_channels,h,w) , output of motion transformer
	'zi' : zi, # (b,C,H,W) \| b = n * t
	'zj' : zj, # (b,C,H,W)
	'zt' : zt, # (b,C,H,W)
	'gt' : vel, # (b,C,H,W)
	'pre': pre, # (b,C,H,W)
	'time_step': time_step, # (b,)
	}
	else:
	return pre,vel,loss_dict # (b,C,H,W)
	def get_noise_latent_pair(self,
	video:torch.Tensor,
	ref_img:torch.Tensor ,
	randomref_img:torch.Tensor,
	sample_step:int = 50,
	):
	pass

	@torch.no_grad()
	def sample(self,video:torch.Tensor,
	ref_img:torch.Tensor ,
	randomref_img:torch.Tensor = None,
	sample_step:int = 50,
	mask_ratio = None,
	start_step:int = None,
	return_meta_info=False,
	**kwargs,):

	device = video.device
	n,t,c,h,w = video.shape

	if start_step is None:
	start_step = self.scheduler.num_step
	assert start_step <= self.scheduler.num_step , 'start_step cant be larger than scheduler.num_step'

	if self.diffusion_model_type == 'doubleref' :
	assert randomref_img is not None, "when diffusion_model_type == doubleref, randomref_img should be given"

	if ref_img.dim()==4:
	ref_img = ref_img.unsqueeze(1).repeat(1,t,1,1,1)

	# motion encoder
	if mask_ratio is not None:
	print(f'* Sampling with Mask_Ratio = {mask_ratio}')
	mask_ratio = mask_ratio

	if self.diffusion_model_type == 'doubleref' and randomref_img is not None:
	if randomref_img.dim()==4:
	randomref_img = randomref_img.unsqueeze(1).repeat(1,t,1,1,1)
	refimg_and_video = torch.cat([randomref_img,video],dim=1)# (n,t+t,C,H,W)
	else:
	refimg_and_video = torch.cat([ref_img,video],dim=1)# (n,t+t,C,H,W)

	motion = self.motion_encoder(refimg_and_video,mask_ratio) # (n,t+t,motion_out_channels,h,w)

	source_motion = motion[:,:t].flatten(0,1) # (NT,motion_token,d)
	target_motion = motion[:,t:].flatten(0,1) # (NT,motion_token,d)

	assert source_motion.shape == target_motion.shape , f'source_motion.shape {source_motion.shape} != target_motion.shape {target_motion.shape}'

	# motion transformer
	if self.need_motion_transformer:
	target_motion = einops.rearrange(target_motion,'(n f) l d -> n f l d',n=n)
	target_motion = self.motion_transformer(target_motion)
	target_motion = einops.rearrange(target_motion,'n f l d -> (n f) l d',n=n)

	# prepare for Diffusion Transformer
	time_step = torch.ones((source_motion.shape[0],)).to(device)
	time_step = time_step * start_step

	zi = ref_img.flatten(0,1) # (NT,C,H,W)
	zj = video.flatten(0,1) # (NT,C,H,W)
	if self.diffusion_model_type == 'doubleref' and randomref_img is not None:
	randomref_img = randomref_img.flatten(0,1) # (NT,C,H,W)
	zt,vel = self.scheduler.get_train_tuple(z1=zj,time_step=time_step) # (NT,C,H,W),(NT,C,H,W)
	noise = zj - vel
	# Sample Loop
	pre_cache = []
	sample_cache = []

	# 1.step_seq
	step_seq = np.linspace(0, start_step, num=sample_step+1, endpoint=True,dtype=int) # [0,5,10,15,....,start_step]
	step_seq = list(reversed(step_seq[1:])) # delete step:0 [start_step,.....,15,10,5]

	# 2.Euler step
	dt = 1./sample_step

	if self.refimg_drop:
	zi = torch.zeros_like(zi).to(video.device)

	for i in tqdm(step_seq):
	# time_step
	time_step = torch.ones((zt.shape[0],)).to(zt.device)
	time_step = time_step * i

	# input
	zt = zt.to(video.dtype)
	image_hidden_states = torch.cat((zi,zt),dim=1) # (b,2C,H,W)

	# forward
	pre = self.diffusion_transformer(motion_source_hidden_states = source_motion,
	motion_target_hidden_states = target_motion,
	image_hidden_states = image_hidden_states,
	randomref_image_hidden_states = randomref_img,
	timestep = time_step,)
	zt = zt + pre * dt
	pre_cache.append(pre)
	sample_cache.append(zt)

	zi = einops.rearrange(zi,'(n t) c h w -> n t c h w',n=n)
	zt = einops.rearrange(zt,'(n t) c h w -> n t c h w',n=n)
	zj = einops.rearrange(zj,'(n t) c h w -> n t c h w',n=n)

	if return_meta_info:
	return {'zi' : zi, # (b,1,c,h,w)
	'zj' : zj, # (b,1,c,h,w)
	'sample' : zt, # (b,1,c,h,w)
	'pre_cache' : pre_cache, # [(b,c,h,w),....]
	'sample_cache' : sample_cache, # [(b,c,h,w),....]
	'step_seq' : step_seq,
	'motion' : target_motion, # (b,C,H,W),
	"noise" : noise
	}
	else:
	return zi,zt,zj # (n,t,c,h,w)

	@torch.no_grad()
	def sample_with_refimg_motion(self,
	ref_img:torch.Tensor,
	motion=torch.Tensor,
	randomref_img:torch.Tensor = None,
	sample_step:int = 10,
	mask_ratio = None,
	return_meta_info=False,
	**kwargs,):
	"""
	Args:
	ref_img : (N,C,H,W)
	randomref_img : (N,C,H,W)
	motion : (N,F,L,D)
	Return:
	video : (N,T,C,H,W)
	"""
	device = motion.device
	n,t,l,d = motion.shape

	start_step = self.scheduler.num_step

	# motion encoder
	refimg = ref_img.unsqueeze(1) # (N,1,C,H,W)
	if self.diffusion_model_type == 'doubleref' :
	assert randomref_img is not None, "when diffusion_model_type == doubleref, randomref_img should be given"

	if self.diffusion_model_type == 'doubleref' and randomref_img is not None:
	print('* Warnning * diffusion_model_type:doubleref')
	if randomref_img.dim()==4:
	randomref_img = randomref_img.unsqueeze(1) # (N,1,C,H,W)
	source_motion = self.motion_encoder(randomref_img,mask_ratio) # (n,1,motion_token,d)
	else:
	source_motion = self.motion_encoder(refimg,mask_ratio) # (n,1,motion_token,d)

	source_motion = source_motion.repeat(1,t,1,1).flatten(0,1) # (NT,l,d)
	target_motion = motion.flatten(0,1) # (NT,l,d)

	assert source_motion.shape == target_motion.shape , f'source_motion.shape {source_motion.shape} != target_motion.shape {target_motion.shape}'

	# motion transformer
	if self.need_motion_transformer and not self.extract_motion_with_motion_transformer:
	target_motion = einops.rearrange(target_motion,'(n f) l d -> n f l d',n=n)
	target_motion = self.motion_transformer(target_motion)
	target_motion = einops.rearrange(target_motion,'n f l d -> (n f) l d',n=n)

	# prepare for Diffusion Transformer
	time_step = torch.ones((source_motion.shape[0],)).to(device)
	time_step = time_step * start_step

	zi = refimg.repeat(1,t,1,1,1).flatten(0,1) # (NT,C,H,W)
	zj = zi
	if self.diffusion_model_type == 'doubleref' and randomref_img is not None:
	randomref_img = randomref_img.repeat(1,t,1,1,1)
	randomref_img = randomref_img.flatten(0,1) # (NT,C,H,W)
	# zt,vel = self.scheduler.get_train_tuple(z1=zj,time_step=time_step) # (NT,C,H,W),(NT,C,H,W)
	zt = torch.randn_like(zj)

	# Sample Loop
	pre_cache = []
	sample_cache = []

	# 1.step_seq
	step_seq = np.linspace(0, start_step, num=sample_step+1, endpoint=True,dtype=int) # [0,5,10,15,....,start_step]
	step_seq = list(reversed(step_seq[1:])) # delete step:0 [start_step,.....,15,10,5]

	# 2.Euler step
	dt = 1./sample_step

	if self.refimg_drop:
	zi = torch.zeros_like(zi).to(ref_img.device)

	for i in tqdm(step_seq):
	# time_step
	time_step = torch.ones((zt.shape[0],)).to(zt.device)
	time_step = time_step * i

	# input
	zt = zt.to(ref_img.dtype)
	image_hidden_states = torch.cat((zi,zt),dim=1) # (b,2C,H,W)

	# forward
	pre = self.diffusion_transformer(motion_source_hidden_states = source_motion,
	motion_target_hidden_states = target_motion,
	image_hidden_states = image_hidden_states,
	randomref_image_hidden_states = randomref_img,
	timestep = time_step,)

	zt = zt + pre * dt

	# unsqueeze (n,1,c,h,w) means images, (n,t,c,h,w) means video t>1 .
	zi = einops.rearrange(zi,'(n t) c h w -> n t c h w',n=n,t=t)
	zt = einops.rearrange(zt,'(n t) c h w -> n t c h w',n=n,t=t)

	if return_meta_info:
	return {'zi' : zi, # (b,1,c,h,w)
	'zj' : zj, # (b,1,c,h,w)
	'sample' : zt, # (b,1,c,h,w)
	'pre_cache' : pre_cache, # [(b,c,h,w),....]
	'sample_cache' : sample_cache, # [(b,c,h,w),....]
	'step_seq' : step_seq,
	'motion' : target_motion, # (b,C,H,W)
	}
	else:
	return zi,zt,zj # (b,1,c,h,w)



	def extract_motion(self,video:torch.tensor,mask_ratio=None):
	# video : (N,T,C,H,W)
	n,t,c,h,w = video.shape

	motion = self.motion_encoder(video,mask_ratio) # (N,T,L,D)

	if self.need_motion_transformer and self.extract_motion_with_motion_transformer:
	motion = self.motion_transformer(motion) # (N,T,L,D)

	return motion

	def prepare_timestep(self,batch_size:int,device,time_step = None):
	if time_step is not None:
	return time_step.to(device)
	else:
	return torch.randint(0,self.num_step+1,(batch_size,)).to(device)

	def prepare_encoder_input(self,video:torch.tensor):
	assert len(video.shape) == 5 , f'only support video data : 5D tensor , but got {video.shape}'

	# cat
	pre = video[:,:-1,:,:,:]
	post= video[:,1:,:,:,:]
	duo_frame_mix = torch.cat([pre,post],dim=2) # (b,t-1,2c,h,w)
	duo_frame_mix = einops.rearrange(duo_frame_mix,'b t c h w -> (b t) c h w')

	return duo_frame_mix # (b*f-1,2c,h,w)


	def unpatchify(self, x ,patch_size):
	"""
	x: (N, S, patch_size*2 C)
	imgs: (N, C, H, W)
	"""
	p = patch_size
	h = w = int(x.shape[1]**.5)
	# c = self.in_chans
	c = x.shape[2] // (p**2)
	assert h * w == x.shape[1]

	x = x.reshape(shape=(x.shape[0], h, w, p, p, c)) # (N, h, w, p, p, c)
	x = torch.einsum('nhwpqc->nchpwq', x) # (N, c, h, p, w, p)
	imgs = x.reshape(shape=(x.shape[0], c, h * p, h * p))
	return imgs #(N,C,H,W)

	def reset_infer_num_frame(self, num:int):
	old_num = self.diffusion_transformer.target_frame
	self.diffusion_transformer.target_frame = num
	print(f'* Reset infer frame from {old_num} to {self.diffusion_transformer.target_frame} *')


	class AMDModel_Rec(ModelMixin, ConfigMixin):
	_supports_gradient_checkpointing = True

	@register_to_config
	def __init__(self,
	image_inchannel :int = 4,
	image_height :int = 32,
	image_width :int = 32,
	video_frames :int = 16,
	scheduler_num_step :int = 1000,

	# ----------- MotionEncoder -----------
	motion_token_num:int = 12,
	motion_token_channel: int = 128,
	enc_num_layers:int = 8,
	enc_nhead:int = 8,
	enc_ndim:int = 64,
	enc_dropout:float = 0.0,
	motion_need_norm_out:bool = True,

	# ----------- MotionTransformer ---------
	need_motion_transformer :bool = False,
	motion_transformer_attn_head_dim:int = 64,
	motion_transformer_attn_num_heads:int = 16,
	motion_transformer_num_layers:int = 4,

	# ----------- Diffusion Transformer -----------
	diffusion_model_type : str = 'default', # or dual
	diffusion_attn_head_dim : int = 64,
	diffusion_attn_num_heads : int = 16,
	diffusion_out_channels : int = 4,
	diffusion_num_layers : int = 16,
	image_patch_size : int = 2,
	motion_patch_size : int = 1,
	motion_drop_ratio: float = 0.0,
	**kwargs,
	):
	super().__init__()

	# setting
	self.num_step = scheduler_num_step
	self.scheduler = RectifiedFlow(num_steps=scheduler_num_step)
	self.need_motion_transformer = need_motion_transformer

	# zt token
	INIT_CONST = 0.02
	self.zt_token = nn.Parameter(torch.randn(1, image_inchannel, image_height,image_width) * INIT_CONST)

	# motion Encoder
	self.motion_encoder = MotionEncoderLearnTokenTransformer(img_height = image_height,
	img_width=image_width,
	img_inchannel=image_inchannel,
	img_patch_size = image_patch_size,
	motion_token_num = motion_token_num,
	motion_channel = motion_token_channel,
	need_norm_out = motion_need_norm_out,
	# ----- attention
	num_attention_heads=enc_nhead,
	attention_head_dim=enc_ndim,
	num_layers=enc_num_layers,
	dropout=enc_dropout,
	attention_bias= True,)

	# motion transformer
	if need_motion_transformer:
	self.motion_transformer = MotionTransformer(motion_token_num=motion_token_num,
	motion_token_channel=motion_token_channel,
	attention_head_dim=motion_transformer_attn_head_dim,
	num_attention_heads=motion_transformer_attn_num_heads,
	num_layers=motion_transformer_num_layers,)

	# diffusion transformer
	if diffusion_model_type == 'default':
	dit_image_inchannel = image_inchannel * 2 # zi + zt
	self.transformer = AMDReconstructTransformerModel(num_attention_heads= diffusion_attn_num_heads,
	attention_head_dim= diffusion_attn_head_dim,
	out_channels = diffusion_out_channels,
	num_layers= diffusion_num_layers,
	# ----- img
	image_width= image_width,
	image_height= image_height,
	image_patch_size= image_patch_size,
	image_in_channels = dit_image_inchannel,
	# ----- motion
	motion_token_num = motion_token_num,
	motion_in_channels = motion_token_channel,)
	elif diffusion_model_type == 'spatial':
	dit_image_inchannel = image_inchannel * 2 # zi + zt
	self.transformer = AMDReconstructTransformerModelSpatial(num_attention_heads= diffusion_attn_num_heads,
	attention_head_dim= diffusion_attn_head_dim,
	out_channels = diffusion_out_channels,
	num_layers= diffusion_num_layers,
	# ----- img
	image_width= image_width,
	image_height= image_height,
	image_patch_size= image_patch_size,
	image_in_channels = dit_image_inchannel,
	# ----- motion
	motion_token_num = motion_token_num,
	motion_in_channels = motion_token_channel,
	motion_target_num_frame = video_frames)

	def forward(self,
	video:torch.tensor,
	ref_img:torch.Tensor ,
	time_step:torch.tensor = None,
	return_meta_info=False,
	**kwargs,):
	"""
	Args:
	video: (N,T,C,H,W)
	ref_img: (N,T,C,H,W)
	"""

	device = video.device
	n,t,c,h,w = video.shape

	assert video.shape == ref_img.shape ,f'video.shape:{video.shape}should be equal to ref_img.shape:{ref_img.shape}'

	# motion encoder
	refimg_and_video = torch.cat([ref_img,video],dim=1)# (n,t+t,C,H,W)
	motion = self.motion_encoder(refimg_and_video) # (n,t+t,l,d)

	source_motion = motion[:,:t].flatten(0,1) # (NT,motion_token,d)
	target_motion = motion[:,t:].flatten(0,1) # (NT,motion_token,d)

	assert source_motion.shape == target_motion.shape , f'source_motion.shape {source_motion.shape} != target_motion.shape {target_motion.shape}'

	# motion transformer
	if self.need_motion_transformer:
	target_motion = einops.rearrange(target_motion,'(n f) l d -> n f l d',n=n)
	target_motion = self.motion_transformer(target_motion)
	target_motion = einops.rearrange(target_motion,'n f l d -> (n f) l d',n=n)


	# prepare for Diffusion Transformer
	zi = ref_img.flatten(0,1) # (NT,C,H,W)
	zj = video.flatten(0,1) # (NT,C,H,W)
	zt = self.zt_token.repeat(zj.shape[0],1,1,1) # (NT,C,H,W)

	# dit forward
	image_hidden_states = torch.cat((zi,zt),dim=1) # (b,2C,H,W)
	pre = self.transformer(motion_source_hidden_states = source_motion,
	motion_target_hidden_states = target_motion,
	image_hidden_states = image_hidden_states,)

	# loss
	rec_loss = l2(pre,zj)

	loss = rec_loss

	loss_dict = {'loss':loss,'rec_loss':rec_loss}

	if return_meta_info:
	return {'motion' : motion, # (,t,motion_out_channels,h,w) , output of motion transformer
	'zi' : zi, # (b,C,H,W) \| b = n * t
	'zj' : zj, # (b,C,H,W)
	'zt' : zt, # (b,C,H,W)
	'pre': pre, # (b,C,H,W)
	'time_step': time_step, # (b,)
	}
	else:
	return pre,zj,loss_dict # (b,C,H,W)

	@torch.no_grad()
	def sample(self,
	video:torch.tensor,
	ref_img:torch.Tensor ,
	sample_step:int = 50,
	start_step:int = None,
	return_meta_info=False,
	**kwargs,):

	device = video.device
	n,t,c,h,w = video.shape

	if start_step is None:
	start_step = self.scheduler.num_step
	assert start_step <= self.scheduler.num_step , 'start_step cant be larger than scheduler.num_step'

	# motion encoder
	refimg_and_video = torch.cat([ref_img,video],dim=1)# (n,t+t,C,H,W)
	motion = self.motion_encoder(refimg_and_video) # (n,t+t,motion_out_channels,h,w)

	source_motion = motion[:,:t].flatten(0,1) # (NT,motion_token,d)
	target_motion = motion[:,t:].flatten(0,1) # (NT,motion_token,d)

	assert source_motion.shape == target_motion.shape , f'source_motion.shape {source_motion.shape} != target_motion.shape {target_motion.shape}'

	# motion transformer
	if self.need_motion_transformer:
	target_motion = einops.rearrange(target_motion,'(n f) l d -> n f l d',n=n)
	target_motion = self.motion_transformer(target_motion)
	target_motion = einops.rearrange(target_motion,'n f l d -> (n f) l d',n=n)

	zi = ref_img.flatten(0,1) # (NT,C,H,W)
	zj = video.flatten(0,1) # (NT,C,H,W)
	zt = self.zt_token.repeat(zj.shape[0],1,1,1) # (NT,C,H,W)


	# input
	zt = zt.to(video.dtype)
	image_hidden_states = torch.cat((zi,zt),dim=1) # (b,2C,H,W)

	# forward
	pre = self.transformer(motion_source_hidden_states = source_motion,
	motion_target_hidden_states = target_motion,
	image_hidden_states = image_hidden_states,)

	zi = einops.rearrange(zi,'(n t) c h w -> n t c h w',n=n)
	zt = einops.rearrange(pre,'(n t) c h w -> n t c h w',n=n)
	zj = einops.rearrange(zj,'(n t) c h w -> n t c h w',n=n)

	if return_meta_info:
	return {'zi' : zi, # (b,1,c,h,w)
	'zj' : zj, # (b,1,c,h,w)
	}
	else:
	return zi,zt,zj # (n,t,c,h,w)

	def sample_with_refimg_motion(self,
	ref_img:torch.Tensor,
	motion=torch.Tensor,
	sample_step:int = 10,
	return_meta_info=False,
	**kwargs,):
	"""
	Args:
	ref_img : (N,C,H,W)
	motion : (N,F,L,D)
	Return:
	video : (N,T,C,H,W)
	"""
	device = motion.device
	n,t,l,d = motion.shape

	start_step = self.scheduler.num_step

	# motion encoder
	refimg = ref_img.unsqueeze(1) # (N,1,C,H,W)
	source_motion = self.motion_encoder(refimg) # (n,1,motion_token,d)

	source_motion = source_motion.repeat(1,t,1,1).flatten(0,1) # (NT,l,d)
	target_motion = motion.flatten(0,1) # (NT,l,d)

	assert source_motion.shape == target_motion.shape , f'source_motion.shape {source_motion.shape} != target_motion.shape {target_motion.shape}'

	# motion transformer
	if self.need_motion_transformer:
	target_motion = einops.rearrange(target_motion,'(n f) l d -> n f l d',n=n)
	target_motion = self.motion_transformer(target_motion)
	target_motion = einops.rearrange(target_motion,'n f l d -> (n f) l d',n=n)

	# prepare for Diffusion Transformer
	time_step = torch.ones((source_motion.shape[0],)).to(device)
	time_step = time_step * start_step

	zi = refimg.repeat(1,t,1,1,1).flatten(0,1) # (NT,C,H,W)
	zj = zi
	zt = self.zt_token.repeat(zj.shape[0],1,1,1) # (NT,C,H,W)

	# input
	zt = zt.to(zj.dtype)
	image_hidden_states = torch.cat((zi,zt),dim=1) # (b,2C,H,W)

	# forward
	pre = self.transformer(motion_source_hidden_states = source_motion,
	motion_target_hidden_states = target_motion,
	image_hidden_states = image_hidden_states,)

	zi = einops.rearrange(zi,'(n t) c h w -> n t c h w',n=n)
	zt = einops.rearrange(pre,'(n t) c h w -> n t c h w',n=n)
	zj = einops.rearrange(zj,'(n t) c h w -> n t c h w',n=n)

	if return_meta_info:
	return {'zi' : zi, # (b,1,c,h,w)
	'zj' : zj, # (b,1,c,h,w)
	}
	else:
	return zi,zt,zj # (b,1,c,h,w)

	def extract_motion(self,video:torch.tensor):
	# video : (N,T,C,H,W)

	# motion Encoder
	motion = self.motion_encoder(video) # (N,T,L,D)

	if self.need_motion_transformer:
	motion = self.motion_transformer(motion) # (N,T,L,D)


	return motion


	def AMD_S(**kwargs) -> AMDModel:
	return AMDModel(
	# ----------- motion encoder -----------
	enc_num_layers = 8,
	enc_nhead = 8,
	enc_ndim = 64,
	# ----------- Diffusion Transformer -----------
	diffusion_attn_head_dim = 64,
	diffusion_attn_num_heads = 16,
	diffusion_out_channels = 4,
	diffusion_num_layers = 12,
	**kwargs)

	def AMD_L(**kwargs) -> AMDModel:
	return AMDModel(
	# ----------- motion encoder -----------
	enc_num_layers = 8,
	enc_nhead = 16,
	enc_ndim = 64,
	# ----------- Diffusion Transformer -----------
	diffusion_attn_head_dim = 96,
	diffusion_attn_num_heads = 16,
	diffusion_out_channels = 4,
	diffusion_num_layers = 16,
	**kwargs)

	def AMD_S_Rec(**kwargs) -> AMDModel:
	return AMDModel_Rec(
	# ----------- motion encoder -----------
	enc_num_layers = 8,
	enc_nhead = 8,
	enc_ndim = 64,
	# ----------- Diffusion Transformer -----------
	diffusion_attn_head_dim = 64,
	diffusion_attn_num_heads = 16,
	diffusion_out_channels = 4,
	diffusion_num_layers = 12,
	**kwargs)

	def AMD_S_RecSplit(**kwargs) -> AMDModel:
	return AMDModel_Rec(
	# ----------- motion encoder -----------
	enc_num_layers = 8,
	enc_nhead = 8,
	enc_ndim = 64,
	# ----------- Diffusion Transformer -----------
	diffusion_attn_head_dim = 64,
	diffusion_attn_num_heads = 16,
	diffusion_out_channels = 4,
	diffusion_num_layers = 12,
	is_split = True,
	**kwargs)


	AMD_models = {
	"AMD_S": AMD_S, # 250M
	"AMD_L": AMD_L, # 700M
	"AMD_S_Rec": AMD_S_Rec, # 250M
	"AMD_S_RecSplit" : AMD_S_RecSplit, # 250M
	} # S 206 B 333 M 642 L 1053