NEXTGPT / code /model /anyToImageVideoAudio.py

Upload 83 files

7cdf421 verified over 1 year ago

57.5 kB

	import logging
	import os.path
	from typing import List

	import torch
	from header import *
	import torch.nn.functional as F
	from .ImageBind import *
	from .ImageBind import data
	from .modeling_llama import LlamaForCausalLM
	from transformers import StoppingCriteria, StoppingCriteriaList
	# from diffusers import StableDiffusionPipeline
	from .custom_sd import StableDiffusionPipeline
	from .custom_vd import TextToVideoSDPipeline
	from .custom_ad import AudioLDMPipeline
	from .layers import *
	from .common.utils import *


	class StoppingCriteriaSub(StoppingCriteria):

	def __init__(self, stops: List = None, encounters: int = 1):
	super().__init__()
	self.stops = stops
	self.ENCOUNTERS = encounters

	def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor):
	stop_count = 0
	for stop in self.stops:
	_stop = torch.tensor(stop).to(input_ids[0].device)
	indices = torch.where(_stop[0] == input_ids)
	for i in indices:
	if len(i) > 0:
	if torch.all(input_ids[0][i:i + len(_stop)] == _stop):
	stop_count += 1
	if stop_count >= self.ENCOUNTERS:
	return True
	return False


	class NextGPTModel(nn.Module):
	"""LoRA for LLaMa model"""

	def __init__(self, **args):
	super(NextGPTModel, self).__init__()
	self.args = args

	self.max_length = args['max_length']
	self.device = torch.cuda.current_device()
	self.stage = args['stage']
	print('args max_length', args['max_length'])

	imagebind_ckpt_path = os.path.join(self.args['pretrained_ckpt_path'], 'imagebind_ckpt',
	self.args['imagebind_version'])
	print(f'Initializing visual encoder from {imagebind_ckpt_path} ...')
	self.visual_encoder, self.visual_hidden_size = \
	imagebind_model.imagebind_huge(pretrained=True, store_path=imagebind_ckpt_path)
	# free vision encoder
	for name, param in self.visual_encoder.named_parameters():
	param.requires_grad = False
	self.visual_encoder.eval()
	print('Visual encoder initialized.')

	self.vicuna_ckpt_path = os.path.join(self.args['pretrained_ckpt_path'], 'vicuna_ckpt',
	self.args['vicuna_version'])
	print(f'Initializing language decoder from {self.vicuna_ckpt_path} ...')

	self.llama_model = LlamaForCausalLM.from_pretrained(self.vicuna_ckpt_path)
	if self.args.get('freeze_lm'):
	print("Freezing the LLaMa ...")
	for param in self.llama_model.parameters():
	param.requires_grad = False
	self.llama_model.eval()
	else:
	print("Instruct tuning the LLaMa ...")
	# add the lora module
	peft_config = LoraConfig(
	task_type=TaskType.CAUSAL_LM,
	inference_mode=False,
	r=self.args['lora_r'],
	lora_alpha=self.args['lora_alpha'],
	lora_dropout=self.args['lora_dropout'],
	target_modules=['q_proj', 'k_proj', 'v_proj', 'o_proj']
	)

	self.llama_model = get_peft_model(self.llama_model, peft_config)
	self.llama_model.print_trainable_parameters()
	print('Language decoder initialized.')

	# use the new trained tokenizer
	tokenizer_path = self.vicuna_ckpt_path
	print(f'Initializing tokenizer from {tokenizer_path} ...')
	self.llama_tokenizer = LlamaTokenizer.from_pretrained(tokenizer_path, use_fast=False)
	self.llama_tokenizer.pad_token = self.llama_tokenizer.eos_token
	self.llama_tokenizer.padding_side = "right"
	# self.llama_tokenizer.add_special_tokens({"mask_token": "[MASK]"})
	self._add_image_token()
	self._add_video_token()
	self._add_audio_token()
	self.llama_model.resize_token_embeddings(len(self.llama_tokenizer))
	print('Tokenizer initialized.')

	self.llama_proj = nn.Linear(
	self.visual_hidden_size, self.llama_model.config.hidden_size
	)
	if self.args.get('freeze_input_proj'):
	for param in self.llama_proj.parameters():
	param.requires_grad = False

	self.input_embeddings = self.llama_model.get_input_embeddings()

	# the alignment module for LLM-TO-IMAGE
	self.sd_ckpt_path = self.args['image_diffusion']
	self.gen_text_hidden_fcs = nn.ModuleList([])
	for layer_idx in self.args['text_emb_to_img_layers']:
	if layer_idx == -1 or layer_idx == self.llama_model.config.num_hidden_layers:
	in_dim = self.llama_model.config.hidden_size

	self.gen_text_hidden_fcs.append(
	TextFcLayer(in_dim, 768, num_input_tokens=self.args['num_gen_img_tokens'],
	num_output_tokens=self.args['num_clip_tokens'],
	mode=self.args['text_fc_to_img_mode']))
	# self.sd_pipe.text_encoder.config.hidden_size
	elif layer_idx < self.llama_model.config.num_hidden_layers:
	self.gen_text_hidden_fcs.append(
	TextFcLayer(self.llama_model.config.hidden_size, 768,
	num_input_tokens=self.args['num_gen_img_tokens'],
	num_output_tokens=self.args['num_clip_tokens'],
	mode=self.args['text_fc_to_img_mode']))
	else:
	raise ValueError(
	f'Embedding of layer {layer_idx} was requested but model only has {self.llama_model.config.num_hidden_layers} layers.')

	# the alignment module for LLM-TO-VIDEO
	self.vd_ckpt_path = self.args['video_diffusion']
	self.gen_text_hidden_fcs_video = nn.ModuleList([])
	for layer_idx in self.args['text_emb_to_video_layers']:
	if layer_idx == -1 or layer_idx == self.llama_model.config.num_hidden_layers:
	in_dim = self.llama_model.config.hidden_size # 4096

	self.gen_text_hidden_fcs_video.append(
	TextFcLayer(in_dim, 1024, num_input_tokens=self.args['num_gen_video_tokens'],
	num_output_tokens=self.args['num_clip_tokens'],
	mode=self.args['text_fc_to_video_mode']))
	# self.vd_pipe.text_encoder.config.hidden_size
	elif layer_idx < self.llama_model.config.num_hidden_layers:
	self.gen_text_hidden_fcs_video.append(
	TextFcLayer(self.llama_model.config.hidden_size, 1024,
	num_input_tokens=self.args['num_gen_video_tokens'],
	num_output_tokens=self.args['num_clip_tokens'],
	mode=self.args['text_fc_to_video_mode']))
	else:
	raise ValueError(
	f'Embedding of layer {layer_idx} was requested but model only has {self.llama_model.config.num_hidden_layers} layers.')

	# the alignment module for LLM-TO-AUDIO
	self.ad_ckpt_path = self.args['audio_diffusion']
	self.gen_text_hidden_fcs_audio = nn.ModuleList([])
	for layer_idx in self.args['text_emb_to_audio_layers']:
	if layer_idx == -1 or layer_idx == self.llama_model.config.num_hidden_layers:
	in_dim = self.llama_model.config.hidden_size

	self.gen_text_hidden_fcs_audio.append(
	TextFcLayer(in_dim, 512,
	num_input_tokens=self.args['num_gen_audio_tokens'],
	num_output_tokens=1,
	mode=self.args['text_fc_to_audio_mode']))
	# self.ad_pipe.text_encoder.config.projection_dim
	elif layer_idx < self.llama_model.config.num_hidden_layers:
	self.gen_text_hidden_fcs_audio.append(
	TextFcLayer(self.llama_model.config.hidden_size, 512,
	num_input_tokens=self.args['num_gen_audio_tokens'],
	num_output_tokens=1,
	mode=self.args['text_fc_to_audio_mode']))
	else:
	raise ValueError(
	f'Embedding of layer {layer_idx} was requested but model only has {self.llama_model.config.num_hidden_layers} layers.')

	if self.args.get('freeze_output_proj'):
	for name, param in self.gen_text_hidden_fcs.named_parameters():
	param.requires_grad = False
	for name, param in self.gen_text_hidden_fcs_video.named_parameters():
	param.requires_grad = False
	for name, param in self.gen_text_hidden_fcs_audio.named_parameters():
	param.requires_grad = False

	def _add_image_token(self):
	# Add an image token for loss masking (and visualization) purposes.
	self.llama_tokenizer.add_tokens(["<Img>"]) # add special image token to tokenizer
	self.llama_tokenizer.add_tokens(["</Img>"]) # add special image token to tokenizer

	# Add [IMG] tokens to the vocabulary.
	self.args['gen_img_token_idx'] = []
	for i in range(self.args['num_gen_img_tokens']):
	print(f'Adding [IMG{i}] token to vocabulary.')
	print(f'Before adding new token, tokenizer("[IMG{i}]") =',
	self.llama_tokenizer(f'[IMG{i}]', add_special_tokens=False))
	num_added_tokens = self.llama_tokenizer.add_tokens(f'[IMG{i}]')
	print(f'After adding {num_added_tokens} new tokens, tokenizer("[IMG{i}]") =',
	self.llama_tokenizer(f'[IMG{i}]', add_special_tokens=False))
	gen_token_idx = self.llama_tokenizer(f'[IMG{i}]', add_special_tokens=False).input_ids
	assert len(gen_token_idx) == 1, gen_token_idx
	self.args['gen_img_token_idx'].append(gen_token_idx[0])

	def _add_video_token(self):
	# self.llama_tokenizer.add_tokens({"<Vid>"}) # add special video token to tokenizer
	# self.llama_tokenizer.add_tokens({"</Vid>"}) # add special video token to tokenizer

	# Add [VID] tokens to the vocabulary.
	self.args['gen_video_token_idx'] = []
	for i in range(self.args['num_gen_video_tokens']):
	print(f'Adding [VID{i}] token to vocabulary.')
	print(f'Before adding new token, tokenizer("[VID{i}]") =',
	self.llama_tokenizer(f'[VID{i}]', add_special_tokens=False))
	num_added_tokens = self.llama_tokenizer.add_tokens(f'[VID{i}]')
	print(f'After adding {num_added_tokens} new tokens, tokenizer("[VID{i}]") =',
	self.llama_tokenizer(f'[VID{i}]', add_special_tokens=False))
	gen_token_idx = self.llama_tokenizer(f'[VID{i}]', add_special_tokens=False).input_ids
	assert len(gen_token_idx) == 1, gen_token_idx
	self.args['gen_video_token_idx'].append(gen_token_idx[0])

	def _add_audio_token(self):
	# self.llama_tokenizer.add_tokens({"<Aud>"}) # add special audio token to tokenizer
	# self.llama_tokenizer.add_tokens({"</Aud>"}) # add special audio token to tokenizer

	# Add [AUD] tokens to the vocabulary.
	self.args['gen_audio_token_idx'] = []
	for i in range(self.args['num_gen_audio_tokens']):
	print(f'Adding [AUD{i}] token to vocabulary.')
	print(f'Before adding new token, tokenizer("[AUD{i}]") =',
	self.llama_tokenizer(f'[AUD{i}]', add_special_tokens=False))
	num_added_tokens = self.llama_tokenizer.add_tokens(f'[AUD{i}]')
	print(f'After adding {num_added_tokens} new tokens, tokenizer("[AUD{i}]") =',
	self.llama_tokenizer(f'[AUD{i}]', add_special_tokens=False))
	gen_token_idx = self.llama_tokenizer(f'[AUD{i}]', add_special_tokens=False).input_ids
	assert len(gen_token_idx) == 1, gen_token_idx
	self.args['gen_audio_token_idx'].append(gen_token_idx[0])

	def encode_video(self, video_paths):
	inputs = {ModalityType.VISION: data.load_and_transform_video_data(video_paths, self.device)}
	# convert into visual dtype
	inputs = {key: inputs[key].to(self.llama_model.dtype) for key in inputs}
	with torch.no_grad():
	embeddings = self.visual_encoder(inputs)
	video_embeds = embeddings[ModalityType.VISION] # bsz x 1024
	inputs_llama = self.llama_proj(video_embeds).unsqueeze(1) # bsz x 1 x llama_size
	atts_llama = torch.ones(inputs_llama.size()[:-1], dtype=torch.long).to(self.device) # bsz x 1
	return inputs_llama, atts_llama

	def encode_audio(self, audio_paths):
	inputs = {ModalityType.AUDIO: data.load_and_transform_audio_data(audio_paths, self.device)}
	# convert into visual dtype
	inputs = {key: inputs[key].to(self.llama_model.dtype) for key in inputs}
	with torch.no_grad():
	embeddings = self.visual_encoder(inputs)
	audio_embeds = embeddings[ModalityType.AUDIO] # bsz x 1024
	inputs_llama = self.llama_proj(audio_embeds).unsqueeze(1) # bsz x 1 x llama_size
	atts_llama = torch.ones(inputs_llama.size()[:-1], dtype=torch.long).to(self.device) # bsz x 1
	return inputs_llama, atts_llama

	def encode_image(self, image_paths):
	inputs = {ModalityType.VISION: data.load_and_transform_vision_data(image_paths, self.device)}
	# convert into visual dtype
	inputs = {key: inputs[key].to(self.llama_model.dtype) for key in inputs}
	with torch.no_grad():
	embeddings = self.visual_encoder(inputs)
	image_embeds = embeddings['vision'] # bsz x 1024
	inputs_llama = self.llama_proj(image_embeds).unsqueeze(1) # bsz x 1 x llama_size
	atts_llama = torch.ones(inputs_llama.size()[:-1], dtype=torch.long).to(self.device) # bsz x 1
	return inputs_llama, atts_llama

	def prompt_wrap(self, img_embeds, input_ids, target_ids, attention_mask):
	'''
	input_ids, target_ids, attention_mask: bsz x s2
	'''
	input_ids = input_ids.to(self.device) # bsz x s2
	target_ids = target_ids.to(self.device) # bsz x s2
	attention_mask = attention_mask.to(self.device) # bsz x s2

	batch_size = input_ids.shape[0]

	bos = torch.ones([batch_size, 1], dtype=input_ids.dtype,
	device=input_ids.device) * self.llama_tokenizer.bos_token_id # bsz x 1
	if self.args['freeze_lm']:
	p_after_embeds = self.llama_model.model.embed_tokens(input_ids).expand(batch_size, -1,
	-1) # bsz x s2 x embed_dim
	bos_embeds = self.llama_model.model.embed_tokens(bos) # bsz x 1 x embed_dim
	else:
	p_after_embeds = self.llama_model.model.model.embed_tokens(input_ids).expand(batch_size, -1,
	-1) # bsz x s2 x embed_dim
	bos_embeds = self.llama_model.model.model.embed_tokens(bos) # bsz x 1 x embed_dim
	if img_embeds is not None:
	p_before = '### Human: <Img>'
	p_before_tokens = self.llama_tokenizer(p_before, return_tensors="pt", add_special_tokens=False).to(
	self.device)
	# peft model need deeper call
	if self.args['freeze_lm']:
	p_before_embeds = self.llama_model.model.embed_tokens(p_before_tokens.input_ids).expand(batch_size, -1,
	-1) # bsz x s1 x embed_dim
	else:
	p_before_embeds = self.llama_model.model.model.embed_tokens(p_before_tokens.input_ids).expand(
	batch_size, -1, -1) # bsz x s1 x embed_dim
	inputs_embeds = torch.cat([bos_embeds, p_before_embeds, img_embeds, p_after_embeds], dim=1).to(
	self.device) # bsz x (1+s1+1+s2) x embed_dim

	# create targets
	empty_targets = (
	torch.ones([batch_size, 1 + p_before_embeds.size()[1] + 1], # 1 (bos) + s1 + 1
	dtype=torch.long).to(self.device).fill_(-100)
	) # bsz x (1 + s1)
	targets = torch.cat([empty_targets, target_ids], dim=1).to(self.device) # bsz x (1 + s1 + 1 + s2)
	assert inputs_embeds.size()[1] == targets.size()[1]

	atts_prefix = torch.ones([batch_size, 1 + p_before_embeds.size()[1] + 1], dtype=torch.long).to(
	self.device) # bsz x (1 + s1 + 1)
	attention_mask = torch.cat([atts_prefix, attention_mask], dim=1).to(self.device)
	assert attention_mask.size() == targets.size() # bsz x (1 + s1 + 1 + s2)
	else:
	p_before = '### Human: '
	p_before_tokens = self.llama_tokenizer(p_before, return_tensors="pt", add_special_tokens=False).to(
	self.device)
	# peft model need deeper call
	if self.args['freeze_lm']:
	p_before_embeds = self.llama_model.model.embed_tokens(p_before_tokens.input_ids).expand(batch_size, -1,
	-1) # bsz x s1 x embed_dim
	else:
	p_before_embeds = self.llama_model.model.model.embed_tokens(p_before_tokens.input_ids).expand(
	batch_size, -1, -1) # bsz x s1 x embed_dim
	inputs_embeds = torch.cat([bos_embeds, p_before_embeds, p_after_embeds], dim=1).to(
	self.device) # bsz x (1+s1+s2) x embed_dim

	# create targets
	empty_targets = (
	torch.ones([batch_size, 1 + p_before_embeds.size()[1]], # 1 (bos) + s1
	dtype=torch.long).to(self.device).fill_(-100)
	) # bsz x (1 + s1)
	targets = torch.cat([empty_targets, target_ids], dim=1).to(self.device) # bsz x (1 + s1 + s2)
	assert inputs_embeds.size()[1] == targets.size()[1]

	atts_prefix = torch.ones([batch_size, 1 + p_before_embeds.size()[1]], dtype=torch.long).to(
	self.device) # bsz x (1 + s1)
	attention_mask = torch.cat([atts_prefix, attention_mask], dim=1).to(self.device)
	assert attention_mask.size() == targets.size() # bsz x (1 + s1 + s2)
	return inputs_embeds, targets, attention_mask

	def _train_with_mode(self, texts, img_embeds=None, modality='text', num_gen_tokens='8',
	text_hidden_fcs=None, gen_token_idx=None, text_emb_layers=None, text_prompt_embeddins=None,
	loss_scale=1.0, stage=2):
	"""
	:param num_gen_tokens: the number of generation tokens
	:param modality: mode can be 'image' / 'video' / 'audio' / 'text'
	:param text_hidden_fcs: alignment module
	:param gen_token_idx: List
	:param text_emb_layers: the layer index of LLM hidden states
	:param text_prompt_embeddins: the textual caption/prompt embeddings
	:param loss_scale: the scale on the mse loss for alignment
	:param stage: the training stage
	:param
	"""
	if stage == 2:
	input_ids, target_ids, attention_mask = process_batch_stage_2(self.llama_tokenizer, texts,
	self.max_length,
	num_gen_tokens,
	modality
	)
	elif stage == 3:
	input_ids, target_ids, attention_mask = process_batch_stage_3(self.llama_tokenizer, texts, self.max_length,
	self.args['num_gen_img_tokens'],
	self.args['num_gen_video_tokens'],
	self.args['num_gen_audio_tokens']
	)
	else:
	raise NotImplementedError
	inputs_embeds, targets, attention_mask = self.prompt_wrap(img_embeds, input_ids, target_ids, attention_mask)

	outputs = self.llama_model(
	inputs_embeds=inputs_embeds,
	attention_mask=attention_mask,
	return_dict=True,
	output_hidden_states=True,
	labels=targets,
	)

	loss = outputs.loss
	# calculate the token accuracy
	chosen_tokens = torch.max(outputs.logits, dim=-1)[1][:, 1:-1] # [B, S-1]
	labels = targets[:, 2:]
	gen_acc = (chosen_tokens.reshape(-1) == labels.reshape(-1)).to(torch.long) # [B*S]
	valid_mask = (labels != -100).reshape(-1)
	valid_tokens = gen_acc & valid_mask # [B*S]
	gen_acc = valid_tokens.sum().item() / (valid_mask.sum().item() + 1.0)

	if modality == 'text':
	return loss, gen_acc, torch.zeros_like(loss)
	else:
	hidden_states = []
	# text_hidden_fcs = self.gen_text_hidden_fcs

	# based on the targets to obtain the hidden state, targets includes the [BOS] token
	start_pos = (targets == gen_token_idx[0]).nonzero(as_tuple=False)[:, 1].tolist()
	end_pos = (targets == gen_token_idx[-1]).nonzero(as_tuple=False)[:, 1].tolist()
	# logging.info(f'targets : {targets}')
	# logging.info(f'start_pos : {start_pos}')
	# logging.info(f'end_pos : {end_pos}')
	assert 0 < len(start_pos) == len(end_pos) == input_ids.size(0) and len(end_pos) > 0, (start_pos, end_pos)
	for idx, fc_layer in zip(text_emb_layers, text_hidden_fcs):
	hidden_embedding = []
	input_embedding = []
	for b, (s, e) in enumerate(zip(start_pos, end_pos)):
	assert e - s + 1 == num_gen_tokens, (s, e)
	hidden_embedding.append(outputs.hidden_states[idx][b, s:e + 1, :])
	input_embedding.append(self.input_embeddings(targets[b, s:e + 1]))
	hidden_embedding = torch.stack(hidden_embedding, dim=0)
	input_embedding = torch.stack(input_embedding, dim=0)
	hidden_states.append(fc_layer(hidden_embedding, input_embedding)) # (N, seq_len, 2048)
	embeddings = torch.stack(hidden_states, dim=-1).sum(dim=-1) # (N, 77, 768)
	# embeddings = embeddings / embeddings.norm(dim=-1, keepdim=True) # (N, T_I_V_A.txt, 256)

	# Obtain the embeddings produced by the text encoder of a frozen text-to-image generation model
	input_text = [conversation for conversation in texts]

	if modality == 'image':
	mse_loss = l2_loss(embeddings, torch.stack(text_prompt_embeddins, dim=0).to(self.device))
	elif modality == 'video':
	mse_loss = l2_loss(embeddings, torch.stack(text_prompt_embeddins, dim=0).to(self.device))
	else:
	text_prompt_embeddins = torch.stack(text_prompt_embeddins, dim=0).to(self.device)
	assert len(text_prompt_embeddins.shape) == 2, text_prompt_embeddins.shape
	text_prompt_embeddins = text_prompt_embeddins.view(text_prompt_embeddins.size(0), 1,
	text_prompt_embeddins.size(1))
	mse_loss = l2_loss(embeddings, text_prompt_embeddins)
	mse_loss = mse_loss.mean()
	loss += loss_scale * mse_loss

	return loss, gen_acc, mse_loss

	def _enc_align_training_stage_1(self, inputs):
	"""
	In the stage 1: training the encoding-side alignment via image/video/audio caption tasks
	modality: the input modality for each caption task, it could be 'image', 'video' or 'audio'.
	"""
	dataset_type = inputs['dataset_types'][0]
	if dataset_type == 'ImageToText':
	image_paths = inputs['mm_paths']
	mm_embeds, _ = self.encode_image(image_paths)
	elif dataset_type == 'VideoToText':
	video_paths = inputs['mm_paths']
	mm_embeds, _ = self.encode_video(video_paths)
	elif dataset_type == 'AudioToText':
	audio_paths = inputs['mm_paths']
	mm_embeds, _ = self.encode_audio(audio_paths)
	else:
	raise NotImplementedError
	input_ids, target_ids, attention_mask = process_batch_stage_1(self.llama_tokenizer,
	inputs['output_texts'],
	self.max_length,
	self.args['prompt'])
	# print(input_ids)
	inputs_embeds, targets, attention_mask = self.prompt_wrap(mm_embeds, input_ids, target_ids, attention_mask)
	outputs = self.llama_model(
	inputs_embeds=inputs_embeds,
	attention_mask=attention_mask,
	return_dict=True,
	output_hidden_states=True,
	labels=targets,
	)

	loss = outputs.loss
	# calculate the token accuracy
	chosen_tokens = torch.max(outputs.logits, dim=-1)[1][:, 1:-1] # [B, S-1]
	labels = targets[:, 2:]
	gen_acc = (chosen_tokens.reshape(-1) == labels.reshape(-1)).to(torch.long) # [B*S]
	valid_mask = (labels != -100).reshape(-1)
	valid_tokens = gen_acc & valid_mask # [B*S]
	gen_acc = valid_tokens.sum().item() / (valid_mask.sum().item() + 1.0)
	return loss, gen_acc

	def _dec_align_training_stage_2(self, inputs):
	"""
	In the stage 2: training the decoding-side alignment via minimize the distance between the
	representation of signal tokens and caption from text encoder within the respective diffusion models.
	modality: the output modality for each caption.
	"""
	dataset_type = inputs['dataset_types'][0]
	if dataset_type == 'TextToImage':
	loss, gen_acc, mse_loss = self._train_with_mode(texts=inputs['output_texts'],
	modality='image',
	num_gen_tokens=self.args['num_gen_img_tokens'],
	text_hidden_fcs=self.gen_text_hidden_fcs,
	gen_token_idx=self.args['gen_img_token_idx'],
	text_emb_layers=self.args['text_emb_to_img_layers'],
	text_prompt_embeddins=inputs['caption_embs'],
	stage=self.stage)
	elif dataset_type == 'TextToVideo':
	loss, gen_acc, mse_loss = self._train_with_mode(texts=inputs['output_texts'],
	modality='video',
	num_gen_tokens=self.args['num_gen_video_tokens'],
	text_hidden_fcs=self.gen_text_hidden_fcs_video,
	gen_token_idx=self.args['gen_video_token_idx'],
	text_emb_layers=self.args['text_emb_to_video_layers'],
	text_prompt_embeddins=inputs['caption_embs'],
	stage=self.stage)
	elif dataset_type == 'TextToAudio':
	loss, gen_acc, mse_loss = self._train_with_mode(texts=inputs['output_texts'],
	modality='audio',
	num_gen_tokens=self.args['num_gen_audio_tokens'],
	text_hidden_fcs=self.gen_text_hidden_fcs_audio,
	gen_token_idx=self.args['gen_audio_token_idx'],
	text_emb_layers=self.args['text_emb_to_audio_layers'],
	text_prompt_embeddins=inputs['caption_embs'],
	stage=self.stage)
	else:
	raise NotImplementedError

	return loss, gen_acc, mse_loss

	def _instruction_tuning_stage_3(self, inputs):
	"""
	In the stage 3: instruction-following training via the instruction dataset.
	"""
	loss = 0
	gen_acc = 0
	mse_loss = []

	dataset_type = inputs['dataset_types'][0]
	if dataset_type == 'TextToImage':
	loss, gen_acc, mse_loss = self._train_with_mode(inputs['output_texts'], None, 'image',
	self.args['num_gen_img_tokens'],
	self.gen_text_hidden_fcs,
	self.args['gen_img_token_idx'],
	self.args['text_emb_to_img_layers'],
	inputs['caption_embs'], stage=self.stage)
	elif dataset_type == 'TextToVideo':
	loss, gen_acc, mse_loss = self._train_with_mode(inputs['output_texts'], None, 'video',
	self.args['num_gen_video_tokens'],
	self.gen_text_hidden_fcs_video,
	self.args['gen_video_token_idx'],
	self.args['text_emb_to_video_layers'],
	inputs['caption_embs'], loss_scale=2,
	stage=self.stage)
	elif dataset_type == 'TextToAudio':
	loss, gen_acc, mse_loss = self._train_with_mode(inputs['output_texts'], None, 'audio',
	self.args['num_gen_audio_tokens'],
	self.gen_text_hidden_fcs_audio,
	self.args['gen_audio_token_idx'],
	self.args['text_emb_to_audio_layers'],
	inputs['caption_embs'], stage=self.stage)
	elif dataset_type == 'ImageToText':
	image_paths = inputs['mm_paths']
	img_embeds, _ = self.encode_image(image_paths)
	loss, gen_acc, _ = self._train_with_mode(inputs['output_texts'], img_embeds, modality='text',
	stage=self.stage)
	elif dataset_type == 'TextToText':
	loss, gen_acc, _ = self._train_with_mode(inputs['output_texts'], None, modality='text',
	stage=self.stage)
	else:
	raise NotImplementedError
	return loss, gen_acc, mse_loss

	def _stage_4_training(self, inputs):
	"""
	In the stage 4, we employ the modality-switch dataset to instruction-tune the overall framework
	"""
	pass

	def forward(self, inputs):
	loss = 0
	gen_acc = 0
	mse_loss = None

	if self.stage == 1:
	loss, gen_acc = self._enc_align_training_stage_1(inputs)
	elif self.stage == 2:
	loss, gen_acc, mse_loss = self._dec_align_training_stage_2(inputs)
	elif self.stage == 3:
	loss, gen_acc, mse_loss = self._instruction_tuning_stage_3(inputs)
	else:
	raise NotImplementedError(f"stage {self.stage} is not implemented, now it only support [1, 2, 3]")

	return loss, gen_acc, mse_loss

	def extract_multimodal_feature(self, inputs):
	features = []
	if inputs['image_paths']:
	image_embeds, _ = self.encode_image(inputs['image_paths'])
	features.append(image_embeds)
	if inputs['audio_paths']:
	audio_embeds, _ = self.encode_audio(inputs['audio_paths'])
	features.append(audio_embeds)
	if inputs['video_paths']:
	video_embeds, _ = self.encode_video(inputs['video_paths'])
	features.append(video_embeds)

	feature_embeds = torch.cat(features).sum(dim=0).unsqueeze(0)
	return feature_embeds

	def _prepare_image_embed(self, text, batch_size):
	pattern = r'Image>(.*?)<\/Image'
	matches = re.findall(pattern, text)
	features = []
	p_before_token = self.llama_tokenizer('<Img>', add_special_tokens=False, return_tensors='pt').to(self.device)
	p_after_token = self.llama_tokenizer('</Img>', add_special_tokens=False, return_tensors='pt').to(self.device)
	if self.args['freeze_lm']:
	p_before_embeds = self.llama_model.model.embed_tokens(p_before_token.input_ids).expand(batch_size, -1,
	-1) # bsz x s1 x embed_dim
	p_after_embeds = self.llama_model.model.embed_tokens(p_after_token.input_ids).expand(batch_size, -1,
	-1) # bsz x s2 x embed_dim
	else:
	p_before_embeds = self.llama_model.model.model.embed_tokens(p_before_token.input_ids).expand(batch_size, -1,
	-1) # bsz x s1 x embed_dim
	p_after_embeds = self.llama_model.model.model.embed_tokens(p_after_token.input_ids).expand(batch_size, -1,
	-1) # bsz x s2 x embed_dim
	for m in matches:
	print('image path: ', m)
	if m.startswith('temp'):
	m = os.path.join('./', m)
	print('image path: ', m)
	_temp_embedding, _ = self.encode_image([m])
	features.append(_temp_embedding)
	feature_embeds = torch.cat(features).sum(dim=0).unsqueeze(0)
	return torch.cat([p_before_embeds, feature_embeds, p_after_embeds], dim=1)

	def _prepare_video_embed(self, text, batch_size):
	pattern = r'Video>(.*?)<\/Video'
	matches = re.findall(pattern, text)
	features = []
	p_before_token = self.llama_tokenizer('<Img>', add_special_tokens=False, return_tensors='pt').to(self.device)
	p_after_token = self.llama_tokenizer('</Img>', add_special_tokens=False, return_tensors='pt').to(self.device)
	if self.args['freeze_lm']:
	p_before_embeds = self.llama_model.model.embed_tokens(p_before_token.input_ids).expand(batch_size, -1,
	-1) # bsz x s1 x embed_dim
	p_after_embeds = self.llama_model.model.embed_tokens(p_after_token.input_ids).expand(batch_size, -1,
	-1) # bsz x s2 x embed_dim
	else:
	p_before_embeds = self.llama_model.model.model.embed_tokens(p_before_token.input_ids).expand(batch_size, -1,
	-1) # bsz x s1 x embed_dim
	p_after_embeds = self.llama_model.model.model.embed_tokens(p_after_token.input_ids).expand(batch_size, -1,
	-1) # bsz x s2 x embed_dim
	for m in matches:
	print('Video path: ', m)
	if m.startswith('temp'):
	m = os.path.join('./', m)
	print('Video path: ', m)
	_temp_embedding, _ = self.encode_video([m])
	features.append(_temp_embedding)
	feature_embeds = torch.cat(features).sum(dim=0).unsqueeze(0)
	return torch.cat([p_before_embeds, feature_embeds, p_after_embeds], dim=1)

	def _prepare_audio_embed(self, text, batch_size):
	pattern = r'Audio>(.*?)<\/Audio'
	matches = re.findall(pattern, text)
	features = []
	p_before_token = self.llama_tokenizer('<Img>', add_special_tokens=False, return_tensors='pt').to(self.device)
	p_after_token = self.llama_tokenizer('</Img>', add_special_tokens=False, return_tensors='pt').to(self.device)
	if self.args['freeze_lm']:
	p_before_embeds = self.llama_model.model.embed_tokens(p_before_token.input_ids).expand(batch_size, -1,
	-1) # bsz x s1 x embed_dim
	p_after_embeds = self.llama_model.model.embed_tokens(p_after_token.input_ids).expand(batch_size, -1,
	-1) # bsz x s2 x embed_dim
	else:
	p_before_embeds = self.llama_model.model.model.embed_tokens(p_before_token.input_ids).expand(batch_size, -1,
	-1) # bsz x s1 x embed_dim
	p_after_embeds = self.llama_model.model.model.embed_tokens(p_after_token.input_ids).expand(batch_size, -1,
	-1) # bsz x s2 x embed_dim
	for m in matches:
	print('Audio path: ', m)
	if m.startswith('temp'):
	m = os.path.join('./', m)
	print('Video path: ', m)
	_temp_embedding, _ = self.encode_audio([m])
	features.append(_temp_embedding)
	feature_embeds = torch.cat(features).sum(dim=0).unsqueeze(0)
	return torch.cat([p_before_embeds, feature_embeds, p_after_embeds], dim=1)

	def prepare_generation_embedding(self, inputs):
	prompt = inputs['prompt']
	text = prompt + '\n### Assistant:'
	print("text prompt: ", text)
	batch_size = 1
	input_embeds = []
	split_text = re.split(r' <\|> ', text)
	for st in split_text:
	if st.startswith('Image>'):
	input_embeds.append(self._prepare_image_embed(st, batch_size))
	elif st.startswith('Audio>'):
	input_embeds.append(self._prepare_audio_embed(st, batch_size))
	elif st.startswith('Video>'):
	input_embeds.append(self._prepare_video_embed(st, batch_size))
	else:
	text_tokens = self.llama_tokenizer(st, add_special_tokens=False, return_tensors='pt').to(self.device)
	bos = torch.ones([batch_size, 1],
	dtype=text_tokens.input_ids.dtype,
	device=text_tokens.input_ids.device) * self.llama_tokenizer.bos_token_id # bsz x 1
	if self.args['freeze_lm']:
	text_embeds = self.llama_model.model.embed_tokens(text_tokens.input_ids).expand(batch_size, -1, -1)
	bos_embeds = self.llama_model.model.embed_tokens(bos) # bsz x 1 x embed_dim
	else:
	text_embeds = self.llama_model.model.model.embed_tokens(text_tokens.input_ids).expand(batch_size,
	-1, -1)
	bos_embeds = self.llama_model.model.model.embed_tokens(bos) # bsz x 1 x embed_dim
	input_embeds.append(bos_embeds)
	input_embeds.append(text_embeds)
	inputs_embeds = torch.cat(input_embeds, dim=1) # bsz x (1+s2) x embed_dim
	return inputs_embeds

	def generate_tokens_embeddings(self, inputs, input_embeds, temperature: float = 0.0, top_p: float = 1.0):
	"""
	This function is used to generate the tokens and output embeddings that employed to generate images/videos/audios
	inputs: dict
	input_embeds: tensor
	return:
	out: the output tokens index
	output_embeddings: output embeddings for synthesizing images
	video_output_embedding: output embeddings for synthesizing video
	audio_output_embedding: output embeddings for synthesizing audio
	"""
	stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=inputs['stops_id'], encounters=1)])

	outputs = self.llama_model.generate(
	inputs_embeds=input_embeds,
	max_new_tokens=inputs['max_tgt_len'],
	top_p=inputs['top_p'],
	temperature=inputs['temperature'],
	# repeat_pen,
	do_sample=True,
	use_cache=True,
	stopping_criteria=stopping_criteria,
	output_hidden_states=True,
	return_dict_in_generate=True,
	output_attentions=True
	)

	output_embeddings = []
	video_output_embedding = []
	audio_output_embedding = []
	out = outputs.sequences
	for _hidden_states in outputs.hidden_states[1:]:
	for idx in self.args['text_emb_to_img_layers']:
	output_embeddings.append(_hidden_states[idx])
	for idx in self.args['text_emb_to_video_layers']:
	video_output_embedding.append(_hidden_states[idx])
	for idx in self.args['text_emb_to_audio_layers']:
	audio_output_embedding.append(_hidden_states[idx])
	output_embeddings = torch.cat(output_embeddings, dim=1)
	video_output_embedding = torch.cat(video_output_embedding, dim=1)
	audio_output_embedding = torch.cat(audio_output_embedding, dim=1)

	return out, output_embeddings, video_output_embedding, audio_output_embedding

	def generate_images(self, generated_ids, embeddings, all_gen_idx, generation_model=None,
	guidance_scale=7.5, num_inference_steps=40):
	"""
	To generate the images based on the embeddings
	generated_ids: the index of the generated tokens
	embedding: the embeddings for synthesizing images
	all_gen_idx: the index of [IMG0] in the generated_ids
	"""
	last_ret_idx = 0
	return_outputs = []
	generation_model = StableDiffusionPipeline.from_pretrained(self.sd_ckpt_path, torch_dtype=torch.float16).to(
	"cuda")
	for gen_idx in all_gen_idx:
	assert generated_ids[0,
	gen_idx:gen_idx + self.args['num_gen_img_tokens']].cpu().detach().numpy().tolist() == self.args[
	'gen_img_token_idx'], (
	generated_ids[0, gen_idx:gen_idx + self.args['num_gen_img_tokens']], self.args['gen_img_token_idx'])
	raw_emb = embeddings[:, gen_idx - 1:gen_idx - 1 + self.args['num_gen_img_tokens'], :] # (1, 8, 4096)

	# Produce generation embedding.
	gen_prefix = ' '.join([f'[IMG{i}]' for i in range(self.args['num_gen_img_tokens'])])
	gen_prefx_ids = self.llama_tokenizer(gen_prefix, add_special_tokens=False,
	return_tensors="pt").input_ids.to(self.device)
	gen_prefix_embs = self.input_embeddings(gen_prefx_ids) # (1, T_I_V_A.txt, D)
	gen_emb = self.gen_text_hidden_fcs[-1](raw_emb, gen_prefix_embs) # (1, 77, 768)

	if gen_emb.shape[1] != 77:
	bs = gen_emb.shape[0]
	clip_emb = 768
	gen_emb = gen_emb.reshape(bs, -1, clip_emb) # (bs, T_I_V_A.txt, 768)
	seq_len = gen_emb.shape[1]
	gen_emb = torch.cat([gen_emb, torch.zeros((bs, 77 - seq_len, clip_emb), device=gen_emb.device,
	dtype=gen_emb.dtype)], dim=1)

	image_outputs = generation_model(prompt_embeds=gen_emb,
	guidance_scale=guidance_scale,
	num_inference_steps=num_inference_steps).images

	caption = \
	self.llama_tokenizer.batch_decode(generated_ids[:, last_ret_idx:gen_idx], skip_special_tokens=True)[
	0]
	last_ret_idx = gen_idx + 1
	return_outputs.append(caption + f' {gen_prefix}')
	# return_outputs.append(truncate_caption(caption) + f' {gen_prefix}')
	return_outputs.append(image_outputs)
	return return_outputs

	def generate_videos(self, generated_ids, embeddings, all_gen_idx, generation_model=None,
	guidance_scale=7.5, num_inference_steps=40, height=320, width=576, num_frames=16):
	"""
	To generate videos based on the embeddings
	generated_ids: the index of the generated tokens
	embedding: the embeddings for synthesizing videos
	all_gen_idx: the index of [VID0] in the generated_ids
	"""
	return_outputs = []
	last_ret_idx = 0
	generation_model = TextToVideoSDPipeline.from_pretrained(self.vd_ckpt_path, torch_dtype=torch.float16).to(
	"cuda")
	for gen_idx in all_gen_idx:
	assert generated_ids[0,
	gen_idx:gen_idx + self.args['num_gen_video_tokens']].cpu().detach().numpy().tolist() == \
	self.args[
	'gen_video_token_idx'], (
	generated_ids[0, gen_idx:gen_idx + self.args['num_gen_video_tokens']],
	self.args['gen_video_token_idx'])
	raw_emb = embeddings[:, gen_idx - 1:gen_idx - 1 + self.args['num_gen_video_tokens'], :] # (1, 8, 4096)
	# print(f'gen_idx: {gen_idx}')
	# print('4', raw_emb.size())
	# assert len(self.args['text_emb_to_video_layers']) == 1

	# Produce generation embedding.
	gen_prefix = ' '.join([f'[VID{i}]' for i in range(self.args['num_gen_video_tokens'])])
	gen_prefx_ids = self.llama_tokenizer(gen_prefix, add_special_tokens=False,
	return_tensors="pt").input_ids.to(self.device)
	gen_prefix_embs = self.input_embeddings(gen_prefx_ids) # (1, T_I_V_A.txt, D)
	gen_emb = self.gen_text_hidden_fcs_video[-1](raw_emb, gen_prefix_embs) # (1, 77, 768)

	if gen_emb.shape[1] != 77:
	print(f"Padding {gen_emb.shape} with zeros")
	bs = gen_emb.shape[0]
	clip_emb = 768
	gen_emb = gen_emb.reshape(bs, -1, clip_emb) # (bs, T_I_V_A.txt, 768)
	seq_len = gen_emb.shape[1]
	gen_emb = torch.cat([gen_emb, torch.zeros((bs, 77 - seq_len, clip_emb), device=gen_emb.device,
	dtype=gen_emb.dtype)], dim=1)
	print('Padded to', gen_emb.shape)

	video_outputs = generation_model(prompt_embeds=gen_emb,
	guidance_scale=guidance_scale,
	num_inference_steps=num_inference_steps, height=height,
	width=width, num_frames=num_frames).frames
	caption = \
	self.llama_tokenizer.batch_decode(generated_ids[:, last_ret_idx:gen_idx], skip_special_tokens=True)[
	0]
	last_ret_idx = gen_idx + 1
	return_outputs.append(caption + f' {gen_prefix}')
	# return_outputs.append(truncate_caption(caption) + f' {gen_prefix}')
	return_outputs.append(video_outputs)
	return return_outputs

	def generate_audios(self, generated_ids, embeddings, all_gen_idx, generation_model=None,
	guidance_scale=7.5, num_inference_steps=40, audio_length_in_s=5.0):
	"""
	To generate videos based on the embeddings
	generated_ids: the index of the generated tokens
	embedding: the embeddings for synthesizing audios
	all_gen_idx: the index of [AUD0] in the generated_ids
	"""
	return_outputs = []
	last_ret_idx = 0
	generation_model = AudioLDMPipeline.from_pretrained(self.ad_ckpt_path, torch_dtype=torch.float16).to("cuda")
	for gen_idx in all_gen_idx:
	assert generated_ids[0,
	gen_idx:gen_idx + self.args['num_gen_audio_tokens']].cpu().detach().numpy().tolist() == \
	self.args[
	'gen_audio_token_idx'], (
	generated_ids[0, gen_idx:gen_idx + self.args['num_gen_audio_tokens']],
	self.args['gen_audio_token_idx'])
	raw_emb = embeddings[:, gen_idx - 1:gen_idx - 1 + self.args['num_gen_audio_tokens'], :] # (1, 8, 4096)
	# print(f'gen_idx: {gen_idx}')
	# print('raw_emb 4', raw_emb.size())
	# assert len(self.args['text_emb_to_video_layers']) == 1

	# Produce generation embedding.
	gen_prefix = ' '.join([f'[AUD{i}]' for i in range(self.args['num_gen_audio_tokens'])])
	gen_prefx_ids = self.llama_tokenizer(gen_prefix, add_special_tokens=False,
	return_tensors="pt").input_ids.to(self.device)
	gen_prefix_embs = self.input_embeddings(gen_prefx_ids) # (1, T_I_V_A.txt, D)
	gen_emb = self.gen_text_hidden_fcs_audio[-1](raw_emb, gen_prefix_embs) # (1, 77, 768)
	# print('gen_emb size:', gen_emb.size())
	bs = gen_emb.shape[0]
	hid_emb_size = gen_emb.shape[2]
	gen_emb = gen_emb.view(bs, hid_emb_size)

	audio_outputs = generation_model(prompt_embeds=gen_emb,
	guidance_scale=guidance_scale,
	num_inference_steps=num_inference_steps,
	audio_length_in_s=audio_length_in_s).audios[0]
	caption = \
	self.llama_tokenizer.batch_decode(generated_ids[:, last_ret_idx:gen_idx], skip_special_tokens=True)[
	0]
	last_ret_idx = gen_idx + 1
	return_outputs.append(caption + f' {gen_prefix}')
	# return_outputs.append(truncate_caption(caption) + f' {gen_prefix}')
	return_outputs.append(audio_outputs)
	return return_outputs

	def generate(self, inputs):
	"""
	inputs = {
	'image_paths': optional,
	'audio_paths': optional
	'video_paths': optional
	'thermal_paths': optional
	'mode': generation mode,
	'prompt': human input prompt,
	'max_tgt_len': generation length,
	'top_p': top_p,
	'temperature': temperature, Used to modulate logit distribution.
	'modality_embeds': None or torch.tensor,
	'modality_cache': save the image cache,

	'filter_value': Value to assign to tokens that should never be generated,
	'min_word_tokens': Minimum number of words to generate before allowing a [IMG] output.
	'gen_scale_factor': float = 1.0,
	'stops_id': the default value is [[835], [2277, 29937]] the stop token is '###', which has two types of tokenization ways, [835] and [2277, 29937]
	'ENCOUNTERS': the times that the generated sentence will be ended.

	'load_sd': whether use SD for image generation
	'max_num_imgs': Maximum number of images to return in one generation pass.
	'guidance_scale_for_img': the guidance ratio of conditioner, if it is None, the default value will be applied in SD
	'num_inference_steps_for_img': the number of inference step for image generation in the stable diffusion model

	'load_vd': whether use VD for video generation
	'max_num_vids': Maximum number of videos to return in one generation pass.
	'guidance_scale_for_vid': the guidance ratio of conditioner, if it is None, the default value will be applied in VD
	'num_inference_steps_for_vid': the number of inference step for video generation in the stable diffusion model
	'height': (`int`, optional, defaults to self.unet.config.sample_size * self.vae_scale_factor):
	The height in pixels of the generated video.
	'width': (`int`, optional, defaults to self.unet.config.sample_size * self.vae_scale_factor):
	The width in pixels of the generated video.
	'num_frames': (`int`, optional, defaults to 16):
	The number of video frames that are generated. Defaults to 16 frames which at 8 frames per seconds
	amounts to 2 seconds of video.

	'load_ad': whether use AD for audio generation
	'max_num_auds': Maximum number of audios to return in one generation pass.
	'guidance_scale_for_aud': the guidance ratio of conditioner, if it is None, the default value will be applied in AD
	'num_inference_steps_for_aud': the number of inference step for audio generation in the stable diffusion model
	'audio_length_in_s': the seconds for generated audio length
	}
	"""
	# init output with image tokens

	input_embeds = self.prepare_generation_embedding(inputs)
	generated_ids, generated_image_embeddings, generated_video_embeddings, generated_audio_embeddings = self.generate_tokens_embeddings(
	inputs, input_embeds)

	return_outputs = []

	# Find up to max_num_rets [IMG] tokens, and their corresponding scores.
	all_gen_img_idx = [i for i, x in enumerate(generated_ids[0, :] == self.args['gen_img_token_idx'][0]) if x][
	:inputs['max_num_imgs']]
	print('all_gen_img_idx: ', all_gen_img_idx)

	# Find up to max_num_rest [VID] tokens, and their corresponding scores.
	all_gen_vid_idx = [i for i, x in enumerate(generated_ids[0, :] == self.args['gen_video_token_idx'][0]) if x][
	:inputs['max_num_vids']]
	print('all_gen_vid_idx: ', all_gen_vid_idx)

	# Find up to max_num_rest [AUD] tokens, and their corresponding scores.
	all_gen_aud_idx = [i for i, x in enumerate(generated_ids[0, :] == self.args['gen_audio_token_idx'][0]) if x][
	:inputs['max_num_auds']]
	print('all_gen_aud_idx: ', all_gen_aud_idx)

	if len(all_gen_img_idx) == 0 and len(all_gen_vid_idx) == 0 and len(all_gen_aud_idx) == 0:
	# No [IMG], [VID], [AUD] tokens.
	caption = self.llama_tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
	# return_outputs.append(truncate_caption(caption))
	return_outputs.append(caption)
	else:
	if len(all_gen_img_idx) > 0:
	img_outputs = self.generate_images(generated_ids, generated_image_embeddings, all_gen_img_idx, None,
	guidance_scale=inputs['guidance_scale_for_img'],
	num_inference_steps=inputs['num_inference_steps_for_img'],
	)
	return_outputs.append({'img': img_outputs})
	if len(all_gen_vid_idx) > 0:
	vid_outputs = self.generate_videos(generated_ids, generated_video_embeddings, all_gen_vid_idx, None,
	guidance_scale=inputs['guidance_scale_for_vid'],
	num_inference_steps=inputs['num_inference_steps_for_vid'],
	height=inputs['height'], width=inputs['width'],
	num_frames=inputs['num_frames'])
	return_outputs.append({'vid': vid_outputs})

	if len(all_gen_aud_idx) > 0:
	aud_outputs = self.generate_audios(generated_ids, generated_audio_embeddings, all_gen_aud_idx, None,
	guidance_scale=inputs['guidance_scale_for_aud'],
	num_inference_steps=inputs['num_inference_steps_for_aud'],
	audio_length_in_s=inputs['audio_length_in_s'])
	return_outputs.append({'aud': aud_outputs})

	return return_outputs