Spaces:

fffiloni
/

Open-Sora-Plan-v1-0-0

Runtime error

App Files Files Community

Open-Sora-Plan-v1-0-0 / opensora /train /train_t2v.py

fffiloni

Upload 244 files

b3f324b verified 5 months ago

raw

history blame contribute delete

No virus

37.9 kB

	# Copyright (c) Meta Platforms, Inc. and affiliates.
	# All rights reserved.

	# This source code is licensed under the license found in the
	# LICENSE file in the root directory of this source tree.

	"""
	A minimal training script for DiT using PyTorch DDP.
	"""
	import argparse
	import logging
	import math
	import os
	import shutil
	from pathlib import Path
	from typing import Optional

	import numpy as np
	from einops import rearrange
	from tqdm import tqdm
	from dataclasses import field, dataclass
	from torch.utils.data import DataLoader
	from copy import deepcopy

	import accelerate
	import torch
	from torch.nn import functional as F
	import transformers
	from accelerate import Accelerator
	from accelerate.logging import get_logger
	from accelerate.utils import ProjectConfiguration, set_seed
	from huggingface_hub import create_repo
	from packaging import version
	from tqdm.auto import tqdm
	from transformers import HfArgumentParser, TrainingArguments, AutoTokenizer

	import diffusers
	from diffusers import DDPMScheduler, PNDMScheduler
	from diffusers.optimization import get_scheduler
	from diffusers.training_utils import EMAModel, compute_snr
	from diffusers.utils import check_min_version, is_wandb_available

	from opensora.dataset import getdataset, ae_denorm
	from opensora.models.ae import getae, getae_wrapper
	from opensora.models.ae.videobase import CausalVQVAEModelWrapper, CausalVAEModelWrapper
	from opensora.models.diffusion.diffusion import create_diffusion_T as create_diffusion
	from opensora.models.diffusion.latte.modeling_latte import LatteT2V
	from opensora.models.text_encoder import get_text_enc
	from opensora.utils.dataset_utils import Collate
	from opensora.models.ae import ae_stride_config, ae_channel_config
	from opensora.models.diffusion import Diffusion_models

	# Will error if the minimal version of diffusers is not installed. Remove at your own risks.
	check_min_version("0.24.0")
	logger = get_logger(__name__)


	def generate_timestep_weights(args, num_timesteps):
	weights = torch.ones(num_timesteps)

	# Determine the indices to bias
	num_to_bias = int(args.timestep_bias_portion * num_timesteps)

	if args.timestep_bias_strategy == "later":
	bias_indices = slice(-num_to_bias, None)
	elif args.timestep_bias_strategy == "earlier":
	bias_indices = slice(0, num_to_bias)
	elif args.timestep_bias_strategy == "range":
	# Out of the possible 1000 timesteps, we might want to focus on eg. 200-500.
	range_begin = args.timestep_bias_begin
	range_end = args.timestep_bias_end
	if range_begin < 0:
	raise ValueError(
	"When using the range strategy for timestep bias, you must provide a beginning timestep greater or equal to zero."
	)
	if range_end > num_timesteps:
	raise ValueError(
	"When using the range strategy for timestep bias, you must provide an ending timestep smaller than the number of timesteps."
	)
	bias_indices = slice(range_begin, range_end)
	else: # 'none' or any other string
	return weights
	if args.timestep_bias_multiplier <= 0:
	return ValueError(
	"The parameter --timestep_bias_multiplier is not intended to be used to disable the training of specific timesteps."
	" If it was intended to disable timestep bias, use `--timestep_bias_strategy none` instead."
	" A timestep bias multiplier less than or equal to 0 is not allowed."
	)

	# Apply the bias
	weights[bias_indices] *= args.timestep_bias_multiplier

	# Normalize
	weights /= weights.sum()

	return weights


	#################################################################################
	# Training Loop #
	#################################################################################

	def main(args):
	logging_dir = Path(args.output_dir, args.logging_dir)

	accelerator_project_config = ProjectConfiguration(project_dir=args.output_dir, logging_dir=logging_dir)

	accelerator = Accelerator(
	gradient_accumulation_steps=args.gradient_accumulation_steps,
	mixed_precision=args.mixed_precision,
	log_with=args.report_to,
	project_config=accelerator_project_config,
	)

	if args.report_to == "wandb":
	if not is_wandb_available():
	raise ImportError("Make sure to install wandb if you want to use it for logging during training.")
	import wandb

	# Make one log on every process with the configuration for debugging.
	logging.basicConfig(
	format="%(asctime)s - %(levelname)s - %(name)s - %(message)s",
	datefmt="%m/%d/%Y %H:%M:%S",
	level=logging.INFO,
	)
	logger.info(accelerator.state, main_process_only=False)
	if accelerator.is_local_main_process:
	transformers.utils.logging.set_verbosity_warning()
	diffusers.utils.logging.set_verbosity_info()
	else:
	transformers.utils.logging.set_verbosity_error()
	diffusers.utils.logging.set_verbosity_error()

	# If passed along, set the training seed now.
	if args.seed is not None:
	set_seed(args.seed)

	# Handle the repository creation
	if accelerator.is_main_process:
	if args.output_dir is not None:
	os.makedirs(args.output_dir, exist_ok=True)

	# if args.push_to_hub:
	# repo_id = create_repo(
	# repo_id=args.hub_model_id or Path(args.output_dir).name, exist_ok=True, token=args.hub_token
	# ).repo_id

	# Create model:

	diffusion = create_diffusion(timestep_respacing="") # default: 1000 steps, linear noise schedule
	ae = getae_wrapper(args.ae)(args.ae_path).eval()
	if args.enable_tiling:
	ae.vae.enable_tiling()
	ae.vae.tile_overlap_factor = args.tile_overlap_factor
	text_enc = get_text_enc(args).eval()

	ae_stride_t, ae_stride_h, ae_stride_w = ae_stride_config[args.ae]
	args.ae_stride_t, args.ae_stride_h, args.ae_stride_w = ae_stride_t, ae_stride_h, ae_stride_w
	args.ae_stride = args.ae_stride_h
	patch_size = args.model[-3:]
	patch_size_t, patch_size_h, patch_size_w = int(patch_size[0]), int(patch_size[1]), int(patch_size[2])
	args.patch_size = patch_size_h
	args.patch_size_t, args.patch_size_h, args.patch_size_w = patch_size_t, patch_size_h, patch_size_w
	assert ae_stride_h == ae_stride_w, f"Support only ae_stride_h == ae_stride_w now, but found ae_stride_h ({ae_stride_h}), ae_stride_w ({ae_stride_w})"
	assert patch_size_h == patch_size_w, f"Support only patch_size_h == patch_size_w now, but found patch_size_h ({patch_size_h}), patch_size_w ({patch_size_w})"
	# assert args.num_frames % ae_stride_t == 0, f"Num_frames must be divisible by ae_stride_t, but found num_frames ({args.num_frames}), ae_stride_t ({ae_stride_t})."
	assert args.max_image_size % ae_stride_h == 0, f"Image size must be divisible by ae_stride_h, but found max_image_size ({args.max_image_size}), ae_stride_h ({ae_stride_h})."

	latent_size = (args.max_image_size // ae_stride_h, args.max_image_size // ae_stride_w)

	if getae_wrapper(args.ae) == CausalVQVAEModelWrapper or getae_wrapper(args.ae) == CausalVAEModelWrapper:
	args.video_length = video_length = args.num_frames // ae_stride_t + 1
	else:
	video_length = args.num_frames // ae_stride_t
	model = Diffusion_models[args.model](
	in_channels=ae_channel_config[args.ae],
	out_channels=ae_channel_config[args.ae] * 2,
	# caption_channels=4096,
	# cross_attention_dim=1152,
	attention_bias=True,
	sample_size=latent_size,
	num_vector_embeds=None,
	activation_fn="gelu-approximate",
	num_embeds_ada_norm=1000,
	use_linear_projection=False,
	only_cross_attention=False,
	double_self_attention=False,
	upcast_attention=False,
	# norm_type="ada_norm_single",
	norm_elementwise_affine=False,
	norm_eps=1e-6,
	attention_type='default',
	video_length=video_length,
	attention_mode=args.attention_mode,
	# compress_kv=args.compress_kv
	)
	model.gradient_checkpointing = args.gradient_checkpointing

	# # use pretrained model?
	if args.pretrained:
	checkpoint = torch.load(args.pretrained, map_location='cpu')['model']
	model_state_dict = model.state_dict()
	missing_keys, unexpected_keys = model.load_state_dict(checkpoint, strict=False)
	logger.info(f'missing_keys {len(missing_keys)}, unexpected_keys {len(unexpected_keys)}')
	logger.info(f'Successfully load {len(model.state_dict()) - len(missing_keys)}/{len(model_state_dict)} keys from {args.pretrained}!')
	# load from pixart-alpha
	# pixelart_alpha = torch.load(args.pretrained, map_location='cpu')['state_dict']
	# checkpoint = {}
	# for k, v in pixelart_alpha.items():
	# if 'x_embedder' in k or 't_embedder' in k or 'y_embedder' in k:
	# checkpoint[k] = v
	# if k.startswith('blocks'):
	# k_spilt = k.split('.')
	# blk_id = str(int(k_spilt[1]) * 2)
	# k_spilt[1] = blk_id
	# new_k = '.'.join(k_spilt)
	# checkpoint[new_k] = v
	# missing_keys, unexpected_keys = model.load_state_dict(checkpoint, strict=False)
	# logger.info(f'Successfully load {len(model.state_dict()) - len(missing_keys)} keys from {args.pretrained}!')

	# Freeze vae and text encoders.
	ae.requires_grad_(False)
	text_enc.requires_grad_(False)
	# Set model as trainable.
	model.train()

	# For mixed precision training we cast all non-trainable weigths to half-precision
	# as these weights are only used for inference, keeping weights in full precision is not required.
	weight_dtype = torch.float32
	if accelerator.mixed_precision == "fp16":
	weight_dtype = torch.float16
	elif accelerator.mixed_precision == "bf16":
	weight_dtype = torch.bfloat16

	# Move unet, vae and text_encoder to device and cast to weight_dtype
	# The VAE is in float32 to avoid NaN losses.
	ae.to(accelerator.device, dtype=torch.float32)
	text_enc.to(accelerator.device, dtype=weight_dtype)

	# Create EMA for the unet.
	if args.use_ema:
	ema_model = deepcopy(model)
	ema_model = EMAModel(ema_model.parameters(), model_cls=LatteT2V, model_config=ema_model.config)

	# `accelerate` 0.16.0 will have better support for customized saving
	if version.parse(accelerate.__version__) >= version.parse("0.16.0"):
	# create custom saving & loading hooks so that `accelerator.save_state(...)` serializes in a nice format
	def save_model_hook(models, weights, output_dir):
	if accelerator.is_main_process:
	if args.use_ema:
	ema_model.save_pretrained(os.path.join(output_dir, "model_ema"))

	for i, model in enumerate(models):
	model.save_pretrained(os.path.join(output_dir, "model"))
	if weights: # Don't pop if empty
	# make sure to pop weight so that corresponding model is not saved again
	weights.pop()

	def load_model_hook(models, input_dir):
	if args.use_ema:
	load_model = EMAModel.from_pretrained(os.path.join(input_dir, "model_ema"), LatteT2V)
	ema_model.load_state_dict(load_model.state_dict())
	ema_model.to(accelerator.device)
	del load_model

	for i in range(len(models)):
	# pop models so that they are not loaded again
	model = models.pop()

	# load diffusers style into model
	load_model = LatteT2V.from_pretrained(input_dir, subfolder="model")
	model.register_to_config(**load_model.config)

	model.load_state_dict(load_model.state_dict())
	del load_model

	accelerator.register_save_state_pre_hook(save_model_hook)
	accelerator.register_load_state_pre_hook(load_model_hook)

	# Enable TF32 for faster training on Ampere GPUs,
	# cf https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices
	if args.allow_tf32:
	torch.backends.cuda.matmul.allow_tf32 = True

	if args.scale_lr:
	args.learning_rate = (
	args.learning_rate * args.gradient_accumulation_steps * args.train_batch_size * accelerator.num_processes
	)

	# Use 8-bit Adam for lower memory usage or to fine-tune the model in 16GB GPUs
	if args.use_8bit_adam:
	try:
	import bitsandbytes as bnb
	except ImportError:
	raise ImportError(
	"To use 8-bit Adam, please install the bitsandbytes library: `pip install bitsandbytes`."
	)

	optimizer_class = bnb.optim.AdamW8bit
	else:
	optimizer_class = torch.optim.AdamW

	# Optimizer creation
	params_to_optimize = model.parameters()
	optimizer = optimizer_class(
	params_to_optimize,
	lr=args.learning_rate,
	betas=(args.adam_beta1, args.adam_beta2),
	weight_decay=args.adam_weight_decay,
	eps=args.adam_epsilon,
	)

	# Setup data:
	train_dataset = getdataset(args)
	train_dataloader = torch.utils.data.DataLoader(
	train_dataset,
	shuffle=True,
	# collate_fn=Collate(args), # TODO: do not enable dynamic mask in this point
	batch_size=args.train_batch_size,
	num_workers=args.dataloader_num_workers,
	)

	# Scheduler and math around the number of training steps.
	overrode_max_train_steps = False
	num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
	if args.max_train_steps is None:
	args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
	overrode_max_train_steps = True

	lr_scheduler = get_scheduler(
	args.lr_scheduler,
	optimizer=optimizer,
	num_warmup_steps=args.lr_warmup_steps * args.gradient_accumulation_steps,
	num_training_steps=args.max_train_steps * args.gradient_accumulation_steps,
	)

	# Prepare everything with our `accelerator`.
	model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(
	model, optimizer, train_dataloader, lr_scheduler
	)

	# We need to recalculate our total training steps as the size of the training dataloader may have changed.
	num_update_steps_per_epoch = math.ceil(len(train_dataloader) / args.gradient_accumulation_steps)
	if overrode_max_train_steps:
	args.max_train_steps = args.num_train_epochs * num_update_steps_per_epoch
	# Afterwards we recalculate our number of training epochs
	args.num_train_epochs = math.ceil(args.max_train_steps / num_update_steps_per_epoch)

	# We need to initialize the trackers we use, and also store our configuration.
	# The trackers initializes automatically on the main process.
	if accelerator.is_main_process:
	accelerator.init_trackers(args.output_dir, config=vars(args))

	# Train!
	total_batch_size = args.train_batch_size * accelerator.num_processes * args.gradient_accumulation_steps

	logger.info("*** Running training ***")
	logger.info(f" Num examples = {len(train_dataset)}")
	logger.info(f" Num Epochs = {args.num_train_epochs}")
	logger.info(f" Instantaneous batch size per device = {args.train_batch_size}")
	logger.info(f" Total train batch size (w. parallel, distributed & accumulation) = {total_batch_size}")
	logger.info(f" Gradient Accumulation steps = {args.gradient_accumulation_steps}")
	logger.info(f" Total optimization steps = {args.max_train_steps}")
	global_step = 0
	first_epoch = 0

	# Potentially load in the weights and states from a previous save
	if args.resume_from_checkpoint:
	if args.resume_from_checkpoint != "latest":
	path = os.path.basename(args.resume_from_checkpoint)
	else:
	# Get the most recent checkpoint
	dirs = os.listdir(args.output_dir)
	dirs = [d for d in dirs if d.startswith("checkpoint")]
	dirs = sorted(dirs, key=lambda x: int(x.split("-")[1]))
	path = dirs[-1] if len(dirs) > 0 else None

	if path is None:
	accelerator.print(
	f"Checkpoint '{args.resume_from_checkpoint}' does not exist. Starting a new training run."
	)
	args.resume_from_checkpoint = None
	initial_global_step = 0
	else:
	accelerator.print(f"Resuming from checkpoint {path}")
	accelerator.load_state(os.path.join(args.output_dir, path))
	global_step = int(path.split("-")[1])

	initial_global_step = global_step
	first_epoch = global_step // num_update_steps_per_epoch

	else:
	initial_global_step = 0

	progress_bar = tqdm(
	range(0, args.max_train_steps),
	initial=initial_global_step,
	desc="Steps",
	# Only show the progress bar once on each machine.
	disable=not accelerator.is_local_main_process,
	)

	for epoch in range(first_epoch, args.num_train_epochs):
	train_loss = 0.0
	for step, (x, input_ids, cond_mask) in enumerate(train_dataloader):
	with accelerator.accumulate(model):
	# Sample noise that we'll add to the latents
	x = x.to(accelerator.device) # B C T+num_images H W, 16 + 4
	# attn_mask = attn_mask.to(device) # B T H W
	# assert torch.all(attn_mask.bool()), 'do not enable dynamic input'
	attn_mask = None
	input_ids = input_ids.to(accelerator.device) # B L or B 1+num_images L
	cond_mask = cond_mask.to(accelerator.device) # B L or B 1+num_images L

	with torch.no_grad():
	# Map input images to latent space + normalize latents
	if args.use_image_num == 0:
	x = ae.encode(x) # B C T H W
	cond = text_enc(input_ids, cond_mask) # B L -> B L D
	else:
	videos, images = x[:, :, :-args.use_image_num], x[:, :, -args.use_image_num:]
	videos = ae.encode(videos) # B C T H W
	images = rearrange(images, 'b c t h w -> (b t) c 1 h w')
	images = ae.encode(images)
	images = rearrange(images, '(b t) c 1 h w -> b c t h w', t=args.use_image_num)
	x = torch.cat([videos, images], dim=2) # b c 16+4, h, w

	# use for loop to avoid OOM, because T5 is too huge...
	B, _, _ = input_ids.shape # B T+num_images L b 1+4, L
	cond = torch.stack([text_enc(input_ids[i], cond_mask[i]) for i in range(B)]) # B 1+num_images L D

	model_kwargs = dict(encoder_hidden_states=cond, attention_mask=attn_mask,
	encoder_attention_mask=cond_mask, use_image_num=args.use_image_num)
	t = torch.randint(0, diffusion.num_timesteps, (x.shape[0],), device=accelerator.device)
	loss_dict = diffusion.training_losses(model, x, t, model_kwargs)
	loss = loss_dict["loss"].mean()

	# Gather the losses across all processes for logging (if we use distributed training).
	avg_loss = accelerator.gather(loss.repeat(args.train_batch_size)).mean()
	train_loss += avg_loss.item() / args.gradient_accumulation_steps

	# Backpropagate
	accelerator.backward(loss)
	if accelerator.sync_gradients:
	params_to_clip = model.parameters()
	accelerator.clip_grad_norm_(params_to_clip, args.max_grad_norm)
	optimizer.step()
	lr_scheduler.step()
	optimizer.zero_grad()

	# Checks if the accelerator has performed an optimization step behind the scenes
	if accelerator.sync_gradients:
	progress_bar.update(1)
	global_step += 1
	accelerator.log({"train_loss": train_loss}, step=global_step)
	train_loss = 0.0

	if args.use_deepspeed or accelerator.is_main_process:
	if global_step % args.checkpointing_steps == 0:
	# _before_ saving state, check if this save would set us over the `checkpoints_total_limit`
	if args.checkpoints_total_limit is not None:
	checkpoints = os.listdir(args.output_dir)
	checkpoints = [d for d in checkpoints if d.startswith("checkpoint")]
	checkpoints = sorted(checkpoints, key=lambda x: int(x.split("-")[1]))

	# before we save the new checkpoint, we need to have at _most_ `checkpoints_total_limit - 1` checkpoints
	if len(checkpoints) >= args.checkpoints_total_limit:
	num_to_remove = len(checkpoints) - args.checkpoints_total_limit + 1
	removing_checkpoints = checkpoints[0:num_to_remove]

	logger.info(
	f"{len(checkpoints)} checkpoints already exist, removing {len(removing_checkpoints)} checkpoints"
	)
	logger.info(f"removing checkpoints: {', '.join(removing_checkpoints)}")

	for removing_checkpoint in removing_checkpoints:
	removing_checkpoint = os.path.join(args.output_dir, removing_checkpoint)
	shutil.rmtree(removing_checkpoint)

	save_path = os.path.join(args.output_dir, f"checkpoint-{global_step}")
	accelerator.save_state(save_path)
	logger.info(f"Saved state to {save_path}")

	logs = {"step_loss": loss.detach().item(), "lr": lr_scheduler.get_last_lr()[0]}
	progress_bar.set_postfix(**logs)

	if global_step >= args.max_train_steps:
	break

	if accelerator.is_main_process:
	validation_prompt = "The majestic beauty of a waterfall cascading down a cliff into a serene lake. The camera angle provides a bird's eye view of the waterfall."
	if global_step % args.checkpointing_steps == 0:
	logger.info(f"Running validation... \n"
	f"Generating {args.num_validation_videos} videos with prompt: {validation_prompt}")
	if args.use_ema:
	# Store the UNet parameters temporarily and load the EMA parameters to perform inference.
	ema_model.store(model.parameters())
	ema_model.copy_to(model.parameters())

	if args.enable_tracker:
	with torch.no_grad():
	# create pipeline
	ae_ = getae_wrapper(args.ae)(args.ae_path).to(accelerator.device).eval()
	if args.enable_tiling:
	ae_.vae.enable_tiling()
	ae_.vae.tile_overlap_factor = args.tile_overlap_factor
	# text_enc_ = get_text_enc(args).to(accelerator.device).eval()
	model_ = LatteT2V.from_pretrained(save_path, subfolder="model").to(accelerator.device).eval()
	diffusion_ = create_diffusion(str(250))
	tokenizer_ = AutoTokenizer.from_pretrained(args.text_encoder_name, cache_dir='./cache_dir')
	videos = []
	for idx in range(args.num_validation_videos):
	with torch.autocast(device_type='cuda', dtype=weight_dtype):
	z = torch.randn(1, model_.in_channels, video_length,
	latent_size[0], latent_size[1], device=accelerator.device)
	text_tokens_and_mask = tokenizer_(
	validation_prompt,
	max_length=args.model_max_length,
	padding='max_length',
	truncation=True,
	return_attention_mask=True,
	add_special_tokens=True,
	return_tensors='pt'
	)
	input_ids = text_tokens_and_mask['input_ids'].to(accelerator.device)
	cond_mask = text_tokens_and_mask['attention_mask'].to(accelerator.device)
	# cond = text_enc_(input_ids, cond_mask) # B L D
	cond = text_enc(input_ids, cond_mask) # B L D
	model_kwargs = dict(encoder_hidden_states=cond, attention_mask=None, encoder_attention_mask=cond_mask)
	sample_fn = model_.forward
	# Sample images:
	samples = diffusion_.p_sample_loop(
	sample_fn, z.shape, z, clip_denoised=False, model_kwargs=model_kwargs, progress=True,
	device=accelerator.device
	)
	samples = ae_.decode(samples)
	# Save and display images:
	video = (ae_denorm[args.ae](samples[0]) * 255).add_(0.5).clamp_(0, 255).to(
	dtype=torch.uint8).cpu().contiguous() # t c h w
	videos.append(video)

	videos = torch.stack(videos).numpy()
	for tracker in accelerator.trackers:
	if tracker.name == "tensorboard":
	np_videos = np.stack([np.asarray(vid) for vid in videos])
	tracker.writer.add_video("validation", np_videos, global_step, fps=10)
	if tracker.name == "wandb":
	tracker.log(
	{
	"validation": [
	wandb.Video(video, caption=f"{i}: {validation_prompt}", fps=10)
	for i, video in enumerate(videos)
	]
	}
	)

	# del ae_, text_enc_, model_, diffusion_, tokenizer_
	del ae_, model_, diffusion_, tokenizer_
	torch.cuda.empty_cache()

	accelerator.wait_for_everyone()
	accelerator.end_training()


	if __name__ == "__main__":
	parser = argparse.ArgumentParser()
	parser.add_argument("--dataset", type=str, required=True)
	parser.add_argument("--data_path", type=str, required=True)
	parser.add_argument("--model", type=str, choices=list(Diffusion_models.keys()), default="DiT-XL/122")
	parser.add_argument("--num_classes", type=int, default=1000)
	parser.add_argument("--ae", type=str, default="stabilityai/sd-vae-ft-mse")
	parser.add_argument("--ae_path", type=str, default="stabilityai/sd-vae-ft-mse")
	parser.add_argument("--sample_rate", type=int, default=4)
	parser.add_argument("--num_frames", type=int, default=16)
	parser.add_argument("--max_image_size", type=int, default=128)
	parser.add_argument("--dynamic_frames", action="store_true")
	parser.add_argument("--compress_kv", action="store_true")
	parser.add_argument("--attention_mode", type=str, choices=['xformers', 'math', 'flash'], default="math")
	parser.add_argument("--pretrained", type=str, default=None)

	parser.add_argument('--tile_overlap_factor', type=float, default=0.25)
	parser.add_argument('--enable_tiling', action='store_true')

	parser.add_argument("--video_folder", type=str, default='')
	parser.add_argument("--text_encoder_name", type=str, default='DeepFloyd/t5-v1_1-xxl')
	parser.add_argument("--model_max_length", type=int, default=120)

	parser.add_argument("--enable_tracker", action="store_true")
	parser.add_argument("--use_image_num", type=int, default=0)
	parser.add_argument("--use_img_from_vid", action="store_true")
	parser.add_argument("--use_deepspeed", action="store_true")
	parser.add_argument("--seed", type=int, default=None, help="A seed for reproducible training.")
	parser.add_argument(
	"--num_validation_videos",
	type=int,
	default=2,
	help="Number of images that should be generated during validation with `validation_prompt`.",
	)
	parser.add_argument(
	"--output_dir",
	type=str,
	default=None,
	help="The output directory where the model predictions and checkpoints will be written.",
	)
	parser.add_argument(
	"--train_batch_size", type=int, default=16, help="Batch size (per device) for the training dataloader."
	)
	parser.add_argument("--num_train_epochs", type=int, default=100)
	parser.add_argument(
	"--max_train_steps",
	type=int,
	default=None,
	help="Total number of training steps to perform. If provided, overrides num_train_epochs.",
	)
	parser.add_argument(
	"--checkpointing_steps",
	type=int,
	default=500,
	help=(
	"Save a checkpoint of the training state every X updates. These checkpoints can be used both as final"
	" checkpoints in case they are better than the last checkpoint, and are also suitable for resuming"
	" training using `--resume_from_checkpoint`."
	),
	)
	parser.add_argument(
	"--checkpoints_total_limit",
	type=int,
	default=None,
	help=("Max number of checkpoints to store."),
	)
	parser.add_argument(
	"--resume_from_checkpoint",
	type=str,
	default=None,
	help=(
	"Whether training should be resumed from a previous checkpoint. Use a path saved by"
	' `--checkpointing_steps`, or `"latest"` to automatically select the last available checkpoint.'
	),
	)
	parser.add_argument(
	"--gradient_accumulation_steps",
	type=int,
	default=1,
	help="Number of updates steps to accumulate before performing a backward/update pass.",
	)
	parser.add_argument(
	"--gradient_checkpointing",
	action="store_true",
	help="Whether or not to use gradient checkpointing to save memory at the expense of slower backward pass.",
	)
	parser.add_argument(
	"--learning_rate",
	type=float,
	default=1e-4,
	help="Initial learning rate (after the potential warmup period) to use.",
	)
	parser.add_argument(
	"--scale_lr",
	action="store_true",
	default=False,
	help="Scale the learning rate by the number of GPUs, gradient accumulation steps, and batch size.",
	)
	parser.add_argument(
	"--lr_scheduler",
	type=str,
	default="constant",
	help=(
	'The scheduler type to use. Choose between ["linear", "cosine", "cosine_with_restarts", "polynomial",'
	' "constant", "constant_with_warmup"]'
	),
	)
	parser.add_argument(
	"--lr_warmup_steps", type=int, default=500, help="Number of steps for the warmup in the lr scheduler."
	)
	parser.add_argument(
	"--timestep_bias_strategy",
	type=str,
	default="none",
	choices=["earlier", "later", "range", "none"],
	help=(
	"The timestep bias strategy, which may help direct the model toward learning low or high frequency details."
	" Choices: ['earlier', 'later', 'range', 'none']."
	" The default is 'none', which means no bias is applied, and training proceeds normally."
	" The value of 'later' will increase the frequency of the model's final training timesteps."
	),
	)
	parser.add_argument(
	"--timestep_bias_multiplier",
	type=float,
	default=1.0,
	help=(
	"The multiplier for the bias. Defaults to 1.0, which means no bias is applied."
	" A value of 2.0 will double the weight of the bias, and a value of 0.5 will halve it."
	),
	)
	parser.add_argument(
	"--timestep_bias_begin",
	type=int,
	default=0,
	help=(
	"When using `--timestep_bias_strategy=range`, the beginning (inclusive) timestep to bias."
	" Defaults to zero, which equates to having no specific bias."
	),
	)
	parser.add_argument(
	"--timestep_bias_end",
	type=int,
	default=1000,
	help=(
	"When using `--timestep_bias_strategy=range`, the final timestep (inclusive) to bias."
	" Defaults to 1000, which is the number of timesteps that Stable Diffusion is trained on."
	),
	)
	parser.add_argument(
	"--timestep_bias_portion",
	type=float,
	default=0.25,
	help=(
	"The portion of timesteps to bias. Defaults to 0.25, which 25% of timesteps will be biased."
	" A value of 0.5 will bias one half of the timesteps. The value provided for `--timestep_bias_strategy` determines"
	" whether the biased portions are in the earlier or later timesteps."
	),
	)
	parser.add_argument(
	"--snr_gamma",
	type=float,
	default=None,
	help="SNR weighting gamma to be used if rebalancing the loss. Recommended value is 5.0. "
	"More details here: https://arxiv.org/abs/2303.09556.",
	)
	parser.add_argument("--use_ema", action="store_true", help="Whether to use EMA model.")
	parser.add_argument(
	"--allow_tf32",
	action="store_true",
	help=(
	"Whether or not to allow TF32 on Ampere GPUs. Can be used to speed up training. For more information, see"
	" https://pytorch.org/docs/stable/notes/cuda.html#tensorfloat-32-tf32-on-ampere-devices"
	),
	)
	parser.add_argument(
	"--dataloader_num_workers",
	type=int,
	default=10,
	help=(
	"Number of subprocesses to use for data loading. 0 means that the data will be loaded in the main process."
	),
	)
	parser.add_argument(
	"--use_8bit_adam", action="store_true", help="Whether or not to use 8-bit Adam from bitsandbytes."
	)
	parser.add_argument("--adam_beta1", type=float, default=0.9, help="The beta1 parameter for the Adam optimizer.")
	parser.add_argument("--adam_beta2", type=float, default=0.999, help="The beta2 parameter for the Adam optimizer.")
	parser.add_argument("--adam_weight_decay", type=float, default=1e-2, help="Weight decay to use.")
	parser.add_argument("--adam_epsilon", type=float, default=1e-08, help="Epsilon value for the Adam optimizer")
	parser.add_argument("--max_grad_norm", default=1.0, type=float, help="Max gradient norm.")
	parser.add_argument("--push_to_hub", action="store_true", help="Whether or not to push the model to the Hub.")
	parser.add_argument("--hub_token", type=str, default=None, help="The token to use to push to the Model Hub.")
	parser.add_argument(
	"--prediction_type",
	type=str,
	default=None,
	help="The prediction_type that shall be used for training. Choose between 'epsilon' or 'v_prediction' or leave `None`. If left to `None` the default prediction type of the scheduler: `noise_scheduler.config.prediciton_type` is chosen.",
	)
	parser.add_argument(
	"--hub_model_id",
	type=str,
	default=None,
	help="The name of the repository to keep in sync with the local `output_dir`.",
	)
	parser.add_argument(
	"--logging_dir",
	type=str,
	default="logs",
	help=(
	"[TensorBoard](https://www.tensorflow.org/tensorboard) log directory. Will default to"
	" output_dir/runs/CURRENT_DATETIME_HOSTNAME**."
	),
	)
	parser.add_argument(
	"--report_to",
	type=str,
	default="tensorboard",
	help=(
	'The integration to report the results and logs to. Supported platforms are `"tensorboard"`'
	' (default), `"wandb"` and `"comet_ml"`. Use `"all"` to report to all integrations.'
	),
	)
	parser.add_argument(
	"--mixed_precision",
	type=str,
	default=None,
	choices=["no", "fp16", "bf16"],
	help=(
	"Whether to use mixed precision. Choose between fp16 and bf16 (bfloat16). Bf16 requires PyTorch >="
	" 1.10.and an Nvidia Ampere GPU. Default to the value of accelerate config of the current system or the"
	" flag passed with the `accelerate.launch` command. Use this argument to override the accelerate config."
	),
	)
	parser.add_argument("--local_rank", type=int, default=-1, help="For distributed training: local_rank")
	parser.add_argument("--noise_offset", type=float, default=0, help="The scale of noise offset.")

	args = parser.parse_args()
	main(args)