num_frames = 16 frame_interval = 3 fps = 24 image_size = (240, 426) multi_resolution = "STDiT2" # Define model model = dict( type="STDiT2-XL/2", from_pretrained="hpcai-tech/OpenSora-STDiT-v2-stage3", input_sq_size=512, qk_norm=True, qk_norm_legacy=True, enable_flash_attn=True, enable_layernorm_kernel=True, ) vae = dict( type="VideoAutoencoderKL", from_pretrained="stabilityai/sd-vae-ft-ema", cache_dir=None, # "/mnt/hdd/cached_models", micro_batch_size=4, ) text_encoder = dict( type="t5", from_pretrained="DeepFloyd/t5-v1_1-xxl", cache_dir=None, # "/mnt/hdd/cached_models", model_max_length=200, ) scheduler = dict( type="iddpm", num_sampling_steps=100, cfg_scale=7.0, cfg_channel=3, # or None ) dtype = "bf16" # Condition prompt_path = "./assets/texts/t2v_samples.txt" prompt = None # prompt has higher priority than prompt_path # Others batch_size = 1 seed = 42 save_dir = "./samples/samples/"