Spaces:

frankleeeee
/

open-sora

Runtime error

App Files Files Community

frankleeeee commited on Apr 24

Commit

011cc63

•

1 Parent(s): f58f053

updated to opensora v1.1

Browse files

Files changed (35) hide show

README copy.md +0 -13
app.py +313 -61
configs/dit/inference/16x256x256.py +2 -2
configs/dit/inference/1x256x256-class.py +2 -2
configs/dit/inference/1x256x256.py +2 -2
configs/dit/train/16x256x256.py +9 -9
configs/dit/train/1x256x256.py +9 -8
configs/latte/inference/16x256x256-class.py +2 -2
configs/latte/inference/16x256x256.py +2 -2
configs/latte/train/16x256x256.py +8 -8
configs/opensora-v1-1/inference/sample-ref.py +62 -0
configs/opensora-v1-1/inference/sample.py +43 -0
configs/opensora-v1-1/train/benchmark.py +101 -0
configs/opensora-v1-1/train/image.py +65 -0
configs/opensora-v1-1/train/stage1.py +77 -0
configs/opensora-v1-1/train/stage2.py +79 -0
configs/opensora-v1-1/train/stage3.py +79 -0
configs/opensora-v1-1/train/video.py +67 -0
configs/opensora/inference/16x256x256.py +11 -6
configs/opensora/inference/16x512x512.py +6 -6
configs/opensora/inference/64x512x512.py +7 -7
configs/opensora/train/16x256x256-mask.py +60 -0
configs/opensora/train/16x256x256-spee.py +60 -0
configs/opensora/train/16x256x256.py +9 -9
configs/opensora/train/16x512x512.py +10 -10
configs/opensora/train/360x512x512.py +14 -8
configs/opensora/train/64x512x512-sp.py +10 -10
configs/opensora/train/64x512x512.py +9 -9
configs/pixart/inference/16x256x256.py +3 -3
configs/pixart/inference/1x1024MS.py +4 -4
configs/pixart/inference/1x256x256.py +3 -3
configs/pixart/inference/1x512x512.py +10 -4
configs/pixart/train/16x256x256.py +10 -10
configs/pixart/train/1x512x512.py +9 -9
configs/pixart/train/64x512x512.py +10 -9

README copy.md DELETED Viewed

@@ -1,13 +0,0 @@
----
-title: Open Sora
-emoji: 📚
-colorFrom: yellow
-colorTo: indigo
-sdk: gradio
-sdk_version: 4.21.0
-app_file: app.py
-pinned: false
-license: apache-2.0
----
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -11,24 +11,146 @@ import importlib
 import os
 import subprocess
 import sys
 import spaces
-import gradio as gr
 import torch
-MODEL_TYPES = ["v1-16x256x256", "v1-HQ-16x256x256", "v1-HQ-16x512x512"]
 CONFIG_MAP = {
-    "v1-16x256x256": "configs/opensora/inference/16x256x256.py",
-    "v1-HQ-16x256x256": "configs/opensora/inference/16x256x256.py",
-    "v1-HQ-16x512x512": "configs/opensora/inference/16x512x512.py",
 }
 HF_STDIT_MAP = {
-    "v1-16x256x256": "hpcai-tech/OpenSora-STDiT-v1-16x256x256",
-    "v1-HQ-16x256x256": "hpcai-tech/OpenSora-STDiT-v1-HQ-16x256x256",
-    "v1-HQ-16x512x512": "hpcai-tech/OpenSora-STDiT-v1-HQ-16x512x512",
 }
 def install_dependencies(enable_optimization=False):
     """
     Install the required dependencies for the demo if they are not already installed.
@@ -70,14 +192,20 @@ def install_dependencies(enable_optimization=False):
                 shell=True,
             )
 def read_config(config_path):
     """
     Read the configuration file.
     """
     from mmengine.config import Config
     return Config.fromfile(config_path)
-def build_models(model_type, config):
     """
     Build the models for the given model type and configuration.
     """
@@ -87,7 +215,7 @@ def build_models(model_type, config):
     vae = build_module(config.vae, MODELS).cuda()
     # build text encoder
-    text_encoder = build_module(config.text_encoder, MODELS) # T5 must be fp32
     text_encoder.t5.model = text_encoder.t5.model.cuda()
     # build stdit
@@ -96,9 +224,8 @@ def build_models(model_type, config):
     from transformers import AutoModel
     stdit = AutoModel.from_pretrained(
-        HF_STDIT_MAP[model_type],
-        enable_flash_attn=False,
-        enable_layernorm_kernel=False,
         trust_remote_code=True,
     ).cuda()
@@ -111,23 +238,20 @@ def build_models(model_type, config):
     text_encoder.y_embedder = stdit.y_embedder
     # move modelst to device
-    vae = vae.to(torch.float16).eval()
     text_encoder.t5.model = text_encoder.t5.model.eval()  # t5 must be in fp32
-    stdit = stdit.to(torch.float16).eval()
-    return vae, text_encoder, stdit, scheduler
-def get_latent_size(config, vae):
-    input_size = (config.num_frames, *config.image_size)
-    latent_size = vae.get_latent_size(input_size)
-    return latent_size
 def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--model-type",
-        default="v1-HQ-16x256x256",
         choices=MODEL_TYPES,
         help=f"The type of model to run for the Gradio App, can only be {MODEL_TYPES}",
     )
@@ -135,7 +259,11 @@ def parse_args():
     parser.add_argument("--port", default=None, type=int, help="The port to run the Gradio App on.")
     parser.add_argument("--host", default=None, type=str, help="The host to run the Gradio App on.")
     parser.add_argument("--share", action="store_true", help="Whether to share this gradio demo.")
-    parser.add_argument("--enable-optimization", action="store_true", help="Whether to enable optimization such as flash attention and fused layernorm")
     return parser.parse_args()
@@ -160,26 +288,130 @@ torch.jit._state.disable()
 # set up
 install_dependencies(enable_optimization=args.enable_optimization)
 # build model
-vae, text_encoder, stdit, scheduler = build_models(args.model_type, config)
 @spaces.GPU(duration=200)
-def run_inference(prompt_text):
-    from opensora.datasets import save_sample
-    latent_size = get_latent_size(config, vae)
-    samples = scheduler.sample(
-        stdit,
-        text_encoder,
-        z_size=(vae.out_channels, *latent_size),
-        prompts=[prompt_text],
-        device="cuda",
-    )
-    samples = vae.decode(samples.to(torch.float16))
-    filename = f"{args.output}/sample"
-    saved_path = save_sample(samples[0], fps=config.fps, save_path=filename)
-    return saved_path
 def main():
     # create demo
@@ -208,32 +440,52 @@ def main():
         with gr.Row():
             with gr.Column():
-                prompt_text = gr.Textbox(show_label=False, placeholder="Describe your video here", lines=4)
-                submit_button = gr.Button("Generate video")
             with gr.Column():
-                output_video = gr.Video()
-        submit_button.click(fn=run_inference, inputs=[prompt_text], outputs=output_video)
-        gr.Examples(
-            examples=[
-                [
-                    "The video captures the majestic beauty of a waterfall cascading down a cliff into a serene lake. The waterfall, with its powerful flow, is the central focus of the video. The surrounding landscape is lush and green, with trees and foliage adding to the natural beauty of the scene. The camera angle provides a bird's eye view of the waterfall, allowing viewers to appreciate the full height and grandeur of the waterfall. The video is a stunning representation of nature's power and beauty.",
-                ],
-            ],
-            fn=run_inference,
-            inputs=[
-                prompt_text,
-            ],
-            outputs=[output_video],
-            cache_examples=True,
-        )
     # launch
     demo.launch(server_port=args.port, server_name=args.host, share=args.share)
-if __name__ == '__main__':
     main()

 import os
 import subprocess
 import sys
+import re
+import json
+import math
 import spaces
 import torch
+import gradio as gr
+MODEL_TYPES = ["v1.1"]
 CONFIG_MAP = {
+    "v1.1-stage2": "configs/opensora-v1-1/inference/sample-ref.py",
+    "v1.1-stage3": "configs/opensora-v1-1/inference/sample-ref.py",
 }
 HF_STDIT_MAP = {
+    "v1.1-stage2": "hpcai-tech/OpenSora-STDiT-v2-stage2",
+    "v1.1-stage3": "hpcai-tech/OpenSora-STDiT-v2-stage3",
 }
+RESOLUTION_MAP = {
+    "360p": (360, 480),
+    "480p": (480, 858),
+    "720p": (720, 1280),
+    "1080p": (1080, 1920)
+}
+# ============================
+# Utils
+# ============================
+def collect_references_batch(reference_paths, vae, image_size):
+    from opensora.datasets.utils import read_from_path
+    refs_x = []
+    for reference_path in reference_paths:
+        if reference_path is None:
+            refs_x.append([])
+            continue
+        ref_path = reference_path.split(";")
+        ref = []
+        for r_path in ref_path:
+            r = read_from_path(r_path, image_size, transform_name="resize_crop")
+            r_x = vae.encode(r.unsqueeze(0).to(vae.device, vae.dtype))
+            r_x = r_x.squeeze(0)
+            ref.append(r_x)
+        refs_x.append(ref)
+    # refs_x: [batch, ref_num, C, T, H, W]
+    return refs_x
+def process_mask_strategy(mask_strategy):
+    mask_batch = []
+    mask_strategy = mask_strategy.split(";")
+    for mask in mask_strategy:
+        mask_group = mask.split(",")
+        assert len(mask_group) >= 1 and len(mask_group) <= 6, f"Invalid mask strategy: {mask}"
+        if len(mask_group) == 1:
+            mask_group.extend(["0", "0", "0", "1", "0"])
+        elif len(mask_group) == 2:
+            mask_group.extend(["0", "0", "1", "0"])
+        elif len(mask_group) == 3:
+            mask_group.extend(["0", "1", "0"])
+        elif len(mask_group) == 4:
+            mask_group.extend(["1", "0"])
+        elif len(mask_group) == 5:
+            mask_group.append("0")
+        mask_batch.append(mask_group)
+    return mask_batch
+def apply_mask_strategy(z, refs_x, mask_strategys, loop_i):
+    masks = []
+    for i, mask_strategy in enumerate(mask_strategys):
+        mask = torch.ones(z.shape[2], dtype=torch.float, device=z.device)
+        if mask_strategy is None:
+            masks.append(mask)
+            continue
+        mask_strategy = process_mask_strategy(mask_strategy)
+        for mst in mask_strategy:
+            loop_id, m_id, m_ref_start, m_target_start, m_length, edit_ratio = mst
+            loop_id = int(loop_id)
+            if loop_id != loop_i:
+                continue
+            m_id = int(m_id)
+            m_ref_start = int(m_ref_start)
+            m_length = int(m_length)
+            m_target_start = int(m_target_start)
+            edit_ratio = float(edit_ratio)
+            ref = refs_x[i][m_id]  # [C, T, H, W]
+            if m_ref_start < 0:
+                m_ref_start = ref.shape[1] + m_ref_start
+            if m_target_start < 0:
+                # z: [B, C, T, H, W]
+                m_target_start = z.shape[2] + m_target_start
+            z[i, :, m_target_start : m_target_start + m_length] = ref[:, m_ref_start : m_ref_start + m_length]
+            mask[m_target_start : m_target_start + m_length] = edit_ratio
+        masks.append(mask)
+    masks = torch.stack(masks)
+    return masks
+def process_prompts(prompts, num_loop):
+    from opensora.models.text_encoder.t5 import text_preprocessing
+    ret_prompts = []
+    for prompt in prompts:
+        if prompt.startswith("|0|"):
+            prompt_list = prompt.split("|")[1:]
+            text_list = []
+            for i in range(0, len(prompt_list), 2):
+                start_loop = int(prompt_list[i])
+                text = prompt_list[i + 1]
+                text = text_preprocessing(text)
+                end_loop = int(prompt_list[i + 2]) if i + 2 < len(prompt_list) else num_loop
+                text_list.extend([text] * (end_loop - start_loop))
+            assert len(text_list) == num_loop, f"Prompt loop mismatch: {len(text_list)} != {num_loop}"
+            ret_prompts.append(text_list)
+        else:
+            prompt = text_preprocessing(prompt)
+            ret_prompts.append([prompt] * num_loop)
+    return ret_prompts
+def extract_json_from_prompts(prompts):
+    additional_infos = []
+    ret_prompts = []
+    for prompt in prompts:
+        parts = re.split(r"(?=[{\[])", prompt)
+        assert len(parts) <= 2, f"Invalid prompt: {prompt}"
+        ret_prompts.append(parts[0])
+        if len(parts) == 1:
+            additional_infos.append({})
+        else:
+            additional_infos.append(json.loads(parts[1]))
+    return ret_prompts, additional_infos
+# ============================
+# Runtime Environment
+# ============================
 def install_dependencies(enable_optimization=False):
     """
     Install the required dependencies for the demo if they are not already installed.
                 shell=True,
             )
+# ============================
+# Model-related
+# ============================
 def read_config(config_path):
     """
     Read the configuration file.
     """
     from mmengine.config import Config
     return Config.fromfile(config_path)
+def build_models(model_type, config, enable_optimization=False):
     """
     Build the models for the given model type and configuration.
     """
     vae = build_module(config.vae, MODELS).cuda()
     # build text encoder
+    text_encoder = build_module(config.text_encoder, MODELS)  # T5 must be fp32
     text_encoder.t5.model = text_encoder.t5.model.cuda()
     # build stdit
     from transformers import AutoModel
     stdit = AutoModel.from_pretrained(
+        HF_STDIT_MAP[model_type],
+        enable_flash_attn=enable_optimization,
         trust_remote_code=True,
     ).cuda()
     text_encoder.y_embedder = stdit.y_embedder
     # move modelst to device
+    vae = vae.to(torch.bfloat16).eval()
     text_encoder.t5.model = text_encoder.t5.model.eval()  # t5 must be in fp32
+    stdit = stdit.to(torch.bfloat16).eval()
+    # clear cuda
+    torch.cuda.empty_cache()
+    return vae, text_encoder, stdit, scheduler
 def parse_args():
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--model-type",
+        default="v1.1-stage3",
         choices=MODEL_TYPES,
         help=f"The type of model to run for the Gradio App, can only be {MODEL_TYPES}",
     )
     parser.add_argument("--port", default=None, type=int, help="The port to run the Gradio App on.")
     parser.add_argument("--host", default=None, type=str, help="The host to run the Gradio App on.")
     parser.add_argument("--share", action="store_true", help="Whether to share this gradio demo.")
+    parser.add_argument(
+        "--enable-optimization",
+        action="store_true",
+        help="Whether to enable optimization such as flash attention and fused layernorm",
+    )
     return parser.parse_args()
 # set up
 install_dependencies(enable_optimization=args.enable_optimization)
+# import after installation
+from opensora.datasets import IMG_FPS, save_sample
+from opensora.utils.misc import to_torch_dtype
+# some global variables
+dtype = to_torch_dtype(config.dtype)
+device = torch.device("cuda")
 # build model
+vae, text_encoder, stdit, scheduler = build_models(args.model_type, config, enable_optimization=args.enable_optimization)
 @spaces.GPU(duration=200)
+def run_inference(mode, prompt_text, resolution, length, reference_image):
+    with torch.inference_mode():
+        # ======================
+        # 1. Preparation
+        # ======================
+        # parse the inputs
+        resolution = RESOLUTION_MAP[resolution]
+        # compute number of loops
+        num_seconds = int(length.rstrip('s'))
+        total_number_of_frames = num_seconds * config.fps / config.frame_interval
+        num_loop = math.ceil(total_number_of_frames / config.num_frames)
+        # prepare model args
+        model_args = dict()
+        height = torch.tensor([resolution[0]], device=device, dtype=dtype)
+        width = torch.tensor([resolution[1]], device=device, dtype=dtype)
+        num_frames = torch.tensor([config.num_frames], device=device, dtype=dtype)
+        ar = torch.tensor([resolution[0] / resolution[1]], device=device, dtype=dtype)
+        if config.num_frames == 1:
+            config.fps = IMG_FPS
+        fps = torch.tensor([config.fps], device=device, dtype=dtype)
+        model_args["height"] = height
+        model_args["width"] = width
+        model_args["num_frames"] = num_frames
+        model_args["ar"] = ar
+        model_args["fps"] = fps
+        # compute latent size
+        input_size = (config.num_frames, *resolution)
+        latent_size = vae.get_latent_size(input_size)
+        # process prompt
+        prompt_raw = [prompt_text]
+        prompt_raw, _ = extract_json_from_prompts(prompt_raw)
+        prompt_loops = process_prompts(prompt_raw, num_loop)
+        video_clips = []
+        # prepare mask strategy
+        if mode == "Text2Video":
+            mask_strategy = [None]
+        elif mode == "Image2Video":
+            mask_strategy = ['0']
+        else:
+            raise ValueError(f"Invalid mode: {mode}")
+        # =========================
+        # 2. Load reference images
+        # =========================
+        if mode == "Text2Video":
+            refs_x = collect_references_batch([None], vae, resolution)
+        elif mode == "Image2Video":
+            # save image to disk
+            from PIL import Image
+            im = Image.fromarray(reference_image)
+            im.save("test.jpg")
+            refs_x = collect_references_batch(["test.jpg"], vae, resolution)
+        else:
+            raise ValueError(f"Invalid mode: {mode}")
+        # 4.3. long video generation
+        for loop_i in range(num_loop):
+            # 4.4 sample in hidden space
+            batch_prompts = [prompt[loop_i] for prompt in prompt_loops]
+            z = torch.randn(len(batch_prompts), vae.out_channels, *latent_size, device=device, dtype=dtype)
+            # 4.5. apply mask strategy
+            masks = None
+            # if cfg.reference_path is not None:
+            if loop_i > 0:
+                ref_x = vae.encode(video_clips[-1])
+                for j, refs in enumerate(refs_x):
+                    if refs is None:
+                        refs_x[j] = [ref_x[j]]
+                    else:
+                        refs.append(ref_x[j])
+                    if mask_strategy[j] is None:
+                        mask_strategy[j] = ""
+                    else:
+                        mask_strategy[j] += ";"
+                    mask_strategy[
+                        j
+                    ] += f"{loop_i},{len(refs)-1},-{config.condition_frame_length},0,{config.condition_frame_length}"
+            masks = apply_mask_strategy(z, refs_x, mask_strategy, loop_i)
+            # 4.6. diffusion sampling
+            samples = scheduler.sample(
+                stdit,
+                text_encoder,
+                z=z,
+                prompts=batch_prompts,
+                device=device,
+                additional_args=model_args,
+                mask=masks,  # scheduler must support mask
+            )
+            samples = vae.decode(samples.to(dtype))
+            video_clips.append(samples)
+            # 4.7. save video
+            if loop_i == num_loop - 1:
+                video_clips_list = [
+                    video_clips[0][0]] + [video_clips[i][0][:, config.condition_frame_length :]
+                    for i in range(1, num_loop)
+                ]
+                video = torch.cat(video_clips_list, dim=1)
+                save_path = f"{args.output}/sample"
+                saved_path = save_sample(video, fps=config.fps // config.frame_interval, save_path=save_path, force_video=True)
+                return saved_path
 def main():
     # create demo
         with gr.Row():
             with gr.Column():
+                mode = gr.Radio(
+                    choices=["Text2Video", "Image2Video"],
+                    value="Text2Video",
+                    label="Usage",
+                    info="Choose your usage scenario",
+                )
+                prompt_text = gr.Textbox(
+                    label="Prompt",
+                    placeholder="Describe your video here",
+                    lines=4,
+                )
+                resolution = gr.Radio(
+                     choices=["360p", "480p", "720p", "1080p"],
+                     value="360p",
+                    label="Resolution",
+                )
+                length = gr.Radio(
+                    choices=["2s", "4s", "8s"],
+                    value="2s",
+                    label="Video Length",
+                    info="8s may fail as Hugging Face ZeroGPU has the limitation of max 200 seconds inference time."
+                )
+                reference_image = gr.Image(
+                    label="Reference Image (only used for Image2Video)",
+                )
             with gr.Column():
+                output_video = gr.Video(
+                    label="Output Video",
+                    height="100%"
+                )
+        with gr.Row():
+             submit_button = gr.Button("Generate video")
+        submit_button.click(
+             fn=run_inference,
+             inputs=[mode, prompt_text, resolution, length, reference_image],
+             outputs=output_video
+             )
     # launch
     demo.launch(server_port=args.port, server_name=args.host, share=args.share)
+if __name__ == "__main__":
     main()

configs/dit/inference/16x256x256.py CHANGED Viewed

@@ -22,10 +22,10 @@ scheduler = dict(
     num_sampling_steps=20,
     cfg_scale=4.0,
 )
-dtype = "fp16"
 # Others
 batch_size = 2
 seed = 42
 prompt_path = "./assets/texts/ucf101_labels.txt"
-save_dir = "./outputs/samples/"

     num_sampling_steps=20,
     cfg_scale=4.0,
 )
+dtype = "bf16"
 # Others
 batch_size = 2
 seed = 42
 prompt_path = "./assets/texts/ucf101_labels.txt"
+save_dir = "./samples/samples/"

configs/dit/inference/1x256x256-class.py CHANGED Viewed

@@ -22,10 +22,10 @@ scheduler = dict(
     num_sampling_steps=20,
     cfg_scale=4.0,
 )
-dtype = "fp16"
 # Others
 batch_size = 2
 seed = 42
 prompt_path = "./assets/texts/imagenet_id.txt"
-save_dir = "./outputs/samples/"

     num_sampling_steps=20,
     cfg_scale=4.0,
 )
+dtype = "bf16"
 # Others
 batch_size = 2
 seed = 42
 prompt_path = "./assets/texts/imagenet_id.txt"
+save_dir = "./samples/samples/"

configs/dit/inference/1x256x256.py CHANGED Viewed

@@ -23,10 +23,10 @@ scheduler = dict(
     num_sampling_steps=20,
     cfg_scale=4.0,
 )
-dtype = "fp16"
 # Others
 batch_size = 2
 seed = 42
 prompt_path = "./assets/texts/imagenet_labels.txt"
-save_dir = "./outputs/samples/"

     num_sampling_steps=20,
     cfg_scale=4.0,
 )
+dtype = "bf16"
 # Others
 batch_size = 2
 seed = 42
 prompt_path = "./assets/texts/imagenet_labels.txt"
+save_dir = "./samples/samples/"

configs/dit/train/16x256x256.py CHANGED Viewed

@@ -1,16 +1,16 @@
-num_frames = 16
-frame_interval = 3
-image_size = (256, 256)
 # Define dataset
-root = None
-data_path = "CSV_PATH"
-use_image_transform = False
-num_workers = 4
 # Define acceleration
 dtype = "bf16"
-grad_checkpoint = False
 plugin = "zero2"
 sp_size = 1

 # Define dataset
+dataset = dict(
+    type="VideoTextDataset",
+    data_path=None,
+    num_frames=16,
+    frame_interval=3,
+    image_size=(256, 256),
+)
 # Define acceleration
+num_workers = 4
 dtype = "bf16"
+grad_checkpoint = True
 plugin = "zero2"
 sp_size = 1

configs/dit/train/1x256x256.py CHANGED Viewed

@@ -1,14 +1,15 @@
-num_frames = 1
-frame_interval = 1
-image_size = (256, 256)
 # Define dataset
-root = None
-data_path = "CSV_PATH"
-use_image_transform = True
-num_workers = 4
 # Define acceleration
 dtype = "bf16"
 grad_checkpoint = False
 plugin = "zero2"

 # Define dataset
+dataset = dict(
+    type="VideoTextDataset",
+    data_path=None,
+    num_frames=1,
+    frame_interval=1,
+    image_size=(256, 256),
+    transform_name="center",
+)
 # Define acceleration
+num_workers = 4
 dtype = "bf16"
 grad_checkpoint = False
 plugin = "zero2"

configs/latte/inference/16x256x256-class.py CHANGED Viewed

@@ -21,10 +21,10 @@ scheduler = dict(
     num_sampling_steps=20,
     cfg_scale=4.0,
 )
-dtype = "fp16"
 # Others
 batch_size = 2
 seed = 42
 prompt_path = "./assets/texts/ucf101_id.txt"
-save_dir = "./outputs/samples/"

     num_sampling_steps=20,
     cfg_scale=4.0,
 )
+dtype = "bf16"
 # Others
 batch_size = 2
 seed = 42
 prompt_path = "./assets/texts/ucf101_id.txt"
+save_dir = "./samples/samples/"

configs/latte/inference/16x256x256.py CHANGED Viewed

@@ -22,10 +22,10 @@ scheduler = dict(
     num_sampling_steps=20,
     cfg_scale=4.0,
 )
-dtype = "fp16"
 # Others
 batch_size = 2
 seed = 42
 prompt_path = "./assets/texts/ucf101_labels.txt"
-save_dir = "./outputs/samples/"

     num_sampling_steps=20,
     cfg_scale=4.0,
 )
+dtype = "bf16"
 # Others
 batch_size = 2
 seed = 42
 prompt_path = "./assets/texts/ucf101_labels.txt"
+save_dir = "./samples/samples/"

configs/latte/train/16x256x256.py CHANGED Viewed

@@ -1,14 +1,14 @@
-num_frames = 16
-frame_interval = 3
-image_size = (256, 256)
 # Define dataset
-root = None
-data_path = "CSV_PATH"
-use_image_transform = False
-num_workers = 4
 # Define acceleration
 dtype = "bf16"
 grad_checkpoint = True
 plugin = "zero2"

 # Define dataset
+dataset = dict(
+    type="VideoTextDataset",
+    data_path=None,
+    num_frames=16,
+    frame_interval=3,
+    image_size=(256, 256),
+)
 # Define acceleration
+num_workers = 4
 dtype = "bf16"
 grad_checkpoint = True
 plugin = "zero2"

configs/opensora-v1-1/inference/sample-ref.py ADDED Viewed

	@@ -0,0 +1,62 @@

+num_frames = 16
+frame_interval = 3
+fps = 24
+image_size = (240, 426)
+multi_resolution = "STDiT2"
+# Condition
+prompt_path = None
+prompt = [
+    "A car driving on the ocean.",
+    'Drone view of waves crashing against the rugged cliffs along Big Sur\'s garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff\'s edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff\'s edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.{"reference_path": "assets/images/condition/cliff.png", "mask_strategy": "0"}',
+    "In an ornate, historical hall, a massive tidal wave peaks and begins to crash. Two surfers, seizing the moment, skillfully navigate the face of the wave.",
+]
+loop = 2
+condition_frame_length = 4
+reference_path = [
+    "https://cdn.openai.com/tmp/s/interp/d0.mp4",
+    None,
+    "assets/images/condition/wave.png",
+]
+# valid when reference_path is not None
+# (loop id, ref id, ref start, length, target start)
+mask_strategy = [
+    "0,0,0,0,8,0.3",
+    None,
+    "0",
+]
+# Define model
+model = dict(
+    type="STDiT2-XL/2",
+    from_pretrained=None,
+    input_sq_size=512,
+    qk_norm=True,
+    enable_flashattn=True,
+    enable_layernorm_kernel=True,
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+    cache_dir=None,  # "/mnt/hdd/cached_models",
+    micro_batch_size=4,
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
+    cache_dir=None,  # "/mnt/hdd/cached_models",
+    model_max_length=200,
+)
+scheduler = dict(
+    type="iddpm",
+    num_sampling_steps=100,
+    cfg_scale=7.0,
+    cfg_channel=3,  # or None
+)
+dtype = "bf16"
+# Others
+batch_size = 1
+seed = 42
+save_dir = "./samples/samples/"

configs/opensora-v1-1/inference/sample.py ADDED Viewed

	@@ -0,0 +1,43 @@

+num_frames = 16
+frame_interval = 3
+fps = 24
+image_size = (240, 426)
+multi_resolution = "STDiT2"
+# Define model
+model = dict(
+    type="STDiT2-XL/2",
+    from_pretrained=None,
+    input_sq_size=512,
+    qk_norm=True,
+    enable_flashattn=True,
+    enable_layernorm_kernel=True,
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+    cache_dir=None,  # "/mnt/hdd/cached_models",
+    micro_batch_size=4,
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
+    cache_dir=None,  # "/mnt/hdd/cached_models",
+    model_max_length=200,
+)
+scheduler = dict(
+    type="iddpm",
+    num_sampling_steps=100,
+    cfg_scale=7.0,
+    cfg_channel=3,  # or None
+)
+dtype = "bf16"
+# Condition
+prompt_path = "./assets/texts/t2v_samples.txt"
+prompt = None  # prompt has higher priority than prompt_path
+# Others
+batch_size = 1
+seed = 42
+save_dir = "./samples/samples/"

configs/opensora-v1-1/train/benchmark.py ADDED Viewed

	@@ -0,0 +1,101 @@

+# this file is only for batch size search and is not used for training
+# Define dataset
+dataset = dict(
+    type="VariableVideoTextDataset",
+    data_path=None,
+    num_frames=None,
+    frame_interval=3,
+    image_size=(None, None),
+    transform_name="resize_crop",
+)
+# bucket config format:
+# 1. { resolution: {num_frames: (prob, batch_size)} }, in this case batch_size is ignored when searching
+# 2. { resolution: {num_frames: (prob, (max_batch_size, ))} }, batch_size is searched in the range [batch_size_start, max_batch_size), batch_size_start is configured via CLI
+# 3. { resolution: {num_frames: (prob, (min_batch_size, max_batch_size))} }, batch_size is searched in the range [min_batch_size, max_batch_size)
+# 4. { resolution: {num_frames: (prob, (min_batch_size, max_batch_size, step_size))} }, batch_size is searched in the range [min_batch_size, max_batch_size) with step_size (grid search)
+# 5. { resolution: {num_frames: (0.0, None)} }, this bucket will not be used
+bucket_config = {
+    # == manual search ==
+    # "240p": {128: (1.0, 2)}, # 4.28s/it
+    # "240p": {64: (1.0, 4)},
+    # "240p": {32: (1.0, 8)},  # 4.6s/it
+    # "240p": {16: (1.0, 16)},  # 4.6s/it
+    # "480p": {16: (1.0, 4)},  # 4.6s/it
+    # "720p": {16: (1.0, 2)},  # 5.89s/it
+    # "256": {1: (1.0, 256)},  # 4.5s/it
+    # "512": {1: (1.0, 96)}, # 4.7s/it
+    # "512": {1: (1.0, 128)}, # 6.3s/it
+    # "480p": {1: (1.0, 50)},  # 4.0s/it
+    # "1024": {1: (1.0, 32)},  # 6.8s/it
+    # "1024": {1: (1.0, 20)}, # 4.3s/it
+    # "1080p": {1: (1.0, 16)}, # 8.6s/it
+    # "1080p": {1: (1.0, 8)},  # 4.4s/it
+    # == stage 2 ==
+    # "240p": {
+    #     16: (1.0, (2, 32)),
+    #     32: (1.0, (2, 16)),
+    #     64: (1.0, (2, 8)),
+    #     128: (1.0, (2, 6)),
+    # },
+    # "256": {1: (1.0, (128, 300))},
+    # "512": {1: (0.5, (64, 128))},
+    # "480p": {1: (0.4, (32, 128)), 16: (0.4, (2, 32)), 32: (0.0, None)},
+    # "720p": {16: (0.1, (2, 16)), 32: (0.0, None)},  # No examples now
+    # "1024": {1: (0.3, (8, 64))},
+    # "1080p": {1: (0.3, (2, 32))},
+    # == stage 3 ==
+    "720p": {1: (20, 40), 32: (0.5, (2, 4)), 64: (0.5, (1, 1))},
+}
+# Define acceleration
+num_workers = 4
+num_bucket_build_workers = 16
+dtype = "bf16"
+grad_checkpoint = True
+plugin = "zero2"
+sp_size = 1
+# Define model
+model = dict(
+    type="STDiT2-XL/2",
+    from_pretrained=None,
+    input_sq_size=512,  # pretrained model is trained on 512x512
+    qk_norm=True,
+    enable_flashattn=True,
+    enable_layernorm_kernel=True,
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+    micro_batch_size=4,
+    local_files_only=True,
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
+    model_max_length=200,
+    shardformer=True,
+    local_files_only=True,
+)
+scheduler = dict(
+    type="iddpm",
+    timestep_respacing="",
+)
+# Others
+seed = 42
+outputs = "outputs"
+wandb = False
+epochs = 1000
+log_every = 10
+ckpt_every = 1000
+load = None
+batch_size = None
+lr = 2e-5
+grad_clip = 1.0

configs/opensora-v1-1/train/image.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# Define dataset
+dataset = dict(
+    type="VariableVideoTextDataset",
+    data_path=None,
+    num_frames=None,
+    frame_interval=3,
+    image_size=(None, None),
+    transform_name="resize_crop",
+)
+bucket_config = {  # 6s/it
+    "256": {1: (1.0, 256)},
+    "512": {1: (1.0, 80)},
+    "480p": {1: (1.0, 52)},
+    "1024": {1: (1.0, 20)},
+    "1080p": {1: (1.0, 8)},
+}
+# Define acceleration
+num_workers = 4
+num_bucket_build_workers = 16
+dtype = "bf16"
+grad_checkpoint = True
+plugin = "zero2"
+sp_size = 1
+# Define model
+model = dict(
+    type="STDiT2-XL/2",
+    from_pretrained=None,
+    input_sq_size=512,  # pretrained model is trained on 512x512
+    qk_norm=True,
+    enable_flashattn=True,
+    enable_layernorm_kernel=True,
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+    micro_batch_size=4,
+    local_files_only=True,
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
+    model_max_length=200,
+    shardformer=True,
+    local_files_only=True,
+)
+scheduler = dict(
+    type="iddpm",
+    timestep_respacing="",
+)
+# Others
+seed = 42
+outputs = "outputs"
+wandb = False
+epochs = 1000
+log_every = 10
+ckpt_every = 500
+load = None
+batch_size = 10  # only for logging
+lr = 2e-5
+grad_clip = 1.0

configs/opensora-v1-1/train/stage1.py ADDED Viewed

	@@ -0,0 +1,77 @@

+# Define dataset
+dataset = dict(
+    type="VariableVideoTextDataset",
+    data_path=None,
+    num_frames=None,
+    frame_interval=3,
+    image_size=(None, None),
+    transform_name="resize_crop",
+)
+# IMG: 1024 (20%) 512 (30%) 256 (50%) drop (50%)
+bucket_config = {  # 1s/it
+    "144p": {1: (0.5, 48), 16: (1.0, 6), 32: (1.0, 3), 96: (1.0, 1)},
+    "256": {1: (0.5, 24), 16: (0.5, 3), 48: (0.5, 1), 64: (0.0, None)},
+    "240p": {16: (0.3, 2), 32: (0.3, 1), 64: (0.0, None)},
+    "512": {1: (0.4, 12)},
+    "1024": {1: (0.3, 3)},
+}
+mask_ratios = {
+    "mask_no": 0.75,
+    "mask_quarter_random": 0.025,
+    "mask_quarter_head": 0.025,
+    "mask_quarter_tail": 0.025,
+    "mask_quarter_head_tail": 0.05,
+    "mask_image_random": 0.025,
+    "mask_image_head": 0.025,
+    "mask_image_tail": 0.025,
+    "mask_image_head_tail": 0.05,
+}
+# Define acceleration
+num_workers = 8
+num_bucket_build_workers = 16
+dtype = "bf16"
+grad_checkpoint = False
+plugin = "zero2"
+sp_size = 1
+# Define model
+model = dict(
+    type="STDiT2-XL/2",
+    from_pretrained=None,
+    input_sq_size=512,  # pretrained model is trained on 512x512
+    qk_norm=True,
+    enable_flashattn=True,
+    enable_layernorm_kernel=True,
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+    micro_batch_size=4,
+    local_files_only=True,
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
+    model_max_length=200,
+    shardformer=True,
+    local_files_only=True,
+)
+scheduler = dict(
+    type="iddpm",
+    timestep_respacing="",
+)
+# Others
+seed = 42
+outputs = "outputs"
+wandb = False
+epochs = 1000
+log_every = 10
+ckpt_every = 500
+load = None
+batch_size = None
+lr = 2e-5
+grad_clip = 1.0

configs/opensora-v1-1/train/stage2.py ADDED Viewed

	@@ -0,0 +1,79 @@

+# Define dataset
+dataset = dict(
+    type="VariableVideoTextDataset",
+    data_path=None,
+    num_frames=None,
+    frame_interval=3,
+    image_size=(None, None),
+    transform_name="resize_crop",
+)
+bucket_config = {  # 7s/it
+    "144p": {1: (1.0, 48), 16: (1.0, 17), 32: (1.0, 9), 64: (1.0, 4), 128: (1.0, 1)},
+    "256": {1: (0.8, 254), 16: (0.5, 17), 32: (0.5, 9), 64: (0.5, 4), 128: (0.5, 1)},
+    "240p": {1: (0.1, 20), 16: (0.9, 17), 32: (0.8, 9), 64: (0.8, 4), 128: (0.8, 2)},
+    "512": {1: (0.5, 86), 16: (0.2, 4), 32: (0.2, 2), 64: (0.2, 1), 128: (0.0, None)},
+    "480p": {1: (0.4, 54), 16: (0.4, 4), 32: (0.0, None)},
+    "720p": {1: (0.1, 20), 16: (0.1, 2), 32: (0.0, None)},
+    "1024": {1: (0.3, 20)},
+    "1080p": {1: (0.4, 8)},
+}
+mask_ratios = {
+    "mask_no": 0.75,
+    "mask_quarter_random": 0.025,
+    "mask_quarter_head": 0.025,
+    "mask_quarter_tail": 0.025,
+    "mask_quarter_head_tail": 0.05,
+    "mask_image_random": 0.025,
+    "mask_image_head": 0.025,
+    "mask_image_tail": 0.025,
+    "mask_image_head_tail": 0.05,
+}
+# Define acceleration
+num_workers = 8
+num_bucket_build_workers = 16
+dtype = "bf16"
+grad_checkpoint = True
+plugin = "zero2"
+sp_size = 1
+# Define model
+model = dict(
+    type="STDiT2-XL/2",
+    from_pretrained=None,
+    input_sq_size=512,  # pretrained model is trained on 512x512
+    qk_norm=True,
+    enable_flashattn=True,
+    enable_layernorm_kernel=True,
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+    micro_batch_size=4,
+    local_files_only=True,
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
+    model_max_length=200,
+    shardformer=True,
+    local_files_only=True,
+)
+scheduler = dict(
+    type="iddpm",
+    timestep_respacing="",
+)
+# Others
+seed = 42
+outputs = "outputs"
+wandb = False
+epochs = 1000
+log_every = 10
+ckpt_every = 500
+load = None
+batch_size = None
+lr = 2e-5
+grad_clip = 1.0

configs/opensora-v1-1/train/stage3.py ADDED Viewed

	@@ -0,0 +1,79 @@

+# Define dataset
+dataset = dict(
+    type="VariableVideoTextDataset",
+    data_path=None,
+    num_frames=None,
+    frame_interval=3,
+    image_size=(None, None),
+    transform_name="resize_crop",
+)
+bucket_config = {  # 13s/it
+    "144p": {1: (1.0, 200), 16: (1.0, 36), 32: (1.0, 18), 64: (1.0, 9), 128: (1.0, 4)},
+    "256": {1: (0.8, 200), 16: (0.5, 22), 32: (0.5, 11), 64: (0.5, 6), 128: (0.8, 4)},
+    "240p": {1: (0.8, 200), 16: (0.5, 22), 32: (0.5, 10), 64: (0.5, 6), 128: (0.5, 3)},
+    "360p": {1: (0.5, 120), 16: (0.5, 9), 32: (0.5, 4), 64: (0.5, 2), 128: (0.5, 1)},
+    "512": {1: (0.5, 120), 16: (0.5, 9), 32: (0.5, 4), 64: (0.5, 2), 128: (0.8, 1)},
+    "480p": {1: (0.4, 80), 16: (0.6, 6), 32: (0.6, 3), 64: (0.6, 1), 128: (0.0, None)},
+    "720p": {1: (0.4, 40), 16: (0.6, 3), 32: (0.6, 1), 96: (0.0, None)},
+    "1024": {1: (0.3, 40)},
+}
+mask_ratios = {
+    "mask_no": 0.75,
+    "mask_quarter_random": 0.025,
+    "mask_quarter_head": 0.025,
+    "mask_quarter_tail": 0.025,
+    "mask_quarter_head_tail": 0.05,
+    "mask_image_random": 0.025,
+    "mask_image_head": 0.025,
+    "mask_image_tail": 0.025,
+    "mask_image_head_tail": 0.05,
+}
+# Define acceleration
+num_workers = 8
+num_bucket_build_workers = 16
+dtype = "bf16"
+grad_checkpoint = True
+plugin = "zero2"
+sp_size = 1
+# Define model
+model = dict(
+    type="STDiT2-XL/2",
+    from_pretrained=None,
+    input_sq_size=512,  # pretrained model is trained on 512x512
+    qk_norm=True,
+    enable_flashattn=True,
+    enable_layernorm_kernel=True,
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+    micro_batch_size=4,
+    local_files_only=True,
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
+    model_max_length=200,
+    shardformer=True,
+    local_files_only=True,
+)
+scheduler = dict(
+    type="iddpm",
+    timestep_respacing="",
+)
+# Others
+seed = 42
+outputs = "outputs"
+wandb = False
+epochs = 1000
+log_every = 10
+ckpt_every = 500
+load = None
+batch_size = None
+lr = 2e-5
+grad_clip = 1.0

configs/opensora-v1-1/train/video.py ADDED Viewed

	@@ -0,0 +1,67 @@

+# Define dataset
+dataset = dict(
+    type="VariableVideoTextDataset",
+    data_path=None,
+    num_frames=None,
+    frame_interval=3,
+    image_size=(None, None),
+    transform_name="resize_crop",
+)
+bucket_config = {  # 6s/it
+    "240p": {16: (1.0, 16), 32: (1.0, 8), 64: (1.0, 4), 128: (1.0, 2)},
+    "256": {1: (1.0, 256)},
+    "512": {1: (0.5, 80)},
+    "480p": {1: (0.4, 52), 16: (0.4, 4), 32: (0.0, None)},
+    "720p": {16: (0.1, 2), 32: (0.0, None)},  # No examples now
+    "1024": {1: (0.3, 20)},
+    "1080p": {1: (0.3, 8)},
+}
+# Define acceleration
+num_workers = 4
+num_bucket_build_workers = 16
+dtype = "bf16"
+grad_checkpoint = True
+plugin = "zero2"
+sp_size = 1
+# Define model
+model = dict(
+    type="STDiT2-XL/2",
+    from_pretrained=None,
+    input_sq_size=512,  # pretrained model is trained on 512x512
+    qk_norm=True,
+    enable_flashattn=True,
+    enable_layernorm_kernel=True,
+)
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+    micro_batch_size=4,
+    local_files_only=True,
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
+    model_max_length=200,
+    shardformer=True,
+    local_files_only=True,
+)
+scheduler = dict(
+    type="iddpm",
+    timestep_respacing="",
+)
+# Others
+seed = 42
+outputs = "outputs"
+wandb = False
+epochs = 1000
+log_every = 10
+ckpt_every = 500
+load = None
+batch_size = 10  # only for logging
+lr = 2e-5
+grad_clip = 1.0

configs/opensora/inference/16x256x256.py CHANGED Viewed

@@ -7,13 +7,14 @@ model = dict(
     type="STDiT-XL/2",
     space_scale=0.5,
     time_scale=1.0,
-    enable_flashattn=False,
-    enable_layernorm_kernel=False,
     from_pretrained="PRETRAINED_MODEL",
 )
 vae = dict(
     type="VideoAutoencoderKL",
     from_pretrained="stabilityai/sd-vae-ft-ema",
 )
 text_encoder = dict(
     type="t5",
@@ -24,11 +25,15 @@ scheduler = dict(
     type="iddpm",
     num_sampling_steps=100,
     cfg_scale=7.0,
 )
-dtype = "fp16"
 # Others
-batch_size = 2
 seed = 42
-prompt_path = "./assets/texts/t2v_samples.txt"
-save_dir = "./outputs/samples/"

     type="STDiT-XL/2",
     space_scale=0.5,
     time_scale=1.0,
+    enable_flashattn=True,
+    enable_layernorm_kernel=True,
     from_pretrained="PRETRAINED_MODEL",
 )
 vae = dict(
     type="VideoAutoencoderKL",
     from_pretrained="stabilityai/sd-vae-ft-ema",
+    micro_batch_size=4,
 )
 text_encoder = dict(
     type="t5",
     type="iddpm",
     num_sampling_steps=100,
     cfg_scale=7.0,
+    cfg_channel=3,  # or None
 )
+dtype = "bf16"
+# Condition
+prompt_path = "./assets/texts/t2v_samples.txt"
+prompt = None  # prompt has higher priority than prompt_path
 # Others
+batch_size = 1
 seed = 42
+save_dir = "./samples/samples/"

configs/opensora/inference/16x512x512.py CHANGED Viewed

@@ -7,14 +7,14 @@ model = dict(
     type="STDiT-XL/2",
     space_scale=1.0,
     time_scale=1.0,
-    enable_flashattn=False,
-    enable_layernorm_kernel=False,
-    from_pretrained="PRETRAINED_MODEL"
 )
 vae = dict(
     type="VideoAutoencoderKL",
     from_pretrained="stabilityai/sd-vae-ft-ema",
-    micro_batch_size=128,
 )
 text_encoder = dict(
     type="t5",
@@ -26,10 +26,10 @@ scheduler = dict(
     num_sampling_steps=100,
     cfg_scale=7.0,
 )
-dtype = "fp16"
 # Others
 batch_size = 2
 seed = 42
 prompt_path = "./assets/texts/t2v_samples.txt"
-save_dir = "./outputs/samples/"

     type="STDiT-XL/2",
     space_scale=1.0,
     time_scale=1.0,
+    enable_flashattn=True,
+    enable_layernorm_kernel=True,
+    from_pretrained="PRETRAINED_MODEL",
 )
 vae = dict(
     type="VideoAutoencoderKL",
     from_pretrained="stabilityai/sd-vae-ft-ema",
+    micro_batch_size=2,
 )
 text_encoder = dict(
     type="t5",
     num_sampling_steps=100,
     cfg_scale=7.0,
 )
+dtype = "bf16"
 # Others
 batch_size = 2
 seed = 42
 prompt_path = "./assets/texts/t2v_samples.txt"
+save_dir = "./samples/samples/"

configs/opensora/inference/64x512x512.py CHANGED Viewed

@@ -1,5 +1,5 @@
-num_frames = 16
-fps = 24 //4
 image_size = (512, 512)
 # Define model
@@ -7,8 +7,8 @@ model = dict(
     type="STDiT-XL/2",
     space_scale=1.0,
     time_scale=2 / 3,
-    enable_flashattn=False,
-    enable_layernorm_kernel=False,
     from_pretrained="PRETRAINED_MODEL",
 )
 vae = dict(
@@ -23,13 +23,13 @@ text_encoder = dict(
 )
 scheduler = dict(
     type="iddpm",
-    num_sampling_steps=50,
     cfg_scale=7.0,
 )
-dtype = "fp16"
 # Others
 batch_size = 1
 seed = 42
 prompt_path = "./assets/texts/t2v_samples.txt"
-save_dir = "./outputs/samples/"

+num_frames = 64
+fps = 24 // 2
 image_size = (512, 512)
 # Define model
     type="STDiT-XL/2",
     space_scale=1.0,
     time_scale=2 / 3,
+    enable_flashattn=True,
+    enable_layernorm_kernel=True,
     from_pretrained="PRETRAINED_MODEL",
 )
 vae = dict(
 )
 scheduler = dict(
     type="iddpm",
+    num_sampling_steps=100,
     cfg_scale=7.0,
 )
+dtype = "bf16"
 # Others
 batch_size = 1
 seed = 42
 prompt_path = "./assets/texts/t2v_samples.txt"
+save_dir = "./samples/samples/"

configs/opensora/train/16x256x256-mask.py ADDED Viewed

	@@ -0,0 +1,60 @@

+# Define dataset
+dataset = dict(
+    type="VideoTextDataset",
+    data_path=None,
+    num_frames=16,
+    frame_interval=3,
+    image_size=(256, 256),
+)
+# Define acceleration
+num_workers = 4
+dtype = "bf16"
+grad_checkpoint = True
+plugin = "zero2"
+sp_size = 1
+# Define model
+model = dict(
+    type="STDiT-XL/2",
+    space_scale=0.5,
+    time_scale=1.0,
+    from_pretrained="PixArt-XL-2-512x512.pth",
+    enable_flashattn=True,
+    enable_layernorm_kernel=True,
+)
+mask_ratios = {
+    "mask_no": 0.7,
+    "mask_random": 0.15,
+    "mask_head": 0.05,
+    "mask_tail": 0.05,
+    "mask_head_tail": 0.05,
+}
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
+    model_max_length=120,
+    shardformer=True,
+)
+scheduler = dict(
+    type="iddpm",
+    timestep_respacing="",
+)
+# Others
+seed = 42
+outputs = "outputs"
+wandb = False
+epochs = 1000
+log_every = 10
+ckpt_every = 1000
+load = None
+batch_size = 8
+lr = 2e-5
+grad_clip = 1.0

configs/opensora/train/16x256x256-spee.py ADDED Viewed

	@@ -0,0 +1,60 @@

+# Define dataset
+dataset = dict(
+    type="VideoTextDataset",
+    data_path=None,
+    num_frames=16,
+    frame_interval=3,
+    image_size=(256, 256),
+)
+# Define acceleration
+num_workers = 4
+dtype = "bf16"
+grad_checkpoint = True
+plugin = "zero2"
+sp_size = 1
+# Define model
+model = dict(
+    type="STDiT-XL/2",
+    space_scale=0.5,
+    time_scale=1.0,
+    from_pretrained="PixArt-XL-2-512x512.pth",
+    enable_flashattn=True,
+    enable_layernorm_kernel=True,
+)
+mask_ratios = {
+    "mask_no": 0.5,
+    "mask_random": 0.29,
+    "mask_head": 0.07,
+    "mask_tail": 0.07,
+    "mask_head_tail": 0.07,
+}
+vae = dict(
+    type="VideoAutoencoderKL",
+    from_pretrained="stabilityai/sd-vae-ft-ema",
+)
+text_encoder = dict(
+    type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
+    model_max_length=120,
+    shardformer=True,
+)
+scheduler = dict(
+    type="iddpm-speed",
+    timestep_respacing="",
+)
+# Others
+seed = 42
+outputs = "outputs"
+wandb = False
+epochs = 1000
+log_every = 10
+ckpt_every = 1000
+load = None
+batch_size = 8
+lr = 2e-5
+grad_clip = 1.0

configs/opensora/train/16x256x256.py CHANGED Viewed

@@ -1,14 +1,14 @@
-num_frames = 16
-frame_interval = 3
-image_size = (256, 256)
 # Define dataset
-root = None
-data_path = "CSV_PATH"
-use_image_transform = False
-num_workers = 4
 # Define acceleration
 dtype = "bf16"
 grad_checkpoint = True
 plugin = "zero2"
@@ -29,7 +29,7 @@ vae = dict(
 )
 text_encoder = dict(
     type="t5",
-    from_pretrained="./pretrained_models/t5_ckpts",
     model_max_length=120,
     shardformer=True,
 )

 # Define dataset
+dataset = dict(
+    type="VideoTextDataset",
+    data_path=None,
+    num_frames=16,
+    frame_interval=3,
+    image_size=(256, 256),
+)
 # Define acceleration
+num_workers = 4
 dtype = "bf16"
 grad_checkpoint = True
 plugin = "zero2"
 )
 text_encoder = dict(
     type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
     model_max_length=120,
     shardformer=True,
 )

configs/opensora/train/16x512x512.py CHANGED Viewed

@@ -1,16 +1,16 @@
-num_frames = 16
-frame_interval = 3
-image_size = (512, 512)
 # Define dataset
-root = None
-data_path = "CSV_PATH"
-use_image_transform = False
-num_workers = 4
 # Define acceleration
 dtype = "bf16"
-grad_checkpoint = False
 plugin = "zero2"
 sp_size = 1
@@ -30,7 +30,7 @@ vae = dict(
 )
 text_encoder = dict(
     type="t5",
-    from_pretrained="./pretrained_models/t5_ckpts",
     model_max_length=120,
     shardformer=True,
 )

 # Define dataset
+dataset = dict(
+    type="VideoTextDataset",
+    data_path=None,
+    num_frames=16,
+    frame_interval=3,
+    image_size=(512, 512),
+)
 # Define acceleration
+num_workers = 4
 dtype = "bf16"
+grad_checkpoint = True
 plugin = "zero2"
 sp_size = 1
 )
 text_encoder = dict(
     type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
     model_max_length=120,
     shardformer=True,
 )

configs/opensora/train/360x512x512.py CHANGED Viewed

@@ -1,12 +1,18 @@
-num_frames = 360
-frame_interval = 1
-image_size = (512, 512)
 # Define dataset
-root = None
-data_path = "CSV_PATH"
-use_image_transform = False
 num_workers = 4
 # Define acceleration
 dtype = "bf16"
@@ -31,7 +37,7 @@ vae = dict(
 )
 text_encoder = dict(
     type="t5",
-    from_pretrained="./pretrained_models/t5_ckpts",
     model_max_length=120,
     shardformer=True,
 )

 # Define dataset
+dataset = dict(
+    type="VideoTextDataset",
+    data_path=None,
+    num_frames=360,
+    frame_interval=3,
+    image_size=(512, 512),
+)
+# Define acceleration
 num_workers = 4
+dtype = "bf16"
+grad_checkpoint = True
+plugin = "zero2"
+sp_size = 1
 # Define acceleration
 dtype = "bf16"
 )
 text_encoder = dict(
     type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
     model_max_length=120,
     shardformer=True,
 )

configs/opensora/train/64x512x512-sp.py CHANGED Viewed

@@ -1,17 +1,17 @@
-num_frames = 64
-frame_interval = 2
-image_size = (512, 512)
 # Define dataset
-root = None
-data_path = "CSV_PATH"
-use_image_transform = False
-num_workers = 4
 # Define acceleration
 dtype = "bf16"
 grad_checkpoint = True
-plugin = "zero2-seq"
 sp_size = 2
 # Define model
@@ -30,7 +30,7 @@ vae = dict(
 )
 text_encoder = dict(
     type="t5",
-    from_pretrained="./pretrained_models/t5_ckpts",
     model_max_length=120,
     shardformer=True,
 )

 # Define dataset
+dataset = dict(
+    type="VideoTextDataset",
+    data_path=None,
+    num_frames=16,
+    frame_interval=3,
+    image_size=(512, 512),
+)
 # Define acceleration
+num_workers = 4
 dtype = "bf16"
 grad_checkpoint = True
+plugin = "zero2"
 sp_size = 2
 # Define model
 )
 text_encoder = dict(
     type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
     model_max_length=120,
     shardformer=True,
 )

configs/opensora/train/64x512x512.py CHANGED Viewed

@@ -1,14 +1,14 @@
-num_frames = 64
-frame_interval = 2
-image_size = (512, 512)
 # Define dataset
-root = None
-data_path = "CSV_PATH"
-use_image_transform = False
-num_workers = 4
 # Define acceleration
 dtype = "bf16"
 grad_checkpoint = True
 plugin = "zero2"
@@ -30,7 +30,7 @@ vae = dict(
 )
 text_encoder = dict(
     type="t5",
-    from_pretrained="./pretrained_models/t5_ckpts",
     model_max_length=120,
     shardformer=True,
 )

 # Define dataset
+dataset = dict(
+    type="VideoTextDataset",
+    data_path=None,
+    num_frames=64,
+    frame_interval=3,
+    image_size=(512, 512),
+)
 # Define acceleration
+num_workers = 4
 dtype = "bf16"
 grad_checkpoint = True
 plugin = "zero2"
 )
 text_encoder = dict(
     type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
     model_max_length=120,
     shardformer=True,
 )

configs/pixart/inference/16x256x256.py CHANGED Viewed

@@ -15,7 +15,7 @@ vae = dict(
 )
 text_encoder = dict(
     type="t5",
-    from_pretrained="./pretrained_models/t5_ckpts",
     model_max_length=120,
 )
 scheduler = dict(
@@ -23,10 +23,10 @@ scheduler = dict(
     num_sampling_steps=20,
     cfg_scale=7.0,
 )
-dtype = "fp16"
 # Others
 batch_size = 2
 seed = 42
 prompt_path = "./assets/texts/t2v_samples.txt"
-save_dir = "./outputs/samples/"

 )
 text_encoder = dict(
     type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
     model_max_length=120,
 )
 scheduler = dict(
     num_sampling_steps=20,
     cfg_scale=7.0,
 )
+dtype = "bf16"
 # Others
 batch_size = 2
 seed = 42
 prompt_path = "./assets/texts/t2v_samples.txt"
+save_dir = "./samples/samples/"

configs/pixart/inference/1x1024MS.py CHANGED Viewed

@@ -1,7 +1,7 @@
 num_frames = 1
 fps = 1
 image_size = (1920, 512)
-multi_resolution = True
 # Define model
 model = dict(
@@ -17,7 +17,7 @@ vae = dict(
 )
 text_encoder = dict(
     type="t5",
-    from_pretrained="./pretrained_models/t5_ckpts",
     model_max_length=120,
 )
 scheduler = dict(
@@ -25,10 +25,10 @@ scheduler = dict(
     num_sampling_steps=20,
     cfg_scale=7.0,
 )
-dtype = "fp16"
 # Others
 batch_size = 2
 seed = 42
 prompt_path = "./assets/texts/t2i_samples.txt"
-save_dir = "./outputs/samples/"

 num_frames = 1
 fps = 1
 image_size = (1920, 512)
+multi_resolution = "PixArtMS"
 # Define model
 model = dict(
 )
 text_encoder = dict(
     type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
     model_max_length=120,
 )
 scheduler = dict(
     num_sampling_steps=20,
     cfg_scale=7.0,
 )
+dtype = "bf16"
 # Others
 batch_size = 2
 seed = 42
 prompt_path = "./assets/texts/t2i_samples.txt"
+save_dir = "./samples/samples/"

configs/pixart/inference/1x256x256.py CHANGED Viewed

@@ -16,7 +16,7 @@ vae = dict(
 )
 text_encoder = dict(
     type="t5",
-    from_pretrained="./pretrained_models/t5_ckpts",
     model_max_length=120,
 )
 scheduler = dict(
@@ -24,10 +24,10 @@ scheduler = dict(
     num_sampling_steps=20,
     cfg_scale=7.0,
 )
-dtype = "fp16"
 # Others
 batch_size = 2
 seed = 42
 prompt_path = "./assets/texts/t2i_samples.txt"
-save_dir = "./outputs/samples/"

 )
 text_encoder = dict(
     type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
     model_max_length=120,
 )
 scheduler = dict(
     num_sampling_steps=20,
     cfg_scale=7.0,
 )
+dtype = "bf16"
 # Others
 batch_size = 2
 seed = 42
 prompt_path = "./assets/texts/t2i_samples.txt"
+save_dir = "./samples/samples/"

configs/pixart/inference/1x512x512.py CHANGED Viewed

@@ -16,7 +16,7 @@ vae = dict(
 )
 text_encoder = dict(
     type="t5",
-    from_pretrained="./pretrained_models/t5_ckpts",
     model_max_length=120,
 )
 scheduler = dict(
@@ -24,10 +24,16 @@ scheduler = dict(
     num_sampling_steps=20,
     cfg_scale=7.0,
 )
-dtype = "fp16"
 # Others
 batch_size = 2
 seed = 42
-prompt_path = "./assets/texts/t2i_samples.txt"
-save_dir = "./outputs/samples/"

 )
 text_encoder = dict(
     type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
     model_max_length=120,
 )
 scheduler = dict(
     num_sampling_steps=20,
     cfg_scale=7.0,
 )
+dtype = "bf16"
+# prompt_path = "./assets/texts/t2i_samples.txt"
+prompt = [
+    "Pirate ship trapped in a cosmic maelstrom nebula.",
+    "A small cactus with a happy face in the Sahara desert.",
+    "A small cactus with a sad face in the Sahara desert.",
+]
 # Others
 batch_size = 2
 seed = 42
+save_dir = "./samples/samples/"

configs/pixart/train/16x256x256.py CHANGED Viewed

@@ -1,16 +1,16 @@
-num_frames = 16
-frame_interval = 3
-image_size = (256, 256)
 # Define dataset
-root = None
-data_path = "CSV_PATH"
-use_image_transform = False
-num_workers = 4
 # Define acceleration
 dtype = "bf16"
-grad_checkpoint = False
 plugin = "zero2"
 sp_size = 1
@@ -29,7 +29,7 @@ vae = dict(
 )
 text_encoder = dict(
     type="t5",
-    from_pretrained="./pretrained_models/t5_ckpts",
     model_max_length=120,
     shardformer=True,
 )

 # Define dataset
+dataset = dict(
+    type="VideoTextDataset",
+    data_path=None,
+    num_frames=16,
+    frame_interval=3,
+    image_size=(256, 256),
+)
 # Define acceleration
+num_workers = 4
 dtype = "bf16"
+grad_checkpoint = True
 plugin = "zero2"
 sp_size = 1
 )
 text_encoder = dict(
     type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
     model_max_length=120,
     shardformer=True,
 )

configs/pixart/train/1x512x512.py CHANGED Viewed

@@ -1,14 +1,14 @@
-num_frames = 1
-frame_interval = 1
-image_size = (512, 512)
 # Define dataset
-root = None
-data_path = "CSV_PATH"
-use_image_transform = True
-num_workers = 4
 # Define acceleration
 dtype = "bf16"
 grad_checkpoint = True
 plugin = "zero2"
@@ -30,7 +30,7 @@ vae = dict(
 )
 text_encoder = dict(
     type="t5",
-    from_pretrained="./pretrained_models/t5_ckpts",
     model_max_length=120,
     shardformer=True,
 )

 # Define dataset
+dataset = dict(
+    type="VideoTextDataset",
+    data_path=None,
+    num_frames=1,
+    frame_interval=3,
+    image_size=(512, 512),
+)
 # Define acceleration
+num_workers = 4
 dtype = "bf16"
 grad_checkpoint = True
 plugin = "zero2"
 )
 text_encoder = dict(
     type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
     model_max_length=120,
     shardformer=True,
 )

configs/pixart/train/64x512x512.py CHANGED Viewed

@@ -1,19 +1,20 @@
-num_frames = 64
-frame_interval = 2
-image_size = (512, 512)
 # Define dataset
-root = None
-data_path = "CSV_PATH"
-use_image_transform = False
-num_workers = 4
 # Define acceleration
 dtype = "bf16"
 grad_checkpoint = True
 plugin = "zero2"
 sp_size = 1
 # Define model
 model = dict(
     type="PixArt-XL/2",
@@ -30,7 +31,7 @@ vae = dict(
 )
 text_encoder = dict(
     type="t5",
-    from_pretrained="./pretrained_models/t5_ckpts",
     model_max_length=120,
     shardformer=True,
 )

 # Define dataset
+dataset = dict(
+    type="VideoTextDataset",
+    data_path=None,
+    num_frames=64,
+    frame_interval=3,
+    image_size=(256, 256),
+)
 # Define acceleration
+num_workers = 4
 dtype = "bf16"
 grad_checkpoint = True
 plugin = "zero2"
 sp_size = 1
 # Define model
 model = dict(
     type="PixArt-XL/2",
 )
 text_encoder = dict(
     type="t5",
+    from_pretrained="DeepFloyd/t5-v1_1-xxl",
     model_max_length=120,
     shardformer=True,
 )