frankleeeee commited on
Commit
68404e4
1 Parent(s): 3f75658

uploaded app

Browse files
app.py ADDED
@@ -0,0 +1,248 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ This script runs a Gradio App for the Open-Sora model.
4
+
5
+ Usage:
6
+ python demo.py <config-path>
7
+ """
8
+
9
+ import argparse
10
+ import importlib
11
+ import os
12
+ import subprocess
13
+ import sys
14
+
15
+ import spaces
16
+ import torch
17
+
18
+ import gradio as gr
19
+
20
+ MODEL_TYPES = ["v1-16x256x256", "v1-HQ-16x256x256", "v1-HQ-16x512x512"]
21
+ CONFIG_MAP = {
22
+ "v1-16x256x256": "configs/opensora/inference/16x256x256.py",
23
+ "v1-HQ-16x256x256": "configs/opensora/inference/16x256x256.py",
24
+ "v1-HQ-16x512x512": "configs/opensora/inference/16x512x512.py",
25
+ }
26
+ HF_STDIT_MAP = {
27
+ "v1-16x256x256": "hpcai-tech/OpenSora-STDiT-v1-16x256x256",
28
+ "v1-HQ-16x256x256": "hpcai-tech/OpenSora-STDiT-v1-HQ-16x256x256",
29
+ "v1-HQ-16x512x512": "hpcai-tech/OpenSora-STDiT-v1-HQ-16x512x512",
30
+ }
31
+
32
+
33
+ def install_dependencies(enable_optimization=False):
34
+ """
35
+ Install the required dependencies for the demo if they are not already installed.
36
+ """
37
+
38
+ def _is_package_available(name) -> bool:
39
+ try:
40
+ importlib.import_module(name)
41
+ return True
42
+ except (ImportError, ModuleNotFoundError):
43
+ return False
44
+
45
+ # flash attention is needed no matter optimization is enabled or not
46
+ # because Hugging Face transformers detects flash_attn is a dependency in STDiT
47
+ # thus, we need to install it no matter what
48
+ if not _is_package_available("flash_attn"):
49
+ subprocess.run(
50
+ f"{sys.executable} -m pip install flash-attn --no-build-isolation",
51
+ env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
52
+ shell=True,
53
+ )
54
+
55
+ if enable_optimization:
56
+ # install apex for fused layernorm
57
+ if not _is_package_available("apex"):
58
+ subprocess.run(
59
+ f'{sys.executable} -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git',
60
+ shell=True,
61
+ )
62
+
63
+ # install ninja
64
+ if not _is_package_available("ninja"):
65
+ subprocess.run(f"{sys.executable} -m pip install ninja", shell=True)
66
+
67
+ # install xformers
68
+ if not _is_package_available("xformers"):
69
+ subprocess.run(
70
+ f"{sys.executable} -m pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers",
71
+ shell=True,
72
+ )
73
+
74
+
75
+ def read_config(config_path):
76
+ """
77
+ Read the configuration file.
78
+ """
79
+ from mmengine.config import Config
80
+
81
+ return Config.fromfile(config_path)
82
+
83
+
84
+ def build_models(model_type, config):
85
+ """
86
+ Build the models for the given model type and configuration.
87
+ """
88
+ # build vae
89
+ from opensora.registry import MODELS, build_module
90
+
91
+ vae = build_module(config.vae, MODELS).cuda()
92
+
93
+ # build text encoder
94
+ text_encoder = build_module(config.text_encoder, MODELS) # T5 must be fp32
95
+ text_encoder.t5.model = text_encoder.t5.model.cuda()
96
+
97
+ # build stdit
98
+ # we load model from HuggingFace directly so that we don't need to
99
+ # handle model download logic in HuggingFace Space
100
+ from transformers import AutoModel
101
+
102
+ stdit = AutoModel.from_pretrained(
103
+ HF_STDIT_MAP[model_type],
104
+ enable_flash_attn=False,
105
+ enable_layernorm_kernel=False,
106
+ trust_remote_code=True,
107
+ ).cuda()
108
+
109
+ # build scheduler
110
+ from opensora.registry import SCHEDULERS
111
+
112
+ scheduler = build_module(config.scheduler, SCHEDULERS)
113
+
114
+ # hack for classifier-free guidance
115
+ text_encoder.y_embedder = stdit.y_embedder
116
+
117
+ # move modelst to device
118
+ vae = vae.to(torch.float16).eval()
119
+ text_encoder.t5.model = text_encoder.t5.model.eval() # t5 must be in fp32
120
+ stdit = stdit.to(torch.float16).eval()
121
+ return vae, text_encoder, stdit, scheduler
122
+
123
+
124
+ def get_latent_size(config, vae):
125
+ input_size = (config.num_frames, *config.image_size)
126
+ latent_size = vae.get_latent_size(input_size)
127
+ return latent_size
128
+
129
+
130
+ def parse_args():
131
+ parser = argparse.ArgumentParser()
132
+ parser.add_argument(
133
+ "--model-type",
134
+ default="v1-HQ-16x256x256",
135
+ choices=MODEL_TYPES,
136
+ help=f"The type of model to run for the Gradio App, can only be {MODEL_TYPES}",
137
+ )
138
+ parser.add_argument("--output", default="./outputs", type=str, help="The path to the output folder")
139
+ parser.add_argument("--port", default=None, type=int, help="The port to run the Gradio App on.")
140
+ parser.add_argument("--host", default=None, type=str, help="The host to run the Gradio App on.")
141
+ parser.add_argument("--share", action="store_true", help="Whether to share this gradio demo.")
142
+ parser.add_argument(
143
+ "--enable-optimization",
144
+ action="store_true",
145
+ help="Whether to enable optimization such as flash attention and fused layernorm",
146
+ )
147
+ return parser.parse_args()
148
+
149
+
150
+ # ============================
151
+ # Main Gradio Script
152
+ # ============================
153
+ # as `run_inference` needs to be wrapped by `spaces.GPU` and the input can only be the prompt text
154
+ # so we can't pass the models to `run_inference` as arguments.
155
+ # instead, we need to define them globally so that we can access these models inside `run_inference`
156
+
157
+ # read config
158
+ args = parse_args()
159
+ config = read_config(CONFIG_MAP[args.model_type])
160
+
161
+ # make outputs dir
162
+ os.makedirs(args.output, exist_ok=True)
163
+
164
+ # disable torch jit as it can cause failure in gradio SDK
165
+ # gradio sdk uses torch with cuda 11.3
166
+ torch.jit._state.disable()
167
+
168
+ # set up
169
+ install_dependencies(enable_optimization=args.enable_optimization)
170
+
171
+ # build model
172
+ vae, text_encoder, stdit, scheduler = build_models(args.model_type, config)
173
+
174
+
175
+ @spaces.GPU(duration=200)
176
+ def run_inference(prompt_text):
177
+ from opensora.datasets import save_sample
178
+
179
+ latent_size = get_latent_size(config, vae)
180
+ samples = scheduler.sample(
181
+ stdit,
182
+ text_encoder,
183
+ z_size=(vae.out_channels, *latent_size),
184
+ prompts=[prompt_text],
185
+ device="cuda",
186
+ )
187
+
188
+ samples = vae.decode(samples.to(torch.float16))
189
+ filename = f"{args.output}/sample"
190
+ saved_path = save_sample(samples[0], fps=config.fps, save_path=filename)
191
+ return saved_path
192
+
193
+
194
+ def main():
195
+ # create demo
196
+ with gr.Blocks() as demo:
197
+ with gr.Row():
198
+ with gr.Column():
199
+ gr.HTML(
200
+ """
201
+ <div style='text-align: center;'>
202
+ <p align="center">
203
+ <img src="https://github.com/hpcaitech/Open-Sora/raw/main/assets/readme/icon.png" width="250"/>
204
+ </p>
205
+ <div style="display: flex; gap: 10px; justify-content: center;">
206
+ <a href="https://github.com/hpcaitech/Open-Sora/stargazers"><img src="https://img.shields.io/github/stars/hpcaitech/Open-Sora?style=social"></a>
207
+ <a href="https://hpcaitech.github.io/Open-Sora/"><img src="https://img.shields.io/badge/Gallery-View-orange?logo=&amp"></a>
208
+ <a href="https://discord.gg/kZakZzrSUT"><img src="https://img.shields.io/badge/Discord-join-blueviolet?logo=discord&amp"></a>
209
+ <a href="https://join.slack.com/t/colossalaiworkspace/shared_invite/zt-247ipg9fk-KRRYmUl~u2ll2637WRURVA"><img src="https://img.shields.io/badge/Slack-ColossalAI-blueviolet?logo=slack&amp"></a>
210
+ <a href="https://twitter.com/yangyou1991/status/1769411544083996787?s=61&t=jT0Dsx2d-MS5vS9rNM5e5g"><img src="https://img.shields.io/badge/Twitter-Discuss-blue?logo=twitter&amp"></a>
211
+ <a href="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/WeChat.png"><img src="https://img.shields.io/badge/微信-小助手加群-green?logo=wechat&amp"></a>
212
+ <a href="https://hpc-ai.com/blog/open-sora-v1.0"><img src="https://img.shields.io/badge/Open_Sora-Blog-blue"></a>
213
+ </div>
214
+ <h1 style='margin-top: 5px;'>Open-Sora: Democratizing Efficient Video Production for All</h1>
215
+ </div>
216
+ """
217
+ )
218
+
219
+ with gr.Row():
220
+ with gr.Column():
221
+ prompt_text = gr.Textbox(show_label=False, placeholder="Describe your video here", lines=4)
222
+ submit_button = gr.Button("Generate video")
223
+
224
+ with gr.Column():
225
+ output_video = gr.Video()
226
+
227
+ submit_button.click(fn=run_inference, inputs=[prompt_text], outputs=output_video)
228
+
229
+ gr.Examples(
230
+ examples=[
231
+ [
232
+ "The video captures the majestic beauty of a waterfall cascading down a cliff into a serene lake. The waterfall, with its powerful flow, is the central focus of the video. The surrounding landscape is lush and green, with trees and foliage adding to the natural beauty of the scene. The camera angle provides a bird's eye view of the waterfall, allowing viewers to appreciate the full height and grandeur of the waterfall. The video is a stunning representation of nature's power and beauty.",
233
+ ],
234
+ ],
235
+ fn=run_inference,
236
+ inputs=[
237
+ prompt_text,
238
+ ],
239
+ outputs=[output_video],
240
+ cache_examples=True,
241
+ )
242
+
243
+ # launch
244
+ demo.launch(server_port=args.port, server_name=args.host, share=args.share)
245
+
246
+
247
+ if __name__ == "__main__":
248
+ main()
configs/dit/inference/16x256x256.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 16
2
+ fps = 8
3
+ image_size = (256, 256)
4
+
5
+ # Define model
6
+ model = dict(
7
+ type="DiT-XL/2",
8
+ condition="text",
9
+ from_pretrained="PRETRAINED_MODEL",
10
+ )
11
+ vae = dict(
12
+ type="VideoAutoencoderKL",
13
+ from_pretrained="stabilityai/sd-vae-ft-ema",
14
+ )
15
+ text_encoder = dict(
16
+ type="clip",
17
+ from_pretrained="openai/clip-vit-base-patch32",
18
+ model_max_length=77,
19
+ )
20
+ scheduler = dict(
21
+ type="dpm-solver",
22
+ num_sampling_steps=20,
23
+ cfg_scale=4.0,
24
+ )
25
+ dtype = "fp16"
26
+
27
+ # Others
28
+ batch_size = 2
29
+ seed = 42
30
+ prompt_path = "./assets/texts/ucf101_labels.txt"
31
+ save_dir = "./outputs/samples/"
configs/dit/inference/1x256x256-class.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 1
2
+ fps = 1
3
+ image_size = (256, 256)
4
+
5
+ # Define model
6
+ model = dict(
7
+ type="DiT-XL/2",
8
+ no_temporal_pos_emb=True,
9
+ condition="label_1000",
10
+ from_pretrained="DiT-XL-2-256x256.pt",
11
+ )
12
+ vae = dict(
13
+ type="VideoAutoencoderKL",
14
+ from_pretrained="stabilityai/sd-vae-ft-ema",
15
+ )
16
+ text_encoder = dict(
17
+ type="classes",
18
+ num_classes=1000,
19
+ )
20
+ scheduler = dict(
21
+ type="dpm-solver",
22
+ num_sampling_steps=20,
23
+ cfg_scale=4.0,
24
+ )
25
+ dtype = "fp16"
26
+
27
+ # Others
28
+ batch_size = 2
29
+ seed = 42
30
+ prompt_path = "./assets/texts/imagenet_id.txt"
31
+ save_dir = "./outputs/samples/"
configs/dit/inference/1x256x256.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 1
2
+ fps = 1
3
+ image_size = (256, 256)
4
+
5
+ # Define model
6
+ model = dict(
7
+ type="DiT-XL/2",
8
+ no_temporal_pos_emb=True,
9
+ condition="text",
10
+ from_pretrained="PRETRAINED_MODEL",
11
+ )
12
+ vae = dict(
13
+ type="VideoAutoencoderKL",
14
+ from_pretrained="stabilityai/sd-vae-ft-ema",
15
+ )
16
+ text_encoder = dict(
17
+ type="clip",
18
+ from_pretrained="openai/clip-vit-base-patch32",
19
+ model_max_length=77,
20
+ )
21
+ scheduler = dict(
22
+ type="dpm-solver",
23
+ num_sampling_steps=20,
24
+ cfg_scale=4.0,
25
+ )
26
+ dtype = "fp16"
27
+
28
+ # Others
29
+ batch_size = 2
30
+ seed = 42
31
+ prompt_path = "./assets/texts/imagenet_labels.txt"
32
+ save_dir = "./outputs/samples/"
configs/dit/train/16x256x256.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 16
2
+ frame_interval = 3
3
+ image_size = (256, 256)
4
+
5
+ # Define dataset
6
+ root = None
7
+ data_path = "CSV_PATH"
8
+ use_image_transform = False
9
+ num_workers = 4
10
+
11
+ # Define acceleration
12
+ dtype = "bf16"
13
+ grad_checkpoint = False
14
+ plugin = "zero2"
15
+ sp_size = 1
16
+
17
+ # Define model
18
+ model = dict(
19
+ type="DiT-XL/2",
20
+ from_pretrained="DiT-XL-2-256x256.pt",
21
+ enable_flashattn=True,
22
+ enable_layernorm_kernel=True,
23
+ )
24
+ vae = dict(
25
+ type="VideoAutoencoderKL",
26
+ from_pretrained="stabilityai/sd-vae-ft-ema",
27
+ )
28
+ text_encoder = dict(
29
+ type="clip",
30
+ from_pretrained="openai/clip-vit-base-patch32",
31
+ model_max_length=77,
32
+ )
33
+ scheduler = dict(
34
+ type="iddpm",
35
+ timestep_respacing="",
36
+ )
37
+
38
+ # Others
39
+ seed = 42
40
+ outputs = "outputs"
41
+ wandb = False
42
+
43
+ epochs = 1000
44
+ log_every = 10
45
+ ckpt_every = 1000
46
+ load = None
47
+
48
+ batch_size = 8
49
+ lr = 2e-5
50
+ grad_clip = 1.0
configs/dit/train/1x256x256.py ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 1
2
+ frame_interval = 1
3
+ image_size = (256, 256)
4
+
5
+ # Define dataset
6
+ root = None
7
+ data_path = "CSV_PATH"
8
+ use_image_transform = True
9
+ num_workers = 4
10
+
11
+ # Define acceleration
12
+ dtype = "bf16"
13
+ grad_checkpoint = False
14
+ plugin = "zero2"
15
+ sp_size = 1
16
+
17
+ # Define model
18
+ model = dict(
19
+ type="DiT-XL/2",
20
+ no_temporal_pos_emb=True,
21
+ enable_flashattn=True,
22
+ enable_layernorm_kernel=True,
23
+ )
24
+ vae = dict(
25
+ type="VideoAutoencoderKL",
26
+ from_pretrained="stabilityai/sd-vae-ft-ema",
27
+ )
28
+ text_encoder = dict(
29
+ type="clip",
30
+ from_pretrained="openai/clip-vit-base-patch32",
31
+ model_max_length=77,
32
+ )
33
+ scheduler = dict(
34
+ type="iddpm",
35
+ timestep_respacing="",
36
+ )
37
+
38
+ # Others
39
+ seed = 42
40
+ outputs = "outputs"
41
+ wandb = False
42
+
43
+ epochs = 1000
44
+ log_every = 10
45
+ ckpt_every = 1000
46
+ load = None
47
+
48
+ batch_size = 128
49
+ lr = 1e-4 # according to DiT repo
50
+ grad_clip = 1.0
configs/latte/inference/16x256x256-class.py ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 16
2
+ fps = 8
3
+ image_size = (256, 256)
4
+
5
+ # Define model
6
+ model = dict(
7
+ type="Latte-XL/2",
8
+ condition="label_101",
9
+ from_pretrained="Latte-XL-2-256x256-ucf101.pt",
10
+ )
11
+ vae = dict(
12
+ type="VideoAutoencoderKL",
13
+ from_pretrained="stabilityai/sd-vae-ft-ema",
14
+ )
15
+ text_encoder = dict(
16
+ type="classes",
17
+ num_classes=101,
18
+ )
19
+ scheduler = dict(
20
+ type="dpm-solver",
21
+ num_sampling_steps=20,
22
+ cfg_scale=4.0,
23
+ )
24
+ dtype = "fp16"
25
+
26
+ # Others
27
+ batch_size = 2
28
+ seed = 42
29
+ prompt_path = "./assets/texts/ucf101_id.txt"
30
+ save_dir = "./outputs/samples/"
configs/latte/inference/16x256x256.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 16
2
+ fps = 8
3
+ image_size = (256, 256)
4
+
5
+ # Define model
6
+ model = dict(
7
+ type="Latte-XL/2",
8
+ condition="text",
9
+ from_pretrained="PRETRAINED_MODEL",
10
+ )
11
+ vae = dict(
12
+ type="VideoAutoencoderKL",
13
+ from_pretrained="stabilityai/sd-vae-ft-ema",
14
+ )
15
+ text_encoder = dict(
16
+ type="clip",
17
+ from_pretrained="openai/clip-vit-base-patch32",
18
+ model_max_length=77,
19
+ )
20
+ scheduler = dict(
21
+ type="dpm-solver",
22
+ num_sampling_steps=20,
23
+ cfg_scale=4.0,
24
+ )
25
+ dtype = "fp16"
26
+
27
+ # Others
28
+ batch_size = 2
29
+ seed = 42
30
+ prompt_path = "./assets/texts/ucf101_labels.txt"
31
+ save_dir = "./outputs/samples/"
configs/latte/train/16x256x256.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 16
2
+ frame_interval = 3
3
+ image_size = (256, 256)
4
+
5
+ # Define dataset
6
+ root = None
7
+ data_path = "CSV_PATH"
8
+ use_image_transform = False
9
+ num_workers = 4
10
+
11
+ # Define acceleration
12
+ dtype = "bf16"
13
+ grad_checkpoint = True
14
+ plugin = "zero2"
15
+ sp_size = 1
16
+
17
+ # Define model
18
+ model = dict(
19
+ type="Latte-XL/2",
20
+ enable_flashattn=True,
21
+ enable_layernorm_kernel=True,
22
+ )
23
+ vae = dict(
24
+ type="VideoAutoencoderKL",
25
+ from_pretrained="stabilityai/sd-vae-ft-ema",
26
+ )
27
+ text_encoder = dict(
28
+ type="clip",
29
+ from_pretrained="openai/clip-vit-base-patch32",
30
+ model_max_length=77,
31
+ )
32
+ scheduler = dict(
33
+ type="iddpm",
34
+ timestep_respacing="",
35
+ )
36
+
37
+ # Others
38
+ seed = 42
39
+ outputs = "outputs"
40
+ wandb = False
41
+
42
+ epochs = 1000
43
+ log_every = 10
44
+ ckpt_every = 1000
45
+ load = None
46
+
47
+ batch_size = 8
48
+ lr = 2e-5
49
+ grad_clip = 1.0
configs/opensora/inference/16x256x256.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 16
2
+ fps = 24 // 3
3
+ image_size = (256, 256)
4
+
5
+ # Define model
6
+ model = dict(
7
+ type="STDiT-XL/2",
8
+ space_scale=0.5,
9
+ time_scale=1.0,
10
+ enable_flashattn=True,
11
+ enable_layernorm_kernel=True,
12
+ from_pretrained="PRETRAINED_MODEL",
13
+ )
14
+ vae = dict(
15
+ type="VideoAutoencoderKL",
16
+ from_pretrained="stabilityai/sd-vae-ft-ema",
17
+ micro_batch_size=4,
18
+ )
19
+ text_encoder = dict(
20
+ type="t5",
21
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
22
+ model_max_length=120,
23
+ )
24
+ scheduler = dict(
25
+ type="iddpm",
26
+ num_sampling_steps=100,
27
+ cfg_scale=7.0,
28
+ cfg_channel=3, # or None
29
+ )
30
+ dtype = "fp16"
31
+
32
+ # Others
33
+ batch_size = 1
34
+ seed = 42
35
+ prompt_path = "./assets/texts/t2v_samples.txt"
36
+ save_dir = "./outputs/samples/"
configs/opensora/inference/16x512x512.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 16
2
+ fps = 24 // 3
3
+ image_size = (512, 512)
4
+
5
+ # Define model
6
+ model = dict(
7
+ type="STDiT-XL/2",
8
+ space_scale=1.0,
9
+ time_scale=1.0,
10
+ enable_flashattn=True,
11
+ enable_layernorm_kernel=True,
12
+ from_pretrained="PRETRAINED_MODEL"
13
+ )
14
+ vae = dict(
15
+ type="VideoAutoencoderKL",
16
+ from_pretrained="stabilityai/sd-vae-ft-ema",
17
+ micro_batch_size=2,
18
+ )
19
+ text_encoder = dict(
20
+ type="t5",
21
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
22
+ model_max_length=120,
23
+ )
24
+ scheduler = dict(
25
+ type="iddpm",
26
+ num_sampling_steps=100,
27
+ cfg_scale=7.0,
28
+ )
29
+ dtype = "fp16"
30
+
31
+ # Others
32
+ batch_size = 2
33
+ seed = 42
34
+ prompt_path = "./assets/texts/t2v_samples.txt"
35
+ save_dir = "./outputs/samples/"
configs/opensora/inference/64x512x512.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 64
2
+ fps = 24 // 2
3
+ image_size = (512, 512)
4
+
5
+ # Define model
6
+ model = dict(
7
+ type="STDiT-XL/2",
8
+ space_scale=1.0,
9
+ time_scale=2 / 3,
10
+ enable_flashattn=True,
11
+ enable_layernorm_kernel=True,
12
+ from_pretrained="PRETRAINED_MODEL",
13
+ )
14
+ vae = dict(
15
+ type="VideoAutoencoderKL",
16
+ from_pretrained="stabilityai/sd-vae-ft-ema",
17
+ micro_batch_size=128,
18
+ )
19
+ text_encoder = dict(
20
+ type="t5",
21
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
22
+ model_max_length=120,
23
+ )
24
+ scheduler = dict(
25
+ type="iddpm",
26
+ num_sampling_steps=100,
27
+ cfg_scale=7.0,
28
+ )
29
+ dtype = "fp16"
30
+
31
+ # Others
32
+ batch_size = 1
33
+ seed = 42
34
+ prompt_path = "./assets/texts/t2v_samples.txt"
35
+ save_dir = "./outputs/samples/"
configs/opensora/train/16x256x256.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 16
2
+ frame_interval = 3
3
+ image_size = (256, 256)
4
+
5
+ # Define dataset
6
+ root = None
7
+ data_path = "CSV_PATH"
8
+ use_image_transform = False
9
+ num_workers = 4
10
+
11
+ # Define acceleration
12
+ dtype = "bf16"
13
+ grad_checkpoint = True
14
+ plugin = "zero2"
15
+ sp_size = 1
16
+
17
+ # Define model
18
+ model = dict(
19
+ type="STDiT-XL/2",
20
+ space_scale=0.5,
21
+ time_scale=1.0,
22
+ from_pretrained="PixArt-XL-2-512x512.pth",
23
+ enable_flashattn=True,
24
+ enable_layernorm_kernel=True,
25
+ )
26
+ vae = dict(
27
+ type="VideoAutoencoderKL",
28
+ from_pretrained="stabilityai/sd-vae-ft-ema",
29
+ )
30
+ text_encoder = dict(
31
+ type="t5",
32
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
33
+ model_max_length=120,
34
+ shardformer=True,
35
+ )
36
+ scheduler = dict(
37
+ type="iddpm",
38
+ timestep_respacing="",
39
+ )
40
+
41
+ # Others
42
+ seed = 42
43
+ outputs = "outputs"
44
+ wandb = False
45
+
46
+ epochs = 1000
47
+ log_every = 10
48
+ ckpt_every = 1000
49
+ load = None
50
+
51
+ batch_size = 8
52
+ lr = 2e-5
53
+ grad_clip = 1.0
configs/opensora/train/16x512x512.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 16
2
+ frame_interval = 3
3
+ image_size = (512, 512)
4
+
5
+ # Define dataset
6
+ root = None
7
+ data_path = "CSV_PATH"
8
+ use_image_transform = False
9
+ num_workers = 4
10
+
11
+ # Define acceleration
12
+ dtype = "bf16"
13
+ grad_checkpoint = False
14
+ plugin = "zero2"
15
+ sp_size = 1
16
+
17
+ # Define model
18
+ model = dict(
19
+ type="STDiT-XL/2",
20
+ space_scale=1.0,
21
+ time_scale=1.0,
22
+ from_pretrained=None,
23
+ enable_flashattn=True,
24
+ enable_layernorm_kernel=True,
25
+ )
26
+ vae = dict(
27
+ type="VideoAutoencoderKL",
28
+ from_pretrained="stabilityai/sd-vae-ft-ema",
29
+ micro_batch_size=128,
30
+ )
31
+ text_encoder = dict(
32
+ type="t5",
33
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
34
+ model_max_length=120,
35
+ shardformer=True,
36
+ )
37
+ scheduler = dict(
38
+ type="iddpm",
39
+ timestep_respacing="",
40
+ )
41
+
42
+ # Others
43
+ seed = 42
44
+ outputs = "outputs"
45
+ wandb = False
46
+
47
+ epochs = 1000
48
+ log_every = 10
49
+ ckpt_every = 500
50
+ load = None
51
+
52
+ batch_size = 8
53
+ lr = 2e-5
54
+ grad_clip = 1.0
configs/opensora/train/360x512x512.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 360
2
+ frame_interval = 1
3
+ image_size = (512, 512)
4
+
5
+ # Define dataset
6
+ root = None
7
+ data_path = "CSV_PATH"
8
+ use_image_transform = False
9
+ num_workers = 4
10
+
11
+ # Define acceleration
12
+ dtype = "bf16"
13
+ grad_checkpoint = True
14
+ plugin = "zero2-seq"
15
+ sp_size = 2
16
+
17
+ # Define model
18
+ model = dict(
19
+ type="STDiT-XL/2",
20
+ space_scale=1.0,
21
+ time_scale=2 / 3,
22
+ from_pretrained=None,
23
+ enable_flashattn=True,
24
+ enable_layernorm_kernel=True,
25
+ enable_sequence_parallelism=True, # enable sq here
26
+ )
27
+ vae = dict(
28
+ type="VideoAutoencoderKL",
29
+ from_pretrained="stabilityai/sd-vae-ft-ema",
30
+ micro_batch_size=128,
31
+ )
32
+ text_encoder = dict(
33
+ type="t5",
34
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
35
+ model_max_length=120,
36
+ shardformer=True,
37
+ )
38
+ scheduler = dict(
39
+ type="iddpm",
40
+ timestep_respacing="",
41
+ )
42
+
43
+ # Others
44
+ seed = 42
45
+ outputs = "outputs"
46
+ wandb = False
47
+
48
+ epochs = 1000
49
+ log_every = 10
50
+ ckpt_every = 250
51
+ load = None
52
+
53
+ batch_size = 1
54
+ lr = 2e-5
55
+ grad_clip = 1.0
configs/opensora/train/64x512x512-sp.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 64
2
+ frame_interval = 2
3
+ image_size = (512, 512)
4
+
5
+ # Define dataset
6
+ root = None
7
+ data_path = "CSV_PATH"
8
+ use_image_transform = False
9
+ num_workers = 4
10
+
11
+ # Define acceleration
12
+ dtype = "bf16"
13
+ grad_checkpoint = True
14
+ plugin = "zero2-seq"
15
+ sp_size = 2
16
+
17
+ # Define model
18
+ model = dict(
19
+ type="STDiT-XL/2",
20
+ space_scale=1.0,
21
+ time_scale=2 / 3,
22
+ from_pretrained=None,
23
+ enable_flashattn=True,
24
+ enable_layernorm_kernel=True,
25
+ enable_sequence_parallelism=True, # enable sq here
26
+ )
27
+ vae = dict(
28
+ type="VideoAutoencoderKL",
29
+ from_pretrained="stabilityai/sd-vae-ft-ema",
30
+ )
31
+ text_encoder = dict(
32
+ type="t5",
33
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
34
+ model_max_length=120,
35
+ shardformer=True,
36
+ )
37
+ scheduler = dict(
38
+ type="iddpm",
39
+ timestep_respacing="",
40
+ )
41
+
42
+ # Others
43
+ seed = 42
44
+ outputs = "outputs"
45
+ wandb = False
46
+
47
+ epochs = 1000
48
+ log_every = 10
49
+ ckpt_every = 1000
50
+ load = None
51
+
52
+ batch_size = 1
53
+ lr = 2e-5
54
+ grad_clip = 1.0
configs/opensora/train/64x512x512.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 64
2
+ frame_interval = 2
3
+ image_size = (512, 512)
4
+
5
+ # Define dataset
6
+ root = None
7
+ data_path = "CSV_PATH"
8
+ use_image_transform = False
9
+ num_workers = 4
10
+
11
+ # Define acceleration
12
+ dtype = "bf16"
13
+ grad_checkpoint = True
14
+ plugin = "zero2"
15
+ sp_size = 1
16
+
17
+ # Define model
18
+ model = dict(
19
+ type="STDiT-XL/2",
20
+ space_scale=1.0,
21
+ time_scale=2 / 3,
22
+ from_pretrained=None,
23
+ enable_flashattn=True,
24
+ enable_layernorm_kernel=True,
25
+ )
26
+ vae = dict(
27
+ type="VideoAutoencoderKL",
28
+ from_pretrained="stabilityai/sd-vae-ft-ema",
29
+ micro_batch_size=64,
30
+ )
31
+ text_encoder = dict(
32
+ type="t5",
33
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
34
+ model_max_length=120,
35
+ shardformer=True,
36
+ )
37
+ scheduler = dict(
38
+ type="iddpm",
39
+ timestep_respacing="",
40
+ )
41
+
42
+ # Others
43
+ seed = 42
44
+ outputs = "outputs"
45
+ wandb = False
46
+
47
+ epochs = 1000
48
+ log_every = 10
49
+ ckpt_every = 250
50
+ load = None
51
+
52
+ batch_size = 4
53
+ lr = 2e-5
54
+ grad_clip = 1.0
configs/pixart/inference/16x256x256.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 16
2
+ fps = 8
3
+ image_size = (256, 256)
4
+
5
+ # Define model
6
+ model = dict(
7
+ type="PixArt-XL/2",
8
+ space_scale=0.5,
9
+ time_scale=1.0,
10
+ from_pretrained="outputs/098-F16S3-PixArt-XL-2/epoch7-global_step30000/model_ckpt.pt",
11
+ )
12
+ vae = dict(
13
+ type="VideoAutoencoderKL",
14
+ from_pretrained="stabilityai/sd-vae-ft-ema",
15
+ )
16
+ text_encoder = dict(
17
+ type="t5",
18
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
19
+ model_max_length=120,
20
+ )
21
+ scheduler = dict(
22
+ type="dpm-solver",
23
+ num_sampling_steps=20,
24
+ cfg_scale=7.0,
25
+ )
26
+ dtype = "fp16"
27
+
28
+ # Others
29
+ batch_size = 2
30
+ seed = 42
31
+ prompt_path = "./assets/texts/t2v_samples.txt"
32
+ save_dir = "./outputs/samples/"
configs/pixart/inference/1x1024MS.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 1
2
+ fps = 1
3
+ image_size = (1920, 512)
4
+ multi_resolution = True
5
+
6
+ # Define model
7
+ model = dict(
8
+ type="PixArtMS-XL/2",
9
+ space_scale=2.0,
10
+ time_scale=1.0,
11
+ no_temporal_pos_emb=True,
12
+ from_pretrained="PixArt-XL-2-1024-MS.pth",
13
+ )
14
+ vae = dict(
15
+ type="VideoAutoencoderKL",
16
+ from_pretrained="stabilityai/sd-vae-ft-ema",
17
+ )
18
+ text_encoder = dict(
19
+ type="t5",
20
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
21
+ model_max_length=120,
22
+ )
23
+ scheduler = dict(
24
+ type="dpm-solver",
25
+ num_sampling_steps=20,
26
+ cfg_scale=7.0,
27
+ )
28
+ dtype = "fp16"
29
+
30
+ # Others
31
+ batch_size = 2
32
+ seed = 42
33
+ prompt_path = "./assets/texts/t2i_samples.txt"
34
+ save_dir = "./outputs/samples/"
configs/pixart/inference/1x256x256.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 1
2
+ fps = 1
3
+ image_size = (256, 256)
4
+
5
+ # Define model
6
+ model = dict(
7
+ type="PixArt-XL/2",
8
+ space_scale=1.0,
9
+ time_scale=1.0,
10
+ no_temporal_pos_emb=True,
11
+ from_pretrained="PixArt-XL-2-256x256.pth",
12
+ )
13
+ vae = dict(
14
+ type="VideoAutoencoderKL",
15
+ from_pretrained="stabilityai/sd-vae-ft-ema",
16
+ )
17
+ text_encoder = dict(
18
+ type="t5",
19
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
20
+ model_max_length=120,
21
+ )
22
+ scheduler = dict(
23
+ type="dpm-solver",
24
+ num_sampling_steps=20,
25
+ cfg_scale=7.0,
26
+ )
27
+ dtype = "fp16"
28
+
29
+ # Others
30
+ batch_size = 2
31
+ seed = 42
32
+ prompt_path = "./assets/texts/t2i_samples.txt"
33
+ save_dir = "./outputs/samples/"
configs/pixart/inference/1x512x512.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 1
2
+ fps = 1
3
+ image_size = (512, 512)
4
+
5
+ # Define model
6
+ model = dict(
7
+ type="PixArt-XL/2",
8
+ space_scale=1.0,
9
+ time_scale=1.0,
10
+ no_temporal_pos_emb=True,
11
+ from_pretrained="PixArt-XL-2-512x512.pth",
12
+ )
13
+ vae = dict(
14
+ type="VideoAutoencoderKL",
15
+ from_pretrained="stabilityai/sd-vae-ft-ema",
16
+ )
17
+ text_encoder = dict(
18
+ type="t5",
19
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
20
+ model_max_length=120,
21
+ )
22
+ scheduler = dict(
23
+ type="dpm-solver",
24
+ num_sampling_steps=20,
25
+ cfg_scale=7.0,
26
+ )
27
+ dtype = "fp16"
28
+
29
+ # Others
30
+ batch_size = 2
31
+ seed = 42
32
+ prompt_path = "./assets/texts/t2i_samples.txt"
33
+ save_dir = "./outputs/samples/"
configs/pixart/train/16x256x256.py ADDED
@@ -0,0 +1,53 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 16
2
+ frame_interval = 3
3
+ image_size = (256, 256)
4
+
5
+ # Define dataset
6
+ root = None
7
+ data_path = "CSV_PATH"
8
+ use_image_transform = False
9
+ num_workers = 4
10
+
11
+ # Define acceleration
12
+ dtype = "bf16"
13
+ grad_checkpoint = False
14
+ plugin = "zero2"
15
+ sp_size = 1
16
+
17
+ # Define model
18
+ model = dict(
19
+ type="PixArt-XL/2",
20
+ space_scale=0.5,
21
+ time_scale=1.0,
22
+ from_pretrained="PixArt-XL-2-512x512.pth",
23
+ enable_flashattn=True,
24
+ enable_layernorm_kernel=True,
25
+ )
26
+ vae = dict(
27
+ type="VideoAutoencoderKL",
28
+ from_pretrained="stabilityai/sd-vae-ft-ema",
29
+ )
30
+ text_encoder = dict(
31
+ type="t5",
32
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
33
+ model_max_length=120,
34
+ shardformer=True,
35
+ )
36
+ scheduler = dict(
37
+ type="iddpm",
38
+ timestep_respacing="",
39
+ )
40
+
41
+ # Others
42
+ seed = 42
43
+ outputs = "outputs"
44
+ wandb = False
45
+
46
+ epochs = 1000
47
+ log_every = 10
48
+ ckpt_every = 1000
49
+ load = None
50
+
51
+ batch_size = 8
52
+ lr = 2e-5
53
+ grad_clip = 1.0
configs/pixart/train/1x512x512.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 1
2
+ frame_interval = 1
3
+ image_size = (512, 512)
4
+
5
+ # Define dataset
6
+ root = None
7
+ data_path = "CSV_PATH"
8
+ use_image_transform = True
9
+ num_workers = 4
10
+
11
+ # Define acceleration
12
+ dtype = "bf16"
13
+ grad_checkpoint = True
14
+ plugin = "zero2"
15
+ sp_size = 1
16
+
17
+ # Define model
18
+ model = dict(
19
+ type="PixArt-XL/2",
20
+ space_scale=1.0,
21
+ time_scale=1.0,
22
+ no_temporal_pos_emb=True,
23
+ from_pretrained="PixArt-XL-2-512x512.pth",
24
+ enable_flashattn=True,
25
+ enable_layernorm_kernel=True,
26
+ )
27
+ vae = dict(
28
+ type="VideoAutoencoderKL",
29
+ from_pretrained="stabilityai/sd-vae-ft-ema",
30
+ )
31
+ text_encoder = dict(
32
+ type="t5",
33
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
34
+ model_max_length=120,
35
+ shardformer=True,
36
+ )
37
+ scheduler = dict(
38
+ type="iddpm",
39
+ timestep_respacing="",
40
+ )
41
+
42
+ # Others
43
+ seed = 42
44
+ outputs = "outputs"
45
+ wandb = False
46
+
47
+ epochs = 1000
48
+ log_every = 10
49
+ ckpt_every = 1000
50
+ load = None
51
+
52
+ batch_size = 32
53
+ lr = 2e-5
54
+ grad_clip = 1.0
configs/pixart/train/64x512x512.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 64
2
+ frame_interval = 2
3
+ image_size = (512, 512)
4
+
5
+ # Define dataset
6
+ root = None
7
+ data_path = "CSV_PATH"
8
+ use_image_transform = False
9
+ num_workers = 4
10
+
11
+ # Define acceleration
12
+ dtype = "bf16"
13
+ grad_checkpoint = True
14
+ plugin = "zero2"
15
+ sp_size = 1
16
+
17
+ # Define model
18
+ model = dict(
19
+ type="PixArt-XL/2",
20
+ space_scale=1.0,
21
+ time_scale=2 / 3,
22
+ from_pretrained=None,
23
+ enable_flashattn=True,
24
+ enable_layernorm_kernel=True,
25
+ )
26
+ vae = dict(
27
+ type="VideoAutoencoderKL",
28
+ from_pretrained="stabilityai/sd-vae-ft-ema",
29
+ micro_batch_size=128,
30
+ )
31
+ text_encoder = dict(
32
+ type="t5",
33
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
34
+ model_max_length=120,
35
+ shardformer=True,
36
+ )
37
+ scheduler = dict(
38
+ type="iddpm",
39
+ timestep_respacing="",
40
+ )
41
+
42
+ # Others
43
+ seed = 42
44
+ outputs = "outputs"
45
+ wandb = False
46
+
47
+ epochs = 1000
48
+ log_every = 10
49
+ ckpt_every = 250
50
+ load = None
51
+
52
+ batch_size = 4
53
+ lr = 2e-5
54
+ grad_clip = 1.0
requirements.txt ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ xformers
2
+ git+https://github.com/hpcaitech/Open-Sora.git#egg=opensora
3
+ transformers