frankleeeee commited on
Commit
5613724
1 Parent(s): 68404e4

updated to v1.1

Browse files
Files changed (35) hide show
  1. app.py +299 -54
  2. configs/dit/inference/16x256x256.py +2 -2
  3. configs/dit/inference/1x256x256-class.py +2 -2
  4. configs/dit/inference/1x256x256.py +2 -2
  5. configs/dit/train/16x256x256.py +9 -9
  6. configs/dit/train/1x256x256.py +9 -8
  7. configs/latte/inference/16x256x256-class.py +2 -2
  8. configs/latte/inference/16x256x256.py +2 -2
  9. configs/latte/train/16x256x256.py +8 -8
  10. configs/opensora-v1-1/inference/sample-ref.py +62 -0
  11. configs/opensora-v1-1/inference/sample.py +43 -0
  12. configs/opensora-v1-1/train/benchmark.py +101 -0
  13. configs/opensora-v1-1/train/image.py +65 -0
  14. configs/opensora-v1-1/train/stage1.py +77 -0
  15. configs/opensora-v1-1/train/stage2.py +79 -0
  16. configs/opensora-v1-1/train/stage3.py +79 -0
  17. configs/opensora-v1-1/train/video.py +67 -0
  18. configs/opensora/inference/16x256x256.py +7 -4
  19. configs/opensora/inference/16x512x512.py +3 -3
  20. configs/opensora/inference/64x512x512.py +2 -2
  21. configs/opensora/train/16x256x256-mask.py +60 -0
  22. configs/opensora/train/16x256x256-spee.py +60 -0
  23. configs/opensora/train/16x256x256.py +8 -8
  24. configs/opensora/train/16x512x512.py +9 -9
  25. configs/opensora/train/360x512x512.py +13 -7
  26. configs/opensora/train/64x512x512-sp.py +9 -9
  27. configs/opensora/train/64x512x512.py +8 -8
  28. configs/pixart/inference/16x256x256.py +2 -2
  29. configs/pixart/inference/1x1024MS.py +3 -3
  30. configs/pixart/inference/1x256x256.py +2 -2
  31. configs/pixart/inference/1x512x512.py +9 -3
  32. configs/pixart/train/16x256x256.py +9 -9
  33. configs/pixart/train/1x512x512.py +8 -8
  34. configs/pixart/train/64x512x512.py +9 -8
  35. requirements.txt +1 -1
app.py CHANGED
@@ -11,25 +11,148 @@ import importlib
11
  import os
12
  import subprocess
13
  import sys
 
 
 
14
 
15
  import spaces
16
  import torch
17
 
18
  import gradio as gr
19
 
20
- MODEL_TYPES = ["v1-16x256x256", "v1-HQ-16x256x256", "v1-HQ-16x512x512"]
 
21
  CONFIG_MAP = {
22
- "v1-16x256x256": "configs/opensora/inference/16x256x256.py",
23
- "v1-HQ-16x256x256": "configs/opensora/inference/16x256x256.py",
24
- "v1-HQ-16x512x512": "configs/opensora/inference/16x512x512.py",
25
  }
26
  HF_STDIT_MAP = {
27
- "v1-16x256x256": "hpcai-tech/OpenSora-STDiT-v1-16x256x256",
28
- "v1-HQ-16x256x256": "hpcai-tech/OpenSora-STDiT-v1-HQ-16x256x256",
29
- "v1-HQ-16x512x512": "hpcai-tech/OpenSora-STDiT-v1-HQ-16x512x512",
 
 
 
 
 
 
 
30
  }
31
 
32
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  def install_dependencies(enable_optimization=False):
34
  """
35
  Install the required dependencies for the demo if they are not already installed.
@@ -72,6 +195,9 @@ def install_dependencies(enable_optimization=False):
72
  )
73
 
74
 
 
 
 
75
  def read_config(config_path):
76
  """
77
  Read the configuration file.
@@ -81,7 +207,7 @@ def read_config(config_path):
81
  return Config.fromfile(config_path)
82
 
83
 
84
- def build_models(model_type, config):
85
  """
86
  Build the models for the given model type and configuration.
87
  """
@@ -101,8 +227,7 @@ def build_models(model_type, config):
101
 
102
  stdit = AutoModel.from_pretrained(
103
  HF_STDIT_MAP[model_type],
104
- enable_flash_attn=False,
105
- enable_layernorm_kernel=False,
106
  trust_remote_code=True,
107
  ).cuda()
108
 
@@ -115,23 +240,20 @@ def build_models(model_type, config):
115
  text_encoder.y_embedder = stdit.y_embedder
116
 
117
  # move modelst to device
118
- vae = vae.to(torch.float16).eval()
119
  text_encoder.t5.model = text_encoder.t5.model.eval() # t5 must be in fp32
120
- stdit = stdit.to(torch.float16).eval()
121
- return vae, text_encoder, stdit, scheduler
122
-
123
 
124
- def get_latent_size(config, vae):
125
- input_size = (config.num_frames, *config.image_size)
126
- latent_size = vae.get_latent_size(input_size)
127
- return latent_size
128
 
129
 
130
  def parse_args():
131
  parser = argparse.ArgumentParser()
132
  parser.add_argument(
133
  "--model-type",
134
- default="v1-HQ-16x256x256",
135
  choices=MODEL_TYPES,
136
  help=f"The type of model to run for the Gradio App, can only be {MODEL_TYPES}",
137
  )
@@ -168,27 +290,129 @@ torch.jit._state.disable()
168
  # set up
169
  install_dependencies(enable_optimization=args.enable_optimization)
170
 
 
 
 
 
 
 
 
 
171
  # build model
172
- vae, text_encoder, stdit, scheduler = build_models(args.model_type, config)
173
 
174
 
175
  @spaces.GPU(duration=200)
176
- def run_inference(prompt_text):
177
- from opensora.datasets import save_sample
178
-
179
- latent_size = get_latent_size(config, vae)
180
- samples = scheduler.sample(
181
- stdit,
182
- text_encoder,
183
- z_size=(vae.out_channels, *latent_size),
184
- prompts=[prompt_text],
185
- device="cuda",
186
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
 
188
- samples = vae.decode(samples.to(torch.float16))
189
- filename = f"{args.output}/sample"
190
- saved_path = save_sample(samples[0], fps=config.fps, save_path=filename)
191
- return saved_path
 
 
 
 
 
 
192
 
193
 
194
  def main():
@@ -218,27 +442,48 @@ def main():
218
 
219
  with gr.Row():
220
  with gr.Column():
221
- prompt_text = gr.Textbox(show_label=False, placeholder="Describe your video here", lines=4)
222
- submit_button = gr.Button("Generate video")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
223
 
 
 
 
 
224
  with gr.Column():
225
- output_video = gr.Video()
226
-
227
- submit_button.click(fn=run_inference, inputs=[prompt_text], outputs=output_video)
228
-
229
- gr.Examples(
230
- examples=[
231
- [
232
- "The video captures the majestic beauty of a waterfall cascading down a cliff into a serene lake. The waterfall, with its powerful flow, is the central focus of the video. The surrounding landscape is lush and green, with trees and foliage adding to the natural beauty of the scene. The camera angle provides a bird's eye view of the waterfall, allowing viewers to appreciate the full height and grandeur of the waterfall. The video is a stunning representation of nature's power and beauty.",
233
- ],
234
- ],
235
- fn=run_inference,
236
- inputs=[
237
- prompt_text,
238
- ],
239
- outputs=[output_video],
240
- cache_examples=True,
241
- )
242
 
243
  # launch
244
  demo.launch(server_port=args.port, server_name=args.host, share=args.share)
 
11
  import os
12
  import subprocess
13
  import sys
14
+ import re
15
+ import json
16
+ import math
17
 
18
  import spaces
19
  import torch
20
 
21
  import gradio as gr
22
 
23
+
24
+ MODEL_TYPES = ["v1.1"]
25
  CONFIG_MAP = {
26
+ "v1.1-stage2": "configs/opensora-v1-1/inference/sample-ref.py",
27
+ "v1.1-stage3": "configs/opensora-v1-1/inference/sample-ref.py",
 
28
  }
29
  HF_STDIT_MAP = {
30
+ "v1.1-stage2": "hpcai-tech/OpenSora-STDiT-v2-stage2",
31
+ "v1.1-stage3": "hpcai-tech/OpenSora-STDiT-v2-stage3",
32
+ }
33
+ RESOLUTION_MAP = {
34
+ "144p": (144, 256),
35
+ "240p": (240, 426),
36
+ "360p": (360, 480),
37
+ "480p": (480, 858),
38
+ "720p": (720, 1280),
39
+ "1080p": (1080, 1920)
40
  }
41
 
42
 
43
+ # ============================
44
+ # Utils
45
+ # ============================
46
+ def collect_references_batch(reference_paths, vae, image_size):
47
+ from opensora.datasets.utils import read_from_path
48
+
49
+ refs_x = []
50
+ for reference_path in reference_paths:
51
+ if reference_path is None:
52
+ refs_x.append([])
53
+ continue
54
+ ref_path = reference_path.split(";")
55
+ ref = []
56
+ for r_path in ref_path:
57
+ r = read_from_path(r_path, image_size, transform_name="resize_crop")
58
+ r_x = vae.encode(r.unsqueeze(0).to(vae.device, vae.dtype))
59
+ r_x = r_x.squeeze(0)
60
+ ref.append(r_x)
61
+ refs_x.append(ref)
62
+ # refs_x: [batch, ref_num, C, T, H, W]
63
+ return refs_x
64
+
65
+
66
+ def process_mask_strategy(mask_strategy):
67
+ mask_batch = []
68
+ mask_strategy = mask_strategy.split(";")
69
+ for mask in mask_strategy:
70
+ mask_group = mask.split(",")
71
+ assert len(mask_group) >= 1 and len(mask_group) <= 6, f"Invalid mask strategy: {mask}"
72
+ if len(mask_group) == 1:
73
+ mask_group.extend(["0", "0", "0", "1", "0"])
74
+ elif len(mask_group) == 2:
75
+ mask_group.extend(["0", "0", "1", "0"])
76
+ elif len(mask_group) == 3:
77
+ mask_group.extend(["0", "1", "0"])
78
+ elif len(mask_group) == 4:
79
+ mask_group.extend(["1", "0"])
80
+ elif len(mask_group) == 5:
81
+ mask_group.append("0")
82
+ mask_batch.append(mask_group)
83
+ return mask_batch
84
+
85
+
86
+ def apply_mask_strategy(z, refs_x, mask_strategys, loop_i):
87
+ masks = []
88
+ for i, mask_strategy in enumerate(mask_strategys):
89
+ mask = torch.ones(z.shape[2], dtype=torch.float, device=z.device)
90
+ if mask_strategy is None:
91
+ masks.append(mask)
92
+ continue
93
+ mask_strategy = process_mask_strategy(mask_strategy)
94
+ for mst in mask_strategy:
95
+ loop_id, m_id, m_ref_start, m_target_start, m_length, edit_ratio = mst
96
+ loop_id = int(loop_id)
97
+ if loop_id != loop_i:
98
+ continue
99
+ m_id = int(m_id)
100
+ m_ref_start = int(m_ref_start)
101
+ m_length = int(m_length)
102
+ m_target_start = int(m_target_start)
103
+ edit_ratio = float(edit_ratio)
104
+ ref = refs_x[i][m_id] # [C, T, H, W]
105
+ if m_ref_start < 0:
106
+ m_ref_start = ref.shape[1] + m_ref_start
107
+ if m_target_start < 0:
108
+ # z: [B, C, T, H, W]
109
+ m_target_start = z.shape[2] + m_target_start
110
+ z[i, :, m_target_start : m_target_start + m_length] = ref[:, m_ref_start : m_ref_start + m_length]
111
+ mask[m_target_start : m_target_start + m_length] = edit_ratio
112
+ masks.append(mask)
113
+ masks = torch.stack(masks)
114
+ return masks
115
+
116
+
117
+ def process_prompts(prompts, num_loop):
118
+ from opensora.models.text_encoder.t5 import text_preprocessing
119
+
120
+ ret_prompts = []
121
+ for prompt in prompts:
122
+ if prompt.startswith("|0|"):
123
+ prompt_list = prompt.split("|")[1:]
124
+ text_list = []
125
+ for i in range(0, len(prompt_list), 2):
126
+ start_loop = int(prompt_list[i])
127
+ text = prompt_list[i + 1]
128
+ text = text_preprocessing(text)
129
+ end_loop = int(prompt_list[i + 2]) if i + 2 < len(prompt_list) else num_loop
130
+ text_list.extend([text] * (end_loop - start_loop))
131
+ assert len(text_list) == num_loop, f"Prompt loop mismatch: {len(text_list)} != {num_loop}"
132
+ ret_prompts.append(text_list)
133
+ else:
134
+ prompt = text_preprocessing(prompt)
135
+ ret_prompts.append([prompt] * num_loop)
136
+ return ret_prompts
137
+
138
+
139
+ def extract_json_from_prompts(prompts):
140
+ additional_infos = []
141
+ ret_prompts = []
142
+ for prompt in prompts:
143
+ parts = re.split(r"(?=[{\[])", prompt)
144
+ assert len(parts) <= 2, f"Invalid prompt: {prompt}"
145
+ ret_prompts.append(parts[0])
146
+ if len(parts) == 1:
147
+ additional_infos.append({})
148
+ else:
149
+ additional_infos.append(json.loads(parts[1]))
150
+ return ret_prompts, additional_infos
151
+
152
+
153
+ # ============================
154
+ # Runtime Environment
155
+ # ============================
156
  def install_dependencies(enable_optimization=False):
157
  """
158
  Install the required dependencies for the demo if they are not already installed.
 
195
  )
196
 
197
 
198
+ # ============================
199
+ # Model-related
200
+ # ============================
201
  def read_config(config_path):
202
  """
203
  Read the configuration file.
 
207
  return Config.fromfile(config_path)
208
 
209
 
210
+ def build_models(model_type, config, enable_optimization=False):
211
  """
212
  Build the models for the given model type and configuration.
213
  """
 
227
 
228
  stdit = AutoModel.from_pretrained(
229
  HF_STDIT_MAP[model_type],
230
+ enable_flash_attn=enable_optimization,
 
231
  trust_remote_code=True,
232
  ).cuda()
233
 
 
240
  text_encoder.y_embedder = stdit.y_embedder
241
 
242
  # move modelst to device
243
+ vae = vae.to(torch.bfloat16).eval()
244
  text_encoder.t5.model = text_encoder.t5.model.eval() # t5 must be in fp32
245
+ stdit = stdit.to(torch.bfloat16).eval()
 
 
246
 
247
+ # clear cuda
248
+ torch.cuda.empty_cache()
249
+ return vae, text_encoder, stdit, scheduler
 
250
 
251
 
252
  def parse_args():
253
  parser = argparse.ArgumentParser()
254
  parser.add_argument(
255
  "--model-type",
256
+ default="v1.1-stage3",
257
  choices=MODEL_TYPES,
258
  help=f"The type of model to run for the Gradio App, can only be {MODEL_TYPES}",
259
  )
 
290
  # set up
291
  install_dependencies(enable_optimization=args.enable_optimization)
292
 
293
+ # import after installation
294
+ from opensora.datasets import IMG_FPS, save_sample
295
+ from opensora.utils.misc import to_torch_dtype
296
+
297
+ # some global variables
298
+ dtype = to_torch_dtype(config.dtype)
299
+ device = torch.device("cuda")
300
+
301
  # build model
302
+ vae, text_encoder, stdit, scheduler = build_models(args.model_type, config, enable_optimization=args.enable_optimization)
303
 
304
 
305
  @spaces.GPU(duration=200)
306
+ def run_inference(mode, prompt_text, resolution, length, reference_image):
307
+ with torch.inference_mode():
308
+ # ======================
309
+ # 1. Preparation
310
+ # ======================
311
+ # parse the inputs
312
+ resolution = RESOLUTION_MAP[resolution]
313
+
314
+ # compute number of loops
315
+ num_seconds = int(length.rstrip('s'))
316
+ total_number_of_frames = num_seconds * config.fps / config.frame_interval
317
+ num_loop = math.ceil(total_number_of_frames / config.num_frames)
318
+
319
+ # prepare model args
320
+ model_args = dict()
321
+ height = torch.tensor([resolution[0]], device=device, dtype=dtype)
322
+ width = torch.tensor([resolution[1]], device=device, dtype=dtype)
323
+ num_frames = torch.tensor([config.num_frames], device=device, dtype=dtype)
324
+ ar = torch.tensor([resolution[0] / resolution[1]], device=device, dtype=dtype)
325
+ if config.num_frames == 1:
326
+ config.fps = IMG_FPS
327
+ fps = torch.tensor([config.fps], device=device, dtype=dtype)
328
+ model_args["height"] = height
329
+ model_args["width"] = width
330
+ model_args["num_frames"] = num_frames
331
+ model_args["ar"] = ar
332
+ model_args["fps"] = fps
333
+
334
+ # compute latent size
335
+ input_size = (config.num_frames, *resolution)
336
+ latent_size = vae.get_latent_size(input_size)
337
+
338
+ # process prompt
339
+ prompt_raw = [prompt_text]
340
+ prompt_raw, _ = extract_json_from_prompts(prompt_raw)
341
+ prompt_loops = process_prompts(prompt_raw, num_loop)
342
+ video_clips = []
343
+
344
+ # prepare mask strategy
345
+ if mode == "Text2Video":
346
+ mask_strategy = [None]
347
+ elif mode == "Image2Video":
348
+ mask_strategy = ['0']
349
+ else:
350
+ raise ValueError(f"Invalid mode: {mode}")
351
+
352
+ # =========================
353
+ # 2. Load reference images
354
+ # =========================
355
+ if mode == "Text2Video":
356
+ refs_x = collect_references_batch([None], vae, resolution)
357
+ elif mode == "Image2Video":
358
+ # save image to disk
359
+ from PIL import Image
360
+ im = Image.fromarray(reference_image)
361
+ im.save("test.jpg")
362
+ refs_x = collect_references_batch(["test.jpg"], vae, resolution)
363
+ else:
364
+ raise ValueError(f"Invalid mode: {mode}")
365
+
366
+ # 4.3. long video generation
367
+ for loop_i in range(num_loop):
368
+ # 4.4 sample in hidden space
369
+ batch_prompts = [prompt[loop_i] for prompt in prompt_loops]
370
+ z = torch.randn(len(batch_prompts), vae.out_channels, *latent_size, device=device, dtype=dtype)
371
+
372
+ # 4.5. apply mask strategy
373
+ masks = None
374
+
375
+ # if cfg.reference_path is not None:
376
+ if loop_i > 0:
377
+ ref_x = vae.encode(video_clips[-1])
378
+ for j, refs in enumerate(refs_x):
379
+ if refs is None:
380
+ refs_x[j] = [ref_x[j]]
381
+ else:
382
+ refs.append(ref_x[j])
383
+ if mask_strategy[j] is None:
384
+ mask_strategy[j] = ""
385
+ else:
386
+ mask_strategy[j] += ";"
387
+ mask_strategy[
388
+ j
389
+ ] += f"{loop_i},{len(refs)-1},-{config.condition_frame_length},0,{config.condition_frame_length}"
390
+
391
+ masks = apply_mask_strategy(z, refs_x, mask_strategy, loop_i)
392
+
393
+ # 4.6. diffusion sampling
394
+ samples = scheduler.sample(
395
+ stdit,
396
+ text_encoder,
397
+ z=z,
398
+ prompts=batch_prompts,
399
+ device=device,
400
+ additional_args=model_args,
401
+ mask=masks, # scheduler must support mask
402
+ )
403
+ samples = vae.decode(samples.to(dtype))
404
+ video_clips.append(samples)
405
 
406
+ # 4.7. save video
407
+ if loop_i == num_loop - 1:
408
+ video_clips_list = [
409
+ video_clips[0][0]] + [video_clips[i][0][:, config.condition_frame_length :]
410
+ for i in range(1, num_loop)
411
+ ]
412
+ video = torch.cat(video_clips_list, dim=1)
413
+ save_path = f"{args.output}/sample"
414
+ saved_path = save_sample(video, fps=config.fps // config.frame_interval, save_path=save_path, force_video=True)
415
+ return saved_path
416
 
417
 
418
  def main():
 
442
 
443
  with gr.Row():
444
  with gr.Column():
445
+ mode = gr.Radio(
446
+ choices=["Text2Video", "Image2Video"],
447
+ value="Text2Video",
448
+ label="Usage",
449
+ info="Choose your usage scenario",
450
+ )
451
+ prompt_text = gr.Textbox(
452
+ label="Prompt",
453
+ placeholder="Describe your video here",
454
+ lines=4,
455
+ )
456
+ resolution = gr.Radio(
457
+ choices=["144p", "240p", "360p", "480p", "720p", "1080p"],
458
+ value="144p",
459
+ label="Resolution",
460
+ )
461
+ length = gr.Radio(
462
+ choices=["2s", "4s", "8s"],
463
+ value="2s",
464
+ label="Video Length",
465
+ info="8s may fail as Hugging Face ZeroGPU has the limitation of max 200 seconds inference time."
466
+ )
467
 
468
+ reference_image = gr.Image(
469
+ label="Reference Image (only used for Image2Video)",
470
+ )
471
+
472
  with gr.Column():
473
+ output_video = gr.Video(
474
+ label="Output Video",
475
+ height="100%"
476
+ )
477
+
478
+ with gr.Row():
479
+ submit_button = gr.Button("Generate video")
480
+
481
+
482
+ submit_button.click(
483
+ fn=run_inference,
484
+ inputs=[mode, prompt_text, resolution, length, reference_image],
485
+ outputs=output_video
486
+ )
 
 
 
487
 
488
  # launch
489
  demo.launch(server_port=args.port, server_name=args.host, share=args.share)
configs/dit/inference/16x256x256.py CHANGED
@@ -22,10 +22,10 @@ scheduler = dict(
22
  num_sampling_steps=20,
23
  cfg_scale=4.0,
24
  )
25
- dtype = "fp16"
26
 
27
  # Others
28
  batch_size = 2
29
  seed = 42
30
  prompt_path = "./assets/texts/ucf101_labels.txt"
31
- save_dir = "./outputs/samples/"
 
22
  num_sampling_steps=20,
23
  cfg_scale=4.0,
24
  )
25
+ dtype = "bf16"
26
 
27
  # Others
28
  batch_size = 2
29
  seed = 42
30
  prompt_path = "./assets/texts/ucf101_labels.txt"
31
+ save_dir = "./samples/samples/"
configs/dit/inference/1x256x256-class.py CHANGED
@@ -22,10 +22,10 @@ scheduler = dict(
22
  num_sampling_steps=20,
23
  cfg_scale=4.0,
24
  )
25
- dtype = "fp16"
26
 
27
  # Others
28
  batch_size = 2
29
  seed = 42
30
  prompt_path = "./assets/texts/imagenet_id.txt"
31
- save_dir = "./outputs/samples/"
 
22
  num_sampling_steps=20,
23
  cfg_scale=4.0,
24
  )
25
+ dtype = "bf16"
26
 
27
  # Others
28
  batch_size = 2
29
  seed = 42
30
  prompt_path = "./assets/texts/imagenet_id.txt"
31
+ save_dir = "./samples/samples/"
configs/dit/inference/1x256x256.py CHANGED
@@ -23,10 +23,10 @@ scheduler = dict(
23
  num_sampling_steps=20,
24
  cfg_scale=4.0,
25
  )
26
- dtype = "fp16"
27
 
28
  # Others
29
  batch_size = 2
30
  seed = 42
31
  prompt_path = "./assets/texts/imagenet_labels.txt"
32
- save_dir = "./outputs/samples/"
 
23
  num_sampling_steps=20,
24
  cfg_scale=4.0,
25
  )
26
+ dtype = "bf16"
27
 
28
  # Others
29
  batch_size = 2
30
  seed = 42
31
  prompt_path = "./assets/texts/imagenet_labels.txt"
32
+ save_dir = "./samples/samples/"
configs/dit/train/16x256x256.py CHANGED
@@ -1,16 +1,16 @@
1
- num_frames = 16
2
- frame_interval = 3
3
- image_size = (256, 256)
4
-
5
  # Define dataset
6
- root = None
7
- data_path = "CSV_PATH"
8
- use_image_transform = False
9
- num_workers = 4
 
 
 
10
 
11
  # Define acceleration
 
12
  dtype = "bf16"
13
- grad_checkpoint = False
14
  plugin = "zero2"
15
  sp_size = 1
16
 
 
 
 
 
 
1
  # Define dataset
2
+ dataset = dict(
3
+ type="VideoTextDataset",
4
+ data_path=None,
5
+ num_frames=16,
6
+ frame_interval=3,
7
+ image_size=(256, 256),
8
+ )
9
 
10
  # Define acceleration
11
+ num_workers = 4
12
  dtype = "bf16"
13
+ grad_checkpoint = True
14
  plugin = "zero2"
15
  sp_size = 1
16
 
configs/dit/train/1x256x256.py CHANGED
@@ -1,14 +1,15 @@
1
- num_frames = 1
2
- frame_interval = 1
3
- image_size = (256, 256)
4
-
5
  # Define dataset
6
- root = None
7
- data_path = "CSV_PATH"
8
- use_image_transform = True
9
- num_workers = 4
 
 
 
 
10
 
11
  # Define acceleration
 
12
  dtype = "bf16"
13
  grad_checkpoint = False
14
  plugin = "zero2"
 
 
 
 
 
1
  # Define dataset
2
+ dataset = dict(
3
+ type="VideoTextDataset",
4
+ data_path=None,
5
+ num_frames=1,
6
+ frame_interval=1,
7
+ image_size=(256, 256),
8
+ transform_name="center",
9
+ )
10
 
11
  # Define acceleration
12
+ num_workers = 4
13
  dtype = "bf16"
14
  grad_checkpoint = False
15
  plugin = "zero2"
configs/latte/inference/16x256x256-class.py CHANGED
@@ -21,10 +21,10 @@ scheduler = dict(
21
  num_sampling_steps=20,
22
  cfg_scale=4.0,
23
  )
24
- dtype = "fp16"
25
 
26
  # Others
27
  batch_size = 2
28
  seed = 42
29
  prompt_path = "./assets/texts/ucf101_id.txt"
30
- save_dir = "./outputs/samples/"
 
21
  num_sampling_steps=20,
22
  cfg_scale=4.0,
23
  )
24
+ dtype = "bf16"
25
 
26
  # Others
27
  batch_size = 2
28
  seed = 42
29
  prompt_path = "./assets/texts/ucf101_id.txt"
30
+ save_dir = "./samples/samples/"
configs/latte/inference/16x256x256.py CHANGED
@@ -22,10 +22,10 @@ scheduler = dict(
22
  num_sampling_steps=20,
23
  cfg_scale=4.0,
24
  )
25
- dtype = "fp16"
26
 
27
  # Others
28
  batch_size = 2
29
  seed = 42
30
  prompt_path = "./assets/texts/ucf101_labels.txt"
31
- save_dir = "./outputs/samples/"
 
22
  num_sampling_steps=20,
23
  cfg_scale=4.0,
24
  )
25
+ dtype = "bf16"
26
 
27
  # Others
28
  batch_size = 2
29
  seed = 42
30
  prompt_path = "./assets/texts/ucf101_labels.txt"
31
+ save_dir = "./samples/samples/"
configs/latte/train/16x256x256.py CHANGED
@@ -1,14 +1,14 @@
1
- num_frames = 16
2
- frame_interval = 3
3
- image_size = (256, 256)
4
-
5
  # Define dataset
6
- root = None
7
- data_path = "CSV_PATH"
8
- use_image_transform = False
9
- num_workers = 4
 
 
 
10
 
11
  # Define acceleration
 
12
  dtype = "bf16"
13
  grad_checkpoint = True
14
  plugin = "zero2"
 
 
 
 
 
1
  # Define dataset
2
+ dataset = dict(
3
+ type="VideoTextDataset",
4
+ data_path=None,
5
+ num_frames=16,
6
+ frame_interval=3,
7
+ image_size=(256, 256),
8
+ )
9
 
10
  # Define acceleration
11
+ num_workers = 4
12
  dtype = "bf16"
13
  grad_checkpoint = True
14
  plugin = "zero2"
configs/opensora-v1-1/inference/sample-ref.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 16
2
+ frame_interval = 3
3
+ fps = 24
4
+ image_size = (240, 426)
5
+ multi_resolution = "STDiT2"
6
+
7
+ # Condition
8
+ prompt_path = None
9
+ prompt = [
10
+ "A car driving on the ocean.",
11
+ 'Drone view of waves crashing against the rugged cliffs along Big Sur\'s garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff\'s edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff\'s edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.{"reference_path": "assets/images/condition/cliff.png", "mask_strategy": "0"}',
12
+ "In an ornate, historical hall, a massive tidal wave peaks and begins to crash. Two surfers, seizing the moment, skillfully navigate the face of the wave.",
13
+ ]
14
+
15
+ loop = 2
16
+ condition_frame_length = 4
17
+ reference_path = [
18
+ "https://cdn.openai.com/tmp/s/interp/d0.mp4",
19
+ None,
20
+ "assets/images/condition/wave.png",
21
+ ]
22
+ # valid when reference_path is not None
23
+ # (loop id, ref id, ref start, length, target start)
24
+ mask_strategy = [
25
+ "0,0,0,0,8,0.3",
26
+ None,
27
+ "0",
28
+ ]
29
+
30
+ # Define model
31
+ model = dict(
32
+ type="STDiT2-XL/2",
33
+ from_pretrained=None,
34
+ input_sq_size=512,
35
+ qk_norm=True,
36
+ enable_flashattn=True,
37
+ enable_layernorm_kernel=True,
38
+ )
39
+ vae = dict(
40
+ type="VideoAutoencoderKL",
41
+ from_pretrained="stabilityai/sd-vae-ft-ema",
42
+ cache_dir=None, # "/mnt/hdd/cached_models",
43
+ micro_batch_size=4,
44
+ )
45
+ text_encoder = dict(
46
+ type="t5",
47
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
48
+ cache_dir=None, # "/mnt/hdd/cached_models",
49
+ model_max_length=200,
50
+ )
51
+ scheduler = dict(
52
+ type="iddpm",
53
+ num_sampling_steps=100,
54
+ cfg_scale=7.0,
55
+ cfg_channel=3, # or None
56
+ )
57
+ dtype = "bf16"
58
+
59
+ # Others
60
+ batch_size = 1
61
+ seed = 42
62
+ save_dir = "./samples/samples/"
configs/opensora-v1-1/inference/sample.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 16
2
+ frame_interval = 3
3
+ fps = 24
4
+ image_size = (240, 426)
5
+ multi_resolution = "STDiT2"
6
+
7
+ # Define model
8
+ model = dict(
9
+ type="STDiT2-XL/2",
10
+ from_pretrained=None,
11
+ input_sq_size=512,
12
+ qk_norm=True,
13
+ enable_flashattn=True,
14
+ enable_layernorm_kernel=True,
15
+ )
16
+ vae = dict(
17
+ type="VideoAutoencoderKL",
18
+ from_pretrained="stabilityai/sd-vae-ft-ema",
19
+ cache_dir=None, # "/mnt/hdd/cached_models",
20
+ micro_batch_size=4,
21
+ )
22
+ text_encoder = dict(
23
+ type="t5",
24
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
25
+ cache_dir=None, # "/mnt/hdd/cached_models",
26
+ model_max_length=200,
27
+ )
28
+ scheduler = dict(
29
+ type="iddpm",
30
+ num_sampling_steps=100,
31
+ cfg_scale=7.0,
32
+ cfg_channel=3, # or None
33
+ )
34
+ dtype = "bf16"
35
+
36
+ # Condition
37
+ prompt_path = "./assets/texts/t2v_samples.txt"
38
+ prompt = None # prompt has higher priority than prompt_path
39
+
40
+ # Others
41
+ batch_size = 1
42
+ seed = 42
43
+ save_dir = "./samples/samples/"
configs/opensora-v1-1/train/benchmark.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # this file is only for batch size search and is not used for training
2
+
3
+ # Define dataset
4
+ dataset = dict(
5
+ type="VariableVideoTextDataset",
6
+ data_path=None,
7
+ num_frames=None,
8
+ frame_interval=3,
9
+ image_size=(None, None),
10
+ transform_name="resize_crop",
11
+ )
12
+
13
+ # bucket config format:
14
+ # 1. { resolution: {num_frames: (prob, batch_size)} }, in this case batch_size is ignored when searching
15
+ # 2. { resolution: {num_frames: (prob, (max_batch_size, ))} }, batch_size is searched in the range [batch_size_start, max_batch_size), batch_size_start is configured via CLI
16
+ # 3. { resolution: {num_frames: (prob, (min_batch_size, max_batch_size))} }, batch_size is searched in the range [min_batch_size, max_batch_size)
17
+ # 4. { resolution: {num_frames: (prob, (min_batch_size, max_batch_size, step_size))} }, batch_size is searched in the range [min_batch_size, max_batch_size) with step_size (grid search)
18
+ # 5. { resolution: {num_frames: (0.0, None)} }, this bucket will not be used
19
+
20
+ bucket_config = {
21
+ # == manual search ==
22
+ # "240p": {128: (1.0, 2)}, # 4.28s/it
23
+ # "240p": {64: (1.0, 4)},
24
+ # "240p": {32: (1.0, 8)}, # 4.6s/it
25
+ # "240p": {16: (1.0, 16)}, # 4.6s/it
26
+ # "480p": {16: (1.0, 4)}, # 4.6s/it
27
+ # "720p": {16: (1.0, 2)}, # 5.89s/it
28
+ # "256": {1: (1.0, 256)}, # 4.5s/it
29
+ # "512": {1: (1.0, 96)}, # 4.7s/it
30
+ # "512": {1: (1.0, 128)}, # 6.3s/it
31
+ # "480p": {1: (1.0, 50)}, # 4.0s/it
32
+ # "1024": {1: (1.0, 32)}, # 6.8s/it
33
+ # "1024": {1: (1.0, 20)}, # 4.3s/it
34
+ # "1080p": {1: (1.0, 16)}, # 8.6s/it
35
+ # "1080p": {1: (1.0, 8)}, # 4.4s/it
36
+ # == stage 2 ==
37
+ # "240p": {
38
+ # 16: (1.0, (2, 32)),
39
+ # 32: (1.0, (2, 16)),
40
+ # 64: (1.0, (2, 8)),
41
+ # 128: (1.0, (2, 6)),
42
+ # },
43
+ # "256": {1: (1.0, (128, 300))},
44
+ # "512": {1: (0.5, (64, 128))},
45
+ # "480p": {1: (0.4, (32, 128)), 16: (0.4, (2, 32)), 32: (0.0, None)},
46
+ # "720p": {16: (0.1, (2, 16)), 32: (0.0, None)}, # No examples now
47
+ # "1024": {1: (0.3, (8, 64))},
48
+ # "1080p": {1: (0.3, (2, 32))},
49
+ # == stage 3 ==
50
+ "720p": {1: (20, 40), 32: (0.5, (2, 4)), 64: (0.5, (1, 1))},
51
+ }
52
+
53
+
54
+ # Define acceleration
55
+ num_workers = 4
56
+ num_bucket_build_workers = 16
57
+ dtype = "bf16"
58
+ grad_checkpoint = True
59
+ plugin = "zero2"
60
+ sp_size = 1
61
+
62
+ # Define model
63
+ model = dict(
64
+ type="STDiT2-XL/2",
65
+ from_pretrained=None,
66
+ input_sq_size=512, # pretrained model is trained on 512x512
67
+ qk_norm=True,
68
+ enable_flashattn=True,
69
+ enable_layernorm_kernel=True,
70
+ )
71
+ vae = dict(
72
+ type="VideoAutoencoderKL",
73
+ from_pretrained="stabilityai/sd-vae-ft-ema",
74
+ micro_batch_size=4,
75
+ local_files_only=True,
76
+ )
77
+ text_encoder = dict(
78
+ type="t5",
79
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
80
+ model_max_length=200,
81
+ shardformer=True,
82
+ local_files_only=True,
83
+ )
84
+ scheduler = dict(
85
+ type="iddpm",
86
+ timestep_respacing="",
87
+ )
88
+
89
+ # Others
90
+ seed = 42
91
+ outputs = "outputs"
92
+ wandb = False
93
+
94
+ epochs = 1000
95
+ log_every = 10
96
+ ckpt_every = 1000
97
+ load = None
98
+
99
+ batch_size = None
100
+ lr = 2e-5
101
+ grad_clip = 1.0
configs/opensora-v1-1/train/image.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Define dataset
2
+ dataset = dict(
3
+ type="VariableVideoTextDataset",
4
+ data_path=None,
5
+ num_frames=None,
6
+ frame_interval=3,
7
+ image_size=(None, None),
8
+ transform_name="resize_crop",
9
+ )
10
+ bucket_config = { # 6s/it
11
+ "256": {1: (1.0, 256)},
12
+ "512": {1: (1.0, 80)},
13
+ "480p": {1: (1.0, 52)},
14
+ "1024": {1: (1.0, 20)},
15
+ "1080p": {1: (1.0, 8)},
16
+ }
17
+
18
+ # Define acceleration
19
+ num_workers = 4
20
+ num_bucket_build_workers = 16
21
+ dtype = "bf16"
22
+ grad_checkpoint = True
23
+ plugin = "zero2"
24
+ sp_size = 1
25
+
26
+ # Define model
27
+ model = dict(
28
+ type="STDiT2-XL/2",
29
+ from_pretrained=None,
30
+ input_sq_size=512, # pretrained model is trained on 512x512
31
+ qk_norm=True,
32
+ enable_flashattn=True,
33
+ enable_layernorm_kernel=True,
34
+ )
35
+ vae = dict(
36
+ type="VideoAutoencoderKL",
37
+ from_pretrained="stabilityai/sd-vae-ft-ema",
38
+ micro_batch_size=4,
39
+ local_files_only=True,
40
+ )
41
+ text_encoder = dict(
42
+ type="t5",
43
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
44
+ model_max_length=200,
45
+ shardformer=True,
46
+ local_files_only=True,
47
+ )
48
+ scheduler = dict(
49
+ type="iddpm",
50
+ timestep_respacing="",
51
+ )
52
+
53
+ # Others
54
+ seed = 42
55
+ outputs = "outputs"
56
+ wandb = False
57
+
58
+ epochs = 1000
59
+ log_every = 10
60
+ ckpt_every = 500
61
+ load = None
62
+
63
+ batch_size = 10 # only for logging
64
+ lr = 2e-5
65
+ grad_clip = 1.0
configs/opensora-v1-1/train/stage1.py ADDED
@@ -0,0 +1,77 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Define dataset
2
+ dataset = dict(
3
+ type="VariableVideoTextDataset",
4
+ data_path=None,
5
+ num_frames=None,
6
+ frame_interval=3,
7
+ image_size=(None, None),
8
+ transform_name="resize_crop",
9
+ )
10
+ # IMG: 1024 (20%) 512 (30%) 256 (50%) drop (50%)
11
+ bucket_config = { # 1s/it
12
+ "144p": {1: (0.5, 48), 16: (1.0, 6), 32: (1.0, 3), 96: (1.0, 1)},
13
+ "256": {1: (0.5, 24), 16: (0.5, 3), 48: (0.5, 1), 64: (0.0, None)},
14
+ "240p": {16: (0.3, 2), 32: (0.3, 1), 64: (0.0, None)},
15
+ "512": {1: (0.4, 12)},
16
+ "1024": {1: (0.3, 3)},
17
+ }
18
+ mask_ratios = {
19
+ "mask_no": 0.75,
20
+ "mask_quarter_random": 0.025,
21
+ "mask_quarter_head": 0.025,
22
+ "mask_quarter_tail": 0.025,
23
+ "mask_quarter_head_tail": 0.05,
24
+ "mask_image_random": 0.025,
25
+ "mask_image_head": 0.025,
26
+ "mask_image_tail": 0.025,
27
+ "mask_image_head_tail": 0.05,
28
+ }
29
+
30
+ # Define acceleration
31
+ num_workers = 8
32
+ num_bucket_build_workers = 16
33
+ dtype = "bf16"
34
+ grad_checkpoint = False
35
+ plugin = "zero2"
36
+ sp_size = 1
37
+
38
+ # Define model
39
+ model = dict(
40
+ type="STDiT2-XL/2",
41
+ from_pretrained=None,
42
+ input_sq_size=512, # pretrained model is trained on 512x512
43
+ qk_norm=True,
44
+ enable_flashattn=True,
45
+ enable_layernorm_kernel=True,
46
+ )
47
+ vae = dict(
48
+ type="VideoAutoencoderKL",
49
+ from_pretrained="stabilityai/sd-vae-ft-ema",
50
+ micro_batch_size=4,
51
+ local_files_only=True,
52
+ )
53
+ text_encoder = dict(
54
+ type="t5",
55
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
56
+ model_max_length=200,
57
+ shardformer=True,
58
+ local_files_only=True,
59
+ )
60
+ scheduler = dict(
61
+ type="iddpm",
62
+ timestep_respacing="",
63
+ )
64
+
65
+ # Others
66
+ seed = 42
67
+ outputs = "outputs"
68
+ wandb = False
69
+
70
+ epochs = 1000
71
+ log_every = 10
72
+ ckpt_every = 500
73
+ load = None
74
+
75
+ batch_size = None
76
+ lr = 2e-5
77
+ grad_clip = 1.0
configs/opensora-v1-1/train/stage2.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Define dataset
2
+ dataset = dict(
3
+ type="VariableVideoTextDataset",
4
+ data_path=None,
5
+ num_frames=None,
6
+ frame_interval=3,
7
+ image_size=(None, None),
8
+ transform_name="resize_crop",
9
+ )
10
+ bucket_config = { # 7s/it
11
+ "144p": {1: (1.0, 48), 16: (1.0, 17), 32: (1.0, 9), 64: (1.0, 4), 128: (1.0, 1)},
12
+ "256": {1: (0.8, 254), 16: (0.5, 17), 32: (0.5, 9), 64: (0.5, 4), 128: (0.5, 1)},
13
+ "240p": {1: (0.1, 20), 16: (0.9, 17), 32: (0.8, 9), 64: (0.8, 4), 128: (0.8, 2)},
14
+ "512": {1: (0.5, 86), 16: (0.2, 4), 32: (0.2, 2), 64: (0.2, 1), 128: (0.0, None)},
15
+ "480p": {1: (0.4, 54), 16: (0.4, 4), 32: (0.0, None)},
16
+ "720p": {1: (0.1, 20), 16: (0.1, 2), 32: (0.0, None)},
17
+ "1024": {1: (0.3, 20)},
18
+ "1080p": {1: (0.4, 8)},
19
+ }
20
+ mask_ratios = {
21
+ "mask_no": 0.75,
22
+ "mask_quarter_random": 0.025,
23
+ "mask_quarter_head": 0.025,
24
+ "mask_quarter_tail": 0.025,
25
+ "mask_quarter_head_tail": 0.05,
26
+ "mask_image_random": 0.025,
27
+ "mask_image_head": 0.025,
28
+ "mask_image_tail": 0.025,
29
+ "mask_image_head_tail": 0.05,
30
+ }
31
+
32
+ # Define acceleration
33
+ num_workers = 8
34
+ num_bucket_build_workers = 16
35
+ dtype = "bf16"
36
+ grad_checkpoint = True
37
+ plugin = "zero2"
38
+ sp_size = 1
39
+
40
+ # Define model
41
+ model = dict(
42
+ type="STDiT2-XL/2",
43
+ from_pretrained=None,
44
+ input_sq_size=512, # pretrained model is trained on 512x512
45
+ qk_norm=True,
46
+ enable_flashattn=True,
47
+ enable_layernorm_kernel=True,
48
+ )
49
+ vae = dict(
50
+ type="VideoAutoencoderKL",
51
+ from_pretrained="stabilityai/sd-vae-ft-ema",
52
+ micro_batch_size=4,
53
+ local_files_only=True,
54
+ )
55
+ text_encoder = dict(
56
+ type="t5",
57
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
58
+ model_max_length=200,
59
+ shardformer=True,
60
+ local_files_only=True,
61
+ )
62
+ scheduler = dict(
63
+ type="iddpm",
64
+ timestep_respacing="",
65
+ )
66
+
67
+ # Others
68
+ seed = 42
69
+ outputs = "outputs"
70
+ wandb = False
71
+
72
+ epochs = 1000
73
+ log_every = 10
74
+ ckpt_every = 500
75
+ load = None
76
+
77
+ batch_size = None
78
+ lr = 2e-5
79
+ grad_clip = 1.0
configs/opensora-v1-1/train/stage3.py ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Define dataset
2
+ dataset = dict(
3
+ type="VariableVideoTextDataset",
4
+ data_path=None,
5
+ num_frames=None,
6
+ frame_interval=3,
7
+ image_size=(None, None),
8
+ transform_name="resize_crop",
9
+ )
10
+ bucket_config = { # 13s/it
11
+ "144p": {1: (1.0, 200), 16: (1.0, 36), 32: (1.0, 18), 64: (1.0, 9), 128: (1.0, 4)},
12
+ "256": {1: (0.8, 200), 16: (0.5, 22), 32: (0.5, 11), 64: (0.5, 6), 128: (0.8, 4)},
13
+ "240p": {1: (0.8, 200), 16: (0.5, 22), 32: (0.5, 10), 64: (0.5, 6), 128: (0.5, 3)},
14
+ "360p": {1: (0.5, 120), 16: (0.5, 9), 32: (0.5, 4), 64: (0.5, 2), 128: (0.5, 1)},
15
+ "512": {1: (0.5, 120), 16: (0.5, 9), 32: (0.5, 4), 64: (0.5, 2), 128: (0.8, 1)},
16
+ "480p": {1: (0.4, 80), 16: (0.6, 6), 32: (0.6, 3), 64: (0.6, 1), 128: (0.0, None)},
17
+ "720p": {1: (0.4, 40), 16: (0.6, 3), 32: (0.6, 1), 96: (0.0, None)},
18
+ "1024": {1: (0.3, 40)},
19
+ }
20
+ mask_ratios = {
21
+ "mask_no": 0.75,
22
+ "mask_quarter_random": 0.025,
23
+ "mask_quarter_head": 0.025,
24
+ "mask_quarter_tail": 0.025,
25
+ "mask_quarter_head_tail": 0.05,
26
+ "mask_image_random": 0.025,
27
+ "mask_image_head": 0.025,
28
+ "mask_image_tail": 0.025,
29
+ "mask_image_head_tail": 0.05,
30
+ }
31
+
32
+ # Define acceleration
33
+ num_workers = 8
34
+ num_bucket_build_workers = 16
35
+ dtype = "bf16"
36
+ grad_checkpoint = True
37
+ plugin = "zero2"
38
+ sp_size = 1
39
+
40
+ # Define model
41
+ model = dict(
42
+ type="STDiT2-XL/2",
43
+ from_pretrained=None,
44
+ input_sq_size=512, # pretrained model is trained on 512x512
45
+ qk_norm=True,
46
+ enable_flashattn=True,
47
+ enable_layernorm_kernel=True,
48
+ )
49
+ vae = dict(
50
+ type="VideoAutoencoderKL",
51
+ from_pretrained="stabilityai/sd-vae-ft-ema",
52
+ micro_batch_size=4,
53
+ local_files_only=True,
54
+ )
55
+ text_encoder = dict(
56
+ type="t5",
57
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
58
+ model_max_length=200,
59
+ shardformer=True,
60
+ local_files_only=True,
61
+ )
62
+ scheduler = dict(
63
+ type="iddpm",
64
+ timestep_respacing="",
65
+ )
66
+
67
+ # Others
68
+ seed = 42
69
+ outputs = "outputs"
70
+ wandb = False
71
+
72
+ epochs = 1000
73
+ log_every = 10
74
+ ckpt_every = 500
75
+ load = None
76
+
77
+ batch_size = None
78
+ lr = 2e-5
79
+ grad_clip = 1.0
configs/opensora-v1-1/train/video.py ADDED
@@ -0,0 +1,67 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Define dataset
2
+ dataset = dict(
3
+ type="VariableVideoTextDataset",
4
+ data_path=None,
5
+ num_frames=None,
6
+ frame_interval=3,
7
+ image_size=(None, None),
8
+ transform_name="resize_crop",
9
+ )
10
+ bucket_config = { # 6s/it
11
+ "240p": {16: (1.0, 16), 32: (1.0, 8), 64: (1.0, 4), 128: (1.0, 2)},
12
+ "256": {1: (1.0, 256)},
13
+ "512": {1: (0.5, 80)},
14
+ "480p": {1: (0.4, 52), 16: (0.4, 4), 32: (0.0, None)},
15
+ "720p": {16: (0.1, 2), 32: (0.0, None)}, # No examples now
16
+ "1024": {1: (0.3, 20)},
17
+ "1080p": {1: (0.3, 8)},
18
+ }
19
+
20
+ # Define acceleration
21
+ num_workers = 4
22
+ num_bucket_build_workers = 16
23
+ dtype = "bf16"
24
+ grad_checkpoint = True
25
+ plugin = "zero2"
26
+ sp_size = 1
27
+
28
+ # Define model
29
+ model = dict(
30
+ type="STDiT2-XL/2",
31
+ from_pretrained=None,
32
+ input_sq_size=512, # pretrained model is trained on 512x512
33
+ qk_norm=True,
34
+ enable_flashattn=True,
35
+ enable_layernorm_kernel=True,
36
+ )
37
+ vae = dict(
38
+ type="VideoAutoencoderKL",
39
+ from_pretrained="stabilityai/sd-vae-ft-ema",
40
+ micro_batch_size=4,
41
+ local_files_only=True,
42
+ )
43
+ text_encoder = dict(
44
+ type="t5",
45
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
46
+ model_max_length=200,
47
+ shardformer=True,
48
+ local_files_only=True,
49
+ )
50
+ scheduler = dict(
51
+ type="iddpm",
52
+ timestep_respacing="",
53
+ )
54
+
55
+ # Others
56
+ seed = 42
57
+ outputs = "outputs"
58
+ wandb = False
59
+
60
+ epochs = 1000
61
+ log_every = 10
62
+ ckpt_every = 500
63
+ load = None
64
+
65
+ batch_size = 10 # only for logging
66
+ lr = 2e-5
67
+ grad_clip = 1.0
configs/opensora/inference/16x256x256.py CHANGED
@@ -25,12 +25,15 @@ scheduler = dict(
25
  type="iddpm",
26
  num_sampling_steps=100,
27
  cfg_scale=7.0,
28
- cfg_channel=3, # or None
29
  )
30
- dtype = "fp16"
 
 
 
 
31
 
32
  # Others
33
  batch_size = 1
34
  seed = 42
35
- prompt_path = "./assets/texts/t2v_samples.txt"
36
- save_dir = "./outputs/samples/"
 
25
  type="iddpm",
26
  num_sampling_steps=100,
27
  cfg_scale=7.0,
28
+ cfg_channel=3, # or None
29
  )
30
+ dtype = "bf16"
31
+
32
+ # Condition
33
+ prompt_path = "./assets/texts/t2v_samples.txt"
34
+ prompt = None # prompt has higher priority than prompt_path
35
 
36
  # Others
37
  batch_size = 1
38
  seed = 42
39
+ save_dir = "./samples/samples/"
 
configs/opensora/inference/16x512x512.py CHANGED
@@ -9,7 +9,7 @@ model = dict(
9
  time_scale=1.0,
10
  enable_flashattn=True,
11
  enable_layernorm_kernel=True,
12
- from_pretrained="PRETRAINED_MODEL"
13
  )
14
  vae = dict(
15
  type="VideoAutoencoderKL",
@@ -26,10 +26,10 @@ scheduler = dict(
26
  num_sampling_steps=100,
27
  cfg_scale=7.0,
28
  )
29
- dtype = "fp16"
30
 
31
  # Others
32
  batch_size = 2
33
  seed = 42
34
  prompt_path = "./assets/texts/t2v_samples.txt"
35
- save_dir = "./outputs/samples/"
 
9
  time_scale=1.0,
10
  enable_flashattn=True,
11
  enable_layernorm_kernel=True,
12
+ from_pretrained="PRETRAINED_MODEL",
13
  )
14
  vae = dict(
15
  type="VideoAutoencoderKL",
 
26
  num_sampling_steps=100,
27
  cfg_scale=7.0,
28
  )
29
+ dtype = "bf16"
30
 
31
  # Others
32
  batch_size = 2
33
  seed = 42
34
  prompt_path = "./assets/texts/t2v_samples.txt"
35
+ save_dir = "./samples/samples/"
configs/opensora/inference/64x512x512.py CHANGED
@@ -26,10 +26,10 @@ scheduler = dict(
26
  num_sampling_steps=100,
27
  cfg_scale=7.0,
28
  )
29
- dtype = "fp16"
30
 
31
  # Others
32
  batch_size = 1
33
  seed = 42
34
  prompt_path = "./assets/texts/t2v_samples.txt"
35
- save_dir = "./outputs/samples/"
 
26
  num_sampling_steps=100,
27
  cfg_scale=7.0,
28
  )
29
+ dtype = "bf16"
30
 
31
  # Others
32
  batch_size = 1
33
  seed = 42
34
  prompt_path = "./assets/texts/t2v_samples.txt"
35
+ save_dir = "./samples/samples/"
configs/opensora/train/16x256x256-mask.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Define dataset
2
+ dataset = dict(
3
+ type="VideoTextDataset",
4
+ data_path=None,
5
+ num_frames=16,
6
+ frame_interval=3,
7
+ image_size=(256, 256),
8
+ )
9
+
10
+ # Define acceleration
11
+ num_workers = 4
12
+ dtype = "bf16"
13
+ grad_checkpoint = True
14
+ plugin = "zero2"
15
+ sp_size = 1
16
+
17
+ # Define model
18
+ model = dict(
19
+ type="STDiT-XL/2",
20
+ space_scale=0.5,
21
+ time_scale=1.0,
22
+ from_pretrained="PixArt-XL-2-512x512.pth",
23
+ enable_flashattn=True,
24
+ enable_layernorm_kernel=True,
25
+ )
26
+ mask_ratios = {
27
+ "mask_no": 0.7,
28
+ "mask_random": 0.15,
29
+ "mask_head": 0.05,
30
+ "mask_tail": 0.05,
31
+ "mask_head_tail": 0.05,
32
+ }
33
+ vae = dict(
34
+ type="VideoAutoencoderKL",
35
+ from_pretrained="stabilityai/sd-vae-ft-ema",
36
+ )
37
+ text_encoder = dict(
38
+ type="t5",
39
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
40
+ model_max_length=120,
41
+ shardformer=True,
42
+ )
43
+ scheduler = dict(
44
+ type="iddpm",
45
+ timestep_respacing="",
46
+ )
47
+
48
+ # Others
49
+ seed = 42
50
+ outputs = "outputs"
51
+ wandb = False
52
+
53
+ epochs = 1000
54
+ log_every = 10
55
+ ckpt_every = 1000
56
+ load = None
57
+
58
+ batch_size = 8
59
+ lr = 2e-5
60
+ grad_clip = 1.0
configs/opensora/train/16x256x256-spee.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Define dataset
2
+ dataset = dict(
3
+ type="VideoTextDataset",
4
+ data_path=None,
5
+ num_frames=16,
6
+ frame_interval=3,
7
+ image_size=(256, 256),
8
+ )
9
+
10
+ # Define acceleration
11
+ num_workers = 4
12
+ dtype = "bf16"
13
+ grad_checkpoint = True
14
+ plugin = "zero2"
15
+ sp_size = 1
16
+
17
+ # Define model
18
+ model = dict(
19
+ type="STDiT-XL/2",
20
+ space_scale=0.5,
21
+ time_scale=1.0,
22
+ from_pretrained="PixArt-XL-2-512x512.pth",
23
+ enable_flashattn=True,
24
+ enable_layernorm_kernel=True,
25
+ )
26
+ mask_ratios = {
27
+ "mask_no": 0.5,
28
+ "mask_random": 0.29,
29
+ "mask_head": 0.07,
30
+ "mask_tail": 0.07,
31
+ "mask_head_tail": 0.07,
32
+ }
33
+ vae = dict(
34
+ type="VideoAutoencoderKL",
35
+ from_pretrained="stabilityai/sd-vae-ft-ema",
36
+ )
37
+ text_encoder = dict(
38
+ type="t5",
39
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
40
+ model_max_length=120,
41
+ shardformer=True,
42
+ )
43
+ scheduler = dict(
44
+ type="iddpm-speed",
45
+ timestep_respacing="",
46
+ )
47
+
48
+ # Others
49
+ seed = 42
50
+ outputs = "outputs"
51
+ wandb = False
52
+
53
+ epochs = 1000
54
+ log_every = 10
55
+ ckpt_every = 1000
56
+ load = None
57
+
58
+ batch_size = 8
59
+ lr = 2e-5
60
+ grad_clip = 1.0
configs/opensora/train/16x256x256.py CHANGED
@@ -1,14 +1,14 @@
1
- num_frames = 16
2
- frame_interval = 3
3
- image_size = (256, 256)
4
-
5
  # Define dataset
6
- root = None
7
- data_path = "CSV_PATH"
8
- use_image_transform = False
9
- num_workers = 4
 
 
 
10
 
11
  # Define acceleration
 
12
  dtype = "bf16"
13
  grad_checkpoint = True
14
  plugin = "zero2"
 
 
 
 
 
1
  # Define dataset
2
+ dataset = dict(
3
+ type="VideoTextDataset",
4
+ data_path=None,
5
+ num_frames=16,
6
+ frame_interval=3,
7
+ image_size=(256, 256),
8
+ )
9
 
10
  # Define acceleration
11
+ num_workers = 4
12
  dtype = "bf16"
13
  grad_checkpoint = True
14
  plugin = "zero2"
configs/opensora/train/16x512x512.py CHANGED
@@ -1,16 +1,16 @@
1
- num_frames = 16
2
- frame_interval = 3
3
- image_size = (512, 512)
4
-
5
  # Define dataset
6
- root = None
7
- data_path = "CSV_PATH"
8
- use_image_transform = False
9
- num_workers = 4
 
 
 
10
 
11
  # Define acceleration
 
12
  dtype = "bf16"
13
- grad_checkpoint = False
14
  plugin = "zero2"
15
  sp_size = 1
16
 
 
 
 
 
 
1
  # Define dataset
2
+ dataset = dict(
3
+ type="VideoTextDataset",
4
+ data_path=None,
5
+ num_frames=16,
6
+ frame_interval=3,
7
+ image_size=(512, 512),
8
+ )
9
 
10
  # Define acceleration
11
+ num_workers = 4
12
  dtype = "bf16"
13
+ grad_checkpoint = True
14
  plugin = "zero2"
15
  sp_size = 1
16
 
configs/opensora/train/360x512x512.py CHANGED
@@ -1,12 +1,18 @@
1
- num_frames = 360
2
- frame_interval = 1
3
- image_size = (512, 512)
4
-
5
  # Define dataset
6
- root = None
7
- data_path = "CSV_PATH"
8
- use_image_transform = False
 
 
 
 
 
 
9
  num_workers = 4
 
 
 
 
10
 
11
  # Define acceleration
12
  dtype = "bf16"
 
 
 
 
 
1
  # Define dataset
2
+ dataset = dict(
3
+ type="VideoTextDataset",
4
+ data_path=None,
5
+ num_frames=360,
6
+ frame_interval=3,
7
+ image_size=(512, 512),
8
+ )
9
+
10
+ # Define acceleration
11
  num_workers = 4
12
+ dtype = "bf16"
13
+ grad_checkpoint = True
14
+ plugin = "zero2"
15
+ sp_size = 1
16
 
17
  # Define acceleration
18
  dtype = "bf16"
configs/opensora/train/64x512x512-sp.py CHANGED
@@ -1,17 +1,17 @@
1
- num_frames = 64
2
- frame_interval = 2
3
- image_size = (512, 512)
4
-
5
  # Define dataset
6
- root = None
7
- data_path = "CSV_PATH"
8
- use_image_transform = False
9
- num_workers = 4
 
 
 
10
 
11
  # Define acceleration
 
12
  dtype = "bf16"
13
  grad_checkpoint = True
14
- plugin = "zero2-seq"
15
  sp_size = 2
16
 
17
  # Define model
 
 
 
 
 
1
  # Define dataset
2
+ dataset = dict(
3
+ type="VideoTextDataset",
4
+ data_path=None,
5
+ num_frames=16,
6
+ frame_interval=3,
7
+ image_size=(512, 512),
8
+ )
9
 
10
  # Define acceleration
11
+ num_workers = 4
12
  dtype = "bf16"
13
  grad_checkpoint = True
14
+ plugin = "zero2"
15
  sp_size = 2
16
 
17
  # Define model
configs/opensora/train/64x512x512.py CHANGED
@@ -1,14 +1,14 @@
1
- num_frames = 64
2
- frame_interval = 2
3
- image_size = (512, 512)
4
-
5
  # Define dataset
6
- root = None
7
- data_path = "CSV_PATH"
8
- use_image_transform = False
9
- num_workers = 4
 
 
 
10
 
11
  # Define acceleration
 
12
  dtype = "bf16"
13
  grad_checkpoint = True
14
  plugin = "zero2"
 
 
 
 
 
1
  # Define dataset
2
+ dataset = dict(
3
+ type="VideoTextDataset",
4
+ data_path=None,
5
+ num_frames=64,
6
+ frame_interval=3,
7
+ image_size=(512, 512),
8
+ )
9
 
10
  # Define acceleration
11
+ num_workers = 4
12
  dtype = "bf16"
13
  grad_checkpoint = True
14
  plugin = "zero2"
configs/pixart/inference/16x256x256.py CHANGED
@@ -23,10 +23,10 @@ scheduler = dict(
23
  num_sampling_steps=20,
24
  cfg_scale=7.0,
25
  )
26
- dtype = "fp16"
27
 
28
  # Others
29
  batch_size = 2
30
  seed = 42
31
  prompt_path = "./assets/texts/t2v_samples.txt"
32
- save_dir = "./outputs/samples/"
 
23
  num_sampling_steps=20,
24
  cfg_scale=7.0,
25
  )
26
+ dtype = "bf16"
27
 
28
  # Others
29
  batch_size = 2
30
  seed = 42
31
  prompt_path = "./assets/texts/t2v_samples.txt"
32
+ save_dir = "./samples/samples/"
configs/pixart/inference/1x1024MS.py CHANGED
@@ -1,7 +1,7 @@
1
  num_frames = 1
2
  fps = 1
3
  image_size = (1920, 512)
4
- multi_resolution = True
5
 
6
  # Define model
7
  model = dict(
@@ -25,10 +25,10 @@ scheduler = dict(
25
  num_sampling_steps=20,
26
  cfg_scale=7.0,
27
  )
28
- dtype = "fp16"
29
 
30
  # Others
31
  batch_size = 2
32
  seed = 42
33
  prompt_path = "./assets/texts/t2i_samples.txt"
34
- save_dir = "./outputs/samples/"
 
1
  num_frames = 1
2
  fps = 1
3
  image_size = (1920, 512)
4
+ multi_resolution = "PixArtMS"
5
 
6
  # Define model
7
  model = dict(
 
25
  num_sampling_steps=20,
26
  cfg_scale=7.0,
27
  )
28
+ dtype = "bf16"
29
 
30
  # Others
31
  batch_size = 2
32
  seed = 42
33
  prompt_path = "./assets/texts/t2i_samples.txt"
34
+ save_dir = "./samples/samples/"
configs/pixart/inference/1x256x256.py CHANGED
@@ -24,10 +24,10 @@ scheduler = dict(
24
  num_sampling_steps=20,
25
  cfg_scale=7.0,
26
  )
27
- dtype = "fp16"
28
 
29
  # Others
30
  batch_size = 2
31
  seed = 42
32
  prompt_path = "./assets/texts/t2i_samples.txt"
33
- save_dir = "./outputs/samples/"
 
24
  num_sampling_steps=20,
25
  cfg_scale=7.0,
26
  )
27
+ dtype = "bf16"
28
 
29
  # Others
30
  batch_size = 2
31
  seed = 42
32
  prompt_path = "./assets/texts/t2i_samples.txt"
33
+ save_dir = "./samples/samples/"
configs/pixart/inference/1x512x512.py CHANGED
@@ -24,10 +24,16 @@ scheduler = dict(
24
  num_sampling_steps=20,
25
  cfg_scale=7.0,
26
  )
27
- dtype = "fp16"
 
 
 
 
 
 
 
28
 
29
  # Others
30
  batch_size = 2
31
  seed = 42
32
- prompt_path = "./assets/texts/t2i_samples.txt"
33
- save_dir = "./outputs/samples/"
 
24
  num_sampling_steps=20,
25
  cfg_scale=7.0,
26
  )
27
+ dtype = "bf16"
28
+
29
+ # prompt_path = "./assets/texts/t2i_samples.txt"
30
+ prompt = [
31
+ "Pirate ship trapped in a cosmic maelstrom nebula.",
32
+ "A small cactus with a happy face in the Sahara desert.",
33
+ "A small cactus with a sad face in the Sahara desert.",
34
+ ]
35
 
36
  # Others
37
  batch_size = 2
38
  seed = 42
39
+ save_dir = "./samples/samples/"
 
configs/pixart/train/16x256x256.py CHANGED
@@ -1,16 +1,16 @@
1
- num_frames = 16
2
- frame_interval = 3
3
- image_size = (256, 256)
4
-
5
  # Define dataset
6
- root = None
7
- data_path = "CSV_PATH"
8
- use_image_transform = False
9
- num_workers = 4
 
 
 
10
 
11
  # Define acceleration
 
12
  dtype = "bf16"
13
- grad_checkpoint = False
14
  plugin = "zero2"
15
  sp_size = 1
16
 
 
 
 
 
 
1
  # Define dataset
2
+ dataset = dict(
3
+ type="VideoTextDataset",
4
+ data_path=None,
5
+ num_frames=16,
6
+ frame_interval=3,
7
+ image_size=(256, 256),
8
+ )
9
 
10
  # Define acceleration
11
+ num_workers = 4
12
  dtype = "bf16"
13
+ grad_checkpoint = True
14
  plugin = "zero2"
15
  sp_size = 1
16
 
configs/pixart/train/1x512x512.py CHANGED
@@ -1,14 +1,14 @@
1
- num_frames = 1
2
- frame_interval = 1
3
- image_size = (512, 512)
4
-
5
  # Define dataset
6
- root = None
7
- data_path = "CSV_PATH"
8
- use_image_transform = True
9
- num_workers = 4
 
 
 
10
 
11
  # Define acceleration
 
12
  dtype = "bf16"
13
  grad_checkpoint = True
14
  plugin = "zero2"
 
 
 
 
 
1
  # Define dataset
2
+ dataset = dict(
3
+ type="VideoTextDataset",
4
+ data_path=None,
5
+ num_frames=1,
6
+ frame_interval=3,
7
+ image_size=(512, 512),
8
+ )
9
 
10
  # Define acceleration
11
+ num_workers = 4
12
  dtype = "bf16"
13
  grad_checkpoint = True
14
  plugin = "zero2"
configs/pixart/train/64x512x512.py CHANGED
@@ -1,19 +1,20 @@
1
- num_frames = 64
2
- frame_interval = 2
3
- image_size = (512, 512)
4
-
5
  # Define dataset
6
- root = None
7
- data_path = "CSV_PATH"
8
- use_image_transform = False
9
- num_workers = 4
 
 
 
10
 
11
  # Define acceleration
 
12
  dtype = "bf16"
13
  grad_checkpoint = True
14
  plugin = "zero2"
15
  sp_size = 1
16
 
 
17
  # Define model
18
  model = dict(
19
  type="PixArt-XL/2",
 
 
 
 
 
1
  # Define dataset
2
+ dataset = dict(
3
+ type="VideoTextDataset",
4
+ data_path=None,
5
+ num_frames=64,
6
+ frame_interval=3,
7
+ image_size=(256, 256),
8
+ )
9
 
10
  # Define acceleration
11
+ num_workers = 4
12
  dtype = "bf16"
13
  grad_checkpoint = True
14
  plugin = "zero2"
15
  sp_size = 1
16
 
17
+
18
  # Define model
19
  model = dict(
20
  type="PixArt-XL/2",
requirements.txt CHANGED
@@ -1,3 +1,3 @@
1
  xformers
2
- git+https://github.com/hpcaitech/Open-Sora.git#egg=opensora
3
  transformers
 
 
1
  xformers
 
2
  transformers
3
+ git+https://github.com/hpcaitech/Open-Sora.git#egg=opensora