frankleeeee commited on
Commit
9670e85
1 Parent(s): 1cf10f7
Files changed (49) hide show
  1. app.py +393 -240
  2. configs/dit/train/16x256x256.py +1 -1
  3. configs/dit/train/1x256x256.py +1 -1
  4. configs/latte/train/16x256x256.py +1 -1
  5. configs/opensora-v1-1/inference/sample-ref.py +19 -17
  6. configs/opensora-v1-1/inference/sample.py +3 -2
  7. configs/opensora-v1-1/train/benchmark.py +2 -1
  8. configs/opensora-v1-1/train/image.py +2 -1
  9. configs/opensora-v1-1/train/image_rflow.py +88 -0
  10. configs/opensora-v1-1/train/stage1.py +11 -10
  11. configs/opensora-v1-1/train/stage2.py +11 -10
  12. configs/opensora-v1-1/train/stage3.py +11 -10
  13. configs/opensora-v1-1/train/video.py +2 -1
  14. configs/opensora-v1-2/inference/sample.py +43 -0
  15. configs/opensora-v1-2/misc/bs.py +117 -0
  16. configs/opensora-v1-2/misc/eval_loss.py +49 -0
  17. configs/opensora-v1-2/misc/extract.py +62 -0
  18. configs/opensora-v1-2/misc/feat.py +94 -0
  19. configs/opensora-v1-2/train/adapt.py +84 -0
  20. configs/opensora-v1-2/train/stage1.py +111 -0
  21. configs/opensora-v1-2/train/stage1_feat.py +59 -0
  22. configs/opensora-v1-2/train/stage2.py +91 -0
  23. configs/opensora-v1-2/train/stage3.py +92 -0
  24. configs/opensora/inference/16x256x256.py +1 -1
  25. configs/opensora/inference/16x512x512-rflow.py +35 -0
  26. configs/opensora/inference/16x512x512.py +1 -1
  27. configs/opensora/inference/64x512x512.py +1 -1
  28. configs/opensora/train/16x256x256-mask.py +3 -3
  29. configs/opensora/train/16x256x256-spee-rflow.py +64 -0
  30. configs/opensora/train/16x256x256-spee.py +3 -3
  31. configs/opensora/train/16x256x256.py +2 -2
  32. configs/opensora/train/16x512x512.py +1 -1
  33. configs/opensora/train/360x512x512.py +1 -1
  34. configs/opensora/train/64x512x512-sp.py +1 -1
  35. configs/opensora/train/64x512x512.py +1 -1
  36. configs/pixart/inference/1x20481B.py +36 -0
  37. configs/pixart/inference/1x2048MS.py +36 -0
  38. configs/pixart/inference/1x512x512-rflow.py +39 -0
  39. configs/pixart/train/16x256x256.py +1 -1
  40. configs/pixart/train/1x2048x2048.py +54 -0
  41. configs/pixart/train/1x512x512-rflow.py +55 -0
  42. configs/pixart/train/1x512x512.py +1 -1
  43. configs/pixart/train/64x512x512.py +1 -1
  44. configs/vae/inference/image.py +32 -0
  45. configs/vae/inference/video.py +32 -0
  46. configs/vae/train/image.py +58 -0
  47. configs/vae/train/video.py +58 -0
  48. configs/vae/train/video_disc.py +75 -0
  49. requirements.txt +1 -1
app.py CHANGED
@@ -11,147 +11,25 @@ import importlib
11
  import os
12
  import subprocess
13
  import sys
14
- import re
15
- import json
16
- import math
17
-
18
  import spaces
19
  import torch
20
 
21
  import gradio as gr
 
 
 
22
 
23
 
24
- MODEL_TYPES = ["v1.1"]
25
  CONFIG_MAP = {
26
- "v1.1-stage2": "configs/opensora-v1-1/inference/sample-ref.py",
27
- "v1.1-stage3": "configs/opensora-v1-1/inference/sample-ref.py",
28
  }
29
  HF_STDIT_MAP = {
30
- "v1.1-stage2": "hpcai-tech/OpenSora-STDiT-v2-stage2",
31
- "v1.1-stage3": "hpcai-tech/OpenSora-STDiT-v2-stage3",
32
- }
33
- RESOLUTION_MAP = {
34
- "144p": (144, 256),
35
- "240p": (240, 426),
36
- "360p": (360, 480),
37
- "480p": (480, 858),
38
- "720p": (720, 1280),
39
- "1080p": (1080, 1920)
40
  }
41
 
42
-
43
- # ============================
44
- # Utils
45
  # ============================
46
- def collect_references_batch(reference_paths, vae, image_size):
47
- from opensora.datasets.utils import read_from_path
48
-
49
- refs_x = []
50
- for reference_path in reference_paths:
51
- if reference_path is None:
52
- refs_x.append([])
53
- continue
54
- ref_path = reference_path.split(";")
55
- ref = []
56
- for r_path in ref_path:
57
- r = read_from_path(r_path, image_size, transform_name="resize_crop")
58
- r_x = vae.encode(r.unsqueeze(0).to(vae.device, vae.dtype))
59
- r_x = r_x.squeeze(0)
60
- ref.append(r_x)
61
- refs_x.append(ref)
62
- # refs_x: [batch, ref_num, C, T, H, W]
63
- return refs_x
64
-
65
-
66
- def process_mask_strategy(mask_strategy):
67
- mask_batch = []
68
- mask_strategy = mask_strategy.split(";")
69
- for mask in mask_strategy:
70
- mask_group = mask.split(",")
71
- assert len(mask_group) >= 1 and len(mask_group) <= 6, f"Invalid mask strategy: {mask}"
72
- if len(mask_group) == 1:
73
- mask_group.extend(["0", "0", "0", "1", "0"])
74
- elif len(mask_group) == 2:
75
- mask_group.extend(["0", "0", "1", "0"])
76
- elif len(mask_group) == 3:
77
- mask_group.extend(["0", "1", "0"])
78
- elif len(mask_group) == 4:
79
- mask_group.extend(["1", "0"])
80
- elif len(mask_group) == 5:
81
- mask_group.append("0")
82
- mask_batch.append(mask_group)
83
- return mask_batch
84
-
85
-
86
- def apply_mask_strategy(z, refs_x, mask_strategys, loop_i):
87
- masks = []
88
- for i, mask_strategy in enumerate(mask_strategys):
89
- mask = torch.ones(z.shape[2], dtype=torch.float, device=z.device)
90
- if mask_strategy is None:
91
- masks.append(mask)
92
- continue
93
- mask_strategy = process_mask_strategy(mask_strategy)
94
- for mst in mask_strategy:
95
- loop_id, m_id, m_ref_start, m_target_start, m_length, edit_ratio = mst
96
- loop_id = int(loop_id)
97
- if loop_id != loop_i:
98
- continue
99
- m_id = int(m_id)
100
- m_ref_start = int(m_ref_start)
101
- m_length = int(m_length)
102
- m_target_start = int(m_target_start)
103
- edit_ratio = float(edit_ratio)
104
- ref = refs_x[i][m_id] # [C, T, H, W]
105
- if m_ref_start < 0:
106
- m_ref_start = ref.shape[1] + m_ref_start
107
- if m_target_start < 0:
108
- # z: [B, C, T, H, W]
109
- m_target_start = z.shape[2] + m_target_start
110
- z[i, :, m_target_start : m_target_start + m_length] = ref[:, m_ref_start : m_ref_start + m_length]
111
- mask[m_target_start : m_target_start + m_length] = edit_ratio
112
- masks.append(mask)
113
- masks = torch.stack(masks)
114
- return masks
115
-
116
-
117
- def process_prompts(prompts, num_loop):
118
- from opensora.models.text_encoder.t5 import text_preprocessing
119
-
120
- ret_prompts = []
121
- for prompt in prompts:
122
- if prompt.startswith("|0|"):
123
- prompt_list = prompt.split("|")[1:]
124
- text_list = []
125
- for i in range(0, len(prompt_list), 2):
126
- start_loop = int(prompt_list[i])
127
- text = prompt_list[i + 1]
128
- text = text_preprocessing(text)
129
- end_loop = int(prompt_list[i + 2]) if i + 2 < len(prompt_list) else num_loop
130
- text_list.extend([text] * (end_loop - start_loop))
131
- assert len(text_list) == num_loop, f"Prompt loop mismatch: {len(text_list)} != {num_loop}"
132
- ret_prompts.append(text_list)
133
- else:
134
- prompt = text_preprocessing(prompt)
135
- ret_prompts.append([prompt] * num_loop)
136
- return ret_prompts
137
-
138
-
139
- def extract_json_from_prompts(prompts):
140
- additional_infos = []
141
- ret_prompts = []
142
- for prompt in prompts:
143
- parts = re.split(r"(?=[{\[])", prompt)
144
- assert len(parts) <= 2, f"Invalid prompt: {prompt}"
145
- ret_prompts.append(parts[0])
146
- if len(parts) == 1:
147
- additional_infos.append({})
148
- else:
149
- additional_infos.append(json.loads(parts[1]))
150
- return ret_prompts, additional_infos
151
-
152
-
153
- # ============================
154
- # Runtime Environment
155
  # ============================
156
  def install_dependencies(enable_optimization=False):
157
  """
@@ -223,13 +101,9 @@ def build_models(model_type, config, enable_optimization=False):
223
  # build stdit
224
  # we load model from HuggingFace directly so that we don't need to
225
  # handle model download logic in HuggingFace Space
226
- from transformers import AutoModel
227
-
228
- stdit = AutoModel.from_pretrained(
229
- HF_STDIT_MAP[model_type],
230
- enable_flash_attn=enable_optimization,
231
- trust_remote_code=True,
232
- ).cuda()
233
 
234
  # build scheduler
235
  from opensora.registry import SCHEDULERS
@@ -253,13 +127,13 @@ def parse_args():
253
  parser = argparse.ArgumentParser()
254
  parser.add_argument(
255
  "--model-type",
256
- default="v1.1-stage3",
257
  choices=MODEL_TYPES,
258
  help=f"The type of model to run for the Gradio App, can only be {MODEL_TYPES}",
259
  )
260
  parser.add_argument("--output", default="./outputs", type=str, help="The path to the output folder")
261
  parser.add_argument("--port", default=None, type=int, help="The port to run the Gradio App on.")
262
- parser.add_argument("--host", default=None, type=str, help="The host to run the Gradio App on.")
263
  parser.add_argument("--share", action="store_true", help="Whether to share this gradio demo.")
264
  parser.add_argument(
265
  "--enable-optimization",
@@ -279,6 +153,8 @@ def parse_args():
279
  # read config
280
  args = parse_args()
281
  config = read_config(CONFIG_MAP[args.model_type])
 
 
282
 
283
  # make outputs dir
284
  os.makedirs(args.output, exist_ok=True)
@@ -293,6 +169,24 @@ install_dependencies(enable_optimization=args.enable_optimization)
293
  # import after installation
294
  from opensora.datasets import IMG_FPS, save_sample
295
  from opensora.utils.misc import to_torch_dtype
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
296
 
297
  # some global variables
298
  dtype = to_torch_dtype(config.dtype)
@@ -302,117 +196,278 @@ device = torch.device("cuda")
302
  vae, text_encoder, stdit, scheduler = build_models(args.model_type, config, enable_optimization=args.enable_optimization)
303
 
304
 
305
- @spaces.GPU(duration=200)
306
- def run_inference(mode, prompt_text, resolution, length, reference_image):
 
 
 
 
307
  with torch.inference_mode():
308
  # ======================
309
- # 1. Preparation
310
  # ======================
311
  # parse the inputs
312
- resolution = RESOLUTION_MAP[resolution]
 
 
 
 
 
 
 
 
 
 
 
 
313
 
314
- # compute number of loops
315
- num_seconds = int(length.rstrip('s'))
316
- total_number_of_frames = num_seconds * config.fps / config.frame_interval
317
- num_loop = math.ceil(total_number_of_frames / config.num_frames)
318
-
319
- # prepare model args
320
- model_args = dict()
321
- height = torch.tensor([resolution[0]], device=device, dtype=dtype)
322
- width = torch.tensor([resolution[1]], device=device, dtype=dtype)
323
- num_frames = torch.tensor([config.num_frames], device=device, dtype=dtype)
324
- ar = torch.tensor([resolution[0] / resolution[1]], device=device, dtype=dtype)
325
- if config.num_frames == 1:
326
- config.fps = IMG_FPS
327
- fps = torch.tensor([config.fps], device=device, dtype=dtype)
328
- model_args["height"] = height
329
- model_args["width"] = width
330
- model_args["num_frames"] = num_frames
331
- model_args["ar"] = ar
332
- model_args["fps"] = fps
333
-
334
- # compute latent size
335
- input_size = (config.num_frames, *resolution)
336
  latent_size = vae.get_latent_size(input_size)
337
-
338
- # process prompt
339
- prompt_raw = [prompt_text]
340
- prompt_raw, _ = extract_json_from_prompts(prompt_raw)
341
- prompt_loops = process_prompts(prompt_raw, num_loop)
342
- video_clips = []
343
-
344
- # prepare mask strategy
345
- if mode == "Text2Video":
346
  mask_strategy = [None]
347
- elif mode == "Image2Video":
348
- mask_strategy = ['0']
 
 
 
349
  else:
350
  raise ValueError(f"Invalid mode: {mode}")
351
-
352
- # =========================
353
- # 2. Load reference images
354
- # =========================
355
- if mode == "Text2Video":
356
- refs_x = collect_references_batch([None], vae, resolution)
357
- elif mode == "Image2Video":
358
- # save image to disk
359
- from PIL import Image
360
- im = Image.fromarray(reference_image)
361
- im.save("test.jpg")
362
- refs_x = collect_references_batch(["test.jpg"], vae, resolution)
 
 
363
  else:
364
  raise ValueError(f"Invalid mode: {mode}")
 
 
 
 
365
 
366
- # 4.3. long video generation
367
- for loop_i in range(num_loop):
368
- # 4.4 sample in hidden space
369
- batch_prompts = [prompt[loop_i] for prompt in prompt_loops]
370
- z = torch.randn(len(batch_prompts), vae.out_channels, *latent_size, device=device, dtype=dtype)
371
 
372
- # 4.5. apply mask strategy
373
- masks = None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
374
 
375
- # if cfg.reference_path is not None:
376
- if loop_i > 0:
377
- ref_x = vae.encode(video_clips[-1])
378
- for j, refs in enumerate(refs_x):
379
- if refs is None:
380
- refs_x[j] = [ref_x[j]]
381
- else:
382
- refs.append(ref_x[j])
383
- if mask_strategy[j] is None:
384
- mask_strategy[j] = ""
385
- else:
386
- mask_strategy[j] += ";"
387
- mask_strategy[
388
- j
389
- ] += f"{loop_i},{len(refs)-1},-{config.condition_frame_length},0,{config.condition_frame_length}"
390
-
391
- masks = apply_mask_strategy(z, refs_x, mask_strategy, loop_i)
392
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
393
  # 4.6. diffusion sampling
 
 
 
 
 
 
 
 
 
394
  samples = scheduler.sample(
395
  stdit,
396
  text_encoder,
397
  z=z,
398
- prompts=batch_prompts,
399
  device=device,
400
  additional_args=model_args,
401
- mask=masks, # scheduler must support mask
 
402
  )
403
- samples = vae.decode(samples.to(dtype))
404
  video_clips.append(samples)
405
-
406
- # 4.7. save video
407
- if loop_i == num_loop - 1:
408
- video_clips_list = [
409
- video_clips[0][0]] + [video_clips[i][0][:, config.condition_frame_length :]
410
- for i in range(1, num_loop)
411
- ]
412
- video = torch.cat(video_clips_list, dim=1)
413
- save_path = f"{args.output}/sample"
414
- saved_path = save_sample(video, fps=config.fps // config.frame_interval, save_path=save_path, force_video=True)
 
 
 
 
 
 
 
 
 
 
 
 
415
  return saved_path
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
416
 
417
 
418
  def main():
@@ -442,31 +497,119 @@ def main():
442
 
443
  with gr.Row():
444
  with gr.Column():
445
- mode = gr.Radio(
446
- choices=["Text2Video", "Image2Video"],
447
- value="Text2Video",
448
- label="Usage",
449
- info="Choose your usage scenario",
450
- )
451
  prompt_text = gr.Textbox(
452
  label="Prompt",
453
  placeholder="Describe your video here",
454
- lines=4,
455
  )
 
 
 
 
456
  resolution = gr.Radio(
457
- choices=["144p", "240p", "360p", "480p", "720p", "1080p"],
458
- value="144p",
459
  label="Resolution",
460
  )
 
 
 
 
 
461
  length = gr.Radio(
462
- choices=["2s", "4s", "8s"],
463
  value="2s",
464
  label="Video Length",
465
- info="8s may fail as Hugging Face ZeroGPU has the limitation of max 200 seconds inference time."
466
  )
467
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
468
  reference_image = gr.Image(
469
- label="Reference Image (only used for Image2Video)",
 
470
  )
471
 
472
  with gr.Column():
@@ -476,14 +619,24 @@ def main():
476
  )
477
 
478
  with gr.Row():
479
- submit_button = gr.Button("Generate video")
 
480
 
481
 
482
- submit_button.click(
483
- fn=run_inference,
484
- inputs=[mode, prompt_text, resolution, length, reference_image],
 
 
 
 
 
485
  outputs=output_video
486
  )
 
 
 
 
487
 
488
  # launch
489
  demo.launch(server_port=args.port, server_name=args.host, share=args.share)
 
11
  import os
12
  import subprocess
13
  import sys
 
 
 
 
14
  import spaces
15
  import torch
16
 
17
  import gradio as gr
18
+ from tempfile import NamedTemporaryFile
19
+ import datetime
20
+
21
 
22
 
23
+ MODEL_TYPES = ["v1.2-stage3"]
24
  CONFIG_MAP = {
25
+ "v1.2-stage3": "configs/opensora-v1-2/inference/sample.py",
 
26
  }
27
  HF_STDIT_MAP = {
28
+ "v1.2-stage3": "hpcai-tech/OpenSora-STDiT-v3"
 
 
 
 
 
 
 
 
 
29
  }
30
 
 
 
 
31
  # ============================
32
+ # Prepare Runtime Environment
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  # ============================
34
  def install_dependencies(enable_optimization=False):
35
  """
 
101
  # build stdit
102
  # we load model from HuggingFace directly so that we don't need to
103
  # handle model download logic in HuggingFace Space
104
+ from opensora.models.stdit.stdit3 import STDiT3
105
+ stdit = STDiT3.from_pretrained(HF_STDIT_MAP[model_type])
106
+ stdit = stdit.cuda()
 
 
 
 
107
 
108
  # build scheduler
109
  from opensora.registry import SCHEDULERS
 
127
  parser = argparse.ArgumentParser()
128
  parser.add_argument(
129
  "--model-type",
130
+ default="v1.2-stage3",
131
  choices=MODEL_TYPES,
132
  help=f"The type of model to run for the Gradio App, can only be {MODEL_TYPES}",
133
  )
134
  parser.add_argument("--output", default="./outputs", type=str, help="The path to the output folder")
135
  parser.add_argument("--port", default=None, type=int, help="The port to run the Gradio App on.")
136
+ parser.add_argument("--host", default="0.0.0.0", type=str, help="The host to run the Gradio App on.")
137
  parser.add_argument("--share", action="store_true", help="Whether to share this gradio demo.")
138
  parser.add_argument(
139
  "--enable-optimization",
 
153
  # read config
154
  args = parse_args()
155
  config = read_config(CONFIG_MAP[args.model_type])
156
+ torch.backends.cuda.matmul.allow_tf32 = True
157
+ torch.backends.cudnn.allow_tf32 = True
158
 
159
  # make outputs dir
160
  os.makedirs(args.output, exist_ok=True)
 
169
  # import after installation
170
  from opensora.datasets import IMG_FPS, save_sample
171
  from opensora.utils.misc import to_torch_dtype
172
+ from opensora.utils.inference_utils import (
173
+ append_generated,
174
+ apply_mask_strategy,
175
+ collect_references_batch,
176
+ extract_json_from_prompts,
177
+ extract_prompts_loop,
178
+ prepare_multi_resolution_info,
179
+ dframe_to_frame,
180
+ append_score_to_prompts,
181
+ has_openai_key,
182
+ refine_prompts_by_openai,
183
+ add_watermark,
184
+ get_random_prompt_by_openai,
185
+ split_prompt,
186
+ merge_prompt
187
+ )
188
+ from opensora.models.text_encoder.t5 import text_preprocessing
189
+ from opensora.datasets.aspect import get_image_size, get_num_frames
190
 
191
  # some global variables
192
  dtype = to_torch_dtype(config.dtype)
 
196
  vae, text_encoder, stdit, scheduler = build_models(args.model_type, config, enable_optimization=args.enable_optimization)
197
 
198
 
199
+ def run_inference(mode, prompt_text, resolution, aspect_ratio, length, motion_strength, aesthetic_score, use_motion_strength, use_aesthetic_score, camera_motion, reference_image, refine_prompt, fps, num_loop, seed, sampling_steps, cfg_scale):
200
+ if prompt_text is None or prompt_text == "":
201
+ gr.Warning("Your prompt is empty, please enter a valid prompt")
202
+ return None
203
+
204
+ torch.manual_seed(seed)
205
  with torch.inference_mode():
206
  # ======================
207
+ # 1. Preparation arguments
208
  # ======================
209
  # parse the inputs
210
+ # frame_interval must be 1 so we ignore it here
211
+ image_size = get_image_size(resolution, aspect_ratio)
212
+
213
+ # compute generation parameters
214
+ if mode == "Text2Image":
215
+ num_frames = 1
216
+ fps = IMG_FPS
217
+ else:
218
+ num_frames = config.num_frames
219
+ num_frames = get_num_frames(length)
220
+
221
+ condition_frame_length = int(num_frames / 17 * 5 / 3)
222
+ condition_frame_edit = 0.0
223
 
224
+ input_size = (num_frames, *image_size)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
225
  latent_size = vae.get_latent_size(input_size)
226
+ multi_resolution = "OpenSora"
227
+ align = 5
228
+
229
+ # == prepare mask strategy ==
230
+ if mode == "Text2Image":
 
 
 
 
231
  mask_strategy = [None]
232
+ elif mode == "Text2Video":
233
+ if reference_image is not None:
234
+ mask_strategy = ['0']
235
+ else:
236
+ mask_strategy = [None]
237
  else:
238
  raise ValueError(f"Invalid mode: {mode}")
239
+
240
+ # == prepare reference ==
241
+ if mode == "Text2Image":
242
+ refs = [""]
243
+ elif mode == "Text2Video":
244
+ if reference_image is not None:
245
+ # save image to disk
246
+ from PIL import Image
247
+ im = Image.fromarray(reference_image)
248
+ temp_file = NamedTemporaryFile(suffix=".png")
249
+ im.save(temp_file.name)
250
+ refs = [temp_file.name]
251
+ else:
252
+ refs = [""]
253
  else:
254
  raise ValueError(f"Invalid mode: {mode}")
255
+
256
+ # == get json from prompts ==
257
+ batch_prompts = [prompt_text]
258
+ batch_prompts, refs, mask_strategy = extract_json_from_prompts(batch_prompts, refs, mask_strategy)
259
 
260
+ # == get reference for condition ==
261
+ refs = collect_references_batch(refs, vae, image_size)
 
 
 
262
 
263
+ # == multi-resolution info ==
264
+ model_args = prepare_multi_resolution_info(
265
+ multi_resolution, len(batch_prompts), image_size, num_frames, fps, device, dtype
266
+ )
267
+
268
+ # == process prompts step by step ==
269
+ # 0. split prompt
270
+ # each element in the list is [prompt_segment_list, loop_idx_list]
271
+ batched_prompt_segment_list = []
272
+ batched_loop_idx_list = []
273
+ for prompt in batch_prompts:
274
+ prompt_segment_list, loop_idx_list = split_prompt(prompt)
275
+ batched_prompt_segment_list.append(prompt_segment_list)
276
+ batched_loop_idx_list.append(loop_idx_list)
277
+
278
+ # 1. refine prompt by openai
279
+ if refine_prompt:
280
+ # check if openai key is provided
281
+ if not has_openai_key():
282
+ gr.Warning("OpenAI API key is not provided, the prompt will not be enhanced.")
283
+ else:
284
+ for idx, prompt_segment_list in enumerate(batched_prompt_segment_list):
285
+ batched_prompt_segment_list[idx] = refine_prompts_by_openai(prompt_segment_list)
286
+
287
+ # process scores
288
+ aesthetic_score = aesthetic_score if use_aesthetic_score else None
289
+ motion_strength = motion_strength if use_motion_strength and mode != "Text2Image" else None
290
+ camera_motion = None if camera_motion == "none" or mode == "Text2Image" else camera_motion
291
+ # 2. append score
292
+ for idx, prompt_segment_list in enumerate(batched_prompt_segment_list):
293
+ batched_prompt_segment_list[idx] = append_score_to_prompts(
294
+ prompt_segment_list,
295
+ aes=aesthetic_score,
296
+ flow=motion_strength,
297
+ camera_motion=camera_motion,
298
+ )
299
 
300
+ # 3. clean prompt with T5
301
+ for idx, prompt_segment_list in enumerate(batched_prompt_segment_list):
302
+ batched_prompt_segment_list[idx] = [text_preprocessing(prompt) for prompt in prompt_segment_list]
303
+
304
+ # 4. merge to obtain the final prompt
305
+ batch_prompts = []
306
+ for prompt_segment_list, loop_idx_list in zip(batched_prompt_segment_list, batched_loop_idx_list):
307
+ batch_prompts.append(merge_prompt(prompt_segment_list, loop_idx_list))
308
+
 
 
 
 
 
 
 
 
309
 
310
+ # =========================
311
+ # Generate image/video
312
+ # =========================
313
+ video_clips = []
314
+
315
+ for loop_i in range(num_loop):
316
+ # 4.4 sample in hidden space
317
+ batch_prompts_loop = extract_prompts_loop(batch_prompts, loop_i)
318
+
319
+ # == loop ==
320
+ if loop_i > 0:
321
+ refs, mask_strategy = append_generated(
322
+ vae,
323
+ video_clips[-1],
324
+ refs,
325
+ mask_strategy,
326
+ loop_i,
327
+ condition_frame_length,
328
+ condition_frame_edit
329
+ )
330
+
331
+ # == sampling ==
332
+ z = torch.randn(len(batch_prompts), vae.out_channels, *latent_size, device=device, dtype=dtype)
333
+ masks = apply_mask_strategy(z, refs, mask_strategy, loop_i, align=align)
334
+
335
  # 4.6. diffusion sampling
336
+ # hack to update num_sampling_steps and cfg_scale
337
+ scheduler_kwargs = config.scheduler.copy()
338
+ scheduler_kwargs.pop('type')
339
+ scheduler_kwargs['num_sampling_steps'] = sampling_steps
340
+ scheduler_kwargs['cfg_scale'] = cfg_scale
341
+
342
+ scheduler.__init__(
343
+ **scheduler_kwargs
344
+ )
345
  samples = scheduler.sample(
346
  stdit,
347
  text_encoder,
348
  z=z,
349
+ prompts=batch_prompts_loop,
350
  device=device,
351
  additional_args=model_args,
352
+ progress=True,
353
+ mask=masks,
354
  )
355
+ samples = vae.decode(samples.to(dtype), num_frames=num_frames)
356
  video_clips.append(samples)
357
+
358
+ # =========================
359
+ # Save output
360
+ # =========================
361
+ video_clips = [val[0] for val in video_clips]
362
+ for i in range(1, num_loop):
363
+ video_clips[i] = video_clips[i][:, dframe_to_frame(condition_frame_length) :]
364
+ video = torch.cat(video_clips, dim=1)
365
+ current_datetime = datetime.datetime.now()
366
+ timestamp = current_datetime.timestamp()
367
+ save_path = os.path.join(args.output, f"output_{timestamp}")
368
+ saved_path = save_sample(video, save_path=save_path, fps=24)
369
+ torch.cuda.empty_cache()
370
+
371
+ # add watermark
372
+ # all watermarked videos should have a _watermarked suffix
373
+ if mode != "Text2Image":
374
+ watermarked_path = saved_path.replace(".mp4", "_watermarked.mp4")
375
+ success = add_watermark(saved_path, "./assets/images/watermark/watermark.png", watermarked_path)
376
+ if success:
377
+ return watermarked_path
378
+ else:
379
  return saved_path
380
+ else:
381
+ return saved_path
382
+
383
+
384
+ @spaces.GPU(duration=200)
385
+ def run_image_inference(
386
+ prompt_text,
387
+ resolution,
388
+ aspect_ratio,
389
+ length,
390
+ motion_strength,
391
+ aesthetic_score,
392
+ use_motion_strength,
393
+ use_aesthetic_score,
394
+ camera_motion,
395
+ reference_image,
396
+ refine_prompt,
397
+ fps,
398
+ num_loop,
399
+ seed,
400
+ sampling_steps,
401
+ cfg_scale):
402
+ return run_inference(
403
+ "Text2Image",
404
+ prompt_text,
405
+ resolution,
406
+ aspect_ratio,
407
+ length,
408
+ motion_strength,
409
+ aesthetic_score,
410
+ use_motion_strength,
411
+ use_aesthetic_score,
412
+ camera_motion,
413
+ reference_image,
414
+ refine_prompt,
415
+ fps,
416
+ num_loop,
417
+ seed,
418
+ sampling_steps,
419
+ cfg_scale)
420
+
421
+ @spaces.GPU(duration=200)
422
+ def run_video_inference(
423
+ prompt_text,
424
+ resolution,
425
+ aspect_ratio,
426
+ length,
427
+ motion_strength,
428
+ aesthetic_score,
429
+ use_motion_strength,
430
+ use_aesthetic_score,
431
+ camera_motion,
432
+ reference_image,
433
+ refine_prompt,
434
+ fps,
435
+ num_loop,
436
+ seed,
437
+ sampling_steps,
438
+ cfg_scale):
439
+ # if (resolution == "480p" and length == "16s") or \
440
+ # (resolution == "720p" and length in ["8s", "16s"]):
441
+ # gr.Warning("Generation is interrupted as the combination of 480p and 16s will lead to CUDA out of memory")
442
+ # else:
443
+ return run_inference(
444
+ "Text2Video",
445
+ prompt_text,
446
+ resolution,
447
+ aspect_ratio,
448
+ length,
449
+ motion_strength,
450
+ aesthetic_score,
451
+ use_motion_strength,
452
+ use_aesthetic_score,
453
+ camera_motion,
454
+ reference_image,
455
+ refine_prompt,
456
+ fps,
457
+ num_loop,
458
+ seed,
459
+ sampling_steps,
460
+ cfg_scale
461
+ )
462
+
463
+
464
+ def generate_random_prompt():
465
+ if "OPENAI_API_KEY" not in os.environ:
466
+ gr.Warning("Your prompt is empty and the OpenAI API key is not provided, please enter a valid prompt")
467
+ return None
468
+ else:
469
+ prompt_text = get_random_prompt_by_openai()
470
+ return prompt_text
471
 
472
 
473
  def main():
 
497
 
498
  with gr.Row():
499
  with gr.Column():
 
 
 
 
 
 
500
  prompt_text = gr.Textbox(
501
  label="Prompt",
502
  placeholder="Describe your video here",
503
+ lines=4
504
  )
505
+ refine_prompt = gr.Checkbox(value=True, label="Refine prompt with GPT4o")
506
+ random_prompt_btn = gr.Button("Random Prompt By GPT4o")
507
+
508
+ gr.Markdown("## Basic Settings")
509
  resolution = gr.Radio(
510
+ choices=["144p", "240p", "360p", "480p", "720p"],
511
+ value="480p",
512
  label="Resolution",
513
  )
514
+ aspect_ratio = gr.Radio(
515
+ choices=["9:16", "16:9", "3:4", "4:3", "1:1"],
516
+ value="9:16",
517
+ label="Aspect Ratio (H:W)",
518
+ )
519
  length = gr.Radio(
520
+ choices=["2s", "4s", "8s", "16s"],
521
  value="2s",
522
  label="Video Length",
523
+ info="only effective for video generation, 8s may fail as Hugging Face ZeroGPU has the limitation of max 200 seconds inference time."
524
  )
525
 
526
+ with gr.Row():
527
+ seed = gr.Slider(
528
+ value=1024,
529
+ minimum=1,
530
+ maximum=2048,
531
+ step=1,
532
+ label="Seed"
533
+ )
534
+
535
+ sampling_steps = gr.Slider(
536
+ value=30,
537
+ minimum=1,
538
+ maximum=200,
539
+ step=1,
540
+ label="Sampling steps"
541
+ )
542
+ cfg_scale = gr.Slider(
543
+ value=7.0,
544
+ minimum=0.0,
545
+ maximum=10.0,
546
+ step=0.1,
547
+ label="CFG Scale"
548
+ )
549
+
550
+ with gr.Row():
551
+ with gr.Column():
552
+ motion_strength = gr.Slider(
553
+ value=5,
554
+ minimum=0,
555
+ maximum=100,
556
+ step=1,
557
+ label="Motion Strength",
558
+ info="only effective for video generation"
559
+ )
560
+ use_motion_strength = gr.Checkbox(value=False, label="Enable")
561
+
562
+ with gr.Column():
563
+ aesthetic_score = gr.Slider(
564
+ value=6.5,
565
+ minimum=4,
566
+ maximum=7,
567
+ step=0.1,
568
+ label="Aesthetic",
569
+ info="effective for text & video generation"
570
+ )
571
+ use_aesthetic_score = gr.Checkbox(value=True, label="Enable")
572
+
573
+ camera_motion = gr.Radio(
574
+ value="none",
575
+ label="Camera Motion",
576
+ choices=[
577
+ "none",
578
+ "pan right",
579
+ "pan left",
580
+ "tilt up",
581
+ "tilt down",
582
+ "zoom in",
583
+ "zoom out",
584
+ "static"
585
+ ],
586
+ interactive=True
587
+ )
588
+
589
+ gr.Markdown("## Advanced Settings")
590
+ with gr.Row():
591
+ fps = gr.Slider(
592
+ value=24,
593
+ minimum=1,
594
+ maximum=60,
595
+ step=1,
596
+ label="FPS",
597
+ info="This is the frames per seconds for video generation, keep it to 24 if you are not sure"
598
+ )
599
+ num_loop = gr.Slider(
600
+ value=1,
601
+ minimum=1,
602
+ maximum=20,
603
+ step=1,
604
+ label="Number of Loops",
605
+ info="This will change the length of the generated video, keep it to 1 if you are not sure"
606
+ )
607
+
608
+
609
+ gr.Markdown("## Reference Image")
610
  reference_image = gr.Image(
611
+ label="Image (optional)",
612
+ show_download_button=True
613
  )
614
 
615
  with gr.Column():
 
619
  )
620
 
621
  with gr.Row():
622
+ image_gen_button = gr.Button("Generate image")
623
+ video_gen_button = gr.Button("Generate video")
624
 
625
 
626
+ image_gen_button.click(
627
+ fn=run_image_inference,
628
+ inputs=[prompt_text, resolution, aspect_ratio, length, motion_strength, aesthetic_score, use_motion_strength, use_aesthetic_score, camera_motion, reference_image, refine_prompt, fps, num_loop, seed, sampling_steps, cfg_scale],
629
+ outputs=reference_image
630
+ )
631
+ video_gen_button.click(
632
+ fn=run_video_inference,
633
+ inputs=[prompt_text, resolution, aspect_ratio, length, motion_strength, aesthetic_score, use_motion_strength, use_aesthetic_score, camera_motion, reference_image, refine_prompt, fps, num_loop, seed, sampling_steps, cfg_scale],
634
  outputs=output_video
635
  )
636
+ random_prompt_btn.click(
637
+ fn=generate_random_prompt,
638
+ outputs=prompt_text
639
+ )
640
 
641
  # launch
642
  demo.launch(server_port=args.port, server_name=args.host, share=args.share)
configs/dit/train/16x256x256.py CHANGED
@@ -18,7 +18,7 @@ sp_size = 1
18
  model = dict(
19
  type="DiT-XL/2",
20
  from_pretrained="DiT-XL-2-256x256.pt",
21
- enable_flashattn=True,
22
  enable_layernorm_kernel=True,
23
  )
24
  vae = dict(
 
18
  model = dict(
19
  type="DiT-XL/2",
20
  from_pretrained="DiT-XL-2-256x256.pt",
21
+ enable_flash_attn=True,
22
  enable_layernorm_kernel=True,
23
  )
24
  vae = dict(
configs/dit/train/1x256x256.py CHANGED
@@ -19,7 +19,7 @@ sp_size = 1
19
  model = dict(
20
  type="DiT-XL/2",
21
  no_temporal_pos_emb=True,
22
- enable_flashattn=True,
23
  enable_layernorm_kernel=True,
24
  )
25
  vae = dict(
 
19
  model = dict(
20
  type="DiT-XL/2",
21
  no_temporal_pos_emb=True,
22
+ enable_flash_attn=True,
23
  enable_layernorm_kernel=True,
24
  )
25
  vae = dict(
configs/latte/train/16x256x256.py CHANGED
@@ -17,7 +17,7 @@ sp_size = 1
17
  # Define model
18
  model = dict(
19
  type="Latte-XL/2",
20
- enable_flashattn=True,
21
  enable_layernorm_kernel=True,
22
  )
23
  vae = dict(
 
17
  # Define model
18
  model = dict(
19
  type="Latte-XL/2",
20
+ enable_flash_attn=True,
21
  enable_layernorm_kernel=True,
22
  )
23
  vae = dict(
configs/opensora-v1-1/inference/sample-ref.py CHANGED
@@ -7,33 +7,35 @@ multi_resolution = "STDiT2"
7
  # Condition
8
  prompt_path = None
9
  prompt = [
10
- "A car driving on the ocean.",
11
- 'Drone view of waves crashing against the rugged cliffs along Big Sur\'s garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff\'s edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff\'s edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.{"reference_path": "assets/images/condition/cliff.png", "mask_strategy": "0"}',
12
- "In an ornate, historical hall, a massive tidal wave peaks and begins to crash. Two surfers, seizing the moment, skillfully navigate the face of the wave.",
 
 
 
13
  ]
14
 
15
  loop = 2
16
  condition_frame_length = 4
17
- reference_path = [
18
- "https://cdn.openai.com/tmp/s/interp/d0.mp4",
19
- None,
20
- "assets/images/condition/wave.png",
21
- ]
22
- # valid when reference_path is not None
23
- # (loop id, ref id, ref start, length, target start)
24
- mask_strategy = [
25
- "0,0,0,0,8,0.3",
26
- None,
27
- "0",
28
- ]
29
 
30
  # Define model
31
  model = dict(
32
  type="STDiT2-XL/2",
33
- from_pretrained=None,
34
  input_sq_size=512,
35
  qk_norm=True,
36
- enable_flashattn=True,
 
37
  enable_layernorm_kernel=True,
38
  )
39
  vae = dict(
 
7
  # Condition
8
  prompt_path = None
9
  prompt = [
10
+ 'Drone view of waves crashing against the rugged cliffs along Big Sur\'s garay point beach. {"reference_path": "assets/images/condition/cliff.png", "mask_strategy": "0"}',
11
+ 'A breathtaking sunrise scene.{"reference_path": "assets/images/condition/sunset1.png","mask_strategy": "0"}',
12
+ 'A car driving on the ocean.{"reference_path": "https://cdn.openai.com/tmp/s/interp/d0.mp4","mask_strategy": "0,0,-8,0,8"}',
13
+ 'A snowy forest.{"reference_path": "https://cdn.pixabay.com/video/2021/04/25/72171-542991404_large.mp4","mask_strategy": "0,0,0,0,15,0.8"}',
14
+ 'A breathtaking sunrise scene.{"reference_path": "assets/images/condition/sunset1.png;assets/images/condition/sunset2.png","mask_strategy": "0;0,1,0,-1,1"}',
15
+ '|0|a white jeep equipped with a roof rack driving on a dirt road in a coniferous forest.|2|a white jeep equipped with a roof rack driving on a dirt road in the desert.|4|a white jeep equipped with a roof rack driving on a dirt road in a mountain.|6|A white jeep equipped with a roof rack driving on a dirt road in a city.|8|a white jeep equipped with a roof rack driving on a dirt road on the surface of a river.|10|a white jeep equipped with a roof rack driving on a dirt road under the lake.|12|a white jeep equipped with a roof rack flying into the sky.|14|a white jeep equipped with a roof rack driving in the universe. Earth is the background.{"reference_path": "https://cdn.openai.com/tmp/s/interp/d0.mp4", "mask_strategy": "0,0,0,0,15"}',
16
  ]
17
 
18
  loop = 2
19
  condition_frame_length = 4
20
+ # (
21
+ # loop id, [the loop index of the condition image or video]
22
+ # reference id, [the index of the condition image or video in the reference_path]
23
+ # reference start, [the start frame of the condition image or video]
24
+ # target start, [the location to insert]
25
+ # length, [the number of frames to insert]
26
+ # edit_ratio [the edit rate of the condition image or video]
27
+ # )
28
+ # See https://github.com/hpcaitech/Open-Sora/blob/main/docs/config.md#advanced-inference-config for more details
29
+ # See https://github.com/hpcaitech/Open-Sora/blob/main/docs/commands.md#inference-with-open-sora-11 for more examples
 
 
30
 
31
  # Define model
32
  model = dict(
33
  type="STDiT2-XL/2",
34
+ from_pretrained="hpcai-tech/OpenSora-STDiT-v2-stage3",
35
  input_sq_size=512,
36
  qk_norm=True,
37
+ qk_norm_legacy=True,
38
+ enable_flash_attn=True,
39
  enable_layernorm_kernel=True,
40
  )
41
  vae = dict(
configs/opensora-v1-1/inference/sample.py CHANGED
@@ -7,10 +7,11 @@ multi_resolution = "STDiT2"
7
  # Define model
8
  model = dict(
9
  type="STDiT2-XL/2",
10
- from_pretrained=None,
11
  input_sq_size=512,
12
  qk_norm=True,
13
- enable_flashattn=True,
 
14
  enable_layernorm_kernel=True,
15
  )
16
  vae = dict(
 
7
  # Define model
8
  model = dict(
9
  type="STDiT2-XL/2",
10
+ from_pretrained="hpcai-tech/OpenSora-STDiT-v2-stage3",
11
  input_sq_size=512,
12
  qk_norm=True,
13
+ qk_norm_legacy=True,
14
+ enable_flash_attn=True,
15
  enable_layernorm_kernel=True,
16
  )
17
  vae = dict(
configs/opensora-v1-1/train/benchmark.py CHANGED
@@ -65,7 +65,8 @@ model = dict(
65
  from_pretrained=None,
66
  input_sq_size=512, # pretrained model is trained on 512x512
67
  qk_norm=True,
68
- enable_flashattn=True,
 
69
  enable_layernorm_kernel=True,
70
  )
71
  vae = dict(
 
65
  from_pretrained=None,
66
  input_sq_size=512, # pretrained model is trained on 512x512
67
  qk_norm=True,
68
+ qk_norm_legacy=True,
69
+ enable_flash_attn=True,
70
  enable_layernorm_kernel=True,
71
  )
72
  vae = dict(
configs/opensora-v1-1/train/image.py CHANGED
@@ -29,7 +29,8 @@ model = dict(
29
  from_pretrained=None,
30
  input_sq_size=512, # pretrained model is trained on 512x512
31
  qk_norm=True,
32
- enable_flashattn=True,
 
33
  enable_layernorm_kernel=True,
34
  )
35
  vae = dict(
 
29
  from_pretrained=None,
30
  input_sq_size=512, # pretrained model is trained on 512x512
31
  qk_norm=True,
32
+ qk_norm_legacy=True,
33
+ enable_flash_attn=True,
34
  enable_layernorm_kernel=True,
35
  )
36
  vae = dict(
configs/opensora-v1-1/train/image_rflow.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Define dataset
2
+ # dataset = dict(
3
+ # type="VariableVideoTextDataset",
4
+ # data_path=None,
5
+ # num_frames=None,
6
+ # frame_interval=3,
7
+ # image_size=(None, None),
8
+ # transform_name="resize_crop",
9
+ # )
10
+ dataset = dict(
11
+ type="VideoTextDataset",
12
+ data_path=None,
13
+ num_frames=1,
14
+ frame_interval=1,
15
+ image_size=(256, 256),
16
+ transform_name="center",
17
+ )
18
+ bucket_config = { # 6s/it
19
+ "256": {1: (1.0, 256)},
20
+ "512": {1: (1.0, 80)},
21
+ "480p": {1: (1.0, 52)},
22
+ "1024": {1: (1.0, 20)},
23
+ "1080p": {1: (1.0, 8)},
24
+ }
25
+
26
+ # Define acceleration
27
+ num_workers = 16
28
+ dtype = "bf16"
29
+ grad_checkpoint = True
30
+ plugin = "zero2"
31
+ sp_size = 1
32
+
33
+ # Define model
34
+ # model = dict(
35
+ # type="DiT-XL/2",
36
+ # from_pretrained="/home/zhaowangbo/wangbo/PixArt-alpha/pretrained_models/PixArt-XL-2-512x512.pth",
37
+ # # input_sq_size=512, # pretrained model is trained on 512x512
38
+ # enable_flash_attn=True,
39
+ # enable_layernorm_kernel=True,
40
+ # )
41
+ model = dict(
42
+ type="PixArt-XL/2",
43
+ space_scale=1.0,
44
+ time_scale=1.0,
45
+ no_temporal_pos_emb=True,
46
+ from_pretrained="PixArt-XL-2-512x512.pth",
47
+ enable_flash_attn=True,
48
+ enable_layernorm_kernel=True,
49
+ )
50
+ # model = dict(
51
+ # type="DiT-XL/2",
52
+ # # space_scale=1.0,
53
+ # # time_scale=1.0,
54
+ # no_temporal_pos_emb=True,
55
+ # # from_pretrained="PixArt-XL-2-512x512.pth",
56
+ # from_pretrained="/home/zhaowangbo/wangbo/PixArt-alpha/pretrained_models/PixArt-XL-2-512x512.pth",
57
+ # enable_flash_attn=True,
58
+ # enable_layernorm_kernel=True,
59
+ # )
60
+ vae = dict(
61
+ type="VideoAutoencoderKL",
62
+ from_pretrained="stabilityai/sd-vae-ft-ema",
63
+ micro_batch_size=4,
64
+ )
65
+ text_encoder = dict(
66
+ type="t5",
67
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
68
+ model_max_length=200,
69
+ shardformer=True,
70
+ )
71
+ scheduler = dict(
72
+ type="rflow",
73
+ # timestep_respacing="",
74
+ )
75
+
76
+ # Others
77
+ seed = 42
78
+ outputs = "outputs"
79
+ wandb = False
80
+
81
+ epochs = 10
82
+ log_every = 10
83
+ ckpt_every = 500
84
+ load = None
85
+
86
+ batch_size = 100 # only for logging
87
+ lr = 2e-5
88
+ grad_clip = 1.0
configs/opensora-v1-1/train/stage1.py CHANGED
@@ -16,15 +16,15 @@ bucket_config = { # 1s/it
16
  "1024": {1: (0.3, 3)},
17
  }
18
  mask_ratios = {
19
- "mask_no": 0.75,
20
- "mask_quarter_random": 0.025,
21
- "mask_quarter_head": 0.025,
22
- "mask_quarter_tail": 0.025,
23
- "mask_quarter_head_tail": 0.05,
24
- "mask_image_random": 0.025,
25
- "mask_image_head": 0.025,
26
- "mask_image_tail": 0.025,
27
- "mask_image_head_tail": 0.05,
28
  }
29
 
30
  # Define acceleration
@@ -41,7 +41,8 @@ model = dict(
41
  from_pretrained=None,
42
  input_sq_size=512, # pretrained model is trained on 512x512
43
  qk_norm=True,
44
- enable_flashattn=True,
 
45
  enable_layernorm_kernel=True,
46
  )
47
  vae = dict(
 
16
  "1024": {1: (0.3, 3)},
17
  }
18
  mask_ratios = {
19
+ "identity": 0.75,
20
+ "quarter_random": 0.025,
21
+ "quarter_head": 0.025,
22
+ "quarter_tail": 0.025,
23
+ "quarter_head_tail": 0.05,
24
+ "image_random": 0.025,
25
+ "image_head": 0.025,
26
+ "image_tail": 0.025,
27
+ "image_head_tail": 0.05,
28
  }
29
 
30
  # Define acceleration
 
41
  from_pretrained=None,
42
  input_sq_size=512, # pretrained model is trained on 512x512
43
  qk_norm=True,
44
+ qk_norm_legacy=True,
45
+ enable_flash_attn=True,
46
  enable_layernorm_kernel=True,
47
  )
48
  vae = dict(
configs/opensora-v1-1/train/stage2.py CHANGED
@@ -18,15 +18,15 @@ bucket_config = { # 7s/it
18
  "1080p": {1: (0.4, 8)},
19
  }
20
  mask_ratios = {
21
- "mask_no": 0.75,
22
- "mask_quarter_random": 0.025,
23
- "mask_quarter_head": 0.025,
24
- "mask_quarter_tail": 0.025,
25
- "mask_quarter_head_tail": 0.05,
26
- "mask_image_random": 0.025,
27
- "mask_image_head": 0.025,
28
- "mask_image_tail": 0.025,
29
- "mask_image_head_tail": 0.05,
30
  }
31
 
32
  # Define acceleration
@@ -43,7 +43,8 @@ model = dict(
43
  from_pretrained=None,
44
  input_sq_size=512, # pretrained model is trained on 512x512
45
  qk_norm=True,
46
- enable_flashattn=True,
 
47
  enable_layernorm_kernel=True,
48
  )
49
  vae = dict(
 
18
  "1080p": {1: (0.4, 8)},
19
  }
20
  mask_ratios = {
21
+ "identity": 0.75,
22
+ "quarter_random": 0.025,
23
+ "quarter_head": 0.025,
24
+ "quarter_tail": 0.025,
25
+ "quarter_head_tail": 0.05,
26
+ "image_random": 0.025,
27
+ "image_head": 0.025,
28
+ "image_tail": 0.025,
29
+ "image_head_tail": 0.05,
30
  }
31
 
32
  # Define acceleration
 
43
  from_pretrained=None,
44
  input_sq_size=512, # pretrained model is trained on 512x512
45
  qk_norm=True,
46
+ qk_norm_legacy=True,
47
+ enable_flash_attn=True,
48
  enable_layernorm_kernel=True,
49
  )
50
  vae = dict(
configs/opensora-v1-1/train/stage3.py CHANGED
@@ -18,15 +18,15 @@ bucket_config = { # 13s/it
18
  "1024": {1: (0.3, 40)},
19
  }
20
  mask_ratios = {
21
- "mask_no": 0.75,
22
- "mask_quarter_random": 0.025,
23
- "mask_quarter_head": 0.025,
24
- "mask_quarter_tail": 0.025,
25
- "mask_quarter_head_tail": 0.05,
26
- "mask_image_random": 0.025,
27
- "mask_image_head": 0.025,
28
- "mask_image_tail": 0.025,
29
- "mask_image_head_tail": 0.05,
30
  }
31
 
32
  # Define acceleration
@@ -43,7 +43,8 @@ model = dict(
43
  from_pretrained=None,
44
  input_sq_size=512, # pretrained model is trained on 512x512
45
  qk_norm=True,
46
- enable_flashattn=True,
 
47
  enable_layernorm_kernel=True,
48
  )
49
  vae = dict(
 
18
  "1024": {1: (0.3, 40)},
19
  }
20
  mask_ratios = {
21
+ "identity": 0.75,
22
+ "quarter_random": 0.025,
23
+ "quarter_head": 0.025,
24
+ "quarter_tail": 0.025,
25
+ "quarter_head_tail": 0.05,
26
+ "image_random": 0.025,
27
+ "image_head": 0.025,
28
+ "image_tail": 0.025,
29
+ "image_head_tail": 0.05,
30
  }
31
 
32
  # Define acceleration
 
43
  from_pretrained=None,
44
  input_sq_size=512, # pretrained model is trained on 512x512
45
  qk_norm=True,
46
+ qk_norm_legacy=True,
47
+ enable_flash_attn=True,
48
  enable_layernorm_kernel=True,
49
  )
50
  vae = dict(
configs/opensora-v1-1/train/video.py CHANGED
@@ -31,7 +31,8 @@ model = dict(
31
  from_pretrained=None,
32
  input_sq_size=512, # pretrained model is trained on 512x512
33
  qk_norm=True,
34
- enable_flashattn=True,
 
35
  enable_layernorm_kernel=True,
36
  )
37
  vae = dict(
 
31
  from_pretrained=None,
32
  input_sq_size=512, # pretrained model is trained on 512x512
33
  qk_norm=True,
34
+ qk_norm_legacy=True,
35
+ enable_flash_attn=True,
36
  enable_layernorm_kernel=True,
37
  )
38
  vae = dict(
configs/opensora-v1-2/inference/sample.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ resolution = "240p"
2
+ aspect_ratio = "9:16"
3
+ num_frames = 51
4
+ fps = 24
5
+ frame_interval = 1
6
+ save_fps = 24
7
+
8
+ save_dir = "./samples/samples/"
9
+ seed = 42
10
+ batch_size = 1
11
+ multi_resolution = "STDiT2"
12
+ dtype = "bf16"
13
+ condition_frame_length = 5
14
+ align = 5
15
+
16
+ model = dict(
17
+ type="STDiT3-XL/2",
18
+ from_pretrained="hpcai-tech/OpenSora-STDiT-v3",
19
+ qk_norm=True,
20
+ enable_flash_attn=True,
21
+ enable_layernorm_kernel=True,
22
+ )
23
+ vae = dict(
24
+ type="OpenSoraVAE_V1_2",
25
+ from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
26
+ micro_frame_size=17,
27
+ micro_batch_size=4,
28
+ )
29
+ text_encoder = dict(
30
+ type="t5",
31
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
32
+ model_max_length=300,
33
+ local_files_only=True,
34
+ )
35
+ scheduler = dict(
36
+ type="rflow",
37
+ use_timestep_transform=True,
38
+ num_sampling_steps=30,
39
+ cfg_scale=7.0,
40
+ )
41
+
42
+ aes = 6.5
43
+ flow = None
configs/opensora-v1-2/misc/bs.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dataset settings
2
+ dataset = dict(
3
+ type="VariableVideoTextDataset",
4
+ transform_name="resize_crop",
5
+ )
6
+
7
+ # == Config 1: Webvid ==
8
+ # base: (512, 408), 12s/it
9
+ grad_checkpoint = True
10
+ base = ("512", "408")
11
+ base_step_time = 12
12
+ bucket_config = {
13
+ "144p": {
14
+ 1: (475, 0),
15
+ 51: (51, 0),
16
+ 102: (27, 0),
17
+ 204: (13, 0),
18
+ 408: (6, 0),
19
+ },
20
+ # ---
21
+ "240p": {
22
+ 1: (297, 200), # 8.25
23
+ 51: (20, 0),
24
+ 102: (10, 0),
25
+ 204: (5, 0),
26
+ 408: (2, 0),
27
+ },
28
+ # ---
29
+ "512": {
30
+ 1: (141, 0),
31
+ 51: (8, 0),
32
+ 102: (4, 0),
33
+ 204: (2, 0),
34
+ 408: (1, 0),
35
+ },
36
+ # ---
37
+ "480p": {
38
+ 1: (89, 0),
39
+ 51: (5, 0),
40
+ 102: (2, 0),
41
+ 204: (1, 0),
42
+ },
43
+ # ---
44
+ "1024": {
45
+ 1: (36, 0),
46
+ 51: (1, 0),
47
+ },
48
+ # ---
49
+ "1080p": {1: (5, 0)},
50
+ # ---
51
+ "2048": {1: (5, 0)},
52
+ }
53
+
54
+ # == Config 1 ==
55
+ # base: (512, 408), 16s/it
56
+
57
+ # Acceleration settings
58
+ num_workers = 8
59
+ num_bucket_build_workers = 16
60
+ dtype = "bf16"
61
+ plugin = "zero2"
62
+
63
+ # Model settings
64
+ model = dict(
65
+ type="STDiT3-XL/2",
66
+ from_pretrained=None,
67
+ qk_norm=True,
68
+ enable_flash_attn=True,
69
+ enable_layernorm_kernel=True,
70
+ )
71
+ vae = dict(
72
+ type="OpenSoraVAE_V1_2",
73
+ from_pretrained="pretrained_models/vae-pipeline",
74
+ micro_frame_size=17,
75
+ micro_batch_size=4,
76
+ )
77
+ text_encoder = dict(
78
+ type="t5",
79
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
80
+ model_max_length=300,
81
+ shardformer=True,
82
+ local_files_only=True,
83
+ )
84
+ scheduler = dict(
85
+ type="rflow",
86
+ use_timestep_transform=True,
87
+ sample_method="logit-normal",
88
+ )
89
+
90
+ # Mask settings
91
+ mask_ratios = {
92
+ "random": 0.2,
93
+ "intepolate": 0.01,
94
+ "quarter_random": 0.01,
95
+ "quarter_head": 0.01,
96
+ "quarter_tail": 0.01,
97
+ "quarter_head_tail": 0.01,
98
+ "image_random": 0.05,
99
+ "image_head": 0.1,
100
+ "image_tail": 0.05,
101
+ "image_head_tail": 0.05,
102
+ }
103
+
104
+ # Log settings
105
+ seed = 42
106
+ outputs = "outputs"
107
+ wandb = False
108
+ epochs = 1000
109
+ log_every = 10
110
+ ckpt_every = 500
111
+
112
+ # optimization settings
113
+ load = None
114
+ grad_clip = 1.0
115
+ lr = 2e-4
116
+ ema_decay = 0.99
117
+ adam_eps = 1e-15
configs/opensora-v1-2/misc/eval_loss.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_workers = 8
2
+ dtype = "bf16"
3
+ seed = 42
4
+ num_eval_timesteps = 10
5
+
6
+ # Dataset settings
7
+ dataset = dict(
8
+ type="VariableVideoTextDataset",
9
+ transform_name="resize_crop",
10
+ )
11
+
12
+ bucket_config = {
13
+ "144p": {1: (None, 100), 51: (None, 30), 102: (None, 20), 204: (None, 8), 408: (None, 4)},
14
+ # ---
15
+ "240p": {1: (None, 100), 51: (None, 24), 102: (None, 12), 204: (None, 4), 408: (None, 2)},
16
+ # ---
17
+ "360p": {1: (None, 60), 51: (None, 12), 102: (None, 6), 204: (None, 2), 408: (None, 1)},
18
+ # ---
19
+ "480p": {1: (None, 40), 51: (None, 6), 102: (None, 3), 204: (None, 1)},
20
+ # ---
21
+ "720p": {1: (None, 20), 51: (None, 2), 102: (None, 1)},
22
+ # ---
23
+ "1080p": {1: (None, 10)},
24
+ # ---
25
+ "2048": {1: (None, 5)},
26
+ }
27
+
28
+ # Model settings
29
+ model = dict(
30
+ type="STDiT3-XL/2",
31
+ from_pretrained=None,
32
+ qk_norm=True,
33
+ enable_flash_attn=True,
34
+ enable_layernorm_kernel=True,
35
+ )
36
+ vae = dict(
37
+ type="OpenSoraVAE_V1_2",
38
+ from_pretrained="pretrained_models/vae-pipeline",
39
+ micro_frame_size=17,
40
+ micro_batch_size=4,
41
+ local_files_only=True,
42
+ )
43
+ text_encoder = dict(
44
+ type="t5",
45
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
46
+ model_max_length=300,
47
+ local_files_only=True,
48
+ )
49
+ scheduler = dict(type="rflow")
configs/opensora-v1-2/misc/extract.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dataset settings
2
+ dataset = dict(
3
+ type="VariableVideoTextDataset",
4
+ transform_name="resize_crop",
5
+ )
6
+
7
+ # webvid
8
+ bucket_config = { # 12s/it
9
+ "144p": {1: (1.0, 475), 51: (1.0, 51), 102: ((1.0, 0.33), 27), 204: ((1.0, 0.1), 13), 408: ((1.0, 0.1), 6)},
10
+ # ---
11
+ "256": {1: (0.4, 297), 51: (0.5, 20), 102: ((0.5, 0.33), 10), 204: ((0.5, 0.1), 5), 408: ((0.5, 0.1), 2)},
12
+ "240p": {1: (0.3, 297), 51: (0.4, 20), 102: ((0.4, 0.33), 10), 204: ((0.4, 0.1), 5), 408: ((0.4, 0.1), 2)},
13
+ # ---
14
+ "360p": {1: (0.2, 141), 51: (0.15, 8), 102: ((0.15, 0.33), 4), 204: ((0.15, 0.1), 2), 408: ((0.15, 0.1), 1)},
15
+ "512": {1: (0.1, 141)},
16
+ # ---
17
+ "480p": {1: (0.1, 89)},
18
+ # ---
19
+ "720p": {1: (0.05, 36)},
20
+ "1024": {1: (0.05, 36)},
21
+ # ---
22
+ "1080p": {1: (0.1, 5)},
23
+ # ---
24
+ "2048": {1: (0.1, 5)},
25
+ }
26
+
27
+ # Acceleration settings
28
+ num_workers = 8
29
+ num_bucket_build_workers = 16
30
+ dtype = "bf16"
31
+ seed = 42
32
+ outputs = "outputs"
33
+ wandb = False
34
+
35
+
36
+ # Model settings
37
+ model = dict(
38
+ type="STDiT3-XL/2",
39
+ from_pretrained="/mnt/nfs-206/zangwei/opensora/outputs/1091-STDiT3-XL-2/epoch0-global_step8500",
40
+ qk_norm=True,
41
+ enable_flash_attn=True,
42
+ enable_layernorm_kernel=True,
43
+ )
44
+ vae = dict(
45
+ type="OpenSoraVAE_V1_2",
46
+ from_pretrained="pretrained_models/vae-pipeline",
47
+ micro_frame_size=17,
48
+ micro_batch_size=32,
49
+ )
50
+ text_encoder = dict(
51
+ type="t5",
52
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
53
+ model_max_length=300,
54
+ shardformer=True,
55
+ local_files_only=True,
56
+ )
57
+
58
+ # feature extraction settings
59
+ save_text_features = True
60
+ save_compressed_text_features = True
61
+ bin_size = 250 # 1GB, 4195 bins
62
+ log_time = False
configs/opensora-v1-2/misc/feat.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dataset settings
2
+ dataset = dict(
3
+ type="VariableVideoTextDataset",
4
+ transform_name="resize_crop",
5
+ dummy_text_feature=True,
6
+ )
7
+
8
+ # webvid
9
+ bucket_config = { # 12s/it
10
+ "144p": {1: (1.0, 475), 51: (1.0, 51), 102: ((1.0, 0.33), 27), 204: ((1.0, 0.1), 13), 408: ((1.0, 0.1), 6)},
11
+ # ---
12
+ "256": {1: (0.4, 297), 51: (0.5, 20), 102: ((0.5, 0.33), 10), 204: ((0.5, 0.1), 5), 408: ((0.5, 0.1), 2)},
13
+ "240p": {1: (0.3, 297), 51: (0.4, 20), 102: ((0.4, 0.33), 10), 204: ((0.4, 0.1), 5), 408: ((0.4, 0.1), 2)},
14
+ # ---
15
+ "360p": {1: (0.2, 141), 51: (0.15, 8), 102: ((0.15, 0.33), 4), 204: ((0.15, 0.1), 2), 408: ((0.15, 0.1), 1)},
16
+ "512": {1: (0.1, 141)},
17
+ # ---
18
+ "480p": {1: (0.1, 89)},
19
+ # ---
20
+ "720p": {1: (0.05, 36)},
21
+ "1024": {1: (0.05, 36)},
22
+ # ---
23
+ "1080p": {1: (0.1, 5)},
24
+ # ---
25
+ "2048": {1: (0.1, 5)},
26
+ }
27
+
28
+ grad_checkpoint = True
29
+
30
+ load_text_features = True
31
+
32
+ # Acceleration settings
33
+ num_workers = 0
34
+ num_bucket_build_workers = 16
35
+ dtype = "bf16"
36
+ plugin = "zero2"
37
+
38
+ # Model settings
39
+ model = dict(
40
+ type="STDiT3-XL/2",
41
+ from_pretrained=None,
42
+ qk_norm=True,
43
+ enable_flash_attn=True,
44
+ enable_layernorm_kernel=True,
45
+ freeze_y_embedder=True,
46
+ skip_y_embedder=True,
47
+ )
48
+ vae = dict(
49
+ type="OpenSoraVAE_V1_2",
50
+ from_pretrained="pretrained_models/vae-pipeline",
51
+ micro_frame_size=17,
52
+ micro_batch_size=4,
53
+ )
54
+ text_encoder = dict(
55
+ type="t5",
56
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
57
+ model_max_length=300,
58
+ shardformer=True,
59
+ local_files_only=True,
60
+ )
61
+ scheduler = dict(
62
+ type="rflow",
63
+ use_timestep_transform=True,
64
+ sample_method="logit-normal",
65
+ )
66
+
67
+ # Mask settings
68
+ mask_ratios = {
69
+ "random": 0.2,
70
+ "intepolate": 0.01,
71
+ "quarter_random": 0.01,
72
+ "quarter_head": 0.01,
73
+ "quarter_tail": 0.01,
74
+ "quarter_head_tail": 0.01,
75
+ "image_random": 0.05,
76
+ "image_head": 0.1,
77
+ "image_tail": 0.05,
78
+ "image_head_tail": 0.05,
79
+ }
80
+
81
+ # Log settings
82
+ seed = 42
83
+ outputs = "outputs"
84
+ wandb = False
85
+ epochs = 1000
86
+ log_every = 10
87
+ ckpt_every = 1
88
+
89
+ # optimization settings
90
+ load = None
91
+ grad_clip = 1.0
92
+ lr = 2e-4
93
+ ema_decay = 0.99
94
+ adam_eps = 1e-15
configs/opensora-v1-2/train/adapt.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dataset settings
2
+ dataset = dict(
3
+ type="VariableVideoTextDataset",
4
+ transform_name="resize_crop",
5
+ )
6
+ bucket_config = { # 2s/it
7
+ "144p": {1: (0.5, 48), 34: (1.0, 2), 51: (1.0, 4), 102: (1.0, 2), 204: (1.0, 1)},
8
+ # ---
9
+ "256": {1: (0.6, 20), 34: (0.5, 2), 51: (0.5, 1), 68: (0.5, 1), 136: (0.0, None)},
10
+ "240p": {1: (0.6, 20), 34: (0.5, 2), 51: (0.5, 1), 68: (0.5, 1), 136: (0.0, None)},
11
+ # ---
12
+ "360p": {1: (0.5, 8), 34: (0.2, 1), 102: (0.0, None)},
13
+ "512": {1: (0.5, 8), 34: (0.2, 1), 102: (0.0, None)},
14
+ # ---
15
+ "480p": {1: (0.2, 4), 17: (0.3, 1), 68: (0.0, None)},
16
+ # ---
17
+ "720p": {1: (0.1, 2)},
18
+ "1024": {1: (0.1, 2)},
19
+ # ---
20
+ "1080p": {1: (0.1, 1)},
21
+ }
22
+ grad_checkpoint = False
23
+
24
+ # Acceleration settings
25
+ num_workers = 8
26
+ num_bucket_build_workers = 16
27
+ dtype = "bf16"
28
+ plugin = "zero2"
29
+
30
+ # Model settings
31
+ model = dict(
32
+ type="STDiT3-XL/2",
33
+ from_pretrained=None,
34
+ qk_norm=True,
35
+ enable_flash_attn=True,
36
+ enable_layernorm_kernel=True,
37
+ )
38
+ vae = dict(
39
+ type="OpenSoraVAE_V1_2",
40
+ from_pretrained="pretrained_models/vae-pipeline",
41
+ micro_frame_size=17,
42
+ micro_batch_size=4,
43
+ )
44
+ text_encoder = dict(
45
+ type="t5",
46
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
47
+ model_max_length=300,
48
+ shardformer=True,
49
+ local_files_only=True,
50
+ )
51
+ scheduler = dict(
52
+ type="rflow",
53
+ use_timestep_transform=True,
54
+ sample_method="logit-normal",
55
+ )
56
+
57
+ # Mask settings
58
+ mask_ratios = {
59
+ "random": 0.2,
60
+ "intepolate": 0.01,
61
+ "quarter_random": 0.01,
62
+ "quarter_head": 0.01,
63
+ "quarter_tail": 0.01,
64
+ "quarter_head_tail": 0.01,
65
+ "image_random": 0.05,
66
+ "image_head": 0.1,
67
+ "image_tail": 0.05,
68
+ "image_head_tail": 0.05,
69
+ }
70
+
71
+ # Log settings
72
+ seed = 42
73
+ outputs = "outputs"
74
+ wandb = False
75
+ epochs = 1000
76
+ log_every = 10
77
+ ckpt_every = 500
78
+
79
+ # optimization settings
80
+ load = None
81
+ grad_clip = 1.0
82
+ lr = 1e-4
83
+ ema_decay = 0.99
84
+ adam_eps = 1e-15
configs/opensora-v1-2/train/stage1.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dataset settings
2
+ dataset = dict(
3
+ type="VariableVideoTextDataset",
4
+ transform_name="resize_crop",
5
+ )
6
+
7
+ # backup
8
+ # bucket_config = { # 20s/it
9
+ # "144p": {1: (1.0, 100), 51: (1.0, 30), 102: (1.0, 20), 204: (1.0, 8), 408: (1.0, 4)},
10
+ # # ---
11
+ # "256": {1: (0.5, 100), 51: (0.3, 24), 102: (0.3, 12), 204: (0.3, 4), 408: (0.3, 2)},
12
+ # "240p": {1: (0.5, 100), 51: (0.3, 24), 102: (0.3, 12), 204: (0.3, 4), 408: (0.3, 2)},
13
+ # # ---
14
+ # "360p": {1: (0.5, 60), 51: (0.3, 12), 102: (0.3, 6), 204: (0.3, 2), 408: (0.3, 1)},
15
+ # "512": {1: (0.5, 60), 51: (0.3, 12), 102: (0.3, 6), 204: (0.3, 2), 408: (0.3, 1)},
16
+ # # ---
17
+ # "480p": {1: (0.5, 40), 51: (0.3, 6), 102: (0.3, 3), 204: (0.3, 1), 408: (0.0, None)},
18
+ # # ---
19
+ # "720p": {1: (0.2, 20), 51: (0.3, 2), 102: (0.3, 1), 204: (0.0, None)},
20
+ # "1024": {1: (0.1, 20), 51: (0.3, 2), 102: (0.3, 1), 204: (0.0, None)},
21
+ # # ---
22
+ # "1080p": {1: (0.1, 10)},
23
+ # # ---
24
+ # "2048": {1: (0.1, 5)},
25
+ # }
26
+
27
+ # webvid
28
+ bucket_config = { # 12s/it
29
+ "144p": {1: (1.0, 475), 51: (1.0, 51), 102: ((1.0, 0.33), 27), 204: ((1.0, 0.1), 13), 408: ((1.0, 0.1), 6)},
30
+ # ---
31
+ "256": {1: (0.4, 297), 51: (0.5, 20), 102: ((0.5, 0.33), 10), 204: ((0.5, 0.1), 5), 408: ((0.5, 0.1), 2)},
32
+ "240p": {1: (0.3, 297), 51: (0.4, 20), 102: ((0.4, 0.33), 10), 204: ((0.4, 0.1), 5), 408: ((0.4, 0.1), 2)},
33
+ # ---
34
+ "360p": {1: (0.2, 141), 51: (0.15, 8), 102: ((0.15, 0.33), 4), 204: ((0.15, 0.1), 2), 408: ((0.15, 0.1), 1)},
35
+ "512": {1: (0.1, 141)},
36
+ # ---
37
+ "480p": {1: (0.1, 89)},
38
+ # ---
39
+ "720p": {1: (0.05, 36)},
40
+ "1024": {1: (0.05, 36)},
41
+ # ---
42
+ "1080p": {1: (0.1, 5)},
43
+ # ---
44
+ "2048": {1: (0.1, 5)},
45
+ }
46
+
47
+ grad_checkpoint = True
48
+
49
+ # Acceleration settings
50
+ num_workers = 8
51
+ num_bucket_build_workers = 16
52
+ dtype = "bf16"
53
+ plugin = "zero2"
54
+
55
+ # Model settings
56
+ model = dict(
57
+ type="STDiT3-XL/2",
58
+ from_pretrained=None,
59
+ qk_norm=True,
60
+ enable_flash_attn=True,
61
+ enable_layernorm_kernel=True,
62
+ freeze_y_embedder=True,
63
+ )
64
+ vae = dict(
65
+ type="OpenSoraVAE_V1_2",
66
+ from_pretrained="/mnt/jfs/sora_checkpoints/vae-pipeline",
67
+ micro_frame_size=17,
68
+ micro_batch_size=4,
69
+ )
70
+ text_encoder = dict(
71
+ type="t5",
72
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
73
+ model_max_length=300,
74
+ shardformer=True,
75
+ local_files_only=True,
76
+ )
77
+ scheduler = dict(
78
+ type="rflow",
79
+ use_timestep_transform=True,
80
+ sample_method="logit-normal",
81
+ )
82
+
83
+ # Mask settings
84
+ mask_ratios = {
85
+ "random": 0.05,
86
+ "intepolate": 0.005,
87
+ "quarter_random": 0.005,
88
+ "quarter_head": 0.005,
89
+ "quarter_tail": 0.005,
90
+ "quarter_head_tail": 0.005,
91
+ "image_random": 0.025,
92
+ "image_head": 0.05,
93
+ "image_tail": 0.025,
94
+ "image_head_tail": 0.025,
95
+ }
96
+
97
+ # Log settings
98
+ seed = 42
99
+ outputs = "outputs"
100
+ wandb = False
101
+ epochs = 1000
102
+ log_every = 10
103
+ ckpt_every = 200
104
+
105
+ # optimization settings
106
+ load = None
107
+ grad_clip = 1.0
108
+ lr = 1e-4
109
+ ema_decay = 0.99
110
+ adam_eps = 1e-15
111
+ warmup_steps = 1000
configs/opensora-v1-2/train/stage1_feat.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dataset settings
2
+ dataset = dict(type="BatchFeatureDataset")
3
+ grad_checkpoint = True
4
+ num_workers = 4
5
+
6
+ # Acceleration settings
7
+ dtype = "bf16"
8
+ plugin = "zero2"
9
+
10
+ # Model settings
11
+ model = dict(
12
+ type="STDiT3-XL/2",
13
+ from_pretrained=None,
14
+ qk_norm=True,
15
+ enable_flash_attn=True,
16
+ enable_layernorm_kernel=True,
17
+ freeze_y_embedder=True,
18
+ skip_y_embedder=True,
19
+ )
20
+ scheduler = dict(
21
+ type="rflow",
22
+ use_timestep_transform=True,
23
+ sample_method="logit-normal",
24
+ )
25
+
26
+ vae_out_channels = 4
27
+ model_max_length = 300
28
+ text_encoder_output_dim = 4096
29
+ load_video_features = True
30
+ load_text_features = True
31
+
32
+ # Mask settings
33
+ mask_ratios = {
34
+ "random": 0.2,
35
+ "intepolate": 0.01,
36
+ "quarter_random": 0.01,
37
+ "quarter_head": 0.01,
38
+ "quarter_tail": 0.01,
39
+ "quarter_head_tail": 0.01,
40
+ "image_random": 0.05,
41
+ "image_head": 0.1,
42
+ "image_tail": 0.05,
43
+ "image_head_tail": 0.05,
44
+ }
45
+
46
+ # Log settings
47
+ seed = 42
48
+ outputs = "outputs"
49
+ wandb = False
50
+ epochs = 1000
51
+ log_every = 10
52
+ ckpt_every = 500
53
+
54
+ # optimization settings
55
+ load = None
56
+ grad_clip = 1.0
57
+ lr = 2e-4
58
+ ema_decay = 0.99
59
+ adam_eps = 1e-15
configs/opensora-v1-2/train/stage2.py ADDED
@@ -0,0 +1,91 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dataset settings
2
+ dataset = dict(
3
+ type="VariableVideoTextDataset",
4
+ transform_name="resize_crop",
5
+ )
6
+
7
+ # webvid
8
+ bucket_config = { # 12s/it
9
+ "144p": {1: (1.0, 475), 51: (1.0, 51), 102: ((1.0, 0.33), 27), 204: ((1.0, 0.1), 13), 408: ((1.0, 0.1), 6)},
10
+ # ---
11
+ "256": {1: (0.4, 297), 51: (0.5, 20), 102: ((0.5, 0.33), 10), 204: ((0.5, 1.0), 5), 408: ((0.5, 1.0), 2)},
12
+ "240p": {1: (0.3, 297), 51: (0.4, 20), 102: ((0.4, 0.33), 10), 204: ((0.4, 1.0), 5), 408: ((0.4, 1.0), 2)},
13
+ # ---
14
+ "360p": {1: (0.5, 141), 51: (0.15, 8), 102: ((0.3, 0.5), 4), 204: ((0.3, 1.0), 2), 408: ((0.5, 0.5), 1)},
15
+ "512": {1: (0.4, 141), 51: (0.15, 8), 102: ((0.2, 0.4), 4), 204: ((0.2, 1.0), 2), 408: ((0.4, 0.5), 1)},
16
+ # ---
17
+ "480p": {1: (0.5, 89), 51: (0.2, 5), 102: (0.2, 2), 204: (0.1, 1)},
18
+ # ---
19
+ "720p": {1: (0.1, 36), 51: (0.03, 1)},
20
+ "1024": {1: (0.1, 36), 51: (0.02, 1)},
21
+ # ---
22
+ "1080p": {1: (0.01, 5)},
23
+ # ---
24
+ "2048": {1: (0.01, 5)},
25
+ }
26
+
27
+ grad_checkpoint = True
28
+
29
+ # Acceleration settings
30
+ num_workers = 8
31
+ num_bucket_build_workers = 16
32
+ dtype = "bf16"
33
+ plugin = "zero2"
34
+
35
+ # Model settings
36
+ model = dict(
37
+ type="STDiT3-XL/2",
38
+ from_pretrained=None,
39
+ qk_norm=True,
40
+ enable_flash_attn=True,
41
+ enable_layernorm_kernel=True,
42
+ freeze_y_embedder=True,
43
+ )
44
+ vae = dict(
45
+ type="OpenSoraVAE_V1_2",
46
+ from_pretrained="/mnt/jfs/sora_checkpoints/vae-pipeline",
47
+ micro_frame_size=17,
48
+ micro_batch_size=4,
49
+ )
50
+ text_encoder = dict(
51
+ type="t5",
52
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
53
+ model_max_length=300,
54
+ shardformer=True,
55
+ local_files_only=True,
56
+ )
57
+ scheduler = dict(
58
+ type="rflow",
59
+ use_timestep_transform=True,
60
+ sample_method="logit-normal",
61
+ )
62
+
63
+ # Mask settings
64
+ mask_ratios = {
65
+ "random": 0.05,
66
+ "intepolate": 0.005,
67
+ "quarter_random": 0.005,
68
+ "quarter_head": 0.005,
69
+ "quarter_tail": 0.005,
70
+ "quarter_head_tail": 0.005,
71
+ "image_random": 0.025,
72
+ "image_head": 0.05,
73
+ "image_tail": 0.025,
74
+ "image_head_tail": 0.025,
75
+ }
76
+
77
+ # Log settings
78
+ seed = 42
79
+ outputs = "outputs"
80
+ wandb = False
81
+ epochs = 1000
82
+ log_every = 10
83
+ ckpt_every = 200
84
+
85
+ # optimization settings
86
+ load = None
87
+ grad_clip = 1.0
88
+ lr = 1e-4
89
+ ema_decay = 0.99
90
+ adam_eps = 1e-15
91
+
configs/opensora-v1-2/train/stage3.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dataset settings
2
+ dataset = dict(
3
+ type="VariableVideoTextDataset",
4
+ transform_name="resize_crop",
5
+ )
6
+
7
+ # webvid
8
+ bucket_config = { # 20s/it
9
+ "144p": {1: (1.0, 475), 51: (1.0, 51), 102: (1.0, 27), 204: (1.0, 13), 408: (1.0, 6)},
10
+ # ---
11
+ "256": {1: (1.0, 297), 51: (0.5, 20), 102: (0.5, 10), 204: (0.5, 5), 408: ((0.5, 0.5), 2)},
12
+ "240p": {1: (1.0, 297), 51: (0.5, 20), 102: (0.5, 10), 204: (0.5, 5), 408: ((0.5, 0.4), 2)},
13
+ # ---
14
+ "360p": {1: (1.0, 141), 51: (0.5, 8), 102: (0.5, 4), 204: (0.5, 2), 408: ((0.5, 0.3), 1)},
15
+ "512": {1: (1.0, 141), 51: (0.5, 8), 102: (0.5, 4), 204: (0.5, 2), 408: ((0.5, 0.2), 1)},
16
+ # ---
17
+ "480p": {1: (1.0, 89), 51: (0.5, 5), 102: (0.5, 3), 204: ((0.5, 0.5), 1), 408: (0.0, None)},
18
+ # ---
19
+ "720p": {1: (0.3, 36), 51: (0.2, 2), 102: (0.1, 1), 204: (0.0, None)},
20
+ "1024": {1: (0.3, 36), 51: (0.1, 2), 102: (0.1, 1), 204: (0.0, None)},
21
+ # ---
22
+ "1080p": {1: (0.1, 5)},
23
+ # ---
24
+ "2048": {1: (0.05, 5)},
25
+ }
26
+
27
+ grad_checkpoint = True
28
+
29
+ # Acceleration settings
30
+ num_workers = 8
31
+ num_bucket_build_workers = 16
32
+ dtype = "bf16"
33
+ plugin = "zero2"
34
+
35
+ # Model settings
36
+ model = dict(
37
+ type="STDiT3-XL/2",
38
+ from_pretrained=None,
39
+ qk_norm=True,
40
+ enable_flash_attn=True,
41
+ enable_layernorm_kernel=True,
42
+ freeze_y_embedder=True,
43
+ )
44
+ vae = dict(
45
+ type="OpenSoraVAE_V1_2",
46
+ from_pretrained="/mnt/jfs/sora_checkpoints/vae-pipeline",
47
+ micro_frame_size=17,
48
+ micro_batch_size=4,
49
+ )
50
+ text_encoder = dict(
51
+ type="t5",
52
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
53
+ model_max_length=300,
54
+ shardformer=True,
55
+ local_files_only=True,
56
+ )
57
+ scheduler = dict(
58
+ type="rflow",
59
+ use_timestep_transform=True,
60
+ sample_method="logit-normal",
61
+ )
62
+
63
+ # Mask settings
64
+ # 25%
65
+ mask_ratios = {
66
+ "random": 0.01,
67
+ "intepolate": 0.002,
68
+ "quarter_random": 0.002,
69
+ "quarter_head": 0.002,
70
+ "quarter_tail": 0.002,
71
+ "quarter_head_tail": 0.002,
72
+ "image_random": 0.0,
73
+ "image_head": 0.22,
74
+ "image_tail": 0.005,
75
+ "image_head_tail": 0.005,
76
+ }
77
+
78
+ # Log settings
79
+ seed = 42
80
+ outputs = "outputs"
81
+ wandb = False
82
+ epochs = 1000
83
+ log_every = 10
84
+ ckpt_every = 200
85
+
86
+ # optimization settings
87
+ load = None
88
+ grad_clip = 1.0
89
+ lr = 1e-4
90
+ ema_decay = 0.99
91
+ adam_eps = 1e-15
92
+ warmup_steps = 1000
configs/opensora/inference/16x256x256.py CHANGED
@@ -7,7 +7,7 @@ model = dict(
7
  type="STDiT-XL/2",
8
  space_scale=0.5,
9
  time_scale=1.0,
10
- enable_flashattn=True,
11
  enable_layernorm_kernel=True,
12
  from_pretrained="PRETRAINED_MODEL",
13
  )
 
7
  type="STDiT-XL/2",
8
  space_scale=0.5,
9
  time_scale=1.0,
10
+ enable_flash_attn=True,
11
  enable_layernorm_kernel=True,
12
  from_pretrained="PRETRAINED_MODEL",
13
  )
configs/opensora/inference/16x512x512-rflow.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 16
2
+ fps = 24 // 3
3
+ image_size = (512, 512)
4
+
5
+ # Define model
6
+ model = dict(
7
+ type="STDiT-XL/2",
8
+ space_scale=1.0,
9
+ time_scale=1.0,
10
+ enable_flash_attn=True,
11
+ enable_layernorm_kernel=True,
12
+ from_pretrained="PRETRAINED_MODEL",
13
+ )
14
+ vae = dict(
15
+ type="VideoAutoencoderKL",
16
+ from_pretrained="stabilityai/sd-vae-ft-ema",
17
+ micro_batch_size=2,
18
+ )
19
+ text_encoder = dict(
20
+ type="t5",
21
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
22
+ model_max_length=120,
23
+ )
24
+ scheduler = dict(
25
+ type="rflow",
26
+ num_sampling_steps=10,
27
+ cfg_scale=7.0,
28
+ )
29
+ dtype = "bf16"
30
+
31
+ # Others
32
+ batch_size = 2
33
+ seed = 42
34
+ prompt_path = "./assets/texts/t2v_samples.txt"
35
+ save_dir = "./outputs/samples/"
configs/opensora/inference/16x512x512.py CHANGED
@@ -7,7 +7,7 @@ model = dict(
7
  type="STDiT-XL/2",
8
  space_scale=1.0,
9
  time_scale=1.0,
10
- enable_flashattn=True,
11
  enable_layernorm_kernel=True,
12
  from_pretrained="PRETRAINED_MODEL",
13
  )
 
7
  type="STDiT-XL/2",
8
  space_scale=1.0,
9
  time_scale=1.0,
10
+ enable_flash_attn=True,
11
  enable_layernorm_kernel=True,
12
  from_pretrained="PRETRAINED_MODEL",
13
  )
configs/opensora/inference/64x512x512.py CHANGED
@@ -7,7 +7,7 @@ model = dict(
7
  type="STDiT-XL/2",
8
  space_scale=1.0,
9
  time_scale=2 / 3,
10
- enable_flashattn=True,
11
  enable_layernorm_kernel=True,
12
  from_pretrained="PRETRAINED_MODEL",
13
  )
 
7
  type="STDiT-XL/2",
8
  space_scale=1.0,
9
  time_scale=2 / 3,
10
+ enable_flash_attn=True,
11
  enable_layernorm_kernel=True,
12
  from_pretrained="PRETRAINED_MODEL",
13
  )
configs/opensora/train/16x256x256-mask.py CHANGED
@@ -20,12 +20,12 @@ model = dict(
20
  space_scale=0.5,
21
  time_scale=1.0,
22
  from_pretrained="PixArt-XL-2-512x512.pth",
23
- enable_flashattn=True,
24
  enable_layernorm_kernel=True,
25
  )
26
  mask_ratios = {
27
- "mask_no": 0.7,
28
- "mask_random": 0.15,
29
  "mask_head": 0.05,
30
  "mask_tail": 0.05,
31
  "mask_head_tail": 0.05,
 
20
  space_scale=0.5,
21
  time_scale=1.0,
22
  from_pretrained="PixArt-XL-2-512x512.pth",
23
+ enable_flash_attn=True,
24
  enable_layernorm_kernel=True,
25
  )
26
  mask_ratios = {
27
+ "identity": 0.7,
28
+ "random": 0.15,
29
  "mask_head": 0.05,
30
  "mask_tail": 0.05,
31
  "mask_head_tail": 0.05,
configs/opensora/train/16x256x256-spee-rflow.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Define dataset
2
+ dataset = dict(
3
+ type="VideoTextDataset",
4
+ data_path=None,
5
+ num_frames=16,
6
+ frame_interval=3,
7
+ image_size=(256, 256),
8
+ )
9
+
10
+ # Define acceleration
11
+ num_workers = 4
12
+ dtype = "bf16"
13
+ grad_checkpoint = True
14
+ plugin = "zero2"
15
+ sp_size = 1
16
+
17
+ # Define model
18
+ model = dict(
19
+ type="STDiT-XL/2",
20
+ space_scale=0.5,
21
+ time_scale=1.0,
22
+ # from_pretrained="PixArt-XL-2-512x512.pth",
23
+ # from_pretrained = "/home/zhaowangbo/wangbo/PixArt-alpha/pretrained_models/OpenSora-v1-HQ-16x512x512.pth",
24
+ # from_pretrained = "OpenSora-v1-HQ-16x512x512.pth",
25
+ from_pretrained="PRETRAINED_MODEL",
26
+ enable_flash_attn=True,
27
+ enable_layernorm_kernel=True,
28
+ )
29
+ # mask_ratios = [0.5, 0.29, 0.07, 0.07, 0.07]
30
+ # mask_ratios = {
31
+ # "identity": 0.9,
32
+ # "random": 0.06,
33
+ # "mask_head": 0.01,
34
+ # "mask_tail": 0.01,
35
+ # "mask_head_tail": 0.02,
36
+ # }
37
+ vae = dict(
38
+ type="VideoAutoencoderKL",
39
+ from_pretrained="stabilityai/sd-vae-ft-ema",
40
+ )
41
+ text_encoder = dict(
42
+ type="t5",
43
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
44
+ model_max_length=120,
45
+ shardformer=True,
46
+ )
47
+ scheduler = dict(
48
+ type="rflow",
49
+ # timestep_respacing="",
50
+ )
51
+
52
+ # Others
53
+ seed = 42
54
+ outputs = "outputs"
55
+ wandb = True
56
+
57
+ epochs = 1
58
+ log_every = 10
59
+ ckpt_every = 1000
60
+ load = None
61
+
62
+ batch_size = 16
63
+ lr = 2e-5
64
+ grad_clip = 1.0
configs/opensora/train/16x256x256-spee.py CHANGED
@@ -20,12 +20,12 @@ model = dict(
20
  space_scale=0.5,
21
  time_scale=1.0,
22
  from_pretrained="PixArt-XL-2-512x512.pth",
23
- enable_flashattn=True,
24
  enable_layernorm_kernel=True,
25
  )
26
  mask_ratios = {
27
- "mask_no": 0.5,
28
- "mask_random": 0.29,
29
  "mask_head": 0.07,
30
  "mask_tail": 0.07,
31
  "mask_head_tail": 0.07,
 
20
  space_scale=0.5,
21
  time_scale=1.0,
22
  from_pretrained="PixArt-XL-2-512x512.pth",
23
+ enable_flash_attn=True,
24
  enable_layernorm_kernel=True,
25
  )
26
  mask_ratios = {
27
+ "identity": 0.5,
28
+ "random": 0.29,
29
  "mask_head": 0.07,
30
  "mask_tail": 0.07,
31
  "mask_head_tail": 0.07,
configs/opensora/train/16x256x256.py CHANGED
@@ -8,7 +8,7 @@ dataset = dict(
8
  )
9
 
10
  # Define acceleration
11
- num_workers = 4
12
  dtype = "bf16"
13
  grad_checkpoint = True
14
  plugin = "zero2"
@@ -20,7 +20,7 @@ model = dict(
20
  space_scale=0.5,
21
  time_scale=1.0,
22
  from_pretrained="PixArt-XL-2-512x512.pth",
23
- enable_flashattn=True,
24
  enable_layernorm_kernel=True,
25
  )
26
  vae = dict(
 
8
  )
9
 
10
  # Define acceleration
11
+ num_workers = 0
12
  dtype = "bf16"
13
  grad_checkpoint = True
14
  plugin = "zero2"
 
20
  space_scale=0.5,
21
  time_scale=1.0,
22
  from_pretrained="PixArt-XL-2-512x512.pth",
23
+ enable_flash_attn=True,
24
  enable_layernorm_kernel=True,
25
  )
26
  vae = dict(
configs/opensora/train/16x512x512.py CHANGED
@@ -20,7 +20,7 @@ model = dict(
20
  space_scale=1.0,
21
  time_scale=1.0,
22
  from_pretrained=None,
23
- enable_flashattn=True,
24
  enable_layernorm_kernel=True,
25
  )
26
  vae = dict(
 
20
  space_scale=1.0,
21
  time_scale=1.0,
22
  from_pretrained=None,
23
+ enable_flash_attn=True,
24
  enable_layernorm_kernel=True,
25
  )
26
  vae = dict(
configs/opensora/train/360x512x512.py CHANGED
@@ -26,7 +26,7 @@ model = dict(
26
  space_scale=1.0,
27
  time_scale=2 / 3,
28
  from_pretrained=None,
29
- enable_flashattn=True,
30
  enable_layernorm_kernel=True,
31
  enable_sequence_parallelism=True, # enable sq here
32
  )
 
26
  space_scale=1.0,
27
  time_scale=2 / 3,
28
  from_pretrained=None,
29
+ enable_flash_attn=True,
30
  enable_layernorm_kernel=True,
31
  enable_sequence_parallelism=True, # enable sq here
32
  )
configs/opensora/train/64x512x512-sp.py CHANGED
@@ -20,7 +20,7 @@ model = dict(
20
  space_scale=1.0,
21
  time_scale=2 / 3,
22
  from_pretrained=None,
23
- enable_flashattn=True,
24
  enable_layernorm_kernel=True,
25
  enable_sequence_parallelism=True, # enable sq here
26
  )
 
20
  space_scale=1.0,
21
  time_scale=2 / 3,
22
  from_pretrained=None,
23
+ enable_flash_attn=True,
24
  enable_layernorm_kernel=True,
25
  enable_sequence_parallelism=True, # enable sq here
26
  )
configs/opensora/train/64x512x512.py CHANGED
@@ -20,7 +20,7 @@ model = dict(
20
  space_scale=1.0,
21
  time_scale=2 / 3,
22
  from_pretrained=None,
23
- enable_flashattn=True,
24
  enable_layernorm_kernel=True,
25
  )
26
  vae = dict(
 
20
  space_scale=1.0,
21
  time_scale=2 / 3,
22
  from_pretrained=None,
23
+ enable_flash_attn=True,
24
  enable_layernorm_kernel=True,
25
  )
26
  vae = dict(
configs/pixart/inference/1x20481B.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 1
2
+ fps = 1
3
+ image_size = (2560, 1536)
4
+ # image_size = (2048, 2048)
5
+
6
+ model = dict(
7
+ type="PixArt-1B/2",
8
+ from_pretrained="PixArt-1B-2.pth",
9
+ space_scale=4,
10
+ no_temporal_pos_emb=True,
11
+ enable_flash_attn=True,
12
+ enable_layernorm_kernel=True,
13
+ base_size=2048 // 8,
14
+ )
15
+ vae = dict(
16
+ type="VideoAutoencoderKL",
17
+ from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers",
18
+ subfolder="vae",
19
+ )
20
+ text_encoder = dict(
21
+ type="t5",
22
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
23
+ model_max_length=300,
24
+ )
25
+ scheduler = dict(
26
+ type="dpm-solver",
27
+ num_sampling_steps=14,
28
+ cfg_scale=4.5,
29
+ )
30
+ dtype = "bf16"
31
+
32
+ # Others
33
+ batch_size = 1
34
+ seed = 42
35
+ prompt_path = "./assets/texts/t2i_sigma.txt"
36
+ save_dir = "./samples/samples/"
configs/pixart/inference/1x2048MS.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 1
2
+ fps = 1
3
+ image_size = (2560, 1536)
4
+ # image_size = (2048, 2048)
5
+
6
+ model = dict(
7
+ type="PixArt-XL/2",
8
+ from_pretrained="PixArt-Sigma-XL-2-2K-MS.pth",
9
+ space_scale=4,
10
+ no_temporal_pos_emb=True,
11
+ enable_flash_attn=True,
12
+ enable_layernorm_kernel=True,
13
+ base_size=2048 // 8,
14
+ )
15
+ vae = dict(
16
+ type="VideoAutoencoderKL",
17
+ from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers",
18
+ subfolder="vae",
19
+ )
20
+ text_encoder = dict(
21
+ type="t5",
22
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
23
+ model_max_length=300,
24
+ )
25
+ scheduler = dict(
26
+ type="dpm-solver",
27
+ num_sampling_steps=14,
28
+ cfg_scale=4.5,
29
+ )
30
+ dtype = "bf16"
31
+
32
+ # Others
33
+ batch_size = 1
34
+ seed = 42
35
+ prompt_path = "./assets/texts/t2i_sigma.txt"
36
+ save_dir = "./samples/samples/"
configs/pixart/inference/1x512x512-rflow.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 1
2
+ fps = 1
3
+ image_size = (512, 512)
4
+
5
+ # Define model
6
+ model = dict(
7
+ type="PixArt-XL/2",
8
+ space_scale=1.0,
9
+ time_scale=1.0,
10
+ no_temporal_pos_emb=True,
11
+ from_pretrained="PRETRAINED_MODEL",
12
+ )
13
+ vae = dict(
14
+ type="VideoAutoencoderKL",
15
+ from_pretrained="stabilityai/sd-vae-ft-ema",
16
+ )
17
+ text_encoder = dict(
18
+ type="t5",
19
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
20
+ model_max_length=120,
21
+ )
22
+ scheduler = dict(
23
+ type="rflow",
24
+ num_sampling_steps=20,
25
+ cfg_scale=7.0,
26
+ )
27
+ dtype = "bf16"
28
+
29
+ # prompt_path = "./assets/texts/t2i_samples.txt"
30
+ prompt = [
31
+ "Pirate ship trapped in a cosmic maelstrom nebula.",
32
+ "A small cactus with a happy face in the Sahara desert.",
33
+ "A small cactus with a sad face in the Sahara desert.",
34
+ ]
35
+
36
+ # Others
37
+ batch_size = 2
38
+ seed = 42
39
+ save_dir = "./outputs/samples2/"
configs/pixart/train/16x256x256.py CHANGED
@@ -20,7 +20,7 @@ model = dict(
20
  space_scale=0.5,
21
  time_scale=1.0,
22
  from_pretrained="PixArt-XL-2-512x512.pth",
23
- enable_flashattn=True,
24
  enable_layernorm_kernel=True,
25
  )
26
  vae = dict(
 
20
  space_scale=0.5,
21
  time_scale=1.0,
22
  from_pretrained="PixArt-XL-2-512x512.pth",
23
+ enable_flash_attn=True,
24
  enable_layernorm_kernel=True,
25
  )
26
  vae = dict(
configs/pixart/train/1x2048x2048.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Define dataset
2
+ dataset = dict(
3
+ type="VideoTextDataset",
4
+ data_path="/home/zhaowangbo/data/csv/image-v1_1_ext_noempty_rcp_clean_info.csv",
5
+ num_frames=1,
6
+ frame_interval=3,
7
+ image_size=(2048, 2048),
8
+ )
9
+
10
+ # Define acceleration
11
+ num_workers = 4
12
+ dtype = "bf16"
13
+ grad_checkpoint = True
14
+ plugin = "zero2"
15
+ sp_size = 1
16
+
17
+ # Define model
18
+ model = dict(
19
+ type="PixArt-1B/2",
20
+ space_scale=4.0,
21
+ no_temporal_pos_emb=True,
22
+ from_pretrained="PixArt-1B-2.pth",
23
+ enable_flash_attn=True,
24
+ enable_layernorm_kernel=True,
25
+ )
26
+
27
+ vae = dict(
28
+ type="VideoAutoencoderKL",
29
+ from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers",
30
+ subfolder="vae",
31
+ )
32
+ text_encoder = dict(
33
+ type="t5",
34
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
35
+ model_max_length=300,
36
+ )
37
+ scheduler = dict(
38
+ type="iddpm",
39
+ timestep_respacing="",
40
+ )
41
+
42
+ # Others
43
+ seed = 42
44
+ outputs = "outputs"
45
+ wandb = False
46
+
47
+ epochs = 1000
48
+ log_every = 10
49
+ ckpt_every = 1000
50
+ load = None
51
+
52
+ batch_size = 4
53
+ lr = 2e-5
54
+ grad_clip = 1.0
configs/pixart/train/1x512x512-rflow.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Define dataset
2
+ dataset = dict(
3
+ type="VideoTextDataset",
4
+ data_path=None,
5
+ num_frames=1,
6
+ frame_interval=3,
7
+ image_size=(512, 512),
8
+ )
9
+
10
+ # Define acceleration
11
+ num_workers = 4
12
+ dtype = "bf16"
13
+ grad_checkpoint = True
14
+ plugin = "zero2"
15
+ sp_size = 1
16
+
17
+ # Define model
18
+ model = dict(
19
+ type="PixArt-XL/2",
20
+ space_scale=1.0,
21
+ time_scale=1.0,
22
+ no_temporal_pos_emb=True,
23
+ # from_pretrained="PixArt-XL-2-512x512.pth",
24
+ from_pretrained="PRETRAINED_MODEL",
25
+ enable_flash_attn=True,
26
+ enable_layernorm_kernel=True,
27
+ )
28
+ vae = dict(
29
+ type="VideoAutoencoderKL",
30
+ from_pretrained="stabilityai/sd-vae-ft-ema",
31
+ )
32
+ text_encoder = dict(
33
+ type="t5",
34
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
35
+ model_max_length=120,
36
+ shardformer=True,
37
+ )
38
+ scheduler = dict(
39
+ type="rflow",
40
+ # timestep_respacing="",
41
+ )
42
+
43
+ # Others
44
+ seed = 42
45
+ outputs = "outputs"
46
+ wandb = True
47
+
48
+ epochs = 2
49
+ log_every = 10
50
+ ckpt_every = 1000
51
+ load = None
52
+
53
+ batch_size = 64
54
+ lr = 2e-5
55
+ grad_clip = 1.0
configs/pixart/train/1x512x512.py CHANGED
@@ -21,7 +21,7 @@ model = dict(
21
  time_scale=1.0,
22
  no_temporal_pos_emb=True,
23
  from_pretrained="PixArt-XL-2-512x512.pth",
24
- enable_flashattn=True,
25
  enable_layernorm_kernel=True,
26
  )
27
  vae = dict(
 
21
  time_scale=1.0,
22
  no_temporal_pos_emb=True,
23
  from_pretrained="PixArt-XL-2-512x512.pth",
24
+ enable_flash_attn=True,
25
  enable_layernorm_kernel=True,
26
  )
27
  vae = dict(
configs/pixart/train/64x512x512.py CHANGED
@@ -21,7 +21,7 @@ model = dict(
21
  space_scale=1.0,
22
  time_scale=2 / 3,
23
  from_pretrained=None,
24
- enable_flashattn=True,
25
  enable_layernorm_kernel=True,
26
  )
27
  vae = dict(
 
21
  space_scale=1.0,
22
  time_scale=2 / 3,
23
  from_pretrained=None,
24
+ enable_flash_attn=True,
25
  enable_layernorm_kernel=True,
26
  )
27
  vae = dict(
configs/vae/inference/image.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ image_size = (256, 256)
2
+ num_frames = 1
3
+
4
+ dtype = "bf16"
5
+ batch_size = 1
6
+ seed = 42
7
+ save_dir = "samples/vae_video"
8
+ cal_stats = True
9
+ log_stats_every = 100
10
+
11
+ # Define dataset
12
+ dataset = dict(
13
+ type="VideoTextDataset",
14
+ data_path=None,
15
+ num_frames=num_frames,
16
+ image_size=image_size,
17
+ )
18
+ num_samples = 100
19
+ num_workers = 4
20
+
21
+ # Define model
22
+ model = dict(
23
+ type="OpenSoraVAE_V1_2",
24
+ from_pretrained="pretrained_models/vae-pipeline",
25
+ micro_frame_size=None,
26
+ micro_batch_size=4,
27
+ cal_loss=True,
28
+ )
29
+
30
+ # loss weights
31
+ perceptual_loss_weight = 0.1 # use vgg is not None and more than 0
32
+ kl_loss_weight = 1e-6
configs/vae/inference/video.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ image_size = (256, 256)
2
+ num_frames = 17
3
+
4
+ dtype = "bf16"
5
+ batch_size = 1
6
+ seed = 42
7
+ save_dir = "samples/vae_video"
8
+ cal_stats = True
9
+ log_stats_every = 100
10
+
11
+ # Define dataset
12
+ dataset = dict(
13
+ type="VideoTextDataset",
14
+ data_path=None,
15
+ num_frames=num_frames,
16
+ image_size=image_size,
17
+ )
18
+ num_samples = 100
19
+ num_workers = 4
20
+
21
+ # Define model
22
+ model = dict(
23
+ type="OpenSoraVAE_V1_2",
24
+ from_pretrained="pretrained_models/vae-pipeline",
25
+ micro_frame_size=None,
26
+ micro_batch_size=4,
27
+ cal_loss=True,
28
+ )
29
+
30
+ # loss weights
31
+ perceptual_loss_weight = 0.1 # use vgg is not None and more than 0
32
+ kl_loss_weight = 1e-6
configs/vae/train/image.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 1
2
+ image_size = (256, 256)
3
+
4
+ # Define dataset
5
+ dataset = dict(
6
+ type="VideoTextDataset",
7
+ data_path=None,
8
+ num_frames=num_frames,
9
+ frame_interval=1,
10
+ image_size=image_size,
11
+ )
12
+
13
+ # Define acceleration
14
+ num_workers = 16
15
+ dtype = "bf16"
16
+ grad_checkpoint = True
17
+ plugin = "zero2"
18
+
19
+ # Define model
20
+ model = dict(
21
+ type="VideoAutoencoderPipeline",
22
+ freeze_vae_2d=True,
23
+ from_pretrained=None,
24
+ cal_loss=True,
25
+ vae_2d=dict(
26
+ type="VideoAutoencoderKL",
27
+ from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers",
28
+ subfolder="vae",
29
+ local_files_only=True,
30
+ ),
31
+ vae_temporal=dict(
32
+ type="VAE_Temporal_SD",
33
+ from_pretrained=None,
34
+ ),
35
+ )
36
+
37
+ # loss weights
38
+ perceptual_loss_weight = 0.0 # use vgg is not None and more than 0
39
+ kl_loss_weight = 1e-6
40
+
41
+ mixed_image_ratio = 0.1
42
+ use_real_rec_loss = False
43
+ use_z_rec_loss = True
44
+ use_image_identity_loss = True
45
+
46
+ # Others
47
+ seed = 42
48
+ outputs = "outputs"
49
+ wandb = False
50
+
51
+ epochs = 100
52
+ log_every = 1
53
+ ckpt_every = 1000
54
+ load = None
55
+
56
+ batch_size = 1
57
+ lr = 1e-5
58
+ grad_clip = 1.0
configs/vae/train/video.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 33
2
+ image_size = (256, 256)
3
+
4
+ # Define dataset
5
+ dataset = dict(
6
+ type="VideoTextDataset",
7
+ data_path=None,
8
+ num_frames=num_frames,
9
+ frame_interval=1,
10
+ image_size=image_size,
11
+ )
12
+
13
+ # Define acceleration
14
+ num_workers = 16
15
+ dtype = "bf16"
16
+ grad_checkpoint = True
17
+ plugin = "zero2"
18
+
19
+ # Define model
20
+ model = dict(
21
+ type="VideoAutoencoderPipeline",
22
+ freeze_vae_2d=False,
23
+ from_pretrained=None,
24
+ cal_loss=True,
25
+ vae_2d=dict(
26
+ type="VideoAutoencoderKL",
27
+ from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers",
28
+ subfolder="vae",
29
+ local_files_only=True,
30
+ ),
31
+ vae_temporal=dict(
32
+ type="VAE_Temporal_SD",
33
+ from_pretrained=None,
34
+ ),
35
+ )
36
+
37
+ # loss weights
38
+ perceptual_loss_weight = 0.1 # use vgg is not None and more than 0
39
+ kl_loss_weight = 1e-6
40
+
41
+ mixed_image_ratio = 0.2
42
+ use_real_rec_loss = True
43
+ use_z_rec_loss = False
44
+ use_image_identity_loss = False
45
+
46
+ # Others
47
+ seed = 42
48
+ outputs = "outputs"
49
+ wandb = False
50
+
51
+ epochs = 100
52
+ log_every = 1
53
+ ckpt_every = 1000
54
+ load = None
55
+
56
+ batch_size = 1
57
+ lr = 1e-5
58
+ grad_clip = 1.0
configs/vae/train/video_disc.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 17
2
+ image_size = (256, 256)
3
+
4
+ # Define dataset
5
+ dataset = dict(
6
+ type="VideoTextDataset",
7
+ data_path=None,
8
+ num_frames=num_frames,
9
+ frame_interval=1,
10
+ image_size=image_size,
11
+ )
12
+
13
+ # Define acceleration
14
+ num_workers = 16
15
+ dtype = "bf16"
16
+ grad_checkpoint = True
17
+ plugin = "zero2"
18
+
19
+ # Define model
20
+ model = dict(
21
+ type="VideoAutoencoderPipeline",
22
+ freeze_vae_2d=False,
23
+ from_pretrained=None,
24
+ cal_loss=True,
25
+ vae_2d=dict(
26
+ type="VideoAutoencoderKL",
27
+ from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers",
28
+ subfolder="vae",
29
+ local_files_only=True,
30
+ ),
31
+ vae_temporal=dict(
32
+ type="VAE_Temporal_SD",
33
+ from_pretrained=None,
34
+ ),
35
+ )
36
+
37
+ discriminator = dict(
38
+ type="NLayerDiscriminator",
39
+ from_pretrained="/home/shenchenhui/opensoraplan-v1.0.0-discriminator.pt",
40
+ input_nc=3,
41
+ n_layers=3,
42
+ use_actnorm=False,
43
+ )
44
+
45
+ # discriminator hyper-parames TODO
46
+ discriminator_factor = 1
47
+ discriminator_start = -1
48
+ generator_factor = 0.5
49
+ generator_loss_type = "hinge"
50
+ discriminator_loss_type = "hinge"
51
+ lecam_loss_weight = None
52
+ gradient_penalty_loss_weight = None
53
+
54
+ # loss weights
55
+ perceptual_loss_weight = 0.1 # use vgg is not None and more than 0
56
+ kl_loss_weight = 1e-6
57
+
58
+ mixed_image_ratio = 0.2
59
+ use_real_rec_loss = True
60
+ use_z_rec_loss = False
61
+ use_image_identity_loss = False
62
+
63
+ # Others
64
+ seed = 42
65
+ outputs = "outputs"
66
+ wandb = False
67
+
68
+ epochs = 100
69
+ log_every = 1
70
+ ckpt_every = 1000
71
+ load = None
72
+
73
+ batch_size = 1
74
+ lr = 1e-5
75
+ grad_clip = 1.0
requirements.txt CHANGED
@@ -1,4 +1,4 @@
1
  xformers
2
  transformers
3
  pandarallel
4
- git+https://github.com/hpcaitech/Open-Sora.git#egg=opensora
 
1
  xformers
2
  transformers
3
  pandarallel
4
+ git+https://github.com/FrankLeeeee/test-space.git