frankleeeee commited on
Commit
e6d2ce0
1 Parent(s): 6099edb
Files changed (47) hide show
  1. configs/dit/train/16x256x256.py +1 -1
  2. configs/dit/train/1x256x256.py +1 -1
  3. configs/latte/train/16x256x256.py +1 -1
  4. configs/opensora-v1-1/inference/sample-ref.py +19 -17
  5. configs/opensora-v1-1/inference/sample.py +3 -2
  6. configs/opensora-v1-1/train/benchmark.py +2 -1
  7. configs/opensora-v1-1/train/image.py +2 -1
  8. configs/opensora-v1-1/train/image_rflow.py +88 -0
  9. configs/opensora-v1-1/train/stage1.py +11 -10
  10. configs/opensora-v1-1/train/stage2.py +11 -10
  11. configs/opensora-v1-1/train/stage3.py +11 -10
  12. configs/opensora-v1-1/train/video.py +2 -1
  13. configs/opensora-v1-2/inference/sample.py +42 -0
  14. configs/opensora-v1-2/misc/bs.py +117 -0
  15. configs/opensora-v1-2/misc/eval_loss.py +49 -0
  16. configs/opensora-v1-2/misc/extract.py +62 -0
  17. configs/opensora-v1-2/misc/feat.py +94 -0
  18. configs/opensora-v1-2/train/adapt.py +84 -0
  19. configs/opensora-v1-2/train/stage1.py +111 -0
  20. configs/opensora-v1-2/train/stage1_feat.py +59 -0
  21. configs/opensora-v1-2/train/stage2.py +90 -0
  22. configs/opensora-v1-2/train/stage3.py +92 -0
  23. configs/opensora/inference/16x256x256.py +1 -1
  24. configs/opensora/inference/16x512x512-rflow.py +35 -0
  25. configs/opensora/inference/16x512x512.py +1 -1
  26. configs/opensora/inference/64x512x512.py +1 -1
  27. configs/opensora/train/16x256x256-mask.py +3 -3
  28. configs/opensora/train/16x256x256-spee-rflow.py +64 -0
  29. configs/opensora/train/16x256x256-spee.py +3 -3
  30. configs/opensora/train/16x256x256.py +2 -2
  31. configs/opensora/train/16x512x512.py +1 -1
  32. configs/opensora/train/360x512x512.py +1 -1
  33. configs/opensora/train/64x512x512-sp.py +1 -1
  34. configs/opensora/train/64x512x512.py +1 -1
  35. configs/pixart/inference/1x20481B.py +36 -0
  36. configs/pixart/inference/1x2048MS.py +36 -0
  37. configs/pixart/inference/1x512x512-rflow.py +39 -0
  38. configs/pixart/train/16x256x256.py +1 -1
  39. configs/pixart/train/1x2048x2048.py +54 -0
  40. configs/pixart/train/1x512x512-rflow.py +55 -0
  41. configs/pixart/train/1x512x512.py +1 -1
  42. configs/pixart/train/64x512x512.py +1 -1
  43. configs/vae/inference/image.py +32 -0
  44. configs/vae/inference/video.py +32 -0
  45. configs/vae/train/stage1.py +59 -0
  46. configs/vae/train/stage2.py +59 -0
  47. configs/vae/train/stage3.py +58 -0
configs/dit/train/16x256x256.py CHANGED
@@ -18,7 +18,7 @@ sp_size = 1
18
  model = dict(
19
  type="DiT-XL/2",
20
  from_pretrained="DiT-XL-2-256x256.pt",
21
- enable_flashattn=True,
22
  enable_layernorm_kernel=True,
23
  )
24
  vae = dict(
 
18
  model = dict(
19
  type="DiT-XL/2",
20
  from_pretrained="DiT-XL-2-256x256.pt",
21
+ enable_flash_attn=True,
22
  enable_layernorm_kernel=True,
23
  )
24
  vae = dict(
configs/dit/train/1x256x256.py CHANGED
@@ -19,7 +19,7 @@ sp_size = 1
19
  model = dict(
20
  type="DiT-XL/2",
21
  no_temporal_pos_emb=True,
22
- enable_flashattn=True,
23
  enable_layernorm_kernel=True,
24
  )
25
  vae = dict(
 
19
  model = dict(
20
  type="DiT-XL/2",
21
  no_temporal_pos_emb=True,
22
+ enable_flash_attn=True,
23
  enable_layernorm_kernel=True,
24
  )
25
  vae = dict(
configs/latte/train/16x256x256.py CHANGED
@@ -17,7 +17,7 @@ sp_size = 1
17
  # Define model
18
  model = dict(
19
  type="Latte-XL/2",
20
- enable_flashattn=True,
21
  enable_layernorm_kernel=True,
22
  )
23
  vae = dict(
 
17
  # Define model
18
  model = dict(
19
  type="Latte-XL/2",
20
+ enable_flash_attn=True,
21
  enable_layernorm_kernel=True,
22
  )
23
  vae = dict(
configs/opensora-v1-1/inference/sample-ref.py CHANGED
@@ -7,33 +7,35 @@ multi_resolution = "STDiT2"
7
  # Condition
8
  prompt_path = None
9
  prompt = [
10
- "A car driving on the ocean.",
11
- 'Drone view of waves crashing against the rugged cliffs along Big Sur\'s garay point beach. The crashing blue waters create white-tipped waves, while the golden light of the setting sun illuminates the rocky shore. A small island with a lighthouse sits in the distance, and green shrubbery covers the cliff\'s edge. The steep drop from the road down to the beach is a dramatic feat, with the cliff\'s edges jutting out over the sea. This is a view that captures the raw beauty of the coast and the rugged landscape of the Pacific Coast Highway.{"reference_path": "assets/images/condition/cliff.png", "mask_strategy": "0"}',
12
- "In an ornate, historical hall, a massive tidal wave peaks and begins to crash. Two surfers, seizing the moment, skillfully navigate the face of the wave.",
 
 
 
13
  ]
14
 
15
  loop = 2
16
  condition_frame_length = 4
17
- reference_path = [
18
- "https://cdn.openai.com/tmp/s/interp/d0.mp4",
19
- None,
20
- "assets/images/condition/wave.png",
21
- ]
22
- # valid when reference_path is not None
23
- # (loop id, ref id, ref start, length, target start)
24
- mask_strategy = [
25
- "0,0,0,0,8,0.3",
26
- None,
27
- "0",
28
- ]
29
 
30
  # Define model
31
  model = dict(
32
  type="STDiT2-XL/2",
33
- from_pretrained=None,
34
  input_sq_size=512,
35
  qk_norm=True,
36
- enable_flashattn=True,
 
37
  enable_layernorm_kernel=True,
38
  )
39
  vae = dict(
 
7
  # Condition
8
  prompt_path = None
9
  prompt = [
10
+ 'Drone view of waves crashing against the rugged cliffs along Big Sur\'s garay point beach. {"reference_path": "assets/images/condition/cliff.png", "mask_strategy": "0"}',
11
+ 'A breathtaking sunrise scene.{"reference_path": "assets/images/condition/sunset1.png","mask_strategy": "0"}',
12
+ 'A car driving on the ocean.{"reference_path": "https://cdn.openai.com/tmp/s/interp/d0.mp4","mask_strategy": "0,0,-8,0,8"}',
13
+ 'A snowy forest.{"reference_path": "https://cdn.pixabay.com/video/2021/04/25/72171-542991404_large.mp4","mask_strategy": "0,0,0,0,15,0.8"}',
14
+ 'A breathtaking sunrise scene.{"reference_path": "assets/images/condition/sunset1.png;assets/images/condition/sunset2.png","mask_strategy": "0;0,1,0,-1,1"}',
15
+ '|0|a white jeep equipped with a roof rack driving on a dirt road in a coniferous forest.|2|a white jeep equipped with a roof rack driving on a dirt road in the desert.|4|a white jeep equipped with a roof rack driving on a dirt road in a mountain.|6|A white jeep equipped with a roof rack driving on a dirt road in a city.|8|a white jeep equipped with a roof rack driving on a dirt road on the surface of a river.|10|a white jeep equipped with a roof rack driving on a dirt road under the lake.|12|a white jeep equipped with a roof rack flying into the sky.|14|a white jeep equipped with a roof rack driving in the universe. Earth is the background.{"reference_path": "https://cdn.openai.com/tmp/s/interp/d0.mp4", "mask_strategy": "0,0,0,0,15"}',
16
  ]
17
 
18
  loop = 2
19
  condition_frame_length = 4
20
+ # (
21
+ # loop id, [the loop index of the condition image or video]
22
+ # reference id, [the index of the condition image or video in the reference_path]
23
+ # reference start, [the start frame of the condition image or video]
24
+ # target start, [the location to insert]
25
+ # length, [the number of frames to insert]
26
+ # edit_ratio [the edit rate of the condition image or video]
27
+ # )
28
+ # See https://github.com/hpcaitech/Open-Sora/blob/main/docs/config.md#advanced-inference-config for more details
29
+ # See https://github.com/hpcaitech/Open-Sora/blob/main/docs/commands.md#inference-with-open-sora-11 for more examples
 
 
30
 
31
  # Define model
32
  model = dict(
33
  type="STDiT2-XL/2",
34
+ from_pretrained="hpcai-tech/OpenSora-STDiT-v2-stage3",
35
  input_sq_size=512,
36
  qk_norm=True,
37
+ qk_norm_legacy=True,
38
+ enable_flash_attn=True,
39
  enable_layernorm_kernel=True,
40
  )
41
  vae = dict(
configs/opensora-v1-1/inference/sample.py CHANGED
@@ -7,10 +7,11 @@ multi_resolution = "STDiT2"
7
  # Define model
8
  model = dict(
9
  type="STDiT2-XL/2",
10
- from_pretrained=None,
11
  input_sq_size=512,
12
  qk_norm=True,
13
- enable_flashattn=True,
 
14
  enable_layernorm_kernel=True,
15
  )
16
  vae = dict(
 
7
  # Define model
8
  model = dict(
9
  type="STDiT2-XL/2",
10
+ from_pretrained="hpcai-tech/OpenSora-STDiT-v2-stage3",
11
  input_sq_size=512,
12
  qk_norm=True,
13
+ qk_norm_legacy=True,
14
+ enable_flash_attn=True,
15
  enable_layernorm_kernel=True,
16
  )
17
  vae = dict(
configs/opensora-v1-1/train/benchmark.py CHANGED
@@ -65,7 +65,8 @@ model = dict(
65
  from_pretrained=None,
66
  input_sq_size=512, # pretrained model is trained on 512x512
67
  qk_norm=True,
68
- enable_flashattn=True,
 
69
  enable_layernorm_kernel=True,
70
  )
71
  vae = dict(
 
65
  from_pretrained=None,
66
  input_sq_size=512, # pretrained model is trained on 512x512
67
  qk_norm=True,
68
+ qk_norm_legacy=True,
69
+ enable_flash_attn=True,
70
  enable_layernorm_kernel=True,
71
  )
72
  vae = dict(
configs/opensora-v1-1/train/image.py CHANGED
@@ -29,7 +29,8 @@ model = dict(
29
  from_pretrained=None,
30
  input_sq_size=512, # pretrained model is trained on 512x512
31
  qk_norm=True,
32
- enable_flashattn=True,
 
33
  enable_layernorm_kernel=True,
34
  )
35
  vae = dict(
 
29
  from_pretrained=None,
30
  input_sq_size=512, # pretrained model is trained on 512x512
31
  qk_norm=True,
32
+ qk_norm_legacy=True,
33
+ enable_flash_attn=True,
34
  enable_layernorm_kernel=True,
35
  )
36
  vae = dict(
configs/opensora-v1-1/train/image_rflow.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Define dataset
2
+ # dataset = dict(
3
+ # type="VariableVideoTextDataset",
4
+ # data_path=None,
5
+ # num_frames=None,
6
+ # frame_interval=3,
7
+ # image_size=(None, None),
8
+ # transform_name="resize_crop",
9
+ # )
10
+ dataset = dict(
11
+ type="VideoTextDataset",
12
+ data_path=None,
13
+ num_frames=1,
14
+ frame_interval=1,
15
+ image_size=(256, 256),
16
+ transform_name="center",
17
+ )
18
+ bucket_config = { # 6s/it
19
+ "256": {1: (1.0, 256)},
20
+ "512": {1: (1.0, 80)},
21
+ "480p": {1: (1.0, 52)},
22
+ "1024": {1: (1.0, 20)},
23
+ "1080p": {1: (1.0, 8)},
24
+ }
25
+
26
+ # Define acceleration
27
+ num_workers = 16
28
+ dtype = "bf16"
29
+ grad_checkpoint = True
30
+ plugin = "zero2"
31
+ sp_size = 1
32
+
33
+ # Define model
34
+ # model = dict(
35
+ # type="DiT-XL/2",
36
+ # from_pretrained="/home/zhaowangbo/wangbo/PixArt-alpha/pretrained_models/PixArt-XL-2-512x512.pth",
37
+ # # input_sq_size=512, # pretrained model is trained on 512x512
38
+ # enable_flash_attn=True,
39
+ # enable_layernorm_kernel=True,
40
+ # )
41
+ model = dict(
42
+ type="PixArt-XL/2",
43
+ space_scale=1.0,
44
+ time_scale=1.0,
45
+ no_temporal_pos_emb=True,
46
+ from_pretrained="PixArt-XL-2-512x512.pth",
47
+ enable_flash_attn=True,
48
+ enable_layernorm_kernel=True,
49
+ )
50
+ # model = dict(
51
+ # type="DiT-XL/2",
52
+ # # space_scale=1.0,
53
+ # # time_scale=1.0,
54
+ # no_temporal_pos_emb=True,
55
+ # # from_pretrained="PixArt-XL-2-512x512.pth",
56
+ # from_pretrained="/home/zhaowangbo/wangbo/PixArt-alpha/pretrained_models/PixArt-XL-2-512x512.pth",
57
+ # enable_flash_attn=True,
58
+ # enable_layernorm_kernel=True,
59
+ # )
60
+ vae = dict(
61
+ type="VideoAutoencoderKL",
62
+ from_pretrained="stabilityai/sd-vae-ft-ema",
63
+ micro_batch_size=4,
64
+ )
65
+ text_encoder = dict(
66
+ type="t5",
67
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
68
+ model_max_length=200,
69
+ shardformer=True,
70
+ )
71
+ scheduler = dict(
72
+ type="rflow",
73
+ # timestep_respacing="",
74
+ )
75
+
76
+ # Others
77
+ seed = 42
78
+ outputs = "outputs"
79
+ wandb = False
80
+
81
+ epochs = 10
82
+ log_every = 10
83
+ ckpt_every = 500
84
+ load = None
85
+
86
+ batch_size = 100 # only for logging
87
+ lr = 2e-5
88
+ grad_clip = 1.0
configs/opensora-v1-1/train/stage1.py CHANGED
@@ -16,15 +16,15 @@ bucket_config = { # 1s/it
16
  "1024": {1: (0.3, 3)},
17
  }
18
  mask_ratios = {
19
- "mask_no": 0.75,
20
- "mask_quarter_random": 0.025,
21
- "mask_quarter_head": 0.025,
22
- "mask_quarter_tail": 0.025,
23
- "mask_quarter_head_tail": 0.05,
24
- "mask_image_random": 0.025,
25
- "mask_image_head": 0.025,
26
- "mask_image_tail": 0.025,
27
- "mask_image_head_tail": 0.05,
28
  }
29
 
30
  # Define acceleration
@@ -41,7 +41,8 @@ model = dict(
41
  from_pretrained=None,
42
  input_sq_size=512, # pretrained model is trained on 512x512
43
  qk_norm=True,
44
- enable_flashattn=True,
 
45
  enable_layernorm_kernel=True,
46
  )
47
  vae = dict(
 
16
  "1024": {1: (0.3, 3)},
17
  }
18
  mask_ratios = {
19
+ "identity": 0.75,
20
+ "quarter_random": 0.025,
21
+ "quarter_head": 0.025,
22
+ "quarter_tail": 0.025,
23
+ "quarter_head_tail": 0.05,
24
+ "image_random": 0.025,
25
+ "image_head": 0.025,
26
+ "image_tail": 0.025,
27
+ "image_head_tail": 0.05,
28
  }
29
 
30
  # Define acceleration
 
41
  from_pretrained=None,
42
  input_sq_size=512, # pretrained model is trained on 512x512
43
  qk_norm=True,
44
+ qk_norm_legacy=True,
45
+ enable_flash_attn=True,
46
  enable_layernorm_kernel=True,
47
  )
48
  vae = dict(
configs/opensora-v1-1/train/stage2.py CHANGED
@@ -18,15 +18,15 @@ bucket_config = { # 7s/it
18
  "1080p": {1: (0.4, 8)},
19
  }
20
  mask_ratios = {
21
- "mask_no": 0.75,
22
- "mask_quarter_random": 0.025,
23
- "mask_quarter_head": 0.025,
24
- "mask_quarter_tail": 0.025,
25
- "mask_quarter_head_tail": 0.05,
26
- "mask_image_random": 0.025,
27
- "mask_image_head": 0.025,
28
- "mask_image_tail": 0.025,
29
- "mask_image_head_tail": 0.05,
30
  }
31
 
32
  # Define acceleration
@@ -43,7 +43,8 @@ model = dict(
43
  from_pretrained=None,
44
  input_sq_size=512, # pretrained model is trained on 512x512
45
  qk_norm=True,
46
- enable_flashattn=True,
 
47
  enable_layernorm_kernel=True,
48
  )
49
  vae = dict(
 
18
  "1080p": {1: (0.4, 8)},
19
  }
20
  mask_ratios = {
21
+ "identity": 0.75,
22
+ "quarter_random": 0.025,
23
+ "quarter_head": 0.025,
24
+ "quarter_tail": 0.025,
25
+ "quarter_head_tail": 0.05,
26
+ "image_random": 0.025,
27
+ "image_head": 0.025,
28
+ "image_tail": 0.025,
29
+ "image_head_tail": 0.05,
30
  }
31
 
32
  # Define acceleration
 
43
  from_pretrained=None,
44
  input_sq_size=512, # pretrained model is trained on 512x512
45
  qk_norm=True,
46
+ qk_norm_legacy=True,
47
+ enable_flash_attn=True,
48
  enable_layernorm_kernel=True,
49
  )
50
  vae = dict(
configs/opensora-v1-1/train/stage3.py CHANGED
@@ -18,15 +18,15 @@ bucket_config = { # 13s/it
18
  "1024": {1: (0.3, 40)},
19
  }
20
  mask_ratios = {
21
- "mask_no": 0.75,
22
- "mask_quarter_random": 0.025,
23
- "mask_quarter_head": 0.025,
24
- "mask_quarter_tail": 0.025,
25
- "mask_quarter_head_tail": 0.05,
26
- "mask_image_random": 0.025,
27
- "mask_image_head": 0.025,
28
- "mask_image_tail": 0.025,
29
- "mask_image_head_tail": 0.05,
30
  }
31
 
32
  # Define acceleration
@@ -43,7 +43,8 @@ model = dict(
43
  from_pretrained=None,
44
  input_sq_size=512, # pretrained model is trained on 512x512
45
  qk_norm=True,
46
- enable_flashattn=True,
 
47
  enable_layernorm_kernel=True,
48
  )
49
  vae = dict(
 
18
  "1024": {1: (0.3, 40)},
19
  }
20
  mask_ratios = {
21
+ "identity": 0.75,
22
+ "quarter_random": 0.025,
23
+ "quarter_head": 0.025,
24
+ "quarter_tail": 0.025,
25
+ "quarter_head_tail": 0.05,
26
+ "image_random": 0.025,
27
+ "image_head": 0.025,
28
+ "image_tail": 0.025,
29
+ "image_head_tail": 0.05,
30
  }
31
 
32
  # Define acceleration
 
43
  from_pretrained=None,
44
  input_sq_size=512, # pretrained model is trained on 512x512
45
  qk_norm=True,
46
+ qk_norm_legacy=True,
47
+ enable_flash_attn=True,
48
  enable_layernorm_kernel=True,
49
  )
50
  vae = dict(
configs/opensora-v1-1/train/video.py CHANGED
@@ -31,7 +31,8 @@ model = dict(
31
  from_pretrained=None,
32
  input_sq_size=512, # pretrained model is trained on 512x512
33
  qk_norm=True,
34
- enable_flashattn=True,
 
35
  enable_layernorm_kernel=True,
36
  )
37
  vae = dict(
 
31
  from_pretrained=None,
32
  input_sq_size=512, # pretrained model is trained on 512x512
33
  qk_norm=True,
34
+ qk_norm_legacy=True,
35
+ enable_flash_attn=True,
36
  enable_layernorm_kernel=True,
37
  )
38
  vae = dict(
configs/opensora-v1-2/inference/sample.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ resolution = "240p"
2
+ aspect_ratio = "9:16"
3
+ num_frames = 51
4
+ fps = 24
5
+ frame_interval = 1
6
+ save_fps = 24
7
+
8
+ save_dir = "./samples/samples/"
9
+ seed = 42
10
+ batch_size = 1
11
+ multi_resolution = "STDiT2"
12
+ dtype = "bf16"
13
+ condition_frame_length = 5
14
+ align = 5
15
+
16
+ model = dict(
17
+ type="STDiT3-XL/2",
18
+ from_pretrained="hpcai-tech/OpenSora-STDiT-v3",
19
+ qk_norm=True,
20
+ enable_flash_attn=True,
21
+ enable_layernorm_kernel=True,
22
+ )
23
+ vae = dict(
24
+ type="OpenSoraVAE_V1_2",
25
+ from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
26
+ micro_frame_size=17,
27
+ micro_batch_size=4,
28
+ )
29
+ text_encoder = dict(
30
+ type="t5",
31
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
32
+ model_max_length=300,
33
+ )
34
+ scheduler = dict(
35
+ type="rflow",
36
+ use_timestep_transform=True,
37
+ num_sampling_steps=30,
38
+ cfg_scale=7.0,
39
+ )
40
+
41
+ aes = 6.5
42
+ flow = None
configs/opensora-v1-2/misc/bs.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dataset settings
2
+ dataset = dict(
3
+ type="VariableVideoTextDataset",
4
+ transform_name="resize_crop",
5
+ )
6
+
7
+ # == Config 1: Webvid ==
8
+ # base: (512, 408), 12s/it
9
+ grad_checkpoint = True
10
+ base = ("512", "408")
11
+ base_step_time = 12
12
+ bucket_config = {
13
+ "144p": {
14
+ 1: (475, 0),
15
+ 51: (51, 0),
16
+ 102: (27, 0),
17
+ 204: (13, 0),
18
+ 408: (6, 0),
19
+ },
20
+ # ---
21
+ "240p": {
22
+ 1: (297, 200), # 8.25
23
+ 51: (20, 0),
24
+ 102: (10, 0),
25
+ 204: (5, 0),
26
+ 408: (2, 0),
27
+ },
28
+ # ---
29
+ "512": {
30
+ 1: (141, 0),
31
+ 51: (8, 0),
32
+ 102: (4, 0),
33
+ 204: (2, 0),
34
+ 408: (1, 0),
35
+ },
36
+ # ---
37
+ "480p": {
38
+ 1: (89, 0),
39
+ 51: (5, 0),
40
+ 102: (2, 0),
41
+ 204: (1, 0),
42
+ },
43
+ # ---
44
+ "1024": {
45
+ 1: (36, 0),
46
+ 51: (1, 0),
47
+ },
48
+ # ---
49
+ "1080p": {1: (5, 0)},
50
+ # ---
51
+ "2048": {1: (5, 0)},
52
+ }
53
+
54
+ # == Config 1 ==
55
+ # base: (512, 408), 16s/it
56
+
57
+ # Acceleration settings
58
+ num_workers = 8
59
+ num_bucket_build_workers = 16
60
+ dtype = "bf16"
61
+ plugin = "zero2"
62
+
63
+ # Model settings
64
+ model = dict(
65
+ type="STDiT3-XL/2",
66
+ from_pretrained=None,
67
+ qk_norm=True,
68
+ enable_flash_attn=True,
69
+ enable_layernorm_kernel=True,
70
+ )
71
+ vae = dict(
72
+ type="OpenSoraVAE_V1_2",
73
+ from_pretrained="pretrained_models/vae-pipeline",
74
+ micro_frame_size=17,
75
+ micro_batch_size=4,
76
+ )
77
+ text_encoder = dict(
78
+ type="t5",
79
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
80
+ model_max_length=300,
81
+ shardformer=True,
82
+ local_files_only=True,
83
+ )
84
+ scheduler = dict(
85
+ type="rflow",
86
+ use_timestep_transform=True,
87
+ sample_method="logit-normal",
88
+ )
89
+
90
+ # Mask settings
91
+ mask_ratios = {
92
+ "random": 0.2,
93
+ "intepolate": 0.01,
94
+ "quarter_random": 0.01,
95
+ "quarter_head": 0.01,
96
+ "quarter_tail": 0.01,
97
+ "quarter_head_tail": 0.01,
98
+ "image_random": 0.05,
99
+ "image_head": 0.1,
100
+ "image_tail": 0.05,
101
+ "image_head_tail": 0.05,
102
+ }
103
+
104
+ # Log settings
105
+ seed = 42
106
+ outputs = "outputs"
107
+ wandb = False
108
+ epochs = 1000
109
+ log_every = 10
110
+ ckpt_every = 500
111
+
112
+ # optimization settings
113
+ load = None
114
+ grad_clip = 1.0
115
+ lr = 2e-4
116
+ ema_decay = 0.99
117
+ adam_eps = 1e-15
configs/opensora-v1-2/misc/eval_loss.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_workers = 8
2
+ dtype = "bf16"
3
+ seed = 42
4
+ num_eval_timesteps = 10
5
+
6
+ # Dataset settings
7
+ dataset = dict(
8
+ type="VariableVideoTextDataset",
9
+ transform_name="resize_crop",
10
+ )
11
+
12
+ bucket_config = {
13
+ "144p": {1: (None, 100), 51: (None, 30), 102: (None, 20), 204: (None, 8), 408: (None, 4)},
14
+ # ---
15
+ "240p": {1: (None, 100), 51: (None, 24), 102: (None, 12), 204: (None, 4), 408: (None, 2)},
16
+ # ---
17
+ "360p": {1: (None, 60), 51: (None, 12), 102: (None, 6), 204: (None, 2), 408: (None, 1)},
18
+ # ---
19
+ "480p": {1: (None, 40), 51: (None, 6), 102: (None, 3), 204: (None, 1)},
20
+ # ---
21
+ "720p": {1: (None, 20), 51: (None, 2), 102: (None, 1)},
22
+ # ---
23
+ "1080p": {1: (None, 10)},
24
+ # ---
25
+ "2048": {1: (None, 5)},
26
+ }
27
+
28
+ # Model settings
29
+ model = dict(
30
+ type="STDiT3-XL/2",
31
+ from_pretrained=None,
32
+ qk_norm=True,
33
+ enable_flash_attn=True,
34
+ enable_layernorm_kernel=True,
35
+ )
36
+ vae = dict(
37
+ type="OpenSoraVAE_V1_2",
38
+ from_pretrained="pretrained_models/vae-pipeline",
39
+ micro_frame_size=17,
40
+ micro_batch_size=4,
41
+ local_files_only=True,
42
+ )
43
+ text_encoder = dict(
44
+ type="t5",
45
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
46
+ model_max_length=300,
47
+ local_files_only=True,
48
+ )
49
+ scheduler = dict(type="rflow")
configs/opensora-v1-2/misc/extract.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dataset settings
2
+ dataset = dict(
3
+ type="VariableVideoTextDataset",
4
+ transform_name="resize_crop",
5
+ )
6
+
7
+ # webvid
8
+ bucket_config = { # 12s/it
9
+ "144p": {1: (1.0, 475), 51: (1.0, 51), 102: ((1.0, 0.33), 27), 204: ((1.0, 0.1), 13), 408: ((1.0, 0.1), 6)},
10
+ # ---
11
+ "256": {1: (0.4, 297), 51: (0.5, 20), 102: ((0.5, 0.33), 10), 204: ((0.5, 0.1), 5), 408: ((0.5, 0.1), 2)},
12
+ "240p": {1: (0.3, 297), 51: (0.4, 20), 102: ((0.4, 0.33), 10), 204: ((0.4, 0.1), 5), 408: ((0.4, 0.1), 2)},
13
+ # ---
14
+ "360p": {1: (0.2, 141), 51: (0.15, 8), 102: ((0.15, 0.33), 4), 204: ((0.15, 0.1), 2), 408: ((0.15, 0.1), 1)},
15
+ "512": {1: (0.1, 141)},
16
+ # ---
17
+ "480p": {1: (0.1, 89)},
18
+ # ---
19
+ "720p": {1: (0.05, 36)},
20
+ "1024": {1: (0.05, 36)},
21
+ # ---
22
+ "1080p": {1: (0.1, 5)},
23
+ # ---
24
+ "2048": {1: (0.1, 5)},
25
+ }
26
+
27
+ # Acceleration settings
28
+ num_workers = 8
29
+ num_bucket_build_workers = 16
30
+ dtype = "bf16"
31
+ seed = 42
32
+ outputs = "outputs"
33
+ wandb = False
34
+
35
+
36
+ # Model settings
37
+ model = dict(
38
+ type="STDiT3-XL/2",
39
+ from_pretrained="/mnt/nfs-206/zangwei/opensora/outputs/1091-STDiT3-XL-2/epoch0-global_step8500",
40
+ qk_norm=True,
41
+ enable_flash_attn=True,
42
+ enable_layernorm_kernel=True,
43
+ )
44
+ vae = dict(
45
+ type="OpenSoraVAE_V1_2",
46
+ from_pretrained="pretrained_models/vae-pipeline",
47
+ micro_frame_size=17,
48
+ micro_batch_size=32,
49
+ )
50
+ text_encoder = dict(
51
+ type="t5",
52
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
53
+ model_max_length=300,
54
+ shardformer=True,
55
+ local_files_only=True,
56
+ )
57
+
58
+ # feature extraction settings
59
+ save_text_features = True
60
+ save_compressed_text_features = True
61
+ bin_size = 250 # 1GB, 4195 bins
62
+ log_time = False
configs/opensora-v1-2/misc/feat.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dataset settings
2
+ dataset = dict(
3
+ type="VariableVideoTextDataset",
4
+ transform_name="resize_crop",
5
+ dummy_text_feature=True,
6
+ )
7
+
8
+ # webvid
9
+ bucket_config = { # 12s/it
10
+ "144p": {1: (1.0, 475), 51: (1.0, 51), 102: ((1.0, 0.33), 27), 204: ((1.0, 0.1), 13), 408: ((1.0, 0.1), 6)},
11
+ # ---
12
+ "256": {1: (0.4, 297), 51: (0.5, 20), 102: ((0.5, 0.33), 10), 204: ((0.5, 0.1), 5), 408: ((0.5, 0.1), 2)},
13
+ "240p": {1: (0.3, 297), 51: (0.4, 20), 102: ((0.4, 0.33), 10), 204: ((0.4, 0.1), 5), 408: ((0.4, 0.1), 2)},
14
+ # ---
15
+ "360p": {1: (0.2, 141), 51: (0.15, 8), 102: ((0.15, 0.33), 4), 204: ((0.15, 0.1), 2), 408: ((0.15, 0.1), 1)},
16
+ "512": {1: (0.1, 141)},
17
+ # ---
18
+ "480p": {1: (0.1, 89)},
19
+ # ---
20
+ "720p": {1: (0.05, 36)},
21
+ "1024": {1: (0.05, 36)},
22
+ # ---
23
+ "1080p": {1: (0.1, 5)},
24
+ # ---
25
+ "2048": {1: (0.1, 5)},
26
+ }
27
+
28
+ grad_checkpoint = True
29
+
30
+ load_text_features = True
31
+
32
+ # Acceleration settings
33
+ num_workers = 0
34
+ num_bucket_build_workers = 16
35
+ dtype = "bf16"
36
+ plugin = "zero2"
37
+
38
+ # Model settings
39
+ model = dict(
40
+ type="STDiT3-XL/2",
41
+ from_pretrained=None,
42
+ qk_norm=True,
43
+ enable_flash_attn=True,
44
+ enable_layernorm_kernel=True,
45
+ freeze_y_embedder=True,
46
+ skip_y_embedder=True,
47
+ )
48
+ vae = dict(
49
+ type="OpenSoraVAE_V1_2",
50
+ from_pretrained="pretrained_models/vae-pipeline",
51
+ micro_frame_size=17,
52
+ micro_batch_size=4,
53
+ )
54
+ text_encoder = dict(
55
+ type="t5",
56
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
57
+ model_max_length=300,
58
+ shardformer=True,
59
+ local_files_only=True,
60
+ )
61
+ scheduler = dict(
62
+ type="rflow",
63
+ use_timestep_transform=True,
64
+ sample_method="logit-normal",
65
+ )
66
+
67
+ # Mask settings
68
+ mask_ratios = {
69
+ "random": 0.2,
70
+ "intepolate": 0.01,
71
+ "quarter_random": 0.01,
72
+ "quarter_head": 0.01,
73
+ "quarter_tail": 0.01,
74
+ "quarter_head_tail": 0.01,
75
+ "image_random": 0.05,
76
+ "image_head": 0.1,
77
+ "image_tail": 0.05,
78
+ "image_head_tail": 0.05,
79
+ }
80
+
81
+ # Log settings
82
+ seed = 42
83
+ outputs = "outputs"
84
+ wandb = False
85
+ epochs = 1000
86
+ log_every = 10
87
+ ckpt_every = 1
88
+
89
+ # optimization settings
90
+ load = None
91
+ grad_clip = 1.0
92
+ lr = 2e-4
93
+ ema_decay = 0.99
94
+ adam_eps = 1e-15
configs/opensora-v1-2/train/adapt.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dataset settings
2
+ dataset = dict(
3
+ type="VariableVideoTextDataset",
4
+ transform_name="resize_crop",
5
+ )
6
+ bucket_config = { # 2s/it
7
+ "144p": {1: (0.5, 48), 34: (1.0, 2), 51: (1.0, 4), 102: (1.0, 2), 204: (1.0, 1)},
8
+ # ---
9
+ "256": {1: (0.6, 20), 34: (0.5, 2), 51: (0.5, 1), 68: (0.5, 1), 136: (0.0, None)},
10
+ "240p": {1: (0.6, 20), 34: (0.5, 2), 51: (0.5, 1), 68: (0.5, 1), 136: (0.0, None)},
11
+ # ---
12
+ "360p": {1: (0.5, 8), 34: (0.2, 1), 102: (0.0, None)},
13
+ "512": {1: (0.5, 8), 34: (0.2, 1), 102: (0.0, None)},
14
+ # ---
15
+ "480p": {1: (0.2, 4), 17: (0.3, 1), 68: (0.0, None)},
16
+ # ---
17
+ "720p": {1: (0.1, 2)},
18
+ "1024": {1: (0.1, 2)},
19
+ # ---
20
+ "1080p": {1: (0.1, 1)},
21
+ }
22
+ grad_checkpoint = False
23
+
24
+ # Acceleration settings
25
+ num_workers = 8
26
+ num_bucket_build_workers = 16
27
+ dtype = "bf16"
28
+ plugin = "zero2"
29
+
30
+ # Model settings
31
+ model = dict(
32
+ type="STDiT3-XL/2",
33
+ from_pretrained=None,
34
+ qk_norm=True,
35
+ enable_flash_attn=True,
36
+ enable_layernorm_kernel=True,
37
+ )
38
+ vae = dict(
39
+ type="OpenSoraVAE_V1_2",
40
+ from_pretrained="pretrained_models/vae-pipeline",
41
+ micro_frame_size=17,
42
+ micro_batch_size=4,
43
+ )
44
+ text_encoder = dict(
45
+ type="t5",
46
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
47
+ model_max_length=300,
48
+ shardformer=True,
49
+ local_files_only=True,
50
+ )
51
+ scheduler = dict(
52
+ type="rflow",
53
+ use_timestep_transform=True,
54
+ sample_method="logit-normal",
55
+ )
56
+
57
+ # Mask settings
58
+ mask_ratios = {
59
+ "random": 0.2,
60
+ "intepolate": 0.01,
61
+ "quarter_random": 0.01,
62
+ "quarter_head": 0.01,
63
+ "quarter_tail": 0.01,
64
+ "quarter_head_tail": 0.01,
65
+ "image_random": 0.05,
66
+ "image_head": 0.1,
67
+ "image_tail": 0.05,
68
+ "image_head_tail": 0.05,
69
+ }
70
+
71
+ # Log settings
72
+ seed = 42
73
+ outputs = "outputs"
74
+ wandb = False
75
+ epochs = 1000
76
+ log_every = 10
77
+ ckpt_every = 500
78
+
79
+ # optimization settings
80
+ load = None
81
+ grad_clip = 1.0
82
+ lr = 1e-4
83
+ ema_decay = 0.99
84
+ adam_eps = 1e-15
configs/opensora-v1-2/train/stage1.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dataset settings
2
+ dataset = dict(
3
+ type="VariableVideoTextDataset",
4
+ transform_name="resize_crop",
5
+ )
6
+
7
+ # backup
8
+ # bucket_config = { # 20s/it
9
+ # "144p": {1: (1.0, 100), 51: (1.0, 30), 102: (1.0, 20), 204: (1.0, 8), 408: (1.0, 4)},
10
+ # # ---
11
+ # "256": {1: (0.5, 100), 51: (0.3, 24), 102: (0.3, 12), 204: (0.3, 4), 408: (0.3, 2)},
12
+ # "240p": {1: (0.5, 100), 51: (0.3, 24), 102: (0.3, 12), 204: (0.3, 4), 408: (0.3, 2)},
13
+ # # ---
14
+ # "360p": {1: (0.5, 60), 51: (0.3, 12), 102: (0.3, 6), 204: (0.3, 2), 408: (0.3, 1)},
15
+ # "512": {1: (0.5, 60), 51: (0.3, 12), 102: (0.3, 6), 204: (0.3, 2), 408: (0.3, 1)},
16
+ # # ---
17
+ # "480p": {1: (0.5, 40), 51: (0.3, 6), 102: (0.3, 3), 204: (0.3, 1), 408: (0.0, None)},
18
+ # # ---
19
+ # "720p": {1: (0.2, 20), 51: (0.3, 2), 102: (0.3, 1), 204: (0.0, None)},
20
+ # "1024": {1: (0.1, 20), 51: (0.3, 2), 102: (0.3, 1), 204: (0.0, None)},
21
+ # # ---
22
+ # "1080p": {1: (0.1, 10)},
23
+ # # ---
24
+ # "2048": {1: (0.1, 5)},
25
+ # }
26
+
27
+ # webvid
28
+ bucket_config = { # 12s/it
29
+ "144p": {1: (1.0, 475), 51: (1.0, 51), 102: ((1.0, 0.33), 27), 204: ((1.0, 0.1), 13), 408: ((1.0, 0.1), 6)},
30
+ # ---
31
+ "256": {1: (0.4, 297), 51: (0.5, 20), 102: ((0.5, 0.33), 10), 204: ((0.5, 0.1), 5), 408: ((0.5, 0.1), 2)},
32
+ "240p": {1: (0.3, 297), 51: (0.4, 20), 102: ((0.4, 0.33), 10), 204: ((0.4, 0.1), 5), 408: ((0.4, 0.1), 2)},
33
+ # ---
34
+ "360p": {1: (0.2, 141), 51: (0.15, 8), 102: ((0.15, 0.33), 4), 204: ((0.15, 0.1), 2), 408: ((0.15, 0.1), 1)},
35
+ "512": {1: (0.1, 141)},
36
+ # ---
37
+ "480p": {1: (0.1, 89)},
38
+ # ---
39
+ "720p": {1: (0.05, 36)},
40
+ "1024": {1: (0.05, 36)},
41
+ # ---
42
+ "1080p": {1: (0.1, 5)},
43
+ # ---
44
+ "2048": {1: (0.1, 5)},
45
+ }
46
+
47
+ grad_checkpoint = True
48
+
49
+ # Acceleration settings
50
+ num_workers = 8
51
+ num_bucket_build_workers = 16
52
+ dtype = "bf16"
53
+ plugin = "zero2"
54
+
55
+ # Model settings
56
+ model = dict(
57
+ type="STDiT3-XL/2",
58
+ from_pretrained=None,
59
+ qk_norm=True,
60
+ enable_flash_attn=True,
61
+ enable_layernorm_kernel=True,
62
+ freeze_y_embedder=True,
63
+ )
64
+ vae = dict(
65
+ type="OpenSoraVAE_V1_2",
66
+ from_pretrained="/mnt/jfs/sora_checkpoints/vae-pipeline",
67
+ micro_frame_size=17,
68
+ micro_batch_size=4,
69
+ )
70
+ text_encoder = dict(
71
+ type="t5",
72
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
73
+ model_max_length=300,
74
+ shardformer=True,
75
+ local_files_only=True,
76
+ )
77
+ scheduler = dict(
78
+ type="rflow",
79
+ use_timestep_transform=True,
80
+ sample_method="logit-normal",
81
+ )
82
+
83
+ # Mask settings
84
+ mask_ratios = {
85
+ "random": 0.05,
86
+ "intepolate": 0.005,
87
+ "quarter_random": 0.005,
88
+ "quarter_head": 0.005,
89
+ "quarter_tail": 0.005,
90
+ "quarter_head_tail": 0.005,
91
+ "image_random": 0.025,
92
+ "image_head": 0.05,
93
+ "image_tail": 0.025,
94
+ "image_head_tail": 0.025,
95
+ }
96
+
97
+ # Log settings
98
+ seed = 42
99
+ outputs = "outputs"
100
+ wandb = False
101
+ epochs = 1000
102
+ log_every = 10
103
+ ckpt_every = 200
104
+
105
+ # optimization settings
106
+ load = None
107
+ grad_clip = 1.0
108
+ lr = 1e-4
109
+ ema_decay = 0.99
110
+ adam_eps = 1e-15
111
+ warmup_steps = 1000
configs/opensora-v1-2/train/stage1_feat.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dataset settings
2
+ dataset = dict(type="BatchFeatureDataset")
3
+ grad_checkpoint = True
4
+ num_workers = 4
5
+
6
+ # Acceleration settings
7
+ dtype = "bf16"
8
+ plugin = "zero2"
9
+
10
+ # Model settings
11
+ model = dict(
12
+ type="STDiT3-XL/2",
13
+ from_pretrained=None,
14
+ qk_norm=True,
15
+ enable_flash_attn=True,
16
+ enable_layernorm_kernel=True,
17
+ freeze_y_embedder=True,
18
+ skip_y_embedder=True,
19
+ )
20
+ scheduler = dict(
21
+ type="rflow",
22
+ use_timestep_transform=True,
23
+ sample_method="logit-normal",
24
+ )
25
+
26
+ vae_out_channels = 4
27
+ model_max_length = 300
28
+ text_encoder_output_dim = 4096
29
+ load_video_features = True
30
+ load_text_features = True
31
+
32
+ # Mask settings
33
+ mask_ratios = {
34
+ "random": 0.2,
35
+ "intepolate": 0.01,
36
+ "quarter_random": 0.01,
37
+ "quarter_head": 0.01,
38
+ "quarter_tail": 0.01,
39
+ "quarter_head_tail": 0.01,
40
+ "image_random": 0.05,
41
+ "image_head": 0.1,
42
+ "image_tail": 0.05,
43
+ "image_head_tail": 0.05,
44
+ }
45
+
46
+ # Log settings
47
+ seed = 42
48
+ outputs = "outputs"
49
+ wandb = False
50
+ epochs = 1000
51
+ log_every = 10
52
+ ckpt_every = 500
53
+
54
+ # optimization settings
55
+ load = None
56
+ grad_clip = 1.0
57
+ lr = 2e-4
58
+ ema_decay = 0.99
59
+ adam_eps = 1e-15
configs/opensora-v1-2/train/stage2.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dataset settings
2
+ dataset = dict(
3
+ type="VariableVideoTextDataset",
4
+ transform_name="resize_crop",
5
+ )
6
+
7
+ # webvid
8
+ bucket_config = { # 12s/it
9
+ "144p": {1: (1.0, 475), 51: (1.0, 51), 102: ((1.0, 0.33), 27), 204: ((1.0, 0.1), 13), 408: ((1.0, 0.1), 6)},
10
+ # ---
11
+ "256": {1: (0.4, 297), 51: (0.5, 20), 102: ((0.5, 0.33), 10), 204: ((0.5, 1.0), 5), 408: ((0.5, 1.0), 2)},
12
+ "240p": {1: (0.3, 297), 51: (0.4, 20), 102: ((0.4, 0.33), 10), 204: ((0.4, 1.0), 5), 408: ((0.4, 1.0), 2)},
13
+ # ---
14
+ "360p": {1: (0.5, 141), 51: (0.15, 8), 102: ((0.3, 0.5), 4), 204: ((0.3, 1.0), 2), 408: ((0.5, 0.5), 1)},
15
+ "512": {1: (0.4, 141), 51: (0.15, 8), 102: ((0.2, 0.4), 4), 204: ((0.2, 1.0), 2), 408: ((0.4, 0.5), 1)},
16
+ # ---
17
+ "480p": {1: (0.5, 89), 51: (0.2, 5), 102: (0.2, 2), 204: (0.1, 1)},
18
+ # ---
19
+ "720p": {1: (0.1, 36), 51: (0.03, 1)},
20
+ "1024": {1: (0.1, 36), 51: (0.02, 1)},
21
+ # ---
22
+ "1080p": {1: (0.01, 5)},
23
+ # ---
24
+ "2048": {1: (0.01, 5)},
25
+ }
26
+
27
+ grad_checkpoint = True
28
+
29
+ # Acceleration settings
30
+ num_workers = 8
31
+ num_bucket_build_workers = 16
32
+ dtype = "bf16"
33
+ plugin = "zero2"
34
+
35
+ # Model settings
36
+ model = dict(
37
+ type="STDiT3-XL/2",
38
+ from_pretrained=None,
39
+ qk_norm=True,
40
+ enable_flash_attn=True,
41
+ enable_layernorm_kernel=True,
42
+ freeze_y_embedder=True,
43
+ )
44
+ vae = dict(
45
+ type="OpenSoraVAE_V1_2",
46
+ from_pretrained="/mnt/jfs/sora_checkpoints/vae-pipeline",
47
+ micro_frame_size=17,
48
+ micro_batch_size=4,
49
+ )
50
+ text_encoder = dict(
51
+ type="t5",
52
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
53
+ model_max_length=300,
54
+ shardformer=True,
55
+ local_files_only=True,
56
+ )
57
+ scheduler = dict(
58
+ type="rflow",
59
+ use_timestep_transform=True,
60
+ sample_method="logit-normal",
61
+ )
62
+
63
+ # Mask settings
64
+ mask_ratios = {
65
+ "random": 0.05,
66
+ "intepolate": 0.005,
67
+ "quarter_random": 0.005,
68
+ "quarter_head": 0.005,
69
+ "quarter_tail": 0.005,
70
+ "quarter_head_tail": 0.005,
71
+ "image_random": 0.025,
72
+ "image_head": 0.05,
73
+ "image_tail": 0.025,
74
+ "image_head_tail": 0.025,
75
+ }
76
+
77
+ # Log settings
78
+ seed = 42
79
+ outputs = "outputs"
80
+ wandb = False
81
+ epochs = 1000
82
+ log_every = 10
83
+ ckpt_every = 200
84
+
85
+ # optimization settings
86
+ load = None
87
+ grad_clip = 1.0
88
+ lr = 1e-4
89
+ ema_decay = 0.99
90
+ adam_eps = 1e-15
configs/opensora-v1-2/train/stage3.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dataset settings
2
+ dataset = dict(
3
+ type="VariableVideoTextDataset",
4
+ transform_name="resize_crop",
5
+ )
6
+
7
+ # webvid
8
+ bucket_config = { # 20s/it
9
+ "144p": {1: (1.0, 475), 51: (1.0, 51), 102: (1.0, 27), 204: (1.0, 13), 408: (1.0, 6)},
10
+ # ---
11
+ "256": {1: (1.0, 297), 51: (0.5, 20), 102: (0.5, 10), 204: (0.5, 5), 408: ((0.5, 0.5), 2)},
12
+ "240p": {1: (1.0, 297), 51: (0.5, 20), 102: (0.5, 10), 204: (0.5, 5), 408: ((0.5, 0.4), 2)},
13
+ # ---
14
+ "360p": {1: (1.0, 141), 51: (0.5, 8), 102: (0.5, 4), 204: (0.5, 2), 408: ((0.5, 0.3), 1)},
15
+ "512": {1: (1.0, 141), 51: (0.5, 8), 102: (0.5, 4), 204: (0.5, 2), 408: ((0.5, 0.2), 1)},
16
+ # ---
17
+ "480p": {1: (1.0, 89), 51: (0.5, 5), 102: (0.5, 3), 204: ((0.5, 0.5), 1), 408: (0.0, None)},
18
+ # ---
19
+ "720p": {1: (0.3, 36), 51: (0.2, 2), 102: (0.1, 1), 204: (0.0, None)},
20
+ "1024": {1: (0.3, 36), 51: (0.1, 2), 102: (0.1, 1), 204: (0.0, None)},
21
+ # ---
22
+ "1080p": {1: (0.1, 5)},
23
+ # ---
24
+ "2048": {1: (0.05, 5)},
25
+ }
26
+
27
+ grad_checkpoint = True
28
+
29
+ # Acceleration settings
30
+ num_workers = 8
31
+ num_bucket_build_workers = 16
32
+ dtype = "bf16"
33
+ plugin = "zero2"
34
+
35
+ # Model settings
36
+ model = dict(
37
+ type="STDiT3-XL/2",
38
+ from_pretrained=None,
39
+ qk_norm=True,
40
+ enable_flash_attn=True,
41
+ enable_layernorm_kernel=True,
42
+ freeze_y_embedder=True,
43
+ )
44
+ vae = dict(
45
+ type="OpenSoraVAE_V1_2",
46
+ from_pretrained="/mnt/jfs/sora_checkpoints/vae-pipeline",
47
+ micro_frame_size=17,
48
+ micro_batch_size=4,
49
+ )
50
+ text_encoder = dict(
51
+ type="t5",
52
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
53
+ model_max_length=300,
54
+ shardformer=True,
55
+ local_files_only=True,
56
+ )
57
+ scheduler = dict(
58
+ type="rflow",
59
+ use_timestep_transform=True,
60
+ sample_method="logit-normal",
61
+ )
62
+
63
+ # Mask settings
64
+ # 25%
65
+ mask_ratios = {
66
+ "random": 0.01,
67
+ "intepolate": 0.002,
68
+ "quarter_random": 0.002,
69
+ "quarter_head": 0.002,
70
+ "quarter_tail": 0.002,
71
+ "quarter_head_tail": 0.002,
72
+ "image_random": 0.0,
73
+ "image_head": 0.22,
74
+ "image_tail": 0.005,
75
+ "image_head_tail": 0.005,
76
+ }
77
+
78
+ # Log settings
79
+ seed = 42
80
+ outputs = "outputs"
81
+ wandb = False
82
+ epochs = 1000
83
+ log_every = 10
84
+ ckpt_every = 200
85
+
86
+ # optimization settings
87
+ load = None
88
+ grad_clip = 1.0
89
+ lr = 1e-4
90
+ ema_decay = 0.99
91
+ adam_eps = 1e-15
92
+ warmup_steps = 1000
configs/opensora/inference/16x256x256.py CHANGED
@@ -7,7 +7,7 @@ model = dict(
7
  type="STDiT-XL/2",
8
  space_scale=0.5,
9
  time_scale=1.0,
10
- enable_flashattn=True,
11
  enable_layernorm_kernel=True,
12
  from_pretrained="PRETRAINED_MODEL",
13
  )
 
7
  type="STDiT-XL/2",
8
  space_scale=0.5,
9
  time_scale=1.0,
10
+ enable_flash_attn=True,
11
  enable_layernorm_kernel=True,
12
  from_pretrained="PRETRAINED_MODEL",
13
  )
configs/opensora/inference/16x512x512-rflow.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 16
2
+ fps = 24 // 3
3
+ image_size = (512, 512)
4
+
5
+ # Define model
6
+ model = dict(
7
+ type="STDiT-XL/2",
8
+ space_scale=1.0,
9
+ time_scale=1.0,
10
+ enable_flash_attn=True,
11
+ enable_layernorm_kernel=True,
12
+ from_pretrained="PRETRAINED_MODEL",
13
+ )
14
+ vae = dict(
15
+ type="VideoAutoencoderKL",
16
+ from_pretrained="stabilityai/sd-vae-ft-ema",
17
+ micro_batch_size=2,
18
+ )
19
+ text_encoder = dict(
20
+ type="t5",
21
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
22
+ model_max_length=120,
23
+ )
24
+ scheduler = dict(
25
+ type="rflow",
26
+ num_sampling_steps=10,
27
+ cfg_scale=7.0,
28
+ )
29
+ dtype = "bf16"
30
+
31
+ # Others
32
+ batch_size = 2
33
+ seed = 42
34
+ prompt_path = "./assets/texts/t2v_samples.txt"
35
+ save_dir = "./outputs/samples/"
configs/opensora/inference/16x512x512.py CHANGED
@@ -7,7 +7,7 @@ model = dict(
7
  type="STDiT-XL/2",
8
  space_scale=1.0,
9
  time_scale=1.0,
10
- enable_flashattn=True,
11
  enable_layernorm_kernel=True,
12
  from_pretrained="PRETRAINED_MODEL",
13
  )
 
7
  type="STDiT-XL/2",
8
  space_scale=1.0,
9
  time_scale=1.0,
10
+ enable_flash_attn=True,
11
  enable_layernorm_kernel=True,
12
  from_pretrained="PRETRAINED_MODEL",
13
  )
configs/opensora/inference/64x512x512.py CHANGED
@@ -7,7 +7,7 @@ model = dict(
7
  type="STDiT-XL/2",
8
  space_scale=1.0,
9
  time_scale=2 / 3,
10
- enable_flashattn=True,
11
  enable_layernorm_kernel=True,
12
  from_pretrained="PRETRAINED_MODEL",
13
  )
 
7
  type="STDiT-XL/2",
8
  space_scale=1.0,
9
  time_scale=2 / 3,
10
+ enable_flash_attn=True,
11
  enable_layernorm_kernel=True,
12
  from_pretrained="PRETRAINED_MODEL",
13
  )
configs/opensora/train/16x256x256-mask.py CHANGED
@@ -20,12 +20,12 @@ model = dict(
20
  space_scale=0.5,
21
  time_scale=1.0,
22
  from_pretrained="PixArt-XL-2-512x512.pth",
23
- enable_flashattn=True,
24
  enable_layernorm_kernel=True,
25
  )
26
  mask_ratios = {
27
- "mask_no": 0.7,
28
- "mask_random": 0.15,
29
  "mask_head": 0.05,
30
  "mask_tail": 0.05,
31
  "mask_head_tail": 0.05,
 
20
  space_scale=0.5,
21
  time_scale=1.0,
22
  from_pretrained="PixArt-XL-2-512x512.pth",
23
+ enable_flash_attn=True,
24
  enable_layernorm_kernel=True,
25
  )
26
  mask_ratios = {
27
+ "identity": 0.7,
28
+ "random": 0.15,
29
  "mask_head": 0.05,
30
  "mask_tail": 0.05,
31
  "mask_head_tail": 0.05,
configs/opensora/train/16x256x256-spee-rflow.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Define dataset
2
+ dataset = dict(
3
+ type="VideoTextDataset",
4
+ data_path=None,
5
+ num_frames=16,
6
+ frame_interval=3,
7
+ image_size=(256, 256),
8
+ )
9
+
10
+ # Define acceleration
11
+ num_workers = 4
12
+ dtype = "bf16"
13
+ grad_checkpoint = True
14
+ plugin = "zero2"
15
+ sp_size = 1
16
+
17
+ # Define model
18
+ model = dict(
19
+ type="STDiT-XL/2",
20
+ space_scale=0.5,
21
+ time_scale=1.0,
22
+ # from_pretrained="PixArt-XL-2-512x512.pth",
23
+ # from_pretrained = "/home/zhaowangbo/wangbo/PixArt-alpha/pretrained_models/OpenSora-v1-HQ-16x512x512.pth",
24
+ # from_pretrained = "OpenSora-v1-HQ-16x512x512.pth",
25
+ from_pretrained="PRETRAINED_MODEL",
26
+ enable_flash_attn=True,
27
+ enable_layernorm_kernel=True,
28
+ )
29
+ # mask_ratios = [0.5, 0.29, 0.07, 0.07, 0.07]
30
+ # mask_ratios = {
31
+ # "identity": 0.9,
32
+ # "random": 0.06,
33
+ # "mask_head": 0.01,
34
+ # "mask_tail": 0.01,
35
+ # "mask_head_tail": 0.02,
36
+ # }
37
+ vae = dict(
38
+ type="VideoAutoencoderKL",
39
+ from_pretrained="stabilityai/sd-vae-ft-ema",
40
+ )
41
+ text_encoder = dict(
42
+ type="t5",
43
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
44
+ model_max_length=120,
45
+ shardformer=True,
46
+ )
47
+ scheduler = dict(
48
+ type="rflow",
49
+ # timestep_respacing="",
50
+ )
51
+
52
+ # Others
53
+ seed = 42
54
+ outputs = "outputs"
55
+ wandb = True
56
+
57
+ epochs = 1
58
+ log_every = 10
59
+ ckpt_every = 1000
60
+ load = None
61
+
62
+ batch_size = 16
63
+ lr = 2e-5
64
+ grad_clip = 1.0
configs/opensora/train/16x256x256-spee.py CHANGED
@@ -20,12 +20,12 @@ model = dict(
20
  space_scale=0.5,
21
  time_scale=1.0,
22
  from_pretrained="PixArt-XL-2-512x512.pth",
23
- enable_flashattn=True,
24
  enable_layernorm_kernel=True,
25
  )
26
  mask_ratios = {
27
- "mask_no": 0.5,
28
- "mask_random": 0.29,
29
  "mask_head": 0.07,
30
  "mask_tail": 0.07,
31
  "mask_head_tail": 0.07,
 
20
  space_scale=0.5,
21
  time_scale=1.0,
22
  from_pretrained="PixArt-XL-2-512x512.pth",
23
+ enable_flash_attn=True,
24
  enable_layernorm_kernel=True,
25
  )
26
  mask_ratios = {
27
+ "identity": 0.5,
28
+ "random": 0.29,
29
  "mask_head": 0.07,
30
  "mask_tail": 0.07,
31
  "mask_head_tail": 0.07,
configs/opensora/train/16x256x256.py CHANGED
@@ -8,7 +8,7 @@ dataset = dict(
8
  )
9
 
10
  # Define acceleration
11
- num_workers = 4
12
  dtype = "bf16"
13
  grad_checkpoint = True
14
  plugin = "zero2"
@@ -20,7 +20,7 @@ model = dict(
20
  space_scale=0.5,
21
  time_scale=1.0,
22
  from_pretrained="PixArt-XL-2-512x512.pth",
23
- enable_flashattn=True,
24
  enable_layernorm_kernel=True,
25
  )
26
  vae = dict(
 
8
  )
9
 
10
  # Define acceleration
11
+ num_workers = 0
12
  dtype = "bf16"
13
  grad_checkpoint = True
14
  plugin = "zero2"
 
20
  space_scale=0.5,
21
  time_scale=1.0,
22
  from_pretrained="PixArt-XL-2-512x512.pth",
23
+ enable_flash_attn=True,
24
  enable_layernorm_kernel=True,
25
  )
26
  vae = dict(
configs/opensora/train/16x512x512.py CHANGED
@@ -20,7 +20,7 @@ model = dict(
20
  space_scale=1.0,
21
  time_scale=1.0,
22
  from_pretrained=None,
23
- enable_flashattn=True,
24
  enable_layernorm_kernel=True,
25
  )
26
  vae = dict(
 
20
  space_scale=1.0,
21
  time_scale=1.0,
22
  from_pretrained=None,
23
+ enable_flash_attn=True,
24
  enable_layernorm_kernel=True,
25
  )
26
  vae = dict(
configs/opensora/train/360x512x512.py CHANGED
@@ -26,7 +26,7 @@ model = dict(
26
  space_scale=1.0,
27
  time_scale=2 / 3,
28
  from_pretrained=None,
29
- enable_flashattn=True,
30
  enable_layernorm_kernel=True,
31
  enable_sequence_parallelism=True, # enable sq here
32
  )
 
26
  space_scale=1.0,
27
  time_scale=2 / 3,
28
  from_pretrained=None,
29
+ enable_flash_attn=True,
30
  enable_layernorm_kernel=True,
31
  enable_sequence_parallelism=True, # enable sq here
32
  )
configs/opensora/train/64x512x512-sp.py CHANGED
@@ -20,7 +20,7 @@ model = dict(
20
  space_scale=1.0,
21
  time_scale=2 / 3,
22
  from_pretrained=None,
23
- enable_flashattn=True,
24
  enable_layernorm_kernel=True,
25
  enable_sequence_parallelism=True, # enable sq here
26
  )
 
20
  space_scale=1.0,
21
  time_scale=2 / 3,
22
  from_pretrained=None,
23
+ enable_flash_attn=True,
24
  enable_layernorm_kernel=True,
25
  enable_sequence_parallelism=True, # enable sq here
26
  )
configs/opensora/train/64x512x512.py CHANGED
@@ -20,7 +20,7 @@ model = dict(
20
  space_scale=1.0,
21
  time_scale=2 / 3,
22
  from_pretrained=None,
23
- enable_flashattn=True,
24
  enable_layernorm_kernel=True,
25
  )
26
  vae = dict(
 
20
  space_scale=1.0,
21
  time_scale=2 / 3,
22
  from_pretrained=None,
23
+ enable_flash_attn=True,
24
  enable_layernorm_kernel=True,
25
  )
26
  vae = dict(
configs/pixart/inference/1x20481B.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 1
2
+ fps = 1
3
+ image_size = (2560, 1536)
4
+ # image_size = (2048, 2048)
5
+
6
+ model = dict(
7
+ type="PixArt-1B/2",
8
+ from_pretrained="PixArt-1B-2.pth",
9
+ space_scale=4,
10
+ no_temporal_pos_emb=True,
11
+ enable_flash_attn=True,
12
+ enable_layernorm_kernel=True,
13
+ base_size=2048 // 8,
14
+ )
15
+ vae = dict(
16
+ type="VideoAutoencoderKL",
17
+ from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers",
18
+ subfolder="vae",
19
+ )
20
+ text_encoder = dict(
21
+ type="t5",
22
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
23
+ model_max_length=300,
24
+ )
25
+ scheduler = dict(
26
+ type="dpm-solver",
27
+ num_sampling_steps=14,
28
+ cfg_scale=4.5,
29
+ )
30
+ dtype = "bf16"
31
+
32
+ # Others
33
+ batch_size = 1
34
+ seed = 42
35
+ prompt_path = "./assets/texts/t2i_sigma.txt"
36
+ save_dir = "./samples/samples/"
configs/pixart/inference/1x2048MS.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 1
2
+ fps = 1
3
+ image_size = (2560, 1536)
4
+ # image_size = (2048, 2048)
5
+
6
+ model = dict(
7
+ type="PixArt-XL/2",
8
+ from_pretrained="PixArt-Sigma-XL-2-2K-MS.pth",
9
+ space_scale=4,
10
+ no_temporal_pos_emb=True,
11
+ enable_flash_attn=True,
12
+ enable_layernorm_kernel=True,
13
+ base_size=2048 // 8,
14
+ )
15
+ vae = dict(
16
+ type="VideoAutoencoderKL",
17
+ from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers",
18
+ subfolder="vae",
19
+ )
20
+ text_encoder = dict(
21
+ type="t5",
22
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
23
+ model_max_length=300,
24
+ )
25
+ scheduler = dict(
26
+ type="dpm-solver",
27
+ num_sampling_steps=14,
28
+ cfg_scale=4.5,
29
+ )
30
+ dtype = "bf16"
31
+
32
+ # Others
33
+ batch_size = 1
34
+ seed = 42
35
+ prompt_path = "./assets/texts/t2i_sigma.txt"
36
+ save_dir = "./samples/samples/"
configs/pixart/inference/1x512x512-rflow.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 1
2
+ fps = 1
3
+ image_size = (512, 512)
4
+
5
+ # Define model
6
+ model = dict(
7
+ type="PixArt-XL/2",
8
+ space_scale=1.0,
9
+ time_scale=1.0,
10
+ no_temporal_pos_emb=True,
11
+ from_pretrained="PRETRAINED_MODEL",
12
+ )
13
+ vae = dict(
14
+ type="VideoAutoencoderKL",
15
+ from_pretrained="stabilityai/sd-vae-ft-ema",
16
+ )
17
+ text_encoder = dict(
18
+ type="t5",
19
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
20
+ model_max_length=120,
21
+ )
22
+ scheduler = dict(
23
+ type="rflow",
24
+ num_sampling_steps=20,
25
+ cfg_scale=7.0,
26
+ )
27
+ dtype = "bf16"
28
+
29
+ # prompt_path = "./assets/texts/t2i_samples.txt"
30
+ prompt = [
31
+ "Pirate ship trapped in a cosmic maelstrom nebula.",
32
+ "A small cactus with a happy face in the Sahara desert.",
33
+ "A small cactus with a sad face in the Sahara desert.",
34
+ ]
35
+
36
+ # Others
37
+ batch_size = 2
38
+ seed = 42
39
+ save_dir = "./outputs/samples2/"
configs/pixart/train/16x256x256.py CHANGED
@@ -20,7 +20,7 @@ model = dict(
20
  space_scale=0.5,
21
  time_scale=1.0,
22
  from_pretrained="PixArt-XL-2-512x512.pth",
23
- enable_flashattn=True,
24
  enable_layernorm_kernel=True,
25
  )
26
  vae = dict(
 
20
  space_scale=0.5,
21
  time_scale=1.0,
22
  from_pretrained="PixArt-XL-2-512x512.pth",
23
+ enable_flash_attn=True,
24
  enable_layernorm_kernel=True,
25
  )
26
  vae = dict(
configs/pixart/train/1x2048x2048.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Define dataset
2
+ dataset = dict(
3
+ type="VideoTextDataset",
4
+ data_path="/home/zhaowangbo/data/csv/image-v1_1_ext_noempty_rcp_clean_info.csv",
5
+ num_frames=1,
6
+ frame_interval=3,
7
+ image_size=(2048, 2048),
8
+ )
9
+
10
+ # Define acceleration
11
+ num_workers = 4
12
+ dtype = "bf16"
13
+ grad_checkpoint = True
14
+ plugin = "zero2"
15
+ sp_size = 1
16
+
17
+ # Define model
18
+ model = dict(
19
+ type="PixArt-1B/2",
20
+ space_scale=4.0,
21
+ no_temporal_pos_emb=True,
22
+ from_pretrained="PixArt-1B-2.pth",
23
+ enable_flash_attn=True,
24
+ enable_layernorm_kernel=True,
25
+ )
26
+
27
+ vae = dict(
28
+ type="VideoAutoencoderKL",
29
+ from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers",
30
+ subfolder="vae",
31
+ )
32
+ text_encoder = dict(
33
+ type="t5",
34
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
35
+ model_max_length=300,
36
+ )
37
+ scheduler = dict(
38
+ type="iddpm",
39
+ timestep_respacing="",
40
+ )
41
+
42
+ # Others
43
+ seed = 42
44
+ outputs = "outputs"
45
+ wandb = False
46
+
47
+ epochs = 1000
48
+ log_every = 10
49
+ ckpt_every = 1000
50
+ load = None
51
+
52
+ batch_size = 4
53
+ lr = 2e-5
54
+ grad_clip = 1.0
configs/pixart/train/1x512x512-rflow.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Define dataset
2
+ dataset = dict(
3
+ type="VideoTextDataset",
4
+ data_path=None,
5
+ num_frames=1,
6
+ frame_interval=3,
7
+ image_size=(512, 512),
8
+ )
9
+
10
+ # Define acceleration
11
+ num_workers = 4
12
+ dtype = "bf16"
13
+ grad_checkpoint = True
14
+ plugin = "zero2"
15
+ sp_size = 1
16
+
17
+ # Define model
18
+ model = dict(
19
+ type="PixArt-XL/2",
20
+ space_scale=1.0,
21
+ time_scale=1.0,
22
+ no_temporal_pos_emb=True,
23
+ # from_pretrained="PixArt-XL-2-512x512.pth",
24
+ from_pretrained="PRETRAINED_MODEL",
25
+ enable_flash_attn=True,
26
+ enable_layernorm_kernel=True,
27
+ )
28
+ vae = dict(
29
+ type="VideoAutoencoderKL",
30
+ from_pretrained="stabilityai/sd-vae-ft-ema",
31
+ )
32
+ text_encoder = dict(
33
+ type="t5",
34
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
35
+ model_max_length=120,
36
+ shardformer=True,
37
+ )
38
+ scheduler = dict(
39
+ type="rflow",
40
+ # timestep_respacing="",
41
+ )
42
+
43
+ # Others
44
+ seed = 42
45
+ outputs = "outputs"
46
+ wandb = True
47
+
48
+ epochs = 2
49
+ log_every = 10
50
+ ckpt_every = 1000
51
+ load = None
52
+
53
+ batch_size = 64
54
+ lr = 2e-5
55
+ grad_clip = 1.0
configs/pixart/train/1x512x512.py CHANGED
@@ -21,7 +21,7 @@ model = dict(
21
  time_scale=1.0,
22
  no_temporal_pos_emb=True,
23
  from_pretrained="PixArt-XL-2-512x512.pth",
24
- enable_flashattn=True,
25
  enable_layernorm_kernel=True,
26
  )
27
  vae = dict(
 
21
  time_scale=1.0,
22
  no_temporal_pos_emb=True,
23
  from_pretrained="PixArt-XL-2-512x512.pth",
24
+ enable_flash_attn=True,
25
  enable_layernorm_kernel=True,
26
  )
27
  vae = dict(
configs/pixart/train/64x512x512.py CHANGED
@@ -21,7 +21,7 @@ model = dict(
21
  space_scale=1.0,
22
  time_scale=2 / 3,
23
  from_pretrained=None,
24
- enable_flashattn=True,
25
  enable_layernorm_kernel=True,
26
  )
27
  vae = dict(
 
21
  space_scale=1.0,
22
  time_scale=2 / 3,
23
  from_pretrained=None,
24
+ enable_flash_attn=True,
25
  enable_layernorm_kernel=True,
26
  )
27
  vae = dict(
configs/vae/inference/image.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ image_size = (256, 256)
2
+ num_frames = 1
3
+
4
+ dtype = "bf16"
5
+ batch_size = 1
6
+ seed = 42
7
+ save_dir = "samples/vae_video"
8
+ cal_stats = True
9
+ log_stats_every = 100
10
+
11
+ # Define dataset
12
+ dataset = dict(
13
+ type="VideoTextDataset",
14
+ data_path=None,
15
+ num_frames=num_frames,
16
+ image_size=image_size,
17
+ )
18
+ num_samples = 100
19
+ num_workers = 4
20
+
21
+ # Define model
22
+ model = dict(
23
+ type="OpenSoraVAE_V1_2",
24
+ from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
25
+ micro_frame_size=None,
26
+ micro_batch_size=4,
27
+ cal_loss=True,
28
+ )
29
+
30
+ # loss weights
31
+ perceptual_loss_weight = 0.1 # use vgg is not None and more than 0
32
+ kl_loss_weight = 1e-6
configs/vae/inference/video.py ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ image_size = (256, 256)
2
+ num_frames = 17
3
+
4
+ dtype = "bf16"
5
+ batch_size = 1
6
+ seed = 42
7
+ save_dir = "samples/vae_video"
8
+ cal_stats = True
9
+ log_stats_every = 100
10
+
11
+ # Define dataset
12
+ dataset = dict(
13
+ type="VideoTextDataset",
14
+ data_path=None,
15
+ num_frames=num_frames,
16
+ image_size=image_size,
17
+ )
18
+ num_samples = 100
19
+ num_workers = 4
20
+
21
+ # Define model
22
+ model = dict(
23
+ type="OpenSoraVAE_V1_2",
24
+ from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
25
+ micro_frame_size=None,
26
+ micro_batch_size=4,
27
+ cal_loss=True,
28
+ )
29
+
30
+ # loss weights
31
+ perceptual_loss_weight = 0.1 # use vgg is not None and more than 0
32
+ kl_loss_weight = 1e-6
configs/vae/train/stage1.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 17
2
+ image_size = (256, 256)
3
+
4
+ # Define dataset
5
+ dataset = dict(
6
+ type="VideoTextDataset",
7
+ data_path=None,
8
+ num_frames=num_frames,
9
+ frame_interval=1,
10
+ image_size=image_size,
11
+ )
12
+
13
+ # Define acceleration
14
+ num_workers = 16
15
+ dtype = "bf16"
16
+ grad_checkpoint = True
17
+ plugin = "zero2"
18
+
19
+ # Define model
20
+ model = dict(
21
+ type="VideoAutoencoderPipeline",
22
+ freeze_vae_2d=True,
23
+ from_pretrained=None,
24
+ cal_loss=True,
25
+ vae_2d=dict(
26
+ type="VideoAutoencoderKL",
27
+ from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers",
28
+ subfolder="vae",
29
+ local_files_only=True,
30
+ ),
31
+ vae_temporal=dict(
32
+ type="VAE_Temporal_SD",
33
+ from_pretrained=None,
34
+ ),
35
+ )
36
+
37
+ # loss weights
38
+ perceptual_loss_weight = 0.1 # use vgg is not None and more than 0
39
+ kl_loss_weight = 1e-6
40
+
41
+ mixed_strategy = "mixed_video_image"
42
+ mixed_image_ratio = 0.2
43
+ use_real_rec_loss = False
44
+ use_z_rec_loss = True
45
+ use_image_identity_loss = True
46
+
47
+ # Others
48
+ seed = 42
49
+ outputs = "outputs/vae_stage1"
50
+ wandb = False
51
+
52
+ epochs = 100 # NOTE: adjust accordingly w.r.t dataset size
53
+ log_every = 1
54
+ ckpt_every = 1000
55
+ load = None
56
+
57
+ batch_size = 1
58
+ lr = 1e-5
59
+ grad_clip = 1.0
configs/vae/train/stage2.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 17
2
+ image_size = (256, 256)
3
+
4
+ # Define dataset
5
+ dataset = dict(
6
+ type="VideoTextDataset",
7
+ data_path=None,
8
+ num_frames=num_frames,
9
+ frame_interval=1,
10
+ image_size=image_size,
11
+ )
12
+
13
+ # Define acceleration
14
+ num_workers = 16
15
+ dtype = "bf16"
16
+ grad_checkpoint = True
17
+ plugin = "zero2"
18
+
19
+ # Define model
20
+ model = dict(
21
+ type="VideoAutoencoderPipeline",
22
+ freeze_vae_2d=False,
23
+ from_pretrained="outputs/vae_stage1",
24
+ cal_loss=True,
25
+ vae_2d=dict(
26
+ type="VideoAutoencoderKL",
27
+ from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers",
28
+ subfolder="vae",
29
+ local_files_only=True,
30
+ ),
31
+ vae_temporal=dict(
32
+ type="VAE_Temporal_SD",
33
+ from_pretrained=None,
34
+ ),
35
+ )
36
+
37
+ # loss weights
38
+ perceptual_loss_weight = 0.1 # use vgg is not None and more than 0
39
+ kl_loss_weight = 1e-6
40
+
41
+ mixed_strategy = "mixed_video_image"
42
+ mixed_image_ratio = 0.2
43
+ use_real_rec_loss = False
44
+ use_z_rec_loss = True
45
+ use_image_identity_loss = False
46
+
47
+ # Others
48
+ seed = 42
49
+ outputs = "outputs/vae_stage2"
50
+ wandb = False
51
+
52
+ epochs = 100 # NOTE: adjust accordingly w.r.t dataset size
53
+ log_every = 1
54
+ ckpt_every = 1000
55
+ load = None
56
+
57
+ batch_size = 1
58
+ lr = 1e-5
59
+ grad_clip = 1.0
configs/vae/train/stage3.py ADDED
@@ -0,0 +1,58 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 33
2
+ image_size = (256, 256)
3
+
4
+ # Define dataset
5
+ dataset = dict(
6
+ type="VideoTextDataset",
7
+ data_path=None,
8
+ num_frames=num_frames,
9
+ frame_interval=1,
10
+ image_size=image_size,
11
+ )
12
+
13
+ # Define acceleration
14
+ num_workers = 16
15
+ dtype = "bf16"
16
+ grad_checkpoint = True
17
+ plugin = "zero2"
18
+
19
+ # Define model
20
+ model = dict(
21
+ type="VideoAutoencoderPipeline",
22
+ freeze_vae_2d=False,
23
+ from_pretrained="outputs/vae_stage2",
24
+ cal_loss=True,
25
+ vae_2d=dict(
26
+ type="VideoAutoencoderKL",
27
+ from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers",
28
+ subfolder="vae",
29
+ local_files_only=True,
30
+ ),
31
+ vae_temporal=dict(
32
+ type="VAE_Temporal_SD",
33
+ from_pretrained=None,
34
+ ),
35
+ )
36
+
37
+ # loss weights
38
+ perceptual_loss_weight = 0.1 # use vgg is not None and more than 0
39
+ kl_loss_weight = 1e-6
40
+
41
+ mixed_strategy = "mixed_video_random"
42
+ use_real_rec_loss = True
43
+ use_z_rec_loss = False
44
+ use_image_identity_loss = False
45
+
46
+ # Others
47
+ seed = 42
48
+ outputs = "outputs/vae_stage3"
49
+ wandb = False
50
+
51
+ epochs = 100 # NOTE: adjust accordingly w.r.t dataset size
52
+ log_every = 1
53
+ ckpt_every = 1000
54
+ load = None
55
+
56
+ batch_size = 1
57
+ lr = 1e-5
58
+ grad_clip = 1.0