kadirnar commited on
Commit
fd2cf14
1 Parent(s): 0b92ae1

Upload 59 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. README.md +8 -8
  2. app.py +630 -288
  3. configs/dit/inference/16x256x256.py +2 -2
  4. configs/dit/inference/1x256x256-class.py +2 -2
  5. configs/dit/inference/1x256x256.py +2 -2
  6. configs/dit/train/16x256x256.py +10 -10
  7. configs/dit/train/1x256x256.py +10 -9
  8. configs/latte/inference/16x256x256-class.py +2 -2
  9. configs/latte/inference/16x256x256.py +2 -2
  10. configs/latte/train/16x256x256.py +9 -9
  11. configs/opensora-v1-1/inference/sample-ref.py +64 -0
  12. configs/opensora-v1-1/inference/sample.py +44 -0
  13. configs/opensora-v1-1/train/benchmark.py +102 -0
  14. configs/opensora-v1-1/train/image.py +66 -0
  15. configs/opensora-v1-1/train/image_rflow.py +88 -0
  16. configs/opensora-v1-1/train/stage1.py +78 -0
  17. configs/opensora-v1-1/train/stage2.py +80 -0
  18. configs/opensora-v1-1/train/stage3.py +80 -0
  19. configs/opensora-v1-1/train/video.py +68 -0
  20. configs/opensora-v1-2/inference/sample.py +42 -0
  21. configs/opensora-v1-2/misc/bs.py +117 -0
  22. configs/opensora-v1-2/misc/eval_loss.py +49 -0
  23. configs/opensora-v1-2/misc/extract.py +62 -0
  24. configs/opensora-v1-2/misc/feat.py +94 -0
  25. configs/opensora-v1-2/train/adapt.py +84 -0
  26. configs/opensora-v1-2/train/stage1.py +111 -0
  27. configs/opensora-v1-2/train/stage1_feat.py +59 -0
  28. configs/opensora-v1-2/train/stage2.py +90 -0
  29. configs/opensora-v1-2/train/stage3.py +92 -0
  30. configs/opensora/inference/16x256x256.py +13 -8
  31. configs/opensora/inference/16x512x512-rflow.py +35 -0
  32. configs/opensora/inference/16x512x512.py +7 -7
  33. configs/opensora/inference/64x512x512.py +5 -5
  34. configs/opensora/train/16x256x256-mask.py +60 -0
  35. configs/opensora/train/16x256x256-spee-rflow.py +64 -0
  36. configs/opensora/train/16x256x256-spee.py +60 -0
  37. configs/opensora/train/16x256x256.py +10 -10
  38. configs/opensora/train/16x512x512.py +11 -11
  39. configs/opensora/train/360x512x512.py +15 -9
  40. configs/opensora/train/64x512x512-sp.py +11 -11
  41. configs/opensora/train/64x512x512.py +10 -10
  42. configs/pixart/inference/16x256x256.py +5 -5
  43. configs/pixart/inference/1x1024MS.py +4 -4
  44. configs/pixart/inference/1x20481B.py +36 -0
  45. configs/pixart/inference/1x2048MS.py +36 -0
  46. configs/pixart/inference/1x256x256.py +4 -4
  47. configs/pixart/inference/1x512x512-rflow.py +39 -0
  48. configs/pixart/inference/1x512x512.py +11 -5
  49. configs/pixart/train/16x256x256.py +11 -11
  50. configs/pixart/train/1x2048x2048.py +54 -0
README.md CHANGED
@@ -1,13 +1,13 @@
1
  ---
2
- title: ComfyUI-A10
3
- emoji: 👕👔👚
4
- colorFrom: yellow
5
- colorTo: red
6
- sdk: docker
7
- sdk_version: 4.36.1
8
  app_file: app.py
9
  pinned: false
10
- license: cc-by-nc-sa-4.0
11
  ---
12
 
13
- Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Open Sora
3
+ emoji:
4
+ colorFrom: blue
5
+ colorTo: purple
6
+ sdk: gradio
7
+ sdk_version: 4.25.0
8
  app_file: app.py
9
  pinned: false
10
+ license: apache-2.0
11
  ---
12
 
13
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -1,311 +1,653 @@
1
- import gradio as gr
2
- from PIL import Image
3
- from src.tryon_pipeline import StableDiffusionXLInpaintPipeline as TryonPipeline
4
- from src.unet_hacked_garmnet import UNet2DConditionModel as UNet2DConditionModel_ref
5
- from src.unet_hacked_tryon import UNet2DConditionModel
6
- from transformers import (
7
- CLIPImageProcessor,
8
- CLIPVisionModelWithProjection,
9
- CLIPTextModel,
10
- CLIPTextModelWithProjection,
11
- )
12
- from diffusers import DDPMScheduler,AutoencoderKL
13
- from typing import List
14
 
15
- import torch
 
 
 
 
 
 
16
  import os
17
- from transformers import AutoTokenizer
18
- import numpy as np
19
- from utils_mask import get_mask_location
20
- from torchvision import transforms
21
- import apply_net
22
- from preprocess.humanparsing.run_parsing import Parsing
23
- from preprocess.openpose.run_openpose import OpenPose
24
- from detectron2.data.detection_utils import convert_PIL_to_numpy,_apply_exif_orientation
25
- from torchvision.transforms.functional import to_pil_image
26
-
27
-
28
- def pil_to_binary_mask(pil_image, threshold=0):
29
- np_image = np.array(pil_image)
30
- grayscale_image = Image.fromarray(np_image).convert("L")
31
- binary_mask = np.array(grayscale_image) > threshold
32
- mask = np.zeros(binary_mask.shape, dtype=np.uint8)
33
- for i in range(binary_mask.shape[0]):
34
- for j in range(binary_mask.shape[1]):
35
- if binary_mask[i,j] == True :
36
- mask[i,j] = 1
37
- mask = (mask*255).astype(np.uint8)
38
- output_mask = Image.fromarray(mask)
39
- return output_mask
40
-
41
-
42
- base_path = 'yisol/IDM-VTON'
43
- example_path = os.path.join(os.path.dirname(__file__), 'example')
44
-
45
- unet = UNet2DConditionModel.from_pretrained(
46
- base_path,
47
- subfolder="unet",
48
- torch_dtype=torch.float16,
49
- )
50
- unet.requires_grad_(False)
51
- tokenizer_one = AutoTokenizer.from_pretrained(
52
- base_path,
53
- subfolder="tokenizer",
54
- revision=None,
55
- use_fast=False,
56
- )
57
- tokenizer_two = AutoTokenizer.from_pretrained(
58
- base_path,
59
- subfolder="tokenizer_2",
60
- revision=None,
61
- use_fast=False,
62
- )
63
- noise_scheduler = DDPMScheduler.from_pretrained(base_path, subfolder="scheduler")
64
 
65
- text_encoder_one = CLIPTextModel.from_pretrained(
66
- base_path,
67
- subfolder="text_encoder",
68
- torch_dtype=torch.float16,
69
- )
70
- text_encoder_two = CLIPTextModelWithProjection.from_pretrained(
71
- base_path,
72
- subfolder="text_encoder_2",
73
- torch_dtype=torch.float16,
74
- )
75
- image_encoder = CLIPVisionModelWithProjection.from_pretrained(
76
- base_path,
77
- subfolder="image_encoder",
78
- torch_dtype=torch.float16,
79
- )
80
- vae = AutoencoderKL.from_pretrained(base_path,
81
- subfolder="vae",
82
- torch_dtype=torch.float16,
83
- )
84
 
85
- # "stabilityai/stable-diffusion-xl-base-1.0",
86
- UNet_Encoder = UNet2DConditionModel_ref.from_pretrained(
87
- base_path,
88
- subfolder="unet_encoder",
89
- torch_dtype=torch.float16,
90
- )
91
 
92
- parsing_model = Parsing(0)
93
- openpose_model = OpenPose(0)
94
-
95
- UNet_Encoder.requires_grad_(False)
96
- image_encoder.requires_grad_(False)
97
- vae.requires_grad_(False)
98
- unet.requires_grad_(False)
99
- text_encoder_one.requires_grad_(False)
100
- text_encoder_two.requires_grad_(False)
101
- tensor_transfrom = transforms.Compose(
102
- [
103
- transforms.ToTensor(),
104
- transforms.Normalize([0.5], [0.5]),
105
- ]
106
- )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
107
 
108
- pipe = TryonPipeline.from_pretrained(
109
- base_path,
110
- unet=unet,
111
- vae=vae,
112
- feature_extractor= CLIPImageProcessor(),
113
- text_encoder = text_encoder_one,
114
- text_encoder_2 = text_encoder_two,
115
- tokenizer = tokenizer_one,
116
- tokenizer_2 = tokenizer_two,
117
- scheduler = noise_scheduler,
118
- image_encoder=image_encoder,
119
- torch_dtype=torch.float16,
120
- )
121
- pipe.unet_encoder = UNet_Encoder
122
-
123
- def start_tryon(dict,garm_img,garment_des,is_checked,is_checked_crop,denoise_steps,seed):
124
- device = "cuda"
125
-
126
- openpose_model.preprocessor.body_estimation.model.to(device)
127
- pipe.to(device)
128
- pipe.unet_encoder.to(device)
129
-
130
- garm_img= garm_img.convert("RGB").resize((768,1024))
131
- human_img_orig = dict["background"].convert("RGB")
132
-
133
- if is_checked_crop:
134
- width, height = human_img_orig.size
135
- target_width = int(min(width, height * (3 / 4)))
136
- target_height = int(min(height, width * (4 / 3)))
137
- left = (width - target_width) / 2
138
- top = (height - target_height) / 2
139
- right = (width + target_width) / 2
140
- bottom = (height + target_height) / 2
141
- cropped_img = human_img_orig.crop((left, top, right, bottom))
142
- crop_size = cropped_img.size
143
- human_img = cropped_img.resize((768,1024))
144
- else:
145
- human_img = human_img_orig.resize((768,1024))
146
 
 
 
 
 
 
 
147
 
148
- if is_checked:
149
- keypoints = openpose_model(human_img.resize((384,512)))
150
- model_parse, _ = parsing_model(human_img.resize((384,512)))
151
- mask, mask_gray = get_mask_location('hd', "upper_body", model_parse, keypoints)
152
- mask = mask.resize((768,1024))
153
- else:
154
- mask = pil_to_binary_mask(dict['layers'][0].convert("RGB").resize((768, 1024)))
155
- # mask = transforms.ToTensor()(mask)
156
- # mask = mask.unsqueeze(0)
157
- mask_gray = (1-transforms.ToTensor()(mask)) * tensor_transfrom(human_img)
158
- mask_gray = to_pil_image((mask_gray+1.0)/2.0)
159
-
160
-
161
- human_img_arg = _apply_exif_orientation(human_img.resize((384,512)))
162
- human_img_arg = convert_PIL_to_numpy(human_img_arg, format="BGR")
163
-
164
-
165
-
166
- args = apply_net.create_argument_parser().parse_args(('show', './configs/densepose_rcnn_R_50_FPN_s1x.yaml', './ckpt/densepose/model_final_162be9.pkl', 'dp_segm', '-v', '--opts', 'MODEL.DEVICE', 'cuda'))
167
- # verbosity = getattr(args, "verbosity", None)
168
- pose_img = args.func(args,human_img_arg)
169
- pose_img = pose_img[:,:,::-1]
170
- pose_img = Image.fromarray(pose_img).resize((768,1024))
171
-
172
- with torch.no_grad():
173
- # Extract the images
174
- with torch.cuda.amp.autocast():
175
- with torch.no_grad():
176
- prompt = "model is wearing " + garment_des
177
- negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
178
- with torch.inference_mode():
179
- (
180
- prompt_embeds,
181
- negative_prompt_embeds,
182
- pooled_prompt_embeds,
183
- negative_pooled_prompt_embeds,
184
- ) = pipe.encode_prompt(
185
- prompt,
186
- num_images_per_prompt=1,
187
- do_classifier_free_guidance=True,
188
- negative_prompt=negative_prompt,
189
- )
190
-
191
- prompt = "a photo of " + garment_des
192
- negative_prompt = "monochrome, lowres, bad anatomy, worst quality, low quality"
193
- if not isinstance(prompt, List):
194
- prompt = [prompt] * 1
195
- if not isinstance(negative_prompt, List):
196
- negative_prompt = [negative_prompt] * 1
197
- with torch.inference_mode():
198
- (
199
- prompt_embeds_c,
200
- _,
201
- _,
202
- _,
203
- ) = pipe.encode_prompt(
204
- prompt,
205
- num_images_per_prompt=1,
206
- do_classifier_free_guidance=False,
207
- negative_prompt=negative_prompt,
208
- )
209
 
 
 
 
 
 
 
 
 
210
 
 
211
 
212
- pose_img = tensor_transfrom(pose_img).unsqueeze(0).to(device,torch.float16)
213
- garm_tensor = tensor_transfrom(garm_img).unsqueeze(0).to(device,torch.float16)
214
- generator = torch.Generator(device).manual_seed(seed) if seed is not None else None
215
- pipe.enable_vae_slicing()
216
- pipe.enable_xformers_memory_efficient_attention()
217
- pipe.enable_vae_tiling()
218
- images = pipe(
219
- prompt_embeds=prompt_embeds.to(device,torch.float16),
220
- negative_prompt_embeds=negative_prompt_embeds.to(device,torch.float16),
221
- pooled_prompt_embeds=pooled_prompt_embeds.to(device,torch.float16),
222
- negative_pooled_prompt_embeds=negative_pooled_prompt_embeds.to(device,torch.float16),
223
- num_inference_steps=denoise_steps,
224
- generator=generator,
225
- strength = 1.0,
226
- pose_img = pose_img.to(device,torch.float16),
227
- text_embeds_cloth=prompt_embeds_c.to(device,torch.float16),
228
- cloth = garm_tensor.to(device,torch.float16),
229
- mask_image=mask,
230
- image=human_img,
231
- height=1024,
232
- width=768,
233
- ip_adapter_image = garm_img.resize((768,1024)),
234
- guidance_scale=2.0,
235
- )[0]
236
-
237
- if is_checked_crop:
238
- out_img = images[0].resize(crop_size)
239
- human_img_orig.paste(out_img, (int(left), int(top)))
240
- return human_img_orig, mask_gray
241
- else:
242
- return images[0], mask_gray
243
- # return images[0], mask_gray
244
-
245
- garm_list = os.listdir(os.path.join(example_path,"cloth"))
246
- garm_list_path = [os.path.join(example_path,"cloth",garm) for garm in garm_list]
247
-
248
- human_list = os.listdir(os.path.join(example_path,"human"))
249
- human_list_path = [os.path.join(example_path,"human",human) for human in human_list]
250
-
251
- human_ex_list = []
252
- for ex_human in human_list_path:
253
- ex_dict= {}
254
- ex_dict['background'] = ex_human
255
- ex_dict['layers'] = None
256
- ex_dict['composite'] = None
257
- human_ex_list.append(ex_dict)
258
-
259
- ##default human
260
-
261
- image_blocks = gr.Blocks().queue()
262
- with image_blocks as demo:
263
- with gr.Row():
264
- with gr.Column():
265
- imgs = gr.ImageEditor(sources='upload', type="pil", label='Human. Mask with pen or use auto-masking', interactive=True)
266
- with gr.Row():
267
- is_checked = gr.Checkbox(label="Yes", info="Use auto-generated mask (Takes 5 seconds)",value=True)
268
- with gr.Row():
269
- is_checked_crop = gr.Checkbox(label="Yes", info="Use auto-crop & resizing",value=False)
270
-
271
- example = gr.Examples(
272
- inputs=imgs,
273
- examples_per_page=10,
274
- examples=human_ex_list
275
- )
276
 
277
- with gr.Column():
278
- garm_img = gr.Image(label="Garment", sources='upload', type="pil")
279
- with gr.Row(elem_id="prompt-container"):
280
- with gr.Row():
281
- prompt = gr.Textbox(placeholder="Description of garment ex) Short Sleeve Round Neck T-shirts", show_label=False, elem_id="prompt")
282
- example = gr.Examples(
283
- inputs=garm_img,
284
- examples_per_page=8,
285
- examples=garm_list_path)
286
- with gr.Column():
287
- # image_out = gr.Image(label="Output", elem_id="output-img", height=400)
288
- masked_img = gr.Image(label="Masked image output", elem_id="masked-img",show_share_button=False)
289
- with gr.Column():
290
- # image_out = gr.Image(label="Output", elem_id="output-img", height=400)
291
- image_out = gr.Image(label="Output", elem_id="output-img",show_share_button=False)
292
 
 
293
 
 
 
 
294
 
 
 
 
 
295
 
296
- with gr.Column():
297
- try_button = gr.Button(value="Try-on")
298
- with gr.Accordion(label="Advanced Settings", open=False):
299
- with gr.Row():
300
- denoise_steps = gr.Number(label="Denoising Steps", minimum=20, maximum=40, value=30, step=1)
301
- seed = gr.Number(label="Seed", minimum=-1, maximum=2147483647, step=1, value=42)
302
 
 
 
303
 
 
304
 
305
- try_button.click(fn=start_tryon, inputs=[imgs, garm_img, prompt, is_checked,is_checked_crop, denoise_steps, seed], outputs=[image_out,masked_img], api_name='tryon')
 
306
 
307
-
 
 
 
 
 
 
 
308
 
309
 
310
- image_blocks.launch()
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
311
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python
2
+ """
3
+ This script runs a Gradio App for the Open-Sora model.
 
 
 
 
 
 
 
 
 
 
4
 
5
+ Usage:
6
+ python demo.py <config-path>
7
+ """
8
+
9
+ import argparse
10
+ import datetime
11
+ import importlib
12
  import os
13
+ import subprocess
14
+ import sys
15
+ from tempfile import NamedTemporaryFile
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
 
17
+ import spaces
18
+ import torch
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
19
 
20
+ import gradio as gr
 
 
 
 
 
21
 
22
+ MODEL_TYPES = ["v1.2-stage3"]
23
+ WATERMARK_PATH = "./assets/images/watermark/watermark.png"
24
+ CONFIG_MAP = {
25
+ "v1.2-stage3": "configs/opensora-v1-2/inference/sample.py",
26
+ }
27
+ HF_STDIT_MAP = {"v1.2-stage3": "hpcai-tech/OpenSora-STDiT-v3"}
28
+
29
+
30
+ # ============================
31
+ # Prepare Runtime Environment
32
+ # ============================
33
+ def install_dependencies(enable_optimization=False):
34
+ """
35
+ Install the required dependencies for the demo if they are not already installed.
36
+ """
37
+
38
+ def _is_package_available(name) -> bool:
39
+ try:
40
+ importlib.import_module(name)
41
+ return True
42
+ except (ImportError, ModuleNotFoundError):
43
+ return False
44
+
45
+ if enable_optimization:
46
+ # install flash attention
47
+ if not _is_package_available("flash_attn"):
48
+ subprocess.run(
49
+ f"{sys.executable} -m pip install flash-attn --no-build-isolation",
50
+ env={"FLASH_ATTENTION_SKIP_CUDA_BUILD": "TRUE"},
51
+ shell=True,
52
+ )
53
+
54
+ # install apex for fused layernorm
55
+ if not _is_package_available("apex"):
56
+ subprocess.run(
57
+ f'{sys.executable} -m pip install -v --disable-pip-version-check --no-cache-dir --no-build-isolation --config-settings "--build-option=--cpp_ext" --config-settings "--build-option=--cuda_ext" git+https://github.com/NVIDIA/apex.git',
58
+ shell=True,
59
+ )
60
 
61
+ # install ninja
62
+ if not _is_package_available("ninja"):
63
+ subprocess.run(f"{sys.executable} -m pip install ninja", shell=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
64
 
65
+ # install xformers
66
+ if not _is_package_available("xformers"):
67
+ subprocess.run(
68
+ f"{sys.executable} -m pip install -v -U git+https://github.com/facebookresearch/xformers.git@main#egg=xformers",
69
+ shell=True,
70
+ )
71
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
73
+ # ============================
74
+ # Model-related
75
+ # ============================
76
+ def read_config(config_path):
77
+ """
78
+ Read the configuration file.
79
+ """
80
+ from mmengine.config import Config
81
 
82
+ return Config.fromfile(config_path)
83
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
84
 
85
+ def build_models(model_type, config, enable_optimization=False):
86
+ """
87
+ Build the models for the given model type and configuration.
88
+ """
89
+ # build vae
90
+ from opensora.registry import MODELS, build_module
 
 
 
 
 
 
 
 
 
91
 
92
+ vae = build_module(config.vae, MODELS).cuda()
93
 
94
+ # build text encoder
95
+ text_encoder = build_module(config.text_encoder, MODELS) # T5 must be fp32
96
+ text_encoder.t5.model = text_encoder.t5.model.cuda()
97
 
98
+ # build stdit
99
+ # we load model from HuggingFace directly so that we don't need to
100
+ # handle model download logic in HuggingFace Space
101
+ from opensora.models.stdit.stdit3 import STDiT3
102
 
103
+ stdit = STDiT3.from_pretrained(HF_STDIT_MAP[model_type])
104
+ stdit = stdit.cuda()
 
 
 
 
105
 
106
+ # build scheduler
107
+ from opensora.registry import SCHEDULERS
108
 
109
+ scheduler = build_module(config.scheduler, SCHEDULERS)
110
 
111
+ # hack for classifier-free guidance
112
+ text_encoder.y_embedder = stdit.y_embedder
113
 
114
+ # move modelst to device
115
+ vae = vae.to(torch.bfloat16).eval()
116
+ text_encoder.t5.model = text_encoder.t5.model.eval() # t5 must be in fp32
117
+ stdit = stdit.to(torch.bfloat16).eval()
118
+
119
+ # clear cuda
120
+ torch.cuda.empty_cache()
121
+ return vae, text_encoder, stdit, scheduler
122
 
123
 
124
+ def parse_args():
125
+ parser = argparse.ArgumentParser()
126
+ parser.add_argument(
127
+ "--model-type",
128
+ default="v1.2-stage3",
129
+ choices=MODEL_TYPES,
130
+ help=f"The type of model to run for the Gradio App, can only be {MODEL_TYPES}",
131
+ )
132
+ parser.add_argument("--output", default="./outputs", type=str, help="The path to the output folder")
133
+ parser.add_argument("--port", default=None, type=int, help="The port to run the Gradio App on.")
134
+ parser.add_argument("--host", default="0.0.0.0", type=str, help="The host to run the Gradio App on.")
135
+ parser.add_argument("--share", action="store_true", help="Whether to share this gradio demo.")
136
+ parser.add_argument(
137
+ "--enable-optimization",
138
+ action="store_true",
139
+ help="Whether to enable optimization such as flash attention and fused layernorm",
140
+ )
141
+ return parser.parse_args()
142
+
143
+
144
+ # ============================
145
+ # Main Gradio Script
146
+ # ============================
147
+ # as `run_inference` needs to be wrapped by `spaces.GPU` and the input can only be the prompt text
148
+ # so we can't pass the models to `run_inference` as arguments.
149
+ # instead, we need to define them globally so that we can access these models inside `run_inference`
150
+
151
+ # read config
152
+ args = parse_args()
153
+ config = read_config(CONFIG_MAP[args.model_type])
154
+ torch.backends.cuda.matmul.allow_tf32 = True
155
+ torch.backends.cudnn.allow_tf32 = True
156
+
157
+ # make outputs dir
158
+ os.makedirs(args.output, exist_ok=True)
159
+
160
+ # disable torch jit as it can cause failure in gradio SDK
161
+ # gradio sdk uses torch with cuda 11.3
162
+ torch.jit._state.disable()
163
+
164
+ # set up
165
+ install_dependencies(enable_optimization=args.enable_optimization)
166
+
167
+ # import after installation
168
+ from opensora.datasets import IMG_FPS, save_sample
169
+ from opensora.datasets.aspect import get_image_size, get_num_frames
170
+ from opensora.models.text_encoder.t5 import text_preprocessing
171
+ from opensora.utils.inference_utils import (
172
+ add_watermark,
173
+ append_generated,
174
+ append_score_to_prompts,
175
+ apply_mask_strategy,
176
+ collect_references_batch,
177
+ dframe_to_frame,
178
+ extract_json_from_prompts,
179
+ extract_prompts_loop,
180
+ get_random_prompt_by_openai,
181
+ has_openai_key,
182
+ merge_prompt,
183
+ prepare_multi_resolution_info,
184
+ refine_prompts_by_openai,
185
+ split_prompt,
186
+ has_openai_key
187
+ )
188
+ from opensora.utils.misc import to_torch_dtype
189
+
190
+ # some global variables
191
+ dtype = to_torch_dtype(config.dtype)
192
+ device = torch.device("cuda")
193
+
194
+ # build model
195
+ vae, text_encoder, stdit, scheduler = build_models(
196
+ args.model_type, config, enable_optimization=args.enable_optimization
197
+ )
198
+
199
+
200
+ def run_inference(
201
+ mode,
202
+ prompt_text,
203
+ resolution,
204
+ aspect_ratio,
205
+ length,
206
+ motion_strength,
207
+ aesthetic_score,
208
+ use_motion_strength,
209
+ use_aesthetic_score,
210
+ camera_motion,
211
+ reference_image,
212
+ refine_prompt,
213
+ fps,
214
+ num_loop,
215
+ seed,
216
+ sampling_steps,
217
+ cfg_scale,
218
+ ):
219
+ if prompt_text is None or prompt_text == "":
220
+ gr.Warning("Your prompt is empty, please enter a valid prompt")
221
+ return None
222
+
223
+ torch.manual_seed(seed)
224
+ with torch.inference_mode():
225
+ # ======================
226
+ # 1. Preparation arguments
227
+ # ======================
228
+ # parse the inputs
229
+ # frame_interval must be 1 so we ignore it here
230
+ image_size = get_image_size(resolution, aspect_ratio)
231
+
232
+ # compute generation parameters
233
+ if mode == "Text2Image":
234
+ num_frames = 1
235
+ fps = IMG_FPS
236
+ else:
237
+ num_frames = config.num_frames
238
+ num_frames = get_num_frames(length)
239
+
240
+ condition_frame_length = int(num_frames / 17 * 5 / 3)
241
+ condition_frame_edit = 0.0
242
+
243
+ input_size = (num_frames, *image_size)
244
+ latent_size = vae.get_latent_size(input_size)
245
+ multi_resolution = "OpenSora"
246
+ align = 5
247
+
248
+ # == prepare mask strategy ==
249
+ if mode == "Text2Image":
250
+ mask_strategy = [None]
251
+ elif mode == "Text2Video":
252
+ if reference_image is not None:
253
+ mask_strategy = ["0"]
254
+ else:
255
+ mask_strategy = [None]
256
+ else:
257
+ raise ValueError(f"Invalid mode: {mode}")
258
+
259
+ # == prepare reference ==
260
+ if mode == "Text2Image":
261
+ refs = [""]
262
+ elif mode == "Text2Video":
263
+ if reference_image is not None:
264
+ # save image to disk
265
+ from PIL import Image
266
+
267
+ im = Image.fromarray(reference_image)
268
+ temp_file = NamedTemporaryFile(suffix=".png")
269
+ im.save(temp_file.name)
270
+ refs = [temp_file.name]
271
+ else:
272
+ refs = [""]
273
+ else:
274
+ raise ValueError(f"Invalid mode: {mode}")
275
+
276
+ # == get json from prompts ==
277
+ batch_prompts = [prompt_text]
278
+ batch_prompts, refs, mask_strategy = extract_json_from_prompts(batch_prompts, refs, mask_strategy)
279
+
280
+ # == get reference for condition ==
281
+ refs = collect_references_batch(refs, vae, image_size)
282
+
283
+ # == multi-resolution info ==
284
+ model_args = prepare_multi_resolution_info(
285
+ multi_resolution, len(batch_prompts), image_size, num_frames, fps, device, dtype
286
+ )
287
+
288
+ # == process prompts step by step ==
289
+ # 0. split prompt
290
+ # each element in the list is [prompt_segment_list, loop_idx_list]
291
+ batched_prompt_segment_list = []
292
+ batched_loop_idx_list = []
293
+ for prompt in batch_prompts:
294
+ prompt_segment_list, loop_idx_list = split_prompt(prompt)
295
+ batched_prompt_segment_list.append(prompt_segment_list)
296
+ batched_loop_idx_list.append(loop_idx_list)
297
+
298
+ # 1. refine prompt by openai
299
+ if refine_prompt:
300
+ # check if openai key is provided
301
+ if not has_openai_key():
302
+ gr.Warning("OpenAI API key is not provided, the prompt will not be enhanced.")
303
+ else:
304
+ for idx, prompt_segment_list in enumerate(batched_prompt_segment_list):
305
+ batched_prompt_segment_list[idx] = refine_prompts_by_openai(prompt_segment_list)
306
+
307
+ # process scores
308
+ aesthetic_score = aesthetic_score if use_aesthetic_score else None
309
+ motion_strength = motion_strength if use_motion_strength and mode != "Text2Image" else None
310
+ camera_motion = None if camera_motion == "none" or mode == "Text2Image" else camera_motion
311
+ # 2. append score
312
+ for idx, prompt_segment_list in enumerate(batched_prompt_segment_list):
313
+ batched_prompt_segment_list[idx] = append_score_to_prompts(
314
+ prompt_segment_list,
315
+ aes=aesthetic_score,
316
+ flow=motion_strength,
317
+ camera_motion=camera_motion,
318
+ )
319
+
320
+ # 3. clean prompt with T5
321
+ for idx, prompt_segment_list in enumerate(batched_prompt_segment_list):
322
+ batched_prompt_segment_list[idx] = [text_preprocessing(prompt) for prompt in prompt_segment_list]
323
+
324
+ # 4. merge to obtain the final prompt
325
+ batch_prompts = []
326
+ for prompt_segment_list, loop_idx_list in zip(batched_prompt_segment_list, batched_loop_idx_list):
327
+ batch_prompts.append(merge_prompt(prompt_segment_list, loop_idx_list))
328
+
329
+ # =========================
330
+ # Generate image/video
331
+ # =========================
332
+ video_clips = []
333
+
334
+ for loop_i in range(num_loop):
335
+ # 4.4 sample in hidden space
336
+ batch_prompts_loop = extract_prompts_loop(batch_prompts, loop_i)
337
+
338
+ # == loop ==
339
+ if loop_i > 0:
340
+ refs, mask_strategy = append_generated(
341
+ vae, video_clips[-1], refs, mask_strategy, loop_i, condition_frame_length, condition_frame_edit
342
+ )
343
+
344
+ # == sampling ==
345
+ z = torch.randn(len(batch_prompts), vae.out_channels, *latent_size, device=device, dtype=dtype)
346
+ masks = apply_mask_strategy(z, refs, mask_strategy, loop_i, align=align)
347
+
348
+ # 4.6. diffusion sampling
349
+ # hack to update num_sampling_steps and cfg_scale
350
+ scheduler_kwargs = config.scheduler.copy()
351
+ scheduler_kwargs.pop("type")
352
+ scheduler_kwargs["num_sampling_steps"] = sampling_steps
353
+ scheduler_kwargs["cfg_scale"] = cfg_scale
354
+
355
+ scheduler.__init__(**scheduler_kwargs)
356
+ samples = scheduler.sample(
357
+ stdit,
358
+ text_encoder,
359
+ z=z,
360
+ prompts=batch_prompts_loop,
361
+ device=device,
362
+ additional_args=model_args,
363
+ progress=True,
364
+ mask=masks,
365
+ )
366
+ samples = vae.decode(samples.to(dtype), num_frames=num_frames)
367
+ video_clips.append(samples)
368
+
369
+ # =========================
370
+ # Save output
371
+ # =========================
372
+ video_clips = [val[0] for val in video_clips]
373
+ for i in range(1, num_loop):
374
+ video_clips[i] = video_clips[i][:, dframe_to_frame(condition_frame_length) :]
375
+ video = torch.cat(video_clips, dim=1)
376
+ current_datetime = datetime.datetime.now()
377
+ timestamp = current_datetime.timestamp()
378
+ save_path = os.path.join(args.output, f"output_{timestamp}")
379
+ saved_path = save_sample(video, save_path=save_path, fps=24)
380
+ torch.cuda.empty_cache()
381
+
382
+ # add watermark
383
+ # all watermarked videos should have a _watermarked suffix
384
+ if mode != "Text2Image" and os.path.exists(WATERMARK_PATH):
385
+ watermarked_path = saved_path.replace(".mp4", "_watermarked.mp4")
386
+ success = add_watermark(saved_path, WATERMARK_PATH, watermarked_path)
387
+ if success:
388
+ return watermarked_path
389
+ else:
390
+ return saved_path
391
+ else:
392
+ return saved_path
393
+
394
+
395
+ @spaces.GPU()
396
+ def run_image_inference(
397
+ prompt_text,
398
+ resolution,
399
+ aspect_ratio,
400
+ length,
401
+ motion_strength,
402
+ aesthetic_score,
403
+ use_motion_strength,
404
+ use_aesthetic_score,
405
+ camera_motion,
406
+ reference_image,
407
+ refine_prompt,
408
+ fps,
409
+ num_loop,
410
+ seed,
411
+ sampling_steps,
412
+ cfg_scale,
413
+ ):
414
+ return run_inference(
415
+ "Text2Image",
416
+ prompt_text,
417
+ resolution,
418
+ aspect_ratio,
419
+ length,
420
+ motion_strength,
421
+ aesthetic_score,
422
+ use_motion_strength,
423
+ use_aesthetic_score,
424
+ camera_motion,
425
+ reference_image,
426
+ refine_prompt,
427
+ fps,
428
+ num_loop,
429
+ seed,
430
+ sampling_steps,
431
+ cfg_scale,
432
+ )
433
+
434
+
435
+ @spaces.GPU(duration=200)
436
+ def run_video_inference(
437
+ prompt_text,
438
+ resolution,
439
+ aspect_ratio,
440
+ length,
441
+ motion_strength,
442
+ aesthetic_score,
443
+ use_motion_strength,
444
+ use_aesthetic_score,
445
+ camera_motion,
446
+ reference_image,
447
+ refine_prompt,
448
+ fps,
449
+ num_loop,
450
+ seed,
451
+ sampling_steps,
452
+ cfg_scale,
453
+ ):
454
+ # if (resolution == "480p" and length == "16s") or \
455
+ # (resolution == "720p" and length in ["8s", "16s"]):
456
+ # gr.Warning("Generation is interrupted as the combination of 480p and 16s will lead to CUDA out of memory")
457
+ # else:
458
+ return run_inference(
459
+ "Text2Video",
460
+ prompt_text,
461
+ resolution,
462
+ aspect_ratio,
463
+ length,
464
+ motion_strength,
465
+ aesthetic_score,
466
+ use_motion_strength,
467
+ use_aesthetic_score,
468
+ camera_motion,
469
+ reference_image,
470
+ refine_prompt,
471
+ fps,
472
+ num_loop,
473
+ seed,
474
+ sampling_steps,
475
+ cfg_scale,
476
+ )
477
+
478
+
479
+ def generate_random_prompt():
480
+ if "OPENAI_API_KEY" not in os.environ:
481
+ gr.Warning("Your prompt is empty and the OpenAI API key is not provided, please enter a valid prompt")
482
+ return None
483
+ else:
484
+ prompt_text = get_random_prompt_by_openai()
485
+ return prompt_text
486
+
487
+
488
+ def main():
489
+ # create demo
490
+ with gr.Blocks() as demo:
491
+ with gr.Row():
492
+ with gr.Column():
493
+ gr.HTML(
494
+ """
495
+ <div style='text-align: center;'>
496
+ <p align="center">
497
+ <img src="https://github.com/hpcaitech/Open-Sora/raw/main/assets/readme/icon.png" width="250"/>
498
+ </p>
499
+ <div style="display: flex; gap: 10px; justify-content: center;">
500
+ <a href="https://github.com/hpcaitech/Open-Sora/stargazers"><img src="https://img.shields.io/github/stars/hpcaitech/Open-Sora?style=social"></a>
501
+ <a href="https://hpcaitech.github.io/Open-Sora/"><img src="https://img.shields.io/badge/Gallery-View-orange?logo=&amp"></a>
502
+ <a href="https://discord.gg/kZakZzrSUT"><img src="https://img.shields.io/badge/Discord-join-blueviolet?logo=discord&amp"></a>
503
+ <a href="https://join.slack.com/t/colossalaiworkspace/shared_invite/zt-247ipg9fk-KRRYmUl~u2ll2637WRURVA"><img src="https://img.shields.io/badge/Slack-ColossalAI-blueviolet?logo=slack&amp"></a>
504
+ <a href="https://twitter.com/yangyou1991/status/1769411544083996787?s=61&t=jT0Dsx2d-MS5vS9rNM5e5g"><img src="https://img.shields.io/badge/Twitter-Discuss-blue?logo=twitter&amp"></a>
505
+ <a href="https://raw.githubusercontent.com/hpcaitech/public_assets/main/colossalai/img/WeChat.png"><img src="https://img.shields.io/badge/微信-小助手加群-green?logo=wechat&amp"></a>
506
+ <a href="https://hpc-ai.com/blog/open-sora-v1.0"><img src="https://img.shields.io/badge/Open_Sora-Blog-blue"></a>
507
+ </div>
508
+ <h1 style='margin-top: 5px;'>Open-Sora: Democratizing Efficient Video Production for All</h1>
509
+ </div>
510
+ """
511
+ )
512
+
513
+ with gr.Row():
514
+ with gr.Column():
515
+ prompt_text = gr.Textbox(label="Prompt", placeholder="Describe your video here", lines=4)
516
+ refine_prompt = gr.Checkbox(value=has_openai_key(), label="Refine prompt with GPT4o", interactive=has_openai_key())
517
+ random_prompt_btn = gr.Button("Random Prompt By GPT4o", interactive=has_openai_key())
518
+
519
+ gr.Markdown("## Basic Settings")
520
+ resolution = gr.Radio(
521
+ choices=["144p", "240p", "360p", "480p", "720p"],
522
+ value="240p",
523
+ label="Resolution",
524
+ )
525
+ aspect_ratio = gr.Radio(
526
+ choices=["9:16", "16:9", "3:4", "4:3", "1:1"],
527
+ value="9:16",
528
+ label="Aspect Ratio (H:W)",
529
+ )
530
+ length = gr.Radio(
531
+ choices=["2s", "4s", "8s", "16s"],
532
+ value="2s",
533
+ label="Video Length",
534
+ info="only effective for video generation, 8s may fail as Hugging Face ZeroGPU has the limitation of max 200 seconds inference time.",
535
+ )
536
+
537
+ with gr.Row():
538
+ seed = gr.Slider(value=1024, minimum=1, maximum=2048, step=1, label="Seed")
539
+
540
+ sampling_steps = gr.Slider(value=30, minimum=1, maximum=200, step=1, label="Sampling steps")
541
+ cfg_scale = gr.Slider(value=7.0, minimum=0.0, maximum=10.0, step=0.1, label="CFG Scale")
542
+
543
+ with gr.Row():
544
+ with gr.Column():
545
+ motion_strength = gr.Slider(
546
+ value=5,
547
+ minimum=0,
548
+ maximum=100,
549
+ step=1,
550
+ label="Motion Strength",
551
+ info="only effective for video generation",
552
+ )
553
+ use_motion_strength = gr.Checkbox(value=False, label="Enable")
554
+
555
+ with gr.Column():
556
+ aesthetic_score = gr.Slider(
557
+ value=6.5,
558
+ minimum=4,
559
+ maximum=7,
560
+ step=0.1,
561
+ label="Aesthetic",
562
+ info="effective for text & video generation",
563
+ )
564
+ use_aesthetic_score = gr.Checkbox(value=True, label="Enable")
565
+
566
+ camera_motion = gr.Radio(
567
+ value="none",
568
+ label="Camera Motion",
569
+ choices=["none", "pan right", "pan left", "tilt up", "tilt down", "zoom in", "zoom out", "static"],
570
+ interactive=True,
571
+ )
572
+
573
+ gr.Markdown("## Advanced Settings")
574
+ with gr.Row():
575
+ fps = gr.Slider(
576
+ value=24,
577
+ minimum=1,
578
+ maximum=60,
579
+ step=1,
580
+ label="FPS",
581
+ info="This is the frames per seconds for video generation, keep it to 24 if you are not sure",
582
+ )
583
+ num_loop = gr.Slider(
584
+ value=1,
585
+ minimum=1,
586
+ maximum=20,
587
+ step=1,
588
+ label="Number of Loops",
589
+ info="This will change the length of the generated video, keep it to 1 if you are not sure",
590
+ )
591
 
592
+ gr.Markdown("## Reference Image")
593
+ reference_image = gr.Image(label="Image (optional)", show_download_button=True)
594
+
595
+ with gr.Column():
596
+ output_video = gr.Video(label="Output Video", height="100%")
597
+
598
+ with gr.Row():
599
+ image_gen_button = gr.Button("Generate image")
600
+ video_gen_button = gr.Button("Generate video")
601
+
602
+ image_gen_button.click(
603
+ fn=run_image_inference,
604
+ inputs=[
605
+ prompt_text,
606
+ resolution,
607
+ aspect_ratio,
608
+ length,
609
+ motion_strength,
610
+ aesthetic_score,
611
+ use_motion_strength,
612
+ use_aesthetic_score,
613
+ camera_motion,
614
+ reference_image,
615
+ refine_prompt,
616
+ fps,
617
+ num_loop,
618
+ seed,
619
+ sampling_steps,
620
+ cfg_scale,
621
+ ],
622
+ outputs=reference_image,
623
+ )
624
+ video_gen_button.click(
625
+ fn=run_video_inference,
626
+ inputs=[
627
+ prompt_text,
628
+ resolution,
629
+ aspect_ratio,
630
+ length,
631
+ motion_strength,
632
+ aesthetic_score,
633
+ use_motion_strength,
634
+ use_aesthetic_score,
635
+ camera_motion,
636
+ reference_image,
637
+ refine_prompt,
638
+ fps,
639
+ num_loop,
640
+ seed,
641
+ sampling_steps,
642
+ cfg_scale,
643
+ ],
644
+ outputs=output_video,
645
+ )
646
+ random_prompt_btn.click(fn=generate_random_prompt, outputs=prompt_text)
647
+
648
+ # launch
649
+ demo.launch(server_port=args.port, server_name=args.host, share=args.share)
650
+
651
+
652
+ if __name__ == "__main__":
653
+ main()
configs/dit/inference/16x256x256.py CHANGED
@@ -22,10 +22,10 @@ scheduler = dict(
22
  num_sampling_steps=20,
23
  cfg_scale=4.0,
24
  )
25
- dtype = "fp16"
26
 
27
  # Others
28
  batch_size = 2
29
  seed = 42
30
  prompt_path = "./assets/texts/ucf101_labels.txt"
31
- save_dir = "./outputs/samples/"
 
22
  num_sampling_steps=20,
23
  cfg_scale=4.0,
24
  )
25
+ dtype = "bf16"
26
 
27
  # Others
28
  batch_size = 2
29
  seed = 42
30
  prompt_path = "./assets/texts/ucf101_labels.txt"
31
+ save_dir = "./samples/samples/"
configs/dit/inference/1x256x256-class.py CHANGED
@@ -22,10 +22,10 @@ scheduler = dict(
22
  num_sampling_steps=20,
23
  cfg_scale=4.0,
24
  )
25
- dtype = "fp16"
26
 
27
  # Others
28
  batch_size = 2
29
  seed = 42
30
  prompt_path = "./assets/texts/imagenet_id.txt"
31
- save_dir = "./outputs/samples/"
 
22
  num_sampling_steps=20,
23
  cfg_scale=4.0,
24
  )
25
+ dtype = "bf16"
26
 
27
  # Others
28
  batch_size = 2
29
  seed = 42
30
  prompt_path = "./assets/texts/imagenet_id.txt"
31
+ save_dir = "./samples/samples/"
configs/dit/inference/1x256x256.py CHANGED
@@ -23,10 +23,10 @@ scheduler = dict(
23
  num_sampling_steps=20,
24
  cfg_scale=4.0,
25
  )
26
- dtype = "fp16"
27
 
28
  # Others
29
  batch_size = 2
30
  seed = 42
31
  prompt_path = "./assets/texts/imagenet_labels.txt"
32
- save_dir = "./outputs/samples/"
 
23
  num_sampling_steps=20,
24
  cfg_scale=4.0,
25
  )
26
+ dtype = "bf16"
27
 
28
  # Others
29
  batch_size = 2
30
  seed = 42
31
  prompt_path = "./assets/texts/imagenet_labels.txt"
32
+ save_dir = "./samples/samples/"
configs/dit/train/16x256x256.py CHANGED
@@ -1,16 +1,16 @@
1
- num_frames = 16
2
- frame_interval = 3
3
- image_size = (256, 256)
4
-
5
  # Define dataset
6
- root = None
7
- data_path = "CSV_PATH"
8
- use_image_transform = False
9
- num_workers = 4
 
 
 
10
 
11
  # Define acceleration
 
12
  dtype = "bf16"
13
- grad_checkpoint = False
14
  plugin = "zero2"
15
  sp_size = 1
16
 
@@ -18,7 +18,7 @@ sp_size = 1
18
  model = dict(
19
  type="DiT-XL/2",
20
  from_pretrained="DiT-XL-2-256x256.pt",
21
- enable_flashattn=True,
22
  enable_layernorm_kernel=True,
23
  )
24
  vae = dict(
 
 
 
 
 
1
  # Define dataset
2
+ dataset = dict(
3
+ type="VideoTextDataset",
4
+ data_path=None,
5
+ num_frames=16,
6
+ frame_interval=3,
7
+ image_size=(256, 256),
8
+ )
9
 
10
  # Define acceleration
11
+ num_workers = 4
12
  dtype = "bf16"
13
+ grad_checkpoint = True
14
  plugin = "zero2"
15
  sp_size = 1
16
 
 
18
  model = dict(
19
  type="DiT-XL/2",
20
  from_pretrained="DiT-XL-2-256x256.pt",
21
+ enable_flash_attn=True,
22
  enable_layernorm_kernel=True,
23
  )
24
  vae = dict(
configs/dit/train/1x256x256.py CHANGED
@@ -1,14 +1,15 @@
1
- num_frames = 1
2
- frame_interval = 1
3
- image_size = (256, 256)
4
-
5
  # Define dataset
6
- root = None
7
- data_path = "CSV_PATH"
8
- use_image_transform = True
9
- num_workers = 4
 
 
 
 
10
 
11
  # Define acceleration
 
12
  dtype = "bf16"
13
  grad_checkpoint = False
14
  plugin = "zero2"
@@ -18,7 +19,7 @@ sp_size = 1
18
  model = dict(
19
  type="DiT-XL/2",
20
  no_temporal_pos_emb=True,
21
- enable_flashattn=True,
22
  enable_layernorm_kernel=True,
23
  )
24
  vae = dict(
 
 
 
 
 
1
  # Define dataset
2
+ dataset = dict(
3
+ type="VideoTextDataset",
4
+ data_path=None,
5
+ num_frames=1,
6
+ frame_interval=1,
7
+ image_size=(256, 256),
8
+ transform_name="center",
9
+ )
10
 
11
  # Define acceleration
12
+ num_workers = 4
13
  dtype = "bf16"
14
  grad_checkpoint = False
15
  plugin = "zero2"
 
19
  model = dict(
20
  type="DiT-XL/2",
21
  no_temporal_pos_emb=True,
22
+ enable_flash_attn=True,
23
  enable_layernorm_kernel=True,
24
  )
25
  vae = dict(
configs/latte/inference/16x256x256-class.py CHANGED
@@ -21,10 +21,10 @@ scheduler = dict(
21
  num_sampling_steps=20,
22
  cfg_scale=4.0,
23
  )
24
- dtype = "fp16"
25
 
26
  # Others
27
  batch_size = 2
28
  seed = 42
29
  prompt_path = "./assets/texts/ucf101_id.txt"
30
- save_dir = "./outputs/samples/"
 
21
  num_sampling_steps=20,
22
  cfg_scale=4.0,
23
  )
24
+ dtype = "bf16"
25
 
26
  # Others
27
  batch_size = 2
28
  seed = 42
29
  prompt_path = "./assets/texts/ucf101_id.txt"
30
+ save_dir = "./samples/samples/"
configs/latte/inference/16x256x256.py CHANGED
@@ -22,10 +22,10 @@ scheduler = dict(
22
  num_sampling_steps=20,
23
  cfg_scale=4.0,
24
  )
25
- dtype = "fp16"
26
 
27
  # Others
28
  batch_size = 2
29
  seed = 42
30
  prompt_path = "./assets/texts/ucf101_labels.txt"
31
- save_dir = "./outputs/samples/"
 
22
  num_sampling_steps=20,
23
  cfg_scale=4.0,
24
  )
25
+ dtype = "bf16"
26
 
27
  # Others
28
  batch_size = 2
29
  seed = 42
30
  prompt_path = "./assets/texts/ucf101_labels.txt"
31
+ save_dir = "./samples/samples/"
configs/latte/train/16x256x256.py CHANGED
@@ -1,14 +1,14 @@
1
- num_frames = 16
2
- frame_interval = 3
3
- image_size = (256, 256)
4
-
5
  # Define dataset
6
- root = None
7
- data_path = "CSV_PATH"
8
- use_image_transform = False
9
- num_workers = 4
 
 
 
10
 
11
  # Define acceleration
 
12
  dtype = "bf16"
13
  grad_checkpoint = True
14
  plugin = "zero2"
@@ -17,7 +17,7 @@ sp_size = 1
17
  # Define model
18
  model = dict(
19
  type="Latte-XL/2",
20
- enable_flashattn=True,
21
  enable_layernorm_kernel=True,
22
  )
23
  vae = dict(
 
 
 
 
 
1
  # Define dataset
2
+ dataset = dict(
3
+ type="VideoTextDataset",
4
+ data_path=None,
5
+ num_frames=16,
6
+ frame_interval=3,
7
+ image_size=(256, 256),
8
+ )
9
 
10
  # Define acceleration
11
+ num_workers = 4
12
  dtype = "bf16"
13
  grad_checkpoint = True
14
  plugin = "zero2"
 
17
  # Define model
18
  model = dict(
19
  type="Latte-XL/2",
20
+ enable_flash_attn=True,
21
  enable_layernorm_kernel=True,
22
  )
23
  vae = dict(
configs/opensora-v1-1/inference/sample-ref.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 16
2
+ frame_interval = 3
3
+ fps = 24
4
+ image_size = (240, 426)
5
+ multi_resolution = "STDiT2"
6
+
7
+ # Condition
8
+ prompt_path = None
9
+ prompt = [
10
+ 'Drone view of waves crashing against the rugged cliffs along Big Sur\'s garay point beach. {"reference_path": "assets/images/condition/cliff.png", "mask_strategy": "0"}',
11
+ 'A breathtaking sunrise scene.{"reference_path": "assets/images/condition/sunset1.png","mask_strategy": "0"}',
12
+ 'A car driving on the ocean.{"reference_path": "https://cdn.openai.com/tmp/s/interp/d0.mp4","mask_strategy": "0,0,-8,0,8"}',
13
+ 'A snowy forest.{"reference_path": "https://cdn.pixabay.com/video/2021/04/25/72171-542991404_large.mp4","mask_strategy": "0,0,0,0,15,0.8"}',
14
+ 'A breathtaking sunrise scene.{"reference_path": "assets/images/condition/sunset1.png;assets/images/condition/sunset2.png","mask_strategy": "0;0,1,0,-1,1"}',
15
+ '|0|a white jeep equipped with a roof rack driving on a dirt road in a coniferous forest.|2|a white jeep equipped with a roof rack driving on a dirt road in the desert.|4|a white jeep equipped with a roof rack driving on a dirt road in a mountain.|6|A white jeep equipped with a roof rack driving on a dirt road in a city.|8|a white jeep equipped with a roof rack driving on a dirt road on the surface of a river.|10|a white jeep equipped with a roof rack driving on a dirt road under the lake.|12|a white jeep equipped with a roof rack flying into the sky.|14|a white jeep equipped with a roof rack driving in the universe. Earth is the background.{"reference_path": "https://cdn.openai.com/tmp/s/interp/d0.mp4", "mask_strategy": "0,0,0,0,15"}',
16
+ ]
17
+
18
+ loop = 2
19
+ condition_frame_length = 4
20
+ # (
21
+ # loop id, [the loop index of the condition image or video]
22
+ # reference id, [the index of the condition image or video in the reference_path]
23
+ # reference start, [the start frame of the condition image or video]
24
+ # target start, [the location to insert]
25
+ # length, [the number of frames to insert]
26
+ # edit_ratio [the edit rate of the condition image or video]
27
+ # )
28
+ # See https://github.com/hpcaitech/Open-Sora/blob/main/docs/config.md#advanced-inference-config for more details
29
+ # See https://github.com/hpcaitech/Open-Sora/blob/main/docs/commands.md#inference-with-open-sora-11 for more examples
30
+
31
+ # Define model
32
+ model = dict(
33
+ type="STDiT2-XL/2",
34
+ from_pretrained="hpcai-tech/OpenSora-STDiT-v2-stage3",
35
+ input_sq_size=512,
36
+ qk_norm=True,
37
+ qk_norm_legacy=True,
38
+ enable_flash_attn=True,
39
+ enable_layernorm_kernel=True,
40
+ )
41
+ vae = dict(
42
+ type="VideoAutoencoderKL",
43
+ from_pretrained="stabilityai/sd-vae-ft-ema",
44
+ cache_dir=None, # "/mnt/hdd/cached_models",
45
+ micro_batch_size=4,
46
+ )
47
+ text_encoder = dict(
48
+ type="t5",
49
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
50
+ cache_dir=None, # "/mnt/hdd/cached_models",
51
+ model_max_length=200,
52
+ )
53
+ scheduler = dict(
54
+ type="iddpm",
55
+ num_sampling_steps=100,
56
+ cfg_scale=7.0,
57
+ cfg_channel=3, # or None
58
+ )
59
+ dtype = "bf16"
60
+
61
+ # Others
62
+ batch_size = 1
63
+ seed = 42
64
+ save_dir = "./samples/samples/"
configs/opensora-v1-1/inference/sample.py ADDED
@@ -0,0 +1,44 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 16
2
+ frame_interval = 3
3
+ fps = 24
4
+ image_size = (240, 426)
5
+ multi_resolution = "STDiT2"
6
+
7
+ # Define model
8
+ model = dict(
9
+ type="STDiT2-XL/2",
10
+ from_pretrained="hpcai-tech/OpenSora-STDiT-v2-stage3",
11
+ input_sq_size=512,
12
+ qk_norm=True,
13
+ qk_norm_legacy=True,
14
+ enable_flash_attn=True,
15
+ enable_layernorm_kernel=True,
16
+ )
17
+ vae = dict(
18
+ type="VideoAutoencoderKL",
19
+ from_pretrained="stabilityai/sd-vae-ft-ema",
20
+ cache_dir=None, # "/mnt/hdd/cached_models",
21
+ micro_batch_size=4,
22
+ )
23
+ text_encoder = dict(
24
+ type="t5",
25
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
26
+ cache_dir=None, # "/mnt/hdd/cached_models",
27
+ model_max_length=200,
28
+ )
29
+ scheduler = dict(
30
+ type="iddpm",
31
+ num_sampling_steps=100,
32
+ cfg_scale=7.0,
33
+ cfg_channel=3, # or None
34
+ )
35
+ dtype = "bf16"
36
+
37
+ # Condition
38
+ prompt_path = "./assets/texts/t2v_samples.txt"
39
+ prompt = None # prompt has higher priority than prompt_path
40
+
41
+ # Others
42
+ batch_size = 1
43
+ seed = 42
44
+ save_dir = "./samples/samples/"
configs/opensora-v1-1/train/benchmark.py ADDED
@@ -0,0 +1,102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # this file is only for batch size search and is not used for training
2
+
3
+ # Define dataset
4
+ dataset = dict(
5
+ type="VariableVideoTextDataset",
6
+ data_path=None,
7
+ num_frames=None,
8
+ frame_interval=3,
9
+ image_size=(None, None),
10
+ transform_name="resize_crop",
11
+ )
12
+
13
+ # bucket config format:
14
+ # 1. { resolution: {num_frames: (prob, batch_size)} }, in this case batch_size is ignored when searching
15
+ # 2. { resolution: {num_frames: (prob, (max_batch_size, ))} }, batch_size is searched in the range [batch_size_start, max_batch_size), batch_size_start is configured via CLI
16
+ # 3. { resolution: {num_frames: (prob, (min_batch_size, max_batch_size))} }, batch_size is searched in the range [min_batch_size, max_batch_size)
17
+ # 4. { resolution: {num_frames: (prob, (min_batch_size, max_batch_size, step_size))} }, batch_size is searched in the range [min_batch_size, max_batch_size) with step_size (grid search)
18
+ # 5. { resolution: {num_frames: (0.0, None)} }, this bucket will not be used
19
+
20
+ bucket_config = {
21
+ # == manual search ==
22
+ # "240p": {128: (1.0, 2)}, # 4.28s/it
23
+ # "240p": {64: (1.0, 4)},
24
+ # "240p": {32: (1.0, 8)}, # 4.6s/it
25
+ # "240p": {16: (1.0, 16)}, # 4.6s/it
26
+ # "480p": {16: (1.0, 4)}, # 4.6s/it
27
+ # "720p": {16: (1.0, 2)}, # 5.89s/it
28
+ # "256": {1: (1.0, 256)}, # 4.5s/it
29
+ # "512": {1: (1.0, 96)}, # 4.7s/it
30
+ # "512": {1: (1.0, 128)}, # 6.3s/it
31
+ # "480p": {1: (1.0, 50)}, # 4.0s/it
32
+ # "1024": {1: (1.0, 32)}, # 6.8s/it
33
+ # "1024": {1: (1.0, 20)}, # 4.3s/it
34
+ # "1080p": {1: (1.0, 16)}, # 8.6s/it
35
+ # "1080p": {1: (1.0, 8)}, # 4.4s/it
36
+ # == stage 2 ==
37
+ # "240p": {
38
+ # 16: (1.0, (2, 32)),
39
+ # 32: (1.0, (2, 16)),
40
+ # 64: (1.0, (2, 8)),
41
+ # 128: (1.0, (2, 6)),
42
+ # },
43
+ # "256": {1: (1.0, (128, 300))},
44
+ # "512": {1: (0.5, (64, 128))},
45
+ # "480p": {1: (0.4, (32, 128)), 16: (0.4, (2, 32)), 32: (0.0, None)},
46
+ # "720p": {16: (0.1, (2, 16)), 32: (0.0, None)}, # No examples now
47
+ # "1024": {1: (0.3, (8, 64))},
48
+ # "1080p": {1: (0.3, (2, 32))},
49
+ # == stage 3 ==
50
+ "720p": {1: (20, 40), 32: (0.5, (2, 4)), 64: (0.5, (1, 1))},
51
+ }
52
+
53
+
54
+ # Define acceleration
55
+ num_workers = 4
56
+ num_bucket_build_workers = 16
57
+ dtype = "bf16"
58
+ grad_checkpoint = True
59
+ plugin = "zero2"
60
+ sp_size = 1
61
+
62
+ # Define model
63
+ model = dict(
64
+ type="STDiT2-XL/2",
65
+ from_pretrained=None,
66
+ input_sq_size=512, # pretrained model is trained on 512x512
67
+ qk_norm=True,
68
+ qk_norm_legacy=True,
69
+ enable_flash_attn=True,
70
+ enable_layernorm_kernel=True,
71
+ )
72
+ vae = dict(
73
+ type="VideoAutoencoderKL",
74
+ from_pretrained="stabilityai/sd-vae-ft-ema",
75
+ micro_batch_size=4,
76
+ local_files_only=True,
77
+ )
78
+ text_encoder = dict(
79
+ type="t5",
80
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
81
+ model_max_length=200,
82
+ shardformer=True,
83
+ local_files_only=True,
84
+ )
85
+ scheduler = dict(
86
+ type="iddpm",
87
+ timestep_respacing="",
88
+ )
89
+
90
+ # Others
91
+ seed = 42
92
+ outputs = "outputs"
93
+ wandb = False
94
+
95
+ epochs = 1000
96
+ log_every = 10
97
+ ckpt_every = 1000
98
+ load = None
99
+
100
+ batch_size = None
101
+ lr = 2e-5
102
+ grad_clip = 1.0
configs/opensora-v1-1/train/image.py ADDED
@@ -0,0 +1,66 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Define dataset
2
+ dataset = dict(
3
+ type="VariableVideoTextDataset",
4
+ data_path=None,
5
+ num_frames=None,
6
+ frame_interval=3,
7
+ image_size=(None, None),
8
+ transform_name="resize_crop",
9
+ )
10
+ bucket_config = { # 6s/it
11
+ "256": {1: (1.0, 256)},
12
+ "512": {1: (1.0, 80)},
13
+ "480p": {1: (1.0, 52)},
14
+ "1024": {1: (1.0, 20)},
15
+ "1080p": {1: (1.0, 8)},
16
+ }
17
+
18
+ # Define acceleration
19
+ num_workers = 4
20
+ num_bucket_build_workers = 16
21
+ dtype = "bf16"
22
+ grad_checkpoint = True
23
+ plugin = "zero2"
24
+ sp_size = 1
25
+
26
+ # Define model
27
+ model = dict(
28
+ type="STDiT2-XL/2",
29
+ from_pretrained=None,
30
+ input_sq_size=512, # pretrained model is trained on 512x512
31
+ qk_norm=True,
32
+ qk_norm_legacy=True,
33
+ enable_flash_attn=True,
34
+ enable_layernorm_kernel=True,
35
+ )
36
+ vae = dict(
37
+ type="VideoAutoencoderKL",
38
+ from_pretrained="stabilityai/sd-vae-ft-ema",
39
+ micro_batch_size=4,
40
+ local_files_only=True,
41
+ )
42
+ text_encoder = dict(
43
+ type="t5",
44
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
45
+ model_max_length=200,
46
+ shardformer=True,
47
+ local_files_only=True,
48
+ )
49
+ scheduler = dict(
50
+ type="iddpm",
51
+ timestep_respacing="",
52
+ )
53
+
54
+ # Others
55
+ seed = 42
56
+ outputs = "outputs"
57
+ wandb = False
58
+
59
+ epochs = 1000
60
+ log_every = 10
61
+ ckpt_every = 500
62
+ load = None
63
+
64
+ batch_size = 10 # only for logging
65
+ lr = 2e-5
66
+ grad_clip = 1.0
configs/opensora-v1-1/train/image_rflow.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Define dataset
2
+ # dataset = dict(
3
+ # type="VariableVideoTextDataset",
4
+ # data_path=None,
5
+ # num_frames=None,
6
+ # frame_interval=3,
7
+ # image_size=(None, None),
8
+ # transform_name="resize_crop",
9
+ # )
10
+ dataset = dict(
11
+ type="VideoTextDataset",
12
+ data_path=None,
13
+ num_frames=1,
14
+ frame_interval=1,
15
+ image_size=(256, 256),
16
+ transform_name="center",
17
+ )
18
+ bucket_config = { # 6s/it
19
+ "256": {1: (1.0, 256)},
20
+ "512": {1: (1.0, 80)},
21
+ "480p": {1: (1.0, 52)},
22
+ "1024": {1: (1.0, 20)},
23
+ "1080p": {1: (1.0, 8)},
24
+ }
25
+
26
+ # Define acceleration
27
+ num_workers = 16
28
+ dtype = "bf16"
29
+ grad_checkpoint = True
30
+ plugin = "zero2"
31
+ sp_size = 1
32
+
33
+ # Define model
34
+ # model = dict(
35
+ # type="DiT-XL/2",
36
+ # from_pretrained="/home/zhaowangbo/wangbo/PixArt-alpha/pretrained_models/PixArt-XL-2-512x512.pth",
37
+ # # input_sq_size=512, # pretrained model is trained on 512x512
38
+ # enable_flash_attn=True,
39
+ # enable_layernorm_kernel=True,
40
+ # )
41
+ model = dict(
42
+ type="PixArt-XL/2",
43
+ space_scale=1.0,
44
+ time_scale=1.0,
45
+ no_temporal_pos_emb=True,
46
+ from_pretrained="PixArt-XL-2-512x512.pth",
47
+ enable_flash_attn=True,
48
+ enable_layernorm_kernel=True,
49
+ )
50
+ # model = dict(
51
+ # type="DiT-XL/2",
52
+ # # space_scale=1.0,
53
+ # # time_scale=1.0,
54
+ # no_temporal_pos_emb=True,
55
+ # # from_pretrained="PixArt-XL-2-512x512.pth",
56
+ # from_pretrained="/home/zhaowangbo/wangbo/PixArt-alpha/pretrained_models/PixArt-XL-2-512x512.pth",
57
+ # enable_flash_attn=True,
58
+ # enable_layernorm_kernel=True,
59
+ # )
60
+ vae = dict(
61
+ type="VideoAutoencoderKL",
62
+ from_pretrained="stabilityai/sd-vae-ft-ema",
63
+ micro_batch_size=4,
64
+ )
65
+ text_encoder = dict(
66
+ type="t5",
67
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
68
+ model_max_length=200,
69
+ shardformer=True,
70
+ )
71
+ scheduler = dict(
72
+ type="rflow",
73
+ # timestep_respacing="",
74
+ )
75
+
76
+ # Others
77
+ seed = 42
78
+ outputs = "outputs"
79
+ wandb = False
80
+
81
+ epochs = 10
82
+ log_every = 10
83
+ ckpt_every = 500
84
+ load = None
85
+
86
+ batch_size = 100 # only for logging
87
+ lr = 2e-5
88
+ grad_clip = 1.0
configs/opensora-v1-1/train/stage1.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Define dataset
2
+ dataset = dict(
3
+ type="VariableVideoTextDataset",
4
+ data_path=None,
5
+ num_frames=None,
6
+ frame_interval=3,
7
+ image_size=(None, None),
8
+ transform_name="resize_crop",
9
+ )
10
+ # IMG: 1024 (20%) 512 (30%) 256 (50%) drop (50%)
11
+ bucket_config = { # 1s/it
12
+ "144p": {1: (0.5, 48), 16: (1.0, 6), 32: (1.0, 3), 96: (1.0, 1)},
13
+ "256": {1: (0.5, 24), 16: (0.5, 3), 48: (0.5, 1), 64: (0.0, None)},
14
+ "240p": {16: (0.3, 2), 32: (0.3, 1), 64: (0.0, None)},
15
+ "512": {1: (0.4, 12)},
16
+ "1024": {1: (0.3, 3)},
17
+ }
18
+ mask_ratios = {
19
+ "identity": 0.75,
20
+ "quarter_random": 0.025,
21
+ "quarter_head": 0.025,
22
+ "quarter_tail": 0.025,
23
+ "quarter_head_tail": 0.05,
24
+ "image_random": 0.025,
25
+ "image_head": 0.025,
26
+ "image_tail": 0.025,
27
+ "image_head_tail": 0.05,
28
+ }
29
+
30
+ # Define acceleration
31
+ num_workers = 8
32
+ num_bucket_build_workers = 16
33
+ dtype = "bf16"
34
+ grad_checkpoint = False
35
+ plugin = "zero2"
36
+ sp_size = 1
37
+
38
+ # Define model
39
+ model = dict(
40
+ type="STDiT2-XL/2",
41
+ from_pretrained=None,
42
+ input_sq_size=512, # pretrained model is trained on 512x512
43
+ qk_norm=True,
44
+ qk_norm_legacy=True,
45
+ enable_flash_attn=True,
46
+ enable_layernorm_kernel=True,
47
+ )
48
+ vae = dict(
49
+ type="VideoAutoencoderKL",
50
+ from_pretrained="stabilityai/sd-vae-ft-ema",
51
+ micro_batch_size=4,
52
+ local_files_only=True,
53
+ )
54
+ text_encoder = dict(
55
+ type="t5",
56
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
57
+ model_max_length=200,
58
+ shardformer=True,
59
+ local_files_only=True,
60
+ )
61
+ scheduler = dict(
62
+ type="iddpm",
63
+ timestep_respacing="",
64
+ )
65
+
66
+ # Others
67
+ seed = 42
68
+ outputs = "outputs"
69
+ wandb = False
70
+
71
+ epochs = 1000
72
+ log_every = 10
73
+ ckpt_every = 500
74
+ load = None
75
+
76
+ batch_size = None
77
+ lr = 2e-5
78
+ grad_clip = 1.0
configs/opensora-v1-1/train/stage2.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Define dataset
2
+ dataset = dict(
3
+ type="VariableVideoTextDataset",
4
+ data_path=None,
5
+ num_frames=None,
6
+ frame_interval=3,
7
+ image_size=(None, None),
8
+ transform_name="resize_crop",
9
+ )
10
+ bucket_config = { # 7s/it
11
+ "144p": {1: (1.0, 48), 16: (1.0, 17), 32: (1.0, 9), 64: (1.0, 4), 128: (1.0, 1)},
12
+ "256": {1: (0.8, 254), 16: (0.5, 17), 32: (0.5, 9), 64: (0.5, 4), 128: (0.5, 1)},
13
+ "240p": {1: (0.1, 20), 16: (0.9, 17), 32: (0.8, 9), 64: (0.8, 4), 128: (0.8, 2)},
14
+ "512": {1: (0.5, 86), 16: (0.2, 4), 32: (0.2, 2), 64: (0.2, 1), 128: (0.0, None)},
15
+ "480p": {1: (0.4, 54), 16: (0.4, 4), 32: (0.0, None)},
16
+ "720p": {1: (0.1, 20), 16: (0.1, 2), 32: (0.0, None)},
17
+ "1024": {1: (0.3, 20)},
18
+ "1080p": {1: (0.4, 8)},
19
+ }
20
+ mask_ratios = {
21
+ "identity": 0.75,
22
+ "quarter_random": 0.025,
23
+ "quarter_head": 0.025,
24
+ "quarter_tail": 0.025,
25
+ "quarter_head_tail": 0.05,
26
+ "image_random": 0.025,
27
+ "image_head": 0.025,
28
+ "image_tail": 0.025,
29
+ "image_head_tail": 0.05,
30
+ }
31
+
32
+ # Define acceleration
33
+ num_workers = 8
34
+ num_bucket_build_workers = 16
35
+ dtype = "bf16"
36
+ grad_checkpoint = True
37
+ plugin = "zero2"
38
+ sp_size = 1
39
+
40
+ # Define model
41
+ model = dict(
42
+ type="STDiT2-XL/2",
43
+ from_pretrained=None,
44
+ input_sq_size=512, # pretrained model is trained on 512x512
45
+ qk_norm=True,
46
+ qk_norm_legacy=True,
47
+ enable_flash_attn=True,
48
+ enable_layernorm_kernel=True,
49
+ )
50
+ vae = dict(
51
+ type="VideoAutoencoderKL",
52
+ from_pretrained="stabilityai/sd-vae-ft-ema",
53
+ micro_batch_size=4,
54
+ local_files_only=True,
55
+ )
56
+ text_encoder = dict(
57
+ type="t5",
58
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
59
+ model_max_length=200,
60
+ shardformer=True,
61
+ local_files_only=True,
62
+ )
63
+ scheduler = dict(
64
+ type="iddpm",
65
+ timestep_respacing="",
66
+ )
67
+
68
+ # Others
69
+ seed = 42
70
+ outputs = "outputs"
71
+ wandb = False
72
+
73
+ epochs = 1000
74
+ log_every = 10
75
+ ckpt_every = 500
76
+ load = None
77
+
78
+ batch_size = None
79
+ lr = 2e-5
80
+ grad_clip = 1.0
configs/opensora-v1-1/train/stage3.py ADDED
@@ -0,0 +1,80 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Define dataset
2
+ dataset = dict(
3
+ type="VariableVideoTextDataset",
4
+ data_path=None,
5
+ num_frames=None,
6
+ frame_interval=3,
7
+ image_size=(None, None),
8
+ transform_name="resize_crop",
9
+ )
10
+ bucket_config = { # 13s/it
11
+ "144p": {1: (1.0, 200), 16: (1.0, 36), 32: (1.0, 18), 64: (1.0, 9), 128: (1.0, 4)},
12
+ "256": {1: (0.8, 200), 16: (0.5, 22), 32: (0.5, 11), 64: (0.5, 6), 128: (0.8, 4)},
13
+ "240p": {1: (0.8, 200), 16: (0.5, 22), 32: (0.5, 10), 64: (0.5, 6), 128: (0.5, 3)},
14
+ "360p": {1: (0.5, 120), 16: (0.5, 9), 32: (0.5, 4), 64: (0.5, 2), 128: (0.5, 1)},
15
+ "512": {1: (0.5, 120), 16: (0.5, 9), 32: (0.5, 4), 64: (0.5, 2), 128: (0.8, 1)},
16
+ "480p": {1: (0.4, 80), 16: (0.6, 6), 32: (0.6, 3), 64: (0.6, 1), 128: (0.0, None)},
17
+ "720p": {1: (0.4, 40), 16: (0.6, 3), 32: (0.6, 1), 96: (0.0, None)},
18
+ "1024": {1: (0.3, 40)},
19
+ }
20
+ mask_ratios = {
21
+ "identity": 0.75,
22
+ "quarter_random": 0.025,
23
+ "quarter_head": 0.025,
24
+ "quarter_tail": 0.025,
25
+ "quarter_head_tail": 0.05,
26
+ "image_random": 0.025,
27
+ "image_head": 0.025,
28
+ "image_tail": 0.025,
29
+ "image_head_tail": 0.05,
30
+ }
31
+
32
+ # Define acceleration
33
+ num_workers = 8
34
+ num_bucket_build_workers = 16
35
+ dtype = "bf16"
36
+ grad_checkpoint = True
37
+ plugin = "zero2"
38
+ sp_size = 1
39
+
40
+ # Define model
41
+ model = dict(
42
+ type="STDiT2-XL/2",
43
+ from_pretrained=None,
44
+ input_sq_size=512, # pretrained model is trained on 512x512
45
+ qk_norm=True,
46
+ qk_norm_legacy=True,
47
+ enable_flash_attn=True,
48
+ enable_layernorm_kernel=True,
49
+ )
50
+ vae = dict(
51
+ type="VideoAutoencoderKL",
52
+ from_pretrained="stabilityai/sd-vae-ft-ema",
53
+ micro_batch_size=4,
54
+ local_files_only=True,
55
+ )
56
+ text_encoder = dict(
57
+ type="t5",
58
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
59
+ model_max_length=200,
60
+ shardformer=True,
61
+ local_files_only=True,
62
+ )
63
+ scheduler = dict(
64
+ type="iddpm",
65
+ timestep_respacing="",
66
+ )
67
+
68
+ # Others
69
+ seed = 42
70
+ outputs = "outputs"
71
+ wandb = False
72
+
73
+ epochs = 1000
74
+ log_every = 10
75
+ ckpt_every = 500
76
+ load = None
77
+
78
+ batch_size = None
79
+ lr = 2e-5
80
+ grad_clip = 1.0
configs/opensora-v1-1/train/video.py ADDED
@@ -0,0 +1,68 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Define dataset
2
+ dataset = dict(
3
+ type="VariableVideoTextDataset",
4
+ data_path=None,
5
+ num_frames=None,
6
+ frame_interval=3,
7
+ image_size=(None, None),
8
+ transform_name="resize_crop",
9
+ )
10
+ bucket_config = { # 6s/it
11
+ "240p": {16: (1.0, 16), 32: (1.0, 8), 64: (1.0, 4), 128: (1.0, 2)},
12
+ "256": {1: (1.0, 256)},
13
+ "512": {1: (0.5, 80)},
14
+ "480p": {1: (0.4, 52), 16: (0.4, 4), 32: (0.0, None)},
15
+ "720p": {16: (0.1, 2), 32: (0.0, None)}, # No examples now
16
+ "1024": {1: (0.3, 20)},
17
+ "1080p": {1: (0.3, 8)},
18
+ }
19
+
20
+ # Define acceleration
21
+ num_workers = 4
22
+ num_bucket_build_workers = 16
23
+ dtype = "bf16"
24
+ grad_checkpoint = True
25
+ plugin = "zero2"
26
+ sp_size = 1
27
+
28
+ # Define model
29
+ model = dict(
30
+ type="STDiT2-XL/2",
31
+ from_pretrained=None,
32
+ input_sq_size=512, # pretrained model is trained on 512x512
33
+ qk_norm=True,
34
+ qk_norm_legacy=True,
35
+ enable_flash_attn=True,
36
+ enable_layernorm_kernel=True,
37
+ )
38
+ vae = dict(
39
+ type="VideoAutoencoderKL",
40
+ from_pretrained="stabilityai/sd-vae-ft-ema",
41
+ micro_batch_size=4,
42
+ local_files_only=True,
43
+ )
44
+ text_encoder = dict(
45
+ type="t5",
46
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
47
+ model_max_length=200,
48
+ shardformer=True,
49
+ local_files_only=True,
50
+ )
51
+ scheduler = dict(
52
+ type="iddpm",
53
+ timestep_respacing="",
54
+ )
55
+
56
+ # Others
57
+ seed = 42
58
+ outputs = "outputs"
59
+ wandb = False
60
+
61
+ epochs = 1000
62
+ log_every = 10
63
+ ckpt_every = 500
64
+ load = None
65
+
66
+ batch_size = 10 # only for logging
67
+ lr = 2e-5
68
+ grad_clip = 1.0
configs/opensora-v1-2/inference/sample.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ resolution = "240p"
2
+ aspect_ratio = "9:16"
3
+ num_frames = 51
4
+ fps = 24
5
+ frame_interval = 1
6
+ save_fps = 24
7
+
8
+ save_dir = "./samples/samples/"
9
+ seed = 42
10
+ batch_size = 1
11
+ multi_resolution = "STDiT2"
12
+ dtype = "bf16"
13
+ condition_frame_length = 5
14
+ align = 5
15
+
16
+ model = dict(
17
+ type="STDiT3-XL/2",
18
+ from_pretrained="hpcai-tech/OpenSora-STDiT-v3",
19
+ qk_norm=True,
20
+ enable_flash_attn=True,
21
+ enable_layernorm_kernel=True,
22
+ )
23
+ vae = dict(
24
+ type="OpenSoraVAE_V1_2",
25
+ from_pretrained="hpcai-tech/OpenSora-VAE-v1.2",
26
+ micro_frame_size=17,
27
+ micro_batch_size=4,
28
+ )
29
+ text_encoder = dict(
30
+ type="t5",
31
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
32
+ model_max_length=300,
33
+ )
34
+ scheduler = dict(
35
+ type="rflow",
36
+ use_timestep_transform=True,
37
+ num_sampling_steps=30,
38
+ cfg_scale=7.0,
39
+ )
40
+
41
+ aes = 6.5
42
+ flow = None
configs/opensora-v1-2/misc/bs.py ADDED
@@ -0,0 +1,117 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dataset settings
2
+ dataset = dict(
3
+ type="VariableVideoTextDataset",
4
+ transform_name="resize_crop",
5
+ )
6
+
7
+ # == Config 1: Webvid ==
8
+ # base: (512, 408), 12s/it
9
+ grad_checkpoint = True
10
+ base = ("512", "408")
11
+ base_step_time = 12
12
+ bucket_config = {
13
+ "144p": {
14
+ 1: (475, 0),
15
+ 51: (51, 0),
16
+ 102: (27, 0),
17
+ 204: (13, 0),
18
+ 408: (6, 0),
19
+ },
20
+ # ---
21
+ "240p": {
22
+ 1: (297, 200), # 8.25
23
+ 51: (20, 0),
24
+ 102: (10, 0),
25
+ 204: (5, 0),
26
+ 408: (2, 0),
27
+ },
28
+ # ---
29
+ "512": {
30
+ 1: (141, 0),
31
+ 51: (8, 0),
32
+ 102: (4, 0),
33
+ 204: (2, 0),
34
+ 408: (1, 0),
35
+ },
36
+ # ---
37
+ "480p": {
38
+ 1: (89, 0),
39
+ 51: (5, 0),
40
+ 102: (2, 0),
41
+ 204: (1, 0),
42
+ },
43
+ # ---
44
+ "1024": {
45
+ 1: (36, 0),
46
+ 51: (1, 0),
47
+ },
48
+ # ---
49
+ "1080p": {1: (5, 0)},
50
+ # ---
51
+ "2048": {1: (5, 0)},
52
+ }
53
+
54
+ # == Config 1 ==
55
+ # base: (512, 408), 16s/it
56
+
57
+ # Acceleration settings
58
+ num_workers = 8
59
+ num_bucket_build_workers = 16
60
+ dtype = "bf16"
61
+ plugin = "zero2"
62
+
63
+ # Model settings
64
+ model = dict(
65
+ type="STDiT3-XL/2",
66
+ from_pretrained=None,
67
+ qk_norm=True,
68
+ enable_flash_attn=True,
69
+ enable_layernorm_kernel=True,
70
+ )
71
+ vae = dict(
72
+ type="OpenSoraVAE_V1_2",
73
+ from_pretrained="pretrained_models/vae-pipeline",
74
+ micro_frame_size=17,
75
+ micro_batch_size=4,
76
+ )
77
+ text_encoder = dict(
78
+ type="t5",
79
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
80
+ model_max_length=300,
81
+ shardformer=True,
82
+ local_files_only=True,
83
+ )
84
+ scheduler = dict(
85
+ type="rflow",
86
+ use_timestep_transform=True,
87
+ sample_method="logit-normal",
88
+ )
89
+
90
+ # Mask settings
91
+ mask_ratios = {
92
+ "random": 0.2,
93
+ "intepolate": 0.01,
94
+ "quarter_random": 0.01,
95
+ "quarter_head": 0.01,
96
+ "quarter_tail": 0.01,
97
+ "quarter_head_tail": 0.01,
98
+ "image_random": 0.05,
99
+ "image_head": 0.1,
100
+ "image_tail": 0.05,
101
+ "image_head_tail": 0.05,
102
+ }
103
+
104
+ # Log settings
105
+ seed = 42
106
+ outputs = "outputs"
107
+ wandb = False
108
+ epochs = 1000
109
+ log_every = 10
110
+ ckpt_every = 500
111
+
112
+ # optimization settings
113
+ load = None
114
+ grad_clip = 1.0
115
+ lr = 2e-4
116
+ ema_decay = 0.99
117
+ adam_eps = 1e-15
configs/opensora-v1-2/misc/eval_loss.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_workers = 8
2
+ dtype = "bf16"
3
+ seed = 42
4
+ num_eval_timesteps = 10
5
+
6
+ # Dataset settings
7
+ dataset = dict(
8
+ type="VariableVideoTextDataset",
9
+ transform_name="resize_crop",
10
+ )
11
+
12
+ bucket_config = {
13
+ "144p": {1: (None, 100), 51: (None, 30), 102: (None, 20), 204: (None, 8), 408: (None, 4)},
14
+ # ---
15
+ "240p": {1: (None, 100), 51: (None, 24), 102: (None, 12), 204: (None, 4), 408: (None, 2)},
16
+ # ---
17
+ "360p": {1: (None, 60), 51: (None, 12), 102: (None, 6), 204: (None, 2), 408: (None, 1)},
18
+ # ---
19
+ "480p": {1: (None, 40), 51: (None, 6), 102: (None, 3), 204: (None, 1)},
20
+ # ---
21
+ "720p": {1: (None, 20), 51: (None, 2), 102: (None, 1)},
22
+ # ---
23
+ "1080p": {1: (None, 10)},
24
+ # ---
25
+ "2048": {1: (None, 5)},
26
+ }
27
+
28
+ # Model settings
29
+ model = dict(
30
+ type="STDiT3-XL/2",
31
+ from_pretrained=None,
32
+ qk_norm=True,
33
+ enable_flash_attn=True,
34
+ enable_layernorm_kernel=True,
35
+ )
36
+ vae = dict(
37
+ type="OpenSoraVAE_V1_2",
38
+ from_pretrained="pretrained_models/vae-pipeline",
39
+ micro_frame_size=17,
40
+ micro_batch_size=4,
41
+ local_files_only=True,
42
+ )
43
+ text_encoder = dict(
44
+ type="t5",
45
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
46
+ model_max_length=300,
47
+ local_files_only=True,
48
+ )
49
+ scheduler = dict(type="rflow")
configs/opensora-v1-2/misc/extract.py ADDED
@@ -0,0 +1,62 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dataset settings
2
+ dataset = dict(
3
+ type="VariableVideoTextDataset",
4
+ transform_name="resize_crop",
5
+ )
6
+
7
+ # webvid
8
+ bucket_config = { # 12s/it
9
+ "144p": {1: (1.0, 475), 51: (1.0, 51), 102: ((1.0, 0.33), 27), 204: ((1.0, 0.1), 13), 408: ((1.0, 0.1), 6)},
10
+ # ---
11
+ "256": {1: (0.4, 297), 51: (0.5, 20), 102: ((0.5, 0.33), 10), 204: ((0.5, 0.1), 5), 408: ((0.5, 0.1), 2)},
12
+ "240p": {1: (0.3, 297), 51: (0.4, 20), 102: ((0.4, 0.33), 10), 204: ((0.4, 0.1), 5), 408: ((0.4, 0.1), 2)},
13
+ # ---
14
+ "360p": {1: (0.2, 141), 51: (0.15, 8), 102: ((0.15, 0.33), 4), 204: ((0.15, 0.1), 2), 408: ((0.15, 0.1), 1)},
15
+ "512": {1: (0.1, 141)},
16
+ # ---
17
+ "480p": {1: (0.1, 89)},
18
+ # ---
19
+ "720p": {1: (0.05, 36)},
20
+ "1024": {1: (0.05, 36)},
21
+ # ---
22
+ "1080p": {1: (0.1, 5)},
23
+ # ---
24
+ "2048": {1: (0.1, 5)},
25
+ }
26
+
27
+ # Acceleration settings
28
+ num_workers = 8
29
+ num_bucket_build_workers = 16
30
+ dtype = "bf16"
31
+ seed = 42
32
+ outputs = "outputs"
33
+ wandb = False
34
+
35
+
36
+ # Model settings
37
+ model = dict(
38
+ type="STDiT3-XL/2",
39
+ from_pretrained="/mnt/nfs-206/zangwei/opensora/outputs/1091-STDiT3-XL-2/epoch0-global_step8500",
40
+ qk_norm=True,
41
+ enable_flash_attn=True,
42
+ enable_layernorm_kernel=True,
43
+ )
44
+ vae = dict(
45
+ type="OpenSoraVAE_V1_2",
46
+ from_pretrained="pretrained_models/vae-pipeline",
47
+ micro_frame_size=17,
48
+ micro_batch_size=32,
49
+ )
50
+ text_encoder = dict(
51
+ type="t5",
52
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
53
+ model_max_length=300,
54
+ shardformer=True,
55
+ local_files_only=True,
56
+ )
57
+
58
+ # feature extraction settings
59
+ save_text_features = True
60
+ save_compressed_text_features = True
61
+ bin_size = 250 # 1GB, 4195 bins
62
+ log_time = False
configs/opensora-v1-2/misc/feat.py ADDED
@@ -0,0 +1,94 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dataset settings
2
+ dataset = dict(
3
+ type="VariableVideoTextDataset",
4
+ transform_name="resize_crop",
5
+ dummy_text_feature=True,
6
+ )
7
+
8
+ # webvid
9
+ bucket_config = { # 12s/it
10
+ "144p": {1: (1.0, 475), 51: (1.0, 51), 102: ((1.0, 0.33), 27), 204: ((1.0, 0.1), 13), 408: ((1.0, 0.1), 6)},
11
+ # ---
12
+ "256": {1: (0.4, 297), 51: (0.5, 20), 102: ((0.5, 0.33), 10), 204: ((0.5, 0.1), 5), 408: ((0.5, 0.1), 2)},
13
+ "240p": {1: (0.3, 297), 51: (0.4, 20), 102: ((0.4, 0.33), 10), 204: ((0.4, 0.1), 5), 408: ((0.4, 0.1), 2)},
14
+ # ---
15
+ "360p": {1: (0.2, 141), 51: (0.15, 8), 102: ((0.15, 0.33), 4), 204: ((0.15, 0.1), 2), 408: ((0.15, 0.1), 1)},
16
+ "512": {1: (0.1, 141)},
17
+ # ---
18
+ "480p": {1: (0.1, 89)},
19
+ # ---
20
+ "720p": {1: (0.05, 36)},
21
+ "1024": {1: (0.05, 36)},
22
+ # ---
23
+ "1080p": {1: (0.1, 5)},
24
+ # ---
25
+ "2048": {1: (0.1, 5)},
26
+ }
27
+
28
+ grad_checkpoint = True
29
+
30
+ load_text_features = True
31
+
32
+ # Acceleration settings
33
+ num_workers = 0
34
+ num_bucket_build_workers = 16
35
+ dtype = "bf16"
36
+ plugin = "zero2"
37
+
38
+ # Model settings
39
+ model = dict(
40
+ type="STDiT3-XL/2",
41
+ from_pretrained=None,
42
+ qk_norm=True,
43
+ enable_flash_attn=True,
44
+ enable_layernorm_kernel=True,
45
+ freeze_y_embedder=True,
46
+ skip_y_embedder=True,
47
+ )
48
+ vae = dict(
49
+ type="OpenSoraVAE_V1_2",
50
+ from_pretrained="pretrained_models/vae-pipeline",
51
+ micro_frame_size=17,
52
+ micro_batch_size=4,
53
+ )
54
+ text_encoder = dict(
55
+ type="t5",
56
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
57
+ model_max_length=300,
58
+ shardformer=True,
59
+ local_files_only=True,
60
+ )
61
+ scheduler = dict(
62
+ type="rflow",
63
+ use_timestep_transform=True,
64
+ sample_method="logit-normal",
65
+ )
66
+
67
+ # Mask settings
68
+ mask_ratios = {
69
+ "random": 0.2,
70
+ "intepolate": 0.01,
71
+ "quarter_random": 0.01,
72
+ "quarter_head": 0.01,
73
+ "quarter_tail": 0.01,
74
+ "quarter_head_tail": 0.01,
75
+ "image_random": 0.05,
76
+ "image_head": 0.1,
77
+ "image_tail": 0.05,
78
+ "image_head_tail": 0.05,
79
+ }
80
+
81
+ # Log settings
82
+ seed = 42
83
+ outputs = "outputs"
84
+ wandb = False
85
+ epochs = 1000
86
+ log_every = 10
87
+ ckpt_every = 1
88
+
89
+ # optimization settings
90
+ load = None
91
+ grad_clip = 1.0
92
+ lr = 2e-4
93
+ ema_decay = 0.99
94
+ adam_eps = 1e-15
configs/opensora-v1-2/train/adapt.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dataset settings
2
+ dataset = dict(
3
+ type="VariableVideoTextDataset",
4
+ transform_name="resize_crop",
5
+ )
6
+ bucket_config = { # 2s/it
7
+ "144p": {1: (0.5, 48), 34: (1.0, 2), 51: (1.0, 4), 102: (1.0, 2), 204: (1.0, 1)},
8
+ # ---
9
+ "256": {1: (0.6, 20), 34: (0.5, 2), 51: (0.5, 1), 68: (0.5, 1), 136: (0.0, None)},
10
+ "240p": {1: (0.6, 20), 34: (0.5, 2), 51: (0.5, 1), 68: (0.5, 1), 136: (0.0, None)},
11
+ # ---
12
+ "360p": {1: (0.5, 8), 34: (0.2, 1), 102: (0.0, None)},
13
+ "512": {1: (0.5, 8), 34: (0.2, 1), 102: (0.0, None)},
14
+ # ---
15
+ "480p": {1: (0.2, 4), 17: (0.3, 1), 68: (0.0, None)},
16
+ # ---
17
+ "720p": {1: (0.1, 2)},
18
+ "1024": {1: (0.1, 2)},
19
+ # ---
20
+ "1080p": {1: (0.1, 1)},
21
+ }
22
+ grad_checkpoint = False
23
+
24
+ # Acceleration settings
25
+ num_workers = 8
26
+ num_bucket_build_workers = 16
27
+ dtype = "bf16"
28
+ plugin = "zero2"
29
+
30
+ # Model settings
31
+ model = dict(
32
+ type="STDiT3-XL/2",
33
+ from_pretrained=None,
34
+ qk_norm=True,
35
+ enable_flash_attn=True,
36
+ enable_layernorm_kernel=True,
37
+ )
38
+ vae = dict(
39
+ type="OpenSoraVAE_V1_2",
40
+ from_pretrained="pretrained_models/vae-pipeline",
41
+ micro_frame_size=17,
42
+ micro_batch_size=4,
43
+ )
44
+ text_encoder = dict(
45
+ type="t5",
46
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
47
+ model_max_length=300,
48
+ shardformer=True,
49
+ local_files_only=True,
50
+ )
51
+ scheduler = dict(
52
+ type="rflow",
53
+ use_timestep_transform=True,
54
+ sample_method="logit-normal",
55
+ )
56
+
57
+ # Mask settings
58
+ mask_ratios = {
59
+ "random": 0.2,
60
+ "intepolate": 0.01,
61
+ "quarter_random": 0.01,
62
+ "quarter_head": 0.01,
63
+ "quarter_tail": 0.01,
64
+ "quarter_head_tail": 0.01,
65
+ "image_random": 0.05,
66
+ "image_head": 0.1,
67
+ "image_tail": 0.05,
68
+ "image_head_tail": 0.05,
69
+ }
70
+
71
+ # Log settings
72
+ seed = 42
73
+ outputs = "outputs"
74
+ wandb = False
75
+ epochs = 1000
76
+ log_every = 10
77
+ ckpt_every = 500
78
+
79
+ # optimization settings
80
+ load = None
81
+ grad_clip = 1.0
82
+ lr = 1e-4
83
+ ema_decay = 0.99
84
+ adam_eps = 1e-15
configs/opensora-v1-2/train/stage1.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dataset settings
2
+ dataset = dict(
3
+ type="VariableVideoTextDataset",
4
+ transform_name="resize_crop",
5
+ )
6
+
7
+ # backup
8
+ # bucket_config = { # 20s/it
9
+ # "144p": {1: (1.0, 100), 51: (1.0, 30), 102: (1.0, 20), 204: (1.0, 8), 408: (1.0, 4)},
10
+ # # ---
11
+ # "256": {1: (0.5, 100), 51: (0.3, 24), 102: (0.3, 12), 204: (0.3, 4), 408: (0.3, 2)},
12
+ # "240p": {1: (0.5, 100), 51: (0.3, 24), 102: (0.3, 12), 204: (0.3, 4), 408: (0.3, 2)},
13
+ # # ---
14
+ # "360p": {1: (0.5, 60), 51: (0.3, 12), 102: (0.3, 6), 204: (0.3, 2), 408: (0.3, 1)},
15
+ # "512": {1: (0.5, 60), 51: (0.3, 12), 102: (0.3, 6), 204: (0.3, 2), 408: (0.3, 1)},
16
+ # # ---
17
+ # "480p": {1: (0.5, 40), 51: (0.3, 6), 102: (0.3, 3), 204: (0.3, 1), 408: (0.0, None)},
18
+ # # ---
19
+ # "720p": {1: (0.2, 20), 51: (0.3, 2), 102: (0.3, 1), 204: (0.0, None)},
20
+ # "1024": {1: (0.1, 20), 51: (0.3, 2), 102: (0.3, 1), 204: (0.0, None)},
21
+ # # ---
22
+ # "1080p": {1: (0.1, 10)},
23
+ # # ---
24
+ # "2048": {1: (0.1, 5)},
25
+ # }
26
+
27
+ # webvid
28
+ bucket_config = { # 12s/it
29
+ "144p": {1: (1.0, 475), 51: (1.0, 51), 102: ((1.0, 0.33), 27), 204: ((1.0, 0.1), 13), 408: ((1.0, 0.1), 6)},
30
+ # ---
31
+ "256": {1: (0.4, 297), 51: (0.5, 20), 102: ((0.5, 0.33), 10), 204: ((0.5, 0.1), 5), 408: ((0.5, 0.1), 2)},
32
+ "240p": {1: (0.3, 297), 51: (0.4, 20), 102: ((0.4, 0.33), 10), 204: ((0.4, 0.1), 5), 408: ((0.4, 0.1), 2)},
33
+ # ---
34
+ "360p": {1: (0.2, 141), 51: (0.15, 8), 102: ((0.15, 0.33), 4), 204: ((0.15, 0.1), 2), 408: ((0.15, 0.1), 1)},
35
+ "512": {1: (0.1, 141)},
36
+ # ---
37
+ "480p": {1: (0.1, 89)},
38
+ # ---
39
+ "720p": {1: (0.05, 36)},
40
+ "1024": {1: (0.05, 36)},
41
+ # ---
42
+ "1080p": {1: (0.1, 5)},
43
+ # ---
44
+ "2048": {1: (0.1, 5)},
45
+ }
46
+
47
+ grad_checkpoint = True
48
+
49
+ # Acceleration settings
50
+ num_workers = 8
51
+ num_bucket_build_workers = 16
52
+ dtype = "bf16"
53
+ plugin = "zero2"
54
+
55
+ # Model settings
56
+ model = dict(
57
+ type="STDiT3-XL/2",
58
+ from_pretrained=None,
59
+ qk_norm=True,
60
+ enable_flash_attn=True,
61
+ enable_layernorm_kernel=True,
62
+ freeze_y_embedder=True,
63
+ )
64
+ vae = dict(
65
+ type="OpenSoraVAE_V1_2",
66
+ from_pretrained="/mnt/jfs/sora_checkpoints/vae-pipeline",
67
+ micro_frame_size=17,
68
+ micro_batch_size=4,
69
+ )
70
+ text_encoder = dict(
71
+ type="t5",
72
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
73
+ model_max_length=300,
74
+ shardformer=True,
75
+ local_files_only=True,
76
+ )
77
+ scheduler = dict(
78
+ type="rflow",
79
+ use_timestep_transform=True,
80
+ sample_method="logit-normal",
81
+ )
82
+
83
+ # Mask settings
84
+ mask_ratios = {
85
+ "random": 0.05,
86
+ "intepolate": 0.005,
87
+ "quarter_random": 0.005,
88
+ "quarter_head": 0.005,
89
+ "quarter_tail": 0.005,
90
+ "quarter_head_tail": 0.005,
91
+ "image_random": 0.025,
92
+ "image_head": 0.05,
93
+ "image_tail": 0.025,
94
+ "image_head_tail": 0.025,
95
+ }
96
+
97
+ # Log settings
98
+ seed = 42
99
+ outputs = "outputs"
100
+ wandb = False
101
+ epochs = 1000
102
+ log_every = 10
103
+ ckpt_every = 200
104
+
105
+ # optimization settings
106
+ load = None
107
+ grad_clip = 1.0
108
+ lr = 1e-4
109
+ ema_decay = 0.99
110
+ adam_eps = 1e-15
111
+ warmup_steps = 1000
configs/opensora-v1-2/train/stage1_feat.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dataset settings
2
+ dataset = dict(type="BatchFeatureDataset")
3
+ grad_checkpoint = True
4
+ num_workers = 4
5
+
6
+ # Acceleration settings
7
+ dtype = "bf16"
8
+ plugin = "zero2"
9
+
10
+ # Model settings
11
+ model = dict(
12
+ type="STDiT3-XL/2",
13
+ from_pretrained=None,
14
+ qk_norm=True,
15
+ enable_flash_attn=True,
16
+ enable_layernorm_kernel=True,
17
+ freeze_y_embedder=True,
18
+ skip_y_embedder=True,
19
+ )
20
+ scheduler = dict(
21
+ type="rflow",
22
+ use_timestep_transform=True,
23
+ sample_method="logit-normal",
24
+ )
25
+
26
+ vae_out_channels = 4
27
+ model_max_length = 300
28
+ text_encoder_output_dim = 4096
29
+ load_video_features = True
30
+ load_text_features = True
31
+
32
+ # Mask settings
33
+ mask_ratios = {
34
+ "random": 0.2,
35
+ "intepolate": 0.01,
36
+ "quarter_random": 0.01,
37
+ "quarter_head": 0.01,
38
+ "quarter_tail": 0.01,
39
+ "quarter_head_tail": 0.01,
40
+ "image_random": 0.05,
41
+ "image_head": 0.1,
42
+ "image_tail": 0.05,
43
+ "image_head_tail": 0.05,
44
+ }
45
+
46
+ # Log settings
47
+ seed = 42
48
+ outputs = "outputs"
49
+ wandb = False
50
+ epochs = 1000
51
+ log_every = 10
52
+ ckpt_every = 500
53
+
54
+ # optimization settings
55
+ load = None
56
+ grad_clip = 1.0
57
+ lr = 2e-4
58
+ ema_decay = 0.99
59
+ adam_eps = 1e-15
configs/opensora-v1-2/train/stage2.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dataset settings
2
+ dataset = dict(
3
+ type="VariableVideoTextDataset",
4
+ transform_name="resize_crop",
5
+ )
6
+
7
+ # webvid
8
+ bucket_config = { # 12s/it
9
+ "144p": {1: (1.0, 475), 51: (1.0, 51), 102: ((1.0, 0.33), 27), 204: ((1.0, 0.1), 13), 408: ((1.0, 0.1), 6)},
10
+ # ---
11
+ "256": {1: (0.4, 297), 51: (0.5, 20), 102: ((0.5, 0.33), 10), 204: ((0.5, 1.0), 5), 408: ((0.5, 1.0), 2)},
12
+ "240p": {1: (0.3, 297), 51: (0.4, 20), 102: ((0.4, 0.33), 10), 204: ((0.4, 1.0), 5), 408: ((0.4, 1.0), 2)},
13
+ # ---
14
+ "360p": {1: (0.5, 141), 51: (0.15, 8), 102: ((0.3, 0.5), 4), 204: ((0.3, 1.0), 2), 408: ((0.5, 0.5), 1)},
15
+ "512": {1: (0.4, 141), 51: (0.15, 8), 102: ((0.2, 0.4), 4), 204: ((0.2, 1.0), 2), 408: ((0.4, 0.5), 1)},
16
+ # ---
17
+ "480p": {1: (0.5, 89), 51: (0.2, 5), 102: (0.2, 2), 204: (0.1, 1)},
18
+ # ---
19
+ "720p": {1: (0.1, 36), 51: (0.03, 1)},
20
+ "1024": {1: (0.1, 36), 51: (0.02, 1)},
21
+ # ---
22
+ "1080p": {1: (0.01, 5)},
23
+ # ---
24
+ "2048": {1: (0.01, 5)},
25
+ }
26
+
27
+ grad_checkpoint = True
28
+
29
+ # Acceleration settings
30
+ num_workers = 8
31
+ num_bucket_build_workers = 16
32
+ dtype = "bf16"
33
+ plugin = "zero2"
34
+
35
+ # Model settings
36
+ model = dict(
37
+ type="STDiT3-XL/2",
38
+ from_pretrained=None,
39
+ qk_norm=True,
40
+ enable_flash_attn=True,
41
+ enable_layernorm_kernel=True,
42
+ freeze_y_embedder=True,
43
+ )
44
+ vae = dict(
45
+ type="OpenSoraVAE_V1_2",
46
+ from_pretrained="/mnt/jfs/sora_checkpoints/vae-pipeline",
47
+ micro_frame_size=17,
48
+ micro_batch_size=4,
49
+ )
50
+ text_encoder = dict(
51
+ type="t5",
52
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
53
+ model_max_length=300,
54
+ shardformer=True,
55
+ local_files_only=True,
56
+ )
57
+ scheduler = dict(
58
+ type="rflow",
59
+ use_timestep_transform=True,
60
+ sample_method="logit-normal",
61
+ )
62
+
63
+ # Mask settings
64
+ mask_ratios = {
65
+ "random": 0.05,
66
+ "intepolate": 0.005,
67
+ "quarter_random": 0.005,
68
+ "quarter_head": 0.005,
69
+ "quarter_tail": 0.005,
70
+ "quarter_head_tail": 0.005,
71
+ "image_random": 0.025,
72
+ "image_head": 0.05,
73
+ "image_tail": 0.025,
74
+ "image_head_tail": 0.025,
75
+ }
76
+
77
+ # Log settings
78
+ seed = 42
79
+ outputs = "outputs"
80
+ wandb = False
81
+ epochs = 1000
82
+ log_every = 10
83
+ ckpt_every = 200
84
+
85
+ # optimization settings
86
+ load = None
87
+ grad_clip = 1.0
88
+ lr = 1e-4
89
+ ema_decay = 0.99
90
+ adam_eps = 1e-15
configs/opensora-v1-2/train/stage3.py ADDED
@@ -0,0 +1,92 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Dataset settings
2
+ dataset = dict(
3
+ type="VariableVideoTextDataset",
4
+ transform_name="resize_crop",
5
+ )
6
+
7
+ # webvid
8
+ bucket_config = { # 20s/it
9
+ "144p": {1: (1.0, 475), 51: (1.0, 51), 102: (1.0, 27), 204: (1.0, 13), 408: (1.0, 6)},
10
+ # ---
11
+ "256": {1: (1.0, 297), 51: (0.5, 20), 102: (0.5, 10), 204: (0.5, 5), 408: ((0.5, 0.5), 2)},
12
+ "240p": {1: (1.0, 297), 51: (0.5, 20), 102: (0.5, 10), 204: (0.5, 5), 408: ((0.5, 0.4), 2)},
13
+ # ---
14
+ "360p": {1: (1.0, 141), 51: (0.5, 8), 102: (0.5, 4), 204: (0.5, 2), 408: ((0.5, 0.3), 1)},
15
+ "512": {1: (1.0, 141), 51: (0.5, 8), 102: (0.5, 4), 204: (0.5, 2), 408: ((0.5, 0.2), 1)},
16
+ # ---
17
+ "480p": {1: (1.0, 89), 51: (0.5, 5), 102: (0.5, 3), 204: ((0.5, 0.5), 1), 408: (0.0, None)},
18
+ # ---
19
+ "720p": {1: (0.3, 36), 51: (0.2, 2), 102: (0.1, 1), 204: (0.0, None)},
20
+ "1024": {1: (0.3, 36), 51: (0.1, 2), 102: (0.1, 1), 204: (0.0, None)},
21
+ # ---
22
+ "1080p": {1: (0.1, 5)},
23
+ # ---
24
+ "2048": {1: (0.05, 5)},
25
+ }
26
+
27
+ grad_checkpoint = True
28
+
29
+ # Acceleration settings
30
+ num_workers = 8
31
+ num_bucket_build_workers = 16
32
+ dtype = "bf16"
33
+ plugin = "zero2"
34
+
35
+ # Model settings
36
+ model = dict(
37
+ type="STDiT3-XL/2",
38
+ from_pretrained=None,
39
+ qk_norm=True,
40
+ enable_flash_attn=True,
41
+ enable_layernorm_kernel=True,
42
+ freeze_y_embedder=True,
43
+ )
44
+ vae = dict(
45
+ type="OpenSoraVAE_V1_2",
46
+ from_pretrained="/mnt/jfs/sora_checkpoints/vae-pipeline",
47
+ micro_frame_size=17,
48
+ micro_batch_size=4,
49
+ )
50
+ text_encoder = dict(
51
+ type="t5",
52
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
53
+ model_max_length=300,
54
+ shardformer=True,
55
+ local_files_only=True,
56
+ )
57
+ scheduler = dict(
58
+ type="rflow",
59
+ use_timestep_transform=True,
60
+ sample_method="logit-normal",
61
+ )
62
+
63
+ # Mask settings
64
+ # 25%
65
+ mask_ratios = {
66
+ "random": 0.01,
67
+ "intepolate": 0.002,
68
+ "quarter_random": 0.002,
69
+ "quarter_head": 0.002,
70
+ "quarter_tail": 0.002,
71
+ "quarter_head_tail": 0.002,
72
+ "image_random": 0.0,
73
+ "image_head": 0.22,
74
+ "image_tail": 0.005,
75
+ "image_head_tail": 0.005,
76
+ }
77
+
78
+ # Log settings
79
+ seed = 42
80
+ outputs = "outputs"
81
+ wandb = False
82
+ epochs = 1000
83
+ log_every = 10
84
+ ckpt_every = 200
85
+
86
+ # optimization settings
87
+ load = None
88
+ grad_clip = 1.0
89
+ lr = 1e-4
90
+ ema_decay = 0.99
91
+ adam_eps = 1e-15
92
+ warmup_steps = 1000
configs/opensora/inference/16x256x256.py CHANGED
@@ -7,28 +7,33 @@ model = dict(
7
  type="STDiT-XL/2",
8
  space_scale=0.5,
9
  time_scale=1.0,
10
- enable_flashattn=False,
11
- enable_layernorm_kernel=False,
12
  from_pretrained="PRETRAINED_MODEL",
13
  )
14
  vae = dict(
15
  type="VideoAutoencoderKL",
16
  from_pretrained="stabilityai/sd-vae-ft-ema",
 
17
  )
18
  text_encoder = dict(
19
  type="t5",
20
- from_pretrained="./pretrained_models/t5_ckpts",
21
  model_max_length=120,
22
  )
23
  scheduler = dict(
24
  type="iddpm",
25
  num_sampling_steps=100,
26
- cfg_scale=5.0,
 
27
  )
28
- dtype = "fp16"
 
 
 
 
29
 
30
  # Others
31
- batch_size = 2
32
  seed = 42
33
- prompt_path = "./assets/texts/t2v_samples.txt"
34
- save_dir = "./outputs/samples/"
 
7
  type="STDiT-XL/2",
8
  space_scale=0.5,
9
  time_scale=1.0,
10
+ enable_flash_attn=True,
11
+ enable_layernorm_kernel=True,
12
  from_pretrained="PRETRAINED_MODEL",
13
  )
14
  vae = dict(
15
  type="VideoAutoencoderKL",
16
  from_pretrained="stabilityai/sd-vae-ft-ema",
17
+ micro_batch_size=4,
18
  )
19
  text_encoder = dict(
20
  type="t5",
21
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
22
  model_max_length=120,
23
  )
24
  scheduler = dict(
25
  type="iddpm",
26
  num_sampling_steps=100,
27
+ cfg_scale=7.0,
28
+ cfg_channel=3, # or None
29
  )
30
+ dtype = "bf16"
31
+
32
+ # Condition
33
+ prompt_path = "./assets/texts/t2v_samples.txt"
34
+ prompt = None # prompt has higher priority than prompt_path
35
 
36
  # Others
37
+ batch_size = 1
38
  seed = 42
39
+ save_dir = "./samples/samples/"
 
configs/opensora/inference/16x512x512-rflow.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 16
2
+ fps = 24 // 3
3
+ image_size = (512, 512)
4
+
5
+ # Define model
6
+ model = dict(
7
+ type="STDiT-XL/2",
8
+ space_scale=1.0,
9
+ time_scale=1.0,
10
+ enable_flash_attn=True,
11
+ enable_layernorm_kernel=True,
12
+ from_pretrained="PRETRAINED_MODEL",
13
+ )
14
+ vae = dict(
15
+ type="VideoAutoencoderKL",
16
+ from_pretrained="stabilityai/sd-vae-ft-ema",
17
+ micro_batch_size=2,
18
+ )
19
+ text_encoder = dict(
20
+ type="t5",
21
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
22
+ model_max_length=120,
23
+ )
24
+ scheduler = dict(
25
+ type="rflow",
26
+ num_sampling_steps=10,
27
+ cfg_scale=7.0,
28
+ )
29
+ dtype = "bf16"
30
+
31
+ # Others
32
+ batch_size = 2
33
+ seed = 42
34
+ prompt_path = "./assets/texts/t2v_samples.txt"
35
+ save_dir = "./outputs/samples/"
configs/opensora/inference/16x512x512.py CHANGED
@@ -7,18 +7,18 @@ model = dict(
7
  type="STDiT-XL/2",
8
  space_scale=1.0,
9
  time_scale=1.0,
10
- enable_flashattn=True,
11
- enable_layernorm_kernel=False,
12
- from_pretrained="PRETRAINED_MODEL"
13
  )
14
  vae = dict(
15
  type="VideoAutoencoderKL",
16
  from_pretrained="stabilityai/sd-vae-ft-ema",
17
- micro_batch_size=128,
18
  )
19
  text_encoder = dict(
20
  type="t5",
21
- from_pretrained="./pretrained_models/t5_ckpts",
22
  model_max_length=120,
23
  )
24
  scheduler = dict(
@@ -26,10 +26,10 @@ scheduler = dict(
26
  num_sampling_steps=100,
27
  cfg_scale=7.0,
28
  )
29
- dtype = "fp16"
30
 
31
  # Others
32
  batch_size = 2
33
  seed = 42
34
  prompt_path = "./assets/texts/t2v_samples.txt"
35
- save_dir = "./outputs/samples/"
 
7
  type="STDiT-XL/2",
8
  space_scale=1.0,
9
  time_scale=1.0,
10
+ enable_flash_attn=True,
11
+ enable_layernorm_kernel=True,
12
+ from_pretrained="PRETRAINED_MODEL",
13
  )
14
  vae = dict(
15
  type="VideoAutoencoderKL",
16
  from_pretrained="stabilityai/sd-vae-ft-ema",
17
+ micro_batch_size=2,
18
  )
19
  text_encoder = dict(
20
  type="t5",
21
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
22
  model_max_length=120,
23
  )
24
  scheduler = dict(
 
26
  num_sampling_steps=100,
27
  cfg_scale=7.0,
28
  )
29
+ dtype = "bf16"
30
 
31
  # Others
32
  batch_size = 2
33
  seed = 42
34
  prompt_path = "./assets/texts/t2v_samples.txt"
35
+ save_dir = "./samples/samples/"
configs/opensora/inference/64x512x512.py CHANGED
@@ -7,8 +7,8 @@ model = dict(
7
  type="STDiT-XL/2",
8
  space_scale=1.0,
9
  time_scale=2 / 3,
10
- enable_flashattn=True,
11
- enable_layernorm_kernel=False,
12
  from_pretrained="PRETRAINED_MODEL",
13
  )
14
  vae = dict(
@@ -18,7 +18,7 @@ vae = dict(
18
  )
19
  text_encoder = dict(
20
  type="t5",
21
- from_pretrained="./pretrained_models/t5_ckpts",
22
  model_max_length=120,
23
  )
24
  scheduler = dict(
@@ -26,10 +26,10 @@ scheduler = dict(
26
  num_sampling_steps=100,
27
  cfg_scale=7.0,
28
  )
29
- dtype = "fp16"
30
 
31
  # Others
32
  batch_size = 1
33
  seed = 42
34
  prompt_path = "./assets/texts/t2v_samples.txt"
35
- save_dir = "./outputs/samples/"
 
7
  type="STDiT-XL/2",
8
  space_scale=1.0,
9
  time_scale=2 / 3,
10
+ enable_flash_attn=True,
11
+ enable_layernorm_kernel=True,
12
  from_pretrained="PRETRAINED_MODEL",
13
  )
14
  vae = dict(
 
18
  )
19
  text_encoder = dict(
20
  type="t5",
21
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
22
  model_max_length=120,
23
  )
24
  scheduler = dict(
 
26
  num_sampling_steps=100,
27
  cfg_scale=7.0,
28
  )
29
+ dtype = "bf16"
30
 
31
  # Others
32
  batch_size = 1
33
  seed = 42
34
  prompt_path = "./assets/texts/t2v_samples.txt"
35
+ save_dir = "./samples/samples/"
configs/opensora/train/16x256x256-mask.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Define dataset
2
+ dataset = dict(
3
+ type="VideoTextDataset",
4
+ data_path=None,
5
+ num_frames=16,
6
+ frame_interval=3,
7
+ image_size=(256, 256),
8
+ )
9
+
10
+ # Define acceleration
11
+ num_workers = 4
12
+ dtype = "bf16"
13
+ grad_checkpoint = True
14
+ plugin = "zero2"
15
+ sp_size = 1
16
+
17
+ # Define model
18
+ model = dict(
19
+ type="STDiT-XL/2",
20
+ space_scale=0.5,
21
+ time_scale=1.0,
22
+ from_pretrained="PixArt-XL-2-512x512.pth",
23
+ enable_flash_attn=True,
24
+ enable_layernorm_kernel=True,
25
+ )
26
+ mask_ratios = {
27
+ "identity": 0.7,
28
+ "random": 0.15,
29
+ "mask_head": 0.05,
30
+ "mask_tail": 0.05,
31
+ "mask_head_tail": 0.05,
32
+ }
33
+ vae = dict(
34
+ type="VideoAutoencoderKL",
35
+ from_pretrained="stabilityai/sd-vae-ft-ema",
36
+ )
37
+ text_encoder = dict(
38
+ type="t5",
39
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
40
+ model_max_length=120,
41
+ shardformer=True,
42
+ )
43
+ scheduler = dict(
44
+ type="iddpm",
45
+ timestep_respacing="",
46
+ )
47
+
48
+ # Others
49
+ seed = 42
50
+ outputs = "outputs"
51
+ wandb = False
52
+
53
+ epochs = 1000
54
+ log_every = 10
55
+ ckpt_every = 1000
56
+ load = None
57
+
58
+ batch_size = 8
59
+ lr = 2e-5
60
+ grad_clip = 1.0
configs/opensora/train/16x256x256-spee-rflow.py ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Define dataset
2
+ dataset = dict(
3
+ type="VideoTextDataset",
4
+ data_path=None,
5
+ num_frames=16,
6
+ frame_interval=3,
7
+ image_size=(256, 256),
8
+ )
9
+
10
+ # Define acceleration
11
+ num_workers = 4
12
+ dtype = "bf16"
13
+ grad_checkpoint = True
14
+ plugin = "zero2"
15
+ sp_size = 1
16
+
17
+ # Define model
18
+ model = dict(
19
+ type="STDiT-XL/2",
20
+ space_scale=0.5,
21
+ time_scale=1.0,
22
+ # from_pretrained="PixArt-XL-2-512x512.pth",
23
+ # from_pretrained = "/home/zhaowangbo/wangbo/PixArt-alpha/pretrained_models/OpenSora-v1-HQ-16x512x512.pth",
24
+ # from_pretrained = "OpenSora-v1-HQ-16x512x512.pth",
25
+ from_pretrained="PRETRAINED_MODEL",
26
+ enable_flash_attn=True,
27
+ enable_layernorm_kernel=True,
28
+ )
29
+ # mask_ratios = [0.5, 0.29, 0.07, 0.07, 0.07]
30
+ # mask_ratios = {
31
+ # "identity": 0.9,
32
+ # "random": 0.06,
33
+ # "mask_head": 0.01,
34
+ # "mask_tail": 0.01,
35
+ # "mask_head_tail": 0.02,
36
+ # }
37
+ vae = dict(
38
+ type="VideoAutoencoderKL",
39
+ from_pretrained="stabilityai/sd-vae-ft-ema",
40
+ )
41
+ text_encoder = dict(
42
+ type="t5",
43
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
44
+ model_max_length=120,
45
+ shardformer=True,
46
+ )
47
+ scheduler = dict(
48
+ type="rflow",
49
+ # timestep_respacing="",
50
+ )
51
+
52
+ # Others
53
+ seed = 42
54
+ outputs = "outputs"
55
+ wandb = True
56
+
57
+ epochs = 1
58
+ log_every = 10
59
+ ckpt_every = 1000
60
+ load = None
61
+
62
+ batch_size = 16
63
+ lr = 2e-5
64
+ grad_clip = 1.0
configs/opensora/train/16x256x256-spee.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Define dataset
2
+ dataset = dict(
3
+ type="VideoTextDataset",
4
+ data_path=None,
5
+ num_frames=16,
6
+ frame_interval=3,
7
+ image_size=(256, 256),
8
+ )
9
+
10
+ # Define acceleration
11
+ num_workers = 4
12
+ dtype = "bf16"
13
+ grad_checkpoint = True
14
+ plugin = "zero2"
15
+ sp_size = 1
16
+
17
+ # Define model
18
+ model = dict(
19
+ type="STDiT-XL/2",
20
+ space_scale=0.5,
21
+ time_scale=1.0,
22
+ from_pretrained="PixArt-XL-2-512x512.pth",
23
+ enable_flash_attn=True,
24
+ enable_layernorm_kernel=True,
25
+ )
26
+ mask_ratios = {
27
+ "identity": 0.5,
28
+ "random": 0.29,
29
+ "mask_head": 0.07,
30
+ "mask_tail": 0.07,
31
+ "mask_head_tail": 0.07,
32
+ }
33
+ vae = dict(
34
+ type="VideoAutoencoderKL",
35
+ from_pretrained="stabilityai/sd-vae-ft-ema",
36
+ )
37
+ text_encoder = dict(
38
+ type="t5",
39
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
40
+ model_max_length=120,
41
+ shardformer=True,
42
+ )
43
+ scheduler = dict(
44
+ type="iddpm-speed",
45
+ timestep_respacing="",
46
+ )
47
+
48
+ # Others
49
+ seed = 42
50
+ outputs = "outputs"
51
+ wandb = False
52
+
53
+ epochs = 1000
54
+ log_every = 10
55
+ ckpt_every = 1000
56
+ load = None
57
+
58
+ batch_size = 8
59
+ lr = 2e-5
60
+ grad_clip = 1.0
configs/opensora/train/16x256x256.py CHANGED
@@ -1,14 +1,14 @@
1
- num_frames = 16
2
- frame_interval = 3
3
- image_size = (256, 256)
4
-
5
  # Define dataset
6
- root = None
7
- data_path = "CSV_PATH"
8
- use_image_transform = False
9
- num_workers = 4
 
 
 
10
 
11
  # Define acceleration
 
12
  dtype = "bf16"
13
  grad_checkpoint = True
14
  plugin = "zero2"
@@ -20,7 +20,7 @@ model = dict(
20
  space_scale=0.5,
21
  time_scale=1.0,
22
  from_pretrained="PixArt-XL-2-512x512.pth",
23
- enable_flashattn=True,
24
  enable_layernorm_kernel=True,
25
  )
26
  vae = dict(
@@ -29,7 +29,7 @@ vae = dict(
29
  )
30
  text_encoder = dict(
31
  type="t5",
32
- from_pretrained="./pretrained_models/t5_ckpts",
33
  model_max_length=120,
34
  shardformer=True,
35
  )
 
 
 
 
 
1
  # Define dataset
2
+ dataset = dict(
3
+ type="VideoTextDataset",
4
+ data_path=None,
5
+ num_frames=16,
6
+ frame_interval=3,
7
+ image_size=(256, 256),
8
+ )
9
 
10
  # Define acceleration
11
+ num_workers = 0
12
  dtype = "bf16"
13
  grad_checkpoint = True
14
  plugin = "zero2"
 
20
  space_scale=0.5,
21
  time_scale=1.0,
22
  from_pretrained="PixArt-XL-2-512x512.pth",
23
+ enable_flash_attn=True,
24
  enable_layernorm_kernel=True,
25
  )
26
  vae = dict(
 
29
  )
30
  text_encoder = dict(
31
  type="t5",
32
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
33
  model_max_length=120,
34
  shardformer=True,
35
  )
configs/opensora/train/16x512x512.py CHANGED
@@ -1,16 +1,16 @@
1
- num_frames = 16
2
- frame_interval = 3
3
- image_size = (512, 512)
4
-
5
  # Define dataset
6
- root = None
7
- data_path = "CSV_PATH"
8
- use_image_transform = False
9
- num_workers = 4
 
 
 
10
 
11
  # Define acceleration
 
12
  dtype = "bf16"
13
- grad_checkpoint = False
14
  plugin = "zero2"
15
  sp_size = 1
16
 
@@ -20,7 +20,7 @@ model = dict(
20
  space_scale=1.0,
21
  time_scale=1.0,
22
  from_pretrained=None,
23
- enable_flashattn=True,
24
  enable_layernorm_kernel=True,
25
  )
26
  vae = dict(
@@ -30,7 +30,7 @@ vae = dict(
30
  )
31
  text_encoder = dict(
32
  type="t5",
33
- from_pretrained="./pretrained_models/t5_ckpts",
34
  model_max_length=120,
35
  shardformer=True,
36
  )
 
 
 
 
 
1
  # Define dataset
2
+ dataset = dict(
3
+ type="VideoTextDataset",
4
+ data_path=None,
5
+ num_frames=16,
6
+ frame_interval=3,
7
+ image_size=(512, 512),
8
+ )
9
 
10
  # Define acceleration
11
+ num_workers = 4
12
  dtype = "bf16"
13
+ grad_checkpoint = True
14
  plugin = "zero2"
15
  sp_size = 1
16
 
 
20
  space_scale=1.0,
21
  time_scale=1.0,
22
  from_pretrained=None,
23
+ enable_flash_attn=True,
24
  enable_layernorm_kernel=True,
25
  )
26
  vae = dict(
 
30
  )
31
  text_encoder = dict(
32
  type="t5",
33
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
34
  model_max_length=120,
35
  shardformer=True,
36
  )
configs/opensora/train/360x512x512.py CHANGED
@@ -1,12 +1,18 @@
1
- num_frames = 360
2
- frame_interval = 1
3
- image_size = (512, 512)
4
-
5
  # Define dataset
6
- root = None
7
- data_path = "CSV_PATH"
8
- use_image_transform = False
 
 
 
 
 
 
9
  num_workers = 4
 
 
 
 
10
 
11
  # Define acceleration
12
  dtype = "bf16"
@@ -20,7 +26,7 @@ model = dict(
20
  space_scale=1.0,
21
  time_scale=2 / 3,
22
  from_pretrained=None,
23
- enable_flashattn=True,
24
  enable_layernorm_kernel=True,
25
  enable_sequence_parallelism=True, # enable sq here
26
  )
@@ -31,7 +37,7 @@ vae = dict(
31
  )
32
  text_encoder = dict(
33
  type="t5",
34
- from_pretrained="./pretrained_models/t5_ckpts",
35
  model_max_length=120,
36
  shardformer=True,
37
  )
 
 
 
 
 
1
  # Define dataset
2
+ dataset = dict(
3
+ type="VideoTextDataset",
4
+ data_path=None,
5
+ num_frames=360,
6
+ frame_interval=3,
7
+ image_size=(512, 512),
8
+ )
9
+
10
+ # Define acceleration
11
  num_workers = 4
12
+ dtype = "bf16"
13
+ grad_checkpoint = True
14
+ plugin = "zero2"
15
+ sp_size = 1
16
 
17
  # Define acceleration
18
  dtype = "bf16"
 
26
  space_scale=1.0,
27
  time_scale=2 / 3,
28
  from_pretrained=None,
29
+ enable_flash_attn=True,
30
  enable_layernorm_kernel=True,
31
  enable_sequence_parallelism=True, # enable sq here
32
  )
 
37
  )
38
  text_encoder = dict(
39
  type="t5",
40
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
41
  model_max_length=120,
42
  shardformer=True,
43
  )
configs/opensora/train/64x512x512-sp.py CHANGED
@@ -1,17 +1,17 @@
1
- num_frames = 64
2
- frame_interval = 2
3
- image_size = (512, 512)
4
-
5
  # Define dataset
6
- root = None
7
- data_path = "CSV_PATH"
8
- use_image_transform = False
9
- num_workers = 4
 
 
 
10
 
11
  # Define acceleration
 
12
  dtype = "bf16"
13
  grad_checkpoint = True
14
- plugin = "zero2-seq"
15
  sp_size = 2
16
 
17
  # Define model
@@ -20,7 +20,7 @@ model = dict(
20
  space_scale=1.0,
21
  time_scale=2 / 3,
22
  from_pretrained=None,
23
- enable_flashattn=True,
24
  enable_layernorm_kernel=True,
25
  enable_sequence_parallelism=True, # enable sq here
26
  )
@@ -30,7 +30,7 @@ vae = dict(
30
  )
31
  text_encoder = dict(
32
  type="t5",
33
- from_pretrained="./pretrained_models/t5_ckpts",
34
  model_max_length=120,
35
  shardformer=True,
36
  )
 
 
 
 
 
1
  # Define dataset
2
+ dataset = dict(
3
+ type="VideoTextDataset",
4
+ data_path=None,
5
+ num_frames=16,
6
+ frame_interval=3,
7
+ image_size=(512, 512),
8
+ )
9
 
10
  # Define acceleration
11
+ num_workers = 4
12
  dtype = "bf16"
13
  grad_checkpoint = True
14
+ plugin = "zero2"
15
  sp_size = 2
16
 
17
  # Define model
 
20
  space_scale=1.0,
21
  time_scale=2 / 3,
22
  from_pretrained=None,
23
+ enable_flash_attn=True,
24
  enable_layernorm_kernel=True,
25
  enable_sequence_parallelism=True, # enable sq here
26
  )
 
30
  )
31
  text_encoder = dict(
32
  type="t5",
33
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
34
  model_max_length=120,
35
  shardformer=True,
36
  )
configs/opensora/train/64x512x512.py CHANGED
@@ -1,14 +1,14 @@
1
- num_frames = 64
2
- frame_interval = 2
3
- image_size = (512, 512)
4
-
5
  # Define dataset
6
- root = None
7
- data_path = "CSV_PATH"
8
- use_image_transform = False
9
- num_workers = 4
 
 
 
10
 
11
  # Define acceleration
 
12
  dtype = "bf16"
13
  grad_checkpoint = True
14
  plugin = "zero2"
@@ -20,7 +20,7 @@ model = dict(
20
  space_scale=1.0,
21
  time_scale=2 / 3,
22
  from_pretrained=None,
23
- enable_flashattn=True,
24
  enable_layernorm_kernel=True,
25
  )
26
  vae = dict(
@@ -30,7 +30,7 @@ vae = dict(
30
  )
31
  text_encoder = dict(
32
  type="t5",
33
- from_pretrained="./pretrained_models/t5_ckpts",
34
  model_max_length=120,
35
  shardformer=True,
36
  )
 
 
 
 
 
1
  # Define dataset
2
+ dataset = dict(
3
+ type="VideoTextDataset",
4
+ data_path=None,
5
+ num_frames=64,
6
+ frame_interval=3,
7
+ image_size=(512, 512),
8
+ )
9
 
10
  # Define acceleration
11
+ num_workers = 4
12
  dtype = "bf16"
13
  grad_checkpoint = True
14
  plugin = "zero2"
 
20
  space_scale=1.0,
21
  time_scale=2 / 3,
22
  from_pretrained=None,
23
+ enable_flash_attn=True,
24
  enable_layernorm_kernel=True,
25
  )
26
  vae = dict(
 
30
  )
31
  text_encoder = dict(
32
  type="t5",
33
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
34
  model_max_length=120,
35
  shardformer=True,
36
  )
configs/pixart/inference/16x256x256.py CHANGED
@@ -15,18 +15,18 @@ vae = dict(
15
  )
16
  text_encoder = dict(
17
  type="t5",
18
- from_pretrained="./pretrained_models/t5_ckpts",
19
  model_max_length=120,
20
  )
21
  scheduler = dict(
22
  type="dpm-solver",
23
- num_sampling_steps=50,
24
- cfg_scale=5.0,
25
  )
26
- dtype = "fp16"
27
 
28
  # Others
29
  batch_size = 2
30
  seed = 42
31
  prompt_path = "./assets/texts/t2v_samples.txt"
32
- save_dir = "./outputs/samples/"
 
15
  )
16
  text_encoder = dict(
17
  type="t5",
18
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
19
  model_max_length=120,
20
  )
21
  scheduler = dict(
22
  type="dpm-solver",
23
+ num_sampling_steps=20,
24
+ cfg_scale=7.0,
25
  )
26
+ dtype = "bf16"
27
 
28
  # Others
29
  batch_size = 2
30
  seed = 42
31
  prompt_path = "./assets/texts/t2v_samples.txt"
32
+ save_dir = "./samples/samples/"
configs/pixart/inference/1x1024MS.py CHANGED
@@ -1,7 +1,7 @@
1
  num_frames = 1
2
  fps = 1
3
  image_size = (1920, 512)
4
- multi_resolution = True
5
 
6
  # Define model
7
  model = dict(
@@ -17,7 +17,7 @@ vae = dict(
17
  )
18
  text_encoder = dict(
19
  type="t5",
20
- from_pretrained="./pretrained_models/t5_ckpts",
21
  model_max_length=120,
22
  )
23
  scheduler = dict(
@@ -25,10 +25,10 @@ scheduler = dict(
25
  num_sampling_steps=20,
26
  cfg_scale=7.0,
27
  )
28
- dtype = "fp16"
29
 
30
  # Others
31
  batch_size = 2
32
  seed = 42
33
  prompt_path = "./assets/texts/t2i_samples.txt"
34
- save_dir = "./outputs/samples/"
 
1
  num_frames = 1
2
  fps = 1
3
  image_size = (1920, 512)
4
+ multi_resolution = "PixArtMS"
5
 
6
  # Define model
7
  model = dict(
 
17
  )
18
  text_encoder = dict(
19
  type="t5",
20
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
21
  model_max_length=120,
22
  )
23
  scheduler = dict(
 
25
  num_sampling_steps=20,
26
  cfg_scale=7.0,
27
  )
28
+ dtype = "bf16"
29
 
30
  # Others
31
  batch_size = 2
32
  seed = 42
33
  prompt_path = "./assets/texts/t2i_samples.txt"
34
+ save_dir = "./samples/samples/"
configs/pixart/inference/1x20481B.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 1
2
+ fps = 1
3
+ image_size = (2560, 1536)
4
+ # image_size = (2048, 2048)
5
+
6
+ model = dict(
7
+ type="PixArt-1B/2",
8
+ from_pretrained="PixArt-1B-2.pth",
9
+ space_scale=4,
10
+ no_temporal_pos_emb=True,
11
+ enable_flash_attn=True,
12
+ enable_layernorm_kernel=True,
13
+ base_size=2048 // 8,
14
+ )
15
+ vae = dict(
16
+ type="VideoAutoencoderKL",
17
+ from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers",
18
+ subfolder="vae",
19
+ )
20
+ text_encoder = dict(
21
+ type="t5",
22
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
23
+ model_max_length=300,
24
+ )
25
+ scheduler = dict(
26
+ type="dpm-solver",
27
+ num_sampling_steps=14,
28
+ cfg_scale=4.5,
29
+ )
30
+ dtype = "bf16"
31
+
32
+ # Others
33
+ batch_size = 1
34
+ seed = 42
35
+ prompt_path = "./assets/texts/t2i_sigma.txt"
36
+ save_dir = "./samples/samples/"
configs/pixart/inference/1x2048MS.py ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 1
2
+ fps = 1
3
+ image_size = (2560, 1536)
4
+ # image_size = (2048, 2048)
5
+
6
+ model = dict(
7
+ type="PixArt-XL/2",
8
+ from_pretrained="PixArt-Sigma-XL-2-2K-MS.pth",
9
+ space_scale=4,
10
+ no_temporal_pos_emb=True,
11
+ enable_flash_attn=True,
12
+ enable_layernorm_kernel=True,
13
+ base_size=2048 // 8,
14
+ )
15
+ vae = dict(
16
+ type="VideoAutoencoderKL",
17
+ from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers",
18
+ subfolder="vae",
19
+ )
20
+ text_encoder = dict(
21
+ type="t5",
22
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
23
+ model_max_length=300,
24
+ )
25
+ scheduler = dict(
26
+ type="dpm-solver",
27
+ num_sampling_steps=14,
28
+ cfg_scale=4.5,
29
+ )
30
+ dtype = "bf16"
31
+
32
+ # Others
33
+ batch_size = 1
34
+ seed = 42
35
+ prompt_path = "./assets/texts/t2i_sigma.txt"
36
+ save_dir = "./samples/samples/"
configs/pixart/inference/1x256x256.py CHANGED
@@ -16,18 +16,18 @@ vae = dict(
16
  )
17
  text_encoder = dict(
18
  type="t5",
19
- from_pretrained="./pretrained_models/t5_ckpts",
20
  model_max_length=120,
21
  )
22
  scheduler = dict(
23
  type="dpm-solver",
24
- num_sampling_steps=30,
25
  cfg_scale=7.0,
26
  )
27
- dtype = "fp16"
28
 
29
  # Others
30
  batch_size = 2
31
  seed = 42
32
  prompt_path = "./assets/texts/t2i_samples.txt"
33
- save_dir = "./outputs/samples/"
 
16
  )
17
  text_encoder = dict(
18
  type="t5",
19
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
20
  model_max_length=120,
21
  )
22
  scheduler = dict(
23
  type="dpm-solver",
24
+ num_sampling_steps=20,
25
  cfg_scale=7.0,
26
  )
27
+ dtype = "bf16"
28
 
29
  # Others
30
  batch_size = 2
31
  seed = 42
32
  prompt_path = "./assets/texts/t2i_samples.txt"
33
+ save_dir = "./samples/samples/"
configs/pixart/inference/1x512x512-rflow.py ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ num_frames = 1
2
+ fps = 1
3
+ image_size = (512, 512)
4
+
5
+ # Define model
6
+ model = dict(
7
+ type="PixArt-XL/2",
8
+ space_scale=1.0,
9
+ time_scale=1.0,
10
+ no_temporal_pos_emb=True,
11
+ from_pretrained="PRETRAINED_MODEL",
12
+ )
13
+ vae = dict(
14
+ type="VideoAutoencoderKL",
15
+ from_pretrained="stabilityai/sd-vae-ft-ema",
16
+ )
17
+ text_encoder = dict(
18
+ type="t5",
19
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
20
+ model_max_length=120,
21
+ )
22
+ scheduler = dict(
23
+ type="rflow",
24
+ num_sampling_steps=20,
25
+ cfg_scale=7.0,
26
+ )
27
+ dtype = "bf16"
28
+
29
+ # prompt_path = "./assets/texts/t2i_samples.txt"
30
+ prompt = [
31
+ "Pirate ship trapped in a cosmic maelstrom nebula.",
32
+ "A small cactus with a happy face in the Sahara desert.",
33
+ "A small cactus with a sad face in the Sahara desert.",
34
+ ]
35
+
36
+ # Others
37
+ batch_size = 2
38
+ seed = 42
39
+ save_dir = "./outputs/samples2/"
configs/pixart/inference/1x512x512.py CHANGED
@@ -16,18 +16,24 @@ vae = dict(
16
  )
17
  text_encoder = dict(
18
  type="t5",
19
- from_pretrained="./pretrained_models/t5_ckpts",
20
  model_max_length=120,
21
  )
22
  scheduler = dict(
23
  type="dpm-solver",
24
- num_sampling_steps=35,
25
  cfg_scale=7.0,
26
  )
27
- dtype = "fp16"
 
 
 
 
 
 
 
28
 
29
  # Others
30
  batch_size = 2
31
  seed = 42
32
- prompt_path = "./assets/texts/t2i_samples.txt"
33
- save_dir = "./outputs/samples/"
 
16
  )
17
  text_encoder = dict(
18
  type="t5",
19
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
20
  model_max_length=120,
21
  )
22
  scheduler = dict(
23
  type="dpm-solver",
24
+ num_sampling_steps=20,
25
  cfg_scale=7.0,
26
  )
27
+ dtype = "bf16"
28
+
29
+ # prompt_path = "./assets/texts/t2i_samples.txt"
30
+ prompt = [
31
+ "Pirate ship trapped in a cosmic maelstrom nebula.",
32
+ "A small cactus with a happy face in the Sahara desert.",
33
+ "A small cactus with a sad face in the Sahara desert.",
34
+ ]
35
 
36
  # Others
37
  batch_size = 2
38
  seed = 42
39
+ save_dir = "./samples/samples/"
 
configs/pixart/train/16x256x256.py CHANGED
@@ -1,16 +1,16 @@
1
- num_frames = 16
2
- frame_interval = 3
3
- image_size = (256, 256)
4
-
5
  # Define dataset
6
- root = None
7
- data_path = "CSV_PATH"
8
- use_image_transform = False
9
- num_workers = 4
 
 
 
10
 
11
  # Define acceleration
 
12
  dtype = "bf16"
13
- grad_checkpoint = False
14
  plugin = "zero2"
15
  sp_size = 1
16
 
@@ -20,7 +20,7 @@ model = dict(
20
  space_scale=0.5,
21
  time_scale=1.0,
22
  from_pretrained="PixArt-XL-2-512x512.pth",
23
- enable_flashattn=True,
24
  enable_layernorm_kernel=True,
25
  )
26
  vae = dict(
@@ -29,7 +29,7 @@ vae = dict(
29
  )
30
  text_encoder = dict(
31
  type="t5",
32
- from_pretrained="./pretrained_models/t5_ckpts",
33
  model_max_length=120,
34
  shardformer=True,
35
  )
 
 
 
 
 
1
  # Define dataset
2
+ dataset = dict(
3
+ type="VideoTextDataset",
4
+ data_path=None,
5
+ num_frames=16,
6
+ frame_interval=3,
7
+ image_size=(256, 256),
8
+ )
9
 
10
  # Define acceleration
11
+ num_workers = 4
12
  dtype = "bf16"
13
+ grad_checkpoint = True
14
  plugin = "zero2"
15
  sp_size = 1
16
 
 
20
  space_scale=0.5,
21
  time_scale=1.0,
22
  from_pretrained="PixArt-XL-2-512x512.pth",
23
+ enable_flash_attn=True,
24
  enable_layernorm_kernel=True,
25
  )
26
  vae = dict(
 
29
  )
30
  text_encoder = dict(
31
  type="t5",
32
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
33
  model_max_length=120,
34
  shardformer=True,
35
  )
configs/pixart/train/1x2048x2048.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Define dataset
2
+ dataset = dict(
3
+ type="VideoTextDataset",
4
+ data_path="/home/zhaowangbo/data/csv/image-v1_1_ext_noempty_rcp_clean_info.csv",
5
+ num_frames=1,
6
+ frame_interval=3,
7
+ image_size=(2048, 2048),
8
+ )
9
+
10
+ # Define acceleration
11
+ num_workers = 4
12
+ dtype = "bf16"
13
+ grad_checkpoint = True
14
+ plugin = "zero2"
15
+ sp_size = 1
16
+
17
+ # Define model
18
+ model = dict(
19
+ type="PixArt-1B/2",
20
+ space_scale=4.0,
21
+ no_temporal_pos_emb=True,
22
+ from_pretrained="PixArt-1B-2.pth",
23
+ enable_flash_attn=True,
24
+ enable_layernorm_kernel=True,
25
+ )
26
+
27
+ vae = dict(
28
+ type="VideoAutoencoderKL",
29
+ from_pretrained="PixArt-alpha/pixart_sigma_sdxlvae_T5_diffusers",
30
+ subfolder="vae",
31
+ )
32
+ text_encoder = dict(
33
+ type="t5",
34
+ from_pretrained="DeepFloyd/t5-v1_1-xxl",
35
+ model_max_length=300,
36
+ )
37
+ scheduler = dict(
38
+ type="iddpm",
39
+ timestep_respacing="",
40
+ )
41
+
42
+ # Others
43
+ seed = 42
44
+ outputs = "outputs"
45
+ wandb = False
46
+
47
+ epochs = 1000
48
+ log_every = 10
49
+ ckpt_every = 1000
50
+ load = None
51
+
52
+ batch_size = 4
53
+ lr = 2e-5
54
+ grad_clip = 1.0