吴吴大庸 commited on
Commit
c098bbc
·
1 Parent(s): 0a7f9b3

used open sora official space to replace our local repo

Browse files
.DS_Store ADDED
Binary file (6.15 kB). View file
 
README.md CHANGED
@@ -1,12 +1,13 @@
1
  ---
2
- title: Sora
3
- emoji: 🦀
4
- colorFrom: gray
5
- colorTo: gray
6
  sdk: gradio
7
- sdk_version: 4.36.1
8
  app_file: app.py
9
  pinned: false
 
10
  ---
11
 
12
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
 
1
  ---
2
+ title: Open Sora
3
+ emoji:
4
+ colorFrom: blue
5
+ colorTo: purple
6
  sdk: gradio
7
+ sdk_version: 4.25.0
8
  app_file: app.py
9
  pinned: false
10
+ license: apache-2.0
11
  ---
12
 
13
  Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app.py CHANGED
@@ -19,12 +19,9 @@ import spaces
19
  import torch
20
 
21
  import gradio as gr
22
- from tempfile import NamedTemporaryFile
23
- import datetime
24
 
25
 
26
-
27
- MODEL_TYPES = ["v1.1-stage2", "v1.1-stage3"]
28
  CONFIG_MAP = {
29
  "v1.1-stage2": "configs/opensora-v1-1/inference/sample-ref.py",
30
  "v1.1-stage3": "configs/opensora-v1-1/inference/sample-ref.py",
@@ -34,41 +31,12 @@ HF_STDIT_MAP = {
34
  "v1.1-stage3": "hpcai-tech/OpenSora-STDiT-v2-stage3",
35
  }
36
  RESOLUTION_MAP = {
37
- "144p": {
38
- "16:9": (256, 144),
39
- "9:16": (144, 256),
40
- "4:3": (221, 165),
41
- "3:4": (165, 221),
42
- "1:1": (192, 192),
43
- },
44
- "240p": {
45
- "16:9": (426, 240),
46
- "9:16": (240, 426),
47
- "4:3": (370, 278),
48
- "3:4": (278, 370),
49
- "1:1": (320, 320),
50
- },
51
- "360p": {
52
- "16:9": (640, 360),
53
- "9:16": (360, 640),
54
- "4:3": (554, 416),
55
- "3:4": (416, 554),
56
- "1:1": (480, 480),
57
- },
58
- "480p": {
59
- "16:9": (854, 480),
60
- "9:16": (480, 854),
61
- "4:3": (740, 555),
62
- "3:4": (555, 740),
63
- "1:1": (640, 640),
64
- },
65
- "720p": {
66
- "16:9": (1280, 720),
67
- "9:16": (720, 1280),
68
- "4:3": (1108, 832),
69
- "3:4": (832, 1110),
70
- "1:1": (960, 960),
71
- },
72
  }
73
 
74
 
@@ -255,9 +223,9 @@ def build_models(model_type, config, enable_optimization=False):
255
  # build stdit
256
  # we load model from HuggingFace directly so that we don't need to
257
  # handle model download logic in HuggingFace Space
258
- from opensora.models.stdit.stdit2 import STDiT2
259
 
260
- stdit = STDiT2.from_pretrained(
261
  HF_STDIT_MAP[model_type],
262
  enable_flash_attn=enable_optimization,
263
  trust_remote_code=True,
@@ -334,53 +302,37 @@ device = torch.device("cuda")
334
  vae, text_encoder, stdit, scheduler = build_models(args.model_type, config, enable_optimization=args.enable_optimization)
335
 
336
 
337
- def run_inference(mode, prompt_text, resolution, aspect_ratio, length, reference_image, seed, sampling_steps, cfg_scale):
338
- torch.manual_seed(seed)
339
  with torch.inference_mode():
340
  # ======================
341
  # 1. Preparation
342
  # ======================
343
  # parse the inputs
344
- resolution = RESOLUTION_MAP[resolution][aspect_ratio]
345
-
346
- # gather args from config
347
- num_frames = config.num_frames
348
- frame_interval = config.frame_interval
349
- fps = config.fps
350
- condition_frame_length = config.condition_frame_length
351
-
352
  # compute number of loops
353
- if mode == "Text2Image":
354
- num_frames = 1
355
- num_loop = 1
356
- else:
357
- num_seconds = int(length.rstrip('s'))
358
- if num_seconds <= 16:
359
- num_frames = num_seconds * fps // frame_interval
360
- num_loop = 1
361
- else:
362
- config.num_frames = 16
363
- total_number_of_frames = num_seconds * fps / frame_interval
364
- num_loop = math.ceil((total_number_of_frames - condition_frame_length) / (num_frames - condition_frame_length))
365
 
366
  # prepare model args
367
- if config.num_frames == 1:
368
- fps = IMG_FPS
369
-
370
  model_args = dict()
371
- height_tensor = torch.tensor([resolution[0]], device=device, dtype=dtype)
372
- width_tensor = torch.tensor([resolution[1]], device=device, dtype=dtype)
373
- num_frames_tensor = torch.tensor([num_frames], device=device, dtype=dtype)
374
- ar_tensor = torch.tensor([resolution[0] / resolution[1]], device=device, dtype=dtype)
375
- fps_tensor = torch.tensor([fps], device=device, dtype=dtype)
376
- model_args["height"] = height_tensor
377
- model_args["width"] = width_tensor
378
- model_args["num_frames"] = num_frames_tensor
379
- model_args["ar"] = ar_tensor
380
- model_args["fps"] = fps_tensor
 
 
381
 
382
  # compute latent size
383
- input_size = (num_frames, *resolution)
384
  latent_size = vae.get_latent_size(input_size)
385
 
386
  # process prompt
@@ -390,32 +342,24 @@ def run_inference(mode, prompt_text, resolution, aspect_ratio, length, reference
390
  video_clips = []
391
 
392
  # prepare mask strategy
393
- if mode == "Text2Image":
394
  mask_strategy = [None]
395
- elif mode == "Text2Video":
396
- if reference_image is not None:
397
- mask_strategy = ['0']
398
- else:
399
- mask_strategy = [None]
400
  else:
401
  raise ValueError(f"Invalid mode: {mode}")
402
 
403
  # =========================
404
  # 2. Load reference images
405
  # =========================
406
- if mode == "Text2Image":
407
  refs_x = collect_references_batch([None], vae, resolution)
408
- elif mode == "Text2Video":
409
- if reference_image is not None:
410
- # save image to disk
411
- from PIL import Image
412
- im = Image.fromarray(reference_image)
413
-
414
- with NamedTemporaryFile(suffix=".jpg") as temp_file:
415
- im.save(temp_file.name)
416
- refs_x = collect_references_batch([temp_file.name], vae, resolution)
417
- else:
418
- refs_x = collect_references_batch([None], vae, resolution)
419
  else:
420
  raise ValueError(f"Invalid mode: {mode}")
421
 
@@ -442,20 +386,11 @@ def run_inference(mode, prompt_text, resolution, aspect_ratio, length, reference
442
  mask_strategy[j] += ";"
443
  mask_strategy[
444
  j
445
- ] += f"{loop_i},{len(refs)-1},-{condition_frame_length},0,{condition_frame_length}"
446
 
447
  masks = apply_mask_strategy(z, refs_x, mask_strategy, loop_i)
448
 
449
  # 4.6. diffusion sampling
450
- # hack to update num_sampling_steps and cfg_scale
451
- scheduler_kwargs = config.scheduler.copy()
452
- scheduler_kwargs.pop('type')
453
- scheduler_kwargs['num_sampling_steps'] = sampling_steps
454
- scheduler_kwargs['cfg_scale'] = cfg_scale
455
-
456
- scheduler.__init__(
457
- **scheduler_kwargs
458
- )
459
  samples = scheduler.sample(
460
  stdit,
461
  text_encoder,
@@ -475,20 +410,10 @@ def run_inference(mode, prompt_text, resolution, aspect_ratio, length, reference
475
  for i in range(1, num_loop)
476
  ]
477
  video = torch.cat(video_clips_list, dim=1)
478
- current_datetime = datetime.datetime.now()
479
- timestamp = current_datetime.timestamp()
480
- save_path = os.path.join(args.output, f"output_{timestamp}")
481
- saved_path = save_sample(video, save_path=save_path, fps=config.fps // config.frame_interval)
482
  return saved_path
483
 
484
- @spaces.GPU(duration=200)
485
- def run_image_inference(prompt_text, resolution, aspect_ratio, length, reference_image, seed, sampling_steps, cfg_scale):
486
- return run_inference("Text2Image", prompt_text, resolution, aspect_ratio, length, reference_image, seed, sampling_steps, cfg_scale)
487
-
488
- @spaces.GPU(duration=200)
489
- def run_video_inference(prompt_text, resolution, aspect_ratio, length, reference_image, seed, sampling_steps, cfg_scale):
490
- return run_inference("Text2Video", prompt_text, resolution, aspect_ratio, length, reference_image, seed, sampling_steps, cfg_scale)
491
-
492
 
493
  def main():
494
  # create demo
@@ -517,54 +442,31 @@ def main():
517
 
518
  with gr.Row():
519
  with gr.Column():
 
 
 
 
 
 
520
  prompt_text = gr.Textbox(
521
  label="Prompt",
522
  placeholder="Describe your video here",
523
  lines=4,
524
  )
525
  resolution = gr.Radio(
526
- choices=["144p", "240p", "360p", "480p", "720p"],
527
- value="240p",
528
  label="Resolution",
529
  )
530
- aspect_ratio = gr.Radio(
531
- choices=["9:16", "16:9", "3:4", "4:3", "1:1"],
532
- value="9:16",
533
- label="Aspect Ratio (H:W)",
534
- )
535
  length = gr.Radio(
536
- choices=["2s", "4s", "8s", "16s"],
537
  value="2s",
538
- label="Video Length (only effective for video generation)",
539
  info="8s may fail as Hugging Face ZeroGPU has the limitation of max 200 seconds inference time."
540
  )
541
 
542
- with gr.Row():
543
- seed = gr.Slider(
544
- value=1024,
545
- minimum=1,
546
- maximum=2048,
547
- step=1,
548
- label="Seed"
549
- )
550
-
551
- sampling_steps = gr.Slider(
552
- value=100,
553
- minimum=1,
554
- maximum=200,
555
- step=1,
556
- label="Sampling steps"
557
- )
558
- cfg_scale = gr.Slider(
559
- value=7.0,
560
- minimum=0.0,
561
- maximum=10.0,
562
- step=0.1,
563
- label="CFG Scale"
564
- )
565
-
566
  reference_image = gr.Image(
567
- label="Reference Image (Optional)",
568
  )
569
 
570
  with gr.Column():
@@ -574,18 +476,12 @@ def main():
574
  )
575
 
576
  with gr.Row():
577
- image_gen_button = gr.Button("Generate image")
578
- video_gen_button = gr.Button("Generate video")
579
 
580
 
581
- image_gen_button.click(
582
- fn=run_image_inference,
583
- inputs=[prompt_text, resolution, aspect_ratio, length, reference_image, seed, sampling_steps, cfg_scale],
584
- outputs=reference_image
585
- )
586
- video_gen_button.click(
587
- fn=run_video_inference,
588
- inputs=[prompt_text, resolution, aspect_ratio, length, reference_image, seed, sampling_steps, cfg_scale],
589
  outputs=output_video
590
  )
591
 
 
19
  import torch
20
 
21
  import gradio as gr
 
 
22
 
23
 
24
+ MODEL_TYPES = ["v1.1"]
 
25
  CONFIG_MAP = {
26
  "v1.1-stage2": "configs/opensora-v1-1/inference/sample-ref.py",
27
  "v1.1-stage3": "configs/opensora-v1-1/inference/sample-ref.py",
 
31
  "v1.1-stage3": "hpcai-tech/OpenSora-STDiT-v2-stage3",
32
  }
33
  RESOLUTION_MAP = {
34
+ "144p": (144, 256),
35
+ "240p": (240, 426),
36
+ "360p": (360, 480),
37
+ "480p": (480, 858),
38
+ "720p": (720, 1280),
39
+ "1080p": (1080, 1920)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
40
  }
41
 
42
 
 
223
  # build stdit
224
  # we load model from HuggingFace directly so that we don't need to
225
  # handle model download logic in HuggingFace Space
226
+ from transformers import AutoModel
227
 
228
+ stdit = AutoModel.from_pretrained(
229
  HF_STDIT_MAP[model_type],
230
  enable_flash_attn=enable_optimization,
231
  trust_remote_code=True,
 
302
  vae, text_encoder, stdit, scheduler = build_models(args.model_type, config, enable_optimization=args.enable_optimization)
303
 
304
 
305
+ @spaces.GPU(duration=200)
306
+ def run_inference(mode, prompt_text, resolution, length, reference_image):
307
  with torch.inference_mode():
308
  # ======================
309
  # 1. Preparation
310
  # ======================
311
  # parse the inputs
312
+ resolution = RESOLUTION_MAP[resolution]
313
+
 
 
 
 
 
 
314
  # compute number of loops
315
+ num_seconds = int(length.rstrip('s'))
316
+ total_number_of_frames = num_seconds * config.fps / config.frame_interval
317
+ num_loop = math.ceil(total_number_of_frames / config.num_frames)
 
 
 
 
 
 
 
 
 
318
 
319
  # prepare model args
 
 
 
320
  model_args = dict()
321
+ height = torch.tensor([resolution[0]], device=device, dtype=dtype)
322
+ width = torch.tensor([resolution[1]], device=device, dtype=dtype)
323
+ num_frames = torch.tensor([config.num_frames], device=device, dtype=dtype)
324
+ ar = torch.tensor([resolution[0] / resolution[1]], device=device, dtype=dtype)
325
+ if config.num_frames == 1:
326
+ config.fps = IMG_FPS
327
+ fps = torch.tensor([config.fps], device=device, dtype=dtype)
328
+ model_args["height"] = height
329
+ model_args["width"] = width
330
+ model_args["num_frames"] = num_frames
331
+ model_args["ar"] = ar
332
+ model_args["fps"] = fps
333
 
334
  # compute latent size
335
+ input_size = (config.num_frames, *resolution)
336
  latent_size = vae.get_latent_size(input_size)
337
 
338
  # process prompt
 
342
  video_clips = []
343
 
344
  # prepare mask strategy
345
+ if mode == "Text2Video":
346
  mask_strategy = [None]
347
+ elif mode == "Image2Video":
348
+ mask_strategy = ['0']
 
 
 
349
  else:
350
  raise ValueError(f"Invalid mode: {mode}")
351
 
352
  # =========================
353
  # 2. Load reference images
354
  # =========================
355
+ if mode == "Text2Video":
356
  refs_x = collect_references_batch([None], vae, resolution)
357
+ elif mode == "Image2Video":
358
+ # save image to disk
359
+ from PIL import Image
360
+ im = Image.fromarray(reference_image)
361
+ im.save("test.jpg")
362
+ refs_x = collect_references_batch(["test.jpg"], vae, resolution)
 
 
 
 
 
363
  else:
364
  raise ValueError(f"Invalid mode: {mode}")
365
 
 
386
  mask_strategy[j] += ";"
387
  mask_strategy[
388
  j
389
+ ] += f"{loop_i},{len(refs)-1},-{config.condition_frame_length},0,{config.condition_frame_length}"
390
 
391
  masks = apply_mask_strategy(z, refs_x, mask_strategy, loop_i)
392
 
393
  # 4.6. diffusion sampling
 
 
 
 
 
 
 
 
 
394
  samples = scheduler.sample(
395
  stdit,
396
  text_encoder,
 
410
  for i in range(1, num_loop)
411
  ]
412
  video = torch.cat(video_clips_list, dim=1)
413
+ save_path = f"{args.output}/sample"
414
+ saved_path = save_sample(video, fps=config.fps // config.frame_interval, save_path=save_path, force_video=True)
 
 
415
  return saved_path
416
 
 
 
 
 
 
 
 
 
417
 
418
  def main():
419
  # create demo
 
442
 
443
  with gr.Row():
444
  with gr.Column():
445
+ mode = gr.Radio(
446
+ choices=["Text2Video", "Image2Video"],
447
+ value="Text2Video",
448
+ label="Usage",
449
+ info="Choose your usage scenario",
450
+ )
451
  prompt_text = gr.Textbox(
452
  label="Prompt",
453
  placeholder="Describe your video here",
454
  lines=4,
455
  )
456
  resolution = gr.Radio(
457
+ choices=["144p", "240p", "360p", "480p", "720p", "1080p"],
458
+ value="144p",
459
  label="Resolution",
460
  )
 
 
 
 
 
461
  length = gr.Radio(
462
+ choices=["2s", "4s", "8s"],
463
  value="2s",
464
+ label="Video Length",
465
  info="8s may fail as Hugging Face ZeroGPU has the limitation of max 200 seconds inference time."
466
  )
467
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
468
  reference_image = gr.Image(
469
+ label="Reference Image (only used for Image2Video)",
470
  )
471
 
472
  with gr.Column():
 
476
  )
477
 
478
  with gr.Row():
479
+ submit_button = gr.Button("Generate video")
 
480
 
481
 
482
+ submit_button.click(
483
+ fn=run_inference,
484
+ inputs=[mode, prompt_text, resolution, length, reference_image],
 
 
 
 
 
485
  outputs=output_video
486
  )
487
 
configs/.DS_Store ADDED
Binary file (6.15 kB). View file
 
configs/dit/train/16x256x256.py CHANGED
@@ -18,7 +18,7 @@ sp_size = 1
18
  model = dict(
19
  type="DiT-XL/2",
20
  from_pretrained="DiT-XL-2-256x256.pt",
21
- enable_flash_attn=True,
22
  enable_layernorm_kernel=True,
23
  )
24
  vae = dict(
 
18
  model = dict(
19
  type="DiT-XL/2",
20
  from_pretrained="DiT-XL-2-256x256.pt",
21
+ enable_flashattn=True,
22
  enable_layernorm_kernel=True,
23
  )
24
  vae = dict(
configs/dit/train/1x256x256.py CHANGED
@@ -19,7 +19,7 @@ sp_size = 1
19
  model = dict(
20
  type="DiT-XL/2",
21
  no_temporal_pos_emb=True,
22
- enable_flash_attn=True,
23
  enable_layernorm_kernel=True,
24
  )
25
  vae = dict(
 
19
  model = dict(
20
  type="DiT-XL/2",
21
  no_temporal_pos_emb=True,
22
+ enable_flashattn=True,
23
  enable_layernorm_kernel=True,
24
  )
25
  vae = dict(
configs/latte/train/16x256x256.py CHANGED
@@ -17,7 +17,7 @@ sp_size = 1
17
  # Define model
18
  model = dict(
19
  type="Latte-XL/2",
20
- enable_flash_attn=True,
21
  enable_layernorm_kernel=True,
22
  )
23
  vae = dict(
 
17
  # Define model
18
  model = dict(
19
  type="Latte-XL/2",
20
+ enable_flashattn=True,
21
  enable_layernorm_kernel=True,
22
  )
23
  vae = dict(
configs/opensora-v1-1/.DS_Store ADDED
Binary file (6.15 kB). View file
 
configs/opensora-v1-1/inference/sample-ref.py CHANGED
@@ -14,34 +14,26 @@ prompt = [
14
 
15
  loop = 2
16
  condition_frame_length = 4
17
- # (
18
- # loop id, [the loop index of the condition image or video]
19
- # reference id, [the index of the condition image or video in the reference_path]
20
- # reference start, [the start frame of the condition image or video]
21
- # target start, [the location to insert]
22
- # length, [the number of frames to insert]
23
- # edit_ratio [the edit rate of the condition image or video]
24
- # )
25
- # See https://github.com/hpcaitech/Open-Sora/blob/main/docs/config.md#advanced-inference-config for more details
26
- # See https://github.com/hpcaitech/Open-Sora/blob/main/docs/commands.md#inference-with-open-sora-11 for more examples
27
- mask_strategy = [
28
- "0,0,0,0,8,0.3",
29
- None,
30
- "0",
31
- ]
32
  reference_path = [
33
  "https://cdn.openai.com/tmp/s/interp/d0.mp4",
34
  None,
35
  "assets/images/condition/wave.png",
36
  ]
 
 
 
 
 
 
 
37
 
38
  # Define model
39
  model = dict(
40
  type="STDiT2-XL/2",
41
- from_pretrained="hpcai-tech/OpenSora-STDiT-v2-stage3",
42
  input_sq_size=512,
43
  qk_norm=True,
44
- enable_flash_attn=True,
45
  enable_layernorm_kernel=True,
46
  )
47
  vae = dict(
 
14
 
15
  loop = 2
16
  condition_frame_length = 4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
17
  reference_path = [
18
  "https://cdn.openai.com/tmp/s/interp/d0.mp4",
19
  None,
20
  "assets/images/condition/wave.png",
21
  ]
22
+ # valid when reference_path is not None
23
+ # (loop id, ref id, ref start, length, target start)
24
+ mask_strategy = [
25
+ "0,0,0,0,8,0.3",
26
+ None,
27
+ "0",
28
+ ]
29
 
30
  # Define model
31
  model = dict(
32
  type="STDiT2-XL/2",
33
+ from_pretrained=None,
34
  input_sq_size=512,
35
  qk_norm=True,
36
+ enable_flashattn=True,
37
  enable_layernorm_kernel=True,
38
  )
39
  vae = dict(
configs/opensora-v1-1/inference/sample.py CHANGED
@@ -7,10 +7,10 @@ multi_resolution = "STDiT2"
7
  # Define model
8
  model = dict(
9
  type="STDiT2-XL/2",
10
- from_pretrained="hpcai-tech/OpenSora-STDiT-v2-stage3",
11
  input_sq_size=512,
12
  qk_norm=True,
13
- enable_flash_attn=True,
14
  enable_layernorm_kernel=True,
15
  )
16
  vae = dict(
 
7
  # Define model
8
  model = dict(
9
  type="STDiT2-XL/2",
10
+ from_pretrained=None,
11
  input_sq_size=512,
12
  qk_norm=True,
13
+ enable_flashattn=True,
14
  enable_layernorm_kernel=True,
15
  )
16
  vae = dict(
configs/opensora-v1-1/train/benchmark.py CHANGED
@@ -65,7 +65,7 @@ model = dict(
65
  from_pretrained=None,
66
  input_sq_size=512, # pretrained model is trained on 512x512
67
  qk_norm=True,
68
- enable_flash_attn=True,
69
  enable_layernorm_kernel=True,
70
  )
71
  vae = dict(
 
65
  from_pretrained=None,
66
  input_sq_size=512, # pretrained model is trained on 512x512
67
  qk_norm=True,
68
+ enable_flashattn=True,
69
  enable_layernorm_kernel=True,
70
  )
71
  vae = dict(
configs/opensora-v1-1/train/image.py CHANGED
@@ -29,7 +29,7 @@ model = dict(
29
  from_pretrained=None,
30
  input_sq_size=512, # pretrained model is trained on 512x512
31
  qk_norm=True,
32
- enable_flash_attn=True,
33
  enable_layernorm_kernel=True,
34
  )
35
  vae = dict(
 
29
  from_pretrained=None,
30
  input_sq_size=512, # pretrained model is trained on 512x512
31
  qk_norm=True,
32
+ enable_flashattn=True,
33
  enable_layernorm_kernel=True,
34
  )
35
  vae = dict(
configs/opensora-v1-1/train/stage1.py CHANGED
@@ -41,7 +41,7 @@ model = dict(
41
  from_pretrained=None,
42
  input_sq_size=512, # pretrained model is trained on 512x512
43
  qk_norm=True,
44
- enable_flash_attn=True,
45
  enable_layernorm_kernel=True,
46
  )
47
  vae = dict(
 
41
  from_pretrained=None,
42
  input_sq_size=512, # pretrained model is trained on 512x512
43
  qk_norm=True,
44
+ enable_flashattn=True,
45
  enable_layernorm_kernel=True,
46
  )
47
  vae = dict(
configs/opensora-v1-1/train/stage2.py CHANGED
@@ -43,7 +43,7 @@ model = dict(
43
  from_pretrained=None,
44
  input_sq_size=512, # pretrained model is trained on 512x512
45
  qk_norm=True,
46
- enable_flash_attn=True,
47
  enable_layernorm_kernel=True,
48
  )
49
  vae = dict(
 
43
  from_pretrained=None,
44
  input_sq_size=512, # pretrained model is trained on 512x512
45
  qk_norm=True,
46
+ enable_flashattn=True,
47
  enable_layernorm_kernel=True,
48
  )
49
  vae = dict(
configs/opensora-v1-1/train/stage3.py CHANGED
@@ -43,7 +43,7 @@ model = dict(
43
  from_pretrained=None,
44
  input_sq_size=512, # pretrained model is trained on 512x512
45
  qk_norm=True,
46
- enable_flash_attn=True,
47
  enable_layernorm_kernel=True,
48
  )
49
  vae = dict(
 
43
  from_pretrained=None,
44
  input_sq_size=512, # pretrained model is trained on 512x512
45
  qk_norm=True,
46
+ enable_flashattn=True,
47
  enable_layernorm_kernel=True,
48
  )
49
  vae = dict(
configs/opensora-v1-1/train/video.py CHANGED
@@ -31,7 +31,7 @@ model = dict(
31
  from_pretrained=None,
32
  input_sq_size=512, # pretrained model is trained on 512x512
33
  qk_norm=True,
34
- enable_flash_attn=True,
35
  enable_layernorm_kernel=True,
36
  )
37
  vae = dict(
 
31
  from_pretrained=None,
32
  input_sq_size=512, # pretrained model is trained on 512x512
33
  qk_norm=True,
34
+ enable_flashattn=True,
35
  enable_layernorm_kernel=True,
36
  )
37
  vae = dict(
configs/opensora/inference/16x256x256.py CHANGED
@@ -7,7 +7,7 @@ model = dict(
7
  type="STDiT-XL/2",
8
  space_scale=0.5,
9
  time_scale=1.0,
10
- enable_flash_attn=True,
11
  enable_layernorm_kernel=True,
12
  from_pretrained="PRETRAINED_MODEL",
13
  )
 
7
  type="STDiT-XL/2",
8
  space_scale=0.5,
9
  time_scale=1.0,
10
+ enable_flashattn=True,
11
  enable_layernorm_kernel=True,
12
  from_pretrained="PRETRAINED_MODEL",
13
  )
configs/opensora/inference/16x512x512.py CHANGED
@@ -7,7 +7,7 @@ model = dict(
7
  type="STDiT-XL/2",
8
  space_scale=1.0,
9
  time_scale=1.0,
10
- enable_flash_attn=True,
11
  enable_layernorm_kernel=True,
12
  from_pretrained="PRETRAINED_MODEL",
13
  )
 
7
  type="STDiT-XL/2",
8
  space_scale=1.0,
9
  time_scale=1.0,
10
+ enable_flashattn=True,
11
  enable_layernorm_kernel=True,
12
  from_pretrained="PRETRAINED_MODEL",
13
  )
configs/opensora/inference/64x512x512.py CHANGED
@@ -7,7 +7,7 @@ model = dict(
7
  type="STDiT-XL/2",
8
  space_scale=1.0,
9
  time_scale=2 / 3,
10
- enable_flash_attn=True,
11
  enable_layernorm_kernel=True,
12
  from_pretrained="PRETRAINED_MODEL",
13
  )
 
7
  type="STDiT-XL/2",
8
  space_scale=1.0,
9
  time_scale=2 / 3,
10
+ enable_flashattn=True,
11
  enable_layernorm_kernel=True,
12
  from_pretrained="PRETRAINED_MODEL",
13
  )
configs/opensora/train/16x256x256-mask.py CHANGED
@@ -20,7 +20,7 @@ model = dict(
20
  space_scale=0.5,
21
  time_scale=1.0,
22
  from_pretrained="PixArt-XL-2-512x512.pth",
23
- enable_flash_attn=True,
24
  enable_layernorm_kernel=True,
25
  )
26
  mask_ratios = {
 
20
  space_scale=0.5,
21
  time_scale=1.0,
22
  from_pretrained="PixArt-XL-2-512x512.pth",
23
+ enable_flashattn=True,
24
  enable_layernorm_kernel=True,
25
  )
26
  mask_ratios = {
configs/opensora/train/16x256x256-spee.py CHANGED
@@ -20,7 +20,7 @@ model = dict(
20
  space_scale=0.5,
21
  time_scale=1.0,
22
  from_pretrained="PixArt-XL-2-512x512.pth",
23
- enable_flash_attn=True,
24
  enable_layernorm_kernel=True,
25
  )
26
  mask_ratios = {
 
20
  space_scale=0.5,
21
  time_scale=1.0,
22
  from_pretrained="PixArt-XL-2-512x512.pth",
23
+ enable_flashattn=True,
24
  enable_layernorm_kernel=True,
25
  )
26
  mask_ratios = {
configs/opensora/train/16x256x256.py CHANGED
@@ -20,7 +20,7 @@ model = dict(
20
  space_scale=0.5,
21
  time_scale=1.0,
22
  from_pretrained="PixArt-XL-2-512x512.pth",
23
- enable_flash_attn=True,
24
  enable_layernorm_kernel=True,
25
  )
26
  vae = dict(
 
20
  space_scale=0.5,
21
  time_scale=1.0,
22
  from_pretrained="PixArt-XL-2-512x512.pth",
23
+ enable_flashattn=True,
24
  enable_layernorm_kernel=True,
25
  )
26
  vae = dict(
configs/opensora/train/16x512x512.py CHANGED
@@ -20,7 +20,7 @@ model = dict(
20
  space_scale=1.0,
21
  time_scale=1.0,
22
  from_pretrained=None,
23
- enable_flash_attn=True,
24
  enable_layernorm_kernel=True,
25
  )
26
  vae = dict(
 
20
  space_scale=1.0,
21
  time_scale=1.0,
22
  from_pretrained=None,
23
+ enable_flashattn=True,
24
  enable_layernorm_kernel=True,
25
  )
26
  vae = dict(
configs/opensora/train/360x512x512.py CHANGED
@@ -26,7 +26,7 @@ model = dict(
26
  space_scale=1.0,
27
  time_scale=2 / 3,
28
  from_pretrained=None,
29
- enable_flash_attn=True,
30
  enable_layernorm_kernel=True,
31
  enable_sequence_parallelism=True, # enable sq here
32
  )
 
26
  space_scale=1.0,
27
  time_scale=2 / 3,
28
  from_pretrained=None,
29
+ enable_flashattn=True,
30
  enable_layernorm_kernel=True,
31
  enable_sequence_parallelism=True, # enable sq here
32
  )
configs/opensora/train/64x512x512-sp.py CHANGED
@@ -20,7 +20,7 @@ model = dict(
20
  space_scale=1.0,
21
  time_scale=2 / 3,
22
  from_pretrained=None,
23
- enable_flash_attn=True,
24
  enable_layernorm_kernel=True,
25
  enable_sequence_parallelism=True, # enable sq here
26
  )
 
20
  space_scale=1.0,
21
  time_scale=2 / 3,
22
  from_pretrained=None,
23
+ enable_flashattn=True,
24
  enable_layernorm_kernel=True,
25
  enable_sequence_parallelism=True, # enable sq here
26
  )
configs/opensora/train/64x512x512.py CHANGED
@@ -20,7 +20,7 @@ model = dict(
20
  space_scale=1.0,
21
  time_scale=2 / 3,
22
  from_pretrained=None,
23
- enable_flash_attn=True,
24
  enable_layernorm_kernel=True,
25
  )
26
  vae = dict(
 
20
  space_scale=1.0,
21
  time_scale=2 / 3,
22
  from_pretrained=None,
23
+ enable_flashattn=True,
24
  enable_layernorm_kernel=True,
25
  )
26
  vae = dict(
configs/pixart/train/16x256x256.py CHANGED
@@ -20,7 +20,7 @@ model = dict(
20
  space_scale=0.5,
21
  time_scale=1.0,
22
  from_pretrained="PixArt-XL-2-512x512.pth",
23
- enable_flash_attn=True,
24
  enable_layernorm_kernel=True,
25
  )
26
  vae = dict(
 
20
  space_scale=0.5,
21
  time_scale=1.0,
22
  from_pretrained="PixArt-XL-2-512x512.pth",
23
+ enable_flashattn=True,
24
  enable_layernorm_kernel=True,
25
  )
26
  vae = dict(
configs/pixart/train/1x512x512.py CHANGED
@@ -21,7 +21,7 @@ model = dict(
21
  time_scale=1.0,
22
  no_temporal_pos_emb=True,
23
  from_pretrained="PixArt-XL-2-512x512.pth",
24
- enable_flash_attn=True,
25
  enable_layernorm_kernel=True,
26
  )
27
  vae = dict(
 
21
  time_scale=1.0,
22
  no_temporal_pos_emb=True,
23
  from_pretrained="PixArt-XL-2-512x512.pth",
24
+ enable_flashattn=True,
25
  enable_layernorm_kernel=True,
26
  )
27
  vae = dict(
configs/pixart/train/64x512x512.py CHANGED
@@ -21,7 +21,7 @@ model = dict(
21
  space_scale=1.0,
22
  time_scale=2 / 3,
23
  from_pretrained=None,
24
- enable_flash_attn=True,
25
  enable_layernorm_kernel=True,
26
  )
27
  vae = dict(
 
21
  space_scale=1.0,
22
  time_scale=2 / 3,
23
  from_pretrained=None,
24
+ enable_flashattn=True,
25
  enable_layernorm_kernel=True,
26
  )
27
  vae = dict(
requirements.txt CHANGED
@@ -1,3 +1,3 @@
1
- xformers
2
  transformers
 
3
  git+https://github.com/hpcaitech/Open-Sora.git#egg=opensora
 
 
1
  transformers
2
+ xformers
3
  git+https://github.com/hpcaitech/Open-Sora.git#egg=opensora