Spaces:

BobLLM
/

Sora

Runtime error

App Files Files Community

吴吴大庸 commited on Jun 13, 2024

Commit

c098bbc

1 Parent(s): 0a7f9b3

used open sora official space to replace our local repo

Browse files

Files changed (30) hide show

.DS_Store +0 -0
README.md +6 -5
app.py +57 -161
configs/.DS_Store +0 -0
configs/dit/train/16x256x256.py +1 -1
configs/dit/train/1x256x256.py +1 -1
configs/latte/train/16x256x256.py +1 -1
configs/opensora-v1-1/.DS_Store +0 -0
configs/opensora-v1-1/inference/sample-ref.py +9 -17
configs/opensora-v1-1/inference/sample.py +2 -2
configs/opensora-v1-1/train/benchmark.py +1 -1
configs/opensora-v1-1/train/image.py +1 -1
configs/opensora-v1-1/train/stage1.py +1 -1
configs/opensora-v1-1/train/stage2.py +1 -1
configs/opensora-v1-1/train/stage3.py +1 -1
configs/opensora-v1-1/train/video.py +1 -1
configs/opensora/inference/16x256x256.py +1 -1
configs/opensora/inference/16x512x512.py +1 -1
configs/opensora/inference/64x512x512.py +1 -1
configs/opensora/train/16x256x256-mask.py +1 -1
configs/opensora/train/16x256x256-spee.py +1 -1
configs/opensora/train/16x256x256.py +1 -1
configs/opensora/train/16x512x512.py +1 -1
configs/opensora/train/360x512x512.py +1 -1
configs/opensora/train/64x512x512-sp.py +1 -1
configs/opensora/train/64x512x512.py +1 -1
configs/pixart/train/16x256x256.py +1 -1
configs/pixart/train/1x512x512.py +1 -1
configs/pixart/train/64x512x512.py +1 -1
requirements.txt +1 -1

.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

README.md CHANGED Viewed

@@ -1,12 +1,13 @@
 ---
-title: Sora
-emoji: 🦀
-colorFrom: gray
-colorTo: gray
 sdk: gradio
-sdk_version: 4.36.1
 app_file: app.py
 pinned: false
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

 ---
+title: Open Sora
+emoji: ⚡
+colorFrom: blue
+colorTo: purple
 sdk: gradio
+sdk_version: 4.25.0
 app_file: app.py
 pinned: false
+license: apache-2.0
 ---
 Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py CHANGED Viewed

@@ -19,12 +19,9 @@ import spaces
 import torch
 import gradio as gr
-from tempfile import NamedTemporaryFile
-import datetime
-MODEL_TYPES = ["v1.1-stage2", "v1.1-stage3"]
 CONFIG_MAP = {
     "v1.1-stage2": "configs/opensora-v1-1/inference/sample-ref.py",
     "v1.1-stage3": "configs/opensora-v1-1/inference/sample-ref.py",
@@ -34,41 +31,12 @@ HF_STDIT_MAP = {
     "v1.1-stage3": "hpcai-tech/OpenSora-STDiT-v2-stage3",
 }
 RESOLUTION_MAP = {
-    "144p": {
-        "16:9": (256, 144),
-        "9:16": (144, 256),
-        "4:3": (221, 165),
-        "3:4": (165, 221),
-        "1:1": (192, 192),
-    },
-    "240p": {
-        "16:9": (426, 240),
-        "9:16": (240, 426),
-        "4:3": (370, 278),
-        "3:4": (278, 370),
-        "1:1": (320, 320),
-    },
-    "360p": {
-        "16:9": (640, 360),
-        "9:16": (360, 640),
-        "4:3": (554, 416),
-        "3:4": (416, 554),
-        "1:1": (480, 480),
-    },
-    "480p": {
-        "16:9": (854, 480),
-        "9:16": (480, 854),
-        "4:3": (740, 555),
-        "3:4": (555, 740),
-        "1:1": (640, 640),
-    },
-    "720p": {
-        "16:9": (1280, 720),
-        "9:16": (720, 1280),
-        "4:3": (1108, 832),
-        "3:4": (832, 1110),
-        "1:1": (960, 960),
-    },
 }
@@ -255,9 +223,9 @@ def build_models(model_type, config, enable_optimization=False):
     # build stdit
     # we load model from HuggingFace directly so that we don't need to
     # handle model download logic in HuggingFace Space
-    from opensora.models.stdit.stdit2 import STDiT2
-    stdit = STDiT2.from_pretrained(
         HF_STDIT_MAP[model_type],
         enable_flash_attn=enable_optimization,
         trust_remote_code=True,
@@ -334,53 +302,37 @@ device = torch.device("cuda")
 vae, text_encoder, stdit, scheduler = build_models(args.model_type, config, enable_optimization=args.enable_optimization)
-def run_inference(mode, prompt_text, resolution, aspect_ratio, length, reference_image, seed, sampling_steps, cfg_scale):
-    torch.manual_seed(seed)
     with torch.inference_mode():
         # ======================
         # 1. Preparation
         # ======================
         # parse the inputs
-        resolution = RESOLUTION_MAP[resolution][aspect_ratio]
-        # gather args from config
-        num_frames = config.num_frames
-        frame_interval = config.frame_interval
-        fps = config.fps
-        condition_frame_length = config.condition_frame_length
         # compute number of loops
-        if mode == "Text2Image":
-            num_frames = 1
-            num_loop = 1
-        else:
-            num_seconds = int(length.rstrip('s'))
-            if num_seconds <= 16:
-                num_frames = num_seconds * fps // frame_interval
-                num_loop = 1
-            else:
-                config.num_frames = 16
-                total_number_of_frames = num_seconds * fps / frame_interval
-                num_loop = math.ceil((total_number_of_frames - condition_frame_length) / (num_frames - condition_frame_length))
         # prepare model args
-        if config.num_frames == 1:
-            fps = IMG_FPS
         model_args = dict()
-        height_tensor = torch.tensor([resolution[0]], device=device, dtype=dtype)
-        width_tensor = torch.tensor([resolution[1]], device=device, dtype=dtype)
-        num_frames_tensor = torch.tensor([num_frames], device=device, dtype=dtype)
-        ar_tensor = torch.tensor([resolution[0] / resolution[1]], device=device, dtype=dtype)
-        fps_tensor = torch.tensor([fps], device=device, dtype=dtype)
-        model_args["height"] = height_tensor
-        model_args["width"] = width_tensor
-        model_args["num_frames"] = num_frames_tensor
-        model_args["ar"] = ar_tensor
-        model_args["fps"] = fps_tensor
         # compute latent size
-        input_size = (num_frames, *resolution)
         latent_size = vae.get_latent_size(input_size)
         # process prompt
@@ -390,32 +342,24 @@ def run_inference(mode, prompt_text, resolution, aspect_ratio, length, reference
         video_clips = []
         # prepare mask strategy
-        if mode == "Text2Image":
             mask_strategy = [None]
-        elif mode == "Text2Video":
-            if reference_image is not None:
-                mask_strategy = ['0']
-            else:
-                mask_strategy = [None]
         else:
             raise ValueError(f"Invalid mode: {mode}")
         # =========================
         # 2. Load reference images
         # =========================
-        if mode == "Text2Image":
             refs_x = collect_references_batch([None], vae, resolution)
-        elif mode == "Text2Video":
-            if reference_image is not None:
-                # save image to disk
-                from PIL import Image
-                im = Image.fromarray(reference_image)
-                with NamedTemporaryFile(suffix=".jpg") as temp_file:
-                    im.save(temp_file.name)
-                    refs_x = collect_references_batch([temp_file.name], vae, resolution)
-            else:
-                refs_x = collect_references_batch([None], vae, resolution)
         else:
             raise ValueError(f"Invalid mode: {mode}")
@@ -442,20 +386,11 @@ def run_inference(mode, prompt_text, resolution, aspect_ratio, length, reference
                         mask_strategy[j] += ";"
                     mask_strategy[
                         j
-                    ] += f"{loop_i},{len(refs)-1},-{condition_frame_length},0,{condition_frame_length}"
             masks = apply_mask_strategy(z, refs_x, mask_strategy, loop_i)
             # 4.6. diffusion sampling
-            # hack to update num_sampling_steps and cfg_scale
-            scheduler_kwargs = config.scheduler.copy()
-            scheduler_kwargs.pop('type')
-            scheduler_kwargs['num_sampling_steps'] = sampling_steps
-            scheduler_kwargs['cfg_scale'] = cfg_scale
-            scheduler.__init__(
-                **scheduler_kwargs
-            )
             samples = scheduler.sample(
                 stdit,
                 text_encoder,
@@ -475,20 +410,10 @@ def run_inference(mode, prompt_text, resolution, aspect_ratio, length, reference
                     for i in range(1, num_loop)
                 ]
                 video = torch.cat(video_clips_list, dim=1)
-                current_datetime = datetime.datetime.now()
-                timestamp = current_datetime.timestamp()
-                save_path = os.path.join(args.output, f"output_{timestamp}")
-                saved_path = save_sample(video, save_path=save_path, fps=config.fps // config.frame_interval)
                 return saved_path
-@spaces.GPU(duration=200)
-def run_image_inference(prompt_text, resolution, aspect_ratio, length, reference_image, seed, sampling_steps, cfg_scale):
-    return run_inference("Text2Image", prompt_text, resolution, aspect_ratio, length, reference_image, seed, sampling_steps, cfg_scale)
-@spaces.GPU(duration=200)
-def run_video_inference(prompt_text, resolution, aspect_ratio, length, reference_image, seed, sampling_steps, cfg_scale):
-    return run_inference("Text2Video", prompt_text, resolution, aspect_ratio, length, reference_image, seed, sampling_steps, cfg_scale)
 def main():
     # create demo
@@ -517,54 +442,31 @@ def main():
         with gr.Row():
             with gr.Column():
                 prompt_text = gr.Textbox(
                     label="Prompt",
                     placeholder="Describe your video here",
                     lines=4,
                 )
                 resolution = gr.Radio(
-                     choices=["144p", "240p", "360p", "480p", "720p"],
-                     value="240p",
                     label="Resolution",
                 )
-                aspect_ratio = gr.Radio(
-                     choices=["9:16", "16:9", "3:4", "4:3", "1:1"],
-                     value="9:16",
-                    label="Aspect Ratio (H:W)",
-                )
                 length = gr.Radio(
-                    choices=["2s", "4s", "8s", "16s"],
                     value="2s",
-                    label="Video Length (only effective for video generation)",
                     info="8s may fail as Hugging Face ZeroGPU has the limitation of max 200 seconds inference time."
                 )
-                with gr.Row():
-                    seed = gr.Slider(
-                        value=1024,
-                        minimum=1,
-                        maximum=2048,
-                        step=1,
-                        label="Seed"
-                    )
-                    sampling_steps = gr.Slider(
-                        value=100,
-                        minimum=1,
-                        maximum=200,
-                        step=1,
-                        label="Sampling steps"
-                    )
-                    cfg_scale = gr.Slider(
-                        value=7.0,
-                        minimum=0.0,
-                        maximum=10.0,
-                        step=0.1,
-                        label="CFG Scale"
-                    )
                 reference_image = gr.Image(
-                    label="Reference Image (Optional)",
                 )
             with gr.Column():
@@ -574,18 +476,12 @@ def main():
                 )
         with gr.Row():
-             image_gen_button = gr.Button("Generate image")
-             video_gen_button = gr.Button("Generate video")
-        image_gen_button.click(
-             fn=run_image_inference,
-             inputs=[prompt_text, resolution, aspect_ratio, length, reference_image, seed, sampling_steps, cfg_scale],
-             outputs=reference_image
-             )
-        video_gen_button.click(
-             fn=run_video_inference,
-             inputs=[prompt_text, resolution, aspect_ratio, length, reference_image, seed, sampling_steps, cfg_scale],
              outputs=output_video
              )

 import torch
 import gradio as gr
+MODEL_TYPES = ["v1.1"]
 CONFIG_MAP = {
     "v1.1-stage2": "configs/opensora-v1-1/inference/sample-ref.py",
     "v1.1-stage3": "configs/opensora-v1-1/inference/sample-ref.py",
     "v1.1-stage3": "hpcai-tech/OpenSora-STDiT-v2-stage3",
 }
 RESOLUTION_MAP = {
+    "144p": (144, 256),
+    "240p": (240, 426),
+    "360p": (360, 480),
+    "480p": (480, 858),
+    "720p": (720, 1280),
+    "1080p": (1080, 1920)
 }
     # build stdit
     # we load model from HuggingFace directly so that we don't need to
     # handle model download logic in HuggingFace Space
+    from transformers import AutoModel
+    stdit = AutoModel.from_pretrained(
         HF_STDIT_MAP[model_type],
         enable_flash_attn=enable_optimization,
         trust_remote_code=True,
 vae, text_encoder, stdit, scheduler = build_models(args.model_type, config, enable_optimization=args.enable_optimization)
+@spaces.GPU(duration=200)
+def run_inference(mode, prompt_text, resolution, length, reference_image):
     with torch.inference_mode():
         # ======================
         # 1. Preparation
         # ======================
         # parse the inputs
+        resolution = RESOLUTION_MAP[resolution]
         # compute number of loops
+        num_seconds = int(length.rstrip('s'))
+        total_number_of_frames = num_seconds * config.fps / config.frame_interval
+        num_loop = math.ceil(total_number_of_frames / config.num_frames)
         # prepare model args
         model_args = dict()
+        height = torch.tensor([resolution[0]], device=device, dtype=dtype)
+        width = torch.tensor([resolution[1]], device=device, dtype=dtype)
+        num_frames = torch.tensor([config.num_frames], device=device, dtype=dtype)
+        ar = torch.tensor([resolution[0] / resolution[1]], device=device, dtype=dtype)
+        if config.num_frames == 1:
+            config.fps = IMG_FPS
+        fps = torch.tensor([config.fps], device=device, dtype=dtype)
+        model_args["height"] = height
+        model_args["width"] = width
+        model_args["num_frames"] = num_frames
+        model_args["ar"] = ar
+        model_args["fps"] = fps
         # compute latent size
+        input_size = (config.num_frames, *resolution)
         latent_size = vae.get_latent_size(input_size)
         # process prompt
         video_clips = []
         # prepare mask strategy
+        if mode == "Text2Video":
             mask_strategy = [None]
+        elif mode == "Image2Video":
+            mask_strategy = ['0']
         else:
             raise ValueError(f"Invalid mode: {mode}")
         # =========================
         # 2. Load reference images
         # =========================
+        if mode == "Text2Video":
             refs_x = collect_references_batch([None], vae, resolution)
+        elif mode == "Image2Video":
+            # save image to disk
+            from PIL import Image
+            im = Image.fromarray(reference_image)
+            im.save("test.jpg")
+            refs_x = collect_references_batch(["test.jpg"], vae, resolution)
         else:
             raise ValueError(f"Invalid mode: {mode}")
                         mask_strategy[j] += ";"
                     mask_strategy[
                         j
+                    ] += f"{loop_i},{len(refs)-1},-{config.condition_frame_length},0,{config.condition_frame_length}"
             masks = apply_mask_strategy(z, refs_x, mask_strategy, loop_i)
             # 4.6. diffusion sampling
             samples = scheduler.sample(
                 stdit,
                 text_encoder,
                     for i in range(1, num_loop)
                 ]
                 video = torch.cat(video_clips_list, dim=1)
+                save_path = f"{args.output}/sample"
+                saved_path = save_sample(video, fps=config.fps // config.frame_interval, save_path=save_path, force_video=True)
                 return saved_path
 def main():
     # create demo
         with gr.Row():
             with gr.Column():
+                mode = gr.Radio(
+                    choices=["Text2Video", "Image2Video"],
+                    value="Text2Video",
+                    label="Usage",
+                    info="Choose your usage scenario",
+                )
                 prompt_text = gr.Textbox(
                     label="Prompt",
                     placeholder="Describe your video here",
                     lines=4,
                 )
                 resolution = gr.Radio(
+                     choices=["144p", "240p", "360p", "480p", "720p", "1080p"],
+                     value="144p",
                     label="Resolution",
                 )
                 length = gr.Radio(
+                    choices=["2s", "4s", "8s"],
                     value="2s",
+                    label="Video Length",
                     info="8s may fail as Hugging Face ZeroGPU has the limitation of max 200 seconds inference time."
                 )
                 reference_image = gr.Image(
+                    label="Reference Image (only used for Image2Video)",
                 )
             with gr.Column():
                 )
         with gr.Row():
+             submit_button = gr.Button("Generate video")
+        submit_button.click(
+             fn=run_inference,
+             inputs=[mode, prompt_text, resolution, length, reference_image],
              outputs=output_video
              )

configs/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

configs/dit/train/16x256x256.py CHANGED Viewed

@@ -18,7 +18,7 @@ sp_size = 1
 model = dict(
     type="DiT-XL/2",
     from_pretrained="DiT-XL-2-256x256.pt",
-    enable_flash_attn=True,
     enable_layernorm_kernel=True,
 )
 vae = dict(

 model = dict(
     type="DiT-XL/2",
     from_pretrained="DiT-XL-2-256x256.pt",
+    enable_flashattn=True,
     enable_layernorm_kernel=True,
 )
 vae = dict(

configs/dit/train/1x256x256.py CHANGED Viewed

@@ -19,7 +19,7 @@ sp_size = 1
 model = dict(
     type="DiT-XL/2",
     no_temporal_pos_emb=True,
-    enable_flash_attn=True,
     enable_layernorm_kernel=True,
 )
 vae = dict(

 model = dict(
     type="DiT-XL/2",
     no_temporal_pos_emb=True,
+    enable_flashattn=True,
     enable_layernorm_kernel=True,
 )
 vae = dict(

configs/latte/train/16x256x256.py CHANGED Viewed

@@ -17,7 +17,7 @@ sp_size = 1
 # Define model
 model = dict(
     type="Latte-XL/2",
-    enable_flash_attn=True,
     enable_layernorm_kernel=True,
 )
 vae = dict(

 # Define model
 model = dict(
     type="Latte-XL/2",
+    enable_flashattn=True,
     enable_layernorm_kernel=True,
 )
 vae = dict(

configs/opensora-v1-1/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

configs/opensora-v1-1/inference/sample-ref.py CHANGED Viewed

@@ -14,34 +14,26 @@ prompt = [
 loop = 2
 condition_frame_length = 4
-# (
-#   loop id, [the loop index of the condition image or video]
-#   reference id, [the index of the condition image or video in the reference_path]
-#   reference start, [the start frame of the condition image or video]
-#   target start, [the location to insert]
-#   length, [the number of frames to insert]
-#   edit_ratio [the edit rate of the condition image or video]
-# )
-# See https://github.com/hpcaitech/Open-Sora/blob/main/docs/config.md#advanced-inference-config for more details
-# See https://github.com/hpcaitech/Open-Sora/blob/main/docs/commands.md#inference-with-open-sora-11 for more examples
-mask_strategy = [
-    "0,0,0,0,8,0.3",
-    None,
-    "0",
-]
 reference_path = [
     "https://cdn.openai.com/tmp/s/interp/d0.mp4",
     None,
     "assets/images/condition/wave.png",
 ]
 # Define model
 model = dict(
     type="STDiT2-XL/2",
-    from_pretrained="hpcai-tech/OpenSora-STDiT-v2-stage3",
     input_sq_size=512,
     qk_norm=True,
-    enable_flash_attn=True,
     enable_layernorm_kernel=True,
 )
 vae = dict(

 loop = 2
 condition_frame_length = 4
 reference_path = [
     "https://cdn.openai.com/tmp/s/interp/d0.mp4",
     None,
     "assets/images/condition/wave.png",
 ]
+# valid when reference_path is not None
+# (loop id, ref id, ref start, length, target start)
+mask_strategy = [
+    "0,0,0,0,8,0.3",
+    None,
+    "0",
+]
 # Define model
 model = dict(
     type="STDiT2-XL/2",
+    from_pretrained=None,
     input_sq_size=512,
     qk_norm=True,
+    enable_flashattn=True,
     enable_layernorm_kernel=True,
 )
 vae = dict(

configs/opensora-v1-1/inference/sample.py CHANGED Viewed

@@ -7,10 +7,10 @@ multi_resolution = "STDiT2"
 # Define model
 model = dict(
     type="STDiT2-XL/2",
-    from_pretrained="hpcai-tech/OpenSora-STDiT-v2-stage3",
     input_sq_size=512,
     qk_norm=True,
-    enable_flash_attn=True,
     enable_layernorm_kernel=True,
 )
 vae = dict(

 # Define model
 model = dict(
     type="STDiT2-XL/2",
+    from_pretrained=None,
     input_sq_size=512,
     qk_norm=True,
+    enable_flashattn=True,
     enable_layernorm_kernel=True,
 )
 vae = dict(

configs/opensora-v1-1/train/benchmark.py CHANGED Viewed

@@ -65,7 +65,7 @@ model = dict(
     from_pretrained=None,
     input_sq_size=512,  # pretrained model is trained on 512x512
     qk_norm=True,
-    enable_flash_attn=True,
     enable_layernorm_kernel=True,
 )
 vae = dict(

     from_pretrained=None,
     input_sq_size=512,  # pretrained model is trained on 512x512
     qk_norm=True,
+    enable_flashattn=True,
     enable_layernorm_kernel=True,
 )
 vae = dict(

configs/opensora-v1-1/train/image.py CHANGED Viewed

@@ -29,7 +29,7 @@ model = dict(
     from_pretrained=None,
     input_sq_size=512,  # pretrained model is trained on 512x512
     qk_norm=True,
-    enable_flash_attn=True,
     enable_layernorm_kernel=True,
 )
 vae = dict(

     from_pretrained=None,
     input_sq_size=512,  # pretrained model is trained on 512x512
     qk_norm=True,
+    enable_flashattn=True,
     enable_layernorm_kernel=True,
 )
 vae = dict(

configs/opensora-v1-1/train/stage1.py CHANGED Viewed

@@ -41,7 +41,7 @@ model = dict(
     from_pretrained=None,
     input_sq_size=512,  # pretrained model is trained on 512x512
     qk_norm=True,
-    enable_flash_attn=True,
     enable_layernorm_kernel=True,
 )
 vae = dict(

     from_pretrained=None,
     input_sq_size=512,  # pretrained model is trained on 512x512
     qk_norm=True,
+    enable_flashattn=True,
     enable_layernorm_kernel=True,
 )
 vae = dict(

configs/opensora-v1-1/train/stage2.py CHANGED Viewed

@@ -43,7 +43,7 @@ model = dict(
     from_pretrained=None,
     input_sq_size=512,  # pretrained model is trained on 512x512
     qk_norm=True,
-    enable_flash_attn=True,
     enable_layernorm_kernel=True,
 )
 vae = dict(

     from_pretrained=None,
     input_sq_size=512,  # pretrained model is trained on 512x512
     qk_norm=True,
+    enable_flashattn=True,
     enable_layernorm_kernel=True,
 )
 vae = dict(

configs/opensora-v1-1/train/stage3.py CHANGED Viewed

@@ -43,7 +43,7 @@ model = dict(
     from_pretrained=None,
     input_sq_size=512,  # pretrained model is trained on 512x512
     qk_norm=True,
-    enable_flash_attn=True,
     enable_layernorm_kernel=True,
 )
 vae = dict(

     from_pretrained=None,
     input_sq_size=512,  # pretrained model is trained on 512x512
     qk_norm=True,
+    enable_flashattn=True,
     enable_layernorm_kernel=True,
 )
 vae = dict(

configs/opensora-v1-1/train/video.py CHANGED Viewed

@@ -31,7 +31,7 @@ model = dict(
     from_pretrained=None,
     input_sq_size=512,  # pretrained model is trained on 512x512
     qk_norm=True,
-    enable_flash_attn=True,
     enable_layernorm_kernel=True,
 )
 vae = dict(

     from_pretrained=None,
     input_sq_size=512,  # pretrained model is trained on 512x512
     qk_norm=True,
+    enable_flashattn=True,
     enable_layernorm_kernel=True,
 )
 vae = dict(

configs/opensora/inference/16x256x256.py CHANGED Viewed

@@ -7,7 +7,7 @@ model = dict(
     type="STDiT-XL/2",
     space_scale=0.5,
     time_scale=1.0,
-    enable_flash_attn=True,
     enable_layernorm_kernel=True,
     from_pretrained="PRETRAINED_MODEL",
 )

     type="STDiT-XL/2",
     space_scale=0.5,
     time_scale=1.0,
+    enable_flashattn=True,
     enable_layernorm_kernel=True,
     from_pretrained="PRETRAINED_MODEL",
 )

configs/opensora/inference/16x512x512.py CHANGED Viewed

@@ -7,7 +7,7 @@ model = dict(
     type="STDiT-XL/2",
     space_scale=1.0,
     time_scale=1.0,
-    enable_flash_attn=True,
     enable_layernorm_kernel=True,
     from_pretrained="PRETRAINED_MODEL",
 )

     type="STDiT-XL/2",
     space_scale=1.0,
     time_scale=1.0,
+    enable_flashattn=True,
     enable_layernorm_kernel=True,
     from_pretrained="PRETRAINED_MODEL",
 )

configs/opensora/inference/64x512x512.py CHANGED Viewed

@@ -7,7 +7,7 @@ model = dict(
     type="STDiT-XL/2",
     space_scale=1.0,
     time_scale=2 / 3,
-    enable_flash_attn=True,
     enable_layernorm_kernel=True,
     from_pretrained="PRETRAINED_MODEL",
 )

     type="STDiT-XL/2",
     space_scale=1.0,
     time_scale=2 / 3,
+    enable_flashattn=True,
     enable_layernorm_kernel=True,
     from_pretrained="PRETRAINED_MODEL",
 )

configs/opensora/train/16x256x256-mask.py CHANGED Viewed

@@ -20,7 +20,7 @@ model = dict(
     space_scale=0.5,
     time_scale=1.0,
     from_pretrained="PixArt-XL-2-512x512.pth",
-    enable_flash_attn=True,
     enable_layernorm_kernel=True,
 )
 mask_ratios = {

     space_scale=0.5,
     time_scale=1.0,
     from_pretrained="PixArt-XL-2-512x512.pth",
+    enable_flashattn=True,
     enable_layernorm_kernel=True,
 )
 mask_ratios = {

configs/opensora/train/16x256x256-spee.py CHANGED Viewed

@@ -20,7 +20,7 @@ model = dict(
     space_scale=0.5,
     time_scale=1.0,
     from_pretrained="PixArt-XL-2-512x512.pth",
-    enable_flash_attn=True,
     enable_layernorm_kernel=True,
 )
 mask_ratios = {

     space_scale=0.5,
     time_scale=1.0,
     from_pretrained="PixArt-XL-2-512x512.pth",
+    enable_flashattn=True,
     enable_layernorm_kernel=True,
 )
 mask_ratios = {

configs/opensora/train/16x256x256.py CHANGED Viewed

@@ -20,7 +20,7 @@ model = dict(
     space_scale=0.5,
     time_scale=1.0,
     from_pretrained="PixArt-XL-2-512x512.pth",
-    enable_flash_attn=True,
     enable_layernorm_kernel=True,
 )
 vae = dict(

     space_scale=0.5,
     time_scale=1.0,
     from_pretrained="PixArt-XL-2-512x512.pth",
+    enable_flashattn=True,
     enable_layernorm_kernel=True,
 )
 vae = dict(

configs/opensora/train/16x512x512.py CHANGED Viewed

@@ -20,7 +20,7 @@ model = dict(
     space_scale=1.0,
     time_scale=1.0,
     from_pretrained=None,
-    enable_flash_attn=True,
     enable_layernorm_kernel=True,
 )
 vae = dict(

     space_scale=1.0,
     time_scale=1.0,
     from_pretrained=None,
+    enable_flashattn=True,
     enable_layernorm_kernel=True,
 )
 vae = dict(

configs/opensora/train/360x512x512.py CHANGED Viewed

@@ -26,7 +26,7 @@ model = dict(
     space_scale=1.0,
     time_scale=2 / 3,
     from_pretrained=None,
-    enable_flash_attn=True,
     enable_layernorm_kernel=True,
     enable_sequence_parallelism=True,  # enable sq here
 )

     space_scale=1.0,
     time_scale=2 / 3,
     from_pretrained=None,
+    enable_flashattn=True,
     enable_layernorm_kernel=True,
     enable_sequence_parallelism=True,  # enable sq here
 )

configs/opensora/train/64x512x512-sp.py CHANGED Viewed

@@ -20,7 +20,7 @@ model = dict(
     space_scale=1.0,
     time_scale=2 / 3,
     from_pretrained=None,
-    enable_flash_attn=True,
     enable_layernorm_kernel=True,
     enable_sequence_parallelism=True,  # enable sq here
 )

     space_scale=1.0,
     time_scale=2 / 3,
     from_pretrained=None,
+    enable_flashattn=True,
     enable_layernorm_kernel=True,
     enable_sequence_parallelism=True,  # enable sq here
 )

configs/opensora/train/64x512x512.py CHANGED Viewed

@@ -20,7 +20,7 @@ model = dict(
     space_scale=1.0,
     time_scale=2 / 3,
     from_pretrained=None,
-    enable_flash_attn=True,
     enable_layernorm_kernel=True,
 )
 vae = dict(

     space_scale=1.0,
     time_scale=2 / 3,
     from_pretrained=None,
+    enable_flashattn=True,
     enable_layernorm_kernel=True,
 )
 vae = dict(

configs/pixart/train/16x256x256.py CHANGED Viewed

@@ -20,7 +20,7 @@ model = dict(
     space_scale=0.5,
     time_scale=1.0,
     from_pretrained="PixArt-XL-2-512x512.pth",
-    enable_flash_attn=True,
     enable_layernorm_kernel=True,
 )
 vae = dict(

     space_scale=0.5,
     time_scale=1.0,
     from_pretrained="PixArt-XL-2-512x512.pth",
+    enable_flashattn=True,
     enable_layernorm_kernel=True,
 )
 vae = dict(

configs/pixart/train/1x512x512.py CHANGED Viewed

@@ -21,7 +21,7 @@ model = dict(
     time_scale=1.0,
     no_temporal_pos_emb=True,
     from_pretrained="PixArt-XL-2-512x512.pth",
-    enable_flash_attn=True,
     enable_layernorm_kernel=True,
 )
 vae = dict(

     time_scale=1.0,
     no_temporal_pos_emb=True,
     from_pretrained="PixArt-XL-2-512x512.pth",
+    enable_flashattn=True,
     enable_layernorm_kernel=True,
 )
 vae = dict(

configs/pixart/train/64x512x512.py CHANGED Viewed

@@ -21,7 +21,7 @@ model = dict(
     space_scale=1.0,
     time_scale=2 / 3,
     from_pretrained=None,
-    enable_flash_attn=True,
     enable_layernorm_kernel=True,
 )
 vae = dict(

     space_scale=1.0,
     time_scale=2 / 3,
     from_pretrained=None,
+    enable_flashattn=True,
     enable_layernorm_kernel=True,
 )
 vae = dict(

requirements.txt CHANGED Viewed

@@ -1,3 +1,3 @@
-xformers
 transformers
 git+https://github.com/hpcaitech/Open-Sora.git#egg=opensora

 transformers
+xformers
 git+https://github.com/hpcaitech/Open-Sora.git#egg=opensora