Video-Diffusion-WebUI

Paused

App Files Files Community

wrdias

kadirnar commited on Jul 20, 2023

Commit

2cc4443

•

0 Parent(s):

Duplicate from ArtGAN/Video-Diffusion-WebUI

Browse files

Co-authored-by: Kadir Nar <kadirnar@users.noreply.huggingface.co>

Files changed (29) hide show

.gitattributes +34 -0
README.md +16 -0
app.py +50 -0
requirements.txt +12 -0
video_diffusion/__init__.py +1 -0
video_diffusion/damo/damo_text2_video.py +126 -0
video_diffusion/inpaint_zoom/__init__.py +0 -0
video_diffusion/inpaint_zoom/utils/__init__.py +0 -0
video_diffusion/inpaint_zoom/utils/zoom_in_utils.py +75 -0
video_diffusion/inpaint_zoom/utils/zoom_out_utils.py +47 -0
video_diffusion/inpaint_zoom/zoom_in_app.py +186 -0
video_diffusion/inpaint_zoom/zoom_out_app.py +140 -0
video_diffusion/stable_diffusion_video/__init__.py +0 -0
video_diffusion/stable_diffusion_video/image_generation.py +363 -0
video_diffusion/stable_diffusion_video/stable_diffusion_pipeline.py +848 -0
video_diffusion/stable_diffusion_video/stable_video_text2video.py +158 -0
video_diffusion/stable_diffusion_video/upsampling.py +104 -0
video_diffusion/stable_diffusion_video/utils.py +135 -0
video_diffusion/tuneavideo/models/attention.py +322 -0
video_diffusion/tuneavideo/models/resnet.py +208 -0
video_diffusion/tuneavideo/models/unet.py +437 -0
video_diffusion/tuneavideo/models/unet_blocks.py +588 -0
video_diffusion/tuneavideo/pipelines/pipeline_tuneavideo.py +411 -0
video_diffusion/tuneavideo/tuneavideo_text2video.py +153 -0
video_diffusion/tuneavideo/util.py +93 -0
video_diffusion/utils/__init__.py +0 -0
video_diffusion/utils/model_list.py +6 -0
video_diffusion/utils/scheduler_list.py +32 -0
video_diffusion/zero_shot/zero_shot_text2video.py +164 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,34 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,16 @@

+---
+title: Video Diffusion WebUI
+emoji: 🏃
+colorFrom: gray
+colorTo: yellow
+sdk: gradio
+sdk_version: 3.19.0
+app_file: app.py
+pinned: false
+license: apache-2.0
+tags:
+- making-demos
+duplicated_from: ArtGAN/Video-Diffusion-WebUI
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,50 @@

+import gradio as gr
+from video_diffusion.damo.damo_text2_video import DamoText2VideoGenerator
+from video_diffusion.inpaint_zoom.zoom_in_app import StableDiffusionZoomIn
+from video_diffusion.inpaint_zoom.zoom_out_app import StableDiffusionZoomOut
+from video_diffusion.stable_diffusion_video.stable_video_text2video import StableDiffusionText2VideoGenerator
+from video_diffusion.tuneavideo.tuneavideo_text2video import TunaVideoText2VideoGenerator
+from video_diffusion.zero_shot.zero_shot_text2video import ZeroShotText2VideoGenerator
+def diffusion_app():
+    app = gr.Blocks()
+    with app:
+        gr.HTML(
+        """
+        <h1 style='text-align: center'>
+       Video Diffusion WebUI
+        </h1>
+        """
+    )
+        gr.HTML(
+            """
+            <h3 style='text-align: center'>
+            Follow me for more!
+            <a href='https://twitter.com/kadirnar_ai' target='_blank'>Twitter</a> | <a href='https://github.com/kadirnar' target='_blank'>Github</a> | <a href='https://www.linkedin.com/in/kadir-nar/' target='_blank'>Linkedin</a>
+            </h3>
+            """
+    )
+        with gr.Row():
+            with gr.Column():
+                with gr.Tab("Stable Diffusion Video"):
+                    StableDiffusionText2VideoGenerator.app()
+                with gr.Tab("Tune-a-Video"):
+                    TunaVideoText2VideoGenerator.app()
+                with gr.Tab("Stable Infinite Zoom"):
+                    with gr.Tab("Zoom In"):
+                        StableDiffusionZoomIn.app()
+                    with gr.Tab("Zoom Out"):
+                        StableDiffusionZoomOut.app()
+                with gr.Tab("Damo Text2Video"):
+                    DamoText2VideoGenerator.app()
+                with gr.Tab("Zero Shot Text2Video"):
+                    ZeroShotText2VideoGenerator.app()
+    app.queue(concurrency_count=1)
+    app.launch(debug=True, enable_queue=True)
+if __name__ == "__main__":
+    diffusion_app()

requirements.txt ADDED Viewed

	@@ -0,0 +1,12 @@

+torch==2.0.0
+git+https://github.com/huggingface/diffusers
+transformers
+accelerate
+opencv-python
+realesrgan==0.2.5.0
+librosa
+xformers
+einops
+av<10.0.0
+imageio==2.9.0
+imageio-ffmpeg==0.4.2

video_diffusion/__init__.py ADDED Viewed

	@@ -0,0 +1 @@


1	+ __version__ = "0.0.1"

video_diffusion/damo/damo_text2_video.py ADDED Viewed

	@@ -0,0 +1,126 @@

+import gradio as gr
+import torch
+from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
+from diffusers.utils import export_to_video
+from video_diffusion.utils.scheduler_list import diff_scheduler_list, get_scheduler_list
+stable_model_list =["damo-vilab/text-to-video-ms-1.7b","cerspense/zeroscope_v2_576w"]
+class DamoText2VideoGenerator:
+    def __init__(self):
+        self.pipe = None
+    def load_model(self, stable_model, scheduler):
+        if self.pipe is None:
+            self.pipe = DiffusionPipeline.from_pretrained(
+                stable_model, torch_dtype=torch.float16, variant="fp16"
+            )
+            self.pipe = get_scheduler_list(pipe=self.pipe, scheduler=scheduler)
+            self.pipe.to("cuda")
+            self.pipe.enable_xformers_memory_efficient_attention()
+        return self.pipe
+    def generate_video(
+        self,
+        prompt: str,
+        negative_prompt: str,
+        stable_model:str,
+        num_frames: int,
+        num_inference_steps: int,
+        guidance_scale: int,
+        height: int,
+        width: int,
+        scheduler: str,
+    ):
+        pipe = self.load_model(stable_model=stable_model, scheduler=scheduler)
+        video = pipe(
+            prompt,
+            negative_prompt=negative_prompt,
+            num_frames=int(num_frames),
+            height=height,
+            width=width,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+        ).frames
+        video_path = export_to_video(video)
+        return video_path
+    def app():
+        with gr.Blocks():
+            with gr.Row():
+                with gr.Column():
+                    dano_text2video_prompt = gr.Textbox(lines=1, placeholder="Prompt", show_label=False)
+                    dano_text2video_negative_prompt = gr.Textbox(
+                        lines=1, placeholder="Negative Prompt", show_label=False
+                    )
+                    with gr.Row():
+                        with gr.Column():
+                            dano_text2video_model_list = gr.Dropdown(
+                                    choices=stable_model_list,
+                                    label="Model List",
+                                    value=stable_model_list[0],
+                                )
+                            dano_text2video_num_inference_steps = gr.Slider(
+                                minimum=1,
+                                maximum=100,
+                                value=50,
+                                step=1,
+                                label="Inference Steps",
+                            )
+                            dano_text2video_guidance_scale = gr.Slider(
+                                minimum=1,
+                                maximum=15,
+                                value=7,
+                                step=1,
+                                label="Guidance Scale",
+                            )
+                            dano_text2video_num_frames = gr.Slider(
+                                minimum=1,
+                                maximum=50,
+                                value=16,
+                                step=1,
+                                label="Number of Frames",
+                            )
+                        with gr.Row():
+                            with gr.Column():
+                                dano_text2video_height = gr.Slider(
+                                    minimum=128,
+                                    maximum=1280,
+                                    value=512,
+                                    step=32,
+                                    label="Height",
+                                )
+                                dano_text2video_width = gr.Slider(
+                                    minimum=128,
+                                    maximum=1280,
+                                    value=512,
+                                    step=32,
+                                    label="Width",
+                                )
+                                damo_text2video_scheduler = gr.Dropdown(
+                                    choices=diff_scheduler_list,
+                                    label="Scheduler",
+                                    value=diff_scheduler_list[6],
+                                )
+                    dano_text2video_generate = gr.Button(value="Generator")
+                with gr.Column():
+                    dano_output = gr.Video(label="Output")
+        dano_text2video_generate.click(
+            fn=DamoText2VideoGenerator().generate_video,
+            inputs=[
+                dano_text2video_prompt,
+                dano_text2video_negative_prompt,
+                dano_text2video_model_list,
+                dano_text2video_num_frames,
+                dano_text2video_num_inference_steps,
+                dano_text2video_guidance_scale,
+                dano_text2video_height,
+                dano_text2video_width,
+                damo_text2video_scheduler,
+            ],
+            outputs=dano_output,
+        )

video_diffusion/inpaint_zoom/__init__.py ADDED Viewed

File without changes

video_diffusion/inpaint_zoom/utils/__init__.py ADDED Viewed

File without changes

video_diffusion/inpaint_zoom/utils/zoom_in_utils.py ADDED Viewed

	@@ -0,0 +1,75 @@

+import os
+import cv2
+import numpy as np
+from PIL import Image
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+def write_video(file_path, frames, fps, reversed=True):
+    """
+    Writes frames to an mp4 video file
+    :param file_path: Path to output video, must end with .mp4
+    :param frames: List of PIL.Image objects
+    :param fps: Desired frame rate
+    :param reversed: if order of images to be reversed (default = True)
+    """
+    if reversed == True:
+        frames.reverse()
+    w, h = frames[0].size
+    fourcc = cv2.VideoWriter_fourcc("m", "p", "4", "v")
+    # fourcc = cv2.VideoWriter_fourcc(*'avc1')
+    writer = cv2.VideoWriter(file_path, fourcc, fps, (w, h))
+    for frame in frames:
+        np_frame = np.array(frame.convert("RGB"))
+        cv_frame = cv2.cvtColor(np_frame, cv2.COLOR_RGB2BGR)
+        writer.write(cv_frame)
+    writer.release()
+def image_grid(imgs, rows, cols):
+    assert len(imgs) == rows * cols
+    w, h = imgs[0].size
+    grid = Image.new("RGB", size=(cols * w, rows * h))
+    grid_w, grid_h = grid.size
+    for i, img in enumerate(imgs):
+        grid.paste(img, box=(i % cols * w, i // cols * h))
+    return grid
+def shrink_and_paste_on_blank(current_image, mask_width):
+    """
+    Decreases size of current_image by mask_width pixels from each side,
+    then adds a mask_width width transparent frame,
+    so that the image the function returns is the same size as the input.
+    :param current_image: input image to transform
+    :param mask_width: width in pixels to shrink from each side
+    """
+    height = current_image.height
+    width = current_image.width
+    # shrink down by mask_width
+    prev_image = current_image.resize((height - 2 * mask_width, width - 2 * mask_width))
+    prev_image = prev_image.convert("RGBA")
+    prev_image = np.array(prev_image)
+    # create blank non-transparent image
+    blank_image = np.array(current_image.convert("RGBA")) * 0
+    blank_image[:, :, 3] = 1
+    # paste shrinked onto blank
+    blank_image[mask_width : height - mask_width, mask_width : width - mask_width, :] = prev_image
+    prev_image = Image.fromarray(blank_image)
+    return prev_image
+def dummy(images, **kwargs):
+    return images, False

video_diffusion/inpaint_zoom/utils/zoom_out_utils.py ADDED Viewed

	@@ -0,0 +1,47 @@

+import cv2
+import numpy as np
+from PIL import Image
+def write_video(file_path, frames, fps):
+    """
+    Writes frames to an mp4 video file
+    :param file_path: Path to output video, must end with .mp4
+    :param frames: List of PIL.Image objects
+    :param fps: Desired frame rate
+    """
+    w, h = frames[0].size
+    fourcc = cv2.VideoWriter_fourcc("m", "p", "4", "v")
+    writer = cv2.VideoWriter(file_path, fourcc, fps, (w, h))
+    for frame in frames:
+        np_frame = np.array(frame.convert("RGB"))
+        cv_frame = cv2.cvtColor(np_frame, cv2.COLOR_RGB2BGR)
+        writer.write(cv_frame)
+    writer.release()
+def dummy(images, **kwargs):
+    return images, False
+def preprocess_image(current_image, steps, image_size):
+    next_image = np.array(current_image.convert("RGBA")) * 0
+    prev_image = current_image.resize((image_size - 2 * steps, image_size - 2 * steps))
+    prev_image = prev_image.convert("RGBA")
+    prev_image = np.array(prev_image)
+    next_image[:, :, 3] = 1
+    next_image[steps : image_size - steps, steps : image_size - steps, :] = prev_image
+    prev_image = Image.fromarray(next_image)
+    return prev_image
+def preprocess_mask_image(current_image):
+    mask_image = np.array(current_image)[:, :, 3]  # assume image has alpha mask (use .mode to check for "RGBA")
+    mask_image = Image.fromarray(255 - mask_image).convert("RGB")
+    current_image = current_image.convert("RGB")
+    return current_image, mask_image

video_diffusion/inpaint_zoom/zoom_in_app.py ADDED Viewed

	@@ -0,0 +1,186 @@

+import os
+import gradio as gr
+import numpy as np
+import torch
+from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
+from PIL import Image
+from video_diffusion.inpaint_zoom.utils.zoom_in_utils import dummy, image_grid, shrink_and_paste_on_blank, write_video
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+stable_paint_model_list = ["stabilityai/stable-diffusion-2-inpainting", "runwayml/stable-diffusion-inpainting"]
+stable_paint_prompt_list = [
+    "children running in the forest , sunny, bright, by studio ghibli painting, superior quality, masterpiece,  traditional Japanese colors, by Grzegorz Rutkowski, concept art",
+    "A beautiful landscape of a mountain range with a lake in the foreground",
+]
+stable_paint_negative_prompt_list = [
+    "lurry, bad art, blurred, text, watermark",
+]
+class StableDiffusionZoomIn:
+    def __init__(self):
+        self.pipe = None
+    def load_model(self, model_id):
+        if self.pipe is None:
+            self.pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16, revision="fp16")
+            self.pipe.scheduler = DPMSolverMultistepScheduler.from_config(self.pipe.scheduler.config)
+            self.pipe = self.pipe.to("cuda")
+            self.pipe.safety_checker = dummy
+            self.pipe.enable_attention_slicing()
+            self.pipe.enable_xformers_memory_efficient_attention()
+            self.g_cuda = torch.Generator(device="cuda")
+        return self.pipe
+    def generate_video(
+        self,
+        model_id,
+        prompt,
+        negative_prompt,
+        guidance_scale,
+        num_inference_steps,
+    ):
+        pipe = self.load_model(model_id)
+        num_init_images = 2
+        seed = 42
+        height = 512
+        width = height
+        current_image = Image.new(mode="RGBA", size=(height, width))
+        mask_image = np.array(current_image)[:, :, 3]
+        mask_image = Image.fromarray(255 - mask_image).convert("RGB")
+        current_image = current_image.convert("RGB")
+        init_images = pipe(
+            prompt=[prompt] * num_init_images,
+            negative_prompt=[negative_prompt] * num_init_images,
+            image=current_image,
+            guidance_scale=guidance_scale,
+            height=height,
+            width=width,
+            generator=self.g_cuda.manual_seed(seed),
+            mask_image=mask_image,
+            num_inference_steps=num_inference_steps,
+        )[0]
+        image_grid(init_images, rows=1, cols=num_init_images)
+        init_image_selected = 1  # @param
+        if num_init_images == 1:
+            init_image_selected = 0
+        else:
+            init_image_selected = init_image_selected - 1
+        num_outpainting_steps = 20  # @param
+        mask_width = 128  # @param
+        num_interpol_frames = 30  # @param
+        current_image = init_images[init_image_selected]
+        all_frames = []
+        all_frames.append(current_image)
+        for i in range(num_outpainting_steps):
+            print("Generating image: " + str(i + 1) + " / " + str(num_outpainting_steps))
+            prev_image_fix = current_image
+            prev_image = shrink_and_paste_on_blank(current_image, mask_width)
+            current_image = prev_image
+            # create mask (black image with white mask_width width edges)
+            mask_image = np.array(current_image)[:, :, 3]
+            mask_image = Image.fromarray(255 - mask_image).convert("RGB")
+            # inpainting step
+            current_image = current_image.convert("RGB")
+            images = pipe(
+                prompt=prompt,
+                negative_prompt=negative_prompt,
+                image=current_image,
+                guidance_scale=guidance_scale,
+                height=height,
+                width=width,
+                # this can make the whole thing deterministic but the output less exciting
+                # generator = g_cuda.manual_seed(seed),
+                mask_image=mask_image,
+                num_inference_steps=num_inference_steps,
+            )[0]
+            current_image = images[0]
+            current_image.paste(prev_image, mask=prev_image)
+            # interpolation steps bewteen 2 inpainted images (=sequential zoom and crop)
+            for j in range(num_interpol_frames - 1):
+                interpol_image = current_image
+                interpol_width = round(
+                    (1 - (1 - 2 * mask_width / height) ** (1 - (j + 1) / num_interpol_frames)) * height / 2
+                )
+                interpol_image = interpol_image.crop(
+                    (interpol_width, interpol_width, width - interpol_width, height - interpol_width)
+                )
+                interpol_image = interpol_image.resize((height, width))
+                # paste the higher resolution previous image in the middle to avoid drop in quality caused by zooming
+                interpol_width2 = round((1 - (height - 2 * mask_width) / (height - 2 * interpol_width)) / 2 * height)
+                prev_image_fix_crop = shrink_and_paste_on_blank(prev_image_fix, interpol_width2)
+                interpol_image.paste(prev_image_fix_crop, mask=prev_image_fix_crop)
+                all_frames.append(interpol_image)
+            all_frames.append(current_image)
+        video_file_name = "infinite_zoom_out"
+        fps = 30
+        save_path = video_file_name + ".mp4"
+        write_video(save_path, all_frames, fps)
+        return save_path
+    def app():
+        with gr.Blocks():
+            with gr.Row():
+                with gr.Column():
+                    text2image_in_model_path = gr.Dropdown(
+                        choices=stable_paint_model_list, value=stable_paint_model_list[0], label="Text-Image Model Id"
+                    )
+                    text2image_in_prompt = gr.Textbox(lines=2, value=stable_paint_prompt_list[0], label="Prompt")
+                    text2image_in_negative_prompt = gr.Textbox(
+                        lines=1, value=stable_paint_negative_prompt_list[0], label="Negative Prompt"
+                    )
+                    with gr.Row():
+                        with gr.Column():
+                            text2image_in_guidance_scale = gr.Slider(
+                                minimum=0.1, maximum=15, step=0.1, value=7.5, label="Guidance Scale"
+                            )
+                            text2image_in_num_inference_step = gr.Slider(
+                                minimum=1, maximum=100, step=1, value=50, label="Num Inference Step"
+                            )
+                    text2image_in_predict = gr.Button(value="Generator")
+                with gr.Column():
+                    output_image = gr.Video(label="Output")
+            text2image_in_predict.click(
+                fn=StableDiffusionZoomIn().generate_video,
+                inputs=[
+                    text2image_in_model_path,
+                    text2image_in_prompt,
+                    text2image_in_negative_prompt,
+                    text2image_in_guidance_scale,
+                    text2image_in_num_inference_step,
+                ],
+                outputs=output_image,
+            )

video_diffusion/inpaint_zoom/zoom_out_app.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import os
+import gradio as gr
+import torch
+from diffusers import DiffusionPipeline, DPMSolverMultistepScheduler
+from PIL import Image
+from video_diffusion.inpaint_zoom.utils.zoom_out_utils import (
+    dummy,
+    preprocess_image,
+    preprocess_mask_image,
+    write_video,
+)
+os.environ["CUDA_VISIBLE_DEVICES"] = "0"
+stable_paint_model_list = ["stabilityai/stable-diffusion-2-inpainting", "runwayml/stable-diffusion-inpainting"]
+stable_paint_prompt_list = [
+    "children running in the forest , sunny, bright, by studio ghibli painting, superior quality, masterpiece,  traditional Japanese colors, by Grzegorz Rutkowski, concept art",
+    "A beautiful landscape of a mountain range with a lake in the foreground",
+]
+stable_paint_negative_prompt_list = [
+    "lurry, bad art, blurred, text, watermark",
+]
+class StableDiffusionZoomOut:
+    def __init__(self):
+        self.pipe = None
+    def load_model(self, model_id):
+        if self.pipe is None:
+            self.pipe = DiffusionPipeline.from_pretrained(model_id, torch_dtype=torch.float16)
+            self.pipe.set_use_memory_efficient_attention_xformers(True)
+            self.pipe.scheduler = DPMSolverMultistepScheduler.from_config(self.pipe.scheduler.config)
+            self.pipe = self.pipe.to("cuda")
+            self.pipe.safety_checker = dummy
+            self.g_cuda = torch.Generator(device="cuda")
+        return self.pipe
+    def generate_video(
+        self,
+        model_id,
+        prompt,
+        negative_prompt,
+        guidance_scale,
+        num_inference_steps,
+        num_frames,
+        step_size,
+    ):
+        pipe = self.load_model(model_id)
+        new_image = Image.new(mode="RGBA", size=(512, 512))
+        current_image, mask_image = preprocess_mask_image(new_image)
+        current_image = pipe(
+            prompt=[prompt],
+            negative_prompt=[negative_prompt],
+            image=current_image,
+            mask_image=mask_image,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+        ).images[0]
+        all_frames = []
+        all_frames.append(current_image)
+        for i in range(num_frames):
+            prev_image = preprocess_image(current_image, step_size, 512)
+            current_image = prev_image
+            current_image, mask_image = preprocess_mask_image(current_image)
+            current_image = pipe(
+                prompt=[prompt],
+                negative_prompt=[negative_prompt],
+                image=current_image,
+                mask_image=mask_image,
+                num_inference_steps=num_inference_steps,
+            ).images[0]
+            current_image.paste(prev_image, mask=prev_image)
+            all_frames.append(current_image)
+        save_path = "output.mp4"
+        write_video(save_path, all_frames, fps=30)
+        return save_path
+    def app():
+        with gr.Blocks():
+            with gr.Row():
+                with gr.Column():
+                    text2image_out_model_path = gr.Dropdown(
+                        choices=stable_paint_model_list, value=stable_paint_model_list[0], label="Text-Image Model Id"
+                    )
+                    text2image_out_prompt = gr.Textbox(lines=2, value=stable_paint_prompt_list[0], label="Prompt")
+                    text2image_out_negative_prompt = gr.Textbox(
+                        lines=1, value=stable_paint_negative_prompt_list[0], label="Negative Prompt"
+                    )
+                    with gr.Row():
+                        with gr.Column():
+                            text2image_out_guidance_scale = gr.Slider(
+                                minimum=0.1, maximum=15, step=0.1, value=7.5, label="Guidance Scale"
+                            )
+                            text2image_out_num_inference_step = gr.Slider(
+                                minimum=1, maximum=100, step=1, value=50, label="Num Inference Step"
+                            )
+                        with gr.Row():
+                            with gr.Column():
+                                text2image_out_step_size = gr.Slider(
+                                    minimum=1, maximum=100, step=1, value=10, label="Step Size"
+                                )
+                                text2image_out_num_frames = gr.Slider(
+                                    minimum=1, maximum=100, step=1, value=10, label="Frames"
+                                )
+                    text2image_out_predict = gr.Button(value="Generator")
+                with gr.Column():
+                    output_image = gr.Video(label="Output")
+            text2image_out_predict.click(
+                fn=StableDiffusionZoomOut().generate_video,
+                inputs=[
+                    text2image_out_model_path,
+                    text2image_out_prompt,
+                    text2image_out_negative_prompt,
+                    text2image_out_guidance_scale,
+                    text2image_out_num_inference_step,
+                    text2image_out_step_size,
+                    text2image_out_num_frames,
+                ],
+                outputs=output_image,
+            )

video_diffusion/stable_diffusion_video/__init__.py ADDED Viewed

File without changes

video_diffusion/stable_diffusion_video/image_generation.py ADDED Viewed

	@@ -0,0 +1,363 @@

+import json
+import math
+import random
+import time
+from pathlib import Path
+from uuid import uuid4
+import torch
+from diffusers import __version__ as diffusers_version
+from huggingface_hub import CommitOperationAdd, create_commit, create_repo
+from .upsampling import RealESRGANModel
+from .utils import pad_along_axis
+def get_all_files(root: Path):
+    dirs = [root]
+    while len(dirs) > 0:
+        dir = dirs.pop()
+        for candidate in dir.iterdir():
+            if candidate.is_file():
+                yield candidate
+            if candidate.is_dir():
+                dirs.append(candidate)
+def get_groups_of_n(n: int, iterator):
+    assert n > 1
+    buffer = []
+    for elt in iterator:
+        if len(buffer) == n:
+            yield buffer
+            buffer = []
+        buffer.append(elt)
+    if len(buffer) != 0:
+        yield buffer
+def upload_folder_chunked(
+    repo_id: str,
+    upload_dir: Path,
+    n: int = 100,
+    private: bool = False,
+    create_pr: bool = False,
+):
+    """Upload a folder to the Hugging Face Hub in chunks of n files at a time.
+    Args:
+        repo_id (str): The repo id to upload to.
+        upload_dir (Path): The directory to upload.
+        n (int, *optional*, defaults to 100): The number of files to upload at a time.
+        private (bool, *optional*): Whether to upload the repo as private.
+        create_pr (bool, *optional*): Whether to create a PR after uploading instead of commiting directly.
+    """
+    url = create_repo(repo_id, exist_ok=True, private=private, repo_type="dataset")
+    print(f"Uploading files to: {url}")
+    root = Path(upload_dir)
+    if not root.exists():
+        raise ValueError(f"Upload directory {root} does not exist.")
+    for i, file_paths in enumerate(get_groups_of_n(n, get_all_files(root))):
+        print(f"Committing {file_paths}")
+        operations = [
+            CommitOperationAdd(
+                path_in_repo=f"{file_path.parent.name}/{file_path.name}",
+                path_or_fileobj=str(file_path),
+            )
+            for file_path in file_paths
+        ]
+        create_commit(
+            repo_id=repo_id,
+            operations=operations,
+            commit_message=f"Upload part {i}",
+            repo_type="dataset",
+            create_pr=create_pr,
+        )
+def generate_input_batches(pipeline, prompts, seeds, batch_size, height, width):
+    if len(prompts) != len(seeds):
+        raise ValueError("Number of prompts and seeds must be equal.")
+    embeds_batch, noise_batch = None, None
+    batch_idx = 0
+    for i, (prompt, seed) in enumerate(zip(prompts, seeds)):
+        embeds = pipeline.embed_text(prompt)
+        noise = torch.randn(
+            (1, pipeline.unet.in_channels, height // 8, width // 8),
+            device=pipeline.device,
+            generator=torch.Generator(device="cpu" if pipeline.device.type == "mps" else pipeline.device).manual_seed(
+                seed
+            ),
+        )
+        embeds_batch = embeds if embeds_batch is None else torch.cat([embeds_batch, embeds])
+        noise_batch = noise if noise_batch is None else torch.cat([noise_batch, noise])
+        batch_is_ready = embeds_batch.shape[0] == batch_size or i + 1 == len(prompts)
+        if not batch_is_ready:
+            continue
+        yield batch_idx, embeds_batch.type(torch.cuda.HalfTensor), noise_batch.type(torch.cuda.HalfTensor)
+        batch_idx += 1
+        del embeds_batch, noise_batch
+        torch.cuda.empty_cache()
+        embeds_batch, noise_batch = None, None
+def generate_images(
+    pipeline,
+    prompt,
+    batch_size=1,
+    num_batches=1,
+    seeds=None,
+    num_inference_steps=50,
+    guidance_scale=7.5,
+    output_dir="./images",
+    image_file_ext=".jpg",
+    upsample=False,
+    height=512,
+    width=512,
+    eta=0.0,
+    push_to_hub=False,
+    repo_id=None,
+    private=False,
+    create_pr=False,
+    name=None,
+):
+    """Generate images using the StableDiffusion pipeline.
+    Args:
+        pipeline (StableDiffusionWalkPipeline): The StableDiffusion pipeline instance.
+        prompt (str): The prompt to use for the image generation.
+        batch_size (int, *optional*, defaults to 1): The batch size to use for image generation.
+        num_batches (int, *optional*, defaults to 1): The number of batches to generate.
+        seeds (list[int], *optional*): The seeds to use for the image generation.
+        num_inference_steps (int, *optional*, defaults to 50): The number of inference steps to take.
+        guidance_scale (float, *optional*, defaults to 7.5): The guidance scale to use for image generation.
+        output_dir (str, *optional*, defaults to "./images"): The output directory to save the images to.
+        image_file_ext (str, *optional*, defaults to '.jpg'): The image file extension to use.
+        upsample (bool, *optional*, defaults to False): Whether to upsample the images.
+        height (int, *optional*, defaults to 512): The height of the images to generate.
+        width (int, *optional*, defaults to 512): The width of the images to generate.
+        eta (float, *optional*, defaults to 0.0): The eta parameter to use for image generation.
+        push_to_hub (bool, *optional*, defaults to False): Whether to push the generated images to the Hugging Face Hub.
+        repo_id (str, *optional*): The repo id to push the images to.
+        private (bool, *optional*): Whether to push the repo as private.
+        create_pr (bool, *optional*): Whether to create a PR after pushing instead of commiting directly.
+        name (str, *optional*, defaults to current timestamp str): The name of the sub-directory of
+            output_dir to save the images to.
+    """
+    if push_to_hub:
+        if repo_id is None:
+            raise ValueError("Must provide repo_id if push_to_hub is True.")
+    name = name or time.strftime("%Y%m%d-%H%M%S")
+    save_path = Path(output_dir) / name
+    save_path.mkdir(exist_ok=False, parents=True)
+    prompt_config_path = save_path / "prompt_config.json"
+    num_images = batch_size * num_batches
+    seeds = seeds or [random.choice(list(range(0, 9999999))) for _ in range(num_images)]
+    if len(seeds) != num_images:
+        raise ValueError("Number of seeds must be equal to batch_size * num_batches.")
+    if upsample:
+        if getattr(pipeline, "upsampler", None) is None:
+            pipeline.upsampler = RealESRGANModel.from_pretrained("nateraw/real-esrgan")
+        pipeline.upsampler.to(pipeline.device)
+    cfg = dict(
+        prompt=prompt,
+        guidance_scale=guidance_scale,
+        eta=eta,
+        num_inference_steps=num_inference_steps,
+        upsample=upsample,
+        height=height,
+        width=width,
+        scheduler=dict(pipeline.scheduler.config),
+        tiled=pipeline.tiled,
+        diffusers_version=diffusers_version,
+        device_name=torch.cuda.get_device_name(0) if torch.cuda.is_available() else "unknown",
+    )
+    prompt_config_path.write_text(json.dumps(cfg, indent=2, sort_keys=False))
+    frame_index = 0
+    frame_filepaths = []
+    for batch_idx, embeds, noise in generate_input_batches(
+        pipeline, [prompt] * num_images, seeds, batch_size, height, width
+    ):
+        print(f"Generating batch {batch_idx}")
+        outputs = pipeline(
+            text_embeddings=embeds,
+            latents=noise,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+            eta=eta,
+            height=height,
+            width=width,
+            output_type="pil" if not upsample else "numpy",
+        )["images"]
+        if upsample:
+            images = []
+            for output in outputs:
+                images.append(pipeline.upsampler(output))
+        else:
+            images = outputs
+        for image in images:
+            frame_filepath = save_path / f"{seeds[frame_index]}{image_file_ext}"
+            image.save(frame_filepath)
+            frame_filepaths.append(str(frame_filepath))
+            frame_index += 1
+    return frame_filepaths
+    if push_to_hub:
+        upload_folder_chunked(repo_id, save_path, private=private, create_pr=create_pr)
+def generate_images_flax(
+    pipeline,
+    params,
+    prompt,
+    batch_size=1,
+    num_batches=1,
+    seeds=None,
+    num_inference_steps=50,
+    guidance_scale=7.5,
+    output_dir="./images",
+    image_file_ext=".jpg",
+    upsample=False,
+    height=512,
+    width=512,
+    push_to_hub=False,
+    repo_id=None,
+    private=False,
+    create_pr=False,
+    name=None,
+):
+    import jax
+    from flax.training.common_utils import shard
+    """Generate images using the StableDiffusion pipeline.
+    Args:
+        pipeline (StableDiffusionWalkPipeline): The StableDiffusion pipeline instance.
+        params (`Union[Dict, FrozenDict]`): The model parameters.
+        prompt (str): The prompt to use for the image generation.
+        batch_size (int, *optional*, defaults to 1): The batch size to use for image generation.
+        num_batches (int, *optional*, defaults to 1): The number of batches to generate.
+        seeds (int, *optional*): The seed to use for the image generation.
+        num_inference_steps (int, *optional*, defaults to 50): The number of inference steps to take.
+        guidance_scale (float, *optional*, defaults to 7.5): The guidance scale to use for image generation.
+        output_dir (str, *optional*, defaults to "./images"): The output directory to save the images to.
+        image_file_ext (str, *optional*, defaults to '.jpg'): The image file extension to use.
+        upsample (bool, *optional*, defaults to False): Whether to upsample the images.
+        height (int, *optional*, defaults to 512): The height of the images to generate.
+        width (int, *optional*, defaults to 512): The width of the images to generate.
+        push_to_hub (bool, *optional*, defaults to False): Whether to push the generated images to the Hugging Face Hub.
+        repo_id (str, *optional*): The repo id to push the images to.
+        private (bool, *optional*): Whether to push the repo as private.
+        create_pr (bool, *optional*): Whether to create a PR after pushing instead of commiting directly.
+        name (str, *optional*, defaults to current timestamp str): The name of the sub-directory of
+            output_dir to save the images to.
+    """
+    if push_to_hub:
+        if repo_id is None:
+            raise ValueError("Must provide repo_id if push_to_hub is True.")
+    name = name or time.strftime("%Y%m%d-%H%M%S")
+    save_path = Path(output_dir) / name
+    save_path.mkdir(exist_ok=False, parents=True)
+    prompt_config_path = save_path / "prompt_config.json"
+    num_images = batch_size * num_batches
+    seeds = seeds or random.choice(list(range(0, 9999999)))
+    prng_seed = jax.random.PRNGKey(seeds)
+    if upsample:
+        if getattr(pipeline, "upsampler", None) is None:
+            pipeline.upsampler = RealESRGANModel.from_pretrained("nateraw/real-esrgan")
+            if not torch.cuda.is_available():
+                print("Upsampling is recommended to be done on a GPU, as it is very slow on CPU")
+            else:
+                pipeline.upsampler = pipeline.upsampler.cuda()
+    cfg = dict(
+        prompt=prompt,
+        guidance_scale=guidance_scale,
+        num_inference_steps=num_inference_steps,
+        upsample=upsample,
+        height=height,
+        width=width,
+        scheduler=dict(pipeline.scheduler.config),
+        # tiled=pipeline.tiled,
+        diffusers_version=diffusers_version,
+        device_name=torch.cuda.get_device_name(0) if torch.cuda.is_available() else "unknown",
+    )
+    prompt_config_path.write_text(json.dumps(cfg, indent=2, sort_keys=False))
+    NUM_TPU_CORES = jax.device_count()
+    jit = True  # force jit, assume params are already sharded
+    batch_size_total = NUM_TPU_CORES * batch_size if jit else batch_size
+    def generate_input_batches(prompts, batch_size):
+        prompt_batch = None
+        for batch_idx in range(math.ceil(len(prompts) / batch_size)):
+            prompt_batch = prompts[batch_idx * batch_size : (batch_idx + 1) * batch_size]
+            yield batch_idx, prompt_batch
+    frame_index = 0
+    frame_filepaths = []
+    for batch_idx, prompt_batch in generate_input_batches([prompt] * num_images, batch_size_total):
+        # This batch size correspond to each TPU core, so we are generating batch_size * NUM_TPU_CORES images
+        print(f"Generating batches: {batch_idx*NUM_TPU_CORES} - {min((batch_idx+1)*NUM_TPU_CORES, num_batches)}")
+        prompt_ids_batch = pipeline.prepare_inputs(prompt_batch)
+        prng_seed_batch = prng_seed
+        if jit:
+            padded = False
+            # Check if len of prompt_batch is multiple of NUM_TPU_CORES, if not pad its ids
+            if len(prompt_batch) % NUM_TPU_CORES != 0:
+                padded = True
+                pad_size = NUM_TPU_CORES - (len(prompt_batch) % NUM_TPU_CORES)
+                # Pad embeds_batch and noise_batch with zeros in batch dimension
+                prompt_ids_batch = pad_along_axis(prompt_ids_batch, pad_size, axis=0)
+            prompt_ids_batch = shard(prompt_ids_batch)
+            prng_seed_batch = jax.random.split(prng_seed, jax.device_count())
+        outputs = pipeline(
+            params,
+            prng_seed=prng_seed_batch,
+            prompt_ids=prompt_ids_batch,
+            height=height,
+            width=width,
+            guidance_scale=guidance_scale,
+            num_inference_steps=num_inference_steps,
+            output_type="pil" if not upsample else "numpy",
+            jit=jit,
+        )["images"]
+        if jit:
+            # check if we padded and remove that padding from outputs
+            if padded:
+                outputs = outputs[:-pad_size]
+        if upsample:
+            images = []
+            for output in outputs:
+                images.append(pipeline.upsampler(output))
+        else:
+            images = outputs
+        for image in images:
+            uuid = str(uuid4())
+            frame_filepath = save_path / f"{uuid}{image_file_ext}"
+            image.save(frame_filepath)
+            frame_filepaths.append(str(frame_filepath))
+            frame_index += 1
+    return frame_filepaths
+    if push_to_hub:
+        upload_folder_chunked(repo_id, save_path, private=private, create_pr=create_pr)

video_diffusion/stable_diffusion_video/stable_diffusion_pipeline.py ADDED Viewed

	@@ -0,0 +1,848 @@

+import inspect
+import json
+import math
+import time
+from pathlib import Path
+from typing import Callable, List, Optional, Tuple, Union
+import numpy as np
+import torch
+from diffusers.configuration_utils import FrozenDict
+from diffusers.models import AutoencoderKL, UNet2DConditionModel
+from diffusers.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.stable_diffusion import StableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from diffusers.schedulers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
+from diffusers.utils import deprecate, logging
+from packaging import version
+from torch import nn
+from transformers import CLIPFeatureExtractor, CLIPTextModel, CLIPTokenizer
+from .upsampling import RealESRGANModel
+from .utils import get_timesteps_arr, make_video_pyav, slerp
+logging.set_verbosity_info()
+logger = logging.get_logger(__name__)
+class StableDiffusionWalkPipeline(DiffusionPipeline):
+    r"""
+    Pipeline for generating videos by interpolating  Stable Diffusion's latent space.
+    This model inherits from [`DiffusionPipeline`]. Check the superclass documentation for the generic methods the
+    library implements for all the pipelines (such as downloading or saving, running on a particular device, etc.)
+    Args:
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            Frozen text-encoder. Stable Diffusion uses the text portion of
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/v4.21.0/en/model_doc/clip#transformers.CLIPTokenizer).
+        unet ([`UNet2DConditionModel`]): Conditional U-Net architecture to denoise the encoded image latents.
+        scheduler ([`SchedulerMixin`]):
+            A scheduler to be used in combination with `unet` to denoise the encoded image latens. Can be one of
+            [`DDIMScheduler`], [`LMSDiscreteScheduler`], or [`PNDMScheduler`].
+        safety_checker ([`StableDiffusionSafetyChecker`]):
+            Classification module that estimates whether generated images could be considered offensive or harmful.
+            Please, refer to the [model card](https://huggingface.co/CompVis/stable-diffusion-v1-4) for details.
+        feature_extractor ([`CLIPFeatureExtractor`]):
+            Model that extracts features from generated images to be used as inputs for the `safety_checker`.
+    """
+    _optional_components = ["safety_checker", "feature_extractor"]
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet2DConditionModel,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+        ],
+        safety_checker: StableDiffusionSafetyChecker,
+        feature_extractor: CLIPFeatureExtractor,
+        requires_safety_checker: bool = True,
+    ):
+        super().__init__()
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+        if safety_checker is None and requires_safety_checker:
+            logger.warning(
+                f"You have disabled the safety checker for {self.__class__} by passing `safety_checker=None`. Ensure"
+                " that you abide to the conditions of the Stable Diffusion license and do not expose unfiltered"
+                " results in services or applications open to the public. Both the diffusers team and Hugging Face"
+                " strongly recommend to keep the safety filter enabled in all public facing circumstances, disabling"
+                " it only for use-cases that involve analyzing network behavior or auditing its results. For more"
+                " information, please have a look at https://github.com/huggingface/diffusers/pull/254 ."
+            )
+        if safety_checker is not None and feature_extractor is None:
+            raise ValueError(
+                "Make sure to define a feature extractor when loading {self.__class__} if you want to use the safety"
+                " checker. If you do not want to use the safety checker, you can pass `'safety_checker=None'` instead."
+            )
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely .If you're checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+            safety_checker=safety_checker,
+            feature_extractor=feature_extractor,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+        self.register_to_config(requires_safety_checker=requires_safety_checker)
+    def enable_attention_slicing(self, slice_size: Optional[Union[str, int]] = "auto"):
+        r"""
+        Enable sliced attention computation.
+        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
+        in several steps. This is useful to save some memory in exchange for a small speed decrease.
+        Args:
+            slice_size (`str` or `int`, *optional*, defaults to `"auto"`):
+                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
+                a number is provided, uses as many slices as `attention_head_dim // slice_size`. In this case,
+                `attention_head_dim` must be a multiple of `slice_size`.
+        """
+        if slice_size == "auto":
+            if isinstance(self.unet.config.attention_head_dim, int):
+                # half the attention head size is usually a good trade-off between
+                # speed and memory
+                slice_size = self.unet.config.attention_head_dim // 2
+            else:
+                # if `attention_head_dim` is a list, take the smallest head size
+                slice_size = min(self.unet.config.attention_head_dim)
+        self.unet.set_attention_slice(slice_size)
+    def disable_attention_slicing(self):
+        r"""
+        Disable sliced attention computation. If `enable_attention_slicing` was previously invoked, this method will go
+        back to computing attention in one step.
+        """
+        # set slice_size = `None` to disable `attention slicing`
+        self.enable_attention_slicing(None)
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Optional[Union[str, List[str]]] = None,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_images_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[torch.Generator] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "pil",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        text_embeddings: Optional[torch.FloatTensor] = None,
+        **kwargs,
+    ):
+        r"""
+        Function invoked when calling the pipeline for generation.
+        Args:
+            prompt (`str` or `List[str]`, *optional*, defaults to `None`):
+                The prompt or prompts to guide the image generation. If not provided, `text_embeddings` is required.
+            height (`int`, *optional*, defaults to 512):
+                The height in pixels of the generated image.
+            width (`int`, *optional*, defaults to 512):
+                The width in pixels of the generated image.
+            num_inference_steps (`int`, *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (`float`, *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            negative_prompt (`str` or `List[str]`, *optional*):
+                The prompt or prompts not to guide the image generation. Ignored when not using guidance (i.e., ignored
+                if `guidance_scale` is less than `1`).
+            num_images_per_prompt (`int`, *optional*, defaults to 1):
+                The number of images to generate per prompt.
+            eta (`float`, *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            generator (`torch.Generator`, *optional*):
+                A [torch generator](https://pytorch.org/docs/stable/generated/torch.Generator.html) to make generation
+                deterministic.
+            latents (`torch.FloatTensor`, *optional*):
+                Pre-generated noisy latents, sampled from a Gaussian distribution, to be used as inputs for image
+                generation. Can be used to tweak the same generation with different prompts. If not provided, a latents
+                tensor will ge generated by sampling using the supplied random `generator`.
+            output_type (`str`, *optional*, defaults to `"pil"`):
+                The output format of the generate image. Choose between
+                [PIL](https://pillow.readthedocs.io/en/stable/): `PIL.Image.Image` or `np.array`.
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] instead of a
+                plain tuple.
+            callback (`Callable`, *optional*):
+                A function that will be called every `callback_steps` steps during inference. The function will be
+                called with the following arguments: `callback(step: int, timestep: int, latents: torch.FloatTensor)`.
+            callback_steps (`int`, *optional*, defaults to 1):
+                The frequency at which the `callback` function will be called. If not specified, the callback will be
+                called at every step.
+            text_embeddings (`torch.FloatTensor`, *optional*, defaults to `None`):
+                Pre-generated text embeddings to be used as inputs for image generation. Can be used in place of
+                `prompt` to avoid re-computing the embeddings. If not provided, the embeddings will be generated from
+                the supplied `prompt`.
+        Returns:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] or `tuple`:
+            [`~pipelines.stable_diffusion.StableDiffusionPipelineOutput`] if `return_dict` is True, otherwise a `tuple.
+            When returning a tuple, the first element is a list with the generated images, and the second element is a
+            list of `bool`s denoting whether the corresponding generated image likely represents "not-safe-for-work"
+            (nsfw) content, according to the `safety_checker`.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+        if text_embeddings is None:
+            if isinstance(prompt, str):
+                batch_size = 1
+            elif isinstance(prompt, list):
+                batch_size = len(prompt)
+            else:
+                raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+            # get prompt text embeddings
+            text_inputs = self.tokenizer(
+                prompt,
+                padding="max_length",
+                max_length=self.tokenizer.model_max_length,
+                return_tensors="pt",
+            )
+            text_input_ids = text_inputs.input_ids
+            if text_input_ids.shape[-1] > self.tokenizer.model_max_length:
+                removed_text = self.tokenizer.batch_decode(text_input_ids[:, self.tokenizer.model_max_length :])
+                print(
+                    "The following part of your input was truncated because CLIP can only handle sequences up to"
+                    f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+                )
+                text_input_ids = text_input_ids[:, : self.tokenizer.model_max_length]
+            text_embeddings = self.text_encoder(text_input_ids.to(self.device))[0]
+        else:
+            batch_size = text_embeddings.shape[0]
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_images_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_images_per_prompt, seq_len, -1)
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""]
+            elif text_embeddings is None and type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+            max_length = self.tokenizer.model_max_length
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            uncond_embeddings = self.text_encoder(uncond_input.input_ids.to(self.device))[0]
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(batch_size, num_images_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_images_per_prompt, seq_len, -1)
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+        # get the initial random noise unless the user supplied it
+        # Unlike in other pipelines, latents need to be generated in the target device
+        # for 1-to-1 results reproducibility with the CompVis implementation.
+        # However this currently doesn't work in `mps`.
+        latents_shape = (
+            batch_size * num_images_per_prompt,
+            self.unet.in_channels,
+            height // 8,
+            width // 8,
+        )
+        latents_dtype = text_embeddings.dtype
+        if latents is None:
+            if self.device.type == "mps":
+                # randn does not exist on mps
+                latents = torch.randn(
+                    latents_shape,
+                    generator=generator,
+                    device="cpu",
+                    dtype=latents_dtype,
+                ).to(self.device)
+            else:
+                latents = torch.randn(
+                    latents_shape,
+                    generator=generator,
+                    device=self.device,
+                    dtype=latents_dtype,
+                )
+        else:
+            if latents.shape != latents_shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {latents_shape}")
+            latents = latents.to(self.device)
+        # set timesteps
+        self.scheduler.set_timesteps(num_inference_steps)
+        # Some schedulers like PNDM have timesteps as arrays
+        # It's more optimized to move all timesteps to correct device beforehand
+        timesteps_tensor = self.scheduler.timesteps.to(self.device)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        for i, t in enumerate(self.progress_bar(timesteps_tensor)):
+            # expand the latents if we are doing classifier free guidance
+            latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+            latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+            # predict the noise residual
+            noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample
+            # perform guidance
+            if do_classifier_free_guidance:
+                noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+            # call the callback, if provided
+            if callback is not None and i % callback_steps == 0:
+                callback(i, t, latents)
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents).sample
+        image = (image / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        image = image.cpu().permute(0, 2, 3, 1).float().numpy()
+        if self.safety_checker is not None:
+            safety_checker_input = self.feature_extractor(self.numpy_to_pil(image), return_tensors="pt").to(self.device)
+            image, has_nsfw_concept = self.safety_checker(
+                images=image,
+                clip_input=safety_checker_input.pixel_values.to(text_embeddings.dtype),
+            )
+        else:
+            has_nsfw_concept = None
+        if output_type == "pil":
+            image = self.numpy_to_pil(image)
+        if not return_dict:
+            return (image, has_nsfw_concept)
+        return StableDiffusionPipelineOutput(images=image, nsfw_content_detected=has_nsfw_concept)
+    def generate_inputs(self, prompt_a, prompt_b, seed_a, seed_b, noise_shape, T, batch_size):
+        embeds_a = self.embed_text(prompt_a)
+        embeds_b = self.embed_text(prompt_b)
+        latents_dtype = embeds_a.dtype
+        latents_a = self.init_noise(seed_a, noise_shape, latents_dtype)
+        latents_b = self.init_noise(seed_b, noise_shape, latents_dtype)
+        batch_idx = 0
+        embeds_batch, noise_batch = None, None
+        for i, t in enumerate(T):
+            embeds = torch.lerp(embeds_a, embeds_b, t)
+            noise = slerp(float(t), latents_a, latents_b)
+            embeds_batch = embeds if embeds_batch is None else torch.cat([embeds_batch, embeds])
+            noise_batch = noise if noise_batch is None else torch.cat([noise_batch, noise])
+            batch_is_ready = embeds_batch.shape[0] == batch_size or i + 1 == T.shape[0]
+            if not batch_is_ready:
+                continue
+            yield batch_idx, embeds_batch, noise_batch
+            batch_idx += 1
+            del embeds_batch, noise_batch
+            torch.cuda.empty_cache()
+            embeds_batch, noise_batch = None, None
+    def make_clip_frames(
+        self,
+        prompt_a: str,
+        prompt_b: str,
+        seed_a: int,
+        seed_b: int,
+        num_interpolation_steps: int = 5,
+        save_path: Union[str, Path] = "outputs/",
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        eta: float = 0.0,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        upsample: bool = False,
+        batch_size: int = 1,
+        image_file_ext: str = ".png",
+        T: np.ndarray = None,
+        skip: int = 0,
+        negative_prompt: str = None,
+        step: Optional[Tuple[int, int]] = None,
+    ):
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        save_path = Path(save_path)
+        save_path.mkdir(parents=True, exist_ok=True)
+        T = T if T is not None else np.linspace(0.0, 1.0, num_interpolation_steps)
+        if T.shape[0] != num_interpolation_steps:
+            raise ValueError(f"Unexpected T shape, got {T.shape}, expected dim 0 to be {num_interpolation_steps}")
+        if upsample:
+            if getattr(self, "upsampler", None) is None:
+                self.upsampler = RealESRGANModel.from_pretrained("nateraw/real-esrgan")
+            self.upsampler.to(self.device)
+        batch_generator = self.generate_inputs(
+            prompt_a,
+            prompt_b,
+            seed_a,
+            seed_b,
+            (1, self.unet.in_channels, height // 8, width // 8),
+            T[skip:],
+            batch_size,
+        )
+        num_batches = math.ceil(num_interpolation_steps / batch_size)
+        log_prefix = "" if step is None else f"[{step[0]}/{step[1]}] "
+        frame_index = skip
+        for batch_idx, embeds_batch, noise_batch in batch_generator:
+            if batch_size == 1:
+                msg = f"Generating frame {frame_index}"
+            else:
+                msg = f"Generating frames {frame_index}-{frame_index+embeds_batch.shape[0]-1}"
+            logger.info(f"{log_prefix}[{batch_idx}/{num_batches}] {msg}")
+            outputs = self(
+                latents=noise_batch,
+                text_embeddings=embeds_batch,
+                height=height,
+                width=width,
+                guidance_scale=guidance_scale,
+                eta=eta,
+                num_inference_steps=num_inference_steps,
+                output_type="pil" if not upsample else "numpy",
+                negative_prompt=negative_prompt,
+            )["images"]
+            for image in outputs:
+                frame_filepath = save_path / (f"frame%06d{image_file_ext}" % frame_index)
+                image = image if not upsample else self.upsampler(image)
+                image.save(frame_filepath)
+                frame_index += 1
+    def walk(
+        self,
+        prompts: Optional[List[str]] = None,
+        seeds: Optional[List[int]] = None,
+        num_interpolation_steps: Optional[Union[int, List[int]]] = 5,  # int or list of int
+        output_dir: Optional[str] = "./dreams",
+        name: Optional[str] = None,
+        image_file_ext: Optional[str] = ".png",
+        fps: Optional[int] = 30,
+        num_inference_steps: Optional[int] = 50,
+        guidance_scale: Optional[float] = 7.5,
+        eta: Optional[float] = 0.0,
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        upsample: Optional[bool] = False,
+        batch_size: Optional[int] = 1,
+        resume: Optional[bool] = False,
+        audio_filepath: str = None,
+        audio_start_sec: Optional[Union[int, float]] = None,
+        margin: Optional[float] = 1.0,
+        smooth: Optional[float] = 0.0,
+        negative_prompt: Optional[str] = None,
+        make_video: Optional[bool] = True,
+    ):
+        """Generate a video from a sequence of prompts and seeds. Optionally, add audio to the
+        video to interpolate to the intensity of the audio.
+        Args:
+            prompts (Optional[List[str]], optional):
+                list of text prompts. Defaults to None.
+            seeds (Optional[List[int]], optional):
+                list of random seeds corresponding to prompts. Defaults to None.
+            num_interpolation_steps (Union[int, List[int]], *optional*):
+                How many interpolation steps between each prompt. Defaults to None.
+            output_dir (Optional[str], optional):
+                Where to save the video. Defaults to './dreams'.
+            name (Optional[str], optional):
+                Name of the subdirectory of output_dir. Defaults to None.
+            image_file_ext (Optional[str], *optional*, defaults to '.png'):
+                The extension to use when writing video frames.
+            fps (Optional[int], *optional*, defaults to 30):
+                The frames per second in the resulting output videos.
+            num_inference_steps (Optional[int], *optional*, defaults to 50):
+                The number of denoising steps. More denoising steps usually lead to a higher quality image at the
+                expense of slower inference.
+            guidance_scale (Optional[float], *optional*, defaults to 7.5):
+                Guidance scale as defined in [Classifier-Free Diffusion Guidance](https://arxiv.org/abs/2207.12598).
+                `guidance_scale` is defined as `w` of equation 2. of [Imagen
+                Paper](https://arxiv.org/pdf/2205.11487.pdf). Guidance scale is enabled by setting `guidance_scale >
+                1`. Higher guidance scale encourages to generate images that are closely linked to the text `prompt`,
+                usually at the expense of lower image quality.
+            eta (Optional[float], *optional*, defaults to 0.0):
+                Corresponds to parameter eta (η) in the DDIM paper: https://arxiv.org/abs/2010.02502. Only applies to
+                [`schedulers.DDIMScheduler`], will be ignored for others.
+            height (Optional[int], *optional*, defaults to None):
+                height of the images to generate.
+            width (Optional[int], *optional*, defaults to None):
+                width of the images to generate.
+            upsample (Optional[bool], *optional*, defaults to False):
+                When True, upsamples images with realesrgan.
+            batch_size (Optional[int], *optional*, defaults to 1):
+                Number of images to generate at once.
+            resume (Optional[bool], *optional*, defaults to False):
+                When True, resumes from the last frame in the output directory based
+                on available prompt config. Requires you to provide the `name` argument.
+            audio_filepath (str, *optional*, defaults to None):
+                Optional path to an audio file to influence the interpolation rate.
+            audio_start_sec (Optional[Union[int, float]], *optional*, defaults to 0):
+                Global start time of the provided audio_filepath.
+            margin (Optional[float], *optional*, defaults to 1.0):
+                Margin from librosa hpss to use for audio interpolation.
+            smooth (Optional[float], *optional*, defaults to 0.0):
+                Smoothness of the audio interpolation. 1.0 means linear interpolation.
+            negative_prompt (Optional[str], *optional*, defaults to None):
+                Optional negative prompt to use. Same across all prompts.
+            make_video (Optional[bool], *optional*, defaults to True):
+                When True, makes a video from the generated frames. If False, only
+                generates the frames.
+        This function will create sub directories for each prompt and seed pair.
+        For example, if you provide the following prompts and seeds:
+        ```
+        prompts = ['a dog', 'a cat', 'a bird']
+        seeds = [1, 2, 3]
+        num_interpolation_steps = 5
+        output_dir = 'output_dir'
+        name = 'name'
+        fps = 5
+        ```
+        Then the following directories will be created:
+        ```
+        output_dir
+        ├── name
+        │   ├── name_000000
+        │   │   ├── frame000000.png
+        │   │   ├── ...
+        │   │   ├── frame000004.png
+        │   │   ├── name_000000.mp4
+        │   ├── name_000001
+        │   │   ├── frame000000.png
+        │   │   ├── ...
+        │   │   ├── frame000004.png
+        │   │   ├── name_000001.mp4
+        │   ├── ...
+        │   ├── name.mp4
+        |   |── prompt_config.json
+        ```
+        Returns:
+            str: The resulting video filepath. This video includes all sub directories' video clips.
+        """
+        # 0. Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        output_path = Path(output_dir)
+        name = name or time.strftime("%Y%m%d-%H%M%S")
+        save_path_root = output_path / name
+        save_path_root.mkdir(parents=True, exist_ok=True)
+        # Where the final video of all the clips combined will be saved
+        output_filepath = save_path_root / f"{name}.mp4"
+        # If using same number of interpolation steps between, we turn into list
+        if not resume and isinstance(num_interpolation_steps, int):
+            num_interpolation_steps = [num_interpolation_steps] * (len(prompts) - 1)
+        if not resume:
+            audio_start_sec = audio_start_sec or 0
+        # Save/reload prompt config
+        prompt_config_path = save_path_root / "prompt_config.json"
+        if not resume:
+            prompt_config_path.write_text(
+                json.dumps(
+                    dict(
+                        prompts=prompts,
+                        seeds=seeds,
+                        num_interpolation_steps=num_interpolation_steps,
+                        fps=fps,
+                        num_inference_steps=num_inference_steps,
+                        guidance_scale=guidance_scale,
+                        eta=eta,
+                        upsample=upsample,
+                        height=height,
+                        width=width,
+                        audio_filepath=audio_filepath,
+                        audio_start_sec=audio_start_sec,
+                        negative_prompt=negative_prompt,
+                    ),
+                    indent=2,
+                    sort_keys=False,
+                )
+            )
+        else:
+            data = json.load(open(prompt_config_path))
+            prompts = data["prompts"]
+            seeds = data["seeds"]
+            num_interpolation_steps = data["num_interpolation_steps"]
+            fps = data["fps"]
+            num_inference_steps = data["num_inference_steps"]
+            guidance_scale = data["guidance_scale"]
+            eta = data["eta"]
+            upsample = data["upsample"]
+            height = data["height"]
+            width = data["width"]
+            audio_filepath = data["audio_filepath"]
+            audio_start_sec = data["audio_start_sec"]
+            negative_prompt = data.get("negative_prompt", None)
+        for i, (prompt_a, prompt_b, seed_a, seed_b, num_step) in enumerate(
+            zip(prompts, prompts[1:], seeds, seeds[1:], num_interpolation_steps)
+        ):
+            # {name}_000000 / {name}_000001 / ...
+            save_path = save_path_root / f"{name}_{i:06d}"
+            # Where the individual clips will be saved
+            step_output_filepath = save_path / f"{name}_{i:06d}.mp4"
+            # Determine if we need to resume from a previous run
+            skip = 0
+            if resume:
+                if step_output_filepath.exists():
+                    print(f"Skipping {save_path} because frames already exist")
+                    continue
+                existing_frames = sorted(save_path.glob(f"*{image_file_ext}"))
+                if existing_frames:
+                    skip = int(existing_frames[-1].stem[-6:]) + 1
+                    if skip + 1 >= num_step:
+                        print(f"Skipping {save_path} because frames already exist")
+                        continue
+                    print(f"Resuming {save_path.name} from frame {skip}")
+            audio_offset = audio_start_sec + sum(num_interpolation_steps[:i]) / fps
+            audio_duration = num_step / fps
+            self.make_clip_frames(
+                prompt_a,
+                prompt_b,
+                seed_a,
+                seed_b,
+                num_interpolation_steps=num_step,
+                save_path=save_path,
+                num_inference_steps=num_inference_steps,
+                guidance_scale=guidance_scale,
+                eta=eta,
+                height=height,
+                width=width,
+                upsample=upsample,
+                batch_size=batch_size,
+                T=get_timesteps_arr(
+                    audio_filepath,
+                    offset=audio_offset,
+                    duration=audio_duration,
+                    fps=fps,
+                    margin=margin,
+                    smooth=smooth,
+                )
+                if audio_filepath
+                else None,
+                skip=skip,
+                negative_prompt=negative_prompt,
+                step=(i, len(prompts) - 1),
+            )
+            if make_video:
+                make_video_pyav(
+                    save_path,
+                    audio_filepath=audio_filepath,
+                    fps=fps,
+                    output_filepath=step_output_filepath,
+                    glob_pattern=f"*{image_file_ext}",
+                    audio_offset=audio_offset,
+                    audio_duration=audio_duration,
+                    sr=44100,
+                )
+        if make_video:
+            return make_video_pyav(
+                save_path_root,
+                audio_filepath=audio_filepath,
+                fps=fps,
+                audio_offset=audio_start_sec,
+                audio_duration=sum(num_interpolation_steps) / fps,
+                output_filepath=output_filepath,
+                glob_pattern=f"**/*{image_file_ext}",
+                sr=44100,
+            )
+    def embed_text(self, text, negative_prompt=None):
+        """Helper to embed some text"""
+        text_input = self.tokenizer(
+            text,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        with torch.no_grad():
+            embed = self.text_encoder(text_input.input_ids.to(self.device))[0]
+        return embed
+    def init_noise(self, seed, noise_shape, dtype):
+        """Helper to initialize noise"""
+        # randn does not exist on mps, so we create noise on CPU here and move it to the device after initialization
+        if self.device.type == "mps":
+            noise = torch.randn(
+                noise_shape,
+                device="cpu",
+                generator=torch.Generator(device="cpu").manual_seed(seed),
+            ).to(self.device)
+        else:
+            noise = torch.randn(
+                noise_shape,
+                device=self.device,
+                generator=torch.Generator(device=self.device).manual_seed(seed),
+                dtype=dtype,
+            )
+        return noise
+    @classmethod
+    def from_pretrained(cls, *args, tiled=False, **kwargs):
+        """Same as diffusers `from_pretrained` but with tiled option, which makes images tilable"""
+        if tiled:
+            def patch_conv(**patch):
+                cls = nn.Conv2d
+                init = cls.__init__
+                def __init__(self, *args, **kwargs):
+                    return init(self, *args, **kwargs, **patch)
+                cls.__init__ = __init__
+            patch_conv(padding_mode="circular")
+        pipeline = super().from_pretrained(*args, **kwargs)
+        pipeline.tiled = tiled
+        return pipeline

video_diffusion/stable_diffusion_video/stable_video_text2video.py ADDED Viewed

	@@ -0,0 +1,158 @@

+import gradio as gr
+import numpy as np
+import torch
+from video_diffusion.stable_diffusion_video.stable_diffusion_pipeline import StableDiffusionWalkPipeline
+from video_diffusion.utils.model_list import stable_model_list
+class StableDiffusionText2VideoGenerator:
+    def __init__(self):
+        self.pipe = None
+    def load_model(
+        self,
+        model_path,
+    ):
+        if self.pipe is None:
+            self.pipe = StableDiffusionWalkPipeline.from_pretrained(
+                model_path,
+                torch_dtype=torch.float16,
+                revision="fp16",
+            )
+            self.pipe.to("cuda")
+            self.pipe.enable_xformers_memory_efficient_attention()
+            self.pipe.enable_attention_slicing()
+        return self.pipe
+    def generate_video(
+        self,
+        model_path: str,
+        first_prompts: str,
+        second_prompts: str,
+        negative_prompt: str,
+        num_interpolation_steps: int,
+        guidance_scale: int,
+        num_inference_step: int,
+        height: int,
+        width: int,
+        upsample: bool,
+        fps=int,
+    ):
+        first_seed = np.random.randint(0, 100000)
+        second_seed = np.random.randint(0, 100000)
+        seeds = [first_seed, second_seed]
+        prompts = [first_prompts, second_prompts]
+        pipe = self.load_model(model_path=model_path)
+        output_video = pipe.walk(
+            prompts=prompts,
+            num_interpolation_steps=int(num_interpolation_steps),
+            height=height,
+            width=width,
+            guidance_scale=guidance_scale,
+            num_inference_steps=num_inference_step,
+            negative_prompt=negative_prompt,
+            seeds=seeds,
+            upsample=upsample,
+            fps=fps,
+        )
+        return output_video
+    def app():
+        with gr.Blocks():
+            with gr.Row():
+                with gr.Column():
+                    stable_text2video_first_prompt = gr.Textbox(
+                        lines=1,
+                        placeholder="First Prompt",
+                        show_label=False,
+                    )
+                    stable_text2video_second_prompt = gr.Textbox(
+                        lines=1,
+                        placeholder="Second Prompt",
+                        show_label=False,
+                    )
+                    stable_text2video_negative_prompt = gr.Textbox(
+                        lines=1,
+                        placeholder="Negative Prompt ",
+                        show_label=False,
+                    )
+                    with gr.Row():
+                        with gr.Column():
+                            stable_text2video_model_path = gr.Dropdown(
+                                choices=stable_model_list,
+                                label="Stable Model List",
+                                value=stable_model_list[0],
+                            )
+                            stable_text2video_guidance_scale = gr.Slider(
+                                minimum=0,
+                                maximum=15,
+                                step=1,
+                                value=8.5,
+                                label="Guidance Scale",
+                            )
+                            stable_text2video_num_inference_steps = gr.Slider(
+                                minimum=1,
+                                maximum=100,
+                                step=1,
+                                value=30,
+                                label="Number of Inference Steps",
+                            )
+                            stable_text2video_fps = gr.Slider(
+                                minimum=1,
+                                maximum=60,
+                                step=1,
+                                value=10,
+                                label="Fps",
+                            )
+                        with gr.Row():
+                            with gr.Column():
+                                stable_text2video_num_interpolation_steps = gr.Number(
+                                    value=10,
+                                    label="Number of Interpolation Steps",
+                                )
+                                stable_text2video_height = gr.Slider(
+                                    minimum=1,
+                                    maximum=1000,
+                                    step=1,
+                                    value=512,
+                                    label="Height",
+                                )
+                                stable_text2video_width = gr.Slider(
+                                    minimum=1,
+                                    maximum=1000,
+                                    step=1,
+                                    value=512,
+                                    label="Width",
+                                )
+                                stable_text2video_upsample = gr.Checkbox(
+                                    label="Upsample",
+                                    default=False,
+                                )
+                    text2video_generate = gr.Button(value="Generator")
+                with gr.Column():
+                    text2video_output = gr.Video(label="Output")
+            text2video_generate.click(
+                fn=StableDiffusionText2VideoGenerator().generate_video,
+                inputs=[
+                    stable_text2video_model_path,
+                    stable_text2video_first_prompt,
+                    stable_text2video_second_prompt,
+                    stable_text2video_negative_prompt,
+                    stable_text2video_num_interpolation_steps,
+                    stable_text2video_guidance_scale,
+                    stable_text2video_num_inference_steps,
+                    stable_text2video_height,
+                    stable_text2video_width,
+                    stable_text2video_upsample,
+                    stable_text2video_fps,
+                ],
+                outputs=text2video_output,
+            )

video_diffusion/stable_diffusion_video/upsampling.py ADDED Viewed

	@@ -0,0 +1,104 @@

+from pathlib import Path
+import cv2
+from diffusers.utils import logging
+from huggingface_hub import hf_hub_download
+from PIL import Image
+from torch import nn
+try:
+    from basicsr.archs.rrdbnet_arch import RRDBNet
+    from realesrgan import RealESRGANer
+except ImportError as e:
+    raise ImportError(
+        "You tried to import realesrgan without having it installed properly. To install Real-ESRGAN, run:\n\n"
+        "pip install realesrgan"
+    )
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+class RealESRGANModel(nn.Module):
+    def __init__(self, model_path, tile=0, tile_pad=10, pre_pad=0, fp32=False):
+        super().__init__()
+        try:
+            from basicsr.archs.rrdbnet_arch import RRDBNet
+            from realesrgan import RealESRGANer
+        except ImportError as e:
+            raise ImportError(
+                "You tried to import realesrgan without having it installed properly. To install Real-ESRGAN, run:\n\n"
+                "pip install realesrgan"
+            )
+        model = RRDBNet(num_in_ch=3, num_out_ch=3, num_feat=64, num_block=23, num_grow_ch=32, scale=4)
+        self.upsampler = RealESRGANer(
+            scale=4, model_path=model_path, model=model, tile=tile, tile_pad=tile_pad, pre_pad=pre_pad, half=not fp32
+        )
+    def forward(self, image, outscale=4, convert_to_pil=True):
+        """Upsample an image array or path.
+        Args:
+            image (Union[np.ndarray, str]): Either a np array or an image path. np array is assumed to be in RGB format,
+                and we convert it to BGR.
+            outscale (int, optional): Amount to upscale the image. Defaults to 4.
+            convert_to_pil (bool, optional): If True, return PIL image. Otherwise, return numpy array (BGR). Defaults to True.
+        Returns:
+            Union[np.ndarray, PIL.Image.Image]: An upsampled version of the input image.
+        """
+        if isinstance(image, (str, Path)):
+            img = cv2.imread(image, cv2.IMREAD_UNCHANGED)
+        else:
+            img = image
+            img = (img * 255).round().astype("uint8")
+            img = img[:, :, ::-1]
+        image, _ = self.upsampler.enhance(img, outscale=outscale)
+        if convert_to_pil:
+            image = Image.fromarray(image[:, :, ::-1])
+        return image
+    @classmethod
+    def from_pretrained(cls, model_name_or_path="nateraw/real-esrgan"):
+        """Initialize a pretrained Real-ESRGAN upsampler.
+        Example:
+            ```python
+            >>> from stable_diffusion_videos import PipelineRealESRGAN
+            >>> pipe = PipelineRealESRGAN.from_pretrained('nateraw/real-esrgan')
+            >>> im_out = pipe('input_img.jpg')
+            ```
+        Args:
+            model_name_or_path (str, optional): The Hugging Face repo ID or path to local model. Defaults to 'nateraw/real-esrgan'.
+        Returns:
+            stable_diffusion_videos.PipelineRealESRGAN: An instance of `PipelineRealESRGAN` instantiated from pretrained model.
+        """
+        # reuploaded form official ones mentioned here:
+        # https://github.com/xinntao/Real-ESRGAN
+        if Path(model_name_or_path).exists():
+            file = model_name_or_path
+        else:
+            file = hf_hub_download(model_name_or_path, "RealESRGAN_x4plus.pth")
+        return cls(file)
+    def upsample_imagefolder(self, in_dir, out_dir, suffix="out", outfile_ext=".png", recursive=False, force=False):
+        in_dir, out_dir = Path(in_dir), Path(out_dir)
+        if not in_dir.exists():
+            raise FileNotFoundError(f"Provided input directory {in_dir} does not exist")
+        out_dir.mkdir(exist_ok=True, parents=True)
+        generator = in_dir.rglob("*") if recursive else in_dir.glob("*")
+        image_paths = [x for x in generator if x.suffix.lower() in [".png", ".jpg", ".jpeg"]]
+        n_img = len(image_paths)
+        for i, image in enumerate(image_paths):
+            out_filepath = out_dir / (str(image.relative_to(in_dir).with_suffix("")) + suffix + outfile_ext)
+            if not force and out_filepath.exists():
+                logger.info(
+                    f"[{i}/{n_img}] {out_filepath} already exists, skipping. To avoid skipping, pass force=True."
+                )
+                continue
+            logger.info(f"[{i}/{n_img}] upscaling {image}")
+            im = self(str(image))
+            out_filepath.parent.mkdir(parents=True, exist_ok=True)
+            im.save(out_filepath)

video_diffusion/stable_diffusion_video/utils.py ADDED Viewed

	@@ -0,0 +1,135 @@

+from pathlib import Path
+from typing import Union
+import librosa
+import numpy as np
+import torch
+from PIL import Image
+from torchvision.io import write_video
+from torchvision.transforms.functional import pil_to_tensor
+def get_timesteps_arr(audio_filepath, offset, duration, fps=30, margin=1.0, smooth=0.0):
+    y, sr = librosa.load(audio_filepath, offset=offset, duration=duration)
+    # librosa.stft hardcoded defaults...
+    # n_fft defaults to 2048
+    # hop length is win_length // 4
+    # win_length defaults to n_fft
+    D = librosa.stft(y, n_fft=2048, hop_length=2048 // 4, win_length=2048)
+    # Extract percussive elements
+    D_harmonic, D_percussive = librosa.decompose.hpss(D, margin=margin)
+    y_percussive = librosa.istft(D_percussive, length=len(y))
+    # Get normalized melspectrogram
+    spec_raw = librosa.feature.melspectrogram(y=y_percussive, sr=sr)
+    spec_max = np.amax(spec_raw, axis=0)
+    spec_norm = (spec_max - np.min(spec_max)) / np.ptp(spec_max)
+    # Resize cumsum of spec norm to our desired number of interpolation frames
+    x_norm = np.linspace(0, spec_norm.shape[-1], spec_norm.shape[-1])
+    y_norm = np.cumsum(spec_norm)
+    y_norm /= y_norm[-1]
+    x_resize = np.linspace(0, y_norm.shape[-1], int(duration * fps))
+    T = np.interp(x_resize, x_norm, y_norm)
+    # Apply smoothing
+    return T * (1 - smooth) + np.linspace(0.0, 1.0, T.shape[0]) * smooth
+def slerp(t, v0, v1, DOT_THRESHOLD=0.9995):
+    """helper function to spherically interpolate two arrays v1 v2"""
+    inputs_are_torch = isinstance(v0, torch.Tensor)
+    if inputs_are_torch:
+        input_device = v0.device
+        v0 = v0.cpu().numpy()
+        v1 = v1.cpu().numpy()
+    dot = np.sum(v0 * v1 / (np.linalg.norm(v0) * np.linalg.norm(v1)))
+    if np.abs(dot) > DOT_THRESHOLD:
+        v2 = (1 - t) * v0 + t * v1
+    else:
+        theta_0 = np.arccos(dot)
+        sin_theta_0 = np.sin(theta_0)
+        theta_t = theta_0 * t
+        sin_theta_t = np.sin(theta_t)
+        s0 = np.sin(theta_0 - theta_t) / sin_theta_0
+        s1 = sin_theta_t / sin_theta_0
+        v2 = s0 * v0 + s1 * v1
+    if inputs_are_torch:
+        v2 = torch.from_numpy(v2).to(input_device)
+    return v2
+def make_video_pyav(
+    frames_or_frame_dir: Union[str, Path, torch.Tensor],
+    audio_filepath: Union[str, Path] = None,
+    fps: int = 30,
+    audio_offset: int = 0,
+    audio_duration: int = 2,
+    sr: int = 22050,
+    output_filepath: Union[str, Path] = "output.mp4",
+    glob_pattern: str = "*.png",
+):
+    """
+    TODO - docstring here
+    frames_or_frame_dir: (Union[str, Path, torch.Tensor]):
+        Either a directory of images, or a tensor of shape (T, C, H, W) in range [0, 255].
+    """
+    # Torchvision write_video doesn't support pathlib paths
+    output_filepath = str(output_filepath)
+    if isinstance(frames_or_frame_dir, (str, Path)):
+        frames = None
+        for img in sorted(Path(frames_or_frame_dir).glob(glob_pattern)):
+            frame = pil_to_tensor(Image.open(img)).unsqueeze(0)
+            frames = frame if frames is None else torch.cat([frames, frame])
+    else:
+        frames = frames_or_frame_dir
+    # TCHW -> THWC
+    frames = frames.permute(0, 2, 3, 1)
+    if audio_filepath:
+        # Read audio, convert to tensor
+        audio, sr = librosa.load(
+            audio_filepath,
+            sr=sr,
+            mono=True,
+            offset=audio_offset,
+            duration=audio_duration,
+        )
+        audio_tensor = torch.tensor(audio).unsqueeze(0)
+        write_video(
+            output_filepath,
+            frames,
+            fps=fps,
+            audio_array=audio_tensor,
+            audio_fps=sr,
+            audio_codec="aac",
+            options={"crf": "10", "pix_fmt": "yuv420p"},
+        )
+    else:
+        write_video(
+            output_filepath,
+            frames,
+            fps=fps,
+            options={"crf": "10", "pix_fmt": "yuv420p"},
+        )
+    return output_filepath
+def pad_along_axis(array: np.ndarray, pad_size: int, axis: int = 0) -> np.ndarray:
+    if pad_size <= 0:
+        return array
+    npad = [(0, 0)] * array.ndim
+    npad[axis] = (0, pad_size)
+    return np.pad(array, pad_width=npad, mode="constant", constant_values=0)

video_diffusion/tuneavideo/models/attention.py ADDED Viewed

	@@ -0,0 +1,322 @@

+# Adapted from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention.py
+from dataclasses import dataclass
+from typing import Optional
+import torch
+import torch.nn.functional as F
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.attention import AdaLayerNorm, FeedForward
+from diffusers.models.cross_attention import CrossAttention
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.utils import BaseOutput
+from diffusers.utils.import_utils import is_xformers_available
+from einops import rearrange, repeat
+from torch import nn
+@dataclass
+class Transformer3DModelOutput(BaseOutput):
+    sample: torch.FloatTensor
+if is_xformers_available():
+    import xformers
+    import xformers.ops
+else:
+    xformers = None
+class Transformer3DModel(ModelMixin, ConfigMixin):
+    @register_to_config
+    def __init__(
+        self,
+        num_attention_heads: int = 16,
+        attention_head_dim: int = 88,
+        in_channels: Optional[int] = None,
+        num_layers: int = 1,
+        dropout: float = 0.0,
+        norm_num_groups: int = 32,
+        cross_attention_dim: Optional[int] = None,
+        attention_bias: bool = False,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        use_linear_projection: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+    ):
+        super().__init__()
+        self.use_linear_projection = use_linear_projection
+        self.num_attention_heads = num_attention_heads
+        self.attention_head_dim = attention_head_dim
+        inner_dim = num_attention_heads * attention_head_dim
+        # Define input layers
+        self.in_channels = in_channels
+        self.norm = torch.nn.GroupNorm(num_groups=norm_num_groups, num_channels=in_channels, eps=1e-6, affine=True)
+        if use_linear_projection:
+            self.proj_in = nn.Linear(in_channels, inner_dim)
+        else:
+            self.proj_in = nn.Conv2d(in_channels, inner_dim, kernel_size=1, stride=1, padding=0)
+        # Define transformers blocks
+        self.transformer_blocks = nn.ModuleList(
+            [
+                BasicTransformerBlock(
+                    inner_dim,
+                    num_attention_heads,
+                    attention_head_dim,
+                    dropout=dropout,
+                    cross_attention_dim=cross_attention_dim,
+                    activation_fn=activation_fn,
+                    num_embeds_ada_norm=num_embeds_ada_norm,
+                    attention_bias=attention_bias,
+                    only_cross_attention=only_cross_attention,
+                    upcast_attention=upcast_attention,
+                )
+                for d in range(num_layers)
+            ]
+        )
+        # 4. Define output layers
+        if use_linear_projection:
+            self.proj_out = nn.Linear(in_channels, inner_dim)
+        else:
+            self.proj_out = nn.Conv2d(inner_dim, in_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, hidden_states, encoder_hidden_states=None, timestep=None, return_dict: bool = True):
+        # Input
+        assert hidden_states.dim() == 5, f"Expected hidden_states to have ndim=5, but got ndim={hidden_states.dim()}."
+        video_length = hidden_states.shape[2]
+        hidden_states = rearrange(hidden_states, "b c f h w -> (b f) c h w")
+        encoder_hidden_states = repeat(encoder_hidden_states, "b n c -> (b f) n c", f=video_length)
+        batch, channel, height, weight = hidden_states.shape
+        residual = hidden_states
+        hidden_states = self.norm(hidden_states)
+        if not self.use_linear_projection:
+            hidden_states = self.proj_in(hidden_states)
+            inner_dim = hidden_states.shape[1]
+            hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim)
+        else:
+            inner_dim = hidden_states.shape[1]
+            hidden_states = hidden_states.permute(0, 2, 3, 1).reshape(batch, height * weight, inner_dim)
+            hidden_states = self.proj_in(hidden_states)
+        # Blocks
+        for block in self.transformer_blocks:
+            hidden_states = block(
+                hidden_states, encoder_hidden_states=encoder_hidden_states, timestep=timestep, video_length=video_length
+            )
+        # Output
+        if not self.use_linear_projection:
+            hidden_states = hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2).contiguous()
+            hidden_states = self.proj_out(hidden_states)
+        else:
+            hidden_states = self.proj_out(hidden_states)
+            hidden_states = hidden_states.reshape(batch, height, weight, inner_dim).permute(0, 3, 1, 2).contiguous()
+        output = hidden_states + residual
+        output = rearrange(output, "(b f) c h w -> b c f h w", f=video_length)
+        if not return_dict:
+            return (output,)
+        return Transformer3DModelOutput(sample=output)
+class BasicTransformerBlock(nn.Module):
+    def __init__(
+        self,
+        dim: int,
+        num_attention_heads: int,
+        attention_head_dim: int,
+        dropout=0.0,
+        cross_attention_dim: Optional[int] = None,
+        activation_fn: str = "geglu",
+        num_embeds_ada_norm: Optional[int] = None,
+        attention_bias: bool = False,
+        only_cross_attention: bool = False,
+        upcast_attention: bool = False,
+    ):
+        super().__init__()
+        self.only_cross_attention = only_cross_attention
+        self.use_ada_layer_norm = num_embeds_ada_norm is not None
+        # SC-Attn
+        self.attn1 = SparseCausalAttention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            cross_attention_dim=cross_attention_dim if only_cross_attention else None,
+            upcast_attention=upcast_attention,
+        )
+        self.norm1 = AdaLayerNorm(dim, num_embeds_ada_norm) if self.use_ada_layer_norm else nn.LayerNorm(dim)
+        # Cross-Attn
+        if cross_attention_dim is not None:
+            self.attn2 = CrossAttention(
+                query_dim=dim,
+                cross_attention_dim=cross_attention_dim,
+                heads=num_attention_heads,
+                dim_head=attention_head_dim,
+                dropout=dropout,
+                bias=attention_bias,
+                upcast_attention=upcast_attention,
+            )
+        else:
+            self.attn2 = None
+        if cross_attention_dim is not None:
+            self.norm2 = AdaLayerNorm(dim, num_embeds_ada_norm) if self.use_ada_layer_norm else nn.LayerNorm(dim)
+        else:
+            self.norm2 = None
+        # Feed-forward
+        self.ff = FeedForward(dim, dropout=dropout, activation_fn=activation_fn)
+        self.norm3 = nn.LayerNorm(dim)
+        # Temp-Attn
+        self.attn_temp = CrossAttention(
+            query_dim=dim,
+            heads=num_attention_heads,
+            dim_head=attention_head_dim,
+            dropout=dropout,
+            bias=attention_bias,
+            upcast_attention=upcast_attention,
+        )
+        nn.init.zeros_(self.attn_temp.to_out[0].weight.data)
+        self.norm_temp = AdaLayerNorm(dim, num_embeds_ada_norm) if self.use_ada_layer_norm else nn.LayerNorm(dim)
+    def set_use_memory_efficient_attention_xformers(self, use_memory_efficient_attention_xformers: bool):
+        if not is_xformers_available():
+            print("Here is how to install it")
+            raise ModuleNotFoundError(
+                "Refer to https://github.com/facebookresearch/xformers for more information on how to install"
+                " xformers",
+                name="xformers",
+            )
+        elif not torch.cuda.is_available():
+            raise ValueError(
+                "torch.cuda.is_available() should be True but is False. xformers' memory efficient attention is only"
+                " available for GPU "
+            )
+        else:
+            try:
+                # Make sure we can run the memory efficient attention
+                _ = xformers.ops.memory_efficient_attention(
+                    torch.randn((1, 2, 40), device="cuda"),
+                    torch.randn((1, 2, 40), device="cuda"),
+                    torch.randn((1, 2, 40), device="cuda"),
+                )
+            except Exception as e:
+                raise e
+            self.attn1._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers
+            if self.attn2 is not None:
+                self.attn2._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers
+            # self.attn_temp._use_memory_efficient_attention_xformers = use_memory_efficient_attention_xformers
+    def forward(self, hidden_states, encoder_hidden_states=None, timestep=None, attention_mask=None, video_length=None):
+        # SparseCausal-Attention
+        norm_hidden_states = (
+            self.norm1(hidden_states, timestep) if self.use_ada_layer_norm else self.norm1(hidden_states)
+        )
+        if self.only_cross_attention:
+            hidden_states = (
+                self.attn1(norm_hidden_states, encoder_hidden_states, attention_mask=attention_mask) + hidden_states
+            )
+        else:
+            hidden_states = (
+                self.attn1(norm_hidden_states, attention_mask=attention_mask, video_length=video_length) + hidden_states
+            )
+        if self.attn2 is not None:
+            # Cross-Attention
+            norm_hidden_states = (
+                self.norm2(hidden_states, timestep) if self.use_ada_layer_norm else self.norm2(hidden_states)
+            )
+            hidden_states = (
+                self.attn2(
+                    norm_hidden_states, encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask
+                )
+                + hidden_states
+            )
+        # Feed-forward
+        hidden_states = self.ff(self.norm3(hidden_states)) + hidden_states
+        # Temporal-Attention
+        d = hidden_states.shape[1]
+        hidden_states = rearrange(hidden_states, "(b f) d c -> (b d) f c", f=video_length)
+        norm_hidden_states = (
+            self.norm_temp(hidden_states, timestep) if self.use_ada_layer_norm else self.norm_temp(hidden_states)
+        )
+        hidden_states = self.attn_temp(norm_hidden_states) + hidden_states
+        hidden_states = rearrange(hidden_states, "(b d) f c -> (b f) d c", d=d)
+        return hidden_states
+class SparseCausalAttention(CrossAttention):
+    def forward(self, hidden_states, encoder_hidden_states=None, attention_mask=None, video_length=None):
+        batch_size, sequence_length, _ = hidden_states.shape
+        encoder_hidden_states = encoder_hidden_states
+        if self.group_norm is not None:
+            hidden_states = self.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = self.to_q(hidden_states)
+        dim = query.shape[-1]
+        query = self.reshape_heads_to_batch_dim(query)
+        if self.added_kv_proj_dim is not None:
+            raise NotImplementedError
+        encoder_hidden_states = encoder_hidden_states if encoder_hidden_states is not None else hidden_states
+        key = self.to_k(encoder_hidden_states)
+        value = self.to_v(encoder_hidden_states)
+        former_frame_index = torch.arange(video_length) - 1
+        former_frame_index[0] = 0
+        key = rearrange(key, "(b f) d c -> b f d c", f=video_length)
+        key = torch.cat([key[:, [0] * video_length], key[:, former_frame_index]], dim=2)
+        key = rearrange(key, "b f d c -> (b f) d c")
+        value = rearrange(value, "(b f) d c -> b f d c", f=video_length)
+        value = torch.cat([value[:, [0] * video_length], value[:, former_frame_index]], dim=2)
+        value = rearrange(value, "b f d c -> (b f) d c")
+        key = self.reshape_heads_to_batch_dim(key)
+        value = self.reshape_heads_to_batch_dim(value)
+        if attention_mask is not None:
+            if attention_mask.shape[-1] != query.shape[1]:
+                target_length = query.shape[1]
+                attention_mask = F.pad(attention_mask, (0, target_length), value=0.0)
+                attention_mask = attention_mask.repeat_interleave(self.heads, dim=0)
+        # attention, what we cannot get enough of
+        if self._use_memory_efficient_attention_xformers:
+            hidden_states = self._memory_efficient_attention_xformers(query, key, value, attention_mask)
+            # Some versions of xformers return output in fp32, cast it back to the dtype of the input
+            hidden_states = hidden_states.to(query.dtype)
+        else:
+            if self._slice_size is None or query.shape[0] // self._slice_size == 1:
+                hidden_states = self._attention(query, key, value, attention_mask)
+            else:
+                hidden_states = self._sliced_attention(query, key, value, sequence_length, dim, attention_mask)
+        # linear proj
+        hidden_states = self.to_out[0](hidden_states)
+        # dropout
+        hidden_states = self.to_out[1](hidden_states)
+        return hidden_states

video_diffusion/tuneavideo/models/resnet.py ADDED Viewed

	@@ -0,0 +1,208 @@

+# Adapted from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/resnet.py
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from einops import rearrange
+class InflatedConv3d(nn.Conv2d):
+    def forward(self, x):
+        video_length = x.shape[2]
+        x = rearrange(x, "b c f h w -> (b f) c h w")
+        x = super().forward(x)
+        x = rearrange(x, "(b f) c h w -> b c f h w", f=video_length)
+        return x
+class Upsample3D(nn.Module):
+    def __init__(self, channels, use_conv=False, use_conv_transpose=False, out_channels=None, name="conv"):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.use_conv_transpose = use_conv_transpose
+        self.name = name
+        conv = None
+        if use_conv_transpose:
+            raise NotImplementedError
+        elif use_conv:
+            conv = InflatedConv3d(self.channels, self.out_channels, 3, padding=1)
+        if name == "conv":
+            self.conv = conv
+        else:
+            self.Conv2d_0 = conv
+    def forward(self, hidden_states, output_size=None):
+        assert hidden_states.shape[1] == self.channels
+        if self.use_conv_transpose:
+            raise NotImplementedError
+        # Cast to float32 to as 'upsample_nearest2d_out_frame' op does not support bfloat16
+        dtype = hidden_states.dtype
+        if dtype == torch.bfloat16:
+            hidden_states = hidden_states.to(torch.float32)
+        # upsample_nearest_nhwc fails with large batch sizes. see https://github.com/huggingface/diffusers/issues/984
+        if hidden_states.shape[0] >= 64:
+            hidden_states = hidden_states.contiguous()
+        # if `output_size` is passed we force the interpolation output
+        # size and do not make use of `scale_factor=2`
+        if output_size is None:
+            hidden_states = F.interpolate(hidden_states, scale_factor=[1.0, 2.0, 2.0], mode="nearest")
+        else:
+            hidden_states = F.interpolate(hidden_states, size=output_size, mode="nearest")
+        # If the input is bfloat16, we cast back to bfloat16
+        if dtype == torch.bfloat16:
+            hidden_states = hidden_states.to(dtype)
+        if self.use_conv:
+            if self.name == "conv":
+                hidden_states = self.conv(hidden_states)
+            else:
+                hidden_states = self.Conv2d_0(hidden_states)
+        return hidden_states
+class Downsample3D(nn.Module):
+    def __init__(self, channels, use_conv=False, out_channels=None, padding=1, name="conv"):
+        super().__init__()
+        self.channels = channels
+        self.out_channels = out_channels or channels
+        self.use_conv = use_conv
+        self.padding = padding
+        stride = 2
+        self.name = name
+        if use_conv:
+            conv = InflatedConv3d(self.channels, self.out_channels, 3, stride=stride, padding=padding)
+        else:
+            raise NotImplementedError
+        if name == "conv":
+            self.Conv2d_0 = conv
+            self.conv = conv
+        elif name == "Conv2d_0":
+            self.conv = conv
+        else:
+            self.conv = conv
+    def forward(self, hidden_states):
+        assert hidden_states.shape[1] == self.channels
+        if self.use_conv and self.padding == 0:
+            raise NotImplementedError
+        assert hidden_states.shape[1] == self.channels
+        hidden_states = self.conv(hidden_states)
+        return hidden_states
+class ResnetBlock3D(nn.Module):
+    def __init__(
+        self,
+        *,
+        in_channels,
+        out_channels=None,
+        conv_shortcut=False,
+        dropout=0.0,
+        temb_channels=512,
+        groups=32,
+        groups_out=None,
+        pre_norm=True,
+        eps=1e-6,
+        non_linearity="swish",
+        time_embedding_norm="default",
+        output_scale_factor=1.0,
+        use_in_shortcut=None,
+    ):
+        super().__init__()
+        self.pre_norm = pre_norm
+        self.pre_norm = True
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.use_conv_shortcut = conv_shortcut
+        self.time_embedding_norm = time_embedding_norm
+        self.output_scale_factor = output_scale_factor
+        if groups_out is None:
+            groups_out = groups
+        self.norm1 = torch.nn.GroupNorm(num_groups=groups, num_channels=in_channels, eps=eps, affine=True)
+        self.conv1 = InflatedConv3d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if temb_channels is not None:
+            if self.time_embedding_norm == "default":
+                time_emb_proj_out_channels = out_channels
+            elif self.time_embedding_norm == "scale_shift":
+                time_emb_proj_out_channels = out_channels * 2
+            else:
+                raise ValueError(f"unknown time_embedding_norm : {self.time_embedding_norm} ")
+            self.time_emb_proj = torch.nn.Linear(temb_channels, time_emb_proj_out_channels)
+        else:
+            self.time_emb_proj = None
+        self.norm2 = torch.nn.GroupNorm(num_groups=groups_out, num_channels=out_channels, eps=eps, affine=True)
+        self.dropout = torch.nn.Dropout(dropout)
+        self.conv2 = InflatedConv3d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if non_linearity == "swish":
+            self.nonlinearity = lambda x: F.silu(x)
+        elif non_linearity == "mish":
+            self.nonlinearity = Mish()
+        elif non_linearity == "silu":
+            self.nonlinearity = nn.SiLU()
+        self.use_in_shortcut = self.in_channels != self.out_channels if use_in_shortcut is None else use_in_shortcut
+        self.conv_shortcut = None
+        if self.use_in_shortcut:
+            self.conv_shortcut = InflatedConv3d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, input_tensor, temb):
+        hidden_states = input_tensor
+        hidden_states = self.norm1(hidden_states)
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.conv1(hidden_states)
+        if temb is not None:
+            temb = self.time_emb_proj(self.nonlinearity(temb))[:, :, None, None, None]
+        if temb is not None and self.time_embedding_norm == "default":
+            hidden_states = hidden_states + temb
+        hidden_states = self.norm2(hidden_states)
+        if temb is not None and self.time_embedding_norm == "scale_shift":
+            scale, shift = torch.chunk(temb, 2, dim=1)
+            hidden_states = hidden_states * (1 + scale) + shift
+        hidden_states = self.nonlinearity(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = self.conv2(hidden_states)
+        if self.conv_shortcut is not None:
+            input_tensor = self.conv_shortcut(input_tensor)
+        output_tensor = (input_tensor + hidden_states) / self.output_scale_factor
+        return output_tensor
+class Mish(torch.nn.Module):
+    def forward(self, hidden_states):
+        return hidden_states * torch.tanh(torch.nn.functional.softplus(hidden_states))

video_diffusion/tuneavideo/models/unet.py ADDED Viewed

	@@ -0,0 +1,437 @@

+# Adapted from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/unet_2d_condition.py
+import json
+import os
+from dataclasses import dataclass
+from typing import List, Optional, Tuple, Union
+import torch
+import torch.nn as nn
+import torch.utils.checkpoint
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.models.embeddings import TimestepEmbedding, Timesteps
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.utils import BaseOutput, logging
+from .resnet import InflatedConv3d
+from .unet_blocks import (
+    CrossAttnDownBlock3D,
+    CrossAttnUpBlock3D,
+    DownBlock3D,
+    UNetMidBlock3DCrossAttn,
+    UpBlock3D,
+    get_down_block,
+    get_up_block,
+)
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@dataclass
+class UNet3DConditionOutput(BaseOutput):
+    sample: torch.FloatTensor
+class UNet3DConditionModel(ModelMixin, ConfigMixin):
+    _supports_gradient_checkpointing = True
+    @register_to_config
+    def __init__(
+        self,
+        sample_size: Optional[int] = None,
+        in_channels: int = 4,
+        out_channels: int = 4,
+        center_input_sample: bool = False,
+        flip_sin_to_cos: bool = True,
+        freq_shift: int = 0,
+        down_block_types: Tuple[str] = (
+            "CrossAttnDownBlock3D",
+            "CrossAttnDownBlock3D",
+            "CrossAttnDownBlock3D",
+            "DownBlock3D",
+        ),
+        mid_block_type: str = "UNetMidBlock3DCrossAttn",
+        up_block_types: Tuple[str] = ("UpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D"),
+        only_cross_attention: Union[bool, Tuple[bool]] = False,
+        block_out_channels: Tuple[int] = (320, 640, 1280, 1280),
+        layers_per_block: int = 2,
+        downsample_padding: int = 1,
+        mid_block_scale_factor: float = 1,
+        act_fn: str = "silu",
+        norm_num_groups: int = 32,
+        norm_eps: float = 1e-5,
+        cross_attention_dim: int = 1280,
+        attention_head_dim: Union[int, Tuple[int]] = 8,
+        dual_cross_attention: bool = False,
+        use_linear_projection: bool = False,
+        class_embed_type: Optional[str] = None,
+        num_class_embeds: Optional[int] = None,
+        upcast_attention: bool = False,
+        resnet_time_scale_shift: str = "default",
+    ):
+        super().__init__()
+        self.sample_size = sample_size
+        time_embed_dim = block_out_channels[0] * 4
+        # input
+        self.conv_in = InflatedConv3d(in_channels, block_out_channels[0], kernel_size=3, padding=(1, 1))
+        # time
+        self.time_proj = Timesteps(block_out_channels[0], flip_sin_to_cos, freq_shift)
+        timestep_input_dim = block_out_channels[0]
+        self.time_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+        # class embedding
+        if class_embed_type is None and num_class_embeds is not None:
+            self.class_embedding = nn.Embedding(num_class_embeds, time_embed_dim)
+        elif class_embed_type == "timestep":
+            self.class_embedding = TimestepEmbedding(timestep_input_dim, time_embed_dim)
+        elif class_embed_type == "identity":
+            self.class_embedding = nn.Identity(time_embed_dim, time_embed_dim)
+        else:
+            self.class_embedding = None
+        self.down_blocks = nn.ModuleList([])
+        self.mid_block = None
+        self.up_blocks = nn.ModuleList([])
+        if isinstance(only_cross_attention, bool):
+            only_cross_attention = [only_cross_attention] * len(down_block_types)
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(down_block_types)
+        # down
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            down_block = get_down_block(
+                down_block_type,
+                num_layers=layers_per_block,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                temb_channels=time_embed_dim,
+                add_downsample=not is_final_block,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=attention_head_dim[i],
+                downsample_padding=downsample_padding,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+            )
+            self.down_blocks.append(down_block)
+        # mid
+        if mid_block_type == "UNetMidBlock3DCrossAttn":
+            self.mid_block = UNetMidBlock3DCrossAttn(
+                in_channels=block_out_channels[-1],
+                temb_channels=time_embed_dim,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                output_scale_factor=mid_block_scale_factor,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=attention_head_dim[-1],
+                resnet_groups=norm_num_groups,
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                upcast_attention=upcast_attention,
+            )
+        else:
+            raise ValueError(f"unknown mid_block_type : {mid_block_type}")
+        # count how many layers upsample the videos
+        self.num_upsamplers = 0
+        # up
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_attention_head_dim = list(reversed(attention_head_dim))
+        only_cross_attention = list(reversed(only_cross_attention))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(up_block_types):
+            is_final_block = i == len(block_out_channels) - 1
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+            # add upsample block for all BUT final layer
+            if not is_final_block:
+                add_upsample = True
+                self.num_upsamplers += 1
+            else:
+                add_upsample = False
+            up_block = get_up_block(
+                up_block_type,
+                num_layers=layers_per_block + 1,
+                in_channels=input_channel,
+                out_channels=output_channel,
+                prev_output_channel=prev_output_channel,
+                temb_channels=time_embed_dim,
+                add_upsample=add_upsample,
+                resnet_eps=norm_eps,
+                resnet_act_fn=act_fn,
+                resnet_groups=norm_num_groups,
+                cross_attention_dim=cross_attention_dim,
+                attn_num_head_channels=reversed_attention_head_dim[i],
+                dual_cross_attention=dual_cross_attention,
+                use_linear_projection=use_linear_projection,
+                only_cross_attention=only_cross_attention[i],
+                upcast_attention=upcast_attention,
+                resnet_time_scale_shift=resnet_time_scale_shift,
+            )
+            self.up_blocks.append(up_block)
+            prev_output_channel = output_channel
+        # out
+        self.conv_norm_out = nn.GroupNorm(num_channels=block_out_channels[0], num_groups=norm_num_groups, eps=norm_eps)
+        self.conv_act = nn.SiLU()
+        self.conv_out = InflatedConv3d(block_out_channels[0], out_channels, kernel_size=3, padding=1)
+    def set_attention_slice(self, slice_size):
+        r"""
+        Enable sliced attention computation.
+        When this option is enabled, the attention module will split the input tensor in slices, to compute attention
+        in several steps. This is useful to save some memory in exchange for a small speed decrease.
+        Args:
+            slice_size (`str` or `int` or `list(int)`, *optional*, defaults to `"auto"`):
+                When `"auto"`, halves the input to the attention heads, so attention will be computed in two steps. If
+                `"max"`, maxium amount of memory will be saved by running only one slice at a time. If a number is
+                provided, uses as many slices as `attention_head_dim // slice_size`. In this case, `attention_head_dim`
+                must be a multiple of `slice_size`.
+        """
+        sliceable_head_dims = []
+        def fn_recursive_retrieve_slicable_dims(module: torch.nn.Module):
+            if hasattr(module, "set_attention_slice"):
+                sliceable_head_dims.append(module.sliceable_head_dim)
+            for child in module.children():
+                fn_recursive_retrieve_slicable_dims(child)
+        # retrieve number of attention layers
+        for module in self.children():
+            fn_recursive_retrieve_slicable_dims(module)
+        num_slicable_layers = len(sliceable_head_dims)
+        if slice_size == "auto":
+            # half the attention head size is usually a good trade-off between
+            # speed and memory
+            slice_size = [dim // 2 for dim in sliceable_head_dims]
+        elif slice_size == "max":
+            # make smallest slice possible
+            slice_size = num_slicable_layers * [1]
+        slice_size = num_slicable_layers * [slice_size] if not isinstance(slice_size, list) else slice_size
+        if len(slice_size) != len(sliceable_head_dims):
+            raise ValueError(
+                f"You have provided {len(slice_size)}, but {self.config} has {len(sliceable_head_dims)} different"
+                f" attention layers. Make sure to match `len(slice_size)` to be {len(sliceable_head_dims)}."
+            )
+        for i in range(len(slice_size)):
+            size = slice_size[i]
+            dim = sliceable_head_dims[i]
+            if size is not None and size > dim:
+                raise ValueError(f"size {size} has to be smaller or equal to {dim}.")
+        # Recursively walk through all the children.
+        # Any children which exposes the set_attention_slice method
+        # gets the message
+        def fn_recursive_set_attention_slice(module: torch.nn.Module, slice_size: List[int]):
+            if hasattr(module, "set_attention_slice"):
+                module.set_attention_slice(slice_size.pop())
+            for child in module.children():
+                fn_recursive_set_attention_slice(child, slice_size)
+        reversed_slice_size = list(reversed(slice_size))
+        for module in self.children():
+            fn_recursive_set_attention_slice(module, reversed_slice_size)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (CrossAttnDownBlock3D, DownBlock3D, CrossAttnUpBlock3D, UpBlock3D)):
+            module.gradient_checkpointing = value
+    def forward(
+        self,
+        sample: torch.FloatTensor,
+        timestep: Union[torch.Tensor, float, int],
+        encoder_hidden_states: torch.Tensor,
+        class_labels: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        return_dict: bool = True,
+    ) -> Union[UNet3DConditionOutput, Tuple]:
+        r"""
+        Args:
+            sample (`torch.FloatTensor`): (batch, channel, height, width) noisy inputs tensor
+            timestep (`torch.FloatTensor` or `float` or `int`): (batch) timesteps
+            encoder_hidden_states (`torch.FloatTensor`): (batch, sequence_length, feature_dim) encoder hidden states
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`models.unet_2d_condition.UNet2DConditionOutput`] instead of a plain tuple.
+        Returns:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] or `tuple`:
+            [`~models.unet_2d_condition.UNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`. When
+            returning a tuple, the first element is the sample tensor.
+        """
+        # By default samples have to be AT least a multiple of the overall upsampling factor.
+        # The overall upsampling factor is equal to 2 ** (# num of upsampling layears).
+        # However, the upsampling interpolation output size can be forced to fit any upsampling size
+        # on the fly if necessary.
+        default_overall_up_factor = 2**self.num_upsamplers
+        # upsample size should be forwarded when sample is not a multiple of `default_overall_up_factor`
+        forward_upsample_size = False
+        upsample_size = None
+        if any(s % default_overall_up_factor != 0 for s in sample.shape[-2:]):
+            logger.info("Forward upsample size to force interpolation output size.")
+            forward_upsample_size = True
+        # prepare attention_mask
+        if attention_mask is not None:
+            attention_mask = (1 - attention_mask.to(sample.dtype)) * -10000.0
+            attention_mask = attention_mask.unsqueeze(1)
+        # center input if necessary
+        if self.config.center_input_sample:
+            sample = 2 * sample - 1.0
+        # time
+        timesteps = timestep
+        if not torch.is_tensor(timesteps):
+            # This would be a good case for the `match` statement (Python 3.10+)
+            is_mps = sample.device.type == "mps"
+            if isinstance(timestep, float):
+                dtype = torch.float32 if is_mps else torch.float64
+            else:
+                dtype = torch.int32 if is_mps else torch.int64
+            timesteps = torch.tensor([timesteps], dtype=dtype, device=sample.device)
+        elif len(timesteps.shape) == 0:
+            timesteps = timesteps[None].to(sample.device)
+        # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+        timesteps = timesteps.expand(sample.shape[0])
+        t_emb = self.time_proj(timesteps)
+        # timesteps does not contain any weights and will always return f32 tensors
+        # but time_embedding might actually be running in fp16. so we need to cast here.
+        # there might be better ways to encapsulate this.
+        t_emb = t_emb.to(dtype=self.dtype)
+        emb = self.time_embedding(t_emb)
+        if self.class_embedding is not None:
+            if class_labels is None:
+                raise ValueError("class_labels should be provided when num_class_embeds > 0")
+            if self.config.class_embed_type == "timestep":
+                class_labels = self.time_proj(class_labels)
+            class_emb = self.class_embedding(class_labels).to(dtype=self.dtype)
+            emb = emb + class_emb
+        # pre-process
+        sample = self.conv_in(sample)
+        # down
+        down_block_res_samples = (sample,)
+        for downsample_block in self.down_blocks:
+            if hasattr(downsample_block, "has_cross_attention") and downsample_block.has_cross_attention:
+                sample, res_samples = downsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    encoder_hidden_states=encoder_hidden_states,
+                    attention_mask=attention_mask,
+                )
+            else:
+                sample, res_samples = downsample_block(hidden_states=sample, temb=emb)
+            down_block_res_samples += res_samples
+        # mid
+        sample = self.mid_block(sample, emb, encoder_hidden_states=encoder_hidden_states, attention_mask=attention_mask)
+        # up
+        for i, upsample_block in enumerate(self.up_blocks):
+            is_final_block = i == len(self.up_blocks) - 1
+            res_samples = down_block_res_samples[-len(upsample_block.resnets) :]
+            down_block_res_samples = down_block_res_samples[: -len(upsample_block.resnets)]
+            # if we have not reached the final block and need to forward the
+            # upsample size, we do it here
+            if not is_final_block and forward_upsample_size:
+                upsample_size = down_block_res_samples[-1].shape[2:]
+            if hasattr(upsample_block, "has_cross_attention") and upsample_block.has_cross_attention:
+                sample = upsample_block(
+                    hidden_states=sample,
+                    temb=emb,
+                    res_hidden_states_tuple=res_samples,
+                    encoder_hidden_states=encoder_hidden_states,
+                    upsample_size=upsample_size,
+                    attention_mask=attention_mask,
+                )
+            else:
+                sample = upsample_block(
+                    hidden_states=sample, temb=emb, res_hidden_states_tuple=res_samples, upsample_size=upsample_size
+                )
+        # post-process
+        sample = self.conv_norm_out(sample)
+        sample = self.conv_act(sample)
+        sample = self.conv_out(sample)
+        if not return_dict:
+            return (sample,)
+        return UNet3DConditionOutput(sample=sample)
+    @classmethod
+    def from_pretrained_2d(cls, pretrained_model_path, subfolder=None):
+        if subfolder is not None:
+            pretrained_model_path = os.path.join(pretrained_model_path, subfolder)
+        config_file = os.path.join(pretrained_model_path, "config.json")
+        if not os.path.isfile(config_file):
+            raise RuntimeError(f"{config_file} does not exist")
+        with open(config_file, "r") as f:
+            config = json.load(f)
+        config["_class_name"] = cls.__name__
+        config["down_block_types"] = [
+            "CrossAttnDownBlock3D",
+            "CrossAttnDownBlock3D",
+            "CrossAttnDownBlock3D",
+            "DownBlock3D",
+        ]
+        config["up_block_types"] = ["UpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D", "CrossAttnUpBlock3D"]
+        from diffusers.utils import WEIGHTS_NAME
+        model = cls.from_config(config)
+        model_file = os.path.join(pretrained_model_path, WEIGHTS_NAME)
+        if not os.path.isfile(model_file):
+            raise RuntimeError(f"{model_file} does not exist")
+        state_dict = torch.load(model_file, map_location="cpu")
+        for k, v in model.state_dict().items():
+            if "_temp." in k:
+                state_dict.update({k: v})
+        model.load_state_dict(state_dict)
+        return model

video_diffusion/tuneavideo/models/unet_blocks.py ADDED Viewed

	@@ -0,0 +1,588 @@

+# Adapted from https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/unet_2d_blocks.py
+import torch
+from torch import nn
+from .attention import Transformer3DModel
+from .resnet import Downsample3D, ResnetBlock3D, Upsample3D
+def get_down_block(
+    down_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    temb_channels,
+    add_downsample,
+    resnet_eps,
+    resnet_act_fn,
+    attn_num_head_channels,
+    resnet_groups=None,
+    cross_attention_dim=None,
+    downsample_padding=None,
+    dual_cross_attention=False,
+    use_linear_projection=False,
+    only_cross_attention=False,
+    upcast_attention=False,
+    resnet_time_scale_shift="default",
+):
+    down_block_type = down_block_type[7:] if down_block_type.startswith("UNetRes") else down_block_type
+    if down_block_type == "DownBlock3D":
+        return DownBlock3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif down_block_type == "CrossAttnDownBlock3D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnDownBlock3D")
+        return CrossAttnDownBlock3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            temb_channels=temb_channels,
+            add_downsample=add_downsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            downsample_padding=downsample_padding,
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attn_num_head_channels,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    raise ValueError(f"{down_block_type} does not exist.")
+def get_up_block(
+    up_block_type,
+    num_layers,
+    in_channels,
+    out_channels,
+    prev_output_channel,
+    temb_channels,
+    add_upsample,
+    resnet_eps,
+    resnet_act_fn,
+    attn_num_head_channels,
+    resnet_groups=None,
+    cross_attention_dim=None,
+    dual_cross_attention=False,
+    use_linear_projection=False,
+    only_cross_attention=False,
+    upcast_attention=False,
+    resnet_time_scale_shift="default",
+):
+    up_block_type = up_block_type[7:] if up_block_type.startswith("UNetRes") else up_block_type
+    if up_block_type == "UpBlock3D":
+        return UpBlock3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    elif up_block_type == "CrossAttnUpBlock3D":
+        if cross_attention_dim is None:
+            raise ValueError("cross_attention_dim must be specified for CrossAttnUpBlock3D")
+        return CrossAttnUpBlock3D(
+            num_layers=num_layers,
+            in_channels=in_channels,
+            out_channels=out_channels,
+            prev_output_channel=prev_output_channel,
+            temb_channels=temb_channels,
+            add_upsample=add_upsample,
+            resnet_eps=resnet_eps,
+            resnet_act_fn=resnet_act_fn,
+            resnet_groups=resnet_groups,
+            cross_attention_dim=cross_attention_dim,
+            attn_num_head_channels=attn_num_head_channels,
+            dual_cross_attention=dual_cross_attention,
+            use_linear_projection=use_linear_projection,
+            only_cross_attention=only_cross_attention,
+            upcast_attention=upcast_attention,
+            resnet_time_scale_shift=resnet_time_scale_shift,
+        )
+    raise ValueError(f"{up_block_type} does not exist.")
+class UNetMidBlock3DCrossAttn(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        output_scale_factor=1.0,
+        cross_attention_dim=1280,
+        dual_cross_attention=False,
+        use_linear_projection=False,
+        upcast_attention=False,
+    ):
+        super().__init__()
+        self.has_cross_attention = True
+        self.attn_num_head_channels = attn_num_head_channels
+        resnet_groups = resnet_groups if resnet_groups is not None else min(in_channels // 4, 32)
+        # there is always at least one resnet
+        resnets = [
+            ResnetBlock3D(
+                in_channels=in_channels,
+                out_channels=in_channels,
+                temb_channels=temb_channels,
+                eps=resnet_eps,
+                groups=resnet_groups,
+                dropout=dropout,
+                time_embedding_norm=resnet_time_scale_shift,
+                non_linearity=resnet_act_fn,
+                output_scale_factor=output_scale_factor,
+                pre_norm=resnet_pre_norm,
+            )
+        ]
+        attentions = []
+        for _ in range(num_layers):
+            if dual_cross_attention:
+                raise NotImplementedError
+            attentions.append(
+                Transformer3DModel(
+                    attn_num_head_channels,
+                    in_channels // attn_num_head_channels,
+                    in_channels=in_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    use_linear_projection=use_linear_projection,
+                    upcast_attention=upcast_attention,
+                )
+            )
+            resnets.append(
+                ResnetBlock3D(
+                    in_channels=in_channels,
+                    out_channels=in_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+    def forward(self, hidden_states, temb=None, encoder_hidden_states=None, attention_mask=None):
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states).sample
+            hidden_states = resnet(hidden_states, temb)
+        return hidden_states
+class CrossAttnDownBlock3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        cross_attention_dim=1280,
+        output_scale_factor=1.0,
+        downsample_padding=1,
+        add_downsample=True,
+        dual_cross_attention=False,
+        use_linear_projection=False,
+        only_cross_attention=False,
+        upcast_attention=False,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        self.has_cross_attention = True
+        self.attn_num_head_channels = attn_num_head_channels
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock3D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            if dual_cross_attention:
+                raise NotImplementedError
+            attentions.append(
+                Transformer3DModel(
+                    attn_num_head_channels,
+                    out_channels // attn_num_head_channels,
+                    in_channels=out_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    use_linear_projection=use_linear_projection,
+                    only_cross_attention=only_cross_attention,
+                    upcast_attention=upcast_attention,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample3D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+        self.gradient_checkpointing = False
+    def forward(self, hidden_states, temb=None, encoder_hidden_states=None, attention_mask=None):
+        output_states = ()
+        for resnet, attn in zip(self.resnets, self.attentions):
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(attn, return_dict=False),
+                    hidden_states,
+                    encoder_hidden_states,
+                )[0]
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states).sample
+            output_states += (hidden_states,)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+            output_states += (hidden_states,)
+        return hidden_states, output_states
+class DownBlock3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_downsample=True,
+        downsample_padding=1,
+    ):
+        super().__init__()
+        resnets = []
+        for i in range(num_layers):
+            in_channels = in_channels if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock3D(
+                    in_channels=in_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+        if add_downsample:
+            self.downsamplers = nn.ModuleList(
+                [
+                    Downsample3D(
+                        out_channels, use_conv=True, out_channels=out_channels, padding=downsample_padding, name="op"
+                    )
+                ]
+            )
+        else:
+            self.downsamplers = None
+        self.gradient_checkpointing = False
+    def forward(self, hidden_states, temb=None):
+        output_states = ()
+        for resnet in self.resnets:
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+                    return custom_forward
+                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+            else:
+                hidden_states = resnet(hidden_states, temb)
+            output_states += (hidden_states,)
+        if self.downsamplers is not None:
+            for downsampler in self.downsamplers:
+                hidden_states = downsampler(hidden_states)
+            output_states += (hidden_states,)
+        return hidden_states, output_states
+class CrossAttnUpBlock3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        out_channels: int,
+        prev_output_channel: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        attn_num_head_channels=1,
+        cross_attention_dim=1280,
+        output_scale_factor=1.0,
+        add_upsample=True,
+        dual_cross_attention=False,
+        use_linear_projection=False,
+        only_cross_attention=False,
+        upcast_attention=False,
+    ):
+        super().__init__()
+        resnets = []
+        attentions = []
+        self.has_cross_attention = True
+        self.attn_num_head_channels = attn_num_head_channels
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock3D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+            if dual_cross_attention:
+                raise NotImplementedError
+            attentions.append(
+                Transformer3DModel(
+                    attn_num_head_channels,
+                    out_channels // attn_num_head_channels,
+                    in_channels=out_channels,
+                    num_layers=1,
+                    cross_attention_dim=cross_attention_dim,
+                    norm_num_groups=resnet_groups,
+                    use_linear_projection=use_linear_projection,
+                    only_cross_attention=only_cross_attention,
+                    upcast_attention=upcast_attention,
+                )
+            )
+        self.attentions = nn.ModuleList(attentions)
+        self.resnets = nn.ModuleList(resnets)
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample3D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states,
+        res_hidden_states_tuple,
+        temb=None,
+        encoder_hidden_states=None,
+        upsample_size=None,
+        attention_mask=None,
+    ):
+        for resnet, attn in zip(self.resnets, self.attentions):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(attn, return_dict=False),
+                    hidden_states,
+                    encoder_hidden_states,
+                )[0]
+            else:
+                hidden_states = resnet(hidden_states, temb)
+                hidden_states = attn(hidden_states, encoder_hidden_states=encoder_hidden_states).sample
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+        return hidden_states
+class UpBlock3D(nn.Module):
+    def __init__(
+        self,
+        in_channels: int,
+        prev_output_channel: int,
+        out_channels: int,
+        temb_channels: int,
+        dropout: float = 0.0,
+        num_layers: int = 1,
+        resnet_eps: float = 1e-6,
+        resnet_time_scale_shift: str = "default",
+        resnet_act_fn: str = "swish",
+        resnet_groups: int = 32,
+        resnet_pre_norm: bool = True,
+        output_scale_factor=1.0,
+        add_upsample=True,
+    ):
+        super().__init__()
+        resnets = []
+        for i in range(num_layers):
+            res_skip_channels = in_channels if (i == num_layers - 1) else out_channels
+            resnet_in_channels = prev_output_channel if i == 0 else out_channels
+            resnets.append(
+                ResnetBlock3D(
+                    in_channels=resnet_in_channels + res_skip_channels,
+                    out_channels=out_channels,
+                    temb_channels=temb_channels,
+                    eps=resnet_eps,
+                    groups=resnet_groups,
+                    dropout=dropout,
+                    time_embedding_norm=resnet_time_scale_shift,
+                    non_linearity=resnet_act_fn,
+                    output_scale_factor=output_scale_factor,
+                    pre_norm=resnet_pre_norm,
+                )
+            )
+        self.resnets = nn.ModuleList(resnets)
+        if add_upsample:
+            self.upsamplers = nn.ModuleList([Upsample3D(out_channels, use_conv=True, out_channels=out_channels)])
+        else:
+            self.upsamplers = None
+        self.gradient_checkpointing = False
+    def forward(self, hidden_states, res_hidden_states_tuple, temb=None, upsample_size=None):
+        for resnet in self.resnets:
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = torch.cat([hidden_states, res_hidden_states], dim=1)
+            if self.training and self.gradient_checkpointing:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+                    return custom_forward
+                hidden_states = torch.utils.checkpoint.checkpoint(create_custom_forward(resnet), hidden_states, temb)
+            else:
+                hidden_states = resnet(hidden_states, temb)
+        if self.upsamplers is not None:
+            for upsampler in self.upsamplers:
+                hidden_states = upsampler(hidden_states, upsample_size)
+        return hidden_states

video_diffusion/tuneavideo/pipelines/pipeline_tuneavideo.py ADDED Viewed

	@@ -0,0 +1,411 @@

+# Adapted from https://github.com/huggingface/diffusers/blob/main/src/diffusers/pipelines/stable_diffusion/pipeline_stable_diffusion.py
+import inspect
+from dataclasses import dataclass
+from typing import Callable, List, Optional, Union
+import numpy as np
+import torch
+from diffusers.configuration_utils import FrozenDict
+from diffusers.models import AutoencoderKL
+from diffusers.pipeline_utils import DiffusionPipeline
+from diffusers.schedulers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    LMSDiscreteScheduler,
+    PNDMScheduler,
+)
+from diffusers.utils import BaseOutput, deprecate, is_accelerate_available, logging
+from einops import rearrange
+from packaging import version
+from transformers import CLIPTextModel, CLIPTokenizer
+from ..models.unet import UNet3DConditionModel
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@dataclass
+class TuneAVideoPipelineOutput(BaseOutput):
+    videos: Union[torch.Tensor, np.ndarray]
+class TuneAVideoPipeline(DiffusionPipeline):
+    _optional_components = []
+    def __init__(
+        self,
+        vae: AutoencoderKL,
+        text_encoder: CLIPTextModel,
+        tokenizer: CLIPTokenizer,
+        unet: UNet3DConditionModel,
+        scheduler: Union[
+            DDIMScheduler,
+            PNDMScheduler,
+            LMSDiscreteScheduler,
+            EulerDiscreteScheduler,
+            EulerAncestralDiscreteScheduler,
+            DPMSolverMultistepScheduler,
+        ],
+    ):
+        super().__init__()
+        if hasattr(scheduler.config, "steps_offset") and scheduler.config.steps_offset != 1:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} is outdated. `steps_offset`"
+                f" should be set to 1 instead of {scheduler.config.steps_offset}. Please make sure "
+                "to update the config accordingly as leaving `steps_offset` might led to incorrect results"
+                " in future versions. If you have downloaded this checkpoint from the Hugging Face Hub,"
+                " it would be very nice if you could open a Pull request for the `scheduler/scheduler_config.json`"
+                " file"
+            )
+            deprecate("steps_offset!=1", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["steps_offset"] = 1
+            scheduler._internal_dict = FrozenDict(new_config)
+        if hasattr(scheduler.config, "clip_sample") and scheduler.config.clip_sample is True:
+            deprecation_message = (
+                f"The configuration file of this scheduler: {scheduler} has not set the configuration `clip_sample`."
+                " `clip_sample` should be set to False in the configuration file. Please make sure to update the"
+                " config accordingly as not setting `clip_sample` in the config might lead to incorrect results in"
+                " future versions. If you have downloaded this checkpoint from the Hugging Face Hub, it would be very"
+                " nice if you could open a Pull request for the `scheduler/scheduler_config.json` file"
+            )
+            deprecate("clip_sample not set", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(scheduler.config)
+            new_config["clip_sample"] = False
+            scheduler._internal_dict = FrozenDict(new_config)
+        is_unet_version_less_0_9_0 = hasattr(unet.config, "_diffusers_version") and version.parse(
+            version.parse(unet.config._diffusers_version).base_version
+        ) < version.parse("0.9.0.dev0")
+        is_unet_sample_size_less_64 = hasattr(unet.config, "sample_size") and unet.config.sample_size < 64
+        if is_unet_version_less_0_9_0 and is_unet_sample_size_less_64:
+            deprecation_message = (
+                "The configuration file of the unet has set the default `sample_size` to smaller than"
+                " 64 which seems highly unlikely. If your checkpoint is a fine-tuned version of any of the"
+                " following: \n- CompVis/stable-diffusion-v1-4 \n- CompVis/stable-diffusion-v1-3 \n-"
+                " CompVis/stable-diffusion-v1-2 \n- CompVis/stable-diffusion-v1-1 \n- runwayml/stable-diffusion-v1-5"
+                " \n- runwayml/stable-diffusion-inpainting \n you should change 'sample_size' to 64 in the"
+                " configuration file. Please make sure to update the config accordingly as leaving `sample_size=32`"
+                " in the config might lead to incorrect results in future versions. If you have downloaded this"
+                " checkpoint from the Hugging Face Hub, it would be very nice if you could open a Pull request for"
+                " the `unet/config.json` file"
+            )
+            deprecate("sample_size<64", "1.0.0", deprecation_message, standard_warn=False)
+            new_config = dict(unet.config)
+            new_config["sample_size"] = 64
+            unet._internal_dict = FrozenDict(new_config)
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            tokenizer=tokenizer,
+            unet=unet,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = 2 ** (len(self.vae.config.block_out_channels) - 1)
+    def enable_vae_slicing(self):
+        self.vae.enable_slicing()
+    def disable_vae_slicing(self):
+        self.vae.disable_slicing()
+    def enable_sequential_cpu_offload(self, gpu_id=0):
+        if is_accelerate_available():
+            from accelerate import cpu_offload
+        else:
+            raise ImportError("Please install accelerate via `pip install accelerate`")
+        device = torch.device(f"cuda:{gpu_id}")
+        for cpu_offloaded_model in [self.unet, self.text_encoder, self.vae]:
+            if cpu_offloaded_model is not None:
+                cpu_offload(cpu_offloaded_model, device)
+    @property
+    def _execution_device(self):
+        if self.device != torch.device("meta") or not hasattr(self.unet, "_hf_hook"):
+            return self.device
+        for module in self.unet.modules():
+            if (
+                hasattr(module, "_hf_hook")
+                and hasattr(module._hf_hook, "execution_device")
+                and module._hf_hook.execution_device is not None
+            ):
+                return torch.device(module._hf_hook.execution_device)
+        return self.device
+    def _encode_prompt(self, prompt, device, num_videos_per_prompt, do_classifier_free_guidance, negative_prompt):
+        batch_size = len(prompt) if isinstance(prompt, list) else 1
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer.model_max_length - 1 : -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer.model_max_length} tokens: {removed_text}"
+            )
+        if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+            attention_mask = text_inputs.attention_mask.to(device)
+        else:
+            attention_mask = None
+        text_embeddings = self.text_encoder(
+            text_input_ids.to(device),
+            attention_mask=attention_mask,
+        )
+        text_embeddings = text_embeddings[0]
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        bs_embed, seq_len, _ = text_embeddings.shape
+        text_embeddings = text_embeddings.repeat(1, num_videos_per_prompt, 1)
+        text_embeddings = text_embeddings.view(bs_embed * num_videos_per_prompt, seq_len, -1)
+        # get unconditional embeddings for classifier free guidance
+        if do_classifier_free_guidance:
+            uncond_tokens: List[str]
+            if negative_prompt is None:
+                uncond_tokens = [""] * batch_size
+            elif type(prompt) is not type(negative_prompt):
+                raise TypeError(
+                    f"`negative_prompt` should be the same type to `prompt`, but got {type(negative_prompt)} !="
+                    f" {type(prompt)}."
+                )
+            elif isinstance(negative_prompt, str):
+                uncond_tokens = [negative_prompt]
+            elif batch_size != len(negative_prompt):
+                raise ValueError(
+                    f"`negative_prompt`: {negative_prompt} has batch size {len(negative_prompt)}, but `prompt`:"
+                    f" {prompt} has batch size {batch_size}. Please make sure that passed `negative_prompt` matches"
+                    " the batch size of `prompt`."
+                )
+            else:
+                uncond_tokens = negative_prompt
+            max_length = text_input_ids.shape[-1]
+            uncond_input = self.tokenizer(
+                uncond_tokens,
+                padding="max_length",
+                max_length=max_length,
+                truncation=True,
+                return_tensors="pt",
+            )
+            if hasattr(self.text_encoder.config, "use_attention_mask") and self.text_encoder.config.use_attention_mask:
+                attention_mask = uncond_input.attention_mask.to(device)
+            else:
+                attention_mask = None
+            uncond_embeddings = self.text_encoder(
+                uncond_input.input_ids.to(device),
+                attention_mask=attention_mask,
+            )
+            uncond_embeddings = uncond_embeddings[0]
+            # duplicate unconditional embeddings for each generation per prompt, using mps friendly method
+            seq_len = uncond_embeddings.shape[1]
+            uncond_embeddings = uncond_embeddings.repeat(1, num_videos_per_prompt, 1)
+            uncond_embeddings = uncond_embeddings.view(batch_size * num_videos_per_prompt, seq_len, -1)
+            # For classifier free guidance, we need to do two forward passes.
+            # Here we concatenate the unconditional and text embeddings into a single batch
+            # to avoid doing two forward passes
+            text_embeddings = torch.cat([uncond_embeddings, text_embeddings])
+        return text_embeddings
+    def decode_latents(self, latents):
+        video_length = latents.shape[2]
+        latents = 1 / 0.18215 * latents
+        latents = rearrange(latents, "b c f h w -> (b f) c h w")
+        video = self.vae.decode(latents).sample
+        video = rearrange(video, "(b f) c h w -> b c f h w", f=video_length)
+        video = (video / 2 + 0.5).clamp(0, 1)
+        # we always cast to float32 as this does not cause significant overhead and is compatible with bfloa16
+        video = video.cpu().float().numpy()
+        return video
+    def prepare_extra_step_kwargs(self, generator, eta):
+        # prepare extra kwargs for the scheduler step, since not all schedulers have the same signature
+        # eta (η) is only used with the DDIMScheduler, it will be ignored for other schedulers.
+        # eta corresponds to η in DDIM paper: https://arxiv.org/abs/2010.02502
+        # and should be between [0, 1]
+        accepts_eta = "eta" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        extra_step_kwargs = {}
+        if accepts_eta:
+            extra_step_kwargs["eta"] = eta
+        # check if the scheduler accepts generator
+        accepts_generator = "generator" in set(inspect.signature(self.scheduler.step).parameters.keys())
+        if accepts_generator:
+            extra_step_kwargs["generator"] = generator
+        return extra_step_kwargs
+    def check_inputs(self, prompt, height, width, callback_steps):
+        if not isinstance(prompt, str) and not isinstance(prompt, list):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if (callback_steps is None) or (
+            callback_steps is not None and (not isinstance(callback_steps, int) or callback_steps <= 0)
+        ):
+            raise ValueError(
+                f"`callback_steps` has to be a positive integer but is {callback_steps} of type"
+                f" {type(callback_steps)}."
+            )
+    def prepare_latents(
+        self, batch_size, num_channels_latents, video_length, height, width, dtype, device, generator, latents=None
+    ):
+        shape = (
+            batch_size,
+            num_channels_latents,
+            video_length,
+            height // self.vae_scale_factor,
+            width // self.vae_scale_factor,
+        )
+        if isinstance(generator, list) and len(generator) != batch_size:
+            raise ValueError(
+                f"You have passed a list of generators of length {len(generator)}, but requested an effective batch"
+                f" size of {batch_size}. Make sure the batch size matches the length of the generators."
+            )
+        if latents is None:
+            rand_device = "cpu" if device.type == "mps" else device
+            if isinstance(generator, list):
+                shape = (1,) + shape[1:]
+                latents = [
+                    torch.randn(shape, generator=generator[i], device=rand_device, dtype=dtype)
+                    for i in range(batch_size)
+                ]
+                latents = torch.cat(latents, dim=0).to(device)
+            else:
+                latents = torch.randn(shape, generator=generator, device=rand_device, dtype=dtype).to(device)
+        else:
+            if latents.shape != shape:
+                raise ValueError(f"Unexpected latents shape, got {latents.shape}, expected {shape}")
+            latents = latents.to(device)
+        # scale the initial noise by the standard deviation required by the scheduler
+        latents = latents * self.scheduler.init_noise_sigma
+        return latents
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt: Union[str, List[str]],
+        video_length: Optional[int],
+        height: Optional[int] = None,
+        width: Optional[int] = None,
+        num_inference_steps: int = 50,
+        guidance_scale: float = 7.5,
+        negative_prompt: Optional[Union[str, List[str]]] = None,
+        num_videos_per_prompt: Optional[int] = 1,
+        eta: float = 0.0,
+        generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+        latents: Optional[torch.FloatTensor] = None,
+        output_type: Optional[str] = "tensor",
+        return_dict: bool = True,
+        callback: Optional[Callable[[int, int, torch.FloatTensor], None]] = None,
+        callback_steps: Optional[int] = 1,
+        **kwargs,
+    ):
+        # Default height and width to unet
+        height = height or self.unet.config.sample_size * self.vae_scale_factor
+        width = width or self.unet.config.sample_size * self.vae_scale_factor
+        # Check inputs. Raise error if not correct
+        self.check_inputs(prompt, height, width, callback_steps)
+        # Define call parameters
+        batch_size = 1 if isinstance(prompt, str) else len(prompt)
+        device = self._execution_device
+        # here `guidance_scale` is defined analog to the guidance weight `w` of equation (2)
+        # of the Imagen paper: https://arxiv.org/pdf/2205.11487.pdf . `guidance_scale = 1`
+        # corresponds to doing no classifier free guidance.
+        do_classifier_free_guidance = guidance_scale > 1.0
+        # Encode input prompt
+        text_embeddings = self._encode_prompt(
+            prompt, device, num_videos_per_prompt, do_classifier_free_guidance, negative_prompt
+        )
+        # Prepare timesteps
+        self.scheduler.set_timesteps(num_inference_steps, device=device)
+        timesteps = self.scheduler.timesteps
+        # Prepare latent variables
+        num_channels_latents = self.unet.in_channels
+        latents = self.prepare_latents(
+            batch_size * num_videos_per_prompt,
+            num_channels_latents,
+            video_length,
+            height,
+            width,
+            text_embeddings.dtype,
+            device,
+            generator,
+            latents,
+        )
+        latents_dtype = latents.dtype
+        # Prepare extra step kwargs.
+        extra_step_kwargs = self.prepare_extra_step_kwargs(generator, eta)
+        # Denoising loop
+        num_warmup_steps = len(timesteps) - num_inference_steps * self.scheduler.order
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                # expand the latents if we are doing classifier free guidance
+                latent_model_input = torch.cat([latents] * 2) if do_classifier_free_guidance else latents
+                latent_model_input = self.scheduler.scale_model_input(latent_model_input, t)
+                # predict the noise residual
+                noise_pred = self.unet(latent_model_input, t, encoder_hidden_states=text_embeddings).sample.to(
+                    dtype=latents_dtype
+                )
+                # perform guidance
+                if do_classifier_free_guidance:
+                    noise_pred_uncond, noise_pred_text = noise_pred.chunk(2)
+                    noise_pred = noise_pred_uncond + guidance_scale * (noise_pred_text - noise_pred_uncond)
+                # compute the previous noisy sample x_t -> x_t-1
+                latents = self.scheduler.step(noise_pred, t, latents, **extra_step_kwargs).prev_sample
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                    if callback is not None and i % callback_steps == 0:
+                        callback(i, t, latents)
+        # Post-processing
+        video = self.decode_latents(latents)
+        # Convert to tensor
+        if output_type == "tensor":
+            video = torch.from_numpy(video)
+        if not return_dict:
+            return video
+        return TuneAVideoPipelineOutput(videos=video)

video_diffusion/tuneavideo/tuneavideo_text2video.py ADDED Viewed

	@@ -0,0 +1,153 @@

+import gradio as gr
+import torch
+from video_diffusion.tuneavideo.models.unet import UNet3DConditionModel
+from video_diffusion.tuneavideo.pipelines.pipeline_tuneavideo import TuneAVideoPipeline
+from video_diffusion.tuneavideo.util import save_videos_grid
+from video_diffusion.utils.model_list import stable_model_list
+video_diffusion_model_list = [
+    "Tune-A-Video-library/a-man-is-surfing",
+    "Tune-A-Video-library/mo-di-bear-guitar",
+    "Tune-A-Video-library/redshift-man-skiing",
+]
+class TunaVideoText2VideoGenerator:
+    def __init__(self):
+        self.pipe = None
+        self.unet = None
+    def load_model(self, video_diffusion_model_list, stable_model_list):
+        if self.pipe is None:
+            if self.unet is None:
+                self.unet = UNet3DConditionModel.from_pretrained(
+                    video_diffusion_model_list, subfolder="unet", torch_dtype=torch.float16
+                ).to("cuda")
+            self.pipe = TuneAVideoPipeline.from_pretrained(
+                stable_model_list, unet=self.unet, torch_dtype=torch.float16
+            )
+            self.pipe.to("cuda")
+            self.pipe.enable_xformers_memory_efficient_attention()
+        return self.pipe
+    def generate_video(
+        self,
+        video_diffusion_model: str,
+        stable_model_list: str,
+        prompt: str,
+        negative_prompt: str,
+        video_length: int,
+        height: int,
+        width: int,
+        num_inference_steps: int,
+        guidance_scale: int,
+        fps: int,
+    ):
+        pipe = self.load_model(video_diffusion_model, stable_model_list)
+        video = pipe(
+            prompt,
+            negative_prompt=negative_prompt,
+            video_length=video_length,
+            height=height,
+            width=width,
+            num_inference_steps=num_inference_steps,
+            guidance_scale=guidance_scale,
+        ).videos
+        save_videos_grid(videos=video, path="output.gif", fps=fps)
+        return "output.gif"
+    def app():
+        with gr.Blocks():
+            with gr.Row():
+                with gr.Column():
+                    tunevideo_video_diffusion_model_list = gr.Dropdown(
+                        choices=video_diffusion_model_list,
+                        label="Video Diffusion Model",
+                        value=video_diffusion_model_list[0],
+                    )
+                    tunevideo_stable_model_list = gr.Dropdown(
+                        choices=stable_model_list,
+                        label="Stable Model List",
+                        value=stable_model_list[0],
+                    )
+                    with gr.Row():
+                        with gr.Column():
+                            tunevideo_prompt = gr.Textbox(
+                                lines=1,
+                                placeholder="Prompt",
+                                show_label=False,
+                            )
+                            tunevideo_video_length = gr.Slider(
+                                minimum=1,
+                                maximum=100,
+                                step=1,
+                                value=10,
+                                label="Video Length",
+                            )
+                            tunevideo_num_inference_steps = gr.Slider(
+                                minimum=1,
+                                maximum=100,
+                                step=1,
+                                value=50,
+                                label="Num Inference Steps",
+                            )
+                            tunevideo_fps = gr.Slider(
+                                minimum=1,
+                                maximum=60,
+                                step=1,
+                                value=5,
+                                label="Fps",
+                            )
+                        with gr.Row():
+                            with gr.Column():
+                                tunevideo_negative_prompt = gr.Textbox(
+                                    lines=1,
+                                    placeholder="Negative Prompt",
+                                    show_label=False,
+                                )
+                                tunevideo_guidance_scale = gr.Slider(
+                                    minimum=1,
+                                    maximum=15,
+                                    step=1,
+                                    value=7.5,
+                                    label="Guidance Scale",
+                                )
+                                tunevideo_height = gr.Slider(
+                                    minimum=1,
+                                    maximum=1280,
+                                    step=32,
+                                    value=512,
+                                    label="Height",
+                                )
+                                tunevideo_width = gr.Slider(
+                                    minimum=1,
+                                    maximum=1280,
+                                    step=32,
+                                    value=512,
+                                    label="Width",
+                                )
+                    tunevideo_generate = gr.Button(value="Generator")
+                with gr.Column():
+                    tunevideo_output = gr.Video(label="Output")
+            tunevideo_generate.click(
+                fn=TunaVideoText2VideoGenerator().generate_video,
+                inputs=[
+                    tunevideo_video_diffusion_model_list,
+                    tunevideo_stable_model_list,
+                    tunevideo_prompt,
+                    tunevideo_negative_prompt,
+                    tunevideo_video_length,
+                    tunevideo_height,
+                    tunevideo_width,
+                    tunevideo_num_inference_steps,
+                    tunevideo_guidance_scale,
+                    tunevideo_fps,
+                ],
+                outputs=tunevideo_output,
+            )

video_diffusion/tuneavideo/util.py ADDED Viewed

	@@ -0,0 +1,93 @@

+import os
+from typing import Union
+import imageio
+import numpy as np
+import torch
+import torchvision
+from einops import rearrange
+from tqdm import tqdm
+def save_videos_grid(
+    videos: torch.Tensor, save_path: str = "output", path: str = "output.gif", rescale=False, n_rows=4, fps=3
+):
+    videos = rearrange(videos, "b c t h w -> t b c h w")
+    outputs = []
+    for x in videos:
+        x = torchvision.utils.make_grid(x, nrow=n_rows)
+        x = x.transpose(0, 1).transpose(1, 2).squeeze(-1)
+        if rescale:
+            x = (x + 1.0) / 2.0  # -1,1 -> 0,1
+        x = (x * 255).numpy().astype(np.uint8)
+        outputs.append(x)
+    if not os.path.exists(save_path):
+        os.makedirs(save_path)
+    imageio.mimsave(os.path.join(save_path, path), outputs, fps=fps)
+    return os.path.join(save_path, path)
+# DDIM Inversion
+@torch.no_grad()
+def init_prompt(prompt, pipeline):
+    uncond_input = pipeline.tokenizer(
+        [""], padding="max_length", max_length=pipeline.tokenizer.model_max_length, return_tensors="pt"
+    )
+    uncond_embeddings = pipeline.text_encoder(uncond_input.input_ids.to(pipeline.device))[0]
+    text_input = pipeline.tokenizer(
+        [prompt],
+        padding="max_length",
+        max_length=pipeline.tokenizer.model_max_length,
+        truncation=True,
+        return_tensors="pt",
+    )
+    text_embeddings = pipeline.text_encoder(text_input.input_ids.to(pipeline.device))[0]
+    context = torch.cat([uncond_embeddings, text_embeddings])
+    return context
+def next_step(
+    model_output: Union[torch.FloatTensor, np.ndarray],
+    timestep: int,
+    sample: Union[torch.FloatTensor, np.ndarray],
+    ddim_scheduler,
+):
+    timestep, next_timestep = (
+        min(timestep - ddim_scheduler.config.num_train_timesteps // ddim_scheduler.num_inference_steps, 999),
+        timestep,
+    )
+    alpha_prod_t = ddim_scheduler.alphas_cumprod[timestep] if timestep >= 0 else ddim_scheduler.final_alpha_cumprod
+    alpha_prod_t_next = ddim_scheduler.alphas_cumprod[next_timestep]
+    beta_prod_t = 1 - alpha_prod_t
+    next_original_sample = (sample - beta_prod_t**0.5 * model_output) / alpha_prod_t**0.5
+    next_sample_direction = (1 - alpha_prod_t_next) ** 0.5 * model_output
+    next_sample = alpha_prod_t_next**0.5 * next_original_sample + next_sample_direction
+    return next_sample
+def get_noise_pred_single(latents, t, context, unet):
+    noise_pred = unet(latents, t, encoder_hidden_states=context)["sample"]
+    return noise_pred
+@torch.no_grad()
+def ddim_loop(pipeline, ddim_scheduler, latent, num_inv_steps, prompt):
+    context = init_prompt(prompt, pipeline)
+    uncond_embeddings, cond_embeddings = context.chunk(2)
+    all_latent = [latent]
+    latent = latent.clone().detach()
+    for i in tqdm(range(num_inv_steps)):
+        t = ddim_scheduler.timesteps[len(ddim_scheduler.timesteps) - i - 1]
+        noise_pred = get_noise_pred_single(latent, t, cond_embeddings, pipeline.unet)
+        latent = next_step(noise_pred, t, latent, ddim_scheduler)
+        all_latent.append(latent)
+    return all_latent
+@torch.no_grad()
+def ddim_inversion(pipeline, ddim_scheduler, video_latent, num_inv_steps, prompt=""):
+    ddim_latents = ddim_loop(pipeline, ddim_scheduler, video_latent, num_inv_steps, prompt)
+    return ddim_latents

video_diffusion/utils/__init__.py ADDED Viewed

File without changes

video_diffusion/utils/model_list.py ADDED Viewed

	@@ -0,0 +1,6 @@

+stable_model_list = [
+    "runwayml/stable-diffusion-v1-5",
+    "stabilityai/stable-diffusion-2-1",
+    # "prompthero/openjourney-v4",
+    "cerspense/zeroscope_v2_576w"
+]

video_diffusion/utils/scheduler_list.py ADDED Viewed

	@@ -0,0 +1,32 @@

+from diffusers import (
+    DDIMScheduler,
+    DPMSolverMultistepScheduler,
+    EulerAncestralDiscreteScheduler,
+    EulerDiscreteScheduler,
+    HeunDiscreteScheduler,
+    LMSDiscreteScheduler,
+)
+diff_scheduler_list = ["DDIM", "EulerA", "Euler", "LMS", "Heun", "UniPC", "DPMSolver"]
+def get_scheduler_list(pipe, scheduler):
+    if scheduler == diff_scheduler_list[0]:
+        pipe.scheduler = DDIMScheduler.from_config(pipe.scheduler.config)
+    elif scheduler == diff_scheduler_list[1]:
+        pipe.scheduler = EulerAncestralDiscreteScheduler.from_config(pipe.scheduler.config)
+    elif scheduler == diff_scheduler_list[2]:
+        pipe.scheduler = EulerDiscreteScheduler.from_config(pipe.scheduler.config)
+    elif scheduler == diff_scheduler_list[3]:
+        pipe.scheduler = LMSDiscreteScheduler.from_config(pipe.scheduler.config)
+    elif scheduler == diff_scheduler_list[4]:
+        pipe.scheduler = HeunDiscreteScheduler.from_config(pipe.scheduler.config)
+    elif scheduler == diff_scheduler_list[5]:
+        pipe.scheduler = DPMSolverMultistepScheduler.from_config(pipe.scheduler.config)
+    return pipe

video_diffusion/zero_shot/zero_shot_text2video.py ADDED Viewed

	@@ -0,0 +1,164 @@

+import gradio as gr
+import imageio
+import torch
+from diffusers import TextToVideoZeroPipeline
+from video_diffusion.tuneavideo.util import save_videos_grid
+from video_diffusion.utils.model_list import stable_model_list
+class ZeroShotText2VideoGenerator:
+    def __init__(self):
+        self.pipe = None
+    def load_model(self, model_id):
+        if self.pipe is None:
+            self.pipe = TextToVideoZeroPipeline.from_pretrained(model_id, torch_dtype=torch.float16).to("cuda")
+            self.pipe.to("cuda")
+            self.pipe.enable_xformers_memory_efficient_attention()
+            self.pipe.enable_attention_slicing()
+        return self.pipe
+    def generate_video(
+        self,
+        prompt,
+        negative_prompt,
+        model_id,
+        height,
+        width,
+        video_length,
+        guidance_scale,
+        fps,
+        t0,
+        t1,
+        motion_field_strength_x,
+        motion_field_strength_y,
+    ):
+        pipe = self.load_model(model_id)
+        result = pipe(
+            prompt=prompt,
+            negative_prompt=negative_prompt,
+            height=height,
+            width=width,
+            video_length=video_length,
+            guidance_scale=guidance_scale,
+            t0=t0,
+            t1=t1,
+            motion_field_strength_x=motion_field_strength_x,
+            motion_field_strength_y=motion_field_strength_y,
+        ).images
+        result = [(r * 255).astype("uint8") for r in result]
+        imageio.mimsave("video.mp4", result, fps=fps)
+        return "video.mp4"
+    def app():
+        with gr.Blocks():
+            with gr.Row():
+                with gr.Column():
+                    zero_shot_text2video_prompt = gr.Textbox(
+                        lines=1,
+                        placeholder="Prompt",
+                        show_label=False,
+                    )
+                    zero_shot_text2video_negative_prompt = gr.Textbox(
+                        lines=1,
+                        placeholder="Negative Prompt",
+                        show_label=False,
+                    )
+                    zero_shot_text2video_model_id = gr.Dropdown(
+                        choices=stable_model_list,
+                        label="Stable Model List",
+                        value=stable_model_list[0],
+                    )
+                    with gr.Row():
+                        with gr.Column():
+                            zero_shot_text2video_guidance_scale = gr.Slider(
+                                label="Guidance Scale",
+                                minimum=1,
+                                maximum=15,
+                                step=1,
+                                value=7.5,
+                            )
+                            zero_shot_text2video_video_length = gr.Slider(
+                                label="Video Length",
+                                minimum=1,
+                                maximum=100,
+                                step=1,
+                                value=10,
+                            )
+                            zero_shot_text2video_t0 = gr.Slider(
+                                label="Timestep T0",
+                                minimum=0,
+                                maximum=100,
+                                step=1,
+                                value=44,
+                            )
+                            zero_shot_text2video_motion_field_strength_x = gr.Slider(
+                                label="Motion Field Strength X",
+                                minimum=0,
+                                maximum=100,
+                                step=1,
+                                value=12,
+                            )
+                            zero_shot_text2video_fps = gr.Slider(
+                                label="Fps",
+                                minimum=1,
+                                maximum=60,
+                                step=1,
+                                value=10,
+                            )
+                        with gr.Row():
+                            with gr.Column():
+                                zero_shot_text2video_height = gr.Slider(
+                                    label="Height",
+                                    minimum=128,
+                                    maximum=1280,
+                                    step=32,
+                                    value=512,
+                                )
+                                zero_shot_text2video_width = gr.Slider(
+                                    label="Width",
+                                    minimum=128,
+                                    maximum=1280,
+                                    step=32,
+                                    value=512,
+                                )
+                                zero_shot_text2video_t1 = gr.Slider(
+                                    label="Timestep T1",
+                                    minimum=0,
+                                    maximum=100,
+                                    step=1,
+                                    value=47,
+                                )
+                                zero_shot_text2video_motion_field_strength_y = gr.Slider(
+                                    label="Motion Field Strength Y",
+                                    minimum=0,
+                                    maximum=100,
+                                    step=1,
+                                    value=12,
+                                )
+                    zero_shot_text2video_button = gr.Button(value="Generator")
+                with gr.Column():
+                    zero_shot_text2video_output = gr.Video(label="Output")
+            zero_shot_text2video_button.click(
+                fn=ZeroShotText2VideoGenerator().generate_video,
+                inputs=[
+                    zero_shot_text2video_prompt,
+                    zero_shot_text2video_negative_prompt,
+                    zero_shot_text2video_model_id,
+                    zero_shot_text2video_height,
+                    zero_shot_text2video_width,
+                    zero_shot_text2video_video_length,
+                    zero_shot_text2video_guidance_scale,
+                    zero_shot_text2video_fps,
+                    zero_shot_text2video_t0,
+                    zero_shot_text2video_t1,
+                    zero_shot_text2video_motion_field_strength_x,
+                    zero_shot_text2video_motion_field_strength_y,
+                ],
+                outputs=zero_shot_text2video_output,
+            )