Spaces:

Pie31415
/

control-animation

Build error

App Files Files Community

Pie31415 commited on May 8, 2023

Commit

71e9a42

1 Parent(s): f5e4df7

gigant merge

Browse files

Files changed (19) hide show

__assets__/run.gif +0 -0
__assets__/run.mp4 +0 -0
__assets__/walk_01.gif +0 -0
__assets__/walk_01.mp4 +0 -0
__assets__/walk_02.gif +0 -0
__assets__/walk_02.mp4 +0 -0
__assets__/walk_03.gif +0 -0
__assets__/walk_03.mp4 +0 -0
__assets__/walk_04.gif +0 -0
__assets__/walk_04.mp4 +0 -0
app.py +1 -1
text_to_animation/model.py +84 -39
text_to_animation/models/controlnet_flax.py +12 -29
text_to_animation/models/cross_frame_attention_flax.py +336 -1
text_to_animation/models/unet_2d_blocks_flax.py +237 -2
text_to_animation/models/unet_2d_condition_flax.py +325 -31
text_to_animation/pipelines/text_to_video_pipeline_flax.py +267 -635
utils/gradio_utils.py +5 -1
webui/app_control_animation.py +77 -45

__assets__/run.gif ADDED Viewed

__assets__/run.mp4 ADDED Viewed

Binary file (13.2 kB). View file

__assets__/walk_01.gif ADDED Viewed

__assets__/walk_01.mp4 ADDED Viewed

Binary file (33.2 kB). View file

__assets__/walk_02.gif ADDED Viewed

__assets__/walk_02.mp4 ADDED Viewed

Binary file (47.6 kB). View file

__assets__/walk_03.gif ADDED Viewed

__assets__/walk_03.mp4 ADDED Viewed

Binary file (43.2 kB). View file

__assets__/walk_04.gif ADDED Viewed

__assets__/walk_04.mp4 ADDED Viewed

Binary file (64.9 kB). View file

app.py CHANGED Viewed

@@ -11,7 +11,7 @@ import jax.numpy as jnp
 huggingspace_name = os.environ.get("SPACE_AUTHOR_NAME")
 on_huggingspace = huggingspace_name if huggingspace_name is not None else False
-model = ControlAnimationModel(device="cuda", dtype=jnp.float16)
 parser = argparse.ArgumentParser()
 parser.add_argument(

 huggingspace_name = os.environ.get("SPACE_AUTHOR_NAME")
 on_huggingspace = huggingspace_name if huggingspace_name is not None else False
+model = ControlAnimationModel(dtype=jnp.float16)
 parser = argparse.ArgumentParser()
 parser.add_argument(

text_to_animation/model.py CHANGED Viewed

@@ -19,10 +19,10 @@ from diffusers import (
     FlaxAutoencoderKL,
     FlaxStableDiffusionControlNetPipeline,
     StableDiffusionPipeline,
-    FlaxUNet2DConditionModel,
 )
 from text_to_animation.models.unet_2d_condition_flax import (
-    FlaxUNet2DConditionModel as CustomFlaxUNet2DConditionModel,
 )
 from diffusers import FlaxControlNetModel
@@ -82,10 +82,10 @@ class ControlAnimationModel:
         feature_extractor = CLIPFeatureExtractor.from_pretrained(
             model_id, subfolder="feature_extractor"
         )
-        unet, unet_params = CustomFlaxUNet2DConditionModel.from_pretrained(
             model_id, subfolder="unet", from_pt=True, dtype=self.dtype
         )
-        unet_vanilla, _ = FlaxUNet2DConditionModel.from_pretrained(
             model_id, subfolder="unet", from_pt=True, dtype=self.dtype
         )
         vae, vae_params = FlaxAutoencoderKL.from_pretrained(
@@ -141,8 +141,9 @@ class ControlAnimationModel:
         seeds = [seed for seed in jax.random.randint(self.rng, [num_imgs], 0, 65536)]
         prngs = [jax.random.PRNGKey(seed) for seed in seeds]
         images = self.pipe.generate_starting_frames(
-            params=self.params,
             prngs=prngs,
             controlnet_image=control,
             prompt=prompts,
@@ -153,30 +154,66 @@ class ControlAnimationModel:
         return images
     def generate_animation(
         self,
-        prompt: str,
-        initial_frame_index: int,
-        input_video_path: str,
-        model_link: str = "dreamlike-art/dreamlike-photoreal-2.0",
-        motion_field_strength_x: int = 12,
-        motion_field_strength_y: int = 12,
-        t0: int = 44,
-        t1: int = 47,
-        n_prompt: str = "",
-        chunk_size: int = 8,
-        video_length: int = 8,
-        merging_ratio: float = 0.0,
-        seed: int = 0,
-        resolution: int = 512,
-        fps: int = 2,
-        use_cf_attn: bool = True,
-        use_motion_field: bool = True,
-        smooth_bg: bool = False,
-        smooth_bg_strength: float = 0.4,
-        path: str = None,
     ):
-        video_path = gradio_utils.motion_to_video_path(video_path)
         # added_prompt = 'best quality, HD, clay stop-motion, claymation, HQ, masterpiece, art, smooth'
         # added_prompt = 'high quality, anatomically correct, clay stop-motion, aardman, claymation, smooth'
@@ -187,18 +224,26 @@ class ControlAnimationModel:
             video_path, resolution, None, self.dtype, False, output_fps=4
         )
         control = utils.pre_process_pose(video, apply_pose_detect=False)
-        f, _, h, w = video.shape
         prng_seed = jax.random.PRNGKey(seed)
-        vid = self.pipe.generate_video(
-            prompt,
-            image=control,
-            params=self.params,
-            prng_seed=prng_seed,
-            neg_prompt="",
-            controlnet_conditioning_scale=1.0,
-            motion_field_strength_x=3,
-            motion_field_strength_y=4,
-            jit=True,
-        ).image
         return utils.create_gif(np.array(vid), 4, path=None, watermark=None)

     FlaxAutoencoderKL,
     FlaxStableDiffusionControlNetPipeline,
     StableDiffusionPipeline,
+    FlaxUNet2DConditionModel as VanillaFlaxUNet2DConditionModel,
 )
 from text_to_animation.models.unet_2d_condition_flax import (
+    FlaxUNet2DConditionModel
 )
 from diffusers import FlaxControlNetModel
         feature_extractor = CLIPFeatureExtractor.from_pretrained(
             model_id, subfolder="feature_extractor"
         )
+        unet, unet_params = FlaxUNet2DConditionModel.from_pretrained(
             model_id, subfolder="unet", from_pt=True, dtype=self.dtype
         )
+        unet_vanilla = VanillaFlaxUNet2DConditionModel.from_config(
             model_id, subfolder="unet", from_pt=True, dtype=self.dtype
         )
         vae, vae_params = FlaxAutoencoderKL.from_pretrained(
         seeds = [seed for seed in jax.random.randint(self.rng, [num_imgs], 0, 65536)]
         prngs = [jax.random.PRNGKey(seed) for seed in seeds]
+        print(seeds)
         images = self.pipe.generate_starting_frames(
+            params=self.p_params,
             prngs=prngs,
             controlnet_image=control,
             prompt=prompts,
         return images
+    def generate_video_from_frame(self, controlnet_video, prompt, seed, neg_prompt=""):
+        # generate a video using the seed provided
+        prng_seed = jax.random.PRNGKey(seed)
+        len_vid = controlnet_video.shape[0]
+        # print(f"Generating video from prompt {'<aardman> style '+ prompt}, with {controlnet_video.shape[0]} frames and prng seed {seed}")
+        added_prompt = "high quality, best quality, HD, clay stop-motion, claymation, HQ, masterpiece, art, smooth"
+        prompts = added_prompt + ", " + prompt
+        added_n_prompt = "longbody, lowres, bad anatomy, bad hands, missing fingers, extra digit, fewer difits, cropped, worst quality, low quality, deformed body, bloated, ugly"
+        negative_prompts = added_n_prompt + ", " + neg_prompt
+        # prompt_ids = self.pipe.prepare_text_inputs(["aardman style "+ prompt]*len_vid)
+        # n_prompt_ids = self.pipe.prepare_text_inputs([neg_prompt]*len_vid)
+        prompt_ids = self.pipe.prepare_text_inputs([prompts]*len_vid)
+        n_prompt_ids = self.pipe.prepare_text_inputs([negative_prompts]*len_vid)
+        prng = replicate_devices(prng_seed) #jax.random.split(prng, jax.device_count())
+        image = replicate_devices(controlnet_video)
+        prompt_ids = replicate_devices(prompt_ids)
+        n_prompt_ids = replicate_devices(n_prompt_ids)
+        motion_field_strength_x = replicate_devices(jnp.array(3))
+        motion_field_strength_y = replicate_devices(jnp.array(4))
+        smooth_bg_strength = replicate_devices(jnp.array(0.8))
+        vid = (self.pipe(image=image,
+                        prompt_ids=prompt_ids,
+                        neg_prompt_ids=n_prompt_ids,
+                        params=self.p_params,
+                        prng_seed=prng,
+                        jit = True,
+                        smooth_bg_strength=smooth_bg_strength,
+                        motion_field_strength_x=motion_field_strength_x,
+                        motion_field_strength_y=motion_field_strength_y,
+                        ).images)[0]
+        return utils.create_gif(np.array(vid), 4, path=None, watermark=None)
     def generate_animation(
         self,
+        prompt, #: str,
+        initial_frame_index, #: int,
+        input_video_path, #: str,
+        model_link = None,#: str = "dreamlike-art/dreamlike-photoreal-2.0",
+        motion_field_strength_x = 12,#: int = 12,
+        motion_field_strength_y= 12,#: int = 12,
+        t0= 44,#: int = 44,
+        t1= 47,#: int = 47,
+        n_prompt= "",#: str = "",
+        chunk_size= 8, #: int = 8,
+        video_length = 8, #: int = 8,
+        merging_ratio = 0., #: float = 0.0,
+        seed= 0,#: int = 0,
+        resolution=512,#: int = 512,
+        fps=2,#: int = 2,
+        use_cf_attn=True,#: bool = True,
+        use_motion_field=True,#: bool = True,
+        smooth_bg=False,#: bool = False,
+        smooth_bg_strength=0.4,#: float = 0.4,
+        path=None,#: str = None,
     ):
+        video_path = gradio_utils.motion_to_video_path(input_video_path)
         # added_prompt = 'best quality, HD, clay stop-motion, claymation, HQ, masterpiece, art, smooth'
         # added_prompt = 'high quality, anatomically correct, clay stop-motion, aardman, claymation, smooth'
             video_path, resolution, None, self.dtype, False, output_fps=4
         )
         control = utils.pre_process_pose(video, apply_pose_detect=False)
+        len_vid, _, h, w = video.shape
         prng_seed = jax.random.PRNGKey(seed)
+        prompts = prompt
+        prompt_ids = self.pipe.prepare_text_inputs([prompts]*len_vid)
+        n_prompt_ids = self.pipe.prepare_text_inputs([negative_prompts]*len_vid)
+        prng = replicate_devices(prng_seed) #jax.random.split(prng, jax.device_count())
+        image = replicate_devices(control)
+        prompt_ids = replicate_devices(prompt_ids)
+        n_prompt_ids = replicate_devices(n_prompt_ids)
+        motion_field_strength_x = replicate_devices(jnp.array(motion_field_strength_x))
+        motion_field_strength_y = replicate_devices(jnp.array(motion_field_strength_y))
+        smooth_bg_strength = replicate_devices(jnp.array(smooth_bg_strength))
+        vid = (self.pipe(image=image,
+                        prompt_ids=prompt_ids,
+                        neg_prompt_ids=n_prompt_ids,
+                        params=self.p_params,
+                        prng_seed=prng,
+                        jit = True,
+                        smooth_bg_strength=smooth_bg_strength,
+                        motion_field_strength_x=motion_field_strength_x,
+                        motion_field_strength_y=motion_field_strength_y,
+                        ).images)[0]
         return utils.create_gif(np.array(vid), 4, path=None, watermark=None)

text_to_animation/models/controlnet_flax.py CHANGED Viewed

@@ -23,12 +23,10 @@ from diffusers.configuration_utils import ConfigMixin, flax_register_to_config
 from diffusers.utils import BaseOutput
 from diffusers.models.embeddings_flax import FlaxTimestepEmbedding, FlaxTimesteps
 from diffusers.models.modeling_flax_utils import FlaxModelMixin
-from diffusers.models.unet_2d_blocks_flax import (
     FlaxCrossAttnDownBlock2D,
-    FlaxCrossAttnUpBlock2D,
     FlaxDownBlock2D,
-    FlaxUNetMidBlock2DCrossAttn,
-    FlaxUpBlock2D,
 )
@@ -171,18 +169,14 @@ class FlaxControlNetModel(nn.Module, FlaxModelMixin, ConfigMixin):
         sample_shape = (1, self.in_channels, self.sample_size, self.sample_size)
         sample = jnp.zeros(sample_shape, dtype=jnp.float32)
         timesteps = jnp.ones((1,), dtype=jnp.int32)
-        encoder_hidden_states = jnp.zeros(
-            (1, 1, self.cross_attention_dim), dtype=jnp.float32
-        )
         controlnet_cond_shape = (1, 3, self.sample_size * 8, self.sample_size * 8)
         controlnet_cond = jnp.zeros(controlnet_cond_shape, dtype=jnp.float32)
         params_rng, dropout_rng = jax.random.split(rng)
         rngs = {"params": params_rng, "dropout": dropout_rng}
-        return self.init(
-            rngs, sample, timesteps, encoder_hidden_states, controlnet_cond
-        )["params"]
     def setup(self):
         block_out_channels = self.block_out_channels
@@ -199,9 +193,7 @@ class FlaxControlNetModel(nn.Module, FlaxModelMixin, ConfigMixin):
         # time
         self.time_proj = FlaxTimesteps(
-            block_out_channels[0],
-            flip_sin_to_cos=self.flip_sin_to_cos,
-            freq_shift=self.config.freq_shift,
         )
         self.time_embedding = FlaxTimestepEmbedding(time_embed_dim, dtype=self.dtype)
@@ -290,7 +282,7 @@ class FlaxControlNetModel(nn.Module, FlaxModelMixin, ConfigMixin):
         # mid
         mid_block_channel = block_out_channels[-1]
-        self.mid_block = FlaxUNetMidBlock2DCrossAttn(
             in_channels=mid_block_channel,
             dropout=self.dropout,
             attn_num_head_channels=attention_head_dim[-1],
@@ -361,23 +353,17 @@ class FlaxControlNetModel(nn.Module, FlaxModelMixin, ConfigMixin):
         down_block_res_samples = (sample,)
         for down_block in self.down_blocks:
             if isinstance(down_block, FlaxCrossAttnDownBlock2D):
-                sample, res_samples = down_block(
-                    sample, t_emb, encoder_hidden_states, deterministic=not train
-                )
             else:
                 sample, res_samples = down_block(sample, t_emb, deterministic=not train)
             down_block_res_samples += res_samples
         # 4. mid
-        sample = self.mid_block(
-            sample, t_emb, encoder_hidden_states, deterministic=not train
-        )
         # 5. contronet blocks
         controlnet_down_block_res_samples = ()
-        for down_block_res_sample, controlnet_block in zip(
-            down_block_res_samples, self.controlnet_down_blocks
-        ):
             down_block_res_sample = controlnet_block(down_block_res_sample)
             controlnet_down_block_res_samples += (down_block_res_sample,)
@@ -386,15 +372,12 @@ class FlaxControlNetModel(nn.Module, FlaxModelMixin, ConfigMixin):
         mid_block_res_sample = self.controlnet_mid_block(sample)
         # 6. scaling
-        down_block_res_samples = [
-            sample * conditioning_scale for sample in down_block_res_samples
-        ]
         mid_block_res_sample *= conditioning_scale
         if not return_dict:
             return (down_block_res_samples, mid_block_res_sample)
         return FlaxControlNetOutput(
-            down_block_res_samples=down_block_res_samples,
-            mid_block_res_sample=mid_block_res_sample,
-        )

 from diffusers.utils import BaseOutput
 from diffusers.models.embeddings_flax import FlaxTimestepEmbedding, FlaxTimesteps
 from diffusers.models.modeling_flax_utils import FlaxModelMixin
+from .unet_2d_blocks_flax import (
     FlaxCrossAttnDownBlock2D,
     FlaxDownBlock2D,
+    FlaxUNetCrossAttnMidBlock2D,
 )
         sample_shape = (1, self.in_channels, self.sample_size, self.sample_size)
         sample = jnp.zeros(sample_shape, dtype=jnp.float32)
         timesteps = jnp.ones((1,), dtype=jnp.int32)
+        encoder_hidden_states = jnp.zeros((1, 1, self.cross_attention_dim), dtype=jnp.float32)
         controlnet_cond_shape = (1, 3, self.sample_size * 8, self.sample_size * 8)
         controlnet_cond = jnp.zeros(controlnet_cond_shape, dtype=jnp.float32)
         params_rng, dropout_rng = jax.random.split(rng)
         rngs = {"params": params_rng, "dropout": dropout_rng}
+        return self.init(rngs, sample, timesteps, encoder_hidden_states, controlnet_cond)["params"]
     def setup(self):
         block_out_channels = self.block_out_channels
         # time
         self.time_proj = FlaxTimesteps(
+            block_out_channels[0], flip_sin_to_cos=self.flip_sin_to_cos, freq_shift=self.config.freq_shift
         )
         self.time_embedding = FlaxTimestepEmbedding(time_embed_dim, dtype=self.dtype)
         # mid
         mid_block_channel = block_out_channels[-1]
+        self.mid_block = FlaxUNetCrossAttnMidBlock2D(
             in_channels=mid_block_channel,
             dropout=self.dropout,
             attn_num_head_channels=attention_head_dim[-1],
         down_block_res_samples = (sample,)
         for down_block in self.down_blocks:
             if isinstance(down_block, FlaxCrossAttnDownBlock2D):
+                sample, res_samples = down_block(sample, t_emb, encoder_hidden_states, deterministic=not train)
             else:
                 sample, res_samples = down_block(sample, t_emb, deterministic=not train)
             down_block_res_samples += res_samples
         # 4. mid
+        sample = self.mid_block(sample, t_emb, encoder_hidden_states, deterministic=not train)
         # 5. contronet blocks
         controlnet_down_block_res_samples = ()
+        for down_block_res_sample, controlnet_block in zip(down_block_res_samples, self.controlnet_down_blocks):
             down_block_res_sample = controlnet_block(down_block_res_sample)
             controlnet_down_block_res_samples += (down_block_res_sample,)
         mid_block_res_sample = self.controlnet_mid_block(sample)
         # 6. scaling
+        down_block_res_samples = [sample * conditioning_scale for sample in down_block_res_samples]
         mid_block_res_sample *= conditioning_scale
         if not return_dict:
             return (down_block_res_samples, mid_block_res_sample)
         return FlaxControlNetOutput(
+            down_block_res_samples=down_block_res_samples, mid_block_res_sample=mid_block_res_sample
+        )

text_to_animation/models/cross_frame_attention_flax.py CHANGED Viewed

@@ -19,6 +19,8 @@ import flax.linen as nn
 import jax
 import jax.numpy as jnp
 # from diffusers.models.attention_flax import FlaxBasicTransformerBlock
 from diffusers.models.attention_flax import FlaxFeedForward, jax_memory_efficient_attention
@@ -32,7 +34,7 @@ def rearrange_4(array):
 class FlaxCrossFrameAttention(nn.Module):
     r"""
-    A Flax multi-head attention module, with cross-frame attention as described in: https://arxiv.org/abs/2303.13439
     Parameters:
         query_dim (:obj:`int`):
@@ -50,6 +52,7 @@ class FlaxCrossFrameAttention(nn.Module):
         batch_size: The number that represents actual batch size, other than the frames.
             For example, using calling unet with a single prompt and num_images_per_prompt=1, batch_size should be
             equal to 2, due to classifier-free guidance.
     """
     query_dim: int
     heads: int = 8
@@ -152,6 +155,173 @@ class FlaxCrossFrameAttention(nn.Module):
         hidden_states = self.proj_attn(hidden_states)
         return hidden_states
 class FlaxBasicTransformerBlock(nn.Module):
     r"""
     A Flax transformer block layer with `GLU` (Gated Linear Unit) activation function as described in:
@@ -222,6 +392,76 @@ class FlaxBasicTransformerBlock(nn.Module):
         return hidden_states
 class FlaxCrossFrameTransformer2DModel(nn.Module):
     r"""
@@ -320,4 +560,99 @@ class FlaxCrossFrameTransformer2DModel(nn.Module):
         hidden_states = hidden_states + residual
         return hidden_states

 import jax
 import jax.numpy as jnp
+from einops import repeat
 # from diffusers.models.attention_flax import FlaxBasicTransformerBlock
 from diffusers.models.attention_flax import FlaxFeedForward, jax_memory_efficient_attention
 class FlaxCrossFrameAttention(nn.Module):
     r"""
+    A Flax multi-head attention module as described in: https://arxiv.org/abs/1706.03762
     Parameters:
         query_dim (:obj:`int`):
         batch_size: The number that represents actual batch size, other than the frames.
             For example, using calling unet with a single prompt and num_images_per_prompt=1, batch_size should be
             equal to 2, due to classifier-free guidance.
     """
     query_dim: int
     heads: int = 8
         hidden_states = self.proj_attn(hidden_states)
         return hidden_states
+class FlaxLoRALinearLayer(nn.Module):
+    out_features: int
+    dtype: jnp.dtype = jnp.float32
+    rank: int=4
+    def setup(self):
+        self.down = nn.Dense(self.rank, use_bias=False, kernel_init=nn.initializers.normal(stddev=1 / self.rank), dtype=self.dtype, name="down_lora")
+        self.up = nn.Dense(self.out_features, use_bias=False, kernel_init=nn.initializers.zeros, dtype=self.dtype, name="up_lora")
+    def __call__(self, hidden_states):
+        down_hidden_states = self.down(hidden_states)
+        up_hidden_states = self.up(down_hidden_states)
+        return up_hidden_states
+class LoRAPositionalEncoding(nn.Module):
+    d_model : int         # Hidden dimensionality of the input.
+    rank: int=4
+    dtype: jnp.dtype = jnp.float32
+    max_len : int = 200  # Maximum length of a sequence to expect.
+    def setup(self):
+        # Create matrix of [SeqLen, HiddenDim] representing the positional encoding for max_len inputs
+        pe = jnp.zeros((self.max_len, self.d_model), dtype=self.dtype)
+        position = jnp.arange(0, self.max_len, dtype=self.dtype)[:,None]
+        div_term = jnp.exp(jnp.arange(0, self.d_model, 2) * (-jnp.log(10000.0) / self.d_model))
+        pe = pe.at[:, 0::2].set(jnp.sin(position * div_term))
+        pe = pe.at[:, 1::2].set(jnp.cos(position * div_term))
+        self.pe = pe
+        self.lora_pe = FlaxLoRALinearLayer(self.d_model, rank=self.rank, dtype=self.dtype)
+    def __call__(self, x):
+        #x is (F // f, f, D, C)
+        b, f, d, c = x.shape
+        pe = repeat(self.lora_pe(self.pe[:f]), 'f c -> b f d c', b=b, d=d)
+        return x + pe
+class FlaxLoRACrossFrameAttention(nn.Module):
+    r"""
+    A Flax multi-head attention module as described in: https://arxiv.org/abs/1706.03762
+    Parameters:
+        query_dim (:obj:`int`):
+            Input hidden states dimension
+        heads (:obj:`int`, *optional*, defaults to 8):
+            Number of heads
+        dim_head (:obj:`int`, *optional*, defaults to 64):
+            Hidden states dimension inside each head
+        dropout (:obj:`float`, *optional*, defaults to 0.0):
+            Dropout rate
+        use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
+            enable memory efficient attention https://arxiv.org/abs/2112.05682
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+        batch_size: The number that represents actual batch size, other than the frames.
+            For example, using calling unet with a single prompt and num_images_per_prompt=1, batch_size should be
+            equal to 2, due to classifier-free guidance.
+    """
+    query_dim: int
+    heads: int = 8
+    dim_head: int = 64
+    dropout: float = 0.0
+    use_memory_efficient_attention: bool = False
+    dtype: jnp.dtype = jnp.float32
+    batch_size : int = 2
+    rank: int=4
+    def setup(self):
+        inner_dim = self.dim_head * self.heads
+        self.scale = self.dim_head**-0.5
+        # Weights were exported with old names {to_q, to_k, to_v, to_out}
+        self.query = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype, name="to_q")
+        self.key = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype, name="to_k")
+        self.value = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype, name="to_v")
+        self.add_k_proj = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype)
+        self.add_v_proj = nn.Dense(inner_dim, use_bias=False, dtype=self.dtype)
+        self.proj_attn = nn.Dense(self.query_dim, dtype=self.dtype, name="to_out_0")
+        self.to_q_lora = FlaxLoRALinearLayer(inner_dim, rank=self.rank, dtype=self.dtype)
+        self.to_k_lora = FlaxLoRALinearLayer(inner_dim, rank=self.rank, dtype=self.dtype)
+        self.to_v_lora = FlaxLoRALinearLayer(inner_dim, rank=self.rank, dtype=self.dtype)
+        self.to_out_lora = FlaxLoRALinearLayer(inner_dim, rank=self.rank, dtype=self.dtype)
+    def reshape_heads_to_batch_dim(self, tensor):
+        batch_size, seq_len, dim = tensor.shape
+        head_size = self.heads
+        tensor = tensor.reshape(batch_size, seq_len, head_size, dim // head_size)
+        tensor = jnp.transpose(tensor, (0, 2, 1, 3))
+        tensor = tensor.reshape(batch_size * head_size, seq_len, dim // head_size)
+        return tensor
+    def reshape_batch_dim_to_heads(self, tensor):
+        batch_size, seq_len, dim = tensor.shape
+        head_size = self.heads
+        tensor = tensor.reshape(batch_size // head_size, head_size, seq_len, dim)
+        tensor = jnp.transpose(tensor, (0, 2, 1, 3))
+        tensor = tensor.reshape(batch_size // head_size, seq_len, dim * head_size)
+        return tensor
+    def __call__(self, hidden_states, context=None, deterministic=True, scale=1.):
+        is_cross_attention = context is not None
+        context = hidden_states if context is None else context
+        query_proj = self.query(hidden_states) + scale * self.to_q_lora(hidden_states)
+        key_proj = self.key(context) + scale * self.to_k_lora(context)
+        value_proj = self.value(context) + scale * self.to_v_lora(context)
+        # Sparse Attention
+        if not is_cross_attention:
+            video_length = 1 if key_proj.shape[0] < self.batch_size else key_proj.shape[0] // self.batch_size
+            first_frame_index = [0] * video_length
+            #first frame ==> previous frame
+            previous_frame_index = jnp.array([0] + list(range(video_length - 1)))
+            # rearrange keys to have batch and frames in the 1st and 2nd dims respectively
+            key_proj = rearrange_3(key_proj, video_length)
+            key_proj = key_proj[:, first_frame_index]
+            # rearrange values to have batch and frames in the 1st and 2nd dims respectively
+            value_proj = rearrange_3(value_proj, video_length)
+            value_proj = value_proj[:, first_frame_index]
+            # rearrange back to original shape
+            key_proj = rearrange_4(key_proj)
+            value_proj = rearrange_4(value_proj)
+        query_states = self.reshape_heads_to_batch_dim(query_proj)
+        key_states = self.reshape_heads_to_batch_dim(key_proj)
+        value_states = self.reshape_heads_to_batch_dim(value_proj)
+        if self.use_memory_efficient_attention:
+            query_states = query_states.transpose(1, 0, 2)
+            key_states = key_states.transpose(1, 0, 2)
+            value_states = value_states.transpose(1, 0, 2)
+            # this if statement create a chunk size for each layer of the unet
+            # the chunk size is equal to the query_length dimension of the deepest layer of the unet
+            flatten_latent_dim = query_states.shape[-3]
+            if flatten_latent_dim % 64 == 0:
+                query_chunk_size = int(flatten_latent_dim / 64)
+            elif flatten_latent_dim % 16 == 0:
+                query_chunk_size = int(flatten_latent_dim / 16)
+            elif flatten_latent_dim % 4 == 0:
+                query_chunk_size = int(flatten_latent_dim / 4)
+            else:
+                query_chunk_size = int(flatten_latent_dim)
+            hidden_states = jax_memory_efficient_attention(
+                query_states, key_states, value_states, query_chunk_size=query_chunk_size, key_chunk_size=4096 * 4
+            )
+            hidden_states = hidden_states.transpose(1, 0, 2)
+        else:
+            # compute attentions
+            attention_scores = jnp.einsum("b i d, b j d->b i j", query_states, key_states)
+            attention_scores = attention_scores * self.scale
+            attention_probs = nn.softmax(attention_scores, axis=2)
+            # attend to values
+            hidden_states = jnp.einsum("b i j, b j d -> b i d", attention_probs, value_states)
+        hidden_states = self.reshape_batch_dim_to_heads(hidden_states)
+        hidden_states = self.proj_attn(hidden_states) + scale * self.to_out_lora(hidden_states)
+        return hidden_states
 class FlaxBasicTransformerBlock(nn.Module):
     r"""
     A Flax transformer block layer with `GLU` (Gated Linear Unit) activation function as described in:
         return hidden_states
+class FlaxLoRABasicTransformerBlock(nn.Module):
+    r"""
+    A Flax transformer block layer with `GLU` (Gated Linear Unit) activation function as described in:
+    https://arxiv.org/abs/1706.03762
+    Parameters:
+        dim (:obj:`int`):
+            Inner hidden states dimension
+        n_heads (:obj:`int`):
+            Number of heads
+        d_head (:obj:`int`):
+            Hidden states dimension inside each head
+        dropout (:obj:`float`, *optional*, defaults to 0.0):
+            Dropout rate
+        only_cross_attention (`bool`, defaults to `False`):
+            Whether to only apply cross attention.
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+        use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
+            enable memory efficient attention https://arxiv.org/abs/2112.05682
+    """
+    dim: int
+    n_heads: int
+    d_head: int
+    dropout: float = 0.0
+    only_cross_attention: bool = False
+    dtype: jnp.dtype = jnp.float32
+    use_memory_efficient_attention: bool = False
+    def setup(self):
+        # self attention (or cross_attention if only_cross_attention is True)
+        self.attn1 = FlaxLoRACrossFrameAttention(
+                                        self.dim, self.n_heads, self.d_head, self.dropout, self.use_memory_efficient_attention, dtype=self.dtype,
+                                        )
+        # cross attention
+        self.attn2 = FlaxLoRACrossFrameAttention(
+                                        self.dim, self.n_heads, self.d_head, self.dropout, self.use_memory_efficient_attention, dtype=self.dtype,
+                                        )
+        self.ff = FlaxFeedForward(dim=self.dim, dropout=self.dropout, dtype=self.dtype)
+        self.norm1 = nn.LayerNorm(epsilon=1e-5, dtype=self.dtype)
+        self.norm2 = nn.LayerNorm(epsilon=1e-5, dtype=self.dtype)
+        self.norm3 = nn.LayerNorm(epsilon=1e-5, dtype=self.dtype)
+    def __call__(self, hidden_states, context, deterministic=True, scale=1.):
+        # self attention
+        residual = hidden_states
+        if self.only_cross_attention:
+            hidden_states = self.attn1(self.norm1(hidden_states), context, deterministic=deterministic, scale=scale)
+        else:
+            hidden_states = self.attn1(self.norm1(hidden_states), deterministic=deterministic, scale=scale)
+        hidden_states = hidden_states + residual
+        # cross attention
+        residual = hidden_states
+        hidden_states = self.attn2(self.norm2(hidden_states), context, deterministic=deterministic, scale=scale)
+        hidden_states = hidden_states + residual
+        # feed forward
+        residual = hidden_states
+        hidden_states = self.ff(self.norm3(hidden_states), deterministic=deterministic)
+        hidden_states = hidden_states + residual
+        return hidden_states
 class FlaxCrossFrameTransformer2DModel(nn.Module):
     r"""
         hidden_states = hidden_states + residual
         return hidden_states
+class FlaxLoRACrossFrameTransformer2DModel(nn.Module):
+    r"""
+    A Spatial Transformer layer with Gated Linear Unit (GLU) activation function as described in:
+    https://arxiv.org/pdf/1506.02025.pdf
+    Parameters:
+        in_channels (:obj:`int`):
+            Input number of channels
+        n_heads (:obj:`int`):
+            Number of heads
+        d_head (:obj:`int`):
+            Hidden states dimension inside each head
+        depth (:obj:`int`, *optional*, defaults to 1):
+            Number of transformers block
+        dropout (:obj:`float`, *optional*, defaults to 0.0):
+            Dropout rate
+        use_linear_projection (`bool`, defaults to `False`): tbd
+        only_cross_attention (`bool`, defaults to `False`): tbd
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+        use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
+            enable memory efficient attention https://arxiv.org/abs/2112.05682
+    """
+    in_channels: int
+    n_heads: int
+    d_head: int
+    depth: int = 1
+    dropout: float = 0.0
+    use_linear_projection: bool = False
+    only_cross_attention: bool = False
+    dtype: jnp.dtype = jnp.float32
+    use_memory_efficient_attention: bool = False
+    def setup(self):
+        self.norm = nn.GroupNorm(num_groups=32, epsilon=1e-5)
+        inner_dim = self.n_heads * self.d_head
+        if self.use_linear_projection:
+            self.proj_in = nn.Dense(inner_dim, dtype=self.dtype)
+        else:
+            self.proj_in = nn.Conv(
+                inner_dim,
+                kernel_size=(1, 1),
+                strides=(1, 1),
+                padding="VALID",
+                dtype=self.dtype,
+            )
+        self.transformer_blocks = [
+            FlaxLoRABasicTransformerBlock(
+                inner_dim,
+                self.n_heads,
+                self.d_head,
+                dropout=self.dropout,
+                only_cross_attention=self.only_cross_attention,
+                dtype=self.dtype,
+                use_memory_efficient_attention=self.use_memory_efficient_attention,
+            )
+            for _ in range(self.depth)
+        ]
+        if self.use_linear_projection:
+            self.proj_out = nn.Dense(inner_dim, dtype=self.dtype)
+        else:
+            self.proj_out = nn.Conv(
+                inner_dim,
+                kernel_size=(1, 1),
+                strides=(1, 1),
+                padding="VALID",
+                dtype=self.dtype,
+            )
+    def __call__(self, hidden_states, context, deterministic=True, scale=1.0):
+        batch, height, width, channels = hidden_states.shape
+        residual = hidden_states
+        hidden_states = self.norm(hidden_states)
+        if self.use_linear_projection:
+            hidden_states = hidden_states.reshape(batch, height * width, channels)
+            hidden_states = self.proj_in(hidden_states)
+        else:
+            hidden_states = self.proj_in(hidden_states)
+            hidden_states = hidden_states.reshape(batch, height * width, channels)
+        for transformer_block in self.transformer_blocks:
+            hidden_states = transformer_block(hidden_states, context, deterministic=deterministic, scale=scale)
+        if self.use_linear_projection:
+            hidden_states = self.proj_out(hidden_states)
+            hidden_states = hidden_states.reshape(batch, height, width, channels)
+        else:
+            hidden_states = hidden_states.reshape(batch, height, width, channels)
+            hidden_states = self.proj_out(hidden_states)
+        hidden_states = hidden_states + residual
+        return hidden_states

text_to_animation/models/unet_2d_blocks_flax.py CHANGED Viewed

@@ -17,7 +17,7 @@ import jax.numpy as jnp
 # from diffusers.models.attention_flax import FlaxTransformer2DModel
 from diffusers.models.resnet_flax import FlaxDownsample2D, FlaxResnetBlock2D, FlaxUpsample2D
-from .cross_frame_attention_flax import FlaxCrossFrameTransformer2DModel
 class FlaxCrossAttnDownBlock2D(nn.Module):
     r"""
@@ -100,6 +100,87 @@ class FlaxCrossAttnDownBlock2D(nn.Module):
         return hidden_states, output_states
 class FlaxDownBlock2D(nn.Module):
     r"""
     Flax 2D downsizing block
@@ -240,6 +321,90 @@ class FlaxCrossAttnUpBlock2D(nn.Module):
         return hidden_states
 class FlaxUpBlock2D(nn.Module):
     r"""
     Flax 2D upsampling block
@@ -302,7 +467,7 @@ class FlaxUpBlock2D(nn.Module):
         return hidden_states
-class FlaxUNetMidBlock2DCrossAttn(nn.Module):
     r"""
     Cross Attention 2D Mid-level block - original architecture from Unet transformers: https://arxiv.org/abs/2103.06104
     Parameters:
@@ -369,4 +534,74 @@ class FlaxUNetMidBlock2DCrossAttn(nn.Module):
             hidden_states = attn(hidden_states, encoder_hidden_states, deterministic=deterministic)
             hidden_states = resnet(hidden_states, temb, deterministic=deterministic)
         return hidden_states

 # from diffusers.models.attention_flax import FlaxTransformer2DModel
 from diffusers.models.resnet_flax import FlaxDownsample2D, FlaxResnetBlock2D, FlaxUpsample2D
+from .cross_frame_attention_flax import FlaxCrossFrameTransformer2DModel, FlaxLoRACrossFrameTransformer2DModel
 class FlaxCrossAttnDownBlock2D(nn.Module):
     r"""
         return hidden_states, output_states
+class FlaxLoRACrossAttnDownBlock2D(nn.Module):
+    r"""
+    Cross Attention 2D Downsizing block - original architecture from Unet transformers:
+    https://arxiv.org/abs/2103.06104
+    Parameters:
+        in_channels (:obj:`int`):
+            Input channels
+        out_channels (:obj:`int`):
+            Output channels
+        dropout (:obj:`float`, *optional*, defaults to 0.0):
+            Dropout rate
+        num_layers (:obj:`int`, *optional*, defaults to 1):
+            Number of attention blocks layers
+        attn_num_head_channels (:obj:`int`, *optional*, defaults to 1):
+            Number of attention heads of each spatial transformer block
+        add_downsample (:obj:`bool`, *optional*, defaults to `True`):
+            Whether to add downsampling layer before each final output
+        use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
+            enable memory efficient attention https://arxiv.org/abs/2112.05682
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+    """
+    in_channels: int
+    out_channels: int
+    dropout: float = 0.0
+    num_layers: int = 1
+    attn_num_head_channels: int = 1
+    add_downsample: bool = True
+    use_linear_projection: bool = False
+    only_cross_attention: bool = False
+    use_memory_efficient_attention: bool = False
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        resnets = []
+        attentions = []
+        for i in range(self.num_layers):
+            in_channels = self.in_channels if i == 0 else self.out_channels
+            res_block = FlaxResnetBlock2D(
+                in_channels=in_channels,
+                out_channels=self.out_channels,
+                dropout_prob=self.dropout,
+                dtype=self.dtype,
+            )
+            resnets.append(res_block)
+            attn_block = FlaxLoRACrossFrameTransformer2DModel(
+                in_channels=self.out_channels,
+                n_heads=self.attn_num_head_channels,
+                d_head=self.out_channels // self.attn_num_head_channels,
+                depth=1,
+                use_linear_projection=self.use_linear_projection,
+                only_cross_attention=self.only_cross_attention,
+                use_memory_efficient_attention=self.use_memory_efficient_attention,
+                dtype=self.dtype,
+            )
+            attentions.append(attn_block)
+        self.resnets = resnets
+        self.attentions = attentions
+        if self.add_downsample:
+            self.downsamplers_0 = FlaxDownsample2D(self.out_channels, dtype=self.dtype)
+    def __call__(self, hidden_states, temb, encoder_hidden_states, deterministic=True, scale=1.):
+        output_states = ()
+        for resnet, attn in zip(self.resnets, self.attentions):
+            hidden_states = resnet(hidden_states, temb, deterministic=deterministic)
+            hidden_states = attn(hidden_states, encoder_hidden_states, deterministic=deterministic, scale=scale)
+            output_states += (hidden_states,)
+        if self.add_downsample:
+            hidden_states = self.downsamplers_0(hidden_states)
+            output_states += (hidden_states,)
+        return hidden_states, output_states
 class FlaxDownBlock2D(nn.Module):
     r"""
     Flax 2D downsizing block
         return hidden_states
+class FlaxLoRACrossAttnUpBlock2D(nn.Module):
+    r"""
+    Cross Attention 2D Upsampling block - original architecture from Unet transformers:
+    https://arxiv.org/abs/2103.06104
+    Parameters:
+        in_channels (:obj:`int`):
+            Input channels
+        out_channels (:obj:`int`):
+            Output channels
+        dropout (:obj:`float`, *optional*, defaults to 0.0):
+            Dropout rate
+        num_layers (:obj:`int`, *optional*, defaults to 1):
+            Number of attention blocks layers
+        attn_num_head_channels (:obj:`int`, *optional*, defaults to 1):
+            Number of attention heads of each spatial transformer block
+        add_upsample (:obj:`bool`, *optional*, defaults to `True`):
+            Whether to add upsampling layer before each final output
+        use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
+            enable memory efficient attention https://arxiv.org/abs/2112.05682
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+    """
+    in_channels: int
+    out_channels: int
+    prev_output_channel: int
+    dropout: float = 0.0
+    num_layers: int = 1
+    attn_num_head_channels: int = 1
+    add_upsample: bool = True
+    use_linear_projection: bool = False
+    only_cross_attention: bool = False
+    use_memory_efficient_attention: bool = False
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        resnets = []
+        attentions = []
+        for i in range(self.num_layers):
+            res_skip_channels = self.in_channels if (i == self.num_layers - 1) else self.out_channels
+            resnet_in_channels = self.prev_output_channel if i == 0 else self.out_channels
+            res_block = FlaxResnetBlock2D(
+                in_channels=resnet_in_channels + res_skip_channels,
+                out_channels=self.out_channels,
+                dropout_prob=self.dropout,
+                dtype=self.dtype,
+            )
+            resnets.append(res_block)
+            attn_block = FlaxLoRACrossFrameTransformer2DModel(
+                in_channels=self.out_channels,
+                n_heads=self.attn_num_head_channels,
+                d_head=self.out_channels // self.attn_num_head_channels,
+                depth=1,
+                use_linear_projection=self.use_linear_projection,
+                only_cross_attention=self.only_cross_attention,
+                use_memory_efficient_attention=self.use_memory_efficient_attention,
+                dtype=self.dtype,
+            )
+            attentions.append(attn_block)
+        self.resnets = resnets
+        self.attentions = attentions
+        if self.add_upsample:
+            self.upsamplers_0 = FlaxUpsample2D(self.out_channels, dtype=self.dtype)
+    def __call__(self, hidden_states, res_hidden_states_tuple, temb, encoder_hidden_states, deterministic=True, scale=1.):
+        for resnet, attn in zip(self.resnets, self.attentions):
+            # pop res hidden states
+            res_hidden_states = res_hidden_states_tuple[-1]
+            res_hidden_states_tuple = res_hidden_states_tuple[:-1]
+            hidden_states = jnp.concatenate((hidden_states, res_hidden_states), axis=-1)
+            hidden_states = resnet(hidden_states, temb, deterministic=deterministic)
+            hidden_states = attn(hidden_states, encoder_hidden_states, deterministic=deterministic, scale=scale)
+        if self.add_upsample:
+            hidden_states = self.upsamplers_0(hidden_states)
+        return hidden_states
 class FlaxUpBlock2D(nn.Module):
     r"""
     Flax 2D upsampling block
         return hidden_states
+class FlaxUNetCrossAttnMidBlock2D(nn.Module):
     r"""
     Cross Attention 2D Mid-level block - original architecture from Unet transformers: https://arxiv.org/abs/2103.06104
     Parameters:
             hidden_states = attn(hidden_states, encoder_hidden_states, deterministic=deterministic)
             hidden_states = resnet(hidden_states, temb, deterministic=deterministic)
+        return hidden_states
+class FlaxLoRAUNetCrossAttnMidBlock2D(nn.Module):
+    r"""
+    Cross Attention 2D Mid-level block - original architecture from Unet transformers: https://arxiv.org/abs/2103.06104
+    Parameters:
+        in_channels (:obj:`int`):
+            Input channels
+        dropout (:obj:`float`, *optional*, defaults to 0.0):
+            Dropout rate
+        num_layers (:obj:`int`, *optional*, defaults to 1):
+            Number of attention blocks layers
+        attn_num_head_channels (:obj:`int`, *optional*, defaults to 1):
+            Number of attention heads of each spatial transformer block
+        use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
+            enable memory efficient attention https://arxiv.org/abs/2112.05682
+        dtype (:obj:`jnp.dtype`, *optional*, defaults to jnp.float32):
+            Parameters `dtype`
+    """
+    in_channels: int
+    dropout: float = 0.0
+    num_layers: int = 1
+    attn_num_head_channels: int = 1
+    use_linear_projection: bool = False
+    use_memory_efficient_attention: bool = False
+    dtype: jnp.dtype = jnp.float32
+    def setup(self):
+        # there is always at least one resnet
+        resnets = [
+            FlaxResnetBlock2D(
+                in_channels=self.in_channels,
+                out_channels=self.in_channels,
+                dropout_prob=self.dropout,
+                dtype=self.dtype,
+            )
+        ]
+        attentions = []
+        for _ in range(self.num_layers):
+            attn_block = FlaxLoRACrossFrameTransformer2DModel(
+                in_channels=self.in_channels,
+                n_heads=self.attn_num_head_channels,
+                d_head=self.in_channels // self.attn_num_head_channels,
+                depth=1,
+                use_linear_projection=self.use_linear_projection,
+                use_memory_efficient_attention=self.use_memory_efficient_attention,
+                dtype=self.dtype,
+            )
+            attentions.append(attn_block)
+            res_block = FlaxResnetBlock2D(
+                in_channels=self.in_channels,
+                out_channels=self.in_channels,
+                dropout_prob=self.dropout,
+                dtype=self.dtype,
+            )
+            resnets.append(res_block)
+        self.resnets = resnets
+        self.attentions = attentions
+    def __call__(self, hidden_states, temb, encoder_hidden_states, deterministic=True, scale=1.):
+        hidden_states = self.resnets[0](hidden_states, temb)
+        for attn, resnet in zip(self.attentions, self.resnets[1:]):
+            hidden_states = attn(hidden_states, encoder_hidden_states, deterministic=deterministic, scale=scale)
+            hidden_states = resnet(hidden_states, temb, deterministic=deterministic)
         return hidden_states

text_to_animation/models/unet_2d_condition_flax.py CHANGED Viewed

@@ -26,15 +26,17 @@ from diffusers.configuration_utils import ConfigMixin, flax_register_to_config
 from diffusers.utils import BaseOutput
 from diffusers.models.embeddings_flax import FlaxTimestepEmbedding, FlaxTimesteps
 from diffusers.models.modeling_flax_utils import FlaxModelMixin
-from diffusers.models.unet_2d_blocks_flax import (
     FlaxCrossAttnDownBlock2D,
     FlaxCrossAttnUpBlock2D,
     FlaxDownBlock2D,
-    FlaxUNetMidBlock2DCrossAttn,
     FlaxUpBlock2D,
 )
 @flax.struct.dataclass
 class FlaxUNet2DConditionOutput(BaseOutput):
     """
@@ -105,12 +107,7 @@ class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin):
         "CrossAttnDownBlock2D",
         "DownBlock2D",
     )
-    up_block_types: Tuple[str] = (
-        "UpBlock2D",
-        "CrossAttnUpBlock2D",
-        "CrossAttnUpBlock2D",
-        "CrossAttnUpBlock2D",
-    )
     only_cross_attention: Union[bool, Tuple[bool]] = False
     block_out_channels: Tuple[int] = (320, 640, 1280, 1280)
     layers_per_block: int = 2
@@ -118,7 +115,7 @@ class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin):
     cross_attention_dim: int = 1280
     dropout: float = 0.0
     use_linear_projection: bool = False
-    dtype: jnp.dtype = jnp.float32
     flip_sin_to_cos: bool = True
     freq_shift: int = 0
     use_memory_efficient_attention: bool = False
@@ -126,11 +123,9 @@ class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin):
     def init_weights(self, rng: jax.random.KeyArray) -> FrozenDict:
         # init input tensors
         sample_shape = (1, self.in_channels, self.sample_size, self.sample_size)
-        sample = jnp.zeros(sample_shape, dtype=jnp.float32)
         timesteps = jnp.ones((1,), dtype=jnp.int32)
-        encoder_hidden_states = jnp.zeros(
-            (1, 1, self.cross_attention_dim), dtype=jnp.float32
-        )
         params_rng, dropout_rng = jax.random.split(rng)
         rngs = {"params": params_rng, "dropout": dropout_rng}
@@ -152,9 +147,7 @@ class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin):
         # time
         self.time_proj = FlaxTimesteps(
-            block_out_channels[0],
-            flip_sin_to_cos=self.flip_sin_to_cos,
-            freq_shift=self.config.freq_shift,
         )
         self.time_embedding = FlaxTimestepEmbedding(time_embed_dim, dtype=self.dtype)
@@ -201,7 +194,7 @@ class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin):
         self.down_blocks = down_blocks
         # mid
-        self.mid_block = FlaxUNetMidBlock2DCrossAttn(
             in_channels=block_out_channels[-1],
             dropout=self.dropout,
             attn_num_head_channels=attention_head_dim[-1],
@@ -219,9 +212,7 @@ class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin):
         for i, up_block_type in enumerate(self.up_block_types):
             prev_output_channel = output_channel
             output_channel = reversed_block_out_channels[i]
-            input_channel = reversed_block_out_channels[
-                min(i + 1, len(block_out_channels) - 1)
-            ]
             is_final_block = i == len(block_out_channels) - 1
@@ -308,9 +299,7 @@ class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin):
         down_block_res_samples = (sample,)
         for down_block in self.down_blocks:
             if isinstance(down_block, FlaxCrossAttnDownBlock2D):
-                sample, res_samples = down_block(
-                    sample, t_emb, encoder_hidden_states, deterministic=not train
-                )
             else:
                 sample, res_samples = down_block(sample, t_emb, deterministic=not train)
             down_block_res_samples += res_samples
@@ -327,9 +316,7 @@ class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin):
             down_block_res_samples = new_down_block_res_samples
         # 4. mid
-        sample = self.mid_block(
-            sample, t_emb, encoder_hidden_states, deterministic=not train
-        )
         if mid_block_additional_residual is not None:
             sample += mid_block_additional_residual
@@ -337,9 +324,7 @@ class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin):
         # 5. up
         for up_block in self.up_blocks:
             res_samples = down_block_res_samples[-(self.layers_per_block + 1) :]
-            down_block_res_samples = down_block_res_samples[
-                : -(self.layers_per_block + 1)
-            ]
             if isinstance(up_block, FlaxCrossAttnUpBlock2D):
                 sample = up_block(
                     sample,
@@ -349,12 +334,321 @@ class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin):
                     deterministic=not train,
                 )
             else:
                 sample = up_block(
                     sample,
                     temb=t_emb,
                     res_hidden_states_tuple=res_samples,
                     deterministic=not train,
                 )
         # 6. post-process
         sample = self.conv_norm_out(sample)
@@ -365,4 +659,4 @@ class FlaxUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin):
         if not return_dict:
             return (sample,)
-        return FlaxUNet2DConditionOutput(sample=sample)

 from diffusers.utils import BaseOutput
 from diffusers.models.embeddings_flax import FlaxTimestepEmbedding, FlaxTimesteps
 from diffusers.models.modeling_flax_utils import FlaxModelMixin
+from .unet_2d_blocks_flax import (
     FlaxCrossAttnDownBlock2D,
     FlaxCrossAttnUpBlock2D,
+    FlaxUNetCrossAttnMidBlock2D,
+    FlaxLoRACrossAttnDownBlock2D,
+    FlaxLoRACrossAttnUpBlock2D,
+    FlaxLoRAUNetCrossAttnMidBlock2D,
     FlaxDownBlock2D,
     FlaxUpBlock2D,
 )
 @flax.struct.dataclass
 class FlaxUNet2DConditionOutput(BaseOutput):
     """
         "CrossAttnDownBlock2D",
         "DownBlock2D",
     )
+    up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")
     only_cross_attention: Union[bool, Tuple[bool]] = False
     block_out_channels: Tuple[int] = (320, 640, 1280, 1280)
     layers_per_block: int = 2
     cross_attention_dim: int = 1280
     dropout: float = 0.0
     use_linear_projection: bool = False
+    dtype: jnp.dtype = jnp.float16
     flip_sin_to_cos: bool = True
     freq_shift: int = 0
     use_memory_efficient_attention: bool = False
     def init_weights(self, rng: jax.random.KeyArray) -> FrozenDict:
         # init input tensors
         sample_shape = (1, self.in_channels, self.sample_size, self.sample_size)
+        sample = jnp.zeros(sample_shape, dtype=self.dtype)
         timesteps = jnp.ones((1,), dtype=jnp.int32)
+        encoder_hidden_states = jnp.zeros((1, 1, self.cross_attention_dim), dtype=self.dtype)
         params_rng, dropout_rng = jax.random.split(rng)
         rngs = {"params": params_rng, "dropout": dropout_rng}
         # time
         self.time_proj = FlaxTimesteps(
+            block_out_channels[0], flip_sin_to_cos=self.flip_sin_to_cos, freq_shift=self.config.freq_shift
         )
         self.time_embedding = FlaxTimestepEmbedding(time_embed_dim, dtype=self.dtype)
         self.down_blocks = down_blocks
         # mid
+        self.mid_block = FlaxUNetCrossAttnMidBlock2D(
             in_channels=block_out_channels[-1],
             dropout=self.dropout,
             attn_num_head_channels=attention_head_dim[-1],
         for i, up_block_type in enumerate(self.up_block_types):
             prev_output_channel = output_channel
             output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
             is_final_block = i == len(block_out_channels) - 1
         down_block_res_samples = (sample,)
         for down_block in self.down_blocks:
             if isinstance(down_block, FlaxCrossAttnDownBlock2D):
+                sample, res_samples = down_block(sample, t_emb, encoder_hidden_states, deterministic=not train)
             else:
                 sample, res_samples = down_block(sample, t_emb, deterministic=not train)
             down_block_res_samples += res_samples
             down_block_res_samples = new_down_block_res_samples
         # 4. mid
+        sample = self.mid_block(sample, t_emb, encoder_hidden_states, deterministic=not train)
         if mid_block_additional_residual is not None:
             sample += mid_block_additional_residual
         # 5. up
         for up_block in self.up_blocks:
             res_samples = down_block_res_samples[-(self.layers_per_block + 1) :]
+            down_block_res_samples = down_block_res_samples[: -(self.layers_per_block + 1)]
             if isinstance(up_block, FlaxCrossAttnUpBlock2D):
                 sample = up_block(
                     sample,
                     deterministic=not train,
                 )
             else:
+                sample = up_block(sample, temb=t_emb, res_hidden_states_tuple=res_samples, deterministic=not train)
+        # 6. post-process
+        sample = self.conv_norm_out(sample)
+        sample = nn.silu(sample)
+        sample = self.conv_out(sample)
+        sample = jnp.transpose(sample, (0, 3, 1, 2))
+        if not return_dict:
+            return (sample,)
+        return FlaxUNet2DConditionOutput(sample=sample)
+@flax_register_to_config
+class FlaxLoRAUNet2DConditionModel(nn.Module, FlaxModelMixin, ConfigMixin):
+    r"""
+    FlaxLoRAUNet2DConditionModel is a custom FlaxUNet2DConditionModel with a few tweaks:
+    - Cross Attention is replaced by Cross-Frame Attention
+    - Low Rank Adaptation (LoRA) layers are added to the Cross-Frame Attention
+    - An frame positional encoding is added to the encoder_hidden_states via a LoRA linear layer
+    FlaxUNet2DConditionModel is a conditional 2D UNet model that takes in a noisy sample, conditional state, and a
+    timestep and returns sample shaped output.
+    This model inherits from [`FlaxModelMixin`]. Check the superclass documentation for the generic methods the library
+    implements for all the models (such as downloading or saving, etc.)
+    Also, this model is a Flax Linen [flax.linen.Module](https://flax.readthedocs.io/en/latest/flax.linen.html#module)
+    subclass. Use it as a regular Flax linen Module and refer to the Flax documentation for all matter related to
+    general usage and behavior.
+    Finally, this model supports inherent JAX features such as:
+    - [Just-In-Time (JIT) compilation](https://jax.readthedocs.io/en/latest/jax.html#just-in-time-compilation-jit)
+    - [Automatic Differentiation](https://jax.readthedocs.io/en/latest/jax.html#automatic-differentiation)
+    - [Vectorization](https://jax.readthedocs.io/en/latest/jax.html#vectorization-vmap)
+    - [Parallelization](https://jax.readthedocs.io/en/latest/jax.html#parallelization-pmap)
+    Parameters:
+        sample_size (`int`, *optional*):
+            The size of the input sample.
+        in_channels (`int`, *optional*, defaults to 4):
+            The number of channels in the input sample.
+        out_channels (`int`, *optional*, defaults to 4):
+            The number of channels in the output.
+        down_block_types (`Tuple[str]`, *optional*, defaults to `("CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "CrossAttnDownBlock2D", "DownBlock2D")`):
+            The tuple of downsample blocks to use. The corresponding class names will be: "FlaxCrossAttnDownBlock2D",
+            "FlaxCrossAttnDownBlock2D", "FlaxCrossAttnDownBlock2D", "FlaxDownBlock2D"
+        up_block_types (`Tuple[str]`, *optional*, defaults to `("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D",)`):
+            The tuple of upsample blocks to use. The corresponding class names will be: "FlaxUpBlock2D",
+            "FlaxCrossAttnUpBlock2D", "FlaxCrossAttnUpBlock2D", "FlaxCrossAttnUpBlock2D"
+        block_out_channels (`Tuple[int]`, *optional*, defaults to `(320, 640, 1280, 1280)`):
+            The tuple of output channels for each block.
+        layers_per_block (`int`, *optional*, defaults to 2):
+            The number of layers per block.
+        attention_head_dim (`int` or `Tuple[int]`, *optional*, defaults to 8):
+            The dimension of the attention heads.
+        cross_attention_dim (`int`, *optional*, defaults to 768):
+            The dimension of the cross attention features.
+        dropout (`float`, *optional*, defaults to 0):
+            Dropout probability for down, up and bottleneck blocks.
+        flip_sin_to_cos (`bool`, *optional*, defaults to `True`):
+            Whether to flip the sin to cos in the time embedding.
+        freq_shift (`int`, *optional*, defaults to 0): The frequency shift to apply to the time embedding.
+        use_memory_efficient_attention (`bool`, *optional*, defaults to `False`):
+            enable memory efficient attention https://arxiv.org/abs/2112.05682
+    """
+    sample_size: int = 32
+    in_channels: int = 4
+    out_channels: int = 4
+    down_block_types: Tuple[str] = (
+        "CrossAttnDownBlock2D",
+        "CrossAttnDownBlock2D",
+        "CrossAttnDownBlock2D",
+        "DownBlock2D",
+    )
+    up_block_types: Tuple[str] = ("UpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D", "CrossAttnUpBlock2D")
+    only_cross_attention: Union[bool, Tuple[bool]] = False
+    block_out_channels: Tuple[int] = (320, 640, 1280, 1280)
+    layers_per_block: int = 2
+    attention_head_dim: Union[int, Tuple[int]] = 8
+    cross_attention_dim: int = 1280
+    dropout: float = 0.0
+    use_linear_projection: bool = False
+    dtype: jnp.dtype = jnp.float16
+    flip_sin_to_cos: bool = True
+    freq_shift: int = 0
+    use_memory_efficient_attention: bool = False
+    def init_weights(self, rng: jax.random.KeyArray) -> FrozenDict:
+        # init input tensors
+        sample_shape = (1, self.in_channels, self.sample_size, self.sample_size)
+        sample = jnp.zeros(sample_shape, dtype=self.dtype)
+        timesteps = jnp.ones((1,), dtype=jnp.int32)
+        encoder_hidden_states = jnp.zeros((1, 1, self.cross_attention_dim), dtype=self.dtype)
+        params_rng, dropout_rng = jax.random.split(rng)
+        rngs = {"params": params_rng, "dropout": dropout_rng}
+        return self.init(rngs, sample, timesteps, encoder_hidden_states)["params"]
+    def setup(self):
+        block_out_channels = self.block_out_channels
+        time_embed_dim = block_out_channels[0] * 4
+        # input
+        self.conv_in = nn.Conv(
+            block_out_channels[0],
+            kernel_size=(3, 3),
+            strides=(1, 1),
+            padding=((1, 1), (1, 1)),
+            dtype=self.dtype,
+        )
+        # time
+        self.time_proj = FlaxTimesteps(
+            block_out_channels[0], flip_sin_to_cos=self.flip_sin_to_cos, freq_shift=self.config.freq_shift
+        )
+        self.time_embedding = FlaxTimestepEmbedding(time_embed_dim, dtype=self.dtype)
+        only_cross_attention = self.only_cross_attention
+        if isinstance(only_cross_attention, bool):
+            only_cross_attention = (only_cross_attention,) * len(self.down_block_types)
+        attention_head_dim = self.attention_head_dim
+        if isinstance(attention_head_dim, int):
+            attention_head_dim = (attention_head_dim,) * len(self.down_block_types)
+        # #frame positional embedding
+        # self.frame_pe = LoRAPositionalEncoding(self.cross_attention_dim)
+        # down
+        down_blocks = []
+        output_channel = block_out_channels[0]
+        for i, down_block_type in enumerate(self.down_block_types):
+            input_channel = output_channel
+            output_channel = block_out_channels[i]
+            is_final_block = i == len(block_out_channels) - 1
+            if down_block_type == "CrossAttnDownBlock2D":
+                down_block = FlaxLoRACrossAttnDownBlock2D(
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    dropout=self.dropout,
+                    num_layers=self.layers_per_block,
+                    attn_num_head_channels=attention_head_dim[i],
+                    add_downsample=not is_final_block,
+                    use_linear_projection=self.use_linear_projection,
+                    only_cross_attention=only_cross_attention[i],
+                    use_memory_efficient_attention=self.use_memory_efficient_attention,
+                    dtype=self.dtype,
+                )
+            else:
+                down_block = FlaxDownBlock2D(
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    dropout=self.dropout,
+                    num_layers=self.layers_per_block,
+                    add_downsample=not is_final_block,
+                    dtype=self.dtype,
+                )
+            down_blocks.append(down_block)
+        self.down_blocks = down_blocks
+        # mid
+        self.mid_block = FlaxLoRAUNetCrossAttnMidBlock2D(
+            in_channels=block_out_channels[-1],
+            dropout=self.dropout,
+            attn_num_head_channels=attention_head_dim[-1],
+            use_linear_projection=self.use_linear_projection,
+            use_memory_efficient_attention=self.use_memory_efficient_attention,
+            dtype=self.dtype,
+        )
+        # up
+        up_blocks = []
+        reversed_block_out_channels = list(reversed(block_out_channels))
+        reversed_attention_head_dim = list(reversed(attention_head_dim))
+        only_cross_attention = list(reversed(only_cross_attention))
+        output_channel = reversed_block_out_channels[0]
+        for i, up_block_type in enumerate(self.up_block_types):
+            prev_output_channel = output_channel
+            output_channel = reversed_block_out_channels[i]
+            input_channel = reversed_block_out_channels[min(i + 1, len(block_out_channels) - 1)]
+            is_final_block = i == len(block_out_channels) - 1
+            if up_block_type == "CrossAttnUpBlock2D":
+                up_block = FlaxLoRACrossAttnUpBlock2D(
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    prev_output_channel=prev_output_channel,
+                    num_layers=self.layers_per_block + 1,
+                    attn_num_head_channels=reversed_attention_head_dim[i],
+                    add_upsample=not is_final_block,
+                    dropout=self.dropout,
+                    use_linear_projection=self.use_linear_projection,
+                    only_cross_attention=only_cross_attention[i],
+                    use_memory_efficient_attention=self.use_memory_efficient_attention,
+                    dtype=self.dtype,
+                )
+            else:
+                up_block = FlaxUpBlock2D(
+                    in_channels=input_channel,
+                    out_channels=output_channel,
+                    prev_output_channel=prev_output_channel,
+                    num_layers=self.layers_per_block + 1,
+                    add_upsample=not is_final_block,
+                    dropout=self.dropout,
+                    dtype=self.dtype,
+                )
+            up_blocks.append(up_block)
+            prev_output_channel = output_channel
+        self.up_blocks = up_blocks
+        # out
+        self.conv_norm_out = nn.GroupNorm(num_groups=32, epsilon=1e-5)
+        self.conv_out = nn.Conv(
+            self.out_channels,
+            kernel_size=(3, 3),
+            strides=(1, 1),
+            padding=((1, 1), (1, 1)),
+            dtype=self.dtype,
+        )
+    def __call__(
+        self,
+        sample,
+        timesteps,
+        encoder_hidden_states,
+        down_block_additional_residuals=None,
+        mid_block_additional_residual=None,
+        return_dict: bool = True,
+        train: bool = False,
+        scale: float = 1.,
+    ) -> Union[FlaxUNet2DConditionOutput, Tuple]:
+        r"""
+        Args:
+            sample (`jnp.ndarray`): (batch, channel, height, width) noisy inputs tensor
+            timestep (`jnp.ndarray` or `float` or `int`): timesteps
+            encoder_hidden_states (`jnp.ndarray`): (batch_size, sequence_length, hidden_size) encoder hidden states
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`models.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] instead of a
+                plain tuple.
+            train (`bool`, *optional*, defaults to `False`):
+                Use deterministic functions and disable dropout when not training.
+        Returns:
+            [`~models.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] or `tuple`:
+            [`~models.unet_2d_condition_flax.FlaxUNet2DConditionOutput`] if `return_dict` is True, otherwise a `tuple`.
+            When returning a tuple, the first element is the sample tensor.
+        """
+        # 1. time
+        if not isinstance(timesteps, jnp.ndarray):
+            timesteps = jnp.array([timesteps], dtype=jnp.int32)
+        elif isinstance(timesteps, jnp.ndarray) and len(timesteps.shape) == 0:
+            timesteps = timesteps.astype(dtype=jnp.float32)
+            timesteps = jnp.expand_dims(timesteps, 0)
+        t_emb = self.time_proj(timesteps)
+        t_emb = self.time_embedding(t_emb)
+        # 2. pre-process
+        sample = jnp.transpose(sample, (0, 2, 3, 1))
+        sample = self.conv_in(sample)
+        # 3. down
+        down_block_res_samples = (sample,)
+        for down_block in self.down_blocks:
+            if isinstance(down_block, FlaxLoRACrossAttnDownBlock2D):
+                sample, res_samples = down_block(sample, t_emb, encoder_hidden_states, deterministic=not train, scale=scale)
+            else:
+                sample, res_samples = down_block(sample, t_emb, deterministic=not train)
+            down_block_res_samples += res_samples
+        if down_block_additional_residuals is not None:
+            new_down_block_res_samples = ()
+            for down_block_res_sample, down_block_additional_residual in zip(
+                down_block_res_samples, down_block_additional_residuals
+            ):
+                down_block_res_sample += down_block_additional_residual
+                new_down_block_res_samples += (down_block_res_sample,)
+            down_block_res_samples = new_down_block_res_samples
+        # if encoder_hidden_states is not None:
+        #     #adding frame positional encoding
+        #     encoder_hidden_states = self.frame_pe(encoder_hidden_states, scale=scale)
+        # 4. mid
+        sample = self.mid_block(sample, t_emb, encoder_hidden_states, deterministic=not train, scale=scale)
+        if mid_block_additional_residual is not None:
+            sample += mid_block_additional_residual
+        # 5. up
+        for up_block in self.up_blocks:
+            res_samples = down_block_res_samples[-(self.layers_per_block + 1) :]
+            down_block_res_samples = down_block_res_samples[: -(self.layers_per_block + 1)]
+            if isinstance(up_block, FlaxLoRACrossAttnUpBlock2D):
                 sample = up_block(
                     sample,
                     temb=t_emb,
+                    encoder_hidden_states=encoder_hidden_states,
                     res_hidden_states_tuple=res_samples,
                     deterministic=not train,
+                    scale=scale,
                 )
+            else:
+                sample = up_block(sample, temb=t_emb, res_hidden_states_tuple=res_samples, deterministic=not train)
         # 6. post-process
         sample = self.conv_norm_out(sample)
         if not return_dict:
             return (sample,)
+        return FlaxUNet2DConditionOutput(sample=sample)

text_to_animation/pipelines/text_to_video_pipeline_flax.py CHANGED Viewed

@@ -11,11 +11,7 @@ from flax.training.common_utils import shard
 from PIL import Image
 from transformers import CLIPFeatureExtractor, CLIPTokenizer, FlaxCLIPTextModel
 from einops import rearrange, repeat
-from diffusers.models import (
-    FlaxAutoencoderKL,
-    FlaxControlNetModel,
-    FlaxUNet2DConditionModel,
-)
 from diffusers.schedulers import (
     FlaxDDIMScheduler,
     FlaxDPMSolverMultistepScheduler,
@@ -25,24 +21,21 @@ from diffusers.schedulers import (
 from diffusers.utils import PIL_INTERPOLATION, logging, replace_example_docstring
 from diffusers.pipelines.pipeline_flax_utils import FlaxDiffusionPipeline
 from diffusers.pipelines.stable_diffusion import FlaxStableDiffusionPipelineOutput
-from diffusers.pipelines.stable_diffusion.safety_checker_flax import (
-    FlaxStableDiffusionSafetyChecker,
-)
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 """
 Text2Video-Zero:
  - Inputs: Prompt, Pose Control via mp4/gif, First Frame (?)
  - JAX implementation
  - 3DUnet to replace 2DUnetConditional
-"""
 def replicate_devices(array):
     return jnp.expand_dims(array, 0).repeat(jax.device_count(), 0)
-DEBUG = False  # Set to True to use python for loop instead of jax.fori_loop for easier debugging
 EXAMPLE_DOC_STRING = """
     Examples:
@@ -101,8 +94,6 @@ EXAMPLE_DOC_STRING = """
         >>> output_images.save("generated_image.png")
         ```
 """
 class FlaxTextToVideoPipeline(FlaxDiffusionPipeline):
     def __init__(
         self,
@@ -113,10 +104,7 @@ class FlaxTextToVideoPipeline(FlaxDiffusionPipeline):
         unet_vanilla,
         controlnet,
         scheduler: Union[
-            FlaxDDIMScheduler,
-            FlaxPNDMScheduler,
-            FlaxLMSDiscreteScheduler,
-            FlaxDPMSolverMultistepScheduler,
         ],
         safety_checker: FlaxStableDiffusionSafetyChecker,
         feature_extractor: CLIPFeatureExtractor,
@@ -154,50 +142,30 @@ class FlaxTextToVideoPipeline(FlaxDiffusionPipeline):
         else:
             eps = jax.random.normal(prng, x0.shape, dtype=text_embeddings.dtype)
             alpha_vec = jnp.prod(params["scheduler"].common.alphas[t0:tMax])
-            xt = jnp.sqrt(alpha_vec) * x0 + jnp.sqrt(1 - alpha_vec) * eps
             return xt
-    def DDIM_backward(
-        self,
-        params,
-        num_inference_steps,
-        timesteps,
-        skip_t,
-        t0,
-        t1,
-        do_classifier_free_guidance,
-        text_embeddings,
-        latents_local,
-        guidance_scale,
-        controlnet_image=None,
-        controlnet_conditioning_scale=None,
-    ):
-        scheduler_state = self.scheduler.set_timesteps(
-            params["scheduler"], num_inference_steps
-        )
         f = latents_local.shape[2]
         latents_local = rearrange(latents_local, "b c f h w -> (b f) c h w")
         latents = latents_local.copy()
         x_t0_1 = None
         x_t1_1 = None
-        max_timestep = len(timesteps) - 1
         timesteps = jnp.array(timesteps)
         def while_body(args):
             step, latents, x_t0_1, x_t1_1, scheduler_state = args
             t = jnp.array(scheduler_state.timesteps, dtype=jnp.int32)[step]
-            latent_model_input = (
-                jnp.concatenate([latents] * 2)
-                if do_classifier_free_guidance
-                else latents
-            )
             latent_model_input = self.scheduler.scale_model_input(
                 scheduler_state, latent_model_input, timestep=t
             )
             f = latents.shape[0]
-            te = jnp.stack(
-                [text_embeddings[0, :, :]] * f + [text_embeddings[-1, :, :]] * f
-            )
             timestep = jnp.broadcast_to(t, latent_model_input.shape[0])
             if controlnet_image is not None:
                 down_block_res_samples, mid_block_res_sample = self.controlnet.apply(
@@ -224,43 +192,32 @@ class FlaxTextToVideoPipeline(FlaxDiffusionPipeline):
                     jnp.array(latent_model_input),
                     jnp.array(timestep, dtype=jnp.int32),
                     encoder_hidden_states=te,
-                ).sample
             # perform guidance
             if do_classifier_free_guidance:
                 noise_pred_uncond, noise_pred_text = jnp.split(noise_pred, 2, axis=0)
-                noise_pred = noise_pred_uncond + guidance_scale * (
-                    noise_pred_text - noise_pred_uncond
-                )
             # compute the previous noisy sample x_t -> x_t-1
-            latents, scheduler_state = self.scheduler.step(
-                scheduler_state, noise_pred, t, latents
-            ).to_tuple()
-            x_t0_1 = jax.lax.select(
-                (step < max_timestep - 1) & (timesteps[step + 1] == t0), latents, x_t0_1
-            )
-            x_t1_1 = jax.lax.select(
-                (step < max_timestep - 1) & (timesteps[step + 1] == t1), latents, x_t1_1
-            )
             return (step + 1, latents, x_t0_1, x_t1_1, scheduler_state)
         latents_shape = latents.shape
         x_t0_1, x_t1_1 = jnp.zeros(latents_shape), jnp.zeros(latents_shape)
         def cond_fun(arg):
             step, latents, x_t0_1, x_t1_1, scheduler_state = arg
             return (step < skip_t) & (step < num_inference_steps)
         if DEBUG:
             step = 0
             while cond_fun((step, latents, x_t0_1, x_t1_1)):
-                step, latents, x_t0_1, x_t1_1, scheduler_state = while_body(
-                    (step, latents, x_t0_1, x_t1_1, scheduler_state)
-                )
                 step = step + 1
         else:
-            _, latents, x_t0_1, x_t1_1, scheduler_state = jax.lax.while_loop(
-                cond_fun, while_body, (0, latents, x_t0_1, x_t1_1, scheduler_state)
-            )
         latents = rearrange(latents, "(b f) c h w -> b c f h w", f=f)
         res = {"x0": latents.copy()}
         if x_t0_1 is not None:
@@ -270,7 +227,7 @@ class FlaxTextToVideoPipeline(FlaxDiffusionPipeline):
             x_t1_1 = rearrange(x_t1_1, "(b f) c h w -> b c f  h w", f=f)
             res["x_t1_1"] = x_t1_1.copy()
         return res
     def warp_latents_independently(self, latents, reference_flow):
         _, _, H, W = reference_flow.shape
         b, _, f, h, w = latents.shape
@@ -281,10 +238,10 @@ class FlaxTextToVideoPipeline(FlaxDiffusionPipeline):
         coords_t0 = coords_t0.at[:, 1].set(coords_t0[:, 1] * h / H)
         f, c, _, _ = coords_t0.shape
         coords_t0 = jax.image.resize(coords_t0, (f, c, h, w), "linear")
-        coords_t0 = rearrange(coords_t0, "f c h w -> f h w c")
-        latents_0 = rearrange(latents[0], "c f h w -> f  c  h w")
         warped = grid_sample(latents_0, coords_t0, "mirror")
-        warped = rearrange(warped, "(b f) c h w -> b c f h w", f=f)
         return warped
     def warp_vid_independently(self, vid, reference_flow):
@@ -296,173 +253,75 @@ class FlaxTextToVideoPipeline(FlaxDiffusionPipeline):
         coords_t0 = coords_t0.at[:, 1].set(coords_t0[:, 1] * h / H)
         f, c, _, _ = coords_t0.shape
         coords_t0 = jax.image.resize(coords_t0, (f, c, h, w), "linear")
-        coords_t0 = rearrange(coords_t0, "f c h w -> f h w c")
         # latents_0 = rearrange(vid, 'c f h w -> f  c  h w')
         warped = grid_sample(vid, coords_t0, "zeropad")
         # warped = rearrange(warped, 'f c h w -> b c f h w', f=f)
         return warped
-    def create_motion_field(
-        self,
-        motion_field_strength_x,
-        motion_field_strength_y,
-        frame_ids,
-        video_length,
-        latents,
-    ):
-        reference_flow = jnp.zeros((video_length - 1, 2, 512, 512), dtype=latents.dtype)
         for fr_idx, frame_id in enumerate(frame_ids):
-            reference_flow = reference_flow.at[fr_idx, 0, :, :].set(
-                motion_field_strength_x * (frame_id)
-            )
-            reference_flow = reference_flow.at[fr_idx, 1, :, :].set(
-                motion_field_strength_y * (frame_id)
-            )
         return reference_flow
-    def create_motion_field_and_warp_latents(
-        self,
-        motion_field_strength_x,
-        motion_field_strength_y,
-        frame_ids,
-        video_length,
-        latents,
-    ):
-        motion_field = self.create_motion_field(
-            motion_field_strength_x=motion_field_strength_x,
-            motion_field_strength_y=motion_field_strength_y,
-            latents=latents,
-            video_length=video_length,
-            frame_ids=frame_ids,
-        )
         for idx, latent in enumerate(latents):
-            latents = latents.at[idx].set(
-                self.warp_latents_independently(latent[None], motion_field)[0]
-            )
         return motion_field, latents
-    def text_to_video_zero(
-        self,
-        params,
-        prng,
-        text_embeddings,
-        video_length: Optional[int],
-        do_classifier_free_guidance=True,
-        height: Optional[int] = None,
-        width: Optional[int] = None,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        num_videos_per_prompt: Optional[int] = 1,
-        xT=None,
-        smooth_bg_strength: float = 0.0,
-        motion_field_strength_x: float = 12,
-        motion_field_strength_y: float = 12,
-        t0: int = 44,
-        t1: int = 47,
-        controlnet_image=None,
-        controlnet_conditioning_scale=0,
-    ):
         frame_ids = list(range(video_length))
         # Prepare timesteps
-        params["scheduler"] = self.scheduler.set_timesteps(
-            params["scheduler"], num_inference_steps
-        )
         timesteps = params["scheduler"].timesteps
         # Prepare latent variables
         num_channels_latents = self.unet.in_channels
         batch_size = 1
-        xT = prepare_latents(
-            params,
-            prng,
-            batch_size * num_videos_per_prompt,
-            num_channels_latents,
-            height,
-            width,
-            self.vae_scale_factor,
-            xT,
-        )
-        timesteps_ddpm = [
-            981,
-            961,
-            941,
-            921,
-            901,
-            881,
-            861,
-            841,
-            821,
-            801,
-            781,
-            761,
-            741,
-            721,
-            701,
-            681,
-            661,
-            641,
-            621,
-            601,
-            581,
-            561,
-            541,
-            521,
-            501,
-            481,
-            461,
-            441,
-            421,
-            401,
-            381,
-            361,
-            341,
-            321,
-            301,
-            281,
-            261,
-            241,
-            221,
-            201,
-            181,
-            161,
-            141,
-            121,
-            101,
-            81,
-            61,
-            41,
-            21,
-            1,
-        ]
         timesteps_ddpm.reverse()
         t0 = timesteps_ddpm[t0]
         t1 = timesteps_ddpm[t1]
         x_t1_1 = None
         # Denoising loop
-        shape = (
-            batch_size,
-            num_channels_latents,
-            1,
-            height // self.vae.scaling_factor,
-            width // self.vae.scaling_factor,
-        )
         #  perform ∆t backward steps by stable diffusion
-        ddim_res = self.DDIM_backward(
-            params,
-            num_inference_steps=num_inference_steps,
-            timesteps=timesteps,
-            skip_t=1000,
-            t0=t0,
-            t1=t1,
-            do_classifier_free_guidance=do_classifier_free_guidance,
-            text_embeddings=text_embeddings,
-            latents_local=xT,
-            guidance_scale=guidance_scale,
-            controlnet_image=jnp.stack([controlnet_image[0]] * 2),
-            controlnet_conditioning_scale=controlnet_conditioning_scale,
-        )
         x0 = ddim_res["x0"]
         # apply warping functions
@@ -470,89 +329,46 @@ class FlaxTextToVideoPipeline(FlaxDiffusionPipeline):
             x_t0_1 = ddim_res["x_t0_1"]
         if "x_t1_1" in ddim_res:
             x_t1_1 = ddim_res["x_t1_1"]
-        x_t0_k = x_t0_1[:, :, :1, :, :].repeat(video_length - 1, 2)
         reference_flow, x_t0_k = self.create_motion_field_and_warp_latents(
-            motion_field_strength_x=motion_field_strength_x,
-            motion_field_strength_y=motion_field_strength_y,
-            latents=x_t0_k,
-            video_length=video_length,
-            frame_ids=frame_ids[1:],
-        )
         # assuming t0=t1=1000, if t0 = 1000
         # DDPM forward for more motion freedom
-        ddpm_fwd = partial(
-            self.DDPM_forward,
-            params=params,
-            prng=prng,
-            x0=x_t0_k,
-            t0=t0,
-            tMax=t1,
-            shape=shape,
-            text_embeddings=text_embeddings,
         )
-        x_t1_k = jax.lax.cond(t1 > t0, ddpm_fwd, lambda: x_t0_k)
         x_t1 = jnp.concatenate([x_t1_1, x_t1_k], axis=2)
         # backward stepts by stable diffusion
-        # warp the controlnet image following the same flow defined for latent
         controlnet_video = controlnet_image[:video_length]
-        controlnet_video = controlnet_video.at[1:].set(
-            self.warp_vid_independently(controlnet_video[1:], reference_flow)
-        )
-        controlnet_image = jnp.concatenate([controlnet_video] * 2)
         smooth_bg = True
         if smooth_bg:
-            # latent shape: "b c f h w"
-            M_FG = repeat(
-                get_mask_pose(controlnet_video),
-                "f h w -> b c f h w",
-                c=x_t1.shape[1],
-                b=batch_size,
-            )
-            initial_bg = repeat(
-                x_t1[:, :, 0] * (1 - M_FG[:, :, 0]),
-                "b c h w -> b c f h w",
-                f=video_length - 1,
-            )
-            # warp the controlnet image following the same flow defined for latent #f c h w
-            initial_bg_warped = self.warp_latents_independently(
-                initial_bg, reference_flow
-            )
-            bgs = x_t1[:, :, 1:] * (1 - M_FG[:, :, 1:])  # initial background
-            initial_mask_warped = 1 - self.warp_latents_independently(
-                repeat(M_FG[:, :, 0], "b c h w -> b c f h w", f=video_length - 1),
-                reference_flow,
-            )
             # initial_mask_warped = 1 - warp_vid_independently(repeat(M_FG[:,:,0], "b c h w -> (b f) c h w", f = video_length-1), reference_flow)
             # initial_mask_warped = rearrange(initial_mask_warped, "(b f) c h w -> b c f h w", b=batch_size)
-            mask = (1 - M_FG[:, :, 1:]) * initial_mask_warped
-            x_t1 = x_t1.at[:, :, 1:].set(
-                (1 - mask) * x_t1[:, :, 1:]
-                + mask
-                * (
-                    initial_bg_warped * smooth_bg_strength
-                    + (1 - smooth_bg_strength) * bgs
-                )
-            )
-        ddim_res = self.DDIM_backward(
-            params,
-            num_inference_steps=num_inference_steps,
-            timesteps=timesteps,
-            skip_t=t1,
-            t0=-1,
-            t1=-1,
-            do_classifier_free_guidance=do_classifier_free_guidance,
-            text_embeddings=text_embeddings,
-            latents_local=x_t1,
-            guidance_scale=guidance_scale,
-            controlnet_image=controlnet_image,
-            controlnet_conditioning_scale=controlnet_conditioning_scale,
-        )
         x0 = ddim_res["x0"]
         del ddim_res
         del x_t1
@@ -560,42 +376,25 @@ class FlaxTextToVideoPipeline(FlaxDiffusionPipeline):
         del x_t1_k
         return x0
-    def denoise_latent(
-        self,
-        params,
-        num_inference_steps,
-        timesteps,
-        do_classifier_free_guidance,
-        text_embeddings,
-        latents,
-        guidance_scale,
-        controlnet_image=None,
-        controlnet_conditioning_scale=None,
-    ):
-        scheduler_state = self.scheduler.set_timesteps(
-            params["scheduler"], num_inference_steps
-        )
         # f = latents_local.shape[2]
         # latents_local = rearrange(latents_local, "b c f h w -> (b f) c h w")
-        max_timestep = len(timesteps) - 1
         timesteps = jnp.array(timesteps)
         def while_body(args):
             step, latents, scheduler_state = args
             t = jnp.array(scheduler_state.timesteps, dtype=jnp.int32)[step]
-            latent_model_input = (
-                jnp.concatenate([latents] * 2)
-                if do_classifier_free_guidance
-                else latents
-            )
             latent_model_input = self.scheduler.scale_model_input(
                 scheduler_state, latent_model_input, timestep=t
             )
             f = latents.shape[0]
-            te = jnp.stack(
-                [text_embeddings[0, :, :]] * f + [text_embeddings[-1, :, :]] * f
-            )
             timestep = jnp.broadcast_to(t, latent_model_input.shape[0])
             if controlnet_image is not None:
                 down_block_res_samples, mid_block_res_sample = self.controlnet.apply(
@@ -622,215 +421,104 @@ class FlaxTextToVideoPipeline(FlaxDiffusionPipeline):
                     jnp.array(latent_model_input),
                     jnp.array(timestep, dtype=jnp.int32),
                     encoder_hidden_states=te,
-                ).sample
             # perform guidance
             if do_classifier_free_guidance:
                 noise_pred_uncond, noise_pred_text = jnp.split(noise_pred, 2, axis=0)
-                noise_pred = noise_pred_uncond + guidance_scale * (
-                    noise_pred_text - noise_pred_uncond
-                )
             # compute the previous noisy sample x_t -> x_t-1
-            latents, scheduler_state = self.scheduler.step(
-                scheduler_state, noise_pred, t, latents
-            ).to_tuple()
             return (step + 1, latents, scheduler_state)
         def cond_fun(arg):
             step, latents, scheduler_state = arg
-            return step < num_inference_steps
         if DEBUG:
             step = 0
             while cond_fun((step, latents, scheduler_state)):
-                step, latents, scheduler_state = while_body(
-                    (step, latents, scheduler_state)
-                )
                 step = step + 1
         else:
-            _, latents, scheduler_state = jax.lax.while_loop(
-                cond_fun, while_body, (0, latents, scheduler_state)
-            )
         # latents = rearrange(latents, "(b f) c h w -> b c f h w", f=f)
         return latents
-    @partial(jax.jit, static_argnums=(0, 1))
-    def _generate_starting_frames(
-        self,
-        num_inference_steps,
-        params,
-        timesteps,
-        text_embeddings,
-        latents,
-        guidance_scale,
-        controlnet_image,
-        controlnet_conditioning_scale,
-    ):
-        #  perform ∆t backward steps by stable diffusion
-        # delta_t_diffusion = jax.vmap(lambda latent : self.DDIM_backward(params, num_inference_steps=num_inference_steps, timesteps=timesteps, skip_t=1000, t0=t0, t1=t1, do_classifier_free_guidance=do_classifier_free_guidance,
-        #                                     text_embeddings=text_embeddings, latents_local=latent, guidance_scale=guidance_scale,
-        #                                     controlnet_image=controlnet_image, controlnet_conditioning_scale=controlnet_conditioning_scale))
-        # ddim_res = delta_t_diffusion(latents)
-        # latents = ddim_res["x0"] #output is  i b c f h w
-        # DDPM forward for more motion freedom
-        # ddpm_fwd = jax.vmap(lambda prng, latent: self.DDPM_forward(params=params, prng=prng, x0=latent, t0=t0,
-        #                 tMax=t1, shape=shape, text_embeddings=text_embeddings))
-        # latents = ddpm_fwd(stacked_prngs, latents)
-        # main backward diffusion
-        # denoise_first_frame = lambda latent : self.DDIM_backward(params, num_inference_steps=num_inference_steps, timesteps=timesteps, skip_t=100000, t0=-1, t1=-1, do_classifier_free_guidance=do_classifier_free_guidance,
-        #                                     text_embeddings=text_embeddings, latents_local=latent, guidance_scale=guidance_scale,
-        #                                     controlnet_image=controlnet_image, controlnet_conditioning_scale=controlnet_conditioning_scale, use_vanilla=True)
-        # latents = rearrange(latents, 'i b c f h w -> (i b) c f h w')
-        # ddim_res = denoise_first_frame(latents)
-        latents = self.denoise_latent(
-            params,
-            num_inference_steps=num_inference_steps,
-            timesteps=timesteps,
-            do_classifier_free_guidance=True,
-            text_embeddings=text_embeddings,
-            latents=latents,
-            guidance_scale=guidance_scale,
-            controlnet_image=controlnet_image,
-            controlnet_conditioning_scale=controlnet_conditioning_scale,
-        )
-        # latents = rearrange(ddim_res["x0"], 'i b c f h w -> (i b) c f h w') #output is  i b c f h w
-        # scale and decode the image latents with vae
-        latents = 1 / self.vae.config.scaling_factor * latents
-        # latents = rearrange(latents, "b c h w -> (b f) c h w")
-        imgs = self.vae.apply(
-            {"params": params["vae"]}, latents, method=self.vae.decode
-        ).sample
-        imgs = (imgs / 2 + 0.5).clip(0, 1).transpose(0, 2, 3, 1)
-        return imgs
-    def generate_starting_frames(
-        self,
-        params,
-        prngs: list,  # list of prngs for each img
-        prompt,
-        neg_prompt,
-        controlnet_image,
-        do_classifier_free_guidance=True,
-        num_inference_steps: int = 50,
-        guidance_scale: float = 7.5,
-        t0: int = 44,
-        t1: int = 47,
-        controlnet_conditioning_scale=1.0,
-    ):
         height, width = controlnet_image.shape[-2:]
         if height % 64 != 0 or width % 64 != 0:
-            raise ValueError(
-                f"`height` and `width` have to be divisible by 64 but are {height} and {width}."
-            )
-        shape = (
-            self.unet.in_channels,
-            height // self.vae_scale_factor,
-            width // self.vae_scale_factor,
-        )  # b c h w
         # scale the initial noise by the standard deviation required by the scheduler
-        print(
-            f"Generating {len(prngs)} first frames with prompt {prompt}, for {num_inference_steps} steps. PRNG seeds are: {prngs}"
-        )
-        latents = jnp.stack(
-            [jax.random.normal(prng, shape) for prng in prngs]
-        )  # b c h w
         latents = latents * params["scheduler"].init_noise_sigma
         timesteps = params["scheduler"].timesteps
-        timesteps_ddpm = [
-            981,
-            961,
-            941,
-            921,
-            901,
-            881,
-            861,
-            841,
-            821,
-            801,
-            781,
-            761,
-            741,
-            721,
-            701,
-            681,
-            661,
-            641,
-            621,
-            601,
-            581,
-            561,
-            541,
-            521,
-            501,
-            481,
-            461,
-            441,
-            421,
-            401,
-            381,
-            361,
-            341,
-            321,
-            301,
-            281,
-            261,
-            241,
-            221,
-            201,
-            181,
-            161,
-            141,
-            121,
-            101,
-            81,
-            61,
-            41,
-            21,
-            1,
-        ]
         timesteps_ddpm.reverse()
         t0 = timesteps_ddpm[t0]
         t1 = timesteps_ddpm[t1]
         # get prompt text embeddings
-        prompt_ids = self.prepare_text_inputs(prompt)
-        prompt_embeds = self.text_encoder(prompt_ids, params=params["text_encoder"])[0]
         # TODO: currently it is assumed `do_classifier_free_guidance = guidance_scale > 1.0`
         # implement this conditional `do_classifier_free_guidance = guidance_scale > 1.0`
         batch_size = 1
         max_length = prompt_ids.shape[-1]
         if neg_prompt is None:
-            uncond_input = self.tokenizer(
-                [""] * batch_size,
-                padding="max_length",
-                max_length=max_length,
-                return_tensors="np",
-            ).input_ids
         else:
             neg_prompt_ids = self.prepare_text_inputs(neg_prompt)
-            uncond_input = neg_prompt_ids
-        negative_prompt_embeds = self.text_encoder(
-            uncond_input, params=params["text_encoder"]
-        )[0]
-        text_embeddings = jnp.concatenate([negative_prompt_embeds, prompt_embeds])
-        controlnet_image = jnp.stack([controlnet_image[0]] * 2 * len(prngs))
-        return self._generate_starting_frames(
-            num_inference_steps,
-            params,
-            timesteps,
-            text_embeddings,
-            latents,
-            guidance_scale,
-            controlnet_image,
-            controlnet_conditioning_scale,
-        )
     def generate_video(
         self,
@@ -845,8 +533,8 @@ class FlaxTextToVideoPipeline(FlaxDiffusionPipeline):
         controlnet_conditioning_scale: Union[float, jnp.array] = 1.0,
         return_dict: bool = True,
         jit: bool = False,
-        xT=None,
-        smooth_bg_strength: float = 0.0,
         motion_field_strength_x: float = 3,
         motion_field_strength_y: float = 4,
         t0: int = 44,
@@ -912,9 +600,7 @@ class FlaxTextToVideoPipeline(FlaxDiffusionPipeline):
         if isinstance(controlnet_conditioning_scale, float):
             # Convert to a tensor so each device gets a copy. Follow the prompt_ids for
             # shape information, as they may be sharded (when `jit` is `True`), or not.
-            controlnet_conditioning_scale = jnp.array(
-                [controlnet_conditioning_scale] * prompt_ids.shape[0]
-            )
             if len(prompt_ids.shape) > 2:
                 # Assume sharded
                 controlnet_conditioning_scale = controlnet_conditioning_scale[:, None]
@@ -928,9 +614,7 @@ class FlaxTextToVideoPipeline(FlaxDiffusionPipeline):
                 num_inference_steps,
                 replicate_devices(guidance_scale),
                 replicate_devices(latents) if latents is not None else None,
-                replicate_devices(neg_prompt_ids)
-                if neg_prompt_ids is not None
-                else None,
                 replicate_devices(controlnet_conditioning_scale),
                 replicate_devices(xT) if xT is not None else None,
                 replicate_devices(smooth_bg_strength),
@@ -961,12 +645,8 @@ class FlaxTextToVideoPipeline(FlaxDiffusionPipeline):
             safety_params = params["safety_checker"]
             images_uint8_casted = (images * 255).round().astype("uint8")
             num_devices, batch_size = images.shape[:2]
-            images_uint8_casted = np.asarray(images_uint8_casted).reshape(
-                num_devices * batch_size, height, width, 3
-            )
-            images_uint8_casted, has_nsfw_concept = self._run_safety_checker(
-                images_uint8_casted, safety_params, jit
-            )
             images = np.asarray(images)
             # block images
             if any(has_nsfw_concept):
@@ -979,15 +659,11 @@ class FlaxTextToVideoPipeline(FlaxDiffusionPipeline):
             has_nsfw_concept = False
         if not return_dict:
             return (images, has_nsfw_concept)
-        return FlaxStableDiffusionPipelineOutput(
-            images=images, nsfw_content_detected=has_nsfw_concept
-        )
     def prepare_text_inputs(self, prompt: Union[str, List[str]]):
         if not isinstance(prompt, (str, list)):
-            raise ValueError(
-                f"`prompt` has to be of type `str` or `list` but is {type(prompt)}"
-            )
         text_input = self.tokenizer(
             prompt,
             padding="max_length",
@@ -996,38 +672,27 @@ class FlaxTextToVideoPipeline(FlaxDiffusionPipeline):
             return_tensors="np",
         )
         return text_input.input_ids
     def prepare_image_inputs(self, image: Union[Image.Image, List[Image.Image]]):
         if not isinstance(image, (Image.Image, list)):
-            raise ValueError(
-                f"image has to be of type `PIL.Image.Image` or list but is {type(image)}"
-            )
         if isinstance(image, Image.Image):
             image = [image]
-        processed_images = jnp.concatenate(
-            [preprocess(img, jnp.float32) for img in image]
-        )
         return processed_images
     def _get_has_nsfw_concepts(self, features, params):
         has_nsfw_concepts = self.safety_checker(features, params)
         return has_nsfw_concepts
     def _run_safety_checker(self, images, safety_model_params, jit=False):
         # safety_model_params should already be replicated when jit is True
         pil_images = [Image.fromarray(image) for image in images]
         features = self.feature_extractor(pil_images, return_tensors="np").pixel_values
         if jit:
             features = shard(features)
-            has_nsfw_concepts = _p_get_has_nsfw_concepts(
-                self, features, safety_model_params
-            )
             has_nsfw_concepts = unshard(has_nsfw_concepts)
             safety_model_params = unreplicate(safety_model_params)
         else:
-            has_nsfw_concepts = self._get_has_nsfw_concepts(
-                features, safety_model_params
-            )
         images_was_copied = False
         for idx, has_nsfw_concept in enumerate(has_nsfw_concepts):
             if has_nsfw_concept:
@@ -1041,7 +706,6 @@ class FlaxTextToVideoPipeline(FlaxDiffusionPipeline):
                     " instead. Try again with a different prompt and/or seed."
                 )
         return images, has_nsfw_concepts
     def _generate(
         self,
         prompt_ids: jnp.array,
@@ -1053,8 +717,8 @@ class FlaxTextToVideoPipeline(FlaxDiffusionPipeline):
         latents: Optional[jnp.array] = None,
         neg_prompt_ids: Optional[jnp.array] = None,
         controlnet_conditioning_scale: float = 1.0,
-        xT=None,
-        smooth_bg_strength: float = 0.0,
         motion_field_strength_x: float = 12,
         motion_field_strength_y: float = 12,
         t0: int = 44,
@@ -1063,9 +727,7 @@ class FlaxTextToVideoPipeline(FlaxDiffusionPipeline):
         height, width = image.shape[-2:]
         video_length = image.shape[0]
         if height % 64 != 0 or width % 64 != 0:
-            raise ValueError(
-                f"`height` and `width` have to be divisible by 64 but are {height} and {width}."
-            )
         # get prompt text embeddings
         prompt_embeds = self.text_encoder(prompt_ids, params=params["text_encoder"])[0]
         # TODO: currently it is assumed `do_classifier_free_guidance = guidance_scale > 1.0`
@@ -1074,47 +736,30 @@ class FlaxTextToVideoPipeline(FlaxDiffusionPipeline):
         max_length = prompt_ids.shape[-1]
         if neg_prompt_ids is None:
             uncond_input = self.tokenizer(
-                [""] * batch_size,
-                padding="max_length",
-                max_length=max_length,
-                return_tensors="np",
             ).input_ids
         else:
             uncond_input = neg_prompt_ids
-        negative_prompt_embeds = self.text_encoder(
-            uncond_input, params=params["text_encoder"]
-        )[0]
         context = jnp.concatenate([negative_prompt_embeds, prompt_embeds])
         image = jnp.concatenate([image] * 2)
         seed_t2vz, prng_seed = jax.random.split(prng_seed)
-        # get the latent following text to video zero
-        latents = self.text_to_video_zero(
-            params,
-            seed_t2vz,
-            text_embeddings=context,
-            video_length=video_length,
-            height=height,
-            width=width,
-            num_inference_steps=num_inference_steps,
-            guidance_scale=guidance_scale,
-            controlnet_image=image,
-            xT=xT,
-            smooth_bg_strength=smooth_bg_strength,
-            t0=t0,
-            t1=t1,
-            motion_field_strength_x=motion_field_strength_x,
-            motion_field_strength_y=motion_field_strength_y,
-            controlnet_conditioning_scale=controlnet_conditioning_scale,
-        )
         # scale and decode the image latents with vae
         latents = 1 / self.vae.config.scaling_factor * latents
         latents = rearrange(latents, "b c f h w -> (b f) c h w")
-        video = self.vae.apply(
-            {"params": params["vae"]}, latents, method=self.vae.decode
-        ).sample
         video = (video / 2 + 0.5).clip(0, 1).transpose(0, 2, 3, 1)
         return video
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         self,
@@ -1129,8 +774,8 @@ class FlaxTextToVideoPipeline(FlaxDiffusionPipeline):
         controlnet_conditioning_scale: Union[float, jnp.array] = 1.0,
         return_dict: bool = True,
         jit: bool = False,
-        xT=None,
-        smooth_bg_strength: float = 0.0,
         motion_field_strength_x: float = 3,
         motion_field_strength_y: float = 4,
         t0: int = 44,
@@ -1187,9 +832,7 @@ class FlaxTextToVideoPipeline(FlaxDiffusionPipeline):
         if isinstance(controlnet_conditioning_scale, float):
             # Convert to a tensor so each device gets a copy. Follow the prompt_ids for
             # shape information, as they may be sharded (when `jit` is `True`), or not.
-            controlnet_conditioning_scale = jnp.array(
-                [controlnet_conditioning_scale] * prompt_ids.shape[0]
-            )
             if len(prompt_ids.shape) > 2:
                 # Assume sharded
                 controlnet_conditioning_scale = controlnet_conditioning_scale[:, None]
@@ -1234,12 +877,8 @@ class FlaxTextToVideoPipeline(FlaxDiffusionPipeline):
             safety_params = params["safety_checker"]
             images_uint8_casted = (images * 255).round().astype("uint8")
             num_devices, batch_size = images.shape[:2]
-            images_uint8_casted = np.asarray(images_uint8_casted).reshape(
-                num_devices * batch_size, height, width, 3
-            )
-            images_uint8_casted, has_nsfw_concept = self._run_safety_checker(
-                images_uint8_casted, safety_params, jit
-            )
             images = np.asarray(images)
             # block images
             if any(has_nsfw_concept):
@@ -1252,9 +891,7 @@ class FlaxTextToVideoPipeline(FlaxDiffusionPipeline):
             has_nsfw_concept = False
         if not return_dict:
             return (images, has_nsfw_concept)
-        return FlaxStableDiffusionPipelineOutput(
-            images=images, nsfw_content_detected=has_nsfw_concept
-        )
 # Static argnums are pipe, num_inference_steps. A change would trigger recompilation.
@@ -1262,11 +899,11 @@ class FlaxTextToVideoPipeline(FlaxDiffusionPipeline):
 @partial(
     jax.pmap,
     in_axes=(None, 0, 0, 0, 0, None, 0, 0, 0, 0, 0, 0, 0, 0, None, None),
-    static_broadcasted_argnums=(0, 5, 14, 15),
 )
 def _p_generate(
     pipe,
-    prompt_ids,
     image,
     params,
     prng_seed,
@@ -1299,20 +936,52 @@ def _p_generate(
         t0,
         t1,
     )
 @partial(jax.pmap, static_broadcasted_argnums=(0,))
 def _p_get_has_nsfw_concepts(pipe, features, params):
     return pipe._get_has_nsfw_concepts(features, params)
 def unshard(x: jnp.ndarray):
     # einops.rearrange(x, 'd b ... -> (d b) ...')
     num_devices, batch_size = x.shape[:2]
     rest = x.shape[2:]
     return x.reshape(num_devices * batch_size, *rest)
 def preprocess(image, dtype):
     image = image.convert("RGB")
     w, h = image.size
@@ -1322,98 +991,61 @@ def preprocess(image, dtype):
     image = image[None].transpose(0, 3, 1, 2)
     return image
-def prepare_latents(
-    params,
-    prng,
-    batch_size,
-    num_channels_latents,
-    height,
-    width,
-    vae_scale_factor,
-    latents=None,
-):
-    shape = (
-        batch_size,
-        num_channels_latents,
-        1,
-        height // vae_scale_factor,
-        width // vae_scale_factor,
-    )  # b c f h w
     # scale the initial noise by the standard deviation required by the scheduler
     if latents is None:
         latents = jax.random.normal(prng, shape)
     latents = latents * params["scheduler"].init_noise_sigma
     return latents
 def coords_grid(batch, ht, wd):
     coords = jnp.meshgrid(jnp.arange(ht), jnp.arange(wd), indexing="ij")
     coords = jnp.stack(coords[::-1], axis=0)
     return coords[None].repeat(batch, 0)
 def adapt_pos_mirror(x, y, W, H):
-    # adapt the position, with mirror padding
-    x_w_mirror = ((x + W - 1) % (2 * (W - 1))) - W + 1
-    x_adapted = jnp.where(x_w_mirror > 0, x_w_mirror, -(x_w_mirror))
-    y_w_mirror = ((y + H - 1) % (2 * (H - 1))) - H + 1
-    y_adapted = jnp.where(y_w_mirror > 0, y_w_mirror, -(y_w_mirror))
-    return y_adapted, x_adapted
-def safe_get_zeropad(img, x, y, W, H):
-    return jnp.where((x < W) & (x > 0) & (y < H) & (y > 0), img[y, x], 0.0)
-def safe_get_mirror(img, x, y, W, H):
-    return img[adapt_pos_mirror(x, y, W, H)]
 @partial(jax.vmap, in_axes=(0, 0, None))
 @partial(jax.vmap, in_axes=(0, None, None))
-@partial(jax.vmap, in_axes=(None, 0, None))
 @partial(jax.vmap, in_axes=(None, 0, None))
 def grid_sample(latents, grid, method):
     # this is an alternative to torch.functional.nn.grid_sample in jax
     # this implementation is following the algorithm described @ https://pytorch.org/docs/stable/generated/torch.nn.functional.grid_sample.html
     # but with coordinates scaled to the size of the image
     if method == "mirror":
-        return safe_get_mirror(
-            latents,
-            jnp.array(grid[0], dtype=jnp.int16),
-            jnp.array(grid[1], dtype=jnp.int16),
-            latents.shape[0],
-            latents.shape[1],
-        )
-    else:  # default is zero padding
-        return safe_get_zeropad(
-            latents,
-            jnp.array(grid[0], dtype=jnp.int16),
-            jnp.array(grid[1], dtype=jnp.int16),
-            latents.shape[0],
-            latents.shape[1],
-        )
 def bandw_vid(vid, threshold):
-    vid = jnp.max(vid, axis=1)
-    return jnp.where(vid > threshold, 1, 0)
 def mean_blur(vid, k):
-    window = jnp.ones((vid.shape[0], k, k)) / (k * k)
-    convolve = jax.vmap(
-        lambda img, kernel: jax.scipy.signal.convolve(img, kernel, mode="same")
-    )
-    smooth_vid = convolve(vid, window)
-    return smooth_vid
 def get_mask_pose(vid):
-    vid = bandw_vid(vid, 0.4)
-    l, h, w = vid.shape
-    vid = jax.image.resize(vid, (l, h // 8, w // 8), "nearest")
-    vid = bandw_vid(mean_blur(vid, 7)[:, None], threshold=0.01)
-    return vid / (jnp.max(vid) + 1e-4)
-    # return jax.image.resize(vid/(jnp.max(vid) + 1e-4), (l, h, w), "nearest")

 from PIL import Image
 from transformers import CLIPFeatureExtractor, CLIPTokenizer, FlaxCLIPTextModel
 from einops import rearrange, repeat
+from diffusers.models import FlaxAutoencoderKL, FlaxControlNetModel, FlaxUNet2DConditionModel
 from diffusers.schedulers import (
     FlaxDDIMScheduler,
     FlaxDPMSolverMultistepScheduler,
 from diffusers.utils import PIL_INTERPOLATION, logging, replace_example_docstring
 from diffusers.pipelines.pipeline_flax_utils import FlaxDiffusionPipeline
 from diffusers.pipelines.stable_diffusion import FlaxStableDiffusionPipelineOutput
+from diffusers.pipelines.stable_diffusion.safety_checker_flax import FlaxStableDiffusionSafetyChecker
 logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
 """
 Text2Video-Zero:
  - Inputs: Prompt, Pose Control via mp4/gif, First Frame (?)
  - JAX implementation
  - 3DUnet to replace 2DUnetConditional
+"""
 def replicate_devices(array):
     return jnp.expand_dims(array, 0).repeat(jax.device_count(), 0)
+DEBUG = False # Set to True to use python for loop instead of jax.fori_loop for easier debugging
 EXAMPLE_DOC_STRING = """
     Examples:
         >>> output_images.save("generated_image.png")
         ```
 """
 class FlaxTextToVideoPipeline(FlaxDiffusionPipeline):
     def __init__(
         self,
         unet_vanilla,
         controlnet,
         scheduler: Union[
+            FlaxDDIMScheduler, FlaxPNDMScheduler, FlaxLMSDiscreteScheduler, FlaxDPMSolverMultistepScheduler
         ],
         safety_checker: FlaxStableDiffusionSafetyChecker,
         feature_extractor: CLIPFeatureExtractor,
         else:
             eps = jax.random.normal(prng, x0.shape, dtype=text_embeddings.dtype)
             alpha_vec = jnp.prod(params["scheduler"].common.alphas[t0:tMax])
+            xt = jnp.sqrt(alpha_vec) * x0 + \
+                jnp.sqrt(1-alpha_vec) * eps
             return xt
+    def DDIM_backward(self, params, num_inference_steps, timesteps, skip_t, t0, t1, do_classifier_free_guidance, text_embeddings, latents_local,
+                        guidance_scale, controlnet_image=None, controlnet_conditioning_scale=None):
+        scheduler_state = self.scheduler.set_timesteps(params["scheduler"], num_inference_steps)
         f = latents_local.shape[2]
         latents_local = rearrange(latents_local, "b c f h w -> (b f) c h w")
         latents = latents_local.copy()
         x_t0_1 = None
         x_t1_1 = None
+        max_timestep = len(timesteps)-1
         timesteps = jnp.array(timesteps)
         def while_body(args):
             step, latents, x_t0_1, x_t1_1, scheduler_state = args
             t = jnp.array(scheduler_state.timesteps, dtype=jnp.int32)[step]
+            latent_model_input = jnp.concatenate(
+                [latents] * 2) if do_classifier_free_guidance else latents
             latent_model_input = self.scheduler.scale_model_input(
                 scheduler_state, latent_model_input, timestep=t
             )
             f = latents.shape[0]
+            te = jnp.stack([text_embeddings[0, :, :]]*f + [text_embeddings[-1,:,:]]*f)
             timestep = jnp.broadcast_to(t, latent_model_input.shape[0])
             if controlnet_image is not None:
                 down_block_res_samples, mid_block_res_sample = self.controlnet.apply(
                     jnp.array(latent_model_input),
                     jnp.array(timestep, dtype=jnp.int32),
                     encoder_hidden_states=te,
+                    ).sample
             # perform guidance
             if do_classifier_free_guidance:
                 noise_pred_uncond, noise_pred_text = jnp.split(noise_pred, 2, axis=0)
+                noise_pred = noise_pred_uncond + guidance_scale * \
+                    (noise_pred_text - noise_pred_uncond)
             # compute the previous noisy sample x_t -> x_t-1
+            latents, scheduler_state = self.scheduler.step(scheduler_state, noise_pred, t, latents).to_tuple()
+            x_t0_1 = jax.lax.select((step < max_timestep-1) & (timesteps[step+1] == t0), latents, x_t0_1)
+            x_t1_1 = jax.lax.select((step < max_timestep-1) & (timesteps[step+1] == t1), latents, x_t1_1)
             return (step + 1, latents, x_t0_1, x_t1_1, scheduler_state)
         latents_shape = latents.shape
         x_t0_1, x_t1_1 = jnp.zeros(latents_shape), jnp.zeros(latents_shape)
         def cond_fun(arg):
             step, latents, x_t0_1, x_t1_1, scheduler_state = arg
             return (step < skip_t) & (step < num_inference_steps)
         if DEBUG:
             step = 0
             while cond_fun((step, latents, x_t0_1, x_t1_1)):
+                step, latents, x_t0_1, x_t1_1, scheduler_state = while_body((step, latents, x_t0_1, x_t1_1, scheduler_state))
                 step = step + 1
         else:
+            _, latents, x_t0_1, x_t1_1, scheduler_state = jax.lax.while_loop(cond_fun, while_body, (0, latents, x_t0_1, x_t1_1, scheduler_state))
         latents = rearrange(latents, "(b f) c h w -> b c f h w", f=f)
         res = {"x0": latents.copy()}
         if x_t0_1 is not None:
             x_t1_1 = rearrange(x_t1_1, "(b f) c h w -> b c f  h w", f=f)
             res["x_t1_1"] = x_t1_1.copy()
         return res
     def warp_latents_independently(self, latents, reference_flow):
         _, _, H, W = reference_flow.shape
         b, _, f, h, w = latents.shape
         coords_t0 = coords_t0.at[:, 1].set(coords_t0[:, 1] * h / H)
         f, c, _, _ = coords_t0.shape
         coords_t0 = jax.image.resize(coords_t0, (f, c, h, w), "linear")
+        coords_t0 = rearrange(coords_t0, 'f c h w -> f h w c')
+        latents_0 = rearrange(latents[0], 'c f h w -> f  c  h w')
         warped = grid_sample(latents_0, coords_t0, "mirror")
+        warped = rearrange(warped, '(b f) c h w -> b c f h w', f=f)
         return warped
     def warp_vid_independently(self, vid, reference_flow):
         coords_t0 = coords_t0.at[:, 1].set(coords_t0[:, 1] * h / H)
         f, c, _, _ = coords_t0.shape
         coords_t0 = jax.image.resize(coords_t0, (f, c, h, w), "linear")
+        coords_t0 = rearrange(coords_t0, 'f c h w -> f h w c')
         # latents_0 = rearrange(vid, 'c f h w -> f  c  h w')
         warped = grid_sample(vid, coords_t0, "zeropad")
         # warped = rearrange(warped, 'f c h w -> b c f h w', f=f)
         return warped
+    def create_motion_field(self, motion_field_strength_x, motion_field_strength_y, frame_ids, video_length, latents):
+        reference_flow = jnp.zeros(
+            (video_length-1, 2, 512, 512), dtype=latents.dtype)
         for fr_idx, frame_id in enumerate(frame_ids):
+            reference_flow = reference_flow.at[fr_idx, 0, :,
+                           :].set(motion_field_strength_x*(frame_id))
+            reference_flow = reference_flow.at[fr_idx, 1, :,
+                           :].set(motion_field_strength_y*(frame_id))
         return reference_flow
+    def create_motion_field_and_warp_latents(self, motion_field_strength_x, motion_field_strength_y, frame_ids, video_length, latents):
+        motion_field = self.create_motion_field(motion_field_strength_x=motion_field_strength_x,
+                                                motion_field_strength_y=motion_field_strength_y, latents=latents, video_length=video_length, frame_ids=frame_ids)
         for idx, latent in enumerate(latents):
+            latents = latents.at[idx].set(self.warp_latents_independently(
+                latent[None], motion_field)[0])
         return motion_field, latents
+    def text_to_video_zero(self, params,
+                           prng,
+                           text_embeddings,
+                           video_length: Optional[int],
+                           do_classifier_free_guidance = True,
+                           height: Optional[int] = None,
+                           width: Optional[int] = None,
+                           num_inference_steps: int = 50,
+                           guidance_scale: float = 7.5,
+                           num_videos_per_prompt: Optional[int] = 1,
+                           xT = None,
+                           smooth_bg_strength: float=0.,
+                           motion_field_strength_x: float = 12,
+                           motion_field_strength_y: float = 12,
+                           t0: int = 44,
+                           t1: int = 47,
+                           controlnet_image=None,
+                           controlnet_conditioning_scale=0,
+                           ):
         frame_ids = list(range(video_length))
         # Prepare timesteps
+        params["scheduler"] = self.scheduler.set_timesteps(params["scheduler"], num_inference_steps)
         timesteps = params["scheduler"].timesteps
         # Prepare latent variables
         num_channels_latents = self.unet.in_channels
         batch_size = 1
+        xT = prepare_latents(params, prng, batch_size * num_videos_per_prompt, num_channels_latents, height, width, self.vae_scale_factor, xT)
+        timesteps_ddpm = [981, 961, 941, 921, 901, 881, 861, 841, 821, 801, 781, 761, 741, 721,
+                            701, 681, 661, 641, 621, 601, 581, 561, 541, 521, 501, 481, 461, 441,
+                            421, 401, 381, 361, 341, 321, 301, 281, 261, 241, 221, 201, 181, 161,
+                            141, 121, 101,  81,  61,  41,  21,   1]
         timesteps_ddpm.reverse()
         t0 = timesteps_ddpm[t0]
         t1 = timesteps_ddpm[t1]
         x_t1_1 = None
         # Denoising loop
+        shape = (batch_size, num_channels_latents, 1, height //
+                self.vae.scaling_factor, width // self.vae.scaling_factor)
         #  perform ∆t backward steps by stable diffusion
+        ddim_res = self.DDIM_backward(params, num_inference_steps=num_inference_steps, timesteps=timesteps, skip_t=1000, t0=t0, t1=t1, do_classifier_free_guidance=do_classifier_free_guidance,
+                                text_embeddings=text_embeddings, latents_local=xT, guidance_scale=guidance_scale,
+                                controlnet_image=jnp.stack([controlnet_image[0]] * 2), controlnet_conditioning_scale=controlnet_conditioning_scale)
         x0 = ddim_res["x0"]
         # apply warping functions
             x_t0_1 = ddim_res["x_t0_1"]
         if "x_t1_1" in ddim_res:
             x_t1_1 = ddim_res["x_t1_1"]
+        x_t0_k = x_t0_1[:, :, :1, :, :].repeat(video_length-1, 2)
         reference_flow, x_t0_k = self.create_motion_field_and_warp_latents(
+            motion_field_strength_x=motion_field_strength_x, motion_field_strength_y=motion_field_strength_y, latents=x_t0_k, video_length=video_length, frame_ids=frame_ids[1:])
         # assuming t0=t1=1000, if t0 = 1000
         # DDPM forward for more motion freedom
+        ddpm_fwd = partial(self.DDPM_forward, params=params, prng=prng, x0=x_t0_k, t0=t0,
+                           tMax=t1, shape=shape, text_embeddings=text_embeddings)
+        x_t1_k = jax.lax.cond(t1 > t0,
+                              ddpm_fwd,
+                              lambda:x_t0_k
         )
         x_t1 = jnp.concatenate([x_t1_1, x_t1_k], axis=2)
         # backward stepts by stable diffusion
+        #warp the controlnet image following the same flow defined for latent
         controlnet_video = controlnet_image[:video_length]
+        controlnet_video = controlnet_video.at[1:].set(self.warp_vid_independently(controlnet_video[1:], reference_flow))
+        controlnet_image = jnp.concatenate([controlnet_video]*2)
         smooth_bg = True
         if smooth_bg:
+            #latent shape: "b c f h w"
+            M_FG = repeat(get_mask_pose(controlnet_video), "f h w -> b c f h w", c=x_t1.shape[1], b=batch_size)
+            initial_bg = repeat(x_t1[:,:,0] * (1 - M_FG[:,:,0]), "b c h w -> b c f h w", f=video_length-1)
+            #warp the controlnet image following the same flow defined for latent #f c h w
+            initial_bg_warped = self.warp_latents_independently(initial_bg, reference_flow)
+            bgs = x_t1[:,:,1:] * (1 - M_FG[:,:,1:]) #initial background
+            initial_mask_warped = 1 - self.warp_latents_independently(repeat(M_FG[:,:,0], "b c h w -> b c f h w", f = video_length-1), reference_flow)
             # initial_mask_warped = 1 - warp_vid_independently(repeat(M_FG[:,:,0], "b c h w -> (b f) c h w", f = video_length-1), reference_flow)
             # initial_mask_warped = rearrange(initial_mask_warped, "(b f) c h w -> b c f h w", b=batch_size)
+            mask = (1 - M_FG[:,:,1:]) * initial_mask_warped
+            x_t1 = x_t1.at[:,:,1:].set( (1 - mask) * x_t1[:,:,1:] + mask * (initial_bg_warped * smooth_bg_strength + (1 - smooth_bg_strength) * bgs))
+        ddim_res = self.DDIM_backward(params, num_inference_steps=num_inference_steps, timesteps=timesteps, skip_t=t1, t0=-1, t1=-1, do_classifier_free_guidance=do_classifier_free_guidance,
+                                            text_embeddings=text_embeddings, latents_local=x_t1, guidance_scale=guidance_scale,
+                                            controlnet_image=controlnet_image, controlnet_conditioning_scale=controlnet_conditioning_scale,
+                                     )
         x0 = ddim_res["x0"]
         del ddim_res
         del x_t1
         del x_t1_k
         return x0
+    def denoise_latent(self, params, num_inference_steps, timesteps, do_classifier_free_guidance, text_embeddings, latents,
+                        guidance_scale, controlnet_image=None, controlnet_conditioning_scale=None):
+        scheduler_state = self.scheduler.set_timesteps(params["scheduler"], num_inference_steps)
         # f = latents_local.shape[2]
         # latents_local = rearrange(latents_local, "b c f h w -> (b f) c h w")
+        max_timestep = len(timesteps)-1
         timesteps = jnp.array(timesteps)
         def while_body(args):
             step, latents, scheduler_state = args
             t = jnp.array(scheduler_state.timesteps, dtype=jnp.int32)[step]
+            latent_model_input = jnp.concatenate(
+                [latents] * 2) if do_classifier_free_guidance else latents
             latent_model_input = self.scheduler.scale_model_input(
                 scheduler_state, latent_model_input, timestep=t
             )
             f = latents.shape[0]
+            te = jnp.stack([text_embeddings[0, :, :]]*f + [text_embeddings[-1,:,:]]*f)
             timestep = jnp.broadcast_to(t, latent_model_input.shape[0])
             if controlnet_image is not None:
                 down_block_res_samples, mid_block_res_sample = self.controlnet.apply(
                     jnp.array(latent_model_input),
                     jnp.array(timestep, dtype=jnp.int32),
                     encoder_hidden_states=te,
+                    ).sample
             # perform guidance
             if do_classifier_free_guidance:
                 noise_pred_uncond, noise_pred_text = jnp.split(noise_pred, 2, axis=0)
+                noise_pred = noise_pred_uncond + guidance_scale * \
+                    (noise_pred_text - noise_pred_uncond)
             # compute the previous noisy sample x_t -> x_t-1
+            latents, scheduler_state = self.scheduler.step(scheduler_state, noise_pred, t, latents).to_tuple()
             return (step + 1, latents, scheduler_state)
         def cond_fun(arg):
             step, latents, scheduler_state = arg
+            return (step < num_inference_steps)
         if DEBUG:
             step = 0
             while cond_fun((step, latents, scheduler_state)):
+                step, latents, scheduler_state = while_body((step, latents, scheduler_state))
                 step = step + 1
         else:
+            _, latents, scheduler_state = jax.lax.while_loop(cond_fun, while_body, (0, latents, scheduler_state))
         # latents = rearrange(latents, "(b f) c h w -> b c f h w", f=f)
         return latents
+    def generate_starting_frames(self,
+                                params,
+                                prngs: list, #list of prngs for each img
+                                prompt,
+                                neg_prompt,
+                                controlnet_image,
+                                do_classifier_free_guidance = True,
+                                num_inference_steps: int = 50,
+                                guidance_scale: float = 7.5,
+                                t0: int = 44,
+                                t1: int = 47,
+                                controlnet_conditioning_scale=1.,
+                                ):
         height, width = controlnet_image.shape[-2:]
         if height % 64 != 0 or width % 64 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 64 but are {height} and {width}.")
+        shape = (self.unet.in_channels, height //
+        self.vae_scale_factor, width // self.vae_scale_factor) # c h w
         # scale the initial noise by the standard deviation required by the scheduler
+        # print(f"Generating {len(prngs)} first frames with prompt {prompt}, for {num_inference_steps} steps. PRNG seeds are: {prngs}")
+        latents = jnp.stack([jax.random.normal(prng, shape) for prng in prngs]) # b c h w
         latents = latents * params["scheduler"].init_noise_sigma
         timesteps = params["scheduler"].timesteps
+        timesteps_ddpm = [981, 961, 941, 921, 901, 881, 861, 841, 821, 801, 781, 761, 741, 721,
+                            701, 681, 661, 641, 621, 601, 581, 561, 541, 521, 501, 481, 461, 441,
+                            421, 401, 381, 361, 341, 321, 301, 281, 261, 241, 221, 201, 181, 161,
+                            141, 121, 101,  81,  61,  41,  21,   1]
         timesteps_ddpm.reverse()
         t0 = timesteps_ddpm[t0]
         t1 = timesteps_ddpm[t1]
         # get prompt text embeddings
+        prompt_ids = shard(self.prepare_text_inputs(prompt))
+        # prompt_embeds = jax.pmap( lambda prompt_ids, params:  )(prompt_ids, params)
+        @jax.pmap
+        def prepare_text(params, prompt_ids, uncond_input):
+            prompt_embeds = self.text_encoder(prompt_ids, params=params["text_encoder"])[0]
+            negative_prompt_embeds = self.text_encoder(uncond_input, params=params["text_encoder"])[0]
+            text_embeddings = jnp.concatenate([negative_prompt_embeds, prompt_embeds])
+            return text_embeddings
         # TODO: currently it is assumed `do_classifier_free_guidance = guidance_scale > 1.0`
         # implement this conditional `do_classifier_free_guidance = guidance_scale > 1.0`
         batch_size = 1
         max_length = prompt_ids.shape[-1]
         if neg_prompt is None:
+            uncond_input = shard(self.tokenizer(
+                [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="np"
+            ).input_ids)
         else:
             neg_prompt_ids = self.prepare_text_inputs(neg_prompt)
+            uncond_input = shard(neg_prompt_ids)
+        text_embeddings = prepare_text(params, prompt_ids, uncond_input)
+        controlnet_image = shard(jnp.stack([controlnet_image[0]] * len(prngs) * 2))
+        timesteps = shard(jnp.array(timesteps))
+        guidance_scale = shard(jnp.array(guidance_scale))
+        controlnet_conditioning_scale = shard(jnp.array(controlnet_conditioning_scale))
+        #latent is shape # b c h w
+        # vmap_gen_start_frame = jax.vmap(lambda latent: p_generate_starting_frames(self, num_inference_steps, params, timesteps, text_embeddings, shard(latent[None]), guidance_scale, controlnet_image, controlnet_conditioning_scale))
+        # decoded_latents = vmap_gen_start_frame(latents)
+        decoded_latents = p_generate_starting_frames(self, num_inference_steps, params, timesteps, text_embeddings, shard(latents), guidance_scale, controlnet_image, controlnet_conditioning_scale)
+        # print(f"shape output: {decoded_latents.shape}")
+        return unshard(decoded_latents)#[:, 0]
     def generate_video(
         self,
         controlnet_conditioning_scale: Union[float, jnp.array] = 1.0,
         return_dict: bool = True,
         jit: bool = False,
+        xT = None,
+        smooth_bg_strength: float=0.,
         motion_field_strength_x: float = 3,
         motion_field_strength_y: float = 4,
         t0: int = 44,
         if isinstance(controlnet_conditioning_scale, float):
             # Convert to a tensor so each device gets a copy. Follow the prompt_ids for
             # shape information, as they may be sharded (when `jit` is `True`), or not.
+            controlnet_conditioning_scale = jnp.array([controlnet_conditioning_scale] * prompt_ids.shape[0])
             if len(prompt_ids.shape) > 2:
                 # Assume sharded
                 controlnet_conditioning_scale = controlnet_conditioning_scale[:, None]
                 num_inference_steps,
                 replicate_devices(guidance_scale),
                 replicate_devices(latents) if latents is not None else None,
+                replicate_devices(neg_prompt_ids) if neg_prompt_ids is not None else None,
                 replicate_devices(controlnet_conditioning_scale),
                 replicate_devices(xT) if xT is not None else None,
                 replicate_devices(smooth_bg_strength),
             safety_params = params["safety_checker"]
             images_uint8_casted = (images * 255).round().astype("uint8")
             num_devices, batch_size = images.shape[:2]
+            images_uint8_casted = np.asarray(images_uint8_casted).reshape(num_devices * batch_size, height, width, 3)
+            images_uint8_casted, has_nsfw_concept = self._run_safety_checker(images_uint8_casted, safety_params, jit)
             images = np.asarray(images)
             # block images
             if any(has_nsfw_concept):
             has_nsfw_concept = False
         if not return_dict:
             return (images, has_nsfw_concept)
+        return FlaxStableDiffusionPipelineOutput(images=images, nsfw_content_detected=has_nsfw_concept)
     def prepare_text_inputs(self, prompt: Union[str, List[str]]):
         if not isinstance(prompt, (str, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
         text_input = self.tokenizer(
             prompt,
             padding="max_length",
             return_tensors="np",
         )
         return text_input.input_ids
     def prepare_image_inputs(self, image: Union[Image.Image, List[Image.Image]]):
         if not isinstance(image, (Image.Image, list)):
+            raise ValueError(f"image has to be of type `PIL.Image.Image` or list but is {type(image)}")
         if isinstance(image, Image.Image):
             image = [image]
+        processed_images = jnp.concatenate([preprocess(img, jnp.float32) for img in image])
         return processed_images
     def _get_has_nsfw_concepts(self, features, params):
         has_nsfw_concepts = self.safety_checker(features, params)
         return has_nsfw_concepts
     def _run_safety_checker(self, images, safety_model_params, jit=False):
         # safety_model_params should already be replicated when jit is True
         pil_images = [Image.fromarray(image) for image in images]
         features = self.feature_extractor(pil_images, return_tensors="np").pixel_values
         if jit:
             features = shard(features)
+            has_nsfw_concepts = _p_get_has_nsfw_concepts(self, features, safety_model_params)
             has_nsfw_concepts = unshard(has_nsfw_concepts)
             safety_model_params = unreplicate(safety_model_params)
         else:
+            has_nsfw_concepts = self._get_has_nsfw_concepts(features, safety_model_params)
         images_was_copied = False
         for idx, has_nsfw_concept in enumerate(has_nsfw_concepts):
             if has_nsfw_concept:
                     " instead. Try again with a different prompt and/or seed."
                 )
         return images, has_nsfw_concepts
     def _generate(
         self,
         prompt_ids: jnp.array,
         latents: Optional[jnp.array] = None,
         neg_prompt_ids: Optional[jnp.array] = None,
         controlnet_conditioning_scale: float = 1.0,
+        xT = None,
+        smooth_bg_strength: float = 0.,
         motion_field_strength_x: float = 12,
         motion_field_strength_y: float = 12,
         t0: int = 44,
         height, width = image.shape[-2:]
         video_length = image.shape[0]
         if height % 64 != 0 or width % 64 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 64 but are {height} and {width}.")
         # get prompt text embeddings
         prompt_embeds = self.text_encoder(prompt_ids, params=params["text_encoder"])[0]
         # TODO: currently it is assumed `do_classifier_free_guidance = guidance_scale > 1.0`
         max_length = prompt_ids.shape[-1]
         if neg_prompt_ids is None:
             uncond_input = self.tokenizer(
+                [""] * batch_size, padding="max_length", max_length=max_length, return_tensors="np"
             ).input_ids
         else:
             uncond_input = neg_prompt_ids
+        negative_prompt_embeds = self.text_encoder(uncond_input, params=params["text_encoder"])[0]
         context = jnp.concatenate([negative_prompt_embeds, prompt_embeds])
         image = jnp.concatenate([image] * 2)
         seed_t2vz, prng_seed = jax.random.split(prng_seed)
+        #get the latent following text to video zero
+        latents = self.text_to_video_zero(params, seed_t2vz, text_embeddings=context, video_length=video_length,
+                                          height=height, width = width, num_inference_steps=num_inference_steps,
+                                          guidance_scale=guidance_scale, controlnet_image=image,
+                                          xT=xT, smooth_bg_strength=smooth_bg_strength, t0=t0, t1=t1,
+                                          motion_field_strength_x=motion_field_strength_x,
+                                          motion_field_strength_y=motion_field_strength_y,
+                                          controlnet_conditioning_scale=controlnet_conditioning_scale
+                                          )
         # scale and decode the image latents with vae
         latents = 1 / self.vae.config.scaling_factor * latents
         latents = rearrange(latents, "b c f h w -> (b f) c h w")
+        video = self.vae.apply({"params": params["vae"]}, latents, method=self.vae.decode).sample
         video = (video / 2 + 0.5).clip(0, 1).transpose(0, 2, 3, 1)
         return video
     @replace_example_docstring(EXAMPLE_DOC_STRING)
     def __call__(
         self,
         controlnet_conditioning_scale: Union[float, jnp.array] = 1.0,
         return_dict: bool = True,
         jit: bool = False,
+        xT = None,
+        smooth_bg_strength: float = 0.,
         motion_field_strength_x: float = 3,
         motion_field_strength_y: float = 4,
         t0: int = 44,
         if isinstance(controlnet_conditioning_scale, float):
             # Convert to a tensor so each device gets a copy. Follow the prompt_ids for
             # shape information, as they may be sharded (when `jit` is `True`), or not.
+            controlnet_conditioning_scale = jnp.array([controlnet_conditioning_scale] * prompt_ids.shape[0])
             if len(prompt_ids.shape) > 2:
                 # Assume sharded
                 controlnet_conditioning_scale = controlnet_conditioning_scale[:, None]
             safety_params = params["safety_checker"]
             images_uint8_casted = (images * 255).round().astype("uint8")
             num_devices, batch_size = images.shape[:2]
+            images_uint8_casted = np.asarray(images_uint8_casted).reshape(num_devices * batch_size, height, width, 3)
+            images_uint8_casted, has_nsfw_concept = self._run_safety_checker(images_uint8_casted, safety_params, jit)
             images = np.asarray(images)
             # block images
             if any(has_nsfw_concept):
             has_nsfw_concept = False
         if not return_dict:
             return (images, has_nsfw_concept)
+        return FlaxStableDiffusionPipelineOutput(images=images, nsfw_content_detected=has_nsfw_concept)
 # Static argnums are pipe, num_inference_steps. A change would trigger recompilation.
 @partial(
     jax.pmap,
     in_axes=(None, 0, 0, 0, 0, None, 0, 0, 0, 0, 0, 0, 0, 0, None, None),
+    static_broadcasted_argnums=(0, 5, 14, 15)
 )
 def _p_generate(
     pipe,
+    prompt_ids,
     image,
     params,
     prng_seed,
         t0,
         t1,
     )
 @partial(jax.pmap, static_broadcasted_argnums=(0,))
 def _p_get_has_nsfw_concepts(pipe, features, params):
     return pipe._get_has_nsfw_concepts(features, params)
+@partial(
+jax.pmap,
+in_axes=(None, None, 0, 0, 0, 0, 0, 0, 0),
+static_broadcasted_argnums=(0, 1)
+)
+def p_generate_starting_frames(pipe, num_inference_steps, params, timesteps, text_embeddings, latents, guidance_scale, controlnet_image, controlnet_conditioning_scale):
+    #  perform ∆t backward steps by stable diffusion
+    # delta_t_diffusion = jax.vmap(lambda latent : self.DDIM_backward(params, num_inference_steps=num_inference_steps, timesteps=timesteps, skip_t=1000, t0=t0, t1=t1, do_classifier_free_guidance=do_classifier_free_guidance,
+    #                                     text_embeddings=text_embeddings, latents_local=latent, guidance_scale=guidance_scale,
+    #                                     controlnet_image=controlnet_image, controlnet_conditioning_scale=controlnet_conditioning_scale))
+    # ddim_res = delta_t_diffusion(latents)
+    # latents = ddim_res["x0"] #output is  i b c f h w
+    # DDPM forward for more motion freedom
+    # ddpm_fwd = jax.vmap(lambda prng, latent: self.DDPM_forward(params=params, prng=prng, x0=latent, t0=t0,
+    #                 tMax=t1, shape=shape, text_embeddings=text_embeddings))
+    # latents = ddpm_fwd(stacked_prngs, latents)
+    # main backward diffusion
+    # denoise_first_frame = lambda latent : self.DDIM_backward(params, num_inference_steps=num_inference_steps, timesteps=timesteps, skip_t=100000, t0=-1, t1=-1, do_classifier_free_guidance=do_classifier_free_guidance,
+    #                                     text_embeddings=text_embeddings, latents_local=latent, guidance_scale=guidance_scale,
+    #                                     controlnet_image=controlnet_image, controlnet_conditioning_scale=controlnet_conditioning_scale)
+    # latents = rearrange(latents, 'i b c f h w -> (i b) c f h w')
+    # ddim_res = denoise_first_frame(latents)
+    latents = pipe.denoise_latent(params, num_inference_steps=num_inference_steps, timesteps=timesteps, do_classifier_free_guidance=True,
+                                        text_embeddings=text_embeddings, latents=latents, guidance_scale=guidance_scale,
+                                        controlnet_image=controlnet_image, controlnet_conditioning_scale=controlnet_conditioning_scale)
+    # latents = rearrange(ddim_res["x0"], 'i b c f h w -> (i b) c f h w') #output is  i b c f h w
+    # scale and decode the image latents with vae
+    latents = 1 / pipe.vae.config.scaling_factor * latents
+    # latents = rearrange(latents, "b c h w -> (b f) c h w")
+    imgs = pipe.vae.apply({"params": params["vae"]}, latents, method=pipe.vae.decode).sample
+    imgs = (imgs / 2 + 0.5).clip(0, 1).transpose(0, 2, 3, 1)
+    return imgs
 def unshard(x: jnp.ndarray):
     # einops.rearrange(x, 'd b ... -> (d b) ...')
     num_devices, batch_size = x.shape[:2]
     rest = x.shape[2:]
     return x.reshape(num_devices * batch_size, *rest)
 def preprocess(image, dtype):
     image = image.convert("RGB")
     w, h = image.size
     image = image[None].transpose(0, 3, 1, 2)
     return image
+def prepare_latents(params, prng, batch_size, num_channels_latents, height, width, vae_scale_factor, latents=None):
+    shape = (batch_size, num_channels_latents, 1, height //
+            vae_scale_factor, width // vae_scale_factor) #b c f h w
     # scale the initial noise by the standard deviation required by the scheduler
     if latents is None:
         latents = jax.random.normal(prng, shape)
     latents = latents * params["scheduler"].init_noise_sigma
     return latents
 def coords_grid(batch, ht, wd):
     coords = jnp.meshgrid(jnp.arange(ht), jnp.arange(wd), indexing="ij")
     coords = jnp.stack(coords[::-1], axis=0)
     return coords[None].repeat(batch, 0)
 def adapt_pos_mirror(x, y, W, H):
+  #adapt the position, with mirror padding
+  x_w_mirror = ((x + W - 1) % (2*(W - 1))) - W + 1
+  x_adapted = jnp.where(x_w_mirror > 0, x_w_mirror, - (x_w_mirror))
+  y_w_mirror = ((y + H - 1) % (2*(H - 1))) - H + 1
+  y_adapted = jnp.where(y_w_mirror > 0, y_w_mirror, - (y_w_mirror))
+  return y_adapted, x_adapted
+def safe_get_zeropad(img, x,y,W,H):
+  return jnp.where((x < W) & (x > 0) & (y < H) & (y > 0), img[y,x], 0.)
+def safe_get_mirror(img, x,y,W,H):
+  return img[adapt_pos_mirror(x,y,W,H)]
 @partial(jax.vmap, in_axes=(0, 0, None))
 @partial(jax.vmap, in_axes=(0, None, None))
+@partial(jax.vmap, in_axes=(None,0, None))
 @partial(jax.vmap, in_axes=(None, 0, None))
 def grid_sample(latents, grid, method):
     # this is an alternative to torch.functional.nn.grid_sample in jax
     # this implementation is following the algorithm described @ https://pytorch.org/docs/stable/generated/torch.nn.functional.grid_sample.html
     # but with coordinates scaled to the size of the image
     if method == "mirror":
+      return safe_get_mirror(latents, jnp.array(grid[0], dtype=jnp.int16), jnp.array(grid[1], dtype=jnp.int16), latents.shape[0], latents.shape[1])
+    else: #default is zero padding
+      return safe_get_zeropad(latents, jnp.array(grid[0], dtype=jnp.int16), jnp.array(grid[1], dtype=jnp.int16), latents.shape[0], latents.shape[1])
 def bandw_vid(vid, threshold):
+  vid = jnp.max(vid, axis=1)
+  return jnp.where(vid > threshold, 1, 0)
 def mean_blur(vid, k):
+  window = jnp.ones((vid.shape[0], k, k))/ (k*k)
+  convolve=jax.vmap(lambda img, kernel:jax.scipy.signal.convolve(img, kernel, mode='same'))
+  smooth_vid = convolve(vid, window)
+  return smooth_vid
 def get_mask_pose(vid):
+  vid = bandw_vid(vid, 0.4)
+  l, h, w = vid.shape
+  vid = jax.image.resize(vid, (l, h//8, w//8), "nearest")
+  vid=bandw_vid(mean_blur(vid, 7)[:,None], threshold=0.01)
+  return vid/(jnp.max(vid) + 1e-4)
+  #return jax.image.resize(vid/(jnp.max(vid) + 1e-4), (l, h, w), "nearest")

utils/gradio_utils.py CHANGED Viewed

@@ -3,11 +3,15 @@ import os
 # App Pose utils
 def motion_to_video_path(motion):
     videos = [
         "__assets__/dance1_corr.mp4",
         "__assets__/dance2_corr.mp4",
         "__assets__/dance3_corr.mp4",
         "__assets__/dance4_corr.mp4",
-        "__assets__/dance5_corr.mp4"
     ]
     if len(motion.split(" ")) > 1 and motion.split(" ")[1].isnumeric():
         id = int(motion.split(" ")[1]) - 1

 # App Pose utils
 def motion_to_video_path(motion):
     videos = [
+        "__assets__/walk_01.mp4",
+        "__assets__/walk_02.mp4",
+        "__assets__/walk_03.mp4",
+        "__assets__/run.mp4",
         "__assets__/dance1_corr.mp4",
         "__assets__/dance2_corr.mp4",
         "__assets__/dance3_corr.mp4",
         "__assets__/dance4_corr.mp4",
+        "__assets__/dance5_corr.mp4",
     ]
     if len(motion.split(" ")) > 1 and motion.split(" ")[1].isnumeric():
         id = int(motion.split(" ")[1]) - 1

webui/app_control_animation.py CHANGED Viewed

@@ -6,23 +6,35 @@ from utils.hf_utils import get_model_list
 huggingspace_name = os.environ.get("SPACE_AUTHOR_NAME")
 on_huggingspace = huggingspace_name if huggingspace_name is not None else False
-examples = [
-    ["an astronaut waving the arm on the moon"],
-    ["a sloth surfing on a wakeboard"],
-    ["an astronaut walking on a street"],
-    ["a cute cat walking on grass"],
-    ["a horse is galloping on a street"],
-    ["an astronaut is skiing down the hill"],
-    ["a gorilla walking alone down the street"],
-    ["a gorilla dancing on times square"],
-    ["A panda dancing dancing like crazy on Times Square"],
-]
 def on_video_path_update(evt: gr.EventData):
     return f"Selection: **{evt._data}**"
 def pose_gallery_callback(evt: gr.SelectData):
     return f"Motion {evt.index+1}"
@@ -134,31 +146,37 @@ def create_demo(model: ControlAnimationModel):
                     gallery_pose_sequence = gr.Gallery(
                         label="Pose Sequence",
                         value=[
-                            ("__assets__/dance1.gif", "Motion 1"),
-                            ("__assets__/dance2.gif", "Motion 2"),
-                            ("__assets__/dance3.gif", "Motion 3"),
-                            ("__assets__/dance4.gif", "Motion 4"),
-                            ("__assets__/dance5.gif", "Motion 5"),
                         ],
-                    ).style(grid=3, columns=3, rows=1, object_fit="contain", height="auto")
                     input_video_path = gr.Textbox(
                         label="Pose Sequence", visible=False, value="Motion 1"
                     )
                     pose_sequence_selector = gr.Markdown("Pose Sequence: **Motion 1**")
-            with gr.Column(visible=True) as frame_selection_view:
-                initial_frames = gr.Gallery(
-                    label="Initial Frames", show_label=False
-                ).style(grid=4, columns=4, rows=1, object_fit="contain", preview=True)
-                gr.Markdown("Select an initial frame to start your animation with.")
-                gen_animation_button = gr.Button(
-                    value="Select Initial Frame & Generate Animation",
-                    variant="secondary",
-                )
-            with gr.Column(visible=False) as animation_view:
-                result = gr.Video(label="Generated Video")
         with gr.Box(visible=False):
             initial_frame_index = gr.Number(
@@ -191,17 +209,17 @@ def create_demo(model: ControlAnimationModel):
             seed,
         ]
-        def submit_select(initial_frame_index: int):
-            if initial_frame_index != -1:  # More to next step
-                return {
-                    frame_selection_view: gr.update(visible=False),
-                    animation_view: gr.update(visible=True),
-                }
-            return {
-                frame_selection_view: gr.update(visible=True),
-                animation_view: gr.update(visible=False),
-            }
         gen_frames_button.click(
             fn=model.generate_initial_frames,
@@ -209,12 +227,18 @@ def create_demo(model: ControlAnimationModel):
             outputs=initial_frames,
         )
         gen_animation_button.click(
-            fn=submit_select,
-            inputs=initial_frame_index,
-            outputs=[frame_selection_view, animation_view],
-        ).then(
-            fn=None,
             inputs=animation_inputs,
             outputs=result,
         )
@@ -227,4 +251,12 @@ def create_demo(model: ControlAnimationModel):
         #             cache_examples=on_huggingspace,
         # )
     return demo

 huggingspace_name = os.environ.get("SPACE_AUTHOR_NAME")
 on_huggingspace = huggingspace_name if huggingspace_name is not None else False
+examples = [["A surfer in miami walking by the beach",
+            None,
+            "Motion 3",
+            None,
+            3,
+            0,
+            None,
+            None,
+            None,
+            None,
+            None,
+            None,
+            0],
+            ]
+# examples = [
+#     ["an astronaut waving the arm on the moon"],
+#     ["a sloth surfing on a wakeboard"],
+#     ["an astronaut walking on a street"],
+#     ["a cute cat walking on grass"],
+#     ["a horse is galloping on a street"],
+#     ["an astronaut is skiing down the hill"],
+#     ["a gorilla walking alone down the street"],
+#     ["a gorilla dancing on times square"],
+#     ["A panda dancing dancing like crazy on Times Square"],
+# ]
 def on_video_path_update(evt: gr.EventData):
     return f"Selection: **{evt._data}**"
 def pose_gallery_callback(evt: gr.SelectData):
     return f"Motion {evt.index+1}"
                     gallery_pose_sequence = gr.Gallery(
                         label="Pose Sequence",
                         value=[
+                            ("__assets__/walk_01.gif", "Motion 1"),
+                            ("__assets__/walk_02.gif", "Motion 2"),
+                            ("__assets__/walk_03.gif", "Motion 3"),
+                            ("__assets__/run.gif", "Motion 4"),
+                            ("__assets__/dance1.gif", "Motion 5"),
+                            ("__assets__/dance2.gif", "Motion 6"),
+                            ("__assets__/dance3.gif", "Motion 7"),
+                            ("__assets__/dance4.gif", "Motion 8"),
+                            ("__assets__/dance5.gif", "Motion 9"),
                         ],
+                    ).style(columns=3)
                     input_video_path = gr.Textbox(
                         label="Pose Sequence", visible=False, value="Motion 1"
                     )
                     pose_sequence_selector = gr.Markdown("Pose Sequence: **Motion 1**")
+            with gr.Row():
+                with gr.Column(visible=True) as frame_selection_view:
+                    initial_frames = gr.Gallery(
+                        label="Initial Frames", show_label=False
+                    ).style(columns=4, rows=1, object_fit="contain", preview=True)
+                    gr.Markdown("Select an initial frame to start your animation with.")
+                    gen_animation_button = gr.Button(
+                        value="Select Initial Frame & Generate Animation",
+                        variant="secondary",
+                    )
+                with gr.Column(visible=True) as animation_view:
+                    result = gr.Image(label="Generated Video")
         with gr.Box(visible=False):
             initial_frame_index = gr.Number(
             seed,
         ]
+        # def submit_select(initial_frame_index: int):
+        #     if initial_frame_index != -1:  # More to next step
+        #         return {
+        #             frame_selection_view: gr.update(visible=False),
+        #             animation_view: gr.update(visible=True),
+        #         }
+        #     return {
+        #         frame_selection_view: gr.update(visible=True),
+        #         animation_view: gr.update(visible=False),
+        #     }
         gen_frames_button.click(
             fn=model.generate_initial_frames,
             outputs=initial_frames,
         )
+        # gen_animation_button.click(
+        #     fn=submit_select,
+        #     inputs=initial_frame_index,
+        #     outputs=[frame_selection_view, animation_view],
+        # ).then(
+        #     fn=model.generate_animation,
+        #     inputs=animation_inputs,
+        #     outputs=result,
+        # )
         gen_animation_button.click(
+            fn=model.generate_animation,
             inputs=animation_inputs,
             outputs=result,
         )
         #             cache_examples=on_huggingspace,
         # )
+        gr.Examples(examples=examples,
+                    inputs=animation_inputs,
+                    outputs=result,
+                    fn=model.generate_animation,
+                    cache_examples=on_huggingspace,
+                    run_on_click=True,
+                    )
     return demo