Spaces:

king159
/

PAID

Runtime error

App Files Files Community

wjh commited on Mar 26

Commit

67e6974

•

1 Parent(s): c464026

init

Browse files

Files changed (8) hide show

app.py +432 -0
interpolation.py +388 -0
pipeline_interpolated_sdxl.py +0 -0
pipeline_interpolated_stable_diffusion.py +584 -0
prior.py +168 -0
requirements.txt +65 -0
style.css +71 -0
utils.py +189 -0

app.py ADDED Viewed

	@@ -0,0 +1,432 @@

+import os
+from typing import Optional
+import gradio as gr
+import numpy as np
+import pandas as pd
+import torch
+import user_history
+from PIL import Image
+from scipy.stats import beta as beta_distribution
+from pipeline_interpolated_sdxl import InterpolationStableDiffusionXLPipeline
+from pipeline_interpolated_stable_diffusion import InterpolationStableDiffusionPipeline
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
+title = r"""
+<h1 align="center">PAID: (Prompt-guided) Attention Interpolation of Text-to-Image Diffusion</h1>
+"""
+description = r"""
+<b>Official 🤗 Gradio demo</b> for <a href='https://github.com/QY-H00/attention-interpolation-diffusion/tree/public' target='_blank'><b>PAID: (Prompt-guided) Attention Interpolation of Text-to-Image Diffusion</b></a>.<br>
+How to use:<br>
+1. Input prompt 1 and prompt 2.
+2. (Optional) Input the guidance prompt and negative prompt.
+3. (Optional) Change the interpolation parameters and check the Beta distribution.
+4. Click the <b>Generate</b> button to begin generating images.
+5. Enjoy! 😊"""
+article = r"""
+---
+✒️ **Citation**
+<br>
+If you found this demo/our paper useful, please consider citing:
+```bibtex
+@article{he024paid,
+    title={PAID:(Prompt-guided) Attention Interpolation of Text-to-Image Diffusion},
+    author={He, Qiyuan and Wang, Jinghao and Liu, Ziwei and Angle, Yao},
+    journal={},
+    year={2024}
+}
+```
+📧 **Contact**
+<br>
+If you have any questions, please feel free to open an issue in our <a href='https://github.com/QY-H00/attention-interpolation-diffusion/tree/public' target='_blank'><b>Github Repo</b></a> or directly reach us out at <b>qhe@u.nus.edu.sg</b>.
+"""
+MAX_SEED = np.iinfo(np.int32).max
+CACHE_EXAMPLES = False
+USE_TORCH_COMPILE = False
+ENABLE_CPU_OFFLOAD = os.getenv("ENABLE_CPU_OFFLOAD") == "1"
+PREVIEW_IMAGES = False
+dtype = torch.float32
+device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
+pipeline = InterpolationStableDiffusionPipeline(
+    repo_name="runwayml/stable-diffusion-v1-5",
+    guidance_scale=10.0,
+    scheduler_name="unipc",
+)
+pipeline.to(device, dtype=dtype)
+def change_model_fn(model_name: str) -> None:
+    global pipeline
+    name_mapping = {
+        "SD1.4-521": "CompVis/stable-diffusion-v1-4",
+        "SD1.5-512": "runwayml/stable-diffusion-v1-5",
+        "SD2.1-768": "stabilityai/stable-diffusion-2-1",
+        "SDXL-1024": "stabilityai/stable-diffusion-xl-base-1.0",
+    }
+    if "XL" not in model_name:
+        pipeline = InterpolationStableDiffusionPipeline(
+            repo_name=name_mapping[model_name],
+            guidance_scale=10.0,
+            scheduler_name="unipc",
+        )
+        pipeline.to(device, dtype=dtype)
+    else:
+        pipeline = InterpolationStableDiffusionXLPipeline.from_pretrained(
+            name_mapping[model_name]
+        )
+        pipeline.to(device, dtype=dtype)
+def save_image(img, index):
+    unique_name = f"{index}.png"
+    img = Image.fromarray(img)
+    img.save(unique_name)
+    return unique_name
+def generate_beta_tensor(
+    size: int, alpha: float = 3.0, beta: float = 3.0
+) -> torch.FloatTensor:
+    prob_values = [i / (size - 1) for i in range(size)]
+    inverse_cdf_values = beta_distribution.ppf(prob_values, alpha, beta)
+    return inverse_cdf_values
+def plot_gemma_fn(alpha: float, beta: float, size: int) -> pd.DataFrame:
+    beta_ppf = generate_beta_tensor(size=size, alpha=int(alpha), beta=int(beta))
+    return pd.DataFrame(
+        {
+            "interpolation index": [i for i in range(size)],
+            "coefficient": beta_ppf.tolist(),
+        }
+    )
+def get_example() -> list:
+    case = [
+        [
+            "A photo of dog, best quality, extremely detailed",
+            "A photo of car, best quality, extremely detailed",
+            3,
+            6,
+            3,
+            "A photo of a dog driving a car, logical, best quality, extremely detailed",
+            "monochrome, lowres, bad anatomy, worst quality, low quality",
+            "SD1.5-512",
+            6.1 / 50,
+            10,
+            50,
+            "fused_inner",
+            "self",
+            1002,
+            True,
+        ]
+    ]
+    return case
+def dynamic_gallery_fn(interpolation_size: int):
+    return gr.Gallery(
+        label="Result", show_label=False, rows=1, columns=interpolation_size
+    )
+@torch.no_grad()
+def generate(
+    prompt1: str,
+    prompt2: str,
+    guidance_prompt: Optional[str] = None,
+    negative_prompt: str = "",
+    warmup_ratio: int = 8,
+    guidance_scale: float = 10,
+    early: str = "fused_outer",
+    late: str = "self",
+    alpha: float = 4.0,
+    beta: float = 4.0,
+    interpolation_size: int = 3,
+    seed: int = 0,
+    same_latent: bool = True,
+    num_inference_steps: int = 50,
+    progress=gr.Progress(),
+) -> np.ndarray:
+    global pipeline
+    generator = (
+        torch.cuda.manual_seed(seed)
+        if torch.cuda.is_available()
+        else torch.manual_seed(seed)
+    )
+    latent1 = pipeline.generate_latent(generator=generator)
+    latent1 = latent1.to(device=pipeline.unet.device, dtype=pipeline.unet.dtype)
+    if same_latent:
+        latent2 = latent1.clone()
+    else:
+        latent2 = pipeline.generate_latent(generator=generator)
+        latent2 = latent2.to(device=pipeline.unet.device, dtype=pipeline.unet.dtype)
+    betas = generate_beta_tensor(size=interpolation_size, alpha=alpha, beta=beta)
+    for i in progress.tqdm(
+        range(interpolation_size - 2),
+        desc=(
+            f"Generating {interpolation_size-2} images"
+            if interpolation_size > 3
+            else "Generating 1 image"
+        ),
+    ):
+        it = betas[i + 1].item()
+        images = pipeline.interpolate_single(
+            it,
+            latent_start=latent1,
+            latent_end=latent2,
+            prompt_start=prompt1,
+            prompt_end=prompt2,
+            guide_prompt=guidance_prompt,
+            num_inference_steps=num_inference_steps,
+            warmup_ratio=warmup_ratio,
+            early=early,
+            late=late,
+            negative_prompt=negative_prompt,
+            guidance_scale=guidance_scale,
+        )
+        if interpolation_size == 3:
+            final_images = images
+            break
+        if i == 0:
+            final_images = images[:2]
+        elif i == interpolation_size - 3:
+            final_images = np.concatenate([final_images, images[1:]], axis=0)
+        else:
+            final_images = np.concatenate([final_images, images[1:2]], axis=0)
+    return final_images
+interpolation_size = None
+with gr.Blocks() as demo:
+    gr.Markdown(title)
+    gr.Markdown(description)
+    with gr.Group():
+        prompt1 = gr.Text(
+            label="Prompt 1",
+            max_lines=3,
+            placeholder="Enter the First Prompt",
+            interactive=True,
+            value="A photo of dog, best quality, extremely detailed",
+        )
+        prompt2 = gr.Text(
+            label="Prompt 2",
+            max_lines=3,
+            placeholder="Enter the Second prompt",
+            interactive=True,
+            value="A photo of car, best quality, extremely detaile",
+        )
+        result = gr.Gallery(label="Result", show_label=False, rows=1, columns=3)
+    generate_button = gr.Button("Generate", variant="primary")
+    with gr.Accordion("Advanced options", open=True):
+        with gr.Group():
+            with gr.Row():
+                with gr.Column():
+                    interpolation_size = gr.Slider(
+                        label="Interpolation Size",
+                        minimum=3,
+                        maximum=15,
+                        step=1,
+                        value=3,
+                        info="Interpolation size includes the start and end images",
+                    )
+                    alpha = gr.Slider(
+                        label="alpha",
+                        minimum=1,
+                        maximum=50,
+                        step=0.1,
+                        value=6.0,
+                    )
+                    beta = gr.Slider(
+                        label="beta",
+                        minimum=1,
+                        maximum=50,
+                        step=0.1,
+                        value=3.0,
+                    )
+                gamma_plot = gr.LinePlot(
+                    x="interpolation index",
+                    y="coefficient",
+                    title="Beta Distribution with Sampled Points",
+                    height=500,
+                    width=400,
+                    overlay_point=True,
+                    tooltip=["coefficient", "interpolation index"],
+                    interactive=False,
+                    show_label=False,
+                )
+                gamma_plot.change(
+                    plot_gemma_fn,
+                    inputs=[
+                        alpha,
+                        beta,
+                        interpolation_size,
+                    ],
+                    outputs=gamma_plot,
+                )
+        with gr.Group():
+            guidance_prompt = gr.Text(
+                label="Guidance prompt",
+                max_lines=3,
+                placeholder="Enter a Guidance Prompt",
+                interactive=True,
+                value="A photo of a dog driving a car, logical, best quality, extremely detailed",
+            )
+            negative_prompt = gr.Text(
+                label="Negative prompt",
+                max_lines=3,
+                placeholder="Enter a Negative Prompt",
+                interactive=True,
+                value="monochrome, lowres, bad anatomy, worst quality, low quality",
+            )
+        with gr.Row():
+            with gr.Column():
+                warmup_ratio = gr.Slider(
+                    label="Warmup Ratio",
+                    minimum=0.02,
+                    maximum=1,
+                    step=0.01,
+                    value=0.122,
+                    interactive=True,
+                )
+                guidance_scale = gr.Slider(
+                    label="Guidance Scale",
+                    minimum=0,
+                    maximum=50,
+                    step=0.1,
+                    value=10,
+                    interactive=True,
+                )
+            with gr.Column():
+                early = gr.Dropdown(
+                    label="Early stage attention type",
+                    choices=[
+                        "pure_inner",
+                        "fused_inner",
+                        "pure_outer",
+                        "fused_outer",
+                        "self",
+                    ],
+                    value="fused_outer",
+                    type="value",
+                    interactive=True,
+                )
+                late = gr.Dropdown(
+                    label="Late stage attention type",
+                    choices=[
+                        "pure_inner",
+                        "fused_inner",
+                        "pure_outer",
+                        "fused_outer",
+                        "self",
+                    ],
+                    value="self",
+                    type="value",
+                    interactive=True,
+                )
+        num_inference_steps = gr.Slider(
+            label="Inference Steps",
+            minimum=25,
+            maximum=50,
+            step=1,
+            value=50,
+            interactive=True,
+        )
+        with gr.Row():
+            model_choice = gr.Dropdown(
+                ["SD1.4-521", "SD1.5-512", "SD2.1-768", "SDXL-1024"],
+                label="Model",
+                value="SD1.5-512",
+                interactive=True,
+            )
+            with gr.Column():
+                seed = gr.Slider(
+                    label="Seed",
+                    minimum=0,
+                    maximum=MAX_SEED,
+                    step=1,
+                    value=1002,
+                )
+                same_latent = gr.Checkbox(
+                    label="Same latent",
+                    value=True,
+                    info="Use the same latent for start and end images",
+                    show_label=True,
+                )
+    gr.Examples(
+        examples=get_example(),
+        inputs=[
+            prompt1,
+            prompt2,
+            interpolation_size,
+            alpha,
+            beta,
+            guidance_prompt,
+            negative_prompt,
+            model_choice,
+            warmup_ratio,
+            guidance_scale,
+            num_inference_steps,
+            early,
+            late,
+            seed,
+            same_latent,
+        ],
+        outputs=result,
+        fn=generate,
+        cache_examples=CACHE_EXAMPLES,
+    )
+    alpha.change(
+        fn=plot_gemma_fn, inputs=[alpha, beta, interpolation_size], outputs=gamma_plot
+    )
+    beta.change(
+        fn=plot_gemma_fn, inputs=[alpha, beta, interpolation_size], outputs=gamma_plot
+    )
+    interpolation_size.change(
+        fn=plot_gemma_fn, inputs=[alpha, beta, interpolation_size], outputs=gamma_plot
+    )
+    model_choice.change(fn=change_model_fn, inputs=model_choice)
+    inputs = [
+        prompt1,
+        prompt2,
+        guidance_prompt,
+        negative_prompt,
+        warmup_ratio,
+        guidance_scale,
+        early,
+        late,
+        alpha,
+        beta,
+        interpolation_size,
+        seed,
+        same_latent,
+        num_inference_steps,
+    ]
+    generate_button.click(
+        fn=dynamic_gallery_fn,
+        inputs=interpolation_size,
+        outputs=result,
+    ).then(
+        fn=generate,
+        inputs=inputs,
+        outputs=result,
+    )
+    gr.Markdown(article)
+with gr.Blocks(css="style.css") as demo_with_history:
+    with gr.Tab("App"):
+        demo.render()
+if __name__ == "__main__":
+    demo_with_history.queue(max_size=20).launch()

interpolation.py ADDED Viewed

	@@ -0,0 +1,388 @@

+from typing import Optional
+import torch
+from torch import FloatTensor, LongTensor, Size, Tensor
+from prior import generate_beta_tensor
+class OuterInterpolatedAttnProcessor:
+    r"""
+    Personalized processor for performing outer attention interpolation.
+    The attention output of interpolated image is obtained by:
+    (1 - t) * Q_t * K_1 * V_1 + t * Q_t * K_m * V_m;
+    If fused with self-attention:
+    (1 - t) * Q_t * [K_1, K_t] * [V_1, V_t] + t * Q_t * [K_m, K_t] * [V_m, V_t];
+    """
+    def __init__(
+        self,
+        t: Optional[float] = None,
+        size: int = 7,
+        is_fused: bool = False,
+        alpha: float = 1,
+        beta: float = 1,
+    ):
+        """
+        t: float, interpolation point between 0 and 1, if specified, size is set to 3
+        """
+        if t is None:
+            ts = generate_beta_tensor(size, alpha=alpha, beta=beta)
+            ts[0], ts[-1] = 0, 1
+        else:
+            assert t > 0 and t < 1, "t must be between 0 and 1"
+            ts = [0, t, 1]
+            ts = torch.tensor(ts)
+            size = 3
+        self.size = size
+        self.coef = ts
+        self.is_fused = is_fused
+    def __call__(
+        self,
+        attn,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        temb: Optional[torch.FloatTensor] = None,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(
+                batch_size, channel, height * width
+            ).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape
+            if encoder_hidden_states is None
+            else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(
+            attention_mask, sequence_length, batch_size
+        )
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(
+                1, 2
+            )
+        query = attn.to_q(hidden_states)
+        query = attn.head_to_batch_dim(query)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(
+                encoder_hidden_states
+            )
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        # Specify the first and last key and value
+        key_begin = key[0:1]
+        key_end = key[-1:]
+        value_begin = value[0:1]
+        value_end = value[-1:]
+        key_begin = torch.cat([key_begin] * (self.size))
+        key_end = torch.cat([key_end] * (self.size))
+        value_begin = torch.cat([value_begin] * (self.size))
+        value_end = torch.cat([value_end] * (self.size))
+        key_begin = attn.head_to_batch_dim(key_begin)
+        value_begin = attn.head_to_batch_dim(value_begin)
+        key_end = attn.head_to_batch_dim(key_end)
+        value_end = attn.head_to_batch_dim(value_end)
+        # Fused with self-attention
+        if self.is_fused:
+            key = attn.head_to_batch_dim(key)
+            value = attn.head_to_batch_dim(value)
+            key_end = torch.cat([key, key_end], dim=-2)
+            value_end = torch.cat([value, value_end], dim=-2)
+            key_begin = torch.cat([key, key_begin], dim=-2)
+            value_begin = torch.cat([value, value_begin], dim=-2)
+        attention_probs_end = attn.get_attention_scores(query, key_end, attention_mask)
+        hidden_states_end = torch.bmm(attention_probs_end, value_end)
+        hidden_states_end = attn.batch_to_head_dim(hidden_states_end)
+        attention_probs_begin = attn.get_attention_scores(
+            query, key_begin, attention_mask
+        )
+        hidden_states_begin = torch.bmm(attention_probs_begin, value_begin)
+        hidden_states_begin = attn.batch_to_head_dim(hidden_states_begin)
+        # Apply outer interpolation on attention
+        coef = self.coef.reshape(-1, 1, 1)
+        coef = coef.to(key.device, key.dtype)
+        hidden_states = (1 - coef) * hidden_states_begin + coef * hidden_states_end
+        hidden_states = attn.to_out[0](hidden_states)
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(
+                batch_size, channel, height, width
+            )
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class InnerInterpolatedAttnProcessor:
+    r"""
+    Personalized processor for performing inner attention interpolation.
+    The attention output of interpolated image is obtained by:
+    (1 - t) * Q_t * K_1 * V_1 + t * Q_t * K_m * V_m;
+    If fused with self-attention:
+    (1 - t) * Q_t * [K_1, K_t] * [V_1, V_t] + t * Q_t * [K_m, K_t] * [V_m, V_t];
+    """
+    def __init__(
+        self,
+        t: Optional[float] = None,
+        size: int = 7,
+        is_fused: bool = False,
+        alpha: float = 1,
+        beta: float = 1,
+    ):
+        """
+        t: float, interpolation point between 0 and 1, if specified, size is set to 3
+        """
+        if t is None:
+            ts = generate_beta_tensor(size, alpha=alpha, beta=beta)
+            ts[0], ts[-1] = 0, 1
+        else:
+            assert t > 0 and t < 1, "t must be between 0 and 1"
+            ts = [0, t, 1]
+            ts = torch.tensor(ts)
+            size = 3
+        self.size = size
+        self.coef = ts
+        self.is_fused = is_fused
+    def __call__(
+        self,
+        attn,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        temb: Optional[torch.FloatTensor] = None,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(
+                batch_size, channel, height * width
+            ).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape
+            if encoder_hidden_states is None
+            else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(
+            attention_mask, sequence_length, batch_size
+        )
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(
+                1, 2
+            )
+        query = attn.to_q(hidden_states)
+        query = attn.head_to_batch_dim(query)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(
+                encoder_hidden_states
+            )
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        # Specify the first and last key and value
+        key_start = key[0:1]
+        key_end = key[-1:]
+        value_start = value[0:1]
+        value_end = value[-1:]
+        key_start = torch.cat([key_start] * (self.size))
+        key_end = torch.cat([key_end] * (self.size))
+        value_start = torch.cat([value_start] * (self.size))
+        value_end = torch.cat([value_end] * (self.size))
+        # Apply inner interpolation on attention
+        coef = self.coef.reshape(-1, 1, 1)
+        coef = coef.to(key.device, key.dtype)
+        key_cross = (1 - coef) * key_start + coef * key_end
+        value_cross = (1 - coef) * value_start + coef * value_end
+        key_cross = attn.head_to_batch_dim(key_cross)
+        value_cross = attn.head_to_batch_dim(value_cross)
+        # Fused with self-attention
+        if self.is_fused:
+            key = attn.head_to_batch_dim(key)
+            value = attn.head_to_batch_dim(value)
+            key_cross = torch.cat([key, key_cross], dim=-2)
+            value_cross = torch.cat([value, value_cross], dim=-2)
+        attention_probs = attn.get_attention_scores(query, key_cross, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value_cross)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        hidden_states = attn.to_out[0](hidden_states)
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(
+                batch_size, channel, height, width
+            )
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+def linear_interpolation(
+    l1: FloatTensor, l2: FloatTensor, ts: Optional[FloatTensor] = None, size: int = 5
+) -> FloatTensor:
+    """
+    Linear interpolation
+    Args:
+        l1: Starting vector: (1, *)
+        l2: Final vector: (1, *)
+        ts: FloatTensor, interpolation points between 0 and 1
+        size: int, number of interpolation points including l1 and l2
+    Returns:
+    Interpolated vectors: (size, *)
+    """
+    assert l1.shape == l2.shape, "shapes of l1 and l2 must match"
+    res = []
+    if ts is not None:
+        for t in ts:
+            li = torch.lerp(l1, l2, t)
+            res.append(li)
+    else:
+        for i in range(size):
+            t = i / (size - 1)
+            li = torch.lerp(l1, l2, t)
+            res.append(li)
+    res = torch.cat(res, dim=0)
+    return res
+def spherical_interpolation(l1: FloatTensor, l2: FloatTensor, size=5) -> FloatTensor:
+    """
+    Spherical interpolation
+    Args:
+        l1: Starting vector: (1, *)
+        l2: Final vector: (1, *)
+        size: int, number of interpolation points including l1 and l2
+    Returns:
+        Interpolated vectors: (size, *)
+    """
+    assert l1.shape == l2.shape, "shapes of l1 and l2 must match"
+    res = []
+    for i in range(size):
+        t = i / (size - 1)
+        li = slerp(l1, l2, t)
+        res.append(li)
+    res = torch.cat(res, dim=0)
+    return res
+def slerp(v0: FloatTensor, v1: FloatTensor, t, threshold=0.9995):
+    """
+    Spherical linear interpolation
+    Args:
+        v0: Starting vector
+        v1: Final vector
+        t: Float value between 0.0 and 1.0
+        threshold: Threshold for considering the two vectors as
+                                colinear. Not recommended to alter this.
+    Returns:
+        Interpolation vector between v0 and v1
+    """
+    assert v0.shape == v1.shape, "shapes of v0 and v1 must match"
+    # Normalize the vectors to get the directions and angles
+    v0_norm: FloatTensor = torch.norm(v0, dim=-1)
+    v1_norm: FloatTensor = torch.norm(v1, dim=-1)
+    v0_normed: FloatTensor = v0 / v0_norm.unsqueeze(-1)
+    v1_normed: FloatTensor = v1 / v1_norm.unsqueeze(-1)
+    # Dot product with the normalized vectors
+    dot: FloatTensor = (v0_normed * v1_normed).sum(-1)
+    dot_mag: FloatTensor = dot.abs()
+    # if dp is NaN, it's because the v0 or v1 row was filled with 0s
+    # If absolute value of dot product is almost 1, vectors are ~colinear, so use torch.lerp
+    gotta_lerp: LongTensor = dot_mag.isnan() | (dot_mag > threshold)
+    can_slerp: LongTensor = ~gotta_lerp
+    t_batch_dim_count: int = max(0, t.dim() - v0.dim()) if isinstance(t, Tensor) else 0
+    t_batch_dims: Size = (
+        t.shape[:t_batch_dim_count] if isinstance(t, Tensor) else Size([])
+    )
+    out: FloatTensor = torch.zeros_like(v0.expand(*t_batch_dims, *[-1] * v0.dim()))
+    # if no elements are lerpable, our vectors become 0-dimensional, preventing broadcasting
+    if gotta_lerp.any():
+        lerped: FloatTensor = torch.lerp(v0, v1, t)
+        out: FloatTensor = lerped.where(gotta_lerp.unsqueeze(-1), out)
+    # if no elements are slerpable, our vectors become 0-dimensional, preventing broadcasting
+    if can_slerp.any():
+        # Calculate initial angle between v0 and v1
+        theta_0: FloatTensor = dot.arccos().unsqueeze(-1)
+        sin_theta_0: FloatTensor = theta_0.sin()
+        # Angle at timestep t
+        theta_t: FloatTensor = theta_0 * t
+        sin_theta_t: FloatTensor = theta_t.sin()
+        # Finish the slerp algorithm
+        s0: FloatTensor = (theta_0 - theta_t).sin() / sin_theta_0
+        s1: FloatTensor = sin_theta_t / sin_theta_0
+        slerped: FloatTensor = s0 * v0 + s1 * v1
+        out: FloatTensor = slerped.where(can_slerp.unsqueeze(-1), out)
+    return out

pipeline_interpolated_sdxl.py ADDED Viewed

The diff for this file is too large to render. See raw diff

pipeline_interpolated_stable_diffusion.py ADDED Viewed

	@@ -0,0 +1,584 @@

+from typing import Optional
+import numpy as np
+import torch
+from diffusers import (
+    AutoencoderKL,
+    DDIMScheduler,
+    SchedulerMixin,
+    UNet2DConditionModel,
+    UniPCMultistepScheduler,
+)
+from diffusers.models.attention_processor import AttnProcessor2_0
+from tqdm.auto import tqdm
+from transformers import CLIPTextModel, CLIPTokenizer
+from interpolation import (
+    InnerInterpolatedAttnProcessor,
+    OuterInterpolatedAttnProcessor,
+    generate_beta_tensor,
+    linear_interpolation,
+    slerp,
+    spherical_interpolation,
+)
+class InterpolationStableDiffusionPipeline:
+    """
+    Diffusion Pipeline that generates interpolated images
+    """
+    def __init__(
+        self,
+        repo_name: str = "CompVis/stable-diffusion-v1-4",
+        scheduler_name: str = "ddim",
+        frozen: bool = True,
+        guidance_scale: float = 7.5,
+        scheduler: Optional[SchedulerMixin] = None,
+        cache_dir: Optional[str] = None,
+    ):
+        # Initialize the generator
+        self.vae = AutoencoderKL.from_pretrained(
+            repo_name, subfolder="vae", use_safetensors=True, cache_dir=cache_dir
+        )
+        self.tokenizer = CLIPTokenizer.from_pretrained(
+            repo_name, subfolder="tokenizer", cache_dir=cache_dir
+        )
+        self.text_encoder = CLIPTextModel.from_pretrained(
+            repo_name,
+            subfolder="text_encoder",
+            use_safetensors=True,
+            cache_dir=cache_dir,
+        )
+        self.unet = UNet2DConditionModel.from_pretrained(
+            repo_name, subfolder="unet", use_safetensors=True, cache_dir=cache_dir
+        )
+        # Initialize the scheduler
+        if scheduler is not None:
+            self.scheduler = scheduler
+        elif scheduler_name == "ddim":
+            self.scheduler = DDIMScheduler.from_pretrained(
+                repo_name, subfolder="scheduler", cache_dir=cache_dir
+            )
+        elif scheduler_name == "unipc":
+            self.scheduler = UniPCMultistepScheduler.from_pretrained(
+                repo_name, subfolder="scheduler", cache_dir=cache_dir
+            )
+        else:
+            raise ValueError(
+                "Invalid scheduler name (ddim, unipc) and not specify scheduler."
+            )
+        # Setup device
+        self.guidance_scale = guidance_scale  # Scale for classifier-free guidance
+        if frozen:
+            for param in self.unet.parameters():
+                param.requires_grad = False
+            for param in self.text_encoder.parameters():
+                param.requires_grad = False
+            for param in self.vae.parameters():
+                param.requires_grad = False
+    def to(self, *args, **kwargs):
+        self.vae.to(*args, **kwargs)
+        self.text_encoder.to(*args, **kwargs)
+        self.unet.to(*args, **kwargs)
+    def generate_latent(
+        self, generator: Optional[torch.Generator] = None, torch_device: str = "cpu"
+    ) -> torch.FloatTensor:
+        """
+        Generates a random latent tensor.
+        Args:
+            generator (Optional[torch.Generator], optional): Generator for random number generation. Defaults to None.
+            torch_device (str, optional): Device to store the tensor. Defaults to "cpu".
+        Returns:
+            torch.FloatTensor: Random latent tensor.
+        """
+        channel = self.unet.config.in_channels
+        height = self.unet.config.sample_size
+        width = self.unet.config.sample_size
+        if generator is None:
+            latent = torch.randn(
+                (1, channel, height, width),
+                device=torch_device,
+            )
+        else:
+            latent = torch.randn(
+                (1, channel, height, width),
+                generator=generator,
+                device=torch_device,
+            )
+        return latent
+    @torch.no_grad()
+    def prompt_to_embedding(
+        self, prompt: str, negative_prompt: str = ""
+    ) -> torch.FloatTensor:
+        """
+        Prepare the text prompt for the diffusion process
+        Args:
+            prompt: str, text prompt
+            negative_prompt: str, negative text prompt
+        Returns:
+            FloatTensor, text embeddings
+        """
+        text_input = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        text_embeddings = self.text_encoder(text_input.input_ids.to(self.torch_device))[
+            0
+        ]
+        uncond_input = self.tokenizer(
+            negative_prompt,
+            padding="max_length",
+            max_length=self.tokenizer.model_max_length,
+            truncation=True,
+            return_tensors="pt",
+        )
+        uncond_embeddings = self.text_encoder(
+            uncond_input.input_ids.to(self.torch_device)
+        )[0]
+        text_embeddings = torch.cat([text_embeddings, uncond_embeddings])
+        return text_embeddings
+    @torch.no_grad()
+    def interpolate(
+        self,
+        latent_start: torch.FloatTensor,
+        latent_end: torch.FloatTensor,
+        prompt_start: str,
+        prompt_end: str,
+        guide_prompt: Optional[str] = None,
+        negative_prompt: str = "",
+        size: int = 7,
+        num_inference_steps: int = 25,
+        warmup_ratio: float = 0.5,
+        early: str = "fused_outer",
+        late: str = "self",
+        alpha: Optional[float] = None,
+        beta: Optional[float] = None,
+        guidance_scale: Optional[float] = None,
+    ) -> np.ndarray:
+        """
+        Interpolate between two generation
+        Args:
+            latent_start: FloatTensor, latent vector of the first image
+            latent_end: FloatTensor, latent vector of the second image
+            prompt_start: str, text prompt of the first image
+            prompt_end: str, text prompt of the second image
+            guide_prompt: str, text prompt for the interpolation
+            negative_prompt: str, negative text prompt
+            size: int, number of interpolations including starting and ending points
+            num_inference_steps: int, number of inference steps in scheduler
+            warmup_ratio: float, ratio of warmup steps
+            early: str, warmup interpolation methods
+            late: str, late interpolation methods
+            alpha: float, alpha parameter for beta distribution
+            beta: float, beta parameter for beta distribution
+            guidance_scale: Optional[float], scale for classifier-free guidance
+        Returns:
+            Numpy array of interpolated images, shape (size, H, W, 3)
+        """
+        # Specify alpha and beta
+        self.torch_device = self.unet.device
+        if alpha is None:
+            alpha = num_inference_steps
+        if beta is None:
+            beta = num_inference_steps
+        if guidance_scale is None:
+            guidance_scale = self.guidance_scale
+        self.scheduler.set_timesteps(num_inference_steps)
+        # Prepare interpolated latents and embeddings
+        latents = spherical_interpolation(latent_start, latent_end, size)
+        embs_start = self.prompt_to_embedding(prompt_start, negative_prompt)
+        emb_start = embs_start[0:1]
+        uncond_emb_start = embs_start[1:2]
+        embs_end = self.prompt_to_embedding(prompt_end, negative_prompt)
+        emb_end = embs_end[0:1]
+        uncond_emb_end = embs_end[1:2]
+        # Perform prompt guidance if it is specified
+        if guide_prompt is not None:
+            guide_embs = self.prompt_to_embedding(guide_prompt, negative_prompt)
+            guide_emb = guide_embs[0:1]
+            uncond_guide_emb = guide_embs[1:2]
+            embs = torch.cat([emb_start] + [guide_emb] * (size - 2) + [emb_end], dim=0)
+            uncond_embs = torch.cat(
+                [uncond_emb_start] + [uncond_guide_emb] * (size - 2) + [uncond_emb_end],
+                dim=0,
+            )
+        else:
+            embs = linear_interpolation(emb_start, emb_end, size=size)
+            uncond_embs = linear_interpolation(
+                uncond_emb_start, uncond_emb_end, size=size
+            )
+        # Specify the interpolation methods
+        pure_inner_attn_proc = InnerInterpolatedAttnProcessor(
+            size=size,
+            is_fused=False,
+            alpha=alpha,
+            beta=beta,
+        )
+        fused_inner_attn_proc = InnerInterpolatedAttnProcessor(
+            size=size,
+            is_fused=True,
+            alpha=alpha,
+            beta=beta,
+        )
+        pure_outer_attn_proc = OuterInterpolatedAttnProcessor(
+            size=size,
+            is_fused=False,
+            alpha=alpha,
+            beta=beta,
+        )
+        fused_outer_attn_proc = OuterInterpolatedAttnProcessor(
+            size=size,
+            is_fused=True,
+            alpha=alpha,
+            beta=beta,
+        )
+        self_attn_proc = AttnProcessor2_0()
+        procs_dict = {
+            "pure_inner": pure_inner_attn_proc,
+            "fused_inner": fused_inner_attn_proc,
+            "pure_outer": pure_outer_attn_proc,
+            "fused_outer": fused_outer_attn_proc,
+            "self": self_attn_proc,
+        }
+        # Denoising process
+        i = 0
+        warmup_step = int(num_inference_steps * warmup_ratio)
+        for t in tqdm(self.scheduler.timesteps):
+            i += 1
+            latent_model_input = self.scheduler.scale_model_input(latents, timestep=t)
+            with torch.no_grad():
+                # Change attention module
+                if i < warmup_step:
+                    interpolate_attn_proc = procs_dict[early]
+                else:
+                    interpolate_attn_proc = procs_dict[late]
+                self.unet.set_attn_processor(processor=interpolate_attn_proc)
+                # Predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input, t, encoder_hidden_states=embs
+                ).sample
+                attn_proc = AttnProcessor()
+                self.unet.set_attn_processor(processor=attn_proc)
+                noise_uncond = self.unet(
+                    latent_model_input, t, encoder_hidden_states=uncond_embs
+                ).sample
+            # perform guidance
+            noise_pred = noise_uncond + guidance_scale * (noise_pred - noise_uncond)
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents).prev_sample
+        # Decode the images
+        latents = 1 / 0.18215 * latents
+        with torch.no_grad():
+            image = self.vae.decode(latents).sample
+        images = (image / 2 + 0.5).clamp(0, 1)
+        images = (images.permute(0, 2, 3, 1) * 255).to(torch.uint8).cpu().numpy()
+        return images
+    @torch.no_grad()
+    def interpolate_save_gpu(
+        self,
+        latent_start: torch.FloatTensor,
+        latent_end: torch.FloatTensor,
+        prompt_start: str,
+        prompt_end: str,
+        guide_prompt: Optional[str] = None,
+        negative_prompt: str = "",
+        size: int = 7,
+        num_inference_steps: int = 25,
+        warmup_ratio: float = 0.5,
+        early: str = "fused_outer",
+        late: str = "self",
+        alpha: Optional[float] = None,
+        beta: Optional[float] = None,
+        init: str = "linear",
+        guidance_scale: Optional[float] = None,
+    ) -> np.ndarray:
+        """
+        Interpolate between two generation
+        Args:
+            latent_start: FloatTensor, latent vector of the first image
+            latent_end: FloatTensor, latent vector of the second image
+            prompt_start: str, text prompt of the first image
+            prompt_end: str, text prompt of the second image
+            guide_prompt: str, text prompt for the interpolation
+            negative_prompt: str, negative text prompt
+            size: int, number of interpolations including starting and ending points
+            num_inference_steps: int, number of inference steps in scheduler
+            warmup_ratio: float, ratio of warmup steps
+            early: str, warmup interpolation methods
+            late: str, late interpolation methods
+            alpha: float, alpha parameter for beta distribution
+            beta: float, beta parameter for beta distribution
+            init: str, interpolation initialization methods
+        Returns:
+            Numpy array of interpolated images, shape (size, H, W, 3)
+        """
+        self.torch_device = self.unet.device
+        # Specify alpha and beta
+        if alpha is None:
+            alpha = num_inference_steps
+        if beta is None:
+            beta = num_inference_steps
+        betas = generate_beta_tensor(size, alpha=alpha, beta=beta)
+        final_images = None
+        # Generate interpolated images one by one
+        for i in range(size - 2):
+            it = betas[i + 1].item()
+            if init == "denoising":
+                images = self.denoising_interpolate(
+                    latent_start,
+                    prompt_start,
+                    prompt_end,
+                    negative_prompt,
+                    interpolated_ratio=it,
+                    timesteps=num_inference_steps,
+                )
+            else:
+                images = self.interpolate_single(
+                    it,
+                    latent_start,
+                    latent_end,
+                    prompt_start,
+                    prompt_end,
+                    guide_prompt=guide_prompt,
+                    num_inference_steps=num_inference_steps,
+                    warmup_ratio=warmup_ratio,
+                    early=early,
+                    late=late,
+                    negative_prompt=negative_prompt,
+                    init=init,
+                    guidance_scale=guidance_scale,
+                )
+            if size == 3:
+                return images
+            if i == 0:
+                final_images = images[:2]
+            elif i == size - 3:
+                final_images = np.concatenate([final_images, images[1:]], axis=0)
+            else:
+                final_images = np.concatenate([final_images, images[1:2]], axis=0)
+        return final_images
+    def interpolate_single(
+        self,
+        it,
+        latent_start: torch.FloatTensor,
+        latent_end: torch.FloatTensor,
+        prompt_start: str,
+        prompt_end: str,
+        guide_prompt: str = None,
+        negative_prompt: str = "",
+        num_inference_steps: int = 25,
+        warmup_ratio: float = 0.5,
+        early: str = "fused_outer",
+        late: str = "self",
+        init="linear",
+        guidance_scale: Optional[float] = None,
+    ) -> np.ndarray:
+        """
+        Interpolates between two latent vectors and generates a sequence of images.
+        Args:
+            it (float): Interpolation factor between latent_start and latent_end.
+            latent_start (torch.FloatTensor): Starting latent vector.
+            latent_end (torch.FloatTensor): Ending latent vector.
+            prompt_start (str): Starting prompt for text conditioning.
+            prompt_end (str): Ending prompt for text conditioning.
+            guide_prompt (str, optional): Guiding prompt for text conditioning. Defaults to None.
+            negative_prompt (str, optional): Negative prompt for text conditioning. Defaults to "".
+            num_inference_steps (int, optional): Number of inference steps. Defaults to 25.
+            warmup_ratio (float, optional): Ratio of warm-up steps. Defaults to 0.5.
+            early (str, optional): Early attention processing method. Defaults to "fused_outer".
+            late (str, optional): Late attention processing method. Defaults to "self".
+            init (str, optional): Initialization method for interpolation. Defaults to "linear".
+            guidance_scale (Optional[float], optional): Scale for classifier-free guidance. Defaults to None.
+        Returns:
+            numpy.ndarray: Sequence of generated images.
+        """
+        self.torch_device = self.unet.device
+        if guidance_scale is None:
+            guidance_scale = self.guidance_scale
+        # Prepare interpolated inputs
+        self.scheduler.set_timesteps(num_inference_steps)
+        embs_start = self.prompt_to_embedding(prompt_start, negative_prompt)
+        emb_start = embs_start[0:1]
+        uncond_emb_start = embs_start[1:2]
+        embs_end = self.prompt_to_embedding(prompt_end, negative_prompt)
+        emb_end = embs_end[0:1]
+        uncond_emb_end = embs_end[1:2]
+        latent_t = slerp(latent_start, latent_end, it)
+        if guide_prompt is not None:
+            embs_guide = self.prompt_to_embedding(guide_prompt, negative_prompt)
+            emb_t = embs_guide[0:1]
+        else:
+            if init == "linear":
+                emb_t = torch.lerp(emb_start, emb_end, it)
+            else:
+                emb_t = slerp(emb_start, emb_end, it)
+        if init == "linear":
+            uncond_emb_t = torch.lerp(uncond_emb_start, uncond_emb_end, it)
+        else:
+            uncond_emb_t = slerp(uncond_emb_start, uncond_emb_end, it)
+        latents = torch.cat([latent_start, latent_t, latent_end], dim=0)
+        embs = torch.cat([emb_start, emb_t, emb_end], dim=0)
+        uncond_embs = torch.cat([uncond_emb_start, uncond_emb_t, uncond_emb_end], dim=0)
+        # Specifiy the attention processors
+        pure_inner_attn_proc = InnerInterpolatedAttnProcessor(
+            t=it,
+            is_fused=False,
+        )
+        fused_inner_attn_proc = InnerInterpolatedAttnProcessor(
+            t=it,
+            is_fused=True,
+        )
+        pure_outer_attn_proc = OuterInterpolatedAttnProcessor(
+            t=it,
+            is_fused=False,
+        )
+        fused_outer_attn_proc = OuterInterpolatedAttnProcessor(
+            t=it,
+            is_fused=True,
+        )
+        self_attn_proc = AttnProcessor()
+        procs_dict = {
+            "pure_inner": pure_inner_attn_proc,
+            "fused_inner": fused_inner_attn_proc,
+            "pure_outer": pure_outer_attn_proc,
+            "fused_outer": fused_outer_attn_proc,
+            "self": self_attn_proc,
+        }
+        i = 0
+        warmup_step = int(num_inference_steps * warmup_ratio)
+        for t in tqdm(self.scheduler.timesteps):
+            i += 1
+            latent_model_input = self.scheduler.scale_model_input(latents, timestep=t)
+            # predict the noise residual
+            with torch.no_grad():
+                # Warmup
+                if i < warmup_step:
+                    interpolate_attn_proc = procs_dict[early]
+                else:
+                    interpolate_attn_proc = procs_dict[late]
+                self.unet.set_attn_processor(processor=interpolate_attn_proc)
+                # predict the noise residual
+                noise_pred = self.unet(
+                    latent_model_input, t, encoder_hidden_states=embs
+                ).sample
+                attn_proc = AttnProcessor()
+                self.unet.set_attn_processor(processor=attn_proc)
+                noise_uncond = self.unet(
+                    latent_model_input, t, encoder_hidden_states=uncond_embs
+                ).sample
+            # perform guidance
+            noise_pred = noise_uncond + guidance_scale * (noise_pred - noise_uncond)
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents).prev_sample
+        # Decode the images
+        latents = 1 / 0.18215 * latents
+        with torch.no_grad():
+            image = self.vae.decode(latents).sample
+        images = (image / 2 + 0.5).clamp(0, 1)
+        images = (images.permute(0, 2, 3, 1) * 255).to(torch.uint8).cpu().numpy()
+        return images
+    def denoising_interpolate(
+        self,
+        latents: torch.FloatTensor,
+        text_1: str,
+        text_2: str,
+        negative_prompt: str = "",
+        interpolated_ratio: float = 1,
+        timesteps: int = 25,
+    ) -> np.ndarray:
+        """
+        Performs denoising interpolation on the given latents.
+        Args:
+            latents (torch.Tensor): The input latents.
+            text_1 (str): The first text prompt.
+            text_2 (str): The second text prompt.
+            negative_prompt (str, optional): The negative text prompt. Defaults to "".
+            interpolated_ratio (int, optional): The ratio of interpolation between text_1 and text_2. Defaults to 1.
+            timesteps (int, optional): The number of timesteps for diffusion. Defaults to 25.
+        Returns:
+            numpy.ndarray: The interpolated images.
+        """
+        self.unet.set_attn_processor(processor=AttnProcessor())
+        start_emb = self.prompt_to_embedding(text_1)
+        end_emb = self.prompt_to_embedding(text_2)
+        neg_emb = self.prompt_to_embedding(negative_prompt)
+        uncond_emb = neg_emb[0:1]
+        emb_1 = start_emb[0:1]
+        emb_2 = end_emb[0:1]
+        self.scheduler.set_timesteps(timesteps)
+        i = 0
+        for t in tqdm(self.scheduler.timesteps):
+            i += 1
+            # expand the latents if we are doing classifier-free guidance to avoid doing two forward passes.
+            latent_model_input = self.scheduler.scale_model_input(latents, timestep=t)
+            # predict the noise residual
+            with torch.no_grad():
+                if i < timesteps * interpolated_ratio:
+                    noise_pred = self.unet(
+                        latent_model_input, t, encoder_hidden_states=emb_1
+                    ).sample
+                else:
+                    noise_pred = self.unet(
+                        latent_model_input, t, encoder_hidden_states=emb_2
+                    ).sample
+                noise_uncond = self.unet(
+                    latent_model_input, t, encoder_hidden_states=uncond_emb
+                ).sample
+            # perform guidance
+            noise_pred = noise_uncond + self.guidance_scale * (
+                noise_pred - noise_uncond
+            )
+            # compute the previous noisy sample x_t -> x_t-1
+            latents = self.scheduler.step(noise_pred, t, latents).prev_sample
+        latents = 1 / 0.18215 * latents
+        with torch.no_grad():
+            image = self.vae.decode(latents).sample
+        images = (image / 2 + 0.5).clamp(0, 1)
+        images = (images.permute(0, 2, 3, 1) * 255).to(torch.uint8).cpu().numpy()
+        return images

prior.py ADDED Viewed

	@@ -0,0 +1,168 @@

+import torch
+from bayes_opt import BayesianOptimization, SequentialDomainReductionTransformer
+from lpips import LPIPS
+from scipy.stats import beta as beta_distribution
+from utils import compute_lpips, compute_smoothness_and_consistency
+def bayesian_prior_selection(
+    interpolation_pipe,
+    latent1: torch.FloatTensor,
+    latent2: torch.FloatTensor,
+    prompt1: str,
+    prompt2: str,
+    lpips_model: LPIPS,
+    guide_prompt: str | None = None,
+    negative_prompt: str = "",
+    size: int = 3,
+    num_inference_steps: int = 25,
+    warmup_ratio: float = 1,
+    early: str = "vfused",
+    late: str = "self",
+    target_score: float = 0.9,
+    n_iter: int = 15,
+    p_min: float | None = None,
+    p_max: float | None = None,
+) -> tuple:
+    """
+    Select the alpha and beta parameters for the interpolation using Bayesian optimization.
+    Args:
+        interpolation_pipe (any): The interpolation pipeline.
+        latent1 (torch.FloatTensor): The first source latent vector.
+        latent2 (torch.FloatTensor): The second source latent vector.
+        prompt1 (str): The first source prompt.
+        prompt2 (str): The second source prompt.
+        lpips_model (any): The LPIPS model used to compute perceptual distances.
+        guide_prompt (str | None, optional): The guide prompt for the interpolation, if any. Defaults to None.
+        negative_prompt (str, optional): The negative prompt for the interpolation, default to empty string. Defaults to "".
+        size (int, optional): The size of the interpolation sequence. Defaults to 3.
+        num_inference_steps (int, optional): The number of inference steps. Defaults to 25.
+        warmup_ratio (float, optional): The warmup ratio. Defaults to 1.
+        early (str, optional): The early fusion method. Defaults to "vfused".
+        late (str, optional): The late fusion method. Defaults to "self".
+        target_score (float, optional): The target score. Defaults to 0.9.
+        n_iter (int, optional): The maximum number of iterations. Defaults to 15.
+        p_min (float, optional): The minimum value of alpha and beta. Defaults to None.
+        p_max (float, optional): The maximum value of alpha and beta. Defaults to None.
+    Returns:
+        tuple: A tuple containing the selected alpha and beta parameters.
+    """
+    def get_smoothness(alpha, beta):
+        """
+        Black-box objective function of Bayesian Optimization.
+        Get the smoothness of the interpolated sequence with the given alpha and beta.
+        """
+        if alpha < beta and large_alpha_prior:
+            return 0
+        if alpha > beta and not large_alpha_prior:
+            return 0
+        if alpha == beta:
+            return init_smoothness
+        interpolation_sequence = interpolation_pipe.interpolate_save_gpu(
+            latent1,
+            latent2,
+            prompt1,
+            prompt2,
+            guide_prompt=guide_prompt,
+            negative_prompt=negative_prompt,
+            size=size,
+            num_inference_steps=num_inference_steps,
+            warmup_ratio=warmup_ratio,
+            early=early,
+            late=late,
+            alpha=alpha,
+            beta=beta,
+        )
+        smoothness, _, _ = compute_smoothness_and_consistency(
+            interpolation_sequence, lpips_model
+        )
+        return smoothness
+    # Add prior into selection of alpha and beta
+    # We firstly compute the interpolated images with t=0.5
+    images = interpolation_pipe.interpolate_single(
+        0.5,
+        latent1,
+        latent2,
+        prompt1,
+        prompt2,
+        guide_prompt=guide_prompt,
+        negative_prompt=negative_prompt,
+        num_inference_steps=num_inference_steps,
+        warmup_ratio=warmup_ratio,
+        early=early,
+        late=late,
+    )
+    # We compute the perceptual distances of the interpolated images (t=0.5) to the source image
+    distances = compute_lpips(images, lpips_model)
+    # We compute the init_smoothness as the smoothness when alpha=beta to avoid recomputation
+    init_smoothness, _, _ = compute_smoothness_and_consistency(images, lpips_model)
+    # If perceptual distance to the first source image is smaller, alpha should be larger than beta
+    large_alpha_prior = distances[0] < distances[1]
+    # Bayesian optimization configuration
+    num_warmup_steps = warmup_ratio * num_inference_steps
+    if p_min is None:
+        p_min = 1
+    if p_max is None:
+        p_max = num_warmup_steps
+    pbounds = {"alpha": (p_min, p_max), "beta": (p_min, p_max)}
+    bounds_transformer = SequentialDomainReductionTransformer(minimum_window=0.1)
+    optimizer = BayesianOptimization(
+        f=get_smoothness,
+        pbounds=pbounds,
+        random_state=1,
+        bounds_transformer=bounds_transformer,
+        allow_duplicate_points=True,
+    )
+    alpha_init = [p_min, (p_min + p_max) / 2, p_max]
+    beta_init = [p_min, (p_min + p_max) / 2, p_max]
+    # Initial probing
+    for alpha in alpha_init:
+        for beta in beta_init:
+            optimizer.probe(params={"alpha": alpha, "beta": beta}, lazy=False)
+            latest_result = optimizer.res[-1]  # Get the last result
+            latest_score = latest_result["target"]
+            if latest_score >= target_score:
+                return alpha, beta
+    # Start optimization
+    for _ in range(n_iter):  # Max iterations
+        optimizer.maximize(init_points=0, n_iter=1)  # One iteration at a time
+        max_score = optimizer.max["target"]  # Get the highest score so far
+        if max_score >= target_score:
+            print(f"Stopping early, target of {target_score} reached.")
+            break  # Exit the loop if target is reached or exceeded
+    results = optimizer.max
+    alpha = results["params"]["alpha"]
+    beta = results["params"]["beta"]
+    return alpha, beta
+def generate_beta_tensor(
+    size: int, alpha: float = 3, beta: float = 3
+) -> torch.FloatTensor:
+    """
+    Assume size as n
+    Generates a PyTorch tensor of values [x0, x1, ..., xn-1] for the Beta distribution
+    where each xi satisfies F(xi) = i/(n-1) for the CDF F of the Beta distribution.
+    Args:
+        size (int): The number of values to generate.
+        alpha (float): The alpha parameter of the Beta distribution.
+        beta (float): The beta parameter of the Beta distribution.
+    Returns:
+        torch.Tensor: A tensor of the inverse CDF values of the Beta distribution.
+    """
+    # Generating the inverse CDF values
+    prob_values = [i / (size - 1) for i in range(size)]
+    inverse_cdf_values = beta_distribution.ppf(prob_values, alpha, beta)
+    # Converting to a PyTorch tensor
+    return torch.tensor(inverse_cdf_values, dtype=torch.float32)

requirements.txt ADDED Viewed

	@@ -0,0 +1,65 @@

+absl-py==2.1.0
+accelerate==0.27.2
+addict==2.4.0
+antlr4-python3-runtime==4.9.3
+bayesian-optimization==1.4.3
+clean-fid==0.1.35
+clip @ git+https://github.com/openai/CLIP.git@a1d071733d7111c9c014f024669f959182114e33
+colorama==0.4.6
+contourpy==1.2.0
+cycler==0.12.1
+diffusers==0.27.1
+einops==0.7.0
+facexlib==0.3.0
+filterpy==1.4.5
+fonttools==4.49.0
+fsspec==2024.2.0
+ftfy==6.1.3
+future==1.0.0
+grpcio==1.62.0
+huggingface-hub==0.20.3
+imageio==2.34.0
+imgaug==0.4.0
+joblib==1.3.2
+kiwisolver==1.4.5
+lazy_loader==0.3
+llvmlite==0.42.0
+lmdb==1.4.1
+lpips==0.1.4
+Markdown==3.5.2
+matplotlib==3.8.3
+mkl-service==2.4.0
+numba==0.59.0
+numpy==1.24.4
+omegaconf==2.3.0
+openai-clip==1.0.1
+opencv-python==4.9.0.80
+pandas==2.2.0
+protobuf==4.25.3
+pyiqa==0.1.10
+pyparsing==3.1.1
+python-dateutil==2.8.2
+pytorch-fid==0.3.0
+pytz==2024.1
+regex==2023.12.25
+safetensors==0.4.2
+scikit-image==0.22.0
+scikit-learn==1.4.1.post1
+scipy==1.9.1
+shapely==2.0.3
+tensorboard==2.16.2
+tensorboard-data-server==0.7.2
+threadpoolctl==3.3.0
+tifffile==2024.2.12
+timm==0.9.16
+tokenizers==0.15.2
+tomli==2.0.1
+torch==2.1.0
+torchaudio==2.1.0
+torchvision==0.16.0
+tqdm==4.66.2
+transformers==4.38.2
+triton==2.1.0
+tzdata==2024.1
+Werkzeug==3.0.1
+yapf==0.40.2

style.css ADDED Viewed

	@@ -0,0 +1,71 @@

+h1 {
+    text-align: center;
+    justify-content: center;
+}
+[role="tabpanel"] {
+    border: 0
+}
+#duplicate-button {
+    margin: auto;
+    color: #fff;
+    background: #1565c0;
+    border-radius: 100vh;
+}
+.gradio-container {
+    max-width: 690px ! important;
+}
+#share-btn-container {
+    padding-left: 0.5rem !important;
+    padding-right: 0.5rem !important;
+    background-color: #000000;
+    justify-content: center;
+    align-items: center;
+    border-radius: 9999px !important;
+    max-width: 13rem;
+    margin-left: auto;
+    margin-top: 0.35em;
+}
+div#share-btn-container>div {
+    flex-direction: row;
+    background: black;
+    align-items: center
+}
+#share-btn-container:hover {
+    background-color: #060606
+}
+#share-btn {
+    all: initial;
+    color: #ffffff;
+    font-weight: 600;
+    cursor: pointer;
+    font-family: 'IBM Plex Sans', sans-serif;
+    margin-left: 0.5rem !important;
+    padding-top: 0.5rem !important;
+    padding-bottom: 0.5rem !important;
+    right: 0;
+    font-size: 15px;
+}
+#share-btn * {
+    all: unset
+}
+#share-btn-container div:nth-child(-n+2) {
+    width: auto !important;
+    min-height: 0px !important;
+}
+#share-btn-container .wrap {
+    display: none !important
+}
+#share-btn-container.hidden {
+    display: none !important
+}

utils.py ADDED Viewed

	@@ -0,0 +1,189 @@

+import os
+from typing import Optional
+import matplotlib.pyplot as plt
+import numpy as np
+import torch
+from lpips import LPIPS
+from PIL import Image
+from torchvision.transforms import Normalize
+def show_images_horizontally(
+    list_of_files: np.array, output_file: Optional[str] = None, interact: bool = False
+) -> None:
+    """
+    Visualize the list of images horizontally and save the figure as PNG.
+    Args:
+        list_of_files: The list of images as numpy array with shape (N, H, W, C).
+        output_file: The output file path to save the figure as PNG.
+        interact: Whether to show the figure interactively in Jupyter Notebook or not in Python.
+    """
+    number_of_files = len(list_of_files)
+    heights = [a[0].shape[0] for a in list_of_files]
+    widths = [a.shape[1] for a in list_of_files[0]]
+    fig_width = 8.0  # inches
+    fig_height = fig_width * sum(heights) / sum(widths)
+    # Create a figure with subplots
+    _, axs = plt.subplots(
+        1, number_of_files, figsize=(fig_width * number_of_files, fig_height)
+    )
+    plt.tight_layout()
+    for i in range(number_of_files):
+        _image = list_of_files[i]
+        axs[i].imshow(_image)
+        axs[i].axis("off")
+    # Save the figure as PNG
+    if interact:
+        plt.show()
+    else:
+        plt.savefig(output_file, bbox_inches="tight", pad_inches=0.25)
+def save_image(image: np.array, file_name: str) -> None:
+    """
+    Save the image as JPG.
+    Args:
+        image: The input image as numpy array with shape (H, W, C).
+        file_name: The file name to save the image.
+    """
+    image = Image.fromarray(image)
+    image.save(file_name)
+def load_and_process_images(load_dir: str) -> np.array:
+    """
+    Load and process the images into numpy array from the directory.
+    Args:
+        load_dir: The directory to load the images.
+    Returns:
+        images: The images as numpy array with shape (N, H, W, C).
+    """
+    images = []
+    print(load_dir)
+    filenames = sorted(
+        os.listdir(load_dir), key=lambda x: int(x.split(".")[0])
+    )  # Ensure the files are sorted numerically
+    for filename in filenames:
+        if filename.endswith(".jpg"):
+            img = Image.open(os.path.join(load_dir, filename))
+            img_array = (
+                np.asarray(img) / 255.0
+            )  # Convert to numpy array and scale pixel values to [0, 1]
+            images.append(img_array)
+    return images
+def compute_lpips(images: np.array, lpips_model: LPIPS) -> np.array:
+    """
+    Compute the LPIPS of the input images.
+    Args:
+        images: The input images as numpy array with shape (N, H, W, C).
+        lpips_model: The LPIPS model used to compute perceptual distances.
+    Returns:
+        distances: The LPIPS of the input images.
+    """
+    # Get device of lpips_model
+    device = next(lpips_model.parameters()).device
+    device = str(device)
+    # Change the input images into tensor
+    images = torch.tensor(images).to(device).float()
+    images = torch.permute(images, (0, 3, 1, 2))
+    normalize = Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+    images = normalize(images)
+    # Compute the LPIPS between each adjacent input images
+    distances = []
+    for i in range(images.shape[0]):
+        if i == images.shape[0] - 1:
+            break
+        img1 = images[i].unsqueeze(0)
+        img2 = images[i + 1].unsqueeze(0)
+        loss = lpips_model(img1, img2)
+        distances.append(loss.item())
+    distances = np.array(distances)
+    return distances
+def compute_gini(distances: np.array) -> float:
+    """
+    Compute the Gini index of the input distances.
+    Args:
+        distances: The input distances as numpy array.
+    Returns:
+        gini: The Gini index of the input distances.
+    """
+    if len(distances) < 2:
+        return 0.0  # Gini index is 0 for less than two elements
+    # Sort the list of distances
+    sorted_distances = sorted(distances)
+    n = len(sorted_distances)
+    mean_distance = sum(sorted_distances) / n
+    # Compute the sum of absolute differences
+    sum_of_differences = 0
+    for di in sorted_distances:
+        for dj in sorted_distances:
+            sum_of_differences += abs(di - dj)
+    # Normalize the sum of differences by the mean and the number of elements
+    gini = sum_of_differences / (2 * n * n * mean_distance)
+    return gini
+def compute_smoothness_and_consistency(images: np.array, lpips_model: LPIPS) -> tuple:
+    """
+    Compute the smoothness and efficiency of the input images.
+    Args:
+        images: The input images as numpy array with shape (N, H, W, C).
+        lpips_model: The LPIPS model used to compute perceptual distances.
+    Returns:
+        smoothness: One minus gini index of LPIPS of consecutive images.
+        consistency: The mean LPIPS of consecutive images.
+        max_inception_distance: The maximum LPIPS of consecutive images.
+    """
+    distances = compute_lpips(images, lpips_model)
+    smoothness = 1 - compute_gini(distances)
+    consistency = np.mean(distances)
+    max_inception_distance = np.max(distances)
+    return smoothness, consistency, max_inception_distance
+def separate_source_and_interpolated_images(images: np.array) -> tuple:
+    """
+    Separate the input images into source and interpolated images.
+    The input source is the start and end of the images, while the interpolated images are the rest.
+    Args:
+        images: The input images as numpy array with shape (N, H, W, C).
+    Returns:
+        source: The source images as numpy array with shape (2, H, W, C).
+        interpolation: The interpolated images as numpy array with shape (N-2, H, W, C).
+    """
+    # Check if the array has at least two elements
+    if len(images) < 2:
+        raise ValueError("The input array should have at least two elements.")
+    # Separate the array into two parts
+    # First part takes the first and last element
+    source = np.array([images[0], images[-1]])
+    # Second part takes the rest of the elements
+    interpolation = images[1:-1]
+    return source, interpolation