Spaces:

SakanaAI
/

Evo-Ukiyoe

Running on Zero

File size: 5,876 Bytes

dd4e3ab

import gc
import os
from typing import Dict, List, Union

from diffusers import (
    StableDiffusionXLPipeline,
    UNet2DConditionModel,
)
from huggingface_hub import hf_hub_download
import safetensors
import torch
from tqdm import tqdm
from transformers import AutoTokenizer, CLIPTextModelWithProjection


# Base models
SDXL_REPO = "stabilityai/stable-diffusion-xl-base-1.0"
DPO_REPO = "mhdang/dpo-sdxl-text2image-v1"
JN_REPO = "RunDiffusion/Juggernaut-XL-v9"
JSDXL_REPO = "stabilityai/japanese-stable-diffusion-xl"

# Evo-Ukiyoe
UKIYOE_REPO = "SakanaAI/Evo-Ukiyoe-v1"


def load_state_dict(checkpoint_file: Union[str, os.PathLike], device: str = "cpu"):
    file_extension = os.path.basename(checkpoint_file).split(".")[-1]
    if file_extension == "safetensors":
        return safetensors.torch.load_file(checkpoint_file, device=device)
    else:
        return torch.load(checkpoint_file, map_location=device)


def load_from_pretrained(
    repo_id,
    filename="diffusion_pytorch_model.fp16.safetensors",
    subfolder="unet",
    device="cuda",
) -> Dict[str, torch.Tensor]:
    return load_state_dict(
        hf_hub_download(
            repo_id=repo_id,
            filename=filename,
            subfolder=subfolder,
        ),
        device=device,
    )


def reshape_weight_task_tensors(task_tensors, weights):
    """
    Reshapes `weights` to match the shape of `task_tensors` by unsqeezing in the remaining dimenions.

    Args:
        task_tensors (`torch.Tensor`): The tensors that will be used to reshape `weights`.
        weights (`torch.Tensor`): The tensor to be reshaped.

    Returns:
        `torch.Tensor`: The reshaped tensor.
    """
    new_shape = weights.shape + (1,) * (task_tensors.dim() - weights.dim())
    weights = weights.view(new_shape)
    return weights


def linear(task_tensors: List[torch.Tensor], weights: torch.Tensor) -> torch.Tensor:
    """
    Merge the task tensors using `linear`.

    Args:
        task_tensors(`List[torch.Tensor]`):The task tensors to merge.
        weights (`torch.Tensor`):The weights of the task tensors.

    Returns:
        `torch.Tensor`: The merged tensor.
    """
    task_tensors = torch.stack(task_tensors, dim=0)
    # weighted task tensors
    weights = reshape_weight_task_tensors(task_tensors, weights)
    weighted_task_tensors = task_tensors * weights
    mixed_task_tensors = weighted_task_tensors.sum(dim=0)
    return mixed_task_tensors


def merge_models(task_tensors, weights):
    keys = list(task_tensors[0].keys())
    weights = torch.tensor(weights, device=task_tensors[0][keys[0]].device)
    state_dict = {}
    for key in tqdm(keys, desc="Merging"):
        w_list = []
        for i, sd in enumerate(task_tensors):
            w = sd.pop(key)
            w_list.append(w)
        new_w = linear(task_tensors=w_list, weights=weights)
        state_dict[key] = new_w
    return state_dict


def split_conv_attn(weights):
    attn_tensors = {}
    conv_tensors = {}
    for key in list(weights.keys()):
        if any(k in key for k in ["to_k", "to_q", "to_v", "to_out.0"]):
            attn_tensors[key] = weights.pop(key)
        else:
            conv_tensors[key] = weights.pop(key)
    return {"conv": conv_tensors, "attn": attn_tensors}


def load_evo_ukiyoe(device="cuda") -> StableDiffusionXLPipeline:
    # Load base models
    sdxl_weights = split_conv_attn(load_from_pretrained(SDXL_REPO, device=device))
    dpo_weights = split_conv_attn(
        load_from_pretrained(
            DPO_REPO, "diffusion_pytorch_model.safetensors", device=device
        )
    )
    jn_weights = split_conv_attn(load_from_pretrained(JN_REPO, device=device))
    jsdxl_weights = split_conv_attn(load_from_pretrained(JSDXL_REPO, device=device))

    # Merge base models
    tensors = [sdxl_weights, dpo_weights, jn_weights, jsdxl_weights]
    new_conv = merge_models(
        [sd["conv"] for sd in tensors],
        [
            0.15928833971605916,
            0.1032449268871776,
            0.6503217149752791,
            0.08714501842148402,
        ],
    )
    new_attn = merge_models(
        [sd["attn"] for sd in tensors],
        [
            0.1877279276437178,
            0.20014114603909822,
            0.3922685507065275,
            0.2198623756106564,
        ],
    )

    # Delete no longer needed variables to free
    del sdxl_weights, dpo_weights, jn_weights, jsdxl_weights
    gc.collect()
    if "cuda" in device:
        torch.cuda.empty_cache()

    # Instantiate UNet
    unet_config = UNet2DConditionModel.load_config(SDXL_REPO, subfolder="unet")
    unet = UNet2DConditionModel.from_config(unet_config).to(device=device)
    unet.load_state_dict({**new_conv, **new_attn})

    # Load other modules
    text_encoder = CLIPTextModelWithProjection.from_pretrained(
        JSDXL_REPO, subfolder="text_encoder", torch_dtype=torch.float16, variant="fp16",
    )
    tokenizer = AutoTokenizer.from_pretrained(
        JSDXL_REPO, subfolder="tokenizer", use_fast=False,
    )

    # Load pipeline
    pipe = StableDiffusionXLPipeline.from_pretrained(
        SDXL_REPO,
        unet=unet,
        text_encoder=text_encoder,
        tokenizer=tokenizer,
        torch_dtype=torch.float16,
        variant="fp16",
    )
    # Load Evo-Ukiyoe weights
    pipe.load_lora_weights(UKIYOE_REPO)
    pipe.fuse_lora(lora_scale=1.0)
    pipe = pipe.to(device=torch.device(device), dtype=torch.float16)

    return pipe


if __name__ == "__main__":
    pipe = load_evo_ukiyoe()
    images = pipe(
        prompt="着物を着ている猫が庭でお茶を飲んでいる。最高品質の輻の浮世絵。超詳細。",
        negative_prompt="",
        guidance_scale=8.0,
        num_inference_steps=50,
        generator=torch.Generator().manual_seed(0),
        num_images_per_prompt=1,
        output_type="pil",
    ).images
    images[0].save("out.png")