{ "cells": [ { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "# Experiments with Text-To-Video Zero Pipeline" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/awu/dev/lib/python3.8/site-packages/tqdm/auto.py:22: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "import torch\n", "import imageio\n", "from diffusers import TextToVideoZeroPipeline, ControlNetModel, StableDiffusionControlNetPipeline, TextToVideoZeroPipeline\n", "from diffusers.pipelines.text_to_video_synthesis.pipeline_text_to_video_zero import CrossFrameAttnProcessor\n", "from huggingface_hub import hf_hub_download\n", "from PIL import Image" ] }, { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "import sys\n", "\n", "sys.path.insert(0, \"..\")" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "import jax\n", "jax.local_devices()" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Text-To-Video" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model_id = \"tuwonga/zukki_style\"\n", "pipe = TextToVideoZeroPipeline.from_pretrained(model_id)\n", "\n", "prompt = \"A person taking a walk through the city at night\"\n", "result = pipe(prompt=prompt).images\n", "result = [(r * 255).astype(\"uint8\") for r in result]\n", "imageio.mimsave(\"video.mp4\", result, fps=4)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Text-To-Video with Pose Control" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [ "model_id = \"runwayml/stable-diffusion-v1-5\" # base model\n", "video_path = \"__assets__/dance1_corr.mp4\" # pose video\n", "\n", "reader = imageio.get_reader(video_path, \"ffmpeg\")\n", "frame_count = 8\n", "pose_images = [Image.fromarray(reader.get_data(i)) for i in range(frame_count)]\n", "\n", "controlnet = ControlNetModel.from_pretrained(\"lllyasviel/sd-controlnet-openpose\")\n", "pipe = StableDiffusionControlNetPipeline.from_pretrained(model_id, controlnet=controlnet)\n", "\n", "# Set the attention processor\n", "pipe.unet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2))\n", "pipe.controlnet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2))\n", "\n", "# fix latents for all frames\n", "latents = torch.randn((1, 4, 64, 64)).repeat(len(pose_images), 1, 1, 1)\n", "\n", "prompt = \"Darth Vader dancing in a desert\"\n", "result = pipe(prompt=[prompt] * len(pose_images), image=pose_images, latents=latents).images\n", "imageio.mimsave(\"video.mp4\", result, fps=4)" ] }, { "attachments": {}, "cell_type": "markdown", "metadata": {}, "source": [ "### Text-To-Video with Safetensors" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Checkpoint path: /home/awu/.cache/huggingface/hub/models--breakcore2--ligne_claire_anime_diffusion/snapshots/0e89c2e14030f1afdc77b208e35aaf4a597238d9/ligne_claire_anime_diffusion_v1.safetensors\n", "global_step key not found in model\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Some weights of the model checkpoint at openai/clip-vit-large-patch14 were not used when initializing CLIPTextModel: ['vision_model.encoder.layers.5.self_attn.out_proj.weight', 'vision_model.encoder.layers.16.layer_norm1.weight', 'vision_model.encoder.layers.2.self_attn.out_proj.weight', 'vision_model.encoder.layers.23.layer_norm2.bias', 'vision_model.encoder.layers.23.self_attn.k_proj.bias', 'vision_model.encoder.layers.22.mlp.fc2.weight', 'vision_model.encoder.layers.6.self_attn.out_proj.bias', 'vision_model.encoder.layers.6.self_attn.v_proj.weight', 'vision_model.encoder.layers.21.self_attn.out_proj.weight', 'vision_model.encoder.layers.3.mlp.fc2.weight', 'vision_model.encoder.layers.19.self_attn.q_proj.bias', 'vision_model.encoder.layers.15.self_attn.v_proj.bias', 'vision_model.encoder.layers.22.self_attn.k_proj.bias', 'vision_model.encoder.layers.17.layer_norm1.bias', 'vision_model.encoder.layers.0.mlp.fc2.bias', 'vision_model.encoder.layers.17.layer_norm2.weight', 'vision_model.encoder.layers.6.self_attn.v_proj.bias', 'vision_model.encoder.layers.8.mlp.fc2.weight', 'vision_model.encoder.layers.20.mlp.fc2.bias', 'vision_model.encoder.layers.5.self_attn.k_proj.bias', 'vision_model.encoder.layers.22.layer_norm1.weight', 'vision_model.encoder.layers.19.mlp.fc1.bias', 'vision_model.encoder.layers.22.layer_norm1.bias', 'vision_model.encoder.layers.17.self_attn.v_proj.bias', 'vision_model.encoder.layers.15.mlp.fc2.bias', 'vision_model.encoder.layers.16.self_attn.q_proj.bias', 'vision_model.encoder.layers.3.self_attn.k_proj.bias', 'vision_model.encoder.layers.13.mlp.fc2.bias', 'vision_model.encoder.layers.10.self_attn.q_proj.weight', 'vision_model.encoder.layers.13.layer_norm1.weight', 'vision_model.encoder.layers.23.layer_norm1.bias', 'vision_model.encoder.layers.22.mlp.fc1.weight', 'vision_model.encoder.layers.2.mlp.fc1.weight', 'vision_model.encoder.layers.20.self_attn.q_proj.weight', 'vision_model.encoder.layers.8.self_attn.v_proj.bias', 'vision_model.encoder.layers.20.self_attn.out_proj.bias', 'vision_model.embeddings.position_ids', 'vision_model.encoder.layers.16.layer_norm1.bias', 'vision_model.encoder.layers.1.self_attn.k_proj.bias', 'vision_model.encoder.layers.5.mlp.fc2.bias', 'vision_model.encoder.layers.10.self_attn.k_proj.weight', 'vision_model.encoder.layers.21.self_attn.v_proj.bias', 'vision_model.pre_layrnorm.weight', 'vision_model.encoder.layers.13.self_attn.v_proj.weight', 'vision_model.encoder.layers.2.mlp.fc1.bias', 'vision_model.encoder.layers.8.mlp.fc1.weight', 'vision_model.encoder.layers.21.layer_norm1.bias', 'vision_model.encoder.layers.14.mlp.fc1.weight', 'vision_model.encoder.layers.9.layer_norm2.bias', 'vision_model.embeddings.patch_embedding.weight', 'vision_model.encoder.layers.6.self_attn.q_proj.weight', 'vision_model.encoder.layers.19.self_attn.out_proj.bias', 'vision_model.post_layernorm.bias', 'vision_model.encoder.layers.14.self_attn.v_proj.bias', 'vision_model.encoder.layers.10.self_attn.out_proj.weight', 'vision_model.encoder.layers.1.mlp.fc1.weight', 'vision_model.encoder.layers.2.self_attn.k_proj.weight', 'vision_model.encoder.layers.3.self_attn.out_proj.bias', 'vision_model.encoder.layers.9.self_attn.out_proj.bias', 'vision_model.encoder.layers.5.layer_norm1.bias', 'vision_model.encoder.layers.21.layer_norm1.weight', 'vision_model.encoder.layers.2.self_attn.k_proj.bias', 'vision_model.encoder.layers.8.layer_norm1.weight', 'vision_model.encoder.layers.4.self_attn.out_proj.weight', 'vision_model.encoder.layers.7.self_attn.k_proj.bias', 'vision_model.encoder.layers.11.layer_norm2.bias', 'vision_model.encoder.layers.19.self_attn.out_proj.weight', 'vision_model.encoder.layers.22.self_attn.q_proj.weight', 'vision_model.encoder.layers.11.self_attn.v_proj.weight', 'vision_model.encoder.layers.19.mlp.fc2.weight', 'vision_model.encoder.layers.16.self_attn.k_proj.bias', 'vision_model.encoder.layers.21.self_attn.k_proj.weight', 'vision_model.encoder.layers.3.mlp.fc1.weight', 'vision_model.encoder.layers.8.layer_norm2.bias', 'vision_model.encoder.layers.21.mlp.fc2.weight', 'vision_model.encoder.layers.21.self_attn.v_proj.weight', 'vision_model.encoder.layers.14.self_attn.q_proj.weight', 'vision_model.encoder.layers.23.layer_norm2.weight', 'vision_model.encoder.layers.12.self_attn.k_proj.weight', 'vision_model.encoder.layers.4.self_attn.k_proj.weight', 'vision_model.encoder.layers.9.mlp.fc1.bias', 'vision_model.encoder.layers.6.self_attn.out_proj.weight', 'vision_model.encoder.layers.1.self_attn.out_proj.weight', 'vision_model.encoder.layers.7.self_attn.v_proj.bias', 'vision_model.encoder.layers.1.self_attn.v_proj.weight', 'vision_model.embeddings.position_embedding.weight', 'vision_model.encoder.layers.16.layer_norm2.bias', 'vision_model.encoder.layers.11.self_attn.k_proj.bias', 'vision_model.encoder.layers.12.self_attn.v_proj.bias', 'vision_model.encoder.layers.3.self_attn.out_proj.weight', 'vision_model.encoder.layers.0.self_attn.out_proj.weight', 'vision_model.encoder.layers.17.self_attn.k_proj.bias', 'vision_model.encoder.layers.10.layer_norm2.weight', 'vision_model.encoder.layers.8.mlp.fc1.bias', 'vision_model.encoder.layers.0.mlp.fc1.bias', 'vision_model.encoder.layers.13.self_attn.k_proj.bias', 'vision_model.encoder.layers.8.layer_norm1.bias', 'vision_model.encoder.layers.4.self_attn.q_proj.bias', 'vision_model.encoder.layers.5.mlp.fc1.bias', 'vision_model.encoder.layers.9.self_attn.out_proj.weight', 'vision_model.encoder.layers.12.layer_norm2.weight', 'vision_model.encoder.layers.17.mlp.fc1.weight', 'vision_model.encoder.layers.16.self_attn.q_proj.weight', 'vision_model.encoder.layers.7.mlp.fc2.bias', 'vision_model.encoder.layers.17.self_attn.out_proj.bias', 'vision_model.encoder.layers.15.self_attn.k_proj.bias', 'vision_model.encoder.layers.9.self_attn.k_proj.weight', 'vision_model.pre_layrnorm.bias', 'vision_model.encoder.layers.13.layer_norm2.bias', 'vision_model.encoder.layers.9.self_attn.v_proj.bias', 'vision_model.encoder.layers.4.mlp.fc2.weight', 'vision_model.encoder.layers.5.mlp.fc1.weight', 'vision_model.encoder.layers.23.self_attn.q_proj.weight', 'vision_model.encoder.layers.20.self_attn.k_proj.weight', 'vision_model.encoder.layers.22.layer_norm2.weight', 'vision_model.encoder.layers.4.layer_norm1.bias', 'vision_model.encoder.layers.14.layer_norm2.bias', 'vision_model.encoder.layers.6.self_attn.q_proj.bias', 'vision_model.encoder.layers.8.layer_norm2.weight', 'vision_model.encoder.layers.0.mlp.fc2.weight', 'vision_model.encoder.layers.21.mlp.fc1.bias', 'vision_model.encoder.layers.16.mlp.fc1.bias', 'vision_model.encoder.layers.10.mlp.fc1.bias', 'vision_model.encoder.layers.6.layer_norm1.bias', 'vision_model.encoder.layers.3.self_attn.q_proj.weight', 'vision_model.encoder.layers.4.self_attn.v_proj.weight', 'text_projection.weight', 'vision_model.encoder.layers.17.self_attn.q_proj.bias', 'vision_model.encoder.layers.10.self_attn.out_proj.bias', 'vision_model.encoder.layers.3.mlp.fc2.bias', 'vision_model.encoder.layers.12.layer_norm1.weight', 'vision_model.encoder.layers.13.self_attn.out_proj.bias', 'vision_model.encoder.layers.21.self_attn.out_proj.bias', 'vision_model.encoder.layers.14.layer_norm1.bias', 'vision_model.encoder.layers.23.self_attn.v_proj.weight', 'vision_model.encoder.layers.16.self_attn.k_proj.weight', 'vision_model.encoder.layers.3.self_attn.v_proj.weight', 'vision_model.encoder.layers.18.mlp.fc2.weight', 'vision_model.encoder.layers.9.layer_norm2.weight', 'vision_model.encoder.layers.0.self_attn.k_proj.bias', 'vision_model.encoder.layers.15.mlp.fc1.weight', 'vision_model.encoder.layers.23.self_attn.out_proj.weight', 'vision_model.encoder.layers.8.self_attn.v_proj.weight', 'vision_model.encoder.layers.7.layer_norm2.bias', 'vision_model.encoder.layers.2.layer_norm1.weight', 'vision_model.encoder.layers.7.self_attn.v_proj.weight', 'vision_model.encoder.layers.20.self_attn.v_proj.bias', 'vision_model.encoder.layers.1.layer_norm2.weight', 'vision_model.encoder.layers.22.self_attn.q_proj.bias', 'vision_model.encoder.layers.0.self_attn.q_proj.weight', 'vision_model.encoder.layers.19.self_attn.v_proj.bias', 'vision_model.encoder.layers.18.self_attn.k_proj.bias', 'vision_model.encoder.layers.8.self_attn.q_proj.bias', 'vision_model.encoder.layers.1.self_attn.q_proj.weight', 'vision_model.encoder.layers.11.layer_norm1.weight', 'vision_model.encoder.layers.0.self_attn.k_proj.weight', 'vision_model.encoder.layers.18.self_attn.out_proj.weight', 'visual_projection.weight', 'vision_model.encoder.layers.6.mlp.fc2.weight', 'vision_model.encoder.layers.22.self_attn.v_proj.weight', 'vision_model.encoder.layers.21.mlp.fc1.weight', 'vision_model.encoder.layers.0.layer_norm1.weight', 'vision_model.encoder.layers.10.layer_norm2.bias', 'vision_model.encoder.layers.2.self_attn.out_proj.bias', 'vision_model.encoder.layers.15.layer_norm1.weight', 'vision_model.encoder.layers.7.self_attn.out_proj.bias', 'vision_model.encoder.layers.18.self_attn.out_proj.bias', 'vision_model.encoder.layers.13.mlp.fc1.weight', 'vision_model.encoder.layers.22.self_attn.out_proj.weight', 'vision_model.encoder.layers.11.mlp.fc2.bias', 'vision_model.encoder.layers.21.self_attn.q_proj.bias', 'vision_model.encoder.layers.20.mlp.fc2.weight', 'vision_model.encoder.layers.6.layer_norm2.bias', 'vision_model.encoder.layers.14.self_attn.q_proj.bias', 'vision_model.encoder.layers.18.self_attn.k_proj.weight', 'vision_model.encoder.layers.12.mlp.fc1.bias', 'vision_model.encoder.layers.23.self_attn.v_proj.bias', 'vision_model.encoder.layers.8.self_attn.out_proj.weight', 'vision_model.encoder.layers.16.mlp.fc1.weight', 'vision_model.encoder.layers.14.self_attn.v_proj.weight', 'vision_model.encoder.layers.18.self_attn.v_proj.bias', 'vision_model.encoder.layers.23.layer_norm1.weight', 'vision_model.encoder.layers.18.layer_norm2.weight', 'vision_model.encoder.layers.15.self_attn.q_proj.bias', 'vision_model.encoder.layers.12.mlp.fc2.weight', 'vision_model.encoder.layers.4.mlp.fc1.weight', 'vision_model.encoder.layers.5.self_attn.v_proj.weight', 'vision_model.encoder.layers.1.mlp.fc2.weight', 'vision_model.encoder.layers.15.mlp.fc1.bias', 'vision_model.encoder.layers.11.mlp.fc1.bias', 'vision_model.encoder.layers.10.self_attn.v_proj.weight', 'vision_model.encoder.layers.23.mlp.fc2.weight', 'vision_model.encoder.layers.7.self_attn.q_proj.weight', 'vision_model.encoder.layers.14.self_attn.out_proj.bias', 'vision_model.encoder.layers.6.self_attn.k_proj.weight', 'vision_model.encoder.layers.18.mlp.fc1.bias', 'vision_model.encoder.layers.5.layer_norm2.bias', 'vision_model.encoder.layers.1.layer_norm1.bias', 'vision_model.encoder.layers.8.mlp.fc2.bias', 'vision_model.encoder.layers.21.layer_norm2.bias', 'vision_model.encoder.layers.5.layer_norm2.weight', 'vision_model.encoder.layers.1.self_attn.k_proj.weight', 'vision_model.encoder.layers.15.layer_norm2.bias', 'vision_model.encoder.layers.15.layer_norm2.weight', 'vision_model.encoder.layers.23.mlp.fc1.bias', 'vision_model.encoder.layers.19.mlp.fc2.bias', 'vision_model.encoder.layers.16.mlp.fc2.bias', 'vision_model.encoder.layers.0.self_attn.q_proj.bias', 'vision_model.encoder.layers.1.mlp.fc1.bias', 'vision_model.encoder.layers.11.self_attn.k_proj.weight', 'vision_model.encoder.layers.11.self_attn.out_proj.weight', 'vision_model.encoder.layers.2.mlp.fc2.weight', 'vision_model.encoder.layers.19.self_attn.k_proj.bias', 'vision_model.encoder.layers.20.self_attn.v_proj.weight', 'vision_model.encoder.layers.14.layer_norm1.weight', 'vision_model.encoder.layers.8.self_attn.k_proj.weight', 'vision_model.encoder.layers.10.self_attn.k_proj.bias', 'vision_model.encoder.layers.15.self_attn.out_proj.weight', 'vision_model.encoder.layers.13.self_attn.out_proj.weight', 'vision_model.encoder.layers.3.self_attn.k_proj.weight', 'vision_model.encoder.layers.20.layer_norm2.weight', 'vision_model.encoder.layers.13.self_attn.q_proj.weight', 'vision_model.encoder.layers.4.layer_norm2.weight', 'vision_model.encoder.layers.13.layer_norm1.bias', 'vision_model.encoder.layers.17.mlp.fc1.bias', 'vision_model.encoder.layers.9.mlp.fc2.weight', 'vision_model.encoder.layers.7.mlp.fc1.bias', 'vision_model.encoder.layers.20.self_attn.k_proj.bias', 'vision_model.encoder.layers.6.self_attn.k_proj.bias', 'vision_model.encoder.layers.5.self_attn.k_proj.weight', 'vision_model.encoder.layers.20.layer_norm2.bias', 'vision_model.encoder.layers.6.layer_norm1.weight', 'vision_model.encoder.layers.4.layer_norm2.bias', 'vision_model.encoder.layers.1.self_attn.v_proj.bias', 'vision_model.encoder.layers.11.mlp.fc1.weight', 'vision_model.encoder.layers.7.layer_norm1.weight', 'vision_model.encoder.layers.12.self_attn.out_proj.bias', 'vision_model.encoder.layers.7.self_attn.q_proj.bias', 'vision_model.encoder.layers.9.mlp.fc1.weight', 'vision_model.encoder.layers.10.layer_norm1.weight', 'vision_model.encoder.layers.11.mlp.fc2.weight', 'vision_model.encoder.layers.17.layer_norm1.weight', 'vision_model.encoder.layers.12.mlp.fc2.bias', 'vision_model.encoder.layers.20.self_attn.out_proj.weight', 'vision_model.encoder.layers.10.mlp.fc2.bias', 'vision_model.encoder.layers.18.layer_norm1.weight', 'vision_model.encoder.layers.0.self_attn.v_proj.weight', 'vision_model.encoder.layers.0.layer_norm1.bias', 'vision_model.encoder.layers.20.layer_norm1.weight', 'vision_model.encoder.layers.19.layer_norm2.bias', 'vision_model.encoder.layers.11.self_attn.v_proj.bias', 'vision_model.encoder.layers.15.self_attn.k_proj.weight', 'vision_model.encoder.layers.1.layer_norm1.weight', 'vision_model.encoder.layers.7.mlp.fc1.weight', 'vision_model.encoder.layers.12.self_attn.k_proj.bias', 'vision_model.encoder.layers.14.self_attn.k_proj.weight', 'vision_model.encoder.layers.21.self_attn.k_proj.bias', 'vision_model.encoder.layers.18.mlp.fc2.bias', 'vision_model.encoder.layers.6.mlp.fc2.bias', 'vision_model.post_layernorm.weight', 'vision_model.encoder.layers.4.self_attn.out_proj.bias', 'vision_model.encoder.layers.8.self_attn.out_proj.bias', 'vision_model.encoder.layers.2.self_attn.q_proj.weight', 'vision_model.encoder.layers.17.self_attn.v_proj.weight', 'vision_model.encoder.layers.2.layer_norm1.bias', 'vision_model.encoder.layers.1.self_attn.q_proj.bias', 'vision_model.encoder.layers.11.self_attn.q_proj.bias', 'vision_model.encoder.layers.19.self_attn.v_proj.weight', 'vision_model.encoder.layers.10.self_attn.v_proj.bias', 'vision_model.encoder.layers.5.self_attn.v_proj.bias', 'vision_model.encoder.layers.15.mlp.fc2.weight', 'vision_model.encoder.layers.20.layer_norm1.bias', 'vision_model.encoder.layers.9.layer_norm1.weight', 'vision_model.encoder.layers.11.layer_norm2.weight', 'vision_model.encoder.layers.17.self_attn.q_proj.weight', 'vision_model.encoder.layers.23.self_attn.k_proj.weight', 'vision_model.encoder.layers.18.layer_norm2.bias', 'vision_model.encoder.layers.12.mlp.fc1.weight', 'vision_model.encoder.layers.15.self_attn.q_proj.weight', 'vision_model.encoder.layers.12.self_attn.out_proj.weight', 'vision_model.encoder.layers.18.self_attn.q_proj.bias', 'vision_model.encoder.layers.18.layer_norm1.bias', 'vision_model.encoder.layers.3.layer_norm1.bias', 'vision_model.encoder.layers.14.self_attn.out_proj.weight', 'vision_model.encoder.layers.20.self_attn.q_proj.bias', 'vision_model.encoder.layers.12.layer_norm2.bias', 'vision_model.encoder.layers.22.mlp.fc1.bias', 'vision_model.encoder.layers.9.self_attn.v_proj.weight', 'vision_model.encoder.layers.4.mlp.fc1.bias', 'vision_model.encoder.layers.9.self_attn.q_proj.bias', 'vision_model.encoder.layers.22.mlp.fc2.bias', 'vision_model.encoder.layers.2.mlp.fc2.bias', 'vision_model.encoder.layers.19.layer_norm2.weight', 'vision_model.encoder.layers.12.layer_norm1.bias', 'vision_model.encoder.layers.22.self_attn.v_proj.bias', 'vision_model.encoder.layers.0.self_attn.out_proj.bias', 'vision_model.encoder.layers.16.self_attn.v_proj.bias', 'vision_model.encoder.layers.13.mlp.fc2.weight', 'vision_model.encoder.layers.16.self_attn.v_proj.weight', 'vision_model.encoder.layers.7.self_attn.out_proj.weight', 'vision_model.encoder.layers.22.self_attn.k_proj.weight', 'vision_model.encoder.layers.20.mlp.fc1.bias', 'vision_model.encoder.layers.13.mlp.fc1.bias', 'vision_model.encoder.layers.8.self_attn.q_proj.weight', 'vision_model.encoder.layers.19.layer_norm1.bias', 'vision_model.encoder.layers.14.mlp.fc2.weight', 'logit_scale', 'vision_model.encoder.layers.11.layer_norm1.bias', 'vision_model.encoder.layers.1.layer_norm2.bias', 'vision_model.encoder.layers.9.self_attn.q_proj.weight', 'vision_model.encoder.layers.4.self_attn.k_proj.bias', 'vision_model.encoder.layers.1.self_attn.out_proj.bias', 'vision_model.embeddings.class_embedding', 'vision_model.encoder.layers.2.self_attn.q_proj.bias', 'vision_model.encoder.layers.15.self_attn.out_proj.bias', 'vision_model.encoder.layers.5.mlp.fc2.weight', 'vision_model.encoder.layers.7.mlp.fc2.weight', 'vision_model.encoder.layers.14.layer_norm2.weight', 'vision_model.encoder.layers.14.mlp.fc2.bias', 'vision_model.encoder.layers.9.layer_norm1.bias', 'vision_model.encoder.layers.2.layer_norm2.bias', 'vision_model.encoder.layers.7.layer_norm1.bias', 'vision_model.encoder.layers.5.self_attn.q_proj.weight', 'vision_model.encoder.layers.17.mlp.fc2.bias', 'vision_model.encoder.layers.23.self_attn.q_proj.bias', 'vision_model.encoder.layers.19.self_attn.k_proj.weight', 'vision_model.encoder.layers.23.mlp.fc2.bias', 'vision_model.encoder.layers.23.self_attn.out_proj.bias', 'vision_model.encoder.layers.6.mlp.fc1.bias', 'vision_model.encoder.layers.21.self_attn.q_proj.weight', 'vision_model.encoder.layers.3.self_attn.q_proj.bias', 'vision_model.encoder.layers.2.self_attn.v_proj.weight', 'vision_model.encoder.layers.13.self_attn.v_proj.bias', 'vision_model.encoder.layers.3.layer_norm1.weight', 'vision_model.encoder.layers.5.self_attn.q_proj.bias', 'vision_model.encoder.layers.7.self_attn.k_proj.weight', 'vision_model.encoder.layers.14.self_attn.k_proj.bias', 'vision_model.encoder.layers.22.layer_norm2.bias', 'vision_model.encoder.layers.13.self_attn.q_proj.bias', 'vision_model.encoder.layers.10.self_attn.q_proj.bias', 'vision_model.encoder.layers.10.mlp.fc1.weight', 'vision_model.encoder.layers.1.mlp.fc2.bias', 'vision_model.encoder.layers.3.self_attn.v_proj.bias', 'vision_model.encoder.layers.9.mlp.fc2.bias', 'vision_model.encoder.layers.17.mlp.fc2.weight', 'vision_model.encoder.layers.3.layer_norm2.weight', 'vision_model.encoder.layers.11.self_attn.q_proj.weight', 'vision_model.encoder.layers.4.self_attn.q_proj.weight', 'vision_model.encoder.layers.4.self_attn.v_proj.bias', 'vision_model.encoder.layers.17.self_attn.out_proj.weight', 'vision_model.encoder.layers.6.layer_norm2.weight', 'vision_model.encoder.layers.9.self_attn.k_proj.bias', 'vision_model.encoder.layers.4.mlp.fc2.bias', 'vision_model.encoder.layers.15.self_attn.v_proj.weight', 'vision_model.encoder.layers.6.mlp.fc1.weight', 'vision_model.encoder.layers.16.self_attn.out_proj.bias', 'vision_model.encoder.layers.18.self_attn.v_proj.weight', 'vision_model.encoder.layers.5.layer_norm1.weight', 'vision_model.encoder.layers.0.self_attn.v_proj.bias', 'vision_model.encoder.layers.0.layer_norm2.bias', 'vision_model.encoder.layers.11.self_attn.out_proj.bias', 'vision_model.encoder.layers.22.self_attn.out_proj.bias', 'vision_model.encoder.layers.14.mlp.fc1.bias', 'vision_model.encoder.layers.12.self_attn.q_proj.weight', 'vision_model.encoder.layers.12.self_attn.v_proj.weight', 'vision_model.encoder.layers.10.layer_norm1.bias', 'vision_model.encoder.layers.17.self_attn.k_proj.weight', 'vision_model.encoder.layers.23.mlp.fc1.weight', 'vision_model.encoder.layers.15.layer_norm1.bias', 'vision_model.encoder.layers.19.self_attn.q_proj.weight', 'vision_model.encoder.layers.5.self_attn.out_proj.bias', 'vision_model.encoder.layers.0.layer_norm2.weight', 'vision_model.encoder.layers.20.mlp.fc1.weight', 'vision_model.encoder.layers.2.layer_norm2.weight', 'vision_model.encoder.layers.17.layer_norm2.bias', 'vision_model.encoder.layers.12.self_attn.q_proj.bias', 'vision_model.encoder.layers.7.layer_norm2.weight', 'vision_model.encoder.layers.21.layer_norm2.weight', 'vision_model.encoder.layers.0.mlp.fc1.weight', 'vision_model.encoder.layers.19.layer_norm1.weight', 'vision_model.encoder.layers.10.mlp.fc2.weight', 'vision_model.encoder.layers.3.mlp.fc1.bias', 'vision_model.encoder.layers.3.layer_norm2.bias', 'vision_model.encoder.layers.13.layer_norm2.weight', 'vision_model.encoder.layers.2.self_attn.v_proj.bias', 'vision_model.encoder.layers.16.mlp.fc2.weight', 'vision_model.encoder.layers.8.self_attn.k_proj.bias', 'vision_model.encoder.layers.13.self_attn.k_proj.weight', 'vision_model.encoder.layers.18.self_attn.q_proj.weight', 'vision_model.encoder.layers.19.mlp.fc1.weight', 'vision_model.encoder.layers.21.mlp.fc2.bias', 'vision_model.encoder.layers.16.layer_norm2.weight', 'vision_model.encoder.layers.18.mlp.fc1.weight', 'vision_model.encoder.layers.16.self_attn.out_proj.weight', 'vision_model.encoder.layers.4.layer_norm1.weight']\n", "- This IS expected if you are initializing CLIPTextModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).\n", "- This IS NOT expected if you are initializing CLIPTextModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).\n", "`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config[\"id2label\"]` will be overriden.\n", "/home/awu/dev/lib/python3.8/site-packages/transformers/models/clip/feature_extraction_clip.py:28: FutureWarning: The class CLIPFeatureExtractor is deprecated and will be removed in version 5 of Transformers. Please use CLIPImageProcessor instead.\n", " warnings.warn(\n" ] } ], "source": [ "from diffusers.pipelines.stable_diffusion.convert_from_ckpt import download_from_original_stable_diffusion_ckpt\n", "from huggingface_hub import hf_hub_download\n", "\n", "ckpt_path = hf_hub_download(repo_id=\"breakcore2/ligne_claire_anime_diffusion\", filename=\"ligne_claire_anime_diffusion_v1.safetensors\")\n", "\n", "print(f\"Checkpoint path: {ckpt_path}\")\n", "\n", "# !wget https://raw.githubusercontent.com/CompVis/stable-diffusion/main/configs/stable-diffusion/v1-inference.yaml\n", "pipe = download_from_original_stable_diffusion_ckpt(\n", " checkpoint_path=ckpt_path,\n", " original_config_file=\"configs/v1-inference.yaml\",\n", " from_safetensors=True\n", ")\n", "\n", "# pipe.save_pretrained(\"./models/ligne_claire\", safe_serialization=True)" ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "StableDiffusionPipeline {\n", " \"_class_name\": \"StableDiffusionPipeline\",\n", " \"_diffusers_version\": \"0.16.0.dev0\",\n", " \"feature_extractor\": [\n", " \"transformers\",\n", " \"CLIPFeatureExtractor\"\n", " ],\n", " \"requires_safety_checker\": true,\n", " \"safety_checker\": [\n", " \"stable_diffusion\",\n", " \"StableDiffusionSafetyChecker\"\n", " ],\n", " \"scheduler\": [\n", " \"diffusers\",\n", " \"PNDMScheduler\"\n", " ],\n", " \"text_encoder\": [\n", " \"transformers\",\n", " \"CLIPTextModel\"\n", " ],\n", " \"tokenizer\": [\n", " \"transformers\",\n", " \"CLIPTokenizer\"\n", " ],\n", " \"unet\": [\n", " \"diffusers\",\n", " \"UNet2DConditionModel\"\n", " ],\n", " \"vae\": [\n", " \"diffusers\",\n", " \"AutoencoderKL\"\n", " ]\n", "}" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipe" ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "UNet2DConditionModel(\n", " (conv_in): Conv2d(4, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (time_proj): Timesteps()\n", " (time_embedding): TimestepEmbedding(\n", " (linear_1): Linear(in_features=320, out_features=1280, bias=True)\n", " (act): SiLU()\n", " (linear_2): Linear(in_features=1280, out_features=1280, bias=True)\n", " )\n", " (down_blocks): ModuleList(\n", " (0): CrossAttnDownBlock2D(\n", " (attentions): ModuleList(\n", " (0-1): 2 x Transformer2DModel(\n", " (norm): GroupNorm(32, 320, eps=1e-06, affine=True)\n", " (proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))\n", " (transformer_blocks): ModuleList(\n", " (0): BasicTransformerBlock(\n", " (norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True)\n", " (attn1): Attention(\n", " (to_q): Linear(in_features=320, out_features=320, bias=False)\n", " (to_k): Linear(in_features=320, out_features=320, bias=False)\n", " (to_v): Linear(in_features=320, out_features=320, bias=False)\n", " (to_out): ModuleList(\n", " (0): Linear(in_features=320, out_features=320, bias=True)\n", " (1): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True)\n", " (attn2): Attention(\n", " (to_q): Linear(in_features=320, out_features=320, bias=False)\n", " (to_k): Linear(in_features=768, out_features=320, bias=False)\n", " (to_v): Linear(in_features=768, out_features=320, bias=False)\n", " (to_out): ModuleList(\n", " (0): Linear(in_features=320, out_features=320, bias=True)\n", " (1): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True)\n", " (ff): FeedForward(\n", " (net): ModuleList(\n", " (0): GEGLU(\n", " (proj): Linear(in_features=320, out_features=2560, bias=True)\n", " )\n", " (1): Dropout(p=0.0, inplace=False)\n", " (2): Linear(in_features=1280, out_features=320, bias=True)\n", " )\n", " )\n", " )\n", " )\n", " (proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " )\n", " (resnets): ModuleList(\n", " (0-1): 2 x ResnetBlock2D(\n", " (norm1): GroupNorm(32, 320, eps=1e-05, affine=True)\n", " (conv1): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (time_emb_proj): Linear(in_features=1280, out_features=320, bias=True)\n", " (norm2): GroupNorm(32, 320, eps=1e-05, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (nonlinearity): SiLU()\n", " )\n", " )\n", " (downsamplers): ModuleList(\n", " (0): Downsample2D(\n", " (conv): Conv2d(320, 320, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))\n", " )\n", " )\n", " )\n", " (1): CrossAttnDownBlock2D(\n", " (attentions): ModuleList(\n", " (0-1): 2 x Transformer2DModel(\n", " (norm): GroupNorm(32, 640, eps=1e-06, affine=True)\n", " (proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))\n", " (transformer_blocks): ModuleList(\n", " (0): BasicTransformerBlock(\n", " (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)\n", " (attn1): Attention(\n", " (to_q): Linear(in_features=640, out_features=640, bias=False)\n", " (to_k): Linear(in_features=640, out_features=640, bias=False)\n", " (to_v): Linear(in_features=640, out_features=640, bias=False)\n", " (to_out): ModuleList(\n", " (0): Linear(in_features=640, out_features=640, bias=True)\n", " (1): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)\n", " (attn2): Attention(\n", " (to_q): Linear(in_features=640, out_features=640, bias=False)\n", " (to_k): Linear(in_features=768, out_features=640, bias=False)\n", " (to_v): Linear(in_features=768, out_features=640, bias=False)\n", " (to_out): ModuleList(\n", " (0): Linear(in_features=640, out_features=640, bias=True)\n", " (1): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)\n", " (ff): FeedForward(\n", " (net): ModuleList(\n", " (0): GEGLU(\n", " (proj): Linear(in_features=640, out_features=5120, bias=True)\n", " )\n", " (1): Dropout(p=0.0, inplace=False)\n", " (2): Linear(in_features=2560, out_features=640, bias=True)\n", " )\n", " )\n", " )\n", " )\n", " (proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " )\n", " (resnets): ModuleList(\n", " (0): ResnetBlock2D(\n", " (norm1): GroupNorm(32, 320, eps=1e-05, affine=True)\n", " (conv1): Conv2d(320, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (time_emb_proj): Linear(in_features=1280, out_features=640, bias=True)\n", " (norm2): GroupNorm(32, 640, eps=1e-05, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (nonlinearity): SiLU()\n", " (conv_shortcut): Conv2d(320, 640, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " (1): ResnetBlock2D(\n", " (norm1): GroupNorm(32, 640, eps=1e-05, affine=True)\n", " (conv1): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (time_emb_proj): Linear(in_features=1280, out_features=640, bias=True)\n", " (norm2): GroupNorm(32, 640, eps=1e-05, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (nonlinearity): SiLU()\n", " )\n", " )\n", " (downsamplers): ModuleList(\n", " (0): Downsample2D(\n", " (conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))\n", " )\n", " )\n", " )\n", " (2): CrossAttnDownBlock2D(\n", " (attentions): ModuleList(\n", " (0-1): 2 x Transformer2DModel(\n", " (norm): GroupNorm(32, 1280, eps=1e-06, affine=True)\n", " (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))\n", " (transformer_blocks): ModuleList(\n", " (0): BasicTransformerBlock(\n", " (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n", " (attn1): Attention(\n", " (to_q): Linear(in_features=1280, out_features=1280, bias=False)\n", " (to_k): Linear(in_features=1280, out_features=1280, bias=False)\n", " (to_v): Linear(in_features=1280, out_features=1280, bias=False)\n", " (to_out): ModuleList(\n", " (0): Linear(in_features=1280, out_features=1280, bias=True)\n", " (1): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n", " (attn2): Attention(\n", " (to_q): Linear(in_features=1280, out_features=1280, bias=False)\n", " (to_k): Linear(in_features=768, out_features=1280, bias=False)\n", " (to_v): Linear(in_features=768, out_features=1280, bias=False)\n", " (to_out): ModuleList(\n", " (0): Linear(in_features=1280, out_features=1280, bias=True)\n", " (1): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n", " (ff): FeedForward(\n", " (net): ModuleList(\n", " (0): GEGLU(\n", " (proj): Linear(in_features=1280, out_features=10240, bias=True)\n", " )\n", " (1): Dropout(p=0.0, inplace=False)\n", " (2): Linear(in_features=5120, out_features=1280, bias=True)\n", " )\n", " )\n", " )\n", " )\n", " (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " )\n", " (resnets): ModuleList(\n", " (0): ResnetBlock2D(\n", " (norm1): GroupNorm(32, 640, eps=1e-05, affine=True)\n", " (conv1): Conv2d(640, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (time_emb_proj): Linear(in_features=1280, out_features=1280, bias=True)\n", " (norm2): GroupNorm(32, 1280, eps=1e-05, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (nonlinearity): SiLU()\n", " (conv_shortcut): Conv2d(640, 1280, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " (1): ResnetBlock2D(\n", " (norm1): GroupNorm(32, 1280, eps=1e-05, affine=True)\n", " (conv1): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (time_emb_proj): Linear(in_features=1280, out_features=1280, bias=True)\n", " (norm2): GroupNorm(32, 1280, eps=1e-05, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (nonlinearity): SiLU()\n", " )\n", " )\n", " (downsamplers): ModuleList(\n", " (0): Downsample2D(\n", " (conv): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1))\n", " )\n", " )\n", " )\n", " (3): DownBlock2D(\n", " (resnets): ModuleList(\n", " (0-1): 2 x ResnetBlock2D(\n", " (norm1): GroupNorm(32, 1280, eps=1e-05, affine=True)\n", " (conv1): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (time_emb_proj): Linear(in_features=1280, out_features=1280, bias=True)\n", " (norm2): GroupNorm(32, 1280, eps=1e-05, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (nonlinearity): SiLU()\n", " )\n", " )\n", " )\n", " )\n", " (up_blocks): ModuleList(\n", " (0): UpBlock2D(\n", " (resnets): ModuleList(\n", " (0-2): 3 x ResnetBlock2D(\n", " (norm1): GroupNorm(32, 2560, eps=1e-05, affine=True)\n", " (conv1): Conv2d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (time_emb_proj): Linear(in_features=1280, out_features=1280, bias=True)\n", " (norm2): GroupNorm(32, 1280, eps=1e-05, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (nonlinearity): SiLU()\n", " (conv_shortcut): Conv2d(2560, 1280, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " )\n", " (upsamplers): ModuleList(\n", " (0): Upsample2D(\n", " (conv): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " )\n", " )\n", " )\n", " (1): CrossAttnUpBlock2D(\n", " (attentions): ModuleList(\n", " (0-2): 3 x Transformer2DModel(\n", " (norm): GroupNorm(32, 1280, eps=1e-06, affine=True)\n", " (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))\n", " (transformer_blocks): ModuleList(\n", " (0): BasicTransformerBlock(\n", " (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n", " (attn1): Attention(\n", " (to_q): Linear(in_features=1280, out_features=1280, bias=False)\n", " (to_k): Linear(in_features=1280, out_features=1280, bias=False)\n", " (to_v): Linear(in_features=1280, out_features=1280, bias=False)\n", " (to_out): ModuleList(\n", " (0): Linear(in_features=1280, out_features=1280, bias=True)\n", " (1): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n", " (attn2): Attention(\n", " (to_q): Linear(in_features=1280, out_features=1280, bias=False)\n", " (to_k): Linear(in_features=768, out_features=1280, bias=False)\n", " (to_v): Linear(in_features=768, out_features=1280, bias=False)\n", " (to_out): ModuleList(\n", " (0): Linear(in_features=1280, out_features=1280, bias=True)\n", " (1): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n", " (ff): FeedForward(\n", " (net): ModuleList(\n", " (0): GEGLU(\n", " (proj): Linear(in_features=1280, out_features=10240, bias=True)\n", " )\n", " (1): Dropout(p=0.0, inplace=False)\n", " (2): Linear(in_features=5120, out_features=1280, bias=True)\n", " )\n", " )\n", " )\n", " )\n", " (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " )\n", " (resnets): ModuleList(\n", " (0-1): 2 x ResnetBlock2D(\n", " (norm1): GroupNorm(32, 2560, eps=1e-05, affine=True)\n", " (conv1): Conv2d(2560, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (time_emb_proj): Linear(in_features=1280, out_features=1280, bias=True)\n", " (norm2): GroupNorm(32, 1280, eps=1e-05, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (nonlinearity): SiLU()\n", " (conv_shortcut): Conv2d(2560, 1280, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " (2): ResnetBlock2D(\n", " (norm1): GroupNorm(32, 1920, eps=1e-05, affine=True)\n", " (conv1): Conv2d(1920, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (time_emb_proj): Linear(in_features=1280, out_features=1280, bias=True)\n", " (norm2): GroupNorm(32, 1280, eps=1e-05, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (nonlinearity): SiLU()\n", " (conv_shortcut): Conv2d(1920, 1280, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " )\n", " (upsamplers): ModuleList(\n", " (0): Upsample2D(\n", " (conv): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " )\n", " )\n", " )\n", " (2): CrossAttnUpBlock2D(\n", " (attentions): ModuleList(\n", " (0-2): 3 x Transformer2DModel(\n", " (norm): GroupNorm(32, 640, eps=1e-06, affine=True)\n", " (proj_in): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))\n", " (transformer_blocks): ModuleList(\n", " (0): BasicTransformerBlock(\n", " (norm1): LayerNorm((640,), eps=1e-05, elementwise_affine=True)\n", " (attn1): Attention(\n", " (to_q): Linear(in_features=640, out_features=640, bias=False)\n", " (to_k): Linear(in_features=640, out_features=640, bias=False)\n", " (to_v): Linear(in_features=640, out_features=640, bias=False)\n", " (to_out): ModuleList(\n", " (0): Linear(in_features=640, out_features=640, bias=True)\n", " (1): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (norm2): LayerNorm((640,), eps=1e-05, elementwise_affine=True)\n", " (attn2): Attention(\n", " (to_q): Linear(in_features=640, out_features=640, bias=False)\n", " (to_k): Linear(in_features=768, out_features=640, bias=False)\n", " (to_v): Linear(in_features=768, out_features=640, bias=False)\n", " (to_out): ModuleList(\n", " (0): Linear(in_features=640, out_features=640, bias=True)\n", " (1): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (norm3): LayerNorm((640,), eps=1e-05, elementwise_affine=True)\n", " (ff): FeedForward(\n", " (net): ModuleList(\n", " (0): GEGLU(\n", " (proj): Linear(in_features=640, out_features=5120, bias=True)\n", " )\n", " (1): Dropout(p=0.0, inplace=False)\n", " (2): Linear(in_features=2560, out_features=640, bias=True)\n", " )\n", " )\n", " )\n", " )\n", " (proj_out): Conv2d(640, 640, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " )\n", " (resnets): ModuleList(\n", " (0): ResnetBlock2D(\n", " (norm1): GroupNorm(32, 1920, eps=1e-05, affine=True)\n", " (conv1): Conv2d(1920, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (time_emb_proj): Linear(in_features=1280, out_features=640, bias=True)\n", " (norm2): GroupNorm(32, 640, eps=1e-05, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (nonlinearity): SiLU()\n", " (conv_shortcut): Conv2d(1920, 640, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " (1): ResnetBlock2D(\n", " (norm1): GroupNorm(32, 1280, eps=1e-05, affine=True)\n", " (conv1): Conv2d(1280, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (time_emb_proj): Linear(in_features=1280, out_features=640, bias=True)\n", " (norm2): GroupNorm(32, 640, eps=1e-05, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (nonlinearity): SiLU()\n", " (conv_shortcut): Conv2d(1280, 640, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " (2): ResnetBlock2D(\n", " (norm1): GroupNorm(32, 960, eps=1e-05, affine=True)\n", " (conv1): Conv2d(960, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (time_emb_proj): Linear(in_features=1280, out_features=640, bias=True)\n", " (norm2): GroupNorm(32, 640, eps=1e-05, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (nonlinearity): SiLU()\n", " (conv_shortcut): Conv2d(960, 640, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " )\n", " (upsamplers): ModuleList(\n", " (0): Upsample2D(\n", " (conv): Conv2d(640, 640, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " )\n", " )\n", " )\n", " (3): CrossAttnUpBlock2D(\n", " (attentions): ModuleList(\n", " (0-2): 3 x Transformer2DModel(\n", " (norm): GroupNorm(32, 320, eps=1e-06, affine=True)\n", " (proj_in): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))\n", " (transformer_blocks): ModuleList(\n", " (0): BasicTransformerBlock(\n", " (norm1): LayerNorm((320,), eps=1e-05, elementwise_affine=True)\n", " (attn1): Attention(\n", " (to_q): Linear(in_features=320, out_features=320, bias=False)\n", " (to_k): Linear(in_features=320, out_features=320, bias=False)\n", " (to_v): Linear(in_features=320, out_features=320, bias=False)\n", " (to_out): ModuleList(\n", " (0): Linear(in_features=320, out_features=320, bias=True)\n", " (1): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (norm2): LayerNorm((320,), eps=1e-05, elementwise_affine=True)\n", " (attn2): Attention(\n", " (to_q): Linear(in_features=320, out_features=320, bias=False)\n", " (to_k): Linear(in_features=768, out_features=320, bias=False)\n", " (to_v): Linear(in_features=768, out_features=320, bias=False)\n", " (to_out): ModuleList(\n", " (0): Linear(in_features=320, out_features=320, bias=True)\n", " (1): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (norm3): LayerNorm((320,), eps=1e-05, elementwise_affine=True)\n", " (ff): FeedForward(\n", " (net): ModuleList(\n", " (0): GEGLU(\n", " (proj): Linear(in_features=320, out_features=2560, bias=True)\n", " )\n", " (1): Dropout(p=0.0, inplace=False)\n", " (2): Linear(in_features=1280, out_features=320, bias=True)\n", " )\n", " )\n", " )\n", " )\n", " (proj_out): Conv2d(320, 320, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " )\n", " (resnets): ModuleList(\n", " (0): ResnetBlock2D(\n", " (norm1): GroupNorm(32, 960, eps=1e-05, affine=True)\n", " (conv1): Conv2d(960, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (time_emb_proj): Linear(in_features=1280, out_features=320, bias=True)\n", " (norm2): GroupNorm(32, 320, eps=1e-05, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (nonlinearity): SiLU()\n", " (conv_shortcut): Conv2d(960, 320, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " (1-2): 2 x ResnetBlock2D(\n", " (norm1): GroupNorm(32, 640, eps=1e-05, affine=True)\n", " (conv1): Conv2d(640, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (time_emb_proj): Linear(in_features=1280, out_features=320, bias=True)\n", " (norm2): GroupNorm(32, 320, eps=1e-05, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(320, 320, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (nonlinearity): SiLU()\n", " (conv_shortcut): Conv2d(640, 320, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " )\n", " )\n", " )\n", " (mid_block): UNetMidBlock2DCrossAttn(\n", " (attentions): ModuleList(\n", " (0): Transformer2DModel(\n", " (norm): GroupNorm(32, 1280, eps=1e-06, affine=True)\n", " (proj_in): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))\n", " (transformer_blocks): ModuleList(\n", " (0): BasicTransformerBlock(\n", " (norm1): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n", " (attn1): Attention(\n", " (to_q): Linear(in_features=1280, out_features=1280, bias=False)\n", " (to_k): Linear(in_features=1280, out_features=1280, bias=False)\n", " (to_v): Linear(in_features=1280, out_features=1280, bias=False)\n", " (to_out): ModuleList(\n", " (0): Linear(in_features=1280, out_features=1280, bias=True)\n", " (1): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (norm2): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n", " (attn2): Attention(\n", " (to_q): Linear(in_features=1280, out_features=1280, bias=False)\n", " (to_k): Linear(in_features=768, out_features=1280, bias=False)\n", " (to_v): Linear(in_features=768, out_features=1280, bias=False)\n", " (to_out): ModuleList(\n", " (0): Linear(in_features=1280, out_features=1280, bias=True)\n", " (1): Dropout(p=0.0, inplace=False)\n", " )\n", " )\n", " (norm3): LayerNorm((1280,), eps=1e-05, elementwise_affine=True)\n", " (ff): FeedForward(\n", " (net): ModuleList(\n", " (0): GEGLU(\n", " (proj): Linear(in_features=1280, out_features=10240, bias=True)\n", " )\n", " (1): Dropout(p=0.0, inplace=False)\n", " (2): Linear(in_features=5120, out_features=1280, bias=True)\n", " )\n", " )\n", " )\n", " )\n", " (proj_out): Conv2d(1280, 1280, kernel_size=(1, 1), stride=(1, 1))\n", " )\n", " )\n", " (resnets): ModuleList(\n", " (0-1): 2 x ResnetBlock2D(\n", " (norm1): GroupNorm(32, 1280, eps=1e-05, affine=True)\n", " (conv1): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (time_emb_proj): Linear(in_features=1280, out_features=1280, bias=True)\n", " (norm2): GroupNorm(32, 1280, eps=1e-05, affine=True)\n", " (dropout): Dropout(p=0.0, inplace=False)\n", " (conv2): Conv2d(1280, 1280, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", " (nonlinearity): SiLU()\n", " )\n", " )\n", " )\n", " (conv_norm_out): GroupNorm(32, 320, eps=1e-05, affine=True)\n", " (conv_act): SiLU()\n", " (conv_out): Conv2d(320, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))\n", ")" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "pipe.unet" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loading Models...\n", "Generating Animation...\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "100%|██████████| 50/50 [51:51<00:00, 62.23s/it] \n" ] } ], "source": [ "video_path = \"../__assets__/dance2_corr.mp4\" # pose video\n", "\n", "reader = imageio.get_reader(video_path, \"ffmpeg\")\n", "frame_count = 16\n", "pose_images = [Image.fromarray(reader.get_data(i)) for i in range(frame_count)]\n", "\n", "print(\"Loading Models...\")\n", "controlnet = ControlNetModel.from_pretrained(\"lllyasviel/sd-controlnet-openpose\")\n", "pipe = StableDiffusionControlNetPipeline.from_pretrained(\"../models/ligne_claire\", controlnet=controlnet)\n", "\n", "# Set the attention processor\n", "pipe.unet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2))\n", "pipe.controlnet.set_attn_processor(CrossFrameAttnProcessor(batch_size=2))\n", "\n", "# fix latents for all frames\n", "latents = torch.randn((1, 4, 64, 64)).repeat(len(pose_images), 1, 1, 1)\n", "\n", "\n", "print(\"Generating Animation...\")\n", "prompt = \"(ligne claire), girl walking through a city of sky scrapers at night\"\n", "result = pipe(prompt=[prompt] * len(pose_images), image=pose_images, latents=latents).images\n", "imageio.mimsave(\"video.mp4\", result, fps=4)" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "dev", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.8.10" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }