Spaces:

svjack
/

Genshin-Impact-XL-MasaCtrl

Sleeping

App Files Files Community

svjack commited on Nov 17, 2024

Commit

f070657

verified ·

1 Parent(s): 968b4fb

Upload 23 files

Browse files

Files changed (23) hide show

LICENSE +201 -0
README.md +302 -9
app.py +97 -0
gradio_app/app_utils.py +30 -0
gradio_app/image_synthesis_app.py +166 -0
gradio_app/images/corgi.jpg +0 -0
gradio_app/images/person.png +0 -0
gradio_app/real_image_editing_app.py +162 -0
masactrl/__init__.py +0 -0
masactrl/diffuser_utils.py +275 -0
masactrl/masactrl.py +334 -0
masactrl/masactrl_processor.py +259 -0
masactrl/masactrl_utils.py +212 -0
masactrl_w_adapter/ddim.py +375 -0
masactrl_w_adapter/masactrl_w_adapter.py +217 -0
playground.ipynb +149 -0
playground_real.ipynb +188 -0
requirements.txt +13 -0
run_synthesis_genshin_impact_xl.py +117 -0
run_synthesis_genshin_impact_xl_app.py +97 -0
run_synthesis_sdxl.py +83 -0
run_synthesis_sdxl_processor.py +90 -0
style.css +3 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,201 @@

+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

README.md CHANGED Viewed

@@ -1,12 +1,305 @@
 ---
-title: Genshin Impact XL MasaCtrl
-emoji: 🐠
-colorFrom: blue
-colorTo: pink
-sdk: gradio
-sdk_version: 5.6.0
-app_file: app.py
-pinned: false
 ---
-Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

+# MotionCtrl and MasaCtrl: Genshin Impact Character Synthesis
+This repository provides a guide to setting up and running the MotionCtrl and MasaCtrl projects for synthesizing Genshin Impact characters using machine learning models. The process involves cloning repositories, installing dependencies, and executing scripts to generate and visualize character images.
+## Table of Contents
+- [Prerequisites](#prerequisites)
+- [Installation](#installation)
+- [Running the Synthesis](#running-the-synthesis)
+- [Using Gradio Interface](#using-gradio-interface)
+- [Example Prompts](#example-prompts)
+## Prerequisites
+Before you begin, ensure you have the following installed:
+- Python 3.10
+- Conda (for environment management)
+- Git
+- Git LFS (Large File Storage)
+- FFmpeg
+## Installation
+### Step 1: Clone the MotionCtrl Repository
+Clone the MotionCtrl repository and install its dependencies:
+```bash
+git clone https://huggingface.co/spaces/svjack/MotionCtrl
+cd MotionCtrl
+pip install -r requirements.txt
+```
+### Step 2: Install System Dependencies
+Update your package list and install necessary system packages:
+```bash
+sudo apt-get update
+sudo apt-get install cbm git-lfs ffmpeg
+```
+### Step 3: Set Up Python Environment
+Create a Conda environment with Python 3.10, activate it, and install the IPython kernel:
+```bash
+conda create -n py310 python=3.10
+conda activate py310
+pip install ipykernel
+python -m ipykernel install --user --name py310 --display-name "py310"
+```
+### Step 4: Clone the MasaCtrl Repository
+Clone the MasaCtrl repository and install its dependencies:
+```bash
+git clone https://github.com/svjack/MasaCtrl
+cd MasaCtrl
+pip install -r requirements.txt
+```
+## Running the Synthesis
+### Command Line Interface
+Run the synthesis script to generate images of Genshin Impact characters:
+```bash
+python run_synthesis_genshin_impact_xl.py --model_path "svjack/GenshinImpact_XL_Base" \
+ --prompt1 "solo,ZHONGLI\(genshin impact\),1boy,highres," \
+ --prompt2 "solo,ZHONGLI drink tea use chinese cup \(genshin impact\),1boy,highres," --guidance_scale 5
+```
+### Gradio Interface
+Alternatively, you can use the Gradio interface for a more interactive experience:
+```bash
+python run_synthesis_genshin_impact_xl_app.py
+```
+## Example Prompts
+Here are some example prompts you can use to generate different character images: (Image with MasaCtrl more like Source Image: In terms of background and other aspects)
+- **Zhongli Drinking Tea:**
+  ```
+  "solo,ZHONGLI(genshin impact),1boy,highres," -> "solo,ZHONGLI drink tea use chinese cup (genshin impact),1boy,highres,"
+  ```
+![Screenshot 2024-11-17 132742](https://github.com/user-attachments/assets/00451728-f2d5-4009-afa8-23baaabdc223)
+- **Kamisato Ayato Smiling:**
+  ```
+  "solo,KAMISATO AYATO(genshin impact),1boy,highres," -> "solo,KAMISATO AYATO smiling (genshin impact),1boy,highres,"
+  ```
+![Screenshot 2024-11-17 133421](https://github.com/user-attachments/assets/7a920f4c-8a3a-4387-98d6-381a798566ef)
+## MasaCtrl: Tuning-free <span style="text-decoration: underline"><font color="Tomato">M</font></span>utu<span style="text-decoration: underline"><font color="Tomato">a</font></span>l <span style="text-decoration: underline"><font color="Tomato">S</font></span>elf-<span style="text-decoration: underline"><font color="Tomato">A</font></span>ttention <span style="text-decoration: underline"><font color="Tomato">Control</font></span> for Consistent Image Synthesis and Editing
+Pytorch implementation of [MasaCtrl: Tuning-free Mutual Self-Attention Control for **Consistent Image Synthesis and Editing**](https://arxiv.org/abs/2304.08465)
+[Mingdeng Cao](https://github.com/ljzycmd),
+[Xintao Wang](https://xinntao.github.io/),
+[Zhongang Qi](https://scholar.google.com/citations?user=zJvrrusAAAAJ),
+[Ying Shan](https://scholar.google.com/citations?user=4oXBp9UAAAAJ),
+[Xiaohu Qie](https://scholar.google.com/citations?user=mk-F69UAAAAJ),
+[Yinqiang Zheng](https://scholar.google.com/citations?user=JD-5DKcAAAAJ)
+[![arXiv](https://img.shields.io/badge/ArXiv-2304.08465-brightgreen)](https://arxiv.org/abs/2304.08465)
+[![Project page](https://img.shields.io/badge/Project-Page-brightgreen)](https://ljzycmd.github.io/projects/MasaCtrl/)
+[![demo](https://img.shields.io/badge/Demo-Hugging%20Face-brightgreen)](https://huggingface.co/spaces/TencentARC/MasaCtrl)
+[![demo](https://img.shields.io/badge/Demo-Colab-brightgreen)](https://colab.research.google.com/drive/1DZeQn2WvRBsNg4feS1bJrwWnIzw1zLJq?usp=sharing)
+[![Open in OpenXLab](https://cdn-static.openxlab.org.cn/app-center/openxlab_app.svg)](https://openxlab.org.cn/apps/detail/MingDengCao/MasaCtrl)
 ---
+<div align="center">
+<img src="https://huggingface.co/TencentARC/MasaCtrl/resolve/main/assets/overview.gif">
+<i> MasaCtrl enables performing various consistent non-rigid image synthesis and editing without fine-tuning and optimization. </i>
+</div>
+## Updates
+- [2024/8/17] We add AttnProcessor based MasaCtrlProcessor, please check `masactrl/masactrl_processor.py` and `run_synthesis_sdxl_processor.py`. You can integrate MasaCtrl into official Diffuser pipeline by register the attention processor.
+- [2023/8/20] MasaCtrl supports SDXL (and other variants) now. ![sdxl_example](https://huggingface.co/TencentARC/MasaCtrl/resolve/main/assets/sdxl_example.jpg)
+- [2023/5/13] The inference code of MasaCtrl with T2I-Adapter is available.
+- [2023/4/28] [Hugging Face demo](https://huggingface.co/spaces/TencentARC/MasaCtrl) released.
+- [2023/4/25] Code released.
+- [2023/4/17] Paper is available [here](https://arxiv.org/abs/2304.08465).
 ---
+## Introduction
+We propose MasaCtrl, a tuning-free method for non-rigid consistent image synthesis and editing. The key idea is to combine the `contents` from the *source image* and the `layout` synthesized from *text prompt and additional controls* into the desired synthesized or edited image, by querying semantically correlated features with **Mutual Self-Attention Control**.
+## Main Features
+### 1 Consistent Image Synthesis and Editing
+MasaCtrl can perform prompt-based image synthesis and editing that changes the layout while maintaining contents of source image.
+>*The target layout is synthesized directly from the target prompt.*
+<details><summary>View visual results</summary>
+<div align="center">
+<img src="https://huggingface.co/TencentARC/MasaCtrl/resolve/main/assets/results_synthetic.png">
+<i>Consistent synthesis results</i>
+<img src="https://huggingface.co/TencentARC/MasaCtrl/resolve/main/assets/results_real.png">
+<i>Real image editing results</i>
+</div>
+</details>
+### 2 Integration to Controllable Diffusion Models
+Directly modifying the text prompts often cannot generate target layout of desired image, thus we further integrate our method into existing proposed controllable diffusion pipelines (like T2I-Adapter and ControlNet) to obtain stable synthesis and editing results.
+>*The target layout controlled by additional guidance.*
+<details><summary>View visual results</summary>
+<div align="center">
+<img src="https://huggingface.co/TencentARC/MasaCtrl/resolve/main/assets/results_w_adapter.png">
+<i>Synthesis (left part) and editing (right part) results with T2I-Adapter</i>
+</div>
+</details>
+### 3 Generalization to Other Models: Anything-V4
+Our method also generalize well to other Stable-Diffusion-based models.
+<details><summary>View visual results</summary>
+<div align="center">
+<img src="https://huggingface.co/TencentARC/MasaCtrl/resolve/main/assets/anythingv4_synthetic.png">
+<i>Results on Anything-V4</i>
+</div>
+</details>
+### 4 Extension to Video Synthesis
+With dense consistent guidance, MasaCtrl enables video synthesis
+<details><summary>View visual results</summary>
+<div align="center">
+<img src="https://huggingface.co/TencentARC/MasaCtrl/resolve/main/assets/results_w_adapter_consistent.png">
+<i>Video Synthesis Results (with keypose and canny guidance)</i>
+</div>
+</details>
+## Usage
+### Requirements
+We implement our method with [diffusers](https://github.com/huggingface/diffusers) code base with similar code structure to [Prompt-to-Prompt](https://github.com/google/prompt-to-prompt). The code runs on Python 3.8.5 with Pytorch 1.11. Conda environment is highly recommended.
+```base
+pip install -r requirements.txt
+```
+### Checkpoints
+**Stable Diffusion:**
+We mainly conduct expriemnts on Stable Diffusion v1-4, while our method can generalize to other versions (like v1-5). You can download these checkpoints on their official repository and [Hugging Face](https://huggingface.co/).
+**Personalized Models:**
+You can download personlized models from [CIVITAI](https://civitai.com/) or train your own customized models.
+### Demos
+**Notebook demos**
+To run the synthesis with MasaCtrl, single GPU with at least 16 GB VRAM is required.
+The notebook `playground.ipynb` and `playground_real.ipynb` provide the synthesis and real editing samples, respectively.
+**Online demos**
+We provide [![demo](https://img.shields.io/badge/Demo-Hugging%20Face-brightgreen)](https://huggingface.co/spaces/TencentARC/MasaCtrl) with Gradio app. Note that you may copy the demo into your own space to use the GPU. Online Colab demo [![demo](https://img.shields.io/badge/Demo-Colab-brightgreen)](https://colab.research.google.com/drive/1DZeQn2WvRBsNg4feS1bJrwWnIzw1zLJq?usp=sharing) is also available.
+**Local Gradio demo**
+You can launch the provided Gradio demo locally with
+```bash
+CUDA_VISIBLE_DEVICES=0 python app.py
+```
+### MasaCtrl with T2I-Adapter
+Install [T2I-Adapter](https://github.com/TencentARC/T2I-Adapter) and prepare the checkpoints following their provided tutorial. Assuming it has been successfully installed and the root directory is `T2I-Adapter`.
+Thereafter copy the core `masactrl` package and the inference code `masactrl_w_adapter.py` to the root directory of T2I-Adapter
+```bash
+cp -r MasaCtrl/masactrl T2I-Adapter/
+cp MasaCtrl/masactrl_w_adapter/masactrl_w_adapter.py T2I-Adapter/
+```
+**[Updates]** Or you can clone the repo [MasaCtrl-w-T2I-Adapter](https://github.com/ljzycmd/T2I-Adapter-w-MasaCtrl) directly to your local space.
+Last, you can inference the images with following command (with sketch adapter)
+```bash
+python masactrl_w_adapter.py \
+--which_cond sketch \
+--cond_path_src SOURCE_CONDITION_PATH \
+--cond_path CONDITION_PATH \
+--cond_inp_type sketch \
+--prompt_src "A bear walking in the forest" \
+--prompt "A bear standing in the forest" \
+--sd_ckpt models/sd-v1-4.ckpt \
+--resize_short_edge 512 \
+--cond_tau 1.0 \
+--cond_weight 1.0 \
+--n_samples 1 \
+--adapter_ckpt models/t2iadapter_sketch_sd14v1.pth
+```
+NOTE: You can download the sketch examples [here](https://huggingface.co/TencentARC/MasaCtrl/tree/main/sketch_example).
+For real image, the DDIM inversion is performed to invert the image into the noise map, thus we add the inversion process into the original DDIM sampler. **You should replace the original file `T2I-Adapter/ldm/models/diffusion/ddim.py` with the exteneded version `MasaCtrl/masactrl_w_adapter/ddim.py` to enable the inversion function**. Then you can edit the real image with following command (with sketch adapter)
+```bash
+python masactrl_w_adapter.py \
+--src_img_path SOURCE_IMAGE_PATH \
+--cond_path CONDITION_PATH \
+--cond_inp_type image \
+--prompt_src "" \
+--prompt "a photo of a man wearing black t-shirt, giving a thumbs up" \
+--sd_ckpt models/sd-v1-4.ckpt \
+--resize_short_edge 512 \
+--cond_tau 1.0 \
+--cond_weight 1.0 \
+--n_samples 1 \
+--which_cond sketch \
+--adapter_ckpt models/t2iadapter_sketch_sd14v1.pth \
+--outdir ./workdir/masactrl_w_adapter_inversion/black-shirt
+```
+NOTE: You can download the real image editing example [here](https://huggingface.co/TencentARC/MasaCtrl/tree/main/black_shirt_example).
+## Acknowledgements
+We thank the awesome research works [Prompt-to-Prompt](https://github.com/google/prompt-to-prompt), [T2I-Adapter](https://github.com/TencentARC/T2I-Adapter).
+## Citation
+```bibtex
+@InProceedings{cao_2023_masactrl,
+    author    = {Cao, Mingdeng and Wang, Xintao and Qi, Zhongang and Shan, Ying and Qie, Xiaohu and Zheng, Yinqiang},
+    title     = {MasaCtrl: Tuning-Free Mutual Self-Attention Control for Consistent Image Synthesis and Editing},
+    booktitle = {Proceedings of the IEEE/CVF International Conference on Computer Vision (ICCV)},
+    month     = {October},
+    year      = {2023},
+    pages     = {22560-22570}
+}
+```
+## Contact
+If you have any comments or questions, please [open a new issue](https://github.com/TencentARC/MasaCtrl/issues/new/choose) or feel free to contact [Mingdeng Cao](https://github.com/ljzycmd) and [Xintao Wang](https://xinntao.github.io/).

app.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import gradio as gr
+import torch
+from diffusers import DDIMScheduler, DiffusionPipeline
+from masactrl.diffuser_utils import MasaCtrlPipeline
+from masactrl.masactrl_utils import AttentionBase, regiter_attention_editor_diffusers
+from masactrl.masactrl import MutualSelfAttentionControl
+from pytorch_lightning import seed_everything
+import os
+import re
+# 初始化设备和模型
+device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False)
+model = DiffusionPipeline.from_pretrained("svjack/GenshinImpact_XL_Base", scheduler=scheduler).to(device)
+def pathify(s):
+    return re.sub(r'[^a-zA-Z0-9]', '_', s.lower())
+def consistent_synthesis(prompt1, prompt2, guidance_scale, seed, starting_step, starting_layer):
+    seed_everything(seed)
+    # 创建输出目录
+    out_dir_ori = os.path.join("masactrl_exp", pathify(prompt2))
+    os.makedirs(out_dir_ori, exist_ok=True)
+    prompts = [prompt1, prompt2]
+    # 初始化噪声图
+    start_code = torch.randn([1, 4, 128, 128], device=device)
+    start_code = start_code.expand(len(prompts), -1, -1, -1)
+    # 推理没有 MasaCtrl 的图像
+    editor = AttentionBase()
+    regiter_attention_editor_diffusers(model, editor)
+    image_ori = model(prompts, latents=start_code, guidance_scale=guidance_scale).images
+    images = []
+    # 劫持注意力模块
+    editor = MutualSelfAttentionControl(starting_step, starting_layer, model_type="SDXL")
+    regiter_attention_editor_diffusers(model, editor)
+    # 推理带 MasaCtrl 的图像
+    image_masactrl = model(prompts, latents=start_code, guidance_scale=guidance_scale).images
+    sample_count = len(os.listdir(out_dir_ori))
+    out_dir = os.path.join(out_dir_ori, f"sample_{sample_count}")
+    os.makedirs(out_dir, exist_ok=True)
+    image_ori[0].save(os.path.join(out_dir, f"source_step{starting_step}_layer{starting_layer}.png"))
+    image_ori[1].save(os.path.join(out_dir, f"without_step{starting_step}_layer{starting_layer}.png"))
+    image_masactrl[-1].save(os.path.join(out_dir, f"masactrl_step{starting_step}_layer{starting_layer}.png"))
+    with open(os.path.join(out_dir, f"prompts.txt"), "w") as f:
+        for p in prompts:
+            f.write(p + "\n")
+        f.write(f"seed: {seed}\n")
+        f.write(f"starting_step: {starting_step}\n")
+        f.write(f"starting_layer: {starting_layer}\n")
+    print("Synthesized images are saved in", out_dir)
+    return [image_ori[0], image_ori[1], image_masactrl[-1]]
+def create_demo_synthesis():
+    with gr.Blocks() as demo:
+        gr.Markdown("# **Genshin Impact XL MasaCtrl Image Synthesis**")  # 添加标题
+        gr.Markdown("## **Input Settings**")
+        with gr.Row():
+            with gr.Column():
+                prompt1 = gr.Textbox(label="Prompt 1", value="solo,ZHONGLI(genshin impact),1boy,highres,")
+                prompt2 = gr.Textbox(label="Prompt 2", value="solo,ZHONGLI drink tea use chinese cup (genshin impact),1boy,highres,")
+                with gr.Row():
+                    starting_step = gr.Slider(label="Starting Step", minimum=0, maximum=999, value=4, step=1)
+                    starting_layer = gr.Slider(label="Starting Layer", minimum=0, maximum=999, value=64, step=1)
+                run_btn = gr.Button("Run")
+            with gr.Column():
+                guidance_scale = gr.Slider(label="Guidance Scale", minimum=0.1, maximum=30.0, value=7.5, step=0.1)
+                seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, value=42, step=1)
+        gr.Markdown("## **Output**")
+        with gr.Row():
+            image_source = gr.Image(label="Source Image")
+            image_without_masactrl = gr.Image(label="Image without MasaCtrl")
+            image_with_masactrl = gr.Image(label="Image with MasaCtrl")
+        inputs = [prompt1, prompt2, guidance_scale, seed, starting_step, starting_layer]
+        run_btn.click(consistent_synthesis, inputs, [image_source, image_without_masactrl, image_with_masactrl])
+        gr.Examples(
+            [
+                ["solo,ZHONGLI(genshin impact),1boy,highres,", "solo,ZHONGLI drink tea use chinese cup (genshin impact),1boy,highres,", 42, 4, 64],
+                ["solo,KAMISATO AYATO(genshin impact),1boy,highres,", "solo,KAMISATO AYATO smiling (genshin impact),1boy,highres,", 42, 4, 55]
+            ],
+            [prompt1, prompt2, seed, starting_step, starting_layer],
+        )
+    return demo
+if __name__ == "__main__":
+    demo_synthesis = create_demo_synthesis()
+    demo_synthesis.launch(share = True)

gradio_app/app_utils.py ADDED Viewed

	@@ -0,0 +1,30 @@

+import gradio as gr
+import numpy as np
+import torch
+from diffusers import DDIMScheduler
+from pytorch_lightning import seed_everything
+from masactrl.diffuser_utils import MasaCtrlPipeline
+from masactrl.masactrl_utils import (AttentionBase,
+                                     regiter_attention_editor_diffusers)
+torch.set_grad_enabled(False)
+device = torch.device("cuda") if torch.cuda.is_available() else torch.device(
+    "cpu")
+model_path = "xyn-ai/anything-v4.0"
+scheduler = DDIMScheduler(beta_start=0.00085,
+                          beta_end=0.012,
+                          beta_schedule="scaled_linear",
+                          clip_sample=False,
+                          set_alpha_to_one=False)
+model = MasaCtrlPipeline.from_pretrained(model_path,
+                                         scheduler=scheduler).to(device)
+global_context = {
+    "model_path": model_path,
+    "scheduler": scheduler,
+    "model": model,
+    "device": device
+}

gradio_app/image_synthesis_app.py ADDED Viewed

	@@ -0,0 +1,166 @@

+import gradio as gr
+import numpy as np
+import torch
+from diffusers import DDIMScheduler
+from pytorch_lightning import seed_everything
+from masactrl.diffuser_utils import MasaCtrlPipeline
+from masactrl.masactrl_utils import (AttentionBase,
+                                     regiter_attention_editor_diffusers)
+from .app_utils import global_context
+torch.set_grad_enabled(False)
+# device = torch.device("cuda") if torch.cuda.is_available() else torch.device(
+#     "cpu")
+# model_path = "andite/anything-v4.0"
+# scheduler = DDIMScheduler(beta_start=0.00085,
+#                           beta_end=0.012,
+#                           beta_schedule="scaled_linear",
+#                           clip_sample=False,
+#                           set_alpha_to_one=False)
+# model = MasaCtrlPipeline.from_pretrained(model_path,
+#                                          scheduler=scheduler).to(device)
+def consistent_synthesis(source_prompt, target_prompt, starting_step,
+                         starting_layer, image_resolution, ddim_steps, scale,
+                         seed, appended_prompt, negative_prompt):
+    from masactrl.masactrl import MutualSelfAttentionControl
+    model = global_context["model"]
+    device = global_context["device"]
+    seed_everything(seed)
+    with torch.no_grad():
+        if appended_prompt is not None:
+            source_prompt += appended_prompt
+            target_prompt += appended_prompt
+        prompts = [source_prompt, target_prompt]
+        # initialize the noise map
+        start_code = torch.randn([1, 4, 64, 64], device=device)
+        start_code = start_code.expand(len(prompts), -1, -1, -1)
+        # inference the synthesized image without MasaCtrl
+        editor = AttentionBase()
+        regiter_attention_editor_diffusers(model, editor)
+        target_image_ori = model([target_prompt],
+                                 latents=start_code[-1:],
+                                 guidance_scale=7.5)
+        target_image_ori = target_image_ori.cpu().permute(0, 2, 3, 1).numpy()
+        # inference the synthesized image with MasaCtrl
+        # hijack the attention module
+        controller = MutualSelfAttentionControl(starting_step, starting_layer)
+        regiter_attention_editor_diffusers(model, controller)
+        # inference the synthesized image
+        image_masactrl = model(prompts, latents=start_code, guidance_scale=7.5)
+        image_masactrl = image_masactrl.cpu().permute(0, 2, 3, 1).numpy()
+    return [image_masactrl[0], target_image_ori[0],
+            image_masactrl[1]]  # source, fixed seed, masactrl
+def create_demo_synthesis():
+    with gr.Blocks() as demo:
+        gr.Markdown("## **Input Settings**")
+        with gr.Row():
+            with gr.Column():
+                source_prompt = gr.Textbox(
+                    label="Source Prompt",
+                    value='1boy, casual, outdoors, sitting',
+                    interactive=True)
+                target_prompt = gr.Textbox(
+                    label="Target Prompt",
+                    value='1boy, casual, outdoors, standing',
+                    interactive=True)
+                with gr.Row():
+                    ddim_steps = gr.Slider(label="DDIM Steps",
+                                            minimum=1,
+                                            maximum=999,
+                                            value=50,
+                                            step=1)
+                    starting_step = gr.Slider(
+                        label="Step of MasaCtrl",
+                        minimum=0,
+                        maximum=999,
+                        value=4,
+                        step=1)
+                    starting_layer = gr.Slider(label="Layer of MasaCtrl",
+                                                minimum=0,
+                                                maximum=16,
+                                                value=10,
+                                                step=1)
+                run_btn = gr.Button(label="Run")
+            with gr.Column():
+                appended_prompt = gr.Textbox(label="Appended Prompt", value='')
+                negative_prompt = gr.Textbox(label="Negative Prompt", value='')
+                with gr.Row():
+                    image_resolution = gr.Slider(label="Image Resolution",
+                                                minimum=256,
+                                                maximum=768,
+                                                value=512,
+                                                step=64)
+                    scale = gr.Slider(label="CFG Scale",
+                                    minimum=0.1,
+                                    maximum=30.0,
+                                    value=7.5,
+                                    step=0.1)
+                    seed = gr.Slider(label="Seed",
+                                    minimum=-1,
+                                    maximum=2147483647,
+                                    value=42,
+                                    step=1)
+        gr.Markdown("## **Output**")
+        with gr.Row():
+            image_source = gr.Image(label="Source Image")
+            image_fixed = gr.Image(label="Image with Fixed Seed")
+            image_masactrl = gr.Image(label="Image with MasaCtrl")
+        inputs = [
+            source_prompt, target_prompt, starting_step, starting_layer,
+            image_resolution, ddim_steps, scale, seed, appended_prompt,
+            negative_prompt
+        ]
+        run_btn.click(consistent_synthesis, inputs,
+                        [image_source, image_fixed, image_masactrl])
+        gr.Examples(
+            [[
+                "1boy, bishounen, casual, indoors, sitting, coffee shop, bokeh",
+                "1boy, bishounen, casual, indoors, standing, coffee shop, bokeh",
+                42
+            ],
+                [
+                    "1boy, casual, outdoors, sitting",
+                    "1boy, casual, outdoors, sitting, side view", 42
+                ],
+                [
+                    "1boy, casual, outdoors, sitting",
+                    "1boy, casual, outdoors, standing, clapping hands", 42
+                ],
+                [
+                    "1boy, casual, outdoors, sitting",
+                    "1boy, casual, outdoors, sitting, shows thumbs up", 42
+                ],
+                [
+                    "1boy, casual, outdoors, sitting",
+                    "1boy, casual, outdoors, sitting, with crossed arms", 42
+                ],
+                [
+                    "1boy, casual, outdoors, sitting",
+                    "1boy, casual, outdoors, sitting, rasing hands", 42
+                ]],
+            [source_prompt, target_prompt, seed],
+        )
+    return demo
+if __name__ == "__main__":
+    demo_syntehsis = create_demo_synthesis()
+    demo_synthesis.launch()

gradio_app/images/corgi.jpg ADDED Viewed

gradio_app/images/person.png ADDED Viewed

gradio_app/real_image_editing_app.py ADDED Viewed

	@@ -0,0 +1,162 @@

+import os
+import numpy as np
+import gradio as gr
+import torch
+import torch.nn.functional as F
+from diffusers import DDIMScheduler
+from torchvision.io import read_image
+from pytorch_lightning import seed_everything
+from masactrl.diffuser_utils import MasaCtrlPipeline
+from masactrl.masactrl_utils import (AttentionBase,
+                                     regiter_attention_editor_diffusers)
+from .app_utils import global_context
+torch.set_grad_enabled(False)
+# device = torch.device("cuda") if torch.cuda.is_available() else torch.device(
+#     "cpu")
+# model_path = "CompVis/stable-diffusion-v1-4"
+# scheduler = DDIMScheduler(beta_start=0.00085,
+#                           beta_end=0.012,
+#                           beta_schedule="scaled_linear",
+#                           clip_sample=False,
+#                           set_alpha_to_one=False)
+# model = MasaCtrlPipeline.from_pretrained(model_path,
+#                                          scheduler=scheduler).to(device)
+def load_image(image_path):
+    device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    image = read_image(image_path)
+    image = image[:3].unsqueeze_(0).float() / 127.5 - 1.  # [-1, 1]
+    image = F.interpolate(image, (512, 512))
+    image = image.to(device)
+def real_image_editing(source_image, target_prompt,
+                       starting_step, starting_layer, ddim_steps, scale, seed,
+                       appended_prompt, negative_prompt):
+    from masactrl.masactrl import MutualSelfAttentionControl
+    model = global_context["model"]
+    device = global_context["device"]
+    seed_everything(seed)
+    with torch.no_grad():
+        if appended_prompt is not None:
+            target_prompt += appended_prompt
+        ref_prompt = ""
+        prompts = [ref_prompt, target_prompt]
+        # invert the image into noise map
+        if isinstance(source_image, np.ndarray):
+            source_image = torch.from_numpy(source_image).to(device) / 127.5 - 1.
+            source_image = source_image.unsqueeze(0).permute(0, 3, 1, 2)
+            source_image = F.interpolate(source_image, (512, 512))
+        start_code, latents_list = model.invert(source_image,
+                                                ref_prompt,
+                                                guidance_scale=scale,
+                                                num_inference_steps=ddim_steps,
+                                                return_intermediates=True)
+        start_code = start_code.expand(len(prompts), -1, -1, -1)
+        # recontruct the image with inverted DDIM noise map
+        editor = AttentionBase()
+        regiter_attention_editor_diffusers(model, editor)
+        image_fixed = model([target_prompt],
+                            latents=start_code[-1:],
+                            num_inference_steps=ddim_steps,
+                            guidance_scale=scale)
+        image_fixed = image_fixed.cpu().permute(0, 2, 3, 1).numpy()
+        # inference the synthesized image with MasaCtrl
+        # hijack the attention module
+        controller = MutualSelfAttentionControl(starting_step, starting_layer)
+        regiter_attention_editor_diffusers(model, controller)
+        # inference the synthesized image
+        image_masactrl = model(prompts,
+                               latents=start_code,
+                               guidance_scale=scale)
+        image_masactrl = image_masactrl.cpu().permute(0, 2, 3, 1).numpy()
+    return [
+        image_masactrl[0],
+        image_fixed[0],
+        image_masactrl[1]
+    ]  # source, fixed seed, masactrl
+def create_demo_editing():
+    with gr.Blocks() as demo:
+        gr.Markdown("## **Input Settings**")
+        with gr.Row():
+            with gr.Column():
+                source_image = gr.Image(label="Source Image", value=os.path.join(os.path.dirname(__file__), "images/corgi.jpg"), interactive=True)
+                target_prompt = gr.Textbox(label="Target Prompt",
+                                        value='A photo of a running corgi',
+                                        interactive=True)
+                with gr.Row():
+                    ddim_steps = gr.Slider(label="DDIM Steps",
+                                        minimum=1,
+                                        maximum=999,
+                                        value=50,
+                                        step=1)
+                    starting_step = gr.Slider(label="Step of MasaCtrl",
+                                            minimum=0,
+                                            maximum=999,
+                                            value=4,
+                                            step=1)
+                    starting_layer = gr.Slider(label="Layer of MasaCtrl",
+                                            minimum=0,
+                                            maximum=16,
+                                            value=10,
+                                            step=1)
+                run_btn = gr.Button(label="Run")
+            with gr.Column():
+                appended_prompt = gr.Textbox(label="Appended Prompt", value='')
+                negative_prompt = gr.Textbox(label="Negative Prompt", value='')
+                with gr.Row():
+                    scale = gr.Slider(label="CFG Scale",
+                                    minimum=0.1,
+                                    maximum=30.0,
+                                    value=7.5,
+                                    step=0.1)
+                    seed = gr.Slider(label="Seed",
+                                    minimum=-1,
+                                    maximum=2147483647,
+                                    value=42,
+                                    step=1)
+        gr.Markdown("## **Output**")
+        with gr.Row():
+            image_recons = gr.Image(label="Source Image")
+            image_fixed = gr.Image(label="Image with Fixed Seed")
+            image_masactrl = gr.Image(label="Image with MasaCtrl")
+        inputs = [
+            source_image, target_prompt, starting_step, starting_layer, ddim_steps,
+            scale, seed, appended_prompt, negative_prompt
+        ]
+        run_btn.click(real_image_editing, inputs,
+                    [image_recons, image_fixed, image_masactrl])
+        gr.Examples(
+            [[os.path.join(os.path.dirname(__file__), "images/corgi.jpg"),
+              "A photo of a running corgi"],
+            [os.path.join(os.path.dirname(__file__), "images/person.png"),
+             "A photo of a person, black t-shirt, raising hand"],
+            ],
+            [source_image, target_prompt]
+        )
+    return demo
+if __name__ == "__main__":
+    demo_editing = create_demo_editing()
+    demo_editing.launch()

masactrl/__init__.py ADDED Viewed

File without changes

masactrl/diffuser_utils.py ADDED Viewed

	@@ -0,0 +1,275 @@

+"""
+Util functions based on Diffuser framework.
+"""
+import os
+import torch
+import cv2
+import numpy as np
+import torch.nn.functional as F
+from tqdm import tqdm
+from PIL import Image
+from torchvision.utils import save_image
+from torchvision.io import read_image
+from diffusers import StableDiffusionPipeline
+from pytorch_lightning import seed_everything
+class MasaCtrlPipeline(StableDiffusionPipeline):
+    def next_step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: int,
+        x: torch.FloatTensor,
+        eta=0.,
+        verbose=False
+    ):
+        """
+        Inverse sampling for DDIM Inversion
+        """
+        if verbose:
+            print("timestep: ", timestep)
+        next_step = timestep
+        timestep = min(timestep - self.scheduler.config.num_train_timesteps // self.scheduler.num_inference_steps, 999)
+        alpha_prod_t = self.scheduler.alphas_cumprod[timestep] if timestep >= 0 else self.scheduler.final_alpha_cumprod
+        alpha_prod_t_next = self.scheduler.alphas_cumprod[next_step]
+        beta_prod_t = 1 - alpha_prod_t
+        pred_x0 = (x - beta_prod_t**0.5 * model_output) / alpha_prod_t**0.5
+        pred_dir = (1 - alpha_prod_t_next)**0.5 * model_output
+        x_next = alpha_prod_t_next**0.5 * pred_x0 + pred_dir
+        return x_next, pred_x0
+    def step(
+        self,
+        model_output: torch.FloatTensor,
+        timestep: int,
+        x: torch.FloatTensor,
+        eta: float=0.0,
+        verbose=False,
+    ):
+        """
+        predict the sampe the next step in the denoise process.
+        """
+        prev_timestep = timestep - self.scheduler.config.num_train_timesteps // self.scheduler.num_inference_steps
+        alpha_prod_t = self.scheduler.alphas_cumprod[timestep]
+        alpha_prod_t_prev = self.scheduler.alphas_cumprod[prev_timestep] if prev_timestep > 0 else self.scheduler.final_alpha_cumprod
+        beta_prod_t = 1 - alpha_prod_t
+        pred_x0 = (x - beta_prod_t**0.5 * model_output) / alpha_prod_t**0.5
+        pred_dir = (1 - alpha_prod_t_prev)**0.5 * model_output
+        x_prev = alpha_prod_t_prev**0.5 * pred_x0 + pred_dir
+        return x_prev, pred_x0
+    @torch.no_grad()
+    def image2latent(self, image):
+        DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+        if type(image) is Image:
+            image = np.array(image)
+            image = torch.from_numpy(image).float() / 127.5 - 1
+            image = image.permute(2, 0, 1).unsqueeze(0).to(DEVICE)
+        # input image density range [-1, 1]
+        latents = self.vae.encode(image)['latent_dist'].mean
+        latents = latents * 0.18215
+        return latents
+    @torch.no_grad()
+    def latent2image(self, latents, return_type='np'):
+        latents = 1 / 0.18215 * latents.detach()
+        image = self.vae.decode(latents)['sample']
+        if return_type == 'np':
+            image = (image / 2 + 0.5).clamp(0, 1)
+            image = image.cpu().permute(0, 2, 3, 1).numpy()[0]
+            image = (image * 255).astype(np.uint8)
+        elif return_type == "pt":
+            image = (image / 2 + 0.5).clamp(0, 1)
+        return image
+    def latent2image_grad(self, latents):
+        latents = 1 / 0.18215 * latents
+        image = self.vae.decode(latents)['sample']
+        return image  # range [-1, 1]
+    @torch.no_grad()
+    def __call__(
+        self,
+        prompt,
+        batch_size=1,
+        height=512,
+        width=512,
+        num_inference_steps=50,
+        guidance_scale=7.5,
+        eta=0.0,
+        latents=None,
+        unconditioning=None,
+        neg_prompt=None,
+        ref_intermediate_latents=None,
+        return_intermediates=False,
+        **kwds):
+        DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+        if isinstance(prompt, list):
+            batch_size = len(prompt)
+        elif isinstance(prompt, str):
+            if batch_size > 1:
+                prompt = [prompt] * batch_size
+        # text embeddings
+        text_input = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=77,
+            return_tensors="pt"
+        )
+        text_embeddings = self.text_encoder(text_input.input_ids.to(DEVICE))[0]
+        print("input text embeddings :", text_embeddings.shape)
+        if kwds.get("dir"):
+            dir = text_embeddings[-2] - text_embeddings[-1]
+            u, s, v = torch.pca_lowrank(dir.transpose(-1, -2), q=1, center=True)
+            text_embeddings[-1] = text_embeddings[-1] + kwds.get("dir") * v
+            print(u.shape)
+            print(v.shape)
+        # define initial latents
+        latents_shape = (batch_size, self.unet.in_channels, height//8, width//8)
+        if latents is None:
+            latents = torch.randn(latents_shape, device=DEVICE)
+        else:
+            assert latents.shape == latents_shape, f"The shape of input latent tensor {latents.shape} should equal to predefined one."
+        # unconditional embedding for classifier free guidance
+        if guidance_scale > 1.:
+            max_length = text_input.input_ids.shape[-1]
+            if neg_prompt:
+                uc_text = neg_prompt
+            else:
+                uc_text = ""
+            # uc_text = "ugly, tiling, poorly drawn hands, poorly drawn feet, body out of frame, cut off, low contrast, underexposed, distorted face"
+            unconditional_input = self.tokenizer(
+                [uc_text] * batch_size,
+                padding="max_length",
+                max_length=77,
+                return_tensors="pt"
+            )
+            # unconditional_input.input_ids = unconditional_input.input_ids[:, 1:]
+            unconditional_embeddings = self.text_encoder(unconditional_input.input_ids.to(DEVICE))[0]
+            text_embeddings = torch.cat([unconditional_embeddings, text_embeddings], dim=0)
+        print("latents shape: ", latents.shape)
+        # iterative sampling
+        self.scheduler.set_timesteps(num_inference_steps)
+        # print("Valid timesteps: ", reversed(self.scheduler.timesteps))
+        latents_list = [latents]
+        pred_x0_list = [latents]
+        for i, t in enumerate(tqdm(self.scheduler.timesteps, desc="DDIM Sampler")):
+            if ref_intermediate_latents is not None:
+                # note that the batch_size >= 2
+                latents_ref = ref_intermediate_latents[-1 - i]
+                _, latents_cur = latents.chunk(2)
+                latents = torch.cat([latents_ref, latents_cur])
+            if guidance_scale > 1.:
+                model_inputs = torch.cat([latents] * 2)
+            else:
+                model_inputs = latents
+            if unconditioning is not None and isinstance(unconditioning, list):
+                _, text_embeddings = text_embeddings.chunk(2)
+                text_embeddings = torch.cat([unconditioning[i].expand(*text_embeddings.shape), text_embeddings])
+            # predict tghe noise
+            noise_pred = self.unet(model_inputs, t, encoder_hidden_states=text_embeddings).sample
+            if guidance_scale > 1.:
+                noise_pred_uncon, noise_pred_con = noise_pred.chunk(2, dim=0)
+                noise_pred = noise_pred_uncon + guidance_scale * (noise_pred_con - noise_pred_uncon)
+            # compute the previous noise sample x_t -> x_t-1
+            latents, pred_x0 = self.step(noise_pred, t, latents)
+            latents_list.append(latents)
+            pred_x0_list.append(pred_x0)
+        image = self.latent2image(latents, return_type="pt")
+        if return_intermediates:
+            pred_x0_list = [self.latent2image(img, return_type="pt") for img in pred_x0_list]
+            latents_list = [self.latent2image(img, return_type="pt") for img in latents_list]
+            return image, pred_x0_list, latents_list
+        return image
+    @torch.no_grad()
+    def invert(
+        self,
+        image: torch.Tensor,
+        prompt,
+        num_inference_steps=50,
+        guidance_scale=7.5,
+        eta=0.0,
+        return_intermediates=False,
+        **kwds):
+        """
+        invert a real image into noise map with determinisc DDIM inversion
+        """
+        DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+        batch_size = image.shape[0]
+        if isinstance(prompt, list):
+            if batch_size == 1:
+                image = image.expand(len(prompt), -1, -1, -1)
+        elif isinstance(prompt, str):
+            if batch_size > 1:
+                prompt = [prompt] * batch_size
+        # text embeddings
+        text_input = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=77,
+            return_tensors="pt"
+        )
+        text_embeddings = self.text_encoder(text_input.input_ids.to(DEVICE))[0]
+        print("input text embeddings :", text_embeddings.shape)
+        # define initial latents
+        latents = self.image2latent(image)
+        start_latents = latents
+        # print(latents)
+        # exit()
+        # unconditional embedding for classifier free guidance
+        if guidance_scale > 1.:
+            max_length = text_input.input_ids.shape[-1]
+            unconditional_input = self.tokenizer(
+                [""] * batch_size,
+                padding="max_length",
+                max_length=77,
+                return_tensors="pt"
+            )
+            unconditional_embeddings = self.text_encoder(unconditional_input.input_ids.to(DEVICE))[0]
+            text_embeddings = torch.cat([unconditional_embeddings, text_embeddings], dim=0)
+        print("latents shape: ", latents.shape)
+        # interative sampling
+        self.scheduler.set_timesteps(num_inference_steps)
+        print("Valid timesteps: ", reversed(self.scheduler.timesteps))
+        # print("attributes: ", self.scheduler.__dict__)
+        latents_list = [latents]
+        pred_x0_list = [latents]
+        for i, t in enumerate(tqdm(reversed(self.scheduler.timesteps), desc="DDIM Inversion")):
+            if guidance_scale > 1.:
+                model_inputs = torch.cat([latents] * 2)
+            else:
+                model_inputs = latents
+            # predict the noise
+            noise_pred = self.unet(model_inputs, t, encoder_hidden_states=text_embeddings).sample
+            if guidance_scale > 1.:
+                noise_pred_uncon, noise_pred_con = noise_pred.chunk(2, dim=0)
+                noise_pred = noise_pred_uncon + guidance_scale * (noise_pred_con - noise_pred_uncon)
+            # compute the previous noise sample x_t-1 -> x_t
+            latents, pred_x0 = self.next_step(noise_pred, t, latents)
+            latents_list.append(latents)
+            pred_x0_list.append(pred_x0)
+        if return_intermediates:
+            # return the intermediate laters during inversion
+            # pred_x0_list = [self.latent2image(img, return_type="pt") for img in pred_x0_list]
+            return latents, latents_list
+        return latents, start_latents

masactrl/masactrl.py ADDED Viewed

	@@ -0,0 +1,334 @@

+import os
+import torch
+import torch.nn.functional as F
+import numpy as np
+from einops import rearrange
+from .masactrl_utils import AttentionBase
+from torchvision.utils import save_image
+class MutualSelfAttentionControl(AttentionBase):
+    MODEL_TYPE = {
+        "SD": 16,
+        "SDXL": 70
+    }
+    def __init__(self, start_step=4, start_layer=10, layer_idx=None, step_idx=None, total_steps=50, model_type="SD"):
+        """
+        Mutual self-attention control for Stable-Diffusion model
+        Args:
+            start_step: the step to start mutual self-attention control
+            start_layer: the layer to start mutual self-attention control
+            layer_idx: list of the layers to apply mutual self-attention control
+            step_idx: list the steps to apply mutual self-attention control
+            total_steps: the total number of steps
+            model_type: the model type, SD or SDXL
+        """
+        super().__init__()
+        self.total_steps = total_steps
+        self.total_layers = self.MODEL_TYPE.get(model_type, 16)
+        self.start_step = start_step
+        self.start_layer = start_layer
+        self.layer_idx = layer_idx if layer_idx is not None else list(range(start_layer, self.total_layers))
+        self.step_idx = step_idx if step_idx is not None else list(range(start_step, total_steps))
+        print("MasaCtrl at denoising steps: ", self.step_idx)
+        print("MasaCtrl at U-Net layers: ", self.layer_idx)
+    def attn_batch(self, q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs):
+        """
+        Performing attention for a batch of queries, keys, and values
+        """
+        b = q.shape[0] // num_heads
+        q = rearrange(q, "(b h) n d -> h (b n) d", h=num_heads)
+        k = rearrange(k, "(b h) n d -> h (b n) d", h=num_heads)
+        v = rearrange(v, "(b h) n d -> h (b n) d", h=num_heads)
+        sim = torch.einsum("h i d, h j d -> h i j", q, k) * kwargs.get("scale")
+        attn = sim.softmax(-1)
+        out = torch.einsum("h i j, h j d -> h i d", attn, v)
+        out = rearrange(out, "h (b n) d -> b n (h d)", b=b)
+        return out
+    def forward(self, q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs):
+        """
+        Attention forward function
+        """
+        if is_cross or self.cur_step not in self.step_idx or self.cur_att_layer // 2 not in self.layer_idx:
+            return super().forward(q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs)
+        qu, qc = q.chunk(2)
+        ku, kc = k.chunk(2)
+        vu, vc = v.chunk(2)
+        attnu, attnc = attn.chunk(2)
+        out_u = self.attn_batch(qu, ku[:num_heads], vu[:num_heads], sim[:num_heads], attnu, is_cross, place_in_unet, num_heads, **kwargs)
+        out_c = self.attn_batch(qc, kc[:num_heads], vc[:num_heads], sim[:num_heads], attnc, is_cross, place_in_unet, num_heads, **kwargs)
+        out = torch.cat([out_u, out_c], dim=0)
+        return out
+class MutualSelfAttentionControlUnion(MutualSelfAttentionControl):
+    def __init__(self, start_step=4, start_layer=10, layer_idx=None, step_idx=None, total_steps=50, model_type="SD"):
+        """
+        Mutual self-attention control for Stable-Diffusion model with unition source and target [K, V]
+        Args:
+            start_step: the step to start mutual self-attention control
+            start_layer: the layer to start mutual self-attention control
+            layer_idx: list of the layers to apply mutual self-attention control
+            step_idx: list the steps to apply mutual self-attention control
+            total_steps: the total number of steps
+            model_type: the model type, SD or SDXL
+        """
+        super().__init__(start_step, start_layer, layer_idx, step_idx, total_steps, model_type)
+    def forward(self, q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs):
+        """
+        Attention forward function
+        """
+        if is_cross or self.cur_step not in self.step_idx or self.cur_att_layer // 2 not in self.layer_idx:
+            return super().forward(q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs)
+        qu_s, qu_t, qc_s, qc_t = q.chunk(4)
+        ku_s, ku_t, kc_s, kc_t = k.chunk(4)
+        vu_s, vu_t, vc_s, vc_t = v.chunk(4)
+        attnu_s, attnu_t, attnc_s, attnc_t = attn.chunk(4)
+        # source image branch
+        out_u_s = super().forward(qu_s, ku_s, vu_s, sim, attnu_s, is_cross, place_in_unet, num_heads, **kwargs)
+        out_c_s = super().forward(qc_s, kc_s, vc_s, sim, attnc_s, is_cross, place_in_unet, num_heads, **kwargs)
+        # target image branch, concatenating source and target [K, V]
+        out_u_t = self.attn_batch(qu_t, torch.cat([ku_s, ku_t]), torch.cat([vu_s, vu_t]), sim[:num_heads], attnu_t, is_cross, place_in_unet, num_heads, **kwargs)
+        out_c_t = self.attn_batch(qc_t, torch.cat([kc_s, kc_t]), torch.cat([vc_s, vc_t]), sim[:num_heads], attnc_t, is_cross, place_in_unet, num_heads, **kwargs)
+        out = torch.cat([out_u_s, out_u_t, out_c_s, out_c_t], dim=0)
+        return out
+class MutualSelfAttentionControlMask(MutualSelfAttentionControl):
+    def __init__(self,  start_step=4, start_layer=10, layer_idx=None, step_idx=None, total_steps=50,  mask_s=None, mask_t=None, mask_save_dir=None, model_type="SD"):
+        """
+        Maske-guided MasaCtrl to alleviate the problem of fore- and background confusion
+        Args:
+            start_step: the step to start mutual self-attention control
+            start_layer: the layer to start mutual self-attention control
+            layer_idx: list of the layers to apply mutual self-attention control
+            step_idx: list the steps to apply mutual self-attention control
+            total_steps: the total number of steps
+            mask_s: source mask with shape (h, w)
+            mask_t: target mask with same shape as source mask
+            mask_save_dir: the path to save the mask image
+            model_type: the model type, SD or SDXL
+        """
+        super().__init__(start_step, start_layer, layer_idx, step_idx, total_steps, model_type)
+        self.mask_s = mask_s  # source mask with shape (h, w)
+        self.mask_t = mask_t  # target mask with same shape as source mask
+        print("Using mask-guided MasaCtrl")
+        if mask_save_dir is not None:
+            os.makedirs(mask_save_dir, exist_ok=True)
+            save_image(self.mask_s.unsqueeze(0).unsqueeze(0), os.path.join(mask_save_dir, "mask_s.png"))
+            save_image(self.mask_t.unsqueeze(0).unsqueeze(0), os.path.join(mask_save_dir, "mask_t.png"))
+    def attn_batch(self, q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs):
+        B = q.shape[0] // num_heads
+        H = W = int(np.sqrt(q.shape[1]))
+        q = rearrange(q, "(b h) n d -> h (b n) d", h=num_heads)
+        k = rearrange(k, "(b h) n d -> h (b n) d", h=num_heads)
+        v = rearrange(v, "(b h) n d -> h (b n) d", h=num_heads)
+        sim = torch.einsum("h i d, h j d -> h i j", q, k) * kwargs.get("scale")
+        if kwargs.get("is_mask_attn") and self.mask_s is not None:
+            print("masked attention")
+            mask = self.mask_s.unsqueeze(0).unsqueeze(0)
+            mask = F.interpolate(mask, (H, W)).flatten(0).unsqueeze(0)
+            mask = mask.flatten()
+            # background
+            sim_bg = sim + mask.masked_fill(mask == 1, torch.finfo(sim.dtype).min)
+            # object
+            sim_fg = sim + mask.masked_fill(mask == 0, torch.finfo(sim.dtype).min)
+            sim = torch.cat([sim_fg, sim_bg], dim=0)
+        attn = sim.softmax(-1)
+        if len(attn) == 2 * len(v):
+            v = torch.cat([v] * 2)
+        out = torch.einsum("h i j, h j d -> h i d", attn, v)
+        out = rearrange(out, "(h1 h) (b n) d -> (h1 b) n (h d)", b=B, h=num_heads)
+        return out
+    def forward(self, q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs):
+        """
+        Attention forward function
+        """
+        if is_cross or self.cur_step not in self.step_idx or self.cur_att_layer // 2 not in self.layer_idx:
+            return super().forward(q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs)
+        B = q.shape[0] // num_heads // 2
+        H = W = int(np.sqrt(q.shape[1]))
+        qu, qc = q.chunk(2)
+        ku, kc = k.chunk(2)
+        vu, vc = v.chunk(2)
+        attnu, attnc = attn.chunk(2)
+        out_u_source = self.attn_batch(qu[:num_heads], ku[:num_heads], vu[:num_heads], sim[:num_heads], attnu, is_cross, place_in_unet, num_heads, **kwargs)
+        out_c_source = self.attn_batch(qc[:num_heads], kc[:num_heads], vc[:num_heads], sim[:num_heads], attnc, is_cross, place_in_unet, num_heads, **kwargs)
+        out_u_target = self.attn_batch(qu[-num_heads:], ku[:num_heads], vu[:num_heads], sim[:num_heads], attnu, is_cross, place_in_unet, num_heads, is_mask_attn=True, **kwargs)
+        out_c_target = self.attn_batch(qc[-num_heads:], kc[:num_heads], vc[:num_heads], sim[:num_heads], attnc, is_cross, place_in_unet, num_heads, is_mask_attn=True, **kwargs)
+        if self.mask_s is not None and self.mask_t is not None:
+            out_u_target_fg, out_u_target_bg = out_u_target.chunk(2, 0)
+            out_c_target_fg, out_c_target_bg = out_c_target.chunk(2, 0)
+            mask = F.interpolate(self.mask_t.unsqueeze(0).unsqueeze(0), (H, W))
+            mask = mask.reshape(-1, 1)  # (hw, 1)
+            out_u_target = out_u_target_fg * mask + out_u_target_bg * (1 - mask)
+            out_c_target = out_c_target_fg * mask + out_c_target_bg * (1 - mask)
+        out = torch.cat([out_u_source, out_u_target, out_c_source, out_c_target], dim=0)
+        return out
+class MutualSelfAttentionControlMaskAuto(MutualSelfAttentionControl):
+    def __init__(self, start_step=4, start_layer=10, layer_idx=None, step_idx=None, total_steps=50, thres=0.1, ref_token_idx=[1], cur_token_idx=[1], mask_save_dir=None, model_type="SD"):
+        """
+        MasaCtrl with mask auto generation from cross-attention map
+        Args:
+            start_step: the step to start mutual self-attention control
+            start_layer: the layer to start mutual self-attention control
+            layer_idx: list of the layers to apply mutual self-attention control
+            step_idx: list the steps to apply mutual self-attention control
+            total_steps: the total number of steps
+            thres: the thereshold for mask thresholding
+            ref_token_idx: the token index list for cross-attention map aggregation
+            cur_token_idx: the token index list for cross-attention map aggregation
+            mask_save_dir: the path to save the mask image
+        """
+        super().__init__(start_step, start_layer, layer_idx, step_idx, total_steps, model_type)
+        print("Using MutualSelfAttentionControlMaskAuto")
+        self.thres = thres
+        self.ref_token_idx = ref_token_idx
+        self.cur_token_idx = cur_token_idx
+        self.self_attns = []
+        self.cross_attns = []
+        self.cross_attns_mask = None
+        self.self_attns_mask = None
+        self.mask_save_dir = mask_save_dir
+        if self.mask_save_dir is not None:
+            os.makedirs(self.mask_save_dir, exist_ok=True)
+    def after_step(self):
+        self.self_attns = []
+        self.cross_attns = []
+    def attn_batch(self, q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs):
+        """
+        Performing attention for a batch of queries, keys, and values
+        """
+        B = q.shape[0] // num_heads
+        H = W = int(np.sqrt(q.shape[1]))
+        q = rearrange(q, "(b h) n d -> h (b n) d", h=num_heads)
+        k = rearrange(k, "(b h) n d -> h (b n) d", h=num_heads)
+        v = rearrange(v, "(b h) n d -> h (b n) d", h=num_heads)
+        sim = torch.einsum("h i d, h j d -> h i j", q, k) * kwargs.get("scale")
+        if self.self_attns_mask is not None:
+            # binarize the mask
+            mask = self.self_attns_mask
+            thres = self.thres
+            mask[mask >= thres] = 1
+            mask[mask < thres] = 0
+            sim_fg = sim + mask.masked_fill(mask == 0, torch.finfo(sim.dtype).min)
+            sim_bg = sim + mask.masked_fill(mask == 1, torch.finfo(sim.dtype).min)
+            sim = torch.cat([sim_fg, sim_bg])
+        attn = sim.softmax(-1)
+        if len(attn) == 2 * len(v):
+            v = torch.cat([v] * 2)
+        out = torch.einsum("h i j, h j d -> h i d", attn, v)
+        out = rearrange(out, "(h1 h) (b n) d -> (h1 b) n (h d)", b=B, h=num_heads)
+        return out
+    def aggregate_cross_attn_map(self, idx):
+        attn_map = torch.stack(self.cross_attns, dim=1).mean(1)  # (B, N, dim)
+        B = attn_map.shape[0]
+        res = int(np.sqrt(attn_map.shape[-2]))
+        attn_map = attn_map.reshape(-1, res, res, attn_map.shape[-1])
+        image = attn_map[..., idx]
+        if isinstance(idx, list):
+            image = image.sum(-1)
+        image_min = image.min(dim=1, keepdim=True)[0].min(dim=2, keepdim=True)[0]
+        image_max = image.max(dim=1, keepdim=True)[0].max(dim=2, keepdim=True)[0]
+        image = (image - image_min) / (image_max - image_min)
+        return image
+    def forward(self, q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs):
+        """
+        Attention forward function
+        """
+        if is_cross:
+            # save cross attention map with res 16 * 16
+            if attn.shape[1] == 16 * 16:
+                self.cross_attns.append(attn.reshape(-1, num_heads, *attn.shape[-2:]).mean(1))
+        if is_cross or self.cur_step not in self.step_idx or self.cur_att_layer // 2 not in self.layer_idx:
+            return super().forward(q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs)
+        B = q.shape[0] // num_heads // 2
+        H = W = int(np.sqrt(q.shape[1]))
+        qu, qc = q.chunk(2)
+        ku, kc = k.chunk(2)
+        vu, vc = v.chunk(2)
+        attnu, attnc = attn.chunk(2)
+        out_u_source = self.attn_batch(qu[:num_heads], ku[:num_heads], vu[:num_heads], sim[:num_heads], attnu, is_cross, place_in_unet, num_heads, **kwargs)
+        out_c_source = self.attn_batch(qc[:num_heads], kc[:num_heads], vc[:num_heads], sim[:num_heads], attnc, is_cross, place_in_unet, num_heads, **kwargs)
+        if len(self.cross_attns) == 0:
+            self.self_attns_mask = None
+            out_u_target = self.attn_batch(qu[-num_heads:], ku[:num_heads], vu[:num_heads], sim[:num_heads], attnu, is_cross, place_in_unet, num_heads, **kwargs)
+            out_c_target = self.attn_batch(qc[-num_heads:], kc[:num_heads], vc[:num_heads], sim[:num_heads], attnc, is_cross, place_in_unet, num_heads, **kwargs)
+        else:
+            mask = self.aggregate_cross_attn_map(idx=self.ref_token_idx)  # (2, H, W)
+            mask_source = mask[-2]  # (H, W)
+            res = int(np.sqrt(q.shape[1]))
+            self.self_attns_mask = F.interpolate(mask_source.unsqueeze(0).unsqueeze(0), (res, res)).flatten()
+            if self.mask_save_dir is not None:
+                H = W = int(np.sqrt(self.self_attns_mask.shape[0]))
+                mask_image = self.self_attns_mask.reshape(H, W).unsqueeze(0)
+                save_image(mask_image, os.path.join(self.mask_save_dir, f"mask_s_{self.cur_step}_{self.cur_att_layer}.png"))
+            out_u_target = self.attn_batch(qu[-num_heads:], ku[:num_heads], vu[:num_heads], sim[:num_heads], attnu, is_cross, place_in_unet, num_heads, **kwargs)
+            out_c_target = self.attn_batch(qc[-num_heads:], kc[:num_heads], vc[:num_heads], sim[:num_heads], attnc, is_cross, place_in_unet, num_heads, **kwargs)
+        if self.self_attns_mask is not None:
+            mask = self.aggregate_cross_attn_map(idx=self.cur_token_idx)  # (2, H, W)
+            mask_target = mask[-1]  # (H, W)
+            res = int(np.sqrt(q.shape[1]))
+            spatial_mask = F.interpolate(mask_target.unsqueeze(0).unsqueeze(0), (res, res)).reshape(-1, 1)
+            if self.mask_save_dir is not None:
+                H = W = int(np.sqrt(spatial_mask.shape[0]))
+                mask_image = spatial_mask.reshape(H, W).unsqueeze(0)
+                save_image(mask_image, os.path.join(self.mask_save_dir, f"mask_t_{self.cur_step}_{self.cur_att_layer}.png"))
+            # binarize the mask
+            thres = self.thres
+            spatial_mask[spatial_mask >= thres] = 1
+            spatial_mask[spatial_mask < thres] = 0
+            out_u_target_fg, out_u_target_bg = out_u_target.chunk(2)
+            out_c_target_fg, out_c_target_bg = out_c_target.chunk(2)
+            out_u_target = out_u_target_fg * spatial_mask + out_u_target_bg * (1 - spatial_mask)
+            out_c_target = out_c_target_fg * spatial_mask + out_c_target_bg * (1 - spatial_mask)
+            # set self self-attention mask to None
+            self.self_attns_mask = None
+        out = torch.cat([out_u_source, out_u_target, out_c_source, out_c_target], dim=0)
+        return out

masactrl/masactrl_processor.py ADDED Viewed

	@@ -0,0 +1,259 @@

+from importlib import import_module
+from typing import Callable, Optional, Union
+import torch
+import torch.nn.functional as F
+from torch import nn
+from diffusers.utils import deprecate, logging
+from diffusers.utils.import_utils import is_xformers_available
+from diffusers.models.attention import Attention
+def register_attention_processor(
+    model: Optional[nn.Module] = None,
+    processor_type: str = "MasaCtrlProcessor",
+    **attn_args,
+):
+    """
+    Args:
+        model: a unet model or a list of unet models
+        processor_type: the type of the processor
+    """
+    if not isinstance(model, (list, tuple)):
+        model = [model]
+    if processor_type == "MasaCtrlProcessor":
+        processor = MasaCtrlProcessor(**attn_args)
+    else:
+        processor = AttnProcessor()
+    for m in model:
+        m.set_attn_processor(processor)
+        print(f"Model {m.__class__.__name__} is registered attention processor: {processor_type}")
+class AttnProcessor:
+    r"""
+    Default processor for performing attention-related computations.
+    """
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
+        *args,
+        **kwargs,
+    ) -> torch.Tensor:
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+        residual = hidden_states
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states
+class MasaCtrlProcessor(nn.Module):
+    """
+    Mutual Self-attention Processor for diffusers library.
+    Note that the all attention layers should register the same processor.
+    """
+    MODEL_TYPE = {
+        "SD": 16,
+        "SDXL": 70
+    }
+    def __init__(self, start_step=4, start_layer=10, layer_idx=None, step_idx=None, total_layers=32, total_steps=50, model_type="SD"):
+        """
+        Mutual self-attention control for Stable-Diffusion model
+        Args:
+            start_step: the step to start mutual self-attention control
+            start_layer: the layer to start mutual self-attention control
+            layer_idx: list of the layers to apply mutual self-attention control
+            step_idx: list the steps to apply mutual self-attention control
+            total_steps: the total number of steps, must be same to the denoising steps used in denoising scheduler
+            model_type: the model type, SD or SDXL
+        """
+        super().__init__()
+        self.total_steps = total_steps
+        self.total_layers = self.MODEL_TYPE.get(model_type, 16)
+        self.start_step = start_step
+        self.start_layer = start_layer
+        self.layer_idx = layer_idx if layer_idx is not None else list(range(start_layer, self.total_layers))
+        self.step_idx = step_idx if step_idx is not None else list(range(start_step, total_steps))
+        print("MasaCtrl at denoising steps: ", self.step_idx)
+        print("MasaCtrl at U-Net layers: ", self.layer_idx)
+        self.cur_step = 0
+        self.cur_att_layer = 0
+        self.num_attn_layers = total_layers
+    def after_step(self):
+        pass
+    def __call__(
+        self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        temb: Optional[torch.FloatTensor] = None,
+        scale: float = 1.0,
+    ):
+        out = self.attn_forward(
+            attn,
+            hidden_states,
+            encoder_hidden_states,
+            attention_mask,
+            temb,
+            scale,
+        )
+        self.cur_att_layer += 1
+        if self.cur_att_layer == self.num_attn_layers:
+            self.cur_att_layer = 0
+            self.cur_step += 1
+            self.cur_step %= self.total_steps
+            # after step
+            self.after_step()
+        return out
+    def masactrl_forward(
+        self,
+        query,
+        key,
+        value,
+    ):
+        """
+        Rearrange the key and value for mutual self-attention control
+        """
+        ku_src, ku_tgt, kc_src, kc_tgt = key.chunk(4)
+        vu_src, vu_tgt, vc_src, vc_tgt = value.chunk(4)
+        k_rearranged = torch.cat([ku_src, ku_src, kc_src, kc_src])
+        v_rearranged = torch.cat([vu_src, vu_src, vc_src, vc_src])
+        return query, k_rearranged, v_rearranged
+    def attn_forward(
+        self,
+        attn: Attention,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        temb: Optional[torch.Tensor] = None,
+        *args,
+        **kwargs,
+    ):
+        cur_transformer_layer = self.cur_att_layer // 2
+        residual = hidden_states
+        is_cross = True if encoder_hidden_states is not None else False
+        if len(args) > 0 or kwargs.get("scale", None) is not None:
+            deprecation_message = "The `scale` argument is deprecated and will be ignored. Please remove it, as passing it will raise an error in the future. `scale` should directly be passed while calling the underlying pipeline component i.e., via `cross_attention_kwargs`."
+            deprecate("scale", "1.0.0", deprecation_message)
+        if attn.spatial_norm is not None:
+            hidden_states = attn.spatial_norm(hidden_states, temb)
+        input_ndim = hidden_states.ndim
+        if input_ndim == 4:
+            batch_size, channel, height, width = hidden_states.shape
+            hidden_states = hidden_states.view(batch_size, channel, height * width).transpose(1, 2)
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states, *args)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states, *args)
+        value = attn.to_v(encoder_hidden_states, *args)
+        # mutual self-attention control
+        if not is_cross and self.cur_step in self.step_idx and cur_transformer_layer in self.layer_idx:
+            query, key, value = self.masactrl_forward(query, key, value)
+        query = attn.head_to_batch_dim(query)
+        key = attn.head_to_batch_dim(key)
+        value = attn.head_to_batch_dim(value)
+        attention_probs = attn.get_attention_scores(query, key, attention_mask)
+        hidden_states = torch.bmm(attention_probs, value)
+        hidden_states = attn.batch_to_head_dim(hidden_states)
+        # linear proj
+        hidden_states = attn.to_out[0](hidden_states, *args)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        if input_ndim == 4:
+            hidden_states = hidden_states.transpose(-1, -2).reshape(batch_size, channel, height, width)
+        if attn.residual_connection:
+            hidden_states = hidden_states + residual
+        hidden_states = hidden_states / attn.rescale_output_factor
+        return hidden_states

masactrl/masactrl_utils.py ADDED Viewed

	@@ -0,0 +1,212 @@

+import os
+import cv2
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional, Union, Tuple, List, Callable, Dict
+from torchvision.utils import save_image
+from einops import rearrange, repeat
+class AttentionBase:
+    def __init__(self):
+        self.cur_step = 0
+        self.num_att_layers = -1
+        self.cur_att_layer = 0
+    def after_step(self):
+        pass
+    def __call__(self, q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs):
+        out = self.forward(q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs)
+        self.cur_att_layer += 1
+        if self.cur_att_layer == self.num_att_layers:
+            self.cur_att_layer = 0
+            self.cur_step += 1
+            # after step
+            self.after_step()
+        return out
+    def forward(self, q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs):
+        out = torch.einsum('b i j, b j d -> b i d', attn, v)
+        out = rearrange(out, '(b h) n d -> b n (h d)', h=num_heads)
+        return out
+    def reset(self):
+        self.cur_step = 0
+        self.cur_att_layer = 0
+class AttentionStore(AttentionBase):
+    def __init__(self, res=[32], min_step=0, max_step=1000):
+        super().__init__()
+        self.res = res
+        self.min_step = min_step
+        self.max_step = max_step
+        self.valid_steps = 0
+        self.self_attns = []  # store the all attns
+        self.cross_attns = []
+        self.self_attns_step = []  # store the attns in each step
+        self.cross_attns_step = []
+    def after_step(self):
+        if self.cur_step > self.min_step and self.cur_step < self.max_step:
+            self.valid_steps += 1
+            if len(self.self_attns) == 0:
+                self.self_attns = self.self_attns_step
+                self.cross_attns = self.cross_attns_step
+            else:
+                for i in range(len(self.self_attns)):
+                    self.self_attns[i] += self.self_attns_step[i]
+                    self.cross_attns[i] += self.cross_attns_step[i]
+        self.self_attns_step.clear()
+        self.cross_attns_step.clear()
+    def forward(self, q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs):
+        if attn.shape[1] <= 64 ** 2:  # avoid OOM
+            if is_cross:
+                self.cross_attns_step.append(attn)
+            else:
+                self.self_attns_step.append(attn)
+        return super().forward(q, k, v, sim, attn, is_cross, place_in_unet, num_heads, **kwargs)
+def regiter_attention_editor_diffusers(model, editor: AttentionBase):
+    """
+    Register a attention editor to Diffuser Pipeline, refer from [Prompt-to-Prompt]
+    """
+    def ca_forward(self, place_in_unet):
+        def forward(x, encoder_hidden_states=None, attention_mask=None, context=None, mask=None):
+            """
+            The attention is similar to the original implementation of LDM CrossAttention class
+            except adding some modifications on the attention
+            """
+            if encoder_hidden_states is not None:
+                context = encoder_hidden_states
+            if attention_mask is not None:
+                mask = attention_mask
+            to_out = self.to_out
+            if isinstance(to_out, nn.modules.container.ModuleList):
+                to_out = self.to_out[0]
+            else:
+                to_out = self.to_out
+            h = self.heads
+            q = self.to_q(x)
+            is_cross = context is not None
+            context = context if is_cross else x
+            k = self.to_k(context)
+            v = self.to_v(context)
+            q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
+            sim = torch.einsum('b i d, b j d -> b i j', q, k) * self.scale
+            if mask is not None:
+                mask = rearrange(mask, 'b ... -> b (...)')
+                max_neg_value = -torch.finfo(sim.dtype).max
+                mask = repeat(mask, 'b j -> (b h) () j', h=h)
+                mask = mask[:, None, :].repeat(h, 1, 1)
+                sim.masked_fill_(~mask, max_neg_value)
+            attn = sim.softmax(dim=-1)
+            # the only difference
+            out = editor(
+                q, k, v, sim, attn, is_cross, place_in_unet,
+                self.heads, scale=self.scale)
+            return to_out(out)
+        return forward
+    def register_editor(net, count, place_in_unet):
+        for name, subnet in net.named_children():
+            if net.__class__.__name__ == 'Attention':  # spatial Transformer layer
+                net.forward = ca_forward(net, place_in_unet)
+                return count + 1
+            elif hasattr(net, 'children'):
+                count = register_editor(subnet, count, place_in_unet)
+        return count
+    cross_att_count = 0
+    for net_name, net in model.unet.named_children():
+        if "down" in net_name:
+            cross_att_count += register_editor(net, 0, "down")
+        elif "mid" in net_name:
+            cross_att_count += register_editor(net, 0, "mid")
+        elif "up" in net_name:
+            cross_att_count += register_editor(net, 0, "up")
+    editor.num_att_layers = cross_att_count
+def regiter_attention_editor_ldm(model, editor: AttentionBase):
+    """
+    Register a attention editor to Stable Diffusion model, refer from [Prompt-to-Prompt]
+    """
+    def ca_forward(self, place_in_unet):
+        def forward(x, encoder_hidden_states=None, attention_mask=None, context=None, mask=None):
+            """
+            The attention is similar to the original implementation of LDM CrossAttention class
+            except adding some modifications on the attention
+            """
+            if encoder_hidden_states is not None:
+                context = encoder_hidden_states
+            if attention_mask is not None:
+                mask = attention_mask
+            to_out = self.to_out
+            if isinstance(to_out, nn.modules.container.ModuleList):
+                to_out = self.to_out[0]
+            else:
+                to_out = self.to_out
+            h = self.heads
+            q = self.to_q(x)
+            is_cross = context is not None
+            context = context if is_cross else x
+            k = self.to_k(context)
+            v = self.to_v(context)
+            q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> (b h) n d', h=h), (q, k, v))
+            sim = torch.einsum('b i d, b j d -> b i j', q, k) * self.scale
+            if mask is not None:
+                mask = rearrange(mask, 'b ... -> b (...)')
+                max_neg_value = -torch.finfo(sim.dtype).max
+                mask = repeat(mask, 'b j -> (b h) () j', h=h)
+                mask = mask[:, None, :].repeat(h, 1, 1)
+                sim.masked_fill_(~mask, max_neg_value)
+            attn = sim.softmax(dim=-1)
+            # the only difference
+            out = editor(
+                q, k, v, sim, attn, is_cross, place_in_unet,
+                self.heads, scale=self.scale)
+            return to_out(out)
+        return forward
+    def register_editor(net, count, place_in_unet):
+        for name, subnet in net.named_children():
+            if net.__class__.__name__ == 'CrossAttention':  # spatial Transformer layer
+                net.forward = ca_forward(net, place_in_unet)
+                return count + 1
+            elif hasattr(net, 'children'):
+                count = register_editor(subnet, count, place_in_unet)
+        return count
+    cross_att_count = 0
+    for net_name, net in model.model.diffusion_model.named_children():
+        if "input" in net_name:
+            cross_att_count += register_editor(net, 0, "input")
+        elif "middle" in net_name:
+            cross_att_count += register_editor(net, 0, "middle")
+        elif "output" in net_name:
+            cross_att_count += register_editor(net, 0, "output")
+    editor.num_att_layers = cross_att_count

masactrl_w_adapter/ddim.py ADDED Viewed

	@@ -0,0 +1,375 @@

+"""SAMPLING ONLY."""
+import torch
+import numpy as np
+from tqdm import tqdm
+from ldm.modules.diffusionmodules.util import make_ddim_sampling_parameters, make_ddim_timesteps, noise_like, \
+    extract_into_tensor
+class DDIMSampler(object):
+    def __init__(self, model, schedule="linear", **kwargs):
+        super().__init__()
+        self.model = model
+        self.ddpm_num_timesteps = model.num_timesteps
+        self.schedule = schedule
+    def register_buffer(self, name, attr):
+        if type(attr) == torch.Tensor:
+            if attr.device != torch.device("cuda"):
+                attr = attr.to(torch.device("cuda"))
+        setattr(self, name, attr)
+    def make_schedule(self, ddim_num_steps, ddim_discretize="uniform", ddim_eta=0., verbose=True):
+        self.ddim_timesteps = make_ddim_timesteps(ddim_discr_method=ddim_discretize, num_ddim_timesteps=ddim_num_steps,
+                                                  num_ddpm_timesteps=self.ddpm_num_timesteps, verbose=verbose)
+        alphas_cumprod = self.model.alphas_cumprod
+        assert alphas_cumprod.shape[0] == self.ddpm_num_timesteps, 'alphas have to be defined for each timestep'
+        to_torch = lambda x: x.clone().detach().to(torch.float32).to(self.model.device)
+        self.register_buffer('betas', to_torch(self.model.betas))
+        self.register_buffer('alphas_cumprod', to_torch(alphas_cumprod))
+        self.register_buffer('alphas_cumprod_prev', to_torch(self.model.alphas_cumprod_prev))
+        # calculations for diffusion q(x_t | x_{t-1}) and others
+        self.register_buffer('sqrt_alphas_cumprod', to_torch(np.sqrt(alphas_cumprod.cpu())))
+        self.register_buffer('sqrt_one_minus_alphas_cumprod', to_torch(np.sqrt(1. - alphas_cumprod.cpu())))
+        self.register_buffer('log_one_minus_alphas_cumprod', to_torch(np.log(1. - alphas_cumprod.cpu())))
+        self.register_buffer('sqrt_recip_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu())))
+        self.register_buffer('sqrt_recipm1_alphas_cumprod', to_torch(np.sqrt(1. / alphas_cumprod.cpu() - 1)))
+        # ddim sampling parameters
+        ddim_sigmas, ddim_alphas, ddim_alphas_prev = make_ddim_sampling_parameters(alphacums=alphas_cumprod.cpu(),
+                                                                                   ddim_timesteps=self.ddim_timesteps,
+                                                                                   eta=ddim_eta, verbose=verbose)
+        self.register_buffer('ddim_sigmas', ddim_sigmas)
+        self.register_buffer('ddim_alphas', ddim_alphas)
+        self.register_buffer('ddim_alphas_prev', ddim_alphas_prev)
+        self.register_buffer('ddim_sqrt_one_minus_alphas', np.sqrt(1. - ddim_alphas))
+        sigmas_for_original_sampling_steps = ddim_eta * torch.sqrt(
+            (1 - self.alphas_cumprod_prev) / (1 - self.alphas_cumprod) * (
+                    1 - self.alphas_cumprod / self.alphas_cumprod_prev))
+        self.register_buffer('ddim_sigmas_for_original_num_steps', sigmas_for_original_sampling_steps)
+    @torch.no_grad()
+    def sample(self,
+               S,
+               batch_size,
+               shape,
+               conditioning=None,
+               callback=None,
+               normals_sequence=None,
+               img_callback=None,
+               quantize_x0=False,
+               eta=0.,
+               mask=None,
+               x0=None,
+               temperature=1.,
+               noise_dropout=0.,
+               score_corrector=None,
+               corrector_kwargs=None,
+               verbose=True,
+               x_T=None,
+               log_every_t=100,
+               unconditional_guidance_scale=1.,
+               unconditional_conditioning=None,
+               features_adapter=None,
+               append_to_context=None,
+               cond_tau=0.4,
+               style_cond_tau=1.0,
+               # this has to come in the same format as the conditioning, # e.g. as encoded tokens, ...
+               **kwargs
+               ):
+        if conditioning is not None:
+            if isinstance(conditioning, dict):
+                cbs = conditioning[list(conditioning.keys())[0]].shape[0]
+                if cbs != batch_size:
+                    print(f"Warning: Got {cbs} conditionings but batch-size is {batch_size}")
+            else:
+                if conditioning.shape[0] != batch_size:
+                    print(f"Warning: Got {conditioning.shape[0]} conditionings but batch-size is {batch_size}")
+        self.make_schedule(ddim_num_steps=S, ddim_eta=eta, verbose=verbose)
+        # sampling
+        C, H, W = shape
+        size = (batch_size, C, H, W)
+        print(f'Data shape for DDIM sampling is {size}, eta {eta}')
+        samples, intermediates = self.ddim_sampling(conditioning, size,
+                                                    callback=callback,
+                                                    img_callback=img_callback,
+                                                    quantize_denoised=quantize_x0,
+                                                    mask=mask, x0=x0,
+                                                    ddim_use_original_steps=False,
+                                                    noise_dropout=noise_dropout,
+                                                    temperature=temperature,
+                                                    score_corrector=score_corrector,
+                                                    corrector_kwargs=corrector_kwargs,
+                                                    x_T=x_T,
+                                                    log_every_t=log_every_t,
+                                                    unconditional_guidance_scale=unconditional_guidance_scale,
+                                                    unconditional_conditioning=unconditional_conditioning,
+                                                    features_adapter=features_adapter,
+                                                    append_to_context=append_to_context,
+                                                    cond_tau=cond_tau,
+                                                    style_cond_tau=style_cond_tau,
+                                                    )
+        return samples, intermediates
+    @torch.no_grad()
+    def ddim_sampling(self, cond, shape,
+                      x_T=None, ddim_use_original_steps=False,
+                      callback=None, timesteps=None, quantize_denoised=False,
+                      mask=None, x0=None, img_callback=None, log_every_t=100,
+                      temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
+                      unconditional_guidance_scale=1., unconditional_conditioning=None, features_adapter=None,
+                      append_to_context=None, cond_tau=0.4, style_cond_tau=1.0):
+        device = self.model.betas.device
+        b = shape[0]
+        if x_T is None:
+            img = torch.randn(shape, device=device)
+        else:
+            img = x_T
+        if timesteps is None:
+            timesteps = self.ddpm_num_timesteps if ddim_use_original_steps else self.ddim_timesteps
+        elif timesteps is not None and not ddim_use_original_steps:
+            subset_end = int(min(timesteps / self.ddim_timesteps.shape[0], 1) * self.ddim_timesteps.shape[0]) - 1
+            timesteps = self.ddim_timesteps[:subset_end]
+        intermediates = {'x_inter': [img], 'pred_x0': [img]}
+        time_range = reversed(range(0, timesteps)) if ddim_use_original_steps else np.flip(timesteps)
+        total_steps = timesteps if ddim_use_original_steps else timesteps.shape[0]
+        print(f"Running DDIM Sampling with {total_steps} timesteps")
+        iterator = tqdm(time_range, desc='DDIM Sampler', total=total_steps)
+        for i, step in enumerate(iterator):
+            index = total_steps - i - 1
+            ts = torch.full((b,), step, device=device, dtype=torch.long)
+            if mask is not None:
+                assert x0 is not None
+                img_orig = self.model.q_sample(x0, ts)  # TODO: deterministic forward pass?
+                img = img_orig * mask + (1. - mask) * img
+            outs = self.p_sample_ddim(img, cond, ts, index=index, use_original_steps=ddim_use_original_steps,
+                                      quantize_denoised=quantize_denoised, temperature=temperature,
+                                      noise_dropout=noise_dropout, score_corrector=score_corrector,
+                                      corrector_kwargs=corrector_kwargs,
+                                      unconditional_guidance_scale=unconditional_guidance_scale,
+                                      unconditional_conditioning=unconditional_conditioning,
+                                      features_adapter=None if index < int(
+                                          (1 - cond_tau) * total_steps) else features_adapter,
+                                      append_to_context=None if index < int(
+                                          (1 - style_cond_tau) * total_steps) else append_to_context,
+                                      )
+            img, pred_x0 = outs
+            if callback: callback(i)
+            if img_callback: img_callback(pred_x0, i)
+            if index % log_every_t == 0 or index == total_steps - 1:
+                intermediates['x_inter'].append(img)
+                intermediates['pred_x0'].append(pred_x0)
+        return img, intermediates
+    @torch.no_grad()
+    def p_sample_ddim(self, x, c, t, index, repeat_noise=False, use_original_steps=False, quantize_denoised=False,
+                      temperature=1., noise_dropout=0., score_corrector=None, corrector_kwargs=None,
+                      unconditional_guidance_scale=1., unconditional_conditioning=None, features_adapter=None,
+                      append_to_context=None):
+        b, *_, device = *x.shape, x.device
+        if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
+            if append_to_context is not None:
+                model_output = self.model.apply_model(x, t, torch.cat([c, append_to_context], dim=1),
+                                                      features_adapter=features_adapter)
+            else:
+                model_output = self.model.apply_model(x, t, c, features_adapter=features_adapter)
+        else:
+            x_in = torch.cat([x] * 2)
+            t_in = torch.cat([t] * 2)
+            if isinstance(c, dict):
+                assert isinstance(unconditional_conditioning, dict)
+                c_in = dict()
+                for k in c:
+                    if isinstance(c[k], list):
+                        c_in[k] = [torch.cat([
+                            unconditional_conditioning[k][i],
+                            c[k][i]]) for i in range(len(c[k]))]
+                    else:
+                        c_in[k] = torch.cat([
+                            unconditional_conditioning[k],
+                            c[k]])
+            elif isinstance(c, list):
+                c_in = list()
+                assert isinstance(unconditional_conditioning, list)
+                for i in range(len(c)):
+                    c_in.append(torch.cat([unconditional_conditioning[i], c[i]]))
+            else:
+                if append_to_context is not None:
+                    pad_len = append_to_context.size(1)
+                    new_unconditional_conditioning = torch.cat(
+                        [unconditional_conditioning, unconditional_conditioning[:, -pad_len:, :]], dim=1)
+                    new_c = torch.cat([c, append_to_context], dim=1)
+                    c_in = torch.cat([new_unconditional_conditioning, new_c])
+                else:
+                    c_in = torch.cat([unconditional_conditioning, c])
+            model_uncond, model_t = self.model.apply_model(x_in, t_in, c_in, features_adapter=features_adapter).chunk(2)
+            model_output = model_uncond + unconditional_guidance_scale * (model_t - model_uncond)
+        if self.model.parameterization == "v":
+            e_t = self.model.predict_eps_from_z_and_v(x, t, model_output)
+        else:
+            e_t = model_output
+        if score_corrector is not None:
+            assert self.model.parameterization == "eps", 'not implemented'
+            e_t = score_corrector.modify_score(self.model, e_t, x, t, c, **corrector_kwargs)
+        alphas = self.model.alphas_cumprod if use_original_steps else self.ddim_alphas
+        alphas_prev = self.model.alphas_cumprod_prev if use_original_steps else self.ddim_alphas_prev
+        sqrt_one_minus_alphas = self.model.sqrt_one_minus_alphas_cumprod if use_original_steps else self.ddim_sqrt_one_minus_alphas
+        sigmas = self.model.ddim_sigmas_for_original_num_steps if use_original_steps else self.ddim_sigmas
+        # select parameters corresponding to the currently considered timestep
+        a_t = torch.full((b, 1, 1, 1), alphas[index], device=device)
+        a_prev = torch.full((b, 1, 1, 1), alphas_prev[index], device=device)
+        sigma_t = torch.full((b, 1, 1, 1), sigmas[index], device=device)
+        sqrt_one_minus_at = torch.full((b, 1, 1, 1), sqrt_one_minus_alphas[index], device=device)
+        # current prediction for x_0
+        if self.model.parameterization != "v":
+            pred_x0 = (x - sqrt_one_minus_at * e_t) / a_t.sqrt()
+        else:
+            pred_x0 = self.model.predict_start_from_z_and_v(x, t, model_output)
+        if quantize_denoised:
+            pred_x0, _, *_ = self.model.first_stage_model.quantize(pred_x0)
+        # direction pointing to x_t
+        dir_xt = (1. - a_prev - sigma_t ** 2).sqrt() * e_t
+        noise = sigma_t * noise_like(x.shape, device, repeat_noise) * temperature
+        if noise_dropout > 0.:
+            noise = torch.nn.functional.dropout(noise, p=noise_dropout)
+        x_prev = a_prev.sqrt() * pred_x0 + dir_xt + noise
+        return x_prev, pred_x0
+    @torch.no_grad()
+    def stochastic_encode(self, x0, t, use_original_steps=False, noise=None):
+        # fast, but does not allow for exact reconstruction
+        # t serves as an index to gather the correct alphas
+        if use_original_steps:
+            sqrt_alphas_cumprod = self.sqrt_alphas_cumprod
+            sqrt_one_minus_alphas_cumprod = self.sqrt_one_minus_alphas_cumprod
+        else:
+            sqrt_alphas_cumprod = torch.sqrt(self.ddim_alphas)
+            sqrt_one_minus_alphas_cumprod = self.ddim_sqrt_one_minus_alphas
+        if noise is None:
+            noise = torch.randn_like(x0)
+        return (extract_into_tensor(sqrt_alphas_cumprod, t, x0.shape) * x0 +
+                extract_into_tensor(sqrt_one_minus_alphas_cumprod, t, x0.shape) * noise)
+    @torch.no_grad()
+    def decode(self, x_latent, cond, t_start, unconditional_guidance_scale=1.0, unconditional_conditioning=None,
+               use_original_steps=False):
+        timesteps = np.arange(self.ddpm_num_timesteps) if use_original_steps else self.ddim_timesteps
+        timesteps = timesteps[:t_start]
+        time_range = np.flip(timesteps)
+        total_steps = timesteps.shape[0]
+        print(f"Running DDIM Sampling with {total_steps} timesteps")
+        iterator = tqdm(time_range, desc='Decoding image', total=total_steps)
+        x_dec = x_latent
+        for i, step in enumerate(iterator):
+            index = total_steps - i - 1
+            ts = torch.full((x_latent.shape[0],), step, device=x_latent.device, dtype=torch.long)
+            x_dec, _ = self.p_sample_ddim(x_dec, cond, ts, index=index, use_original_steps=use_original_steps,
+                                          unconditional_guidance_scale=unconditional_guidance_scale,
+                                          unconditional_conditioning=unconditional_conditioning)
+        return x_dec
+    def ddim_sampling_reverse(self,
+                              num_steps=50,
+                              x_0=None,
+                              conditioning=None,
+                              eta=0.,
+                              verbose=False,
+                              unconditional_guidance_scale=7.5,
+                              unconditional_conditioning=None
+                              ):
+        """
+        obtain the inverted x_T noisy image
+        """
+        assert eta == 0., "eta should be 0. for deterministic sampling"
+        B = x_0.shape[0]
+        # scheduler
+        self.make_schedule(ddim_num_steps=num_steps, ddim_eta=eta, verbose=verbose)
+        self.register_buffer("ddim_sqrt_one_minus_alphas_prev", torch.tensor(1. - self.ddim_alphas_prev).sqrt())
+        # sampling
+        device = self.model.betas.device
+        intermediates = {"x_inter": [x_0], "pred_x0": []}
+        time_range = self.ddim_timesteps
+        print("selected steps for ddim inversion: ", time_range)
+        assert len(time_range) == num_steps, "time range should be same as num steps"
+        iterator = tqdm(time_range, desc='DDIM Inversion', total=num_steps)
+        x_t = x_0
+        for i, step in enumerate(iterator):
+            if i == 0:
+                step = 1
+            else:
+                step = time_range[i - 1]
+            ts = torch.full((B, ), step, device=device, dtype=torch.long)
+            outs = self.p_sample_ddim_reverse(x_t,
+                                              conditioning,
+                                              ts,
+                                              index=i,
+                                              unconditional_guidance_scale=unconditional_guidance_scale,
+                                              unconditional_conditioning=unconditional_conditioning
+                                              )
+            x_t, pred_x0 = outs
+            intermediates["x_inter"].append(x_t)
+            intermediates["pred_x0"].append(pred_x0)
+        return x_t, intermediates
+    def p_sample_ddim_reverse(self,
+                              x, c, t, index,
+                              unconditional_guidance_scale=1.,
+                              unconditional_conditioning=None):
+        B = x.shape[0]
+        device = x.device
+        if unconditional_conditioning is None or unconditional_guidance_scale == 1.:
+            e_t = self.model.apply_model(x, t, c)
+        else:
+            x_in = torch.cat([x] * 2)
+            t_in = torch.cat([t] * 2)
+            c_in = torch.cat([unconditional_conditioning, c])
+            e_t_uncond, e_t = self.model.apply_model(x_in, t_in, c_in).chunk(2)
+            e_t = e_t_uncond + unconditional_guidance_scale * (e_t - e_t_uncond)
+        # scheduler parameters
+        alphas = self.ddim_alphas
+        alphas_prev = self.ddim_alphas_prev
+        sqrt_one_minus_alphas_prev = self.ddim_sqrt_one_minus_alphas_prev
+        # select parameters corresponding to the currently considered timestep
+        alpha_cumprod_next = torch.full((B, 1, 1, 1), alphas[index], device=device)
+        alpha_cumprod_t = torch.full((B, 1, 1, 1), alphas_prev[index], device=device)
+        sqrt_one_minus_alpha_t = torch.full((B, 1, 1, 1), sqrt_one_minus_alphas_prev[index], device=device)
+        # current prediction for x_0
+        pred_x0 = (x - sqrt_one_minus_alpha_t * e_t) / alpha_cumprod_t.sqrt()
+        # direction pointing to x_t
+        dir_xt = (1. - alpha_cumprod_next).sqrt() * e_t
+        x_next = alpha_cumprod_next.sqrt() * pred_x0 + dir_xt
+        return x_next, pred_x0

masactrl_w_adapter/masactrl_w_adapter.py ADDED Viewed

	@@ -0,0 +1,217 @@

+import os
+import cv2
+import torch
+import torch.nn.functional as F
+from basicsr.utils import tensor2img
+from pytorch_lightning import seed_everything
+from torch import autocast
+from torchvision.io import read_image
+from ldm.inference_base import (diffusion_inference, get_adapters, get_base_argument_parser, get_sd_models)
+from ldm.modules.extra_condition import api
+from ldm.modules.extra_condition.api import (ExtraCondition, get_adapter_feature, get_cond_model)
+from ldm.util import fix_cond_shapes
+# for masactrl
+from masactrl.masactrl_utils import regiter_attention_editor_ldm
+from masactrl.masactrl import MutualSelfAttentionControl
+from masactrl.masactrl import MutualSelfAttentionControlMask
+from masactrl.masactrl import MutualSelfAttentionControlMaskAuto
+torch.set_grad_enabled(False)
+def main():
+    supported_cond = [e.name for e in ExtraCondition]
+    parser = get_base_argument_parser()
+    parser.add_argument(
+        '--which_cond',
+        type=str,
+        required=True,
+        choices=supported_cond,
+        help='which condition modality you want to test',
+    )
+    # [MasaCtrl added] reference cond path
+    parser.add_argument(
+        "--cond_path_src",
+        type=str,
+        default=None,
+        help="the condition image path to synthesize the source image",
+    )
+    parser.add_argument(
+        "--prompt_src",
+        type=str,
+        default=None,
+        help="the prompt to synthesize the source image",
+    )
+    parser.add_argument(
+        "--src_img_path",
+        type=str,
+        default=None,
+        help="the input real source image path"
+    )
+    parser.add_argument(
+        "--start_code_path",
+        type=str,
+        default=None,
+        help="the inverted start code path to synthesize the source image",
+    )
+    parser.add_argument(
+        "--masa_step",
+        type=int,
+        default=4,
+        help="the starting step for MasaCtrl",
+    )
+    parser.add_argument(
+        "--masa_layer",
+        type=int,
+        default=10,
+        help="the starting layer for MasaCtrl",
+    )
+    opt = parser.parse_args()
+    which_cond = opt.which_cond
+    if opt.outdir is None:
+        opt.outdir = f'outputs/test-{which_cond}'
+    os.makedirs(opt.outdir, exist_ok=True)
+    if opt.resize_short_edge is None:
+        print(f"you don't specify the resize_shot_edge, so the maximum resolution is set to {opt.max_resolution}")
+    opt.device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+    if os.path.isdir(opt.cond_path):  # for conditioning image folder
+        image_paths = [os.path.join(opt.cond_path, f) for f in os.listdir(opt.cond_path)]
+    else:
+        image_paths = [opt.cond_path]
+    print(image_paths)
+    # prepare models
+    sd_model, sampler = get_sd_models(opt)
+    adapter = get_adapters(opt, getattr(ExtraCondition, which_cond))
+    cond_model = None
+    if opt.cond_inp_type == 'image':
+        cond_model = get_cond_model(opt, getattr(ExtraCondition, which_cond))
+    process_cond_module = getattr(api, f'get_cond_{which_cond}')
+    # [MasaCtrl added] default STEP and LAYER params for MasaCtrl
+    STEP = opt.masa_step if opt.masa_step is not None else 4
+    LAYER = opt.masa_layer if opt.masa_layer is not None else 10
+    # inference
+    with torch.inference_mode(), \
+            sd_model.ema_scope(), \
+            autocast('cuda'):
+        for test_idx, cond_path in enumerate(image_paths):
+            seed_everything(opt.seed)
+            for v_idx in range(opt.n_samples):
+                # seed_everything(opt.seed+v_idx+test_idx)
+                if opt.cond_path_src:
+                    cond_src = process_cond_module(opt, opt.cond_path_src, opt.cond_inp_type, cond_model)
+                cond = process_cond_module(opt, cond_path, opt.cond_inp_type, cond_model)
+                base_count = len(os.listdir(opt.outdir)) // 2
+                cv2.imwrite(os.path.join(opt.outdir, f'{base_count:05}_{which_cond}.png'), tensor2img(cond))
+                if opt.cond_path_src:
+                    cv2.imwrite(os.path.join(opt.outdir, f'{base_count:05}_{which_cond}_src.png'), tensor2img(cond_src))
+                adapter_features, append_to_context = get_adapter_feature(cond, adapter)
+                if opt.cond_path_src:
+                    adapter_features_src, append_to_context_src = get_adapter_feature(cond_src, adapter)
+                if opt.cond_path_src:
+                    print("using reference guidance to synthesize image")
+                    adapter_features = [torch.cat([adapter_features_src[i], adapter_features[i]]) for i in range(len(adapter_features))]
+                else:
+                    adapter_features = [torch.cat([torch.zeros_like(feats), feats]) for feats in adapter_features]
+                if opt.scale > 1.:
+                    adapter_features = [torch.cat([feats] * 2) for feats in adapter_features]
+                # prepare the batch prompts
+                if opt.prompt_src is not None:
+                    prompts = [opt.prompt_src, opt.prompt]
+                else:
+                    prompts = [opt.prompt] * 2
+                print("promts: ", prompts)
+                # get text embedding
+                c = sd_model.get_learned_conditioning(prompts)
+                if opt.scale != 1.0:
+                    uc = sd_model.get_learned_conditioning([""] * len(prompts))
+                else:
+                    uc = None
+                c, uc = fix_cond_shapes(sd_model, c, uc)
+                if not hasattr(opt, 'H'):
+                    opt.H = 512
+                    opt.W = 512
+                shape = [opt.C, opt.H // opt.f, opt.W // opt.f]
+                if opt.src_img_path: # perform ddim inversion
+                    src_img = read_image(opt.src_img_path)
+                    src_img = src_img.float() / 255.  # input normalized image [0, 1]
+                    src_img = src_img * 2 - 1
+                    if src_img.dim() == 3:
+                        src_img = src_img.unsqueeze(0)
+                    src_img = F.interpolate(src_img, (opt.H, opt.W))
+                    src_img = src_img.to(opt.device)
+                    # obtain initial latent
+                    encoder_posterior = sd_model.encode_first_stage(src_img)
+                    src_x_0 = sd_model.get_first_stage_encoding(encoder_posterior)
+                    start_code, latents_dict = sampler.ddim_sampling_reverse(
+                        num_steps=opt.steps,
+                        x_0=src_x_0,
+                        conditioning=uc[:1],  # you may change here during inversion
+                        unconditional_guidance_scale=opt.scale,
+                        unconditional_conditioning=uc[:1],
+                    )
+                    torch.save(
+                        {
+                            "start_code": start_code
+                        },
+                        os.path.join(opt.outdir, "start_code.pth"),
+                    )
+                elif opt.start_code_path:
+                    # load the inverted start code
+                    start_code_dict = torch.load(opt.start_code_path)
+                    start_code = start_code_dict.get("start_code").to(opt.device)
+                else:
+                    start_code = torch.randn([1, *shape], device=opt.device)
+                start_code = start_code.expand(len(prompts), -1, -1, -1)
+                # hijack the attention module
+                editor = MutualSelfAttentionControl(STEP, LAYER)
+                regiter_attention_editor_ldm(sd_model, editor)
+                samples_latents, _ = sampler.sample(
+                    S=opt.steps,
+                    conditioning=c,
+                    batch_size=len(prompts),
+                    shape=shape,
+                    verbose=False,
+                    unconditional_guidance_scale=opt.scale,
+                    unconditional_conditioning=uc,
+                    x_T=start_code,
+                    features_adapter=adapter_features,
+                    append_to_context=append_to_context,
+                    cond_tau=opt.cond_tau,
+                    style_cond_tau=opt.style_cond_tau,
+                )
+                x_samples = sd_model.decode_first_stage(samples_latents)
+                x_samples = torch.clamp((x_samples + 1.0) / 2.0, min=0.0, max=1.0)
+                cv2.imwrite(os.path.join(opt.outdir, f'{base_count:05}_all_result.png'), tensor2img(x_samples))
+                # save the prompts and seed
+                with open(os.path.join(opt.outdir, "log.txt"), "w") as f:
+                    for prom in prompts:
+                        f.write(prom)
+                        f.write("\n")
+                    f.write(f"seed: {opt.seed}")
+                for i in range(len(x_samples)):
+                    base_count += 1
+                    cv2.imwrite(os.path.join(opt.outdir, f'{base_count:05}_result.png'), tensor2img(x_samples[i]))
+if __name__ == '__main__':
+    main()

playground.ipynb ADDED Viewed

	@@ -0,0 +1,149 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### MasaCtrl: Tuning-free Mutual Self-Attention Control for Consistent Image Synthesis and Editing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.nn.functional as F\n",
+    "\n",
+    "import numpy as np\n",
+    "\n",
+    "from tqdm import tqdm\n",
+    "from einops import rearrange, repeat\n",
+    "from omegaconf import OmegaConf\n",
+    "\n",
+    "from diffusers import DDIMScheduler\n",
+    "\n",
+    "from masactrl.diffuser_utils import MasaCtrlPipeline\n",
+    "from masactrl.masactrl_utils import AttentionBase\n",
+    "from masactrl.masactrl_utils import regiter_attention_editor_diffusers\n",
+    "\n",
+    "from torchvision.utils import save_image\n",
+    "from torchvision.io import read_image\n",
+    "from pytorch_lightning import seed_everything\n",
+    "\n",
+    "torch.cuda.set_device(0)  # set the GPU device"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Model Construction"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Note that you may add your Hugging Face token to get access to the models\n",
+    "device = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")\n",
+    "model_path = \"xyn-ai/anything-v4.0\"\n",
+    "# model_path = \"runwayml/stable-diffusion-v1-5\"\n",
+    "scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule=\"scaled_linear\", clip_sample=False, set_alpha_to_one=False)\n",
+    "model = MasaCtrlPipeline.from_pretrained(model_path, scheduler=scheduler, cross_attention_kwargs={\"scale\": 0.5}).to(device)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Consistent synthesis with MasaCtrl"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from masactrl.masactrl import MutualSelfAttentionControl\n",
+    "\n",
+    "\n",
+    "seed = 42\n",
+    "seed_everything(seed)\n",
+    "\n",
+    "out_dir = \"./workdir/masactrl_exp/\"\n",
+    "os.makedirs(out_dir, exist_ok=True)\n",
+    "sample_count = len(os.listdir(out_dir))\n",
+    "out_dir = os.path.join(out_dir, f\"sample_{sample_count}\")\n",
+    "os.makedirs(out_dir, exist_ok=True)\n",
+    "\n",
+    "prompts = [\n",
+    "    \"1boy, casual, outdoors, sitting\",  # source prompt\n",
+    "    \"1boy, casual, outdoors, standing\"  # target prompt\n",
+    "]\n",
+    "\n",
+    "# initialize the noise map\n",
+    "start_code = torch.randn([1, 4, 64, 64], device=device)\n",
+    "start_code = start_code.expand(len(prompts), -1, -1, -1)\n",
+    "\n",
+    "# inference the synthesized image without MasaCtrl\n",
+    "editor = AttentionBase()\n",
+    "regiter_attention_editor_diffusers(model, editor)\n",
+    "image_ori = model(prompts, latents=start_code, guidance_scale=7.5)\n",
+    "\n",
+    "# inference the synthesized image with MasaCtrl\n",
+    "STEP = 4\n",
+    "LAYPER = 10\n",
+    "\n",
+    "# hijack the attention module\n",
+    "editor = MutualSelfAttentionControl(STEP, LAYPER)\n",
+    "regiter_attention_editor_diffusers(model, editor)\n",
+    "\n",
+    "# inference the synthesized image\n",
+    "image_masactrl = model(prompts, latents=start_code, guidance_scale=7.5)[-1:]\n",
+    "\n",
+    "# save the synthesized image\n",
+    "out_image = torch.cat([image_ori, image_masactrl], dim=0)\n",
+    "save_image(out_image, os.path.join(out_dir, f\"all_step{STEP}_layer{LAYPER}.png\"))\n",
+    "save_image(out_image[0], os.path.join(out_dir, f\"source_step{STEP}_layer{LAYPER}.png\"))\n",
+    "save_image(out_image[1], os.path.join(out_dir, f\"without_step{STEP}_layer{LAYPER}.png\"))\n",
+    "save_image(out_image[2], os.path.join(out_dir, f\"masactrl_step{STEP}_layer{LAYPER}.png\"))\n",
+    "\n",
+    "print(\"Syntheiszed images are saved in\", out_dir)"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.8.5 ('ldm')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "587aa04bacead72c1ffd459abbe4c8140b72ba2b534b24165b36a2ede3d95042"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

playground_real.ipynb ADDED Viewed

	@@ -0,0 +1,188 @@

+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### MasaCtrl: Tuning-free Mutual Self-Attention Control for Consistent Image Synthesis and Editing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "import torch\n",
+    "import torch.nn as nn\n",
+    "import torch.nn.functional as F\n",
+    "\n",
+    "import numpy as np\n",
+    "\n",
+    "from tqdm import tqdm\n",
+    "from einops import rearrange, repeat\n",
+    "from omegaconf import OmegaConf\n",
+    "\n",
+    "from diffusers import DDIMScheduler\n",
+    "\n",
+    "from masactrl.diffuser_utils import MasaCtrlPipeline\n",
+    "from masactrl.masactrl_utils import AttentionBase\n",
+    "from masactrl.masactrl_utils import regiter_attention_editor_diffusers\n",
+    "\n",
+    "from torchvision.utils import save_image\n",
+    "from torchvision.io import read_image\n",
+    "from pytorch_lightning import seed_everything\n",
+    "\n",
+    "torch.cuda.set_device(0)  # set the GPU device"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Model Construction"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Note that you may add your Hugging Face token to get access to the models\n",
+    "device = torch.device(\"cuda\") if torch.cuda.is_available() else torch.device(\"cpu\")\n",
+    "# model_path = \"xyn-ai/anything-v4.0\"\n",
+    "model_path = \"CompVis/stable-diffusion-v1-4\"\n",
+    "# model_path = \"runwayml/stable-diffusion-v1-5\"\n",
+    "scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule=\"scaled_linear\", clip_sample=False, set_alpha_to_one=False)\n",
+    "model = MasaCtrlPipeline.from_pretrained(model_path, scheduler=scheduler).to(device)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "#### Real editing with MasaCtrl"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from masactrl.masactrl import MutualSelfAttentionControl\n",
+    "from torchvision.io import read_image\n",
+    "\n",
+    "\n",
+    "def load_image(image_path, device):\n",
+    "    image = read_image(image_path)\n",
+    "    image = image[:3].unsqueeze_(0).float() / 127.5 - 1.  # [-1, 1]\n",
+    "    image = F.interpolate(image, (512, 512))\n",
+    "    image = image.to(device)\n",
+    "    return image\n",
+    "\n",
+    "\n",
+    "seed = 42\n",
+    "seed_everything(seed)\n",
+    "\n",
+    "out_dir = \"./workdir/masactrl_real_exp/\"\n",
+    "os.makedirs(out_dir, exist_ok=True)\n",
+    "sample_count = len(os.listdir(out_dir))\n",
+    "out_dir = os.path.join(out_dir, f\"sample_{sample_count}\")\n",
+    "os.makedirs(out_dir, exist_ok=True)\n",
+    "\n",
+    "# source image\n",
+    "SOURCE_IMAGE_PATH = \"./gradio_app/images/corgi.jpg\"\n",
+    "source_image = load_image(SOURCE_IMAGE_PATH, device)\n",
+    "\n",
+    "source_prompt = \"\"\n",
+    "target_prompt = \"a photo of a running corgi\"\n",
+    "prompts = [source_prompt, target_prompt]\n",
+    "\n",
+    "# invert the source image\n",
+    "start_code, latents_list = model.invert(source_image,\n",
+    "                                        source_prompt,\n",
+    "                                        guidance_scale=7.5,\n",
+    "                                        num_inference_steps=50,\n",
+    "                                        return_intermediates=True)\n",
+    "start_code = start_code.expand(len(prompts), -1, -1, -1)\n",
+    "\n",
+    "# results of direct synthesis\n",
+    "editor = AttentionBase()\n",
+    "regiter_attention_editor_diffusers(model, editor)\n",
+    "image_fixed = model([target_prompt],\n",
+    "                    latents=start_code[-1:],\n",
+    "                    num_inference_steps=50,\n",
+    "                    guidance_scale=7.5)\n",
+    "\n",
+    "# inference the synthesized image with MasaCtrl\n",
+    "STEP = 4\n",
+    "LAYPER = 10\n",
+    "\n",
+    "# hijack the attention module\n",
+    "editor = MutualSelfAttentionControl(STEP, LAYPER)\n",
+    "regiter_attention_editor_diffusers(model, editor)\n",
+    "\n",
+    "# inference the synthesized image\n",
+    "image_masactrl = model(prompts,\n",
+    "                       latents=start_code,\n",
+    "                       guidance_scale=7.5)\n",
+    "# Note: querying the inversion intermediate features latents_list\n",
+    "# may obtain better reconstruction and editing results\n",
+    "# image_masactrl = model(prompts,\n",
+    "#                        latents=start_code,\n",
+    "#                        guidance_scale=7.5,\n",
+    "#                        ref_intermediate_latents=latents_list)\n",
+    "\n",
+    "# save the synthesized image\n",
+    "out_image = torch.cat([source_image * 0.5 + 0.5,\n",
+    "                       image_masactrl[0:1],\n",
+    "                       image_fixed,\n",
+    "                       image_masactrl[-1:]], dim=0)\n",
+    "save_image(out_image, os.path.join(out_dir, f\"all_step{STEP}_layer{LAYPER}.png\"))\n",
+    "save_image(out_image[0], os.path.join(out_dir, f\"source_step{STEP}_layer{LAYPER}.png\"))\n",
+    "save_image(out_image[1], os.path.join(out_dir, f\"reconstructed_source_step{STEP}_layer{LAYPER}.png\"))\n",
+    "save_image(out_image[2], os.path.join(out_dir, f\"without_step{STEP}_layer{LAYPER}.png\"))\n",
+    "save_image(out_image[3], os.path.join(out_dir, f\"masactrl_step{STEP}_layer{LAYPER}.png\"))\n",
+    "\n",
+    "print(\"Syntheiszed images are saved in\", out_dir)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": []
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3.8.5 ('ldm')",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.8.5"
+  },
+  "orig_nbformat": 4,
+  "vscode": {
+   "interpreter": {
+    "hash": "587aa04bacead72c1ffd459abbe4c8140b72ba2b534b24165b36a2ede3d95042"
+   }
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+#diffusers==0.15.0
+diffusers
+transformers
+opencv-python
+einops
+omegaconf
+pytorch_lightning
+torch
+torchvision
+gradio
+httpx[socks]
+huggingface_hub==0.25.0
+moviepy

run_synthesis_genshin_impact_xl.py ADDED Viewed

	@@ -0,0 +1,117 @@

+'''
+python run_synthesis_genshin_impact_xl.py --model_path "svjack/GenshinImpact_XL_Base" \
+ --prompt1 "A portrait of an old man, facing camera, best quality" \
+ --prompt2 "A portrait of an old man, facing camera, smiling, best quality" --guidance_scale 3.5
+python run_synthesis_genshin_impact_xl.py --model_path "svjack/GenshinImpact_XL_Base" \
+ --prompt1 "solo,ZHONGLI\(genshin impact\),1boy,highres," \
+ --prompt2 "solo,ZHONGLI drink tea use chinese cup \(genshin impact\),1boy,highres," --guidance_scale 5
+from IPython import display
+display.Image("masactrl_exp/solo_zhongli_drink_tea_use_chinese_cup___genshin_impact___1boy_highres_/sample_0/masactrl_step4_layer64.png", width=512, height=512)
+display.Image("masactrl_exp/solo_zhongli_drink_tea_use_chinese_cup___genshin_impact___1boy_highres_/sample_1/source_step4_layer74.png", width=512, height=512)
+'''
+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from tqdm import tqdm
+from einops import rearrange, repeat
+from omegaconf import OmegaConf
+from diffusers import DDIMScheduler, DiffusionPipeline
+from masactrl.diffuser_utils import MasaCtrlPipeline
+from masactrl.masactrl_utils import AttentionBase
+from masactrl.masactrl_utils import regiter_attention_editor_diffusers
+from masactrl.masactrl import MutualSelfAttentionControl
+from torchvision.utils import save_image
+from torchvision.io import read_image
+from pytorch_lightning import seed_everything
+import argparse
+import re
+torch.cuda.set_device(0)  # set the GPU device
+# Note that you may add your Hugging Face token to get access to the models
+device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+def pathify(s):
+    # Convert to lowercase and replace non-alphanumeric characters with underscores
+    return re.sub(r'[^a-zA-Z0-9]', '_', s.lower())
+def consistent_synthesis(args):
+    seed = 42
+    seed_everything(seed)
+    # Create the output directory based on prompt2
+    out_dir_ori = os.path.join("masactrl_exp", pathify(args.prompt2))
+    os.makedirs(out_dir_ori, exist_ok=True)
+    prompts = [
+        args.prompt1,
+        args.prompt2,
+    ]
+    # inference the synthesized image with MasaCtrl
+    # TODO: note that the hyper paramerter of MasaCtrl for SDXL may be not optimal
+    STEP = 4
+    #LAYER_LIST = [44, 54, 64]  # run the synthesis with MasaCtrl at three different layer configs
+    #LAYER_LIST = [64, 74, 84, 94]  # run the synthesis with MasaCtrl at three different layer configs
+    LAYER_LIST = [64, 74]  # run the synthesis with MasaCtrl at three different layer configs
+    # initialize the noise map
+    start_code = torch.randn([1, 4, 128, 128], device=device)
+    start_code = start_code.expand(len(prompts), -1, -1, -1)
+    # Load the model
+    scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False)
+    model = DiffusionPipeline.from_pretrained(args.model_path, scheduler=scheduler).to(device)
+    # inference the synthesized image without MasaCtrl
+    editor = AttentionBase()
+    regiter_attention_editor_diffusers(model, editor)
+    image_ori = model(prompts, latents=start_code, guidance_scale=args.guidance_scale).images
+    for LAYER in LAYER_LIST:
+        # hijack the attention module
+        editor = MutualSelfAttentionControl(STEP, LAYER, model_type="SDXL")
+        regiter_attention_editor_diffusers(model, editor)
+        # inference the synthesized image
+        image_masactrl = model(prompts, latents=start_code, guidance_scale=args.guidance_scale).images
+        sample_count = len(os.listdir(out_dir_ori))
+        out_dir = os.path.join(out_dir_ori, f"sample_{sample_count}")
+        os.makedirs(out_dir, exist_ok=True)
+        image_ori[0].save(os.path.join(out_dir, f"source_step{STEP}_layer{LAYER}.png"))
+        image_ori[1].save(os.path.join(out_dir, f"without_step{STEP}_layer{LAYER}.png"))
+        image_masactrl[-1].save(os.path.join(out_dir, f"masactrl_step{STEP}_layer{LAYER}.png"))
+        with open(os.path.join(out_dir, f"prompts.txt"), "w") as f:
+            for p in prompts:
+                f.write(p + "\n")
+            f.write(f"seed: {seed}\n")
+        print("Syntheiszed images are saved in", out_dir)
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser(description="Consistent Synthesis with MasaCtrl")
+    parser.add_argument("--model_path", type=str, default="svjack/GenshinImpact_XL_Base", help="Path to the model")
+    parser.add_argument("--prompt1", type=str, default="A portrait of an old man, facing camera, best quality", help="First prompt")
+    parser.add_argument("--prompt2", type=str, default="A portrait of an old man, facing camera, smiling, best quality", help="Second prompt")
+    parser.add_argument("--guidance_scale", type=float, default=7.5, help="Guidance scale")
+    parser.add_argument("--out_dir", type=str, default=None, help="Output directory")
+    args = parser.parse_args()
+    # If out_dir is not provided, use the default path based on prompt2
+    if args.out_dir is None:
+        args.out_dir = os.path.join("masactrl_exp", pathify(args.prompt2))
+    consistent_synthesis(args)

run_synthesis_genshin_impact_xl_app.py ADDED Viewed

	@@ -0,0 +1,97 @@

+import gradio as gr
+import torch
+from diffusers import DDIMScheduler, DiffusionPipeline
+from masactrl.diffuser_utils import MasaCtrlPipeline
+from masactrl.masactrl_utils import AttentionBase, regiter_attention_editor_diffusers
+from masactrl.masactrl import MutualSelfAttentionControl
+from pytorch_lightning import seed_everything
+import os
+import re
+# 初始化设备和模型
+device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False)
+model = DiffusionPipeline.from_pretrained("svjack/GenshinImpact_XL_Base", scheduler=scheduler).to(device)
+def pathify(s):
+    return re.sub(r'[^a-zA-Z0-9]', '_', s.lower())
+def consistent_synthesis(prompt1, prompt2, guidance_scale, seed, starting_step, starting_layer):
+    seed_everything(seed)
+    # 创建输出目录
+    out_dir_ori = os.path.join("masactrl_exp", pathify(prompt2))
+    os.makedirs(out_dir_ori, exist_ok=True)
+    prompts = [prompt1, prompt2]
+    # 初始化噪声图
+    start_code = torch.randn([1, 4, 128, 128], device=device)
+    start_code = start_code.expand(len(prompts), -1, -1, -1)
+    # 推理没有 MasaCtrl 的图像
+    editor = AttentionBase()
+    regiter_attention_editor_diffusers(model, editor)
+    image_ori = model(prompts, latents=start_code, guidance_scale=guidance_scale).images
+    images = []
+    # 劫持注意力模块
+    editor = MutualSelfAttentionControl(starting_step, starting_layer, model_type="SDXL")
+    regiter_attention_editor_diffusers(model, editor)
+    # 推理带 MasaCtrl 的图像
+    image_masactrl = model(prompts, latents=start_code, guidance_scale=guidance_scale).images
+    sample_count = len(os.listdir(out_dir_ori))
+    out_dir = os.path.join(out_dir_ori, f"sample_{sample_count}")
+    os.makedirs(out_dir, exist_ok=True)
+    image_ori[0].save(os.path.join(out_dir, f"source_step{starting_step}_layer{starting_layer}.png"))
+    image_ori[1].save(os.path.join(out_dir, f"without_step{starting_step}_layer{starting_layer}.png"))
+    image_masactrl[-1].save(os.path.join(out_dir, f"masactrl_step{starting_step}_layer{starting_layer}.png"))
+    with open(os.path.join(out_dir, f"prompts.txt"), "w") as f:
+        for p in prompts:
+            f.write(p + "\n")
+        f.write(f"seed: {seed}\n")
+        f.write(f"starting_step: {starting_step}\n")
+        f.write(f"starting_layer: {starting_layer}\n")
+    print("Synthesized images are saved in", out_dir)
+    return [image_ori[0], image_ori[1], image_masactrl[-1]]
+def create_demo_synthesis():
+    with gr.Blocks() as demo:
+        gr.Markdown("# **Genshin Impact XL MasaCtrl Image Synthesis**")  # 添加标题
+        gr.Markdown("## **Input Settings**")
+        with gr.Row():
+            with gr.Column():
+                prompt1 = gr.Textbox(label="Prompt 1", value="solo,ZHONGLI(genshin impact),1boy,highres,")
+                prompt2 = gr.Textbox(label="Prompt 2", value="solo,ZHONGLI drink tea use chinese cup (genshin impact),1boy,highres,")
+                with gr.Row():
+                    starting_step = gr.Slider(label="Starting Step", minimum=0, maximum=999, value=4, step=1)
+                    starting_layer = gr.Slider(label="Starting Layer", minimum=0, maximum=999, value=64, step=1)
+                run_btn = gr.Button("Run")
+            with gr.Column():
+                guidance_scale = gr.Slider(label="Guidance Scale", minimum=0.1, maximum=30.0, value=7.5, step=0.1)
+                seed = gr.Slider(label="Seed", minimum=-1, maximum=2147483647, value=42, step=1)
+        gr.Markdown("## **Output**")
+        with gr.Row():
+            image_source = gr.Image(label="Source Image")
+            image_without_masactrl = gr.Image(label="Image without MasaCtrl")
+            image_with_masactrl = gr.Image(label="Image with MasaCtrl")
+        inputs = [prompt1, prompt2, guidance_scale, seed, starting_step, starting_layer]
+        run_btn.click(consistent_synthesis, inputs, [image_source, image_without_masactrl, image_with_masactrl])
+        gr.Examples(
+            [
+                ["solo,ZHONGLI(genshin impact),1boy,highres,", "solo,ZHONGLI drink tea use chinese cup (genshin impact),1boy,highres,", 42, 4, 64],
+                ["solo,KAMISATO AYATO(genshin impact),1boy,highres,", "solo,KAMISATO AYATO smiling (genshin impact),1boy,highres,", 42, 4, 55]
+            ],
+            [prompt1, prompt2, seed, starting_step, starting_layer],
+        )
+    return demo
+if __name__ == "__main__":
+    demo_synthesis = create_demo_synthesis()
+    demo_synthesis.launch(share = True)

run_synthesis_sdxl.py ADDED Viewed

	@@ -0,0 +1,83 @@

+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from tqdm import tqdm
+from einops import rearrange, repeat
+from omegaconf import OmegaConf
+from diffusers import DDIMScheduler, DiffusionPipeline
+from masactrl.diffuser_utils import MasaCtrlPipeline
+from masactrl.masactrl_utils import AttentionBase
+from masactrl.masactrl_utils import regiter_attention_editor_diffusers
+from masactrl.masactrl import MutualSelfAttentionControl
+from torchvision.utils import save_image
+from torchvision.io import read_image
+from pytorch_lightning import seed_everything
+torch.cuda.set_device(0)  # set the GPU device
+# Note that you may add your Hugging Face token to get access to the models
+device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+model_path = "stabilityai/stable-diffusion-xl-base-1.0"
+# model_path = "Linaqruf/animagine-xl"
+scheduler = DDIMScheduler(beta_start=0.00085, beta_end=0.012, beta_schedule="scaled_linear", clip_sample=False, set_alpha_to_one=False)
+model = DiffusionPipeline.from_pretrained(model_path, scheduler=scheduler).to(device)
+def consistent_synthesis():
+    seed = 42
+    seed_everything(seed)
+    out_dir_ori = "./workdir/masactrl_exp/oldman_smiling"
+    os.makedirs(out_dir_ori, exist_ok=True)
+    prompts = [
+        "A portrait of an old man, facing camera, best quality",
+        "A portrait of an old man, facing camera, smiling, best quality",
+    ]
+    # inference the synthesized image with MasaCtrl
+    # TODO: note that the hyper paramerter of MasaCtrl for SDXL may be not optimal
+    STEP = 4
+    LAYER_LIST = [44, 54, 64]  # run the synthesis with MasaCtrl at three different layer configs
+    # initialize the noise map
+    start_code = torch.randn([1, 4, 128, 128], device=device)
+    # start_code = None
+    start_code = start_code.expand(len(prompts), -1, -1, -1)
+    # inference the synthesized image without MasaCtrl
+    editor = AttentionBase()
+    regiter_attention_editor_diffusers(model, editor)
+    image_ori = model(prompts, latents=start_code, guidance_scale=7.5).images
+    for LAYER in LAYER_LIST:
+        # hijack the attention module
+        editor = MutualSelfAttentionControl(STEP, LAYER, model_type="SDXL")
+        regiter_attention_editor_diffusers(model, editor)
+        # inference the synthesized image
+        image_masactrl = model(prompts, latents=start_code, guidance_scale=7.5).images
+        sample_count = len(os.listdir(out_dir_ori))
+        out_dir = os.path.join(out_dir_ori, f"sample_{sample_count}")
+        os.makedirs(out_dir, exist_ok=True)
+        image_ori[0].save(os.path.join(out_dir, f"source_step{STEP}_layer{LAYER}.png"))
+        image_ori[1].save(os.path.join(out_dir, f"without_step{STEP}_layer{LAYER}.png"))
+        image_masactrl[-1].save(os.path.join(out_dir, f"masactrl_step{STEP}_layer{LAYER}.png"))
+        with open(os.path.join(out_dir, f"prompts.txt"), "w") as f:
+            for p in prompts:
+                f.write(p + "\n")
+            f.write(f"seed: {seed}\n")
+        print("Syntheiszed images are saved in", out_dir)
+if __name__ == "__main__":
+    consistent_synthesis()

run_synthesis_sdxl_processor.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import os
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from tqdm import tqdm
+from einops import rearrange, repeat
+from omegaconf import OmegaConf
+from diffusers import DDIMScheduler, StableDiffusionPipeline, DiffusionPipeline
+from torchvision.utils import save_image
+from torchvision.io import read_image
+from pytorch_lightning import seed_everything
+from masactrl.masactrl_processor import register_attention_processor
+torch.cuda.set_device(0)  # set the GPU device
+# Note that you may add your Hugging Face token to get access to the models
+device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
+weight_dtype = torch.float16
+model_path = "stabilityai/stable-diffusion-xl-base-1.0"
+scheduler = DDIMScheduler(
+    beta_start=0.00085,
+    beta_end=0.012,
+    beta_schedule="scaled_linear",
+    clip_sample=False,
+    set_alpha_to_one=False
+)
+pipe = DiffusionPipeline.from_pretrained(
+    model_path,
+    scheduler=scheduler,
+    torch_dtype=weight_dtype
+).to(device)
+def consistent_synthesis():
+    seed = 42
+    seed_everything(seed)
+    out_dir_ori = "./workdir/masactrl_exp/oldman_smiling"
+    os.makedirs(out_dir_ori, exist_ok=True)
+    prompts = [
+        "A portrait of an old man, facing camera, best quality",
+        "A portrait of an old man, facing camera, smiling, best quality",
+    ]
+    # inference the synthesized image with MasaCtrl
+    # TODO: note that the hyper paramerter of MasaCtrl for SDXL may be not optimal
+    STEP = 4
+    LAYER_LIST = [44, 54, 64]  # run the synthesis with MasaCtrl at three different layer configs
+    MODEL_TYPE = "SDXL"
+    # initialize the noise map
+    start_code = torch.randn([1, 4, 128, 128], dtype=weight_dtype, device=device)
+    # start_code = None
+    start_code = start_code.expand(len(prompts), -1, -1, -1)
+    # inference the synthesized image without MasaCtrl
+    image_ori = pipe(prompts, latents=start_code, guidance_scale=7.5).images
+    for LAYER in LAYER_LIST:
+        # hijack the attention module with MasaCtrl processor
+        processor_args = {
+            "start_step": STEP,
+            "start_layer": LAYER,
+            "model_type": MODEL_TYPE
+        }
+        register_attention_processor(pipe.unet, processor_type="MasaCtrlProcessor")
+        # inference the synthesized image
+        image_masactrl = pipe(prompts, latents=start_code, guidance_scale=7.5).images
+        sample_count = len(os.listdir(out_dir_ori))
+        out_dir = os.path.join(out_dir_ori, f"sample_{sample_count}")
+        os.makedirs(out_dir, exist_ok=True)
+        image_ori[0].save(os.path.join(out_dir, f"source_step{STEP}_layer{LAYER}.png"))
+        image_ori[1].save(os.path.join(out_dir, f"without_step{STEP}_layer{LAYER}.png"))
+        image_masactrl[-1].save(os.path.join(out_dir, f"masactrl_step{STEP}_layer{LAYER}.png"))
+        with open(os.path.join(out_dir, f"prompts.txt"), "w") as f:
+            for p in prompts:
+                f.write(p + "\n")
+            f.write(f"seed: {seed}\n")
+        print("Syntheiszed images are saved in", out_dir)
+if __name__ == "__main__":
+    consistent_synthesis()

style.css ADDED Viewed

	@@ -0,0 +1,3 @@

+h1 {
+    text-align: center;
+  }