Spaces:

ziheng1234
/

ImageCritic

Running on Zero

App Files Files Community

ziheng1234 commited on 4 days ago

Commit

3e8fe6c

verified ·

1 Parent(s): e5cb7a0

Upload 39 files

Browse files

Files changed (40) hide show

.gitattributes +15 -0
app.py +720 -0
requirements.txt +34 -0
src/__init__.py +0 -0
src/attention_processor.py +146 -0
src/detail_encoder.py +118 -0
src/jsonl_datasets.py +186 -0
src/kontext_custom_pipeline.py +0 -0
src/layers.py +673 -0
src/lora_helper.py +267 -0
src/pipeline.py +805 -0
src/prompt_helper.py +205 -0
src/transformer_flux.py +583 -0
src/transformer_with_loss.py +504 -0
test_imgs/2.png +3 -0
test_imgs/3.png +3 -0
test_imgs/generated_1.png +3 -0
test_imgs/generated_1_bbox.png +3 -0
test_imgs/generated_2.png +3 -0
test_imgs/generated_2_bbox.png +3 -0
test_imgs/generated_3.png +3 -0
test_imgs/generated_3_bbox.png +3 -0
test_imgs/generated_3_bbox_1.png +3 -0
test_imgs/product_1.jpg +0 -0
test_imgs/product_1_bbox.png +3 -0
test_imgs/product_2.png +3 -0
test_imgs/product_2_bbox.png +3 -0
test_imgs/product_3.png +3 -0
test_imgs/product_3_bbox.png +3 -0
test_imgs/product_3_bbox_1.png +3 -0
uno/dataset/uno.py +132 -0
uno/flux/math.py +45 -0
uno/flux/model.py +222 -0
uno/flux/modules/autoencoder.py +327 -0
uno/flux/modules/conditioner.py +53 -0
uno/flux/modules/layers.py +435 -0
uno/flux/pipeline.py +304 -0
uno/flux/sampling.py +252 -0
uno/flux/util.py +396 -0
uno/utils/convert_yaml_to_args_file.py +34 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,18 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+test_imgs/2.png filter=lfs diff=lfs merge=lfs -text
+test_imgs/3.png filter=lfs diff=lfs merge=lfs -text
+test_imgs/generated_1_bbox.png filter=lfs diff=lfs merge=lfs -text
+test_imgs/generated_1.png filter=lfs diff=lfs merge=lfs -text
+test_imgs/generated_2_bbox.png filter=lfs diff=lfs merge=lfs -text
+test_imgs/generated_2.png filter=lfs diff=lfs merge=lfs -text
+test_imgs/generated_3_bbox_1.png filter=lfs diff=lfs merge=lfs -text
+test_imgs/generated_3_bbox.png filter=lfs diff=lfs merge=lfs -text
+test_imgs/generated_3.png filter=lfs diff=lfs merge=lfs -text
+test_imgs/product_1_bbox.png filter=lfs diff=lfs merge=lfs -text
+test_imgs/product_2_bbox.png filter=lfs diff=lfs merge=lfs -text
+test_imgs/product_2.png filter=lfs diff=lfs merge=lfs -text
+test_imgs/product_3_bbox_1.png filter=lfs diff=lfs merge=lfs -text
+test_imgs/product_3_bbox.png filter=lfs diff=lfs merge=lfs -text
+test_imgs/product_3.png filter=lfs diff=lfs merge=lfs -text

app.py ADDED Viewed

	@@ -0,0 +1,720 @@

+# import os
+# os.system("pip uninstall -y gradio")
+# os.system("pip install gradio==5.49.1")
+# os.system("pip uninstall -y gradio_image_annotation")
+# os.system("pip install gradio_image_annotation==0.4.1")
+# os.system("pip uninstall -y huggingface-hub")
+# os.system("pip install huggingface-hub==0.35.3")
+import torch
+from PIL import Image
+import gradio as gr
+from gradio_image_annotation import image_annotator
+import numpy as np
+import random
+from diffusers import FluxTransformer2DModel, FluxKontextPipeline
+from safetensors.torch import load_file
+from huggingface_hub import hf_hub_download
+from src.lora_helper import set_single_lora
+from src.detail_encoder import DetailEncoder
+from src.kontext_custom_pipeline import FluxKontextPipelineWithPhotoEncoderAddTokens
+# import spaces
+from uno.flux.pipeline import UNOPipeline
+hf_hub_download(
+    repo_id="ziheng1234/ImageCritic",
+    filename="detail_encoder.safetensors",
+    local_dir="models"     # 下载到本地 models/ 目录
+)
+hf_hub_download(
+    repo_id="ziheng1234/ImageCritic",
+    filename="lora.safetensors",
+    local_dir="models"
+)
+from huggingface_hub import snapshot_download
+repo_id = "ziheng1234/kontext"
+local_dir = "./kontext"
+snapshot_download(
+    repo_id=repo_id,
+    local_dir=local_dir,
+    repo_type="model",
+    resume_download=True,
+    max_workers=8
+)
+base_path = "./models"
+detail_encoder_path = f"{base_path}/detail_encoder.safetensors"
+kontext_lora_path = f"{base_path}/lora.safetensors"
+def pick_kontext_resolution(w: int, h: int) -> tuple[int, int]:
+    PREFERRED_KONTEXT_RESOLUTIONS = [
+        (672, 1568), (688, 1504), (720, 1456), (752, 1392),
+        (800, 1328), (832, 1248), (880, 1184), (944, 1104),
+        (1024, 1024), (1104, 944), (1184, 880), (1248, 832),
+        (1328, 800), (1392, 752), (1456, 720), (1504, 688), (1568, 672),
+    ]
+    target_ratio = w / h
+    return min(
+        PREFERRED_KONTEXT_RESOLUTIONS,
+        key=lambda wh: abs((wh[0] / wh[1]) - target_ratio),
+    )
+MAX_SEED = np.iinfo(np.int32).max
+device = None
+pipeline = None
+transformer = None
+detail_encoder = None
+stage1_pipeline = None
+@spaces.GPU(duration=200)
+def load_stage1_model():
+    global stage1_pipeline, device
+    if stage1_pipeline is not None:
+        return
+    print("加载 Stage 1 UNO Pipeline...")
+    if device is None:
+        device = "cuda:0" if torch.cuda.is_available() else "cpu"
+    model_type = "flux-dev"
+    stage1_pipeline = UNOPipeline(model_type, device, offload=False, only_lora=True, lora_rank=512)
+    print("Stage 1 模型加载完成！")
+@spaces.GPU(duration=200)
+def load_models():
+    global device, pipeline, transformer, detail_encoder
+    if pipeline is not None and transformer is not None and detail_encoder is not None:
+        return
+    print("CUDA 可用：", torch.cuda.is_available())
+    device = "cuda:0" if torch.cuda.is_available() else "cpu"
+    print("使用设备：", device)
+    dtype = torch.bfloat16 if "cuda" in device else torch.float32
+    print("加载 FluxKontextPipelineWithPhotoEncoderAddTokens...")
+    pipeline_local = FluxKontextPipelineWithPhotoEncoderAddTokens.from_pretrained(
+        "./kontext",
+        torch_dtype=dtype,
+    )
+    pipeline_local.to(device)
+    print("加载 FluxTransformer2DModel...")
+    transformer_local = FluxTransformer2DModel.from_pretrained(
+        "./kontext",
+        subfolder="transformer",
+        torch_dtype=dtype,
+    )
+    transformer_local.to(device)
+    print("加载 detail_encoder 权重...")
+    state_dict = load_file(detail_encoder_path)
+    detail_encoder_local = DetailEncoder().to(dtype=transformer_local.dtype, device=device)
+    detail_encoder_local.to(device)
+    with torch.no_grad():
+        for name, param in detail_encoder_local.named_parameters():
+            if name in state_dict:
+                added = state_dict[name].to(param.device)
+                param.add_(added)
+    pipeline_local.transformer = transformer_local
+    pipeline_local.detail_encoder = detail_encoder_local
+    print("加载 LoRA...")
+    set_single_lora(pipeline_local.transformer, kontext_lora_path, lora_weights=[1.0])
+    print("模型加载完成！")
+    # 写回全局变量
+    pipeline = pipeline_local
+    transformer = transformer_local
+    detail_encoder = detail_encoder_local
+@spaces.GPU(duration=200)
+def generate_image_method1(input_image, prompt, width, height, seed=42, randomize_seed=False, guidance_scale=2.5, steps=28):
+    """
+    Stage 1 - Method 1: UNO image generation
+    """
+    load_stage1_model()
+    global stage1_pipeline
+    if randomize_seed:
+        seed = -1
+    try:
+        # UNO pipeline uses gradio_generate interface
+        output_image, output_file = stage1_pipeline.gradio_generate(
+            prompt=prompt,
+            width=int(width),
+            height=int(height),
+            guidance=guidance_scale,
+            num_steps=steps,
+            seed=seed,
+            image_prompt1=input_image,
+            image_prompt2=None,
+            image_prompt3=None,
+            image_prompt4=None,
+        )
+        used_seed = seed if seed != -1 else random.randint(0, MAX_SEED)
+        return output_image, used_seed
+    except Exception as e:
+        print(f"Stage 1 生成图像时发生错误: {e}")
+        raise gr.Error(f"生成失败：{str(e)}")
+def extract_first_box(annotations: dict):
+    """
+    从 gradio_image_annotation 的返回中拿第一个 bbox 和对应的 PIL 图像及 patch
+    如果没有 bbox，则自动使用整张图作为 bbox。
+    """
+    if not annotations:
+        raise gr.Error("Missing annotation data. Please check if an image is uploaded.")
+    img_array = annotations.get("image", None)
+    boxes = annotations.get("boxes", [])
+    if img_array is None:
+        raise gr.Error("No 'image' field found in annotation.")
+    img = Image.fromarray(img_array)
+    # ✅
+    if not boxes:
+        w, h = img.size
+        xmin, ymin, xmax, ymax = 0, 0, w, h
+    else:
+        box = boxes[0]
+        xmin = int(box["xmin"])
+        ymin = int(box["ymin"])
+        xmax = int(box["xmax"])
+        ymax = int(box["ymax"])
+        if xmax <= xmin or ymax <= ymin:
+            raise gr.Error("Invalid bbox, please draw the box again.")
+    patch = img.crop((xmin, ymin, xmax, ymax))
+    return img, patch, (xmin, ymin, xmax, ymax)
+@spaces.GPU(duration=200)
+def run_with_two_bboxes(
+    annotations_A: dict | None,   #
+    annotations_B: dict | None,   #
+    object_name: str,
+    base_seed: int = 0,
+):  # noqa: C901
+    """
+    """
+    load_models()
+    global pipeline, device
+    if annotations_A is None:
+        raise gr.Error("please upload reference image and draw a bbox")
+    if annotations_B is None:
+        raise gr.Error("please upload input image to be corrected and draw a bbox")
+    # 1.
+    img1_full, patch_A, bbox_A = extract_first_box(annotations_A)
+    img2_full, patch_B, bbox_B = extract_first_box(annotations_B)
+    xmin_B, ymin_B, xmax_B, ymax_B = bbox_B
+    patch_w = xmax_B - xmin_B
+    patch_h = ymax_B - ymin_B
+    if not object_name:
+        object_name = "object"
+    # 2.
+    orig_w, orig_h = patch_B.size
+    target_w, target_h = pick_kontext_resolution(orig_w, orig_h)
+    width_for_model, height_for_model = target_w, target_h
+    # 3.
+    cond_A_image = patch_A.resize((width_for_model, height_for_model), Image.Resampling.LANCZOS)
+    cond_B_image = patch_B.resize((width_for_model, height_for_model), Image.Resampling.LANCZOS)
+    prompt = f"use the {object_name} in IMG1 as a reference to refine, replace, enhance the {object_name} in IMG2"
+    print("prompt:", prompt)
+    seed = int(base_seed)
+    gen_device = device.split(":")[0] if "cuda" in device else device
+    generator = torch.Generator(gen_device).manual_seed(seed)
+    try:
+        out = pipeline(
+            image_A=cond_A_image,
+            image_B=cond_B_image,
+            prompt=prompt,
+            height=height_for_model,
+            width=width_for_model,
+            guidance_scale=3.5,
+            generator=generator,
+        )
+        gen_patch_model = out.images[0]
+        #
+        gen_patch = gen_patch_model.resize((patch_w, patch_h), Image.Resampling.LANCZOS)
+        #
+        composed = img2_full.copy()
+        composed.paste(gen_patch, (xmin_B, ymin_B))
+        patch_A_resized = patch_A.resize((patch_w, patch_h), Image.Resampling.LANCZOS)
+        patch_B_resized = patch_B.resize((patch_w, patch_h), Image.Resampling.LANCZOS)
+        SPACING = 10
+        collage_w = patch_w * 3 + SPACING * 2
+        collage_h = patch_h
+        collage = Image.new("RGB", (collage_w, collage_h), (255, 255, 255))
+        x0 = 0
+        x1 = patch_w + SPACING
+        x2 = patch_w * 2 + SPACING * 2
+        collage.paste(patch_A_resized, (x0, 0))
+        collage.paste(patch_B_resized, (x1, 0))
+        collage.paste(gen_patch, (x2, 0))
+        return collage, composed
+    except Exception as e:
+        print(f"生成图像时发生错误: {e}")
+        raise gr.Error(f"生成失败：{str(e)}")
+import gradio as gr
+with gr.Blocks(
+    theme=gr.themes.Soft(primary_hue="blue", neutral_hue="slate"),
+    css="""
+/* Global Clean Font */
+/* Center container */
+.app-container {
+    width: 100% !important;
+    max-width: 100% !important;
+    margin: 0 auto;
+}
+/* Title block */
+.title-block h1 {
+    text-align: center;
+    font-size: 3rem;
+    font-weight: 1100;
+    /* 蓝紫渐变 */
+    background: linear-gradient(90deg, #5b8dff, #b57aff);
+    -webkit-background-clip: text;
+    color: transparent;
+}
+.title-block h2 {
+    text-align: center;
+    font-size: 1.6rem;
+    font-weight: 700;
+    margin-top: 0.4rem;
+    /* 稍弱一点的渐变 */
+    background: linear-gradient(90deg, #6da0ff, #c28aff);
+    -webkit-background-clip: text;
+    color: transparent;
+}
+/* Title block
+.title-block h1 {
+text-align: center; font-size: 2.4rem; font-weight: 800; color: #1f2937;
+}
+.title-block h2 {
+text-align: center; font-size: 1.2rem; font-weight: 500; color: #303030; margin-top: 0.4rem;
+}
+*/
+/* Simple card */
+.clean-card {
+    background: #ffffff;
+    border: 1px solid #e5e7eb;
+    border-radius: 12px;
+    padding: 14px 16px;
+    margin-bottom: 10px;
+}
+/* Card title */
+.clean-card-title {
+    font-size: 1.3rem;
+    font-weight: 600;
+    color: #404040;
+    margin-bottom: 6px;
+}
+/* Subtitle */
+.clean-card-subtitle {
+    font-size: 1.1rem;
+    color: #404040;
+    margin-bottom: 8px;
+}
+/* Output card */
+.output-card {
+    background: #ffffff;
+    border: 1px solid #d1d5db;
+    border-radius: 12px;
+    padding: 14px 16px;
+}
+.output-card1 {
+    background: #ffffff;
+    border: none !important;
+    box-shadow: none !important;
+    border-radius: 12px;
+    padding: 14px 16px;
+}
+/* 渐变主按钮：同时兼容 button 自己是 .color-btn，或者外层是 .color-btn 的情况 */
+button.color-btn,
+.color-btn button {
+    width: 100%;
+    background: linear-gradient(90deg, #3b82f6 0%, #6366f1 100%) !important;
+    color: #ffffff !important;
+    font-size: 1.05rem !important;
+    font-weight: 700 !important;
+    padding: 14px !important;
+    border-radius: 12px !important;
+    border: none !important;
+    box-shadow: 0 4px 12px rgba(99, 102, 241, 0.25) !important;
+    transition: 0.2s ease !important;
+    cursor: pointer;
+}
+/* Hover 效果 */
+button.color-btn:hover,
+.color-btn button:hover {
+    opacity: 0.92 !important;
+    transform: translateY(-1px) !important;
+}
+/* 按下反馈 */
+button.color-btn:active,
+.color-btn button:active {
+    transform: scale(0.98) !important;
+}
+/* 如果外面还有 wrapper，就把它搞透明一下（防止再套一层白条） */
+.color-btn > div {
+    background: transparent !important;
+    box-shadow: none !important;
+    border: none !important;
+}
+.example-image img {
+    height: 400px !important;
+    object-fit: contain;
+"""
+) as demo:
+    gen_patch_out = None
+    composed_out = None
+    # -------------------------------------------------------
+    # Title
+    # -------------------------------------------------------
+    gr.Markdown(
+        """
+    <div class="title-block">
+        <h1>The Consistency Critic:</h1>
+        <h2>Correcting Inconsistencies in Generated Images via Reference-Guided Attentive Alignment</h2>
+    </div>
+        """
+    )
+    # ========================================================
+    # 两个 Stage 并排显示
+    # ========================================================
+    with gr.Row(elem_classes="app-container"):
+        # ========================================================
+        # STAGE 1: Image Generation (左侧)
+        # ========================================================
+        with gr.Column(scale=1):
+            gr.Markdown(
+                """
+            <div class="clean-card">
+                <div class="clean-card-title">🎨 Stage 1: Customized Image Generation</div>
+                <div class="clean-card-subtitle">Generate images from prompts and reference image using UNO method. The output can be used as input for Stage 2.</div>
+            </div>
+                """
+            )
+            # Stage 1 Input
+            gr.Markdown("### Input")
+            stage1_input_image = gr.Image(label="Input Image (Optional)", type="pil")
+            stage1_prompt = gr.Textbox(
+                label="Prompt",
+                placeholder="Enter your prompt for image generation",
+                lines=3
+            )
+            with gr.Row():
+                with gr.Column():
+                    stage1_width = gr.Slider(512, 2048, 1024, step=16, label="Generation Width")
+                    stage1_height = gr.Slider(512, 2048, 1024, step=16, label="Generation Height")
+            with gr.Accordion("Advanced Settings", open=False):
+                stage1_seed = gr.Slider(
+                    label="Seed",
+                    minimum=0,
+                    maximum=MAX_SEED,
+                    step=1,
+                    value=42,
+                )
+                stage1_randomize_seed = gr.Checkbox(label="Randomize seed", value=True)
+                stage1_guidance_scale = gr.Slider(
+                    label="Guidance Scale",
+                    minimum=1,
+                    maximum=10,
+                    step=0.1,
+                    value=2.5,
+                )
+                stage1_steps = gr.Slider(
+                    label="Steps",
+                    minimum=1,
+                    maximum=30,
+                    value=28,
+                    step=1
+                )
+            stage1_method1_btn = gr.Button("✨ Generate Image", elem_classes="color-btn")
+            # Stage 1 Output
+            gr.Markdown("### Output")
+            stage1_output_image = gr.Image(label="Generated Image", interactive=False)
+            stage1_used_seed = gr.Number(label="Used Seed", interactive=False)
+            # -------------------------------------------------------
+            # Stage 1 Examples
+            # -------------------------------------------------------
+            gr.Markdown(
+                """
+                <div style="
+                    font-size: 1.3rem;
+                    font-weight: 600;
+                    color: #404040;
+                    margin-top: 16px;
+                    margin-bottom: 6px;
+                ">
+                    📚 Stage 1 Example Images & Prompts
+                </div>
+                """,
+            )
+            gr.Markdown(
+                """
+                <div style="
+                    font-size: 1.1rem;
+                    color: #404040;
+                    margin-bottom: 8px;
+                ">
+                    Click on any example below to load the image and prompt into Stage 1 inputs.
+                </div>
+                """,
+            )
+            gr.Examples(
+                examples=[
+                    ["./test_imgs/product_3.png", "In a softly lit nursery, a baby sleeps peacefully as a parent gently applies the product to a washcloth. The scene is calm and warm, with natural light highlighting the product’s label. The camera captures a close-up, centered view, emphasizing the product’s presence and its gentle interaction with the environment."],
+                    ["./test_imgs/3.png", "Create an engaging lifestyle e-commerce scene where a person delicately picks up the product from a slightly shifted angle to add depth and realism, placing it within a creative photography workspace filled with soft natural light, scattered camera gear, open photo books, and warm wooden textures."],
+                    ["./test_imgs/2.png", "Create a stylish e-commerce scene featuring the product displayed on a modern clothing rack in a bright boutique environment, surrounded by soft natural lighting, minimalistic decor, and complementary fashion accessories"]
+                ],
+                inputs=[stage1_input_image, stage1_prompt],
+                label="Click to Load Examples"
+            )
+        # ========================================================
+        # STAGE 2: Image Correction (右侧)
+        # ========================================================
+        with gr.Column(scale=1):
+            gr.Markdown(
+                """
+            <div class="clean-card">
+                <div class="clean-card-title">🔧 Stage 2: Image Consistency Correction</div>
+                <div class="clean-card-subtitle">Refine and correct generated images using ImageCritic.</div>
+            </div>
+                """
+            )
+            # -------------------------------------------------------
+            # Tips for Stage 2
+            # -------------------------------------------------------
+            gr.Markdown(
+                """
+            <div class="clean-card">
+                <div class="clean-card-title">💡 Stage 2 Tips</div>
+                <div class="clean-card-subtitle">
+                    • Crop both the bbox that needs to be corrected and the reference bbox, preferably covering the smallest repeating unit, to achieve better results.<br>
+                    • The bbox area should ideally cover the region to be corrected and the reference region as completely as possible.<br>
+                    • The aspect ratio of the bboxes should also be kept consistent to avoid errors caused by incorrect scaling.<br>
+                    • If model fails to correct the image, it may be because the generated image is too similar to the reference image, causing the model to skip the repair. You can manually<b> paint that area black on a drawing board before sending to the model, or try cropping only the local region and performing multiple rounds correcting to progressively enhance the whole generated image.</b>
+            </div>
+                """
+            )
+            # -------------------------------------------------------
+            # Image annotation area
+            # -------------------------------------------------------
+            with gr.Row():
+                # Left: Reference Image
+                with gr.Column():
+                    gr.Markdown(
+                        """
+                        <div class="clean-card">
+                            <div class="clean-card-title">📌 Reference Image</div>
+                            <div class="clean-card-subtitle">Draw a bounding box around the area for reference.</div>
+                        </div>
+                        """
+                    )
+                    annotator_A = image_annotator(
+                        value=None,
+                        label="reference image",
+                        label_list=["bbox for reference"],
+                        label_colors = [(168, 160, 194)],
+                        single_box=True,
+                        image_type="numpy",
+                        sources=["upload", "clipboard"],
+                        height=300,
+                    )
+                # Right: Image to be corrected
+                with gr.Column():
+                    gr.Markdown(
+                        """
+                        <div class="clean-card">
+                            <div class="clean-card-title">🖼️ Input Image To Be Corrected</div>
+                            <div class="clean-card-subtitle">Use the mouse wheel to zoom and draw a bounding box around the area to be corrected.</div>
+                        </div>
+                        """
+                    )
+                    annotator_B = image_annotator(
+                        value=None,
+                        label="input image to be corrected",
+                        label_list=["bbox for correction"],
+                        label_colors = [(168, 160, 194)],
+                        single_box=True,
+                        image_type="numpy",
+                        sources=["upload", "clipboard"],
+                        height=300,
+                    )
+            # -------------------------------------------------------
+            # Controls
+            # -------------------------------------------------------
+            with gr.Row():
+                object_name = gr.Textbox(
+                    label="Caption for object (optional; using 'product' also works)",
+                    value="product",
+                    placeholder="e.g. product, shoes, bag, face ..."
+                )
+                base_seed = gr.Number(
+                    label="Seed",
+                    value=0,
+                    precision=0,
+                )
+            # -------------------------------------------------------
+            # Run Button
+            # -------------------------------------------------------
+            run_btn = gr.Button("✨ Generate ", elem_classes="color-btn")
+            # ===================== 输出区 =====================
+            gr.Markdown("### Output")
+            with gr.Column(elem_classes="output-card1"):
+                gen_patch_out = gr.Image(
+                    label="concatenated input-output",
+                    interactive=False
+                )
+            with gr.Column(elem_classes="output-card1"):
+                composed_out = gr.Image(
+                    label="corrected image",
+                    interactive=False
+                )
+    # -------------------------------------------------------
+    # Stage 2 Example 区域整体放进一个白色卡片
+    # -------------------------------------------------------
+    with gr.Column(elem_classes="clean-card"):
+        gr.Markdown(
+            """
+            <div style="
+                font-size: 1.3rem;
+                font-weight: 600;
+                color: #404040;
+                margin-bottom: 6px;
+            ">
+                📚 Example Images
+            </div>
+            """,
+        )
+        gr.Markdown(
+            """
+            <div style="
+                font-size: 1.1rem;
+                color: #404040;
+                margin-bottom: 8px;
+            ">
+                Below are some example pairs showing how bounding boxes should be drawn.
+                You can click and drag the image below into the upper area for generation.<br>
+               <b> Full-image input is also supported, but it is recommended to  use the smallest possible bounding box that covers the region to be corrected and reference bbox. For example, the bbox approach used in the first row generally produces better results than the one used in the second row.</b>
+            </div>
+            """,
+        )
+        with gr.Row():
+            gr.Image("./test_imgs/product_3.png",label="reference example", elem_classes="example-image")
+            gr.Image("./test_imgs/product_3_bbox_1.png",label="reference example with bbox",elem_classes="example-image")
+            gr.Image("./test_imgs/generated_3.png",label="input example",  elem_classes="example-image")
+            gr.Image("./test_imgs/generated_3_bbox_1.png",label="input example with bbox",  elem_classes="example-image")
+        with gr.Row():
+            gr.Image("./test_imgs/product_3.png",label="reference example", elem_classes="example-image")
+            gr.Image("./test_imgs/product_3_bbox.png",label="reference example with bbox",elem_classes="example-image")
+            gr.Image("./test_imgs/generated_3.png",label="input example",  elem_classes="example-image")
+            gr.Image("./test_imgs/generated_3_bbox.png",label="input example with bbox",  elem_classes="example-image")
+        with gr.Row():
+            gr.Image("./test_imgs/product_1.jpg", label="reference example", elem_classes="example-image")
+            gr.Image("./test_imgs/product_1_bbox.png", label="reference example with bbox", elem_classes="example-image")
+            gr.Image("./test_imgs/generated_1.png", label="input example", elem_classes="example-image")
+            gr.Image("./test_imgs/generated_1_bbox.png", label="input example with bbox", elem_classes="example-image")
+        with gr.Row():
+            gr.Image("./test_imgs/product_2.png",label="reference example", elem_classes="example-image")
+            gr.Image("./test_imgs/product_2_bbox.png",label="reference example with bbox",elem_classes="example-image")
+            gr.Image("./test_imgs/generated_2.png", label="input example", elem_classes="example-image")
+            gr.Image("./test_imgs/generated_2_bbox.png", label="input example with bbox", elem_classes="example-image")
+    # ========= 所有组件都定义完，再绑定按钮点击 =========
+    # Stage 1: Image Generation
+    stage1_method1_btn.click(
+        fn=generate_image_method1,
+        inputs=[stage1_input_image, stage1_prompt, stage1_width, stage1_height, stage1_seed, stage1_randomize_seed, stage1_guidance_scale, stage1_steps],
+        outputs=[stage1_output_image, stage1_used_seed],
+    )
+    # Stage 2: Image Correction
+    run_btn.click(
+        fn=run_with_two_bboxes,
+        inputs=[annotator_A, annotator_B, object_name, base_seed],
+        outputs=[gen_patch_out, composed_out],
+    )
+if __name__ == "__main__":
+    demo.launch(server_name="0.0.0.0", server_port=7779)

requirements.txt ADDED Viewed

	@@ -0,0 +1,34 @@

+--extra-index-url https://download.pytorch.org/whl/cu124
+torch
+torchvision
+accelerate==1.10.0
+clip @ git+https://github.com/openai/CLIP.git@dcba3cb2e2827b402d2701e7e1c7d9fed8a20ef1
+contourpy==1.3.2
+cycler==0.12.1
+datasets==4.0.0
+decord==0.6.0
+diffusers @ git+https://github.com/huggingface/diffusers.git@345864eb852b528fd1f4b6ad087fa06e0470006b
+gradio==5.49.1
+gradio_client==1.13.3
+gradio_image_annotation==0.4.1
+huggingface-hub==0.35.3
+ipykernel==7.0.1
+ipython==8.37.0
+Jinja2==3.1.6
+multiprocess==0.70.16
+ninja==1.13.0
+numpy==2.2.6
+open_clip_torch==3.2.0
+openai==1.107.2
+opencv-python==4.12.0.88
+opencv-python-headless==4.12.0.88
+qwen-vl-utils==0.0.11
+requests==2.32.5
+safetensors==0.6.2
+scikit-learn==1.7.2
+tornado==6.5.2
+tqdm==4.67.1
+transformers==4.51.3
+wandb==0.21.1
+einops
+sentencepiece

src/__init__.py ADDED Viewed

File without changes

src/attention_processor.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import torch
+import torch.nn.functional as F
+from typing import Optional, Tuple, Dict, Any
+import os
+import numpy as np
+from PIL import Image
+import matplotlib.pyplot as plt
+from diffusers.models.attention_processor import FluxAttnProcessor2_0
+class VisualFluxAttnProcessor2_0(FluxAttnProcessor2_0):
+    """
+    自定义的Flux注意力处理器，用于保存注意力图进行可视化
+    """
+    def __init__(self, save_attention=True, save_dir="attention_maps"):
+        super().__init__()
+        self.save_attention = save_attention
+        self.save_dir = save_dir
+        self.step_counter = 0
+        # 创建保存目录
+        if self.save_attention:
+            os.makedirs(self.save_dir, exist_ok=True)
+    def save_attention_map(self, attn_weights, layer_name="", step=None):
+        """保存注意力图"""
+        if not self.save_attention:
+            return
+        if step is None:
+            step = self.step_counter
+        # 取第一个batch和第一个head的注意力权重
+        attn_map = attn_weights[0, 0].detach().cpu().numpy()  # [seq_len, seq_len]
+        # 创建热力图
+        plt.figure(figsize=(12, 10))
+        plt.imshow(attn_map, cmap='hot', interpolation='nearest')
+        plt.colorbar()
+        plt.title(f'Attention Map - {layer_name} - Step {step}')
+        plt.xlabel('Key Position')
+        plt.ylabel('Query Position')
+        # 保存图片
+        save_path = os.path.join(self.save_dir, f"attention_{layer_name}_step_{step}.png")
+        plt.savefig(save_path, dpi=150, bbox_inches='tight')
+        plt.close()
+        print(f"Attention map saved to: {save_path}")
+    def __call__(
+        self,
+        attn,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        use_cond: bool = False,
+    ) -> torch.Tensor:
+        batch_size, sequence_length, _ = (
+            hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        )
+        if attention_mask is not None:
+            attention_mask = attn.prepare_attention_mask(attention_mask, sequence_length, batch_size)
+            attention_mask = attention_mask.view(batch_size, attn.heads, -1, attention_mask.shape[-1])
+        if attn.group_norm is not None:
+            hidden_states = attn.group_norm(hidden_states.transpose(1, 2)).transpose(1, 2)
+        query = attn.to_q(hidden_states)
+        if encoder_hidden_states is None:
+            encoder_hidden_states = hidden_states
+        elif attn.norm_cross:
+            encoder_hidden_states = attn.norm_encoder_hidden_states(encoder_hidden_states)
+        key = attn.to_k(encoder_hidden_states)
+        value = attn.to_v(encoder_hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        # 应用旋转位置编码
+        if image_rotary_emb is not None:
+            query = attn.rotary_emb(query, image_rotary_emb)
+            if not attn.is_cross_attention:
+                key = attn.rotary_emb(key, image_rotary_emb)
+        # 计算注意力权重
+        attention_scores = torch.matmul(query, key.transpose(-2, -1)) / (head_dim ** 0.5)
+        if attention_mask is not None:
+            attention_scores = attention_scores + attention_mask
+        attention_probs = F.softmax(attention_scores, dim=-1)
+        # 保存注意力图
+        if self.save_attention and self.step_counter % 10 == 0:  # 每10步保存一次
+            layer_name = f"layer_{self.step_counter // 10}"
+            self.save_attention_map(attention_probs, layer_name, self.step_counter)
+        # 应用dropout
+        attention_probs = F.dropout(attention_probs, p=attn.dropout, training=attn.training)
+        # 计算输出
+        hidden_states = torch.matmul(attention_probs, value)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        if use_cond:
+            # 处理条件分支的情况
+            seq_len = hidden_states.shape[1]
+            if seq_len % 2 == 0:
+                # 假设前半部分是原始hidden_states，后半部分是条件hidden_states
+                mid_point = seq_len // 2
+                original_hidden_states = hidden_states[:, :mid_point, :]
+                cond_hidden_states = hidden_states[:, mid_point:, :]
+                # 分别处理
+                original_output = attn.to_out[0](original_hidden_states)
+                cond_output = attn.to_out[0](cond_hidden_states)
+                if len(attn.to_out) > 1:
+                    original_output = attn.to_out[1](original_output)
+                    cond_output = attn.to_out[1](cond_output)
+                self.step_counter += 1
+                return original_output, cond_output
+        # 标准输出处理
+        hidden_states = attn.to_out[0](hidden_states)
+        if len(attn.to_out) > 1:
+            hidden_states = attn.to_out[1](hidden_states)
+        self.step_counter += 1
+        return hidden_states

src/detail_encoder.py ADDED Viewed

	@@ -0,0 +1,118 @@

+# Merge image encoder and fuse module to create an ID Encoder
+# send multiple ID images, we can directly obtain the updated text encoder containing a stacked ID embedding
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from transformers.models.clip.modeling_clip import CLIPVisionModelWithProjection
+from transformers.models.clip.configuration_clip import CLIPVisionConfig
+from transformers import PretrainedConfig
+VISION_CONFIG_DICT = {
+    "hidden_size": 1024,
+    "intermediate_size": 4096,
+    "num_attention_heads": 16,
+    "num_hidden_layers": 24,
+    "patch_size": 14,
+    "projection_dim": 768
+}
+class MLP(nn.Module):
+    def __init__(self, in_dim, out_dim, hidden_dim, use_residual=True):
+        super().__init__()
+        if use_residual:
+            assert in_dim == out_dim
+        self.layernorm = nn.LayerNorm(in_dim)
+        self.fc1 = nn.Linear(in_dim, hidden_dim)
+        self.fc2 = nn.Linear(hidden_dim, out_dim)
+        self.use_residual = use_residual
+        self.act_fn = nn.GELU()
+    def forward(self, x):
+        residual = x
+        x = self.layernorm(x)
+        x = self.fc1(x)
+        x = self.act_fn(x)
+        x = self.fc2(x)
+        if self.use_residual:
+            x = x + residual
+        return x
+class FuseModule(nn.Module):
+    def __init__(self, prompt_embed_dim, id_embed_dim):
+        super().__init__()
+        self.mlp1 = MLP(prompt_embed_dim + id_embed_dim, prompt_embed_dim, prompt_embed_dim, use_residual=False)
+        self.mlp2 = MLP(prompt_embed_dim, prompt_embed_dim, prompt_embed_dim, use_residual=True)
+        self.layer_norm = nn.LayerNorm(prompt_embed_dim)
+    def fuse_fn(self, prompt_embeds, id_embeds):
+        stacked_id_embeds = torch.cat([prompt_embeds, id_embeds], dim=-1)
+        stacked_id_embeds = self.mlp1(stacked_id_embeds) + prompt_embeds
+        stacked_id_embeds = self.mlp2(stacked_id_embeds)
+        stacked_id_embeds = self.layer_norm(stacked_id_embeds)
+        return stacked_id_embeds
+    def forward(
+        self,
+        prompt_embeds,
+        id_embeds,
+        class_tokens_mask,
+    ) -> torch.Tensor:
+        device = prompt_embeds.device
+        class_tokens_mask = class_tokens_mask.to(device)
+        id_embeds = id_embeds.to(prompt_embeds.dtype)
+        num_inputs = class_tokens_mask.sum().unsqueeze(0).to(id_embeds.device)
+        batch_size, max_num_inputs = id_embeds.shape[:2]
+        seq_length = prompt_embeds.shape[1]
+        flat_id_embeds = id_embeds.view(
+            -1, id_embeds.shape[-2], id_embeds.shape[-1]
+        )
+        valid_id_mask = (
+            torch.arange(max_num_inputs, device=flat_id_embeds.device)[None, :]
+            < num_inputs[:, None]
+        )
+        valid_id_embeds = flat_id_embeds[valid_id_mask.flatten()]
+        prompt_embeds = prompt_embeds.view(-1, prompt_embeds.shape[-1])
+        class_tokens_mask = class_tokens_mask.view(-1)
+        valid_id_embeds = valid_id_embeds.view(-1, valid_id_embeds.shape[-1])
+        image_token_embeds = prompt_embeds[class_tokens_mask]
+        stacked_id_embeds = self.fuse_fn(image_token_embeds, valid_id_embeds)
+        stacked_id_embeds = stacked_id_embeds.to(device=device, dtype=prompt_embeds.dtype)
+        assert class_tokens_mask.sum() == stacked_id_embeds.shape[0], f"{class_tokens_mask.sum()} != {stacked_id_embeds.shape[0]}"
+        prompt_embeds = prompt_embeds.masked_scatter(class_tokens_mask[:, None], stacked_id_embeds.to(prompt_embeds.dtype))
+        updated_prompt_embeds = prompt_embeds.view(batch_size, seq_length, -1)
+        return updated_prompt_embeds
+class DetailEncoder(CLIPVisionModelWithProjection):
+    def __init__(self):
+        super().__init__(CLIPVisionConfig(**VISION_CONFIG_DICT))
+        self.visual_projection_2 = nn.Linear(1024, 1280, bias=False)
+        self.fuse_module = FuseModule(4096, 2048)
+    def forward(self, id_pixel_values, prompt_embeds, class_tokens_mask):
+        dtype = next(self.parameters()).dtype
+        device = next(self.parameters()).device
+        b, num_inputs, c, h, w = id_pixel_values.shape
+        # device setting
+        id_pixel_values = id_pixel_values.to(device=device, dtype=dtype)
+        prompt_embeds = prompt_embeds.to(device=device, dtype=dtype)
+        class_tokens_mask = class_tokens_mask.to(device=device)
+        id_pixel_values = id_pixel_values.view(b * num_inputs, c, h, w)
+        id_pixel_values = F.interpolate(id_pixel_values, size=(224, 224), mode="bilinear", align_corners=False)
+        # id embeds <--> input image
+        shared_id_embeds = self.vision_model(id_pixel_values)[1]
+        id_embeds = self.visual_projection(shared_id_embeds)
+        id_embeds_2 = self.visual_projection_2(shared_id_embeds)
+        id_embeds = id_embeds.view(b, num_inputs, 1, -1)
+        id_embeds_2 = id_embeds_2.view(b, num_inputs, 1, -1)
+        id_embeds = torch.cat((id_embeds, id_embeds_2), dim=-1)
+        updated_prompt_embeds = self.fuse_module(prompt_embeds, id_embeds, class_tokens_mask)
+        return updated_prompt_embeds

src/jsonl_datasets.py ADDED Viewed

	@@ -0,0 +1,186 @@

+from PIL import Image
+from datasets import load_dataset
+from torchvision import transforms
+import random
+import torch
+Image.MAX_IMAGE_PIXELS = None
+def multiple_16(num: float):
+    return int(round(num / 16) * 16)
+def get_random_resolution(min_size=512, max_size=1280, multiple=16):
+    resolution = random.randint(min_size // multiple, max_size // multiple) * multiple
+    return resolution
+def load_image_safely(image_path, size):
+    try:
+        image = Image.open(image_path).convert("RGB")
+        return image
+    except Exception as e:
+        print("file error: "+image_path)
+        with open("failed_images.txt", "a") as f:
+            f.write(f"{image_path}\n")
+        return Image.new("RGB", (size, size), (255, 255, 255))
+def make_train_dataset(args, tokenizer, accelerator=None):
+    if args.train_data_dir is not None:
+        print("load_data")
+        dataset = load_dataset('json', data_files=args.train_data_dir)
+    column_names = dataset["train"].column_names
+    # 6. Get the column names for input/target.
+    caption_column = args.caption_column
+    target_column = args.target_column
+    if args.subject_column is not None:
+        subject_columns = args.subject_column.split(",")
+    if args.spatial_column is not None:
+        spatial_columns= args.spatial_column.split(",")
+    size = args.cond_size
+    noise_size = get_random_resolution(max_size=args.noise_size) # maybe 768 or higher
+    subject_cond_train_transforms = transforms.Compose(
+        [
+            transforms.Lambda(lambda img: img.resize((
+                multiple_16(size * img.size[0] / max(img.size)),
+                multiple_16(size * img.size[1] / max(img.size))
+            ), resample=Image.BILINEAR)),
+            transforms.RandomHorizontalFlip(p=0.7),
+            transforms.RandomRotation(degrees=20),
+            transforms.Lambda(lambda img: transforms.Pad(
+                padding=(
+                    int((size - img.size[0]) / 2),
+                    int((size - img.size[1]) / 2),
+                    int((size - img.size[0]) / 2),
+                    int((size - img.size[1]) / 2)
+                ),
+                fill=0
+            )(img)),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5]),
+        ]
+    )
+    cond_train_transforms = transforms.Compose(
+        [
+            transforms.Resize((size, size), interpolation=transforms.InterpolationMode.BILINEAR),
+            transforms.CenterCrop((size, size)),
+            transforms.ToTensor(),
+            transforms.Normalize([0.5], [0.5]),
+        ]
+    )
+    def train_transforms(image, noise_size):
+        train_transforms_ = transforms.Compose(
+            [
+                transforms.Lambda(lambda img: img.resize((
+                    multiple_16(noise_size * img.size[0] / max(img.size)),
+                    multiple_16(noise_size * img.size[1] / max(img.size))
+                ), resample=Image.BILINEAR)),
+                transforms.ToTensor(),
+                transforms.Normalize([0.5], [0.5]),
+            ]
+        )
+        transformed_image = train_transforms_(image)
+        return transformed_image
+    def load_and_transform_cond_images(images):
+        transformed_images = [cond_train_transforms(image) for image in images]
+        concatenated_image = torch.cat(transformed_images, dim=1)
+        return concatenated_image
+    def load_and_transform_subject_images(images):
+        transformed_images = [subject_cond_train_transforms(image) for image in images]
+        concatenated_image = torch.cat(transformed_images, dim=1)
+        return concatenated_image
+    tokenizer_clip = tokenizer[0]
+    tokenizer_t5 = tokenizer[1]
+    def tokenize_prompt_clip_t5(examples):
+        captions = []
+        for caption in examples[caption_column]:
+            if isinstance(caption, str):
+                if random.random() < 0.1:
+                    captions.append(" ")  # 将文本设为空
+                else:
+                    captions.append(caption)
+            elif isinstance(caption, list):
+                # take a random caption if there are multiple
+                if random.random() < 0.1:
+                    captions.append(" ")
+                else:
+                    captions.append(random.choice(caption))
+            else:
+                raise ValueError(
+                    f"Caption column `{caption_column}` should contain either strings or lists of strings."
+                )
+        text_inputs = tokenizer_clip(
+            captions,
+            padding="max_length",
+            max_length=77,
+            truncation=True,
+            return_length=False,
+            return_overflowing_tokens=False,
+            return_tensors="pt",
+        )
+        text_input_ids_1 = text_inputs.input_ids
+        text_inputs = tokenizer_t5(
+            captions,
+            padding="max_length",
+            max_length=512,
+            truncation=True,
+            return_length=False,
+            return_overflowing_tokens=False,
+            return_tensors="pt",
+        )
+        text_input_ids_2 = text_inputs.input_ids
+        return text_input_ids_1, text_input_ids_2
+    def preprocess_train(examples):
+        _examples = {}
+        if args.subject_column is not None:
+            subject_images = [[load_image_safely(examples[column][i], args.cond_size) for column in subject_columns] for i in range(len(examples[target_column]))]
+            _examples["subject_pixel_values"] = [load_and_transform_subject_images(subject) for subject in subject_images]
+        if args.spatial_column is not None:
+            spatial_images = [[load_image_safely(examples[column][i], args.cond_size) for column in spatial_columns] for i in range(len(examples[target_column]))]
+            _examples["cond_pixel_values"] = [load_and_transform_cond_images(spatial) for spatial in spatial_images]
+        target_images = [load_image_safely(image_path, args.cond_size) for image_path in examples[target_column]]
+        _examples["pixel_values"] = [train_transforms(image, noise_size) for image in target_images]
+        _examples["token_ids_clip"], _examples["token_ids_t5"] = tokenize_prompt_clip_t5(examples)
+        return _examples
+    if accelerator is not None:
+        with accelerator.main_process_first():
+            train_dataset = dataset["train"].with_transform(preprocess_train)
+    else:
+        train_dataset = dataset["train"].with_transform(preprocess_train)
+    return train_dataset
+def collate_fn(examples):
+    if examples[0].get("cond_pixel_values") is not None:
+        cond_pixel_values = torch.stack([example["cond_pixel_values"] for example in examples])
+        cond_pixel_values = cond_pixel_values.to(memory_format=torch.contiguous_format).float()
+    else:
+        cond_pixel_values = None
+    if examples[0].get("subject_pixel_values") is not None:
+        subject_pixel_values = torch.stack([example["subject_pixel_values"] for example in examples])
+        subject_pixel_values = subject_pixel_values.to(memory_format=torch.contiguous_format).float()
+    else:
+        subject_pixel_values = None
+    target_pixel_values = torch.stack([example["pixel_values"] for example in examples])
+    target_pixel_values = target_pixel_values.to(memory_format=torch.contiguous_format).float()
+    token_ids_clip = torch.stack([torch.tensor(example["token_ids_clip"]) for example in examples])
+    token_ids_t5 = torch.stack([torch.tensor(example["token_ids_t5"]) for example in examples])
+    return {
+        "cond_pixel_values": cond_pixel_values,
+        "subject_pixel_values": subject_pixel_values,
+        "pixel_values": target_pixel_values,
+        "text_ids_1": token_ids_clip,
+        "text_ids_2": token_ids_t5,
+    }

src/kontext_custom_pipeline.py ADDED Viewed

The diff for this file is too large to render. See raw diff

src/layers.py ADDED Viewed

	@@ -0,0 +1,673 @@

+import inspect
+import math
+from typing import Callable, List, Optional, Tuple, Union
+from einops import rearrange
+import torch
+import torch.nn.functional as F
+from torch import nn
+from torch import Tensor
+from diffusers.models.attention_processor import Attention
+# Global variables for attention visualization
+step = 0
+global_timestep = 0
+global_timestep2 = 0
+def scaled_dot_product_average_attention_map(query, key, attn_mask=None, is_causal=False, scale=None) -> torch.Tensor:
+    # Efficient implementation equivalent to the following:
+    L, S = query.size(-2), key.size(-2)
+    scale_factor = 1 / math.sqrt(query.size(-1)) if scale is None else scale
+    attn_bias = torch.zeros(L, S, dtype=query.dtype)
+    if is_causal:
+        assert attn_mask is None
+        temp_mask = torch.ones(L, S, dtype=torch.bool).tril(diagonal=0)
+        attn_bias.masked_fill_(temp_mask.logical_not(), float("-inf"))
+        attn_bias.to(query.dtype)
+    if attn_mask is not None:
+        if attn_mask.dtype == torch.bool:
+            attn_mask.masked_fill_(attn_mask.logical_not(), float("-inf"))
+        else:
+            attn_bias += attn_mask
+    attn_weight = query @ key.transpose(-2, -1) * scale_factor
+    attn_weight += attn_bias.to(attn_weight.device)
+    attn_weight = attn_weight.mean(dim=(1, 2))
+    return attn_weight
+class LoRALinearLayer(nn.Module):
+    def __init__(
+        self,
+        in_features: int,
+        out_features: int,
+        rank: int = 4,
+        network_alpha: Optional[float] = None,
+        device: Optional[Union[torch.device, str]] = None,
+        dtype: Optional[torch.dtype] = None,
+    ):
+        super().__init__()
+        self.down = nn.Linear(in_features, rank, bias=False, device=device, dtype=dtype)
+        self.up = nn.Linear(rank, out_features, bias=False, device=device, dtype=dtype)
+        # This value has the same meaning as the `--network_alpha` option in the kohya-ss trainer script.
+        # See https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning
+        self.network_alpha = network_alpha
+        self.rank = rank
+        self.out_features = out_features
+        self.in_features = in_features
+        nn.init.normal_(self.down.weight, std=1 / rank)
+        nn.init.zeros_(self.up.weight)
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        orig_dtype = hidden_states.dtype
+        dtype = self.down.weight.dtype
+        down_hidden_states = self.down(hidden_states.to(dtype))
+        up_hidden_states = self.up(down_hidden_states)
+        if self.network_alpha is not None:
+            up_hidden_states *= self.network_alpha / self.rank
+        return up_hidden_states.to(orig_dtype)
+class MultiSingleStreamBlockLoraProcessor(nn.Module):
+    def __init__(self, in_features: int, out_features: int, ranks=[], lora_weights=[], network_alphas=[], device=None, dtype=None, n_loras=1):
+        super().__init__()
+        # Initialize a list to store the LoRA layers
+        self.n_loras = n_loras
+        self.q_loras = nn.ModuleList([
+            LoRALinearLayer(in_features, out_features, ranks[i], network_alphas[i], device=device, dtype=dtype)
+            for i in range(n_loras)
+        ])
+        self.k_loras = nn.ModuleList([
+            LoRALinearLayer(in_features, out_features, ranks[i], network_alphas[i], device=device, dtype=dtype)
+            for i in range(n_loras)
+        ])
+        self.v_loras = nn.ModuleList([
+            LoRALinearLayer(in_features, out_features, ranks[i], network_alphas[i], device=device, dtype=dtype)
+            for i in range(n_loras)
+        ])
+        self.lora_weights = lora_weights
+    def __call__(self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+        use_cond = False,
+    ) -> torch.FloatTensor:
+        batch_size, seq_len, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+        for i in range(self.n_loras):
+            query = query + self.lora_weights[i] * self.q_loras[i](hidden_states)
+            key = key + self.lora_weights[i] * self.k_loras[i](hidden_states)
+            value = value + self.lora_weights[i] * self.v_loras[i](hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        if image_rotary_emb is not None:
+            from diffusers.models.embeddings import apply_rotary_emb
+            query = apply_rotary_emb(query, image_rotary_emb)
+            key = apply_rotary_emb(key, image_rotary_emb)
+        hidden_states = F.scaled_dot_product_attention(query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        return hidden_states
+class MultiDoubleStreamBlockLoraProcessor(nn.Module):
+    def __init__(self, in_features: int, out_features: int, ranks=[], lora_weights=[], network_alphas=[], device=None, dtype=None, n_loras=1):
+        super().__init__()
+        # Initialize a list to store the LoRA layers
+        self.n_loras = n_loras
+        self.q_loras = nn.ModuleList([
+            LoRALinearLayer(in_features, out_features, ranks[i],network_alphas[i], device=device, dtype=dtype)
+            for i in range(n_loras)
+        ])
+        self.k_loras = nn.ModuleList([
+            LoRALinearLayer(in_features, out_features, ranks[i],network_alphas[i], device=device, dtype=dtype)
+            for i in range(n_loras)
+        ])
+        self.v_loras = nn.ModuleList([
+            LoRALinearLayer(in_features, out_features, ranks[i],network_alphas[i], device=device, dtype=dtype)
+            for i in range(n_loras)
+        ])
+        self.proj_loras = nn.ModuleList([
+            LoRALinearLayer(in_features, out_features, ranks[i],network_alphas[i], device=device, dtype=dtype)
+            for i in range(n_loras)
+        ])
+        self.lora_weights = lora_weights
+    def __call__(self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+        use_cond=False,
+    ) -> torch.FloatTensor:
+        batch_size, _, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        # `context` projections.
+        inner_dim = 3072
+        head_dim = inner_dim // attn.heads
+        encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states)
+        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+        encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(
+            batch_size, -1, attn.heads, head_dim
+        ).transpose(1, 2)
+        encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view(
+            batch_size, -1, attn.heads, head_dim
+        ).transpose(1, 2)
+        encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view(
+            batch_size, -1, attn.heads, head_dim
+        ).transpose(1, 2)
+        if attn.norm_added_q is not None:
+            encoder_hidden_states_query_proj = attn.norm_added_q(encoder_hidden_states_query_proj)
+        if attn.norm_added_k is not None:
+            encoder_hidden_states_key_proj = attn.norm_added_k(encoder_hidden_states_key_proj)
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+        for i in range(self.n_loras):
+            query = query + self.lora_weights[i] * self.q_loras[i](hidden_states)
+            key = key + self.lora_weights[i] * self.k_loras[i](hidden_states)
+            value = value + self.lora_weights[i] * self.v_loras[i](hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        # attention
+        query = torch.cat([encoder_hidden_states_query_proj, query], dim=2)
+        key = torch.cat([encoder_hidden_states_key_proj, key], dim=2)
+        value = torch.cat([encoder_hidden_states_value_proj, value], dim=2)
+        if image_rotary_emb is not None:
+            from diffusers.models.embeddings import apply_rotary_emb
+            query = apply_rotary_emb(query, image_rotary_emb)
+            key = apply_rotary_emb(key, image_rotary_emb)
+        hidden_states = F.scaled_dot_product_attention(query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        encoder_hidden_states, hidden_states = (
+            hidden_states[:, : encoder_hidden_states.shape[1]],
+            hidden_states[:, encoder_hidden_states.shape[1] :],
+        )
+        # Linear projection (with LoRA weight applied to each proj layer)
+        hidden_states = attn.to_out[0](hidden_states)
+        for i in range(self.n_loras):
+             hidden_states = hidden_states + self.lora_weights[i] * self.proj_loras[i](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+        return (hidden_states, encoder_hidden_states)
+class MultiSingleStreamBlockLoraProcessorWithLoss(nn.Module):
+    def __init__(self, in_features: int, out_features: int, ranks=[], lora_weights=[], network_alphas=[], device=None, dtype=None, n_loras=1):
+        super().__init__()
+        # Initialize a list to store the LoRA layers
+        self.n_loras = n_loras
+        self.q_loras = nn.ModuleList([
+            LoRALinearLayer(in_features, out_features, ranks[i], network_alphas[i], device=device, dtype=dtype)
+            for i in range(n_loras)
+        ])
+        self.k_loras = nn.ModuleList([
+            LoRALinearLayer(in_features, out_features, ranks[i], network_alphas[i], device=device, dtype=dtype)
+            for i in range(n_loras)
+        ])
+        self.v_loras = nn.ModuleList([
+            LoRALinearLayer(in_features, out_features, ranks[i], network_alphas[i], device=device, dtype=dtype)
+            for i in range(n_loras)
+        ])
+        self.lora_weights = lora_weights
+    def __call__(self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+        use_cond = False,
+    ) -> torch.FloatTensor:
+        batch_size, seq_len, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+        encoder_hidden_length = 512
+        length = (hidden_states.shape[-2] - encoder_hidden_length) // 3
+        for i in range(self.n_loras):
+            query = query + self.lora_weights[i] * self.q_loras[i](hidden_states)
+            key = key + self.lora_weights[i] * self.k_loras[i](hidden_states)
+            value = value + self.lora_weights[i] * self.v_loras[i](hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        if image_rotary_emb is not None:
+            from diffusers.models.embeddings import apply_rotary_emb
+            query = apply_rotary_emb(query, image_rotary_emb)
+            key = apply_rotary_emb(key, image_rotary_emb)
+        # query_cond_a = query[:, :, encoder_hidden_length+length : encoder_hidden_length+2*length, :]
+        # query_cond_b = query[:, :, encoder_hidden_length+2*length : encoder_hidden_length+3*length, :]
+        # key_noise = key[:, :, encoder_hidden_length:encoder_hidden_length+length, :]
+        # attention_probs_query_a_key_noise = scaled_dot_product_average_attention_map(query_cond_a, key_noise, attn_mask=attention_mask, is_causal=False)
+        # attention_probs_query_b_key_noise = scaled_dot_product_average_attention_map(query_cond_b, key_noise, attn_mask=attention_mask, is_causal=False)
+        # attn.attention_probs_query_a_key_noise = attention_probs_query_a_key_noise
+        # attn.attention_probs_query_b_key_noise = attention_probs_query_b_key_noise
+        hidden_states = F.scaled_dot_product_attention(query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        return hidden_states
+class MultiDoubleStreamBlockLoraProcessorWithLoss(nn.Module):
+    def __init__(self, in_features: int, out_features: int, ranks=[], lora_weights=[], network_alphas=[], device=None, dtype=None, n_loras=1):
+        super().__init__()
+        # Initialize a list to store the LoRA layers
+        self.n_loras = n_loras
+        self.q_loras = nn.ModuleList([
+            LoRALinearLayer(in_features, out_features, ranks[i],network_alphas[i], device=device, dtype=dtype)
+            for i in range(n_loras)
+        ])
+        self.k_loras = nn.ModuleList([
+            LoRALinearLayer(in_features, out_features, ranks[i],network_alphas[i], device=device, dtype=dtype)
+            for i in range(n_loras)
+        ])
+        self.v_loras = nn.ModuleList([
+            LoRALinearLayer(in_features, out_features, ranks[i],network_alphas[i], device=device, dtype=dtype)
+            for i in range(n_loras)
+        ])
+        self.proj_loras = nn.ModuleList([
+            LoRALinearLayer(in_features, out_features, ranks[i],network_alphas[i], device=device, dtype=dtype)
+            for i in range(n_loras)
+        ])
+        self.lora_weights = lora_weights
+    def __call__(self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+        use_cond=False,
+    ) -> torch.FloatTensor:
+        batch_size, _, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        # `context` projections.
+        inner_dim = 3072
+        head_dim = inner_dim // attn.heads
+        encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states)
+        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+        encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(
+            batch_size, -1, attn.heads, head_dim
+        ).transpose(1, 2)
+        encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view(
+            batch_size, -1, attn.heads, head_dim
+        ).transpose(1, 2)
+        encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view(
+            batch_size, -1, attn.heads, head_dim
+        ).transpose(1, 2)
+        if attn.norm_added_q is not None:
+            encoder_hidden_states_query_proj = attn.norm_added_q(encoder_hidden_states_query_proj)
+        if attn.norm_added_k is not None:
+            encoder_hidden_states_key_proj = attn.norm_added_k(encoder_hidden_states_key_proj)
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+        length = hidden_states.shape[-2] // 3
+        for i in range(self.n_loras):
+            query = query + self.lora_weights[i] * self.q_loras[i](hidden_states)
+            key = key + self.lora_weights[i] * self.k_loras[i](hidden_states)
+            value = value + self.lora_weights[i] * self.v_loras[i](hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        # attention
+        query = torch.cat([encoder_hidden_states_query_proj, query], dim=2)
+        key = torch.cat([encoder_hidden_states_key_proj, key], dim=2)
+        value = torch.cat([encoder_hidden_states_value_proj, value], dim=2)
+        if image_rotary_emb is not None:
+            from diffusers.models.embeddings import apply_rotary_emb
+            query = apply_rotary_emb(query, image_rotary_emb)
+            key = apply_rotary_emb(key, image_rotary_emb)
+        encoder_hidden_length = 512
+        query_cond_a = query[:, :, encoder_hidden_length+length : encoder_hidden_length+2*length, :]
+        query_cond_b = query[:, :, encoder_hidden_length+2*length : encoder_hidden_length+3*length, :]
+        key_noise = key[:, :, encoder_hidden_length:encoder_hidden_length+length, :]
+        attention_probs_query_a_key_noise = scaled_dot_product_average_attention_map(query_cond_a, key_noise, attn_mask=attention_mask, is_causal=False)
+        attention_probs_query_b_key_noise = scaled_dot_product_average_attention_map(query_cond_b, key_noise, attn_mask=attention_mask, is_causal=False)
+        attn.attention_probs_query_a_key_noise = attention_probs_query_a_key_noise
+        attn.attention_probs_query_b_key_noise = attention_probs_query_b_key_noise
+        hidden_states = F.scaled_dot_product_attention(query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        encoder_hidden_states, hidden_states = (
+            hidden_states[:, : encoder_hidden_states.shape[1]],
+            hidden_states[:, encoder_hidden_states.shape[1] :],
+        )
+        # Linear projection (with LoRA weight applied to each proj layer)
+        hidden_states = attn.to_out[0](hidden_states)
+        for i in range(self.n_loras):
+             hidden_states = hidden_states + self.lora_weights[i] * self.proj_loras[i](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+        return (hidden_states, encoder_hidden_states)
+class MultiDoubleStreamBlockLoraProcessor_visual(nn.Module):
+    def __init__(self, in_features: int, out_features: int, ranks=[], lora_weights=[], network_alphas=[], device=None, dtype=None, n_loras=1):
+        super().__init__()
+        # Initialize a list to store the LoRA layers
+        self.n_loras = n_loras
+        self.q_loras = nn.ModuleList([
+            LoRALinearLayer(in_features, out_features, ranks[i],network_alphas[i], device=device, dtype=dtype)
+            for i in range(n_loras)
+        ])
+        self.k_loras = nn.ModuleList([
+            LoRALinearLayer(in_features, out_features, ranks[i],network_alphas[i], device=device, dtype=dtype)
+            for i in range(n_loras)
+        ])
+        self.v_loras = nn.ModuleList([
+            LoRALinearLayer(in_features, out_features, ranks[i],network_alphas[i], device=device, dtype=dtype)
+            for i in range(n_loras)
+        ])
+        self.proj_loras = nn.ModuleList([
+            LoRALinearLayer(in_features, out_features, ranks[i],network_alphas[i], device=device, dtype=dtype)
+            for i in range(n_loras)
+        ])
+        self.lora_weights = lora_weights
+    def __call__(self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+        use_cond=False,
+    ) -> torch.FloatTensor:
+        batch_size, _, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        # `context` projections.
+        inner_dim = 3072
+        head_dim = inner_dim // attn.heads
+        encoder_hidden_states_query_proj = attn.add_q_proj(encoder_hidden_states)
+        encoder_hidden_states_key_proj = attn.add_k_proj(encoder_hidden_states)
+        encoder_hidden_states_value_proj = attn.add_v_proj(encoder_hidden_states)
+        encoder_hidden_states_query_proj = encoder_hidden_states_query_proj.view(
+            batch_size, -1, attn.heads, head_dim
+        ).transpose(1, 2)
+        encoder_hidden_states_key_proj = encoder_hidden_states_key_proj.view(
+            batch_size, -1, attn.heads, head_dim
+        ).transpose(1, 2)
+        encoder_hidden_states_value_proj = encoder_hidden_states_value_proj.view(
+            batch_size, -1, attn.heads, head_dim
+        ).transpose(1, 2)
+        if attn.norm_added_q is not None:
+            encoder_hidden_states_query_proj = attn.norm_added_q(encoder_hidden_states_query_proj)
+        if attn.norm_added_k is not None:
+            encoder_hidden_states_key_proj = attn.norm_added_k(encoder_hidden_states_key_proj)
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+        length = hidden_states.shape[-2] // 3
+        for i in range(self.n_loras):
+            query = query + self.lora_weights[i] * self.q_loras[i](hidden_states)
+            key = key + self.lora_weights[i] * self.k_loras[i](hidden_states)
+            value = value + self.lora_weights[i] * self.v_loras[i](hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        # attention
+        query = torch.cat([encoder_hidden_states_query_proj, query], dim=2)
+        key = torch.cat([encoder_hidden_states_key_proj, key], dim=2)
+        value = torch.cat([encoder_hidden_states_value_proj, value], dim=2)
+        if image_rotary_emb is not None:
+            from diffusers.models.embeddings import apply_rotary_emb
+            query = apply_rotary_emb(query, image_rotary_emb)
+            key = apply_rotary_emb(key, image_rotary_emb)
+        encoder_hidden_length = 512
+        query_cond_a = query[:, :, encoder_hidden_length+length : encoder_hidden_length+2*length, :]
+        query_cond_b = query[:, :, encoder_hidden_length+2*length : encoder_hidden_length+3*length, :]
+        key_noise = key[:, :, encoder_hidden_length:encoder_hidden_length+length, :]
+        attention_probs_query_a_key_noise = scaled_dot_product_average_attention_map(query_cond_a, key_noise, attn_mask=attention_mask, is_causal=False)
+        attention_probs_query_b_key_noise = scaled_dot_product_average_attention_map(query_cond_b, key_noise, attn_mask=attention_mask, is_causal=False)
+        if not hasattr(attn, 'attention_probs_query_a_key_noise'):
+            attn.attention_probs_query_a_key_noise = []
+        if not hasattr(attn, 'attention_probs_query_b_key_noise'):
+            attn.attention_probs_query_b_key_noise = []
+        global global_timestep
+        attn.attention_probs_query_a_key_noise.append((global_timestep//19, attention_probs_query_a_key_noise))
+        attn.attention_probs_query_b_key_noise.append((global_timestep//19, attention_probs_query_b_key_noise))
+        print(f"Global Timestep: {global_timestep//19}")
+        global_timestep += 1
+        hidden_states = F.scaled_dot_product_attention(query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        encoder_hidden_states, hidden_states = (
+            hidden_states[:, : encoder_hidden_states.shape[1]],
+            hidden_states[:, encoder_hidden_states.shape[1] :],
+        )
+        # Linear projection (with LoRA weight applied to each proj layer)
+        hidden_states = attn.to_out[0](hidden_states)
+        for i in range(self.n_loras):
+             hidden_states = hidden_states + self.lora_weights[i] * self.proj_loras[i](hidden_states)
+        # dropout
+        hidden_states = attn.to_out[1](hidden_states)
+        encoder_hidden_states = attn.to_add_out(encoder_hidden_states)
+        return (hidden_states, encoder_hidden_states)
+class MultiSingleStreamBlockLoraProcessor_visual(nn.Module):
+    def __init__(self, in_features: int, out_features: int, ranks=[], lora_weights=[], network_alphas=[], device=None, dtype=None, n_loras=1):
+        super().__init__()
+        # Initialize a list to store the LoRA layers
+        self.n_loras = n_loras
+        self.q_loras = nn.ModuleList([
+            LoRALinearLayer(in_features, out_features, ranks[i], network_alphas[i], device=device, dtype=dtype)
+            for i in range(n_loras)
+        ])
+        self.k_loras = nn.ModuleList([
+            LoRALinearLayer(in_features, out_features, ranks[i], network_alphas[i], device=device, dtype=dtype)
+            for i in range(n_loras)
+        ])
+        self.v_loras = nn.ModuleList([
+            LoRALinearLayer(in_features, out_features, ranks[i], network_alphas[i], device=device, dtype=dtype)
+            for i in range(n_loras)
+        ])
+        self.lora_weights = lora_weights
+    def __call__(self,
+        attn: Attention,
+        hidden_states: torch.FloatTensor,
+        encoder_hidden_states: torch.FloatTensor = None,
+        attention_mask: Optional[torch.FloatTensor] = None,
+        image_rotary_emb: Optional[torch.Tensor] = None,
+        use_cond = False,
+    ) -> torch.FloatTensor:
+        batch_size, seq_len, _ = hidden_states.shape if encoder_hidden_states is None else encoder_hidden_states.shape
+        query = attn.to_q(hidden_states)
+        key = attn.to_k(hidden_states)
+        value = attn.to_v(hidden_states)
+        encoder_hidden_length = 512
+        length = (hidden_states.shape[-2] - encoder_hidden_length) // 3
+        for i in range(self.n_loras):
+            query = query + self.lora_weights[i] * self.q_loras[i](hidden_states)
+            key = key + self.lora_weights[i] * self.k_loras[i](hidden_states)
+            value = value + self.lora_weights[i] * self.v_loras[i](hidden_states)
+        inner_dim = key.shape[-1]
+        head_dim = inner_dim // attn.heads
+        query = query.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        key = key.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        value = value.view(batch_size, -1, attn.heads, head_dim).transpose(1, 2)
+        if attn.norm_q is not None:
+            query = attn.norm_q(query)
+        if attn.norm_k is not None:
+            key = attn.norm_k(key)
+        if image_rotary_emb is not None:
+            from diffusers.models.embeddings import apply_rotary_emb
+            query = apply_rotary_emb(query, image_rotary_emb)
+            key = apply_rotary_emb(key, image_rotary_emb)
+        if not hasattr(attn, 'attention_probs_query_a_key_noise2'):
+            attn.attention_probs_query_a_key_noise2 = []
+        if not hasattr(attn, 'attention_probs_query_b_key_noise2'):
+            attn.attention_probs_query_b_key_noise2 = []
+        query_cond_a = query[:, :, encoder_hidden_length+length : encoder_hidden_length+2*length, :]
+        query_cond_b = query[:, :, encoder_hidden_length+2*length : encoder_hidden_length+3*length, :]
+        key_noise = key[:, :, encoder_hidden_length:encoder_hidden_length+length, :]
+        attention_probs_query_a_key_noise2 = scaled_dot_product_average_attention_map(query_cond_a, key_noise, attn_mask=attention_mask, is_causal=False)
+        attention_probs_query_b_key_noise2 = scaled_dot_product_average_attention_map(query_cond_b, key_noise, attn_mask=attention_mask, is_causal=False)
+        global global_timestep2
+        attn.attention_probs_query_a_key_noise2.append((global_timestep//38, attention_probs_query_a_key_noise2))
+        attn.attention_probs_query_b_key_noise2.append((global_timestep//38, attention_probs_query_b_key_noise2))
+        print(f"Global Timestep2: {global_timestep2//38}")
+        global_timestep2 += 1
+        hidden_states = F.scaled_dot_product_attention(query, key, value, attn_mask=attention_mask, dropout_p=0.0, is_causal=False)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, attn.heads * head_dim)
+        hidden_states = hidden_states.to(query.dtype)
+        return hidden_states

src/lora_helper.py ADDED Viewed

	@@ -0,0 +1,267 @@

+from diffusers.models.attention_processor import FluxAttnProcessor2_0
+from safetensors import safe_open
+import re
+import torch
+from .layers import MultiDoubleStreamBlockLoraProcessor, MultiSingleStreamBlockLoraProcessor, MultiDoubleStreamBlockLoraProcessor_visual, MultiDoubleStreamBlockLoraProcessorWithLoss, MultiSingleStreamBlockLoraProcessor_visual
+device = "cuda:0"
+def load_safetensors(path):
+    tensors = {}
+    with safe_open(path, framework="pt", device="cpu") as f:
+        for key in f.keys():
+            tensors[key] = f.get_tensor(key)
+    return tensors
+def get_lora_rank(checkpoint):
+    for k in checkpoint.keys():
+        if k.endswith(".down.weight"):
+            return checkpoint[k].shape[0]
+def load_checkpoint(local_path):
+    if local_path is not None:
+        if '.safetensors' in local_path:
+            print(f"Loading .safetensors checkpoint from {local_path}")
+            checkpoint = load_safetensors(local_path)
+        else:
+            print(f"Loading checkpoint from {local_path}")
+            checkpoint = torch.load(local_path, map_location='cpu')
+    return checkpoint
+def update_model_with_lora(checkpoint, lora_weights, transformer):
+        number = len(lora_weights)
+        ranks = [get_lora_rank(checkpoint) for _ in range(number)]
+        lora_attn_procs = {}
+        double_blocks_idx = list(range(19))
+        single_blocks_idx = list(range(38))
+        for name, attn_processor in transformer.attn_processors.items():
+            match = re.search(r'\.(\d+)\.', name)
+            if match:
+                layer_index = int(match.group(1))
+            if name.startswith("transformer_blocks") and layer_index in double_blocks_idx:
+                lora_state_dicts = {}
+                for key, value in checkpoint.items():
+                    # Match based on the layer index in the key (assuming the key contains layer index)
+                    if re.search(r'\.(\d+)\.', key):
+                        checkpoint_layer_index = int(re.search(r'\.(\d+)\.', key).group(1))
+                        if checkpoint_layer_index == layer_index and key.startswith("transformer_blocks"):
+                            lora_state_dicts[key] = value
+                lora_attn_procs[name] = MultiDoubleStreamBlockLoraProcessor(
+                    in_features=3072, out_features=3072, ranks=ranks, network_alphas=ranks, lora_weights=lora_weights, device=device, dtype=torch.bfloat16, n_loras=number
+                )
+                # Load the weights from the checkpoint dictionary into the corresponding layers
+                for n in range(number):
+                    lora_attn_procs[name].q_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.q_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].q_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.q_loras.{n}.up.weight', None)
+                    lora_attn_procs[name].k_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.k_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].k_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.k_loras.{n}.up.weight', None)
+                    lora_attn_procs[name].v_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.v_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].v_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.v_loras.{n}.up.weight', None)
+                    lora_attn_procs[name].proj_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.proj_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].proj_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.proj_loras.{n}.up.weight', None)
+                    lora_attn_procs[name].to(device)
+            elif name.startswith("single_transformer_blocks") and layer_index in single_blocks_idx:
+                lora_state_dicts = {}
+                for key, value in checkpoint.items():
+                    # Match based on the layer index in the key (assuming the key contains layer index)
+                    if re.search(r'\.(\d+)\.', key):
+                        checkpoint_layer_index = int(re.search(r'\.(\d+)\.', key).group(1))
+                        if checkpoint_layer_index == layer_index and key.startswith("single_transformer_blocks"):
+                            lora_state_dicts[key] = value
+                lora_attn_procs[name] = MultiSingleStreamBlockLoraProcessor(
+                    in_features=3072, out_features=3072, ranks=ranks, network_alphas=ranks, lora_weights=lora_weights, device=device, dtype=torch.bfloat16,  n_loras=number
+                )
+                # Load the weights from the checkpoint dictionary into the corresponding layers
+                for n in range(number):
+                    lora_attn_procs[name].q_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.q_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].q_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.q_loras.{n}.up.weight', None)
+                    lora_attn_procs[name].k_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.k_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].k_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.k_loras.{n}.up.weight', None)
+                    lora_attn_procs[name].v_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.v_loras.{n}.down.weight', None)
+                    lora_attn_procs[name].v_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.v_loras.{n}.up.weight', None)
+                    lora_attn_procs[name].to(device)
+            else:
+                lora_attn_procs[name] = FluxAttnProcessor2_0()
+        transformer.set_attn_processor(lora_attn_procs)
+def update_model_with_multi_lora(checkpoints, lora_weights, transformer, cond_size):
+        ck_number = len(checkpoints)
+        cond_lora_number = [len(ls) for ls in lora_weights]
+        cond_number = sum(cond_lora_number)
+        ranks = [get_lora_rank(checkpoint) for checkpoint in checkpoints]
+        multi_lora_weight = []
+        for ls in lora_weights:
+            for n in ls:
+                multi_lora_weight.append(n)
+        lora_attn_procs = {}
+        double_blocks_idx = list(range(19))
+        single_blocks_idx = list(range(38))
+        for name, attn_processor in transformer.attn_processors.items():
+            match = re.search(r'\.(\d+)\.', name)
+            if match:
+                layer_index = int(match.group(1))
+            if name.startswith("transformer_blocks") and layer_index in double_blocks_idx:
+                lora_state_dicts = [{} for _ in range(ck_number)]
+                for idx, checkpoint in enumerate(checkpoints):
+                    for key, value in checkpoint.items():
+                        # Match based on the layer index in the key (assuming the key contains layer index)
+                        if re.search(r'\.(\d+)\.', key):
+                            checkpoint_layer_index = int(re.search(r'\.(\d+)\.', key).group(1))
+                            if checkpoint_layer_index == layer_index and key.startswith("transformer_blocks"):
+                                lora_state_dicts[idx][key] = value
+                lora_attn_procs[name] = MultiDoubleStreamBlockLoraProcessor(
+                    dim=3072, ranks=ranks, network_alphas=ranks, lora_weights=multi_lora_weight, device=device, dtype=torch.bfloat16, cond_width=cond_size, cond_height=cond_size, n_loras=cond_number
+                )
+                # Load the weights from the checkpoint dictionary into the corresponding layers
+                num = 0
+                for idx in range(ck_number):
+                    for n in range(cond_lora_number[idx]):
+                        lora_attn_procs[name].q_loras[num].down.weight.data = lora_state_dicts[idx].get(f'{name}.q_loras.{n}.down.weight', None)
+                        lora_attn_procs[name].q_loras[num].up.weight.data = lora_state_dicts[idx].get(f'{name}.q_loras.{n}.up.weight', None)
+                        lora_attn_procs[name].k_loras[num].down.weight.data = lora_state_dicts[idx].get(f'{name}.k_loras.{n}.down.weight', None)
+                        lora_attn_procs[name].k_loras[num].up.weight.data = lora_state_dicts[idx].get(f'{name}.k_loras.{n}.up.weight', None)
+                        lora_attn_procs[name].v_loras[num].down.weight.data = lora_state_dicts[idx].get(f'{name}.v_loras.{n}.down.weight', None)
+                        lora_attn_procs[name].v_loras[num].up.weight.data = lora_state_dicts[idx].get(f'{name}.v_loras.{n}.up.weight', None)
+                        lora_attn_procs[name].proj_loras[num].down.weight.data = lora_state_dicts[idx].get(f'{name}.proj_loras.{n}.down.weight', None)
+                        lora_attn_procs[name].proj_loras[num].up.weight.data = lora_state_dicts[idx].get(f'{name}.proj_loras.{n}.up.weight', None)
+                        lora_attn_procs[name].to(device)
+                        num += 1
+            elif name.startswith("single_transformer_blocks") and layer_index in single_blocks_idx:
+                lora_state_dicts = [{} for _ in range(ck_number)]
+                for idx, checkpoint in enumerate(checkpoints):
+                    for key, value in checkpoint.items():
+                        # Match based on the layer index in the key (assuming the key contains layer index)
+                        if re.search(r'\.(\d+)\.', key):
+                            checkpoint_layer_index = int(re.search(r'\.(\d+)\.', key).group(1))
+                            if checkpoint_layer_index == layer_index and key.startswith("single_transformer_blocks"):
+                                lora_state_dicts[idx][key] = value
+                lora_attn_procs[name] = MultiSingleStreamBlockLoraProcessor(
+                    dim=3072, ranks=ranks, network_alphas=ranks, lora_weights=multi_lora_weight, device=device, dtype=torch.bfloat16, cond_width=cond_size, cond_height=cond_size, n_loras=cond_number
+                )
+                # Load the weights from the checkpoint dictionary into the corresponding layers
+                num = 0
+                for idx in range(ck_number):
+                    for n in range(cond_lora_number[idx]):
+                        lora_attn_procs[name].q_loras[num].down.weight.data = lora_state_dicts[idx].get(f'{name}.q_loras.{n}.down.weight', None)
+                        lora_attn_procs[name].q_loras[num].up.weight.data = lora_state_dicts[idx].get(f'{name}.q_loras.{n}.up.weight', None)
+                        lora_attn_procs[name].k_loras[num].down.weight.data = lora_state_dicts[idx].get(f'{name}.k_loras.{n}.down.weight', None)
+                        lora_attn_procs[name].k_loras[num].up.weight.data = lora_state_dicts[idx].get(f'{name}.k_loras.{n}.up.weight', None)
+                        lora_attn_procs[name].v_loras[num].down.weight.data = lora_state_dicts[idx].get(f'{name}.v_loras.{n}.down.weight', None)
+                        lora_attn_procs[name].v_loras[num].up.weight.data = lora_state_dicts[idx].get(f'{name}.v_loras.{n}.up.weight', None)
+                        lora_attn_procs[name].to(device)
+                        num += 1
+            else:
+                lora_attn_procs[name] = FluxAttnProcessor2_0()
+        transformer.set_attn_processor(lora_attn_procs)
+def set_single_lora(transformer, local_path, lora_weights=[]):
+    checkpoint = load_checkpoint(local_path)
+    update_model_with_lora(checkpoint, lora_weights, transformer)
+def set_single_lora_visual(transformer, local_path, lora_weights=[]):
+    checkpoint = load_checkpoint(local_path)
+    update_model_with_lora_with_visual(checkpoint, lora_weights, transformer)
+def set_multi_lora(transformer, local_paths, lora_weights=[[]], cond_size=512):
+    checkpoints = [load_checkpoint(local_path) for local_path in local_paths]
+    update_model_with_multi_lora(checkpoints, lora_weights, transformer, cond_size)
+def unset_lora(transformer):
+    lora_attn_procs = {}
+    for name, attn_processor in transformer.attn_processors.items():
+        lora_attn_procs[name] = FluxAttnProcessor2_0()
+    transformer.set_attn_processor(lora_attn_procs)
+def update_model_with_lora_with_visual(checkpoint, lora_weights, transformer):
+        number = len(lora_weights)
+        ranks = [get_lora_rank(checkpoint) for _ in range(number)]
+        lora_attn_procs = {}
+        double_blocks_idx = list(range(19))
+        single_blocks_idx = list(range(38))
+        for name, attn_processor in transformer.attn_processors.items():
+            match = re.search(r'\.(\d+)\.', name)
+            if match:
+                layer_index = int(match.group(1))
+            if name.startswith("transformer_blocks") and layer_index in double_blocks_idx:
+                lora_state_dicts = {}
+                for key, value in checkpoint.items():
+                    # Match based on the layer index in the key (assuming the key contains layer index)
+                    if re.search(r'\.(\d+)\.', key):
+                        checkpoint_layer_index = int(re.search(r'\.(\d+)\.', key).group(1))
+                        if checkpoint_layer_index == layer_index and key.startswith("transformer_blocks"):
+                            lora_state_dicts[key] = value
+                lora_attn_procs[name] = MultiDoubleStreamBlockLoraProcessor_visual(
+                    in_features=3072, out_features=3072, ranks=ranks, network_alphas=ranks, lora_weights=lora_weights, device=device, dtype=torch.bfloat16, n_loras=number
+                )
+                # Load the weights from the checkpoint dictionary into the corresponding layers
+                # for n in range(number):
+                #     lora_attn_procs[name].q_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.q_loras.{n}.down.weight', None)
+                #     lora_attn_procs[name].q_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.q_loras.{n}.up.weight', None)
+                #     lora_attn_procs[name].k_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.k_loras.{n}.down.weight', None)
+                #     lora_attn_procs[name].k_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.k_loras.{n}.up.weight', None)
+                #     lora_attn_procs[name].v_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.v_loras.{n}.down.weight', None)
+                #     lora_attn_procs[name].v_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.v_loras.{n}.up.weight', None)
+                #     lora_attn_procs[name].proj_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.proj_loras.{n}.down.weight', None)
+                #     lora_attn_procs[name].proj_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.proj_loras.{n}.up.weight', None)
+                #     lora_attn_procs[name].to(device)
+            elif name.startswith("single_transformer_blocks") and layer_index in single_blocks_idx:
+                lora_state_dicts = {}
+                for key, value in checkpoint.items():
+                    # Match based on the layer index in the key (assuming the key contains layer index)
+                    if re.search(r'\.(\d+)\.', key):
+                        checkpoint_layer_index = int(re.search(r'\.(\d+)\.', key).group(1))
+                        if checkpoint_layer_index == layer_index and key.startswith("single_transformer_blocks"):
+                            lora_state_dicts[key] = value
+                lora_attn_procs[name] = MultiSingleStreamBlockLoraProcessor_visual(
+                    in_features=3072, out_features=3072, ranks=ranks, network_alphas=ranks, lora_weights=lora_weights, device=device, dtype=torch.bfloat16,  n_loras=number
+                )
+                # Load the weights from the checkpoint dictionary into the corresponding layers
+                # for n in range(number):
+                #     lora_attn_procs[name].q_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.q_loras.{n}.down.weight', None)
+                #     lora_attn_procs[name].q_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.q_loras.{n}.up.weight', None)
+                #     lora_attn_procs[name].k_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.k_loras.{n}.down.weight', None)
+                #     lora_attn_procs[name].k_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.k_loras.{n}.up.weight', None)
+                #     lora_attn_procs[name].v_loras[n].down.weight.data = lora_state_dicts.get(f'{name}.v_loras.{n}.down.weight', None)
+                #     lora_attn_procs[name].v_loras[n].up.weight.data = lora_state_dicts.get(f'{name}.v_loras.{n}.up.weight', None)
+                #     lora_attn_procs[name].to(device)
+            else:
+                lora_attn_procs[name] = FluxAttnProcessor2_0()
+        transformer.set_attn_processor(lora_attn_procs)
+'''
+unset_lora(pipe.transformer)
+lora_path = "./lora.safetensors"
+lora_weights = [1, 1]
+set_lora(pipe.transformer, local_path=lora_path, lora_weights=lora_weights, cond_size=512)
+'''

src/pipeline.py ADDED Viewed

	@@ -0,0 +1,805 @@

+import inspect
+from typing import Any, Callable, Dict, List, Optional, Union
+import numpy as np
+import torch
+from transformers import CLIPTextModel, CLIPTokenizer, T5EncoderModel, T5TokenizerFast
+from diffusers.image_processor import (VaeImageProcessor)
+from diffusers.loaders import FluxLoraLoaderMixin, FromSingleFileMixin
+from diffusers.models.autoencoders import AutoencoderKL
+from diffusers.schedulers import FlowMatchEulerDiscreteScheduler
+from diffusers.utils import (
+    USE_PEFT_BACKEND,
+    is_torch_xla_available,
+    logging,
+    scale_lora_layers,
+    unscale_lora_layers,
+)
+from diffusers.utils.torch_utils import randn_tensor
+from diffusers.pipelines.pipeline_utils import DiffusionPipeline
+from diffusers.pipelines.flux.pipeline_output import FluxPipelineOutput
+from torchvision.transforms.functional import pad
+from .transformer_flux import FluxTransformer2DModel
+if is_torch_xla_available():
+    import torch_xla.core.xla_model as xm
+    XLA_AVAILABLE = True
+else:
+    XLA_AVAILABLE = False
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+def calculate_shift(
+        image_seq_len,
+        base_seq_len: int = 256,
+        max_seq_len: int = 4096,
+        base_shift: float = 0.5,
+        max_shift: float = 1.16,
+):
+    m = (max_shift - base_shift) / (max_seq_len - base_seq_len)
+    b = base_shift - m * base_seq_len
+    mu = image_seq_len * m + b
+    return mu
+def prepare_latent_image_ids_2(height, width, device, dtype):
+    latent_image_ids = torch.zeros(height//2, width//2, 3, device=device, dtype=dtype)
+    latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height//2, device=device)[:, None]  # y坐标
+    latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width//2, device=device)[None, :]   # x坐标
+    return latent_image_ids
+def prepare_latent_subject_ids(height, width, device, dtype):
+    latent_image_ids = torch.zeros(height // 2, width // 2, 3, device=device, dtype=dtype)
+    latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height // 2, device=device)[:, None]
+    latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width // 2, device=device)[None, :]
+    latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+    latent_image_ids = latent_image_ids.reshape(
+        latent_image_id_height * latent_image_id_width, latent_image_id_channels
+    )
+    return latent_image_ids.to(device=device, dtype=dtype)
+def resize_position_encoding(batch_size, original_height, original_width, target_height, target_width, device, dtype):
+    latent_image_ids = prepare_latent_image_ids_2(original_height, original_width, device, dtype)
+    scale_h = original_height / target_height
+    scale_w = original_width / target_width
+    latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+    latent_image_ids = latent_image_ids.reshape(
+        latent_image_id_height * latent_image_id_width, latent_image_id_channels
+    )
+    #spatial进行PE插值
+    latent_image_ids_resized = torch.zeros(target_height//2, target_width//2, 3, device=device, dtype=dtype)
+    for i in range(target_height//2):
+        for j in range(target_width//2):
+            latent_image_ids_resized[i, j, 1] = i*scale_h
+            latent_image_ids_resized[i, j, 2] = j*scale_w
+    cond_latent_image_id_height, cond_latent_image_id_width, cond_latent_image_id_channels = latent_image_ids_resized.shape
+    cond_latent_image_ids = latent_image_ids_resized.reshape(
+            cond_latent_image_id_height * cond_latent_image_id_width, cond_latent_image_id_channels
+        )
+    # latent_image_ids_ = torch.concat([latent_image_ids, cond_latent_image_ids], dim=0)
+    return latent_image_ids, cond_latent_image_ids #, latent_image_ids_
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion_img2img.retrieve_latents
+def retrieve_latents(
+        encoder_output: torch.Tensor, generator: Optional[torch.Generator] = None, sample_mode: str = "sample"
+):
+    if hasattr(encoder_output, "latent_dist") and sample_mode == "sample":
+        return encoder_output.latent_dist.sample(generator)
+    elif hasattr(encoder_output, "latent_dist") and sample_mode == "argmax":
+        return encoder_output.latent_dist.mode()
+    elif hasattr(encoder_output, "latents"):
+        return encoder_output.latents
+    else:
+        raise AttributeError("Could not access latents of provided encoder_output")
+# Copied from diffusers.pipelines.stable_diffusion.pipeline_stable_diffusion.retrieve_timesteps
+def retrieve_timesteps(
+        scheduler,
+        num_inference_steps: Optional[int] = None,
+        device: Optional[Union[str, torch.device]] = None,
+        timesteps: Optional[List[int]] = None,
+        sigmas: Optional[List[float]] = None,
+        **kwargs,
+):
+    """
+    Calls the scheduler's `set_timesteps` method and retrieves timesteps from the scheduler after the call. Handles
+    custom timesteps. Any kwargs will be supplied to `scheduler.set_timesteps`.
+    Args:
+        scheduler (`SchedulerMixin`):
+            The scheduler to get timesteps from.
+        num_inference_steps (`int`):
+            The number of diffusion steps used when generating samples with a pre-trained model. If used, `timesteps`
+            must be `None`.
+        device (`str` or `torch.device`, *optional*):
+            The device to which the timesteps should be moved to. If `None`, the timesteps are not moved.
+        timesteps (`List[int]`, *optional*):
+            Custom timesteps used to override the timestep spacing strategy of the scheduler. If `timesteps` is passed,
+            `num_inference_steps` and `sigmas` must be `None`.
+        sigmas (`List[float]`, *optional*):
+            Custom sigmas used to override the timestep spacing strategy of the scheduler. If `sigmas` is passed,
+            `num_inference_steps` and `timesteps` must be `None`.
+    Returns:
+        `Tuple[torch.Tensor, int]`: A tuple where the first element is the timestep schedule from the scheduler and the
+        second element is the number of inference steps.
+    """
+    if timesteps is not None and sigmas is not None:
+        raise ValueError("Only one of `timesteps` or `sigmas` can be passed. Please choose one to set custom values")
+    if timesteps is not None:
+        accepts_timesteps = "timesteps" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accepts_timesteps:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" timestep schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(timesteps=timesteps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    elif sigmas is not None:
+        accept_sigmas = "sigmas" in set(inspect.signature(scheduler.set_timesteps).parameters.keys())
+        if not accept_sigmas:
+            raise ValueError(
+                f"The current scheduler class {scheduler.__class__}'s `set_timesteps` does not support custom"
+                f" sigmas schedules. Please check whether you are using the correct scheduler."
+            )
+        scheduler.set_timesteps(sigmas=sigmas, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+        num_inference_steps = len(timesteps)
+    else:
+        scheduler.set_timesteps(num_inference_steps, device=device, **kwargs)
+        timesteps = scheduler.timesteps
+    return timesteps, num_inference_steps
+class FluxPipeline(DiffusionPipeline, FluxLoraLoaderMixin, FromSingleFileMixin):
+    r"""
+    The Flux pipeline for text-to-image generation.
+    Reference: https://blackforestlabs.ai/announcing-black-forest-labs/
+    Args:
+        transformer ([`FluxTransformer2DModel`]):
+            Conditional Transformer (MMDiT) architecture to denoise the encoded image latents.
+        scheduler ([`FlowMatchEulerDiscreteScheduler`]):
+            A scheduler to be used in combination with `transformer` to denoise the encoded image latents.
+        vae ([`AutoencoderKL`]):
+            Variational Auto-Encoder (VAE) Model to encode and decode images to and from latent representations.
+        text_encoder ([`CLIPTextModel`]):
+            [CLIP](https://huggingface.co/docs/transformers/model_doc/clip#transformers.CLIPTextModel), specifically
+            the [clip-vit-large-patch14](https://huggingface.co/openai/clip-vit-large-patch14) variant.
+        text_encoder_2 ([`T5EncoderModel`]):
+            [T5](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5EncoderModel), specifically
+            the [google/t5-v1_1-xxl](https://huggingface.co/google/t5-v1_1-xxl) variant.
+        tokenizer (`CLIPTokenizer`):
+            Tokenizer of class
+            [CLIPTokenizer](https://huggingface.co/docs/transformers/en/model_doc/clip#transformers.CLIPTokenizer).
+        tokenizer_2 (`T5TokenizerFast`):
+            Second Tokenizer of class
+            [T5TokenizerFast](https://huggingface.co/docs/transformers/en/model_doc/t5#transformers.T5TokenizerFast).
+    """
+    model_cpu_offload_seq = "text_encoder->text_encoder_2->transformer->vae"
+    _optional_components = []
+    _callback_tensor_inputs = ["latents", "prompt_embeds"]
+    def __init__(
+            self,
+            scheduler: FlowMatchEulerDiscreteScheduler,
+            vae: AutoencoderKL,
+            text_encoder: CLIPTextModel,
+            tokenizer: CLIPTokenizer,
+            text_encoder_2: T5EncoderModel,
+            tokenizer_2: T5TokenizerFast,
+            transformer: FluxTransformer2DModel,
+    ):
+        super().__init__()
+        self.register_modules(
+            vae=vae,
+            text_encoder=text_encoder,
+            text_encoder_2=text_encoder_2,
+            tokenizer=tokenizer,
+            tokenizer_2=tokenizer_2,
+            transformer=transformer,
+            scheduler=scheduler,
+        )
+        self.vae_scale_factor = (
+            2 ** (len(self.vae.config.block_out_channels)) if hasattr(self, "vae") and self.vae is not None else 16
+        )
+        self.image_processor = VaeImageProcessor(vae_scale_factor=self.vae_scale_factor)
+        self.tokenizer_max_length = (
+            self.tokenizer.model_max_length if hasattr(self, "tokenizer") and self.tokenizer is not None else 77
+        )
+        self.default_sample_size = 64
+    def _get_t5_prompt_embeds(
+            self,
+            prompt: Union[str, List[str]] = None,
+            num_images_per_prompt: int = 1,
+            max_sequence_length: int = 512,
+            device: Optional[torch.device] = None,
+            dtype: Optional[torch.dtype] = None,
+    ):
+        device = device or self._execution_device
+        dtype = dtype or self.text_encoder.dtype
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        text_inputs = self.tokenizer_2(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            return_length=False,
+            return_overflowing_tokens=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer_2(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer_2.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1: -1])
+            logger.warning(
+                "The following part of your input was truncated because `max_sequence_length` is set to "
+                f" {max_sequence_length} tokens: {removed_text}"
+            )
+        prompt_embeds = self.text_encoder_2(text_input_ids.to(device), output_hidden_states=False)[0]
+        dtype = self.text_encoder_2.dtype
+        prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+        _, seq_len, _ = prompt_embeds.shape
+        # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+        return prompt_embeds
+    def _get_clip_prompt_embeds(
+            self,
+            prompt: Union[str, List[str]],
+            num_images_per_prompt: int = 1,
+            device: Optional[torch.device] = None,
+    ):
+        device = device or self._execution_device
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        batch_size = len(prompt)
+        text_inputs = self.tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=self.tokenizer_max_length,
+            truncation=True,
+            return_overflowing_tokens=False,
+            return_length=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+        untruncated_ids = self.tokenizer(prompt, padding="longest", return_tensors="pt").input_ids
+        if untruncated_ids.shape[-1] >= text_input_ids.shape[-1] and not torch.equal(text_input_ids, untruncated_ids):
+            removed_text = self.tokenizer.batch_decode(untruncated_ids[:, self.tokenizer_max_length - 1: -1])
+            logger.warning(
+                "The following part of your input was truncated because CLIP can only handle sequences up to"
+                f" {self.tokenizer_max_length} tokens: {removed_text}"
+            )
+        prompt_embeds = self.text_encoder(text_input_ids.to(device), output_hidden_states=False)
+        # Use pooled output of CLIPTextModel
+        prompt_embeds = prompt_embeds.pooler_output
+        prompt_embeds = prompt_embeds.to(dtype=self.text_encoder.dtype, device=device)
+        # duplicate text embeddings for each generation per prompt, using mps friendly method
+        prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt)
+        prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+        return prompt_embeds
+    def encode_prompt(
+            self,
+            prompt: Union[str, List[str]],
+            prompt_2: Union[str, List[str]],
+            device: Optional[torch.device] = None,
+            num_images_per_prompt: int = 1,
+            prompt_embeds: Optional[torch.FloatTensor] = None,
+            pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+            max_sequence_length: int = 512,
+            lora_scale: Optional[float] = None,
+    ):
+        r"""
+        Args:
+            prompt (`str` or `List[str]`, *optional*):
+                prompt to be encoded
+            prompt_2 (`str` or `List[str]`, *optional*):
+                The prompt or prompts to be sent to the `tokenizer_2` and `text_encoder_2`. If not defined, `prompt` is
+                used in all text-encoders
+            device: (`torch.device`):
+                torch device
+            num_images_per_prompt (`int`):
+                number of images that should be generated per prompt
+            prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting. If not
+                provided, text embeddings will be generated from `prompt` input argument.
+            pooled_prompt_embeds (`torch.FloatTensor`, *optional*):
+                Pre-generated pooled text embeddings. Can be used to easily tweak text inputs, *e.g.* prompt weighting.
+                If not provided, pooled text embeddings will be generated from `prompt` input argument.
+            lora_scale (`float`, *optional*):
+                A lora scale that will be applied to all LoRA layers of the text encoder if LoRA layers are loaded.
+        """
+        device = device or self._execution_device
+        # set lora scale so that monkey patched LoRA
+        # function of text encoder can correctly access it
+        if lora_scale is not None and isinstance(self, FluxLoraLoaderMixin):
+            self._lora_scale = lora_scale
+            # dynamically adjust the LoRA scale
+            if self.text_encoder is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder, lora_scale)
+            if self.text_encoder_2 is not None and USE_PEFT_BACKEND:
+                scale_lora_layers(self.text_encoder_2, lora_scale)
+        prompt = [prompt] if isinstance(prompt, str) else prompt
+        if prompt_embeds is None:
+            prompt_2 = prompt_2 or prompt
+            prompt_2 = [prompt_2] if isinstance(prompt_2, str) else prompt_2
+            # We only use the pooled prompt output from the CLIPTextModel
+            pooled_prompt_embeds = self._get_clip_prompt_embeds(
+                prompt=prompt,
+                device=device,
+                num_images_per_prompt=num_images_per_prompt,
+            )
+            prompt_embeds = self._get_t5_prompt_embeds(
+                prompt=prompt_2,
+                num_images_per_prompt=num_images_per_prompt,
+                max_sequence_length=max_sequence_length,
+                device=device,
+            )
+        if self.text_encoder is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder, lora_scale)
+        if self.text_encoder_2 is not None:
+            if isinstance(self, FluxLoraLoaderMixin) and USE_PEFT_BACKEND:
+                # Retrieve the original scale by scaling back the LoRA layers
+                unscale_lora_layers(self.text_encoder_2, lora_scale)
+        dtype = self.text_encoder.dtype if self.text_encoder is not None else self.transformer.dtype
+        text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
+        return prompt_embeds, pooled_prompt_embeds, text_ids
+    # Copied from diffusers.pipelines.stable_diffusion_3.pipeline_stable_diffusion_3_inpaint.StableDiffusion3InpaintPipeline._encode_vae_image
+    def _encode_vae_image(self, image: torch.Tensor, generator: torch.Generator):
+        if isinstance(generator, list):
+            image_latents = [
+                retrieve_latents(self.vae.encode(image[i: i + 1]), generator=generator[i])
+                for i in range(image.shape[0])
+            ]
+            image_latents = torch.cat(image_latents, dim=0)
+        else:
+            image_latents = retrieve_latents(self.vae.encode(image), generator=generator)
+        image_latents = (image_latents - self.vae.config.shift_factor) * self.vae.config.scaling_factor
+        return image_latents
+    def check_inputs(
+            self,
+            prompt,
+            prompt_2,
+            height,
+            width,
+            prompt_embeds=None,
+            pooled_prompt_embeds=None,
+            callback_on_step_end_tensor_inputs=None,
+            max_sequence_length=None,
+    ):
+        if height % 8 != 0 or width % 8 != 0:
+            raise ValueError(f"`height` and `width` have to be divisible by 8 but are {height} and {width}.")
+        if callback_on_step_end_tensor_inputs is not None and not all(
+                k in self._callback_tensor_inputs for k in callback_on_step_end_tensor_inputs
+        ):
+            raise ValueError(
+                f"`callback_on_step_end_tensor_inputs` has to be in {self._callback_tensor_inputs}, but found {[k for k in callback_on_step_end_tensor_inputs if k not in self._callback_tensor_inputs]}"
+            )
+        if prompt is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt`: {prompt} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt_2 is not None and prompt_embeds is not None:
+            raise ValueError(
+                f"Cannot forward both `prompt_2`: {prompt_2} and `prompt_embeds`: {prompt_embeds}. Please make sure to"
+                " only forward one of the two."
+            )
+        elif prompt is None and prompt_embeds is None:
+            raise ValueError(
+                "Provide either `prompt` or `prompt_embeds`. Cannot leave both `prompt` and `prompt_embeds` undefined."
+            )
+        elif prompt is not None and (not isinstance(prompt, str) and not isinstance(prompt, list)):
+            raise ValueError(f"`prompt` has to be of type `str` or `list` but is {type(prompt)}")
+        elif prompt_2 is not None and (not isinstance(prompt_2, str) and not isinstance(prompt_2, list)):
+            raise ValueError(f"`prompt_2` has to be of type `str` or `list` but is {type(prompt_2)}")
+        if prompt_embeds is not None and pooled_prompt_embeds is None:
+            raise ValueError(
+                "If `prompt_embeds` are provided, `pooled_prompt_embeds` also have to be passed. Make sure to generate `pooled_prompt_embeds` from the same text encoder that was used to generate `prompt_embeds`."
+            )
+        if max_sequence_length is not None and max_sequence_length > 512:
+            raise ValueError(f"`max_sequence_length` cannot be greater than 512 but is {max_sequence_length}")
+    @staticmethod
+    def _prepare_latent_image_ids(batch_size, height, width, device, dtype):
+        latent_image_ids = torch.zeros(height // 2, width // 2, 3)
+        latent_image_ids[..., 1] = latent_image_ids[..., 1] + torch.arange(height // 2)[:, None]
+        latent_image_ids[..., 2] = latent_image_ids[..., 2] + torch.arange(width // 2)[None, :]
+        latent_image_id_height, latent_image_id_width, latent_image_id_channels = latent_image_ids.shape
+        latent_image_ids = latent_image_ids.reshape(
+            latent_image_id_height * latent_image_id_width, latent_image_id_channels
+        )
+        return latent_image_ids.to(device=device, dtype=dtype)
+    @staticmethod
+    def _pack_latents(latents, batch_size, num_channels_latents, height, width):
+        latents = latents.view(batch_size, num_channels_latents, height // 2, 2, width // 2, 2)
+        latents = latents.permute(0, 2, 4, 1, 3, 5)
+        latents = latents.reshape(batch_size, (height // 2) * (width // 2), num_channels_latents * 4)
+        return latents
+    @staticmethod
+    def _unpack_latents(latents, height, width, vae_scale_factor):
+        batch_size, num_patches, channels = latents.shape
+        height = height // vae_scale_factor
+        width = width // vae_scale_factor
+        latents = latents.view(batch_size, height, width, channels // 4, 2, 2)
+        latents = latents.permute(0, 3, 1, 4, 2, 5)
+        latents = latents.reshape(batch_size, channels // (2 * 2), height * 2, width * 2)
+        return latents
+    def enable_vae_slicing(self):
+        r"""
+        Enable sliced VAE decoding. When this option is enabled, the VAE will split the input tensor in slices to
+        compute decoding in several steps. This is useful to save some memory and allow larger batch sizes.
+        """
+        self.vae.enable_slicing()
+    def disable_vae_slicing(self):
+        r"""
+        Disable sliced VAE decoding. If `enable_vae_slicing` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_slicing()
+    def enable_vae_tiling(self):
+        r"""
+        Enable tiled VAE decoding. When this option is enabled, the VAE will split the input tensor into tiles to
+        compute decoding and encoding in several steps. This is useful for saving a large amount of memory and to allow
+        processing larger images.
+        """
+        self.vae.enable_tiling()
+    def disable_vae_tiling(self):
+        r"""
+        Disable tiled VAE decoding. If `enable_vae_tiling` was previously enabled, this method will go back to
+        computing decoding in one step.
+        """
+        self.vae.disable_tiling()
+    def prepare_latents(
+            self,
+            batch_size,
+            num_channels_latents,
+            height,
+            width,
+            dtype,
+            device,
+            generator,
+            subject_image,
+            condition_image,
+            latents=None,
+            cond_number=1,
+            sub_number=1
+    ):
+        height_cond = 2 * (self.cond_size // self.vae_scale_factor)
+        width_cond = 2 * (self.cond_size // self.vae_scale_factor)
+        height = 2 * (int(height) // self.vae_scale_factor)
+        width = 2 * (int(width) // self.vae_scale_factor)
+        shape = (batch_size, num_channels_latents, height, width)  # 1 16 106 80
+        noise_latents = randn_tensor(shape, generator=generator, device=device, dtype=dtype)
+        noise_latents = self._pack_latents(noise_latents, batch_size, num_channels_latents, height, width)
+        noise_latent_image_ids, cond_latent_image_ids = resize_position_encoding(
+                batch_size,
+                height,
+                width,
+                height_cond,
+                width_cond,
+                device,
+                dtype,
+            )
+        latents_to_concat = []  # 不包含 latents
+        latents_ids_to_concat = [noise_latent_image_ids]
+        # subject
+        if subject_image is not None:
+            shape_subject = (batch_size, num_channels_latents, height_cond*sub_number, width_cond)
+            subject_image = subject_image.to(device=device, dtype=dtype)
+            subject_image_latents = self._encode_vae_image(image=subject_image, generator=generator)
+            subject_latents = self._pack_latents(subject_image_latents, batch_size, num_channels_latents, height_cond*sub_number, width_cond)
+            mask2 = torch.zeros(shape_subject, device=device, dtype=dtype)
+            mask2 = self._pack_latents(mask2, batch_size, num_channels_latents, height_cond*sub_number, width_cond)
+            latent_subject_ids = prepare_latent_subject_ids(height_cond, width_cond, device, dtype)
+            latent_subject_ids[:, 1] += 64  # fixed offset
+            subject_latent_image_ids = torch.concat([latent_subject_ids for _ in range(sub_number)], dim=-2)
+            latents_to_concat.append(subject_latents)
+            latents_ids_to_concat.append(subject_latent_image_ids)
+        # spatial
+        if condition_image is not None:
+            shape_cond = (batch_size, num_channels_latents, height_cond*cond_number, width_cond)
+            condition_image = condition_image.to(device=device, dtype=dtype)
+            image_latents = self._encode_vae_image(image=condition_image, generator=generator)
+            cond_latents = self._pack_latents(image_latents, batch_size, num_channels_latents, height_cond*cond_number, width_cond)
+            mask3 = torch.zeros(shape_cond, device=device, dtype=dtype)
+            mask3 = self._pack_latents(mask3, batch_size, num_channels_latents, height_cond*cond_number, width_cond)
+            cond_latent_image_ids = cond_latent_image_ids
+            cond_latent_image_ids = torch.concat([cond_latent_image_ids for _ in range(cond_number)], dim=-2)
+            latents_ids_to_concat.append(cond_latent_image_ids)
+            latents_to_concat.append(cond_latents)
+        cond_latents = torch.concat(latents_to_concat, dim=-2)
+        latent_image_ids = torch.concat(latents_ids_to_concat, dim=-2)
+        return cond_latents, latent_image_ids, noise_latents
+    @property
+    def guidance_scale(self):
+        return self._guidance_scale
+    @property
+    def joint_attention_kwargs(self):
+        return self._joint_attention_kwargs
+    @property
+    def num_timesteps(self):
+        return self._num_timesteps
+    @property
+    def interrupt(self):
+        return self._interrupt
+    @torch.no_grad()
+    def __call__(
+            self,
+            prompt: Union[str, List[str]] = None,
+            prompt_2: Optional[Union[str, List[str]]] = None,
+            height: Optional[int] = None,
+            width: Optional[int] = None,
+            num_inference_steps: int = 28,
+            timesteps: List[int] = None,
+            guidance_scale: float = 3.5,
+            num_images_per_prompt: Optional[int] = 1,
+            generator: Optional[Union[torch.Generator, List[torch.Generator]]] = None,
+            latents: Optional[torch.FloatTensor] = None,
+            prompt_embeds: Optional[torch.FloatTensor] = None,
+            pooled_prompt_embeds: Optional[torch.FloatTensor] = None,
+            output_type: Optional[str] = "pil",
+            return_dict: bool = True,
+            joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+            callback_on_step_end: Optional[Callable[[int, int, Dict], None]] = None,
+            callback_on_step_end_tensor_inputs: List[str] = ["latents"],
+            max_sequence_length: int = 512,
+            spatial_images=None,
+            subject_images=None,
+            cond_size=512,
+    ):
+        height = height or self.default_sample_size * self.vae_scale_factor
+        width = width or self.default_sample_size * self.vae_scale_factor
+        self.cond_size = cond_size
+        # 1. Check inputs. Raise error if not correct
+        self.check_inputs(
+            prompt,
+            prompt_2,
+            height,
+            width,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            callback_on_step_end_tensor_inputs=callback_on_step_end_tensor_inputs,
+            max_sequence_length=max_sequence_length,
+        )
+        self._guidance_scale = guidance_scale
+        self._joint_attention_kwargs = joint_attention_kwargs
+        self._interrupt = False
+        cond_number = len(spatial_images)
+        sub_number = len(subject_images)
+        if sub_number > 0:
+            subject_image_ls = []
+            for subject_image in subject_images:
+                w, h = subject_image.size[:2]
+                scale = self.cond_size / max(h, w)
+                new_h, new_w = int(h * scale), int(w * scale)
+                subject_image = self.image_processor.preprocess(subject_image, height=new_h, width=new_w)
+                subject_image = subject_image.to(dtype=torch.float32)
+                pad_h = cond_size - subject_image.shape[-2]
+                pad_w = cond_size - subject_image.shape[-1]
+                subject_image = pad(
+                    subject_image,
+                    padding=(int(pad_w / 2), int(pad_h / 2), int(pad_w / 2), int(pad_h / 2)),
+                    fill=0
+                )
+                subject_image_ls.append(subject_image)
+            subject_image = torch.concat(subject_image_ls, dim=-2)
+        else:
+            subject_image = None
+        if cond_number > 0:
+            condition_image_ls = []
+            for img in spatial_images:
+                condition_image = self.image_processor.preprocess(img, height=self.cond_size, width=self.cond_size)
+                condition_image = condition_image.to(dtype=torch.float32)
+                condition_image_ls.append(condition_image)
+            condition_image = torch.concat(condition_image_ls, dim=-2)
+        else:
+            condition_image = None
+        # 2. Define call parameters
+        if prompt is not None and isinstance(prompt, str):
+            batch_size = 1
+        elif prompt is not None and isinstance(prompt, list):
+            batch_size = len(prompt)
+        else:
+            batch_size = prompt_embeds.shape[0]
+        device = self._execution_device
+        lora_scale = (
+            self.joint_attention_kwargs.get("scale", None) if self.joint_attention_kwargs is not None else None
+        )
+        (
+            prompt_embeds,
+            pooled_prompt_embeds,
+            text_ids,
+        ) = self.encode_prompt(
+            prompt=prompt,
+            prompt_2=prompt_2,
+            prompt_embeds=prompt_embeds,
+            pooled_prompt_embeds=pooled_prompt_embeds,
+            device=device,
+            num_images_per_prompt=num_images_per_prompt,
+            max_sequence_length=max_sequence_length,
+            lora_scale=lora_scale,
+        )
+        # 4. Prepare latent variables
+        num_channels_latents = self.transformer.config.in_channels // 4  # 16
+        cond_latents, latent_image_ids, noise_latents = self.prepare_latents(
+            batch_size * num_images_per_prompt,
+            num_channels_latents,
+            height,
+            width,
+            prompt_embeds.dtype,
+            device,
+            generator,
+            subject_image,
+            condition_image,
+            latents,
+            cond_number,
+            sub_number
+        )
+        latents = noise_latents
+        # 5. Prepare timesteps
+        sigmas = np.linspace(1.0, 1 / num_inference_steps, num_inference_steps)
+        image_seq_len = latents.shape[1]
+        mu = calculate_shift(
+            image_seq_len,
+            self.scheduler.config.base_image_seq_len,
+            self.scheduler.config.max_image_seq_len,
+            self.scheduler.config.base_shift,
+            self.scheduler.config.max_shift,
+        )
+        timesteps, num_inference_steps = retrieve_timesteps(
+            self.scheduler,
+            num_inference_steps,
+            device,
+            timesteps,
+            sigmas,
+            mu=mu,
+        )
+        num_warmup_steps = max(len(timesteps) - num_inference_steps * self.scheduler.order, 0)
+        self._num_timesteps = len(timesteps)
+        # handle guidance
+        if self.transformer.config.guidance_embeds:
+            guidance = torch.full([1], guidance_scale, device=device, dtype=torch.float32)
+            guidance = guidance.expand(latents.shape[0])
+        else:
+            guidance = None
+        # 6. Denoising loop
+        with self.progress_bar(total=num_inference_steps) as progress_bar:
+            for i, t in enumerate(timesteps):
+                if self.interrupt:
+                    continue
+                # broadcast to batch dimension in a way that's compatible with ONNX/Core ML
+                timestep = t.expand(latents.shape[0]).to(latents.dtype)
+                noise_pred = self.transformer(
+                    hidden_states=latents,  # 1 4096 64
+                    cond_hidden_states=cond_latents,
+                    timestep=timestep / 1000,
+                    guidance=guidance,
+                    pooled_projections=pooled_prompt_embeds,
+                    encoder_hidden_states=prompt_embeds,
+                    txt_ids=text_ids,
+                    img_ids=latent_image_ids,
+                    joint_attention_kwargs=self.joint_attention_kwargs,
+                    return_dict=False,
+                )[0]
+                # compute the previous noisy sample x_t -> x_t-1
+                latents_dtype = latents.dtype
+                latents = self.scheduler.step(noise_pred, t, latents, return_dict=False)[0]
+                latents = latents
+                if latents.dtype != latents_dtype:
+                    if torch.backends.mps.is_available():
+                        # some platforms (eg. apple mps) misbehave due to a pytorch bug: https://github.com/pytorch/pytorch/pull/99272
+                        latents = latents.to(latents_dtype)
+                if callback_on_step_end is not None:
+                    callback_kwargs = {}
+                    for k in callback_on_step_end_tensor_inputs:
+                        callback_kwargs[k] = locals()[k]
+                    callback_outputs = callback_on_step_end(self, i, t, callback_kwargs)
+                    latents = callback_outputs.pop("latents", latents)
+                    prompt_embeds = callback_outputs.pop("prompt_embeds", prompt_embeds)
+                # call the callback, if provided
+                if i == len(timesteps) - 1 or ((i + 1) > num_warmup_steps and (i + 1) % self.scheduler.order == 0):
+                    progress_bar.update()
+                if XLA_AVAILABLE:
+                    xm.mark_step()
+        if output_type == "latent":
+            image = latents
+        else:
+            latents = self._unpack_latents(latents, height, width, self.vae_scale_factor)
+            latents = (latents / self.vae.config.scaling_factor) + self.vae.config.shift_factor
+            image = self.vae.decode(latents.to(dtype=self.vae.dtype), return_dict=False)[0]
+            image = self.image_processor.postprocess(image, output_type=output_type)
+        # Offload all models
+        self.maybe_free_model_hooks()
+        if not return_dict:
+            return (image,)
+        return FluxPipelineOutput(images=image)

src/prompt_helper.py ADDED Viewed

	@@ -0,0 +1,205 @@

+import torch
+def load_text_encoders(args, class_one, class_two):
+    text_encoder_one = class_one.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder", revision=args.revision, variant=args.variant
+    )
+    text_encoder_two = class_two.from_pretrained(
+        args.pretrained_model_name_or_path, subfolder="text_encoder_2", revision=args.revision, variant=args.variant
+    )
+    return text_encoder_one, text_encoder_two
+def tokenize_prompt(tokenizer, prompt, max_sequence_length):
+    text_inputs = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=max_sequence_length,
+        truncation=True,
+        return_length=False,
+        return_overflowing_tokens=False,
+        return_tensors="pt",
+    )
+    text_input_ids = text_inputs.input_ids
+    return text_input_ids
+def tokenize_prompt_clip(tokenizer, prompt):
+    text_inputs = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=77,
+        truncation=True,
+        return_length=False,
+        return_overflowing_tokens=False,
+        return_tensors="pt",
+    )
+    text_input_ids = text_inputs.input_ids
+    return text_input_ids
+def tokenize_prompt_t5(tokenizer, prompt):
+    text_inputs = tokenizer(
+        prompt,
+        padding="max_length",
+        max_length=512,
+        truncation=True,
+        return_length=False,
+        return_overflowing_tokens=False,
+        return_tensors="pt",
+    )
+    text_input_ids = text_inputs.input_ids
+    return text_input_ids
+def _encode_prompt_with_t5(
+        text_encoder,
+        tokenizer,
+        max_sequence_length=512,
+        prompt=None,
+        num_images_per_prompt=1,
+        device=None,
+        text_input_ids=None,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    batch_size = len(prompt)
+    if tokenizer is not None:
+        text_inputs = tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=max_sequence_length,
+            truncation=True,
+            return_length=False,
+            return_overflowing_tokens=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+    else:
+        if text_input_ids is None:
+            raise ValueError("text_input_ids must be provided when the tokenizer is not specified")
+    prompt_embeds = text_encoder(text_input_ids.to(device))[0]
+    dtype = text_encoder.dtype
+    prompt_embeds = prompt_embeds.to(dtype=dtype, device=device)
+    _, seq_len, _ = prompt_embeds.shape
+    # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+    prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+    return prompt_embeds
+def _encode_prompt_with_clip(
+        text_encoder,
+        tokenizer,
+        prompt: str,
+        device=None,
+        text_input_ids=None,
+        num_images_per_prompt: int = 1,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    batch_size = len(prompt)
+    if tokenizer is not None:
+        text_inputs = tokenizer(
+            prompt,
+            padding="max_length",
+            max_length=77,
+            truncation=True,
+            return_overflowing_tokens=False,
+            return_length=False,
+            return_tensors="pt",
+        )
+        text_input_ids = text_inputs.input_ids
+    else:
+        if text_input_ids is None:
+            raise ValueError("text_input_ids must be provided when the tokenizer is not specified")
+    prompt_embeds = text_encoder(text_input_ids.to(device), output_hidden_states=False)
+    # Use pooled output of CLIPTextModel
+    prompt_embeds = prompt_embeds.pooler_output
+    prompt_embeds = prompt_embeds.to(dtype=text_encoder.dtype, device=device)
+    # duplicate text embeddings for each generation per prompt, using mps friendly method
+    prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+    return prompt_embeds
+def encode_prompt(
+        text_encoders,
+        tokenizers,
+        prompt: str,
+        max_sequence_length,
+        device=None,
+        num_images_per_prompt: int = 1,
+        text_input_ids_list=None,
+):
+    prompt = [prompt] if isinstance(prompt, str) else prompt
+    dtype = text_encoders[0].dtype
+    pooled_prompt_embeds = _encode_prompt_with_clip(
+        text_encoder=text_encoders[0],
+        tokenizer=tokenizers[0],
+        prompt=prompt,
+        device=device if device is not None else text_encoders[0].device,
+        num_images_per_prompt=num_images_per_prompt,
+        text_input_ids=text_input_ids_list[0] if text_input_ids_list else None,
+    )
+    prompt_embeds = _encode_prompt_with_t5(
+        text_encoder=text_encoders[1],
+        tokenizer=tokenizers[1],
+        max_sequence_length=max_sequence_length,
+        prompt=prompt,
+        num_images_per_prompt=num_images_per_prompt,
+        device=device if device is not None else text_encoders[1].device,
+        text_input_ids=text_input_ids_list[1] if text_input_ids_list else None,
+    )
+    text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=device, dtype=dtype)
+    return prompt_embeds, pooled_prompt_embeds, text_ids
+def encode_token_ids(text_encoders, tokens, accelerator, num_images_per_prompt=1, device=None):
+    text_encoder_clip = text_encoders[0]
+    text_encoder_t5 = text_encoders[1]
+    tokens_clip, tokens_t5 = tokens[0], tokens[1]
+    batch_size = tokens_clip.shape[0]
+    if device == "cpu":
+        device = "cpu"
+    else:
+        device = accelerator.device
+    # clip
+    prompt_embeds = text_encoder_clip(tokens_clip.to(device), output_hidden_states=False)
+    # Use pooled output of CLIPTextModel
+    prompt_embeds = prompt_embeds.pooler_output
+    prompt_embeds = prompt_embeds.to(dtype=text_encoder_clip.dtype, device=accelerator.device)
+    # duplicate text embeddings for each generation per prompt, using mps friendly method
+    prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+    pooled_prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, -1)
+    pooled_prompt_embeds = pooled_prompt_embeds.to(dtype=text_encoder_clip.dtype, device=accelerator.device)
+    # t5
+    prompt_embeds = text_encoder_t5(tokens_t5.to(device))[0]
+    dtype = text_encoder_t5.dtype
+    prompt_embeds = prompt_embeds.to(dtype=dtype, device=accelerator.device)
+    _, seq_len, _ = prompt_embeds.shape
+    # duplicate text embeddings and attention mask for each generation per prompt, using mps friendly method
+    prompt_embeds = prompt_embeds.repeat(1, num_images_per_prompt, 1)
+    prompt_embeds = prompt_embeds.view(batch_size * num_images_per_prompt, seq_len, -1)
+    text_ids = torch.zeros(prompt_embeds.shape[1], 3).to(device=accelerator.device, dtype=dtype)
+    return prompt_embeds, pooled_prompt_embeds, text_ids

src/transformer_flux.py ADDED Viewed

	@@ -0,0 +1,583 @@

+from typing import Any, Dict, Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders import FluxTransformer2DLoadersMixin, FromOriginalModelMixin, PeftAdapterMixin
+from diffusers.models.attention import FeedForward
+from diffusers.models.attention_processor import (
+    Attention,
+    AttentionProcessor,
+    FluxAttnProcessor2_0,
+    FluxAttnProcessor2_0_NPU,
+    FusedFluxAttnProcessor2_0,
+)
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.normalization import AdaLayerNormContinuous, AdaLayerNormZero, AdaLayerNormZeroSingle
+from diffusers.utils import USE_PEFT_BACKEND, is_torch_version, logging, scale_lora_layers, unscale_lora_layers
+from diffusers.utils.import_utils import is_torch_npu_available
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from diffusers.models.embeddings import CombinedTimestepGuidanceTextProjEmbeddings, CombinedTimestepTextProjEmbeddings, FluxPosEmbed
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@maybe_allow_in_graph
+class FluxSingleTransformerBlock(nn.Module):
+    def __init__(self, dim, num_attention_heads, attention_head_dim, mlp_ratio=4.0):
+        super().__init__()
+        self.mlp_hidden_dim = int(dim * mlp_ratio)
+        self.norm = AdaLayerNormZeroSingle(dim)
+        self.proj_mlp = nn.Linear(dim, self.mlp_hidden_dim)
+        self.act_mlp = nn.GELU(approximate="tanh")
+        self.proj_out = nn.Linear(dim + self.mlp_hidden_dim, dim)
+        if is_torch_npu_available():
+            processor = FluxAttnProcessor2_0_NPU()
+        else:
+            processor = FluxAttnProcessor2_0()
+        self.attn = Attention(
+            query_dim=dim,
+            cross_attention_dim=None,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=dim,
+            bias=True,
+            processor=processor,
+            qk_norm="rms_norm",
+            eps=1e-6,
+            pre_only=True,
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cond_hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        cond_temb: torch.Tensor,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> torch.Tensor:
+        use_cond = cond_hidden_states is not None
+        residual = hidden_states
+        norm_hidden_states, gate = self.norm(hidden_states, emb=temb)
+        mlp_hidden_states = self.act_mlp(self.proj_mlp(norm_hidden_states))
+        if use_cond:
+            residual_cond = cond_hidden_states
+            norm_cond_hidden_states, cond_gate = self.norm(cond_hidden_states, emb=cond_temb)
+            mlp_cond_hidden_states = self.act_mlp(self.proj_mlp(norm_cond_hidden_states))
+        norm_hidden_states_concat = torch.concat([norm_hidden_states, norm_cond_hidden_states], dim=-2)
+        joint_attention_kwargs = joint_attention_kwargs or {}
+        attn_output = self.attn(
+            hidden_states=norm_hidden_states_concat,
+            image_rotary_emb=image_rotary_emb,
+            use_cond=use_cond,
+            **joint_attention_kwargs,
+        )
+        if use_cond:
+            attn_output, cond_attn_output = attn_output
+        hidden_states = torch.cat([attn_output, mlp_hidden_states], dim=2)
+        gate = gate.unsqueeze(1)
+        hidden_states = gate * self.proj_out(hidden_states)
+        hidden_states = residual + hidden_states
+        if use_cond:
+            condition_latents = torch.cat([cond_attn_output, mlp_cond_hidden_states], dim=2)
+            cond_gate = cond_gate.unsqueeze(1)
+            condition_latents = cond_gate * self.proj_out(condition_latents)
+            condition_latents = residual_cond + condition_latents
+        if hidden_states.dtype == torch.float16:
+            hidden_states = hidden_states.clip(-65504, 65504)
+        return hidden_states, condition_latents if use_cond else None
+@maybe_allow_in_graph
+class FluxTransformerBlock(nn.Module):
+    def __init__(
+        self, dim: int, num_attention_heads: int, attention_head_dim: int, qk_norm: str = "rms_norm", eps: float = 1e-6
+    ):
+        super().__init__()
+        self.norm1 = AdaLayerNormZero(dim)
+        self.norm1_context = AdaLayerNormZero(dim)
+        if hasattr(F, "scaled_dot_product_attention"):
+            processor = FluxAttnProcessor2_0()
+        else:
+            raise ValueError(
+                "The current PyTorch version does not support the `scaled_dot_product_attention` function."
+            )
+        self.attn = Attention(
+            query_dim=dim,
+            cross_attention_dim=None,
+            added_kv_proj_dim=dim,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=dim,
+            context_pre_only=False,
+            bias=True,
+            processor=processor,
+            qk_norm=qk_norm,
+            eps=eps,
+        )
+        self.norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
+        self.norm2_context = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff_context = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
+        # let chunk size default to None
+        self._chunk_size = None
+        self._chunk_dim = 0
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cond_hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        cond_temb: torch.Tensor,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        use_cond = cond_hidden_states is not None
+        norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)
+        if use_cond:
+                (
+                    norm_cond_hidden_states,
+                    cond_gate_msa,
+                    cond_shift_mlp,
+                    cond_scale_mlp,
+                    cond_gate_mlp,
+                ) = self.norm1(cond_hidden_states, emb=cond_temb)
+        norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context(
+            encoder_hidden_states, emb=temb
+        )
+        norm_hidden_states = torch.concat([norm_hidden_states, norm_cond_hidden_states], dim=-2)
+        joint_attention_kwargs = joint_attention_kwargs or {}
+        # Attention.
+        attention_outputs = self.attn(
+            hidden_states=norm_hidden_states,
+            encoder_hidden_states=norm_encoder_hidden_states,
+            image_rotary_emb=image_rotary_emb,
+            use_cond=use_cond,
+            **joint_attention_kwargs,
+        )
+        attn_output, context_attn_output = attention_outputs[:2]
+        cond_attn_output = attention_outputs[2] if use_cond else None
+        # Process attention outputs for the `hidden_states`.
+        attn_output = gate_msa.unsqueeze(1) * attn_output
+        hidden_states = hidden_states + attn_output
+        if use_cond:
+            cond_attn_output = cond_gate_msa.unsqueeze(1) * cond_attn_output
+            cond_hidden_states = cond_hidden_states + cond_attn_output
+        norm_hidden_states = self.norm2(hidden_states)
+        norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        if use_cond:
+            norm_cond_hidden_states = self.norm2(cond_hidden_states)
+            norm_cond_hidden_states = (
+                norm_cond_hidden_states * (1 + cond_scale_mlp[:, None])
+                + cond_shift_mlp[:, None]
+            )
+        ff_output = self.ff(norm_hidden_states)
+        ff_output = gate_mlp.unsqueeze(1) * ff_output
+        hidden_states = hidden_states + ff_output
+        if use_cond:
+            cond_ff_output = self.ff(norm_cond_hidden_states)
+            cond_ff_output = cond_gate_mlp.unsqueeze(1) * cond_ff_output
+            cond_hidden_states = cond_hidden_states + cond_ff_output
+        # Process attention outputs for the `encoder_hidden_states`.
+        context_attn_output = c_gate_msa.unsqueeze(1) * context_attn_output
+        encoder_hidden_states = encoder_hidden_states + context_attn_output
+        norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
+        norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
+        context_ff_output = self.ff_context(norm_encoder_hidden_states)
+        encoder_hidden_states = encoder_hidden_states + c_gate_mlp.unsqueeze(1) * context_ff_output
+        if encoder_hidden_states.dtype == torch.float16:
+            encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504)
+        return encoder_hidden_states, hidden_states, cond_hidden_states if use_cond else None
+class FluxTransformer2DModel(
+    ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, FluxTransformer2DLoadersMixin
+):
+    _supports_gradient_checkpointing = True
+    _no_split_modules = ["FluxTransformerBlock", "FluxSingleTransformerBlock"]
+    @register_to_config
+    def __init__(
+        self,
+        patch_size: int = 1,
+        in_channels: int = 64,
+        out_channels: Optional[int] = None,
+        num_layers: int = 19,
+        num_single_layers: int = 38,
+        attention_head_dim: int = 128,
+        num_attention_heads: int = 24,
+        joint_attention_dim: int = 4096,
+        pooled_projection_dim: int = 768,
+        guidance_embeds: bool = False,
+        axes_dims_rope: Tuple[int] = (16, 56, 56),
+    ):
+        super().__init__()
+        self.out_channels = out_channels or in_channels
+        self.inner_dim = num_attention_heads * attention_head_dim
+        self.pos_embed = FluxPosEmbed(theta=10000, axes_dim=axes_dims_rope)
+        text_time_guidance_cls = (
+            CombinedTimestepGuidanceTextProjEmbeddings if guidance_embeds else CombinedTimestepTextProjEmbeddings
+        )
+        self.time_text_embed = text_time_guidance_cls(
+            embedding_dim=self.inner_dim, pooled_projection_dim=pooled_projection_dim
+        )
+        self.context_embedder = nn.Linear(joint_attention_dim, self.inner_dim)
+        self.x_embedder = nn.Linear(in_channels, self.inner_dim)
+        self.transformer_blocks = nn.ModuleList(
+            [
+                FluxTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        self.single_transformer_blocks = nn.ModuleList(
+            [
+                FluxSingleTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                )
+                for _ in range(num_single_layers)
+            ]
+        )
+        self.norm_out = AdaLayerNormContinuous(self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6)
+        self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True)
+        self.gradient_checkpointing = False
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections with FusedAttnProcessor2_0->FusedFluxAttnProcessor2_0
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
+        are fused. For cross-attention modules, key and value projection matrices are fused.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        """
+        self.original_attn_processors = None
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+        self.original_attn_processors = self.attn_processors
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+        self.set_attn_processor(FusedFluxAttnProcessor2_0())
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        cond_hidden_states: torch.Tensor = None,
+        encoder_hidden_states: torch.Tensor = None,
+        pooled_projections: torch.Tensor = None,
+        timestep: torch.LongTensor = None,
+        img_ids: torch.Tensor = None,
+        txt_ids: torch.Tensor = None,
+        guidance: torch.Tensor = None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_block_samples=None,
+        controlnet_single_block_samples=None,
+        return_dict: bool = True,
+        controlnet_blocks_repeat: bool = False,
+    ) -> Union[torch.Tensor, Transformer2DModelOutput]:
+        if cond_hidden_states is not None:
+            use_condition = True
+        else:
+            use_condition = False
+        if joint_attention_kwargs is not None:
+            joint_attention_kwargs = joint_attention_kwargs.copy()
+            lora_scale = joint_attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if joint_attention_kwargs is not None and joint_attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+        hidden_states = self.x_embedder(hidden_states)
+        cond_hidden_states = self.x_embedder(cond_hidden_states)
+        timestep = timestep.to(hidden_states.dtype) * 1000
+        if guidance is not None:
+            guidance = guidance.to(hidden_states.dtype) * 1000
+        else:
+            guidance = None
+        temb = (
+            self.time_text_embed(timestep, pooled_projections)
+            if guidance is None
+            else self.time_text_embed(timestep, guidance, pooled_projections)
+        )
+        cond_temb = (
+            self.time_text_embed(torch.ones_like(timestep) * 0, pooled_projections)
+                if guidance is None
+                else self.time_text_embed(
+                    torch.ones_like(timestep) * 0, guidance, pooled_projections
+                )
+            )
+        encoder_hidden_states = self.context_embedder(encoder_hidden_states)
+        if txt_ids.ndim == 3:
+            logger.warning(
+                "Passing `txt_ids` 3d torch.Tensor is deprecated."
+                "Please remove the batch dimension and pass it as a 2d torch Tensor"
+            )
+            txt_ids = txt_ids[0]
+        if img_ids.ndim == 3:
+            logger.warning(
+                "Passing `img_ids` 3d torch.Tensor is deprecated."
+                "Please remove the batch dimension and pass it as a 2d torch Tensor"
+            )
+            img_ids = img_ids[0]
+        ids = torch.cat((txt_ids, img_ids), dim=0)
+        image_rotary_emb = self.pos_embed(ids)
+        if joint_attention_kwargs is not None and "ip_adapter_image_embeds" in joint_attention_kwargs:
+            ip_adapter_image_embeds = joint_attention_kwargs.pop("ip_adapter_image_embeds")
+            ip_hidden_states = self.encoder_hid_proj(ip_adapter_image_embeds)
+            joint_attention_kwargs.update({"ip_hidden_states": ip_hidden_states})
+        for index_block, block in enumerate(self.transformer_blocks):
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                encoder_hidden_states, hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    encoder_hidden_states,
+                    temb,
+                    image_rotary_emb,
+                    cond_temb=cond_temb if use_condition else None,
+                    cond_hidden_states=cond_hidden_states if use_condition else None,
+                    **ckpt_kwargs,
+                )
+            else:
+                encoder_hidden_states, hidden_states, cond_hidden_states = block(
+                    hidden_states=hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    cond_hidden_states=cond_hidden_states if use_condition else None,
+                    temb=temb,
+                    cond_temb=cond_temb if use_condition else None,
+                    image_rotary_emb=image_rotary_emb,
+                    joint_attention_kwargs=joint_attention_kwargs,
+                )
+            # controlnet residual
+            if controlnet_block_samples is not None:
+                interval_control = len(self.transformer_blocks) / len(controlnet_block_samples)
+                interval_control = int(np.ceil(interval_control))
+                # For Xlabs ControlNet.
+                if controlnet_blocks_repeat:
+                    hidden_states = (
+                        hidden_states + controlnet_block_samples[index_block % len(controlnet_block_samples)]
+                    )
+                else:
+                    hidden_states = hidden_states + controlnet_block_samples[index_block // interval_control]
+        hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+        for index_block, block in enumerate(self.single_transformer_blocks):
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                def create_custom_forward(module, return_dict=None):
+                    def custom_forward(*inputs):
+                        if return_dict is not None:
+                            return module(*inputs, return_dict=return_dict)
+                        else:
+                            return module(*inputs)
+                    return custom_forward
+                ckpt_kwargs: Dict[str, Any] = {"use_reentrant": False} if is_torch_version(">=", "1.11.0") else {}
+                hidden_states, cond_hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(block),
+                    hidden_states,
+                    temb,
+                    image_rotary_emb,
+                    cond_temb=cond_temb if use_condition else None,
+                    cond_hidden_states=cond_hidden_states if use_condition else None,
+                    **ckpt_kwargs,
+                )
+            else:
+                hidden_states, cond_hidden_states = block(
+                    hidden_states=hidden_states,
+                    cond_hidden_states=cond_hidden_states if use_condition else None,
+                    temb=temb,
+                    cond_temb=cond_temb if use_condition else None,
+                    image_rotary_emb=image_rotary_emb,
+                    joint_attention_kwargs=joint_attention_kwargs,
+                )
+            # controlnet residual
+            if controlnet_single_block_samples is not None:
+                interval_control = len(self.single_transformer_blocks) / len(controlnet_single_block_samples)
+                interval_control = int(np.ceil(interval_control))
+                hidden_states[:, encoder_hidden_states.shape[1] :, ...] = (
+                    hidden_states[:, encoder_hidden_states.shape[1] :, ...]
+                    + controlnet_single_block_samples[index_block // interval_control]
+                )
+        hidden_states = hidden_states[:, encoder_hidden_states.shape[1] :, ...]
+        hidden_states = self.norm_out(hidden_states, temb)
+        output = self.proj_out(hidden_states)
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+        if not return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output)

src/transformer_with_loss.py ADDED Viewed

	@@ -0,0 +1,504 @@

+from typing import Any, Dict, Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from diffusers.configuration_utils import ConfigMixin, register_to_config
+from diffusers.loaders import FluxTransformer2DLoadersMixin, FromOriginalModelMixin, PeftAdapterMixin
+from diffusers.models.attention import FeedForward
+from diffusers.models.attention_processor import (
+    Attention,
+    AttentionProcessor,
+    FluxAttnProcessor2_0,
+    FluxAttnProcessor2_0_NPU,
+    FusedFluxAttnProcessor2_0,
+)
+from diffusers.models.modeling_utils import ModelMixin
+from diffusers.models.normalization import AdaLayerNormContinuous, AdaLayerNormZero, AdaLayerNormZeroSingle
+from diffusers.utils import USE_PEFT_BACKEND, is_torch_version, logging, scale_lora_layers, unscale_lora_layers, deprecate
+from diffusers.utils.import_utils import is_torch_npu_available
+from diffusers.utils.torch_utils import maybe_allow_in_graph
+from diffusers.models.embeddings import CombinedTimestepGuidanceTextProjEmbeddings, CombinedTimestepTextProjEmbeddings, FluxPosEmbed
+from diffusers.models.modeling_outputs import Transformer2DModelOutput
+from diffusers import CacheMixin
+logger = logging.get_logger(__name__)  # pylint: disable=invalid-name
+@maybe_allow_in_graph
+class FluxSingleTransformerBlock(nn.Module):
+    def __init__(self, dim: int, num_attention_heads: int, attention_head_dim: int, mlp_ratio: float = 4.0):
+        super().__init__()
+        self.mlp_hidden_dim = int(dim * mlp_ratio)
+        self.norm = AdaLayerNormZeroSingle(dim)
+        self.proj_mlp = nn.Linear(dim, self.mlp_hidden_dim)
+        self.act_mlp = nn.GELU(approximate="tanh")
+        self.proj_out = nn.Linear(dim + self.mlp_hidden_dim, dim)
+        if is_torch_npu_available():
+            deprecation_message = (
+                "Defaulting to FluxAttnProcessor2_0_NPU for NPU devices will be removed. Attention processors "
+                "should be set explicitly using the `set_attn_processor` method."
+            )
+            deprecate("npu_processor", "0.34.0", deprecation_message)
+            processor = FluxAttnProcessor2_0_NPU()
+        else:
+            processor = FluxAttnProcessor2_0()
+        self.attn = Attention(
+            query_dim=dim,
+            cross_attention_dim=None,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=dim,
+            bias=True,
+            processor=processor,
+            qk_norm="rms_norm",
+            eps=1e-6,
+            pre_only=True,
+        )
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> torch.Tensor:
+        residual = hidden_states
+        norm_hidden_states, gate = self.norm(hidden_states, emb=temb)
+        mlp_hidden_states = self.act_mlp(self.proj_mlp(norm_hidden_states))
+        joint_attention_kwargs = joint_attention_kwargs or {}
+        attn_output = self.attn(
+            hidden_states=norm_hidden_states,
+            image_rotary_emb=image_rotary_emb,
+            **joint_attention_kwargs,
+        )
+        hidden_states = torch.cat([attn_output, mlp_hidden_states], dim=2)
+        gate = gate.unsqueeze(1)
+        hidden_states = gate * self.proj_out(hidden_states)
+        hidden_states = residual + hidden_states
+        if hidden_states.dtype == torch.float16:
+            hidden_states = hidden_states.clip(-65504, 65504)
+        return hidden_states
+@maybe_allow_in_graph
+class FluxTransformerBlock(nn.Module):
+    def __init__(
+        self, dim: int, num_attention_heads: int, attention_head_dim: int, qk_norm: str = "rms_norm", eps: float = 1e-6
+    ):
+        super().__init__()
+        self.norm1 = AdaLayerNormZero(dim)
+        self.norm1_context = AdaLayerNormZero(dim)
+        self.attn = Attention(
+            query_dim=dim,
+            cross_attention_dim=None,
+            added_kv_proj_dim=dim,
+            dim_head=attention_head_dim,
+            heads=num_attention_heads,
+            out_dim=dim,
+            context_pre_only=False,
+            bias=True,
+            processor=FluxAttnProcessor2_0(),
+            qk_norm=qk_norm,
+            eps=eps,
+        )
+        self.norm2 = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
+        self.norm2_context = nn.LayerNorm(dim, elementwise_affine=False, eps=1e-6)
+        self.ff_context = FeedForward(dim=dim, dim_out=dim, activation_fn="gelu-approximate")
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor,
+        temb: torch.Tensor,
+        image_rotary_emb: Optional[Tuple[torch.Tensor, torch.Tensor]] = None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+    ) -> Tuple[torch.Tensor, torch.Tensor]:
+        norm_hidden_states, gate_msa, shift_mlp, scale_mlp, gate_mlp = self.norm1(hidden_states, emb=temb)
+        norm_encoder_hidden_states, c_gate_msa, c_shift_mlp, c_scale_mlp, c_gate_mlp = self.norm1_context(
+            encoder_hidden_states, emb=temb
+        )
+        joint_attention_kwargs = joint_attention_kwargs or {}
+        # Attention.
+        attention_outputs = self.attn(
+            hidden_states=norm_hidden_states,
+            encoder_hidden_states=norm_encoder_hidden_states,
+            image_rotary_emb=image_rotary_emb,
+            **joint_attention_kwargs,
+        )
+        if len(attention_outputs) == 2:
+            attn_output, context_attn_output = attention_outputs
+        elif len(attention_outputs) == 3:
+            attn_output, context_attn_output, ip_attn_output = attention_outputs
+        # Process attention outputs for the `hidden_states`.
+        attn_output = gate_msa.unsqueeze(1) * attn_output
+        hidden_states = hidden_states + attn_output
+        norm_hidden_states = self.norm2(hidden_states)
+        norm_hidden_states = norm_hidden_states * (1 + scale_mlp[:, None]) + shift_mlp[:, None]
+        ff_output = self.ff(norm_hidden_states)
+        ff_output = gate_mlp.unsqueeze(1) * ff_output
+        hidden_states = hidden_states + ff_output
+        if len(attention_outputs) == 3:
+            hidden_states = hidden_states + ip_attn_output
+        # Process attention outputs for the `encoder_hidden_states`.
+        context_attn_output = c_gate_msa.unsqueeze(1) * context_attn_output
+        encoder_hidden_states = encoder_hidden_states + context_attn_output
+        norm_encoder_hidden_states = self.norm2_context(encoder_hidden_states)
+        norm_encoder_hidden_states = norm_encoder_hidden_states * (1 + c_scale_mlp[:, None]) + c_shift_mlp[:, None]
+        context_ff_output = self.ff_context(norm_encoder_hidden_states)
+        encoder_hidden_states = encoder_hidden_states + c_gate_mlp.unsqueeze(1) * context_ff_output
+        if encoder_hidden_states.dtype == torch.float16:
+            encoder_hidden_states = encoder_hidden_states.clip(-65504, 65504)
+        return encoder_hidden_states, hidden_states
+class FluxTransformer2DModelWithLoss(
+    ModelMixin, ConfigMixin, PeftAdapterMixin, FromOriginalModelMixin, FluxTransformer2DLoadersMixin, CacheMixin
+):
+    _supports_gradient_checkpointing = True
+    _no_split_modules = ["FluxTransformerBlock", "FluxSingleTransformerBlock"]
+    _skip_layerwise_casting_patterns = ["pos_embed", "norm"]
+    _repeated_blocks = ["FluxTransformerBlock", "FluxSingleTransformerBlock"]
+    @register_to_config
+    def __init__(
+        self,
+        patch_size: int = 1,
+        in_channels: int = 64,
+        out_channels: Optional[int] = None,
+        num_layers: int = 19,
+        num_single_layers: int = 38,
+        attention_head_dim: int = 128,
+        num_attention_heads: int = 24,
+        joint_attention_dim: int = 4096,
+        pooled_projection_dim: int = 768,
+        guidance_embeds: bool = False,
+        axes_dims_rope: Tuple[int, int, int] = (16, 56, 56),
+    ):
+        super().__init__()
+        self.out_channels = out_channels or in_channels
+        self.inner_dim = num_attention_heads * attention_head_dim
+        self.pos_embed = FluxPosEmbed(theta=10000, axes_dim=axes_dims_rope)
+        text_time_guidance_cls = (
+            CombinedTimestepGuidanceTextProjEmbeddings if guidance_embeds else CombinedTimestepTextProjEmbeddings
+        )
+        self.time_text_embed = text_time_guidance_cls(
+            embedding_dim=self.inner_dim, pooled_projection_dim=pooled_projection_dim
+        )
+        self.context_embedder = nn.Linear(joint_attention_dim, self.inner_dim)
+        self.x_embedder = nn.Linear(in_channels, self.inner_dim)
+        self.transformer_blocks = nn.ModuleList(
+            [
+                FluxTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                )
+                for _ in range(num_layers)
+            ]
+        )
+        self.single_transformer_blocks = nn.ModuleList(
+            [
+                FluxSingleTransformerBlock(
+                    dim=self.inner_dim,
+                    num_attention_heads=num_attention_heads,
+                    attention_head_dim=attention_head_dim,
+                )
+                for _ in range(num_single_layers)
+            ]
+        )
+        self.norm_out = AdaLayerNormContinuous(self.inner_dim, self.inner_dim, elementwise_affine=False, eps=1e-6)
+        self.proj_out = nn.Linear(self.inner_dim, patch_size * patch_size * self.out_channels, bias=True)
+        self.gradient_checkpointing = False
+    @property
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.attn_processors
+    def attn_processors(self) -> Dict[str, AttentionProcessor]:
+        r"""
+        Returns:
+            `dict` of attention processors: A dictionary containing all attention processors used in the model with
+            indexed by its weight name.
+        """
+        # set recursively
+        processors = {}
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors: Dict[str, AttentionProcessor]):
+            if hasattr(module, "get_processor"):
+                processors[f"{name}.processor"] = module.get_processor()
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.set_attn_processor
+    def set_attn_processor(self, processor: Union[AttentionProcessor, Dict[str, AttentionProcessor]]):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.fuse_qkv_projections with FusedAttnProcessor2_0->FusedFluxAttnProcessor2_0
+    def fuse_qkv_projections(self):
+        """
+        Enables fused QKV projections. For self-attention modules, all projection matrices (i.e., query, key, value)
+        are fused. For cross-attention modules, key and value projection matrices are fused.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        """
+        self.original_attn_processors = None
+        for _, attn_processor in self.attn_processors.items():
+            if "Added" in str(attn_processor.__class__.__name__):
+                raise ValueError("`fuse_qkv_projections()` is not supported for models having added KV projections.")
+        self.original_attn_processors = self.attn_processors
+        for module in self.modules():
+            if isinstance(module, Attention):
+                module.fuse_projections(fuse=True)
+        self.set_attn_processor(FusedFluxAttnProcessor2_0())
+    # Copied from diffusers.models.unets.unet_2d_condition.UNet2DConditionModel.unfuse_qkv_projections
+    def unfuse_qkv_projections(self):
+        """Disables the fused QKV projection if enabled.
+        <Tip warning={true}>
+        This API is 🧪 experimental.
+        </Tip>
+        """
+        if self.original_attn_processors is not None:
+            self.set_attn_processor(self.original_attn_processors)
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        encoder_hidden_states: torch.Tensor = None,
+        pooled_projections: torch.Tensor = None,
+        timestep: torch.LongTensor = None,
+        img_ids: torch.Tensor = None,
+        txt_ids: torch.Tensor = None,
+        guidance: torch.Tensor = None,
+        joint_attention_kwargs: Optional[Dict[str, Any]] = None,
+        controlnet_block_samples=None,
+        controlnet_single_block_samples=None,
+        return_dict: bool = True,
+        controlnet_blocks_repeat: bool = False,
+    ) -> Union[torch.Tensor, Transformer2DModelOutput]:
+        """
+        The [`FluxTransformer2DModel`] forward method.
+        Args:
+            hidden_states (`torch.Tensor` of shape `(batch_size, image_sequence_length, in_channels)`):
+                Input `hidden_states`.
+            encoder_hidden_states (`torch.Tensor` of shape `(batch_size, text_sequence_length, joint_attention_dim)`):
+                Conditional embeddings (embeddings computed from the input conditions such as prompts) to use.
+            pooled_projections (`torch.Tensor` of shape `(batch_size, projection_dim)`): Embeddings projected
+                from the embeddings of input conditions.
+            timestep ( `torch.LongTensor`):
+                Used to indicate denoising step.
+            block_controlnet_hidden_states: (`list` of `torch.Tensor`):
+                A list of tensors that if specified are added to the residuals of transformer blocks.
+            joint_attention_kwargs (`dict`, *optional*):
+                A kwargs dictionary that if specified is passed along to the `AttentionProcessor` as defined under
+                `self.processor` in
+                [diffusers.models.attention_processor](https://github.com/huggingface/diffusers/blob/main/src/diffusers/models/attention_processor.py).
+            return_dict (`bool`, *optional*, defaults to `True`):
+                Whether or not to return a [`~models.transformer_2d.Transformer2DModelOutput`] instead of a plain
+                tuple.
+        Returns:
+            If `return_dict` is True, an [`~models.transformer_2d.Transformer2DModelOutput`] is returned, otherwise a
+            `tuple` where the first element is the sample tensor.
+        """
+        if joint_attention_kwargs is not None:
+            joint_attention_kwargs = joint_attention_kwargs.copy()
+            lora_scale = joint_attention_kwargs.pop("scale", 1.0)
+        else:
+            lora_scale = 1.0
+        if USE_PEFT_BACKEND:
+            # weight the lora layers by setting `lora_scale` for each PEFT layer
+            scale_lora_layers(self, lora_scale)
+        else:
+            if joint_attention_kwargs is not None and joint_attention_kwargs.get("scale", None) is not None:
+                logger.warning(
+                    "Passing `scale` via `joint_attention_kwargs` when not using the PEFT backend is ineffective."
+                )
+        hidden_states = self.x_embedder(hidden_states)
+        timestep = timestep.to(hidden_states.dtype) * 1000
+        if guidance is not None:
+            guidance = guidance.to(hidden_states.dtype) * 1000
+        temb = (
+            self.time_text_embed(timestep, pooled_projections)
+            if guidance is None
+            else self.time_text_embed(timestep, guidance, pooled_projections)
+        )
+        encoder_hidden_states = self.context_embedder(encoder_hidden_states)
+        if txt_ids.ndim == 3:
+            logger.warning(
+                "Passing `txt_ids` 3d torch.Tensor is deprecated."
+                "Please remove the batch dimension and pass it as a 2d torch Tensor"
+            )
+            txt_ids = txt_ids[0]
+        if img_ids.ndim == 3:
+            logger.warning(
+                "Passing `img_ids` 3d torch.Tensor is deprecated."
+                "Please remove the batch dimension and pass it as a 2d torch Tensor"
+            )
+            img_ids = img_ids[0]
+        ids = torch.cat((txt_ids, img_ids), dim=0)
+        image_rotary_emb = self.pos_embed(ids)
+        if joint_attention_kwargs is not None and "ip_adapter_image_embeds" in joint_attention_kwargs:
+            ip_adapter_image_embeds = joint_attention_kwargs.pop("ip_adapter_image_embeds")
+            ip_hidden_states = self.encoder_hid_proj(ip_adapter_image_embeds)
+            joint_attention_kwargs.update({"ip_hidden_states": ip_hidden_states})
+        for index_block, block in enumerate(self.transformer_blocks):
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                encoder_hidden_states, hidden_states = self._gradient_checkpointing_func(
+                    block,
+                    hidden_states,
+                    encoder_hidden_states,
+                    temb,
+                    image_rotary_emb,
+                )
+            else:
+                encoder_hidden_states, hidden_states = block(
+                    hidden_states=hidden_states,
+                    encoder_hidden_states=encoder_hidden_states,
+                    temb=temb,
+                    image_rotary_emb=image_rotary_emb,
+                    joint_attention_kwargs=joint_attention_kwargs,
+                )
+            # controlnet residual
+            if controlnet_block_samples is not None:
+                interval_control = len(self.transformer_blocks) / len(controlnet_block_samples)
+                interval_control = int(np.ceil(interval_control))
+                # For Xlabs ControlNet.
+                if controlnet_blocks_repeat:
+                    hidden_states = (
+                        hidden_states + controlnet_block_samples[index_block % len(controlnet_block_samples)]
+                    )
+                else:
+                    hidden_states = hidden_states + controlnet_block_samples[index_block // interval_control]
+        hidden_states = torch.cat([encoder_hidden_states, hidden_states], dim=1)
+        for index_block, block in enumerate(self.single_transformer_blocks):
+            if torch.is_grad_enabled() and self.gradient_checkpointing:
+                hidden_states = self._gradient_checkpointing_func(
+                    block,
+                    hidden_states,
+                    temb,
+                    image_rotary_emb,
+                )
+            else:
+                hidden_states = block(
+                    hidden_states=hidden_states,
+                    temb=temb,
+                    image_rotary_emb=image_rotary_emb,
+                    joint_attention_kwargs=joint_attention_kwargs,
+                )
+            # controlnet residual
+            if controlnet_single_block_samples is not None:
+                interval_control = len(self.single_transformer_blocks) / len(controlnet_single_block_samples)
+                interval_control = int(np.ceil(interval_control))
+                hidden_states[:, encoder_hidden_states.shape[1] :, ...] = (
+                    hidden_states[:, encoder_hidden_states.shape[1] :, ...]
+                    + controlnet_single_block_samples[index_block // interval_control]
+                )
+        hidden_states = hidden_states[:, encoder_hidden_states.shape[1] :, ...]
+        hidden_states = self.norm_out(hidden_states, temb)
+        output = self.proj_out(hidden_states)
+        if USE_PEFT_BACKEND:
+            # remove `lora_scale` from each PEFT layer
+            unscale_lora_layers(self, lora_scale)
+        if not return_dict:
+            return (output,)
+        return Transformer2DModelOutput(sample=output)

test_imgs/2.png ADDED Viewed

Git LFS Details

SHA256: 913839ca3ca83963f6b52309394a0ea3ca7b701a7290d7db41d7e4f1879e7467
Pointer size: 132 Bytes
Size of remote file: 1.04 MB

test_imgs/3.png ADDED Viewed

Git LFS Details

SHA256: bc366a2bbdeaa883141155214f2af4236c354457b8ca7b608ead64b21b053a2f
Pointer size: 131 Bytes
Size of remote file: 261 kB

test_imgs/generated_1.png ADDED Viewed

Git LFS Details

SHA256: 5e95bb2e854b04958fbcd1013a01ed929a36c5c1b43887c3fad0d646b0768703
Pointer size: 131 Bytes
Size of remote file: 323 kB

test_imgs/generated_1_bbox.png ADDED Viewed

Git LFS Details

SHA256: a3a5fdaf6a6998a5b8559aca5c9571cde99da1945cc8418ea8f679cd1ce6b4bf
Pointer size: 131 Bytes
Size of remote file: 383 kB

test_imgs/generated_2.png ADDED Viewed

Git LFS Details

SHA256: 193dca8f7fab34f802fc6ba7623346a5542871d6f190ba6a412280298fc2c6a4
Pointer size: 131 Bytes
Size of remote file: 522 kB

test_imgs/generated_2_bbox.png ADDED Viewed

Git LFS Details

SHA256: 6376fd6ebb4966d7a26b973663304516ee5878c0a81db93c71493c60a339050d
Pointer size: 131 Bytes
Size of remote file: 192 kB

test_imgs/generated_3.png ADDED Viewed

Git LFS Details

SHA256: 2087fe809477c88c3a46ade8d58d55277fa05ef922f68030e5aebc13991f9043
Pointer size: 132 Bytes
Size of remote file: 1.42 MB

test_imgs/generated_3_bbox.png ADDED Viewed

Git LFS Details

SHA256: fdaa47d38698863c7dc4ca34ed841b7b70370bebd772b1b2fa55eb9504bb2bd8
Pointer size: 131 Bytes
Size of remote file: 637 kB

test_imgs/generated_3_bbox_1.png ADDED Viewed

Git LFS Details

SHA256: 5b3ab07a05abc6c2a8808211ef642b8ec498d69ab4e5c86bc46b14c882153fb5
Pointer size: 131 Bytes
Size of remote file: 434 kB

test_imgs/product_1.jpg ADDED Viewed

test_imgs/product_1_bbox.png ADDED Viewed

Git LFS Details

SHA256: 17d8dd3de98689c8db96b80d6de5f3561fc4fc73f8a8bee5b892d3da235c65dc
Pointer size: 131 Bytes
Size of remote file: 352 kB

test_imgs/product_2.png ADDED Viewed

Git LFS Details

SHA256: 2570c1fc8e4da310f78981e9dc050bf9049f2038343fe03669ff35fb2c9c00f8
Pointer size: 131 Bytes
Size of remote file: 496 kB

test_imgs/product_2_bbox.png ADDED Viewed

Git LFS Details

SHA256: 3c042db4ecc731c06f4d8040d46597e8d1c9459aa06e4431f4372c9b67efd9d4
Pointer size: 131 Bytes
Size of remote file: 475 kB

test_imgs/product_3.png ADDED Viewed

Git LFS Details

SHA256: 3217819ed69d19c2e309897f673a47c6f7910e74e239e8cbcb182e6df72a567d
Pointer size: 131 Bytes
Size of remote file: 367 kB

test_imgs/product_3_bbox.png ADDED Viewed

Git LFS Details

SHA256: 61d09d4f7e03c4447cde438befe4be7bc697be5e28aaf447f23315f9dc38de41
Pointer size: 131 Bytes
Size of remote file: 138 kB

test_imgs/product_3_bbox_1.png ADDED Viewed

Git LFS Details

SHA256: 70dd0fff6ed5164d9ddff1c2184aa725321a70ea5598985393431d758291343d
Pointer size: 131 Bytes
Size of remote file: 147 kB

uno/dataset/uno.py ADDED Viewed

	@@ -0,0 +1,132 @@

+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates. All rights reserved.
+# Copyright (c) 2024 Black Forest Labs and The XLabs-AI Team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import json
+import os
+import numpy as np
+import torch
+import torchvision.transforms.functional as TVF
+from torch.utils.data import DataLoader, Dataset
+from torchvision.transforms import Compose, Normalize, ToTensor
+def bucket_images(images: list[torch.Tensor], resolution: int = 512):
+    bucket_override=[
+        # h    w
+        (256, 768),
+        (320, 768),
+        (320, 704),
+        (384, 640),
+        (448, 576),
+        (512, 512),
+        (576, 448),
+        (640, 384),
+        (704, 320),
+        (768, 320),
+        (768, 256)
+    ]
+    bucket_override = [(int(h / 512 * resolution), int(w / 512 * resolution)) for h, w in bucket_override]
+    bucket_override = [(h // 16 * 16, w // 16 * 16) for h, w in bucket_override]
+    aspect_ratios = [image.shape[-2] / image.shape[-1] for image in images]
+    mean_aspect_ratio = np.mean(aspect_ratios)
+    new_h, new_w = bucket_override[0]
+    min_aspect_diff = np.abs(new_h / new_w - mean_aspect_ratio)
+    for h, w in bucket_override:
+        aspect_diff = np.abs(h / w - mean_aspect_ratio)
+        if aspect_diff < min_aspect_diff:
+            min_aspect_diff = aspect_diff
+            new_h, new_w = h, w
+    images = [TVF.resize(image, (new_h, new_w)) for image in images]
+    images = torch.stack(images, dim=0)
+    return images
+class FluxPairedDatasetV2(Dataset):
+    def __init__(self, json_file: str, resolution: int, resolution_ref: int | None = None):
+        super().__init__()
+        self.json_file = json_file
+        self.resolution = resolution
+        self.resolution_ref = resolution_ref if resolution_ref is not None else resolution
+        self.image_root = os.path.dirname(json_file)
+        with open(self.json_file, "rt") as f:
+            self.data_dicts = json.load(f)
+        self.transform = Compose([
+            ToTensor(),
+            Normalize([0.5], [0.5]),
+        ])
+    def __getitem__(self, idx):
+        data_dict = self.data_dicts[idx]
+        image_paths = [data_dict["image_path"]] if "image_path" in data_dict else data_dict["image_paths"]
+        txt = data_dict["prompt"]
+        image_tgt_path = data_dict.get("image_tgt_path", None)
+        ref_imgs = [
+            Image.open(os.path.join(self.image_root, path)).convert("RGB")
+            for path in image_paths
+        ]
+        ref_imgs = [self.transform(img) for img in ref_imgs]
+        img = None
+        if image_tgt_path is not None:
+            img = Image.open(os.path.join(self.image_root, image_tgt_path)).convert("RGB")
+            img = self.transform(img)
+        return {
+            "img": img,
+            "txt": txt,
+            "ref_imgs": ref_imgs,
+        }
+    def __len__(self):
+        return len(self.data_dicts)
+    def collate_fn(self, batch):
+        img = [data["img"] for data in batch]
+        txt = [data["txt"] for data in batch]
+        ref_imgs = [data["ref_imgs"] for data in batch]
+        assert all([len(ref_imgs[0]) == len(ref_imgs[i]) for i in range(len(ref_imgs))])
+        n_ref = len(ref_imgs[0])
+        img = bucket_images(img, self.resolution)
+        ref_imgs_new = []
+        for i in range(n_ref):
+            ref_imgs_i = [refs[i] for refs in ref_imgs]
+            ref_imgs_i = bucket_images(ref_imgs_i, self.resolution_ref)
+            ref_imgs_new.append(ref_imgs_i)
+        return {
+            "txt": txt,
+            "img": img,
+            "ref_imgs": ref_imgs_new,
+        }
+if __name__ == '__main__':
+    import argparse
+    from pprint import pprint
+    parser = argparse.ArgumentParser()
+    # parser.add_argument("--json_file", type=str, required=True)
+    parser.add_argument("--json_file", type=str, default="datasets/fake_train_data.json")
+    args = parser.parse_args()
+    dataset = FluxPairedDatasetV2(args.json_file, 512)
+    dataloder = DataLoader(dataset, batch_size=4, collate_fn=dataset.collate_fn)
+    for i, data_dict in enumerate(dataloder):
+        pprint(i)
+        pprint(data_dict)
+        breakpoint()

uno/flux/math.py ADDED Viewed

	@@ -0,0 +1,45 @@

+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates. All rights reserved.
+# Copyright (c) 2024 Black Forest Labs and The XLabs-AI Team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import torch
+from einops import rearrange
+from torch import Tensor
+def attention(q: Tensor, k: Tensor, v: Tensor, pe: Tensor) -> Tensor:
+    q, k = apply_rope(q, k, pe)
+    x = torch.nn.functional.scaled_dot_product_attention(q, k, v)
+    x = rearrange(x, "B H L D -> B L (H D)")
+    return x
+def rope(pos: Tensor, dim: int, theta: int) -> Tensor:
+    assert dim % 2 == 0
+    scale = torch.arange(0, dim, 2, dtype=torch.float64, device=pos.device) / dim
+    omega = 1.0 / (theta**scale)
+    out = torch.einsum("...n,d->...nd", pos, omega)
+    out = torch.stack([torch.cos(out), -torch.sin(out), torch.sin(out), torch.cos(out)], dim=-1)
+    out = rearrange(out, "b n d (i j) -> b n d i j", i=2, j=2)
+    return out.float()
+def apply_rope(xq: Tensor, xk: Tensor, freqs_cis: Tensor) -> tuple[Tensor, Tensor]:
+    xq_ = xq.float().reshape(*xq.shape[:-1], -1, 1, 2)
+    xk_ = xk.float().reshape(*xk.shape[:-1], -1, 1, 2)
+    xq_out = freqs_cis[..., 0] * xq_[..., 0] + freqs_cis[..., 1] * xq_[..., 1]
+    xk_out = freqs_cis[..., 0] * xk_[..., 0] + freqs_cis[..., 1] * xk_[..., 1]
+    return xq_out.reshape(*xq.shape).type_as(xq), xk_out.reshape(*xk.shape).type_as(xk)

uno/flux/model.py ADDED Viewed

	@@ -0,0 +1,222 @@

+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates. All rights reserved.
+# Copyright (c) 2024 Black Forest Labs and The XLabs-AI Team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+import torch
+from torch import Tensor, nn
+from .modules.layers import DoubleStreamBlock, EmbedND, LastLayer, MLPEmbedder, SingleStreamBlock, timestep_embedding
+@dataclass
+class FluxParams:
+    in_channels: int
+    vec_in_dim: int
+    context_in_dim: int
+    hidden_size: int
+    mlp_ratio: float
+    num_heads: int
+    depth: int
+    depth_single_blocks: int
+    axes_dim: list[int]
+    theta: int
+    qkv_bias: bool
+    guidance_embed: bool
+class Flux(nn.Module):
+    """
+    Transformer model for flow matching on sequences.
+    """
+    _supports_gradient_checkpointing = True
+    def __init__(self, params: FluxParams):
+        super().__init__()
+        self.params = params
+        self.in_channels = params.in_channels
+        self.out_channels = self.in_channels
+        if params.hidden_size % params.num_heads != 0:
+            raise ValueError(
+                f"Hidden size {params.hidden_size} must be divisible by num_heads {params.num_heads}"
+            )
+        pe_dim = params.hidden_size // params.num_heads
+        if sum(params.axes_dim) != pe_dim:
+            raise ValueError(f"Got {params.axes_dim} but expected positional dim {pe_dim}")
+        self.hidden_size = params.hidden_size
+        self.num_heads = params.num_heads
+        self.pe_embedder = EmbedND(dim=pe_dim, theta=params.theta, axes_dim=params.axes_dim)
+        self.img_in = nn.Linear(self.in_channels, self.hidden_size, bias=True)
+        self.time_in = MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size)
+        self.vector_in = MLPEmbedder(params.vec_in_dim, self.hidden_size)
+        self.guidance_in = (
+            MLPEmbedder(in_dim=256, hidden_dim=self.hidden_size) if params.guidance_embed else nn.Identity()
+        )
+        self.txt_in = nn.Linear(params.context_in_dim, self.hidden_size)
+        self.double_blocks = nn.ModuleList(
+            [
+                DoubleStreamBlock(
+                    self.hidden_size,
+                    self.num_heads,
+                    mlp_ratio=params.mlp_ratio,
+                    qkv_bias=params.qkv_bias,
+                )
+                for _ in range(params.depth)
+            ]
+        )
+        self.single_blocks = nn.ModuleList(
+            [
+                SingleStreamBlock(self.hidden_size, self.num_heads, mlp_ratio=params.mlp_ratio)
+                for _ in range(params.depth_single_blocks)
+            ]
+        )
+        self.final_layer = LastLayer(self.hidden_size, 1, self.out_channels)
+        self.gradient_checkpointing = False
+    def _set_gradient_checkpointing(self, module, value=False):
+        if hasattr(module, "gradient_checkpointing"):
+            module.gradient_checkpointing = value
+    @property
+    def attn_processors(self):
+        # set recursively
+        processors = {}  # type: dict[str, nn.Module]
+        def fn_recursive_add_processors(name: str, module: torch.nn.Module, processors):
+            if hasattr(module, "set_processor"):
+                processors[f"{name}.processor"] = module.processor
+            for sub_name, child in module.named_children():
+                fn_recursive_add_processors(f"{name}.{sub_name}", child, processors)
+            return processors
+        for name, module in self.named_children():
+            fn_recursive_add_processors(name, module, processors)
+        return processors
+    def set_attn_processor(self, processor):
+        r"""
+        Sets the attention processor to use to compute attention.
+        Parameters:
+            processor (`dict` of `AttentionProcessor` or only `AttentionProcessor`):
+                The instantiated processor class or a dictionary of processor classes that will be set as the processor
+                for **all** `Attention` layers.
+                If `processor` is a dict, the key needs to define the path to the corresponding cross attention
+                processor. This is strongly recommended when setting trainable attention processors.
+        """
+        count = len(self.attn_processors.keys())
+        if isinstance(processor, dict) and len(processor) != count:
+            raise ValueError(
+                f"A dict of processors was passed, but the number of processors {len(processor)} does not match the"
+                f" number of attention layers: {count}. Please make sure to pass {count} processor classes."
+            )
+        def fn_recursive_attn_processor(name: str, module: torch.nn.Module, processor):
+            if hasattr(module, "set_processor"):
+                if not isinstance(processor, dict):
+                    module.set_processor(processor)
+                else:
+                    module.set_processor(processor.pop(f"{name}.processor"))
+            for sub_name, child in module.named_children():
+                fn_recursive_attn_processor(f"{name}.{sub_name}", child, processor)
+        for name, module in self.named_children():
+            fn_recursive_attn_processor(name, module, processor)
+    def forward(
+        self,
+        img: Tensor,
+        img_ids: Tensor,
+        txt: Tensor,
+        txt_ids: Tensor,
+        timesteps: Tensor,
+        y: Tensor,
+        guidance: Tensor | None = None,
+        ref_img: Tensor | None = None,
+        ref_img_ids: Tensor | None = None,
+    ) -> Tensor:
+        if img.ndim != 3 or txt.ndim != 3:
+            raise ValueError("Input img and txt tensors must have 3 dimensions.")
+        # running on sequences img
+        img = self.img_in(img)
+        vec = self.time_in(timestep_embedding(timesteps, 256))
+        if self.params.guidance_embed:
+            if guidance is None:
+                raise ValueError("Didn't get guidance strength for guidance distilled model.")
+            vec = vec + self.guidance_in(timestep_embedding(guidance, 256))
+        vec = vec + self.vector_in(y)
+        txt = self.txt_in(txt)
+        ids = torch.cat((txt_ids, img_ids), dim=1)
+        # concat ref_img/img
+        img_end = img.shape[1]
+        if ref_img is not None:
+            if isinstance(ref_img, tuple) or isinstance(ref_img, list):
+                img_in = [img] + [self.img_in(ref) for ref in ref_img]
+                img_ids = [ids] + [ref_ids for ref_ids in ref_img_ids]
+                img = torch.cat(img_in, dim=1)
+                ids = torch.cat(img_ids, dim=1)
+            else:
+                img = torch.cat((img, self.img_in(ref_img)), dim=1)
+                ids = torch.cat((ids, ref_img_ids), dim=1)
+        pe = self.pe_embedder(ids)
+        for index_block, block in enumerate(self.double_blocks):
+            if self.training and self.gradient_checkpointing:
+                img, txt = torch.utils.checkpoint.checkpoint(
+                    block,
+                    img=img,
+                    txt=txt,
+                    vec=vec,
+                    pe=pe,
+                    use_reentrant=False,
+                )
+            else:
+                img, txt = block(
+                    img=img,
+                    txt=txt,
+                    vec=vec,
+                    pe=pe
+                )
+        img = torch.cat((txt, img), 1)
+        for block in self.single_blocks:
+            if self.training and self.gradient_checkpointing:
+                img = torch.utils.checkpoint.checkpoint(
+                    block,
+                    img, vec=vec, pe=pe,
+                    use_reentrant=False
+                )
+            else:
+                img = block(img, vec=vec, pe=pe)
+        img = img[:, txt.shape[1] :, ...]
+        # index img
+        img = img[:, :img_end, ...]
+        img = self.final_layer(img, vec)  # (N, T, patch_size ** 2 * out_channels)
+        return img

uno/flux/modules/autoencoder.py ADDED Viewed

	@@ -0,0 +1,327 @@

+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates. All rights reserved.
+# Copyright (c) 2024 Black Forest Labs and The XLabs-AI Team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from dataclasses import dataclass
+import torch
+from einops import rearrange
+from torch import Tensor, nn
+@dataclass
+class AutoEncoderParams:
+    resolution: int
+    in_channels: int
+    ch: int
+    out_ch: int
+    ch_mult: list[int]
+    num_res_blocks: int
+    z_channels: int
+    scale_factor: float
+    shift_factor: float
+def swish(x: Tensor) -> Tensor:
+    return x * torch.sigmoid(x)
+class AttnBlock(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+        self.norm = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+        self.q = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.k = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.v = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+        self.proj_out = nn.Conv2d(in_channels, in_channels, kernel_size=1)
+    def attention(self, h_: Tensor) -> Tensor:
+        h_ = self.norm(h_)
+        q = self.q(h_)
+        k = self.k(h_)
+        v = self.v(h_)
+        b, c, h, w = q.shape
+        q = rearrange(q, "b c h w -> b 1 (h w) c").contiguous()
+        k = rearrange(k, "b c h w -> b 1 (h w) c").contiguous()
+        v = rearrange(v, "b c h w -> b 1 (h w) c").contiguous()
+        h_ = nn.functional.scaled_dot_product_attention(q, k, v)
+        return rearrange(h_, "b 1 (h w) c -> b c h w", h=h, w=w, c=c, b=b)
+    def forward(self, x: Tensor) -> Tensor:
+        return x + self.proj_out(self.attention(x))
+class ResnetBlock(nn.Module):
+    def __init__(self, in_channels: int, out_channels: int):
+        super().__init__()
+        self.in_channels = in_channels
+        out_channels = in_channels if out_channels is None else out_channels
+        self.out_channels = out_channels
+        self.norm1 = nn.GroupNorm(num_groups=32, num_channels=in_channels, eps=1e-6, affine=True)
+        self.conv1 = nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        self.norm2 = nn.GroupNorm(num_groups=32, num_channels=out_channels, eps=1e-6, affine=True)
+        self.conv2 = nn.Conv2d(out_channels, out_channels, kernel_size=3, stride=1, padding=1)
+        if self.in_channels != self.out_channels:
+            self.nin_shortcut = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, padding=0)
+    def forward(self, x):
+        h = x
+        h = self.norm1(h)
+        h = swish(h)
+        h = self.conv1(h)
+        h = self.norm2(h)
+        h = swish(h)
+        h = self.conv2(h)
+        if self.in_channels != self.out_channels:
+            x = self.nin_shortcut(x)
+        return x + h
+class Downsample(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        # no asymmetric padding in torch conv, must do it ourselves
+        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=2, padding=0)
+    def forward(self, x: Tensor):
+        pad = (0, 1, 0, 1)
+        x = nn.functional.pad(x, pad, mode="constant", value=0)
+        x = self.conv(x)
+        return x
+class Upsample(nn.Module):
+    def __init__(self, in_channels: int):
+        super().__init__()
+        self.conv = nn.Conv2d(in_channels, in_channels, kernel_size=3, stride=1, padding=1)
+    def forward(self, x: Tensor):
+        x = nn.functional.interpolate(x, scale_factor=2.0, mode="nearest")
+        x = self.conv(x)
+        return x
+class Encoder(nn.Module):
+    def __init__(
+        self,
+        resolution: int,
+        in_channels: int,
+        ch: int,
+        ch_mult: list[int],
+        num_res_blocks: int,
+        z_channels: int,
+    ):
+        super().__init__()
+        self.ch = ch
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        # downsampling
+        self.conv_in = nn.Conv2d(in_channels, self.ch, kernel_size=3, stride=1, padding=1)
+        curr_res = resolution
+        in_ch_mult = (1,) + tuple(ch_mult)
+        self.in_ch_mult = in_ch_mult
+        self.down = nn.ModuleList()
+        block_in = self.ch
+        for i_level in range(self.num_resolutions):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_in = ch * in_ch_mult[i_level]
+            block_out = ch * ch_mult[i_level]
+            for _ in range(self.num_res_blocks):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+            down = nn.Module()
+            down.block = block
+            down.attn = attn
+            if i_level != self.num_resolutions - 1:
+                down.downsample = Downsample(block_in)
+                curr_res = curr_res // 2
+            self.down.append(down)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        # end
+        self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
+        self.conv_out = nn.Conv2d(block_in, 2 * z_channels, kernel_size=3, stride=1, padding=1)
+    def forward(self, x: Tensor) -> Tensor:
+        # downsampling
+        hs = [self.conv_in(x)]
+        for i_level in range(self.num_resolutions):
+            for i_block in range(self.num_res_blocks):
+                h = self.down[i_level].block[i_block](hs[-1])
+                if len(self.down[i_level].attn) > 0:
+                    h = self.down[i_level].attn[i_block](h)
+                hs.append(h)
+            if i_level != self.num_resolutions - 1:
+                hs.append(self.down[i_level].downsample(hs[-1]))
+        # middle
+        h = hs[-1]
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # end
+        h = self.norm_out(h)
+        h = swish(h)
+        h = self.conv_out(h)
+        return h
+class Decoder(nn.Module):
+    def __init__(
+        self,
+        ch: int,
+        out_ch: int,
+        ch_mult: list[int],
+        num_res_blocks: int,
+        in_channels: int,
+        resolution: int,
+        z_channels: int,
+    ):
+        super().__init__()
+        self.ch = ch
+        self.num_resolutions = len(ch_mult)
+        self.num_res_blocks = num_res_blocks
+        self.resolution = resolution
+        self.in_channels = in_channels
+        self.ffactor = 2 ** (self.num_resolutions - 1)
+        # compute in_ch_mult, block_in and curr_res at lowest res
+        block_in = ch * ch_mult[self.num_resolutions - 1]
+        curr_res = resolution // 2 ** (self.num_resolutions - 1)
+        self.z_shape = (1, z_channels, curr_res, curr_res)
+        # z to block_in
+        self.conv_in = nn.Conv2d(z_channels, block_in, kernel_size=3, stride=1, padding=1)
+        # middle
+        self.mid = nn.Module()
+        self.mid.block_1 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        self.mid.attn_1 = AttnBlock(block_in)
+        self.mid.block_2 = ResnetBlock(in_channels=block_in, out_channels=block_in)
+        # upsampling
+        self.up = nn.ModuleList()
+        for i_level in reversed(range(self.num_resolutions)):
+            block = nn.ModuleList()
+            attn = nn.ModuleList()
+            block_out = ch * ch_mult[i_level]
+            for _ in range(self.num_res_blocks + 1):
+                block.append(ResnetBlock(in_channels=block_in, out_channels=block_out))
+                block_in = block_out
+            up = nn.Module()
+            up.block = block
+            up.attn = attn
+            if i_level != 0:
+                up.upsample = Upsample(block_in)
+                curr_res = curr_res * 2
+            self.up.insert(0, up)  # prepend to get consistent order
+        # end
+        self.norm_out = nn.GroupNorm(num_groups=32, num_channels=block_in, eps=1e-6, affine=True)
+        self.conv_out = nn.Conv2d(block_in, out_ch, kernel_size=3, stride=1, padding=1)
+    def forward(self, z: Tensor) -> Tensor:
+        # z to block_in
+        h = self.conv_in(z)
+        # middle
+        h = self.mid.block_1(h)
+        h = self.mid.attn_1(h)
+        h = self.mid.block_2(h)
+        # upsampling
+        for i_level in reversed(range(self.num_resolutions)):
+            for i_block in range(self.num_res_blocks + 1):
+                h = self.up[i_level].block[i_block](h)
+                if len(self.up[i_level].attn) > 0:
+                    h = self.up[i_level].attn[i_block](h)
+            if i_level != 0:
+                h = self.up[i_level].upsample(h)
+        # end
+        h = self.norm_out(h)
+        h = swish(h)
+        h = self.conv_out(h)
+        return h
+class DiagonalGaussian(nn.Module):
+    def __init__(self, sample: bool = True, chunk_dim: int = 1):
+        super().__init__()
+        self.sample = sample
+        self.chunk_dim = chunk_dim
+    def forward(self, z: Tensor) -> Tensor:
+        mean, logvar = torch.chunk(z, 2, dim=self.chunk_dim)
+        if self.sample:
+            std = torch.exp(0.5 * logvar)
+            return mean + std * torch.randn_like(mean)
+        else:
+            return mean
+class AutoEncoder(nn.Module):
+    def __init__(self, params: AutoEncoderParams):
+        super().__init__()
+        self.encoder = Encoder(
+            resolution=params.resolution,
+            in_channels=params.in_channels,
+            ch=params.ch,
+            ch_mult=params.ch_mult,
+            num_res_blocks=params.num_res_blocks,
+            z_channels=params.z_channels,
+        )
+        self.decoder = Decoder(
+            resolution=params.resolution,
+            in_channels=params.in_channels,
+            ch=params.ch,
+            out_ch=params.out_ch,
+            ch_mult=params.ch_mult,
+            num_res_blocks=params.num_res_blocks,
+            z_channels=params.z_channels,
+        )
+        self.reg = DiagonalGaussian()
+        self.scale_factor = params.scale_factor
+        self.shift_factor = params.shift_factor
+    def encode(self, x: Tensor) -> Tensor:
+        z = self.reg(self.encoder(x))
+        z = self.scale_factor * (z - self.shift_factor)
+        return z
+    def decode(self, z: Tensor) -> Tensor:
+        z = z / self.scale_factor + self.shift_factor
+        return self.decoder(z)
+    def forward(self, x: Tensor) -> Tensor:
+        return self.decode(self.encode(x))

uno/flux/modules/conditioner.py ADDED Viewed

	@@ -0,0 +1,53 @@

+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates. All rights reserved.
+# Copyright (c) 2024 Black Forest Labs and The XLabs-AI Team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from torch import Tensor, nn
+from transformers import (CLIPTextModel, CLIPTokenizer, T5EncoderModel,
+                          T5Tokenizer)
+class HFEmbedder(nn.Module):
+    def __init__(self, version: str, max_length: int, **hf_kwargs):
+        super().__init__()
+        self.is_clip = version.startswith("openai")
+        self.max_length = max_length
+        self.output_key = "pooler_output" if self.is_clip else "last_hidden_state"
+        if self.is_clip:
+            self.tokenizer: CLIPTokenizer = CLIPTokenizer.from_pretrained(version, max_length=max_length)
+            self.hf_module: CLIPTextModel = CLIPTextModel.from_pretrained(version, **hf_kwargs)
+        else:
+            self.tokenizer: T5Tokenizer = T5Tokenizer.from_pretrained(version, max_length=max_length)
+            self.hf_module: T5EncoderModel = T5EncoderModel.from_pretrained(version, **hf_kwargs)
+        self.hf_module = self.hf_module.eval().requires_grad_(False)
+    def forward(self, text: list[str]) -> Tensor:
+        batch_encoding = self.tokenizer(
+            text,
+            truncation=True,
+            max_length=self.max_length,
+            return_length=False,
+            return_overflowing_tokens=False,
+            padding="max_length",
+            return_tensors="pt",
+        )
+        outputs = self.hf_module(
+            input_ids=batch_encoding["input_ids"].to(self.hf_module.device),
+            attention_mask=None,
+            output_hidden_states=False,
+        )
+        return outputs[self.output_key]

uno/flux/modules/layers.py ADDED Viewed

	@@ -0,0 +1,435 @@

+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates. All rights reserved.
+# Copyright (c) 2024 Black Forest Labs and The XLabs-AI Team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from dataclasses import dataclass
+import torch
+from einops import rearrange
+from torch import Tensor, nn
+from ..math import attention, rope
+import torch.nn.functional as F
+class EmbedND(nn.Module):
+    def __init__(self, dim: int, theta: int, axes_dim: list[int]):
+        super().__init__()
+        self.dim = dim
+        self.theta = theta
+        self.axes_dim = axes_dim
+    def forward(self, ids: Tensor) -> Tensor:
+        n_axes = ids.shape[-1]
+        emb = torch.cat(
+            [rope(ids[..., i], self.axes_dim[i], self.theta) for i in range(n_axes)],
+            dim=-3,
+        )
+        return emb.unsqueeze(1)
+def timestep_embedding(t: Tensor, dim, max_period=10000, time_factor: float = 1000.0):
+    """
+    Create sinusoidal timestep embeddings.
+    :param t: a 1-D Tensor of N indices, one per batch element.
+                      These may be fractional.
+    :param dim: the dimension of the output.
+    :param max_period: controls the minimum frequency of the embeddings.
+    :return: an (N, D) Tensor of positional embeddings.
+    """
+    t = time_factor * t
+    half = dim // 2
+    freqs = torch.exp(-math.log(max_period) * torch.arange(start=0, end=half, dtype=torch.float32) / half).to(
+        t.device
+    )
+    args = t[:, None].float() * freqs[None]
+    embedding = torch.cat([torch.cos(args), torch.sin(args)], dim=-1)
+    if dim % 2:
+        embedding = torch.cat([embedding, torch.zeros_like(embedding[:, :1])], dim=-1)
+    if torch.is_floating_point(t):
+        embedding = embedding.to(t)
+    return embedding
+class MLPEmbedder(nn.Module):
+    def __init__(self, in_dim: int, hidden_dim: int):
+        super().__init__()
+        self.in_layer = nn.Linear(in_dim, hidden_dim, bias=True)
+        self.silu = nn.SiLU()
+        self.out_layer = nn.Linear(hidden_dim, hidden_dim, bias=True)
+    def forward(self, x: Tensor) -> Tensor:
+        return self.out_layer(self.silu(self.in_layer(x)))
+class RMSNorm(torch.nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.scale = nn.Parameter(torch.ones(dim))
+    def forward(self, x: Tensor):
+        x_dtype = x.dtype
+        x = x.float()
+        rrms = torch.rsqrt(torch.mean(x**2, dim=-1, keepdim=True) + 1e-6)
+        return (x * rrms).to(dtype=x_dtype) * self.scale
+class QKNorm(torch.nn.Module):
+    def __init__(self, dim: int):
+        super().__init__()
+        self.query_norm = RMSNorm(dim)
+        self.key_norm = RMSNorm(dim)
+    def forward(self, q: Tensor, k: Tensor, v: Tensor) -> tuple[Tensor, Tensor]:
+        q = self.query_norm(q)
+        k = self.key_norm(k)
+        return q.to(v), k.to(v)
+class LoRALinearLayer(nn.Module):
+    def __init__(self, in_features, out_features, rank=4, network_alpha=None, device=None, dtype=None):
+        super().__init__()
+        self.down = nn.Linear(in_features, rank, bias=False, device=device, dtype=dtype)
+        self.up = nn.Linear(rank, out_features, bias=False, device=device, dtype=dtype)
+        # This value has the same meaning as the `--network_alpha` option in the kohya-ss trainer script.
+        # See https://github.com/darkstorm2150/sd-scripts/blob/main/docs/train_network_README-en.md#execute-learning
+        self.network_alpha = network_alpha
+        self.rank = rank
+        nn.init.normal_(self.down.weight, std=1 / rank)
+        nn.init.zeros_(self.up.weight)
+    def forward(self, hidden_states):
+        orig_dtype = hidden_states.dtype
+        dtype = self.down.weight.dtype
+        down_hidden_states = self.down(hidden_states.to(dtype))
+        up_hidden_states = self.up(down_hidden_states)
+        if self.network_alpha is not None:
+            up_hidden_states *= self.network_alpha / self.rank
+        return up_hidden_states.to(orig_dtype)
+class FLuxSelfAttnProcessor:
+    def __call__(self, attn, x, pe, **attention_kwargs):
+        qkv = attn.qkv(x)
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        q, k = attn.norm(q, k, v)
+        x = attention(q, k, v, pe=pe)
+        x = attn.proj(x)
+        return x
+class LoraFluxAttnProcessor(nn.Module):
+    def __init__(self, dim: int, rank=4, network_alpha=None, lora_weight=1):
+        super().__init__()
+        self.qkv_lora = LoRALinearLayer(dim, dim * 3, rank, network_alpha)
+        self.proj_lora = LoRALinearLayer(dim, dim, rank, network_alpha)
+        self.lora_weight = lora_weight
+    def __call__(self, attn, x, pe, **attention_kwargs):
+        qkv = attn.qkv(x) + self.qkv_lora(x) * self.lora_weight
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=self.num_heads)
+        q, k = attn.norm(q, k, v)
+        x = attention(q, k, v, pe=pe)
+        x = attn.proj(x) + self.proj_lora(x) * self.lora_weight
+        return x
+class SelfAttention(nn.Module):
+    def __init__(self, dim: int, num_heads: int = 8, qkv_bias: bool = False):
+        super().__init__()
+        self.num_heads = num_heads
+        head_dim = dim // num_heads
+        self.qkv = nn.Linear(dim, dim * 3, bias=qkv_bias)
+        self.norm = QKNorm(head_dim)
+        self.proj = nn.Linear(dim, dim)
+    def forward():
+        pass
+@dataclass
+class ModulationOut:
+    shift: Tensor
+    scale: Tensor
+    gate: Tensor
+class Modulation(nn.Module):
+    def __init__(self, dim: int, double: bool):
+        super().__init__()
+        self.is_double = double
+        self.multiplier = 6 if double else 3
+        self.lin = nn.Linear(dim, self.multiplier * dim, bias=True)
+    def forward(self, vec: Tensor) -> tuple[ModulationOut, ModulationOut | None]:
+        out = self.lin(nn.functional.silu(vec))[:, None, :].chunk(self.multiplier, dim=-1)
+        return (
+            ModulationOut(*out[:3]),
+            ModulationOut(*out[3:]) if self.is_double else None,
+        )
+class DoubleStreamBlockLoraProcessor(nn.Module):
+    def __init__(self, dim: int, rank=4, network_alpha=None, lora_weight=1):
+        super().__init__()
+        self.qkv_lora1 = LoRALinearLayer(dim, dim * 3, rank, network_alpha)
+        self.proj_lora1 = LoRALinearLayer(dim, dim, rank, network_alpha)
+        self.qkv_lora2 = LoRALinearLayer(dim, dim * 3, rank, network_alpha)
+        self.proj_lora2 = LoRALinearLayer(dim, dim, rank, network_alpha)
+        self.lora_weight = lora_weight
+    def forward(self, attn, img, txt, vec, pe, **attention_kwargs):
+        img_mod1, img_mod2 = attn.img_mod(vec)
+        txt_mod1, txt_mod2 = attn.txt_mod(vec)
+        # prepare image for attention
+        img_modulated = attn.img_norm1(img)
+        img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
+        img_qkv = attn.img_attn.qkv(img_modulated) + self.qkv_lora1(img_modulated) * self.lora_weight
+        img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=attn.num_heads)
+        img_q, img_k = attn.img_attn.norm(img_q, img_k, img_v)
+        # prepare txt for attention
+        txt_modulated = attn.txt_norm1(txt)
+        txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
+        txt_qkv = attn.txt_attn.qkv(txt_modulated) + self.qkv_lora2(txt_modulated) * self.lora_weight
+        txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=attn.num_heads)
+        txt_q, txt_k = attn.txt_attn.norm(txt_q, txt_k, txt_v)
+        # run actual attention
+        q = torch.cat((txt_q, img_q), dim=2)
+        k = torch.cat((txt_k, img_k), dim=2)
+        v = torch.cat((txt_v, img_v), dim=2)
+        attn1 = attention(q, k, v, pe=pe)
+        txt_attn, img_attn = attn1[:, : txt.shape[1]], attn1[:, txt.shape[1] :]
+        # calculate the img bloks
+        img = img + img_mod1.gate * (attn.img_attn.proj(img_attn) + self.proj_lora1(img_attn) * self.lora_weight)
+        img = img + img_mod2.gate * attn.img_mlp((1 + img_mod2.scale) * attn.img_norm2(img) + img_mod2.shift)
+        # calculate the txt bloks
+        txt = txt + txt_mod1.gate * (attn.txt_attn.proj(txt_attn) + self.proj_lora2(txt_attn) * self.lora_weight)
+        txt = txt + txt_mod2.gate * attn.txt_mlp((1 + txt_mod2.scale) * attn.txt_norm2(txt) + txt_mod2.shift)
+        return img, txt
+class DoubleStreamBlockProcessor:
+    def __call__(self, attn, img, txt, vec, pe, **attention_kwargs):
+        img_mod1, img_mod2 = attn.img_mod(vec)
+        txt_mod1, txt_mod2 = attn.txt_mod(vec)
+        # prepare image for attention
+        img_modulated = attn.img_norm1(img)
+        img_modulated = (1 + img_mod1.scale) * img_modulated + img_mod1.shift
+        img_qkv = attn.img_attn.qkv(img_modulated)
+        img_q, img_k, img_v = rearrange(img_qkv, "B L (K H D) -> K B H L D", K=3, H=attn.num_heads, D=attn.head_dim)
+        img_q, img_k = attn.img_attn.norm(img_q, img_k, img_v)
+        # prepare txt for attention
+        txt_modulated = attn.txt_norm1(txt)
+        txt_modulated = (1 + txt_mod1.scale) * txt_modulated + txt_mod1.shift
+        txt_qkv = attn.txt_attn.qkv(txt_modulated)
+        txt_q, txt_k, txt_v = rearrange(txt_qkv, "B L (K H D) -> K B H L D", K=3, H=attn.num_heads, D=attn.head_dim)
+        txt_q, txt_k = attn.txt_attn.norm(txt_q, txt_k, txt_v)
+        # run actual attention
+        q = torch.cat((txt_q, img_q), dim=2)
+        k = torch.cat((txt_k, img_k), dim=2)
+        v = torch.cat((txt_v, img_v), dim=2)
+        attn1 = attention(q, k, v, pe=pe)
+        txt_attn, img_attn = attn1[:, : txt.shape[1]], attn1[:, txt.shape[1] :]
+        # calculate the img bloks
+        img = img + img_mod1.gate * attn.img_attn.proj(img_attn)
+        img = img + img_mod2.gate * attn.img_mlp((1 + img_mod2.scale) * attn.img_norm2(img) + img_mod2.shift)
+        # calculate the txt bloks
+        txt = txt + txt_mod1.gate * attn.txt_attn.proj(txt_attn)
+        txt = txt + txt_mod2.gate * attn.txt_mlp((1 + txt_mod2.scale) * attn.txt_norm2(txt) + txt_mod2.shift)
+        return img, txt
+class DoubleStreamBlock(nn.Module):
+    def __init__(self, hidden_size: int, num_heads: int, mlp_ratio: float, qkv_bias: bool = False):
+        super().__init__()
+        mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        self.num_heads = num_heads
+        self.hidden_size = hidden_size
+        self.head_dim = hidden_size // num_heads
+        self.img_mod = Modulation(hidden_size, double=True)
+        self.img_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.img_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
+        self.img_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.img_mlp = nn.Sequential(
+            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
+        )
+        self.txt_mod = Modulation(hidden_size, double=True)
+        self.txt_norm1 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.txt_attn = SelfAttention(dim=hidden_size, num_heads=num_heads, qkv_bias=qkv_bias)
+        self.txt_norm2 = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.txt_mlp = nn.Sequential(
+            nn.Linear(hidden_size, mlp_hidden_dim, bias=True),
+            nn.GELU(approximate="tanh"),
+            nn.Linear(mlp_hidden_dim, hidden_size, bias=True),
+        )
+        processor = DoubleStreamBlockProcessor()
+        self.set_processor(processor)
+    def set_processor(self, processor) -> None:
+        self.processor = processor
+    def get_processor(self):
+        return self.processor
+    def forward(
+        self,
+        img: Tensor,
+        txt: Tensor,
+        vec: Tensor,
+        pe: Tensor,
+        image_proj: Tensor = None,
+        ip_scale: float =1.0,
+    ) -> tuple[Tensor, Tensor]:
+        if image_proj is None:
+            return self.processor(self, img, txt, vec, pe)
+        else:
+            return self.processor(self, img, txt, vec, pe, image_proj, ip_scale)
+class SingleStreamBlockLoraProcessor(nn.Module):
+    def __init__(self, dim: int, rank: int = 4, network_alpha = None, lora_weight: float = 1):
+        super().__init__()
+        self.qkv_lora = LoRALinearLayer(dim, dim * 3, rank, network_alpha)
+        self.proj_lora = LoRALinearLayer(15360, dim, rank, network_alpha)
+        self.lora_weight = lora_weight
+    def forward(self, attn: nn.Module, x: Tensor, vec: Tensor, pe: Tensor) -> Tensor:
+        mod, _ = attn.modulation(vec)
+        x_mod = (1 + mod.scale) * attn.pre_norm(x) + mod.shift
+        qkv, mlp = torch.split(attn.linear1(x_mod), [3 * attn.hidden_size, attn.mlp_hidden_dim], dim=-1)
+        qkv = qkv + self.qkv_lora(x_mod) * self.lora_weight
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=attn.num_heads)
+        q, k = attn.norm(q, k, v)
+        # compute attention
+        attn_1 = attention(q, k, v, pe=pe)
+        # compute activation in mlp stream, cat again and run second linear layer
+        output = attn.linear2(torch.cat((attn_1, attn.mlp_act(mlp)), 2))
+        output = output + self.proj_lora(torch.cat((attn_1, attn.mlp_act(mlp)), 2)) * self.lora_weight
+        output = x + mod.gate * output
+        return output
+class SingleStreamBlockProcessor:
+    def __call__(self, attn: nn.Module, x: Tensor, vec: Tensor, pe: Tensor, **attention_kwargs) -> Tensor:
+        mod, _ = attn.modulation(vec)
+        x_mod = (1 + mod.scale) * attn.pre_norm(x) + mod.shift
+        qkv, mlp = torch.split(attn.linear1(x_mod), [3 * attn.hidden_size, attn.mlp_hidden_dim], dim=-1)
+        q, k, v = rearrange(qkv, "B L (K H D) -> K B H L D", K=3, H=attn.num_heads)
+        q, k = attn.norm(q, k, v)
+        # compute attention
+        attn_1 = attention(q, k, v, pe=pe)
+        # compute activation in mlp stream, cat again and run second linear layer
+        output = attn.linear2(torch.cat((attn_1, attn.mlp_act(mlp)), 2))
+        output = x + mod.gate * output
+        return output
+class SingleStreamBlock(nn.Module):
+    """
+    A DiT block with parallel linear layers as described in
+    https://arxiv.org/abs/2302.05442 and adapted modulation interface.
+    """
+    def __init__(
+        self,
+        hidden_size: int,
+        num_heads: int,
+        mlp_ratio: float = 4.0,
+        qk_scale: float | None = None,
+    ):
+        super().__init__()
+        self.hidden_dim = hidden_size
+        self.num_heads = num_heads
+        self.head_dim = hidden_size // num_heads
+        self.scale = qk_scale or self.head_dim**-0.5
+        self.mlp_hidden_dim = int(hidden_size * mlp_ratio)
+        # qkv and mlp_in
+        self.linear1 = nn.Linear(hidden_size, hidden_size * 3 + self.mlp_hidden_dim)
+        # proj and mlp_out
+        self.linear2 = nn.Linear(hidden_size + self.mlp_hidden_dim, hidden_size)
+        self.norm = QKNorm(self.head_dim)
+        self.hidden_size = hidden_size
+        self.pre_norm = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.mlp_act = nn.GELU(approximate="tanh")
+        self.modulation = Modulation(hidden_size, double=False)
+        processor = SingleStreamBlockProcessor()
+        self.set_processor(processor)
+    def set_processor(self, processor) -> None:
+        self.processor = processor
+    def get_processor(self):
+        return self.processor
+    def forward(
+        self,
+        x: Tensor,
+        vec: Tensor,
+        pe: Tensor,
+        image_proj: Tensor | None = None,
+        ip_scale: float = 1.0,
+    ) -> Tensor:
+        if image_proj is None:
+            return self.processor(self, x, vec, pe)
+        else:
+            return self.processor(self, x, vec, pe, image_proj, ip_scale)
+class LastLayer(nn.Module):
+    def __init__(self, hidden_size: int, patch_size: int, out_channels: int):
+        super().__init__()
+        self.norm_final = nn.LayerNorm(hidden_size, elementwise_affine=False, eps=1e-6)
+        self.linear = nn.Linear(hidden_size, patch_size * patch_size * out_channels, bias=True)
+        self.adaLN_modulation = nn.Sequential(nn.SiLU(), nn.Linear(hidden_size, 2 * hidden_size, bias=True))
+    def forward(self, x: Tensor, vec: Tensor) -> Tensor:
+        shift, scale = self.adaLN_modulation(vec).chunk(2, dim=1)
+        x = (1 + scale[:, None, :]) * self.norm_final(x) + shift[:, None, :]
+        x = self.linear(x)
+        return x

uno/flux/pipeline.py ADDED Viewed

	@@ -0,0 +1,304 @@

+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates. All rights reserved.
+# Copyright (c) 2024 Black Forest Labs and The XLabs-AI Team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from typing import Literal
+import torch
+from einops import rearrange
+from PIL import ExifTags, Image
+import torchvision.transforms.functional as TVF
+from uno.flux.modules.layers import (
+    DoubleStreamBlockLoraProcessor,
+    DoubleStreamBlockProcessor,
+    SingleStreamBlockLoraProcessor,
+    SingleStreamBlockProcessor,
+)
+from uno.flux.sampling import denoise, get_noise, get_schedule, prepare_multi_ip, unpack
+from uno.flux.util import (
+    get_lora_rank,
+    load_ae,
+    load_checkpoint,
+    load_clip,
+    load_flow_model,
+    load_flow_model_only_lora,
+    load_flow_model_quintized,
+    load_t5,
+)
+def find_nearest_scale(image_h, image_w, predefined_scales):
+    """
+    根据图片的高度和宽度，找到最近的预定义尺度。
+    :param image_h: 图片的高度
+    :param image_w: 图片的宽度
+    :param predefined_scales: 预定义尺度列表 [(h1, w1), (h2, w2), ...]
+    :return: 最近的预定义尺度 (h, w)
+    """
+    # 计算输入图片的长宽比
+    image_ratio = image_h / image_w
+    # 初始化变量以存储最小差异和最近的尺度
+    min_diff = float('inf')
+    nearest_scale = None
+    # 遍历所有预定义尺度，找到与输入图片长宽比最接近的尺度
+    for scale_h, scale_w in predefined_scales:
+        predefined_ratio = scale_h / scale_w
+        diff = abs(predefined_ratio - image_ratio)
+        if diff < min_diff:
+            min_diff = diff
+            nearest_scale = (scale_h, scale_w)
+    return nearest_scale
+def preprocess_ref(raw_image: Image.Image, long_size: int = 512):
+    # 获取原始图像的宽度和高度
+    image_w, image_h = raw_image.size
+    # 计算长边和短边
+    if image_w >= image_h:
+        new_w = long_size
+        new_h = int((long_size / image_w) * image_h)
+    else:
+        new_h = long_size
+        new_w = int((long_size / image_h) * image_w)
+    # 按新的宽高进行等比例缩放
+    raw_image = raw_image.resize((new_w, new_h), resample=Image.LANCZOS)
+    target_w = new_w // 16 * 16
+    target_h = new_h // 16 * 16
+    # 计算裁剪的起始坐标以实现中心裁剪
+    left = (new_w - target_w) // 2
+    top = (new_h - target_h) // 2
+    right = left + target_w
+    bottom = top + target_h
+    # 进行中心裁剪
+    raw_image = raw_image.crop((left, top, right, bottom))
+    # 转换为 RGB 模式
+    raw_image = raw_image.convert("RGB")
+    return raw_image
+class UNOPipeline:
+    def __init__(
+        self,
+        model_type: str,
+        device: torch.device,
+        offload: bool = False,
+        only_lora: bool = False,
+        lora_rank: int = 16
+    ):
+        self.device = device
+        self.offload = offload
+        self.model_type = model_type
+        self.clip = load_clip(self.device)
+        self.t5 = load_t5(self.device, max_length=512)
+        self.ae = load_ae(model_type, device="cpu" if offload else self.device)
+        if "fp8" in model_type:
+            self.model = load_flow_model_quintized(model_type, device="cpu" if offload else self.device)
+        elif only_lora:
+            self.model = load_flow_model_only_lora(
+                model_type, device="cpu" if offload else self.device, lora_rank=lora_rank
+            )
+        else:
+            self.model = load_flow_model(model_type, device="cpu" if offload else self.device)
+    def load_ckpt(self, ckpt_path):
+        if ckpt_path is not None:
+            from safetensors.torch import load_file as load_sft
+            print("Loading checkpoint to replace old keys")
+            # load_sft doesn't support torch.device
+            if ckpt_path.endswith('safetensors'):
+                sd = load_sft(ckpt_path, device='cpu')
+                missing, unexpected = self.model.load_state_dict(sd, strict=False, assign=True)
+            else:
+                dit_state = torch.load(ckpt_path, map_location='cpu')
+                sd = {}
+                for k in dit_state.keys():
+                    sd[k.replace('module.','')] = dit_state[k]
+                missing, unexpected = self.model.load_state_dict(sd, strict=False, assign=True)
+                self.model.to(str(self.device))
+            print(f"missing keys: {missing}\n\n\n\n\nunexpected keys: {unexpected}")
+    def set_lora(self, local_path: str = None, repo_id: str = None,
+                 name: str = None, lora_weight: int = 0.7):
+        checkpoint = load_checkpoint(local_path, repo_id, name)
+        self.update_model_with_lora(checkpoint, lora_weight)
+    def set_lora_from_collection(self, lora_type: str = "realism", lora_weight: int = 0.7):
+        checkpoint = load_checkpoint(
+            None, self.hf_lora_collection, self.lora_types_to_names[lora_type]
+        )
+        self.update_model_with_lora(checkpoint, lora_weight)
+    def update_model_with_lora(self, checkpoint, lora_weight):
+        rank = get_lora_rank(checkpoint)
+        lora_attn_procs = {}
+        for name, _ in self.model.attn_processors.items():
+            lora_state_dict = {}
+            for k in checkpoint.keys():
+                if name in k:
+                    lora_state_dict[k[len(name) + 1:]] = checkpoint[k] * lora_weight
+            if len(lora_state_dict):
+                if name.startswith("single_blocks"):
+                    lora_attn_procs[name] = SingleStreamBlockLoraProcessor(dim=3072, rank=rank)
+                else:
+                    lora_attn_procs[name] = DoubleStreamBlockLoraProcessor(dim=3072, rank=rank)
+                lora_attn_procs[name].load_state_dict(lora_state_dict)
+                lora_attn_procs[name].to(self.device)
+            else:
+                if name.startswith("single_blocks"):
+                    lora_attn_procs[name] = SingleStreamBlockProcessor()
+                else:
+                    lora_attn_procs[name] = DoubleStreamBlockProcessor()
+        self.model.set_attn_processor(lora_attn_procs)
+    def __call__(
+        self,
+        prompt: str,
+        width: int = 512,
+        height: int = 512,
+        guidance: float = 4,
+        num_steps: int = 50,
+        seed: int = 123456789,
+        **kwargs
+    ):
+        width = 16 * (width // 16)
+        height = 16 * (height // 16)
+        return self.forward(
+            prompt,
+            width,
+            height,
+            guidance,
+            num_steps,
+            seed,
+            **kwargs
+        )
+    @torch.inference_mode()
+    def gradio_generate(
+        self,
+        prompt: str,
+        width: int,
+        height: int,
+        guidance: float,
+        num_steps: int,
+        seed: int,
+        image_prompt1: Image.Image,
+        image_prompt2: Image.Image,
+        image_prompt3: Image.Image,
+        image_prompt4: Image.Image,
+    ):
+        ref_imgs = [image_prompt1, image_prompt2, image_prompt3, image_prompt4]
+        ref_imgs = [img for img in ref_imgs if isinstance(img, Image.Image)]
+        ref_long_side = 512 if len(ref_imgs) <= 1 else 320
+        ref_imgs = [preprocess_ref(img, ref_long_side) for img in ref_imgs]
+        seed = seed if seed != -1 else torch.randint(0, 10 ** 8, (1,)).item()
+        img = self(prompt=prompt, width=width, height=height, guidance=guidance,
+                   num_steps=num_steps, seed=seed, ref_imgs=ref_imgs)
+        filename = f"output/gradio/{seed}_{prompt[:20]}.png"
+        os.makedirs(os.path.dirname(filename), exist_ok=True)
+        exif_data = Image.Exif()
+        exif_data[ExifTags.Base.Make] = "UNO"
+        exif_data[ExifTags.Base.Model] = self.model_type
+        info = f"{prompt=}, {seed=}, {width=}, {height=}, {guidance=}, {num_steps=}"
+        exif_data[ExifTags.Base.ImageDescription] = info
+        img.save(filename, format="png", exif=exif_data)
+        return img, filename
+    @torch.inference_mode
+    def forward(
+        self,
+        prompt: str,
+        width: int,
+        height: int,
+        guidance: float,
+        num_steps: int,
+        seed: int,
+        ref_imgs: list[Image.Image] | None = None,
+        pe: Literal['d', 'h', 'w', 'o'] = 'd',
+    ):
+        x = get_noise(
+            1, height, width, device=self.device,
+            dtype=torch.bfloat16, seed=seed
+        )
+        timesteps = get_schedule(
+            num_steps,
+            (width // 8) * (height // 8) // (16 * 16),
+            shift=True,
+        )
+        if self.offload:
+            self.ae.encoder = self.ae.encoder.to(self.device)
+        x_1_refs = [
+            self.ae.encode(
+                (TVF.to_tensor(ref_img) * 2.0 - 1.0)
+                .unsqueeze(0).to(self.device, torch.float32)
+            ).to(torch.bfloat16)
+            for ref_img in ref_imgs
+        ]
+        if self.offload:
+            self.ae.encoder = self.offload_model_to_cpu(self.ae.encoder)
+            self.t5, self.clip = self.t5.to(self.device), self.clip.to(self.device)
+        inp_cond = prepare_multi_ip(
+            t5=self.t5, clip=self.clip,
+            img=x,
+            prompt=prompt, ref_imgs=x_1_refs, pe=pe
+        )
+        if self.offload:
+            self.offload_model_to_cpu(self.t5, self.clip)
+            self.model = self.model.to(self.device)
+        x = denoise(
+            self.model,
+            **inp_cond,
+            timesteps=timesteps,
+            guidance=guidance,
+        )
+        if self.offload:
+            self.offload_model_to_cpu(self.model)
+            self.ae.decoder.to(x.device)
+        x = unpack(x.float(), height, width)
+        x = self.ae.decode(x)
+        self.offload_model_to_cpu(self.ae.decoder)
+        x1 = x.clamp(-1, 1)
+        x1 = rearrange(x1[-1], "c h w -> h w c")
+        output_img = Image.fromarray((127.5 * (x1 + 1.0)).cpu().byte().numpy())
+        return output_img
+    def offload_model_to_cpu(self, *models):
+        if not self.offload: return
+        for model in models:
+            model.cpu()
+            torch.cuda.empty_cache()

uno/flux/sampling.py ADDED Viewed

	@@ -0,0 +1,252 @@

+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates. All rights reserved.
+# Copyright (c) 2024 Black Forest Labs and The XLabs-AI Team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import math
+from typing import Literal
+import torch
+from einops import rearrange, repeat
+from torch import Tensor
+from tqdm import tqdm
+from .model import Flux
+from .modules.conditioner import HFEmbedder
+def get_noise(
+    num_samples: int,
+    height: int,
+    width: int,
+    device: torch.device,
+    dtype: torch.dtype,
+    seed: int,
+):
+    return torch.randn(
+        num_samples,
+        16,
+        # allow for packing
+        2 * math.ceil(height / 16),
+        2 * math.ceil(width / 16),
+        device=device,
+        dtype=dtype,
+        generator=torch.Generator(device=device).manual_seed(seed),
+    )
+def prepare(
+    t5: HFEmbedder,
+    clip: HFEmbedder,
+    img: Tensor,
+    prompt: str | list[str],
+    ref_img: None | Tensor=None,
+    pe: Literal['d', 'h', 'w', 'o'] ='d'
+) -> dict[str, Tensor]:
+    assert pe in ['d', 'h', 'w', 'o']
+    bs, c, h, w = img.shape
+    if bs == 1 and not isinstance(prompt, str):
+        bs = len(prompt)
+    img = rearrange(img, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
+    if img.shape[0] == 1 and bs > 1:
+        img = repeat(img, "1 ... -> bs ...", bs=bs)
+    img_ids = torch.zeros(h // 2, w // 2, 3)
+    img_ids[..., 1] = img_ids[..., 1] + torch.arange(h // 2)[:, None]
+    img_ids[..., 2] = img_ids[..., 2] + torch.arange(w // 2)[None, :]
+    img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs)
+    if ref_img is not None:
+        _, _, ref_h, ref_w = ref_img.shape
+        ref_img = rearrange(ref_img, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
+        if ref_img.shape[0] == 1 and bs > 1:
+            ref_img = repeat(ref_img, "1 ... -> bs ...", bs=bs)
+        ref_img_ids = torch.zeros(ref_h // 2, ref_w // 2, 3)
+        # img id分别在宽高偏移各自最大值
+        h_offset = h // 2 if pe in {'d', 'h'} else 0
+        w_offset = w // 2 if pe in {'d', 'w'} else 0
+        ref_img_ids[..., 1] = ref_img_ids[..., 1] + torch.arange(ref_h // 2)[:, None] + h_offset
+        ref_img_ids[..., 2] = ref_img_ids[..., 2] + torch.arange(ref_w // 2)[None, :] + w_offset
+        ref_img_ids = repeat(ref_img_ids, "h w c -> b (h w) c", b=bs)
+    if isinstance(prompt, str):
+        prompt = [prompt]
+    txt = t5(prompt)
+    if txt.shape[0] == 1 and bs > 1:
+        txt = repeat(txt, "1 ... -> bs ...", bs=bs)
+    txt_ids = torch.zeros(bs, txt.shape[1], 3)
+    vec = clip(prompt)
+    if vec.shape[0] == 1 and bs > 1:
+        vec = repeat(vec, "1 ... -> bs ...", bs=bs)
+    if ref_img is not None:
+        return {
+            "img": img,
+            "img_ids": img_ids.to(img.device),
+            "ref_img": ref_img,
+            "ref_img_ids": ref_img_ids.to(img.device),
+            "txt": txt.to(img.device),
+            "txt_ids": txt_ids.to(img.device),
+            "vec": vec.to(img.device),
+        }
+    else:
+        return {
+            "img": img,
+            "img_ids": img_ids.to(img.device),
+            "txt": txt.to(img.device),
+            "txt_ids": txt_ids.to(img.device),
+            "vec": vec.to(img.device),
+        }
+def prepare_multi_ip(
+    t5: HFEmbedder,
+    clip: HFEmbedder,
+    img: Tensor,
+    prompt: str | list[str],
+    ref_imgs: list[Tensor] | None = None,
+    pe: Literal['d', 'h', 'w', 'o'] = 'd'
+) -> dict[str, Tensor]:
+    assert pe in ['d', 'h', 'w', 'o']
+    bs, c, h, w = img.shape
+    if bs == 1 and not isinstance(prompt, str):
+        bs = len(prompt)
+    img = rearrange(img, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
+    if img.shape[0] == 1 and bs > 1:
+        img = repeat(img, "1 ... -> bs ...", bs=bs)
+    img_ids = torch.zeros(h // 2, w // 2, 3)
+    img_ids[..., 1] = img_ids[..., 1] + torch.arange(h // 2)[:, None]
+    img_ids[..., 2] = img_ids[..., 2] + torch.arange(w // 2)[None, :]
+    img_ids = repeat(img_ids, "h w c -> b (h w) c", b=bs)
+    ref_img_ids = []
+    ref_imgs_list = []
+    pe_shift_w, pe_shift_h = w // 2, h // 2
+    for ref_img in ref_imgs:
+        _, _, ref_h1, ref_w1 = ref_img.shape
+        ref_img = rearrange(ref_img, "b c (h ph) (w pw) -> b (h w) (c ph pw)", ph=2, pw=2)
+        if ref_img.shape[0] == 1 and bs > 1:
+            ref_img = repeat(ref_img, "1 ... -> bs ...", bs=bs)
+        ref_img_ids1 = torch.zeros(ref_h1 // 2, ref_w1 // 2, 3)
+        # img id分别���宽高偏移各自最大值
+        h_offset = pe_shift_h if pe in {'d', 'h'} else 0
+        w_offset = pe_shift_w if pe in {'d', 'w'} else 0
+        ref_img_ids1[..., 1] = ref_img_ids1[..., 1] + torch.arange(ref_h1 // 2)[:, None] + h_offset
+        ref_img_ids1[..., 2] = ref_img_ids1[..., 2] + torch.arange(ref_w1 // 2)[None, :] + w_offset
+        ref_img_ids1 = repeat(ref_img_ids1, "h w c -> b (h w) c", b=bs)
+        ref_img_ids.append(ref_img_ids1)
+        ref_imgs_list.append(ref_img)
+        # 更新pe shift
+        pe_shift_h += ref_h1 // 2
+        pe_shift_w += ref_w1 // 2
+    if isinstance(prompt, str):
+        prompt = [prompt]
+    txt = t5(prompt)
+    if txt.shape[0] == 1 and bs > 1:
+        txt = repeat(txt, "1 ... -> bs ...", bs=bs)
+    txt_ids = torch.zeros(bs, txt.shape[1], 3)
+    vec = clip(prompt)
+    if vec.shape[0] == 1 and bs > 1:
+        vec = repeat(vec, "1 ... -> bs ...", bs=bs)
+    return {
+        "img": img,
+        "img_ids": img_ids.to(img.device),
+        "ref_img": tuple(ref_imgs_list),
+        "ref_img_ids": [ref_img_id.to(img.device) for ref_img_id in ref_img_ids],
+        "txt": txt.to(img.device),
+        "txt_ids": txt_ids.to(img.device),
+        "vec": vec.to(img.device),
+    }
+def time_shift(mu: float, sigma: float, t: Tensor):
+    return math.exp(mu) / (math.exp(mu) + (1 / t - 1) ** sigma)
+def get_lin_function(
+    x1: float = 256, y1: float = 0.5, x2: float = 4096, y2: float = 1.15
+):
+    m = (y2 - y1) / (x2 - x1)
+    b = y1 - m * x1
+    return lambda x: m * x + b
+def get_schedule(
+    num_steps: int,
+    image_seq_len: int,
+    base_shift: float = 0.5,
+    max_shift: float = 1.15,
+    shift: bool = True,
+) -> list[float]:
+    # extra step for zero
+    timesteps = torch.linspace(1, 0, num_steps + 1)
+    # shifting the schedule to favor high timesteps for higher signal images
+    if shift:
+        # eastimate mu based on linear estimation between two points
+        mu = get_lin_function(y1=base_shift, y2=max_shift)(image_seq_len)
+        timesteps = time_shift(mu, 1.0, timesteps)
+    return timesteps.tolist()
+def denoise(
+    model: Flux,
+    # model input
+    img: Tensor,
+    img_ids: Tensor,
+    txt: Tensor,
+    txt_ids: Tensor,
+    vec: Tensor,
+    # sampling parameters
+    timesteps: list[float],
+    guidance: float = 4.0,
+    ref_img: Tensor=None,
+    ref_img_ids: Tensor=None,
+):
+    i = 0
+    guidance_vec = torch.full((img.shape[0],), guidance, device=img.device, dtype=img.dtype)
+    for t_curr, t_prev in tqdm(zip(timesteps[:-1], timesteps[1:]), total=len(timesteps) - 1):
+        t_vec = torch.full((img.shape[0],), t_curr, dtype=img.dtype, device=img.device)
+        pred = model(
+            img=img,
+            img_ids=img_ids,
+            ref_img=ref_img,
+            ref_img_ids=ref_img_ids,
+            txt=txt,
+            txt_ids=txt_ids,
+            y=vec,
+            timesteps=t_vec,
+            guidance=guidance_vec
+        )
+        img = img + (t_prev - t_curr) * pred
+        i += 1
+    return img
+def unpack(x: Tensor, height: int, width: int) -> Tensor:
+    return rearrange(
+        x,
+        "b (h w) (c ph pw) -> b c (h ph) (w pw)",
+        h=math.ceil(height / 16),
+        w=math.ceil(width / 16),
+        ph=2,
+        pw=2,
+    )

uno/flux/util.py ADDED Viewed

	@@ -0,0 +1,396 @@

+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates. All rights reserved.
+# Copyright (c) 2024 Black Forest Labs and The XLabs-AI Team. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import os
+from dataclasses import dataclass
+import torch
+import json
+import numpy as np
+from huggingface_hub import hf_hub_download
+from safetensors import safe_open
+from safetensors.torch import load_file as load_sft
+from .model import Flux, FluxParams
+from .modules.autoencoder import AutoEncoder, AutoEncoderParams
+from .modules.conditioner import HFEmbedder
+import re
+from uno.flux.modules.layers import DoubleStreamBlockLoraProcessor, SingleStreamBlockLoraProcessor
+def load_model(ckpt, device='cpu'):
+    if ckpt.endswith('safetensors'):
+        from safetensors import safe_open
+        pl_sd = {}
+        with safe_open(ckpt, framework="pt", device=device) as f:
+            for k in f.keys():
+                pl_sd[k] = f.get_tensor(k)
+    else:
+        pl_sd = torch.load(ckpt, map_location=device)
+    return pl_sd
+def load_safetensors(path):
+    tensors = {}
+    with safe_open(path, framework="pt", device="cpu") as f:
+        for key in f.keys():
+            tensors[key] = f.get_tensor(key)
+    return tensors
+def get_lora_rank(checkpoint):
+    for k in checkpoint.keys():
+        if k.endswith(".down.weight"):
+            return checkpoint[k].shape[0]
+def load_checkpoint(local_path, repo_id, name):
+    if local_path is not None:
+        if '.safetensors' in local_path:
+            print(f"Loading .safetensors checkpoint from {local_path}")
+            checkpoint = load_safetensors(local_path)
+        else:
+            print(f"Loading checkpoint from {local_path}")
+            checkpoint = torch.load(local_path, map_location='cpu')
+    elif repo_id is not None and name is not None:
+        print(f"Loading checkpoint {name} from repo id {repo_id}")
+        checkpoint = load_from_repo_id(repo_id, name)
+    else:
+        raise ValueError(
+            "LOADING ERROR: you must specify local_path or repo_id with name in HF to download"
+        )
+    return checkpoint
+def c_crop(image):
+    width, height = image.size
+    new_size = min(width, height)
+    left = (width - new_size) / 2
+    top = (height - new_size) / 2
+    right = (width + new_size) / 2
+    bottom = (height + new_size) / 2
+    return image.crop((left, top, right, bottom))
+def pad64(x):
+    return int(np.ceil(float(x) / 64.0) * 64 - x)
+def HWC3(x):
+    assert x.dtype == np.uint8
+    if x.ndim == 2:
+        x = x[:, :, None]
+    assert x.ndim == 3
+    H, W, C = x.shape
+    assert C == 1 or C == 3 or C == 4
+    if C == 3:
+        return x
+    if C == 1:
+        return np.concatenate([x, x, x], axis=2)
+    if C == 4:
+        color = x[:, :, 0:3].astype(np.float32)
+        alpha = x[:, :, 3:4].astype(np.float32) / 255.0
+        y = color * alpha + 255.0 * (1.0 - alpha)
+        y = y.clip(0, 255).astype(np.uint8)
+        return y
+@dataclass
+class ModelSpec:
+    params: FluxParams
+    ae_params: AutoEncoderParams
+    ckpt_path: str | None
+    ae_path: str | None
+    repo_id: str | None
+    repo_flow: str | None
+    repo_ae: str | None
+    repo_id_ae: str | None
+configs = {
+    "flux-dev": ModelSpec(
+        repo_id="black-forest-labs/FLUX.1-dev",
+        repo_id_ae="black-forest-labs/FLUX.1-dev",
+        repo_flow="flux1-dev.safetensors",
+        repo_ae="ae.safetensors",
+        ckpt_path=os.getenv("FLUX_DEV"),
+        params=FluxParams(
+            in_channels=64,
+            vec_in_dim=768,
+            context_in_dim=4096,
+            hidden_size=3072,
+            mlp_ratio=4.0,
+            num_heads=24,
+            depth=19,
+            depth_single_blocks=38,
+            axes_dim=[16, 56, 56],
+            theta=10_000,
+            qkv_bias=True,
+            guidance_embed=True,
+        ),
+        ae_path=os.getenv("AE"),
+        ae_params=AutoEncoderParams(
+            resolution=256,
+            in_channels=3,
+            ch=128,
+            out_ch=3,
+            ch_mult=[1, 2, 4, 4],
+            num_res_blocks=2,
+            z_channels=16,
+            scale_factor=0.3611,
+            shift_factor=0.1159,
+        ),
+    ),
+    "flux-dev-fp8": ModelSpec(
+        repo_id="XLabs-AI/flux-dev-fp8",
+        repo_id_ae="black-forest-labs/FLUX.1-dev",
+        repo_flow="flux-dev-fp8.safetensors",
+        repo_ae="ae.safetensors",
+        ckpt_path=os.getenv("FLUX_DEV_FP8"),
+        params=FluxParams(
+            in_channels=64,
+            vec_in_dim=768,
+            context_in_dim=4096,
+            hidden_size=3072,
+            mlp_ratio=4.0,
+            num_heads=24,
+            depth=19,
+            depth_single_blocks=38,
+            axes_dim=[16, 56, 56],
+            theta=10_000,
+            qkv_bias=True,
+            guidance_embed=True,
+        ),
+        ae_path=os.getenv("AE"),
+        ae_params=AutoEncoderParams(
+            resolution=256,
+            in_channels=3,
+            ch=128,
+            out_ch=3,
+            ch_mult=[1, 2, 4, 4],
+            num_res_blocks=2,
+            z_channels=16,
+            scale_factor=0.3611,
+            shift_factor=0.1159,
+        ),
+    ),
+    "flux-schnell": ModelSpec(
+        repo_id="black-forest-labs/FLUX.1-schnell",
+        repo_id_ae="black-forest-labs/FLUX.1-dev",
+        repo_flow="flux1-schnell.safetensors",
+        repo_ae="ae.safetensors",
+        ckpt_path=os.getenv("FLUX_SCHNELL"),
+        params=FluxParams(
+            in_channels=64,
+            vec_in_dim=768,
+            context_in_dim=4096,
+            hidden_size=3072,
+            mlp_ratio=4.0,
+            num_heads=24,
+            depth=19,
+            depth_single_blocks=38,
+            axes_dim=[16, 56, 56],
+            theta=10_000,
+            qkv_bias=True,
+            guidance_embed=False,
+        ),
+        ae_path=os.getenv("AE"),
+        ae_params=AutoEncoderParams(
+            resolution=256,
+            in_channels=3,
+            ch=128,
+            out_ch=3,
+            ch_mult=[1, 2, 4, 4],
+            num_res_blocks=2,
+            z_channels=16,
+            scale_factor=0.3611,
+            shift_factor=0.1159,
+        ),
+    ),
+}
+def print_load_warning(missing: list[str], unexpected: list[str]) -> None:
+    if len(missing) > 0 and len(unexpected) > 0:
+        print(f"Got {len(missing)} missing keys:\n\t" + "\n\t".join(missing))
+        print("\n" + "-" * 79 + "\n")
+        print(f"Got {len(unexpected)} unexpected keys:\n\t" + "\n\t".join(unexpected))
+    elif len(missing) > 0:
+        print(f"Got {len(missing)} missing keys:\n\t" + "\n\t".join(missing))
+    elif len(unexpected) > 0:
+        print(f"Got {len(unexpected)} unexpected keys:\n\t" + "\n\t".join(unexpected))
+def load_from_repo_id(repo_id, checkpoint_name):
+    ckpt_path = hf_hub_download(repo_id, checkpoint_name)
+    sd = load_sft(ckpt_path, device='cpu')
+    return sd
+def load_flow_model(name: str, device: str | torch.device = "cuda", hf_download: bool = True):
+    # Loading Flux
+    print("Init model")
+    ckpt_path = configs[name].ckpt_path
+    if (
+        ckpt_path is None
+        and configs[name].repo_id is not None
+        and configs[name].repo_flow is not None
+        and hf_download
+    ):
+        ckpt_path = hf_hub_download(configs[name].repo_id, configs[name].repo_flow)
+    with torch.device("meta" if ckpt_path is not None else device):
+        model = Flux(configs[name].params).to(torch.bfloat16)
+    if ckpt_path is not None:
+        print("Loading checkpoint")
+        # load_sft doesn't support torch.device
+        sd = load_model(ckpt_path, device=str(device))
+        missing, unexpected = model.load_state_dict(sd, strict=False, assign=True)
+        print_load_warning(missing, unexpected)
+    return model
+def load_flow_model_only_lora(
+    name: str,
+    device: str | torch.device = "cuda",
+    hf_download: bool = True,
+    lora_rank: int = 16
+):
+    # Loading Flux
+    print("Init model")
+    ckpt_path = configs[name].ckpt_path
+    if (
+        ckpt_path is None
+        and configs[name].repo_id is not None
+        and configs[name].repo_flow is not None
+        and hf_download
+    ):
+        ckpt_path = hf_hub_download(configs[name].repo_id, configs[name].repo_flow.replace("sft", "safetensors"))
+    if hf_download:
+        # lora_ckpt_path = hf_hub_download("bytedance-research/UNO", "dit_lora.safetensors")
+        try:
+            lora_ckpt_path = hf_hub_download("bytedance-research/UNO", "dit_lora.safetensors")
+        except:
+            lora_ckpt_path = os.environ.get("LORA", None)
+    else:
+        lora_ckpt_path = os.environ.get("LORA", None)
+    with torch.device("meta" if ckpt_path is not None else device):
+        model = Flux(configs[name].params)
+    model = set_lora(model, lora_rank, device="meta" if lora_ckpt_path is not None else device)
+    if ckpt_path is not None:
+        print("Loading lora")
+        lora_sd = load_sft(lora_ckpt_path, device=str(device)) if lora_ckpt_path.endswith("safetensors")\
+            else torch.load(lora_ckpt_path, map_location='cpu')
+        print("Loading main checkpoint")
+        # load_sft doesn't support torch.device
+        if ckpt_path.endswith('safetensors'):
+            sd = load_sft(ckpt_path, device=str(device))
+            sd.update(lora_sd)
+            missing, unexpected = model.load_state_dict(sd, strict=False, assign=True)
+        else:
+            dit_state = torch.load(ckpt_path, map_location='cpu')
+            sd = {}
+            for k in dit_state.keys():
+                sd[k.replace('module.','')] = dit_state[k]
+            sd.update(lora_sd)
+            missing, unexpected = model.load_state_dict(sd, strict=False, assign=True)
+            model.to(str(device))
+        print_load_warning(missing, unexpected)
+    return model
+def set_lora(
+    model: Flux,
+    lora_rank: int,
+    double_blocks_indices: list[int] | None = None,
+    single_blocks_indices: list[int] | None = None,
+    device: str | torch.device = "cpu",
+) -> Flux:
+    double_blocks_indices = list(range(model.params.depth)) if double_blocks_indices is None else double_blocks_indices
+    single_blocks_indices = list(range(model.params.depth_single_blocks)) if single_blocks_indices is None \
+                            else single_blocks_indices
+    lora_attn_procs = {}
+    with torch.device(device):
+        for name, attn_processor in  model.attn_processors.items():
+            match = re.search(r'\.(\d+)\.', name)
+            if match:
+                layer_index = int(match.group(1))
+            if name.startswith("double_blocks") and layer_index in double_blocks_indices:
+                lora_attn_procs[name] = DoubleStreamBlockLoraProcessor(dim=model.params.hidden_size, rank=lora_rank)
+            elif name.startswith("single_blocks") and layer_index in single_blocks_indices:
+                lora_attn_procs[name] = SingleStreamBlockLoraProcessor(dim=model.params.hidden_size, rank=lora_rank)
+            else:
+                lora_attn_procs[name] = attn_processor
+    model.set_attn_processor(lora_attn_procs)
+    return model
+def load_flow_model_quintized(name: str, device: str | torch.device = "cuda", hf_download: bool = True):
+    # Loading Flux
+    from optimum.quanto import requantize
+    print("Init model")
+    ckpt_path = configs[name].ckpt_path
+    if (
+        ckpt_path is None
+        and configs[name].repo_id is not None
+        and configs[name].repo_flow is not None
+        and hf_download
+    ):
+        ckpt_path = hf_hub_download(configs[name].repo_id, configs[name].repo_flow)
+    json_path = hf_hub_download(configs[name].repo_id, 'flux_dev_quantization_map.json')
+    model = Flux(configs[name].params).to(torch.bfloat16)
+    print("Loading checkpoint")
+    # load_sft doesn't support torch.device
+    sd = load_sft(ckpt_path, device='cpu')
+    with open(json_path, "r") as f:
+        quantization_map = json.load(f)
+    print("Start a quantization process...")
+    requantize(model, sd, quantization_map, device=device)
+    print("Model is quantized!")
+    return model
+def load_t5(device: str | torch.device = "cuda", max_length: int = 512) -> HFEmbedder:
+    # max length 64, 128, 256 and 512 should work (if your sequence is short enough)
+    version = os.environ.get("T5", "xlabs-ai/xflux_text_encoders")
+    return HFEmbedder(version, max_length=max_length, torch_dtype=torch.bfloat16).to(device)
+def load_clip(device: str | torch.device = "cuda") -> HFEmbedder:
+    version = os.environ.get("CLIP", "openai/clip-vit-large-patch14")
+    return HFEmbedder(version, max_length=77, torch_dtype=torch.bfloat16).to(device)
+def load_ae(name: str, device: str | torch.device = "cuda", hf_download: bool = True) -> AutoEncoder:
+    ckpt_path = configs[name].ae_path
+    if (
+        ckpt_path is None
+        and configs[name].repo_id is not None
+        and configs[name].repo_ae is not None
+        and hf_download
+    ):
+        ckpt_path = hf_hub_download(configs[name].repo_id_ae, configs[name].repo_ae)
+    # Loading the autoencoder
+    print("Init AE")
+    with torch.device("meta" if ckpt_path is not None else device):
+        ae = AutoEncoder(configs[name].ae_params)
+    if ckpt_path is not None:
+        sd = load_sft(ckpt_path, device=str(device))
+        missing, unexpected = ae.load_state_dict(sd, strict=False, assign=True)
+        print_load_warning(missing, unexpected)
+    return ae

uno/utils/convert_yaml_to_args_file.py ADDED Viewed

	@@ -0,0 +1,34 @@

+# Copyright (c) 2025 Bytedance Ltd. and/or its affiliates. All rights reserved.
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#     http://www.apache.org/licenses/LICENSE-2.0
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import argparse
+import yaml
+parser = argparse.ArgumentParser()
+parser.add_argument("--yaml", type=str, required=True)
+parser.add_argument("--arg", type=str, required=True)
+args = parser.parse_args()
+with open(args.yaml, "r") as f:
+    data = yaml.safe_load(f)
+with open(args.arg, "w") as f:
+    for k, v in data.items():
+        if isinstance(v, list):
+            v = list(map(str, v))
+            v = " ".join(v)
+        if v is None:
+            continue
+        print(f"--{k} {v}", end=" ", file=f)