kelseye commited on 12 days ago

Commit

6322336

verified ·

1 Parent(s): 07de83f

Upload folder using huggingface_hub

Browse files

Files changed (18) hide show

.gitattributes +8 -0
README.md +195 -0
README_from_modelscope.md +197 -0
assets/cat_ContentRef_1.jpg +0 -0
assets/cat_ContentRef_2.jpg +3 -0
assets/cat_style_1.jpg +0 -0
assets/cat_style_2.jpg +3 -0
assets/girl_ContentRef_1.jpg +3 -0
assets/girl_ContentRef_2.jpg +3 -0
assets/girl_style_1.jpg +3 -0
assets/girl_style_2.jpg +3 -0
assets/house_ContentRef_1.jpg +3 -0
assets/house_ContentRef_2.jpg +0 -0
assets/house_style_1.jpg +3 -0
assets/house_style_2.jpg +0 -0
configuration.json +1 -0
model.py +228 -0
model.safetensors +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,11 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/cat_ContentRef_2.jpg filter=lfs diff=lfs merge=lfs -text
+assets/cat_style_2.jpg filter=lfs diff=lfs merge=lfs -text
+assets/girl_ContentRef_1.jpg filter=lfs diff=lfs merge=lfs -text
+assets/girl_ContentRef_2.jpg filter=lfs diff=lfs merge=lfs -text
+assets/girl_style_1.jpg filter=lfs diff=lfs merge=lfs -text
+assets/girl_style_2.jpg filter=lfs diff=lfs merge=lfs -text
+assets/house_ContentRef_1.jpg filter=lfs diff=lfs merge=lfs -text
+assets/house_style_1.jpg filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,195 @@

+---
+license: apache-2.0
+---
+# Templates - Content Reference (FLUX.2-klein-base-4B)
+This model is one of the Diffusion Templates series models open-sourced by [DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio). It can extract visual features from an input reference image and fuse them into the base generation guided by natural language descriptions.
+## Results
+> **Prompt:** A cat is sitting on a stone.
+| Template | Generated | Template | Generated |
+|:---:|:---:|:---:|:---:|
+| ![](./assets/cat_style_1.jpg) | ![](./assets/cat_ContentRef_1.jpg) | ![](./assets/cat_style_2.jpg) | ![](./assets/cat_ContentRef_2.jpg) |
+---
+> **Prompt:** A cozy wooden cottage in a lush green valley, white fluffy clouds in the sky, peaceful atmosphere.
+| Template | Generated | Template | Generated |
+|:---:|:---:|:---:|:---:|
+| ![](./assets/house_style_1.jpg) | ![](./assets/house_ContentRef_1.jpg) | ![](./assets/house_style_2.jpg) | ![](./assets/house_ContentRef_2.jpg) |
+---
+> **Prompt:** A beautiful girl on an outdoor adventure.
+| Template | Generated | Template | Generated |
+|:---:|:---:|:---:|:---:|
+| ![](./assets/girl_style_1.jpg) | ![](./assets/girl_ContentRef_1.jpg) | ![](./assets/girl_style_2.jpg) | ![](./assets/girl_ContentRef_2.jpg) |
+## Inference Code
+* Install [DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio)
+```
+git clone https://github.com/modelscope/DiffSynth-Studio.git
+cd DiffSynth-Studio
+pip install -e .
+```
+* Direct inference (requires 40G GPU memory)
+```python
+from diffsynth.diffusion.template import TemplatePipeline
+from diffsynth.pipelines.flux2_image import Flux2ImagePipeline, ModelConfig
+import torch
+from modelscope import dataset_snapshot_download
+from PIL import Image
+import numpy as np
+```
+```python
+pipe = Flux2ImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="black-forest-labs/FLUX.2-klein-base-4B", origin_file_pattern="transformer/*.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="text_encoder/*.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
+    ],
+    tokenizer_config=ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="tokenizer/"),
+)
+pipe.dit = pipe.enable_lora_hot_loading(pipe.dit)  # Important!
+template = TemplatePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[ModelConfig(model_id="DiffSynth-Studio/Template-KleinBase4B-ContentRef")],
+)
+dataset_snapshot_download(
+    "DiffSynth-Studio/examples_in_diffsynth",
+    allow_file_pattern=["templates/*"],
+    local_dir="data/examples",
+)
+image = template(
+    pipe,
+    prompt="A cat is sitting on a stone.",
+    seed=0, cfg_scale=4, num_inference_steps=50,
+    template_inputs=[{
+        "image": Image.open("data/examples/templates/image_style_1.jpg"),
+    }],
+    negative_template_inputs=[{
+        "image": Image.fromarray(np.zeros((1024, 1024, 3), dtype=np.uint8) + 128),
+    }],
+)
+image.save("image_ContentRef_1.jpg")
+image = template(
+    pipe,
+    prompt="A cat is sitting on a stone.",
+    seed=0, cfg_scale=4, num_inference_steps=50,
+    template_inputs=[{
+        "image": Image.open("data/examples/templates/image_style_2.jpg"),
+    }],
+    negative_template_inputs=[{
+        "image": Image.fromarray(np.zeros((1024, 1024, 3), dtype=np.uint8) + 128),
+    }],
+)
+image.save("image_ContentRef_2.jpg")
+```
+* Enable lazy loading and memory management, requires 24G GPU memory
+```python
+from diffsynth.diffusion.template import TemplatePipeline
+from diffsynth.pipelines.flux2_image import Flux2ImagePipeline, ModelConfig
+import torch
+from modelscope import dataset_snapshot_download
+from PIL import Image
+import numpy as np
+```
+```python
+vram_config = {
+    "offload_dtype": "disk",
+    "offload_device": "disk",
+    "onload_dtype": torch.float8_e4m3fn,
+    "onload_device": "cpu",
+    "preparing_dtype": torch.float8_e4m3fn,
+    "preparing_device": "cuda",
+    "computation_dtype": torch.bfloat16,
+    "computation_device": "cuda",
+}
+pipe = Flux2ImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="black-forest-labs/FLUX.2-klein-base-4B", origin_file_pattern="transformer/*.safetensors", **vram_config),
+        ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="text_encoder/*.safetensors", **vram_config),
+        ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
+    ],
+    tokenizer_config=ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="tokenizer/"),
+    vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
+)
+template = TemplatePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[ModelConfig(model_id="DiffSynth-Studio/Template-KleinBase4B-ContentRef")],
+    lazy_loading=True,
+)
+dataset_snapshot_download(
+    "DiffSynth-Studio/examples_in_diffsynth",
+    allow_file_pattern=["templates/*"],
+    local_dir="data/examples",
+)
+image = template(
+    pipe,
+    prompt="A cat is sitting on a stone.",
+    seed=0, cfg_scale=4, num_inference_steps=50,
+    template_inputs=[{
+        "image": Image.open("data/examples/templates/image_style_1.jpg"),
+    }],
+    negative_template_inputs=[{
+        "image": Image.fromarray(np.zeros((1024, 1024, 3), dtype=np.uint8) + 128),
+    }],
+)
+image.save("image_ContentRef_1.jpg")
+image = template(
+    pipe,
+    prompt="A cat is sitting on a stone.",
+    seed=0, cfg_scale=4, num_inference_steps=50,
+    template_inputs=[{
+        "image": Image.open("data/examples/templates/image_style_2.jpg"),
+    }],
+    negative_template_inputs=[{
+        "image": Image.fromarray(np.zeros((1024, 1024, 3), dtype=np.uint8) + 128),
+    }],
+)
+image.save("image_ContentRef_2.jpg")
+```
+## Training Code
+After installing DiffSynth-Studio, use the following script to start training. For more information, please refer to the [DiffSynth-Studio Documentation](https://diffsynth-studio-doc.readthedocs.io/zh-cn/latest/).
+```shell
+modelscope download --dataset DiffSynth-Studio/diffsynth_example_dataset --include "flux2/Template-KleinBase4B-ContentRef/*" --local_dir ./data/diffsynth_example_dataset
+accelerate launch examples/flux2/model_training/train.py \
+  --dataset_base_path data/diffsynth_example_dataset/flux2/Template-KleinBase4B-ContentRef \
+  --dataset_metadata_path data/diffsynth_example_dataset/flux2/Template-KleinBase4B-ContentRef/metadata.jsonl \
+  --extra_inputs "template_inputs" \
+  --max_pixels 1048576 \
+  --dataset_repeat 50 \
+  --model_id_with_origin_paths "black-forest-labs/FLUX.2-klein-4B:text_encoder/*.safetensors,black-forest-labs/FLUX.2-klein-base-4B:transformer/*.safetensors,black-forest-labs/FLUX.2-klein-4B:vae/diffusion_pytorch_model.safetensors" \
+  --template_model_id_or_path "DiffSynth-Studio/Template-KleinBase4B-ContentRef:" \
+  --tokenizer_path "black-forest-labs/FLUX.2-klein-4B:tokenizer/" \
+  --learning_rate 1e-4 \
+  --num_epochs 2 \
+  --remove_prefix_in_ckpt "pipe.template_model." \
+  --output_path "./models/train/Template-KleinBase4B-ContentRef_full" \
+  --trainable_models "template_model" \
+  --use_gradient_checkpointing \
+  --find_unused_parameters \
+  --enable_lora_hot_loading
+```

README_from_modelscope.md ADDED Viewed

	@@ -0,0 +1,197 @@

+---
+frameworks:
+- Pytorch
+license: Apache License 2.0
+tags: []
+tasks:
+- text-to-image-synthesis
+---
+# Templates-内容参考（FLUX.2-klein-base-4B）
+本模型是 [DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio) 开源的 Diffusion Templates 系列模型之一。该模型能够从输入的参考图像中提取视觉特征，并将其融合到基于自然语言描述的基础生成目标中。
+## 效果展示
+> **Prompt:** A cat is sitting on a stone.
+| Template | Generated | Template | Generated |
+|:---:|:---:|:---:|:---:|
+| ![](./assets/cat_style_1.jpg) | ![](./assets/cat_ContentRef_1.jpg) | ![](./assets/cat_style_2.jpg) | ![](./assets/cat_ContentRef_2.jpg) |
+---
+> **Prompt:** A cozy wooden cottage in a lush green valley, white fluffy clouds in the sky, peaceful atmosphere.
+| Template | Generated | Template | Generated |
+|:---:|:---:|:---:|:---:|
+| ![](./assets/house_style_1.jpg) | ![](./assets/house_ContentRef_1.jpg) | ![](./assets/house_style_2.jpg) | ![](./assets/house_ContentRef_2.jpg) |
+---
+> **Prompt:** A beautiful girl on an outdoor adventure.
+| Template | Generated | Template | Generated |
+|:---:|:---:|:---:|:---:|
+| ![](./assets/girl_style_1.jpg) | ![](./assets/girl_ContentRef_1.jpg) | ![](./assets/girl_style_2.jpg) | ![](./assets/girl_ContentRef_2.jpg) |
+## 推理代码
+* 安装 [DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio)
+```
+git clone https://github.com/modelscope/DiffSynth-Studio.git
+cd DiffSynth-Studio
+pip install -e .
+```
+* 直接推理，需 40G 显存
+```python
+from diffsynth.diffusion.template import TemplatePipeline
+from diffsynth.pipelines.flux2_image import Flux2ImagePipeline, ModelConfig
+import torch
+from modelscope import dataset_snapshot_download
+from PIL import Image
+import numpy as np
+pipe = Flux2ImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="black-forest-labs/FLUX.2-klein-base-4B", origin_file_pattern="transformer/*.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="text_encoder/*.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
+    ],
+    tokenizer_config=ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="tokenizer/"),
+)
+pipe.dit = pipe.enable_lora_hot_loading(pipe.dit) # Important!
+template = TemplatePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[ModelConfig(model_id="DiffSynth-Studio/Template-KleinBase4B-ContentRef")],
+)
+dataset_snapshot_download(
+    "DiffSynth-Studio/examples_in_diffsynth",
+    allow_file_pattern=["templates/*"],
+    local_dir="data/examples",
+)
+image = template(
+    pipe,
+    prompt="A cat is sitting on a stone.",
+    seed=0, cfg_scale=4, num_inference_steps=50,
+    template_inputs = [{
+        "image": Image.open("data/examples/templates/image_style_1.jpg"),
+    }],
+    negative_template_inputs = [{
+        "image": Image.fromarray(np.zeros((1024, 1024, 3), dtype=np.uint8) + 128),
+    }],
+)
+image.save("image_ContentRef_1.jpg")
+image = template(
+    pipe,
+    prompt="A cat is sitting on a stone.",
+    seed=0, cfg_scale=4, num_inference_steps=50,
+    template_inputs = [{
+        "image": Image.open("data/examples/templates/image_style_2.jpg"),
+    }],
+    negative_template_inputs = [{
+        "image": Image.fromarray(np.zeros((1024, 1024, 3), dtype=np.uint8) + 128),
+    }],
+)
+image.save("image_ContentRef_2.jpg")
+```
+* 开启惰性加载和显存管理，需 24G 显存
+```python
+from diffsynth.diffusion.template import TemplatePipeline
+from diffsynth.pipelines.flux2_image import Flux2ImagePipeline, ModelConfig
+import torch
+from modelscope import dataset_snapshot_download
+from PIL import Image
+import numpy as np
+vram_config = {
+    "offload_dtype": "disk",
+    "offload_device": "disk",
+    "onload_dtype": torch.float8_e4m3fn,
+    "onload_device": "cpu",
+    "preparing_dtype": torch.float8_e4m3fn,
+    "preparing_device": "cuda",
+    "computation_dtype": torch.bfloat16,
+    "computation_device": "cuda",
+}
+pipe = Flux2ImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="black-forest-labs/FLUX.2-klein-base-4B", origin_file_pattern="transformer/*.safetensors", **vram_config),
+        ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="text_encoder/*.safetensors", **vram_config),
+        ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
+    ],
+    tokenizer_config=ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="tokenizer/"),
+    vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
+)
+template = TemplatePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[ModelConfig(model_id="DiffSynth-Studio/Template-KleinBase4B-ContentRef")],
+    lazy_loading=True,
+)
+dataset_snapshot_download(
+    "DiffSynth-Studio/examples_in_diffsynth",
+    allow_file_pattern=["templates/*"],
+    local_dir="data/examples",
+)
+image = template(
+    pipe,
+    prompt="A cat is sitting on a stone.",
+    seed=0, cfg_scale=4, num_inference_steps=50,
+    template_inputs = [{
+        "image": Image.open("data/examples/templates/image_style_1.jpg"),
+    }],
+    negative_template_inputs = [{
+        "image": Image.fromarray(np.zeros((1024, 1024, 3), dtype=np.uint8) + 128),
+    }],
+)
+image.save("image_ContentRef_1.jpg")
+image = template(
+    pipe,
+    prompt="A cat is sitting on a stone.",
+    seed=0, cfg_scale=4, num_inference_steps=50,
+    template_inputs = [{
+        "image": Image.open("data/examples/templates/image_style_2.jpg"),
+    }],
+    negative_template_inputs = [{
+        "image": Image.fromarray(np.zeros((1024, 1024, 3), dtype=np.uint8) + 128),
+    }],
+)
+image.save("image_ContentRef_2.jpg")
+```
+## 训练代码
+安装 DiffSynth-Studio 后，使用以下脚本可开启训练，更多信息请参考 [DiffSynth-Studio 文档](https://diffsynth-studio-doc.readthedocs.io/zh-cn/latest/)。
+```shell
+modelscope download --dataset DiffSynth-Studio/diffsynth_example_dataset --include "flux2/Template-KleinBase4B-ContentRef/*" --local_dir ./data/diffsynth_example_dataset
+accelerate launch examples/flux2/model_training/train.py \
+  --dataset_base_path data/diffsynth_example_dataset/flux2/Template-KleinBase4B-ContentRef \
+  --dataset_metadata_path data/diffsynth_example_dataset/flux2/Template-KleinBase4B-ContentRef/metadata.jsonl \
+  --extra_inputs "template_inputs" \
+  --max_pixels 1048576 \
+  --dataset_repeat 50 \
+  --model_id_with_origin_paths "black-forest-labs/FLUX.2-klein-4B:text_encoder/*.safetensors,black-forest-labs/FLUX.2-klein-base-4B:transformer/*.safetensors,black-forest-labs/FLUX.2-klein-4B:vae/diffusion_pytorch_model.safetensors" \
+  --template_model_id_or_path "DiffSynth-Studio/Template-KleinBase4B-ContentRef:" \
+  --tokenizer_path "black-forest-labs/FLUX.2-klein-4B:tokenizer/" \
+  --learning_rate 1e-4 \
+  --num_epochs 2 \
+  --remove_prefix_in_ckpt "pipe.template_model." \
+  --output_path "./models/train/Template-KleinBase4B-ContentRef_full" \
+  --trainable_models "template_model" \
+  --use_gradient_checkpointing \
+  --find_unused_parameters \
+  --enable_lora_hot_loading
+```

assets/cat_ContentRef_1.jpg ADDED Viewed

assets/cat_ContentRef_2.jpg ADDED Viewed

Git LFS Details

SHA256: a5ffbfa561bdf44e344fc2d24d5219549b75c949448133f623ab6190e56a3615
Pointer size: 131 Bytes
Size of remote file: 133 kB

assets/cat_style_1.jpg ADDED Viewed

assets/cat_style_2.jpg ADDED Viewed

Git LFS Details

SHA256: ab1f138570b5df2ced584c373e570ca9b2b08a55ba6e5f42e77a0e6d09e6e6ac
Pointer size: 131 Bytes
Size of remote file: 123 kB

assets/girl_ContentRef_1.jpg ADDED Viewed

Git LFS Details

SHA256: 36ec31d3d3689290199fa201cb761994d1462b6a0a935d4472d27be873a45756
Pointer size: 131 Bytes
Size of remote file: 216 kB

assets/girl_ContentRef_2.jpg ADDED Viewed

Git LFS Details

SHA256: bff566792cde66cf43b5ad70cf7eb35ee7589b4fdb7978953ab7b1d218441f96
Pointer size: 131 Bytes
Size of remote file: 236 kB

assets/girl_style_1.jpg ADDED Viewed

Git LFS Details

SHA256: 57c60a71fbd4f771dfb03fa811089cfa4f96a3b9825ffd1ad4cba603d94e77b7
Pointer size: 131 Bytes
Size of remote file: 119 kB

assets/girl_style_2.jpg ADDED Viewed

Git LFS Details

SHA256: 4b5e068a8b3709ec1e804e23a15ab5a7937c9ecd4acc9c0e2ca1b716883e8c3c
Pointer size: 131 Bytes
Size of remote file: 228 kB

assets/house_ContentRef_1.jpg ADDED Viewed

Git LFS Details

SHA256: df6b7b01f0b3ab961d7124cb2be04399b65decf5afe83534f8b1fc52c77c6c6e
Pointer size: 131 Bytes
Size of remote file: 145 kB

assets/house_ContentRef_2.jpg ADDED Viewed

assets/house_style_1.jpg ADDED Viewed

Git LFS Details

SHA256: 615cdb1fa5f2b200e9b8e60af85f040a09c180932df13118b07236776f0cfc95
Pointer size: 131 Bytes
Size of remote file: 150 kB

assets/house_style_2.jpg ADDED Viewed

configuration.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"framework":"Pytorch","task":"text-to-image-synthesis"}

model.py ADDED Viewed

	@@ -0,0 +1,228 @@

+from transformers.models.siglip.modeling_siglip import SiglipVisionTransformer, SiglipVisionConfig
+from transformers import SiglipImageProcessor
+from PIL import Image
+import torch
+def merge_lora_weight(tensors_A, tensors_B):
+    lora_A = torch.concat(tensors_A, dim=0)
+    lora_B = torch.concat(tensors_B, dim=1)
+    return lora_A, lora_B
+def merge_lora(loras, alpha=1):
+    lora_merged = {}
+    keys = [i for i in loras[0].keys() if ".lora_A." in i]
+    for key in keys:
+        tensors_A = [lora[key] for lora in loras]
+        tensors_B = [lora[key.replace(".lora_A.", ".lora_B.")] for lora in loras]
+        lora_A, lora_B = merge_lora_weight(tensors_A, tensors_B)
+        lora_merged[key] = lora_A * alpha
+        lora_merged[key.replace(".lora_A.", ".lora_B.")] = lora_B
+    return lora_merged
+class Siglip2ImageEncoder(SiglipVisionTransformer):
+    def __init__(self):
+        config = SiglipVisionConfig(
+            attention_dropout = 0.0,
+            dtype = "float32",
+            hidden_act = "gelu_pytorch_tanh",
+            hidden_size = 1536,
+            image_size = 384,
+            intermediate_size = 6144,
+            layer_norm_eps = 1e-06,
+            model_type = "siglip_vision_model",
+            num_attention_heads = 16,
+            num_channels = 3,
+            num_hidden_layers = 40,
+            patch_size = 16,
+            transformers_version = "4.56.1",
+            _attn_implementation = "sdpa"
+        )
+        # For compatibility with transformers
+        import sys
+        sys.modules["template_model"] = None
+        super().__init__(config)
+        self.processor = SiglipImageProcessor(
+            do_convert_rgb = None,
+            do_normalize = True,
+            do_rescale = True,
+            do_resize = True,
+            image_mean = [0.5, 0.5, 0.5],
+            image_processor_type = "SiglipImageProcessor",
+            image_std = [0.5, 0.5, 0.5],
+            processor_class = "SiglipProcessor",
+            resample = 2,
+            rescale_factor = 0.00392156862745098,
+            size = {
+                "height": 384,
+                "width": 384
+            }
+        )
+    def forward(self, image, torch_dtype=torch.bfloat16, device="cuda", query_embs=None):
+        pixel_values = self.processor(images=[image], return_tensors="pt")["pixel_values"]
+        pixel_values = pixel_values.to(device=device, dtype=torch_dtype)
+        output_attentions = False
+        output_hidden_states = False
+        interpolate_pos_encoding = False
+        hidden_states = self.embeddings(pixel_values, interpolate_pos_encoding=interpolate_pos_encoding)
+        encoder_outputs = self.encoder(
+            inputs_embeds=hidden_states,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+        )
+        last_hidden_state = encoder_outputs.last_hidden_state
+        last_hidden_state = self.post_layernorm(last_hidden_state)
+        if query_embs is None:
+            pooler_output = self.head(last_hidden_state)
+        else:
+            hidden_state = self.head.attention(query_embs, last_hidden_state, last_hidden_state)[0]
+            residual = hidden_state
+            hidden_state = self.head.layernorm(hidden_state)
+            pooler_output = residual + self.head.mlp(hidden_state)
+        return pooler_output
+class CompressedMLP(torch.nn.Module):
+    def __init__(self, in_dim, mid_dim, out_dim, bias=False):
+        super().__init__()
+        self.proj_in = torch.nn.Linear(in_dim, mid_dim, bias=bias)
+        self.proj_out = torch.nn.Linear(mid_dim, out_dim, bias=bias)
+    def forward(self, x):
+        x = self.proj_in(x)
+        x = self.proj_out(x)
+        return x
+class ImageEmbeddingToLoraMatrix(torch.nn.Module):
+    def __init__(self, in_dim, compress_dim, lora_a_dim, lora_b_dim, rank):
+        super().__init__()
+        self.proj_a = CompressedMLP(in_dim, compress_dim, lora_a_dim * rank)
+        self.proj_b = CompressedMLP(in_dim, compress_dim, lora_b_dim * rank)
+        self.lora_a_dim = lora_a_dim
+        self.lora_b_dim = lora_b_dim
+        self.rank = rank
+    def forward(self, x):
+        lora_a = self.proj_a(x).view(self.rank, self.lora_a_dim)
+        lora_b = self.proj_b(x).view(self.lora_b_dim, self.rank)
+        return lora_a, lora_b
+class FLUX2Image2LoRAQuerys(torch.nn.Module):
+    def __init__(self, length, dim):
+        super().__init__()
+        self.weights = torch.nn.Parameter(torch.randn((1, length, dim)))
+    def forward(self):
+        return self.weights
+class FLUX2Image2LoRAModel(torch.nn.Module):
+    def __init__(self):
+        super().__init__()
+        self.lora_patterns = [
+            {
+                "name": "single_transformer_blocks.{block_id}.attn.to_qkv_mlp_proj",
+                "num_blocks": 20,
+                "dim_in": 3072,
+                "dim_out": 27648,
+            },
+            {
+                "name": "single_transformer_blocks.{block_id}.attn.to_out",
+                "num_blocks": 20,
+                "dim_in": 12288,
+                "dim_out": 3072,
+            },
+        ]
+        self.image_encoder = Siglip2ImageEncoder()
+        self.parse_lora_layers(
+            self.lora_patterns,
+            dim_image=1536,
+            compress_dim=256,
+            rank=4,
+        )
+        self.query_embs = FLUX2Image2LoRAQuerys(len(self.layers), 1536)
+    def parse_lora_layers(self, lora_patterns, dim_image, compress_dim, rank):
+        names = []
+        layers = []
+        for lora_pattern in lora_patterns:
+            for block_id in range(lora_pattern["num_blocks"]):
+                name = lora_pattern["name"].format(block_id=block_id)
+                layer = ImageEmbeddingToLoraMatrix(dim_image, compress_dim, lora_pattern["dim_in"], lora_pattern["dim_out"], rank)
+                names.append(name)
+                layers.append(layer)
+        self.names = names
+        self.layers = torch.nn.ModuleList(layers)
+    @torch.no_grad()
+    def process_inputs(self, image, scale=1, **kwargs):
+        return {"image": image, "scale": scale}
+    def forward_single_image(self, image):
+        embs = self.image_encoder(image, query_embs=self.query_embs.weights, device=self.query_embs.weights.device)
+        embs = embs.chunk(len(self.layers), dim=1)
+        lora = {}
+        for emb, name, layer in zip(embs, self.names, self.layers):
+            lora_a, lora_b = layer(emb)
+            lora[f"{name}.lora_A.default.weight"] = lora_a
+            lora[f"{name}.lora_B.default.weight"] = lora_b
+        return {"lora": lora}
+    def forward(self, image, scale=1, **kwargs):
+        if not isinstance(image, list):
+            image = [image]
+        loras = [self.forward_single_image(i)["lora"] for i in image]
+        lora = merge_lora(loras, alpha=1 / len(loras) * scale)
+        return {"lora": lora}
+class DataAnnotator:
+    def __init__(self):
+        from diffsynth.core import UnifiedDataset
+        self.image_oparator = UnifiedDataset.default_image_operator(
+            base_path="", # If your dataset contains relative paths, please specify the root path here.
+            max_pixels=1024*1024,
+            height_division_factor=16,
+            width_division_factor=16,
+        )
+    def __call__(self, image, **kwargs):
+        image = self.image_oparator(image)
+        return {"image": image}
+def initialize_model_weights():
+    from diffsynth import ModelConfig, load_state_dict
+    from safetensors.torch import save_file
+    import os
+    config = ModelConfig(model_id="DiffSynth-Studio/General-Image-Encoders", origin_file_pattern="SigLIP2-G384/model.safetensors")
+    config.download_if_necessary()
+    state_dict = load_state_dict(config.path, torch_dtype=torch.bfloat16, device="cuda")
+    model = FLUX2Image2LoRAModel().to(dtype=torch.bfloat16, device="cuda")
+    model.image_encoder.load_state_dict(state_dict)
+    query_embs = {"weights": torch.concat([state_dict["head.probe"]] * len(model.layers), dim=1)}
+    model.query_embs.load_state_dict(query_embs, strict=False)
+    lora_weights = {}
+    for name, param in model.named_parameters():
+        if ".proj_b.proj_out." in name:
+            lora_weights[name] = param * 0
+        elif ".proj_b." in name or ".proj_a." in name:
+            lora_weights[name] = param * 0.3
+    model.load_state_dict(lora_weights, strict=False)
+    print(sum(p.numel() for p in model.parameters()))
+    save_file(model.state_dict(), os.path.join(os.path.dirname(__file__), "model.safetensors"))
+TEMPLATE_MODEL = FLUX2Image2LoRAModel
+TEMPLATE_MODEL_PATH = "model.safetensors"
+TEMPLATE_DATA_PROCESSOR = DataAnnotator

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:e7f6cc09ae8693e2083ae7f7f328abf473801a1625d7cb3f10ec1a941d26deef
+size 4277893528