Upload folder using huggingface_hub

Browse files

Files changed (15) hide show

.gitattributes +5 -0
README.md +193 -0
README_from_modelscope.md +197 -0
assets/cat_rgb_cold.jpg +3 -0
assets/cat_rgb_normal.jpg +3 -0
assets/cat_rgb_warm.jpg +3 -0
assets/girl_rgb_cold.jpg +0 -0
assets/girl_rgb_normal.jpg +0 -0
assets/girl_rgb_warm.jpg +0 -0
assets/room_rgb_cold.jpg +3 -0
assets/room_rgb_normal.jpg +0 -0
assets/room_rgb_warm.jpg +3 -0
configuration.json +1 -0
model.py +64 -0
model.safetensors +3 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+assets/cat_rgb_cold.jpg filter=lfs diff=lfs merge=lfs -text
+assets/cat_rgb_normal.jpg filter=lfs diff=lfs merge=lfs -text
+assets/cat_rgb_warm.jpg filter=lfs diff=lfs merge=lfs -text
+assets/room_rgb_cold.jpg filter=lfs diff=lfs merge=lfs -text
+assets/room_rgb_warm.jpg filter=lfs diff=lfs merge=lfs -text

README.md ADDED Viewed

	@@ -0,0 +1,193 @@

+---
+license: apache-2.0
+---
+# Templates - Color Tone Adjustment (FLUX.2-klein-base-4B)
+This model is part of the open-source Diffusion Templates series by [DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio). It is a color tone adjustment model that allows users to globally control the image's color tendency and color temperature atmosphere by directly inputting normalized numerical values for the `R`, `G`, and `B` channels.
+## Results
+> **Prompt:** A cat is sitting on a stone.
+| cold | normal | warm |
+|:---:|:---:|:---:|
+| ![](./assets/cat_rgb_cold.jpg) | ![](./assets/cat_rgb_normal.jpg) | ![](./assets/cat_rgb_warm.jpg) |
+---
+> **Prompt:** A cinematic portrait of a beautiful woman looking out a rainy window.
+| cold | normal | warm |
+|:---:|:---:|:---:|
+| ![](./assets/girl_rgb_cold.jpg) | ![](./assets/girl_rgb_normal.jpg) | ![](./assets/girl_rgb_warm.jpg) |
+---
+> **Prompt:** A modern minimalist living room with furniture.
+| cold | normal | warm |
+|:---:|:---:|:---:|
+| ![](./assets/room_rgb_cold.jpg) | ![](./assets/room_rgb_normal.jpg) | ![](./assets/room_rgb_warm.jpg) |
+## Inference Code
+* Install [DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio)
+```
+git clone https://github.com/modelscope/DiffSynth-Studio.git
+cd DiffSynth-Studio
+pip install -e .
+```
+* Direct inference, requires 40G GPU memory
+```python
+from diffsynth.diffusion.template import TemplatePipeline
+from diffsynth.pipelines.flux2_image import Flux2ImagePipeline, ModelConfig
+import torch
+```
+pipe = Flux2ImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="black-forest-labs/FLUX.2-klein-base-4B", origin_file_pattern="transformer/*.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="text_encoder/*.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
+    ],
+    tokenizer_config=ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="tokenizer/"),
+)
+template = TemplatePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[ModelConfig(model_id="DiffSynth-Studio/Template-KleinBase4B-SoftRGB")],
+)
+image = template(
+    pipe,
+    prompt="A cat is sitting on a stone.",
+    seed=0, cfg_scale=4, num_inference_steps=50,
+    template_inputs = [{
+        "R": 128/255,
+        "G": 128/255,
+        "B": 128/255
+    }],
+)
+image.save("image_rgb_normal.jpg")
+image = template(
+    pipe,
+    prompt="A cat is sitting on a stone.",
+    seed=0, cfg_scale=4, num_inference_steps=50,
+    template_inputs = [{
+        "R": 208/255,
+        "G": 185/255,
+        "B": 138/255
+    }],
+)
+image.save("image_rgb_warm.jpg")
+image = template(
+    pipe,
+    prompt="A cat is sitting on a stone.",
+    seed=0, cfg_scale=4, num_inference_steps=50,
+    template_inputs = [{
+        "R": 94/255,
+        "G": 163/255,
+        "B": 174/255
+    }],
+)
+image.save("image_rgb_cold.jpg")
+```
+* Enable lazy loading and memory management, requires 24G GPU memory
+```python
+from diffsynth.diffusion.template import TemplatePipeline
+from diffsynth.pipelines.flux2_image import Flux2ImagePipeline, ModelConfig
+import torch
+```python
+vram_config = {
+    "offload_dtype": "disk",
+    "offload_device": "disk",
+    "onload_dtype": torch.float8_e4m3fn,
+    "onload_device": "cpu",
+    "preparing_dtype": torch.float8_e4m3fn,
+    "preparing_device": "cuda",
+    "computation_dtype": torch.bfloat16,
+    "computation_device": "cuda",
+}
+pipe = Flux2ImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="black-forest-labs/FLUX.2-klein-base-4B", origin_file_pattern="transformer/*.safetensors", **vram_config),
+        ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="text_encoder/*.safetensors", **vram_config),
+        ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
+    ],
+    tokenizer_config=ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="tokenizer/"),
+    vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
+)
+template = TemplatePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[ModelConfig(model_id="DiffSynth-Studio/Template-KleinBase4B-SoftRGB")],
+    lazy_loading=True,
+)
+image = template(
+    pipe,
+    prompt="A cat is sitting on a stone.",
+    seed=0, cfg_scale=4, num_inference_steps=50,
+    template_inputs = [{
+        "R": 128/255,
+        "G": 128/255,
+        "B": 128/255
+    }],
+)
+image.save("image_rgb_normal.jpg")
+image = template(
+    pipe,
+    prompt="A cat is sitting on a stone.",
+    seed=0, cfg_scale=4, num_inference_steps=50,
+    template_inputs = [{
+        "R": 208/255,
+        "G": 185/255,
+        "B": 138/255
+    }],
+)
+image.save("image_rgb_warm.jpg")
+image = template(
+    pipe,
+    prompt="A cat is sitting on a stone.",
+    seed=0, cfg_scale=4, num_inference_steps=50,
+    template_inputs = [{
+        "R": 94/255,
+        "G": 163/255,
+        "B": 174/255
+    }],
+)
+image.save("image_rgb_cold.jpg")
+```
+## Training Code
+After installing DiffSynth-Studio, use the following script to start training. For more information, please refer to the [DiffSynth-Studio Documentation](https://diffsynth-studio-doc.readthedocs.io/zh-cn/latest/).
+```shell
+modelscope download --dataset DiffSynth-Studio/diffsynth_example_dataset --include "flux2/Template-KleinBase4B-SoftRGB/*" --local_dir ./data/diffsynth_example_dataset
+accelerate launch examples/flux2/model_training/train.py \
+  --dataset_base_path data/diffsynth_example_dataset/flux2/Template-KleinBase4B-SoftRGB \
+  --dataset_metadata_path data/diffsynth_example_dataset/flux2/Template-KleinBase4B-SoftRGB/metadata.jsonl \
+  --extra_inputs "template_inputs" \
+  --max_pixels 1048576 \
+  --dataset_repeat 50 \
+  --model_id_with_origin_paths "black-forest-labs/FLUX.2-klein-4B:text_encoder/*.safetensors,black-forest-labs/FLUX.2-klein-base-4B:transformer/*.safetensors,black-forest-labs/FLUX.2-klein-4B:vae/diffusion_pytorch_model.safetensors" \
+  --template_model_id_or_path "DiffSynth-Studio/Template-KleinBase4B-SoftRGB:" \
+  --tokenizer_path "black-forest-labs/FLUX.2-klein-4B:tokenizer/" \
+  --learning_rate 1e-4 \
+  --num_epochs 2 \
+  --remove_prefix_in_ckpt "pipe.template_model." \
+  --output_path "./models/train/Template-KleinBase4B-SoftRGB_full" \
+  --trainable_models "template_model" \
+  --use_gradient_checkpointing \
+  --find_unused_parameters
+```

README_from_modelscope.md ADDED Viewed

	@@ -0,0 +1,197 @@

+---
+frameworks:
+- Pytorch
+license: Apache License 2.0
+tags: []
+tasks:
+- text-to-image-synthesis
+---
+# Templates-色调调节（FLUX.2-klein-base-4B）
+本模型是 [DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio) 开源的 Diffusion Templates 系列模型之一。该模型为色调调节模型，允许用户通过直接输入 `R`、`G`、`B` 三个通道的归一化数值，全局调控画面的色彩倾向与色温氛围。
+## 效果展示
+> **Prompt:** A cat is sitting on a stone.
+| cold | normal | warm |
+|:---:|:---:|:---:|
+| ![](./assets/cat_rgb_cold.jpg) | ![](./assets/cat_rgb_normal.jpg) | ![](./assets/cat_rgb_warm.jpg) |
+---
+> **Prompt:** A cinematic portrait of a beautiful woman looking out a rainy window.
+| cold | normal | warm |
+|:---:|:---:|:---:|
+| ![](./assets/girl_rgb_cold.jpg) | ![](./assets/girl_rgb_normal.jpg) | ![](./assets/girl_rgb_warm.jpg) |
+---
+> **Prompt:** A modern minimalist living room with furniture.
+| cold | normal | warm |
+|:---:|:---:|:---:|
+| ![](./assets/room_rgb_cold.jpg) | ![](./assets/room_rgb_normal.jpg) | ![](./assets/room_rgb_warm.jpg) |
+## 推理代码
+* 安装 [DiffSynth-Studio](https://github.com/modelscope/DiffSynth-Studio)
+```
+git clone https://github.com/modelscope/DiffSynth-Studio.git
+cd DiffSynth-Studio
+pip install -e .
+```
+* 直接推理，需 40G 显存
+```python
+from diffsynth.diffusion.template import TemplatePipeline
+from diffsynth.pipelines.flux2_image import Flux2ImagePipeline, ModelConfig
+import torch
+pipe = Flux2ImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="black-forest-labs/FLUX.2-klein-base-4B", origin_file_pattern="transformer/*.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="text_encoder/*.safetensors"),
+        ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
+    ],
+    tokenizer_config=ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="tokenizer/"),
+)
+template = TemplatePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[ModelConfig(model_id="DiffSynth-Studio/Template-KleinBase4B-SoftRGB")],
+)
+image = template(
+    pipe,
+    prompt="A cat is sitting on a stone.",
+    seed=0, cfg_scale=4, num_inference_steps=50,
+    template_inputs = [{
+        "R": 128/255,
+        "G": 128/255,
+        "B": 128/255
+    }],
+)
+image.save("image_rgb_normal.jpg")
+image = template(
+    pipe,
+    prompt="A cat is sitting on a stone.",
+    seed=0, cfg_scale=4, num_inference_steps=50,
+    template_inputs = [{
+        "R": 208/255,
+        "G": 185/255,
+        "B": 138/255
+    }],
+)
+image.save("image_rgb_warm.jpg")
+image = template(
+    pipe,
+    prompt="A cat is sitting on a stone.",
+    seed=0, cfg_scale=4, num_inference_steps=50,
+    template_inputs = [{
+        "R": 94/255,
+        "G": 163/255,
+        "B": 174/255
+    }],
+)
+image.save("image_rgb_cold.jpg")
+```
+* 开启惰性加载和显存管理，需 24G 显存
+```python
+from diffsynth.diffusion.template import TemplatePipeline
+from diffsynth.pipelines.flux2_image import Flux2ImagePipeline, ModelConfig
+import torch
+vram_config = {
+    "offload_dtype": "disk",
+    "offload_device": "disk",
+    "onload_dtype": torch.float8_e4m3fn,
+    "onload_device": "cpu",
+    "preparing_dtype": torch.float8_e4m3fn,
+    "preparing_device": "cuda",
+    "computation_dtype": torch.bfloat16,
+    "computation_device": "cuda",
+}
+pipe = Flux2ImagePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[
+        ModelConfig(model_id="black-forest-labs/FLUX.2-klein-base-4B", origin_file_pattern="transformer/*.safetensors", **vram_config),
+        ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="text_encoder/*.safetensors", **vram_config),
+        ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="vae/diffusion_pytorch_model.safetensors"),
+    ],
+    tokenizer_config=ModelConfig(model_id="black-forest-labs/FLUX.2-klein-4B", origin_file_pattern="tokenizer/"),
+    vram_limit=torch.cuda.mem_get_info("cuda")[1] / (1024 ** 3) - 0.5,
+)
+template = TemplatePipeline.from_pretrained(
+    torch_dtype=torch.bfloat16,
+    device="cuda",
+    model_configs=[ModelConfig(model_id="DiffSynth-Studio/Template-KleinBase4B-SoftRGB")],
+    lazy_loading=True,
+)
+image = template(
+    pipe,
+    prompt="A cat is sitting on a stone.",
+    seed=0, cfg_scale=4, num_inference_steps=50,
+    template_inputs = [{
+        "R": 128/255,
+        "G": 128/255,
+        "B": 128/255
+    }],
+)
+image.save("image_rgb_normal.jpg")
+image = template(
+    pipe,
+    prompt="A cat is sitting on a stone.",
+    seed=0, cfg_scale=4, num_inference_steps=50,
+    template_inputs = [{
+        "R": 208/255,
+        "G": 185/255,
+        "B": 138/255
+    }],
+)
+image.save("image_rgb_warm.jpg")
+image = template(
+    pipe,
+    prompt="A cat is sitting on a stone.",
+    seed=0, cfg_scale=4, num_inference_steps=50,
+    template_inputs = [{
+        "R": 94/255,
+        "G": 163/255,
+        "B": 174/255
+    }],
+)
+image.save("image_rgb_cold.jpg")
+```
+## 训练代码
+安装 DiffSynth-Studio 后，使用以下脚本可开启训练，更多信息请参考 [DiffSynth-Studio 文档](https://diffsynth-studio-doc.readthedocs.io/zh-cn/latest/)。
+```shell
+modelscope download --dataset DiffSynth-Studio/diffsynth_example_dataset --include "flux2/Template-KleinBase4B-SoftRGB/*" --local_dir ./data/diffsynth_example_dataset
+accelerate launch examples/flux2/model_training/train.py \
+  --dataset_base_path data/diffsynth_example_dataset/flux2/Template-KleinBase4B-SoftRGB \
+  --dataset_metadata_path data/diffsynth_example_dataset/flux2/Template-KleinBase4B-SoftRGB/metadata.jsonl \
+  --extra_inputs "template_inputs" \
+  --max_pixels 1048576 \
+  --dataset_repeat 50 \
+  --model_id_with_origin_paths "black-forest-labs/FLUX.2-klein-4B:text_encoder/*.safetensors,black-forest-labs/FLUX.2-klein-base-4B:transformer/*.safetensors,black-forest-labs/FLUX.2-klein-4B:vae/diffusion_pytorch_model.safetensors" \
+  --template_model_id_or_path "DiffSynth-Studio/Template-KleinBase4B-SoftRGB:" \
+  --tokenizer_path "black-forest-labs/FLUX.2-klein-4B:tokenizer/" \
+  --learning_rate 1e-4 \
+  --num_epochs 2 \
+  --remove_prefix_in_ckpt "pipe.template_model." \
+  --output_path "./models/train/Template-KleinBase4B-SoftRGB_full" \
+  --trainable_models "template_model" \
+  --use_gradient_checkpointing \
+  --find_unused_parameters
+```

assets/cat_rgb_cold.jpg ADDED Viewed

Git LFS Details

SHA256: 7ab91bc1892c1228e2c220128cb0245ef6e89bdb1edddfd1735b2722c86a6e74
Pointer size: 131 Bytes
Size of remote file: 130 kB

assets/cat_rgb_normal.jpg ADDED Viewed

Git LFS Details

SHA256: 629aceaf20b898f3e4379d9419912b7d1f551740733ab146656723502a2f21a1
Pointer size: 131 Bytes
Size of remote file: 114 kB

assets/cat_rgb_warm.jpg ADDED Viewed

Git LFS Details

SHA256: aa63c1645f0428b6df1ed4ed0a76c46a5ad19f5ebd014b5aa553424af92ac56c
Pointer size: 131 Bytes
Size of remote file: 110 kB

assets/girl_rgb_cold.jpg ADDED Viewed

assets/girl_rgb_normal.jpg ADDED Viewed

assets/girl_rgb_warm.jpg ADDED Viewed

assets/room_rgb_cold.jpg ADDED Viewed

Git LFS Details

SHA256: 7145c51ecb46a224e4e2fdaeede63f70ea3c76cfed7b9a0b41be78b44a091dba
Pointer size: 131 Bytes
Size of remote file: 105 kB

assets/room_rgb_normal.jpg ADDED Viewed

assets/room_rgb_warm.jpg ADDED Viewed

Git LFS Details

SHA256: c9a54d37bea82f4bf4b4ccf624201326e82be2d8b92b901bed44ce5d1083ecb3
Pointer size: 131 Bytes
Size of remote file: 101 kB

configuration.json ADDED Viewed

	@@ -0,0 +1 @@


1	+ {"framework":"Pytorch","task":"text-to-image-synthesis"}

model.py ADDED Viewed

	@@ -0,0 +1,64 @@

+import torch, math
+from PIL import Image
+import numpy as np
+class MultiValueEncoder(torch.nn.Module):
+    def __init__(self, dim_in=256, dim_out=4096, length=32, num_values=3):
+        super().__init__()
+        self.length = length
+        self.prefer_value_embedder = torch.nn.Sequential(torch.nn.Linear(dim_in * num_values, dim_out), torch.nn.SiLU(), torch.nn.Linear(dim_out, dim_out))
+        self.positional_embedding = torch.nn.Parameter(torch.randn(self.length, dim_out))
+    def get_timestep_embedding(self, timesteps, embedding_dim, max_period=10000):
+        half_dim = embedding_dim // 2
+        exponent = -math.log(max_period) * torch.arange(0, half_dim, dtype=torch.float32, device=timesteps.device) / half_dim
+        emb = timesteps[:, None].float() * torch.exp(exponent)[None, :]
+        emb = torch.cat([torch.cos(emb), torch.sin(emb)], dim=-1)
+        return emb
+    def forward(self, value, dtype):
+        emb = self.get_timestep_embedding(value * 1000, 256).to(dtype)
+        emb = emb.view(1, -1)
+        emb = self.prefer_value_embedder(emb).squeeze(0)
+        base_embeddings = emb.expand(self.length, -1)
+        positional_embedding = self.positional_embedding.to(dtype=base_embeddings.dtype, device=base_embeddings.device)
+        learned_embeddings = base_embeddings + positional_embedding
+        return learned_embeddings
+class ValueFormatModel(torch.nn.Module):
+    def __init__(self, num_double_blocks=5, num_single_blocks=20, dim=3072, num_heads=24, length=512):
+        super().__init__()
+        self.block_names = [f"double_{i}" for i in range(num_double_blocks)] + [f"single_{i}" for i in range(num_single_blocks)]
+        self.proj_k = torch.nn.ModuleDict({block_name: MultiValueEncoder(dim_out=dim, length=length) for block_name in self.block_names})
+        self.proj_v = torch.nn.ModuleDict({block_name: MultiValueEncoder(dim_out=dim, length=length) for block_name in self.block_names})
+        self.num_heads = num_heads
+        self.length = length
+    @torch.no_grad()
+    def process_inputs(self, pipe, R, G, B, **kwargs):
+        return {"value": torch.Tensor([R, G, B]).to(dtype=pipe.torch_dtype, device=pipe.device)}
+    def forward(self, value, **kwargs):
+        kv_cache = {}
+        for block_name in self.block_names:
+            k = self.proj_k[block_name](value, value.dtype)
+            k = k.view(1, self.length, self.num_heads, -1)
+            v = self.proj_v[block_name](value, value.dtype)
+            v = v.view(1, self.length, self.num_heads, -1)
+            kv_cache[block_name] = (k, v)
+        return {"kv_cache": kv_cache}
+class DataAnnotator:
+    def __call__(self, image, **kwargs):
+        image = Image.open(image).convert("RGB")
+        image = np.array(image).astype(np.float32)
+        r, g, b = image[:, :, 0].mean() / 255, image[:, :, 1].mean() / 255, image[:, :, 2].mean() / 255
+        return {"R": r, "G": g, "B": b}
+TEMPLATE_MODEL = ValueFormatModel
+TEMPLATE_MODEL_PATH = "model.safetensors"
+TEMPLATE_DATA_PROCESSOR = DataAnnotator

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:49fe9aa7fc27f1ac3ebe6d99013d251ef8312cfa218e813bcf1cc0cbdaeffbca
+size 1337578464