Spaces:

sayakpaul
/

convert-kerascv-sd-diffusers

Build error

App Files Files Community

sayakpaul HF staff commited on Jan 27, 2023

Commit

ddc8a59

1 Parent(s): 7079251

add: files.

Browse files

Files changed (11) hide show

README.md +3 -3
app.py +44 -0
conversion_utils/__init__.py +3 -0
conversion_utils/text_encoder.py +110 -0
conversion_utils/unet.py +291 -0
conversion_utils/utils.py +15 -0
convert.py +90 -0
hub_utils/__init__.py +2 -0
hub_utils/readme.py +29 -0
hub_utils/repo.py +15 -0
requirements.txt +7 -0

README.md CHANGED Viewed

@@ -1,7 +1,7 @@
 ---
-title: Convert Kerascv Sd Diffusers
-emoji: 🐢
-colorFrom: green
 colorTo: red
 sdk: gradio
 sdk_version: 3.16.2

 ---
+title: Convert Kerascv SD to Diffusers
+emoji: 🧨
+colorFrom: red
 colorTo: red
 sdk: gradio
 sdk_version: 3.16.2

app.py ADDED Viewed

	@@ -0,0 +1,44 @@

+import gradio as gr
+from convert import run_conversion
+from hub_utils import save_model_card, push_to_hub
+PRETRAINED_CKPT = "CompVis/stable-diffusion-v1-4"
+DESCRIPTION = """
+This Space lets you convert KerasCV Stable Diffusion weights to a format compatible with [Diffusers](https://github.com/huggingface/diffusers) 🧨. This allows users to fine-tune using KerasCV and use the fine-tuned weights in Diffusers taking advantage of its nifty features (like schedulers, fast attention, etc.). Specifically, the parameters are converted and then they are wrapped into a [`StableDiffusionPipeline`](https://huggingface.co/docs/diffusers/api/pipelines/stable_diffusion/overview). This pipeline is then pushed to the Hugging Face Hub given you have provided a `your_hf_token`.
+## Notes (important)
+* Only Stable Diffusion (v1) is supported as of now. In particular this checkpoint: [`"CompVis/stable-diffusion-v1-4"`](https://huggingface.co/CompVis/stable-diffusion-v1-4).
+* Only the text encoder and the UNet parameters converted since only these two elements are generally fine-tuned.
+* [This Colab Notebook](https://colab.research.google.com/drive/1RYY077IQbAJldg8FkK8HSEpNILKHEwLb?usp=sharing) was used to develop the conversion utilities initially.
+* You can choose not to provide `text_encoder_weights` and `unet_weights` in case you don't have any fine-tuned weights. In that case, the original parameters of the respective models (text encoder and UNet) from KerasCV will be used.
+* You can provide only `text_encoder_weights` or `unet_weights` or both.
+* When providing the weights' links, ensure they're directly downloadable. Internally, the Space uses [`tf.keras.utils.get_file()`](https://www.tensorflow.org/api_docs/python/tf/keras/utils/get_file) to retrieve the weights locally.
+* If you don't provide `your_hf_token` the converted pipeline won't be pushed.
+Check [here](https://github.com/huggingface/diffusers/blob/31be42209ddfdb69d9640a777b32e9b5c6259bf0/examples/dreambooth/train_dreambooth_lora.py#L975) for an example on how you can change the scheduler of an already initialized pipeline.
+"""
+def run(hf_token, text_encoder_weights, unet_weights, repo_prefix):
+    if text_encoder_weights == "":
+        text_encoder_weights = None
+    if unet_weights == "":
+        unet_weights = None
+    pipeline = run_conversion(text_encoder_weights, unet_weights)
+    output_path = "kerascv_sd_diffusers_pipeline"
+    pipeline.save_pretrained(output_path)
+    save_model_card(base_model=PRETRAINED_CKPT, repo_folder=output_path, weight_paths=[text_encoder_weights, unet_weights], repo_prefix=repo_prefix)
+    push_str = push_to_hub(hf_token, output_path, repo_prefix)
+    return push_str
+demo = gr.Interface(
+        title="KerasCV Stable Diffusion to Diffusers Stable Diffusion Pipelines 🧨🤗",
+        description=DESCRIPTION,
+        allow_flagging="never",
+        inputs=[gr.Text(max_lines=1, label="your_hf_token"), gr.Text(max_lines=1, label="text_encoder_weights"), gr.Text(max_lines=1, label="unet_weights"), gr.Text(max_lines=1, label="output_repo_prefix")],
+        outputs=[gr.Markdown(label="output")],
+        fn=run,
+    )
+demo.launch()

conversion_utils/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@

+from .text_encoder import populate_text_encoder
+from .unet import populate_unet
+from .utils import run_assertion

conversion_utils/text_encoder.py ADDED Viewed

	@@ -0,0 +1,110 @@

+from keras_cv.models import stable_diffusion
+import tensorflow as tf
+import torch
+from typing import Dict
+MAX_SEQ_LENGTH = 77
+def populate_text_encoder(tf_text_encoder: tf.keras.Model) -> Dict[str, torch.Tensor]:
+    """Populates the state dict from the provided TensorFlow model
+    (applicable only for the text encoder)."""
+    text_state_dict = dict()
+    num_encoder_layers = 0
+    for layer in tf_text_encoder.layers:
+        # Embeddings.
+        if isinstance(layer, stable_diffusion.text_encoder.CLIPEmbedding):
+            text_state_dict[
+                "text_model.embeddings.token_embedding.weight"
+            ] = torch.from_numpy(layer.token_embedding.get_weights()[0])
+            text_state_dict[
+                "text_model.embeddings.position_embedding.weight"
+            ] = torch.from_numpy(layer.position_embedding.get_weights()[0])
+        # Encoder blocks.
+        elif isinstance(layer, stable_diffusion.text_encoder.CLIPEncoderLayer):
+            # LayerNorms
+            for i in range(1, 3):
+                if i == 1:
+                    text_state_dict[
+                        f"text_model.encoder.layers.{num_encoder_layers}.layer_norm1.weight"
+                    ] = torch.from_numpy(layer.layer_norm1.get_weights()[0])
+                    text_state_dict[
+                        f"text_model.encoder.layers.{num_encoder_layers}.layer_norm1.bias"
+                    ] = torch.from_numpy(layer.layer_norm1.get_weights()[1])
+                else:
+                    text_state_dict[
+                        f"text_model.encoder.layers.{num_encoder_layers}.layer_norm2.weight"
+                    ] = torch.from_numpy(layer.layer_norm2.get_weights()[0])
+                    text_state_dict[
+                        f"text_model.encoder.layers.{num_encoder_layers}.layer_norm2.bias"
+                    ] = torch.from_numpy(layer.layer_norm2.get_weights()[1])
+            # Attention.
+            q_proj = layer.clip_attn.q_proj
+            k_proj = layer.clip_attn.k_proj
+            v_proj = layer.clip_attn.v_proj
+            out_proj = layer.clip_attn.out_proj
+            text_state_dict[
+                f"text_model.encoder.layers.{num_encoder_layers}.self_attn.q_proj.weight"
+            ] = torch.from_numpy(q_proj.get_weights()[0].transpose())
+            text_state_dict[
+                f"text_model.encoder.layers.{num_encoder_layers}.self_attn.q_proj.bias"
+            ] = torch.from_numpy(q_proj.get_weights()[1])
+            text_state_dict[
+                f"text_model.encoder.layers.{num_encoder_layers}.self_attn.k_proj.weight"
+            ] = torch.from_numpy(k_proj.get_weights()[0].transpose())
+            text_state_dict[
+                f"text_model.encoder.layers.{num_encoder_layers}.self_attn.k_proj.bias"
+            ] = torch.from_numpy(k_proj.get_weights()[1])
+            text_state_dict[
+                f"text_model.encoder.layers.{num_encoder_layers}.self_attn.v_proj.weight"
+            ] = torch.from_numpy(v_proj.get_weights()[0].transpose())
+            text_state_dict[
+                f"text_model.encoder.layers.{num_encoder_layers}.self_attn.v_proj.bias"
+            ] = torch.from_numpy(v_proj.get_weights()[1])
+            text_state_dict[
+                f"text_model.encoder.layers.{num_encoder_layers}.self_attn.out_proj.weight"
+            ] = torch.from_numpy(out_proj.get_weights()[0].transpose())
+            text_state_dict[
+                f"text_model.encoder.layers.{num_encoder_layers}.self_attn.out_proj.bias"
+            ] = torch.from_numpy(out_proj.get_weights()[1])
+            # MLPs.
+            fc1 = layer.fc1
+            fc2 = layer.fc2
+            text_state_dict[
+                f"text_model.encoder.layers.{num_encoder_layers}.mlp.fc1.weight"
+            ] = torch.from_numpy(fc1.get_weights()[0].transpose())
+            text_state_dict[
+                f"text_model.encoder.layers.{num_encoder_layers}.mlp.fc1.bias"
+            ] = torch.from_numpy(fc1.get_weights()[1])
+            text_state_dict[
+                f"text_model.encoder.layers.{num_encoder_layers}.mlp.fc2.weight"
+            ] = torch.from_numpy(fc2.get_weights()[0].transpose())
+            text_state_dict[
+                f"text_model.encoder.layers.{num_encoder_layers}.mlp.fc2.bias"
+            ] = torch.from_numpy(fc2.get_weights()[1])
+            num_encoder_layers += 1
+        # Final LayerNorm.
+        elif isinstance(layer, tf.keras.layers.LayerNormalization):
+            text_state_dict["text_model.final_layer_norm.weight"] = torch.from_numpy(
+                layer.get_weights()[0]
+            )
+            text_state_dict["text_model.final_layer_norm.bias"] = torch.from_numpy(
+                layer.get_weights()[1]
+            )
+        # Position ids.
+        text_state_dict["text_model.embeddings.position_ids"] = torch.tensor(
+            list(range(77))
+        ).unsqueeze(0)
+        return text_state_dict

conversion_utils/unet.py ADDED Viewed

	@@ -0,0 +1,291 @@

+import tensorflow as tf
+import torch
+from typing import Dict
+from itertools import product
+from keras_cv.models import stable_diffusion
+def port_transformer_block(transformer_block: tf.keras.Model, up_down: int, block_id: int, attention_id: int) -> Dict[str, torch.Tensor]:
+    """Populates a Transformer block."""
+    transformer_dict = dict()
+    if block_id is not None:
+        prefix = f"{up_down}_blocks.{block_id}"
+    else:
+        prefix = "mid_block"
+    # Norms.
+    for i in range(1, 4):
+        if i == 1:
+            norm = transformer_block.norm1
+        elif i == 2:
+            norm = transformer_block.norm2
+        elif i == 3:
+            norm = transformer_block.norm3
+        transformer_dict[f"{prefix}.attentions.{attention_id}.transformer_blocks.0.norm{i}.weight"] = torch.from_numpy(norm.get_weights()[0])
+        transformer_dict[f"{prefix}.attentions.{attention_id}.transformer_blocks.0.norm{i}.bias"] = torch.from_numpy(norm.get_weights()[1])
+    # Attentions.
+    for i in range(1, 3):
+        if i == 1:
+            attn = transformer_block.attn1
+        else:
+            attn = transformer_block.attn2
+        transformer_dict[f"{prefix}.attentions.{attention_id}.transformer_blocks.0.attn{i}.to_q.weight"] = torch.from_numpy(attn.to_q.get_weights()[0].transpose())
+        transformer_dict[f"{prefix}.attentions.{attention_id}.transformer_blocks.0.attn{i}.to_k.weight"] = torch.from_numpy(attn.to_k.get_weights()[0].transpose())
+        transformer_dict[f"{prefix}.attentions.{attention_id}.transformer_blocks.0.attn{i}.to_v.weight"] = torch.from_numpy(attn.to_v.get_weights()[0].transpose())
+        transformer_dict[f"{prefix}.attentions.{attention_id}.transformer_blocks.0.attn{i}.to_out.0.weight"] = torch.from_numpy(attn.out_proj.get_weights()[0].transpose())
+        transformer_dict[f"{prefix}.attentions.{attention_id}.transformer_blocks.0.attn{i}.to_out.0.bias"] = torch.from_numpy(attn.out_proj.get_weights()[1])
+    # Dense.
+    for i in range(0, 3, 2):
+        if i == 0:
+            layer = transformer_block.geglu.dense
+            transformer_dict[f"{prefix}.attentions.{attention_id}.transformer_blocks.0.ff.net.{i}.proj.weight"] = torch.from_numpy(layer.get_weights()[0].transpose())
+            transformer_dict[f"{prefix}.attentions.{attention_id}.transformer_blocks.0.ff.net.{i}.proj.bias"] = torch.from_numpy(layer.get_weights()[1])
+        else:
+            layer = transformer_block.dense
+            transformer_dict[f"{prefix}.attentions.{attention_id}.transformer_blocks.0.ff.net.{i}.weight"] = torch.from_numpy(layer.get_weights()[0].transpose())
+            transformer_dict[f"{prefix}.attentions.{attention_id}.transformer_blocks.0.ff.net.{i}.bias"] = torch.from_numpy(layer.get_weights()[1])
+    return transformer_dict
+def populate_unet(tf_unet: tf.keras.Model) -> Dict[str, torch.Tensor]:
+    """Populates the state dict from the provided TensorFlow model
+    (applicable only for the UNet)."""
+    unet_state_dict = dict()
+    timstep_emb = 1
+    padded_conv = 1
+    up_block = 0
+    up_res_blocks = list(product([0, 1, 2, 3], [0, 1, 2]))
+    up_res_block_flag = 0
+    up_spatial_transformer_blocks = list(product([1, 2, 3], [0, 1, 2]))
+    up_spatial_transformer_flag = 0
+    for layer in tf_unet.layers:
+        # Timstep embedding.
+        if isinstance(layer, tf.keras.layers.Dense):
+            unet_state_dict[f"time_embedding.linear_{timstep_emb}.weight"] = torch.from_numpy(layer.get_weights()[0].transpose())
+            unet_state_dict[f"time_embedding.linear_{timstep_emb}.bias"] = torch.from_numpy(layer.get_weights()[1])
+            timstep_emb += 1
+        # Padded convs (downsamplers).
+        elif isinstance(layer, stable_diffusion.__internal__.layers.padded_conv2d.PaddedConv2D):
+            if padded_conv == 1:
+                # Transposition axes taken from https://github.com/huggingface/transformers/blob/main/src/transformers/modeling_tf_pytorch_utils.py#L104
+                unet_state_dict["conv_in.weight"] = torch.from_numpy(layer.get_weights()[0].transpose(3, 2, 0, 1))
+                unet_state_dict["conv_in.bias"] = torch.from_numpy(layer.get_weights()[1])
+            elif padded_conv in [2, 3, 4]:
+                unet_state_dict[f"down_blocks.{padded_conv-2}.downsamplers.0.conv.weight"] = torch.from_numpy(layer.get_weights()[0].transpose(3, 2, 0, 1))
+                unet_state_dict[f"down_blocks.{padded_conv-2}.downsamplers.0.conv.bias"] = torch.from_numpy(layer.get_weights()[1])
+            elif padded_conv == 5:
+                unet_state_dict["conv_out.weight"] = torch.from_numpy(layer.get_weights()[0].transpose(3, 2, 0, 1))
+                unet_state_dict["conv_out.bias"] = torch.from_numpy(layer.get_weights()[1])
+            padded_conv += 1
+        # Upsamplers.
+        elif isinstance(layer, stable_diffusion.diffusion_model.Upsample):
+            conv = layer.conv
+            unet_state_dict[f"up_blocks.{up_block}.upsamplers.0.conv.weight"] = torch.from_numpy(conv.get_weights()[0].transpose(3, 2, 0, 1))
+            unet_state_dict[f"up_blocks.{up_block}.upsamplers.0.conv.bias"] = torch.from_numpy(conv.get_weights()[1])
+            up_block += 1
+        # Output norms.
+        elif isinstance(layer, stable_diffusion.__internal__.layers.group_normalization.GroupNormalization):
+            unet_state_dict["conv_norm_out.weight"] = torch.from_numpy(layer.get_weights()[0])
+            unet_state_dict["conv_norm_out.bias"] = torch.from_numpy(layer.get_weights()[1])
+        # All ResBlocks.
+        elif isinstance(layer, stable_diffusion.diffusion_model.ResBlock):
+            layer_name = layer.name
+            parts = layer_name.split("_")
+            # Down.
+            if len(parts) == 2 or int(parts[-1]) < 8:
+                entry_flow = layer.entry_flow
+                embedding_flow = layer.embedding_flow
+                exit_flow = layer.exit_flow
+                down_block_id = 0 if len(parts) == 2 else int(parts[-1]) // 2
+                down_resnet_id = 0 if len(parts) == 2 else int(parts[-1]) % 2
+                # Conv blocks.
+                first_conv_layer = entry_flow[-1]
+                unet_state_dict[f"down_blocks.{down_block_id}.resnets.{down_resnet_id}.conv1.weight"] = torch.from_numpy(first_conv_layer.get_weights()[0].transpose(3, 2, 0, 1))
+                unet_state_dict[f"down_blocks.{down_block_id}.resnets.{down_resnet_id}.conv1.bias"] = torch.from_numpy(first_conv_layer.get_weights()[1])
+                second_conv_layer = exit_flow[-1]
+                unet_state_dict[f"down_blocks.{down_block_id}.resnets.{down_resnet_id}.conv2.weight"] = torch.from_numpy(second_conv_layer.get_weights()[0].transpose(3, 2, 0, 1))
+                unet_state_dict[f"down_blocks.{down_block_id}.resnets.{down_resnet_id}.conv2.bias"] = torch.from_numpy(second_conv_layer.get_weights()[1])
+                # Residual blocks.
+                if hasattr(layer, "residual_projection"):
+                    if isinstance(layer.residual_projection, stable_diffusion.__internal__.layers.padded_conv2d.PaddedConv2D):
+                        residual = layer.residual_projection
+                        unet_state_dict[f"down_blocks.{down_block_id}.resnets.{down_resnet_id}.conv_shortcut.weight"] = torch.from_numpy(residual.get_weights()[0].transpose(3, 2, 0, 1))
+                        unet_state_dict[f"down_blocks.{down_block_id}.resnets.{down_resnet_id}.conv_shortcut.bias"] = torch.from_numpy(residual.get_weights()[1])
+                # Timestep embedding.
+                embedding_proj = embedding_flow[-1]
+                unet_state_dict[f"down_blocks.{down_block_id}.resnets.{down_resnet_id}.time_emb_proj.weight"] = torch.from_numpy(embedding_proj.get_weights()[0].transpose())
+                unet_state_dict[f"down_blocks.{down_block_id}.resnets.{down_resnet_id}.time_emb_proj.bias"] = torch.from_numpy(embedding_proj.get_weights()[1])
+                # Norms.
+                first_group_norm = entry_flow[0]
+                unet_state_dict[f"down_blocks.{down_block_id}.resnets.{down_resnet_id}.norm1.weight"] = torch.from_numpy(first_group_norm.get_weights()[0])
+                unet_state_dict[f"down_blocks.{down_block_id}.resnets.{down_resnet_id}.norm1.bias"] = torch.from_numpy(first_group_norm.get_weights()[1])
+                second_group_norm = exit_flow[0]
+                unet_state_dict[f"down_blocks.{down_block_id}.resnets.{down_resnet_id}.norm2.weight"] = torch.from_numpy(second_group_norm.get_weights()[0])
+                unet_state_dict[f"down_blocks.{down_block_id}.resnets.{down_resnet_id}.norm2.bias"] = torch.from_numpy(second_group_norm.get_weights()[1])
+            # Middle.
+            elif int(parts[-1]) == 8 or int(parts[-1]) == 9:
+                entry_flow = layer.entry_flow
+                embedding_flow = layer.embedding_flow
+                exit_flow = layer.exit_flow
+                mid_resnet_id = int(parts[-1]) % 2
+                # Conv blocks.
+                first_conv_layer = entry_flow[-1]
+                unet_state_dict[f"mid_block.resnets.{mid_resnet_id}.conv1.weight"] = torch.from_numpy(first_conv_layer.get_weights()[0].transpose(3, 2, 0, 1))
+                unet_state_dict[f"mid_block.resnets.{mid_resnet_id}.conv1.bias"] = torch.from_numpy(first_conv_layer.get_weights()[1])
+                second_conv_layer = exit_flow[-1]
+                unet_state_dict[f"mid_block.resnets.{mid_resnet_id}.conv2.weight"] = torch.from_numpy(second_conv_layer.get_weights()[0].transpose(3, 2, 0, 1))
+                unet_state_dict[f"mid_block.resnets.{mid_resnet_id}.conv2.bias"] = torch.from_numpy(second_conv_layer.get_weights()[1])
+                # Residual blocks.
+                if hasattr(layer, "residual_projection"):
+                    if isinstance(layer.residual_projection, stable_diffusion.__internal__.layers.padded_conv2d.PaddedConv2D):
+                        residual = layer.residual_projection
+                        unet_state_dict[f"mid_block.resnets.{mid_resnet_id}.conv_shortcut.weight"] = torch.from_numpy(residual.get_weights()[0].transpose(3, 2, 0, 1))
+                        unet_state_dict[f"mid_block.resnets.{mid_resnet_id}.conv_shortcut.bias"] = torch.from_numpy(residual.get_weights()[1])
+                # Timestep embedding.
+                embedding_proj = embedding_flow[-1]
+                unet_state_dict[f"mid_block.resnets.{mid_resnet_id}.time_emb_proj.weight"] = torch.from_numpy(embedding_proj.get_weights()[0].transpose())
+                unet_state_dict[f"mid_block.resnets.{mid_resnet_id}.time_emb_proj.bias"] = torch.from_numpy(embedding_proj.get_weights()[1])
+                # Norms.
+                first_group_norm = entry_flow[0]
+                unet_state_dict[f"mid_block.resnets.{mid_resnet_id}.norm1.weight"] = torch.from_numpy(first_group_norm.get_weights()[0])
+                unet_state_dict[f"mid_block.resnets.{mid_resnet_id}.norm1.bias"] = torch.from_numpy(first_group_norm.get_weights()[1])
+                second_group_norm = exit_flow[0]
+                unet_state_dict[f"mid_block.resnets.{mid_resnet_id}.norm2.weight"] = torch.from_numpy(second_group_norm.get_weights()[0])
+                unet_state_dict[f"mid_block.resnets.{mid_resnet_id}.norm2.bias"] = torch.from_numpy(second_group_norm.get_weights()[1])
+            # Up.
+            elif int(parts[-1]) > 9 and up_res_block_flag < len(up_res_blocks):
+                entry_flow = layer.entry_flow
+                embedding_flow = layer.embedding_flow
+                exit_flow = layer.exit_flow
+                up_res_block = up_res_blocks[up_res_block_flag]
+                up_block_id = up_res_block[0]
+                up_resnet_id = up_res_block[1]
+                # Conv blocks.
+                first_conv_layer = entry_flow[-1]
+                unet_state_dict[f"up_blocks.{up_block_id}.resnets.{up_resnet_id}.conv1.weight"] = torch.from_numpy(first_conv_layer.get_weights()[0].transpose(3, 2, 0, 1))
+                unet_state_dict[f"up_blocks.{up_block_id}.resnets.{up_resnet_id}.conv1.bias"] = torch.from_numpy(first_conv_layer.get_weights()[1])
+                second_conv_layer = exit_flow[-1]
+                unet_state_dict[f"up_blocks.{up_block_id}.resnets.{up_resnet_id}.conv2.weight"] = torch.from_numpy(second_conv_layer.get_weights()[0].transpose(3, 2, 0, 1))
+                unet_state_dict[f"up_blocks.{up_block_id}.resnets.{up_resnet_id}.conv2.bias"] = torch.from_numpy(second_conv_layer.get_weights()[1])
+                # Residual blocks.
+                if hasattr(layer, "residual_projection"):
+                    if isinstance(layer.residual_projection, stable_diffusion.__internal__.layers.padded_conv2d.PaddedConv2D):
+                        residual = layer.residual_projection
+                        unet_state_dict[f"up_blocks.{up_block_id}.resnets.{up_resnet_id}.conv_shortcut.weight"] = torch.from_numpy(residual.get_weights()[0].transpose(3, 2, 0, 1))
+                        unet_state_dict[f"up_blocks.{up_block_id}.resnets.{up_resnet_id}.conv_shortcut.bias"] = torch.from_numpy(residual.get_weights()[1])
+                # Timestep embedding.
+                embedding_proj = embedding_flow[-1]
+                unet_state_dict[f"up_blocks.{up_block_id}.resnets.{up_resnet_id}.time_emb_proj.weight"] = torch.from_numpy(embedding_proj.get_weights()[0].transpose())
+                unet_state_dict[f"up_blocks.{up_block_id}.resnets.{up_resnet_id}.time_emb_proj.bias"] = torch.from_numpy(embedding_proj.get_weights()[1])
+                # Norms.
+                first_group_norm = entry_flow[0]
+                unet_state_dict[f"up_blocks.{up_block_id}.resnets.{up_resnet_id}.norm1.weight"] = torch.from_numpy(first_group_norm.get_weights()[0])
+                unet_state_dict[f"up_blocks.{up_block_id}.resnets.{up_resnet_id}.norm1.bias"] = torch.from_numpy(first_group_norm.get_weights()[1])
+                second_group_norm = exit_flow[0]
+                unet_state_dict[f"up_blocks.{up_block_id}.resnets.{up_resnet_id}.norm2.weight"] = torch.from_numpy(second_group_norm.get_weights()[0])
+                unet_state_dict[f"up_blocks.{up_block_id}.resnets.{up_resnet_id}.norm2.bias"] = torch.from_numpy(second_group_norm.get_weights()[1])
+                up_res_block_flag += 1
+        # All SpatialTransformer blocks.
+        elif isinstance(layer, stable_diffusion.diffusion_model.SpatialTransformer):
+            layer_name = layer.name
+            parts = layer_name.split("_")
+            # Down.
+            if len(parts) == 2 or int(parts[-1]) < 6:
+                down_block_id = 0 if len(parts) == 2 else int(parts[-1]) // 2
+                down_attention_id = 0 if len(parts) == 2 else int(parts[-1]) % 2
+                # Convs.
+                proj1 = layer.proj1
+                unet_state_dict[f"down_blocks.{down_block_id}.attentions.{down_attention_id}.proj_in.weight"] = torch.from_numpy(proj1.get_weights()[0].transpose(3, 2, 0, 1))
+                unet_state_dict[f"down_blocks.{down_block_id}.attentions.{down_attention_id}.proj_in.bias"] = torch.from_numpy(proj1.get_weights()[1])
+                proj2 = layer.proj2
+                unet_state_dict[f"down_blocks.{down_block_id}.attentions.{down_attention_id}.proj_out.weight"] = torch.from_numpy(proj2.get_weights()[0].transpose(3, 2, 0, 1))
+                unet_state_dict[f"down_blocks.{down_block_id}.attentions.{down_attention_id}.proj_out.bias"] = torch.from_numpy(proj2.get_weights()[1])
+                # Transformer blocks.
+                transformer_block = layer.transformer_block
+                unet_state_dict.update(port_transformer_block(transformer_block, "down", down_block_id, down_attention_id))
+                # Norms.
+                norm = layer.norm
+                unet_state_dict[f"down_blocks.{down_block_id}.attentions.{down_attention_id}.norm.weight"] = torch.from_numpy(norm.get_weights()[0])
+                unet_state_dict[f"down_blocks.{down_block_id}.attentions.{down_attention_id}.norm.bias"] = torch.from_numpy(norm.get_weights()[1])
+            # Middle.
+            elif int(parts[-1]) == 6:
+                mid_attention_id = int(parts[-1]) % 2
+                # Convs.
+                proj1 = layer.proj1
+                unet_state_dict[f"mid_block.attentions.{mid_attention_id}.proj_in.weight"] = torch.from_numpy(proj1.get_weights()[0].transpose(3, 2, 0, 1))
+                unet_state_dict[f"mid_block.attentions.{mid_attention_id}.proj_in.bias"] = torch.from_numpy(proj1.get_weights()[1])
+                proj2 = layer.proj2
+                unet_state_dict[f"mid_block.attentions.{mid_resnet_id}.proj_out.weight"] = torch.from_numpy(proj2.get_weights()[0].transpose(3, 2, 0, 1))
+                unet_state_dict[f"mid_block.attentions.{mid_attention_id}.proj_out.bias"] = torch.from_numpy(proj2.get_weights()[1])
+                # Transformer blocks.
+                transformer_block = layer.transformer_block
+                unet_state_dict.update(port_transformer_block(transformer_block, "mid", None, mid_attention_id))
+                # Norms.
+                norm = layer.norm
+                unet_state_dict[f"mid_block.attentions.{mid_attention_id}.norm.weight"] = torch.from_numpy(norm.get_weights()[0])
+                unet_state_dict[f"mid_block.attentions.{mid_attention_id}.norm.bias"] = torch.from_numpy(norm.get_weights()[1])
+            # Up.
+            elif int(parts[-1]) > 6 and up_spatial_transformer_flag < len(up_spatial_transformer_blocks):
+                up_spatial_transformer_block = up_spatial_transformer_blocks[up_spatial_transformer_flag]
+                up_block_id = up_spatial_transformer_block[0]
+                up_attention_id = up_spatial_transformer_block[1]
+                # Convs.
+                proj1 = layer.proj1
+                unet_state_dict[f"up_blocks.{up_block_id}.attentions.{up_attention_id}.proj_in.weight"] = torch.from_numpy(proj1.get_weights()[0].transpose(3, 2, 0, 1))
+                unet_state_dict[f"up_blocks.{up_block_id}.attentions.{up_attention_id}.proj_in.bias"] = torch.from_numpy(proj1.get_weights()[1])
+                proj2 = layer.proj2
+                unet_state_dict[f"up_blocks.{up_block_id}.attentions.{up_attention_id}.proj_out.weight"] = torch.from_numpy(proj2.get_weights()[0].transpose(3, 2, 0, 1))
+                unet_state_dict[f"up_blocks.{up_block_id}.attentions.{up_attention_id}.proj_out.bias"] = torch.from_numpy(proj2.get_weights()[1])
+                # Transformer blocks.
+                transformer_block = layer.transformer_block
+                unet_state_dict.update(port_transformer_block(transformer_block, "up", up_block_id, up_attention_id))
+                # Norms.
+                norm = layer.norm
+                unet_state_dict[f"up_blocks.{up_block_id}.attentions.{up_attention_id}.norm.weight"] = torch.from_numpy(norm.get_weights()[0])
+                unet_state_dict[f"up_blocks.{up_block_id}.attentions.{up_attention_id}.norm.bias"] = torch.from_numpy(norm.get_weights()[1])
+                up_spatial_transformer_flag += 1
+    return unet_state_dict

conversion_utils/utils.py ADDED Viewed

	@@ -0,0 +1,15 @@

+import numpy as np
+import torch
+from typing import Dict
+def run_assertion(orig_pt_state_dict: Dict[str, torch.Tensor], pt_state_dict_from_tf: Dict[str, torch.Tensor]):
+    for k in orig_pt_state_dict:
+        try:
+            np.testing.assert_allclose(
+                orig_pt_state_dict[k].numpy(),
+                pt_state_dict_from_tf[k].numpy()
+            )
+        except:
+            raise ValueError("There are problems in the parameter population process. Cannot proceed :(")

convert.py ADDED Viewed

	@@ -0,0 +1,90 @@

+from conversion_utils import populate_text_encoder, populate_unet, run_assertion
+from diffusers import (
+    AutoencoderKL,
+    StableDiffusionPipeline,
+    UNet2DConditionModel,
+)
+from diffusers.pipelines.stable_diffusion.safety_checker import StableDiffusionSafetyChecker
+from transformers import CLIPTextModel
+import keras_cv
+import tensorflow as tf
+PRETRAINED_CKPT = "CompVis/stable-diffusion-v1-4"
+REVISION = None
+NON_EMA_REVISION = None
+IMG_HEIGHT = IMG_WIDTH = 512
+def initialize_pt_models():
+    """Initializes the separate models of Stable Diffusion from diffusers and downloads
+    their pre-trained weights."""
+    pt_text_encoder = CLIPTextModel.from_pretrained(
+    PRETRAINED_CKPT, subfolder="text_encoder", revision=REVISION
+    )
+    pt_vae = AutoencoderKL.from_pretrained(
+        PRETRAINED_CKPT, subfolder="vae", revision=REVISION
+    )
+    pt_unet = UNet2DConditionModel.from_pretrained(
+        PRETRAINED_CKPT, subfolder="unet", revision=NON_EMA_REVISION
+    )
+    pt_safety_checker = StableDiffusionSafetyChecker.from_pretrained(
+        PRETRAINED_CKPT, subfolder="safety_checker", revision=NON_EMA_REVISION
+    )
+    return pt_text_encoder, pt_vae, pt_unet, pt_safety_checker
+def initialize_tf_models():
+    """Initializes the separate models of Stable Diffusion from KerasCV and downloads
+    their pre-trained weights."""
+    tf_sd_model = keras_cv.models.StableDiffusion(img_height=IMG_HEIGHT, img_width=IMG_WIDTH)
+    _ = tf_sd_model.text_to_image("Cartoon") # To download the weights.
+    tf_text_encoder = tf_sd_model.text_encoder
+    tf_vae = tf_sd_model.image_encoder
+    tf_unet = tf_sd_model.diffusion_model
+    return tf_sd_model, tf_text_encoder, tf_vae, tf_unet
+def run_conversion(text_encoder_weights: str = None, unet_weights: str = None):
+    pt_text_encoder, pt_vae, pt_unet, pt_safety_checker = initialize_pt_models()
+    tf_sd_model, tf_text_encoder, tf_vae, tf_unet = initialize_tf_models()
+    print("Pre-trained model weights downloaded.")
+    if text_encoder_weights is not None:
+        print("Loading fine-tuned text encoder weights.")
+        text_encoder_weights_path = tf.keras.utils.get_file(text_encoder_weights)
+        tf_text_encoder.load_weights(text_encoder_weights_path)
+    if unet_weights is not None:
+        print("Loading fine-tuned UNet weights.")
+        unet_weights_path = tf.keras.utils.get_file(unet_weights)
+        tf_unet.load_weights(unet_weights_path)
+    text_encoder_state_dict_from_tf = populate_text_encoder(tf_text_encoder)
+    unet_state_dict_from_tf = populate_unet(tf_unet)
+    print("Conversion done, now running assertions...")
+    # Since we cannot compare the fine-tuned weights.
+    if text_encoder_weights is None:
+        text_encoder_state_dict_from_pt = pt_text_encoder.state_dict()
+        run_assertion(text_encoder_state_dict_from_pt, text_encoder_state_dict_from_tf)
+    if unet_weights is None:
+        unet_state_dict_from_pt = pt_text_encoder.state_dict()
+        run_assertion(unet_state_dict_from_pt, unet_state_dict_from_tf)
+    print("Assertions successful, populating the converted parameters into the diffusers models...")
+    pt_text_encoder.load_state_dict(text_encoder_state_dict_from_tf)
+    pt_unet.load_state_dict(unet_state_dict_from_tf)
+    print("Parameters ported, preparing StabelDiffusionPipeline...")
+    pipeline = StableDiffusionPipeline.from_pretrained(
+        PRETRAINED_CKPT,
+        unet=pt_unet,
+        text_encoder=pt_text_encoder,
+        vae=pt_vae,
+        safety_checker=pt_safety_checker,
+        revision=None,
+    )
+    return pipeline

hub_utils/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+ from .readme import save_model_card
2	+ from .repo import push_to_hub

hub_utils/readme.py ADDED Viewed

	@@ -0,0 +1,29 @@

+import os
+# Copied from https://github.com/huggingface/diffusers/blob/31be42209ddfdb69d9640a777b32e9b5c6259bf0/examples/text_to_image/train_text_to_image_lora.py#L55
+def save_model_card(base_model=str, repo_folder=None, weight_paths=None):
+    yaml = f"""
+---
+license: creativeml-openrail-m
+base_model: {base_model}
+tags:
+- stable-diffusion
+- stable-diffusion-diffusers
+- text-to-image
+- diffusers
+inference: true
+---
+    """
+    model_card = f"""
+# KerasCV Stable Diffusion in Diffusers 🧨🤗
+The pipeline contained in this repository was created using [this Space](https://huggingface.co/spaces/sayakpaul/convert-kerascv-sd-diffusers). The purpose is to convert the KerasCV Stable Diffusion weights in a way that is compatible with Diffusers. This allows users to fine-tune using KerasCV and use the fine-tuned weights in Diffusers taking advantage of its nifty features (like schedulers, fast attention, etc.).\n
+"""
+    if weight_paths is not None:
+        model_card +=  "Following weight paths (KerasCV) were used: {weight_paths}"
+    with open(os.path.join(repo_folder, "README.md"), "w") as f:
+        f.write(yaml + model_card)

hub_utils/repo.py ADDED Viewed

	@@ -0,0 +1,15 @@

+from huggingface_hub import HfApi, create_repo
+def push_to_hub(hf_token: str, push_dir: str, repo_prefix: None) -> str:
+    try:
+        if hf_token == "":
+            return "No HF token provided. Model won't be pushed."
+        else:
+            hf_api = HfApi(token=hf_token)
+            user = hf_api.whoami()["name"]
+            repo_id = f"{user}/{push_dir}" if repo_prefix == "" else f"{user}/{repo_prefix}-{push_dir}"
+            _ = create_repo(repo_id=repo_id, token=hf_token)
+            url = hf_api.upload_folder(folder_path=push_dir, repo_id=repo_id, exist_ok=True)
+            return f"Model successfully pushed: [{url}]({url})"
+    except Exception as e:
+        return f"{e}"

requirements.txt ADDED Viewed

	@@ -0,0 +1,7 @@

+transformers==4.25.1
+numpy==1.21.6
+torch==1.12.1
+tensorflow==2.10.0
+git+https://github.com/keras-team/keras-cv.git@master
+git+https://github.com/huggingface/diffusers.git@main
+tensorflow-datasets==4.8.0