diffusers-sdxl-controlnet / scripts /convert_asymmetric_vqgan_to_diffusers.py
svjack's picture
Upload 1392 files
43b7e92 verified
raw
history blame
6.91 kB
import argparse
import time
from pathlib import Path
from typing import Any, Dict, Literal
import torch
from diffusers import AsymmetricAutoencoderKL
ASYMMETRIC_AUTOENCODER_KL_x_1_5_CONFIG = {
"in_channels": 3,
"out_channels": 3,
"down_block_types": [
"DownEncoderBlock2D",
"DownEncoderBlock2D",
"DownEncoderBlock2D",
"DownEncoderBlock2D",
],
"down_block_out_channels": [128, 256, 512, 512],
"layers_per_down_block": 2,
"up_block_types": [
"UpDecoderBlock2D",
"UpDecoderBlock2D",
"UpDecoderBlock2D",
"UpDecoderBlock2D",
],
"up_block_out_channels": [192, 384, 768, 768],
"layers_per_up_block": 3,
"act_fn": "silu",
"latent_channels": 4,
"norm_num_groups": 32,
"sample_size": 256,
"scaling_factor": 0.18215,
}
ASYMMETRIC_AUTOENCODER_KL_x_2_CONFIG = {
"in_channels": 3,
"out_channels": 3,
"down_block_types": [
"DownEncoderBlock2D",
"DownEncoderBlock2D",
"DownEncoderBlock2D",
"DownEncoderBlock2D",
],
"down_block_out_channels": [128, 256, 512, 512],
"layers_per_down_block": 2,
"up_block_types": [
"UpDecoderBlock2D",
"UpDecoderBlock2D",
"UpDecoderBlock2D",
"UpDecoderBlock2D",
],
"up_block_out_channels": [256, 512, 1024, 1024],
"layers_per_up_block": 5,
"act_fn": "silu",
"latent_channels": 4,
"norm_num_groups": 32,
"sample_size": 256,
"scaling_factor": 0.18215,
}
def convert_asymmetric_autoencoder_kl_state_dict(original_state_dict: Dict[str, Any]) -> Dict[str, Any]:
converted_state_dict = {}
for k, v in original_state_dict.items():
if k.startswith("encoder."):
converted_state_dict[
k.replace("encoder.down.", "encoder.down_blocks.")
.replace("encoder.mid.", "encoder.mid_block.")
.replace("encoder.norm_out.", "encoder.conv_norm_out.")
.replace(".downsample.", ".downsamplers.0.")
.replace(".nin_shortcut.", ".conv_shortcut.")
.replace(".block.", ".resnets.")
.replace(".block_1.", ".resnets.0.")
.replace(".block_2.", ".resnets.1.")
.replace(".attn_1.k.", ".attentions.0.to_k.")
.replace(".attn_1.q.", ".attentions.0.to_q.")
.replace(".attn_1.v.", ".attentions.0.to_v.")
.replace(".attn_1.proj_out.", ".attentions.0.to_out.0.")
.replace(".attn_1.norm.", ".attentions.0.group_norm.")
] = v
elif k.startswith("decoder.") and "up_layers" not in k:
converted_state_dict[
k.replace("decoder.encoder.", "decoder.condition_encoder.")
.replace(".norm_out.", ".conv_norm_out.")
.replace(".up.0.", ".up_blocks.3.")
.replace(".up.1.", ".up_blocks.2.")
.replace(".up.2.", ".up_blocks.1.")
.replace(".up.3.", ".up_blocks.0.")
.replace(".block.", ".resnets.")
.replace("mid", "mid_block")
.replace(".0.upsample.", ".0.upsamplers.0.")
.replace(".1.upsample.", ".1.upsamplers.0.")
.replace(".2.upsample.", ".2.upsamplers.0.")
.replace(".nin_shortcut.", ".conv_shortcut.")
.replace(".block_1.", ".resnets.0.")
.replace(".block_2.", ".resnets.1.")
.replace(".attn_1.k.", ".attentions.0.to_k.")
.replace(".attn_1.q.", ".attentions.0.to_q.")
.replace(".attn_1.v.", ".attentions.0.to_v.")
.replace(".attn_1.proj_out.", ".attentions.0.to_out.0.")
.replace(".attn_1.norm.", ".attentions.0.group_norm.")
] = v
elif k.startswith("quant_conv."):
converted_state_dict[k] = v
elif k.startswith("post_quant_conv."):
converted_state_dict[k] = v
else:
print(f" skipping key `{k}`")
# fix weights shape
for k, v in converted_state_dict.items():
if (
(k.startswith("encoder.mid_block.attentions.0") or k.startswith("decoder.mid_block.attentions.0"))
and k.endswith("weight")
and ("to_q" in k or "to_k" in k or "to_v" in k or "to_out" in k)
):
converted_state_dict[k] = converted_state_dict[k][:, :, 0, 0]
return converted_state_dict
def get_asymmetric_autoencoder_kl_from_original_checkpoint(
scale: Literal["1.5", "2"], original_checkpoint_path: str, map_location: torch.device
) -> AsymmetricAutoencoderKL:
print("Loading original state_dict")
original_state_dict = torch.load(original_checkpoint_path, map_location=map_location)
original_state_dict = original_state_dict["state_dict"]
print("Converting state_dict")
converted_state_dict = convert_asymmetric_autoencoder_kl_state_dict(original_state_dict)
kwargs = ASYMMETRIC_AUTOENCODER_KL_x_1_5_CONFIG if scale == "1.5" else ASYMMETRIC_AUTOENCODER_KL_x_2_CONFIG
print("Initializing AsymmetricAutoencoderKL model")
asymmetric_autoencoder_kl = AsymmetricAutoencoderKL(**kwargs)
print("Loading weight from converted state_dict")
asymmetric_autoencoder_kl.load_state_dict(converted_state_dict)
asymmetric_autoencoder_kl.eval()
print("AsymmetricAutoencoderKL successfully initialized")
return asymmetric_autoencoder_kl
if __name__ == "__main__":
start = time.time()
parser = argparse.ArgumentParser()
parser.add_argument(
"--scale",
default=None,
type=str,
required=True,
help="Asymmetric VQGAN scale: `1.5` or `2`",
)
parser.add_argument(
"--original_checkpoint_path",
default=None,
type=str,
required=True,
help="Path to the original Asymmetric VQGAN checkpoint",
)
parser.add_argument(
"--output_path",
default=None,
type=str,
required=True,
help="Path to save pretrained AsymmetricAutoencoderKL model",
)
parser.add_argument(
"--map_location",
default="cpu",
type=str,
required=False,
help="The device passed to `map_location` when loading the checkpoint",
)
args = parser.parse_args()
assert args.scale in ["1.5", "2"], f"{args.scale} should be `1.5` of `2`"
assert Path(args.original_checkpoint_path).is_file()
asymmetric_autoencoder_kl = get_asymmetric_autoencoder_kl_from_original_checkpoint(
scale=args.scale,
original_checkpoint_path=args.original_checkpoint_path,
map_location=torch.device(args.map_location),
)
print("Saving pretrained AsymmetricAutoencoderKL")
asymmetric_autoencoder_kl.save_pretrained(args.output_path)
print(f"Done in {time.time() - start:.2f} seconds")