Spaces:

harsh99
/

Virtual-Cloths-TryOn

Running

App Files Files Community

harsh99 commited on Jun 10

Commit

24ffbfb

1 Parent(s): b9e9532

bug fixes & using finetune self attentions weights

Browse files

Files changed (7) hide show

agnostic_mask.png +0 -0
dog.jpg +0 -0
garment.jpg +0 -0
model.py +2 -2
model_converter.py +83 -1
person.jpg +0 -0
test.ipynb +25 -22

agnostic_mask.png DELETED Viewed

Binary file (6.26 kB)

dog.jpg DELETED Viewed

Binary file (71.1 kB)

garment.jpg DELETED Viewed

Binary file (56.5 kB)

model.py CHANGED Viewed

@@ -5,12 +5,12 @@ from diffusion import Diffusion
 import model_converter
-def preload_models_from_standard_weights(ckpt_path, device):
     # CatVTON parameters
     in_channels = 9
     out_channels = 4
-    state_dict=model_converter.load_from_standard_weights(ckpt_path, device)
     encoder=VAE_Encoder().to(device)
     encoder.load_state_dict(state_dict['encoder'], strict=True)

 import model_converter
+def preload_models_from_standard_weights(ckpt_path, device, finetune_weight_path=None):
     # CatVTON parameters
     in_channels = 9
     out_channels = 4
+    state_dict=model_converter.load_from_standard_weights(ckpt_path, device, finetune_weight_path)
     encoder=VAE_Encoder().to(device)
     encoder.load_state_dict(state_dict['encoder'], strict=True)

model_converter.py CHANGED Viewed

@@ -1,6 +1,7 @@
 import torch
-def load_from_standard_weights(input_file: str, device: str) -> dict[str, torch.Tensor]:
     # Taken from: https://github.com/kjsman/stable-diffusion-pytorch/issues/7#issuecomment-1426839447
     # original_model = torch.load(input_file, map_location=device, weights_only = False)["state_dict"]
     original_model=torch.load(input_file, weights_only = False)["state_dict"]
@@ -1054,4 +1055,85 @@ def load_from_standard_weights(input_file: str, device: str) -> dict[str, torch.
     converted['clip']['layers.11.attention.in_proj.weight'] = torch.cat((original_model['cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.q_proj.weight'], original_model['cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.k_proj.weight'], original_model['cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.v_proj.weight']), 0)
     converted['clip']['layers.11.attention.in_proj.bias'] = torch.cat((original_model['cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.q_proj.bias'], original_model['cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.k_proj.bias'], original_model['cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.v_proj.bias']), 0)
     return converted

 import torch
+import safetensors.torch
+def load_from_standard_weights(input_file: str, device: str, finetuned_weights_path: str=None) -> dict[str, torch.Tensor]:
     # Taken from: https://github.com/kjsman/stable-diffusion-pytorch/issues/7#issuecomment-1426839447
     # original_model = torch.load(input_file, map_location=device, weights_only = False)["state_dict"]
     original_model=torch.load(input_file, weights_only = False)["state_dict"]
     converted['clip']['layers.11.attention.in_proj.weight'] = torch.cat((original_model['cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.q_proj.weight'], original_model['cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.k_proj.weight'], original_model['cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.v_proj.weight']), 0)
     converted['clip']['layers.11.attention.in_proj.bias'] = torch.cat((original_model['cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.q_proj.bias'], original_model['cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.k_proj.bias'], original_model['cond_stage_model.transformer.text_model.encoder.layers.11.self_attn.v_proj.bias']), 0)
+    if finetuned_weights_path is not None:
+        converted=convert_safetensors_to_combined_weights(finetuned_weights_path, converted)
+    return converted
+def convert_safetensors_to_combined_weights(safetensors_path, converted):
+    """
+    Convert safetensors with separate q,k,v weights to combined in_proj weights
+    """
+    # Load the original safetensors
+    state_dict = safetensors.torch.load_file(safetensors_path)
+    # Create mapping from your safetensors indices to UNet attention paths
+    # Based on dimension analysis:
+    # 320-dim layers: 0, 8, 96, 104, 112 -> encoders.1,2 and decoders.9,10,11
+    # 640-dim layers: 16, 24, 72, 80, 88 -> encoders.4,5 and decoders.6,7,8
+    # 1280-dim layers: 32, 40, 48, 56, 64, 120 -> encoders.7,8, bottleneck, decoders.3,4,5
+    layer_mappings = {
+        # 320-dim layers (encoders)
+        0: "encoders.1.1.attention_1",      # [320,320] -> [960,320]
+        8: "encoders.2.1.attention_1",      # [320,320] -> [960,320]
+        # 640-dim layers (encoders)
+        16: "encoders.4.1.attention_1",     # [640,640] -> [1920,640]
+        24: "encoders.5.1.attention_1",     # [640,640] -> [1920,640]
+        # 1280-dim layers (encoders)
+        32: "encoders.7.1.attention_1",     # [1280,1280] -> [3840,1280]
+        40: "encoders.8.1.attention_1",     # [1280,1280] -> [3840,1280]
+        # 1280-dim layers (bottleneck)
+        48: "bottleneck.1.attention_1",     # [1280,1280] -> [3840,1280]
+        # 1280-dim layers (decoders)
+        56: "decoders.3.1.attention_1",     # [1280,1280] -> [3840,1280]
+        64: "decoders.4.1.attention_1",     # [1280,1280] -> [3840,1280]
+        120: "decoders.5.1.attention_1",    # [1280,1280] -> [3840,1280]
+        # 640-dim layers (decoders)
+        72: "decoders.6.1.attention_1",     # [640,640] -> [1920,640]
+        80: "decoders.7.1.attention_1",     # [640,640] -> [1920,640]
+        88: "decoders.8.1.attention_1",     # [640,640] -> [1920,640]
+        # 320-dim layers (decoders)
+        96: "decoders.9.1.attention_1",     # [320,320] -> [960,320]
+        104: "decoders.10.1.attention_1",   # [320,320] -> [960,320]
+        112: "decoders.11.1.attention_1"    # [320,320] -> [960,320]
+    }
+    for layer_idx, unet_path in layer_mappings.items():
+        # Get the q, k, v weights for this layer
+        q_key = f"{layer_idx}.to_q.weight"
+        k_key = f"{layer_idx}.to_k.weight"
+        v_key = f"{layer_idx}.to_v.weight"
+        out_weight_key = f"{layer_idx}.to_out.0.weight"
+        out_bias_key = f"{layer_idx}.to_out.0.bias"
+        if all(key in state_dict for key in [q_key, k_key, v_key]):
+            # Concatenate q, k, v weights along dimension 0 to create in_proj weight
+            q_weight = state_dict[q_key]
+            k_weight = state_dict[k_key]
+            v_weight = state_dict[v_key]
+            # Combine into single in_proj matrix
+            in_proj_weight = torch.cat([q_weight, k_weight, v_weight], dim=0)
+            # Store in converted format
+            converted['diffusion'][f"unet.{unet_path}.in_proj.weight"] = in_proj_weight
+            # Also handle output projection weights
+            if out_weight_key in state_dict:
+                converted['diffusion'][f"unet.{unet_path}.out_proj.weight"] = state_dict[out_weight_key]
+            if out_bias_key in state_dict:
+                converted['diffusion'][f"unet.{unet_path}.out_proj.bias"] = state_dict[out_bias_key]
+            print(f"Converted layer {layer_idx}: {q_weight.shape} + {k_weight.shape} + {v_weight.shape} -> {in_proj_weight.shape}")
     return converted

person.jpg DELETED Viewed

Binary file (30.9 kB)

test.ipynb CHANGED Viewed

@@ -169,28 +169,19 @@
   },
   {
    "cell_type": "code",
-   "execution_count": 18,
    "id": "13c59a6c",
    "metadata": {},
    "outputs": [
     {
-     "name": "stdout",
-     "output_type": "stream",
-     "text": [
-      "Prepared image shape: torch.Size([1, 3, 512, 512]), condition image shape: torch.Size([1, 3, 512, 512]), mask shape: torch.Size([1, 1, 512, 512])\n",
-      "Masked image shape: torch.Size([1, 3, 512, 512])\n",
-      "Masked latent shape: torch.Size([1, 4, 64, 64]), condition latent shape: torch.Size([1, 4, 64, 64])\n",
-      "Masked Person latent + garment latent: torch.Size([1, 4, 64, 128])\n",
-      "Mask latent concat shape: torch.Size([1, 1, 64, 128])\n",
-      "Latents shape: torch.Size([1, 4, 64, 128])\n",
-      "Masked latent concat for classifier-free guidance: torch.Size([2, 4, 64, 128]), mask latent concat: torch.Size([2, 1, 64, 128])\n"
-     ]
-    },
-    {
-     "name": "stderr",
-     "output_type": "stream",
-     "text": [
-      "100%|██████████| 50/50 [01:20<00:00,  1.62s/it]\n"
      ]
     }
    ],
@@ -586,12 +577,12 @@
     "\n",
     "if __name__ == \"__main__\":\n",
     "    # Example usage\n",
-    "    image = Image.open(\"person.jpg\").convert(\"RGB\")\n",
-    "    condition_image = Image.open(\"image.png\").convert(\"RGB\")\n",
-    "    mask = Image.open(\"agnostic_mask.png\").convert(\"L\")\n",
     "\n",
     "    # Load models\n",
-    "    models=model.preload_models_from_standard_weights(\"sd-v1-5-inpainting.ckpt\", device=\"cuda\")\n",
     "\n",
     "    # Generate image\n",
     "    generated_image = generate(\n",
@@ -733,6 +724,18 @@
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
   }
  },
  "nbformat": 4,

   },
   {
    "cell_type": "code",
+   "execution_count": null,
    "id": "13c59a6c",
    "metadata": {},
    "outputs": [
     {
+     "ename": "ModuleNotFoundError",
+     "evalue": "No module named 'ddpm'",
+     "output_type": "error",
+     "traceback": [
+      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m                       Traceback (most recent call last)",
+      "\u001b[0;32m/tmp/ipykernel_391/3664407558.py\u001b[0m in \u001b[0;36m<cell line: 0>\u001b[0;34m()\u001b[0m\n\u001b[1;32m      5\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mnumpy\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mnp\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      6\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mtqdm\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtqdm\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 7\u001b[0;31m \u001b[0;32mfrom\u001b[0m \u001b[0mddpm\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mDDPMSampler\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m      8\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0mPIL\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mImage\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m      9\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mmodel\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
+      "\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'ddpm'"
      ]
     }
    ],
     "\n",
     "if __name__ == \"__main__\":\n",
     "    # Example usage\n",
+    "    image = Image.open(\"sample_dataset/image.png\").convert(\"RGB\")\n",
+    "    condition_image = Image.open(\"sample_dataset/cloth.png\").convert(\"RGB\")\n",
+    "    mask = Image.open(\"sample_dataset/agnostic_mask.png\").convert(\"L\")\n",
     "\n",
     "    # Load models\n",
+    "    models=model.preload_models_from_standard_weights(ckpt_path=\"sd-v1-5-inpainting.ckpt\", device=\"cuda\", finetune_weight_path=\"model.safetensors\")\n",
     "\n",
     "    # Generate image\n",
     "    generated_image = generate(\n",
    "display_name": "Python 3 (ipykernel)",
    "language": "python",
    "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
   }
  },
  "nbformat": 4,