niksapraljak1
/

BioM3

Model card Files Files and versions Community

Niksa Praljak commited on Dec 16, 2024

Commit

66d2e5f

1 Parent(s): d5de529

Update PenCL argparse and Finish Facilitator script

Browse files

Files changed (4) hide show

README.md +2 -1
run_Facilitator_sample.py +56 -74
run_PenCL_inference.py +22 -2
stage2_config.json +16 -0

README.md CHANGED Viewed

@@ -62,7 +62,8 @@ cd BioM3_PenCL
 ```bash
 python run_PenCL_inference.py \
     --json_path "stage1_config.json" \
-    --model_path "./weights/PenCL/BioM3_PenCL_epoch20.bin"
 ```
 ### Example Input Data

 ```bash
 python run_PenCL_inference.py \
     --json_path "stage1_config.json" \
+    --model_path "./weights/PenCL/BioM3_PenCL_epoch20.bin" \
+    --output_path "test_PenCL_embeddings.pt"
 ```
 ### Example Input Data

run_Facilitator_sample.py CHANGED Viewed

@@ -1,117 +1,103 @@
 import yaml
 from argparse import Namespace
 import json
 import pandas as pd
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-import pytorch_lightning as pl
-import Stage1_source.preprocess as prep
 import Stage1_source.model as mod
-import Stage1_source.PL_wrapper as PL_wrap
-# Step 1: Load JSON configuration
 def load_json_config(json_path):
-    """
-    Load JSON configuration file.
-    """
     with open(json_path, "r") as f:
         config = json.load(f)
-    # print("Loaded JSON config:", config)
     return config
 # Step 2: Convert JSON dictionary to Namespace
 def convert_to_namespace(config_dict):
-    """
-    Recursively convert a dictionary to an argparse Namespace.
-    """
     for key, value in config_dict.items():
-        if isinstance(value, dict):  # Recursively handle nested dictionaries
             config_dict[key] = convert_to_namespace(value)
     return Namespace(**config_dict)
-def prepare_model(args) ->nn.Module:
-    """
-    Prepare the model and PyTorch Lightning Trainer using a flat args object.
-    """
     model = mod.Facilitator(
-            in_dim=args.emb_dim,
-            hid_dim=args.hid_dim,
-            out_dim=args.emb_dim,
-            dropout=args.dropout
     )
-    weights_path = f"{save_dir}/BioM3_Facilitator_epoch20.bin"# BioM3_PenCL_epoch20.bin"
-    model.load_state_dict(torch.load(weights_path, map_location="cpu"))
     model.eval()
     print("Model loaded successfully with weights!")
     return model
 def compute_mmd_loss(x, y, kernel="rbf", sigma=1.0):
-    """
-    Compute the MMD loss between two sets of embeddings.
-    Args:
-        x: Tensor of shape [N, D]
-        y: Tensor of shape [N, D]
-        kernel: Kernel function, default is 'rbf' (Gaussian kernel)
-        sigma: Bandwidth for the Gaussian kernel
-    """
     def rbf_kernel(a, b, sigma):
-        """
-        Compute the RBF kernel between two tensors.
-        """
         pairwise_distances = torch.cdist(a, b, p=2) ** 2
         return torch.exp(-pairwise_distances / (2 * sigma ** 2))
-    # Compute RBF kernel matrices
-    K_xx = rbf_kernel(x, x, sigma)  # Kernel within x
-    K_yy = rbf_kernel(y, y, sigma)  # Kernel within y
-    K_xy = rbf_kernel(x, y, sigma)  # Kernel between x and y
-    # Compute MMD loss
     mmd_loss = K_xx.mean() - 2 * K_xy.mean() + K_yy.mean()
     return mmd_loss
 if __name__ == '__main__':
-    json_path = f"{save_dir}/stage2_config.json"
-    # Load and convert JSON config
-    json_path = f"{save_dir}/stage2_config.json"
-    config_dict = load_json_config(json_path)
-    args = convert_to_namespace(config_dict)
-    # load model
-    model =  prepare_model(args=args)
-    # load test dataset
-    embedding_dataset = torch.load('./PenCL_test_outputs.pt')
-    # Run inference and store z_t, z_p
     with torch.no_grad():
         z_t = embedding_dataset['z_t']
-        z_p = embedding_dataset['z_p']
         z_c = model(z_t)
         embedding_dataset['z_c'] = z_c
-    # Compute MSE between embeddings
-    mse_zc_zp = F.mse_loss(z_c, z_p)  # MSE between facilitated embeddings and protein embeddings
-    mse_zt_zp = F.mse_loss(z_t, z_p)  # MSE between text embeddings and protein embeddings
-    # Compute Norms (L2 magnitudes) for a given batch (e.g., first 5 embeddings)
     batch_idx = 0
     norm_z_t = torch.norm(z_t[batch_idx], p=2).item()
     norm_z_p = torch.norm(z_p[batch_idx], p=2).item()
     norm_z_c = torch.norm(z_c[batch_idx], p=2).item()
-    # Compute MMD between embeddings
-    MMD_zc_zp = model.compute_mmd(z_c, z_p)
-    MMD_zp_zt = model.compute_mmd(z_p, z_t)
-    # Print Results
     print("\n=== Facilitator Model Output ===")
     print(f"Shape of z_t (Text Embeddings): {z_t.shape}")
     print(f"Shape of z_p (Protein Embeddings): {z_p.shape}")
@@ -127,13 +113,9 @@ if __name__ == '__main__':
     print(f"MSE between Text Embeddings (z_t) and Protein Embeddings (z_p): {mse_zt_zp:.6f}")
     print("\n=== Max Mean Discrepancy (MMD) Results ===")
-    print(f"MMD between Facilitated Embeddings (z_c) and Protein Embeddings (z_p): {MMD_zc_zp:.6f}")
-    print(f"MMD between Text Embeddings (z_t) and Protein Embeddings (z_p): {MMD_zp_zt:.6f}")
-    print("\nFacilitator Model successfully computed facilitated embeddings!")
-    # save output embeddings
-    torch.save(embedding_dataset, 'Facilitator_test_outputs.pt')

+import argparse
 import yaml
 from argparse import Namespace
 import json
 import pandas as pd
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
 import Stage1_source.model as mod
+# Step 1: Load JSON Configuration
 def load_json_config(json_path):
     with open(json_path, "r") as f:
         config = json.load(f)
     return config
 # Step 2: Convert JSON dictionary to Namespace
 def convert_to_namespace(config_dict):
     for key, value in config_dict.items():
+        if isinstance(value, dict):
             config_dict[key] = convert_to_namespace(value)
     return Namespace(**config_dict)
+# Step 3: Load Pre-trained Model
+def prepare_model(config_args, model_path) -> nn.Module:
     model = mod.Facilitator(
+        in_dim=config_args.emb_dim,
+        hid_dim=config_args.hid_dim,
+        out_dim=config_args.emb_dim,
+        dropout=config_args.dropout
     )
+    model.load_state_dict(torch.load(model_path, map_location="cpu"))
     model.eval()
     print("Model loaded successfully with weights!")
     return model
+# Step 4: Compute MMD Loss
 def compute_mmd_loss(x, y, kernel="rbf", sigma=1.0):
     def rbf_kernel(a, b, sigma):
         pairwise_distances = torch.cdist(a, b, p=2) ** 2
         return torch.exp(-pairwise_distances / (2 * sigma ** 2))
+    K_xx = rbf_kernel(x, x, sigma)
+    K_yy = rbf_kernel(y, y, sigma)
+    K_xy = rbf_kernel(x, y, sigma)
     mmd_loss = K_xx.mean() - 2 * K_xy.mean() + K_yy.mean()
     return mmd_loss
+# Step 5: Argument Parser Function
+def parse_arguments():
+    parser = argparse.ArgumentParser(description="BioM3 Facilitator Model (Stage 2)")
+    parser.add_argument('--input_data_path', type=str, required=True,
+                        help="Path to the input embeddings (e.g., PenCL_test_outputs.pt)")
+    parser.add_argument('--output_data_path', type=str, required=True,
+                        help="Path to save the output embeddings (e.g., Facilitator_test_outputs.pt)")
+    parser.add_argument('--model_path', type=str, required=True,
+                        help="Path to the Facilitator model weights (e.g., BioM3_Facilitator_epoch20.bin)")
+    parser.add_argument('--json_path', type=str, required=True,
+                        help="Path to the JSON configuration file (stage2_config.json)")
+    return parser.parse_args()
+# Main Execution
 if __name__ == '__main__':
+    # Parse arguments
+    args = parse_arguments()
+    # Load configuration
+    config_dict = load_json_config(args.json_path)
+    config_args = convert_to_namespace(config_dict)
+    # Load model
+    model = prepare_model(config_args=config_args, model_path=args.model_path)
+    # Load input embeddings
+    embedding_dataset = torch.load(args.input_data_path)
+    # Run inference to get facilitated embeddings
     with torch.no_grad():
         z_t = embedding_dataset['z_t']
+        z_p = embedding_dataset['z_p']
         z_c = model(z_t)
         embedding_dataset['z_c'] = z_c
+    # Compute evaluation metrics
+    # 1. MSE between embeddings
+    mse_zc_zp = F.mse_loss(z_c, z_p)
+    mse_zt_zp = F.mse_loss(z_t, z_p)
+    # 2. Compute L2 norms for first batch
     batch_idx = 0
     norm_z_t = torch.norm(z_t[batch_idx], p=2).item()
     norm_z_p = torch.norm(z_p[batch_idx], p=2).item()
     norm_z_c = torch.norm(z_c[batch_idx], p=2).item()
+    # 3. Compute MMD between embeddings
+    mmd_zc_zp = model.compute_mmd(z_c, z_p)
+    mmd_zp_zt = model.compute_mmd(z_p, z_t)
+    # Print results
     print("\n=== Facilitator Model Output ===")
     print(f"Shape of z_t (Text Embeddings): {z_t.shape}")
     print(f"Shape of z_p (Protein Embeddings): {z_p.shape}")
     print(f"MSE between Text Embeddings (z_t) and Protein Embeddings (z_p): {mse_zt_zp:.6f}")
     print("\n=== Max Mean Discrepancy (MMD) Results ===")
+    print(f"MMD between Facilitated Embeddings (z_c) and Protein Embeddings (z_p): {mmd_zc_zp:.6f}")
+    print(f"MMD between Text Embeddings (z_t) and Protein Embeddings (z_p): {mmd_zp_zt:.6f}")
+    # Save output embeddings
+    torch.save(embedding_dataset, args.output_data_path)
+    print(f"\nFacilitator embeddings saved to {args.output_data_path}")

run_PenCL_inference.py CHANGED Viewed

@@ -56,6 +56,9 @@ def parse_arguments():
                         help="Path to the JSON configuration file (stage1_config.json)")
     parser.add_argument('--model_path', type=str, required=True,
                         help="Path to the pre-trained model weights (pytorch_model.bin)")
     return parser.parse_args()
 # Step 6: Compute Homology Probabilities
@@ -90,7 +93,9 @@ if __name__ == '__main__':
     # Run inference and store z_t, z_p
     z_t_list = []
     z_p_list = []
     with torch.no_grad():
         for idx in range(len(test_dataset)):
             batch = test_dataset[idx]
@@ -100,10 +105,24 @@ if __name__ == '__main__':
             z_p = outputs['seq_joint_latent']   # Protein latent
             z_t_list.append(z_t)
             z_p_list.append(z_p)
     # Stack all latent vectors
     z_t_tensor = torch.vstack(z_t_list)  # Shape: (num_samples, latent_dim)
     z_p_tensor = torch.vstack(z_p_list)  # Shape: (num_samples, latent_dim)
     # Compute Dot Product scores
     dot_product_scores = torch.matmul(z_p_tensor, z_t_tensor.T)  # Dot product
@@ -138,4 +157,5 @@ if __name__ == '__main__':
     print("\n=== Homology Matrix (Dot Product of Normalized z_p) ===")
     print(homology_matrix)

                         help="Path to the JSON configuration file (stage1_config.json)")
     parser.add_argument('--model_path', type=str, required=True,
                         help="Path to the pre-trained model weights (pytorch_model.bin)")
+    parser.add_argument('--output_path', type=str, required=True,
+                        help="Path to save output embeddings")
     return parser.parse_args()
 # Step 6: Compute Homology Probabilities
     # Run inference and store z_t, z_p
     z_t_list = []
     z_p_list = []
+    text_list = []
+    protein_list = []
     with torch.no_grad():
         for idx in range(len(test_dataset)):
             batch = test_dataset[idx]
             z_p = outputs['seq_joint_latent']   # Protein latent
             z_t_list.append(z_t)
             z_p_list.append(z_p)
+            protein_sequence = test_dataset.protein_sequence_list[idx]
+            text_prompt = test_dataset.text_captions_list[idx]
+            text_list.append(text_prompt)
+            protein_list.append(protein_sequence)
     # Stack all latent vectors
     z_t_tensor = torch.vstack(z_t_list)  # Shape: (num_samples, latent_dim)
     z_p_tensor = torch.vstack(z_p_list)  # Shape: (num_samples, latent_dim)
+    # Prepare embedding dict.
+    embedding_dict = {
+            'sequence': protein_list,
+            'text_prompts': text_list,
+            'z_t': z_t_tensor,
+            'z_p': z_p_tensor
+    }
     # Compute Dot Product scores
     dot_product_scores = torch.matmul(z_p_tensor, z_t_tensor.T)  # Dot product
     print("\n=== Homology Matrix (Dot Product of Normalized z_p) ===")
     print(homology_matrix)
+    torch.save(embedding_dict, config_args_parser.output_path)

stage2_config.json ADDED Viewed

	@@ -0,0 +1,16 @@

+{
+  "model_checkpoint_path": "/project/andrewferguson/niksapraljak/Project_ProtARDM/logs/Stage2_facilitator/Stage2_facilitator/checkpoints/Stage2_MMD_Pfam_Swiss_epoch20_ckpt/last.ckpt",
+  "model_type": "pfam",
+  "fast_dev_run": 0,
+  "loss_type": "MMD",
+  "dataset_type": "default",
+  "precision": "32",
+  "stage1_dataset_path": "None",
+  "stage2_output_path": "None",
+  "seed": 42,
+  "num_workers": 12,
+  "dropout": 0.0,
+  "batch_size": 64,
+  "emb_dim": 512,
+  "hid_dim": 1024
+}