ynuozhang commited on 3 days ago

Commit

1c0b97e

1 Parent(s): baf3373

Delete unused embeddings, metrics, and training scripts

Files changed (47) hide show

.gitattributes +1 -0
embeddings/binding/data-00000-of-00001.arrow +0 -3
embeddings/binding/dataset_info.json +0 -3
embeddings/binding/state.json +0 -3
embeddings/fast_embedding_generation.py +0 -3
embeddings/hemolysis/data-00000-of-00001.arrow +0 -3
embeddings/hemolysis/dataset_info.json +0 -3
embeddings/hemolysis/shuffled_hemo.csv +0 -3
embeddings/hemolysis/state.json +0 -3
embeddings/nonfouling/combined_nonfouling.csv +0 -3
embeddings/nonfouling/data-00000-of-00001.arrow +0 -3
embeddings/nonfouling/dataset_info.json +0 -3
embeddings/nonfouling/state.json +0 -3
embeddings/permeability/data-00000-of-00001.arrow +0 -3
embeddings/permeability/dataset_info.json +0 -3
embeddings/permeability/nc-CPP-processed.csv +0 -3
embeddings/permeability/state.json +0 -3
embeddings/solubility/data-00000-of-00001.arrow +0 -3
embeddings/solubility/dataset_info.json +0 -3
embeddings/solubility/shuffled_sol.csv +0 -3
embeddings/solubility/state.json +0 -3
metrics/binding/best_model_val_correlation.png +0 -3
metrics/binding/binding_train_correlation.png +0 -3
metrics/hemolysis/optimization_metrics.txt +0 -3
metrics/hemolysis/train_classification_plot.png +0 -3
metrics/hemolysis/train_predictions_binary.csv +0 -3
metrics/hemolysis/val_classification_plot.png +0 -3
metrics/hemolysis/val_predictions_binary.csv +0 -3
metrics/nonfouling/optimization_metrics.txt +0 -22
metrics/nonfouling/train_classification_plot.png +0 -0
metrics/nonfouling/train_predictions_binary.csv +0 -3
metrics/nonfouling/val_classification_plot.png +0 -0
metrics/nonfouling/val_predictions_binary.csv +0 -3
metrics/permeability/optimization_metrics.txt +0 -3
metrics/permeability/train_correlation.png +0 -3
metrics/permeability/train_predictions.csv +0 -3
metrics/permeability/val_correlation.png +0 -3
metrics/permeability/val_predictions.csv +0 -3
metrics/solubility/optimization_metrics.txt +0 -3
metrics/solubility/train_classification_plot.png +0 -3
metrics/solubility/train_predictions_binary.csv +0 -3
metrics/solubility/val_classification_plot.png +0 -3
metrics/solubility/val_predictions_binary.csv +0 -3
train/binary_xg.py +0 -223
train/binding_affinity_model_clean.ipynb +0 -0
train/binding_utils.py +0 -291
train/permeability_xg.py +0 -186

.gitattributes CHANGED Viewed

@@ -88,3 +88,4 @@ training_data filter=lfs diff=lfs merge=lfs -text
 README.md filter=lfs diff=lfs merge=lfs -text
 embeddings filter=lfs diff=lfs merge=lfs -text
 models/binding_affinity_for_smiles.pt filter=lfs diff=lfs merge=lfs -text

 README.md filter=lfs diff=lfs merge=lfs -text
 embeddings filter=lfs diff=lfs merge=lfs -text
 models/binding_affinity_for_smiles.pt filter=lfs diff=lfs merge=lfs -text
+*.csv filter=lfs diff=lfs merge=lfs -text

embeddings/binding/data-00000-of-00001.arrow DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d9b08ce28b452e9767dfc7c60bd6285421bdc6b791150a5f55158da89c7bda4f
-size 15746448

embeddings/binding/dataset_info.json DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:bbe033d1c9b2ec182afa2b1682a4c41c4fdc0cd548c08d7a21c4364dc68b3595
-size 784

embeddings/binding/state.json DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c2b6d15e6c6cf1f18c4f4cdb0ea339adaf7083a1c754266b8ad6a6468484f693
-size 247

embeddings/fast_embedding_generation.py DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:0396104ebf1dc28b0d297bdfebede1927aba9a23417c9f06cd8f39d999d099d3
-size 3900

embeddings/hemolysis/data-00000-of-00001.arrow DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:bef85bc99bc3c81c99fe290c0b2ef6b0d43f50c0089c59be7bf24219dd428d05
-size 20965576

embeddings/hemolysis/dataset_info.json DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a0708b61d5f0aa0a11b62205cbd08b504ad6957c271ff3b984c3e3b9457ce9bf
-size 370

embeddings/hemolysis/shuffled_hemo.csv DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:994d4faa1f29c59cb7fa9d18faa8605766b4033ae2bc432ea76e0bbdf4876b29
-size 2236370

embeddings/hemolysis/state.json DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:17a8a2f0f7a7c272396e10572248a45faeaae1c6390ef311b551ab97f5eb72b8
-size 247

embeddings/nonfouling/combined_nonfouling.csv DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:90f770673946d37f5c8362b3956c789f62ec71a514ab8c46bb2523b3b3d5be2e
-size 28623153

embeddings/nonfouling/data-00000-of-00001.arrow DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:44c869d7a6a03ccdf9143c4007bc85c996146e12e40da3efbd455eaf49eba016
-size 81645736

embeddings/nonfouling/dataset_info.json DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2a2517b8ba7deb4b26855d073360ab0c6a38c2780bc02211c03d2f51f9ccbb00
-size 368

embeddings/nonfouling/state.json DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:c5bf4d95c1fd57f172217a453408d1339a87f7eca5d02f4cd718bfdde6b519fb
-size 247

embeddings/permeability/data-00000-of-00001.arrow DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:82e749eafb2e903ef2dc47255dbe4e489e6db8055b3ba6af4c876d9b1a0f1b38
-size 22250496

embeddings/permeability/dataset_info.json DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a0708b61d5f0aa0a11b62205cbd08b504ad6957c271ff3b984c3e3b9457ce9bf
-size 370

embeddings/permeability/nc-CPP-processed.csv DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:f0bbf79d5a78023460de72087bbadd8d1f5b841b21b05b2d148d9ccd7fc3a254
-size 1083167

embeddings/permeability/state.json DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:e7a5a8e0db7a08e1c26e0115a0c3225d1850b620d00ab24b42769d78ee6208fd
-size 247

embeddings/solubility/data-00000-of-00001.arrow DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:36ac428037f8d09d1f45fcd6a61517428c4409638d63230b3ff1d375bdd0e5cb
-size 106655176

embeddings/solubility/dataset_info.json DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:a0708b61d5f0aa0a11b62205cbd08b504ad6957c271ff3b984c3e3b9457ce9bf
-size 370

embeddings/solubility/shuffled_sol.csv DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:dc5a2139e52a6deb8a1eda45eb5f8b8abfd260724dbbd274a824a9aecb929890
-size 49775729

embeddings/solubility/state.json DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:403b965b3b855f52632a00e828046dd90749edc7808ef4725d54606673f1d5cb
-size 247

metrics/binding/best_model_val_correlation.png DELETED Viewed

Git LFS Details

SHA256: 3ae1a98e66a9cf4fdc0557cdd821024a8bf68ca180920488d128fea247dc0c70
Pointer size: 131 Bytes
Size of remote file: 442 kB

metrics/binding/binding_train_correlation.png DELETED Viewed

Git LFS Details

SHA256: 84e4ef160a333ebcf6d2bad1e794eebd3823095c8ea484d462badd461e7c0415
Pointer size: 131 Bytes
Size of remote file: 834 kB

metrics/hemolysis/optimization_metrics.txt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:2dc387c4d486b3c94a4237ceb04d8f04e18d9198e0b3c069e04c940b459f5422
-size 612

metrics/hemolysis/train_classification_plot.png DELETED Viewed

Git LFS Details

SHA256: ff405228b4d43a652dcf1e9a634c7dc2050f18b39084d23029dc4256faf70b5a
Pointer size: 130 Bytes
Size of remote file: 29.8 kB

metrics/hemolysis/train_predictions_binary.csv DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:5604b65e9f922252ccc92f1bbe9be33270245ada96fef2f3e92dc50c30a5487a
-size 87035

metrics/hemolysis/val_classification_plot.png DELETED Viewed

Git LFS Details

SHA256: 232a28293e6c2674b4a00d1404ace0a226ac967b2c7c52d638a806a5db5a3c89
Pointer size: 130 Bytes
Size of remote file: 49.2 kB

metrics/hemolysis/val_predictions_binary.csv DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:00aca5ad646502e56930f2473c596b1cf754dd96ec13c7d2b630eb2e8cec3d95
-size 21607

metrics/nonfouling/optimization_metrics.txt DELETED Viewed

@@ -1,22 +0,0 @@
-============================================================
-OPTIMIZATION COMPLETE
-============================================================
-Number of finished trials: 200
-Best Trial: #52
-Best F1 Score: 0.8774
-Best AUC Score: 0.9327
-Optuna Best Trial Value: 0.8774
-Best hyperparameters:
-  lambda: 3.1278404540677405e-06
-  alpha: 2.865349682111457
-  colsample_bytree: 0.6388434847100901
-  subsample: 0.975052331668336
-  learning_rate: 0.1046988967097677
-  max_depth: 5
-  min_child_weight: 283
-  gamma: 0.7863860752901305
-  num_boost_round: 876
-============================================================

metrics/nonfouling/train_classification_plot.png DELETED Viewed

Binary file (32.9 kB)

metrics/nonfouling/train_predictions_binary.csv DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:07931d9e8b63b4284bc7ce77cdcb719ea3b81d16125e14c277fce06c0e3d6b1d
-size 219852

metrics/nonfouling/val_classification_plot.png DELETED Viewed

Binary file (39.2 kB)

metrics/nonfouling/val_predictions_binary.csv DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:ef9845cf64c142ff16fc915402953a1383e36ecb1c76b6174fae75c0dec59cd4
-size 54904

metrics/permeability/optimization_metrics.txt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:aa91d37a6e116d4ff44eb2606e439f3243ddbeed613abc567128dd8112901e0c
-size 676

metrics/permeability/train_correlation.png DELETED Viewed

Git LFS Details

SHA256: f378e9d3f6a45873b666263d2f5cf1258f19f0959c6c63f6efa740e4bc9f302d
Pointer size: 131 Bytes
Size of remote file: 101 kB

metrics/permeability/train_predictions.csv DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:bd3817ce0aa54ce50cb3d56c83ae2a3157cd4d4deca7af743f9e24d2b0bfb675
-size 89716

metrics/permeability/val_correlation.png DELETED Viewed

Git LFS Details

SHA256: 3862f998d06f9c017df8dc1a9e8b22ba0c1fe8aecb5fb7a2a95706df57699bfc
Pointer size: 131 Bytes
Size of remote file: 118 kB

metrics/permeability/val_predictions.csv DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:26e268ec0f9b5290ecc7b76c9b41e5819e4bc45bb1bb29843f2ae9b781ecab46
-size 22487

metrics/solubility/optimization_metrics.txt DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:d935afa867779b07a7c99e51c12cdcd49828dd61e4be1ef357926e280fadf648
-size 612

metrics/solubility/train_classification_plot.png DELETED Viewed

Git LFS Details

SHA256: 0aaeaaf89a42cc837cd6db07cb0728fd04f6af855f8a5368b48a29c5401b7ea0
Pointer size: 130 Bytes
Size of remote file: 35.3 kB

metrics/solubility/train_predictions_binary.csv DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:852f71fe2fe196274e25866bfbea03874c7e75fec923556c96536ada85a71c7c
-size 244268

metrics/solubility/val_classification_plot.png DELETED Viewed

Git LFS Details

SHA256: a28973d586e0f8461cfd14996acc905a644820069537920475b5f5e906692c68
Pointer size: 130 Bytes
Size of remote file: 40.1 kB

metrics/solubility/val_predictions_binary.csv DELETED Viewed

@@ -1,3 +0,0 @@
-version https://git-lfs.github.com/spec/v1
-oid sha256:8489b5b047b630571f3959d96e5fbb003f923854e7db777dd266331f586d115d
-size 61087

train/binary_xg.py DELETED Viewed

@@ -1,223 +0,0 @@
-import pandas as pd
-import numpy as np
-import torch
-from sklearn.model_selection import train_test_split
-from sklearn.metrics import precision_recall_curve, f1_score
-import optuna
-from optuna.trial import TrialState
-import xgboost as xgb
-import os
-from datasets import load_from_disk
-from lightning.pytorch import seed_everything
-from rdkit import Chem, rdBase, DataStructs
-from typing import List
-from rdkit.Chem import AllChem
-import matplotlib.pyplot as plt
-from sklearn.metrics import accuracy_score, roc_auc_score
-base_path = "/scratch/pranamlab/sophtang/home/scoring/PeptiVerse"
-def save_and_plot_binary_predictions(y_true_train, y_pred_train, y_true_val, y_pred_val, threshold, output_path):
-    """
-    Saves the true and predicted values for training and validation sets, and generates binary classification plots.
-    Parameters:
-        y_true_train (array): True labels for the training set.
-        y_pred_train (array): Predicted probabilities for the training set.
-        y_true_val (array): True labels for the validation set.
-        y_pred_val (array): Predicted probabilities for the validation set.
-        threshold (float): Classification threshold for predictions.
-        output_path (str): Directory to save the CSV files and plots.
-    """
-    os.makedirs(output_path, exist_ok=True)
-    # Convert probabilities to binary predictions
-    y_pred_train_binary = (y_pred_train >= threshold).astype(int)
-    y_pred_val_binary = (y_pred_val >= threshold).astype(int)
-    # Save training predictions
-    train_df = pd.DataFrame({
-        'True Label': y_true_train,
-        'Predicted Probability': y_pred_train,
-        'Predicted Label': y_pred_train_binary
-    })
-    train_df.to_csv(os.path.join(output_path, 'train_predictions_binary.csv'), index=False)
-    # Save validation predictions
-    val_df = pd.DataFrame({
-        'True Label': y_true_val,
-        'Predicted Probability': y_pred_val,
-        'Predicted Label': y_pred_val_binary
-    })
-    val_df.to_csv(os.path.join(output_path, 'val_predictions_binary.csv'), index=False)
-    # Plot training predictions
-    plot_binary_correlation(
-        y_true_train,
-        y_pred_train,
-        threshold,
-        title="Training Set Binary Classification Plot",
-        output_file=os.path.join(output_path, 'train_classification_plot.png')
-    )
-    # Plot validation predictions
-    plot_binary_correlation(
-        y_true_val,
-        y_pred_val,
-        threshold,
-        title="Validation Set Binary Classification Plot",
-        output_file=os.path.join(output_path, 'val_classification_plot.png')
-    )
-def plot_binary_correlation(y_true, y_pred, threshold, title, output_file):
-    """
-    Generates a scatter plot for binary classification and saves it to a file.
-    Parameters:
-        y_true (array): True labels.
-        y_pred (array): Predicted probabilities.
-        threshold (float): Classification threshold for predictions.
-        title (str): Title of the plot.
-        output_file (str): Path to save the plot.
-    """
-    # Scatter plot
-    plt.figure(figsize=(10, 8))
-    plt.scatter(y_true, y_pred, alpha=0.5, label='Data points', color='#BC80FF')
-    # Add threshold line
-    plt.axhline(y=threshold, color='red', linestyle='--', label=f'Threshold = {threshold}')
-    # Add annotations
-    plt.title(title)
-    plt.xlabel("True Labels")
-    plt.ylabel("Predicted Probability")
-    plt.legend()
-    # Save and show the plot
-    plt.tight_layout()
-    plt.savefig(output_file)
-    plt.show()
-seed_everything(42)
-dataset = load_from_disk(f'{base_path}/data/solubility')
-sequences = np.stack(dataset['sequence'])  # Ensure sequences are SMILES strings
-labels = np.stack(dataset['labels'])
-embeddings = np.stack(dataset['embedding'])
-# Initialize best F1 score and model path
-best_f1 = -np.inf
-best_model_path = f"{base_path}/src/solubility"
-# Trial callback
-def trial_info_callback(study, trial):
-    if study.best_trial == trial:
-        print(f"Trial {trial.number}:")
-        print(f"  Weighted F1 Score: {trial.value}")
-def objective(trial):
-    # Define hyperparameters
-    params = {
-        'objective': 'binary:logistic',
-        'lambda': trial.suggest_float('lambda', 1e-8, 50.0, log=True),
-        'alpha': trial.suggest_float('alpha', 1e-8, 50.0, log=True),
-        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.3, 1.0),
-        'subsample': trial.suggest_float('subsample', 0.5, 1.0),
-        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.3),
-        'max_depth': trial.suggest_int('max_depth', 2, 15),
-        'min_child_weight': trial.suggest_int('min_child_weight', 1, 500),
-        'gamma': trial.suggest_float('gamma', 0, 10.0),
-        'tree_method': 'hist',
-        'device': 'cuda:6',
-    }
-    # Suggest number of boosting rounds
-    num_boost_round = trial.suggest_int('num_boost_round', 10, 1000)
-    threshold = 0.5  # Initial classification threshold
-    # Split the data
-    train_idx, val_idx = train_test_split(
-        np.arange(len(sequences)), test_size=0.2, stratify=labels, random_state=42
-    )
-    train_subset = dataset.select(train_idx).with_format("torch")
-    val_subset = dataset.select(val_idx).with_format("torch")
-    # Extract embeddings and labels for train/validation
-    train_embeddings = np.array(train_subset['embedding'])
-    valid_embeddings = np.array(val_subset['embedding'])
-    train_labels = np.array(train_subset['labels'])
-    valid_labels = np.array(val_subset['labels'])
-    # Prepare training and validation sets
-    dtrain = xgb.DMatrix(train_embeddings, label=train_labels)
-    dvalid = xgb.DMatrix(valid_embeddings, label=valid_labels)
-    # Train the model
-    model = xgb.train(
-        params=params,
-        dtrain=dtrain,
-        num_boost_round=num_boost_round,
-        evals=[(dvalid, "validation")],
-        early_stopping_rounds=50,
-        verbose_eval=False,
-    )
-    # Predict probabilities
-    preds_train = model.predict(dtrain)
-    preds_val = model.predict(dvalid)
-    # Calculate metrics
-    f1_val = f1_score(valid_labels, (preds_val >= threshold).astype(int), average="weighted")
-    auc_val = roc_auc_score(valid_labels, preds_val)
-    print(f"Trial {trial.number}: AUC: {auc_val:.3f}, F1 Score: {f1_val:.3f}")
-    # Save the model if it has the best F1 score
-    current_best = trial.study.user_attrs.get("best_f1", -np.inf)
-    if f1_val > current_best:
-        trial.study.set_user_attr("best_f1", f1_val)
-        trial.study.set_user_attr("best_auc", auc_val)
-        trial.study.set_user_attr("best_trial", trial.number)
-        os.makedirs(best_model_path, exist_ok=True)
-        # Save the model
-        model.save_model(os.path.join(best_model_path, "best_model_f1.json"))
-        print(f"✓ NEW BEST! Trial {trial.number}: F1={f1_val:.4f}, AUC={auc_val:.4f} - Model saved!")
-        # Save and plot binary predictions
-        save_and_plot_binary_predictions(
-            train_labels, preds_train, valid_labels, preds_val, threshold, best_model_path
-        )
-    return f1_val
-if __name__ == "__main__":
-    study = optuna.create_study(direction="maximize", pruner=optuna.pruners.MedianPruner())
-    study.optimize(objective, n_trials=200)
-    # Prepare summary text
-    summary = []
-    summary.append("\n" + "="*60)
-    summary.append("OPTIMIZATION COMPLETE")
-    summary.append("="*60)
-    summary.append(f"Number of finished trials: {len(study.trials)}")
-    summary.append(f"\nBest Trial: #{study.user_attrs.get('best_trial', 'N/A')}")
-    summary.append(f"Best F1 Score: {study.user_attrs.get('best_f1', None):.4f}")
-    summary.append(f"Best AUC Score: {study.user_attrs.get('best_auc', None):.4f}")
-    summary.append(f"Optuna Best Trial Value: {study.best_trial.value:.4f}")
-    summary.append(f"\nBest hyperparameters:")
-    for key, value in study.best_trial.params.items():
-        summary.append(f"  {key}: {value}")
-    summary.append("="*60)
-    # Print to console
-    for line in summary:
-        print(line)
-    # Save to file
-    metrics_file = os.path.join(best_model_path, "optimization_metrics.txt")
-    with open(metrics_file, 'w') as f:
-        f.write('\n'.join(summary))
-    print(f"\n✓ Metrics saved to: {metrics_file}")

train/binding_affinity_model_clean.ipynb DELETED Viewed

The diff for this file is too large to render. See raw diff

train/binding_utils.py DELETED Viewed

@@ -1,291 +0,0 @@
-from torch import nn
-import pdb
-import torch
-import numpy as np
-def to_var(x):
-    if torch.cuda.is_available():
-        x = x.cuda()
-    return x
-class MultiHeadAttentionSequence(nn.Module):
-    def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
-        super().__init__()
-        self.n_head = n_head
-        self.d_model = d_model
-        self.d_k = d_k
-        self.d_v = d_v
-        self.W_Q = nn.Linear(d_model, n_head*d_k)
-        self.W_K = nn.Linear(d_model, n_head*d_k)
-        self.W_V = nn.Linear(d_model, n_head*d_v)
-        self.W_O = nn.Linear(n_head*d_v, d_model)
-        self.layer_norm = nn.LayerNorm(d_model)
-        self.dropout = nn.Dropout(dropout)
-    def forward(self, q, k, v):
-        batch, len_q, _ = q.size()
-        batch, len_k, _ = k.size()
-        batch, len_v, _ = v.size()
-        Q = self.W_Q(q).view([batch, len_q, self.n_head, self.d_k])
-        K = self.W_K(k).view([batch, len_k, self.n_head, self.d_k])
-        V = self.W_V(v).view([batch, len_v, self.n_head, self.d_v])
-        Q = Q.transpose(1, 2)
-        K = K.transpose(1, 2).transpose(2, 3)
-        V = V.transpose(1, 2)
-        attention = torch.matmul(Q, K)
-        attention = attention / np.sqrt(self.d_k)
-        attention = F.softmax(attention, dim=-1)
-        output = torch.matmul(attention, V)
-        output = output.transpose(1, 2).reshape([batch, len_q, self.d_v*self.n_head])
-        output = self.W_O(output)
-        output = self.dropout(output)
-        output = self.layer_norm(output + q)
-        return output, attention
-class MultiHeadAttentionReciprocal(nn.Module):
-    def __init__(self, n_head, d_model, d_k, d_v, dropout=0.1):
-        super().__init__()
-        self.n_head = n_head
-        self.d_model = d_model
-        self.d_k = d_k
-        self.d_v = d_v
-        self.W_Q = nn.Linear(d_model, n_head*d_k)
-        self.W_K = nn.Linear(d_model, n_head*d_k)
-        self.W_V = nn.Linear(d_model, n_head*d_v)
-        self.W_O = nn.Linear(n_head*d_v, d_model)
-        self.W_V_2 = nn.Linear(d_model, n_head*d_v)
-        self.W_O_2 = nn.Linear(n_head*d_v, d_model)
-        self.layer_norm = nn.LayerNorm(d_model)
-        self.dropout = nn.Dropout(dropout)
-        self.layer_norm_2 = nn.LayerNorm(d_model)
-        self.dropout_2 = nn.Dropout(dropout)
-    def forward(self, q, k, v, v_2):
-        batch, len_q, _ = q.size()
-        batch, len_k, _ = k.size()
-        batch, len_v, _ = v.size()
-        batch, len_v_2, _ = v_2.size()
-        Q = self.W_Q(q).view([batch, len_q, self.n_head, self.d_k])
-        K = self.W_K(k).view([batch, len_k, self.n_head, self.d_k])
-        V = self.W_V(v).view([batch, len_v, self.n_head, self.d_v])
-        V_2 = self.W_V_2(v_2).view([batch, len_v_2, self.n_head, self.d_v])
-        Q = Q.transpose(1, 2)
-        K = K.transpose(1, 2).transpose(2, 3)
-        V = V.transpose(1, 2)
-        V_2 = V_2.transpose(1,2)
-        attention = torch.matmul(Q, K)
-        attention = attention /np.sqrt(self.d_k)
-        attention_2 = attention.transpose(-2, -1)
-        attention = F.softmax(attention, dim=-1)
-        attention_2 = F.softmax(attention_2, dim=-1)
-        output = torch.matmul(attention, V)
-        output_2 = torch.matmul(attention_2, V_2)
-        output = output.transpose(1, 2).reshape([batch, len_q, self.d_v*self.n_head])
-        output_2 = output_2.transpose(1, 2).reshape([batch, len_k, self.d_v*self.n_head])
-        output = self.W_O(output)
-        output_2 = self.W_O_2(output_2)
-        output = self.dropout(output)
-        output = self.layer_norm(output + q)
-        output_2 = self.dropout(output_2)
-        output_2 = self.layer_norm(output_2 + k)
-        return output, output_2, attention, attention_2
-class FFN(nn.Module):
-    def __init__(self, d_in, d_hid, dropout=0.1):
-        super().__init__()
-        self.layer_1 = nn.Conv1d(d_in, d_hid,1)
-        self.layer_2 = nn.Conv1d(d_hid, d_in,1)
-        self.relu = nn.ReLU()
-        self.layer_norm = nn.LayerNorm(d_in)
-        self.dropout = nn.Dropout(dropout)
-    def forward(self, x):
-        residual = x
-        output = self.layer_1(x.transpose(1, 2))
-        output = self.relu(output)
-        output = self.layer_2(output)
-        output = self.dropout(output)
-        output = self.layer_norm(output.transpose(1, 2)+residual)
-        return output
-class ConvLayer(nn.Module):
-    def __init__(self, in_channels, out_channels, kernel_size, padding, dilation):
-        super(ConvLayer, self).__init__()
-        self.conv = nn.Conv1d(in_channels, out_channels, kernel_size, padding=padding, dilation=dilation)
-        self.relu = nn.ReLU()
-    def forward(self, x):
-        out = self.conv(x)
-        out = self.relu(out)
-        return out
-class DilatedCNN(nn.Module):
-    def __init__(self, d_model, d_hidden):
-        super(DilatedCNN, self).__init__()
-        self.first_ = nn.ModuleList()
-        self.second_ = nn.ModuleList()
-        self.third_ = nn.ModuleList()
-        dilation_tuple = (1, 2, 3)
-        dim_in_tuple = (d_model, d_hidden, d_hidden)
-        dim_out_tuple = (d_hidden, d_hidden, d_hidden)
-        for i, dilation_rate in enumerate(dilation_tuple):
-            self.first_.append(ConvLayer(dim_in_tuple[i], dim_out_tuple[i], kernel_size=3, padding=dilation_rate,
-                                         dilation=dilation_rate))
-        for i, dilation_rate in enumerate(dilation_tuple):
-            self.second_.append(ConvLayer(dim_in_tuple[i], dim_out_tuple[i], kernel_size=5, padding=2*dilation_rate,
-                                          dilation=dilation_rate))
-        for i, dilation_rate in enumerate(dilation_tuple):
-            self.third_.append(ConvLayer(dim_in_tuple[i], dim_out_tuple[i], kernel_size=7, padding=3*dilation_rate,
-                                         dilation=dilation_rate))
-    def forward(self, protein_seq_enc):
-        # pdb.set_trace()
-        protein_seq_enc = protein_seq_enc.transpose(1, 2)    # protein_seq_enc's shape: B*L*d_model -> B*d_model*L
-        first_embedding = protein_seq_enc
-        second_embedding = protein_seq_enc
-        third_embedding = protein_seq_enc
-        for i in range(len(self.first_)):
-            first_embedding = self.first_[i](first_embedding)
-        for i in range(len(self.second_)):
-            second_embedding = self.second_[i](second_embedding)
-        for i in range(len(self.third_)):
-            third_embedding = self.third_[i](third_embedding)
-        # pdb.set_trace()
-        protein_seq_enc = first_embedding + second_embedding + third_embedding
-        return protein_seq_enc.transpose(1, 2)
-class ReciprocalLayerwithCNN(nn.Module):
-    def __init__(self, d_model, d_inner, d_hidden, n_head, d_k, d_v):
-        super().__init__()
-        self.cnn = DilatedCNN(d_model, d_hidden)
-        self.sequence_attention_layer = MultiHeadAttentionSequence(n_head, d_hidden, d_k, d_v)
-        self.protein_attention_layer = MultiHeadAttentionSequence(n_head, d_hidden, d_k, d_v)
-        self.reciprocal_attention_layer = MultiHeadAttentionReciprocal(n_head, d_hidden, d_k, d_v)
-        self.ffn_seq = FFN(d_hidden, d_inner)
-        self.ffn_protein = FFN(d_hidden, d_inner)
-    def forward(self, sequence_enc, protein_seq_enc):
-        # pdb.set_trace()  # protein_seq_enc.shape = B * L * d_model
-        protein_seq_enc = self.cnn(protein_seq_enc)
-        prot_enc, prot_attention = self.protein_attention_layer(protein_seq_enc, protein_seq_enc, protein_seq_enc)
-        seq_enc, sequence_attention = self.sequence_attention_layer(sequence_enc, sequence_enc, sequence_enc)
-        prot_enc, seq_enc, prot_seq_attention, seq_prot_attention = self.reciprocal_attention_layer(prot_enc, seq_enc, seq_enc, prot_enc)
-        prot_enc = self.ffn_protein(prot_enc)
-        seq_enc = self.ffn_seq(seq_enc)
-        return prot_enc, seq_enc, prot_attention, sequence_attention, prot_seq_attention, seq_prot_attention
-class ReciprocalLayer(nn.Module):
-    def __init__(self, d_model, d_inner, n_head, d_k, d_v):
-        super().__init__()
-        self.sequence_attention_layer = MultiHeadAttentionSequence(n_head, d_model, d_k, d_v)
-        self.protein_attention_layer = MultiHeadAttentionSequence(n_head, d_model, d_k, d_v)
-        self.reciprocal_attention_layer = MultiHeadAttentionReciprocal(n_head, d_model, d_k, d_v)
-        self.ffn_seq = FFN(d_model, d_inner)
-        self.ffn_protein = FFN(d_model, d_inner)
-    def forward(self, sequence_enc, protein_seq_enc):
-        prot_enc, prot_attention = self.protein_attention_layer(protein_seq_enc, protein_seq_enc, protein_seq_enc)
-        seq_enc, sequence_attention = self.sequence_attention_layer(sequence_enc, sequence_enc, sequence_enc)
-        prot_enc, seq_enc, prot_seq_attention, seq_prot_attention = self.reciprocal_attention_layer(prot_enc, seq_enc, seq_enc, prot_enc)
-        prot_enc = self.ffn_protein(prot_enc)
-        seq_enc = self.ffn_seq(seq_enc)
-        return prot_enc, seq_enc, prot_attention, sequence_attention, prot_seq_attention, seq_prot_attention

train/permeability_xg.py DELETED Viewed

@@ -1,186 +0,0 @@
-import pandas as pd
-import numpy as np
-import optuna
-from optuna.trial import TrialState
-from rdkit import Chem
-from rdkit.Chem import AllChem
-from sklearn.metrics import mean_squared_error
-from sklearn.model_selection import train_test_split
-import xgboost as xgb
-import os
-from datasets import load_from_disk
-from scipy.stats import spearmanr
-import matplotlib.pyplot as plt
-base_path = "/scratch/pranamlab/sophtang/home/scoring/PeptiVerse"
-def save_and_plot_predictions(y_true_train, y_pred_train, y_true_val, y_pred_val, output_path):
-    os.makedirs(output_path, exist_ok=True)
-    # Save training predictions
-    train_df = pd.DataFrame({'True Permeability': y_true_train, 'Predicted Permeability': y_pred_train})
-    train_df.to_csv(os.path.join(output_path, 'train_predictions.csv'), index=False)
-    # Save validation predictions
-    val_df = pd.DataFrame({'True Permeability': y_true_val, 'Predicted Permeability': y_pred_val})
-    val_df.to_csv(os.path.join(output_path, 'val_predictions.csv'), index=False)
-    # Plot training predictions
-    plot_correlation(
-        y_true_train,
-        y_pred_train,
-        title="Training Set Correlation Plot",
-        output_file=os.path.join(output_path, 'train_correlation.png'),
-    )
-    # Plot validation predictions
-    plot_correlation(
-        y_true_val,
-        y_pred_val,
-        title="Validation Set Correlation Plot",
-        output_file=os.path.join(output_path, 'val_correlation.png'),
-    )
-def plot_correlation(y_true, y_pred, title, output_file):
-    spearman_corr, _ = spearmanr(y_true, y_pred)
-    # Scatter plot
-    plt.figure(figsize=(10, 8))
-    plt.scatter(y_true, y_pred, alpha=0.5, label='Data points', color='#BC80FF')
-    plt.plot([min(y_true), max(y_true)], [min(y_true), max(y_true)], color='teal', linestyle='--', label='Ideal fit')
-    # Add annotations
-    plt.title(f"{title}\nSpearman Correlation: {spearman_corr:.3f}")
-    plt.xlabel("True Permeability (logP)")
-    plt.ylabel("Predicted Affinity (logP)")
-    plt.legend()
-    # Save and show the plot
-    plt.tight_layout()
-    plt.savefig(output_file)
-    plt.show()
-# Load dataset
-dataset = load_from_disk(f'{base_path}/data/permeability')
-# Extract sequences, labels, and embeddings
-sequences = np.stack(dataset['sequence'])
-labels = np.stack(dataset['labels'])  # Regression labels
-embeddings = np.stack(dataset['embedding'])  # Pre-trained embeddings
-# Function to compute Morgan fingerprints
-def compute_morgan_fingerprints(smiles_list, radius=2, n_bits=2048):
-    fps = []
-    for smiles in smiles_list:
-        mol = Chem.MolFromSmiles(smiles)
-        if mol is not None:
-            fp = AllChem.GetMorganFingerprintAsBitVect(mol, radius, nBits=n_bits)
-            fps.append(np.array(fp))
-        else:
-            # If the SMILES string is invalid, use a zero vector
-            fps.append(np.zeros(n_bits))
-            print(f"Invalid SMILES: {smiles}")
-    return np.array(fps)
-# Compute Morgan fingerprints for the sequences
-#morgan_fingerprints = compute_morgan_fingerprints(sequences)
-# Concatenate embeddings with Morgan fingerprints
-#input_features = np.concatenate([embeddings, morgan_fingerprints], axis=1)
-input_features = embeddings
-# Initialize global variables
-best_model_path = f"{base_path}/src/permeability"
-os.makedirs(best_model_path, exist_ok=True)
-def trial_info_callback(study, trial):
-    if study.best_trial == trial:
-        print(f"Trial {trial.number}:")
-        print(f"  MSE: {trial.value}")
-def objective(trial):
-    # Define hyperparameters
-    params = {
-        'objective': 'reg:squarederror',
-        'lambda': trial.suggest_float('lambda', 0.1, 10.0, log=True),
-        'alpha': trial.suggest_float('alpha', 0.1, 10.0, log=True),
-        'gamma': trial.suggest_float('gamma', 0, 5),
-        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.5, 1.0),
-        'subsample': trial.suggest_float('subsample', 0.6, 0.9),
-        'learning_rate': trial.suggest_float('learning_rate', 1e-5, 0.1),
-        'max_depth': trial.suggest_int('max_depth', 2, 30),
-        'min_child_weight': trial.suggest_int('min_child_weight', 1, 20),
-        'tree_method': 'hist',
-        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 0.5, 10.0, log=True),
-        'device': 'cuda:6',
-    }
-    num_boost_round = trial.suggest_int('num_boost_round', 10, 1000)
-    # Train-validation split
-    X_train, X_val, y_train, y_val = train_test_split(input_features, labels, test_size=0.2, random_state=42)
-    # Convert data to DMatrix
-    dtrain = xgb.DMatrix(X_train, label=y_train)
-    dvalid = xgb.DMatrix(X_val, label=y_val)
-    # Train XGBoost
-    model = xgb.train(
-        params=params,
-        dtrain=dtrain,
-        num_boost_round=num_boost_round,
-        evals=[(dvalid, "validation")],
-        early_stopping_rounds=50,
-        verbose_eval=False,
-    )
-    # Predict and evaluate
-    preds_train = model.predict(dtrain)
-    preds_val = model.predict(dvalid)
-    mse = mean_squared_error(y_val, preds_val)
-    # Calculate Spearman Rank Correlation for both train and validation
-    spearman_train, _ = spearmanr(y_train, preds_train)
-    spearman_val, _ = spearmanr(y_val, preds_val)
-    print(f"Train Spearman: {spearman_train:.4f}, Val Spearman: {spearman_val:.4f}")
-    # Save the best model
-    if trial.study.user_attrs.get("best_mse", np.inf) > mse:
-        trial.study.set_user_attr("best_mse", mse)
-        trial.study.set_user_attr("best_spearman_train", spearman_train)
-        trial.study.set_user_attr("best_spearman_val", spearman_val)
-        trial.study.set_user_attr("best_trial", trial.number)
-        model.save_model(os.path.join(best_model_path, "best_model.json"))
-        save_and_plot_predictions(y_train, preds_train, y_val, preds_val, best_model_path)
-        print(f"✓ NEW BEST! Trial {trial.number}: MSE={mse:.4f}, Train Spearman={spearman_train:.4f}, Val Spearman={spearman_val:.4f}")
-    return mse
-if __name__ == "__main__":
-    study = optuna.create_study(direction="minimize", pruner=optuna.pruners.MedianPruner())
-    study.optimize(objective, n_trials=200, callbacks=[trial_info_callback])
-    # Prepare summary text
-    summary = []
-    summary.append("\n" + "="*60)
-    summary.append("OPTIMIZATION COMPLETE")
-    summary.append("="*60)
-    summary.append(f"Number of finished trials: {len(study.trials)}")
-    summary.append(f"\nBest Trial: #{study.user_attrs.get('best_trial', 'N/A')}")
-    summary.append(f"Best MSE: {study.best_trial.value:.4f}")
-    summary.append(f"Best Training Spearman Correlation: {study.user_attrs.get('best_spearman_train', None):.4f}")
-    summary.append(f"Best Validation Spearman Correlation: {study.user_attrs.get('best_spearman_val', None):.4f}")
-    summary.append(f"\nBest hyperparameters:")
-    for key, value in study.best_trial.params.items():
-        summary.append(f"  {key}: {value}")
-    summary.append("="*60)
-    # Print to console
-    for line in summary:
-        print(line)
-    # Save to file
-    metrics_file = os.path.join(best_model_path, "optimization_metrics.txt")
-    with open(metrics_file, 'w') as f:
-        f.write('\n'.join(summary))
-    print(f"\n✓ Metrics saved to: {metrics_file}")