ynuozhang commited on 3 days ago

Commit

04c2975

1 Parent(s): d2421cb

major update

Files changed (21) hide show

basic_models.txt +9 -9
best_models.txt +9 -9
inference.py +634 -547
training_classifiers/binding_training.py +8 -5
training_classifiers/long_aggregated.csv +2 -2
training_classifiers/ml_uncertainty.py +135 -0
training_classifiers/ml_uncertainty_reg.py +182 -0
training_classifiers/refit_binding_affinity_seed.py +270 -0
training_classifiers/refit_ml_walltime.py +209 -0
training_classifiers/refit_nn_seed.py +315 -0
training_classifiers/refit_regression_seed.py +318 -0
training_classifiers/src_bash/binding_refit.bash +56 -0
training_classifiers/src_bash/ml_uncertainty.bash +192 -0
training_classifiers/src_bash/nn_uncertainty.bash +51 -0
training_classifiers/train_ml.py +0 -3
training_data_cleaned/binding_affinity/binding_affinity_smiles_meta_with_split.csv +2 -2
training_data_cleaned/binding_affinity/binding_affinity_wt_meta_with_split.csv +2 -2
training_data_cleaned/binding_affinity_split.py +459 -709
training_data_cleaned/embed_smiles.py +319 -0
training_data_cleaned/permeability_penetrance/permeability_smiles_meta_with_split.csv +3 -0
training_data_cleaned/smiles_data_split.py +2 -148

basic_models.txt CHANGED Viewed

@@ -1,10 +1,10 @@
 Properties, Best_Model_WT, Best_Model_SMILES, Type, Threshold_WT, Threshold_SMILES,
-Hemolysis, XGB, Transformer, Classifier, 0.2521, 0.4343 ,
-Non-Fouling, MLP, XGB, Classifier, 0.57, 0.6969,
-Solubility, CNN, -, Classifier, 0.377, -,
-Permeability (Penetrance), XGB, -, Classifier, 0.5493, -,
-Toxicity, -, Transformer, Classifier, -, 0.3401,
-Binding_affinity, unpooled, unpooled, Regression, -, -,
-Permeability_PAMPA, -, CNN, Regression, -, -,
-Permeability_CACO2, -, SVR, Regression, -, -,
-Halflife, Transformer, XGB, Regression, -, -,

 Properties, Best_Model_WT, Best_Model_SMILES, Type, Threshold_WT, Threshold_SMILES,
+Hemolysis, XGB, CNN (chemberta), Classifier, 0.2801, 0.564,
+Non-Fouling, Transformer, XGB (peptideclm), Classifier, 0.57, 0.3892,
+Solubility, CNN, Transformer (peptideclm), Classifier, 0.377, 0.329,
+Permeability (Penetrance), XGB, XGB (chemberta), Classifier, 0.4301, 0.5028,
+Toxicity, -, CNN (chemberta), Classifier, -, 0.49,
+Binding_affinity, wt_wt_pooled, chemberta_smiles_pooled, Regression, -, -,
+Permeability_PAMPA, -, CNN (chemberta), Regression, -, -,
+Permeability_CACO2, -, SVR (chemberta), Regression, -, -,
+Halflife, Transformer, XGB (peptideclm), Regression, -, -,

best_models.txt CHANGED Viewed

@@ -1,10 +1,10 @@
 Properties, Best_Model_WT, Best_Model_SMILES, Type, Threshold_WT, Threshold_SMILES,
-Hemolysis, SVM, Transformer, Classifier, 0.2521, 0.4343 ,
-Non-Fouling, MLP, ENET, Classifier, 0.57, 0.6969,
-Solubility, CNN, -, Classifier, 0.377, -,
-Permeability (Penetrance), SVM, -, Classifier, 0.5493, -,
-Toxicity, -, Transformer, Classifier, -, 0.3401,
-Binding_affinity, unpooled, unpooled, Regression, -, -,
-Permeability_PAMPA, -, CNN, Regression, -, -,
-Permeability_CACO2, -, SVR, Regression, -, -,
-Halflife, Transformer, XGB, Regression, -, -,

 Properties, Best_Model_WT, Best_Model_SMILES, Type, Threshold_WT, Threshold_SMILES,
+Hemolysis, SVM, CNN (chemberta), Classifier, 0.2521, 0.564,
+Non-Fouling, Transformer, ENET (peptideclm), Classifier, 0.57, 0.6969,
+Solubility, CNN, Transformer (peptideclm), Classifier, 0.377, 0.329,
+Permeability (Penetrance), SVM, SVM (chemberta), Classifier, 0.5493, 0.573,
+Toxicity, -, CNN (chemberta), Classifier, -, 0.49,
+Binding_affinity, wt_wt_pooled, chemberta_smiles_pooled, Regression, -, -,
+Permeability_PAMPA, -, CNN (chemberta), Regression, -, -,
+Permeability_CACO2, -, SVR (chemberta), Regression, -, -,
+Halflife, Transformer, XGB (peptideclm), Regression, -, -,

inference.py CHANGED Viewed

@@ -1,16 +1,13 @@
 from __future__ import annotations
 import csv, re, json
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Dict, Optional, Tuple, Any, List
 import numpy as np
 import torch
 import torch.nn as nn
 import joblib
 import xgboost as xgb
 from transformers import EsmModel, EsmTokenizer, AutoModelForMaskedLM
 from tokenizer.my_tokenizers import SMILES_SPE_Tokenizer
 from lightning.pytorch import seed_everything
@@ -19,13 +16,31 @@ seed_everything(1986)
 # -----------------------------
 # Manifest
 # -----------------------------
 @dataclass(frozen=True)
 class BestRow:
     property_key: str
-    best_wt: Optional[str]
-    best_smiles: Optional[str]
-    task_type: str               # "Classifier" or "Regression"
-    thr_wt: Optional[float]
     thr_smiles: Optional[float]
@@ -34,21 +49,16 @@ def _clean(s: str) -> str:
 def _none_if_dash(s: str) -> Optional[str]:
     s = _clean(s)
-    if s in {"", "-", "—", "NA", "N/A"}:
-        return None
-    return s
 def _float_or_none(s: str) -> Optional[float]:
     s = _clean(s)
-    if s in {"", "-", "—", "NA", "N/A"}:
-        return None
-    return float(s)
 def normalize_property_key(name: str) -> str:
     n = name.strip().lower()
     n = re.sub(r"\s*\(.*?\)\s*", "", n)
     n = n.replace("-", "_").replace(" ", "_")
     if "permeability" in n and "pampa" not in n and "caco" not in n:
         return "permeability_penetrance"
     if n == "binding_affinity":
@@ -60,11 +70,40 @@ def normalize_property_key(name: str) -> str:
     return n
 def read_best_manifest_csv(path: str | Path) -> Dict[str, BestRow]:
-    """
-    Properties, Best_Model_WT, Best_Model_SMILES, Type, Threshold_WT, Threshold_SMILES,
-    Hemolysis, SVM, SGB, Classifier, 0.2801, 0.2223,
-    """
     p = Path(path)
     out: Dict[str, BestRow] = {}
@@ -90,10 +129,13 @@ def read_best_manifest_csv(path: str | Path) -> Dict[str, BestRow]:
                 continue
             prop_key = normalize_property_key(prop_raw)
             row = BestRow(
                 property_key=prop_key,
-                best_wt=_none_if_dash(rec.get("Best_Model_WT", "")),
-                best_smiles=_none_if_dash(rec.get("Best_Model_SMILES", "")),
                 task_type=_clean(rec.get("Type", "Classifier")),
                 thr_wt=_float_or_none(rec.get("Threshold_WT", "")),
                 thr_smiles=_float_or_none(rec.get("Threshold_SMILES", "")),
@@ -103,53 +145,32 @@ def read_best_manifest_csv(path: str | Path) -> Dict[str, BestRow]:
     return out
-MODEL_ALIAS = {
-    "SVM": "svm_gpu",
-    "SVR": "svr",
-    "ENET": "enet_gpu",
-    "CNN": "cnn",
-    "MLP": "mlp",
-    "TRANSFORMER": "transformer",
-    "XGB": "xgb",
-    "XGB_REG": "xgb_reg",
-    "POOLED": "pooled",
-    "UNPOOLED": "unpooled",
-    "TRANSFORMER_WT_LOG": "transformer_wt_log",
-}
-def canon_model(label: Optional[str]) -> Optional[str]:
-    if label is None:
-        return None
-    k = label.strip().upper()
-    return MODEL_ALIAS.get(k, label.strip().lower())
 # -----------------------------
 # Generic artifact loading
 # -----------------------------
 def find_best_artifact(model_dir: Path) -> Path:
-    for pat in ["best_model.json", "best_model.pt", "best_model*.joblib"]:
         hits = sorted(model_dir.glob(pat))
         if hits:
             return hits[0]
     raise FileNotFoundError(f"No best_model artifact found in {model_dir}")
 def load_artifact(model_dir: Path, device: torch.device) -> Tuple[str, Any, Path]:
     art = find_best_artifact(model_dir)
     if art.suffix == ".json":
         booster = xgb.Booster()
-        #print(str(art))
         booster.load_model(str(art))
         return "xgb", booster, art
     if art.suffix == ".joblib":
         obj = joblib.load(art)
         return "joblib", obj, art
     if art.suffix == ".pt":
         ckpt = torch.load(art, map_location=device, weights_only=False)
         return "torch_ckpt", ckpt, art
     raise ValueError(f"Unknown artifact type: {art}")
@@ -157,7 +178,7 @@ def load_artifact(model_dir: Path, device: torch.device) -> Tuple[str, Any, Path
 # NN architectures
 # -----------------------------
 class MaskedMeanPool(nn.Module):
-    def forward(self, X, M):  # X:(B,L,H), M:(B,L)
         Mf = M.unsqueeze(-1).float()
         denom = Mf.sum(dim=1).clamp(min=1.0)
         return (X * Mf).sum(dim=1) / denom
@@ -167,34 +188,25 @@ class MLPHead(nn.Module):
         super().__init__()
         self.pool = MaskedMeanPool()
         self.net = nn.Sequential(
-            nn.Linear(in_dim, hidden),
-            nn.GELU(),
-            nn.Dropout(dropout),
             nn.Linear(hidden, 1),
         )
     def forward(self, X, M):
-        z = self.pool(X, M)
-        return self.net(z).squeeze(-1)
 class CNNHead(nn.Module):
     def __init__(self, in_ch, c=256, k=5, layers=2, dropout=0.1):
         super().__init__()
-        blocks = []
-        ch = in_ch
         for _ in range(layers):
-            blocks += [nn.Conv1d(ch, c, kernel_size=k, padding=k//2),
-                       nn.GELU(),
-                       nn.Dropout(dropout)]
             ch = c
         self.conv = nn.Sequential(*blocks)
         self.head = nn.Linear(c, 1)
     def forward(self, X, M):
-        Xc = X.transpose(1, 2)              # (B,H,L)
-        Y = self.conv(Xc).transpose(1, 2)   # (B,L,C)
         Mf = M.unsqueeze(-1).float()
-        denom = Mf.sum(dim=1).clamp(min=1.0)
-        pooled = (Y * Mf).sum(dim=1) / denom
         return self.head(pooled).squeeze(-1)
 class TransformerHead(nn.Module):
@@ -207,55 +219,36 @@ class TransformerHead(nn.Module):
         )
         self.enc = nn.TransformerEncoder(enc_layer, num_layers=layers)
         self.head = nn.Linear(d_model, 1)
     def forward(self, X, M):
-        pad_mask = ~M
-        Z = self.proj(X)
-        Z = self.enc(Z, src_key_padding_mask=pad_mask)
         Mf = M.unsqueeze(-1).float()
-        denom = Mf.sum(dim=1).clamp(min=1.0)
-        pooled = (Z * Mf).sum(dim=1) / denom
         return self.head(pooled).squeeze(-1)
 def _infer_in_dim_from_sd(sd: dict, model_name: str) -> int:
-    if model_name == "mlp":
-        return int(sd["net.0.weight"].shape[1])
-    if model_name == "cnn":
-        return int(sd["conv.0.weight"].shape[1])
-    if model_name == "transformer":
-        return int(sd["proj.weight"].shape[1])
     raise ValueError(model_name)
 def _infer_num_layers_from_sd(sd: dict, prefix: str = "enc.layers.") -> int:
-    # enc.layers.0.*, enc.layers.1.*, ...
     idxs = set()
     for k in sd.keys():
         if k.startswith(prefix):
-            rest = k[len(prefix):]
-            m = re.match(r"(\d+)\.", rest)
             if m:
                 idxs.add(int(m.group(1)))
     return (max(idxs) + 1) if idxs else 1
 def _infer_transformer_arch_from_sd(sd: dict) -> Tuple[int, int, int]:
-    """
-    Returns (d_model, layers, ff) inferred from weights.
-    - d_model from proj.weight (shape: [d_model, in_dim])
-    - layers from count of enc.layers.*
-    - ff from enc.layers.0.linear1.weight (shape: [ff, d_model])
-    """
     if "proj.weight" not in sd:
-        raise KeyError("Missing proj.weight in state_dict; cannot infer transformer d_model.")
     d_model = int(sd["proj.weight"].shape[0])
-    layers = _infer_num_layers_from_sd(sd, prefix="enc.layers.")
-    if "enc.layers.0.linear1.weight" in sd:
-        ff = int(sd["enc.layers.0.linear1.weight"].shape[0])
-    else:
-        ff = 4 * d_model
     return d_model, layers, ff
 def _pick_nhead(d_model: int) -> int:
-    # prefer common head counts; must divide d_model
     for h in (8, 6, 4, 3, 2, 1):
         if d_model % h == 0:
             return h
@@ -263,7 +256,7 @@ def _pick_nhead(d_model: int) -> int:
 def build_torch_model_from_ckpt(model_name: str, ckpt: dict, device: torch.device) -> nn.Module:
     params = ckpt["best_params"]
-    sd = ckpt["state_dict"]
     in_dim = int(ckpt.get("in_dim", _infer_in_dim_from_sd(sd, model_name)))
     dropout = float(params.get("dropout", 0.1))
@@ -273,44 +266,127 @@ def build_torch_model_from_ckpt(model_name: str, ckpt: dict, device: torch.devic
         model = CNNHead(in_ch=in_dim, c=int(params["channels"]), k=int(params["kernel"]),
                         layers=int(params["layers"]), dropout=dropout)
     elif model_name == "transformer":
-        # if transfer-learning ckpt omits arch params, infer from state_dict. special case for transformer_wt_log
         d_model = params.get("d_model") or params.get("hidden") or params.get("hidden_dim")
         if d_model is None:
             d_model_i, layers_i, ff_i = _infer_transformer_arch_from_sd(sd)
             nhead_i = _pick_nhead(d_model_i)
             model = TransformerHead(
-                in_dim=in_dim,
-                d_model=int(d_model_i),
-                nhead=int(params.get("nhead", nhead_i)),
-                layers=int(params.get("layers", layers_i)),
-                ff=int(params.get("ff", ff_i)),
                 dropout=float(params.get("dropout", dropout)),
             )
         else:
             d_model = int(d_model)
             model = TransformerHead(
-                in_dim=in_dim,
-                d_model=d_model,
                 nhead=int(params.get("nhead", _pick_nhead(d_model))),
                 layers=int(params.get("layers", 2)),
                 ff=int(params.get("ff", 4 * d_model)),
-                dropout=dropout
             )
     else:
         raise ValueError(f"Unknown NN model_name={model_name}")
     model.load_state_dict(sd)
-    model.to(device)
-    model.eval()
     return model
 # -----------------------------
-# Binding affinity models
 # -----------------------------
 def affinity_to_class(y: float) -> int:
-    # 0=High(>=9), 1=Moderate(7-9), 2=Low(<7)
     if y >= 9.0: return 0
     if y < 7.0:  return 2
     return 1
@@ -320,38 +396,31 @@ class CrossAttnPooled(nn.Module):
         super().__init__()
         self.t_proj = nn.Sequential(nn.Linear(Ht, hidden), nn.LayerNorm(hidden))
         self.b_proj = nn.Sequential(nn.Linear(Hb, hidden), nn.LayerNorm(hidden))
         self.layers = nn.ModuleList([])
         for _ in range(n_layers):
             self.layers.append(nn.ModuleDict({
                 "attn_tb": nn.MultiheadAttention(hidden, n_heads, dropout=dropout, batch_first=False),
                 "attn_bt": nn.MultiheadAttention(hidden, n_heads, dropout=dropout, batch_first=False),
-                "n1t": nn.LayerNorm(hidden),
-                "n2t": nn.LayerNorm(hidden),
-                "n1b": nn.LayerNorm(hidden),
-                "n2b": nn.LayerNorm(hidden),
                 "fft": nn.Sequential(nn.Linear(hidden, 4*hidden), nn.GELU(), nn.Dropout(dropout), nn.Linear(4*hidden, hidden)),
                 "ffb": nn.Sequential(nn.Linear(hidden, 4*hidden), nn.GELU(), nn.Dropout(dropout), nn.Linear(4*hidden, hidden)),
             }))
         self.shared = nn.Sequential(nn.Linear(2*hidden, hidden), nn.GELU(), nn.Dropout(dropout))
         self.reg = nn.Linear(hidden, 1)
         self.cls = nn.Linear(hidden, 3)
     def forward(self, t_vec, b_vec):
-        t = self.t_proj(t_vec).unsqueeze(0)  # (1,B,H)
-        b = self.b_proj(b_vec).unsqueeze(0)  # (1,B,H)
         for L in self.layers:
             t_attn, _ = L["attn_tb"](t, b, b)
             t = L["n1t"]((t + t_attn).transpose(0,1)).transpose(0,1)
             t = L["n2t"]((t + L["fft"](t)).transpose(0,1)).transpose(0,1)
             b_attn, _ = L["attn_bt"](b, t, t)
             b = L["n1b"]((b + b_attn).transpose(0,1)).transpose(0,1)
             b = L["n2b"]((b + L["ffb"](b)).transpose(0,1)).transpose(0,1)
-        z = torch.cat([t[0], b[0]], dim=-1)
-        h = self.shared(z)
         return self.reg(h).squeeze(-1), self.cls(h)
 class CrossAttnUnpooled(nn.Module):
@@ -359,334 +428,247 @@ class CrossAttnUnpooled(nn.Module):
         super().__init__()
         self.t_proj = nn.Sequential(nn.Linear(Ht, hidden), nn.LayerNorm(hidden))
         self.b_proj = nn.Sequential(nn.Linear(Hb, hidden), nn.LayerNorm(hidden))
         self.layers = nn.ModuleList([])
         for _ in range(n_layers):
             self.layers.append(nn.ModuleDict({
                 "attn_tb": nn.MultiheadAttention(hidden, n_heads, dropout=dropout, batch_first=True),
                 "attn_bt": nn.MultiheadAttention(hidden, n_heads, dropout=dropout, batch_first=True),
-                "n1t": nn.LayerNorm(hidden),
-                "n2t": nn.LayerNorm(hidden),
-                "n1b": nn.LayerNorm(hidden),
-                "n2b": nn.LayerNorm(hidden),
                 "fft": nn.Sequential(nn.Linear(hidden, 4*hidden), nn.GELU(), nn.Dropout(dropout), nn.Linear(4*hidden, hidden)),
                 "ffb": nn.Sequential(nn.Linear(hidden, 4*hidden), nn.GELU(), nn.Dropout(dropout), nn.Linear(4*hidden, hidden)),
             }))
         self.shared = nn.Sequential(nn.Linear(2*hidden, hidden), nn.GELU(), nn.Dropout(dropout))
         self.reg = nn.Linear(hidden, 1)
         self.cls = nn.Linear(hidden, 3)
     def _masked_mean(self, X, M):
         Mf = M.unsqueeze(-1).float()
-        denom = Mf.sum(dim=1).clamp(min=1.0)
-        return (X * Mf).sum(dim=1) / denom
     def forward(self, T, Mt, B, Mb):
-        T = self.t_proj(T)
-        Bx = self.b_proj(B)
-        kp_t = ~Mt
-        kp_b = ~Mb
         for L in self.layers:
             T_attn, _ = L["attn_tb"](T, Bx, Bx, key_padding_mask=kp_b)
-            T = L["n1t"](T + T_attn)
-            T = L["n2t"](T + L["fft"](T))
             B_attn, _ = L["attn_bt"](Bx, T, T, key_padding_mask=kp_t)
-            Bx = L["n1b"](Bx + B_attn)
-            Bx = L["n2b"](Bx + L["ffb"](Bx))
-        t_pool = self._masked_mean(T, Mt)
-        b_pool = self._masked_mean(Bx, Mb)
-        z = torch.cat([t_pool, b_pool], dim=-1)
-        h = self.shared(z)
         return self.reg(h).squeeze(-1), self.cls(h)
 def load_binding_model(best_model_pt: Path, pooled_or_unpooled: str, device: torch.device) -> nn.Module:
     ckpt = torch.load(best_model_pt, map_location=device, weights_only=False)
     params = ckpt["best_params"]
-    sd = ckpt["state_dict"]
-    # infer Ht/Hb from projection weights
     Ht = int(sd["t_proj.0.weight"].shape[1])
     Hb = int(sd["b_proj.0.weight"].shape[1])
-    common = dict(
-        Ht=Ht, Hb=Hb,
-        hidden=int(params["hidden_dim"]),
-        n_heads=int(params["n_heads"]),
-        n_layers=int(params["n_layers"]),
-        dropout=float(params["dropout"]),
-    )
-    if pooled_or_unpooled == "pooled":
-        model = CrossAttnPooled(**common)
-    elif pooled_or_unpooled == "unpooled":
-        model = CrossAttnUnpooled(**common)
-    else:
-        raise ValueError(pooled_or_unpooled)
     model.load_state_dict(sd)
-    model.to(device).eval()
-    return model
 # -----------------------------
 # Embedding generation
 # -----------------------------
 def _safe_isin(ids: torch.Tensor, test_ids: torch.Tensor) -> torch.Tensor:
-    """
-    Pytorch patch
-    """
     if hasattr(torch, "isin"):
         return torch.isin(ids, test_ids)
-    # Fallback: compare against each special id
-    # (B,L,1) == (1,1,K) -> (B,L,K)
     return (ids.unsqueeze(-1) == test_ids.view(1, 1, -1)).any(dim=-1)
 class SMILESEmbedder:
-    """
-    PeptideCLM RoFormer embeddings for SMILES.
-    - pooled(): mean over tokens where attention_mask==1 AND token_id not in SPECIAL_IDS
-    - unpooled(): returns token embeddings filtered to valid tokens (specials removed),
-                  plus a 1-mask of length Li (since already filtered).
-    """
-    def __init__(
-        self,
-        device: torch.device,
-        vocab_path: str,
-        splits_path: str,
-        clm_name: str = "aaronfeller/PeptideCLM-23M-all",
-        max_len: int = 512,
-        use_cache: bool = True,
-    ):
         self.device = device
         self.max_len = max_len
         self.use_cache = use_cache
         self.tokenizer = SMILES_SPE_Tokenizer(vocab_path, splits_path)
         self.model = AutoModelForMaskedLM.from_pretrained(clm_name).roformer.to(device).eval()
         self.special_ids = self._get_special_ids(self.tokenizer)
         self.special_ids_t = (torch.tensor(self.special_ids, device=device, dtype=torch.long)
-                              if len(self.special_ids) else None)
         self._cache_pooled: Dict[str, torch.Tensor] = {}
         self._cache_unpooled: Dict[str, Tuple[torch.Tensor, torch.Tensor]] = {}
     @staticmethod
     def _get_special_ids(tokenizer) -> List[int]:
-        cand = [
-            getattr(tokenizer, "pad_token_id", None),
-            getattr(tokenizer, "cls_token_id", None),
-            getattr(tokenizer, "sep_token_id", None),
-            getattr(tokenizer, "bos_token_id", None),
-            getattr(tokenizer, "eos_token_id", None),
-            getattr(tokenizer, "mask_token_id", None),
-        ]
         return sorted({int(x) for x in cand if x is not None})
-    def _tokenize(self, smiles_list: List[str]) -> Dict[str, torch.Tensor]:
-        tok = self.tokenizer(
-            smiles_list,
-            return_tensors="pt",
-            padding=True,
-            truncation=True,
-            max_length=self.max_len,
-        )
-        for k in tok:
-            tok[k] = tok[k].to(self.device)
         if "attention_mask" not in tok:
             tok["attention_mask"] = torch.ones_like(tok["input_ids"], dtype=torch.long, device=self.device)
         return tok
     @torch.no_grad()
     def pooled(self, smiles: str) -> torch.Tensor:
         s = smiles.strip()
-        if self.use_cache and s in self._cache_pooled:
-            return self._cache_pooled[s]
         tok = self._tokenize([s])
-        ids = tok["input_ids"]                 # (1,L)
-        attn = tok["attention_mask"].bool()    # (1,L)
-        out = self.model(input_ids=ids, attention_mask=tok["attention_mask"])
-        h = out.last_hidden_state              # (1,L,H)
-        valid = attn
         if self.special_ids_t is not None and self.special_ids_t.numel() > 0:
             valid = valid & (~_safe_isin(ids, self.special_ids_t))
         vf = valid.unsqueeze(-1).float()
-        summed = (h * vf).sum(dim=1)                       # (1,H)
-        denom = vf.sum(dim=1).clamp(min=1e-9)              # (1,1)
-        pooled = summed / denom                            # (1,H)
-        if self.use_cache:
-            self._cache_pooled[s] = pooled
         return pooled
     @torch.no_grad()
     def unpooled(self, smiles: str) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Returns:
-          X: (1, Li, H) float32 on device
-          M: (1, Li) bool on device
-        where Li excludes padding + special tokens.
-        """
         s = smiles.strip()
-        if self.use_cache and s in self._cache_unpooled:
-            return self._cache_unpooled[s]
         tok = self._tokenize([s])
-        ids = tok["input_ids"]                 # (1,L)
-        attn = tok["attention_mask"].bool()    # (1,L)
-        out = self.model(input_ids=ids, attention_mask=tok["attention_mask"])
-        h = out.last_hidden_state              # (1,L,H)
-        valid = attn
-        if self.special_ids_t is not None and self.special_ids_t.numel() > 0:
-            valid = valid & (~_safe_isin(ids, self.special_ids_t))
-        # filter valid tokens
-        keep = valid[0]                        # (L,)
-        X = h[:, keep, :]                      # (1,Li,H)
         M = torch.ones((1, X.shape[1]), dtype=torch.bool, device=self.device)
-        if self.use_cache:
-            self._cache_unpooled[s] = (X, M)
         return X, M
 class WTEmbedder:
-    """
-    ESM2 embeddings for AA sequences.
-    - pooled(): mean over tokens where attention_mask==1 AND token_id not in {CLS, EOS, PAD,...}
-    - unpooled(): returns token embeddings filtered to valid tokens (specials removed),
-                  plus a 1-mask of length Li (since already filtered).
-    """
-    def __init__(
-        self,
-        device: torch.device,
-        esm_name: str = "facebook/esm2_t33_650M_UR50D",
-        max_len: int = 1022,
-        use_cache: bool = True,
-    ):
         self.device = device
         self.max_len = max_len
         self.use_cache = use_cache
         self.tokenizer = EsmTokenizer.from_pretrained(esm_name)
         self.model = EsmModel.from_pretrained(esm_name, add_pooling_layer=False).to(device).eval()
         self.special_ids = self._get_special_ids(self.tokenizer)
         self.special_ids_t = (torch.tensor(self.special_ids, device=device, dtype=torch.long)
-                              if len(self.special_ids) else None)
         self._cache_pooled: Dict[str, torch.Tensor] = {}
         self._cache_unpooled: Dict[str, Tuple[torch.Tensor, torch.Tensor]] = {}
     @staticmethod
     def _get_special_ids(tokenizer) -> List[int]:
-        cand = [
-            getattr(tokenizer, "pad_token_id", None),
-            getattr(tokenizer, "cls_token_id", None),
-            getattr(tokenizer, "sep_token_id", None),
-            getattr(tokenizer, "bos_token_id", None),
-            getattr(tokenizer, "eos_token_id", None),
-            getattr(tokenizer, "mask_token_id", None),
-        ]
         return sorted({int(x) for x in cand if x is not None})
-    def _tokenize(self, seq_list: List[str]) -> Dict[str, torch.Tensor]:
-        tok = self.tokenizer(
-            seq_list,
-            return_tensors="pt",
-            padding=True,
-            truncation=True,
-            max_length=self.max_len,
-        )
         tok = {k: v.to(self.device) for k, v in tok.items()}
         if "attention_mask" not in tok:
             tok["attention_mask"] = torch.ones_like(tok["input_ids"], dtype=torch.long, device=self.device)
         return tok
     @torch.no_grad()
     def pooled(self, seq: str) -> torch.Tensor:
         s = seq.strip()
-        if self.use_cache and s in self._cache_pooled:
-            return self._cache_pooled[s]
         tok = self._tokenize([s])
-        ids = tok["input_ids"]                 # (1,L)
-        attn = tok["attention_mask"].bool()    # (1,L)
-        out = self.model(**tok)
-        h = out.last_hidden_state              # (1,L,H)
-        valid = attn
-        if self.special_ids_t is not None and self.special_ids_t.numel() > 0:
-            valid = valid & (~_safe_isin(ids, self.special_ids_t))
         vf = valid.unsqueeze(-1).float()
-        summed = (h * vf).sum(dim=1)                       # (1,H)
-        denom = vf.sum(dim=1).clamp(min=1e-9)              # (1,1)
-        pooled = summed / denom                            # (1,H)
-        if self.use_cache:
-            self._cache_pooled[s] = pooled
         return pooled
     @torch.no_grad()
     def unpooled(self, seq: str) -> Tuple[torch.Tensor, torch.Tensor]:
-        """
-        Returns:
-          X: (1, Li, H) float32 on device
-          M: (1, Li) bool on device
-        where Li excludes padding + special tokens.
-        """
         s = seq.strip()
-        if self.use_cache and s in self._cache_unpooled:
-            return self._cache_unpooled[s]
         tok = self._tokenize([s])
-        ids = tok["input_ids"]                 # (1,L)
-        attn = tok["attention_mask"].bool()    # (1,L)
-        out = self.model(**tok)
-        h = out.last_hidden_state              # (1,L,H)
-        valid = attn
-        if self.special_ids_t is not None and self.special_ids_t.numel() > 0:
-            valid = valid & (~_safe_isin(ids, self.special_ids_t))
-        keep = valid[0]                        # (L,)
-        X = h[:, keep, :]                      # (1,Li,H)
         M = torch.ones((1, X.shape[1]), dtype=torch.bool, device=self.device)
-        if self.use_cache:
-            self._cache_unpooled[s] = (X, M)
         return X, M
 # -----------------------------
 # Predictor
 # -----------------------------
 class PeptiVersePredictor:
-    """
-      - loads best models from training_classifiers/
-      - computes embeddings as needed (pooled/unpooled)
-      - supports: xgb, joblib(ENET/SVM/SVR), NN(mlp/cnn/transformer), binding pooled/unpooled.
-    """
     def __init__(
         self,
         manifest_path: str | Path,
         classifier_weight_root: str | Path,
         esm_name="facebook/esm2_t33_650M_UR50D",
         clm_name="aaronfeller/PeptideCLM-23M-all",
         smiles_vocab="tokenizer/new_vocab.txt",
         smiles_splits="tokenizer/new_splits.txt",
         device: Optional[str] = None,
@@ -697,293 +679,398 @@ class PeptiVersePredictor:
         self.manifest = read_best_manifest_csv(manifest_path)
-        self.wt_embedder = WTEmbedder(self.device)
-        self.smiles_embedder = SMILESEmbedder(self.device, clm_name=clm_name,
-                                              vocab_path=str(self.root / smiles_vocab),
-                                              splits_path=str(self.root / smiles_splits))
-        self.models: Dict[Tuple[str, str], Any] = {}
-        self.meta: Dict[Tuple[str, str], Dict[str, Any]] = {}
         self._load_all_best_models()
-    def _resolve_dir(self, prop_key: str, model_name: str, mode: str) -> Path:
-        # map halflife -> half_life folder on disk (common layout)
         disk_prop = "half_life" if prop_key == "halflife" else prop_key
         base = self.training_root / disk_prop
-        # special handling for halflife xgb_wt_log / xgb_smiles
-        if prop_key == "halflife" and model_name in {"xgb_wt_log", "xgb_smiles"}:
-            d = base / model_name
-            if d.exists():
-                return d
-        # special handling for halflife transformer wt log folder
-        if prop_key == "halflife" and mode == "wt" and model_name == "transformer":
-            d = base / "transformer_wt_log"
-            if d.exists():
-                return d
-        if prop_key == "halflife" and model_name == "xgb":
-            d = base / ("xgb_wt_log" if mode == "wt" else "xgb_smiles")
-            if d.exists():
-                return d
         candidates = [
-            base / f"{model_name}_{mode}",
             base / model_name,
         ]
-        if mode == "wt":
-            candidates += [base / f"{model_name}_wt"]
-        if mode == "smiles":
-            candidates += [base / f"{model_name}_smiles"]
         for d in candidates:
-            if d.exists():
-                return d
         raise FileNotFoundError(
-            f"Cannot find model directory for {prop_key} {model_name} {mode}. Tried: {candidates}"
         )
     def _load_all_best_models(self):
         for prop_key, row in self.manifest.items():
-            for mode, label, thr in [
-                ("wt", row.best_wt, row.thr_wt),
-                ("smiles", row.best_smiles, row.thr_smiles),
             ]:
-                m = canon_model(label)
-                if m is None:
                     continue
-                # ---- binding affinity special ----
                 if prop_key == "binding_affinity":
-                    # label is pooled/unpooled; mode chooses folder wt_wt_* vs wt_smiles_*
-                    pooled_or_unpooled = m  # "pooled" or "unpooled"
-                    folder = f"wt_{mode}_{pooled_or_unpooled}"  # wt_wt_pooled / wt_smiles_unpooled etc.
                     model_dir = self.training_root / "binding_affinity" / folder
                     art = find_best_artifact(model_dir)
-                    if art.suffix != ".pt":
-                        raise RuntimeError(f"Binding model expected best_model.pt, got {art}")
-                    model = load_binding_model(art, pooled_or_unpooled=pooled_or_unpooled, device=self.device)
-                    self.models[(prop_key, mode)] = model
-                    self.meta[(prop_key, mode)] = {
-                        "task_type": "Regression",
-                        "threshold": None,
-                        "artifact": str(art),
-                        "model_name": pooled_or_unpooled,
                     }
                     continue
-                model_dir = self._resolve_dir(prop_key, m, mode)
                 kind, obj, art = load_artifact(model_dir, self.device)
-                if kind in {"xgb", "joblib"}:
-                    self.models[(prop_key, mode)] = obj
                 else:
-                    # rebuild NN architecture
-                    arch = m
-                    if arch.startswith("transformer"):
-                        arch = "transformer"
-                    elif arch.startswith("mlp"):
-                        arch = "mlp"
-                    elif arch.startswith("cnn"):
-                        arch = "cnn"
-                    self.models[(prop_key, mode)] = build_torch_model_from_ckpt(arch, obj, self.device)
-                self.meta[(prop_key, mode)] = {
-                        "task_type": row.task_type,
-                        "threshold": thr,
-                        "artifact": str(art),
-                        "model_name": m,
-                        "kind": kind,
-                    }
-    def _get_features_for_model(self, prop_key: str, mode: str, input_str: str):
-        """
-        Returns either:
-          - pooled np array shape (1,H) for xgb/joblib
-          - unpooled torch tensors (X,M) for NN
-        """
-        model = self.models[(prop_key, mode)]
-        meta = self.meta[(prop_key, mode)]
-        kind = meta.get("kind", None)
-        model_name = meta.get("model_name", "")
-        if prop_key == "binding_affinity":
-            raise RuntimeError("Use predict_binding_affinity().")
-        # If torch NN: needs unpooled
         if kind == "torch_ckpt":
-            if mode == "wt":
-                X, M = self.wt_embedder.unpooled(input_str)
-            else:
-                X, M = self.smiles_embedder.unpooled(input_str)
-            return X, M
-        # Otherwise pooled vectors for xgb/joblib
-        if mode == "wt":
-            v = self.wt_embedder.pooled(input_str)     # (1,H)
-        else:
-            v = self.smiles_embedder.pooled(input_str) # (1,H)
-        feats = v.detach().cpu().numpy().astype(np.float32)
-        feats = np.nan_to_num(feats, nan=0.0)
-        feats = np.clip(feats, np.finfo(np.float32).min, np.finfo(np.float32).max)
-        return feats
-    def predict_property(self, prop_key: str, mode: str, input_str: str) -> Dict[str, Any]:
-        """
-        mode: "wt" for AA sequence input, "smiles" for SMILES input
-        Returns dict with score + label if classifier threshold exists.
-        """
-        if (prop_key, mode) not in self.models:
-            raise KeyError(f"No model loaded for ({prop_key}, {mode}). Check manifest and folders.")
-        meta = self.meta[(prop_key, mode)]
-        model = self.models[(prop_key, mode)]
-        task_type = meta["task_type"].lower()
-        thr = meta.get("threshold", None)
-        kind = meta.get("kind", None)
         if prop_key == "binding_affinity":
             raise RuntimeError("Use predict_binding_affinity().")
-        # NN path (logits / regression)
         if kind == "torch_ckpt":
-            X, M = self._get_features_for_model(prop_key, mode, input_str)
             with torch.no_grad():
-                y = model(X, M).squeeze().float().cpu().item()
-            # invert log1p(hours) ONLY for WT half-life log models
-            model_name = meta.get("model_name", "")
-            if (
-                prop_key == "halflife"
-                and mode == "wt"
-                and model_name in {"xgb_wt_log", "transformer_wt_log"}
-            ):
-                y = float(np.expm1(y))
             if task_type == "classifier":
-                prob = float(1.0 / (1.0 + np.exp(-y)))  # sigmoid(logit)
-                out = {"property": prop_key, "mode": mode, "score": prob}
                 if thr is not None:
-                    out["label"] = int(prob >= float(thr))
-                    out["threshold"] = float(thr)
-                return out
             else:
-                return {"property": prop_key, "mode": mode, "score": float(y)}
-        if kind == "xgb":
-            feats = self._get_features_for_model(prop_key, mode, input_str)
-            dmat = xgb.DMatrix(feats)
-            pred = float(model.predict(dmat)[0])
-            # invert log1p(hours) ONLY for WT half-life log models
-            model_name = meta.get("model_name", "")
-            if (
-                prop_key == "halflife"
-                and mode == "wt"
-                and model_name in {"xgb_wt_log", "transformer_wt_log"}
-            ):
                 pred = float(np.expm1(pred))
-            out = {"property": prop_key, "mode": mode, "score": pred}
-            return out
-        # joblib path (svm/enet/svr)
-        if kind == "joblib":
-            feats = self._get_features_for_model(prop_key, mode, input_str)  # (1,H)
-            # classifier vs regressor behavior differs by estimator
             if task_type == "classifier":
                 if hasattr(model, "predict_proba"):
                     pred = float(model.predict_proba(feats)[:, 1][0])
                 else:
-                    if hasattr(model, "decision_function"):
-                        logit = float(model.decision_function(feats)[0])
-                        pred = float(1.0 / (1.0 + np.exp(-logit)))
-                    else:
-                        pred = float(model.predict(feats)[0])
-                out = {"property": prop_key, "mode": mode, "score": pred}
                 if thr is not None:
-                    out["label"] = int(pred >= float(thr))
-                    out["threshold"] = float(thr)
-                return out
             else:
                 pred = float(model.predict(feats)[0])
-                return {"property": prop_key, "mode": mode, "score": pred}
-        raise RuntimeError(f"Unknown model kind={kind}")
-    def predict_binding_affinity(self, mode: str, target_seq: str, binder_str: str) -> Dict[str, Any]:
-        """
-        mode: "wt" (binder is AA sequence) -> wt_wt_(pooled|unpooled)
-              "smiles" (binder is SMILES) -> wt_smiles_(pooled|unpooled)
-        """
-        prop_key = "binding_affinity"
-        if (prop_key, mode) not in self.models:
-            raise KeyError(f"No binding model loaded for ({prop_key}, {mode}).")
-        model = self.models[(prop_key, mode)]
-        pooled_or_unpooled = self.meta[(prop_key, mode)]["model_name"]  # pooled/unpooled
-        # target is always WT sequence (ESM)
-        if pooled_or_unpooled == "pooled":
-            t_vec = self.wt_embedder.pooled(target_seq)  # (1,Ht)
-            if mode == "wt":
-                b_vec = self.wt_embedder.pooled(binder_str)  # (1,Hb)
-            else:
-                b_vec = self.smiles_embedder.pooled(binder_str)  # (1,Hb)
             with torch.no_grad():
                 reg, logits = model(t_vec, b_vec)
-                affinity = float(reg.squeeze().cpu().item())
-                cls_logit = int(torch.argmax(logits, dim=-1).cpu().item())
-                cls_thr = affinity_to_class(affinity)
         else:
             T, Mt = self.wt_embedder.unpooled(target_seq)
-            if mode == "wt":
-                B, Mb = self.wt_embedder.unpooled(binder_str)
-            else:
-                B, Mb = self.smiles_embedder.unpooled(binder_str)
             with torch.no_grad():
                 reg, logits = model(T, Mt, B, Mb)
-                affinity = float(reg.squeeze().cpu().item())
-                cls_logit = int(torch.argmax(logits, dim=-1).cpu().item())
-                cls_thr = affinity_to_class(affinity)
-        names = {0: "High (≥9)", 1: "Moderate (7-9)", 2: "Low (<7)"}
-        return {
-            "property": "binding_affinity",
-            "mode": mode,
-            "affinity": affinity,
             "class_by_threshold": names[cls_thr],
-            "class_by_logits": names[cls_logit],
-            "binding_model": pooled_or_unpooled,
         }
 if __name__ == "__main__":
-    predictor = PeptiVersePredictor(
-       manifest_path="basic_models.txt",
-       classifier_weight_root="./"
-     )
-    print(predictor.predict_property("hemolysis", "wt", "GIGAVLKVLTTGLPALISWIKRKRQQ"))
-    print(predictor.predict_binding_affinity("wt", target_seq="...", binder_str="..."))
-    # Test Embedding #
-    """
-    device = torch.device("cuda:0")
-    wt = WTEmbedder(device)
-    sm = SMILESEmbedder(device,
-        vocab_path="./tokeizner/new_vocab.txt",
-        splits_path="./tokenizer/new_splits.txt"
     )
-    p = wt.pooled("GIGAVLKVLTTGLPALISWIKRKRQQ")        # (1,1280)
-    X, M = wt.unpooled("GIGAVLKVLTTGLPALISWIKRKRQQ")    # (1,Li,1280), (1,Li)
-    p2 = sm.pooled("NCC(=O)N[C@H](CS)C(=O)O")           # (1,H_smiles)
-    X2, M2 = sm.unpooled("NCC(=O)N[C@H](CS)C(=O)O")     # (1,Li,H_smiles), (1,Li)
-    """

 from __future__ import annotations
 import csv, re, json
 from dataclasses import dataclass
 from pathlib import Path
 from typing import Dict, Optional, Tuple, Any, List
 import numpy as np
 import torch
 import torch.nn as nn
 import joblib
 import xgboost as xgb
 from transformers import EsmModel, EsmTokenizer, AutoModelForMaskedLM
 from tokenizer.my_tokenizers import SMILES_SPE_Tokenizer
 from lightning.pytorch import seed_everything
 # -----------------------------
 # Manifest
 # -----------------------------
+EMB_TAG_TO_FOLDER_SUFFIX = {
+    "wt":         "wt",
+    "peptideclm": "smiles",
+    "chemberta":  "chemberta",
+}
+EMB_TAG_TO_RUNTIME_MODE = {
+    "wt":         "wt",
+    "peptideclm": "smiles",
+    "chemberta":  "chemberta",
+}
+MAPIE_REGRESSION_MODELS = {"svr", "enet_gpu"}
+DNN_ARCHS = {"mlp", "cnn", "transformer"}
+XGB_MODELS = {"xgb", "xgb_reg", "xgb_wt_log", "xgb_smiles"}
 @dataclass(frozen=True)
 class BestRow:
     property_key: str
+    best_wt:    Optional[Tuple[str, Optional[str]]]
+    best_smiles: Optional[Tuple[str, Optional[str]]]
+    task_type: str
+    thr_wt:    Optional[float]
     thr_smiles: Optional[float]
 def _none_if_dash(s: str) -> Optional[str]:
     s = _clean(s)
+    return None if s in {"", "-", "-", "NA", "N/A"} else s
 def _float_or_none(s: str) -> Optional[float]:
     s = _clean(s)
+    return None if s in {"", "-", "-", "NA", "N/A"} else float(s)
 def normalize_property_key(name: str) -> str:
     n = name.strip().lower()
     n = re.sub(r"\s*\(.*?\)\s*", "", n)
     n = n.replace("-", "_").replace(" ", "_")
     if "permeability" in n and "pampa" not in n and "caco" not in n:
         return "permeability_penetrance"
     if n == "binding_affinity":
     return n
+MODEL_ALIAS = {
+    "SVM":         "svm_gpu",
+    "SVR":         "svr",
+    "ENET":        "enet_gpu",
+    "CNN":         "cnn",
+    "MLP":         "mlp",
+    "TRANSFORMER": "transformer",
+    "XGB":         "xgb",
+    "XGB_REG":     "xgb_reg",
+    "POOLED":      "pooled",
+    "UNPOOLED":    "unpooled",
+    "TRANSFORMER_WT_LOG": "transformer_wt_log",
+}
+def _parse_model_and_emb(raw: Optional[str]) -> Optional[Tuple[str, Optional[str]]]:
+    if raw is None:
+        return None
+    raw = _clean(raw)
+    if not raw or raw in {"-", "-", "NA", "N/A"}:
+        return None
+    m = re.match(r"^(.+?)\s*\((.+?)\)\s*$", raw)
+    if m:
+        model_raw = m.group(1).strip()
+        emb_tag   = m.group(2).strip().lower()
+    else:
+        model_raw = raw
+        emb_tag   = None
+    canon = MODEL_ALIAS.get(model_raw.upper(), model_raw.lower())
+    return canon, emb_tag
 def read_best_manifest_csv(path: str | Path) -> Dict[str, BestRow]:
     p = Path(path)
     out: Dict[str, BestRow] = {}
                 continue
             prop_key = normalize_property_key(prop_raw)
+            best_wt    = _parse_model_and_emb(_none_if_dash(rec.get("Best_Model_WT", "")))
+            best_smiles = _parse_model_and_emb(_none_if_dash(rec.get("Best_Model_SMILES", "")))
             row = BestRow(
                 property_key=prop_key,
+                best_wt=best_wt,
+                best_smiles=best_smiles,
                 task_type=_clean(rec.get("Type", "Classifier")),
                 thr_wt=_float_or_none(rec.get("Threshold_WT", "")),
                 thr_smiles=_float_or_none(rec.get("Threshold_SMILES", "")),
     return out
 # -----------------------------
 # Generic artifact loading
 # -----------------------------
 def find_best_artifact(model_dir: Path) -> Path:
+    for pat in ["best_model.json", "best_model.pt", "best_model*.joblib",
+                "model.json", "model.ubj", "final_model.json"]:
         hits = sorted(model_dir.glob(pat))
         if hits:
             return hits[0]
+    seed_pt = model_dir / "seed_1986" / "model.pt"
+    if seed_pt.exists():
+        return seed_pt
     raise FileNotFoundError(f"No best_model artifact found in {model_dir}")
 def load_artifact(model_dir: Path, device: torch.device) -> Tuple[str, Any, Path]:
     art = find_best_artifact(model_dir)
     if art.suffix == ".json":
         booster = xgb.Booster()
         booster.load_model(str(art))
         return "xgb", booster, art
     if art.suffix == ".joblib":
         obj = joblib.load(art)
         return "joblib", obj, art
     if art.suffix == ".pt":
         ckpt = torch.load(art, map_location=device, weights_only=False)
         return "torch_ckpt", ckpt, art
     raise ValueError(f"Unknown artifact type: {art}")
 # NN architectures
 # -----------------------------
 class MaskedMeanPool(nn.Module):
+    def forward(self, X, M):
         Mf = M.unsqueeze(-1).float()
         denom = Mf.sum(dim=1).clamp(min=1.0)
         return (X * Mf).sum(dim=1) / denom
         super().__init__()
         self.pool = MaskedMeanPool()
         self.net = nn.Sequential(
+            nn.Linear(in_dim, hidden), nn.GELU(), nn.Dropout(dropout),
             nn.Linear(hidden, 1),
         )
     def forward(self, X, M):
+        return self.net(self.pool(X, M)).squeeze(-1)
 class CNNHead(nn.Module):
     def __init__(self, in_ch, c=256, k=5, layers=2, dropout=0.1):
         super().__init__()
+        blocks, ch = [], in_ch
         for _ in range(layers):
+            blocks += [nn.Conv1d(ch, c, kernel_size=k, padding=k//2), nn.GELU(), nn.Dropout(dropout)]
             ch = c
         self.conv = nn.Sequential(*blocks)
         self.head = nn.Linear(c, 1)
     def forward(self, X, M):
+        Y = self.conv(X.transpose(1, 2)).transpose(1, 2)
         Mf = M.unsqueeze(-1).float()
+        pooled = (Y * Mf).sum(dim=1) / Mf.sum(dim=1).clamp(min=1.0)
         return self.head(pooled).squeeze(-1)
 class TransformerHead(nn.Module):
         )
         self.enc = nn.TransformerEncoder(enc_layer, num_layers=layers)
         self.head = nn.Linear(d_model, 1)
     def forward(self, X, M):
+        Z = self.enc(self.proj(X), src_key_padding_mask=~M)
         Mf = M.unsqueeze(-1).float()
+        pooled = (Z * Mf).sum(dim=1) / Mf.sum(dim=1).clamp(min=1.0)
         return self.head(pooled).squeeze(-1)
 def _infer_in_dim_from_sd(sd: dict, model_name: str) -> int:
+    if model_name == "mlp":        return int(sd["net.0.weight"].shape[1])
+    if model_name == "cnn":        return int(sd["conv.0.weight"].shape[1])
+    if model_name == "transformer": return int(sd["proj.weight"].shape[1])
     raise ValueError(model_name)
 def _infer_num_layers_from_sd(sd: dict, prefix: str = "enc.layers.") -> int:
     idxs = set()
     for k in sd.keys():
         if k.startswith(prefix):
+            m = re.match(r"(\d+)\.", k[len(prefix):])
             if m:
                 idxs.add(int(m.group(1)))
     return (max(idxs) + 1) if idxs else 1
 def _infer_transformer_arch_from_sd(sd: dict) -> Tuple[int, int, int]:
     if "proj.weight" not in sd:
+        raise KeyError("Missing proj.weight in state_dict")
     d_model = int(sd["proj.weight"].shape[0])
+    layers  = _infer_num_layers_from_sd(sd, prefix="enc.layers.")
+    ff      = int(sd["enc.layers.0.linear1.weight"].shape[0]) if "enc.layers.0.linear1.weight" in sd else 4 * d_model
     return d_model, layers, ff
 def _pick_nhead(d_model: int) -> int:
     for h in (8, 6, 4, 3, 2, 1):
         if d_model % h == 0:
             return h
 def build_torch_model_from_ckpt(model_name: str, ckpt: dict, device: torch.device) -> nn.Module:
     params = ckpt["best_params"]
+    sd     = ckpt["state_dict"]
     in_dim = int(ckpt.get("in_dim", _infer_in_dim_from_sd(sd, model_name)))
     dropout = float(params.get("dropout", 0.1))
         model = CNNHead(in_ch=in_dim, c=int(params["channels"]), k=int(params["kernel"]),
                         layers=int(params["layers"]), dropout=dropout)
     elif model_name == "transformer":
         d_model = params.get("d_model") or params.get("hidden") or params.get("hidden_dim")
         if d_model is None:
             d_model_i, layers_i, ff_i = _infer_transformer_arch_from_sd(sd)
             nhead_i = _pick_nhead(d_model_i)
             model = TransformerHead(
+                in_dim=in_dim, d_model=int(d_model_i), nhead=int(params.get("nhead", nhead_i)),
+                layers=int(params.get("layers", layers_i)), ff=int(params.get("ff", ff_i)),
                 dropout=float(params.get("dropout", dropout)),
             )
         else:
             d_model = int(d_model)
             model = TransformerHead(
+                in_dim=in_dim, d_model=d_model,
                 nhead=int(params.get("nhead", _pick_nhead(d_model))),
                 layers=int(params.get("layers", 2)),
                 ff=int(params.get("ff", 4 * d_model)),
+                dropout=dropout,
             )
     else:
         raise ValueError(f"Unknown NN model_name={model_name}")
     model.load_state_dict(sd)
+    model.to(device).eval()
     return model
 # -----------------------------
+# Wrappers
+# -----------------------------
+from sklearn.base import BaseEstimator, ClassifierMixin, RegressorMixin
+class PassthroughRegressor(BaseEstimator, RegressorMixin):
+    def __init__(self, preds: np.ndarray):
+        self.preds = preds
+    def fit(self, X, y): return self
+    def predict(self, X): return self.preds[:len(X)]
+class PassthroughClassifier(BaseEstimator, ClassifierMixin):
+    def __init__(self, preds: np.ndarray):
+        self.preds = preds
+        self.classes_ = np.array([0, 1])
+    def fit(self, X, y): return self
+    def predict(self, X): return (self.preds[:len(X)] >= 0.5).astype(int)
+    def predict_proba(self, X):
+        p = self.preds[:len(X)]
+        return np.stack([1 - p, p], axis=1)
 # -----------------------------
+# Uncertainty helpers
+# -----------------------------
+SEED_DIRS = ["seed_1986", "seed_42", "seed_0", "seed_123", "seed_12345"]
+def load_seed_ensemble(model_dir: Path, arch: str, device: torch.device) -> List[nn.Module]:
+    ensemble = []
+    for sd_name in SEED_DIRS:
+        pt = model_dir / sd_name / "model.pt"
+        if not pt.exists():
+            continue
+        ckpt = torch.load(pt, map_location=device, weights_only=False)
+        ensemble.append(build_torch_model_from_ckpt(arch, ckpt, device))
+    return ensemble
+def _binary_entropy(p: float) -> float:
+    p = float(np.clip(p, 1e-9, 1 - 1e-9))
+    return float(-p * np.log(p) - (1 - p) * np.log(1 - p))
+def _ensemble_clf_uncertainty(ensemble: List[nn.Module], X: torch.Tensor, M: torch.Tensor) -> float:
+    probs = []
+    with torch.no_grad():
+        for m in ensemble:
+            logit = m(X, M).squeeze().float().cpu().item()
+            probs.append(1.0 / (1.0 + np.exp(-logit)))
+    return _binary_entropy(float(np.mean(probs)))
+def _ensemble_reg_uncertainty(ensemble: List[nn.Module], X: torch.Tensor, M: torch.Tensor) -> float:
+    preds = []
+    with torch.no_grad():
+        for m in ensemble:
+            preds.append(m(X, M).squeeze().float().cpu().item())
+    return float(np.std(preds))
+def _mapie_uncertainty(mapie_bundle: dict, score: float,
+                        embedding: Optional[np.ndarray] = None) -> Tuple[float, float]:
+    """
+    Returns (ci_low, ci_high) from a conformal bundle.
+      - adaptive:       {"quantile": q, "sigma_model": xgb, "emb_tag": ..., "adaptive": True}
+                        Input-dependent: interval = score +/- q * sigma(embedding)
+      - plain_quantile: {"quantile": q, "alpha": ...}
+                        Fixed-width: interval = score +/- q
+    """
+    # Adaptive format is input-dependent interval
+    if mapie_bundle.get("adaptive") and "sigma_model" in mapie_bundle:
+        q = float(mapie_bundle["quantile"])
+        if embedding is not None:
+            sigma_model = mapie_bundle["sigma_model"]
+            sigma = float(sigma_model.predict(xgb.DMatrix(embedding.reshape(1, -1)))[0])
+            sigma = max(sigma, 1e-6)
+        else:
+            # No embedding available - fall back to fixed interval with sigma=1
+            sigma = 1.0
+        return float(score - q * sigma), float(score + q * sigma)
+    # Plain quantile format
+    if "quantile" in mapie_bundle:
+        q = float(mapie_bundle["quantile"])
+        return float(score - q), float(score + q)
+    X_dummy = np.zeros((1, 1))
+    result = mapie.predict(X_dummy)
+    if isinstance(result, tuple):
+        intervals = np.asarray(result[1])
+        if intervals.ndim == 3:
+            return float(intervals[0, 0, 0]), float(intervals[0, 1, 0])
+        return float(intervals[0, 0]), float(intervals[0, 1])
+    raise RuntimeError(
+        f"Cannot extract intervals: unknown MAPIE bundle format. "
+        f"Bundle keys: {list(mapie_bundle.keys())}."
+    )
 def affinity_to_class(y: float) -> int:
     if y >= 9.0: return 0
     if y < 7.0:  return 2
     return 1
         super().__init__()
         self.t_proj = nn.Sequential(nn.Linear(Ht, hidden), nn.LayerNorm(hidden))
         self.b_proj = nn.Sequential(nn.Linear(Hb, hidden), nn.LayerNorm(hidden))
         self.layers = nn.ModuleList([])
         for _ in range(n_layers):
             self.layers.append(nn.ModuleDict({
                 "attn_tb": nn.MultiheadAttention(hidden, n_heads, dropout=dropout, batch_first=False),
                 "attn_bt": nn.MultiheadAttention(hidden, n_heads, dropout=dropout, batch_first=False),
+                "n1t": nn.LayerNorm(hidden), "n2t": nn.LayerNorm(hidden),
+                "n1b": nn.LayerNorm(hidden), "n2b": nn.LayerNorm(hidden),
                 "fft": nn.Sequential(nn.Linear(hidden, 4*hidden), nn.GELU(), nn.Dropout(dropout), nn.Linear(4*hidden, hidden)),
                 "ffb": nn.Sequential(nn.Linear(hidden, 4*hidden), nn.GELU(), nn.Dropout(dropout), nn.Linear(4*hidden, hidden)),
             }))
         self.shared = nn.Sequential(nn.Linear(2*hidden, hidden), nn.GELU(), nn.Dropout(dropout))
         self.reg = nn.Linear(hidden, 1)
         self.cls = nn.Linear(hidden, 3)
     def forward(self, t_vec, b_vec):
+        t = self.t_proj(t_vec).unsqueeze(0)
+        b = self.b_proj(b_vec).unsqueeze(0)
         for L in self.layers:
             t_attn, _ = L["attn_tb"](t, b, b)
             t = L["n1t"]((t + t_attn).transpose(0,1)).transpose(0,1)
             t = L["n2t"]((t + L["fft"](t)).transpose(0,1)).transpose(0,1)
             b_attn, _ = L["attn_bt"](b, t, t)
             b = L["n1b"]((b + b_attn).transpose(0,1)).transpose(0,1)
             b = L["n2b"]((b + L["ffb"](b)).transpose(0,1)).transpose(0,1)
+        h = self.shared(torch.cat([t[0], b[0]], dim=-1))
         return self.reg(h).squeeze(-1), self.cls(h)
 class CrossAttnUnpooled(nn.Module):
         super().__init__()
         self.t_proj = nn.Sequential(nn.Linear(Ht, hidden), nn.LayerNorm(hidden))
         self.b_proj = nn.Sequential(nn.Linear(Hb, hidden), nn.LayerNorm(hidden))
         self.layers = nn.ModuleList([])
         for _ in range(n_layers):
             self.layers.append(nn.ModuleDict({
                 "attn_tb": nn.MultiheadAttention(hidden, n_heads, dropout=dropout, batch_first=True),
                 "attn_bt": nn.MultiheadAttention(hidden, n_heads, dropout=dropout, batch_first=True),
+                "n1t": nn.LayerNorm(hidden), "n2t": nn.LayerNorm(hidden),
+                "n1b": nn.LayerNorm(hidden), "n2b": nn.LayerNorm(hidden),
                 "fft": nn.Sequential(nn.Linear(hidden, 4*hidden), nn.GELU(), nn.Dropout(dropout), nn.Linear(4*hidden, hidden)),
                 "ffb": nn.Sequential(nn.Linear(hidden, 4*hidden), nn.GELU(), nn.Dropout(dropout), nn.Linear(4*hidden, hidden)),
             }))
         self.shared = nn.Sequential(nn.Linear(2*hidden, hidden), nn.GELU(), nn.Dropout(dropout))
         self.reg = nn.Linear(hidden, 1)
         self.cls = nn.Linear(hidden, 3)
     def _masked_mean(self, X, M):
         Mf = M.unsqueeze(-1).float()
+        return (X * Mf).sum(dim=1) / Mf.sum(dim=1).clamp(min=1.0)
     def forward(self, T, Mt, B, Mb):
+        T = self.t_proj(T); Bx = self.b_proj(B)
+        kp_t, kp_b = ~Mt, ~Mb
         for L in self.layers:
             T_attn, _ = L["attn_tb"](T, Bx, Bx, key_padding_mask=kp_b)
+            T = L["n1t"](T + T_attn); T = L["n2t"](T + L["fft"](T))
             B_attn, _ = L["attn_bt"](Bx, T, T, key_padding_mask=kp_t)
+            Bx = L["n1b"](Bx + B_attn); Bx = L["n2b"](Bx + L["ffb"](Bx))
+        h = self.shared(torch.cat([self._masked_mean(T, Mt), self._masked_mean(Bx, Mb)], dim=-1))
         return self.reg(h).squeeze(-1), self.cls(h)
 def load_binding_model(best_model_pt: Path, pooled_or_unpooled: str, device: torch.device) -> nn.Module:
     ckpt = torch.load(best_model_pt, map_location=device, weights_only=False)
     params = ckpt["best_params"]
+    sd     = ckpt["state_dict"]
     Ht = int(sd["t_proj.0.weight"].shape[1])
     Hb = int(sd["b_proj.0.weight"].shape[1])
+    common = dict(Ht=Ht, Hb=Hb, hidden=int(params["hidden_dim"]),
+                  n_heads=int(params["n_heads"]), n_layers=int(params["n_layers"]),
+                  dropout=float(params["dropout"]))
+    cls = CrossAttnPooled if pooled_or_unpooled == "pooled" else CrossAttnUnpooled
+    model = cls(**common)
     model.load_state_dict(sd)
+    return model.to(device).eval()
 # -----------------------------
 # Embedding generation
 # -----------------------------
 def _safe_isin(ids: torch.Tensor, test_ids: torch.Tensor) -> torch.Tensor:
     if hasattr(torch, "isin"):
         return torch.isin(ids, test_ids)
     return (ids.unsqueeze(-1) == test_ids.view(1, 1, -1)).any(dim=-1)
 class SMILESEmbedder:
+    def __init__(self, device, vocab_path, splits_path,
+                 clm_name="aaronfeller/PeptideCLM-23M-all", max_len=512, use_cache=True):
         self.device = device
         self.max_len = max_len
         self.use_cache = use_cache
         self.tokenizer = SMILES_SPE_Tokenizer(vocab_path, splits_path)
         self.model = AutoModelForMaskedLM.from_pretrained(clm_name).roformer.to(device).eval()
         self.special_ids = self._get_special_ids(self.tokenizer)
         self.special_ids_t = (torch.tensor(self.special_ids, device=device, dtype=torch.long)
+                              if self.special_ids else None)
         self._cache_pooled: Dict[str, torch.Tensor] = {}
         self._cache_unpooled: Dict[str, Tuple[torch.Tensor, torch.Tensor]] = {}
     @staticmethod
     def _get_special_ids(tokenizer) -> List[int]:
+        cand = [getattr(tokenizer, f"{x}_token_id", None)
+                for x in ("pad", "cls", "sep", "bos", "eos", "mask")]
         return sorted({int(x) for x in cand if x is not None})
+    def _tokenize(self, smiles_list):
+        tok = self.tokenizer(smiles_list, return_tensors="pt", padding=True,
+                             truncation=True, max_length=self.max_len)
+        for k in tok: tok[k] = tok[k].to(self.device)
         if "attention_mask" not in tok:
             tok["attention_mask"] = torch.ones_like(tok["input_ids"], dtype=torch.long, device=self.device)
         return tok
+    def _valid_mask(self, ids, attn):
+        valid = attn.bool()
+        if self.special_ids_t is not None and self.special_ids_t.numel() > 0:
+            valid = valid & (~_safe_isin(ids, self.special_ids_t))
+        return valid
     @torch.no_grad()
     def pooled(self, smiles: str) -> torch.Tensor:
         s = smiles.strip()
+        if self.use_cache and s in self._cache_pooled: return self._cache_pooled[s]
+        tok = self._tokenize([s])
+        h = self.model(input_ids=tok["input_ids"], attention_mask=tok["attention_mask"]).last_hidden_state
+        valid = self._valid_mask(tok["input_ids"], tok["attention_mask"])
+        vf = valid.unsqueeze(-1).float()
+        pooled = (h * vf).sum(dim=1) / vf.sum(dim=1).clamp(min=1e-9)
+        if self.use_cache: self._cache_pooled[s] = pooled
+        return pooled
+    @torch.no_grad()
+    def unpooled(self, smiles: str) -> Tuple[torch.Tensor, torch.Tensor]:
+        s = smiles.strip()
+        if self.use_cache and s in self._cache_unpooled: return self._cache_unpooled[s]
         tok = self._tokenize([s])
+        h = self.model(input_ids=tok["input_ids"], attention_mask=tok["attention_mask"]).last_hidden_state
+        valid = self._valid_mask(tok["input_ids"], tok["attention_mask"])
+        X = h[:, valid[0], :]
+        M = torch.ones((1, X.shape[1]), dtype=torch.bool, device=self.device)
+        if self.use_cache: self._cache_unpooled[s] = (X, M)
+        return X, M
+class ChemBERTaEmbedder:
+    def __init__(self, device, model_name="DeepChem/ChemBERTa-77M-MLM",
+                 max_len=512, use_cache=True):
+        from transformers import AutoTokenizer, AutoModel
+        self.device = device
+        self.max_len = max_len
+        self.use_cache = use_cache
+        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
+        self.model = AutoModel.from_pretrained(model_name).to(device).eval()
+        self.special_ids = self._get_special_ids(self.tokenizer)
+        self.special_ids_t = (torch.tensor(self.special_ids, device=device, dtype=torch.long)
+                              if self.special_ids else None)
+        self._cache_pooled: Dict[str, torch.Tensor] = {}
+        self._cache_unpooled: Dict[str, Tuple[torch.Tensor, torch.Tensor]] = {}
+    @staticmethod
+    def _get_special_ids(tokenizer) -> List[int]:
+        cand = [getattr(tokenizer, f"{x}_token_id", None)
+                for x in ("pad", "cls", "sep", "bos", "eos", "mask")]
+        return sorted({int(x) for x in cand if x is not None})
+    def _tokenize(self, smiles_list):
+        tok = self.tokenizer(smiles_list, return_tensors="pt", padding=True,
+                             truncation=True, max_length=self.max_len)
+        for k in tok: tok[k] = tok[k].to(self.device)
+        if "attention_mask" not in tok:
+            tok["attention_mask"] = torch.ones_like(tok["input_ids"], dtype=torch.long, device=self.device)
+        return tok
+    def _valid_mask(self, ids, attn):
+        valid = attn.bool()
         if self.special_ids_t is not None and self.special_ids_t.numel() > 0:
             valid = valid & (~_safe_isin(ids, self.special_ids_t))
+        return valid
+    @torch.no_grad()
+    def pooled(self, smiles: str) -> torch.Tensor:
+        s = smiles.strip()
+        if self.use_cache and s in self._cache_pooled: return self._cache_pooled[s]
+        tok = self._tokenize([s])
+        h = self.model(input_ids=tok["input_ids"], attention_mask=tok["attention_mask"]).last_hidden_state
+        valid = self._valid_mask(tok["input_ids"], tok["attention_mask"])
         vf = valid.unsqueeze(-1).float()
+        pooled = (h * vf).sum(dim=1) / vf.sum(dim=1).clamp(min=1e-9)
+        if self.use_cache: self._cache_pooled[s] = pooled
         return pooled
     @torch.no_grad()
     def unpooled(self, smiles: str) -> Tuple[torch.Tensor, torch.Tensor]:
         s = smiles.strip()
+        if self.use_cache and s in self._cache_unpooled: return self._cache_unpooled[s]
         tok = self._tokenize([s])
+        h = self.model(input_ids=tok["input_ids"], attention_mask=tok["attention_mask"]).last_hidden_state
+        valid = self._valid_mask(tok["input_ids"], tok["attention_mask"])
+        X = h[:, valid[0], :]
         M = torch.ones((1, X.shape[1]), dtype=torch.bool, device=self.device)
+        if self.use_cache: self._cache_unpooled[s] = (X, M)
         return X, M
 class WTEmbedder:
+    def __init__(self, device, esm_name="facebook/esm2_t33_650M_UR50D", max_len=1022, use_cache=True):
         self.device = device
         self.max_len = max_len
         self.use_cache = use_cache
         self.tokenizer = EsmTokenizer.from_pretrained(esm_name)
         self.model = EsmModel.from_pretrained(esm_name, add_pooling_layer=False).to(device).eval()
         self.special_ids = self._get_special_ids(self.tokenizer)
         self.special_ids_t = (torch.tensor(self.special_ids, device=device, dtype=torch.long)
+                              if self.special_ids else None)
         self._cache_pooled: Dict[str, torch.Tensor] = {}
         self._cache_unpooled: Dict[str, Tuple[torch.Tensor, torch.Tensor]] = {}
     @staticmethod
     def _get_special_ids(tokenizer) -> List[int]:
+        cand = [getattr(tokenizer, f"{x}_token_id", None)
+                for x in ("pad", "cls", "sep", "bos", "eos", "mask")]
         return sorted({int(x) for x in cand if x is not None})
+    def _tokenize(self, seq_list):
+        tok = self.tokenizer(seq_list, return_tensors="pt", padding=True,
+                             truncation=True, max_length=self.max_len)
         tok = {k: v.to(self.device) for k, v in tok.items()}
         if "attention_mask" not in tok:
             tok["attention_mask"] = torch.ones_like(tok["input_ids"], dtype=torch.long, device=self.device)
         return tok
+    def _valid_mask(self, ids, attn):
+        valid = attn.bool()
+        if self.special_ids_t is not None and self.special_ids_t.numel() > 0:
+            valid = valid & (~_safe_isin(ids, self.special_ids_t))
+        return valid
     @torch.no_grad()
     def pooled(self, seq: str) -> torch.Tensor:
         s = seq.strip()
+        if self.use_cache and s in self._cache_pooled: return self._cache_pooled[s]
         tok = self._tokenize([s])
+        h = self.model(**tok).last_hidden_state
+        valid = self._valid_mask(tok["input_ids"], tok["attention_mask"])
         vf = valid.unsqueeze(-1).float()
+        pooled = (h * vf).sum(dim=1) / vf.sum(dim=1).clamp(min=1e-9)
+        if self.use_cache: self._cache_pooled[s] = pooled
         return pooled
     @torch.no_grad()
     def unpooled(self, seq: str) -> Tuple[torch.Tensor, torch.Tensor]:
         s = seq.strip()
+        if self.use_cache and s in self._cache_unpooled: return self._cache_unpooled[s]
         tok = self._tokenize([s])
+        h = self.model(**tok).last_hidden_state
+        valid = self._valid_mask(tok["input_ids"], tok["attention_mask"])
+        X = h[:, valid[0], :]
         M = torch.ones((1, X.shape[1]), dtype=torch.bool, device=self.device)
+        if self.use_cache: self._cache_unpooled[s] = (X, M)
         return X, M
 # -----------------------------
 # Predictor
 # -----------------------------
 class PeptiVersePredictor:
     def __init__(
         self,
         manifest_path: str | Path,
         classifier_weight_root: str | Path,
         esm_name="facebook/esm2_t33_650M_UR50D",
         clm_name="aaronfeller/PeptideCLM-23M-all",
+        chemberta_name="DeepChem/ChemBERTa-77M-MLM",
         smiles_vocab="tokenizer/new_vocab.txt",
         smiles_splits="tokenizer/new_splits.txt",
         device: Optional[str] = None,
         self.manifest = read_best_manifest_csv(manifest_path)
+        self.wt_embedder       = WTEmbedder(self.device, esm_name=esm_name)
+        self.smiles_embedder   = SMILESEmbedder(self.device, clm_name=clm_name,
+                                                vocab_path=str(self.root / smiles_vocab),
+                                                splits_path=str(self.root / smiles_splits))
+        self.chemberta_embedder = ChemBERTaEmbedder(self.device, model_name=chemberta_name)
+        self.models:    Dict[Tuple[str, str], Any]            = {}
+        self.meta:      Dict[Tuple[str, str], Dict[str, Any]] = {}
+        self.mapie:     Dict[Tuple[str, str], dict]           = {}
+        self.ensembles: Dict[Tuple[str, str], List]           = {}
         self._load_all_best_models()
+    def _get_embedder(self, emb_tag: str):
+        if emb_tag == "wt":         return self.wt_embedder
+        if emb_tag == "peptideclm": return self.smiles_embedder
+        if emb_tag == "chemberta":  return self.chemberta_embedder
+        raise ValueError(f"Unknown emb_tag={emb_tag!r}")
+    def _embed_pooled(self, emb_tag: str, input_str: str) -> np.ndarray:
+        v = self._get_embedder(emb_tag).pooled(input_str)
+        feats = v.detach().cpu().numpy().astype(np.float32)
+        feats = np.nan_to_num(feats, nan=0.0)
+        return np.clip(feats, np.finfo(np.float32).min, np.finfo(np.float32).max)
+    def _embed_unpooled(self, emb_tag: str, input_str: str) -> Tuple[torch.Tensor, torch.Tensor]:
+        return self._get_embedder(emb_tag).unpooled(input_str)
+    def _resolve_dir(self, prop_key: str, model_name: str, emb_tag: str) -> Path:
         disk_prop = "half_life" if prop_key == "halflife" else prop_key
         base = self.training_root / disk_prop
+        folder_suffix = EMB_TAG_TO_FOLDER_SUFFIX.get(emb_tag, emb_tag)
+        if prop_key == "halflife" and emb_tag == "wt":
+            if model_name == "transformer":
+                for d in [base / "transformer_wt_log", base / "transformer_wt"]:
+                    if d.exists(): return d
+            if model_name in {"xgb", "xgb_reg"}:
+                d = base / "xgb_wt_log"
+                if d.exists(): return d
         candidates = [
+            base / f"{model_name}_{folder_suffix}",
             base / model_name,
         ]
         for d in candidates:
+            if d.exists(): return d
         raise FileNotFoundError(
+            f"Cannot find model dir for {prop_key}/{model_name}/{emb_tag}. Tried: {candidates}"
         )
     def _load_all_best_models(self):
         for prop_key, row in self.manifest.items():
+            for col, parsed, thr in [
+                ("wt",     row.best_wt,     row.thr_wt),
+                ("smiles", row.best_smiles,  row.thr_smiles),
             ]:
+                if parsed is None:
                     continue
+                model_name, emb_tag = parsed
+                # binding affinity
                 if prop_key == "binding_affinity":
+                    folder = model_name
+                    pooled_or_unpooled = "unpooled" if "unpooled" in folder else "pooled"
                     model_dir = self.training_root / "binding_affinity" / folder
                     art = find_best_artifact(model_dir)
+                    model = load_binding_model(art, pooled_or_unpooled, self.device)
+                    self.models[(prop_key, col)] = model
+                    self.meta[(prop_key, col)] = {
+                        "task_type":    "Regression",
+                        "threshold":    None,
+                        "artifact":     str(art),
+                        "model_name":   pooled_or_unpooled,
+                        "emb_tag":      emb_tag,
+                        "folder":       folder,
+                        "kind":         "binding",
                     }
+                    print(f"  [LOAD] binding_affinity ({col}): folder={folder}, arch={pooled_or_unpooled}, emb_tag={emb_tag}, art={art.name}")
+                    mapie_path = model_dir / "mapie_calibration.joblib"
+                    if mapie_path.exists():
+                        try:
+                            self.mapie[(prop_key, col)] = joblib.load(mapie_path)
+                            print(f"  MAPIE loaded from {mapie_path.name}")
+                        except Exception as e:
+                            print(f"  MAPIE load FAILED for ({prop_key}, {col}): {e}")
+                    else:
+                        print(f"     No MAPIE bundle found (uncertainty will be unavailable)")
                     continue
+                # infer emb_tag
+                if emb_tag is None:
+                    emb_tag = col
+                model_dir = self._resolve_dir(prop_key, model_name, emb_tag)
                 kind, obj, art = load_artifact(model_dir, self.device)
+                if kind == "torch_ckpt":
+                    arch = self._base_arch(model_name)
+                    model = build_torch_model_from_ckpt(arch, obj, self.device)
                 else:
+                    model = obj
+                self.models[(prop_key, col)] = model
+                self.meta[(prop_key, col)] = {
+                    "task_type":  row.task_type,
+                    "threshold":  thr,
+                    "artifact":   str(art),
+                    "model_name": model_name,
+                    "emb_tag":    emb_tag,
+                    "kind":       kind,
+                }
+                print(f"  [LOAD] ({prop_key}, {col}): kind={kind}, model={model_name}, emb={emb_tag}, task={row.task_type}, art={art.name}")
+                # MAPIE: SVR/ElasticNet, XGBoost regression, AND all regression torch_ckpt
+                is_regression = row.task_type.lower() == "regression"
+                wants_mapie = (
+                    (model_name in MAPIE_REGRESSION_MODELS and is_regression)
+                    or (kind == "xgb" and is_regression)
+                    or (kind == "torch_ckpt" and is_regression)
+                )
+                if wants_mapie:
+                    mapie_path = model_dir / "mapie_calibration.joblib"
+                    if mapie_path.exists():
+                        try:
+                            self.mapie[(prop_key, col)] = joblib.load(mapie_path)
+                            print(f"          MAPIE loaded from {mapie_path.name}")
+                        except Exception as e:
+                            print(f"   MAPIE load FAILED for ({prop_key}, {col}): {e}")
+                    else:
+                        print(f"          No MAPIE bundle found at {mapie_path} (will fall back to ensemble if available)")
+                # Seed ensembles: DNN only, used when MAPIE not available
+                if kind == "torch_ckpt":
+                    arch = self._base_arch(model_name)
+                    ens = load_seed_ensemble(model_dir, arch, self.device)
+                    if ens:
+                        self.ensembles[(prop_key, col)] = ens
+                        if (prop_key, col) in self.mapie:
+                            print(f"          Seed ensemble: {len(ens)} seeds loaded (MAPIE takes priority for regression)")
+                        else:
+                            unc_type = "ensemble_predictive_entropy" if row.task_type.lower() == "classifier" else "ensemble_std"
+                            print(f"          Seed ensemble: {len(ens)} seeds loaded  uncertainty method: {unc_type}")
+                    else:
+                        if (prop_key, col) in self.mapie:
+                            print(f"          No seed ensemble (MAPIE covers uncertainty)")
+                        else:
+                            print(f"          No seed ensemble found (checked: {SEED_DIRS}) - uncertainty unavailable")
+                # XGBoost/SVM classifiers: binary entropy
+                if kind in ("xgb", "joblib") and row.task_type.lower() == "classifier":
+                    print(f"         Uncertainty method: binary_predictive_entropy (computed at inference)")
+    @staticmethod
+    def _base_arch(model_name: str) -> str:
+        if model_name.startswith("transformer"): return "transformer"
+        if model_name.startswith("mlp"):         return "mlp"
+        if model_name.startswith("cnn"):         return "cnn"
+        return model_name
+    # Feature extraction
+    def _get_features(self, prop_key: str, col: str, input_str: str):
+        meta = self.meta[(prop_key, col)]
+        emb_tag = meta["emb_tag"]
+        kind    = meta["kind"]
         if kind == "torch_ckpt":
+            return self._embed_unpooled(emb_tag, input_str)
+        return self._embed_pooled(emb_tag, input_str)
+    # Uncertainty
+    def _compute_uncertainty(self, prop_key: str, col: str, input_str: str,
+                              score: float) -> Tuple[Any, str]:
+        meta      = self.meta[(prop_key, col)]
+        kind      = meta["kind"]
+        model_name = meta["model_name"]
+        task_type  = meta["task_type"].lower()
+        emb_tag   = meta["emb_tag"]
+        # Pooled embedding for adaptive MAPIE sigma model
+        def get_pooled_emb():
+            return self._embed_pooled(emb_tag, input_str) if emb_tag else None
+        # DNN
+        if kind == "torch_ckpt":
+            # Regression: prefer MAPIE if available
+            if task_type == "regression":
+                mapie_bundle = self.mapie.get((prop_key, col))
+                if mapie_bundle:
+                    emb = get_pooled_emb() if mapie_bundle.get("adaptive") else None
+                    lo, hi = _mapie_uncertainty(mapie_bundle, score, emb)
+                    return (lo, hi), "conformal_prediction_interval"
+                # Fall back to seed ensemble std
+                ens = self.ensembles.get((prop_key, col))
+                if ens:
+                    X, M = self._embed_unpooled(emb_tag, input_str)
+                    return _ensemble_reg_uncertainty(ens, X, M), "ensemble_std"
+                return None, "unavailable (no MAPIE bundle and no seed ensemble)"
+            # Classifier: ensemble predictive entropy
+            ens = self.ensembles.get((prop_key, col))
+            if not ens:
+                return None, "unavailable (no seed ensemble found)"
+            X, M = self._embed_unpooled(emb_tag, input_str)
+            return _ensemble_clf_uncertainty(ens, X, M), "ensemble_predictive_entropy"
+        # XGBoost
+        if kind == "xgb":
+            if task_type == "classifier":
+                return _binary_entropy(score), "binary_predictive_entropy"
+            mapie_bundle = self.mapie.get((prop_key, col))
+            if mapie_bundle:
+                emb = get_pooled_emb() if mapie_bundle.get("adaptive") else None
+                lo, hi = _mapie_uncertainty(mapie_bundle, score, emb)
+                return (lo, hi), "conformal_prediction_interval"
+            return None, "unavailable (no MAPIE bundle for XGBoost regression)"
+        # SVR / ElasticNet regression: MAPIE
+        if kind == "joblib" and model_name in MAPIE_REGRESSION_MODELS and task_type == "regression":
+            mapie_bundle = self.mapie.get((prop_key, col))
+            if mapie_bundle:
+                emb = get_pooled_emb() if mapie_bundle.get("adaptive") else None
+                lo, hi = _mapie_uncertainty(mapie_bundle, score, emb)
+                return (lo, hi), "conformal_prediction_interval"
+            return None, "unavailable (MAPIE bundle not found)"
+        # joblib classifiers (SVM, ElasticNet used as classifier)
+        if kind == "joblib" and task_type == "classifier":
+            return _binary_entropy(score), "binary_predictive_entropy_single_model"
+        return None, "unavailable"
+    def predict_property(self, prop_key: str, col: str, input_str: str,
+                         uncertainty: bool = False) -> Dict[str, Any]:
+        if (prop_key, col) not in self.models:
+            raise KeyError(f"No model loaded for ({prop_key}, {col}).")
+        meta       = self.meta[(prop_key, col)]
+        model      = self.models[(prop_key, col)]
+        task_type  = meta["task_type"].lower()
+        thr        = meta.get("threshold")
+        kind       = meta["kind"]
+        model_name = meta["model_name"]
         if prop_key == "binding_affinity":
             raise RuntimeError("Use predict_binding_affinity().")
+        # DNN
         if kind == "torch_ckpt":
+            X, M = self._get_features(prop_key, col, input_str)
             with torch.no_grad():
+                raw = model(X, M).squeeze().float().cpu().item()
+            if prop_key == "halflife" and col == "wt" and "log" in model_name:
+                raw = float(np.expm1(raw))
             if task_type == "classifier":
+                score = float(1.0 / (1.0 + np.exp(-raw)))
+                out   = {"property": prop_key, "col": col, "score": score,
+                         "emb_tag": meta["emb_tag"]}
                 if thr is not None:
+                    out["label"] = int(score >= float(thr)); out["threshold"] = float(thr)
             else:
+                out = {"property": prop_key, "col": col, "score": float(raw),
+                       "emb_tag": meta["emb_tag"]}
+        # XGBoost
+        elif kind == "xgb":
+            feats = self._get_features(prop_key, col, input_str)
+            pred  = float(model.predict(xgb.DMatrix(feats))[0])
+            if prop_key == "halflife" and col == "wt" and "log" in model_name:
                 pred = float(np.expm1(pred))
+            out = {"property": prop_key, "col": col, "score": pred,
+                   "emb_tag": meta["emb_tag"]}
+            if task_type == "classifier" and thr is not None:
+                out["label"] = int(pred >= float(thr)); out["threshold"] = float(thr)
+        # joblib (SVM / ElasticNet / SVR)
+        elif kind == "joblib":
+            feats = self._get_features(prop_key, col, input_str)
             if task_type == "classifier":
                 if hasattr(model, "predict_proba"):
                     pred = float(model.predict_proba(feats)[:, 1][0])
+                elif hasattr(model, "decision_function"):
+                    pred = float(1.0 / (1.0 + np.exp(-model.decision_function(feats)[0])))
                 else:
+                    pred = float(model.predict(feats)[0])
+                out = {"property": prop_key, "col": col, "score": pred,
+                       "emb_tag": meta["emb_tag"]}
                 if thr is not None:
+                    out["label"] = int(pred >= float(thr)); out["threshold"] = float(thr)
             else:
                 pred = float(model.predict(feats)[0])
+                out  = {"property": prop_key, "col": col, "score": pred,
+                        "emb_tag": meta["emb_tag"]}
+        else:
+            raise RuntimeError(f"Unknown kind={kind}")
+        if uncertainty:
+            u_val, u_type = self._compute_uncertainty(prop_key, col, input_str, out["score"])
+            out["uncertainty"]      = u_val
+            out["uncertainty_type"] = u_type
+        return out
+    def predict_binding_affinity(self, col: str, target_seq: str, binder_str: str,
+                                  uncertainty: bool = False) -> Dict[str, Any]:
+        prop_key = "binding_affinity"
+        if (prop_key, col) not in self.models:
+            raise KeyError(f"No binding model loaded for ({prop_key}, {col}).")
+        model  = self.models[(prop_key, col)]
+        meta   = self.meta[(prop_key, col)]
+        arch   = meta["model_name"]
+        emb_tag = meta.get("emb_tag")
+        if arch == "pooled":
+            t_vec = self.wt_embedder.pooled(target_seq)
+            b_vec = self._get_embedder(emb_tag or col).pooled(binder_str) if emb_tag else \
+                    (self.wt_embedder.pooled(binder_str) if col == "wt" else self.smiles_embedder.pooled(binder_str))
             with torch.no_grad():
                 reg, logits = model(t_vec, b_vec)
         else:
             T, Mt = self.wt_embedder.unpooled(target_seq)
+            binder_emb = self._get_embedder(emb_tag or col) if emb_tag else \
+                         (self.wt_embedder if col == "wt" else self.smiles_embedder)
+            B, Mb = binder_emb.unpooled(binder_str)
             with torch.no_grad():
                 reg, logits = model(T, Mt, B, Mb)
+        affinity  = float(reg.squeeze().cpu().item())
+        cls_logit = int(torch.argmax(logits, dim=-1).cpu().item())
+        cls_thr   = affinity_to_class(affinity)
+        names     = {0: "High (≥9)", 1: "Moderate (7-9)", 2: "Low (<7)"}
+        out = {
+            "property":          "binding_affinity",
+            "col":               col,
+            "affinity":          affinity,
             "class_by_threshold": names[cls_thr],
+            "class_by_logits":   names[cls_logit],
+            "binding_model":     arch,
         }
+        if uncertainty:
+            mapie_bundle = self.mapie.get((prop_key, col))
+            if mapie_bundle:
+                if mapie_bundle.get("adaptive") and "sigma_model" in mapie_bundle:
+                    # Concatenate target + binder pooled embeddings for sigma model
+                    binder_emb_tag = mapie_bundle.get("emb_tag") or col
+                    target_emb_tag = mapie_bundle.get("target_emb_tag", "wt")
+                    t_vec = self.wt_embedder.pooled(target_seq).cpu().float().numpy()
+                    b_vec = self._get_embedder(binder_emb_tag).pooled(binder_str).cpu().float().numpy()
+                    emb = np.concatenate([t_vec, b_vec], axis=1)
+                else:
+                    emb = None
+                lo, hi = _mapie_uncertainty(mapie_bundle, affinity, emb)
+                out["uncertainty"]      = (lo, hi)
+                out["uncertainty_type"] = "conformal_prediction_interval"
+            else:
+                out["uncertainty"]      = None
+                out["uncertainty_type"] = "unavailable (no MAPIE bundle found)"
+        return out
 if __name__ == "__main__":
+    root = Path(__file__).resolve().parent  # current script folder
+    predictor = PeptiVersePredictor(
+        manifest_path=root / "best_models.txt",
+        classifier_weight_root=root
     )
+    print(predictor.training_root)
+    print("MAPIE keys:",    list(predictor.mapie.keys()))
+    print("Ensemble keys:", list(predictor.ensembles.keys()))
+    seq = "GIGAVLKVLTTGLPALISWIKRKRQQ"
+    smiles = "C(C)C[C@@H]1NC(=O)[C@@H]2CCCN2C(=O)[C@@H](CC(C)C)NC(=O)[C@@H](CC(C)C)N(C)C(=O)[C@H](C)NC(=O)[C@H](Cc2ccccc2)NC1=O"
+    print(predictor.predict_property("hemolysis",   "wt",     seq))
+    print(predictor.predict_property("hemolysis",   "smiles",     smiles, uncertainty=True))
+    print(predictor.predict_property("nf",    "wt",     seq, uncertainty=True))
+    print(predictor.predict_property("nf",    "smiles",     smiles, uncertainty=True))
+    print(predictor.predict_binding_affinity("wt",  target_seq=seq, binder_str="GIGAVLKVLT"))
+    print(predictor.predict_binding_affinity("wt",  target_seq=seq, binder_str="GIGAVLKVLT", uncertainty=True))
+    seq1 = "GIGAVLKVLTTGLPALISWIKRKRQQ"
+    seq2 = "ACDEFGHIKLMNPQRSTVWY"
+    r1 = predictor.predict_binding_affinity("wt",  target_seq=seq2, binder_str="GIGAVLKVLT", uncertainty=True)
+    r2 = predictor.predict_property("nf", "wt", seq1, uncertainty=True)
+    r3 = predictor.predict_property("nf", "wt", seq2, uncertainty=True)
+    print(r1)
+    print(r2)
+    print(r3)

training_classifiers/binding_training.py CHANGED Viewed

@@ -51,8 +51,9 @@ def load_split_paired(path: str):
 # Collate: pooled paired
 # -----------------------------
 def collate_pair_pooled(batch):
-    Pt = torch.tensor([x["target_embedding"] for x in batch], dtype=torch.float32)  # (B,Ht)
-    Pb = torch.tensor([x["binder_embedding"] for x in batch], dtype=torch.float32)  # (B,Hb)
     y  = torch.tensor([float(x["label"]) for x in batch], dtype=torch.float32)
     return Pt, Pb, y
@@ -147,7 +148,7 @@ class CrossAttnUnpooled(nn.Module):
         self.layers = nn.ModuleList([])
         for _ in range(n_layers):
             self.layers.append(nn.ModuleDict({
-                "attn_tb": nn.MultiheadAttention(hidden, n_heads, dropout=dropout, batch_first=True),
                 "attn_bt": nn.MultiheadAttention(hidden, n_heads, dropout=dropout, batch_first=True),
                 "n1t": nn.LayerNorm(hidden),
                 "n2t": nn.LayerNorm(hidden),
@@ -272,7 +273,8 @@ def objective_crossattn(trial: optuna.Trial, mode: str, train_ds, val_ds) -> flo
     # infer dims from first row
     if mode == "pooled":
         Ht = len(train_ds[0]["target_embedding"])
-        Hb = len(train_ds[0]["binder_embedding"])
         collate = collate_pair_pooled
         model = CrossAttnPooled(Ht, Hb, hidden=hidden, n_heads=n_heads, n_layers=n_layers, dropout=dropout).to(DEVICE)
         train_loader = DataLoader(train_ds, batch_size=batch, shuffle=True, num_workers=4, pin_memory=True, collate_fn=collate)
@@ -349,7 +351,8 @@ def run(dataset_path: str, out_dir: str, mode: str, n_trials: int = 50):
     if mode == "pooled":
         Ht = len(train_ds[0]["target_embedding"])
-        Hb = len(train_ds[0]["binder_embedding"])
         model = CrossAttnPooled(Ht, Hb, hidden=hidden, n_heads=n_heads, n_layers=n_layers, dropout=dropout).to(DEVICE)
         collate = collate_pair_pooled
         train_loader = DataLoader(train_ds, batch_size=batch, shuffle=True, num_workers=4, pin_memory=True, collate_fn=collate)

 # Collate: pooled paired
 # -----------------------------
 def collate_pair_pooled(batch):
+    binder_key = "binder_embedding" if "binder_embedding" in batch[0] else "embedding"
+    Pt = torch.tensor([x["target_embedding"] for x in batch], dtype=torch.float32)
+    Pb = torch.tensor([x[binder_key] for x in batch], dtype=torch.float32)
     y  = torch.tensor([float(x["label"]) for x in batch], dtype=torch.float32)
     return Pt, Pb, y
         self.layers = nn.ModuleList([])
         for _ in range(n_layers):
             self.layers.append(nn.ModuleDict({
+                "attn_tb": nn.MultiheadAttention(hidden, n_heads, dropout=dropout, batch_first=True), # (B, L, H) for embeddings now
                 "attn_bt": nn.MultiheadAttention(hidden, n_heads, dropout=dropout, batch_first=True),
                 "n1t": nn.LayerNorm(hidden),
                 "n2t": nn.LayerNorm(hidden),
     # infer dims from first row
     if mode == "pooled":
         Ht = len(train_ds[0]["target_embedding"])
+        binder_key = "binder_embedding" if "binder_embedding" in train_ds.column_names else "embedding"
+        Hb = len(train_ds[0][binder_key])
         collate = collate_pair_pooled
         model = CrossAttnPooled(Ht, Hb, hidden=hidden, n_heads=n_heads, n_layers=n_layers, dropout=dropout).to(DEVICE)
         train_loader = DataLoader(train_ds, batch_size=batch, shuffle=True, num_workers=4, pin_memory=True, collate_fn=collate)
     if mode == "pooled":
         Ht = len(train_ds[0]["target_embedding"])
+        binder_key = "binder_embedding" if "binder_embedding" in train_ds.column_names else "embedding"
+        Hb = len(train_ds[0][binder_key])
         model = CrossAttnPooled(Ht, Hb, hidden=hidden, n_heads=n_heads, n_layers=n_layers, dropout=dropout).to(DEVICE)
         collate = collate_pair_pooled
         train_loader = DataLoader(train_ds, batch_size=batch, shuffle=True, num_workers=4, pin_memory=True, collate_fn=collate)

training_classifiers/long_aggregated.csv CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:2830a2099262d9e7ffdff70bc789b74a69bb44bb3dd380d8d05b91c9d01d065a
-size 45506

 version https://git-lfs.github.com/spec/v1
+oid sha256:513cd88f97ef4b04ef92baaec85f2a5fe255a7dd50664025b2628a4ab6d94a99
+size 45539

training_classifiers/ml_uncertainty.py ADDED Viewed

	@@ -0,0 +1,135 @@

+import os
+import json
+import argparse
+import numpy as np
+import pandas as pd
+import xgboost as xgb
+from scipy import stats
+from sklearn.metrics import f1_score, roc_auc_score, precision_recall_curve
+from datasets import load_from_disk, DatasetDict
+def best_f1_threshold(y_true, y_prob):
+    p, r, thr = precision_recall_curve(y_true, y_prob)
+    f1s = (2 * p[:-1] * r[:-1]) / (p[:-1] + r[:-1] + 1e-12)
+    i = int(np.nanargmax(f1s))
+    return float(thr[i]), float(f1s[i])
+def bootstrap_ci(
+    y_true: np.ndarray,
+    y_prob: np.ndarray,
+    n_bootstrap: int = 2000,
+    ci: float = 0.95,
+    seed: int = 1986,
+) -> dict:
+    """
+    Non-parametric bootstrap CI for F1 (at val-optimal threshold) and AUC.
+    Resamples (y_true, y_prob) pairs
+    """
+    rng = np.random.default_rng(seed=seed)
+    n = len(y_true)
+    # Threshold picked on the full val set
+    thr, _ = best_f1_threshold(y_true, y_prob)
+    f1_scores, auc_scores = [], []
+    for _ in range(n_bootstrap):
+        idx = rng.integers(0, n, size=n)
+        yt, yp = y_true[idx], y_prob[idx]
+        # Skip degenerate bootstraps (only one class)
+        if len(np.unique(yt)) < 2:
+            continue
+        f1_scores.append(f1_score(yt, (yp >= thr).astype(int), zero_division=0))
+        auc_scores.append(roc_auc_score(yt, yp))
+    alpha = 1 - ci
+    lo, hi = alpha / 2, 1 - alpha / 2
+    results = {}
+    for name, arr in [("f1", f1_scores), ("auc", auc_scores)]:
+        arr = np.array(arr)
+        results[name] = {
+            "mean":    float(arr.mean()),
+            "std":     float(arr.std()),
+            "ci_low":  float(np.quantile(arr, lo)),
+            "ci_high": float(np.quantile(arr, hi)),
+            "report":  f"{arr.mean():.4f} [{np.quantile(arr, lo):.4f}, {np.quantile(arr, hi):.4f}]",
+            "n_bootstrap": len(arr),
+        }
+    results["threshold_used"] = float(thr)
+    results["n_samples"] = int(n)
+    return results
+def prob_margin_uncertainty(val_preds_df: pd.DataFrame) -> pd.DataFrame:
+    """
+    Uncertainty = distance from the decision boundary in probability space.
+    |prob - 0.5| if = 0.0 means maximally uncertain, 0.5 means maximally confident.
+    Normalized to [0, 1]: confidence = 2 * |prob - 0.5|
+    This reflecting how far the model is from a coin-flip on given sequence.
+    """
+    df = val_preds_df.copy()
+    df["uncertainty"] = 1 - 2 * (df["y_prob"] - 0.5).abs()   # 0=confident, 1=uncertain
+    df["confidence"]  = 1 - df["uncertainty"]                # 0=uncertain, 1=confident
+    return df
+def save_ci_report(ci_results: dict, out_dir: str, model_name: str = ""):
+    os.makedirs(out_dir, exist_ok=True)
+    path = os.path.join(out_dir, "bootstrap_ci.json")
+    with open(path, "w") as f:
+        json.dump(ci_results, f, indent=2)
+    print(f"\n=== Bootstrap 95% CI ({model_name}) ===")
+    print(f"  F1  : {ci_results['f1']['report']}")
+    print(f"  AUC : {ci_results['auc']['report']}")
+    print(f"  (threshold={ci_results['threshold_used']:.4f}, "
+          f"n_bootstrap={ci_results['f1']['n_bootstrap']}, "
+          f"n_val={ci_results['n_samples']})")
+    print(f"Saved to {path}")
+def save_uncertainty_csv(df: pd.DataFrame, out_dir: str, fname: str = "val_uncertainty.csv"):
+    os.makedirs(out_dir, exist_ok=True)
+    path = os.path.join(out_dir, fname)
+    df.to_csv(path, index=False)
+    print(f"\n=== Per-molecule uncertainty ===")
+    print(f"  Mean uncertainty : {df['uncertainty'].mean():.4f}")
+    print(f"  Mean confidence  : {df['confidence'].mean():.4f}")
+    print(f"  Saved to {path}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--mode", choices=["ci", "uncertainty_xgb", "uncertainty_prob"],
+                        required=True,
+                        help=(
+                            "ci               : bootstrap CI from val_predictions.csv (all models)\n"
+                            "uncertainty_prob : margin uncertainty for SVM/ElasticNet/XGB"
+                        ))
+    parser.add_argument("--val_preds",    type=str, help="Path to val_predictions.csv")
+    parser.add_argument("--model_path",   type=str, help="Path to best_model.json (XGB only)")
+    parser.add_argument("--dataset_path", type=str, help="HuggingFace dataset path (XGB uncertainty only)")
+    parser.add_argument("--out_dir",      type=str, required=True)
+    parser.add_argument("--model_name",   type=str, default="", help="Label for report (xgb_smiles)")
+    parser.add_argument("--n_bootstrap",  type=int, default=2000)
+    args = parser.parse_args()
+    if args.mode == "ci":
+        assert args.val_preds, "--val_preds required for ci mode"
+        df  = pd.read_csv(args.val_preds)
+        ci  = bootstrap_ci(df["y_true"].values, df["y_prob"].values,
+                           n_bootstrap=args.n_bootstrap)
+        save_ci_report(ci, args.out_dir, args.model_name)
+    elif args.mode == "uncertainty_prob":
+        assert args.val_preds, "--val_preds required for uncertainty_prob"
+        df_preds = pd.read_csv(args.val_preds)
+        # CI
+        ci = bootstrap_ci(df_preds["y_true"].values, df_preds["y_prob"].values,
+                          n_bootstrap=args.n_bootstrap)
+        save_ci_report(ci, args.out_dir, args.model_name)
+        # Uncertainty from margin
+        df_unc = prob_margin_uncertainty(df_preds)
+        save_uncertainty_csv(df_unc, args.out_dir, "val_uncertainty_prob.csv")

training_classifiers/ml_uncertainty_reg.py ADDED Viewed

	@@ -0,0 +1,182 @@

+import os
+import json
+import argparse
+import numpy as np
+import pandas as pd
+import xgboost as xgb
+from scipy.stats import spearmanr
+from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
+from datasets import load_from_disk, DatasetDict
+def safe_spearmanr(y_true, y_pred):
+    rho = spearmanr(y_true, y_pred).correlation
+    return 0.0 if (rho is None or np.isnan(rho)) else float(rho)
+def eval_regression(y_true, y_pred):
+    try:
+        from sklearn.metrics import root_mean_squared_error
+        rmse = float(root_mean_squared_error(y_true, y_pred))
+    except Exception:
+        rmse = float(np.sqrt(mean_squared_error(y_true, y_pred)))
+    return {
+        "spearman_rho": safe_spearmanr(y_true, y_pred),
+        "rmse": rmse,
+        "mae":  float(mean_absolute_error(y_true, y_pred)),
+        "r2":   float(r2_score(y_true, y_pred)),
+    }
+# ======================== Bootstrap CI =========================================
+def bootstrap_ci_reg(
+    y_true: np.ndarray,
+    y_pred: np.ndarray,
+    n_bootstrap: int = 2000,
+    ci: float = 0.95,
+    seed: int = 1986,
+) -> dict:
+    """
+    Percentile bootstrap CI for regression metrics.
+    Uses percentile method (not t-CI) because:
+      - Spearman rho is bounded [-1, 1] - t-CI can produce impossible values near extremes
+      - RMSE is strictly positive - symmetric t-CI is inappropriate near 0
+      - Percentile bootstrap makes no distributional assumptions
+    Fisher z-transform CI for rho is also computed as a cross-check.
+    """
+    rng = np.random.default_rng(seed=seed)
+    n   = len(y_true)
+    alpha = 1 - ci
+    lo, hi = alpha / 2, 1 - alpha / 2
+    boot_metrics = {k: [] for k in ["spearman_rho", "rmse", "mae", "r2"]}
+    for _ in range(n_bootstrap):
+        idx = rng.integers(0, n, size=n)
+        yt, yp = y_true[idx], y_pred[idx]
+        if len(np.unique(yt)) < 2:
+            continue
+        m = eval_regression(yt, yp)
+        for k in boot_metrics:
+            boot_metrics[k].append(m[k])
+    results = {}
+    for name, arr in boot_metrics.items():
+        arr = np.array(arr)
+        results[name] = {
+            "mean":    float(arr.mean()),
+            "std":     float(arr.std()),
+            "ci_low":  float(np.quantile(arr, lo)),
+            "ci_high": float(np.quantile(arr, hi)),
+            "report":  f"{arr.mean():.4f} [{np.quantile(arr, lo):.4f}, {np.quantile(arr, hi):.4f}]",
+            "n_bootstrap": len(arr),
+        }
+    # Fisher z-transform CI for Spearman rho (cross-check, more accurate near ±1)
+    rho_vals = np.array(boot_metrics["spearman_rho"])
+    rho_obs  = safe_spearmanr(y_true, y_pred)
+    # z-transform: arctanh(rho), SE = 1/sqrt(n-3)
+    z     = np.arctanh(np.clip(rho_obs, -0.9999, 0.9999))
+    se_z  = 1.0 / np.sqrt(max(n - 3, 1))
+    z_lo  = z - 1.96 * se_z
+    z_hi  = z + 1.96 * se_z
+    results["spearman_rho"]["fisher_z_ci"] = {
+        "ci_low":  float(np.tanh(z_lo)),
+        "ci_high": float(np.tanh(z_hi)),
+        "report":  f"[{np.tanh(z_lo):.4f}, {np.tanh(z_hi):.4f}]",
+        "note": "Fisher z-transform CI - more accurate when rho > 0.9",
+    }
+    results["n_samples"] = int(n)
+    return results
+def residual_uncertainty(val_preds_df: pd.DataFrame, coverage: float = 0.95) -> pd.DataFrame:
+    """
+      - Assume residuals ~ N(0, sigma) where sigma = std(residuals)
+      - 95% prediction interval for molecule i: y_pred_i ± z * sigma
+      - Uncertainty score = sigma (constant across all molecules for linear models)
+      - Dataset-level uncertainty
+    """
+    df = val_preds_df.copy()
+    residuals  = df["y_true"] - df["y_pred"]
+    sigma      = float(residuals.std(ddof=1))
+    z          = {0.90: 1.645, 0.95: 1.960, 0.99: 2.576}.get(coverage, 1.960)
+    half_width = z * sigma
+    df["pred_interval_low"]  = df["y_pred"] - half_width
+    df["pred_interval_high"] = df["y_pred"] + half_width
+    df["pred_interval_width"] = 2 * half_width   # constant for linear models
+    df["abs_error"]           = residuals.abs()
+    # what fraction of y_true actually falls inside the interval
+    empirical_coverage = float(
+        ((df["y_true"] >= df["pred_interval_low"]) &
+         (df["y_true"] <= df["pred_interval_high"])).mean()
+    )
+    meta = {
+        "residual_std":       round(sigma, 6),
+        "interval_halfwidth": round(half_width, 6),
+        f"nominal_coverage":  coverage,
+        "empirical_coverage": round(empirical_coverage, 4),
+        "note": (
+            "Prediction interval assumes N(0, sigma) residuals."
+            "Interval width is constant across molecules for linear models. "
+        ),
+    }
+    return df, meta
+def save_ci_report(ci_results: dict, out_dir: str, model_name: str = ""):
+    os.makedirs(out_dir, exist_ok=True)
+    path = os.path.join(out_dir, "bootstrap_ci_reg.json")
+    with open(path, "w") as f:
+        json.dump(ci_results, f, indent=2)
+    print(f"\n=== Bootstrap 95% CI - Regression ({model_name}) ===")
+    for metric in ["spearman_rho", "rmse", "mae", "r2"]:
+        r = ci_results[metric]
+        print(f"  {metric:15s}: {r['report']}")
+        if metric == "spearman_rho" and "fisher_z_ci" in r:
+            fz = r["fisher_z_ci"]
+            print(f"    Fisher z CI  : {fz['report']}  ← use this if rho > 0.9")
+    print(f"  n_val={ci_results['n_samples']}, n_bootstrap={ci_results['spearman_rho']['n_bootstrap']}")
+    print(f"Saved to {path}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--mode", required=True,
+                        choices=["ci", "uncertainty_residual"],
+                        help=(
+                            "ci                  : bootstrap CI from val_predictions.csv\n"
+                            "uncertainty_residual: residual interval for ElasticNet/SVR"
+                        ))
+    parser.add_argument("--val_preds",    type=str, help="Path to val_predictions.csv")
+    parser.add_argument("--out_dir",      type=str, required=True)
+    parser.add_argument("--model_name",   type=str, default="")
+    parser.add_argument("--n_bootstrap",  type=int, default=2000)
+    args = parser.parse_args()
+    if args.mode == "ci":
+        assert args.val_preds, "--val_preds required"
+        df = pd.read_csv(args.val_preds)
+        ci = bootstrap_ci_reg(df["y_true"].values, df["y_pred"].values,
+                              n_bootstrap=args.n_bootstrap)
+        save_ci_report(ci, args.out_dir, args.model_name)
+    elif args.mode == "uncertainty_residual":
+        assert args.val_preds
+        df_preds = pd.read_csv(args.val_preds)
+        ci = bootstrap_ci_reg(df_preds["y_true"].values, df_preds["y_pred"].values,
+                              n_bootstrap=args.n_bootstrap)
+        save_ci_report(ci, args.out_dir, args.model_name)
+        df_unc, meta = residual_uncertainty(df_preds)
+        path = os.path.join(args.out_dir, "val_uncertainty_residual.csv")
+        df_unc.to_csv(path, index=False)
+        meta_path = os.path.join(args.out_dir, "residual_interval_meta.json")
+        with open(meta_path, "w") as f:
+            json.dump(meta, f, indent=2)
+        print(f"\nResidual interval summary:")
+        print(f"  Residual std       : {meta['residual_std']:.4f}")
+        print(f"  95% interval ± {meta['interval_halfwidth']:.4f}")
+        print(f"  Empirical coverage : {meta['empirical_coverage']:.4f}  (nominal={meta['nominal_coverage']})")
+        print(f"  Saved to {path}")

training_classifiers/refit_binding_affinity_seed.py ADDED Viewed

	@@ -0,0 +1,270 @@

+import os
+import json
+import argparse
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader
+from datasets import load_from_disk, DatasetDict
+from scipy.stats import spearmanr
+from scipy import stats as scipy_stats
+from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
+from lightning.pytorch import seed_everything
+import sys
+sys.path.append(os.path.dirname(os.path.abspath(__file__)))
+from binding_training import (
+    CrossAttnPooled,
+    CrossAttnUnpooled,
+    collate_pair_pooled,
+    collate_pair_unpooled,
+    eval_spearman_pooled,
+    eval_spearman_unpooled,
+    train_one_epoch_pooled,
+    train_one_epoch_unpooled,
+    affinity_to_class_tensor,
+    safe_spearmanr,
+)
+DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+def load_split_paired(path: str):
+    dd = load_from_disk(path)
+    if not isinstance(dd, DatasetDict):
+        raise ValueError(f"Expected DatasetDict at {path}")
+    return dd["train"], dd["val"]
+def eval_regression(y_true: np.ndarray, y_pred: np.ndarray) -> dict:
+    try:
+        from sklearn.metrics import root_mean_squared_error
+        rmse = float(root_mean_squared_error(y_true, y_pred))
+    except Exception:
+        rmse = float(np.sqrt(mean_squared_error(y_true, y_pred)))
+    return {
+        "spearman_rho": safe_spearmanr(y_true, y_pred),
+        "rmse": rmse,
+        "mae":  float(mean_absolute_error(y_true, y_pred)),
+        "r2":   float(r2_score(y_true, y_pred)),
+    }
+@torch.no_grad()
+def predict_all_pooled(model, loader):
+    model.eval()
+    ys, ps = [], []
+    for t, b, y in loader:
+        t = t.to(DEVICE, non_blocking=True)
+        b = b.to(DEVICE, non_blocking=True)
+        pred, _ = model(t, b)
+        ys.append(y.numpy())
+        ps.append(pred.detach().cpu().numpy())
+    return np.concatenate(ys), np.concatenate(ps)
+@torch.no_grad()
+def predict_all_unpooled(model, loader):
+    model.eval()
+    ys, ps = [], []
+    for T, Mt, B, Mb, y in loader:
+        T  = T.to(DEVICE, non_blocking=True)
+        Mt = Mt.to(DEVICE, non_blocking=True)
+        B  = B.to(DEVICE, non_blocking=True)
+        Mb = Mb.to(DEVICE, non_blocking=True)
+        pred, _ = model(T, Mt, B, Mb)
+        ys.append(y.numpy())
+        ps.append(pred.detach().cpu().numpy())
+    return np.concatenate(ys), np.concatenate(ps)
+def build_model(mode: str, params: dict, train_ds) -> nn.Module:
+    hidden  = int(params["hidden_dim"])
+    n_heads = int(params["n_heads"])
+    n_layers = int(params["n_layers"])
+    dropout  = float(params["dropout"])
+    binder_key = "embedding" if "binder_embedding" not in train_ds.column_names else "binder_embedding"
+    if mode == "pooled":
+        Ht = len(train_ds[0]["target_embedding"])
+        Hb = len(train_ds[0][binder_key])
+        return CrossAttnPooled(Ht, Hb, hidden=hidden, n_heads=n_heads,
+                               n_layers=n_layers, dropout=dropout).to(DEVICE)
+    else:
+        Ht = len(train_ds[0]["target_embedding"][0])
+        Hb = len(train_ds[0]["binder_embedding"][0])
+        return CrossAttnUnpooled(Ht, Hb, hidden=hidden, n_heads=n_heads,
+                                 n_layers=n_layers, dropout=dropout).to(DEVICE)
+# Refit
+def refit_with_seed(dataset_path: str, base_out_dir: str, mode: str,
+                    seed: int, patience: int = 20) -> dict:
+    model_path = os.path.join(base_out_dir, "best_model.pt")
+    if not os.path.exists(model_path):
+        raise FileNotFoundError(
+            f"No best_model.pt found at {model_path}. Run Optuna (binding_training.py) first."
+        )
+    checkpoint  = torch.load(model_path, map_location="cpu")
+    best_params = checkpoint["best_params"]
+    print(f"Loaded best_params from {model_path}")
+    print(json.dumps(best_params, indent=2))
+    seed_everything(seed)
+    out_dir = os.path.join(base_out_dir, f"seed_{seed}")
+    os.makedirs(out_dir, exist_ok=True)
+    train_ds, val_ds = load_split_paired(dataset_path)
+    print(f"[Data] Train={len(train_ds)}  Val={len(val_ds)}  mode={mode}")
+    batch  = int(best_params["batch_size"])
+    cls_w  = float(best_params["cls_weight"])
+    if mode == "pooled":
+        collate  = collate_pair_pooled
+        eval_fn  = eval_spearman_pooled
+        train_fn = train_one_epoch_pooled
+        predict  = predict_all_pooled
+    else:
+        collate  = collate_pair_unpooled
+        eval_fn  = eval_spearman_unpooled
+        train_fn = train_one_epoch_unpooled
+        predict  = predict_all_unpooled
+    train_loader = DataLoader(train_ds, batch_size=batch, shuffle=True,
+                              num_workers=4, pin_memory=True, collate_fn=collate)
+    val_loader   = DataLoader(val_ds,   batch_size=batch, shuffle=False,
+                              num_workers=4, pin_memory=True, collate_fn=collate)
+    model = build_model(mode, best_params, train_ds)
+    opt   = torch.optim.AdamW(model.parameters(),
+                               lr=float(best_params["lr"]),
+                               weight_decay=float(best_params["weight_decay"]))
+    loss_reg = nn.MSELoss()
+    loss_cls = nn.CrossEntropyLoss()
+    best_rho, bad, best_state = -1e9, 0, None
+    for epoch in range(1, 201):
+        train_fn(model, train_loader, opt, loss_reg, loss_cls, cls_w=cls_w)
+        rho = eval_fn(model, val_loader)
+        if rho > best_rho + 1e-6:
+            best_rho = rho
+            bad = 0
+            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
+        else:
+            bad += 1
+            if bad >= patience:
+                print(f"  Early stopping at epoch {epoch}  (best rho={best_rho:.4f})")
+                break
+    if best_state:
+        model.load_state_dict(best_state)
+    y_true, y_pred = predict(model, val_loader)
+    metrics = eval_regression(y_true, y_pred)
+    # Save predictions
+    df_val = pd.DataFrame({
+        "y_true":    y_true.astype(float),
+        "y_pred":    y_pred.astype(float),
+        "residual":  (y_true - y_pred).astype(float),
+        "abs_error": np.abs(y_true - y_pred).astype(float),
+    })
+    for col in ("target_sequence", "sequence", "affinity_class"):
+        if col in val_ds.column_names:
+            df_val.insert(0, col, np.asarray(val_ds[col]))
+    df_val.to_csv(os.path.join(out_dir, "val_predictions.csv"), index=False)
+    torch.save({"state_dict": model.state_dict(),
+                "best_params": best_params,
+                "mode": mode,
+                "seed": seed},
+               os.path.join(out_dir, "model.pt"))
+    summary = {"mode": mode, "seed": seed,
+               **{k: round(v, 6) for k, v in metrics.items()}}
+    with open(os.path.join(out_dir, "metrics.json"), "w") as f:
+        json.dump(summary, f, indent=2)
+    print(f"\n[Seed {seed}] rho={metrics['spearman_rho']:.4f}  "
+          f"RMSE={metrics['rmse']:.4f}  R2={metrics['r2']:.4f}")
+    return summary
+# CI aggregation
+def aggregate_seed_results(base_out_dir: str, seeds: list) -> pd.DataFrame:
+    records = []
+    for seed in seeds:
+        p = os.path.join(base_out_dir, f"seed_{seed}", "metrics.json")
+        if os.path.exists(p):
+            records.append(json.load(open(p)))
+        else:
+            print(f"[WARN] Missing seed {seed} at {p}")
+    if not records:
+        raise ValueError("No seed results found — did the refit jobs complete?")
+    df = pd.DataFrame(records)
+    print("\nPer-seed results:")
+    print(df.to_string(index=False))
+    summary_rows = []
+    for metric in ["spearman_rho", "rmse", "mae", "r2"]:
+        vals   = df[metric].values
+        n      = len(vals)
+        mean   = vals.mean()
+        std    = vals.std(ddof=1)
+        se     = std / np.sqrt(n)
+        t_crit = scipy_stats.t.ppf(0.975, df=n - 1)
+        ci     = t_crit * se
+        row = {
+            "metric":  metric,
+            "mean":    round(mean, 4),
+            "std":     round(std,  4),
+            "ci_95":   round(ci,   4),
+            "report":  f"{mean:.4f} ± {ci:.4f}",
+            "n_seeds": n,
+        }
+        if metric == "spearman_rho" and (mean + ci > 0.95 or mean - ci < -0.95):
+            row["note"] = "rho near boundary — consider Fisher z-transform CI"
+        summary_rows.append(row)
+    summary_df = pd.DataFrame(summary_rows)
+    out_path = os.path.join(base_out_dir, "seed_aggregated_metrics.csv")
+    summary_df.to_csv(out_path, index=False)
+    print("\n=== Aggregated Metrics (95% CI, t-distribution) ===")
+    for _, row in summary_df.iterrows():
+        note = f"  ← {row['note']}" if "note" in row and pd.notna(row.get("note")) else ""
+        print(f"  {row['metric']:15s}: {row['report']}{note}")
+    print(f"\nSaved → {out_path}")
+    return summary_df
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dataset_path", type=str, required=True,
+                        help="Paired DatasetDict path")
+    parser.add_argument("--base_out_dir", type=str, required=True,
+                        help="Directory containing best_model.pt from the Optuna run")
+    parser.add_argument("--mode", type=str, required=True)
+    parser.add_argument("--seed", type=int, required=True)
+    parser.add_argument("--patience", type=int, default=20)
+    parser.add_argument("--aggregate", action="store_true",
+                        help="Aggregate across seed runs instead of training")
+    parser.add_argument("--all_seeds", type=int, nargs="+", default=[1986, 42, 0, 123, 12345])
+    args = parser.parse_args()
+    if args.aggregate:
+        aggregate_seed_results(args.base_out_dir, args.all_seeds)
+    else:
+        refit_with_seed(
+            dataset_path=args.dataset_path,
+            base_out_dir=args.base_out_dir,
+            mode=args.mode,
+            seed=args.seed,
+            patience=args.patience,
+        )

training_classifiers/refit_ml_walltime.py ADDED Viewed

	@@ -0,0 +1,209 @@

+"""
+Loads best params from optimization_summary.txt, refits the model once on the
+train split, and appends a wall-time record to wall_clock_ml.jsonl.
+"""
+import json
+import time
+import joblib
+import argparse
+import re
+import numpy as np
+from pathlib import Path
+from datetime import datetime
+# Classification trainers
+from train_ml import (
+    load_split_data as load_split_cls,
+    train_cuml_svc,
+    train_cuml_elastic_net,
+    train_xgb,
+    train_svm,
+)
+# Regression trainers
+from train_ml_regression import (
+    load_split_data as load_split_reg,
+    train_cuml_elasticnet_reg,
+    train_svr_reg,
+    train_xgb_reg,
+)
+MODEL_FILE_MAP = [
+    ("best_model_cuml_svc.joblib",  "svm_gpu",  "classification"),
+    ("best_model_cuml_enet.joblib", "enet_gpu",  "auto"),
+    ("best_model_svr.joblib",       "svr",       "regression"),
+    ("best_model.joblib",           "svm",       "classification"),
+    ("best_model.json",             "xgb",       "auto"),
+]
+def detect_model_type(model_dir: Path) -> tuple:
+    """Returns (model_type, task)."""
+    for fname, model_type, task in MODEL_FILE_MAP:
+        if (model_dir / fname).exists():
+            if task == "auto":
+                if (model_dir / "scaler.joblib").exists():
+                    task = "regression"
+                    if model_type == "xgb":
+                        model_type = "xgb_reg"
+                else:
+                    task = "classification"
+            return model_type, task
+    raise FileNotFoundError(
+        f"No recognised model file in {model_dir}. "
+        f"Expected one of: {[f for f, _, _ in MODEL_FILE_MAP]}"
+    )
+def parse_best_params(model_dir: Path) -> dict:
+    """
+    Extracts the JSON block after 'Best params:' in optimization_summary.txt.
+    """
+    summary_path = model_dir / "optimization_summary.txt"
+    if not summary_path.exists():
+        raise FileNotFoundError(f"optimization_summary.txt not found in {model_dir}")
+    text = summary_path.read_text()
+    match = re.search(r"Best params:\s*(\{.*?\})\s*={10,}", text, re.DOTALL)
+    if not match:
+        raise ValueError(
+            f"Could not find 'Best params:' JSON block in {summary_path}.\n"
+            f"File contents:\n{text}"
+        )
+    return json.loads(match.group(1))
+def parse_objective_and_wt(model_dir: Path) -> tuple:
+    """
+    Expects layout: .../training_classifiers/<objective>/<model>_<wt>/
+    Example: hemolysis/svm_gpu_smiles -> objective=hemolysis, wt=smiles
+    """
+    parts        = model_dir.parts
+    model_folder = parts[-1].lower()
+    objective    = parts[-2]
+    for suffix, wt in [("_chemberta", "chemberta"), ("_smiles", "smiles"), ("_wt", "wt")]:
+        if model_folder.endswith(suffix):
+            return objective, wt
+    return objective, "wt"
+def refit_and_time(model_dir: Path, dataset_path: str) -> tuple:
+    model_type, task = detect_model_type(model_dir)
+    best_params      = parse_best_params(model_dir)
+    print(f"  Model type : {model_type}  ({task})")
+    print(f"  Best params: {best_params}")
+    # Load scaler if present (regression models)
+    scaler_path = model_dir / "scaler.joblib"
+    scaler      = joblib.load(scaler_path) if scaler_path.exists() else None
+    load_fn = load_split_reg if task == "regression" else load_split_cls
+    data    = load_fn(dataset_path)
+    print(f"  Train: {data.X_train.shape}  Val: {data.X_val.shape}")
+    # Build params
+    if model_type == "xgb":
+        params = {
+            "objective":        "binary:logistic",
+            "eval_metric":      "logloss",
+            "lambda":           best_params["lambda"],
+            "alpha":            best_params["alpha"],
+            "colsample_bytree": best_params["colsample_bytree"],
+            "subsample":        best_params["subsample"],
+            "learning_rate":    best_params["learning_rate"],
+            "max_depth":        best_params["max_depth"],
+            "min_child_weight": best_params["min_child_weight"],
+            "gamma":            best_params["gamma"],
+            "tree_method":      "hist",
+            "device":           "cuda",
+            "num_boost_round":       best_params["num_boost_round"],
+            "early_stopping_rounds": best_params["early_stopping_rounds"],
+        }
+        train_fn = train_xgb
+    elif model_type == "xgb_reg":
+        params = {
+            "objective":        "reg:squarederror",
+            "eval_metric":      "rmse",
+            "lambda":           best_params["lambda"],
+            "alpha":            best_params["alpha"],
+            "gamma":            best_params["gamma"],
+            "max_depth":        best_params["max_depth"],
+            "min_child_weight": best_params["min_child_weight"],
+            "subsample":        best_params["subsample"],
+            "colsample_bytree": best_params["colsample_bytree"],
+            "learning_rate":    best_params["learning_rate"],
+            "tree_method":      "hist",
+            "device":           "cuda",
+            "num_boost_round":       best_params["num_boost_round"],
+            "early_stopping_rounds": best_params["early_stopping_rounds"],
+        }
+        train_fn = train_xgb_reg
+    elif model_type == "svm_gpu":
+        params   = best_params
+        train_fn = train_cuml_svc
+    elif model_type == "enet_gpu" and task == "classification":
+        params   = best_params
+        train_fn = train_cuml_elastic_net
+    elif model_type == "enet_gpu" and task == "regression":
+        params   = best_params
+        train_fn = train_cuml_elasticnet_reg
+    elif model_type == "svm":
+        params   = best_params
+        train_fn = train_svm
+    elif model_type == "svr":
+        params   = best_params
+        train_fn = train_svr_reg
+    else:
+        raise ValueError(f"Unhandled model_type={model_type}, task={task}")
+    # Timed block
+    t0 = time.perf_counter()
+    X_train = data.X_train
+    X_val   = data.X_val
+    if scaler is not None:
+        X_train = scaler.transform(X_train).astype(np.float32)
+        X_val   = scaler.transform(X_val).astype(np.float32)
+    train_fn(X_train, data.y_train, X_val, data.y_val, params)
+    wall_s = time.perf_counter() - t0
+    print(f"  Wall time: {wall_s:.1f}s")
+    return wall_s, model_type
+def write_wall_time(logs_dir: Path, objective: str, wt: str,
+                    model_type: str, wall_s: float):
+    logs_dir.mkdir(parents=True, exist_ok=True)
+    date_str   = datetime.now().strftime("%m_%d")
+    jsonl_path = logs_dir / f"{date_str}_wall_clock_ml.jsonl"
+    record = {
+        "model":     model_type,
+        "objective": objective,
+        "wt":        wt,
+        "wall_s":    round(wall_s),
+    }
+    with open(jsonl_path, "a") as f:
+        f.write(json.dumps(record) + "\n")
+    print(f"  Appended to {jsonl_path}: {record}")
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--model_dir",    type=str, required=True,
+                        help="e.g. .../hemolysis/svm_gpu_smiles")
+    parser.add_argument("--dataset_path", type=str, required=True,
+                        help="HuggingFace dataset path for this objective/embedding")
+    parser.add_argument("--logs_dir",     type=str, required=True,
+                        help="Directory to write *_wall_clock_ml.jsonl")
+    args = parser.parse_args()
+    model_dir = Path(args.model_dir)
+    objective, wt = parse_objective_and_wt(model_dir)
+    print(f"\nObjective: {objective}  Embedding: {wt}")
+    wall_s, model_type = refit_and_time(model_dir, args.dataset_path)
+    write_wall_time(Path(args.logs_dir), objective, wt, model_type, wall_s)

training_classifiers/refit_nn_seed.py ADDED Viewed

	@@ -0,0 +1,315 @@

+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+from datasets import load_from_disk, DatasetDict
+from sklearn.metrics import roc_auc_score, precision_recall_curve, f1_score
+import torch.nn as nn
+import os
+import json
+import pandas as pd
+import argparse
+from typing import Optional
+from lightning.pytorch import seed_everything
+def infer_in_dim_from_unpooled_ds(ds) -> int:
+    ex = ds[0]
+    return int(len(ex["embedding"][0]))
+def load_split(dataset_path):
+    ds = load_from_disk(dataset_path)
+    if isinstance(ds, DatasetDict):
+        return ds["train"], ds["val"]
+    raise ValueError("Expected DatasetDict with 'train' and 'val' splits")
+def collate_unpooled(batch):
+    lengths = [int(x["length"]) for x in batch]
+    Lmax = max(lengths)
+    H = len(batch[0]["embedding"][0])
+    X = torch.zeros(len(batch), Lmax, H, dtype=torch.float32)
+    M = torch.zeros(len(batch), Lmax, dtype=torch.bool)
+    y = torch.tensor([x["label"] for x in batch], dtype=torch.float32)
+    for i, x in enumerate(batch):
+        emb = torch.tensor(x["embedding"], dtype=torch.float32)
+        L = emb.shape[0]
+        X[i, :L] = emb
+        if "attention_mask" in x:
+            m = torch.tensor(x["attention_mask"], dtype=torch.bool)
+            M[i, :L] = m[:L]
+        else:
+            M[i, :L] = True
+    return X, M, y
+# ======================== Models =========================================
+class MaskedMeanPool(nn.Module):
+    def forward(self, X, M):
+        Mf = M.unsqueeze(-1).float()
+        denom = Mf.sum(dim=1).clamp(min=1.0)
+        return (X * Mf).sum(dim=1) / denom
+class MLPClassifier(nn.Module):
+    def __init__(self, in_dim, hidden=512, dropout=0.1):
+        super().__init__()
+        self.pool = MaskedMeanPool()
+        self.net = nn.Sequential(
+            nn.Linear(in_dim, hidden),
+            nn.GELU(),
+            nn.Dropout(dropout),
+            nn.Linear(hidden, 1),
+        )
+    def forward(self, X, M):
+        return self.net(self.pool(X, M)).squeeze(-1)
+class CNNClassifier(nn.Module):
+    def __init__(self, in_ch, c=256, k=5, layers=2, dropout=0.1):
+        super().__init__()
+        blocks, ch = [], in_ch
+        for _ in range(layers):
+            blocks += [nn.Conv1d(ch, c, kernel_size=k, padding=k//2), nn.GELU(), nn.Dropout(dropout)]
+            ch = c
+        self.conv = nn.Sequential(*blocks)
+        self.head = nn.Linear(c, 1)
+    def forward(self, X, M):
+        Y = self.conv(X.transpose(1, 2)).transpose(1, 2)
+        Mf = M.unsqueeze(-1).float()
+        pooled = (Y * Mf).sum(dim=1) / Mf.sum(dim=1).clamp(min=1.0)
+        return self.head(pooled).squeeze(-1)
+class TransformerClassifier(nn.Module):
+    def __init__(self, in_dim, d_model=256, nhead=8, layers=2, ff=512, dropout=0.1):
+        super().__init__()
+        self.proj = nn.Linear(in_dim, d_model)
+        enc_layer = nn.TransformerEncoderLayer(
+            d_model=d_model, nhead=nhead, dim_feedforward=ff,
+            dropout=dropout, batch_first=True, activation="gelu"
+        )
+        self.enc = nn.TransformerEncoder(enc_layer, num_layers=layers)
+        self.head = nn.Linear(d_model, 1)
+    def forward(self, X, M):
+        Z = self.enc(self.proj(X), src_key_padding_mask=~M)
+        Mf = M.unsqueeze(-1).float()
+        pooled = (Z * Mf).sum(dim=1) / Mf.sum(dim=1).clamp(min=1.0)
+        return self.head(pooled).squeeze(-1)
+# ======================== Training utils =========================================
+def best_f1_threshold(y_true, y_prob):
+    p, r, thr = precision_recall_curve(y_true, y_prob)
+    f1s = (2 * p[:-1] * r[:-1]) / (p[:-1] + r[:-1] + 1e-12)
+    i = int(np.nanargmax(f1s))
+    return float(thr[i]), float(f1s[i])
+@torch.no_grad()
+def eval_probs(model, loader, device):
+    model.eval()
+    ys, ps = [], []
+    for X, M, y in loader:
+        X, M = X.to(device), M.to(device)
+        ps.append(torch.sigmoid(model(X, M)).cpu().numpy())
+        ys.append(y.numpy())
+    return np.concatenate(ys), np.concatenate(ps)
+def train_one_epoch(model, loader, optim, criterion, device):
+    model.train()
+    for X, M, y in loader:
+        X, M, y = X.to(device), M.to(device), y.to(device)
+        optim.zero_grad(set_to_none=True)
+        criterion(model(X, M), y).backward()
+        optim.step()
+def build_model(model_name, in_dim, params):
+    dropout = float(params.get("dropout", 0.1))
+    if model_name == "mlp":
+        return MLPClassifier(in_dim=in_dim, hidden=int(params["hidden"]), dropout=dropout)
+    elif model_name == "cnn":
+        return CNNClassifier(in_ch=in_dim, c=int(params["channels"]), k=int(params["kernel"]),
+                             layers=int(params["layers"]), dropout=dropout)
+    elif model_name == "transformer":
+        return TransformerClassifier(in_dim=in_dim, d_model=int(params["d_model"]),
+                                     nhead=int(params["nhead"]), layers=int(params["layers"]),
+                                     ff=int(params["ff"]), dropout=dropout)
+    raise ValueError(model_name)
+# ======================== Main refit =========================================
+def refit_with_seed(dataset_path, base_out_dir, model_name, seed, device="cuda:0"):
+    """
+    Loads best_params from base_out_dir/best_model.pt (saved by original Optuna run),
+    retrains with the given seed, saves results to base_out_dir/seed_{seed}/.
+    """
+    # Load best params from completed Optuna run
+    model_path = os.path.join(base_out_dir, "best_model.pt")
+    if not os.path.exists(model_path):
+        raise FileNotFoundError(f"No best_model.pt found at {model_path}. Run Optuna first.")
+    checkpoint = torch.load(model_path, map_location="cpu")
+    best_params = checkpoint["best_params"]
+    print(f"Loaded best_params from {model_path}")
+    print(json.dumps(best_params, indent=2))
+    # Seed
+    seed_everything(seed)
+    out_dir = os.path.join(base_out_dir, f"seed_{seed}")
+    os.makedirs(out_dir, exist_ok=True)
+    # Data import
+    train_ds, val_ds = load_split(dataset_path)
+    print(f"[Data] Train: {len(train_ds)}, Val: {len(val_ds)}")
+    batch_size = int(best_params.get("batch_size", 32))
+    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True,
+                              collate_fn=collate_unpooled, num_workers=4, pin_memory=True)
+    val_loader   = DataLoader(val_ds,   batch_size=64,         shuffle=False,
+                              collate_fn=collate_unpooled, num_workers=4, pin_memory=True)
+    in_dim = infer_in_dim_from_unpooled_ds(train_ds)
+    model  = build_model(model_name, in_dim, best_params).to(device)
+    # Loss
+    ytr = np.asarray(train_ds["label"], dtype=np.int64)
+    pos, neg = ytr.sum(), len(ytr) - ytr.sum()
+    pos_weight = torch.tensor([neg / max(pos, 1)], device=device, dtype=torch.float32)
+    criterion  = nn.BCEWithLogitsLoss(pos_weight=pos_weight)
+    optim = torch.optim.AdamW(model.parameters(),
+                               lr=float(best_params["lr"]),
+                               weight_decay=float(best_params["weight_decay"]))
+    # Training loop with early stopping
+    best_f1, best_thr, bad, patience = -1.0, 0.5, 0, 12
+    best_state = None
+    for epoch in range(1, 151):
+        train_one_epoch(model, train_loader, optim, criterion, device)
+        y_true, y_prob = eval_probs(model, val_loader, device)
+        thr, f1 = best_f1_threshold(y_true, y_prob)
+        if f1 > best_f1 + 1e-4:
+            best_f1   = f1
+            best_thr  = thr
+            bad       = 0
+            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
+        else:
+            bad += 1
+            if bad >= patience:
+                print(f"Early stopping at epoch {epoch}")
+                break
+    if best_state is not None:
+        model.load_state_dict(best_state)
+    # Final eval
+    y_true_val, y_prob_val = eval_probs(model, val_loader, device)
+    best_thr_final, best_f1_final = best_f1_threshold(y_true_val, y_prob_val)
+    auc_final = roc_auc_score(y_true_val, y_prob_val)
+    # Save
+    df_val = pd.DataFrame({
+        "y_true": y_true_val.astype(int),
+        "y_prob": y_prob_val.astype(float),
+        "y_pred": (y_prob_val >= best_thr_final).astype(int),
+    })
+    if "sequence" in val_ds.column_names:
+        df_val.insert(0, "sequence", np.asarray(val_ds["sequence"]))
+    df_val.to_csv(os.path.join(out_dir, "val_predictions.csv"), index=False)
+    torch.save({"state_dict": model.state_dict(), "best_params": best_params, "seed": seed},
+               os.path.join(out_dir, "model.pt"))
+    summary = {
+        "model":    model_name,
+        "seed":     seed,
+        "val_f1":   round(best_f1_final, 6),
+        "val_auc":  round(auc_final, 6),
+        "val_thr":  round(best_thr_final, 6),
+    }
+    with open(os.path.join(out_dir, "metrics.json"), "w") as f:
+        json.dump(summary, f, indent=2)
+    print(f"\n[Seed {seed}] F1={best_f1_final:.4f}  AUC={auc_final:.4f}  thr={best_thr_final:.4f}")
+    print(f"Saved to {out_dir}")
+    return summary
+# ======================== CI aggregation =========================================
+def aggregate_seed_results(base_out_dir, seeds):
+    """
+    Call after all seed runs finish to compute mean ± 95% CI across seeds.
+    Saves a summary CSV to base_out_dir/seed_aggregated_metrics.csv
+    """
+    from scipy import stats
+    records = []
+    for seed in seeds:
+        p = os.path.join(base_out_dir, f"seed_{seed}", "metrics.json")
+        if os.path.exists(p):
+            records.append(json.load(open(p)))
+        else:
+            print(f"Warning: missing seed {seed} at {p}")
+    if not records:
+        raise ValueError("No seed results found.")
+    df = pd.DataFrame(records)
+    print("\nPer-seed results:")
+    print(df.to_string(index=False))
+    summary_rows = []
+    for metric in ["val_f1", "val_auc"]:
+        vals = df[metric].values
+        n    = len(vals)
+        mean = vals.mean()
+        std  = vals.std(ddof=1)
+        se   = std / np.sqrt(n)
+        t_crit = stats.t.ppf(0.975, df=n - 1)
+        ci   = t_crit * se
+        summary_rows.append({
+            "metric": metric,
+            "mean":   round(mean, 4),
+            "std":    round(std,  4),
+            "ci_95":  round(ci,   4),
+            "report": f"{mean:.4f} ± {ci:.4f}",
+            "n_seeds": n,
+        })
+    summary_df = pd.DataFrame(summary_rows)
+    out_path = os.path.join(base_out_dir, "seed_aggregated_metrics.csv")
+    summary_df.to_csv(out_path, index=False)
+    print("\n=== Aggregated Metrics (95% CI) ===")
+    for _, row in summary_df.iterrows():
+        print(f"  {row['metric']:12s}: {row['report']}  (n={row['n_seeds']})")
+    print(f"\nSaved to {out_path}")
+    return summary_df
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dataset_path",  type=str, required=True)
+    parser.add_argument("--base_out_dir",  type=str, required=True,
+                        help="Directory containing best_model.pt from Optuna run")
+    parser.add_argument("--model",         type=str, choices=["mlp", "cnn", "transformer"], required=True)
+    parser.add_argument("--seed",          type=int, required=True,
+                        help="Training seed for this run (1986, 42, 0, 123, 12345)")
+    parser.add_argument("--aggregate",     action="store_true",
+                        help="After all seeds done: aggregate results into CI summary")
+    parser.add_argument("--all_seeds",     type=int, nargs="+", default=[1986, 42, 0, 123, 12345],
+                        help="All seeds to aggregate (used with --aggregate)")
+    args = parser.parse_args()
+    if args.aggregate:
+        aggregate_seed_results(args.base_out_dir, args.all_seeds)
+    else:
+        refit_with_seed(
+            dataset_path=args.dataset_path,
+            base_out_dir=args.base_out_dir,
+            model_name=args.model,
+            seed=args.seed,
+        )

training_classifiers/refit_regression_seed.py ADDED Viewed

	@@ -0,0 +1,318 @@

+import os
+import json
+import argparse
+import numpy as np
+import pandas as pd
+import torch
+import torch.nn as nn
+from torch.utils.data import DataLoader
+from torch.cuda.amp import autocast, GradScaler
+from datasets import load_from_disk, DatasetDict
+from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
+from scipy.stats import spearmanr
+from lightning.pytorch import seed_everything
+from typing import Dict, Optional
+scaler_amp = GradScaler(enabled=torch.cuda.is_available())
+def load_split(dataset_path):
+    ds = load_from_disk(dataset_path)
+    if isinstance(ds, DatasetDict):
+        return ds["train"], ds["val"]
+    raise ValueError("Expected DatasetDict with 'train' and 'val' splits")
+def infer_in_dim(ds) -> int:
+    return int(len(ds[0]["embedding"][0]))
+def collate_unpooled_reg(batch):
+    lengths = [int(x["length"]) for x in batch]
+    Lmax = max(lengths)
+    H = len(batch[0]["embedding"][0])
+    X = torch.zeros(len(batch), Lmax, H, dtype=torch.float32)
+    M = torch.zeros(len(batch), Lmax, dtype=torch.bool)
+    y = torch.tensor([float(x["label"]) for x in batch], dtype=torch.float32)
+    for i, x in enumerate(batch):
+        emb = torch.tensor(x["embedding"], dtype=torch.float32)
+        L = emb.shape[0]
+        X[i, :L] = emb
+        if "attention_mask" in x:
+            m = torch.tensor(x["attention_mask"], dtype=torch.bool)
+            M[i, :L] = m[:L]
+        else:
+            M[i, :L] = True
+    return X, M, y
+# ======================== Models =========================================
+class MaskedMeanPool(nn.Module):
+    def forward(self, X, M):
+        Mf = M.unsqueeze(-1).float()
+        return (X * Mf).sum(dim=1) / Mf.sum(dim=1).clamp(min=1.0)
+class MLPRegressor(nn.Module):
+    def __init__(self, in_dim, hidden=512, dropout=0.1):
+        super().__init__()
+        self.pool = MaskedMeanPool()
+        self.net = nn.Sequential(
+            nn.Linear(in_dim, hidden), nn.GELU(), nn.Dropout(dropout), nn.Linear(hidden, 1)
+        )
+    def forward(self, X, M):
+        return self.net(self.pool(X, M)).squeeze(-1)
+class CNNRegressor(nn.Module):
+    def __init__(self, in_ch, c=256, k=5, layers=2, dropout=0.1):
+        super().__init__()
+        blocks, ch = [], in_ch
+        for _ in range(layers):
+            blocks += [nn.Conv1d(ch, c, kernel_size=k, padding=k//2), nn.GELU(), nn.Dropout(dropout)]
+            ch = c
+        self.conv = nn.Sequential(*blocks)
+        self.head = nn.Linear(c, 1)
+    def forward(self, X, M):
+        Y = self.conv(X.transpose(1, 2)).transpose(1, 2)
+        Mf = M.unsqueeze(-1).float()
+        return self.head((Y * Mf).sum(dim=1) / Mf.sum(dim=1).clamp(min=1.0)).squeeze(-1)
+class TransformerRegressor(nn.Module):
+    def __init__(self, in_dim, d_model=256, nhead=8, layers=2, ff=512, dropout=0.1):
+        super().__init__()
+        self.proj = nn.Linear(in_dim, d_model)
+        self.enc  = nn.TransformerEncoder(
+            nn.TransformerEncoderLayer(d_model=d_model, nhead=nhead, dim_feedforward=ff,
+                                       dropout=dropout, batch_first=True, activation="gelu"),
+            num_layers=layers
+        )
+        self.head = nn.Linear(d_model, 1)
+    def forward(self, X, M):
+        Z = self.enc(self.proj(X), src_key_padding_mask=~M)
+        Mf = M.unsqueeze(-1).float()
+        return self.head((Z * Mf).sum(dim=1) / Mf.sum(dim=1).clamp(min=1.0)).squeeze(-1)
+# ======================== utils =========================================
+def safe_spearmanr(y_true, y_pred):
+    rho = spearmanr(y_true, y_pred).correlation
+    return 0.0 if (rho is None or np.isnan(rho)) else float(rho)
+def eval_regression(y_true, y_pred) -> Dict[str, float]:
+    try:
+        from sklearn.metrics import root_mean_squared_error
+        rmse = float(root_mean_squared_error(y_true, y_pred))
+    except Exception:
+        rmse = float(np.sqrt(mean_squared_error(y_true, y_pred)))
+    return {
+        "spearman_rho": safe_spearmanr(y_true, y_pred),
+        "rmse": rmse,
+        "mae":  float(mean_absolute_error(y_true, y_pred)),
+        "r2":   float(r2_score(y_true, y_pred)),
+    }
+def score_from_metrics(metrics, objective):
+    return {"spearman": metrics["spearman_rho"],
+            "neg_rmse": -metrics["rmse"],
+            "r2":       metrics["r2"]}[objective]
+@torch.no_grad()
+def eval_preds(model, loader, device):
+    model.eval()
+    ys, ps = [], []
+    for X, M, y in loader:
+        X, M = X.to(device), M.to(device)
+        ps.append(model(X, M).cpu().numpy())
+        ys.append(y.numpy())
+    return np.concatenate(ys), np.concatenate(ps)
+def train_one_epoch(model, loader, optim, criterion, device):
+    model.train()
+    for X, M, y in loader:
+        X, M, y = X.to(device), M.to(device), y.to(device)
+        optim.zero_grad(set_to_none=True)
+        with autocast(enabled=torch.cuda.is_available()):
+            loss = criterion(model(X, M), y)
+        scaler_amp.scale(loss).backward()
+        scaler_amp.step(optim)
+        scaler_amp.update()
+def build_model(model_name, in_dim, params):
+    dropout = float(params.get("dropout", 0.1))
+    if model_name == "mlp":
+        return MLPRegressor(in_dim=in_dim, hidden=int(params["hidden"]), dropout=dropout)
+    elif model_name == "cnn":
+        return CNNRegressor(in_ch=in_dim, c=int(params["channels"]), k=int(params["kernel"]),
+                            layers=int(params["layers"]), dropout=dropout)
+    elif model_name == "transformer":
+        return TransformerRegressor(in_dim=in_dim, d_model=int(params["d_model"]),
+                                    nhead=int(params["nhead"]), layers=int(params["layers"]),
+                                    ff=int(params["ff"]), dropout=dropout)
+    raise ValueError(model_name)
+# ======================== Refit Loop =========================================
+def refit_with_seed(dataset_path, base_out_dir, model_name, seed,
+                    objective="spearman", device="cuda:0"):
+    model_path = os.path.join(base_out_dir, "best_model.pt")
+    if not os.path.exists(model_path):
+        raise FileNotFoundError(f"No best_model.pt at {model_path}. Run Optuna first.")
+    checkpoint  = torch.load(model_path, map_location="cpu")
+    best_params = checkpoint["best_params"]
+    print(f"Loaded best_params from {model_path}")
+    print(json.dumps(best_params, indent=2))
+    seed_everything(seed)
+    out_dir = os.path.join(base_out_dir, f"seed_{seed}")
+    os.makedirs(out_dir, exist_ok=True)
+    train_ds, val_ds = load_split(dataset_path)
+    print(f"[Data] Train: {len(train_ds)}, Val: {len(val_ds)}")
+    batch_size   = int(best_params.get("batch_size", 32))
+    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True,
+                              collate_fn=collate_unpooled_reg, num_workers=4, pin_memory=True)
+    val_loader   = DataLoader(val_ds, batch_size=64, shuffle=False,
+                              collate_fn=collate_unpooled_reg, num_workers=4, pin_memory=True)
+    in_dim = infer_in_dim(train_ds)
+    model  = build_model(model_name, in_dim, best_params).to(device)
+    # Loss
+    loss_name = best_params.get("loss", "mse")
+    if loss_name == "mse":
+        criterion = nn.MSELoss()
+    else:
+        criterion = nn.HuberLoss(delta=float(best_params.get("huber_delta", 1.0)))
+    optim = torch.optim.AdamW(model.parameters(),
+                               lr=float(best_params["lr"]),
+                               weight_decay=float(best_params["weight_decay"]))
+    best_score, bad, patience = -1e18, 0, 15
+    best_state, best_metrics  = None, {}
+    for epoch in range(1, 201):
+        train_one_epoch(model, train_loader, optim, criterion, device)
+        y_true, y_pred = eval_preds(model, val_loader, device)
+        metrics = eval_regression(y_true, y_pred)
+        score   = score_from_metrics(metrics, objective)
+        if score > best_score + 1e-6:
+            best_score   = score
+            best_metrics = metrics
+            bad = 0
+            best_state = {k: v.detach().cpu().clone() for k, v in model.state_dict().items()}
+        else:
+            bad += 1
+            if bad >= patience:
+                print(f"Early stopping at epoch {epoch}")
+                break
+    if best_state:
+        model.load_state_dict(best_state)
+    y_true_val, y_pred_val = eval_preds(model, val_loader, device)
+    final_metrics = eval_regression(y_true_val, y_pred_val)
+    df_val = pd.DataFrame({
+        "y_true":    y_true_val.astype(float),
+        "y_pred":    y_pred_val.astype(float),
+        "residual":  (y_true_val - y_pred_val).astype(float),
+        "abs_error": np.abs(y_true_val - y_pred_val).astype(float),
+    })
+    if "sequence" in val_ds.column_names:
+        df_val.insert(0, "sequence", np.asarray(val_ds["sequence"]))
+    df_val.to_csv(os.path.join(out_dir, "val_predictions.csv"), index=False)
+    torch.save({"state_dict": model.state_dict(), "best_params": best_params, "seed": seed},
+               os.path.join(out_dir, "model.pt"))
+    summary = {"model": model_name, "seed": seed, **{k: round(v, 6) for k, v in final_metrics.items()}}
+    with open(os.path.join(out_dir, "metrics.json"), "w") as f:
+        json.dump(summary, f, indent=2)
+    print(f"\n[Seed {seed}] rho={final_metrics['spearman_rho']:.4f}  "
+          f"RMSE={final_metrics['rmse']:.4f}  R2={final_metrics['r2']:.4f}")
+    return summary
+# ======================== CI aggregation =========================================
+def aggregate_seed_results(base_out_dir, seeds):
+    """
+    Aggregates across seed runs using:
+      - t-distribution 95% CI for Spearman rho, RMSE, R2, MAE
+     For rho > 0.9, use Fisher z-transform CI instead.
+    """
+    from scipy import stats
+    records = []
+    for seed in seeds:
+        p = os.path.join(base_out_dir, f"seed_{seed}", "metrics.json")
+        if os.path.exists(p):
+            records.append(json.load(open(p)))
+        else:
+            print(f"Warning: missing seed {seed}")
+    if not records:
+        raise ValueError("No seed results found.")
+    df = pd.DataFrame(records)
+    print("\nPer-seed results:")
+    print(df.to_string(index=False))
+    summary_rows = []
+    for metric in ["spearman_rho", "rmse", "mae", "r2"]:
+        vals  = df[metric].values
+        n     = len(vals)
+        mean  = vals.mean()
+        std   = vals.std(ddof=1)
+        se    = std / np.sqrt(n)
+        t_crit = stats.t.ppf(0.975, df=n - 1)
+        ci    = t_crit * se
+        row = {
+            "metric":  metric,
+            "mean":    round(mean, 4),
+            "std":     round(std,  4),
+            "ci_95":   round(ci,   4),
+            "report":  f"{mean:.4f} ± {ci:.4f}",
+            "n_seeds": n,
+        }
+        # Flag if rho is high enough that the t-CI boundary might exceed 1.0
+        if metric == "spearman_rho" and (mean + ci > 0.95 or mean - ci < -0.95):
+            row["note"] = "rho near boundary — consider Fisher z-transform CI"
+        summary_rows.append(row)
+    summary_df = pd.DataFrame(summary_rows)
+    out_path = os.path.join(base_out_dir, "seed_aggregated_metrics.csv")
+    summary_df.to_csv(out_path, index=False)
+    print("\n=== Aggregated Metrics (95% CI, t-distribution) ===")
+    for _, row in summary_df.iterrows():
+        note = f"  ← {row['note']}" if "note" in row and pd.notna(row.get("note")) else ""
+        print(f"  {row['metric']:15s}: {row['report']}{note}")
+    print(f"\nSaved to {out_path}")
+    return summary_df
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--dataset_path", type=str, required=True)
+    parser.add_argument("--base_out_dir", type=str, required=True)
+    parser.add_argument("--model",        type=str, choices=["mlp", "cnn", "transformer"], required=True)
+    parser.add_argument("--seed",         type=int, required=True)
+    parser.add_argument("--objective",    type=str, default="spearman",
+                        choices=["spearman", "neg_rmse", "r2"])
+    parser.add_argument("--aggregate",    action="store_true")
+    parser.add_argument("--all_seeds",    type=int, nargs="+", default=[1986, 42, 0, 123, 12345])
+    args = parser.parse_args()
+    if args.aggregate:
+        aggregate_seed_results(args.base_out_dir, args.all_seeds)
+    else:
+        refit_with_seed(
+            dataset_path=args.dataset_path,
+            base_out_dir=args.base_out_dir,
+            model_name=args.model,
+            seed=args.seed,
+            objective=args.objective,
+        )

training_classifiers/src_bash/binding_refit.bash ADDED Viewed

	@@ -0,0 +1,56 @@

+#!/bin/bash
+#SBATCH --job-name=ba-refit-seed
+#SBATCH --partition=dgx-b200
+#SBATCH --gpus=1
+#SBATCH --cpus-per-task=10
+#SBATCH --mem=200G
+#SBATCH --time=24:00:00
+#SBATCH --output=%x_%A_%a.out
+#SBATCH --array=0-4          # 5 seeds → indices 0..4
+HOME_LOC=~/
+SCRIPT_LOC=$HOME_LOC/PeptiVerse/training_classifiers
+ALT_EMB_LOC=$HOME_LOC/PeptiVerse/training_data_clean
+# ── Configure per submission ──────────────────────────────────────────
+BINDER_MODEL='wt'    # chemberta / peptideclm / wt
+MODE='pooled'               # pooled / unpooled
+# wt-wt
+DATA_PATH="${SCRIPT_LOC}/binding_affinity/pair_wt_wt_${MODE}"
+BASE_OUT_DIR="${SCRIPT_LOC}/binding_affinity/wt_wt_${MODE}"
+# wt-smiles (chemberta or peptideclm)
+#DATA_PATH="${ALT_EMB_LOC}/binding_affinity/${BINDER_MODEL}/pair_wt_smiles_${MODE}"
+#BASE_OUT_DIR="${SCRIPT_LOC}/binding_affinity/${BINDER_MODEL}_smiles_${MODE}"
+# ────────────────────────────────────────────────────────────────────────────
+SEEDS=(1986 42 0 123 12345)
+SEED=${SEEDS[$SLURM_ARRAY_TASK_ID]}
+LOG_LOC=$SCRIPT_LOC/src_bash/logs
+mkdir -p $LOG_LOC
+DATE=$(date +%m_%d)
+cd $SCRIPT_LOC
+echo "Running: binder=${BINDER_MODEL}  mode=${MODE}  seed=${SEED}"
+echo "  data  : ${DATA_PATH}"
+echo "  out   : ${BASE_OUT_DIR}"
+START_TIME=$(date +%s%N)
+python -u refit_binding_affinity_seed.py \
+    --dataset_path "${DATA_PATH}" \
+    --base_out_dir "${BASE_OUT_DIR}" \
+    --mode         "${MODE}" \
+    --seed         "${SEED}" \
+    > "${LOG_LOC}/${DATE}_ba_refit_${BINDER_MODEL}_${MODE}_seed${SEED}.log" 2>&1
+END_TIME=$(date +%s%N)
+ELAPSED_S=$(( (END_TIME - START_TIME) / 1000000000 ))
+echo "Seed ${SEED} done at $(date) — wall clock: ${ELAPSED_S}s"
+echo "{\"binder\": \"${BINDER_MODEL}\", \"mode\": \"${MODE}\", \"seed\": ${SEED}, \"wall_s\": ${ELAPSED_S}}" \
+    >> "${LOG_LOC}/${DATE}_wall_clock_ba_refit.jsonl"
+conda deactivate

training_classifiers/src_bash/ml_uncertainty.bash ADDED Viewed

	@@ -0,0 +1,192 @@

+#!/bin/bash
+#SBATCH --job-name=ml-walltime
+#SBATCH --partition=b200-mig45
+#SBATCH --gpus=1
+#SBATCH --cpus-per-task=5
+#SBATCH --mem=50G
+#SBATCH --time=6:00:00
+#SBATCH --output=%x_%j.out
+# =============================================================================
+# Unified Bootstrap CI + Uncertainty + Wall-time Refit
+# wt, smiles, chemberta embeddings
+# Runs sequentially: bootstrap/uncertainty first, then wall-time refit
+# =============================================================================
+HOME_LOC=~/
+SCRIPT_LOC=$HOME_LOC/PeptiVerse/training_classifiers
+ALT_EMB_LOC=$HOME_LOC/PeptiVerse/training_data_cleaned
+LOG_LOC=$SCRIPT_LOC/src_bash/logs
+mkdir -p $LOG_LOC
+DATE=$(date +%m_%d)
+cd $SCRIPT_LOC
+# =============================================================================
+# Helper functions
+# =============================================================================
+# Bootstrap CI + uncertainty
+# $1=OBJECTIVE  $2=WT  $3=UNCERTAINTY_SCRIPT  $4=MODEL_TYPE  $5=UNC_MODE
+run_bootstrap() {
+    local OBJECTIVE=$1
+    local WT=$2
+    local SCRIPT=$3
+    local MODEL_TYPE=$4
+    local UNC_MODE=$5
+    local VAL_PREDS="${SCRIPT_LOC}/${OBJECTIVE}/${MODEL_TYPE}_${WT}/val_predictions.csv"
+    local OUT_DIR="${SCRIPT_LOC}/${OBJECTIVE}/${MODEL_TYPE}_${WT}"
+    local LOG_FILE="${LOG_LOC}/${DATE}_ci_${MODEL_TYPE}_${OBJECTIVE}_${WT}.log"
+    if [ ! -f "$VAL_PREDS" ]; then
+        echo "  [SKIP bootstrap] val_predictions.csv not found: $VAL_PREDS"
+        return
+    fi
+    echo "  [bootstrap ci]   ${MODEL_TYPE} / ${OBJECTIVE} / ${WT}"
+    python -u "$SCRIPT" \
+        --mode ci \
+        --val_preds  "$VAL_PREDS" \
+        --out_dir    "$OUT_DIR" \
+        --model_name "${MODEL_TYPE}_${WT}" \
+        >> "$LOG_FILE" 2>&1
+    echo "  [bootstrap unc]  ${MODEL_TYPE} / ${OBJECTIVE} / ${WT}  (${UNC_MODE})"
+    python -u "$SCRIPT" \
+        --mode "$UNC_MODE" \
+        --val_preds  "$VAL_PREDS" \
+        --out_dir    "$OUT_DIR" \
+        --model_name "${MODEL_TYPE}_${WT}" \
+        >> "$LOG_FILE" 2>&1
+    echo "  ${OUT_DIR}/"
+}
+# Wall-time refit
+# $1=OBJECTIVE  $2=WT  $3=MODEL_TYPE  $4=DATASET_PATH
+run_walltime() {
+    local OBJECTIVE=$1
+    local WT=$2
+    local MODEL_TYPE=$3
+    local DATASET_PATH=$4
+    local MODEL_DIR="${SCRIPT_LOC}/${OBJECTIVE}/${MODEL_TYPE}_${WT}"
+    local LOG_FILE="${LOG_LOC}/${DATE}_walltime_${MODEL_TYPE}_${OBJECTIVE}_${WT}.log"
+    if [ ! -d "$MODEL_DIR" ]; then
+        echo "  [SKIP walltime]  model_dir not found: $MODEL_DIR"
+        return
+    fi
+    if [ ! -d "$DATASET_PATH" ]; then
+        echo "  [SKIP walltime]  dataset not found: $DATASET_PATH"
+        return
+    fi
+    echo "  [walltime]       ${MODEL_TYPE} / ${OBJECTIVE} / ${WT}"
+    python -u refit_ml_walltime.py \
+        --model_dir    "$MODEL_DIR" \
+        --dataset_path "$DATASET_PATH" \
+        --logs_dir     "$LOG_LOC" \
+        >> "$LOG_FILE" 2>&1
+    echo "   logged to ${LOG_LOC}/${DATE}_wall_clock_ml.jsonl"
+}
+# =============================================================================
+# Dataset path lookup
+# $1=OBJECTIVE  $2=WT
+# =============================================================================
+get_dataset_path() {
+    local OBJECTIVE=$1
+    local WT=$2
+    local DATA_LOC=$HOME_LOC/projects/Classifier_Weight/training_data_cleaned
+    case "${OBJECTIVE}|${WT}" in
+        # -- wt embeddings (ESM2 / original) ------------------------------
+        "hemolysis|wt")            echo "${DATA_LOC}/hemolysis/hemo_wt_with_embeddings" ;;
+        "nf|wt")                   echo "${DATA_LOC}/nf/nf_wt_with_embeddings" ;;
+        "solubility|wt")           echo "${DATA_LOC}/solubility/sol_wt_with_embeddings" ;;
+        "permeability_penetrance|wt") echo "${DATA_LOC}/permeability_penetrance/perm_wt_with_embeddings_pooled" ;;
+        # -- smiles embeddings (PeptideCLM) -------------------------------
+        "hemolysis|smiles")        echo "${ALT_EMB_LOC}/hemolysis_peptideclm/hemo_smiles_with_embeddings" ;;
+        "nf|smiles")               echo "${ALT_EMB_LOC}/nf_peptideclm/nf_smiles_with_embeddings" ;;
+        "permeability_pampa|smiles")  echo "${ALT_EMB_LOC}/permeability_pampa_peptideclm/pampa_smiles_with_embeddings" ;;
+        "permeability_caco2|smiles")  echo "${ALT_EMB_LOC}/permeability_caco2_peptideclm/caco2_smiles_with_embeddings" ;;
+        # -- chemberta embeddings -----------------------------------------
+        "hemolysis|chemberta")     echo "${ALT_EMB_LOC}/hemolysis_chemberta/hemo_smiles_with_embeddings" ;;
+        "nf|chemberta")            echo "${ALT_EMB_LOC}/nf_chemberta/nf_smiles_with_embeddings" ;;
+        "permeability_penetrance|chemberta") echo "${ALT_EMB_LOC}/permeability_chemberta/perm_smiles_with_embeddings" ;;
+        "permeability_penetrance|peptideclm") echo "${ALT_EMB_LOC}/permeability_peptideclm/perm_smiles_with_embeddings" ;;
+    	"permeability_pampa|chemberta")  echo "${ALT_EMB_LOC}/permeability_pampa_chemberta/pampa_smiles_with_embeddings" ;;
+        "permeability_caco2|chemberta")  echo "${ALT_EMB_LOC}/permeability_caco2_chemberta/caco2_smiles_with_embeddings" ;;
+        *)
+            echo ""
+            ;;
+    esac
+}
+# =============================================================================
+# SECTION 1 - Classification tasks
+# =============================================================================
+echo ""
+echo "============================================================"
+echo "  SECTION 1: Classification bootstrap + walltime"
+echo "============================================================"
+CLS_MODEL_TYPES=("svm_gpu" "enet_gpu" "xgb")
+# hemolysis, nf - wt + smiles + chemberta
+for OBJECTIVE in "hemolysis" "nf"; do
+    for WT in "wt" "smiles" "chemberta"; do
+        for MODEL_TYPE in "${CLS_MODEL_TYPES[@]}"; do
+            echo ""
+            echo "-- ${OBJECTIVE} / ${WT} / ${MODEL_TYPE} --"
+            run_bootstrap "$OBJECTIVE" "$WT" "ml_uncertainty.py" "$MODEL_TYPE" "uncertainty_prob"
+            DPATH=$(get_dataset_path "$OBJECTIVE" "$WT")
+            run_walltime  "$OBJECTIVE" "$WT" "$MODEL_TYPE" "$DPATH"
+        done
+    done
+done
+# solubility, permeability_penetrance - wt + chemberta (no smiles embeddings)
+for OBJECTIVE in "solubility" "permeability_penetrance"; do
+    for WT in "wt" "chemberta"; do
+        for MODEL_TYPE in "${CLS_MODEL_TYPES[@]}"; do
+            echo ""
+            echo "-- ${OBJECTIVE} / ${WT} / ${MODEL_TYPE} --"
+            run_bootstrap "$OBJECTIVE" "$WT" "ml_uncertainty.py" "$MODEL_TYPE" "uncertainty_prob"
+            DPATH=$(get_dataset_path "$OBJECTIVE" "$WT")
+            run_walltime  "$OBJECTIVE" "$WT" "$MODEL_TYPE" "$DPATH"
+        done
+    done
+done
+# =============================================================================
+# SECTION 2 - Regression tasks (PAMPA, Caco-2)
+# =============================================================================
+echo ""
+echo "============================================================"
+echo "  SECTION 2: Regression bootstrap + walltime"
+echo "============================================================"
+REG_MODEL_TYPES=("svr" "enet_gpu" "xgb")
+for OBJECTIVE in "permeability_pampa" "permeability_caco2"; do
+    for WT in "smiles" "chemberta"; do
+        for MODEL_TYPE in "${REG_MODEL_TYPES[@]}"; do
+            echo ""
+            echo "-- ${OBJECTIVE} / ${WT} / ${MODEL_TYPE} --"
+            run_bootstrap "$OBJECTIVE" "$WT" "ml_uncertainty_reg.py" "$MODEL_TYPE" "uncertainty_residual"
+            DPATH=$(get_dataset_path "$OBJECTIVE" "$WT")
+            run_walltime  "$OBJECTIVE" "$WT" "$MODEL_TYPE" "$DPATH"
+        done
+    done
+done
+echo ""
+echo "============================================================"
+echo "All runs completed at $(date)"
+echo "============================================================"
+conda deactivate

training_classifiers/src_bash/nn_uncertainty.bash ADDED Viewed

	@@ -0,0 +1,51 @@

+#!/bin/bash
+#SBATCH --job-name=refit-seed-array
+#SBATCH --partition=dgx-b200
+#SBATCH --gpus=1
+#SBATCH --cpus-per-task=10
+#SBATCH --mem=100G
+#SBATCH --time=12:00:00
+#SBATCH --output=%x_%A_%a.out
+#SBATCH --array=0-4          # 5 seeds → indices 0..4
+HOME_LOC=~/
+SCRIPT_LOC=$HOME_LOC/PeptiVerse/training_classifiers
+DATA_LOC=$HOME_LOC/PeptiVerse/training_data_cleaned
+# ── Configure per submission ──────────────────────────────────────────
+OBJECTIVE='permeability_pampa'         # nf / solubility / hemolysis / permeability_penetrance/ permeability_pampa / permeability_caco2
+WT='chemberta'                   # wt / smiles / chemberta / peptideclm
+MODEL_TYPE='mlp'              # mlp / cnn / transformer
+DATA_FILE="hemo_${WT}_with_embeddings_unpooled" # nf / sol/ hemo / perm / pampa/ caco2
+# Points to the directory where Optuna already saved best_model.pt
+BASE_OUT_DIR="${SCRIPT_LOC}/${OBJECTIVE}/${MODEL_TYPE}_${WT}"
+DATASET_PATH="${DATA_LOC}/permeability_${WT}/${DATA_FILE}"
+# ────────────────────────────────────────────────────────────────────────────
+SEEDS=(1986 42 0 123 12345)
+SEED=${SEEDS[$SLURM_ARRAY_TASK_ID]}
+LOG_LOC=$SCRIPT_LOC/src_bash/logs
+mkdir -p $LOG_LOC
+DATE=$(date +%m_%d)
+cd $SCRIPT_LOC
+echo "Running seed=$SEED  model=$MODEL_TYPE  objective=$OBJECTIVE  wt=$WT"
+START_TIME=$(date +%s%N)
+python -u refit_nn_seed.py \
+  --dataset_path "${DATASET_PATH}" \
+  --base_out_dir  "${BASE_OUT_DIR}" \
+  --model         "${MODEL_TYPE}" \
+  --seed          "${SEED}" \
+  > "${LOG_LOC}/${DATE}_refit_${MODEL_TYPE}_${OBJECTIVE}_${WT}_seed${SEED}.log" 2>&1
+END_TIME=$(date +%s%N)
+ELAPSED_S=$(( (END_TIME - START_TIME) / 1000000000 ))
+echo "Seed $SEED done at $(date) — wall clock: ${ELAPSED_S}s"
+echo "{\"model\": \"${MODEL_TYPE}\", \"objective\": \"${OBJECTIVE}\", \"wt\": \"${WT}\", \"seed\": ${SEED}, \"wall_s\": ${ELAPSED_S}}" \
+  >> "${LOG_LOC}/${DATE}_wall_clock_refit.jsonl"

training_classifiers/train_ml.py CHANGED Viewed

@@ -55,11 +55,9 @@ def _stack_embeddings(col) -> np.ndarray:
 def load_split_data(dataset_path: str) -> SplitData:
     ds = load_from_disk(dataset_path)
-    # Case A: DatasetDict with train/val
     if isinstance(ds, DatasetDict) and "train" in ds and "val" in ds:
         train_ds, val_ds = ds["train"], ds["val"]
     else:
-        # Case B: Single dataset with "split" column
         if "split" not in ds.column_names:
             raise ValueError(
                 "Dataset must be a DatasetDict(train/val) or have a 'split' column."
@@ -201,7 +199,6 @@ def train_svm(X_train, y_train, X_val, y_val, params):
 def train_linearsvm_calibrated(X_train, y_train, X_val, y_val, params):
     """
     Fast linear SVM (LinearSVC) + probability calibration.
-    Usually much faster than SVC on large datasets.
     """
     base = LinearSVC(
         C=float(params["C"]),

 def load_split_data(dataset_path: str) -> SplitData:
     ds = load_from_disk(dataset_path)
     if isinstance(ds, DatasetDict) and "train" in ds and "val" in ds:
         train_ds, val_ds = ds["train"], ds["val"]
     else:
         if "split" not in ds.column_names:
             raise ValueError(
                 "Dataset must be a DatasetDict(train/val) or have a 'split' column."
 def train_linearsvm_calibrated(X_train, y_train, X_val, y_val, params):
     """
     Fast linear SVM (LinearSVC) + probability calibration.
     """
     base = LinearSVC(
         C=float(params["C"]),

training_data_cleaned/binding_affinity/binding_affinity_smiles_meta_with_split.csv CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:351f3a76e9dcd50191d8408d6b15a8133eb519d2f463c83c1e7934c0514c6d78
-size 4454310

 version https://git-lfs.github.com/spec/v1
+oid sha256:3aee738ef2b17343ae69723a75473821b4188a196a55dacd0286ec47d065d531
+size 4436974

training_data_cleaned/binding_affinity/binding_affinity_wt_meta_with_split.csv CHANGED Viewed

@@ -1,3 +1,3 @@
 version https://git-lfs.github.com/spec/v1
-oid sha256:d8bd2ec03e42b503e502bcfb88b567c64da77daaf6f2b79ce1142d187cc79bd0
-size 3714505

 version https://git-lfs.github.com/spec/v1
+oid sha256:b7abc47729fa52a9f0aa68bffc6dd8c6562d0e4621d437a3a939c4ab27f46d80
+size 3704486

training_data_cleaned/binding_affinity_split.py CHANGED Viewed

@@ -1,62 +1,77 @@
-import os
 import math
-from pathlib import Path
 import sys
 from contextlib import contextmanager
 import numpy as np
 import pandas as pd
 import torch
 from tqdm import tqdm
-from tokenizer.my_tokenizers import SMILES_SPE_Tokenizer
-from datasets import Dataset, DatasetDict, Features, Value, Sequence as HFSequence
-from transformers import AutoTokenizer, EsmModel, AutoModelForMaskedLM
-from lightning.pytorch import seed_everything
-seed_everything(1986)
-CSV_PATH = Path("./Classifier_Weight/training_data_cleaned/binding_affinity/c-binding_with_openfold_scores.csv")
-OUT_ROOT = Path(
-    "./Classifier_Weight/training_data_cleaned/binding_affinity"
-)
-# WT embedding model
-WT_MODEL_NAME = "facebook/esm2_t33_650M_UR50D"
-WT_MAX_LEN = 1022
-WT_BATCH = 32
-# SMILES embedding model + tokenizer
-SMI_MODEL_NAME = "aaronfeller/PeptideCLM-23M-all"
-TOKENIZER_VOCAB = "./Classifier_Weight/tokenizer/new_vocab.txt"
-TOKENIZER_SPLITS = "./Classifier_Weight/tokenizer/new_splits.txt"
-SMI_MAX_LEN = 768
-SMI_BATCH = 128
-# Split config
-TRAIN_FRAC = 0.80
-RANDOM_SEED = 1986
-AFFINITY_Q_BINS = 30
-COL_SEQ1 = "seq1"
-COL_SEQ2 = "seq2"
-COL_AFF = "affinity"
-COL_F2S = "Fasta2SMILES"
-COL_REACT = "REACT_SMILES"
-COL_WT_IPTM = "wt_iptm_score"
 COL_SMI_IPTM = "smiles_iptm_score"
-# Device
 DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-QUIET = True
-USE_TQDM = False
-LOG_FILE = None
 def log(msg: str):
-    if LOG_FILE is not None:
-        Path(LOG_FILE).parent.mkdir(parents=True, exist_ok=True)
-        with open(LOG_FILE, "a") as f:
-            f.write(msg.rstrip() + "\n")
     if not QUIET:
         print(msg)
@@ -70,14 +85,22 @@ def section(title: str):
     log(f"=== done: {title} ===")
-# -------------------------
-# Helpers
-# -------------------------
 def has_uaa(seq: str) -> bool:
     return "X" in str(seq).upper()
 def affinity_to_class(a: float) -> str:
-    # High: >= 9 ; Moderate: [7, 9) ; Low: < 7
     if a >= 9.0:
         return "High"
     elif a >= 7.0:
@@ -87,10 +110,8 @@ def affinity_to_class(a: float) -> str:
 def make_distribution_matched_split(df: pd.DataFrame) -> pd.DataFrame:
     df = df.copy()
     df[COL_AFF] = pd.to_numeric(df[COL_AFF], errors="coerce")
     df = df.dropna(subset=[COL_AFF]).reset_index(drop=True)
     df["affinity_class"] = df[COL_AFF].apply(affinity_to_class)
     try:
@@ -101,717 +122,446 @@ def make_distribution_matched_split(df: pd.DataFrame) -> pd.DataFrame:
         strat_col = "aff_bin"
     rng = np.random.RandomState(RANDOM_SEED)
     df["split"] = None
     for _, g in df.groupby(strat_col, observed=True):
         idx = g.index.to_numpy()
         rng.shuffle(idx)
         n_train = int(math.floor(len(idx) * TRAIN_FRAC))
         df.loc[idx[:n_train], "split"] = "train"
-        df.loc[idx[n_train:], "split"] = "val"
     df["split"] = df["split"].fillna("train")
     return df
-def _summ(x):
-    x = np.asarray(x, dtype=float)
-    x = x[~np.isnan(x)]
-    if len(x) == 0:
-        return {"n": 0, "mean": np.nan, "std": np.nan, "p50": np.nan, "p95": np.nan}
-    return {
-        "n": int(len(x)),
-        "mean": float(np.mean(x)),
-        "std": float(np.std(x)),
-        "p50": float(np.quantile(x, 0.50)),
-        "p95": float(np.quantile(x, 0.95)),
-    }
-def _len_stats(seqs):
-    lens = np.asarray([len(str(s)) for s in seqs], dtype=float)
-    if len(lens) == 0:
-        return {"n": 0, "mean": np.nan, "std": np.nan, "p50": np.nan, "p95": np.nan}
-    return {
-        "n": int(len(lens)),
-        "mean": float(lens.mean()),
-        "std": float(lens.std()),
-        "p50": float(np.quantile(lens, 0.50)),
-        "p95": float(np.quantile(lens, 0.95)),
-    }
-def verify_split_before_embedding(
-    df2: pd.DataFrame,
-    affinity_col: str,
-    split_col: str,
-    seq_col: str,
-    iptm_col: str,
-    aff_class_col: str = "affinity_class",
-    aff_bins: int = 30,
-    save_report_prefix: str | None = None,
-    verbose: bool = False,
-):
-    df2 = df2.copy()
-    df2[affinity_col] = pd.to_numeric(df2[affinity_col], errors="coerce")
-    df2[iptm_col] = pd.to_numeric(df2[iptm_col], errors="coerce")
-    assert split_col in df2.columns, f"Missing split col: {split_col}"
-    assert set(df2[split_col].dropna().unique()).issubset({"train", "val"}), f"Unexpected split values: {df2[split_col].unique()}"
-    assert df2[affinity_col].notna().any(), "No valid affinity values after coercion."
-    try:
-        df2["_aff_bin_dbg"] = pd.qcut(df2[affinity_col], q=aff_bins, duplicates="drop")
-    except Exception:
-        df2["_aff_bin_dbg"] = df2[aff_class_col].astype(str)
-    tr = df2[df2[split_col] == "train"].reset_index(drop=True)
-    va = df2[df2[split_col] == "val"].reset_index(drop=True)
-    tr_aff = _summ(tr[affinity_col].to_numpy())
-    va_aff = _summ(va[affinity_col].to_numpy())
-    tr_len = _len_stats(tr[seq_col].tolist())
-    va_len = _len_stats(va[seq_col].tolist())
-    # bin drift
-    bin_ct = (
-        df2.groupby([split_col, "_aff_bin_dbg"])
-           .size()
-           .groupby(level=0)
-           .apply(lambda s: s / s.sum())
-    )
-    tr_bins = bin_ct.loc["train"]
-    va_bins = bin_ct.loc["val"]
-    all_bins = tr_bins.index.union(va_bins.index)
-    tr_bins = tr_bins.reindex(all_bins, fill_value=0.0)
-    va_bins = va_bins.reindex(all_bins, fill_value=0.0)
-    max_bin_diff = float(np.max(np.abs(tr_bins.values - va_bins.values)))
-    msg = (
-        f"[split-check] rows={len(df2)} train={len(tr)} val={len(va)} | "
-        f"aff(mean±std) train={tr_aff['mean']:.3f}±{tr_aff['std']:.3f} val={va_aff['mean']:.3f}±{va_aff['std']:.3f} | "
-        f"len(p50/p95) train={tr_len['p50']:.1f}/{tr_len['p95']:.1f} val={va_len['p50']:.1f}/{va_len['p95']:.1f} | "
-        f"max_bin_diff={max_bin_diff:.4f}"
-    )
-    log(msg)
-    if verbose and (not QUIET):
-        class_ct = df2.groupby([split_col, aff_class_col]).size().unstack(fill_value=0)
-        class_prop = class_ct.div(class_ct.sum(axis=1), axis=0)
-        print("\n[verbose] affinity_class counts:\n", class_ct)
-        print("\n[verbose] affinity_class proportions:\n", class_prop.round(4))
-    if save_report_prefix is not None:
-        out = Path(save_report_prefix)
-        out.parent.mkdir(parents=True, exist_ok=True)
-        stats_df = pd.DataFrame([
-            {"split": "train", **{f"aff_{k}": v for k, v in tr_aff.items()}, **{f"len_{k}": v for k, v in tr_len.items()}},
-            {"split": "val",   **{f"aff_{k}": v for k, v in va_aff.items()}, **{f"len_{k}": v for k, v in va_len.items()}},
-        ])
-        class_ct = df2.groupby([split_col, aff_class_col]).size().unstack(fill_value=0)
-        class_prop = class_ct.div(class_ct.sum(axis=1), axis=0).reset_index()
-        stats_df.to_csv(out.with_suffix(".stats.csv"), index=False)
-        class_prop.to_csv(out.with_suffix(".class_prop.csv"), index=False)
-# -------------------------
-# WT pooled (ESM2)
-# -------------------------
-@torch.no_grad()
-def wt_pooled_embeddings(seqs, tokenizer, model, batch_size=32, max_length=1022):
-    embs = []
-    for i in pbar(range(0, len(seqs), batch_size)):
-        batch = seqs[i:i + batch_size]
-        inputs = tokenizer(
-            batch,
-            padding=True,
-            truncation=True,
-            max_length=max_length,
-            return_tensors="pt",
-        )
-        inputs = {k: v.to(DEVICE) for k, v in inputs.items()}
-        out = model(**inputs)
-        h = out.last_hidden_state  # (B, L, H)
-        attn = inputs["attention_mask"].unsqueeze(-1)  # (B, L, 1)
-        summed = (h * attn).sum(dim=1)                 # (B, H)
-        denom = attn.sum(dim=1).clamp(min=1e-9)        # (B, 1)
-        pooled = (summed / denom).detach().cpu().numpy()
-        embs.append(pooled)
-    return np.vstack(embs)
-# -------------------------
-# WT unpooled (ESM2)
-# -------------------------
 @torch.no_grad()
-def wt_unpooled_one(seq, tokenizer, model, cls_id, eos_id, max_length=1022):
-    tok = tokenizer(seq, padding=False, truncation=True, max_length=max_length, return_tensors="pt")
-    tok = {k: v.to(DEVICE) for k, v in tok.items()}
-    out = model(**tok)
-    h = out.last_hidden_state[0]           # (L, H)
-    attn = tok["attention_mask"][0].bool() # (L,)
-    ids = tok["input_ids"][0]
-    keep = attn.clone()
-    if cls_id is not None:
-        keep &= (ids != cls_id)
-    if eos_id is not None:
-        keep &= (ids != eos_id)
-    return h[keep].detach().cpu().to(torch.float16).numpy()
-def build_wt_unpooled_dataset(df_split: pd.DataFrame, out_dir: Path, tokenizer, model):
-    """
-    Expects df_split to have:
-      - target_sequence  (seq1)
-      - sequence         (binder seq2; WT binder)
-      - label, affinity_class, COL_AFF, COL_WT_IPTM
-    Saves a dataset where each row contains BOTH:
-      - target_embedding (Lt,H), target_attention_mask, target_length
-      - binder_embedding (Lb,H), binder_attention_mask, binder_length
-    """
-    cls_id = tokenizer.cls_token_id
-    eos_id = tokenizer.eos_token_id
-    H = model.config.hidden_size
-    features = Features({
-        "target_sequence": Value("string"),
-        "sequence": Value("string"),
-        "label": Value("float32"),
-        "affinity": Value("float32"),
-        "affinity_class": Value("string"),
-        "target_embedding": HFSequence(HFSequence(Value("float16"), length=H)),
-        "target_attention_mask": HFSequence(Value("int8")),
-        "target_length": Value("int64"),
-        "binder_embedding": HFSequence(HFSequence(Value("float16"), length=H)),
-        "binder_attention_mask": HFSequence(Value("int8")),
-        "binder_length": Value("int64"),
-        COL_WT_IPTM: Value("float32"),
-        COL_AFF: Value("float32"),
-    })
-    def gen_rows(df: pd.DataFrame):
-        for r in pbar(df.itertuples(index=False), total=len(df)):
-            tgt = str(getattr(r, "target_sequence")).strip()
-            bnd = str(getattr(r, "sequence")).strip()
-            y = float(getattr(r, "label"))
-            aff = float(getattr(r, COL_AFF))
-            acls = str(getattr(r, "affinity_class"))
-            iptm = getattr(r, COL_WT_IPTM)
-            iptm = float(iptm) if pd.notna(iptm) else np.nan
-            # token embeddings for target + binder (both ESM)
-            t_emb = wt_unpooled_one(tgt, tokenizer, model, cls_id, eos_id, max_length=WT_MAX_LEN)  # (Lt,H)
-            b_emb = wt_unpooled_one(bnd, tokenizer, model, cls_id, eos_id, max_length=WT_MAX_LEN)  # (Lb,H)
-            t_list = t_emb.tolist()
-            b_list = b_emb.tolist()
-            Lt = len(t_list)
-            Lb = len(b_list)
-            yield {
-                "target_sequence": tgt,
-                "sequence": bnd,
-                "label": np.float32(y),
-                "affinity": np.float32(aff),
-                "affinity_class": acls,
-                "target_embedding": t_list,
-                "target_attention_mask": [1] * Lt,
-                "target_length": int(Lt),
-                "binder_embedding": b_list,
-                "binder_attention_mask": [1] * Lb,
-                "binder_length": int(Lb),
-                COL_WT_IPTM: np.float32(iptm) if not np.isnan(iptm) else np.float32(np.nan),
-                COL_AFF: np.float32(aff),
-            }
-    out_dir.mkdir(parents=True, exist_ok=True)
-    ds = Dataset.from_generator(lambda: gen_rows(df_split), features=features)
-    ds.save_to_disk(str(out_dir), max_shard_size="1GB")
-    return ds
-def build_smiles_unpooled_paired_dataset(df_split: pd.DataFrame, out_dir: Path, wt_tokenizer, wt_model_unpooled,
-                                        smi_tok, smi_roformer):
     """
-    df_split must have:
-      - target_sequence (seq1)
-      - sequence        (binder smiles string)
-      - label, affinity_class, COL_AFF, COL_SMI_IPTM
-    Saves rows with:
-      target_embedding (Lt,Ht) from ESM
-      binder_embedding (Lb,Hb) from PeptideCLM
     """
-    cls_id = wt_tokenizer.cls_token_id
-    eos_id = wt_tokenizer.eos_token_id
-    Ht = wt_model_unpooled.config.hidden_size
-    Hb = getattr(smi_roformer.config, "hidden_size", None)
-    if Hb is None:
-        Hb = getattr(smi_roformer.config, "dim", None)
-    if Hb is None:
-        raise ValueError("Cannot infer Hb from smi_roformer config; print(smi_roformer.config) and set Hb manually.")
-    features = Features({
-        "target_sequence": Value("string"),
-        "sequence": Value("string"),
-        "label": Value("float32"),
-        "affinity": Value("float32"),
-        "affinity_class": Value("string"),
-        "target_embedding": HFSequence(HFSequence(Value("float16"), length=Ht)),
-        "target_attention_mask": HFSequence(Value("int8")),
-        "target_length": Value("int64"),
-        "binder_embedding": HFSequence(HFSequence(Value("float16"), length=Hb)),
-        "binder_attention_mask": HFSequence(Value("int8")),
-        "binder_length": Value("int64"),
-        COL_SMI_IPTM: Value("float32"),
-        COL_AFF: Value("float32"),
     })
-    def gen_rows(df: pd.DataFrame):
-        for r in pbar(df.itertuples(index=False), total=len(df)):
-            tgt = str(getattr(r, "target_sequence")).strip()
-            bnd = str(getattr(r, "sequence")).strip()
-            y = float(getattr(r, "label"))
-            aff = float(getattr(r, COL_AFF))
-            acls = str(getattr(r, "affinity_class"))
-            iptm = getattr(r, COL_SMI_IPTM)
-            iptm = float(iptm) if pd.notna(iptm) else np.nan
-            # target token embeddings (ESM)
-            t_emb = wt_unpooled_one(tgt, wt_tokenizer, wt_model_unpooled, cls_id, eos_id, max_length=WT_MAX_LEN)
-            t_list = t_emb.tolist()
-            Lt = len(t_list)
-            # binder token embeddings (PeptideCLM)
-            _, tok_list, mask_list, lengths = smiles_embed_batch_return_both(
-                [bnd], smi_tok, smi_roformer, max_length=SMI_MAX_LEN
-            )
-            b_emb = tok_list[0]
-            b_list = b_emb.tolist()
-            Lb = int(lengths[0])
-            b_mask = mask_list[0].astype(np.int8).tolist()
-            yield {
-                "target_sequence": tgt,
-                "sequence": bnd,
-                "label": np.float32(y),
-                "affinity": np.float32(aff),
-                "affinity_class": acls,
-                "target_embedding": t_list,
-                "target_attention_mask": [1] * Lt,
-                "target_length": int(Lt),
-                "binder_embedding": b_list,
-                "binder_attention_mask": [int(x) for x in b_mask],
-                "binder_length": int(Lb),
-                COL_SMI_IPTM: np.float32(iptm) if not np.isnan(iptm) else np.float32(np.nan),
-                COL_AFF: np.float32(aff),
-            }
-    out_dir.mkdir(parents=True, exist_ok=True)
-    ds = Dataset.from_generator(lambda: gen_rows(df_split), features=features)
-    ds.save_to_disk(str(out_dir), max_shard_size="1GB")
-    return ds
-# -------------------------
-# SMILES pooled + unpooled (PeptideCLM)
-# -------------------------
-def get_special_ids(tokenizer_obj):
-    cand = [
-        getattr(tokenizer_obj, "pad_token_id", None),
-        getattr(tokenizer_obj, "cls_token_id", None),
-        getattr(tokenizer_obj, "sep_token_id", None),
-        getattr(tokenizer_obj, "bos_token_id", None),
-        getattr(tokenizer_obj, "eos_token_id", None),
-        getattr(tokenizer_obj, "mask_token_id", None),
-    ]
-    return sorted({x for x in cand if x is not None})
-@torch.no_grad()
-def smiles_embed_batch_return_both(batch_sequences, tokenizer_obj, model_roformer, max_length):
-    tok = tokenizer_obj(
-        batch_sequences,
-        return_tensors="pt",
-        padding=True,
-        truncation=True,
-        max_length=max_length,
-    )
-    input_ids = tok["input_ids"].to(DEVICE)
-    attention_mask = tok["attention_mask"].to(DEVICE)
-    outputs = model_roformer(input_ids=input_ids, attention_mask=attention_mask)
-    last_hidden = outputs.last_hidden_state  # (B, L, H)
-    special_ids = get_special_ids(tokenizer_obj)
     valid = attention_mask.bool()
-    if len(special_ids) > 0:
-        sid = torch.tensor(special_ids, device=DEVICE, dtype=torch.long)
-        if hasattr(torch, "isin"):
-            valid = valid & (~torch.isin(input_ids, sid))
-        else:
-            m = torch.zeros_like(valid)
-            for s in special_ids:
-                m |= (input_ids == s)
-            valid = valid & (~m)
     valid_f = valid.unsqueeze(-1).float()
-    summed = torch.sum(last_hidden * valid_f, dim=1)
-    denom = torch.clamp(valid_f.sum(dim=1), min=1e-9)
-    pooled = (summed / denom).detach().cpu().numpy()
-    token_emb_list, mask_list, lengths = [], [], []
     for b in range(last_hidden.shape[0]):
-        emb = last_hidden[b, valid[b]]  # (Li, H)
-        token_emb_list.append(emb.detach().cpu().to(torch.float16).numpy())
-        li = emb.shape[0]
-        lengths.append(int(li))
-        mask_list.append(np.ones((li,), dtype=np.int8))
-    return pooled, token_emb_list, mask_list, lengths
-def smiles_generate_embeddings_batched_both(seqs, tokenizer_obj, model_roformer, batch_size, max_length):
-    pooled_all = []
-    token_emb_all = []
-    mask_all = []
-    lengths_all = []
-    for i in pbar(range(0, len(seqs), batch_size)):
-        batch = seqs[i:i + batch_size]
-        pooled, tok_list, m_list, lens = smiles_embed_batch_return_both(
-            batch, tokenizer_obj, model_roformer, max_length
-        )
-        pooled_all.append(pooled)
-        token_emb_all.extend(tok_list)
-        mask_all.extend(m_list)
-        lengths_all.extend(lens)
-    return np.vstack(pooled_all), token_emb_all, mask_all, lengths_all
-def build_target_cache_from_wt_view(wt_view_train: pd.DataFrame, wt_view_val: pd.DataFrame):
-    wt_tok = AutoTokenizer.from_pretrained(WT_MODEL_NAME)
-    wt_model = EsmModel.from_pretrained(WT_MODEL_NAME).to(DEVICE).eval()
-    # compute target pooled embeddings once
-    tgt_wt_train = wt_view_train["target_sequence"].astype(str).tolist()
-    tgt_wt_val   = wt_view_val["target_sequence"].astype(str).tolist()
-    wt_train_tgt_emb = wt_pooled_embeddings(
-        tgt_wt_train, wt_tok, wt_model, batch_size=WT_BATCH, max_length=WT_MAX_LEN
-    )
-    wt_val_tgt_emb = wt_pooled_embeddings(
-        tgt_wt_val, wt_tok, wt_model, batch_size=WT_BATCH, max_length=WT_MAX_LEN
-    )
-    # build dict: target_sequence -> embedding
-    train_map = {s: e for s, e in zip(tgt_wt_train, wt_train_tgt_emb)}
-    val_map   = {s: e for s, e in zip(tgt_wt_val,   wt_val_tgt_emb)}
-    return wt_tok, wt_model, wt_train_tgt_emb, wt_val_tgt_emb, train_map, val_map
-# -------------------------
 # Main
-# -------------------------
 def main():
-    log(f"[INFO] DEVICE: {DEVICE}")
     OUT_ROOT.mkdir(parents=True, exist_ok=True)
     with section("load csv + dedup"):
         df = pd.read_csv(CSV_PATH)
-        for c in [COL_SEQ1, COL_SEQ2, COL_F2S, COL_REACT]:
             if c in df.columns:
                 df[c] = df[c].apply(lambda x: x.strip() if isinstance(x, str) else x)
-        # Dedup
-        DEDUP_COLS = [COL_SEQ1, COL_SEQ2, COL_F2S, COL_REACT]
-        df = df.drop_duplicates(subset=DEDUP_COLS).reset_index(drop=True)
-        print("Rows after dedup on", DEDUP_COLS, ":", len(df))
-        need = [COL_SEQ1, COL_SEQ2, COL_AFF, COL_F2S, COL_REACT, COL_WT_IPTM, COL_SMI_IPTM]
-        missing = [c for c in need if c not in df.columns]
-        if missing:
-            raise ValueError(f"Missing required columns: {missing}")
-        # numeric affinity for both branches
         df[COL_AFF] = pd.to_numeric(df[COL_AFF], errors="coerce")
-    # WT subset + SMILES subset separately
-    with section("prepare wt/smiles subsets"):
-        # WT: requires a canonical peptide sequence (no X) + affinity
         df_wt = df.copy()
         df_wt["wt_sequence"] = df_wt[COL_SEQ2].astype(str).str.strip()
-        df_wt = df_wt.dropna(subset=[COL_AFF]).reset_index(drop=True)
-        df_wt = df_wt[df_wt["wt_sequence"].notna() & (df_wt["wt_sequence"] != "")]
-        df_wt = df_wt[~df_wt["wt_sequence"].str.contains("X", case=False, na=False)].reset_index(drop=True)
-        # SMILES: requires affinity + a usable picked SMILES (UAA->REACT, else->Fasta2SMILES)
         df_smi = df.copy()
-        df_smi = df_smi.dropna(subset=[COL_AFF]).reset_index(drop=True)
         df_smi = df_smi[
             pd.to_numeric(df_smi[COL_SMI_IPTM], errors="coerce").notna()
-        ].reset_index(drop=True) # empty iptm means sth wrong with their smiles sequence
-        is_uaa = df_smi[COL_SEQ2].astype(str).str.contains("X", case=False, na=False)
-        df_smi["smiles_sequence"] = np.where(is_uaa, df_smi[COL_REACT], df_smi[COL_F2S])
-        df_smi["smiles_sequence"] = df_smi["smiles_sequence"].astype(str).str.strip()
-        df_smi = df_smi[df_smi["smiles_sequence"].notna() & (df_smi["smiles_sequence"] != "")]
-        df_smi = df_smi[~df_smi["smiles_sequence"].isin(["nan", "None"])].reset_index(drop=True)
-        log(f"[counts] WT rows={len(df_wt)} | SMILES rows={len(df_smi)} (after per-branch filtering)")
-    # Split separately
-    with section("split wt and smiles separately"):
-        df_wt2 = make_distribution_matched_split(df_wt)
         df_smi2 = make_distribution_matched_split(df_smi)
-        # save split tables
-        wt_split_csv = OUT_ROOT / "binding_affinity_wt_meta_with_split.csv"
-        smi_split_csv = OUT_ROOT / "binding_affinity_smiles_meta_with_split.csv"
-        df_wt2.to_csv(wt_split_csv, index=False)
-        df_smi2.to_csv(smi_split_csv, index=False)
-        log(f"Saved WT split meta: {wt_split_csv}")
-        log(f"Saved SMILES split meta: {smi_split_csv}")
-        verify_split_before_embedding(
-            df2=df_wt2,
-            affinity_col=COL_AFF,
-            split_col="split",
-            seq_col="wt_sequence",
-            iptm_col=COL_WT_IPTM,
-            aff_class_col="affinity_class",
-            aff_bins=AFFINITY_Q_BINS,
-            save_report_prefix=str(OUT_ROOT / "wt_split_doublecheck_report"),
-            verbose=False,
-        )
-        verify_split_before_embedding(
-            df2=df_smi2,
-            affinity_col=COL_AFF,
-            split_col="split",
-            seq_col="smiles_sequence",
-            iptm_col=COL_SMI_IPTM,
-            aff_class_col="affinity_class",
-            aff_bins=AFFINITY_Q_BINS,
-            save_report_prefix=str(OUT_ROOT / "smiles_split_doublecheck_report"),
-            verbose=False,
-        )
-    # Prepare split views
-    def prep_view(df_in: pd.DataFrame, binder_seq_col: str, iptm_col: str) -> pd.DataFrame:
-        out = df_in.copy()
-        out["target_sequence"] = out[COL_SEQ1].astype(str).str.strip()   # <-- NEW
-        out["sequence"] = out[binder_seq_col].astype(str).str.strip()   # binder
-        out["label"] = pd.to_numeric(out[COL_AFF], errors="coerce")
-        out[iptm_col] = pd.to_numeric(out[iptm_col], errors="coerce")
-        out[COL_AFF] = pd.to_numeric(out[COL_AFF], errors="coerce")
-        out = out.dropna(subset=["target_sequence", "sequence", "label"]).reset_index(drop=True)
-        return out[["target_sequence", "sequence", "label", "split", iptm_col, COL_AFF, "affinity_class"]]
-    wt_view = prep_view(df_wt2, "wt_sequence", COL_WT_IPTM)
-    smi_view = prep_view(df_smi2, "smiles_sequence", COL_SMI_IPTM)
-    # -------------------------
-    # Split views
-    # -------------------------
-    wt_train = wt_view[wt_view["split"] == "train"].reset_index(drop=True)
-    wt_val   = wt_view[wt_view["split"] == "val"].reset_index(drop=True)
     smi_train = smi_view[smi_view["split"] == "train"].reset_index(drop=True)
     smi_val   = smi_view[smi_view["split"] == "val"].reset_index(drop=True)
-    # =========================
-    # TARGET pooled embeddings (ESM) — SEPARATE per branch
-    # =========================
-    with section("TARGET pooled embeddings (ESM) — WT + SMILES separately"):
-        wt_tok = AutoTokenizer.from_pretrained(WT_MODEL_NAME)
-        wt_esm = EsmModel.from_pretrained(WT_MODEL_NAME).to(DEVICE).eval()
-        # ---- WT targets ----
-        wt_train_tgt_emb = wt_pooled_embeddings(
-            wt_train["target_sequence"].astype(str).str.strip().tolist(),
-            wt_tok, wt_esm,
-            batch_size=WT_BATCH,
-            max_length=WT_MAX_LEN,
-        ).astype(np.float32)
-        wt_val_tgt_emb = wt_pooled_embeddings(
-            wt_val["target_sequence"].astype(str).str.strip().tolist(),
-            wt_tok, wt_esm,
-            batch_size=WT_BATCH,
-            max_length=WT_MAX_LEN,
-        ).astype(np.float32)
-        # ---- SMILES targets ----
-        smi_train_tgt_emb = wt_pooled_embeddings(
-            smi_train["target_sequence"].astype(str).str.strip().tolist(),
-            wt_tok, wt_esm,
-            batch_size=WT_BATCH,
-            max_length=WT_MAX_LEN,
-        ).astype(np.float32)
-        smi_val_tgt_emb = wt_pooled_embeddings(
-            smi_val["target_sequence"].astype(str).str.strip().tolist(),
-            wt_tok, wt_esm,
-            batch_size=WT_BATCH,
-            max_length=WT_MAX_LEN,
-        ).astype(np.float32)
-    # =========================
-    # WT pooled binder embeddings (binder = WT peptide)
-    # =========================
-    with section("WT pooled binder embeddings + save"):
-        wt_train_emb = wt_pooled_embeddings(
-            wt_train["sequence"].astype(str).str.strip().tolist(),
-            wt_tok, wt_esm,
-            batch_size=WT_BATCH,
-            max_length=WT_MAX_LEN,
-        ).astype(np.float32)
-        wt_val_emb = wt_pooled_embeddings(
-            wt_val["sequence"].astype(str).str.strip().tolist(),
-            wt_tok, wt_esm,
-            batch_size=WT_BATCH,
-            max_length=WT_MAX_LEN,
-        ).astype(np.float32)
-        wt_train_ds = Dataset.from_dict({
-            "target_sequence": wt_train["target_sequence"].tolist(),
-            "sequence": wt_train["sequence"].tolist(),
-            "label": wt_train["label"].astype(float).tolist(),
-            "target_embedding": wt_train_tgt_emb,
-            "embedding": wt_train_emb,
-            COL_WT_IPTM: wt_train[COL_WT_IPTM].astype(float).tolist(),
-            COL_AFF: wt_train[COL_AFF].astype(float).tolist(),
-            "affinity_class": wt_train["affinity_class"].tolist(),
-        })
-        wt_val_ds = Dataset.from_dict({
-            "target_sequence": wt_val["target_sequence"].tolist(),
-            "sequence": wt_val["sequence"].tolist(),
-            "label": wt_val["label"].astype(float).tolist(),
-            "target_embedding": wt_val_tgt_emb,
-            "embedding": wt_val_emb,
-            COL_WT_IPTM: wt_val[COL_WT_IPTM].astype(float).tolist(),
-            COL_AFF: wt_val[COL_AFF].astype(float).tolist(),
-            "affinity_class": wt_val["affinity_class"].tolist(),
-        })
-        wt_pooled_dd = DatasetDict({"train": wt_train_ds, "val": wt_val_ds})
-        wt_pooled_out = OUT_ROOT / "pair_wt_wt_pooled"
-        wt_pooled_dd.save_to_disk(str(wt_pooled_out))
-        log(f"Saved WT pooled -> {wt_pooled_out}")
-    # =========================
-    # SMILES pooled binder embeddings (binder = SMILES via PeptideCLM)
-    # =========================
-    with section("SMILES pooled binder embeddings + save"):
-        smi_tok = SMILES_SPE_Tokenizer(TOKENIZER_VOCAB, TOKENIZER_SPLITS)
-        smi_roformer = (
-            AutoModelForMaskedLM
-            .from_pretrained(SMI_MODEL_NAME)
-            .roformer
-            .to(DEVICE)
-            .eval()
-        )
-        smi_train_pooled, _, _, _ = smiles_generate_embeddings_batched_both(
-            smi_train["sequence"].astype(str).str.strip().tolist(),
-            smi_tok, smi_roformer,
-            batch_size=SMI_BATCH,
-            max_length=SMI_MAX_LEN,
         )
-        smi_val_pooled, _, _, _ = smiles_generate_embeddings_batched_both(
-            smi_val["sequence"].astype(str).str.strip().tolist(),
-            smi_tok, smi_roformer,
-            batch_size=SMI_BATCH,
-            max_length=SMI_MAX_LEN,
         )
-        smi_train_ds = Dataset.from_dict({
-            "target_sequence": smi_train["target_sequence"].tolist(),
-            "sequence": smi_train["sequence"].tolist(),
-            "label": smi_train["label"].astype(float).tolist(),
-            "target_embedding": smi_train_tgt_emb,
-            "embedding": smi_train_pooled.astype(np.float32),
-            COL_SMI_IPTM: smi_train[COL_SMI_IPTM].astype(float).tolist(),
-            COL_AFF: smi_train[COL_AFF].astype(float).tolist(),
-            "affinity_class": smi_train["affinity_class"].tolist(),
-        })
-        smi_val_ds = Dataset.from_dict({
-            "target_sequence": smi_val["target_sequence"].tolist(),
-            "sequence": smi_val["sequence"].tolist(),
-            "label": smi_val["label"].astype(float).tolist(),
-            "target_embedding": smi_val_tgt_emb,
-            "embedding": smi_val_pooled.astype(np.float32),
-            COL_SMI_IPTM: smi_val[COL_SMI_IPTM].astype(float).tolist(),
-            COL_AFF: smi_val[COL_AFF].astype(float).tolist(),
-            "affinity_class": smi_val["affinity_class"].tolist(),
-        })
-        smi_pooled_dd = DatasetDict({"train": smi_train_ds, "val": smi_val_ds})
-        smi_pooled_out = OUT_ROOT / "pair_wt_smiles_pooled"
-        smi_pooled_dd.save_to_disk(str(smi_pooled_out))
-        log(f"Saved SMILES pooled -> {smi_pooled_out}")
-    # =========================
-    # WT unpooled paired (ESM target + ESM binder) + save
-    # =========================
-    with section("WT unpooled paired embeddings + save"):
-        wt_tok_unpooled = wt_tok                       # reuse tokenizer
-        wt_esm_unpooled = wt_esm                       # reuse model
-        wt_unpooled_out = OUT_ROOT / "pair_wt_wt_unpooled"
-        wt_unpooled_dd = DatasetDict({
-            "train": build_wt_unpooled_dataset(wt_train, wt_unpooled_out / "train",
-                                               wt_tok_unpooled, wt_esm_unpooled),
-            "val":   build_wt_unpooled_dataset(wt_val,   wt_unpooled_out / "val",
-                                               wt_tok_unpooled, wt_esm_unpooled),
-        })
-        wt_unpooled_dd.save_to_disk(str(wt_unpooled_out))
-        log(f"Saved WT unpooled -> {wt_unpooled_out}")
-    # =========================
-    # SMILES unpooled paired (ESM target + PeptideCLM binder) + save
-    # =========================
-    with section("SMILES unpooled paired embeddings + save"):
-        smi_unpooled_out = OUT_ROOT / "pair_wt_smiles_unpooled"
-        smi_unpooled_dd = DatasetDict({
-            "train": build_smiles_unpooled_paired_dataset(
-                smi_train, smi_unpooled_out / "train",
-                wt_tok, wt_esm,
-                smi_tok, smi_roformer
-            ),
-            "val": build_smiles_unpooled_paired_dataset(
-                smi_val, smi_unpooled_out / "val",
-                wt_tok, wt_esm,
-                smi_tok, smi_roformer
-            ),
-        })
-        smi_unpooled_dd.save_to_disk(str(smi_unpooled_out))
-        log(f"Saved SMILES unpooled -> {smi_unpooled_out}")
-    log(f"\n[DONE] All datasets saved under: {OUT_ROOT}")
 if __name__ == "__main__":
-    main()

 import math
 import sys
 from contextlib import contextmanager
+from pathlib import Path
 import numpy as np
 import pandas as pd
 import torch
+from datasets import Dataset, DatasetDict
 from tqdm import tqdm
+from transformers import AutoModel, AutoModelForMaskedLM, AutoTokenizer, EsmModel
+# ======================
+# CONFIG
+# ======================
+ROOT = Path("<>") # CHANGE HERE
+PROJ_ROOT   = ROOT / "PeptiVerse"
+CSV_PATH = PROJ_ROOT / "training_data" / "c-binding.csv"
+OUT_ROOT = PROJ_ROOT / "training_data_cleaned" / "binding_affinity"
+# ESM2 - target encoder (shared across all branches)
+ESM_MODEL   = "facebook/esm2_t33_650M_UR50D"
+ESM_MAX_LEN = 1022
+ESM_BATCH   = 32
+# PeptideCLM - SMILES binder encoder
+sys.path.append(str(PROJ_ROOT))
+from tokenizer.my_tokenizers import SMILES_SPE_Tokenizer
+PEPTIDECLM_MODEL   = "aaronfeller/PeptideCLM-23M-all"
+TOKENIZER_VOCAB    = str(PROJ_ROOT / "tokenizer" / "new_vocab.txt")
+TOKENIZER_SPLITS   = str(PROJ_ROOT / "tokenizer" / "new_splits.txt")
+PEPTIDECLM_MAX_LEN = 768
+PEPTIDECLM_BATCH   = 128
+# ChemBERTa - SMILES binder encoder
+CHEMBERTA_MODEL   = "DeepChem/ChemBERTa-77M-MLM"
+CHEMBERTA_MAX_LEN = 512
+CHEMBERTA_BATCH   = 128
+# Which SMILES binder models to run
+RUN_PEPTIDECLM = True
+RUN_CHEMBERTA  = True
+# CSV column names
+COL_SEQ1     = "seq1"
+COL_SEQ2     = "seq2"
+COL_AFF      = "affinity"
+COL_F2S      = "Fasta2SMILES"
+COL_REACT    = "REACT_SMILES"
+COL_MERGE    = "Merge_SMILES"
+COL_WT_IPTM  = "wt_iptm_score"
 COL_SMI_IPTM = "smiles_iptm_score"
+# Split config
+TRAIN_FRAC       = 0.80
+RANDOM_SEED      = 1986
+AFFINITY_Q_BINS  = 30
+# Logging
+QUIET    = True
+USE_TQDM = False
 DEVICE = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+# ======================
+# Logging / progress
+# ======================
 def log(msg: str):
     if not QUIET:
         print(msg)
     log(f"=== done: {title} ===")
+# ======================
+# Data Handling
+# ======================
 def has_uaa(seq: str) -> bool:
     return "X" in str(seq).upper()
+def pick_smiles(row) -> str | None:
+    """Column Priority: Fasta2SMILES > REACT_SMILES > Merge_SMILES."""
+    for col in [COL_F2S, COL_REACT, COL_MERGE]:
+        val = row.get(col, None)
+        if val is not None and str(val).strip() not in ("", "nan", "None"):
+            return str(val).strip()
+    return None
 def affinity_to_class(a: float) -> str:
     if a >= 9.0:
         return "High"
     elif a >= 7.0:
 def make_distribution_matched_split(df: pd.DataFrame) -> pd.DataFrame:
     df = df.copy()
     df[COL_AFF] = pd.to_numeric(df[COL_AFF], errors="coerce")
     df = df.dropna(subset=[COL_AFF]).reset_index(drop=True)
     df["affinity_class"] = df[COL_AFF].apply(affinity_to_class)
     try:
         strat_col = "aff_bin"
     rng = np.random.RandomState(RANDOM_SEED)
     df["split"] = None
     for _, g in df.groupby(strat_col, observed=True):
         idx = g.index.to_numpy()
         rng.shuffle(idx)
         n_train = int(math.floor(len(idx) * TRAIN_FRAC))
         df.loc[idx[:n_train], "split"] = "train"
+        df.loc[idx[n_train:],  "split"] = "val"
     df["split"] = df["split"].fillna("train")
     return df
+def prep_view(df_in: pd.DataFrame, binder_seq_col: str, iptm_col: str) -> pd.DataFrame:
+    out = df_in.copy()
+    out["target_sequence"] = out[COL_SEQ1].astype(str).str.strip()
+    out["sequence"]        = out[binder_seq_col].astype(str).str.strip()
+    out["label"]           = pd.to_numeric(out[COL_AFF], errors="coerce")
+    out[iptm_col]          = pd.to_numeric(out[iptm_col], errors="coerce")
+    out[COL_AFF]           = pd.to_numeric(out[COL_AFF], errors="coerce")
+    out = out.dropna(subset=["target_sequence", "sequence", "label"]).reset_index(drop=True)
+    return out[["target_sequence", "sequence", "label", "split",
+                iptm_col, COL_AFF, "affinity_class"]]
+# ======================
+# Dataset builders
+# ======================
+def build_pooled_ds(view: pd.DataFrame, iptm_col: str,
+                    tgt_embs: np.ndarray, bnd_embs: np.ndarray) -> Dataset:
+    """Both target and binder are (N, H) pooled float32 arrays."""
+    return Dataset.from_dict({
+        "target_sequence":  view["target_sequence"].tolist(),
+        "sequence":         view["sequence"].tolist(),
+        "label":            view["label"].astype(float).tolist(),
+        "target_embedding": tgt_embs,           # (N, H_esm)    float32
+        "binder_embedding": bnd_embs,           # (N, H_binder) float32
+        "affinity":         view[COL_AFF].astype(float).tolist(),
+        "affinity_class":   view["affinity_class"].tolist(),
+        iptm_col:           view[iptm_col].astype(float).tolist(),
+    })
+def build_unpooled_ds(view: pd.DataFrame, iptm_col: str,
+                      tgt_tok_embs, tgt_masks, tgt_lengths,
+                      bnd_tok_embs, bnd_masks, bnd_lengths) -> Dataset:
+    """
+    Per-token lists for both sides.
+      target_embedding[i] : (Lt_i, H_esm)    float16 ndarray
+      binder_embedding[i] : (Lb_i, H_binder) float16 ndarray
+    """
+    return Dataset.from_dict({
+        "target_sequence":        view["target_sequence"].tolist(),
+        "sequence":               view["sequence"].tolist(),
+        "label":                  view["label"].astype(float).tolist(),
+        "target_embedding":       tgt_tok_embs,
+        "target_attention_mask":  tgt_masks,
+        "target_length":          tgt_lengths,
+        "binder_embedding":       bnd_tok_embs,
+        "binder_attention_mask":  bnd_masks,
+        "binder_length":          bnd_lengths,
+        "affinity":               view[COL_AFF].astype(float).tolist(),
+        "affinity_class":         view["affinity_class"].tolist(),
+        iptm_col:                 view[iptm_col].astype(float).tolist(),
+    })
+# ======================
+# ESM2 - shared target encoder
+# ======================
+def load_esm():
+    print(f"  Loading ESM2: {ESM_MODEL}")
+    tok   = AutoTokenizer.from_pretrained(ESM_MODEL)
+    model = EsmModel.from_pretrained(ESM_MODEL).to(DEVICE).eval()
+    return tok, model
 @torch.no_grad()
+def embed_esm_pooled(seqs, tok, model) -> np.ndarray:
+    """Returns (N, H) float32 - mean-pooled over non-pad tokens."""
+    all_embs = []
+    for i in pbar(range(0, len(seqs), ESM_BATCH), desc="    ESM2 pooled"):
+        batch = seqs[i:i + ESM_BATCH]
+        enc   = tok(batch, return_tensors="pt", padding=True,
+                    truncation=True, max_length=ESM_MAX_LEN)
+        ids  = enc["input_ids"].to(DEVICE)
+        mask = enc["attention_mask"].to(DEVICE)
+        h    = model(input_ids=ids, attention_mask=mask).last_hidden_state
+        attn_f = mask.unsqueeze(-1).float()
+        pooled = ((h * attn_f).sum(dim=1) /
+                  attn_f.sum(dim=1).clamp(min=1e-9)).cpu().numpy().astype(np.float32)
+        all_embs.append(pooled)
+    return np.vstack(all_embs)
+@torch.no_grad()
+def embed_esm_unpooled(seqs, tok, model):
     """
+    Returns per-token lists (CLS/EOS/pad excluded).
+      tok_embs : list of (Lt_i, H) float16 arrays
+      masks    : list of (Lt_i,) int8  arrays  (all-ones)
+      lengths  : list of int
     """
+    cls_id = tok.cls_token_id
+    eos_id = tok.eos_token_id
+    tok_embs, masks, lengths = [], [], []
+    for i in pbar(range(0, len(seqs), ESM_BATCH), desc="    ESM2 unpooled"):
+        batch = seqs[i:i + ESM_BATCH]
+        enc   = tok(batch, return_tensors="pt", padding=True,
+                    truncation=True, max_length=ESM_MAX_LEN)
+        ids  = enc["input_ids"].to(DEVICE)
+        mask = enc["attention_mask"].to(DEVICE)
+        h    = model(input_ids=ids, attention_mask=mask).last_hidden_state
+        for b in range(h.shape[0]):
+            keep = mask[b].bool()
+            if cls_id is not None:
+                keep = keep & (ids[b] != cls_id)
+            if eos_id is not None:
+                keep = keep & (ids[b] != eos_id)
+            emb = h[b, keep].cpu().to(torch.float16).numpy()
+            tok_embs.append(emb)
+            masks.append(np.ones(emb.shape[0], dtype=np.int8))
+            lengths.append(emb.shape[0])
+    return tok_embs, masks, lengths
+# ======================
+# Generic binder embedding helpers
+# ======================
+def _get_special_ids_t(tokenizer):
+    special_ids = sorted({
+        x for x in [
+            getattr(tokenizer, attr, None)
+            for attr in ("pad_token_id", "cls_token_id", "sep_token_id",
+                         "bos_token_id", "eos_token_id", "mask_token_id")
+        ] if x is not None
     })
+    return (torch.tensor(special_ids, device=DEVICE, dtype=torch.long)
+            if special_ids else None)
+def _pool_and_unpool(last_hidden, input_ids, attention_mask, special_ids_t):
+    """Mean-pool over non-special valid tokens; also return per-token arrays."""
     valid = attention_mask.bool()
+    if special_ids_t is not None:
+        valid = valid & (~torch.isin(input_ids, special_ids_t))
     valid_f = valid.unsqueeze(-1).float()
+    pooled  = (
+        torch.sum(last_hidden * valid_f, dim=1) /
+        torch.clamp(valid_f.sum(dim=1), min=1e-9)
+    ).cpu().numpy().astype(np.float32)
+    tok_embs, masks, lengths = [], [], []
     for b in range(last_hidden.shape[0]):
+        emb = last_hidden[b, valid[b]].cpu().to(torch.float16).numpy()
+        tok_embs.append(emb)
+        masks.append(np.ones(emb.shape[0], dtype=np.int8))
+        lengths.append(emb.shape[0])
+    return pooled, tok_embs, masks, lengths
+# ======================
+# PeptideCLM - SMILES binder encoder
+# ======================
+def load_peptideclm():
+    print(f"  Loading PeptideCLM: {PEPTIDECLM_MODEL}")
+    tok   = SMILES_SPE_Tokenizer(TOKENIZER_VOCAB, TOKENIZER_SPLITS)
+    model = (AutoModelForMaskedLM.from_pretrained(PEPTIDECLM_MODEL)
+             .roformer.to(DEVICE).eval())
+    return tok, model, _get_special_ids_t(tok)
+@torch.no_grad()
+def embed_peptideclm(seqs, tok, model, sid_t):
+    pooled_all, tok_all, mask_all, len_all = [], [], [], []
+    for i in pbar(range(0, len(seqs), PEPTIDECLM_BATCH), desc="    PeptideCLM binder"):
+        batch = seqs[i:i + PEPTIDECLM_BATCH]
+        enc   = tok(batch, return_tensors="pt", padding=True,
+                    truncation=True, max_length=PEPTIDECLM_MAX_LEN)
+        ids  = enc["input_ids"].to(DEVICE)
+        mask = enc["attention_mask"].to(DEVICE)
+        h    = model(input_ids=ids, attention_mask=mask).last_hidden_state
+        p, t, m, l = _pool_and_unpool(h, ids, mask, sid_t)
+        pooled_all.append(p); tok_all.extend(t); mask_all.extend(m); len_all.extend(l)
+    return np.vstack(pooled_all), tok_all, mask_all, len_all
+# ======================
+# ChemBERTa - SMILES binder encoder
+# ======================
+def load_chemberta():
+    print(f"  Loading ChemBERTa: {CHEMBERTA_MODEL}")
+    tok   = AutoTokenizer.from_pretrained(CHEMBERTA_MODEL)
+    model = AutoModel.from_pretrained(CHEMBERTA_MODEL).to(DEVICE).eval()
+    return tok, model, _get_special_ids_t(tok)
+@torch.no_grad()
+def embed_chemberta(seqs, tok, model, sid_t):
+    pooled_all, tok_all, mask_all, len_all = [], [], [], []
+    for i in pbar(range(0, len(seqs), CHEMBERTA_BATCH), desc="    ChemBERTa binder"):
+        batch = seqs[i:i + CHEMBERTA_BATCH]
+        enc   = tok(batch, return_tensors="pt", padding=True,
+                    truncation=True, max_length=CHEMBERTA_MAX_LEN)
+        ids  = enc["input_ids"].to(DEVICE)
+        mask = enc["attention_mask"].to(DEVICE)
+        h    = model(input_ids=ids, attention_mask=mask).last_hidden_state
+        p, t, m, l = _pool_and_unpool(h, ids, mask, sid_t)
+        pooled_all.append(p); tok_all.extend(t); mask_all.extend(m); len_all.extend(l)
+    return np.vstack(pooled_all), tok_all, mask_all, len_all
+# ======================
+# WT branch (ESM2 × ESM2)
+# ======================
+def run_wt_branch(wt_train: pd.DataFrame, wt_val: pd.DataFrame,
+                  esm_tok, esm_model):
+    print("\n" + "="*55)
+    print("  Branch : WT  (ESM2 target × ESM2 binder)")
+    print("="*55)
+    pooled_splits, unpooled_splits = {}, {}
+    for split_name, view in [("train", wt_train), ("val", wt_val)]:
+        print(f"\n  [{split_name}] {len(view)} rows")
+        targets = view["target_sequence"].tolist()
+        binders = view["sequence"].tolist()
+        tgt_pooled = embed_esm_pooled(targets, esm_tok, esm_model)
+        bnd_pooled = embed_esm_pooled(binders, esm_tok, esm_model)
+        tgt_tok_embs, tgt_masks, tgt_lengths = embed_esm_unpooled(targets, esm_tok, esm_model)
+        bnd_tok_embs, bnd_masks, bnd_lengths = embed_esm_unpooled(binders, esm_tok, esm_model)
+        pooled_splits[split_name]   = build_pooled_ds(
+            view, COL_WT_IPTM, tgt_pooled, bnd_pooled)
+        unpooled_splits[split_name] = build_unpooled_ds(
+            view, COL_WT_IPTM,
+            tgt_tok_embs, tgt_masks, tgt_lengths,
+            bnd_tok_embs, bnd_masks, bnd_lengths)
+    pooled_out   = OUT_ROOT / "pair_wt_wt_pooled"
+    unpooled_out = OUT_ROOT / "pair_wt_wt_unpooled"
+    DatasetDict(pooled_splits).save_to_disk(str(pooled_out))
+    DatasetDict(unpooled_splits).save_to_disk(str(unpooled_out))
+    print(f"\n   WT pooled   to {pooled_out}")
+    print(f"   WT unpooled to {unpooled_out}")
+# ======================
+# SMILES branch (ESM2 × {PeptideCLM | ChemBERTa})
+# ======================
+def run_smiles_binder_model(name: str,
+                             smi_train: pd.DataFrame, smi_val: pd.DataFrame,
+                             esm_tok, esm_model,
+                             load_fn, embed_fn):
+    print("\n" + "="*55)
+    print(f"  Branch : SMILES  (ESM2 target × {name} binder)")
+    print("="*55)
+    binder_tok, binder_model, sid_t = load_fn()
+    pooled_splits, unpooled_splits  = {}, {}
+    for split_name, view in [("train", smi_train), ("val", smi_val)]:
+        print(f"\n  [{split_name}] {len(view)} rows")
+        targets = view["target_sequence"].tolist()
+        binders = view["sequence"].tolist()
+        print("    ESM2 target - pooled ...")
+        tgt_pooled = embed_esm_pooled(targets, esm_tok, esm_model)
+        print("    ESM2 target - unpooled ...")
+        tgt_tok_embs, tgt_masks, tgt_lengths = embed_esm_unpooled(
+            targets, esm_tok, esm_model)
+        print(f"    {name} binder - pooled + unpooled ...")
+        bnd_pooled, bnd_tok_embs, bnd_masks, bnd_lengths = embed_fn(
+            binders, binder_tok, binder_model, sid_t)
+        pooled_splits[split_name]   = build_pooled_ds(
+            view, COL_SMI_IPTM, tgt_pooled, bnd_pooled)
+        unpooled_splits[split_name] = build_unpooled_ds(
+            view, COL_SMI_IPTM,
+            tgt_tok_embs, tgt_masks, tgt_lengths,
+            bnd_tok_embs, bnd_masks, bnd_lengths)
+    suffix       = "" if name.lower() == "peptideclm" else f"_{name.lower()}"
+    pooled_out   = OUT_ROOT / f"pair_wt_smiles_pooled{suffix}"
+    unpooled_out = OUT_ROOT / f"pair_wt_smiles_unpooled{suffix}"
+    DatasetDict(pooled_splits).save_to_disk(str(pooled_out))
+    DatasetDict(unpooled_splits).save_to_disk(str(unpooled_out))
+    print(f"\n   {name} pooled   to {pooled_out}")
+    print(f"   {name} unpooled to {unpooled_out}")
+    del binder_model
+    torch.cuda.empty_cache()
+# ======================
 # Main
+# ======================
 def main():
+    print(f"Device  : {DEVICE}")
+    print(f"CSV     : {CSV_PATH}")
+    print(f"Out     : {OUT_ROOT}\n")
     OUT_ROOT.mkdir(parents=True, exist_ok=True)
+    # ------------------------------------------------------------------
+    # 1. Load + dedup
+    # ------------------------------------------------------------------
     with section("load csv + dedup"):
         df = pd.read_csv(CSV_PATH)
+        print(f"Raw rows: {len(df)}")
+        df["orig_idx"] = df.index  # traceability only
+        for c in [COL_SEQ1, COL_SEQ2, COL_F2S, COL_REACT, COL_MERGE]:
             if c in df.columns:
                 df[c] = df[c].apply(lambda x: x.strip() if isinstance(x, str) else x)
+        for col in [COL_SEQ1, COL_SEQ2, COL_AFF, COL_F2S, COL_REACT, COL_WT_IPTM, COL_SMI_IPTM]:
+            if col not in df.columns:
+                raise ValueError(f"Missing required column: '{col}'")
+        dedup_cols = [c for c in [COL_SEQ1, COL_SEQ2, COL_F2S, COL_REACT, COL_MERGE]
+                      if c in df.columns]
+        before = len(df)
+        df = df.drop_duplicates(subset=dedup_cols, keep="first").reset_index(drop=True)
+        print(f"After dedup pass 1 (raw columns)        : {len(df)}  (-{before - len(df)})")
         df[COL_AFF] = pd.to_numeric(df[COL_AFF], errors="coerce")
+    # ------------------------------------------------------------------
+    # 2. Prepare per-branch subsets
+    # ------------------------------------------------------------------
+    with section("prepare WT / SMILES subsets"):
+        # ── WT branch ──────────────────────────────────────────────────
+        # Both seq1 and seq2 must be canonical (no X) for ESM2
         df_wt = df.copy()
         df_wt["wt_sequence"] = df_wt[COL_SEQ2].astype(str).str.strip()
+        df_wt = df_wt.dropna(subset=[COL_AFF])
+        df_wt = df_wt[~df_wt[COL_SEQ1].astype(str).str.contains("X", case=False, na=False)]
+        df_wt = df_wt[df_wt["wt_sequence"] != ""]
+        df_wt = df_wt[~df_wt["wt_sequence"].str.contains("X", case=False, na=False)]
+        df_wt = df_wt.reset_index(drop=True)
+        # ── SMILES branch ──────────────────────────────────────────────
+        # seq1 must be canonical (no X) for ESM2; binder SMILES picked
+        # by priority (Fasta2SMILES > REACT_SMILES > Merge_SMILES), then
+        # dedup pass 2 on (seq1, picked smiles_sequence)
         df_smi = df.copy()
+        df_smi = df_smi.dropna(subset=[COL_AFF])
         df_smi = df_smi[
             pd.to_numeric(df_smi[COL_SMI_IPTM], errors="coerce").notna()
+        ]
+        df_smi = df_smi[~df_smi[COL_SEQ1].astype(str).str.contains("X", case=False, na=False)]
+        df_smi = df_smi.reset_index(drop=True)
+        df_smi["smiles_sequence"] = df_smi.apply(pick_smiles, axis=1)
+        df_smi = df_smi[df_smi["smiles_sequence"].notna()].reset_index(drop=True)
+        print(f"After requiring ≥1 valid SMILES         : {len(df_smi)}")
+        # Dedup pass 2: (seq1, picked smiles_sequence)
+        before = len(df_smi)
+        df_smi = df_smi.drop_duplicates(
+            subset=[COL_SEQ1, "smiles_sequence"], keep="first"
+        ).reset_index(drop=True)
+        print(f"After dedup pass 2 (seq1, smiles_sequence): {len(df_smi)}  (-{before - len(df_smi)})")
+        assert not df_smi.duplicated(subset=[COL_SEQ1, "smiles_sequence"]).any(), \
+            "BUG: duplicate (seq1, smiles_sequence) pairs remain!"
+        print(f"\n[counts] WT rows={len(df_wt)} | SMILES rows={len(df_smi)}")
+    # ------------------------------------------------------------------
+    # 3. Split
+    # ------------------------------------------------------------------
+    with section("split WT and SMILES separately"):
+        df_wt2  = make_distribution_matched_split(df_wt)
         df_smi2 = make_distribution_matched_split(df_smi)
+        df_wt2.to_csv(OUT_ROOT / "binding_affinity_wt_meta_with_split.csv",   index=False)
+        df_smi2.to_csv(OUT_ROOT / "binding_affinity_smiles_meta_with_split.csv", index=False)
+    # ------------------------------------------------------------------
+    # 4. Build split views
+    # ------------------------------------------------------------------
+    wt_view  = prep_view(df_wt2,  "wt_sequence",      COL_WT_IPTM)
+    smi_view = prep_view(df_smi2, "smiles_sequence",  COL_SMI_IPTM)
+    wt_train  = wt_view[wt_view["split"]   == "train"].reset_index(drop=True)
+    wt_val    = wt_view[wt_view["split"]   == "val"].reset_index(drop=True)
     smi_train = smi_view[smi_view["split"] == "train"].reset_index(drop=True)
     smi_val   = smi_view[smi_view["split"] == "val"].reset_index(drop=True)
+    print(f"\nSplit sizes - WT:    train={len(wt_train)}  val={len(wt_val)}")
+    print(f"Split sizes - SMILES: train={len(smi_train)}  val={len(smi_val)}")
+    # ------------------------------------------------------------------
+    # 5. Load ESM2 once - shared across all branches
+    # ------------------------------------------------------------------
+    print("\nLoading ESM2 (shared target encoder) ...")
+    esm_tok, esm_model = load_esm()
+    # ------------------------------------------------------------------
+    # 6. WT branch
+    # ------------------------------------------------------------------
+    run_wt_branch(wt_train, wt_val, esm_tok, esm_model)
+    # ------------------------------------------------------------------
+    # 7. SMILES branches
+    # ------------------------------------------------------------------
+    if RUN_PEPTIDECLM:
+        run_smiles_binder_model(
+            "peptideclm", smi_train, smi_val,
+            esm_tok, esm_model,
+            load_fn=load_peptideclm,
+            embed_fn=embed_peptideclm,
         )
+    if RUN_CHEMBERTA:
+        run_smiles_binder_model(
+            "chemberta", smi_train, smi_val,
+            esm_tok, esm_model,
+            load_fn=load_chemberta,
+            embed_fn=embed_chemberta,
         )
+    print(f"\n All done. Datasets saved under: {OUT_ROOT}")
 if __name__ == "__main__":
+    main()

training_data_cleaned/embed_smiles.py ADDED Viewed

	@@ -0,0 +1,319 @@

+"""
+Pipeline:
+  1. Read *_meta_with_split.csv  (sequence, label, id, split)
+  2. Convert wt sequences to SMILES via:  fasta2smi -i peptides.fasta -o peptides.p2smi
+  3. Parse .p2smi format:  "{seq}-linear: {SMILES}"
+  4. Embed SMILES with ChemBERTa  to save pooled + unpooled DatasetDicts
+  5. Embed SMILES with PeptideCLM to save pooled + unpooled DatasetDicts
+"""
+import os
+import subprocess
+import tempfile
+import sys
+import numpy as np
+import torch
+import pandas as pd
+from tqdm import tqdm
+from datasets import Dataset, DatasetDict
+from transformers import AutoTokenizer, AutoModel, AutoModelForMaskedLM
+PROJECT_ROOT   = "<>" # change here
+# using permeability as example
+META_CSV = (
+    f"{PROJECT_ROOT}/training_data_cleaned/"
+    "permeability_penetrance/permeability_meta_with_split.csv"
+)
+BASE_OUT = f"{PROJECT_ROOT}/alternative_embeddings"
+# ChemBERTa
+CHEMBERTA_MODEL = "DeepChem/ChemBERTa-77M-MLM"
+CHEMBERTA_OUT   = f"{BASE_OUT}/permeability_chemberta/perm_smiles_with_embeddings"
+# PeptideCLM
+sys.path.append(PROJECT_ROOT)
+from tokenizer.my_tokenizers import SMILES_SPE_Tokenizer
+PEPTIDECLM_MODEL        = "aaronfeller/PeptideCLM-23M-all"
+PEPTIDECLM_TOKENIZER    = f"{PROJECT_ROOT}/tokenizer/new_vocab.txt"
+PEPTIDECLM_SPLITS       = f"{PROJECT_ROOT}/tokenizer/new_splits.txt"
+PEPTIDECLM_OUT          = f"{BASE_OUT}/permeability_peptideclm/perm_smiles_with_embeddings"
+# Column names in the CSV
+SEQ_COL   = "sequence"
+LABEL_COL = "label"
+SPLIT_COL = "split"
+ID_COL    = "id"        # used as FASTA header; must be unique
+# fasta2smi settings
+FASTA2SMI_BIN = "fasta2smi"   # install via github
+# Embedding settings
+MAX_LENGTH_CHEMBERTA   = 512
+MAX_LENGTH_PEPTIDECLM  = 768
+BATCH_SIZE = 128
+device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
+# ===========================================================================================
+# Step 1 — fasta2smi conversion, do not apply to properties that only have SMILES sequences
+# ===========================================================================================
+def sequences_to_smiles(sequences: list[str], ids: list[str]) -> dict[str, str]:
+    """
+    .p2smi format produced by fasta2smi:
+        MIIFAIAASHKK-linear: N[C@@H](CCSC)C(=O)...
+        KIAKLKAKIQ...-linear: N[C@@H](CCCCN)C(=O)...
+    """
+    with tempfile.TemporaryDirectory() as tmpdir:
+        fasta_path = os.path.join(tmpdir, "peptides.fasta")
+        p2smi_path = os.path.join(tmpdir, "peptides.p2smi")
+        with open(fasta_path, "w") as fh:
+            for sid, seq in zip(ids, sequences):
+                fh.write(f">{sid}\n{seq}\n")
+        cmd = [FASTA2SMI_BIN, "-i", fasta_path, "-o", p2smi_path]
+        print(f"  Running: {' '.join(cmd)}")
+        result = subprocess.run(cmd, capture_output=True, text=True)
+        if result.returncode != 0:
+            raise RuntimeError(
+                f"fasta2smi failed (exit {result.returncode}):\n"
+                f"  stdout: {result.stdout}\n  stderr: {result.stderr}"
+            )
+        seq2smi = _parse_p2smi(p2smi_path)
+    n_ok   = len(seq2smi)
+    n_fail = len(sequences) - n_ok
+    print(f"  fasta2smi: {n_ok}/{len(sequences)} converted  ({n_fail} failed/skipped)")
+    return seq2smi
+def _parse_p2smi(path: str) -> dict[str, str]:
+    seq2smi: dict[str, str] = {}
+    with open(path) as fh:
+        for line in fh:
+            line = line.strip()
+            if not line or line.startswith("#"):
+                continue
+            # Split on "-linear: " — the separator fasta2smi uses
+            if "-linear: " not in line:
+                print(f"  [WARN] Unexpected p2smi line, skipping: {line[:80]}")
+                continue
+            aa_seq, smi = line.split("-linear: ", maxsplit=1)
+            smi = smi.strip()
+            if smi and smi.lower() not in ("none", "null", "n/a"):
+                seq2smi[aa_seq] = smi
+    return seq2smi
+# ============================================================
+# Setups
+# ============================================================
+def _get_special_ids_tensor(tokenizer):
+    attrs = [
+        "pad_token_id", "cls_token_id", "sep_token_id",
+        "bos_token_id", "eos_token_id", "mask_token_id",
+    ]
+    ids = sorted({getattr(tokenizer, a, None) for a in attrs} - {None})
+    return torch.tensor(ids, device=device, dtype=torch.long) if ids else None
+@torch.no_grad()
+def _embed_batch(tokenizer, model, special_ids_t, sequences, max_length):
+    tok = tokenizer(
+        sequences, return_tensors="pt",
+        padding=True, max_length=max_length, truncation=True,
+    )
+    input_ids      = tok["input_ids"].to(device)
+    attention_mask = tok["attention_mask"].to(device)
+    out         = model(input_ids=input_ids, attention_mask=attention_mask)
+    last_hidden = out.last_hidden_state           # (B, L, H)
+    valid = attention_mask.bool()
+    if special_ids_t is not None:
+        valid = valid & (~torch.isin(input_ids, special_ids_t))
+    valid_f = valid.unsqueeze(-1).float()
+    pooled  = (
+        torch.sum(last_hidden * valid_f, dim=1)
+        / torch.clamp(valid_f.sum(dim=1), min=1e-9)
+    ).cpu().numpy()                               # (B, H) float32
+    token_embs, masks, lengths = [], [], []
+    for b in range(last_hidden.shape[0]):
+        emb = last_hidden[b, valid[b]].cpu().to(torch.float16).numpy()
+        token_embs.append(emb)
+        masks.append(np.ones(emb.shape[0], dtype=np.int8))
+        lengths.append(emb.shape[0])
+    return pooled, token_embs, masks, lengths
+def _embed_all(tokenizer, model, special_ids_t, sequences, max_length):
+    pooled_all, token_all, mask_all, len_all = [], [], [], []
+    for i in tqdm(range(0, len(sequences), BATCH_SIZE), desc="    batches"):
+        p, t, m, l = _embed_batch(
+            tokenizer, model, special_ids_t,
+            sequences[i:i+BATCH_SIZE], max_length,
+        )
+        pooled_all.append(p)
+        token_all.extend(t)
+        mask_all.extend(m)
+        len_all.extend(l)
+    return np.vstack(pooled_all), token_all, mask_all, len_all
+def _build_datasets(wt_seqs, smiles, labels, tokenizer, model, special_ids_t, max_length):
+    pooled, tok_embs, masks, lengths = _embed_all(
+        tokenizer, model, special_ids_t, smiles, max_length
+    )
+    pooled_ds = Dataset.from_dict({
+        "sequence":  wt_seqs,
+        "smiles":    smiles,
+        "label":     labels,
+        "embedding": pooled,
+    })
+    full_ds = Dataset.from_dict({
+        "sequence":       wt_seqs,
+        "smiles":         smiles,
+        "label":          labels,
+        "embedding":      tok_embs,
+        "attention_mask": masks,
+        "length":         lengths,
+    })
+    return pooled_ds, full_ds
+def _save(splits: dict, out_path: str):
+    os.makedirs(os.path.dirname(out_path), exist_ok=True)
+    DatasetDict({k: v[0] for k, v in splits.items()}).save_to_disk(out_path)
+    DatasetDict({k: v[1] for k, v in splits.items()}).save_to_disk(out_path + "_unpooled")
+    print(f"   Saved pooled   to {out_path}")
+    print(f"   Saved unpooled to {out_path}_unpooled")
+# ============================================================
+# ChemBERTa
+# ============================================================
+def run_chemberta(meta: pd.DataFrame):
+    print(f"\n{'='*60}")
+    print("  Encoder: ChemBERTa")
+    print(f"{'='*60}")
+    print(f"  Loading {CHEMBERTA_MODEL} ...")
+    tokenizer = AutoTokenizer.from_pretrained(CHEMBERTA_MODEL)
+    model     = AutoModel.from_pretrained(CHEMBERTA_MODEL).to(device).eval()
+    special_ids_t = _get_special_ids_tensor(tokenizer)
+    splits: dict[str, tuple] = {}
+    for split_name in ["train", "val"]:
+        df = meta[meta[SPLIT_COL] == split_name].reset_index(drop=True)
+        print(f"\n  [{split_name}]  {len(df)} rows")
+        if df.empty:
+            print("    [WARN] Empty split, skipping.")
+            continue
+        pooled_ds, full_ds = _build_datasets(
+            df[SEQ_COL].tolist(), df["smiles"].tolist(),
+            df[LABEL_COL].tolist(),
+            tokenizer, model, special_ids_t, MAX_LENGTH_CHEMBERTA,
+        )
+        splits[split_name] = (pooled_ds, full_ds)
+    _save(splits, CHEMBERTA_OUT)
+    # free GPU memory before loading next model
+    del model
+    torch.cuda.empty_cache()
+# ============================================================
+# PeptideCLM
+# ============================================================
+def run_peptideclm(meta: pd.DataFrame):
+    print(f"\n{'='*60}")
+    print("  Encoder: PeptideCLM")
+    print(f"{'='*60}")
+    print(f"  Loading tokenizer from {PEPTIDECLM_TOKENIZER} ...")
+    tokenizer = SMILES_SPE_Tokenizer(PEPTIDECLM_TOKENIZER, PEPTIDECLM_SPLITS)
+    print(f"  Loading {PEPTIDECLM_MODEL} ...")
+    full_model = AutoModelForMaskedLM.from_pretrained(PEPTIDECLM_MODEL)
+    model = full_model.roformer.to(device).eval()
+    special_ids_t = _get_special_ids_tensor(tokenizer)
+    splits: dict[str, tuple] = {}
+    for split_name in ["train", "val"]:
+        df = meta[meta[SPLIT_COL] == split_name].reset_index(drop=True)
+        print(f"\n  [{split_name}]  {len(df)} rows")
+        if df.empty:
+            print("    [WARN] Empty split, skipping.")
+            continue
+        pooled_ds, full_ds = _build_datasets(
+            df[SEQ_COL].tolist(), df["smiles"].tolist(),
+            df[LABEL_COL].tolist(),
+            tokenizer, model, special_ids_t, MAX_LENGTH_PEPTIDECLM,
+        )
+        splits[split_name] = (pooled_ds, full_ds)
+    _save(splits, PEPTIDECLM_OUT)
+    del model
+    torch.cuda.empty_cache()
+# ============================================================
+# Main
+# ============================================================
+def main():
+    print(f"\nDevice : {device}")
+    print(f"Meta   : {META_CSV}")
+    # Load metadata
+    meta = pd.read_csv(META_CSV, sep=None, engine="python")
+    print(f"Loaded {len(meta)} rows.  Columns: {meta.columns.tolist()}")
+    for col in [SEQ_COL, LABEL_COL, SPLIT_COL]:
+        if col not in meta.columns:
+            raise ValueError(f"Expected column '{col}' not found. Available: {meta.columns.tolist()}")
+    # Ensure numeric labels
+    meta[LABEL_COL] = pd.to_numeric(meta[LABEL_COL], errors="coerce")
+    meta = meta.dropna(subset=[SEQ_COL, LABEL_COL]).reset_index(drop=True)
+    # Build id list for FASTA headers
+    if ID_COL in meta.columns:
+        ids = meta[ID_COL].astype(str).tolist()
+    else:
+        ids = [f"seq_{i}" for i in range(len(meta))]
+    # Note that for properties start with SMILES sequences, fasta2smi is not needed
+    # Convert wt to SMILES (single fasta2smi call for the whole dataset)
+    print("\nConverting peptide sequences to SMILES ...")
+    seqs   = meta[SEQ_COL].astype(str).tolist()
+    seq2smi = sequences_to_smiles(seqs, ids)
+    meta["smiles"] = meta[SEQ_COL].astype(str).map(seq2smi)
+    n_missing = meta["smiles"].isna().sum()
+    if n_missing:
+        print(f"  [WARN] {n_missing} sequences had no SMILES — dropping.")
+        meta = meta.dropna(subset=["smiles"]).reset_index(drop=True)
+    print(f"  Retained {len(meta)} rows with valid SMILES.")
+    # Save SMILES-enriched meta CSV
+    smiles_meta_path = os.path.join(BASE_OUT, "permeability_smiles_meta_with_split.csv")
+    os.makedirs(BASE_OUT, exist_ok=True)
+    meta.to_csv(smiles_meta_path, index=False)
+    print(f"   Saved SMILES meta to {smiles_meta_path}")
+    # Run both encoders sequentially (share the same converted SMILES)
+    #run_chemberta(meta)
+    #run_peptideclm(meta)
+    print("\nAll done.")
+if __name__ == "__main__":
+    main()

training_data_cleaned/permeability_penetrance/permeability_smiles_meta_with_split.csv ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cbece0b3b8345cae1ce6fe2e9a1a10ddd5320bae18c3a7a3f958b97b98979796
+size 947525

training_data_cleaned/smiles_data_split.py CHANGED Viewed

@@ -15,6 +15,7 @@ from tokenizer.my_tokenizers import SMILES_SPE_Tokenizer
 seed_everything(1986)
 df = pd.read_csv("caco2.csv")
 mols = []
@@ -87,151 +88,4 @@ df[df["split"] == "train"].to_csv("caco2_train.csv", index=False)
 df[df["split"] == "val"].to_csv("caco2_val.csv", index=False)
 df.to_csv("caco2_meta_with_split.csv", index=False)
-print(df["split"].value_counts())
-# ======================
-# Config
-# ======================
-MAX_LENGTH = 768
-BATCH_SIZE = 128
-TRAIN_CSV = "caco2_train.csv"
-VAL_CSV   = "caco2_val.csv"
-SMILES_COL = "SMILES"
-LABEL_COL  = "Caco2"
-OUT_PATH = "./Classifier_Weight/training_data_cleaned/permeability_caco2/caco2_smiles_with_embeddings"
-# GPU device
-device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
-print(f"Using device: {device}")
-# ======================
-# Load tokenizer + model
-# ======================
-print("Loading tokenizer and model...")
-tokenizer = SMILES_SPE_Tokenizer(
-    "./Classifier_Weight/tokenizer/new_vocab.txt",
-    "./Classifier_Weight/tokenizer/new_splits.txt",
-)
-embedding_model = AutoModelForMaskedLM.from_pretrained("aaronfeller/PeptideCLM-23M-all").roformer
-embedding_model.to(device)
-embedding_model.eval()
-HIDDEN_KEY = "last_hidden_state"
-def get_special_ids(tokenizer):
-    cand = [
-        getattr(tokenizer, "pad_token_id", None),
-        getattr(tokenizer, "cls_token_id", None),
-        getattr(tokenizer, "sep_token_id", None),
-        getattr(tokenizer, "bos_token_id", None),
-        getattr(tokenizer, "eos_token_id", None),
-        getattr(tokenizer, "mask_token_id", None),
-    ]
-    special_ids = sorted({x for x in cand if x is not None})
-    if len(special_ids) == 0:
-        print("[WARN] No special token ids found on tokenizer; pooling will only exclude padding via attention_mask.")
-    return special_ids
-SPECIAL_IDS = get_special_ids(tokenizer)
-SPECIAL_IDS_T = torch.tensor(SPECIAL_IDS, device=device, dtype=torch.long) if len(SPECIAL_IDS) else None
-@torch.no_grad()
-def embed_batch_return_both(batch_sequences, max_length, device):
-    tok = tokenizer(
-        batch_sequences,
-        return_tensors="pt",
-        padding=True,
-        max_length=max_length,
-        truncation=True,
-    )
-    input_ids = tok["input_ids"].to(device)           # (B, L)
-    attention_mask = tok["attention_mask"].to(device) # (B, L)
-    outputs = embedding_model(input_ids=input_ids, attention_mask=attention_mask)
-    last_hidden = outputs.last_hidden_state           # (B, L, H)
-    valid = attention_mask.bool()
-    if SPECIAL_IDS_T is not None and SPECIAL_IDS_T.numel() > 0:
-        valid = valid & (~torch.isin(input_ids, SPECIAL_IDS_T))
-    # --- pooled embeddings (exclude specials) ---
-    valid_f = valid.unsqueeze(-1).float()             # (B, L, 1)
-    summed = torch.sum(last_hidden * valid_f, dim=1)  # (B, H)
-    denom = torch.clamp(valid_f.sum(dim=1), min=1e-9) # (B, 1)
-    pooled = (summed / denom).detach().cpu().numpy()  # (B, H), float32
-    # --- unpooled per-example token embeddings (exclude specials) ---
-    token_emb_list = []
-    mask_list = []
-    lengths = []
-    for b in range(last_hidden.shape[0]):
-        emb = last_hidden[b, valid[b]]               # (L_i, H)
-        token_emb_list.append(emb.detach().cpu().to(torch.float16).numpy())  # float16
-        L_i = emb.shape[0]
-        lengths.append(int(L_i))
-        mask_list.append(np.ones((L_i,), dtype=np.int8))
-    return pooled, token_emb_list, mask_list, lengths
-def generate_embeddings_batched_both(sequences, batch_size, max_length):
-    pooled_all = []
-    token_emb_all = []
-    mask_all = []
-    lengths_all = []
-    for i in tqdm(range(0, len(sequences), batch_size), desc="Embedding batches"):
-        batch = sequences[i:i + batch_size]
-        pooled, token_list, m_list, lens = embed_batch_return_both(batch, max_length, device)
-        pooled_all.append(pooled)
-        token_emb_all.extend(token_list)
-        mask_all.extend(m_list)
-        lengths_all.extend(lens)
-    pooled_all = np.vstack(pooled_all)  # (N, H)
-    return pooled_all, token_emb_all, mask_all, lengths_all
-from datasets import Dataset, DatasetDict
-def make_split_datasets(csv_path, split_name):
-    df = pd.read_csv(csv_path)
-    df = df.dropna(subset=[SMILES_COL, LABEL_COL]).reset_index(drop=True)
-    df["sequence"] = df[SMILES_COL].astype(str)
-    labels = pd.to_numeric(df[LABEL_COL], errors="coerce")
-    df = df.loc[~labels.isna()].reset_index(drop=True)
-    sequences = df["sequence"].tolist()
-    labels = pd.to_numeric(df[LABEL_COL], errors="coerce").tolist()
-    # (pooled_embs: (N,H), token_emb_list: list of (L_i,H), mask_list: list of (L_i,), lengths: list[int])
-    pooled_embs, token_emb_list, mask_list, lengths = generate_embeddings_batched_both(
-        sequences, batch_size=BATCH_SIZE, max_length=MAX_LENGTH
-    )
-    pooled_ds = Dataset.from_dict({
-        "sequence": sequences,
-        "label": labels,
-        "embedding": pooled_embs,   # (N,H)
-    })
-    full_ds = Dataset.from_dict({
-        "sequence": sequences,
-        "label": labels,
-        "embedding": token_emb_list,     # each (L_i,H) float16
-        "attention_mask": mask_list,     # each (L_i,) int8 ones
-        "length": lengths,
-    })
-    return pooled_ds, full_ds
-train_pooled, train_full = make_split_datasets(TRAIN_CSV, "train")
-val_pooled,   val_full   = make_split_datasets(VAL_CSV, "val")
-ds_pooled = DatasetDict({"train": train_pooled, "val": val_pooled})
-ds_full   = DatasetDict({"train": train_full,   "val": val_full})
-ds_pooled.save_to_disk(OUT_PATH)
-ds_full.save_to_disk(OUT_PATH + "_unpooled")

 seed_everything(1986)
+# Starting with a raw dataframe, using caco2 as example.
 df = pd.read_csv("caco2.csv")
 mols = []
 df[df["split"] == "val"].to_csv("caco2_val.csv", index=False)
 df.to_csv("caco2_meta_with_split.csv", index=False)
+print(df["split"].value_counts())