Irwiny123 commited on Nov 9, 2025

Commit

52007f8

1 Parent(s): d0c4cc5

添加PepGLAD初始代码

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.github/workflows/stale.yml +27 -0
.gitignore +35 -0
.idea/.gitignore +3 -0
.idea/PepGLAD.iml +8 -0
.idea/inspectionProfiles/profiles_settings.xml +6 -0
.idea/misc.xml +4 -0
.idea/modules.xml +8 -0
.idea/vcs.xml +6 -0
LICENSE +21 -0
README.md +214 -3
api/detect_pocket.py +72 -0
api/run.py +274 -0
assets/1ssc_A_pocket.json +1 -0
cal_metrics.py +228 -0
configs/pepbdb/autoencoder/train_codesign.yaml +66 -0
configs/pepbdb/autoencoder/train_fixseq.yaml +63 -0
configs/pepbdb/ldm/setup_latent_guidance.yaml +12 -0
configs/pepbdb/ldm/train_codesign.yaml +61 -0
configs/pepbdb/ldm/train_fixseq.yaml +63 -0
configs/pepbdb/test_codesign.yaml +18 -0
configs/pepbdb/test_fixseq.yaml +19 -0
configs/pepbench/autoencoder/train_codesign.yaml +66 -0
configs/pepbench/autoencoder/train_fixseq.yaml +62 -0
configs/pepbench/ldm/setup_latent_guidance.yaml +12 -0
configs/pepbench/ldm/train_codesign.yaml +60 -0
configs/pepbench/ldm/train_fixseq.yaml +61 -0
configs/pepbench/test_codesign.yaml +17 -0
configs/pepbench/test_fixseq.yaml +18 -0
data/__init__.py +53 -0
data/codesign.py +208 -0
data/converter/blocks_interface.py +89 -0
data/converter/blocks_to_data.py +110 -0
data/converter/list_blocks_to_pdb.py +61 -0
data/converter/pdb_to_list_blocks.py +99 -0
data/dataset_wrapper.py +115 -0
data/format.py +220 -0
data/mmap_dataset.py +112 -0
data/resample.py +19 -0
env.yaml +32 -0
evaluation/__init__.py +3 -0
evaluation/dG/RosettaFastRelaxUtil.xml +190 -0
evaluation/dG/base.py +148 -0
evaluation/dG/energy.py +236 -0
evaluation/dG/openmm_relaxer.py +107 -0
evaluation/dG/run.py +92 -0
evaluation/diversity.py +68 -0
evaluation/dockq.py +15 -0
evaluation/rmsd.py +11 -0
evaluation/seq_metric.py +71 -0
generate.py +235 -0

.github/workflows/stale.yml ADDED Viewed

	@@ -0,0 +1,27 @@

+# This workflow warns and then closes issues and PRs that have had no activity for a specified amount of time.
+#
+# You can adjust the behavior by modifying this file.
+# For more information, see:
+# https://github.com/actions/stale
+name: Close inactive issues
+on:
+  schedule:
+    - cron: "30 1 * * *"
+jobs:
+  close-issues:
+    runs-on: ubuntu-latest
+    permissions:
+      issues: write
+      pull-requests: write
+    steps:
+      - uses: actions/stale@v9
+        with:
+          days-before-issue-stale: 30
+          days-before-issue-close: 14
+          stale-issue-label: "stale"
+          stale-issue-message: "This issue is stale because it has been open for 30 days with no activity."
+          close-issue-message: "This issue was closed because it has been inactive for 14 days since being marked as stale."
+          days-before-pr-stale: -1
+          days-before-pr-close: -1
+          repo-token: ${{ secrets.GITHUB_TOKEN }}

.gitignore ADDED Viewed

	@@ -0,0 +1,35 @@

+__pycache__
+__cache__
+__tmcache__
+ckpts
+checkpoints
+*_results*
+datasets
+exps
+DockQ
+TMscore
+*.txt
+*.pt
+*.png
+*.pkl
+*.svg
+*.log
+*.pdb
+*.jsonl

.idea/.gitignore ADDED Viewed

	@@ -0,0 +1,3 @@

+# 默认忽略的文件
+/shelf/
+/workspace.xml

.idea/PepGLAD.iml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<module type="PYTHON_MODULE" version="4">
+  <component name="NewModuleRootManager">
+    <content url="file://$MODULE_DIR$" />
+    <orderEntry type="jdk" jdkName="D:\Miniconda3" jdkType="Python SDK" />
+    <orderEntry type="sourceFolder" forTests="false" />
+  </component>
+</module>

.idea/inspectionProfiles/profiles_settings.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<component name="InspectionProjectProfileManager">
+  <settings>
+    <option name="USE_PROJECT_PROFILE" value="false" />
+    <version value="1.0" />
+  </settings>
+</component>

.idea/misc.xml ADDED Viewed

	@@ -0,0 +1,4 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectRootManager" version="2" project-jdk-name="D:\Miniconda3" project-jdk-type="Python SDK" />
+</project>

.idea/modules.xml ADDED Viewed

	@@ -0,0 +1,8 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="ProjectModuleManager">
+    <modules>
+      <module fileurl="file://$PROJECT_DIR$/.idea/PepGLAD.iml" filepath="$PROJECT_DIR$/.idea/PepGLAD.iml" />
+    </modules>
+  </component>
+</project>

.idea/vcs.xml ADDED Viewed

	@@ -0,0 +1,6 @@

+<?xml version="1.0" encoding="UTF-8"?>
+<project version="4">
+  <component name="VcsDirectoryMappings">
+    <mapping directory="" vcs="Git" />
+  </component>
+</project>

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2024 THUNLP
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

README.md CHANGED Viewed

@@ -1,3 +1,214 @@
----
-license: mit
----

+# PepGLAD: Full-Atom Peptide Design with Geometric Latent Diffusion
+![cover](./assets/cover.png)
+## Quick Links
+- [Setup](#setup)
+    - [Environment](#environment)
+    - [Datasets](#optional-datasets)
+    - [Trained Weights](#trained-weights)
+- [Usage](#usage)
+    - [Peptide Sequence-Structure Co-Design](#peptide-sequence-structure-co-design)
+    - [Peptide Binding Conformation Generation](#peptide-binding-conformation-generation)
+- [Reproduction of Paper Experiments](#reproduction-of-paper-experiments)
+    - [Codesign](#codesign)
+    - [Binding Conformation Generation](#binding-conformation-generation)
+- [Contact](#contact)
+- [Reference](#reference)
+## Updates
+Changes for compatibilities and extended functionalities are saved in [beta](https://github.com/THUNLP-MT/PepGLAD/tree/beta) branch. Thank [@Barry0121](https://github.com/Barry0121) for the help.
+- pyTorch 2.6.0 and openmm 8.2.0 are supported, with new environment configure at [2025_env.yaml](https://github.com/THUNLP-MT/PepGLAD/blob/beta/2025_env.yml).
+- Support non-canonical amino acids in `detect_pocket.py`.
+## Setup
+### Environment
+The conda environment can be constructed with the configuration `env.yaml`:
+```bash
+conda env create -f env.yaml
+```
+The codes are tested with cuda version `11.7` and pytorch version `1.13.1`.
+Don't forget to activate the environment before running the codes:
+```bash
+conda activate PepGLAD
+```
+#### (Optional) pyRosetta
+PyRosetta is used to calculate interface energy of generated peptides. If you are interested in it, please follow the instruction [here](https://www.pyrosetta.org/downloads) to install.
+### (Optional) Datasets
+These datasets are only used for benchmarking models. If you just want to use the trained weights for inferencing on your cases, there is no need to download these datasets.
+#### PepBench
+1. Download
+The datasets, which are originally introduced in this paper, are uploaded to Zenodo at [this url](https://zenodo.org/records/13373108). You can download them as follows:
+```bash
+mkdir datasets  # all datasets will be put into this directory
+wget https://zenodo.org/records/13373108/files/train_valid.tar.gz?download=1 -O ./datasets/train_valid.tar.gz   # training/validation
+wget https://zenodo.org/records/13373108/files/LNR.tar.gz?download=1 -O ./datasets/LNR.tar.gz   # test set
+wget https://zenodo.org/records/13373108/files/ProtFrag.tar.gz?download=1 -O ./datasets/ProtFrag.tar.gz     # augmentation dataset
+```
+2. Decompresss
+```bash
+tar zxvf ./datasets/train_valid.tar.gz -C ./datasets
+tar zxvf ./datasets/LNR.tar.gz -C ./datasets
+tar zxvf ./datasets/ProtFrag.tar.gz -C ./datasets
+```
+3. Process
+```bash
+python -m scripts.data_process.process --index ./datasets/train_valid/all.txt  --out_dir ./datasets/train_valid/processed  # train/validation set
+python -m scripts.data_process.process --index ./datasets/LNR/test.txt  --out_dir ./datasets/LNR/processed  # test set
+python -m scripts.data_process.process --index ./datasets/ProtFrag/all.txt --out_dir ./datasets/ProtFrag/processed # augmentation dataset
+```
+The index of processed data for train/validation splits need to be generated as follows, which will result in `datasets/train_valid/processed/train_index.txt` and `datasets/train_valid/processed/valid_index.txt`:
+```bash
+python -m scripts.data_process.split --train_index datasets/train_valid/train.txt --valid_index datasets/train_valid/valid.txt --processed_dir datasets/train_valid/processed/
+```
+#### PepBDB
+1. Download
+```bash
+wget http://huanglab.phys.hust.edu.cn/pepbdb/db/download/pepbdb-20200318.tgz -O ./datasets/pepbdb.tgz
+```
+2. Decompress
+```bash
+tar zxvf ./datasets/pepbdb.tgz -C ./datasets/pepbdb
+```
+3. Process
+```bash
+python -m scripts.data_process.pepbdb --index ./datasets/pepbdb/peptidelist.txt --out_dir ./datasets/pepbdb/processed
+python -m scripts.data_process.split --train_index ./datasets/pepbdb/train.txt --valid_index ./datasets/pepbdb/valid.txt --test_index ./datasets/pepbdb/test.txt --processed_dir datasets/pepbdb/processed/
+mv ./datasets/pepbdb/processed/pdbs ./dataset/pepbdb  # re-locate
+```
+### Trained Weights
+- codesign: `./checkpoint/codesign.ckpt`
+- conformation generation: `./checkpoints/fixseq.ckpt`
+Both can be downloaded at the [release page](https://github.com/THUNLP-MT/PepGLAD/releases/tag/v1.0). These checkpoints were trained on PepBench.
+## Usage
+:warning: Before using the following codes, please first download the trained weights mentioned above.
+### Peptide Sequence-Structure Co-Design
+Take `./assets/1ssc_A_B.pdb` as an example, where chain A is the target protein:
+```bash
+# obtain the binding site, which might also be manually crafted or from other ligands (e.g. small molecule, antibodies)
+python -m api.detect_pocket --pdb assets/1ssc_A_B.pdb --target_chains A --ligand_chains B --out assets/1ssc_A_pocket.json
+# sequence-structure codesign with length in [8, 15)
+CUDA_VISIBLE_DEVICES=0 python -m api.run \
+    --mode codesign \
+    --pdb assets/1ssc_A_B.pdb \
+    --pocket assets/1ssc_A_pocket.json \
+    --out_dir ./output/codesign \
+    --length_min 8 \
+    --length_max 15 \
+    --n_samples 10
+```
+Then 10 generations will be outputed under the folder `./output/codesign`.
+### Peptide Binding Conformation Generation
+Take `./assets/1ssc_A_B.pdb` as an example, where chain A is the target protein:
+```bash
+# obtain the binding site, which might also be manually crafted or from other ligands (e.g. small molecule, antibodies)
+python -m api.detect_pocket --pdb assets/1ssc_A_B.pdb --target_chains A --ligand_chains B --out assets/1ssc_A_pocket.json
+# generate binding conformation
+CUDA_VISIBLE_DEVICES=0 python -m api.run \
+    --mode struct_pred \
+    --pdb assets/1ssc_A_B.pdb \
+    --pocket assets/1ssc_A_pocket.json \
+    --out_dir ./output/struct_pred \
+    --peptide_seq PYVPVHFDASV \
+    --n_samples 10
+```
+Then 10 conformations will be outputed under the folder `./output/struct_pred`.
+## Reproduction of Paper Experiments
+Each task requires the following steps, which we have integrated into the script `./scripts/run_exp_pipe.sh`:
+1. Train autoencoder
+2. Train latent diffusion model
+3. Calculate distribution of latent distances between consecutive residues
+4. Generation & Evaluation
+On the other hand, if you want to evaluate existing checkpoints, please follow the instructions below (e.g. conformation generation):
+```bash
+# generate results on the test set and save to ./results/fixseq
+python generate.py --config configs/pepbench/test_fixseq.yaml --ckpt checkpoints/fixseq.ckpt --gpu 0 --save_dir ./results/fixseq
+# calculate metrics
+python cal_metrics.py --results ./results/fixseq/results.jsonl
+```
+### Codesign
+Codesign experiments on PepBench:
+```bash
+GPU=0 bash scripts/run_exp_pipe.sh pepbench_codesign configs/pepbench/autoencoder/train_codesign.yaml configs/pepbench/ldm/train_codesign.yaml configs/pepbench/ldm/setup_latent_guidance.yaml configs/pepbench/test_codesign.yaml
+```
+### Binding Conformation Generation
+Conformation generation experiments on PepBench:
+```bash
+GPU=0 bash scripts/run_exp_pipe.sh pepbench_fixseq configs/pepbench/autoencoder/train_fixseq.yaml configs/pepbench/ldm/train_fixseq.yaml configs/pepbench/ldm/setup_latent_guidance.yaml configs/pepbench/test_fixseq.yaml
+```
+## Contact
+Thank you for your interest in our work!
+Please feel free to ask about any questions about the algorithms, codes, as well as problems encountered in running them so that we can make it clearer and better. You can either create an issue in the github repo or contact us at jackie_kxz@outlook.com.
+## Reference
+```bibtex
+@article{kong2025full,
+  title={Full-atom peptide design with geometric latent diffusion},
+  author={Kong, Xiangzhe and Jia, Yinjun and Huang, Wenbing and Liu, Yang},
+  journal={Advances in Neural Information Processing Systems},
+  volume={37},
+  pages={74808--74839},
+  year={2025}
+}
+```

api/detect_pocket.py ADDED Viewed

	@@ -0,0 +1,72 @@

+#!/usr/bin/python
+# -*- coding:utf-8 -*-
+import argparse
+import numpy as np
+from data.converter.pdb_to_list_blocks import pdb_to_list_blocks
+from data.converter.blocks_interface import blocks_cb_interface, dist_matrix_from_blocks
+def get_interface(pdb, receptor_chains, ligand_chains, pocket_th=10.0):  # CB distance
+    list_blocks, chain_ids = pdb_to_list_blocks(pdb, receptor_chains + ligand_chains, return_chain_ids=True)
+    chain2blocks = {chain: block for chain, block in zip(chain_ids, list_blocks)}
+    for c in receptor_chains:
+        assert c in chain2blocks, f'Chain {c} not found for receptor'
+    for c in ligand_chains:
+        assert c in chain2blocks, f'Chain {c} not found for ligand'
+    rec_blocks, rec_block_chains, lig_blocks, lig_block_chains = [], [], [], []
+    for c in receptor_chains:
+        for block in chain2blocks[c]:
+            rec_blocks.append(block)
+            rec_block_chains.append(c)
+    for c in ligand_chains:
+        for block in chain2blocks[c]:
+            lig_blocks.append(block)
+            lig_block_chains.append(c)
+    _, (pocket_idx, lig_if_idx) = blocks_cb_interface(rec_blocks, lig_blocks, pocket_th)  # 10A for pocket size based on CB
+    epitope = []
+    for i in pocket_idx:
+        epitope.append((rec_blocks[i], rec_block_chains[i], i))
+    dist_mat = dist_matrix_from_blocks([rec_blocks[i] for i in pocket_idx], [lig_blocks[i] for i in lig_if_idx])
+    min_dists = np.min(dist_mat, axis=-1)  # [Nrec]
+    lig_idxs = np.argmin(dist_mat, axis=-1)  # [Nrec]
+    dists = []
+    for i, d in zip(lig_idxs, min_dists):
+        i = lig_if_idx[i]
+        dists.append((lig_blocks[i], lig_block_chains[i], i, d))
+    return epitope, dists
+if __name__ == '__main__':
+    import json
+    parser = argparse.ArgumentParser(description='get interface')
+    parser.add_argument('--pdb', type=str, required=True, help='Path to the complex pdb')
+    parser.add_argument('--target_chains', type=str, nargs='+', required=True, help='Specify target chain ids')
+    parser.add_argument('--ligand_chains', type=str, nargs='+', required=True, help='Specify ligand chain ids')
+    parser.add_argument('--pocket_th', type=int, default=10.0, help='CB distance threshold for defining the binding site')
+    parser.add_argument('--out', type=str, default=None, help='Save epitope information to json file if specified')
+    args = parser.parse_args()
+    epitope, dists = get_interface(args.pdb, args.target_chains, args.ligand_chains, args.pocket_th)
+    para_res = {}
+    for _, chain_name, i, d in dists:
+        key = f'{chain_name}-{i}'
+        para_res[key] = 1
+    print(f'REMARK: {len(epitope)} residues in the binding site on the target protein, with {len(para_res)} residues in ligand:')
+    print(f' \tchain\tresidue id\ttype\tchain\tresidue id\ttype\tdistance')
+    for i, (e, p) in enumerate(zip(epitope, dists)):
+        e_res, e_chain_name, _ = e
+        p_res, p_chain_name, _, d = p
+        print(f'{i+1}\t{e_chain_name}\t{e_res.id}\t{e_res.abrv}\t' + \
+              f'{p_chain_name}\t{p_res.id}\t{p_res.abrv}\t{round(d, 3)}')
+    if args.out:
+        data = []
+        for e in epitope:
+            res, chain_name, _ = e
+            data.append((chain_name, res.id))
+        with open(args.out, 'w') as fout:
+            json.dump(data, fout)

api/run.py ADDED Viewed

	@@ -0,0 +1,274 @@

+#!/usr/bin/python
+# -*- coding:utf-8 -*-
+import os
+import sys
+import json
+import argparse
+from tqdm import tqdm
+from os.path import splitext, basename
+import ray
+import numpy as np
+import torch
+from torch.utils.data import DataLoader
+from data.format import Atom, Block, VOCAB
+from data.converter.pdb_to_list_blocks import pdb_to_list_blocks
+from data.converter.list_blocks_to_pdb import list_blocks_to_pdb
+from data.codesign import calculate_covariance_matrix
+from utils.const import sidechain_atoms
+from utils.logger import print_log
+from evaluation.dG.openmm_relaxer import ForceFieldMinimizer
+class DesignDataset(torch.utils.data.Dataset):
+    MAX_N_ATOM = 14
+    def __init__(self, pdbs, epitopes, lengths_range=None, seqs=None) -> None:
+        super().__init__()
+        self.pdbs = pdbs
+        self.epitopes = epitopes
+        self.lengths_range = lengths_range
+        self.seqs = seqs
+        # structure prediction or codesign
+        assert (self.seqs is not None and self.lengths_range is None) | \
+               (self.seqs is None and self.lengths_range is not None)
+    def get_epitope(self, idx):
+        pdb, epitope_def = self.pdbs[idx], self.epitopes[idx]
+        with open(epitope_def, 'r') as fin:
+            epitope = json.load(fin)
+        to_str = lambda pos: f'{pos[0]}-{pos[1]}'
+        epi_map = {}
+        for chain_name, pos in epitope:
+            if chain_name not in epi_map:
+                epi_map[chain_name] = {}
+            epi_map[chain_name][to_str(pos)] = True
+        residues, position_ids = [], []
+        chain2blocks = pdb_to_list_blocks(pdb, list(epi_map.keys()), dict_form=True)
+        if len(chain2blocks) != len(epi_map):
+            print_log(f'Some chains in the epitope are missing. Parsed {list(chain2blocks.keys())}, given {list(epi_map.keys())}.', level='WARN')
+        for chain_name in chain2blocks:
+            chain = chain2blocks[chain_name]
+            for i, block in enumerate(chain):  # residue
+                if to_str(block.id) in epi_map[chain_name]:
+                    residues.append(block)
+                    position_ids.append(i + 1) # position ids start from 1
+        return residues, position_ids, chain2blocks
+    def generate_pep_chain(self, idx):
+        if self.lengths_range is not None: # codesign
+            lmin, lmax = self.lengths_range[idx]
+            length = np.random.randint(lmin, lmax)
+            unk_block = Block(VOCAB.symbol_to_abrv(VOCAB.UNK), [Atom('CA', [0, 0, 0], 'C')])
+            return [unk_block] * length
+        else:
+            seq = self.seqs[idx]
+            blocks = []
+            for s in seq:
+                atoms = []
+                for atom_name in VOCAB.backbone_atoms + sidechain_atoms.get(s, []):
+                    atoms.append(Atom(atom_name, [0, 0, 0], atom_name[0]))
+                blocks.append(Block(VOCAB.symbol_to_abrv(s), atoms))
+            return blocks
+    def __len__(self):
+        return len(self.pdbs)
+    def __getitem__(self, idx: int):
+        rec_blocks, rec_position_ids, rec_chain2blocks = self.get_epitope(idx)
+        lig_blocks = self.generate_pep_chain(idx)
+        mask = [0 for _ in rec_blocks] + [1 for _ in lig_blocks]
+        position_ids = rec_position_ids + [i + 1 for i, _ in enumerate(lig_blocks)]
+        X, S, atom_mask = [], [], []
+        for block in rec_blocks + lig_blocks:
+            symbol = VOCAB.abrv_to_symbol(block.abrv)
+            atom2coord = { unit.name: unit.get_coord() for unit in block.units }
+            bb_pos = np.mean(list(atom2coord.values()), axis=0).tolist()
+            coords, coord_mask = [], []
+            for atom_name in VOCAB.backbone_atoms + sidechain_atoms.get(symbol, []):
+                if atom_name in atom2coord:
+                    coords.append(atom2coord[atom_name])
+                    coord_mask.append(1)
+                else:
+                    coords.append(bb_pos)
+                    coord_mask.append(0)
+            n_pad = self.MAX_N_ATOM - len(coords)
+            for _ in range(n_pad):
+                coords.append(bb_pos)
+                coord_mask.append(0)
+            X.append(coords)
+            S.append(VOCAB.symbol_to_idx(symbol))
+            atom_mask.append(coord_mask)
+        X, atom_mask = torch.tensor(X, dtype=torch.float), torch.tensor(atom_mask, dtype=torch.bool)
+        mask = torch.tensor(mask, dtype=torch.bool)
+        cov = calculate_covariance_matrix(X[~mask][:, 1][atom_mask[~mask][:, 1]].numpy()) # only use the receptor to derive the affine transformation
+        eps = 1e-4
+        cov = cov + eps * np.identity(cov.shape[0])
+        L = torch.from_numpy(np.linalg.cholesky(cov)).float().unsqueeze(0)
+        return {
+            'X': X,                                                         # [N, 14] or [N, 4] if backbone_only == True
+            'S': torch.tensor(S, dtype=torch.long),                         # [N]
+            'position_ids': torch.tensor(position_ids, dtype=torch.long),   # [N]
+            'mask': mask,                                                   # [N], 1 for generation
+            'atom_mask': atom_mask,                                         # [N, 14] or [N, 4], 1 for having records in the PDB
+            'lengths': len(S),
+            'rec_chain2blocks': rec_chain2blocks,
+            'L': L
+        }
+    def collate_fn(self, batch):
+        results = {}
+        for key in batch[0]:
+            values = [item[key] for item in batch]
+            if key == 'lengths':
+                results[key] = torch.tensor(values, dtype=torch.long)
+            elif key == 'rec_chain2blocks':
+                results[key] = values
+            else:
+                results[key] = torch.cat(values, dim=0)
+        return results
+@ray.remote(num_cpus=1, num_gpus=1/16)
+def openmm_relax(pdb_path):
+    force_field = ForceFieldMinimizer()
+    force_field(pdb_path, pdb_path)
+    return pdb_path
+def design(mode, ckpt, gpu, pdbs, epitope_defs, n_samples, out_dir,
+           lengths_range=None, seqs=None, identifiers=None, batch_size=8, num_workers=4):
+    # create out dir
+    if not os.path.exists(out_dir):
+        os.makedirs(out_dir)
+    result_summary = open(os.path.join(out_dir, 'summary.jsonl'), 'w')
+    if identifiers is None:
+        identifiers = [splitext(basename(pdb))[0] for pdb in pdbs]
+    # load model
+    device = torch.device('cpu' if gpu == -1 else f'cuda:{gpu}')
+    model = torch.load(ckpt, map_location='cpu')
+    model.to(device)
+    model.eval()
+    # generate dataset
+    # expand data
+    if lengths_range is None: lengths_range = [None for _ in pdbs]
+    if seqs is None: seqs = [None for _ in pdbs]
+    expand_pdbs, expand_epitopes, expand_lens, expand_ids, expand_seqs = [], [], [], [], []
+    for _id, pdb, epitope, l, s, n in zip(identifiers, pdbs, epitope_defs, lengths_range, seqs, n_samples):
+        expand_ids.extend([f'{_id}_{i}' for i in range(n)])
+        expand_pdbs.extend([pdb for _ in range(n)])
+        expand_epitopes.extend([epitope for _ in range(n)])
+        expand_lens.extend([l for _ in range(n)])
+        expand_seqs.extend([s for _ in range(n)])
+    # create dataset
+    if expand_lens[0] is None: expand_lens = None
+    if expand_seqs[0] is None: expand_seqs = None
+    dataset = DesignDataset(expand_pdbs, expand_epitopes, expand_lens, expand_seqs)
+    dataloader = DataLoader(dataset, batch_size=batch_size,
+                             num_workers=num_workers,
+                             collate_fn=dataset.collate_fn,
+                             shuffle=False
+                            )
+    # generate peptides
+    cnt = 0
+    all_pdbs = []
+    for batch in tqdm(dataloader):
+        with torch.no_grad():
+            # move data
+            for k in batch:
+                if hasattr(batch[k], 'to'):
+                    batch[k] = batch[k].to(device)
+            # generate
+            batch_X, batch_S, batch_pmetric = model.sample(
+                batch['X'], batch['S'],
+                batch['mask'], batch['position_ids'],
+                batch['lengths'], batch['atom_mask'],
+                L=batch['L'], sample_opt={
+                    'energy_func': 'default',
+                    'energy_lambda': 0.5 if mode == 'struct_pred' else 0.8
+                }
+            )
+        # save data
+        for X, S, pmetric, rec_chain2blocks in zip(batch_X, batch_S, batch_pmetric, batch['rec_chain2blocks']):
+            if S is None: S = expand_seqs[cnt] # structure prediction
+            lig_blocks = []
+            for x, s in zip(X, S):
+                abrv = VOCAB.symbol_to_abrv(s)
+                atoms = VOCAB.backbone_atoms + sidechain_atoms[VOCAB.abrv_to_symbol(abrv)]
+                units = [
+                    Atom(atom_name, coord, atom_name[0]) for atom_name, coord in zip(atoms, x)
+                ]
+                lig_blocks.append(Block(abrv, units))
+            list_blocks, chain_names = [], []
+            for chain in rec_chain2blocks:
+                list_blocks.append(rec_chain2blocks[chain])
+                chain_names.append(chain)
+            pep_chain_id = chr(max([ord(c) for c in chain_names]) + 1)
+            list_blocks.append(lig_blocks)
+            chain_names.append(pep_chain_id)
+            out_pdb = os.path.join(out_dir, expand_ids[cnt] + '.pdb')
+            list_blocks_to_pdb(list_blocks, chain_names, out_pdb)
+            all_pdbs.append(out_pdb)
+            result_summary.write(json.dumps({
+                'id': expand_ids[cnt],
+                'rec_chains': list(rec_chain2blocks.keys()),
+                'pep_chain': pep_chain_id,
+                'pep_seq': ''.join([VOCAB.abrv_to_symbol(block.abrv) for block in lig_blocks])
+            }) + '\n')
+            result_summary.flush()
+            cnt += 1
+    result_summary.close()
+    print_log(f'Running openmm relaxation...')
+    ray.init(num_cpus=8)
+    futures = [openmm_relax.remote(path) for path in all_pdbs]
+    pbar = tqdm(total=len(futures))
+    while len(futures) > 0:
+       done_ids, futures = ray.wait(futures, num_returns=1)
+       for done_id in done_ids:
+            done_path = ray.get(done_id)
+            pbar.update(1)
+    print_log(f'Done')
+def parse():
+    parser = argparse.ArgumentParser(description='run pepglad for codesign or structure prediction')
+    parser.add_argument('--mode', type=str, required=True, choices=['codesign', 'struct_pred'], help='Running mode')
+    parser.add_argument('--pdb', type=str, required=True, help='Path to the PDB file of the target protein')
+    parser.add_argument('--pocket', type=str, required=True, help='Path to the pocket definition (*.json generated by detect_pocket)')
+    parser.add_argument('--n_samples', type=int, default=10, help='Number of samples')
+    parser.add_argument('--out_dir', type=str, required=True, help='Output directory')
+    parser.add_argument('--peptide_seq', type=str, required='struct_pred' in sys.argv, help='Peptide sequence for structure prediction')
+    parser.add_argument('--length_min', type=int, required='codesign' in sys.argv, help='Minimum peptide length for codesign (inclusive)')
+    parser.add_argument('--length_max', type=int, required='codesign' in sys.argv, help='Maximum peptide length for codesign (exclusive)')
+    parser.add_argument('--gpu', type=int, default=0, help='GPU to use')
+    return parser.parse_args()
+if __name__ == '__main__':
+    args = parse()
+    proj_dir = os.path.join(os.path.dirname(__file__), '..')
+    ckpt = os.path.join(proj_dir, 'checkpoints', 'fixseq.ckpt' if args.mode == 'struct_pred' else 'codesign.ckpt')
+    print_log(f'Loading checkpoint: {ckpt}')
+    design(
+        mode=args.mode,
+        ckpt=ckpt,                          # path to the checkpoint of the trained model
+        gpu=args.gpu,                       # the ID of the GPU to use
+        pdbs=[args.pdb],                    # paths to the PDB file of each antigen
+        epitope_defs=[args.pocket],         # paths to the epitope (pocket) definitions
+        n_samples=[args.n_samples],         # number of samples for each epitope
+        out_dir=args.out_dir,               # output directory
+        identifiers=[os.path.basename(os.path.splitext(args.pdb)[0])], # file name (name of each output candidate)
+        lengths_range=[(args.length_min, args.length_max)] if args.mode == 'codesign' else None,    # range of acceptable peptide lengths, left inclusive, right exclusive
+        seqs=[args.peptide_seq] if args.mode == 'struct_pred' else None                             # peptide sequences for structure prediction
+    )

assets/1ssc_A_pocket.json ADDED Viewed

	@@ -0,0 +1 @@

+ [["A", [3, " "]], ["A", [4, " "]], ["A", [5, " "]], ["A", [6, " "]], ["A", [7, " "]], ["A", [8, " "]], ["A", [9, " "]], ["A", [11, " "]], ["A", [12, " "]], ["A", [13, " "]], ["A", [43, " "]], ["A", [44, " "]], ["A", [45, " "]], ["A", [46, " "]], ["A", [47, " "]], ["A", [51, " "]], ["A", [54, " "]], ["A", [55, " "]], ["A", [56, " "]], ["A", [57, " "]], ["A", [58, " "]], ["A", [59, " "]], ["A", [63, " "]], ["A", [64, " "]], ["A", [65, " "]], ["A", [66, " "]], ["A", [67, " "]], ["A", [69, " "]], ["A", [71, " "]], ["A", [72, " "]], ["A", [73, " "]], ["A", [74, " "]], ["A", [75, " "]], ["A", [78, " "]], ["A", [79, " "]], ["A", [81, " "]], ["A", [83, " "]], ["A", [102, " "]], ["A", [103, " "]], ["A", [104, " "]], ["A", [105, " "]], ["A", [106, " "]], ["A", [107, " "]], ["A", [108, " "]], ["A", [109, " "]], ["A", [110, " "]], ["A", [111, " "]], ["A", [112, " "]]]

cal_metrics.py ADDED Viewed

	@@ -0,0 +1,228 @@

+#!/usr/bin/python
+# -*- coding:utf-8 -*-
+import argparse
+import json
+import os
+import random
+from copy import deepcopy
+from collections import defaultdict
+from tqdm import tqdm
+from tqdm.contrib.concurrent import process_map
+import statistics
+import warnings
+warnings.filterwarnings("ignore")
+import numpy as np
+from scipy.stats import spearmanr
+from data.converter.pdb_to_list_blocks import pdb_to_list_blocks
+from evaluation import diversity
+from evaluation.dockq import dockq
+from evaluation.rmsd import compute_rmsd
+from utils.random_seed import setup_seed
+from evaluation.seq_metric import aar, slide_aar
+def _get_ref_pdb(_id, root_dir):
+    return os.path.join(root_dir, 'references', f'{_id}_ref.pdb')
+def _get_gen_pdb(_id, number, root_dir, use_rosetta):
+    suffix = '_rosetta' if use_rosetta else ''
+    return os.path.join(root_dir, 'candidates', _id, f'{_id}_gen_{number}{suffix}.pdb')
+def cal_metrics(items):
+    # all of the items are conditioned on the same binding pocket
+    root_dir = items[0]['root_dir']
+    ref_pdb, rec_chain, lig_chain = items[0]['ref_pdb'], items[0]['rec_chain'], items[0]['lig_chain']
+    ref_pdb = _get_ref_pdb(items[0]['id'], root_dir)
+    seq_only, struct_only, backbone_only = items[0]['seq_only'], items[0]['struct_only'], items[0]['backbone_only']
+    # prepare
+    results = defaultdict(list)
+    cand_seqs, cand_ca_xs = [], []
+    rec_blocks, ref_pep_blocks = pdb_to_list_blocks(ref_pdb, [rec_chain, lig_chain])
+    ref_ca_x, ca_mask = [], []
+    for ref_block in ref_pep_blocks:
+        if ref_block.has_unit('CA'):
+            ca_mask.append(1)
+            ref_ca_x.append(ref_block.get_unit_by_name('CA').get_coord())
+        else:
+            ca_mask.append(0)
+            ref_ca_x.append([0, 0, 0])
+    ref_ca_x, ca_mask = np.array(ref_ca_x), np.array(ca_mask).astype(bool)
+    for item in items:
+        if not struct_only:
+            cand_seqs.append(item['gen_seq'])
+            results['Slide AAR'].append(slide_aar(item['gen_seq'], item['ref_seq'], aar))
+        # structure metrics
+        gen_pdb = _get_gen_pdb(item['id'], item['number'], root_dir, item['rosetta'])
+        _, gen_pep_blocks = pdb_to_list_blocks(gen_pdb, [rec_chain, lig_chain])
+        assert len(gen_pep_blocks) == len(ref_pep_blocks), f'{item}\t{len(ref_pep_blocks)}\t{len(gen_pep_blocks)}'
+        # CA RMSD
+        gen_ca_x = np.array([block.get_unit_by_name('CA').get_coord() for block in gen_pep_blocks])
+        cand_ca_xs.append(gen_ca_x)
+        rmsd = compute_rmsd(ref_ca_x[ca_mask], gen_ca_x[ca_mask], aligned=True)
+        results['RMSD(CA)'].append(rmsd)
+        if struct_only:
+            results['RMSD<=2.0'].append(1 if rmsd <= 2.0 else 0)
+            results['RMSD<=5.0'].append(1 if rmsd <= 5.0 else 0)
+            results['RMSD<=10.0'].append(1 if rmsd <= 10.0 else 0)
+        if backbone_only:
+            continue
+        # 5. DockQ
+        dockq_score = dockq(gen_pdb, ref_pdb, lig_chain)
+        results['DockQ'].append(dockq_score)
+        if struct_only:
+            results['DockQ>=0.23'].append(1 if dockq_score >= 0.23 else 0)
+            results['DockQ>=0.49'].append(1 if dockq_score >= 0.49 else 0)
+            results['DockQ>=0.80'].append(1 if dockq_score >= 0.80 else 0)
+        # Full atom RMSD
+        if struct_only:
+            gen_all_x, ref_all_x = [], []
+            for gen_block, ref_block in zip(gen_pep_blocks, ref_pep_blocks):
+                for ref_atom in ref_block:
+                    if gen_block.has_unit(ref_atom.name):
+                        ref_all_x.append(ref_atom.get_coord())
+                        gen_all_x.append(gen_block.get_unit_by_name(ref_atom.name).get_coord())
+            results['RMSD(full-atom)'].append(compute_rmsd(
+                np.array(gen_all_x), np.array(ref_all_x), aligned=True
+            ))
+    pmets = [item['pmetric'] for item in items]
+    indexes = list(range(len(items)))
+    # aggregation
+    for name in results:
+        vals = results[name]
+        corr = spearmanr(vals, pmets, nan_policy='omit').statistic
+        if np.isnan(corr):
+            corr = 0
+        aggr_res = {
+            'max': max(vals),
+            'min': min(vals),
+            'mean': sum(vals) / len(vals),
+            'random': vals[0],
+            'max*': vals[(max if corr > 0 else min)(indexes, key=lambda i: pmets[i])],
+            'min*': vals[(min if corr > 0 else max)(indexes, key=lambda i: pmets[i])],
+            'pmet_corr': corr,
+            'individual': vals,
+            'individual_pmet': pmets
+        }
+        results[name] = aggr_res
+    if len(cand_seqs) > 1 and not seq_only:
+        seq_div, struct_div, co_div, consistency = diversity.diversity(cand_seqs, np.array(cand_ca_xs))
+        results['Sequence Diversity'] = seq_div
+        results['Struct Diversity'] = struct_div
+        results['Codesign Diversity'] = co_div
+        results['Consistency'] = consistency
+    return results
+def cnt_aa_dist(seqs):
+    cnts = {}
+    for seq in seqs:
+        for aa in seq:
+            if aa not in cnts:
+                cnts[aa] = 0
+            cnts[aa] += 1
+    aas = sorted(list(cnts.keys()), key=lambda aa: cnts[aa])
+    total = sum(cnts.values())
+    for aa in aas:
+        print(f'\t{aa}: {cnts[aa] / total}')
+def main(args):
+    root_dir = os.path.dirname(args.results)
+    # load dG filter
+    if args.filter_dG is None:
+        filter_func = lambda _id, n: True
+    else:
+        dG_results = json.load(open(args.filter_dG, 'r'))
+        filter_func = lambda _id, n: dG_results[_id]['all'][str(n)] < 0
+    # load results
+    with open(args.results, 'r') as fin:
+        lines = fin.read().strip().split('\n')
+    id2items = {}
+    for line in lines:
+        item = json.loads(line)
+        _id = item['id']
+        if not filter_func(_id, item['number']):
+            continue
+        if _id not in id2items:
+            id2items[_id] = []
+        item['root_dir'] = root_dir
+        item['rosetta'] = args.rosetta
+        id2items[_id].append(item)
+    ids = list(id2items.keys())
+    if args.filter_dG is not None:
+        # delete results with only one sample since it cannot calculate diversity
+        del_ids = [_id for _id in ids if len(id2items[_id]) < 2]
+        for _id in del_ids:
+            print(f'Deleting {_id} since it only has one sample passed the filter')
+            del id2items[_id]
+    if args.num_workers > 1:
+        metrics = process_map(cal_metrics, id2items.values(), max_workers=args.num_workers, chunksize=1)
+    else:
+        metrics = [cal_metrics(inputs) for inputs in tqdm(id2items.values())]
+    eval_results_path = os.path.join(os.path.dirname(args.results), 'eval_report.json')
+    with open(eval_results_path, 'w') as fout:
+        for i, _id in enumerate(id2items):
+            metric = deepcopy(metrics[i])
+            metric['id'] = _id
+            fout.write(json.dumps(metric) + '\n')
+    # individual level results
+    print('Point-wise evaluation results:')
+    for name in metrics[0]:
+        vals = [item[name] for item in metrics]
+        if isinstance(vals[0], dict):
+            if 'RMSD' in name and '<=' not in name:
+                aggr = 'min'
+            else:
+                aggr = 'max'
+            aggr_vals = [val[aggr] for val in vals]
+            if '>=' in name or '<=' in name:  # percentage
+                print(f'{name}: {sum(aggr_vals) / len(aggr_vals)}')
+            else:
+                if 'RMSD' in name:
+                    print(f'{name}(median): {statistics.median(aggr_vals)}') # unbounded, some extreme values will affect the mean but not the median
+                else:
+                    print(f'{name}(mean): {sum(aggr_vals) / len(aggr_vals)}')
+                lowest_i = min([i for i in range(len(aggr_vals))], key=lambda i: aggr_vals[i])
+                highest_i = max([i for i in range(len(aggr_vals))], key=lambda i: aggr_vals[i])
+                print(f'\tlowest: {aggr_vals[lowest_i]}, id: {ids[lowest_i]}', end='')
+                print(f'\thighest: {aggr_vals[highest_i]}, id: {ids[highest_i]}')
+        else:
+            print(f'{name} (mean): {sum(vals) / len(vals)}')
+            lowest_i = min([i for i in range(len(vals))], key=lambda i: vals[i])
+            highest_i = max([i for i in range(len(vals))], key=lambda i: vals[i])
+            print(f'\tlowest: {vals[lowest_i]}, id: {ids[lowest_i]}')
+            print(f'\thighest: {vals[highest_i]}, id: {ids[highest_i]}')
+def parse():
+    parser = argparse.ArgumentParser(description='calculate metrics')
+    parser.add_argument('--results', type=str, required=True, help='Path to test set')
+    parser.add_argument('--num_workers', type=int, default=8, help='Number of workers to use')
+    parser.add_argument('--rosetta', action='store_true', help='Use the rosetta-refined structure')
+    parser.add_argument('--filter_dG', type=str, default=None, help='Only calculate results on samples with dG<0')
+    return parser.parse_args()
+if __name__ == '__main__':
+    setup_seed(0)
+    main(parse())

configs/pepbdb/autoencoder/train_codesign.yaml ADDED Viewed

	@@ -0,0 +1,66 @@

+dataset:
+  train:
+    - class: CoDesignDataset
+      mmap_dir: ./datasets/pepbdb/processed
+      specify_index: ./datasets/pepbdb/processed/train_index.txt
+      backbone_only: false
+      cluster: ./datasets/pepbdb/train.cluster
+    - class: CoDesignDataset
+      mmap_dir: ./datasets/ProtFrag/processed
+      backbone_only: false
+  valid:
+    class: CoDesignDataset
+    mmap_dir: ./datasets/pepbdb/processed
+    specify_index: ./datasets/pepbdb/processed/valid_index.txt
+    backbone_only: false
+dataloader:
+  shuffle: true
+  num_workers: 4
+  wrapper:
+    class: DynamicBatchWrapper
+    complexity: n**2
+    ubound_per_batch: 60000  # batch size ~24
+trainer:
+  class: AutoEncoderTrainer
+  config:
+    max_epoch: 100
+    save_topk: 10
+    save_dir: ./ckpts/autoencoder_codesign_pepbdb
+    patience: 10
+    metric_min_better: true
+    optimizer:
+      class: AdamW
+      lr: 1.0e-4
+    scheduler:
+      class: ReduceLROnPlateau
+      factor: 0.8
+      patience: 5
+      mode: min
+      frequency: val_epoch
+      min_lr: 5.0e-6
+model:
+  class: AutoEncoder
+  embed_size: 128
+  hidden_size: 128
+  latent_size: 8
+  latent_n_channel: 1
+  n_layers: 3
+  n_channel: 14  # all atom
+  h_kl_weight: 0.3
+  z_kl_weight: 0.5
+  coord_loss_ratio: 0.5
+  coord_loss_weights:
+    Xloss: 1.0
+    ca_Xloss: 1.0
+    bb_bond_lengths_loss: 1.0
+    sc_bond_lengths_loss: 1.0
+    bb_dihedral_angles_loss: 0.0
+    sc_chi_angles_loss: 0.5
+  relative_position: false
+  anchor_at_ca: true
+  mask_ratio: 0.25

configs/pepbdb/autoencoder/train_fixseq.yaml ADDED Viewed

	@@ -0,0 +1,63 @@

+dataset:
+  train:
+    class: CoDesignDataset
+    mmap_dir: ./datasets/pepbdb/processed
+    specify_index: ./datasets/pepbdb/processed/train_index.txt
+    backbone_only: false
+    cluster: ./datasets/pepbdb/train.cluster
+  valid:
+    class: CoDesignDataset
+    mmap_dir: ./datasets/pepbdb/processed
+    specify_index: ./datasets/pepbdb/processed/valid_index.txt
+    backbone_only: false
+dataloader:
+  shuffle: true
+  num_workers: 4
+  wrapper:
+    class: DynamicBatchWrapper
+    complexity: n**2
+    ubound_per_batch: 60000  # batch size ~24
+trainer:
+  class: AutoEncoderTrainer
+  config:
+    max_epoch: 150 # the best checkpoint should be obatained at about epoch 457
+    save_topk: 10
+    save_dir: ./ckpts/autoencoder_fixseq
+    patience: 10
+    metric_min_better: true
+    optimizer:
+      class: AdamW
+      lr: 1.0e-4
+    scheduler:
+      class: ReduceLROnPlateau
+      factor: 0.8
+      patience: 15
+      mode: min
+      frequency: val_epoch
+      min_lr: 5.0e-6
+model:
+  class: AutoEncoder
+  embed_size: 128
+  hidden_size: 128
+  latent_size: 0
+  latent_n_channel: 1
+  n_layers: 3
+  n_channel: 14  # all atom
+  h_kl_weight: 0.0
+  z_kl_weight: 0.6
+  coord_loss_ratio: 1.0
+  coord_loss_weights:
+    Xloss: 1.0
+    ca_Xloss: 1.0
+    bb_bond_lengths_loss: 1.0
+    sc_bond_lengths_loss: 1.0
+    bb_dihedral_angles_loss: 0.0
+    sc_chi_angles_loss: 0.5
+  anchor_at_ca: true
+  mode: fixseq
+  additional_noise_scale: 1.0

configs/pepbdb/ldm/setup_latent_guidance.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+dataset:
+  test:
+    class: CoDesignDataset
+    mmap_dir: ./datasets/pepbdb/processed
+    specify_index: ./datasets/pepbdb/processed/train_index.txt
+    backbone_only: false
+dataloader:
+  num_workers: 2
+  batch_size: 32
+backbone_only: false

configs/pepbdb/ldm/train_codesign.yaml ADDED Viewed

	@@ -0,0 +1,61 @@

+dataset:
+  train:
+    - class: CoDesignDataset
+      mmap_dir: ./datasets/pepbdb/processed
+      specify_index: ./datasets/pepbdb/processed/train_index.txt
+      backbone_only: false
+      cluster: ./datasets/pepbdb/train.cluster
+      use_covariance_matrix: true
+  valid:
+    class: CoDesignDataset
+    mmap_dir: ./datasets/pepbdb/processed
+    specify_index: ./datasets/pepbdb/processed/valid_index.txt
+    backbone_only: false
+    use_covariance_matrix: true
+dataloader:
+  shuffle: true
+  num_workers: 4
+  wrapper:
+    class: DynamicBatchWrapper
+    complexity: n**2
+    ubound_per_batch: 60000  # batch size ~32
+trainer:
+  class: LDMTrainer
+  criterion: Loss
+  config:
+    max_epoch: 500 # the best checkpoint should be obtained at around epoch 380
+    save_topk: 10
+    val_freq: 10
+    save_dir: ./ckpts/LDM_codesign
+    patience: 10
+    metric_min_better: true
+    optimizer:
+      class: AdamW
+      lr: 1.0e-4
+    scheduler:
+      class: ReduceLROnPlateau
+      factor: 0.6
+      patience: 3
+      mode: min
+      frequency: val_epoch
+      min_lr: 5.0e-6
+model:
+  class: LDMPepDesign
+  autoencoder_ckpt: ""
+  autoencoder_no_randomness: true
+  hidden_size: 128
+  num_steps: 100
+  n_layers: 3
+  n_rbf: 32
+  cutoff: 3.0 # the coordinates are in standard space
+  dist_rbf: 32
+  dist_rbf_cutoff: 7.0
+  diffusion_opt:
+    trans_seq_type: Diffusion
+    trans_pos_type: Diffusion
+  max_gen_position: 60

configs/pepbdb/ldm/train_fixseq.yaml ADDED Viewed

	@@ -0,0 +1,63 @@

+dataset:
+  train:
+    - class: CoDesignDataset
+      mmap_dir: ./datasets/pepbdb/processed
+      specify_index: ./datasets/pepbdb/processed/train_index.txt
+      backbone_only: false
+      cluster: ./datasets/pepbdb/train.cluster
+      use_covariance_matrix: true
+  valid:
+    class: CoDesignDataset
+    mmap_dir: ./datasets/pepbdb/processed
+    specify_index: ./datasets/pepbdb/processed/valid_index.txt
+    backbone_only: false
+    use_covariance_matrix: true
+dataloader:
+  shuffle: true
+  num_workers: 4
+  wrapper:
+    class: DynamicBatchWrapper
+    complexity: n**2
+    ubound_per_batch: 60000  # batch size ~32
+trainer:
+  class: LDMTrainer
+  criterion: RMSD
+  config:
+    max_epoch: 1000 # the best checkpoint will be obtained at about 900 epoch
+    save_topk: 10
+    val_freq: 10
+    save_dir: ./ckpts/LDM_fixseq
+    patience: 10
+    metric_min_better: true
+    optimizer:
+      class: AdamW
+      lr: 1.0e-4
+    scheduler:
+      class: ReduceLROnPlateau
+      factor: 0.6
+      patience: 3
+      mode: min
+      frequency: val_epoch
+      min_lr: 5.0e-6
+model:
+  class: LDMPepDesign
+  autoencoder_ckpt: ""
+  autoencoder_no_randomness: true
+  hidden_size: 128
+  num_steps: 100
+  n_layers: 6
+  n_rbf: 32
+  cutoff: 3.0 # the coordinates are in standard space
+  dist_rbf: 0
+  dist_rbf_cutoff: 0.0
+  diffusion_opt:
+    trans_seq_type: Diffusion
+    trans_pos_type: Diffusion
+    std: 20.0
+  mode: fixseq
+  max_gen_position: 60

configs/pepbdb/test_codesign.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+dataset:
+  test:
+    class: CoDesignDataset
+    mmap_dir: ./datasets/pepbdb/processed
+    specify_index: ./datasets/pepbdb/processed/test_index.txt
+    backbone_only: false
+    use_covariance_matrix: true
+dataloader:
+  num_workers: 4
+  batch_size: 64
+backbone_only: false
+n_samples: 40
+sample_opt:
+  energy_func: default
+  energy_lambda: 0.8

configs/pepbdb/test_fixseq.yaml ADDED Viewed

	@@ -0,0 +1,19 @@

+dataset:
+  test:
+    class: CoDesignDataset
+    mmap_dir: ./datasets/pepbdb/processed
+    specify_index: ./datasets/pepbdb/processed/test_index.txt
+    backbone_only: false
+    use_covariance_matrix: true
+dataloader:
+  num_workers: 4
+  batch_size: 64
+backbone_only: false
+struct_only: true
+n_samples: 10
+sample_opt:
+  energy_func: default
+  energy_lambda: 0.8

configs/pepbench/autoencoder/train_codesign.yaml ADDED Viewed

	@@ -0,0 +1,66 @@

+dataset:
+  train:
+    - class: CoDesignDataset
+      mmap_dir: ./datasets/train_valid/processed
+      specify_index: ./datasets/train_valid/processed/train_index.txt
+      backbone_only: false
+      cluster: ./datasets/train_valid/train.cluster
+    - class: CoDesignDataset
+      mmap_dir: ./datasets/ProtFrag/processed
+      backbone_only: false
+  valid:
+    class: CoDesignDataset
+    mmap_dir: ./datasets/train_valid/processed
+    specify_index: ./datasets/train_valid/processed/valid_index.txt
+    backbone_only: false
+dataloader:
+  shuffle: true
+  num_workers: 4
+  wrapper:
+    class: DynamicBatchWrapper
+    complexity: n**2
+    ubound_per_batch: 60000  # batch size ~24
+trainer:
+  class: AutoEncoderTrainer
+  config:
+    max_epoch: 100
+    save_topk: 10
+    save_dir: ./ckpts/autoencoder_codesign
+    patience: 10
+    metric_min_better: true
+    optimizer:
+      class: AdamW
+      lr: 1.0e-4
+    scheduler:
+      class: ReduceLROnPlateau
+      factor: 0.8
+      patience: 5
+      mode: min
+      frequency: val_epoch
+      min_lr: 5.0e-6
+model:
+  class: AutoEncoder
+  embed_size: 128
+  hidden_size: 128
+  latent_size: 8
+  latent_n_channel: 1
+  n_layers: 3
+  n_channel: 14  # all atom
+  h_kl_weight: 0.3
+  z_kl_weight: 0.5
+  coord_loss_ratio: 0.5
+  coord_loss_weights:
+    Xloss: 1.0
+    ca_Xloss: 1.0
+    bb_bond_lengths_loss: 1.0
+    sc_bond_lengths_loss: 1.0
+    bb_dihedral_angles_loss: 0.0
+    sc_chi_angles_loss: 0.5
+  relative_position: false
+  anchor_at_ca: true
+  mask_ratio: 0.25

configs/pepbench/autoencoder/train_fixseq.yaml ADDED Viewed

	@@ -0,0 +1,62 @@

+dataset:
+  train:
+    class: CoDesignDataset
+    mmap_dir: ./datasets/train_valid/processed
+    specify_index: ./datasets/train_valid/processed/train_index.txt
+    backbone_only: false
+    cluster: ./datasets/train_valid/train.cluster
+  valid:
+    class: CoDesignDataset
+    mmap_dir: ./datasets/train_valid/processed
+    specify_index: ./datasets/train_valid/processed/valid_index.txt
+    backbone_only: false
+dataloader:
+  shuffle: true
+  num_workers: 4
+  wrapper:
+    class: DynamicBatchWrapper
+    complexity: n**2
+    ubound_per_batch: 60000  # batch size ~24
+trainer:
+  class: AutoEncoderTrainer
+  config:
+    max_epoch: 500 # the best checkpoint should be obatained at about epoch 457
+    save_topk: 10
+    save_dir: ./ckpts/autoencoder_fixseq
+    patience: 10
+    metric_min_better: true
+    optimizer:
+      class: AdamW
+      lr: 1.0e-4
+    scheduler:
+      class: ReduceLROnPlateau
+      factor: 0.8
+      patience: 15
+      mode: min
+      frequency: val_epoch
+      min_lr: 5.0e-6
+model:
+  class: AutoEncoder
+  embed_size: 128
+  hidden_size: 128
+  latent_size: 0
+  latent_n_channel: 1
+  n_layers: 3
+  n_channel: 14  # all atom
+  h_kl_weight: 0.0
+  z_kl_weight: 1.0
+  coord_loss_ratio: 1.0
+  coord_loss_weights:
+    Xloss: 1.0
+    ca_Xloss: 1.0
+    bb_bond_lengths_loss: 1.0
+    sc_bond_lengths_loss: 1.0
+    bb_dihedral_angles_loss: 0.0
+    sc_chi_angles_loss: 0.5
+  anchor_at_ca: true
+  mode: fixseq

configs/pepbench/ldm/setup_latent_guidance.yaml ADDED Viewed

	@@ -0,0 +1,12 @@

+dataset:
+  test:
+    class: CoDesignDataset
+    mmap_dir: ./datasets/train_valid/processed
+    specify_index: ./datasets/train_valid/processed/train_index.txt
+    backbone_only: false
+dataloader:
+  num_workers: 2
+  batch_size: 32
+backbone_only: false

configs/pepbench/ldm/train_codesign.yaml ADDED Viewed

	@@ -0,0 +1,60 @@

+dataset:
+  train:
+    class: CoDesignDataset
+    mmap_dir: ./datasets/train_valid/processed
+    specify_index: ./datasets/train_valid/processed/train_index.txt
+    backbone_only: false
+    cluster: ./datasets/train_valid/train.cluster
+    use_covariance_matrix: true
+  valid:
+    class: CoDesignDataset
+    mmap_dir: ./datasets/train_valid/processed
+    specify_index: ./datasets/train_valid/processed/valid_index.txt
+    backbone_only: false
+    use_covariance_matrix: true
+dataloader:
+  shuffle: true
+  num_workers: 4
+  wrapper:
+    class: DynamicBatchWrapper
+    complexity: n**2
+    ubound_per_batch: 60000  # batch size ~32
+trainer:
+  class: LDMTrainer
+  criterion: Loss
+  config:
+    max_epoch: 500 # the best checkpoint should be obtained at around epoch 380
+    save_topk: 10
+    val_freq: 10
+    save_dir: ./ckpts/LDM_codesign
+    patience: 10
+    metric_min_better: true
+    optimizer:
+      class: AdamW
+      lr: 1.0e-4
+    scheduler:
+      class: ReduceLROnPlateau
+      factor: 0.6
+      patience: 3
+      mode: min
+      frequency: val_epoch
+      min_lr: 5.0e-6
+model:
+  class: LDMPepDesign
+  autoencoder_ckpt: ""
+  autoencoder_no_randomness: true
+  hidden_size: 128
+  num_steps: 100
+  n_layers: 3
+  n_rbf: 32
+  cutoff: 3.0 # the coordinates are in standard space
+  dist_rbf: 32
+  dist_rbf_cutoff: 7.0
+  diffusion_opt:
+    trans_seq_type: Diffusion
+    trans_pos_type: Diffusion

configs/pepbench/ldm/train_fixseq.yaml ADDED Viewed

	@@ -0,0 +1,61 @@

+dataset:
+  train:
+    class: CoDesignDataset
+    mmap_dir: ./datasets/train_valid/processed
+    specify_index: ./datasets/train_valid/processed/train_index.txt
+    backbone_only: false
+    cluster: ./datasets/train_valid/train.cluster
+    use_covariance_matrix: true
+  valid:
+    class: CoDesignDataset
+    mmap_dir: ./datasets/train_valid/processed
+    specify_index: ./datasets/train_valid/processed/valid_index.txt
+    backbone_only: false
+    use_covariance_matrix: true
+dataloader:
+  shuffle: true
+  num_workers: 4
+  wrapper:
+    class: DynamicBatchWrapper
+    complexity: n**2
+    ubound_per_batch: 60000  # batch size ~32
+trainer:
+  class: LDMTrainer
+  criterion: RMSD
+  config:
+    max_epoch: 1000 # the best checkpoint will be obtained at about 720 epoch
+    save_topk: 10
+    val_freq: 10
+    save_dir: ./ckpts/LDM_fixseq
+    patience: 10
+    metric_min_better: true
+    optimizer:
+      class: AdamW
+      lr: 1.0e-4
+    scheduler:
+      class: ReduceLROnPlateau
+      factor: 0.6
+      patience: 3
+      mode: min
+      frequency: val_epoch
+      min_lr: 5.0e-6
+model:
+  class: LDMPepDesign
+  autoencoder_ckpt: ""
+  autoencoder_no_randomness: true
+  hidden_size: 128
+  num_steps: 100
+  n_layers: 3
+  n_rbf: 32
+  cutoff: 3.0 # the coordinates are in standard space
+  dist_rbf: 0
+  dist_rbf_cutoff: 0.0
+  diffusion_opt:
+    trans_seq_type: Diffusion
+    trans_pos_type: Diffusion
+  mode: fixseq

configs/pepbench/test_codesign.yaml ADDED Viewed

	@@ -0,0 +1,17 @@

+dataset:
+  test:
+    class: CoDesignDataset
+    mmap_dir: ./datasets/LNR/processed
+    backbone_only: false
+    use_covariance_matrix: true
+dataloader:
+  num_workers: 4
+  batch_size: 64
+backbone_only: false
+n_samples: 40
+sample_opt:
+  energy_func: default
+  energy_lambda: 0.8

configs/pepbench/test_fixseq.yaml ADDED Viewed

	@@ -0,0 +1,18 @@

+dataset:
+  test:
+    class: CoDesignDataset
+    mmap_dir: ./datasets/LNR/processed
+    backbone_only: false
+    use_covariance_matrix: true
+dataloader:
+  num_workers: 4
+  batch_size: 64
+backbone_only: false
+struct_only: true
+n_samples: 10
+sample_opt:
+  energy_func: default
+  energy_lambda: 0.5

data/__init__.py ADDED Viewed

	@@ -0,0 +1,53 @@

+#!/usr/bin/python
+# -*- coding:utf-8 -*-
+from .dataset_wrapper import MixDatasetWrapper
+from .codesign import CoDesignDataset
+from .resample import ClusterResampler
+import torch
+from torch.utils.data import DataLoader
+import utils.register as R
+from utils.logger import print_log
+def create_dataset(config: dict):
+    splits = []
+    for split_name in ['train', 'valid', 'test']:
+        split_config = config.get(split_name, None)
+        if split_config is None:
+            splits.append(None)
+            continue
+        if isinstance(split_config, list):
+            dataset = MixDatasetWrapper(
+                *[R.construct(cfg) for cfg in split_config]
+            )
+        else:
+            dataset = R.construct(split_config)
+        splits.append(dataset)
+    return splits  # train/valid/test
+def create_dataloader(dataset, config: dict, n_gpu: int=1, validation: bool=False):
+    if 'wrapper' in config:
+        dataset = R.construct(config['wrapper'], dataset=dataset)
+    batch_size = config.get('batch_size', n_gpu) # default 1 on each gpu
+    if validation:
+        batch_size = config.get('val_batch_size', batch_size)
+    shuffle = config.get('shuffle', False)
+    num_workers = config.get('num_workers', 4)
+    collate_fn = dataset.collate_fn if hasattr(dataset, 'collate_fn') else None
+    if n_gpu > 1:
+        sampler = torch.utils.data.distributed.DistributedSampler(dataset, shuffle=shuffle)
+        batch_size = int(batch_size / n_gpu)
+        print_log(f'Batch size on a single GPU: {batch_size}')
+    else:
+        sampler = None
+    return DataLoader(
+        dataset=dataset,
+        batch_size=batch_size,
+        num_workers=num_workers,
+        shuffle=(shuffle and sampler is None),
+        collate_fn=collate_fn,
+        sampler=sampler
+    )

data/codesign.py ADDED Viewed

	@@ -0,0 +1,208 @@

+import os
+from typing import Optional, Any
+import numpy as np
+import torch
+from torch.nn.utils.rnn import pad_sequence
+from utils import register as R
+from utils.const import sidechain_atoms
+from data.converter.list_blocks_to_pdb import list_blocks_to_pdb
+from .format import VOCAB, Block, Atom
+from .mmap_dataset import MMAPDataset
+from .resample import ClusterResampler
+def calculate_covariance_matrix(point_cloud):
+    # Calculate the covariance matrix of the point cloud
+    covariance_matrix = np.cov(point_cloud, rowvar=False)
+    return covariance_matrix
+@R.register('CoDesignDataset')
+class CoDesignDataset(MMAPDataset):
+    MAX_N_ATOM = 14
+    def __init__(
+            self,
+            mmap_dir: str,
+            backbone_only: bool,  # only backbone (N, CA, C, O) or full-atom
+            specify_data: Optional[str] = None,
+            specify_index: Optional[str] = None,
+            padding_collate: bool = False,
+            cluster: Optional[str] = None,
+            use_covariance_matrix: bool = False
+        ) -> None:
+        super().__init__(mmap_dir, specify_data, specify_index)
+        self.mmap_dir = mmap_dir
+        self.backbone_only = backbone_only
+        self._lengths = [len(prop[-1].split(',')) + int(prop[1]) for prop in self._properties]
+        self.padding_collate = padding_collate
+        self.resampler = ClusterResampler(cluster) if cluster else None  # should only be used in training!
+        self.use_covariance_matrix = use_covariance_matrix
+        self.dynamic_idxs = [i for i in range(len(self))]
+        self.update_epoch() # should be called every epoch
+    def update_epoch(self):
+        if self.resampler is not None:
+            self.dynamic_idxs = self.resampler(len(self))
+    def get_len(self, idx):
+        return self._lengths[self.dynamic_idxs[idx]]
+    def get_summary(self, idx: int):
+        props = self._properties[idx]
+        _id = self._indexes[idx][0].split('.')[0]
+        ref_pdb = os.path.join(self.mmap_dir, '..', 'pdbs', _id + '.pdb')
+        rec_chain, lig_chain = props[4], props[5]
+        return _id, ref_pdb, rec_chain, lig_chain
+    def __getitem__(self, idx: int):
+        idx = self.dynamic_idxs[idx]
+        rec_blocks, lig_blocks = super().__getitem__(idx)
+        # receptor, (lig_chain_id, lig_blocks) = super().__getitem__(idx)
+        # pocket = {}
+        # for i in self._properties[idx][-1].split(','):
+        #     chain, i = i.split(':')
+        #     if chain not in pocket:
+        #         pocket[chain] = []
+        #     pocket[chain].append(int(i))
+        # rec_blocks = []
+        # for chain_id, blocks in receptor:
+        #     for i in pocket[chain_id]:
+        #         rec_blocks.append(blocks[i])
+        pocket_idx = [int(i) for i in self._properties[idx][-1].split(',')]
+        rec_position_ids = [i + 1 for i, _ in enumerate(rec_blocks)]
+        rec_blocks = [rec_blocks[i] for i in pocket_idx]
+        rec_position_ids = [rec_position_ids[i] for i in pocket_idx]
+        rec_blocks = [Block.from_tuple(tup) for tup in rec_blocks]
+        lig_blocks = [Block.from_tuple(tup) for tup in lig_blocks]
+        # for block in lig_blocks:
+        #     block.units = [Atom('CA', [0, 0, 0], 'C')]
+        # if idx == 0:
+        #     print(self._properties[idx])
+        #     print(''.join(VOCAB.abrv_to_symbol(block.abrv) for block in lig_blocks))
+        #     list_blocks_to_pdb([
+        #         rec_blocks, lig_blocks
+        #     ], ['B', 'A'], 'pocket.pdb')
+        mask = [0 for _ in rec_blocks] + [1 for _ in lig_blocks]
+        position_ids = rec_position_ids + [i + 1 for i, _ in enumerate(lig_blocks)]
+        X, S, atom_mask = [], [], []
+        for block in rec_blocks + lig_blocks:
+            symbol = VOCAB.abrv_to_symbol(block.abrv)
+            atom2coord = { unit.name: unit.get_coord() for unit in block.units }
+            bb_pos = np.mean(list(atom2coord.values()), axis=0).tolist()
+            coords, coord_mask = [], []
+            for atom_name in VOCAB.backbone_atoms + sidechain_atoms.get(symbol, []):
+                if atom_name in atom2coord:
+                    coords.append(atom2coord[atom_name])
+                    coord_mask.append(1)
+                else:
+                    coords.append(bb_pos)
+                    coord_mask.append(0)
+            n_pad = self.MAX_N_ATOM - len(coords)
+            for _ in range(n_pad):
+                coords.append(bb_pos)
+                coord_mask.append(0)
+            X.append(coords)
+            S.append(VOCAB.symbol_to_idx(symbol))
+            atom_mask.append(coord_mask)
+        X, atom_mask = torch.tensor(X, dtype=torch.float), torch.tensor(atom_mask, dtype=torch.bool)
+        mask = torch.tensor(mask, dtype=torch.bool)
+        if self.backbone_only:
+            X, atom_mask = X[:, :4], atom_mask[:, :4]
+        if self.use_covariance_matrix:
+            cov = calculate_covariance_matrix(X[~mask][:, 1][atom_mask[~mask][:, 1]].numpy()) # only use the receptor to derive the affine transformation
+            eps = 1e-4
+            cov = cov + eps * np.identity(cov.shape[0])
+            L = torch.from_numpy(np.linalg.cholesky(cov)).float().unsqueeze(0)
+        else:
+            L = None
+        item =  {
+            'X': X,                                                         # [N, 14] or [N, 4] if backbone_only == True
+            'S': torch.tensor(S, dtype=torch.long),                         # [N]
+            'position_ids': torch.tensor(position_ids, dtype=torch.long),   # [N]
+            'mask': mask,                                                   # [N], 1 for generation
+            'atom_mask': atom_mask,                                         # [N, 14] or [N, 4], 1 for having records in the PDB
+            'lengths': len(S),
+        }
+        if L is not None:
+            item['L'] = L
+        return item
+    def collate_fn(self, batch):
+        if self.padding_collate:
+            results = {}
+            pad_idx = VOCAB.symbol_to_idx(VOCAB.PAD)
+            for key in batch[0]:
+                values = [item[key] for item in batch]
+                if values[0] is None:
+                    results[key] = None
+                    continue
+                if key == 'lengths':
+                    results[key] = torch.tensor(values, dtype=torch.long)
+                elif key == 'S':
+                    results[key] = pad_sequence(values, batch_first=True, padding_value=pad_idx)
+                else:
+                    results[key] = pad_sequence(values, batch_first=True, padding_value=0)
+            return results
+        else:
+            results = {}
+            for key in batch[0]:
+                values = [item[key] for item in batch]
+                if values[0] is None:
+                    results[key] = None
+                    continue
+                if key == 'lengths':
+                    results[key] = torch.tensor(values, dtype=torch.long)
+                else:
+                    results[key] = torch.cat(values, dim=0)
+            return results
+@R.register('ShapeDataset')
+class ShapeDataset(CoDesignDataset):
+    def __init__(
+            self,
+            mmap_dir: str,
+            specify_data: Optional[str] = None,
+            specify_index: Optional[str] = None,
+            padding_collate: bool = False,
+            cluster: Optional[str] = None
+        ) -> None:
+        super().__init__(mmap_dir, False, specify_data, specify_index, padding_collate, cluster)
+        self.ca_idx = VOCAB.backbone_atoms.index('CA')
+    def __getitem__(self, idx: int):
+        item = super().__getitem__(idx)
+        # refine coordinates to CA and the atom furthest from CA
+        X = item['X'] # [N, 14, 3]
+        atom_mask = item['atom_mask']
+        ca_x = X[:, self.ca_idx].unsqueeze(1) # [N, 1, 3]
+        sc_x = X[:, 4:]  # [N, 10, 3], sidechain atom indexes
+        dist = torch.norm(sc_x - ca_x, dim=-1) # [N, 10]
+        dist = dist.masked_fill(~atom_mask[:, 4:], 1e10)
+        furthest_atom_x = sc_x[torch.arange(sc_x.shape[0]), torch.argmax(dist, dim=-1)] # [N, 3]
+        X = torch.cat([ca_x, furthest_atom_x.unsqueeze(1)], dim=1)
+        item['X'] = X
+        return item
+if __name__ == '__main__':
+    import sys
+    dataset = CoDesignDataset(sys.argv[1], backbone_only=True)
+    print(dataset[0])

data/converter/blocks_interface.py ADDED Viewed

	@@ -0,0 +1,89 @@

+#!/usr/bin/python
+# -*- coding:utf-8 -*-
+import numpy as np
+def blocks_to_coords(blocks):
+    max_n_unit = 0
+    coords, masks = [], []
+    for block in blocks:
+        coords.append([unit.get_coord() for unit in block.units])
+        max_n_unit = max(max_n_unit, len(coords[-1]))
+        masks.append([1 for _ in coords[-1]])
+    for i in range(len(coords)):
+        num_pad =  max_n_unit - len(coords[i])
+        coords[i] = coords[i] + [[0, 0, 0] for _ in range(num_pad)]
+        masks[i] = masks[i] + [0 for _ in range(num_pad)]
+    return np.array(coords), np.array(masks).astype('bool')  # [N, M, 3], [N, M], M == max_n_unit, in mask 0 is for padding
+def dist_matrix_from_coords(coords1, masks1, coords2, masks2):
+    dist = np.linalg.norm(coords1[:, None] - coords2[None, :], axis=-1)  # [N1, N2, M]
+    dist = dist + np.logical_not(masks1[:, None] * masks2[None, :]) * 1e6  # [N1, N2, M]
+    dist = np.min(dist, axis=-1)  # [N1, N2]
+    return dist
+def dist_matrix_from_blocks(blocks1, blocks2):
+    blocks_coord, blocks_mask = blocks_to_coords(blocks1 + blocks2)
+    blocks1_coord, blocks1_mask = blocks_coord[:len(blocks1)], blocks_mask[:len(blocks1)]
+    blocks2_coord, blocks2_mask = blocks_coord[len(blocks1):], blocks_mask[len(blocks1):]
+    dist = dist_matrix_from_coords(blocks1_coord, blocks1_mask, blocks2_coord, blocks2_mask)
+    return dist
+def blocks_interface(blocks1, blocks2, dist_th):
+    dist = dist_matrix_from_blocks(blocks1, blocks2)
+    on_interface = dist < dist_th
+    indexes1 = np.nonzero(on_interface.sum(axis=1) > 0)[0]
+    indexes2 = np.nonzero(on_interface.sum(axis=0) > 0)[0]
+    blocks1 = [blocks1[i] for i in indexes1]
+    blocks2 = [blocks2[i] for i in indexes2]
+    return (blocks1, blocks2), (indexes1, indexes2)
+def add_cb(input_array):
+    #from protein mpnn
+    #The virtual Cβ coordinates were calculated using ideal angle and bond length definitions: b = Cα - N, c = C - Cα, a = cross(b, c), Cβ = -0.58273431*a + 0.56802827*b - 0.54067466*c + Cα.
+    N,CA,C,O = input_array
+    b = CA - N
+    c = C - CA
+    a = np.cross(b,c)
+    CB = np.around(-0.58273431*a + 0.56802827*b - 0.54067466*c + CA,3)
+    return CB #np.array([N,CA,C,CB,O])
+def blocks_to_cb_coords(blocks):
+    cb_coords = []
+    for block in blocks:
+         try:
+              cb_coords.append(block.get_unit_by_name('CB').get_coord())
+         except KeyError:
+              tmp_coord = np.array([
+                   block.get_unit_by_name('N').get_coord(),
+                   block.get_unit_by_name('CA').get_coord(),
+                   block.get_unit_by_name('C').get_coord(),
+                   block.get_unit_by_name('O').get_coord()
+              ])
+              cb_coords.append(add_cb(tmp_coord))
+    return np.array(cb_coords)
+def blocks_cb_interface(blocks1, blocks2, dist_th=8.0):
+    cb_coords1 = blocks_to_cb_coords(blocks1)
+    cb_coords2 = blocks_to_cb_coords(blocks2)
+    dist = np.linalg.norm(cb_coords1[:, None] - cb_coords2[None, :], axis=-1)  # [N1, N2]
+    on_interface = dist < dist_th
+    indexes1 = np.nonzero(on_interface.sum(axis=1) > 0)[0]
+    indexes2 = np.nonzero(on_interface.sum(axis=0) > 0)[0]
+    blocks1 = [blocks1[i] for i in indexes1]
+    blocks2 = [blocks2[i] for i in indexes2]
+    return (blocks1, blocks2), (indexes1, indexes2)

data/converter/blocks_to_data.py ADDED Viewed

	@@ -0,0 +1,110 @@

+#!/usr/bin/python
+# -*- coding:utf-8 -*-
+from typing import List
+import numpy as np
+from data.format import VOCAB, Block
+from utils import const
+def blocks_to_data(*blocks_list: List[List[Block]]):
+    B, A, X, atom_positions, block_lengths, segment_ids = [], [], [], [], [], []
+    atom_mask, is_ca = [], []
+    topo_edge_index, topo_edge_attr, atom_names = [], [], []
+    last_c_node_id = None
+    for i, blocks in enumerate(blocks_list):
+        if len(blocks) == 0:
+            continue
+        cur_B, cur_A, cur_X, cur_atom_positions, cur_block_lengths = [], [], [], [], []
+        cur_atom_mask, cur_is_ca = [], []
+        # other nodes
+        for block in blocks:
+            b, symbol = VOCAB.abrv_to_idx(block.abrv), VOCAB.abrv_to_symbol(block.abrv)
+            x, a, positions, m, ca = [], [], [], [], []
+            atom2node_id = {}
+            if symbol == '?':
+                atom_missing = {}
+            else:
+                atom_missing = { atom_name: True for atom_name in const.backbone_atoms + const.sidechain_atoms[symbol] }
+            for atom in block:
+                atom2node_id[atom.name] = len(A) + len(cur_A) + len(a)
+                a.append(VOCAB.atom_to_idx(atom.get_element()))
+                x.append(atom.get_coord())
+                pos_code = ''.join((c for c in atom.get_pos_code() if not c.isdigit()))
+                positions.append(VOCAB.atom_pos_to_idx(pos_code))
+                if atom.name in atom_missing:
+                    atom_missing[atom.name] = False
+                m.append(1)
+                ca.append(atom.name == 'CA')
+                atom_names.append(atom.name)
+            for atom_name in atom_missing:
+                if atom_missing[atom_name]:
+                    atom2node_id[atom_name] = len(A) + len(cur_A) + len(a)
+                    a.append(VOCAB.atom_to_idx(atom_name[0])) # only C, N, O, S in proteins
+                    x.append([0, 0, 0])
+                    pos_code = ''.join((c for c in atom_name[1:] if not c.isdigit()))
+                    positions.append(VOCAB.atom_pos_to_idx(pos_code))
+                    m.append(0)
+                    ca.append(atom_name == 'CA')
+                    atom_names.append(atom_name)
+            block_len = len(a)
+            cur_B.append(b)
+            cur_A.extend(a)
+            cur_X.extend(x)
+            cur_atom_positions.extend(positions)
+            cur_block_lengths.append(block_len)
+            cur_atom_mask.extend(m)
+            cur_is_ca.extend(ca)
+            # topology edges
+            for src, dst, bond_type in const.sidechain_bonds.get(VOCAB.abrv_to_symbol(block.abrv), []):
+                src, dst = atom2node_id[src], atom2node_id[dst]
+                topo_edge_index.append((src, dst))  # no direction
+                topo_edge_index.append((dst, src))
+                topo_edge_attr.append(bond_type)
+                topo_edge_attr.append(bond_type)
+            if last_c_node_id is not None and ('CA' in atom2node_id):
+                src, dst = last_c_node_id, atom2node_id['N']
+                topo_edge_index.append((src, dst))  # no direction
+                topo_edge_index.append((dst, src))
+                topo_edge_attr.append(4)
+                topo_edge_attr.append(4)
+            if 'CA' not in atom2node_id:
+                last_c_node_id = None
+            else:
+                last_c_node_id = atom2node_id['C']
+        # update coordinates of the global node to the center
+        # cur_X[0] = np.mean(cur_X[1:], axis=0)
+        cur_segment_ids = [i for _ in cur_B]
+        # finish these blocks
+        B.extend(cur_B)
+        A.extend(cur_A)
+        X.extend(cur_X)
+        atom_positions.extend(cur_atom_positions)
+        block_lengths.extend(cur_block_lengths)
+        segment_ids.extend(cur_segment_ids)
+        atom_mask.extend(cur_atom_mask)
+        is_ca.extend(cur_is_ca)
+    X = np.array(X).tolist()
+    topo_edge_index = np.array(topo_edge_index).T.tolist()
+    topo_edge_attr = (np.array(topo_edge_attr) - 1).tolist() # type starts from 0 but bond type starts from 1
+    data = {
+        'X': X,             # [Natom, 2, 3]
+        'B': B,             # [Nb], block (residue) type
+        'A': A,             # [Natom]
+        'atom_positions': atom_positions,  # [Natom]
+        'block_lengths': block_lengths,  # [Nresidue]
+        'segment_ids': segment_ids,      # [Nresidue]
+        'atom_mask': atom_mask,          # [Natom]
+        'is_ca': is_ca,                  # [Natom]
+        'atom_names': atom_names,        # [Natom]
+        'topo_edge_index': topo_edge_index, # atom level
+        'topo_edge_attr': topo_edge_attr
+    }
+    return data

data/converter/list_blocks_to_pdb.py ADDED Viewed

	@@ -0,0 +1,61 @@

+#!/usr/bin/python
+# -*- coding:utf-8 -*-
+import os
+from typing import List
+import numpy as np
+from Bio.PDB import PDBParser, PDBIO
+from Bio.PDB.Structure import Structure as BStructure
+from Bio.PDB.Model import Model as BModel
+from Bio.PDB.Chain import Chain as BChain
+from Bio.PDB.Residue import Residue as BResidue
+from Bio.PDB.Atom import Atom as BAtom
+from data.format import Block, Atom, VOCAB
+def list_blocks_to_pdb(list_blocks: List[List[Block]], chain_names: List[str], out_path: str) -> None:
+    '''
+        Convert pdb file to a list of lists of blocks using Biopython.
+        Each chain will be a list of blocks.
+        Parameters:
+            list_blocks: A list of lists of blocks. Each list of blocks will be parsed into one chain in the pdb
+            chain_names: name of chains
+            out_path: Path to the pdb file
+    '''
+    pdb_id = os.path.basename(os.path.splitext(out_path)[0])
+    structure = BStructure(id=pdb_id)
+    model = BModel(id=0)
+    for blocks, chain_name in zip(list_blocks, chain_names):
+        chain = BChain(id=chain_name)
+        for i, block in enumerate(blocks):
+            chain.add(_block_to_biopython(block, i))
+        model.add(chain)
+    structure.add(model)
+    io = PDBIO()
+    io.set_structure(structure)
+    io.save(out_path)
+def _block_to_biopython(block: Block, pos_code: int) -> BResidue:
+    _id = (' ', pos_code, ' ')
+    residue = BResidue(_id, block.abrv, '    ')
+    for i, atom in enumerate(block):
+        fullname = ' ' + atom.name
+        while len(fullname) < 4:
+            fullname += ' '
+        bio_atom = BAtom(
+            name=atom,
+            coord=np.array(atom.coordinate, dtype=np.float32),
+            bfactor=0,
+            occupancy=1.0,
+            altloc=' ',
+            fullname=fullname,
+            serial_number=i,
+            element=atom.element
+        )
+        residue.add(bio_atom)
+    return residue

data/converter/pdb_to_list_blocks.py ADDED Viewed

	@@ -0,0 +1,99 @@

+#!/usr/bin/python
+# -*- coding:utf-8 -*-
+from typing import Dict, List, Optional, Union
+from Bio.PDB import PDBParser
+from data.format import Block, Atom
+def pdb_to_list_blocks(pdb: str, selected_chains: Optional[List[str]]=None, return_chain_ids: bool=False, dict_form: bool=False) -> Union[List[List[Block]], Dict[str, List[Block]]]:
+    '''
+        Convert pdb file to a list of lists of blocks using Biopython.
+        Each chain will be a list of blocks.
+        Parameters:
+            pdb: Path to the pdb file
+            selected_chains: List of selected chain ids. The returned list will be ordered
+                according to the ordering of chain ids in this parameter. If not specified,
+                all chains will be returned. e.g. ['A', 'B']
+            return_chain_ids: Whether to return the ids of each chain
+            dict_form: Whether to return chains in dict form (chain id as the key and blocks
+                as the value)
+        Returns:
+            A list of lists of blocks. Each chain in the pdb file will be parsed into
+            one list of blocks.
+            example:
+                [
+                    [residueA1, residueA2, ...],  # chain A
+                    [residueB1, residueB2, ...]   # chain B
+                ],
+                where each residue is instantiated by Block data class.
+    '''
+    parser = PDBParser(QUIET=True)
+    structure = parser.get_structure('anonym', pdb)
+    list_blocks, chain_ids, chains = [], {}, []
+    for model in structure.get_models():  # use model 1 only
+        structure = model
+        break
+    for chain in structure.get_chains():
+        _id = chain.get_id()
+        if (selected_chains is not None) and (_id not in selected_chains):
+            continue
+        residues, res_ids = [], {}
+        for residue in chain:
+            abrv = residue.get_resname()
+            hetero_flag, res_number, insert_code = residue.get_id()
+            res_id = f'{res_number}-{insert_code}'
+            if hetero_flag == 'W':
+                continue   # residue from glucose (WAT) or water (HOH)
+            if hetero_flag.strip() != '' and res_id in res_ids:
+                continue  # the solvent (e.g. H_EDO (EDO))
+            if abrv in ['EDO', 'HOH', 'BME']:  # solvent or other molecules
+                continue
+            if abrv == 'MSE':
+                abrv = 'MET'  # MET is usually transformed to MSE for structural analysis
+            # filter Hs because not all data include them
+            atoms = [ Atom(atom.get_id(), atom.get_coord().tolist(), atom.element) for atom in residue if atom.element != 'H' ]
+            block = Block(abrv, atoms, id=(res_number, insert_code))
+            if block.is_residue():
+                residues.append(block)
+                res_ids[res_id] = True
+        if len(residues) == 0:  # not a chain
+            continue
+        chain_ids[_id] = len(list_blocks)
+        list_blocks.append(residues)
+        chains.append(_id)
+    # reorder
+    if selected_chains is not None:
+        list_blocks = [list_blocks[chain_ids[chain_id]] for chain_id in selected_chains]
+        chains = selected_chains
+    if dict_form:
+        return { chain: blocks for chain, blocks in zip(chains, list_blocks)}
+    if return_chain_ids:
+        return list_blocks, chains
+    return list_blocks
+if __name__ == '__main__':
+    import sys
+    list_blocks = pdb_to_list_blocks(sys.argv[1])
+    print(f'{sys.argv[1]} parsed')
+    print(f'number of chains: {len(list_blocks)}')
+    for i, chain in enumerate(list_blocks):
+        print(f'chain {i} lengths: {len(chain)}')

data/dataset_wrapper.py ADDED Viewed

	@@ -0,0 +1,115 @@

+from typing import Callable
+from tqdm import tqdm
+from math import log
+import numpy as np
+import torch
+import sympy
+from utils import register as R
+class MixDatasetWrapper(torch.utils.data.Dataset):
+    def __init__(self, *datasets, collate_fn: Callable=None) -> None:
+        super().__init__()
+        self.datasets = datasets
+        self.cum_len = []
+        self.total_len = 0
+        for dataset in datasets:
+            self.total_len += len(dataset)
+            self.cum_len.append(self.total_len)
+        self.collate_fn = self.datasets[0].collate_fn if collate_fn is None else collate_fn
+        if hasattr(datasets[0], '_lengths'):
+            self._lengths = []
+            for dataset in datasets:
+                self._lengths.extend(dataset._lengths)
+    def update_epoch(self):
+        for dataset in self.datasets:
+            if hasattr(dataset, 'update_epoch'):
+                dataset.update_epoch()
+    def get_len(self, idx):
+        return self._lengths[idx]
+    def __len__(self):
+        return self.total_len
+    def __getitem__(self, idx):
+        last_cum_len = 0
+        for i, cum_len in enumerate(self.cum_len):
+            if idx < cum_len:
+                return self.datasets[i].__getitem__(idx - last_cum_len)
+            last_cum_len = cum_len
+        return None # this is not possible
+@R.register('DynamicBatchWrapper')
+class DynamicBatchWrapper(torch.utils.data.Dataset):
+    def __init__(self, dataset, complexity, ubound_per_batch) -> None:
+        super().__init__()
+        self.dataset = dataset
+        self.indexes = [i for i in range(len(dataset))]
+        self.complexity = complexity
+        self.eval_func = sympy.lambdify('n', sympy.simplify(complexity))
+        self.ubound_per_batch = ubound_per_batch
+        self.total_size = None
+        self.batch_indexes = []
+        self._form_batch()
+    def __getattr__(self, attr):
+        if attr in self.__dict__:
+            return self.__dict__[attr]
+        elif hasattr(self.dataset, attr):
+            return getattr(self.dataset, attr)
+        else:
+            raise AttributeError(f"'DynamicBatchWrapper'(or '{type(self.dataset)}') object has no attribute '{attr}'")
+    def update_epoch(self):
+        if hasattr(self.dataset, 'update_epoch'):
+            self.dataset.update_epoch()
+        self._form_batch()
+    ########## overload with your criterion ##########
+    def _form_batch(self):
+        np.random.shuffle(self.indexes)
+        last_batch_indexes = self.batch_indexes
+        self.batch_indexes = []
+        cur_complexity = 0
+        batch = []
+        for i in tqdm(self.indexes):
+            item_len = self.eval_func(self.dataset.get_len(i))
+            if item_len > self.ubound_per_batch:
+                continue
+            cur_complexity += item_len
+            if cur_complexity > self.ubound_per_batch:
+                self.batch_indexes.append(batch)
+                batch = []
+                cur_complexity = item_len
+            batch.append(i)
+        self.batch_indexes.append(batch)
+        if self.total_size is None:
+            self.total_size = len(self.batch_indexes)
+        else:
+            # control the lengths of the dataset, otherwise the dataloader will raise error
+            if len(self.batch_indexes) < self.total_size:
+                num_add = self.total_size - len(self.batch_indexes)
+                self.batch_indexes = self.batch_indexes + last_batch_indexes[:num_add]
+            else:
+                self.batch_indexes = self.batch_indexes[:self.total_size]
+    def __len__(self):
+        return len(self.batch_indexes)
+    def __getitem__(self, idx):
+        return [self.dataset[i] for i in self.batch_indexes[idx]]
+    def collate_fn(self, batched_batch):
+        batch = []
+        for minibatch in batched_batch:
+            batch.extend(minibatch)
+        return self.dataset.collate_fn(batch)

data/format.py ADDED Viewed

	@@ -0,0 +1,220 @@

+#!/usr/bin/python
+# -*- coding:utf-8 -*-
+from copy import copy
+from typing import List, Tuple, Iterator, Optional
+from utils import const
+class MoleculeVocab:
+    MAX_ATOM_NUMBER = 14
+    def __init__(self):
+        self.backbone_atoms = ['N', 'CA', 'C', 'O']
+        self.PAD, self.MASK, self.UNK, self.LAT = '#', '*', '?', '&' # pad / mask / unk / latent node
+        specials = [# special added
+                (self.PAD, 'PAD'), (self.MASK, 'MASK'), (self.UNK, 'UNK'), # pad / mask / unk
+                (self.LAT, '<L>')  # latent node in latent space
+            ]
+        aas = const.aas
+        # sms = [(e.lower(), e) for e in const.periodic_table]
+        sms = [] # disable small molecule vocabulary
+        self.atom_pad, self.atom_mask, self.atom_latent = 'pad', 'msk', 'lat' # Avoid conflict with atom P
+        self.atom_pos_pad, self.atom_pos_mask, self.atom_pos_latent = 'pad', 'msk', 'lat'
+        self.atom_pos_sm = 'sml'  # small molecule
+        # block level vocab
+        self.idx2block = specials + aas + sms
+        self.symbol2idx, self.abrv2idx = {}, {}
+        for i, (symbol, abrv) in enumerate(self.idx2block):
+            self.symbol2idx[symbol] = i
+            self.abrv2idx[abrv] = i
+        self.special_mask = [1 for _ in specials] + [0 for _ in aas] + [0 for _ in sms]
+        # atom level vocab
+        self.idx2atom = [self.atom_pad, self.atom_mask, self.atom_latent] + const.periodic_table
+        self.idx2atom_pos = [self.atom_pos_pad, self.atom_pos_mask, self.atom_pos_latent, '', 'A', 'B', 'G', 'D', 'E', 'Z', 'H', 'XT', 'P', self.atom_pos_sm] # SM is for atoms in small molecule, 'P' for O1P, O2P, O3P
+        self.atom2idx, self.atom_pos2idx = {}, {}
+        self.atom2idx = {}
+        for i, atom in enumerate(self.idx2atom):
+            self.atom2idx[atom] = i
+        for i, atom_pos in enumerate(self.idx2atom_pos):
+            self.atom_pos2idx[atom_pos] = i
+    # block level APIs
+    def abrv_to_symbol(self, abrv):
+        idx = self.abrv_to_idx(abrv)
+        return None if idx is None else self.idx2block[idx][0]
+    def symbol_to_abrv(self, symbol):
+        idx = self.symbol_to_idx(symbol)
+        return None if idx is None else self.idx2block[idx][1]
+    def abrv_to_idx(self, abrv):
+        abrv = abrv.upper()
+        return self.abrv2idx.get(abrv, self.abrv2idx['UNK'])
+    def symbol_to_idx(self, symbol):
+        # symbol = symbol.upper()
+        return self.symbol2idx.get(symbol, self.abrv2idx['UNK'])
+    def idx_to_symbol(self, idx):
+        return self.idx2block[idx][0]
+    def idx_to_abrv(self, idx):
+        return self.idx2block[idx][1]
+    def get_pad_idx(self):
+        return self.symbol_to_idx(self.PAD)
+    def get_mask_idx(self):
+        return self.symbol_to_idx(self.MASK)
+    def get_special_mask(self):
+        return copy(self.special_mask)
+    # atom level APIs
+    def get_atom_pad_idx(self):
+        return self.atom2idx[self.atom_pad]
+    def get_atom_mask_idx(self):
+        return self.atom2idx[self.atom_mask]
+    def get_atom_latent_idx(self):
+        return self.atom2idx[self.atom_latent]
+    def get_atom_pos_pad_idx(self):
+        return self.atom_pos2idx[self.atom_pos_pad]
+    def get_atom_pos_mask_idx(self):
+        return self.atom_pos2idx[self.atom_pos_mask]
+    def get_atom_pos_latent_idx(self):
+        return self.atom_pos2idx[self.atom_pos_latent]
+    def idx_to_atom(self, idx):
+        return self.idx2atom[idx]
+    def atom_to_idx(self, atom):
+        atom = atom.upper()
+        return self.atom2idx.get(atom, self.atom2idx[self.atom_mask])
+    def idx_to_atom_pos(self, idx):
+        return self.idx2atom_pos[idx]
+    def atom_pos_to_idx(self, atom_pos):
+        return self.atom_pos2idx.get(atom_pos, self.atom_pos2idx[self.atom_pos_mask])
+    # sizes
+    def get_num_atom_type(self):
+        return len(self.idx2atom)
+    def get_num_atom_pos(self):
+        return len(self.idx2atom_pos)
+    def get_num_block_type(self):
+        return len(self.special_mask) - sum(self.special_mask)
+    def __len__(self):
+        return len(self.symbol2idx)
+    # others
+    @property
+    def ca_channel_idx(self):
+        return self.backbone_atoms.index('CA')
+VOCAB = MoleculeVocab()
+class Atom:
+    def __init__(self, atom_name: str, coordinate: List[float], element: str, pos_code: str=None):
+        self.name = atom_name
+        self.coordinate = coordinate
+        self.element = element
+        if pos_code is None:
+            pos_code = atom_name.lstrip(element)
+            self.pos_code = pos_code
+        else:
+            self.pos_code = pos_code
+    def get_element(self):
+        return self.element
+    def get_coord(self):
+        return copy(self.coordinate)
+    def get_pos_code(self):
+        return self.pos_code
+    def __str__(self) -> str:
+        return self.name
+    def __repr__(self) -> str:
+        return f"Atom ({self.name}): {self.element}({self.pos_code}) [{','.join(['{:.4f}'.format(num) for num in self.coordinate])}]"
+    def to_tuple(self):
+        return (
+            self.name,
+            self.coordinate,
+            self.element,
+            self.pos_code
+        )
+    @classmethod
+    def from_tuple(self, data):
+        return Atom(
+            atom_name=data[0],
+            coordinate=data[1],
+            element=data[2],
+            pos_code=data[3]
+        )
+class Block:
+    def __init__(self, abrv: str, units: List[Atom], id: Optional[any]=None) -> None:
+        self.abrv: str = abrv
+        self.units: List[Atom] = units
+        self._uname2idx = { unit.name: i for i, unit in enumerate(self.units) }
+        self.id = id
+    def __len__(self) -> int:
+        return len(self.units)
+    def __iter__(self) -> Iterator[Atom]:
+        return iter(self.units)
+    def get_unit_by_name(self, name: str) -> Atom:
+        idx = self._uname2idx[name]
+        return self.units[idx]
+    def has_unit(self, name: str) -> bool:
+        return name in self._uname2idx
+    def to_tuple(self):
+        return (
+            self.abrv,
+            [unit.to_tuple() for unit in self.units],
+            self.id
+        )
+    def is_residue(self):
+        return self.has_unit('CA') and self.has_unit('N') and self.has_unit('C') and self.has_unit('O')
+    @classmethod
+    def from_tuple(self, data):
+        return Block(
+            abrv=data[0],
+            units=[Atom.from_tuple(unit_data) for unit_data in data[1]],
+            id=data[2]
+        )
+    def __repr__(self) -> str:
+        return f"Block ({self.abrv}):\n\t" + '\n\t'.join([repr(at) for at in self.units]) + '\n'

data/mmap_dataset.py ADDED Viewed

	@@ -0,0 +1,112 @@

+#!/usr/bin/python
+# -*- coding:utf-8 -*-
+import os
+import io
+import gzip
+import json
+import mmap
+from typing import Optional
+from tqdm import tqdm
+import torch
+def compress(x):
+    serialized_x = json.dumps(x).encode()
+    buf = io.BytesIO()
+    with gzip.GzipFile(fileobj=buf, mode='wb', compresslevel=6) as f:
+        f.write(serialized_x)
+    compressed = buf.getvalue()
+    return compressed
+def decompress(compressed_x):
+    buf = io.BytesIO(compressed_x)
+    with gzip.GzipFile(fileobj=buf, mode="rb") as f:
+        serialized_x = f.read().decode()
+    x = json.loads(serialized_x)
+    return x
+def _find_measure_unit(num_bytes):
+    size, measure_unit = num_bytes, 'Bytes'
+    for unit in ['KB', 'MB', 'GB']:
+        if size > 1000:
+            size /= 1024
+            measure_unit = unit
+        else:
+            break
+    return size, measure_unit
+def create_mmap(iterator, out_dir, total_len=None, commit_batch=10000):
+    if not os.path.exists(out_dir):
+        os.makedirs(out_dir)
+    data_file_path = os.path.join(out_dir, 'data.bin')
+    data_file = open(data_file_path, 'wb')
+    index_file = open(os.path.join(out_dir, 'index.txt'), 'w')
+    i, offset, n_finished = 0, 0, 0
+    progress_bar = tqdm(iterator, total=total_len)
+    for _id, x, properties, entry_idx in iterator:
+        progress_bar.set_description(f'Processing {_id}')
+        compressed_x = compress(x)
+        bin_length = data_file.write(compressed_x)
+        properties = '\t'.join([str(prop) for prop in properties])
+        index_file.write(f'{_id}\t{offset}\t{offset + bin_length}\t{properties}\n') # tuple of (_id, start, end), data slice is [start, end)
+        offset += bin_length
+        i += 1
+        if entry_idx > n_finished:
+            progress_bar.update(entry_idx - n_finished)
+            n_finished = entry_idx
+            if total_len is not None:
+                expected_size = os.fstat(data_file.fileno()).st_size / n_finished * total_len
+                expected_size, measure_unit = _find_measure_unit(expected_size)
+                progress_bar.set_postfix({f'{i} saved; Estimated total size ({measure_unit})': expected_size})
+        if i % commit_batch == 0:
+            data_file.flush()  # save from memory to disk
+            index_file.flush()
+    data_file.close()
+    index_file.close()
+class MMAPDataset(torch.utils.data.Dataset):
+    def __init__(self, mmap_dir: str, specify_data: Optional[str]=None, specify_index: Optional[str]=None) -> None:
+        super().__init__()
+        self._indexes = []
+        self._properties = []
+        _index_path = os.path.join(mmap_dir, 'index.txt') if specify_index is None else specify_index
+        with open(_index_path, 'r') as f:
+            for line in f.readlines():
+                messages = line.strip().split('\t')
+                _id, start, end = messages[:3]
+                _property = messages[3:]
+                self._indexes.append((_id, int(start), int(end)))
+                self._properties.append(_property)
+        _data_path = os.path.join(mmap_dir, 'data.bin') if specify_data is None else specify_data
+        self._data_file = open(_data_path, 'rb')
+        self._mmap = mmap.mmap(self._data_file.fileno(), 0, access=mmap.ACCESS_READ)
+    def __del__(self):
+        self._mmap.close()
+        self._data_file.close()
+    def __len__(self):
+        return len(self._indexes)
+    def __getitem__(self, idx: int):
+        if idx < 0 or idx >= len(self):
+            raise IndexError(idx)
+        _, start, end = self._indexes[idx]
+        data = decompress(self._mmap[start:end])
+        return data

data/resample.py ADDED Viewed

	@@ -0,0 +1,19 @@

+#!/usr/bin/python
+# -*- coding:utf-8 -*-
+import numpy as np
+class ClusterResampler:
+    def __init__(self, cluster_path: str) -> None:
+        idx2prob = []
+        with open(cluster_path, 'r') as fin:
+            for line in fin:
+                cluster_n_member = int(line.strip().split('\t')[-1])
+                idx2prob.append(1 / cluster_n_member)
+        total = sum(idx2prob)
+        idx2prob = [p / total for p in idx2prob]
+        self.idx2prob = np.array(idx2prob)
+    def __call__(self, n_sample:int, replace: bool=False):
+        idxs = np.random.choice(len(self.idx2prob), size=n_sample, replace=replace, p=self.idx2prob)
+        return idxs

env.yaml ADDED Viewed

	@@ -0,0 +1,32 @@

+name: PepGLAD
+channels:
+  - pytorch
+  - nvidia
+  - bioconda
+  - pyg
+  - salilab
+  - conda-forge
+  - defaults
+dependencies:
+  - python=3.9
+  - pytorch::pytorch=1.13.1
+  - pytorch::pytorch-cuda=11.7
+  - nvidia::cudatoolkit=11.7.0
+  - pyg::pytorch-scatter
+  - mkl=2024.0.0
+  - salilab::dssp
+  - anaconda::libboost=1.73.0
+  - mmseqs2
+  - openmm=8.0.0
+  - pdbfixer
+  - pip
+  - pip:
+    - biopython==1.80
+    - rdkit-pypi==2022.3.5
+    - ray
+    - sympy
+    - scipy
+    - freesasa
+    - tensorboard
+    - pyyaml
+    - tqdm

evaluation/__init__.py ADDED Viewed

	@@ -0,0 +1,3 @@


1	+ #!/usr/bin/python
2	+ # -- coding:utf-8 --
3	+

evaluation/dG/RosettaFastRelaxUtil.xml ADDED Viewed

	@@ -0,0 +1,190 @@

+<ROSETTASCRIPTS>
+    <SCOREFXNS>
+        <ScoreFunction name="sfxn_soft" weights="beta_nov16_soft" />
+        <ScoreFunction name="sfxn" weights="beta_nov16" />
+        <ScoreFunction name="sfxn_relax" weights="beta_nov16" >
+            <Reweight scoretype="arg_cation_pi" weight="3" />
+            <Reweight scoretype="approximate_buried_unsat_penalty" weight="5" />
+            <Set approximate_buried_unsat_penalty_burial_atomic_depth="3.5" />
+            <Set approximate_buried_unsat_penalty_hbond_energy_threshold="-0.5" />
+        </ScoreFunction>
+        <ScoreFunction name="sfxn_softish" weights="beta_nov16" >
+            <Reweight scoretype="fa_rep" weight="0.15" />
+        </ScoreFunction>
+        <ScoreFunction name="sfxn_fa_atr" weights="empty" >
+            <Reweight scoretype="fa_atr" weight="1" />
+        </ScoreFunction>
+        <ScoreFunction name="vdw_sol" weights="empty" >
+            <Reweight scoretype="fa_atr" weight="1.0" />
+            <Reweight scoretype="fa_rep" weight="0.55" />
+            <Reweight scoretype="fa_sol" weight="1.0" />
+        </ScoreFunction>
+    </SCOREFXNS>
+    <RESIDUE_SELECTORS>
+        <Chain name="chainA" chains="A"/>
+        <Chain name="chainB" chains="B"/>
+        <Neighborhood name="interface_chA" selector="chainB" distance="8.0" />
+        <Neighborhood name="interface_chB" selector="chainA" distance="8.0" />
+        <And name="AB_interface" selectors="interface_chA,interface_chB" />
+        <Not name="Not_interface" selector="AB_interface" />
+        <And name="actual_interface_chB" selectors="AB_interface,chainB" />
+        <And name="not_interface_chB" selectors="Not_interface,chainB" />
+        <ResidueName name="apolar" residue_name3="ALA,CYS,PHE,ILE,LEU,MET,THR,PRO,VAL,TRP,TYR" />
+        <Not name="polar" selector="apolar" />
+        <True name="all" />
+        <ResidueName name="pro_and_gly_positions" residue_name3="PRO,GLY" />
+        <ResiduePDBInfoHasLabel name="HOTSPOT_res" property="HOTSPOT" />
+    </RESIDUE_SELECTORS>
+    <RESIDUE_SELECTORS>
+        <!-- Layer Design -->
+        <Layer name="surface" select_core="false" select_boundary="false" select_surface="true" use_sidechain_neighbors="true"/>
+        <Layer name="boundary" select_core="false" select_boundary="true" select_surface="false" use_sidechain_neighbors="true"/>
+        <Layer name="core" select_core="true" select_boundary="false" select_surface="false" use_sidechain_neighbors="true"/>
+        <SecondaryStructure name="sheet" overlap="0" minH="3" minE="2" include_terminal_loops="false" use_dssp="true" ss="E"/>
+        <SecondaryStructure name="entire_loop" overlap="0" minH="3" minE="2" include_terminal_loops="true" use_dssp="true" ss="L"/>
+        <SecondaryStructure name="entire_helix" overlap="0" minH="3" minE="2" include_terminal_loops="false" use_dssp="true" ss="H"/>
+        <And name="helix_cap" selectors="entire_loop">
+            <PrimarySequenceNeighborhood lower="1" upper="0" selector="entire_helix"/>
+        </And>
+        <And name="helix_start" selectors="entire_helix">
+            <PrimarySequenceNeighborhood lower="0" upper="1" selector="helix_cap"/>
+        </And>
+        <And name="helix" selectors="entire_helix">
+            <Not selector="helix_start"/>
+        </And>
+        <And name="loop" selectors="entire_loop">
+            <Not selector="helix_cap"/>
+        </And>
+    </RESIDUE_SELECTORS>
+    <TASKOPERATIONS>
+        <DesignRestrictions name="layer_design_no_core_polars">
+            <Action selector_logic="surface AND helix_start"  aas="DEHKPQR"/>
+            <Action selector_logic="surface AND helix"        aas="EHKQR"/>
+            <Action selector_logic="surface AND sheet"        aas="EHKNQRST"/>
+            <Action selector_logic="surface AND loop"         aas="DEGHKNPQRST"/>
+            <Action selector_logic="boundary AND helix_start" aas="ADEHIKLNPQRSTVWY"/>
+            <Action selector_logic="boundary AND helix"       aas="ADEHIKLNQRSTVWY"/>
+            <Action selector_logic="boundary AND sheet"       aas="DEFHIKLNQRSTVWY"/>
+            <Action selector_logic="boundary AND loop"        aas="ADEFGHIKLNPQRSTVWY"/>
+            <Action selector_logic="core AND helix_start"     aas="AFILMPVWY"/>
+            <Action selector_logic="core AND helix"           aas="AFILVWY"/>
+            <Action selector_logic="core AND sheet"           aas="FILVWY"/>
+            <Action selector_logic="core AND loop"            aas="AFGILPVWY"/>
+            <Action selector_logic="helix_cap"                aas="DNST"/>
+        </DesignRestrictions>
+    </TASKOPERATIONS>
+    <TASKOPERATIONS>
+        <ProteinProteinInterfaceUpweighter name="upweight_interface" interface_weight="3" />
+        <ProteinInterfaceDesign name="pack_long" design_chain1="0" design_chain2="0" jump="1" interface_distance_cutoff="15"/>
+        <InitializeFromCommandline name="init" />
+        <IncludeCurrent name="current" />
+        <LimitAromaChi2 name="limitchi2" chi2max="110" chi2min="70" include_trp="True" />
+        <ExtraRotamersGeneric name="ex1_ex2" ex1="1" ex2="1" />
+        <OperateOnResidueSubset name="restrict_target_not_interface" selector="not_interface_chB">
+            <PreventRepackingRLT/>
+        </OperateOnResidueSubset>
+        <OperateOnResidueSubset name="restrict2repacking" selector="all">
+            <RestrictToRepackingRLT/>
+        </OperateOnResidueSubset>
+        <OperateOnResidueSubset name="restrict_to_interface" selector="Not_interface">
+            <PreventRepackingRLT/>
+        </OperateOnResidueSubset>
+        <OperateOnResidueSubset name="restrict_target2repacking" selector="chainB">
+            <RestrictToRepackingRLT/>
+        </OperateOnResidueSubset>
+        <OperateOnResidueSubset name="restrict_hotspots2repacking" selector="HOTSPOT_res">
+            <RestrictToRepackingRLT/>
+        </OperateOnResidueSubset>
+        <DisallowIfNonnative name="disallow_GLY" resnum="0" disallow_aas="G" />
+        <DisallowIfNonnative name="disallow_PRO" resnum="0" disallow_aas="P" />
+        <SelectBySASA name="PR_monomer_core" mode="sc" state="monomer" probe_radius="2.2" core_asa="10" surface_asa="10" core="0" boundary="1" surface="1" verbose="0" />
+        <OperateOnResidueSubset name="restrict_PRO_GLY" selector="pro_and_gly_positions">
+            <PreventRepackingRLT/>
+        </OperateOnResidueSubset>
+        PruneBadRotamers name="prune_bad_rotamers" probability_cut="0.01" />
+    </TASKOPERATIONS>
+    <MOVERS>
+        <SwitchChainOrder name="chain1onlypre" chain_order="1" />
+        <ScoreMover name="scorepose" scorefxn="sfxn" verbose="false" />
+        <ParsedProtocol name="chain1only">
+            <Add mover="chain1onlypre" />
+            <Add mover="scorepose" />
+        </ParsedProtocol>
+        <TaskAwareMinMover name="min" scorefxn="sfxn" bb="0" chi="1" task_operations="pack_long" />
+        <DeleteRegionMover name="delete_polar" residue_selector="polar" rechain="false" />
+    </MOVERS>
+    <FILTERS>
+	<Time name="timed"/>
+        <Sasa name="interface_buried_sasa" confidence="0" />
+        <Ddg name="ddg" threshold="0" jump="1" repeats="1" repack="1" relax_mover="min" confidence="0" scorefxn="sfxn" />
+        <Ddg name="ddg_norepack"  threshold="0" jump="1" repeats="1" repack="0" relax_mover="min" confidence="0" scorefxn="sfxn" />
+        <ShapeComplementarity name="interface_sc" verbose="0" min_sc="0.55" write_int_area="1" write_median_dist="1" jump="1" confidence="0"/>
+        ### score function monomer terms
+        <ScoreType name="total_score_MBF" scorefxn="sfxn" score_type="total_score" threshold="0" confidence="0" />
+        <MoveBeforeFilter name="total_score_monomer" mover="chain1only" filter="total_score_MBF" confidence="0" />
+        <ResidueCount name="res_count_MBF" max_residue_count="9999" confidence="0"/>
+        <MoveBeforeFilter name="res_count_monomer" mover="chain1only" filter="res_count_MBF" confidence="0" />
+        <CalculatorFilter name="score_per_res" equation="total_score_monomer / res" threshold="-3.5" confidence="0">
+            <Var name="total_score_monomer" filter="total_score_monomer"/>
+            <Var name="res" filter="res_count_monomer"/>
+        </CalculatorFilter>
+        <InterfaceHydrophobicResidueContacts name="hydrophobic_residue_contacts" target_selector="chainB" binder_selector="chainA" scorefxn="sfxn_soft" confidence="0"/>
+        <Ddg name="ddg_hydrophobic_pre"  threshold="-10" jump="1" repeats="1" repack="0" confidence="0" scorefxn="vdw_sol" />
+        <MoveBeforeFilter name="ddg_hydrophobic" mover="delete_polar" filter="ddg_hydrophobic_pre" confidence="0"/>
+        <ContactMolecularSurface name="contact_molecular_surface" distance_weight="0.5" target_selector="chainA" binder_selector="chainB" confidence="0" />
+    </FILTERS>
+    <MOVERS>
+        <FastRelax name="FastRelax" scorefxn="sfxn_relax" repeats="1" batch="false" ramp_down_constraints="false" cartesian="false" bondangle="false" bondlength="false" min_type="dfpmin_armijo_nonmonotone" task_operations="current,ex1_ex2,restrict_target_not_interface,limitchi2" >
+            <MoveMap name="MM" >
+                <Chain number="1" chi="true" bb="true" />
+                <Chain number="2" chi="true" bb="false" />
+                <Jump number="1" setting="true" />
+            </MoveMap>
+        </FastRelax>
+    </MOVERS>
+    <APPLY_TO_POSE>
+    </APPLY_TO_POSE>
+    <PROTOCOLS>
+    </PROTOCOLS>
+    <OUTPUT/>
+</ROSETTASCRIPTS>

evaluation/dG/base.py ADDED Viewed

	@@ -0,0 +1,148 @@

+#!/usr/bin/python
+# -*- coding:utf-8 -*-
+import os
+import re
+import json
+from typing import Optional, Tuple, List
+from dataclasses import dataclass
+from .energy import pyrosetta_fastrelax, pyrosetta_interface_energy, rfdiff_refine
+@dataclass
+class RelaxTask:
+    in_path: str
+    current_path: str
+    info: dict
+    status: str
+    rec_chain: str
+    pep_chain: str
+    rfdiff_relax: bool = False
+    dG: Optional[float] = None
+    def set_dG(self, dG):
+        self.dG = dG
+    def get_in_path_with_tag(self, tag):
+        name, ext = os.path.splitext(self.in_path)
+        new_path = f'{name}_{tag}{ext}'
+        return new_path
+    def set_current_path_tag(self, tag):
+        new_path = self.get_in_path_with_tag(tag)
+        self.current_path = new_path
+        return new_path
+    def check_current_path_exists(self):
+        ok = os.path.exists(self.current_path)
+        if not ok:
+            self.mark_failure()
+        if os.path.getsize(self.current_path) == 0:
+            ok = False
+            self.mark_failure()
+            os.unlink(self.current_path)
+        return ok
+    def update_if_finished(self, tag):
+        out_path = self.get_in_path_with_tag(tag)
+        if os.path.exists(out_path) and os.path.getsize(out_path) > 0:
+            # print('Already finished', out_path)
+            self.set_current_path_tag(tag)
+            self.mark_success()
+            return True
+        return False
+    def can_proceed(self):
+        self.check_current_path_exists()
+        return self.status != 'failed'
+    def mark_success(self):
+        self.status = 'success'
+    def mark_failure(self):
+        self.status = 'failed'
+class TaskScanner:
+    def __init__(self, results, n_sample, rfdiff_relax):
+        super().__init__()
+        self.results = results
+        self.n_sample = n_sample
+        self.rfdiff_relax = rfdiff_relax
+        self.visited = set()
+    def scan(self) -> List[RelaxTask]:
+        tasks = []
+        root_dir = os.path.dirname(self.results)
+        with open(self.results, 'r') as fin:
+            lines = fin.readlines()
+        for line in lines:
+            item = json.loads(line)
+            if item['number'] >= self.n_sample:
+                continue
+            _id = f"{item['id']}_{item['number']}"
+            if _id in self.visited:
+                continue
+            gen_pdb = os.path.split(item['gen_pdb'])[-1]
+            # subdir = gen_pdb.split('_')[0]
+            subdir = '_'.join(gen_pdb.split('_')[:-2])
+            gen_pdb = os.path.join(root_dir, 'candidates', subdir, gen_pdb)
+            tasks.append(RelaxTask(
+                in_path=gen_pdb,
+                current_path=gen_pdb,
+                info=item,
+                status='created',
+                rec_chain=item['rec_chain'],
+                pep_chain=item['lig_chain'],
+                rfdiff_relax=self.rfdiff_relax
+            ))
+            self.visited.add(_id)
+        return tasks
+    def scan_dataset(self) -> List[RelaxTask]:
+        tasks = []
+        root_dir = os.path.dirname(self.results)
+        with open(self.results, 'r') as fin: # index file of datasets
+            lines = fin.readlines()
+        for line in lines:
+            line = line.strip('\n').split('\t')
+            _id = line[0]
+            item = {
+                'id': _id,
+                'number': 0
+            }
+            pdb_path = os.path.join(root_dir, 'pdbs', _id + '.pdb')
+            tasks.append(RelaxTask(
+                in_path=pdb_path,
+                current_path=pdb_path,
+                info=item,
+                status='created',
+                rec_chain=line[7],
+                pep_chain=line[8],
+                rfdiff_relax=self.rfdiff_relax
+            ))
+            self.visited.add(_id)
+        return tasks
+def run_pyrosetta(task: RelaxTask):
+    if not task.can_proceed() :
+        return task
+    # if task.update_if_finished('rosetta'):
+    #     return task
+    out_path = task.set_current_path_tag('rosetta')
+    try:
+        if task.rfdiff_relax:
+            rfdiff_refine(task.in_path, out_path, task.pep_chain)
+        else:
+            pyrosetta_fastrelax(task.in_path, out_path, task.pep_chain, rfdiff_config=task.rfdiff_relax)
+        dG = pyrosetta_interface_energy(out_path, [task.rec_chain], [task.pep_chain])
+        task.mark_success()
+    except Exception as e:
+        print(e)
+        dG = 1e10
+        task.mark_failure()
+    task.set_dG(dG)
+    return task

evaluation/dG/energy.py ADDED Viewed

	@@ -0,0 +1,236 @@

+#!/usr/bin/python
+# -*- coding:utf-8 -*-
+'''
+    From https://github.com/luost26/diffab/blob/main/diffab/tools/relax/pyrosetta_relaxer.py
+'''
+import os
+import time
+import pyrosetta
+from pyrosetta.rosetta.protocols.analysis import InterfaceAnalyzerMover
+# for fast relax
+from pyrosetta.rosetta import protocols
+from pyrosetta.rosetta.protocols.relax import FastRelax
+from pyrosetta.rosetta.core.pack.task import TaskFactory
+from pyrosetta.rosetta.core.pack.task import operation
+from pyrosetta.rosetta.core.select import residue_selector as selections
+from pyrosetta.rosetta.core.select.movemap import MoveMapFactory, move_map_action
+from pyrosetta.rosetta.core.scoring import ScoreType
+from Bio.PDB import PDBIO, PDBParser
+from Bio.PDB.Structure import Structure as BStructure
+from Bio.PDB.Model import Model as BModel
+from Bio.PDB.Chain import Chain as BChain
+pyrosetta.init(' '.join([
+    '-mute', 'all',
+    '-use_input_sc',
+    '-ignore_unrecognized_res',
+    '-ignore_zero_occupancy', 'false',
+    '-load_PDB_components', 'false',
+    '-relax:default_repeats', '2',
+    '-no_fconfig',
+    # below are from https://github.com/nrbennet/dl_binder_design/blob/main/mpnn_fr/dl_interface_design.py
+    # '-beta_nov16',
+    '-use_terminal_residues', 'true',
+    '-in:file:silent_struct_type', 'binary'
+]))
+def current_milli_time():
+    return round(time.time() * 1000)
+def get_scorefxn(scorefxn_name:str):
+    """
+    Gets the scorefxn with appropriate corrections.
+    Taken from: https://gist.github.com/matteoferla/b33585f3aeab58b8424581279e032550
+    """
+    import pyrosetta
+    corrections = {
+        'beta_july15': False,
+        'beta_nov16': False,
+        'gen_potential': False,
+        'restore_talaris_behavior': False,
+    }
+    if 'beta_july15' in scorefxn_name or 'beta_nov15' in scorefxn_name:
+        # beta_july15 is ref2015
+        corrections['beta_july15'] = True
+    elif 'beta_nov16' in scorefxn_name:
+        corrections['beta_nov16'] = True
+    elif 'genpot' in scorefxn_name:
+        corrections['gen_potential'] = True
+        pyrosetta.rosetta.basic.options.set_boolean_option('corrections:beta_july15', True)
+    elif 'talaris' in scorefxn_name:  #2013 and 2014
+        corrections['restore_talaris_behavior'] = True
+    else:
+        pass
+    for corr, value in corrections.items():
+        pyrosetta.rosetta.basic.options.set_boolean_option(f'corrections:{corr}', value)
+    return pyrosetta.create_score_function(scorefxn_name)
+class RelaxRegion(object):
+    def __init__(self, scorefxn='ref2015', max_iter=1000, subset='nbrs', move_bb=True, rfdiff_config=False):
+        super().__init__()
+        if rfdiff_config:
+            self.scorefxn = get_scorefxn('beta_nov16')
+            xml = os.path.join(os.path.dirname(__file__), 'RosettaFastRelaxUtil.xml')
+            objs = protocols.rosetta_scripts.XmlObjects.create_from_file(xml)
+            self.fast_relax = objs.get_mover('FastRelax')
+            self.fast_relax.max_iter(max_iter)
+        else:
+            self.scorefxn = get_scorefxn(scorefxn)
+            self.fast_relax = FastRelax()
+            self.fast_relax.set_scorefxn(self.scorefxn)
+            self.fast_relax.max_iter(max_iter)
+        assert subset in ('all', 'target', 'nbrs')
+        self.subset = subset
+        self.move_bb = move_bb
+    def __call__(self, pdb_path, ligand_chains): # flexible_residue_first, flexible_residue_last):
+        pose = pyrosetta.pose_from_pdb(pdb_path)
+        start_t = current_milli_time()
+        original_pose = pose.clone()
+        tf = TaskFactory()
+        tf.push_back(operation.InitializeFromCommandline())
+        tf.push_back(operation.RestrictToRepacking())   # Only allow residues to repack. No design at any position.
+        # Create selector for the region to be relaxed
+        # Turn off design and repacking on irrelevant positions
+        # if flexible_residue_first[-1] == ' ':
+        #     flexible_residue_first = flexible_residue_first[:-1]
+        # if flexible_residue_last[-1] == ' ':
+        #     flexible_residue_last  = flexible_residue_last[:-1]
+        if self.subset != 'all':
+            chain_selectors = [selections.ChainSelector(chain) for chain in ligand_chains]
+            if len(chain_selectors) == 1:
+                gen_selector = chain_selectors[0]
+            else:
+                gen_selector = selections.OrResidueSelector(chain_selectors[0], chain_selectors[1])
+                for selector in chain_selectors[2:]:
+                    gen_selector = selections.OrResidueSelector(gen_selector, selector)
+            # gen_selector = selections.ChainSelector(pep_chain)
+            # gen_selector = selections.ResidueIndexSelector()
+            # gen_selector.set_index_range(
+            #     pose.pdb_info().pdb2pose(*flexible_residue_first),
+            #     pose.pdb_info().pdb2pose(*flexible_residue_last),
+            # )
+            nbr_selector = selections.NeighborhoodResidueSelector()
+            nbr_selector.set_focus_selector(gen_selector)
+            nbr_selector.set_include_focus_in_subset(True)
+            if self.subset == 'nbrs':
+                subset_selector = nbr_selector
+            elif self.subset == 'target':
+                subset_selector = gen_selector
+            prevent_repacking_rlt = operation.PreventRepackingRLT()
+            prevent_subset_repacking = operation.OperateOnResidueSubset(
+                prevent_repacking_rlt,
+                subset_selector,
+                flip_subset=True,
+            )
+            tf.push_back(prevent_subset_repacking)
+        scorefxn = self.scorefxn
+        fr = self.fast_relax
+        pose = original_pose.clone()
+        # pos_list = pyrosetta.rosetta.utility.vector1_unsigned_long()
+        # for pos in range(pose.pdb_info().pdb2pose(*flexible_residue_first), pose.pdb_info().pdb2pose(*flexible_residue_last)+1):
+        #     pos_list.append(pos)
+        # basic_idealize(pose, pos_list, scorefxn, fast=True)
+        mmf = MoveMapFactory()
+        if self.move_bb:
+            mmf.add_bb_action(move_map_action.mm_enable, gen_selector)
+        mmf.add_chi_action(move_map_action.mm_enable, subset_selector)
+        mm  = mmf.create_movemap_from_pose(pose)
+        fr.set_movemap(mm)
+        fr.set_task_factory(tf)
+        fr.apply(pose)
+        e_before = scorefxn(original_pose)
+        e_relax  = scorefxn(pose)
+        # print('\n\n[Finished in %.2f secs]' % ((current_milli_time() - start_t) / 1000))
+        # print(' > Energy (before):    %.4f' % scorefxn(original_pose))
+        # print(' > Energy (optimized): %.4f' % scorefxn(pose))
+        return pose, e_before, e_relax
+def pyrosetta_fastrelax(pdb_path, out_path, pep_chain, rfdiff_config=False):
+    minimizer = RelaxRegion(rfdiff_config=rfdiff_config)
+    pose_min, _, _ = minimizer(
+        pdb_path=pdb_path,
+        ligand_chains=[pep_chain]
+    )
+    pose_min.dump_pdb(out_path)
+def _rename_chain(pdb_path, out_path, src_pep_chain, tgt_pep_chain, tgt_rec_chain):
+    io = PDBIO()
+    parser = PDBParser()
+    structure = parser.get_structure('anonymous', pdb_path)
+    new_mapping = {}
+    pep_chain, rec_chain = BChain(id=tgt_pep_chain), BChain(id=tgt_rec_chain)
+    for model in structure:
+        for chain in model:
+            if chain.get_id() == src_pep_chain:
+                new_mapping[src_pep_chain] = tgt_pep_chain
+                for res in chain:
+                    pep_chain.add(res.copy())
+            else:
+                new_mapping[chain.get_id()] = tgt_rec_chain
+                for res in chain:
+                    rec_chain.add(res.copy())
+    structure = BStructure(id=structure.get_id())
+    model = BModel(id=0)
+    model.add(pep_chain)
+    model.add(rec_chain)
+    structure.add(model)
+    io.set_structure(structure)
+    io.save(out_path)
+    return new_mapping
+def rfdiff_refine(pdb_path, out_path, pep_chain):
+    # rename peptide chain to A and receptor to B
+    new_mapping = _rename_chain(pdb_path, out_path, pep_chain, 'A', 'B')
+    # force fields from RFDiffusion
+    get_scorefxn('beta_nov16')
+    xml = os.path.join(os.path.dirname(__file__), 'RosettaFastRelaxUtil.xml')
+    objs = protocols.rosetta_scripts.XmlObjects.create_from_file(xml)
+    fastrelax = objs.get_mover('FastRelax')
+    pose = pyrosetta.pose_from_pdb(out_path)
+    fastrelax.apply(pose)
+    pose.dump_pdb(out_path)
+    # get back to original chain ids
+    reverse_mapping = { new_mapping[key]: key for key in new_mapping }
+    _rename_chain(out_path, out_path, 'A', reverse_mapping['A'], reverse_mapping['B'])
+def pyrosetta_interface_energy(pdb_path, receptor_chains, ligand_chains, return_dict=False):
+    pose = pyrosetta.pose_from_pdb(pdb_path)
+    interface = ''.join(ligand_chains) + '_' + ''.join(receptor_chains)
+    mover = InterfaceAnalyzerMover(interface)
+    mover.set_pack_separated(True)
+    mover.apply(pose)
+    if return_dict:
+        return pose.scores
+    return pose.scores['dG_separated']

evaluation/dG/openmm_relaxer.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import os
+import time
+import io
+import logging
+import pdbfixer
+import openmm
+from openmm import app as openmm_app
+from openmm import unit
+ENERGY = unit.kilocalories_per_mole
+LENGTH = unit.angstroms
+class ForceFieldMinimizer(object):
+    def __init__(self, stiffness=10.0, max_iterations=0, tolerance=2.39*unit.kilocalories_per_mole, platform='CUDA'):
+        super().__init__()
+        self.stiffness = stiffness
+        self.max_iterations = max_iterations
+        self.tolerance = tolerance
+        assert platform in ('CUDA', 'CPU')
+        self.platform = platform
+    def _fix(self, pdb_str):
+        fixer = pdbfixer.PDBFixer(pdbfile=io.StringIO(pdb_str))
+        fixer.findNonstandardResidues()
+        fixer.replaceNonstandardResidues()
+        fixer.findMissingResidues()
+        fixer.findMissingAtoms()
+        fixer.addMissingAtoms(seed=0)
+        fixer.addMissingHydrogens()
+        out_handle = io.StringIO()
+        openmm_app.PDBFile.writeFile(fixer.topology, fixer.positions, out_handle, keepIds=True)
+        return out_handle.getvalue()
+    def _get_pdb_string(self, topology, positions):
+        with io.StringIO() as f:
+            openmm_app.PDBFile.writeFile(topology, positions, f, keepIds=True)
+            return f.getvalue()
+    def _minimize(self, pdb_str):
+        pdb = openmm_app.PDBFile(io.StringIO(pdb_str))
+        force_field = openmm_app.ForceField("charmm36.xml") # referring to http://docs.openmm.org/latest/userguide/application/02_running_sims.html
+        constraints = openmm_app.HBonds
+        system = force_field.createSystem(pdb.topology, constraints=constraints)
+        # Add constraints to non-generated regions
+        force = openmm.CustomExternalForce("0.5 * k * ((x-x0)^2 + (y-y0)^2 + (z-z0)^2)")
+        force.addGlobalParameter("k", self.stiffness)
+        for p in ["x0", "y0", "z0"]:
+            force.addPerParticleParameter(p)
+        for i, a in enumerate(pdb.topology.atoms()):
+            if a.element.name != 'hydrogen':
+                force.addParticle(i, pdb.positions[i])
+        system.addForce(force)
+        # Set up the integrator and simulation
+        integrator = openmm.LangevinIntegrator(0, 0.01, 0.0)
+        platform = openmm.Platform.getPlatformByName("CUDA")
+        simulation = openmm_app.Simulation(pdb.topology, system, integrator, platform)
+        simulation.context.setPositions(pdb.positions)
+        # Perform minimization
+        ret = {}
+        state = simulation.context.getState(getEnergy=True, getPositions=True)
+        ret["einit"] = state.getPotentialEnergy().value_in_unit(ENERGY)
+        ret["posinit"] = state.getPositions(asNumpy=True).value_in_unit(LENGTH)
+        simulation.minimizeEnergy(maxIterations=self.max_iterations, tolerance=self.tolerance)
+        state = simulation.context.getState(getEnergy=True, getPositions=True)
+        ret["efinal"] = state.getPotentialEnergy().value_in_unit(ENERGY)
+        ret["pos"] = state.getPositions(asNumpy=True).value_in_unit(LENGTH)
+        ret["min_pdb"] = self._get_pdb_string(simulation.topology, state.getPositions())
+        return ret['min_pdb'], ret
+    def _add_energy_remarks(self, pdb_str, ret):
+        pdb_lines = pdb_str.splitlines()
+        pdb_lines.insert(1, "REMARK   1  FINAL ENERGY:   {:.3f} KCAL/MOL".format(ret['efinal']))
+        pdb_lines.insert(1, "REMARK   1  INITIAL ENERGY: {:.3f} KCAL/MOL".format(ret['einit']))
+        return "\n".join(pdb_lines)
+    def __call__(self, pdb_str, out_path, return_info=True):
+        if '\n' not in pdb_str and pdb_str.lower().endswith(".pdb"):
+            with open(pdb_str) as f:
+                pdb_str = f.read()
+        pdb_fixed = self._fix(pdb_str)
+        pdb_min, ret = self._minimize(pdb_fixed)
+        pdb_min = self._add_energy_remarks(pdb_min, ret)
+        with open(out_path, 'w') as f:
+            f.write(pdb_min)
+        if return_info:
+            return pdb_min, ret
+        else:
+            return pdb_min
+if __name__ == '__main__':
+    import sys
+    force_field = ForceFieldMinimizer()
+    force_field(sys.argv[1], sys.argv[2])

evaluation/dG/run.py ADDED Viewed

	@@ -0,0 +1,92 @@

+#!/usr/bin/python
+# -*- coding:utf-8 -*-
+import os
+import json
+import argparse
+import statistics
+import ray
+from utils.logger import print_log
+from .base import TaskScanner, run_pyrosetta
+# @ray.remote(num_gpus=1/8, num_cpus=1)
+# def run_openmm_remote(task):
+#     return run_openmm(task)
+@ray.remote(num_cpus=1)
+def run_pyrosetta_remote(task):
+    return run_pyrosetta(task)
+@ray.remote
+def pipeline_pyrosetta(task):
+    funcs = [
+        run_pyrosetta_remote,
+    ]
+    for fn in funcs:
+        task = fn.remote(task)
+    return ray.get(task)
+def parse():
+    parser = argparse.ArgumentParser(description='calculating dG using pyrosetta')
+    parser.add_argument('--results', type=str, required=True, help='Path to the summary of the results (.jsonl)')
+    parser.add_argument('--n_sample', type=int, default=float('inf'), help='Maximum number of samples for calculation')
+    parser.add_argument('--rfdiff_relax', action='store_true', help='Use rfdiff fastrelax')
+    parser.add_argument('--out_path', type=str, default=None, help='Output path, default dG_report.jsonl under the same directory as results')
+    return parser.parse_args()
+def main(args):
+    # output summary
+    if args.out_path is None:
+        args.out_path = os.path.join(os.path.dirname(args.results), 'dG_report.jsonl')
+    results = {}
+    # parallel
+    ray.init()
+    scanner = TaskScanner(args.results, args.n_sample, args.rfdiff_relax)
+    if args.results.endswith('txt'):
+        tasks = scanner.scan_dataset()
+    else:
+        tasks = scanner.scan()
+    futures = [pipeline_pyrosetta.remote(t) for t in tasks]
+    if len(futures) > 0:
+        print_log(f'Submitted {len(futures)} tasks.')
+    while len(futures) > 0:
+        done_ids, futures = ray.wait(futures, num_returns=1)
+        for done_id in done_ids:
+            done_task = ray.get(done_id)
+            print_log(f'Remaining {len(futures)}. Finished {done_task.current_path}, dG {done_task.dG}')
+            _id, number = done_task.info['id'], done_task.info['number']
+            if _id not in results:
+                results[_id] = {
+                    'min': float('inf'),
+                    'all': {}
+                }
+            results[_id]['all'][number] = done_task.dG
+            results[_id]['min'] = min(results[_id]['min'], done_task.dG)
+    # write results
+    for _id in results:
+        success = 0
+        for n in results[_id]['all']:
+            if results[_id]['all'][n] < 0:
+                success += 1
+        results[_id]['success rate'] = success / len(results[_id]['all'])
+    json.dump(results, open(args.out_path, 'w'), indent=2)
+    # show results
+    vals = [results[_id]['min'] for _id in results]
+    print(f'median: {statistics.median(vals)}, mean: {sum(vals) / len(vals)}')
+    success = [results[_id]['success rate'] for _id in results]
+    print(f'mean success rate: {sum(success) / len(success)}')
+if __name__ == '__main__':
+    import random
+    random.seed(12)
+    main(parse())

evaluation/diversity.py ADDED Viewed

	@@ -0,0 +1,68 @@

+#!/usr/bin/python
+# -*- coding:utf-8 -*-
+from typing import List
+import numpy as np
+from scipy.cluster.hierarchy import linkage, fcluster
+from scipy.spatial.distance import squareform
+from scipy.stats.contingency import association
+from evaluation.seq_metric import align_sequences
+def seq_diversity(seqs: List[str], th: float=0.4) -> float:
+    '''
+        th: sequence distance
+    '''
+    dists = []
+    for i, seq1 in enumerate(seqs):
+        dists.append([])
+        for j, seq2 in enumerate(seqs):
+            _, sim = align_sequences(seq1, seq2)
+            dists[i].append(1 - sim)
+    dists = np.array(dists)
+    Z = linkage(squareform(dists), 'single')
+    cluster = fcluster(Z, t=th, criterion='distance')
+    return len(np.unique(cluster)) / len(seqs), cluster
+def struct_diversity(structs: np.ndarray, th: float=4.0) -> float:
+    '''
+    structs: N*L*3, alpha carbon coordinates
+    th: threshold for clustering (distance < th)
+    '''
+    ca_dists = np.sum((structs[:, None] - structs[None, :]) ** 2, axis=-1) # [N, N, L]
+    rmsd = np.sqrt(np.mean(ca_dists, axis=-1))
+    Z = linkage(squareform(rmsd), 'single') # since the distances might not be euclidean distances (e.g. rmsd)
+    cluster = fcluster(Z, t=th, criterion='distance')
+    return len(np.unique(cluster)) / structs.shape[0], cluster
+def diversity(seqs: List[str], structs: np.ndarray):
+    seq_div, seq_clu = seq_diversity(seqs)
+    if structs is None:
+        return seq_div, None, seq_div, None
+    struct_div, struct_clu = struct_diversity(structs)
+    co_div = np.sqrt(seq_div * struct_div)
+    n_seq_clu, n_struct_clu = np.max(seq_clu), np.max(struct_clu) # clusters start from 1
+    if n_seq_clu == 1 or n_struct_clu == 1:
+        consistency = 1.0 if n_seq_clu == n_struct_clu else 0.0
+    else:
+        table = [[0 for _ in range(n_struct_clu)] for _ in range(n_seq_clu)]
+        for seq_c, struct_c in zip(seq_clu, struct_clu):
+            table[seq_c - 1][struct_c - 1] += 1
+        consistency = association(np.array(table), method='cramer')
+    return seq_div, struct_div, co_div, consistency
+if __name__ == '__main__':
+    N, L = 100, 10
+    a = np.random.randn(N, L, 3)
+    print(struct_diversity(a))
+    from utils.const import aas
+    aas = [tup[0] for tup in aas]
+    seqs = np.random.randint(0, len(aas), (N, L))
+    seqs = [''.join([aas[i] for i in idx]) for idx in seqs]
+    print(seq_diversity(seqs, 0.4))

evaluation/dockq.py ADDED Viewed

	@@ -0,0 +1,15 @@

+#!/usr/bin/python
+# -*- coding:utf-8 -*-
+import os
+import re
+from globals import DOCKQ_DIR
+def dockq(mod_pdb: str, native_pdb: str, pep_chain: str):
+    p = os.popen(f'{os.path.join(DOCKQ_DIR, "DockQ.py")} {mod_pdb} {native_pdb} -model_chain1 {pep_chain} -native_chain1 {pep_chain} -no_needle')
+    text = p.read()
+    p.close()
+    res = re.search(r'DockQ\s+([0-1]\.[0-9]+)', text)
+    score = float(res.group(1))
+    return score

evaluation/rmsd.py ADDED Viewed

	@@ -0,0 +1,11 @@

+#!/usr/bin/python
+# -*- coding:utf-8 -*-
+import torch
+import numpy as np
+# a: [N, 3], b: [N, 3]
+def compute_rmsd(a, b, aligned=False):  # amino acids level rmsd
+    dist = np.sum((a - b) ** 2, axis=-1)
+    rmsd = np.sqrt(dist.sum() / a.shape[0])
+    return float(rmsd)

evaluation/seq_metric.py ADDED Viewed

	@@ -0,0 +1,71 @@

+#!/usr/bin/python
+# -*- coding:utf-8 -*-
+from math import sqrt
+from Bio.Align import substitution_matrices, PairwiseAligner
+def aar(candidate, reference):
+    hit = 0
+    for a, b in zip(candidate, reference):
+        if a == b:
+            hit += 1
+    return hit / len(reference)
+def align_sequences(sequence_A, sequence_B, **kwargs):
+    """
+    Performs a global pairwise alignment between two sequences
+    using the BLOSUM62 matrix and the Needleman-Wunsch algorithm
+    as implemented in Biopython. Returns the alignment, the sequence
+    identity and the residue mapping between both original sequences.
+    """
+    sub_matrice = substitution_matrices.load('BLOSUM62')
+    aligner = PairwiseAligner()
+    aligner.substitution_matrix = sub_matrice
+    alns = aligner.align(sequence_A, sequence_B)
+    best_aln = alns[0]
+    aligned_A, aligned_B = best_aln
+    base = sqrt(aligner.score(sequence_A, sequence_A) * aligner.score(sequence_B, sequence_B))
+    seq_id = aligner.score(sequence_A, sequence_B) / base
+    return (aligned_A, aligned_B), seq_id
+def slide_aar(candidate, reference, aar_func):
+    '''
+    e.g.
+     candidate: AILPV
+     reference: ILPVH
+     should be matched as
+     AILPV
+      ILPVH
+    To do this, we slide the candidate and calculate the maximum aar:
+        A
+       AI
+      AIL
+     AILP
+    AILPV
+    ILPV
+    LPV
+    PV
+    V
+    '''
+    special_token = ' '
+    ref_len = len(reference)
+    padded_candidate = special_token * (ref_len - 1) + candidate + special_token * (ref_len - 1)
+    value = 0
+    for start in range(len(padded_candidate) - ref_len + 1):
+        value = max(value, aar_func(padded_candidate[start:start + ref_len], reference))
+    return value
+if __name__ == '__main__':
+    print(align_sequences('PKGYAAPSA', 'KPAVYKFTL'))
+    print(align_sequences('KPAVYKFTL', 'PKGYAAPSA'))
+    print(align_sequences('PKGYAAPSA', 'PKGYAAPSA'))
+    print(align_sequences('KPAVYKFTL', 'KPAVYKFTL'))

generate.py ADDED Viewed

	@@ -0,0 +1,235 @@

+#!/usr/bin/python
+# -*- coding:utf-8 -*-
+import argparse
+import json
+import os
+import pickle as pkl
+from tqdm import tqdm
+from copy import deepcopy
+from multiprocessing import Pool
+import yaml
+import torch
+from torch.utils.data import DataLoader
+import models
+from utils.config_utils import overwrite_values
+from data.converter.pdb_to_list_blocks import pdb_to_list_blocks
+from data.converter.list_blocks_to_pdb import list_blocks_to_pdb
+from data.format import VOCAB, Atom
+from data import create_dataloader, create_dataset
+from utils.logger import print_log
+from utils.random_seed import setup_seed
+from utils.const import sidechain_atoms
+def get_best_ckpt(ckpt_dir):
+    with open(os.path.join(ckpt_dir, 'checkpoint', 'topk_map.txt'), 'r') as f:
+        ls = f.readlines()
+    ckpts = []
+    for l in ls:
+        k,v = l.strip().split(':')
+        k = float(k)
+        v = v.split('/')[-1]
+        ckpts.append((k,v))
+    # ckpts = sorted(ckpts, key=lambda x:x[0])
+    best_ckpt = ckpts[0][1]
+    return os.path.join(ckpt_dir, 'checkpoint', best_ckpt)
+def to_device(data, device):
+    if isinstance(data, dict):
+        for key in data:
+            data[key] = to_device(data[key], device)
+    elif isinstance(data, list) or isinstance(data, tuple):
+        res = [to_device(item, device) for item in data]
+        data = type(data)(res)
+    elif hasattr(data, 'to'):
+        data = data.to(device)
+    return data
+def clamp_coord(coord):
+    # some models (e.g. diffab) will output very large coordinates (absolute value >1000) which will corrupt the pdb file
+    new_coord = []
+    for val in coord:
+        if abs(val) >= 1000:
+            val = 0
+        new_coord.append(val)
+    return new_coord
+def overwrite_blocks(blocks, seq=None, X=None):
+    if seq is not None:
+        assert len(blocks) == len(seq), f'{len(blocks)} {len(seq)}'
+    new_blocks = []
+    for i, block in enumerate(blocks):
+        block = deepcopy(block)
+        if seq is None:
+            abrv = block.abrv
+        else:
+            abrv = VOCAB.symbol_to_abrv(seq[i])
+            if block.abrv != abrv:
+                if X is None:
+                    block.units = [atom for atom in block.units if atom.name in VOCAB.backbone_atoms]
+        if X is not None:
+            coords = X[i]
+            atoms = VOCAB.backbone_atoms + sidechain_atoms[VOCAB.abrv_to_symbol(abrv)]
+            block.units = [
+                Atom(atom_name, clamp_coord(coord), atom_name[0]) for atom_name, coord in zip(atoms, coords)
+            ]
+        block.abrv = abrv
+        new_blocks.append(block)
+    return new_blocks
+def generate_wrapper(model, sample_opt={}):
+    if isinstance(model, models.AutoEncoder):
+        def wrapper(batch):
+            X, S, ppls = model.test(batch['X'], batch['S'], batch['mask'], batch['position_ids'], batch['lengths'], batch['atom_mask'])
+            return X, S, ppls
+    elif isinstance(model, models.LDMPepDesign):
+        def wrapper(batch):
+            X, S, ppls = model.sample(batch['X'], batch['S'], batch['mask'], batch['position_ids'], batch['lengths'], batch['atom_mask'],
+                                      L=batch['L'] if 'L' in batch else None, sample_opt=sample_opt)
+            return X, S, ppls
+    else:
+        raise NotImplementedError(f'Wrapper for {type(model)} not implemented')
+    return wrapper
+def save_data(
+        _id, n,
+        x_pkl_file, s_pkl_file, pmetric_pkl_file,
+        ref_pdb, rec_chain, lig_chain, ref_save_dir, cand_save_dir,
+        seq_only, struct_only, backbone_only
+    ):
+    X, S, pmetric = pkl.load(open(x_pkl_file, 'rb')), pkl.load(open(s_pkl_file, 'rb')), pkl.load(open(pmetric_pkl_file, 'rb'))
+    os.remove(x_pkl_file), os.remove(s_pkl_file), os.remove(pmetric_pkl_file)
+    if seq_only:
+        X = None
+    elif struct_only:
+        S = None
+    rec_blocks, lig_blocks = pdb_to_list_blocks(ref_pdb, selected_chains=[rec_chain, lig_chain])
+    ref_pdb = os.path.join(ref_save_dir, _id + "_ref.pdb")
+    list_blocks_to_pdb([rec_blocks, lig_blocks], [rec_chain, lig_chain], ref_pdb)
+    # os.system(f'cp {ref_pdb} {os.path.join(ref_save_dir, _id + "_ref.pdb")}')
+    ref_seq = ''.join([VOCAB.abrv_to_symbol(block.abrv) for block in lig_blocks])
+    lig_blocks = overwrite_blocks(lig_blocks, S, X)
+    gen_seq = ''.join([VOCAB.abrv_to_symbol(block.abrv) for block in lig_blocks])
+    save_dir = os.path.join(cand_save_dir, _id)
+    if not os.path.exists(save_dir):
+        os.makedirs(save_dir)
+    gen_pdb = os.path.join(save_dir, _id + f'_gen_{n}.pdb')
+    list_blocks_to_pdb([rec_blocks, lig_blocks], [rec_chain, lig_chain], gen_pdb)
+    return {
+            'id': _id,
+            'number': n,
+            'gen_pdb': gen_pdb,
+            'ref_pdb': ref_pdb,
+            'pmetric': pmetric,
+            'rec_chain': rec_chain,
+            'lig_chain': lig_chain,
+            'ref_seq': ref_seq,
+            'gen_seq': gen_seq,
+            'seq_only': seq_only,
+            'struct_only': struct_only,
+            'backbone_only': backbone_only
+    }
+def main(args, opt_args):
+    config = yaml.safe_load(open(args.config, 'r'))
+    config = overwrite_values(config, opt_args)
+    struct_only = config.get('struct_only', False)
+    seq_only = config.get('seq_only', False)
+    assert not (seq_only and struct_only)
+    backbone_only = config.get('backbone_only', False)
+    # load model
+    b_ckpt = args.ckpt if args.ckpt.endswith('.ckpt') else get_best_ckpt(args.ckpt)
+    ckpt_dir = os.path.split(os.path.split(b_ckpt)[0])[0]
+    print(f'Using checkpoint {b_ckpt}')
+    model = torch.load(b_ckpt, map_location='cpu')
+    device = torch.device('cpu' if args.gpu == -1 else f'cuda:{args.gpu}')
+    model.to(device)
+    model.eval()
+    # load data
+    _, _, test_set = create_dataset(config['dataset'])
+    test_loader = create_dataloader(test_set, config['dataloader'])
+    # save path
+    if args.save_dir is None:
+        save_dir = os.path.join(ckpt_dir, 'results')
+    else:
+        save_dir = args.save_dir
+    ref_save_dir = os.path.join(save_dir, 'references')
+    cand_save_dir = os.path.join(save_dir, 'candidates')
+    for directory in [ref_save_dir, cand_save_dir]:
+        if not os.path.exists(directory):
+            os.makedirs(directory)
+    fout = open(os.path.join(save_dir, 'results.jsonl'), 'w')
+    item_idx = 0
+    # multiprocessing
+    pool = Pool(args.n_cpu)
+    n_samples = config.get('n_samples', 1)
+    pbar = tqdm(total=n_samples * len(test_loader))
+    for n in range(n_samples):
+        item_idx = 0
+        with torch.no_grad():
+            for batch in test_loader:
+                batch = to_device(batch, device)
+                batch_X, batch_S, batch_pmetric = generate_wrapper(model, deepcopy(config.get('sample_opt', {})))(batch)
+                # parallel
+                inputs = []
+                for X, S, pmetric in zip(batch_X, batch_S, batch_pmetric):
+                    _id, ref_pdb, rec_chain, lig_chain = test_set.get_summary(item_idx)
+                    # save temporary pickle file
+                    x_pkl_file = os.path.join(save_dir, _id + f'_gen_{n}_X.pkl')
+                    pkl.dump(X, open(x_pkl_file, 'wb'))
+                    s_pkl_file = os.path.join(save_dir, _id + f'_gen_{n}_S.pkl')
+                    pkl.dump(S, open(s_pkl_file, 'wb'))
+                    pmetric_pkl_file = os.path.join(save_dir, _id + f'_gen_{n}_pmetric.pkl')
+                    pkl.dump(pmetric, open(pmetric_pkl_file, 'wb'))
+                    inputs.append((
+                        _id, n,
+                        x_pkl_file, s_pkl_file, pmetric_pkl_file,
+                        ref_pdb, rec_chain, lig_chain, ref_save_dir, cand_save_dir,
+                        seq_only, struct_only, backbone_only
+                    ))
+                    item_idx += 1
+                results = pool.starmap(save_data, inputs)
+                for result in results:
+                    fout.write(json.dumps(result) + '\n')
+                pbar.update(1)
+    fout.close()
+def parse():
+    parser = argparse.ArgumentParser(description='Generate peptides given epitopes')
+    parser.add_argument('--config', type=str, required=True, help='Path to the test configuration')
+    parser.add_argument('--ckpt', type=str, required=True, help='Path to checkpoint')
+    parser.add_argument('--save_dir', type=str, default=None, help='Directory to save generated peptides')
+    parser.add_argument('--gpu', type=int, default=0, help='GPU to use, -1 for cpu')
+    parser.add_argument('--n_cpu', type=int, default=4, help='Number of CPU to use (for parallelly saving the generated results)')
+    return parser.parse_known_args()
+if __name__ == '__main__':
+    args, opt_args = parse()
+    print_log(f'Overwritting args: {opt_args}')
+    setup_seed(12)
+    main(args, opt_args)