Spaces:

angepapa
/

ParaSurf

Sleeping

App Files Files Community

angepapa commited on 2 days ago

Commit

fec9f61

verified ·

1 Parent(s): 293e2cf

Upload 404 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +26 -0
Dockerfile +38 -0
ParaSurf/create_datasets_from_csv/README.md +37 -0
ParaSurf/create_datasets_from_csv/__pycache__/split_pdb2chains_only.cpython-39.pyc +0 -0
ParaSurf/create_datasets_from_csv/final_dataset_preparation.py +146 -0
ParaSurf/create_datasets_from_csv/process_csv_dataset.py +130 -0
ParaSurf/create_datasets_from_csv/split_pdb2chains_only.py +43 -0
ParaSurf/model/ParaSurf_model.py +173 -0
ParaSurf/model/__pycache__/ParaSurf_model.cpython-310.pyc +0 -0
ParaSurf/model/__pycache__/ParaSurf_model.cpython-39.pyc +0 -0
ParaSurf/model/__pycache__/dataset.cpython-310.pyc +0 -0
ParaSurf/model/__pycache__/dataset.cpython-39.pyc +0 -0
ParaSurf/model/dataset.py +107 -0
ParaSurf/model_weights/README.md +11 -0
ParaSurf/preprocess/README.md +71 -0
ParaSurf/preprocess/__pycache__/check_empty_features.cpython-310.pyc +0 -0
ParaSurf/preprocess/__pycache__/check_empty_features.cpython-39.pyc +0 -0
ParaSurf/preprocess/__pycache__/clean_dataset.cpython-310.pyc +0 -0
ParaSurf/preprocess/__pycache__/clean_dataset.cpython-39.pyc +0 -0
ParaSurf/preprocess/check_empty_features.py +68 -0
ParaSurf/preprocess/check_rec_ant_touch.py +89 -0
ParaSurf/preprocess/clean_dataset.py +27 -0
ParaSurf/preprocess/create_input_features.py +230 -0
ParaSurf/preprocess/create_proteins_file.py +23 -0
ParaSurf/preprocess/create_sample_files.py +31 -0
ParaSurf/preprocess/create_surfpoints.py +57 -0
ParaSurf/train/V_domain_results.py +159 -0
ParaSurf/train/__pycache__/V_domain_results.cpython-310.pyc +0 -0
ParaSurf/train/__pycache__/V_domain_results.cpython-39.pyc +0 -0
ParaSurf/train/__pycache__/bsite_extraction.cpython-310.pyc +0 -0
ParaSurf/train/__pycache__/bsite_extraction.cpython-39.pyc +0 -0
ParaSurf/train/__pycache__/distance_coords.cpython-310.pyc +0 -0
ParaSurf/train/__pycache__/distance_coords.cpython-39.pyc +0 -0
ParaSurf/train/__pycache__/features.cpython-310.pyc +0 -0
ParaSurf/train/__pycache__/features.cpython-39.pyc +0 -0
ParaSurf/train/__pycache__/network.cpython-310.pyc +0 -0
ParaSurf/train/__pycache__/network.cpython-39.pyc +0 -0
ParaSurf/train/__pycache__/protein.cpython-310.pyc +0 -0
ParaSurf/train/__pycache__/protein.cpython-39.pyc +0 -0
ParaSurf/train/__pycache__/utils.cpython-310.pyc +0 -0
ParaSurf/train/__pycache__/utils.cpython-39.pyc +0 -0
ParaSurf/train/__pycache__/validation.cpython-310.pyc +0 -0
ParaSurf/train/__pycache__/validation.cpython-39.pyc +0 -0
ParaSurf/train/bsite_extraction.py +48 -0
ParaSurf/train/distance_coords.py +173 -0
ParaSurf/train/features.py +37 -0
ParaSurf/train/network.py +58 -0
ParaSurf/train/protein.py +92 -0
ParaSurf/train/train.py +172 -0
ParaSurf/train/utils.py +497 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,29 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+pdb2pqr-linux-bin64-2.1.1/_codecs_cn.x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+pdb2pqr-linux-bin64-2.1.1/_codecs_hk.x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+pdb2pqr-linux-bin64-2.1.1/_codecs_jp.x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+pdb2pqr-linux-bin64-2.1.1/_codecs_kr.x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+pdb2pqr-linux-bin64-2.1.1/_codecs_tw.x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+pdb2pqr-linux-bin64-2.1.1/_ctypes.x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+pdb2pqr-linux-bin64-2.1.1/datetime.x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+pdb2pqr-linux-bin64-2.1.1/doc/images/flowchart.png filter=lfs diff=lfs merge=lfs -text
+pdb2pqr-linux-bin64-2.1.1/libcrypto.so.1.0.0 filter=lfs diff=lfs merge=lfs -text
+pdb2pqr-linux-bin64-2.1.1/libexpat.so.1 filter=lfs diff=lfs merge=lfs -text
+pdb2pqr-linux-bin64-2.1.1/libncursesw.so.5 filter=lfs diff=lfs merge=lfs -text
+pdb2pqr-linux-bin64-2.1.1/libpython2.7.so.1.0 filter=lfs diff=lfs merge=lfs -text
+pdb2pqr-linux-bin64-2.1.1/libreadline.so.6 filter=lfs diff=lfs merge=lfs -text
+pdb2pqr-linux-bin64-2.1.1/libssl.so.1.0.0 filter=lfs diff=lfs merge=lfs -text
+pdb2pqr-linux-bin64-2.1.1/libstdc++.so.6 filter=lfs diff=lfs merge=lfs -text
+pdb2pqr-linux-bin64-2.1.1/libtinfo.so.5 filter=lfs diff=lfs merge=lfs -text
+pdb2pqr-linux-bin64-2.1.1/libz.so.1 filter=lfs diff=lfs merge=lfs -text
+pdb2pqr-linux-bin64-2.1.1/numpy.core.multiarray.x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+pdb2pqr-linux-bin64-2.1.1/numpy.core.umath.x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+pdb2pqr-linux-bin64-2.1.1/numpy.fft.fftpack_lite.x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+pdb2pqr-linux-bin64-2.1.1/numpy.linalg._umath_linalg.x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+pdb2pqr-linux-bin64-2.1.1/numpy.linalg.lapack_lite.x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+pdb2pqr-linux-bin64-2.1.1/numpy.random.mtrand.x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+pdb2pqr-linux-bin64-2.1.1/pdb2pka._apbslib.x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+pdb2pqr-linux-bin64-2.1.1/pdb2pka._pMC_mult.x86_64-linux-gnu.so filter=lfs diff=lfs merge=lfs -text
+pdb2pqr-linux-bin64-2.1.1/pdb2pqr filter=lfs diff=lfs merge=lfs -text

Dockerfile ADDED Viewed

	@@ -0,0 +1,38 @@

+FROM continuumio/miniconda3
+WORKDIR /app
+COPY . /app
+# Install system dependencies
+RUN apt-get update && apt-get install -y build-essential wget
+# Create Conda environment and install dependencies
+RUN conda create -n ParaSurf python=3.10 openbabel -c conda-forge -y
+RUN conda run -n ParaSurf pip install -r requirements.txt
+# Increase file descriptor limit to prevent FD_SETSIZE error
+RUN echo "ulimit -n 65535" >> ~/.bashrc
+# Download missing Gradio frpc binary
+RUN wget https://cdn-media.huggingface.co/frpc-gradio-0.3/frpc_linux_amd64 -O /opt/conda/envs/ParaSurf/lib/python3.10/site-packages/gradio/frpc_linux_amd64_v0.3 && \
+    chmod +x /opt/conda/envs/ParaSurf/lib/python3.10/site-packages/gradio/frpc_linux_amd64_v0.3
+# Install DMS software
+WORKDIR /app/dms
+RUN make install
+# Ensure necessary binaries are executable
+RUN chmod +x /app/pdb2pqr-linux-bin64-2.1.1/pdb2pqr && \
+    chmod +x /opt/conda/envs/ParaSurf/bin/* && \
+    chmod -R 755 /app
+# Set writable directories for Matplotlib cache
+ENV MPLCONFIGDIR=/tmp/matplotlib
+ENV XDG_CACHE_HOME=/tmp
+WORKDIR /app
+EXPOSE 7860
+# Run the app with higher file descriptor limits
+CMD ["bash", "-c", "ulimit -n 65535 && conda run --no-capture-output -n ParaSurf python app.py"]

ParaSurf/create_datasets_from_csv/README.md ADDED Viewed

	@@ -0,0 +1,37 @@

+# **Dataset preperation**
+### Steps for Dataset Preparation
+#### Step 1
+Download Specific PDB Files Use the process_csv_dataset.py script to download the PDB files listed in the .csv files.
+```bash
+# Step 1: Download specified PDB files
+python process_csv_dataset.py
+```
+#### Step 2
+Generate Final Complexes Run final_dataset_preparation.py to arrange the files into complexes with the specified chain IDs from the .csv files.
+```bash
+# Step 2: Organize files into final complexes
+python final_dataset_preparation.py
+```
+After running these scripts, you will find a test_data/pdbs folder organized as follows:
+```bash
+├── PECAN
+│   ├── TRAIN
+│   │   ├── 1A3R_receptor_1.pdb
+│   │   ├── 1A3R_antigen_1_1.pdb
+│   │   ├── ...
+│   │   ├── 5WUX_receptor_1.pdb
+│   │   └── 5WUX_antigen_1_1.pdb
+│   ├── VAL
+│   └── TEST
+├── Paragraph_Expanded
+│   ├── TRAIN
+│   ├── VAL
+│   └── TEST
+└── MIPE
+    ├── TRAIN_VAL
+    └── TEST
+```

ParaSurf/create_datasets_from_csv/__pycache__/split_pdb2chains_only.cpython-39.pyc ADDED Viewed

Binary file (1.54 kB). View file

ParaSurf/create_datasets_from_csv/final_dataset_preparation.py ADDED Viewed

	@@ -0,0 +1,146 @@

+import os
+import shutil
+import pandas as pd
+from split_pdb2chains_only import extract_chains_from_pdb
+from tqdm import tqdm
+def process_raw_pdb_data(info_df, initial_raw_pdb_files, final_folder):
+    """
+    Processes the raw PDB files by extracting the specific antibody and antigen chains from the .csv file, merging them,
+    and saving the merged files in the final train_val and test folder.
+    Parameters:
+    info_df (DataFrame): DataFrame containing the PDB codes, antibody chains, and antigen chains.
+    initial_raw_pdb_files (str): Path to the initial raw PDB files directory.
+    final_folder (str): Path to the folder where the processed files will be saved.
+    """
+    if not os.path.exists(final_folder):
+        os.makedirs(final_folder)
+    for i, row in tqdm(info_df.iterrows(), total=len(info_df)):
+        pdb_id = row['pdb_code']
+        ab_heavy_chain = row['Heavy_chain'] # Use only this line if you want to construct the only heavy chain dataset
+        ab_light_chain = row['Light_chain'] # Use only this line if you want to construct the only light chain dataset
+        ag_chain = row['ag']
+        pdb_file = os.path.join(initial_raw_pdb_files, pdb_id + '.pdb')
+        # Extract all the chains from the pdb file and save them to /tmp
+        chain_files, all_chains = extract_chains_from_pdb(pdb_file, '/tmp')
+        # Assign the correct chains
+        ab_heavy_chain_path = f'/tmp/{pdb_id}_chain{ab_heavy_chain}.pdb'
+        ab_light_chain_path = f'/tmp/{pdb_id}_chain{ab_light_chain}.pdb'
+        # Merge antibody chains into one file
+        receptor_output_path = f'{final_folder}/{pdb_id}_receptor_1.pdb'
+        with open(receptor_output_path, 'w') as receptor_file:
+            for ab_file in [ab_heavy_chain_path, ab_light_chain_path]: # also delete one (ab_heavy_chain_path or ab_light_chain_path) if you construct the only heavy/light chain dataset
+                with open(ab_file, 'r') as infile:
+                    receptor_file.write(infile.read())
+        print(f"Successfully merged {ab_heavy_chain} and {ab_light_chain} into {receptor_output_path}")
+        ag_chain_list = ag_chain.split(';')
+        if len(ag_chain_list) == 1:
+            # If there's only one antigen chain
+            ag_chain_1 = ag_chain_list[0].strip()
+            ag_chain_1_path = f'/tmp/{pdb_id}_chain{ag_chain_1}.pdb'
+            print(f"Handling one antigen chain: {ag_chain_1}")
+            # Copy the single antigen chain to the output
+            antigen_output_path = f'{final_folder}/{pdb_id}_antigen_1_1.pdb'
+            shutil.copyfile(ag_chain_1_path, antigen_output_path)
+            print(f"Successfully copied {ag_chain_1} to {antigen_output_path}")
+        elif len(ag_chain_list) == 2:
+            # If there are two antigen chains
+            ag_chain_1, ag_chain_2 = ag_chain_list
+            ag_chain_1_path = f'/tmp/{pdb_id}_chain{ag_chain_1}.pdb'
+            ag_chain_2_path = f'/tmp/{pdb_id}_chain{ag_chain_2}.pdb'
+            print(f"Handling two antigen chains: {ag_chain_1}, {ag_chain_2}")
+            # Merge the antigen chains into a single PDB file
+            antigen_output_path = f'{final_folder}/{pdb_id}_antigen_1_1.pdb'
+            with open(antigen_output_path, 'w') as outfile:
+                for ag_file in [ag_chain_1_path, ag_chain_2_path]:
+                    with open(ag_file, 'r') as infile:
+                        outfile.write(infile.read())
+            print(f"Successfully merged {ag_chain_1} and {ag_chain_2} into {antigen_output_path}")
+        elif len(ag_chain_list) == 3:
+            # If there are three antigen chains
+            ag_chain_1, ag_chain_2, ag_chain_3 = ag_chain_list
+            ag_chain_1_path = f'/tmp/{pdb_id}_chain{ag_chain_1}.pdb'
+            ag_chain_2_path = f'/tmp/{pdb_id}_chain{ag_chain_2}.pdb'
+            ag_chain_3_path = f'/tmp/{pdb_id}_chain{ag_chain_3}.pdb'
+            print(f"Handling three antigen chains: {ag_chain_1}, {ag_chain_2}, {ag_chain_3}")
+            # Merge the antigen chains into a single PDB file
+            antigen_output_path = f'{final_folder}/{pdb_id}_antigen_1_1.pdb'
+            with open(antigen_output_path, 'w') as outfile:
+                for ag_file in [ag_chain_1_path, ag_chain_2_path, ag_chain_3_path]:
+                    with open(ag_file, 'r') as infile:
+                        outfile.write(infile.read())
+            print(f"Successfully merged {ag_chain_1}, {ag_chain_2}, and {ag_chain_3} into {antigen_output_path}")
+        # At the end, remove all the chain pdb files from the /tmp folder
+        for chain_file in chain_files:
+            os.remove(chain_file)
+if __name__ == '__main__':
+    user = os.getenv('USER')
+    datasets = ['PECAN', 'Paragraph_Expanded', 'MIPE']
+    for dataset in datasets:
+        if dataset == 'MIPE':  # here the split is train-val and test according to the MIPE paper
+            # csv path
+            train_val_info = pd.read_csv(f'/home/{user}/PycharmProjects/github_projects/ParaSurf/training_data/{dataset}/train_val.csv')
+            test_info = pd.read_csv(f'/home/{user}/PycharmProjects/github_projects/ParaSurf/training_data/{dataset}/test_set.csv')
+            # path to init raw PDB storage
+            init_pdb_files_train_val = f'/home/{user}/PycharmProjects/github_projects/ParaSurf/test_data/pdbs/{dataset}/train_val_data_initial_raw_pdb_files'
+            init_pdb_files_test = f'/home/{user}/PycharmProjects/github_projects/ParaSurf/test_data/pdbs/{dataset}/test_data_initial_raw_pdb_files'
+            # final folder
+            final_train_val_folder = f'/home/{user}/PycharmProjects/github_projects/ParaSurf/test_data/pdbs/{dataset}/TRAIN_VAL'
+            final_test_folder = f'/home/{user}/PycharmProjects/github_projects/ParaSurf/test_data/pdbs/{dataset}/TEST'
+            process_raw_pdb_data(train_val_info, init_pdb_files_train_val, final_train_val_folder)
+            process_raw_pdb_data(test_info, init_pdb_files_test, final_test_folder)
+            shutil.rmtree(init_pdb_files_train_val)
+            shutil.rmtree(init_pdb_files_test)
+    else:
+        # Paths to dataset csv files
+        train_info = pd.read_csv(f'/home/{user}/PycharmProjects/github_projects/ParaSurf/training_data/{dataset}/train_set.csv')
+        val_info = pd.read_csv(f'/home/{user}/PycharmProjects/github_projects/ParaSurf/training_data/{dataset}/val_set.csv')
+        test_info = pd.read_csv(f'/home/{user}/PycharmProjects/github_projects/ParaSurf/training_data/{dataset}/test_set.csv')
+        # Paths to init raw pdb files
+        initial_pdb_files_train = f'/home/{user}/PycharmProjects/github_projects/ParaSurf/test_data/pdbs/{dataset}/train_data_initial_raw_pdb_files'
+        initial_pdb_files_val = f'/home/{user}/PycharmProjects/github_projects/ParaSurf/test_data/pdbs/{dataset}/val_data_initial_raw_pdb_files'
+        initial_pdb_files_test = f'/home/{user}/PycharmProjects/github_projects/ParaSurf/test_data/pdbs/{dataset}/test_data_initial_raw_pdb_files'
+        # Final folder for the merged files that contain the final PDB complexes
+        final_train_folder = f'/home/{user}/PycharmProjects/github_projects/ParaSurf/test_data/pdbs/{dataset}/TRAIN'
+        final_val_folder = f'/home/{user}/PycharmProjects/github_projects/ParaSurf/test_data/pdbs/{dataset}/VAL'
+        final_test_folder = f'/home/{user}/PycharmProjects/github_projects/ParaSurf/test_data/pdbs/{dataset}/TEST'
+        # Process the train-val-test data
+        process_raw_pdb_data(train_info, initial_pdb_files_train, final_train_folder)
+        process_raw_pdb_data(val_info, initial_pdb_files_val, final_val_folder)
+        process_raw_pdb_data(test_info, initial_pdb_files_test, final_test_folder)
+        # REMOVE the init raw pdb files
+        shutil.rmtree(initial_pdb_files_train)
+        shutil.rmtree(initial_pdb_files_val)
+        shutil.rmtree(initial_pdb_files_test)

ParaSurf/create_datasets_from_csv/process_csv_dataset.py ADDED Viewed

	@@ -0,0 +1,130 @@

+import os
+import pandas as pd
+from Bio.PDB import PDBList
+from tqdm import tqdm
+def add_headers_if_not_present(csv_file, headerlist):
+ """
+ Add headers to the CSV file if they are not already present.
+ Parameters:
+ csv_file (str): Path to the CSV file.
+ headerlist (list): List of headers to add.
+ """
+ # Read the first row to check for headers
+ first_row = pd.read_csv(csv_file, nrows=1)
+ # Check if the first row contains the expected headers
+ if list(first_row.columns) != headerlist:
+     print(f"Headers not found in {csv_file}. Adding headers...")
+     # Load the full data without headers
+     data = pd.read_csv(csv_file, header=None)
+     # Assign the correct headers
+     data.columns = headerlist
+     # Save the file with the correct headers
+     data.to_csv(csv_file, header=True, index=False)
+     print(f"Headers added to {csv_file}")
+ else:
+     print(f"Headers already present in {csv_file}. No changes made.")
+def download_pdb(pdb_code, output_dir):
+ pdbl = PDBList()
+ pdbl.retrieve_pdb_file(pdb_code, pdir=output_dir, file_format='pdb')
+def download_and_rename_pdb_files(pdb_list, folder):
+     """
+     Downloads PDB files from the provided list and renames them from `.ent` to `{pdb_code}.pdb`.
+     Parameters:
+     pdb_list (list): List of PDB codes to be downloaded.
+     folder (str): Directory where the PDB files will be saved and renamed.
+     """
+     # Download PDB files
+     for pdb_code in pdb_list:
+         download_pdb(pdb_code, folder)
+     # Rename files to {pdb_code}.pdb
+     for pdb_file in os.listdir(folder):
+         if pdb_file.endswith('.ent'):
+             old_file_path = os.path.join(folder, pdb_file)
+             new_file_name = pdb_file.split('.')[0][-4:].upper() + '.pdb' #Capital because the csv gives the pdb names in capital
+             new_file_path = os.path.join(folder, new_file_name)
+             os.rename(old_file_path, new_file_path)
+             print(f"Renamed {old_file_path} to {new_file_path}")
+def process_dataset(csv_file, folder):
+     """
+     Processes a dataset by adding headers, extracting PDB codes, and downloading/renaming PDB files.
+     Parameters:
+     csv_file (str): Path to the CSV file.
+     folder (str): Directory where the PDB files will be saved and renamed.
+     """
+     # Add headers if not present
+     add_headers_if_not_present(csv_file, headerlist)
+     # Read the CSV file
+     dataset = pd.read_csv(csv_file)
+     # Create folder if it doesn't exist
+     if not os.path.exists(folder):
+         os.makedirs(folder)
+     # Initialize the PDB list
+     pdb_list = []
+     # Process each row
+     for i, row in dataset.iterrows():
+         pdb_list.append(row['pdb_code'])
+     # Download and rename PDB files
+     download_and_rename_pdb_files(pdb_list, folder)
+if __name__ == '__main__':
+    # ALL datasets follow the same process
+    user = os.getenv('USER')
+    datasets = ['PECAN', 'Paragraph_Expanded', 'MIPE']
+    # Define the correct headers
+    headerlist = ['pdb_code', 'Light_chain', 'Heavy_chain', 'ag']
+    for dataset in datasets:
+        if dataset == 'MIPE': # here the split is train-val and test according to the MIPE paper
+            # csv path
+            train_val = f'/home/{user}/PycharmProjects/github_projects/ParaSurf/training_data/{dataset}/train_val.csv'
+            test = f'/home/{user}/PycharmProjects/github_projects/ParaSurf/training_data/{dataset}/test_set.csv'
+            # path to init raw PDB storage
+            train_val_folder = f'/home/{user}/PycharmProjects/github_projects/ParaSurf/test_data/pdbs/{dataset}/train_val_data_initial_raw_pdb_files'
+            test_folder = f'/home/{user}/PycharmProjects/github_projects/ParaSurf/test_data/pdbs/{dataset}/test_data_initial_raw_pdb_files'
+            process_dataset(train_val, train_val_folder)
+            process_dataset(test, test_folder)
+        else:
+            # Paths to your CSV files. Download dataset from here: https://github.com/oxpig/Paragraph/tree/main/training_data/Expanded
+            train = f'/home/{user}/PycharmProjects/github_projects/ParaSurf/training_data/{dataset}/train_set.csv'
+            val = f'/home/{user}/PycharmProjects/github_projects/ParaSurf/training_data/{dataset}/val_set.csv'
+            test = f'/home/{user}/PycharmProjects/github_projects/ParaSurf/training_data/{dataset}/test_set.csv'
+            # Paths for init raw PDB file storage
+            train_folder = f'/home/{user}/PycharmProjects/github_projects/ParaSurf/test_data/pdbs/{dataset}/train_data_initial_raw_pdb_files'
+            val_folder = f'/home/{user}/PycharmProjects/github_projects/ParaSurf/test_data/pdbs/{dataset}/val_data_initial_raw_pdb_files'
+            test_folder = f'/home/{user}/PycharmProjects/github_projects/ParaSurf/test_data/pdbs/{dataset}/test_data_initial_raw_pdb_files'
+            # Process each dataset
+            process_dataset(train, train_folder)
+            process_dataset(val, val_folder)
+            process_dataset(test, test_folder)

ParaSurf/create_datasets_from_csv/split_pdb2chains_only.py ADDED Viewed

	@@ -0,0 +1,43 @@

+import os
+def extract_chains_from_pdb(pdb_file, output_dir):
+    """
+    Extract and save the chains from a PDB file as separate chain-specific PDB files.
+    Args:
+        pdb_file (str): Path to the PDB file.
+        output_dir (str): Path to the directory where the chain-specific files should be saved.
+    Returns:
+        list: Paths to the chain-specific PDB files.
+    """
+    chain_dict = {}
+    with open(pdb_file, 'r') as f:
+        for line in f:
+            if line.startswith('ATOM'):
+                chain_id = line[21]
+                if chain_id in chain_dict:
+                    chain_dict[chain_id].append(line)
+                else:
+                    chain_dict[chain_id] = [line]
+    chain_files = []
+    for chain_id, lines in chain_dict.items():
+        chain_file = os.path.join(output_dir, f"{os.path.splitext(os.path.basename(pdb_file))[0]}_chain{chain_id}.pdb")
+        with open(chain_file, 'w') as f:
+            f.writelines(lines)
+        # print(f'Chain {chain_id} saved as {chain_file}.')
+        chain_files.append(chain_file)
+    chain_ids = [chain.split("/")[-1].split(".")[0][-1] for chain in chain_files]
+    return chain_files, chain_ids
+if __name__ == '__main__':
+    pdb_file = '/home/angepapa/PycharmProjects/DeepSurf2.0/3bgf.pdb'
+    output_dir = "/".join(pdb_file.split('/')[:-1])
+    chain_files, chain_ids = extract_chains_from_pdb(pdb_file, output_dir)
+    print(chain_files)
+    print(chain_ids)

ParaSurf/model/ParaSurf_model.py ADDED Viewed

	@@ -0,0 +1,173 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torchsummary import summary
+import time
+class GeM(nn.Module):
+    def __init__(self, p=3.0, eps=1e-6):
+        super(GeM, self).__init__()
+        # Initialize p as a learnable parameter
+        self.p = nn.Parameter(torch.ones(1) * p)
+        self.eps = eps
+    def forward(self, x):
+        return self.gem(x, self.p, self.eps)
+    def gem(self, x, p, eps):
+        # Clamp all elements in x to a minimum of eps and then raise them to the power of p
+        # Apply avg_pool3d with kernel size being the spatial dimension of the feature map (entire depth, height, width)
+        # Finally, take the power of 1/p to invert the earlier power of p operation
+        return F.avg_pool3d(x.clamp(min=eps).pow(p), (x.size(2), x.size(3), x.size(4))).pow(1. / p)
+    def __repr__(self):
+        # This helps in identifying the layer characteristics when printing the model or layer
+        return self.__class__.__name__ + '(' + 'p=' + '{:.4f}'.format(self.p.data.tolist()[0]) + ', eps=' + str(
+            self.eps) + ')'
+# Define a custom Bottleneck module with optional dilation
+class DilatedBottleneck(nn.Module):
+    expansion = 4
+    def __init__(self, in_planes, planes, stride=1, dilation=1, dropout_prob=0.25):
+        super(DilatedBottleneck, self).__init__()
+        self.conv1 = nn.Conv3d(in_planes, planes, kernel_size=1, bias=False)
+        self.bn1 = nn.BatchNorm3d(planes)
+        self.dropout1 = nn.Dropout3d(dropout_prob)
+        self.conv2 = nn.Conv3d(planes, planes, kernel_size=3, stride=stride, padding=dilation, dilation=dilation,
+                               bias=False)
+        self.bn2 = nn.BatchNorm3d(planes)
+        self.dropout2 = nn.Dropout3d(dropout_prob)
+        self.conv3 = nn.Conv3d(planes, self.expansion * planes, kernel_size=1, bias=False)
+        self.bn3 = nn.BatchNorm3d(self.expansion * planes)
+        self.shortcut = nn.Sequential()
+        if stride != 1 or in_planes != self.expansion * planes:
+            self.shortcut = nn.Sequential(
+                nn.Conv3d(in_planes, self.expansion * planes, kernel_size=1, stride=stride, bias=False),
+                nn.BatchNorm3d(self.expansion * planes)
+            )
+    def forward(self, x):
+        out = self.dropout1(F.relu(self.bn1(self.conv1(x))))
+        out = self.dropout2(F.relu(self.bn2(self.conv2(out))))
+        out = self.bn3(self.conv3(out))
+        out += self.shortcut(x)
+        out = F.relu(out)
+        return out
+# Define the Transformer Block
+class TransformerBlock(nn.Module):
+    def __init__(self, feature_size, nhead, num_layers):
+        super(TransformerBlock, self).__init__()
+        self.transformer = nn.TransformerEncoder(
+            nn.TransformerEncoderLayer(d_model=feature_size, nhead=nhead),
+            num_layers=num_layers
+        )
+    def forward(self, x):
+        orig_shape = x.shape  # Save original shape
+        x = x.flatten(2)  # Flatten spatial dimensions
+        x = x.permute(2, 0, 1)  # Reshape for the transformer (Seq, Batch, Features)
+        x = self.transformer(x)
+        x = x.permute(1, 2, 0).view(*orig_shape)  # Restore original shape
+        return x
+# Define a Compression Layer
+class CompressionLayer(nn.Module):
+    def __init__(self, in_channels, out_channels):
+        super(CompressionLayer, self).__init__()
+        self.conv1x1 = nn.Conv3d(in_channels, out_channels, kernel_size=1)
+        self.bn = nn.BatchNorm3d(out_channels)
+        self.relu = nn.ReLU(inplace=True)
+    def forward(self, x):
+        x = self.conv1x1(x)
+        x = self.bn(x)
+        x = self.relu(x)
+        return x
+# Define the Enhanced ResNet with hybrid architecture
+class ResNet3D_Transformer(nn.Module):
+    def __init__(self, in_channels, block, num_blocks, num_classes=1, dropout_prob=0.1):
+        super(ResNet3D_Transformer, self).__init__()
+        self.in_planes = 64
+        self.initial_layers = nn.Sequential(
+            nn.Conv3d(in_channels, 64, kernel_size=7, stride=2, padding=3, bias=False),
+            nn.BatchNorm3d(64),
+            nn.ReLU(inplace=True),
+            nn.MaxPool3d(kernel_size=3, stride=2, padding=1)
+        )
+        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=1)
+        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=2)
+        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=2, dilation=2)
+        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=2)
+        self.compression = CompressionLayer(512 * block.expansion, 256)
+        self.transformer_block = TransformerBlock(feature_size=256, nhead=8, num_layers=1)          # change to 4
+        # self.gem_pooling = GeM(p=3.0, eps=1e-6)
+        self.dropout = nn.Dropout(dropout_prob)
+        self.classifier = nn.Linear(256, num_classes)
+    def _make_layer(self, block, planes, num_blocks, stride, dilation=1):
+        strides = [stride] + [1] * (num_blocks - 1)
+        layers = []
+        for s in strides:
+            layers.append(block(self.in_planes, planes, s, dilation))
+            self.in_planes = planes * block.expansion
+        return nn.Sequential(*layers)
+    def forward(self, x):
+        x = x.permute(0, 4, 3, 2, 1)
+        x = self.initial_layers(x)
+        x = self.layer1(x)
+        x = self.layer2(x)
+        x = self.layer3(x)
+        x = self.layer4(x)
+        # Compress and transform
+        x = self.compression(x)
+        x = self.transformer_block(x)
+        # Global average pooling
+        x = torch.mean(x, dim=[2, 3, 4])
+        # Classify
+        x = self.dropout(x)  # Apply dropout before classification
+        x = self.classifier(x)
+        return x
+def count_parameters(model):
+    return sum(p.numel() for p in model.parameters() if p.requires_grad)
+if __name__ == "__main__":
+    # device = 'cuda' if torch.cuda.is_available() else 'cpu'
+    device = 'cpu'
+    num_classes = 1
+    num_input_channels = 20  # Number of input channels
+    model = ResNet3D_Transformer(num_input_channels, DilatedBottleneck, [3, 4, 6, 3], num_classes=num_classes).to(device)
+    grid_size = 41  # Assuming the input grid size (for example, 41x41x41x19)
+    start = time.time()
+    num_params = count_parameters(model)
+    print(f"Number of parameters in the model: {num_params}")
+    print(model)
+    dummy_input = torch.randn(64, grid_size, grid_size, grid_size, num_input_channels).to(device)
+    dummy_input = dummy_input.float().to(device)
+    output = model(dummy_input)
+    print("Output shape:", output.shape)
+    print(output)
+    print(f'total time: {(time.time() - start)/60} mins')

ParaSurf/model/__pycache__/ParaSurf_model.cpython-310.pyc ADDED Viewed

Binary file (6.29 kB). View file

ParaSurf/model/__pycache__/ParaSurf_model.cpython-39.pyc ADDED Viewed

Binary file (6.36 kB). View file

ParaSurf/model/__pycache__/dataset.cpython-310.pyc ADDED Viewed

Binary file (2.76 kB). View file

ParaSurf/model/__pycache__/dataset.cpython-39.pyc ADDED Viewed

Binary file (2.76 kB). View file

ParaSurf/model/dataset.py ADDED Viewed

	@@ -0,0 +1,107 @@

+import numpy as np
+import random, os
+from scipy import sparse
+from torch.utils.data import Dataset
+from tqdm import tqdm
+from torch.utils.data import DataLoader
+import h5py
+class dataset(Dataset):
+    def __init__(self, train_file, batch_size, data_path, grid_size, training, feature_vector_lentgh, feature_names=['deepsite']):
+        super(dataset, self).__init__()
+        self.training = training
+        self.feature_vector_lentgh = feature_vector_lentgh
+        #  in testing mode training file is not read
+        if self.training:
+            with open(train_file) as f:
+                self.train_lines = f.readlines()
+            random.shuffle(self.train_lines)
+        else:
+            self.train_lines = []
+        random.shuffle(self.train_lines)
+        self.pointer_tr = 0
+        self.pointer_val = 0
+        self.batch_size = batch_size
+        self.data_path = data_path
+        self.grid_size = grid_size
+        self.feature_names = feature_names
+        #        if added_features is None:         # resolved outside
+        self.nAtomTypes = 0
+        self.nfeats = {
+            'deepsite': 8,
+            'kalasanty': feature_vector_lentgh,
+            'kalasanty_with_force_fields': feature_vector_lentgh,
+            'kalasanty_norotgrid': 18,
+            'spat_protr': 1,
+            'spat_protr_norotgrid': 1
+        }
+        for name in feature_names:
+            self.nAtomTypes += self.nfeats[name]
+    def __len__(self):
+        if self.training:
+            return len(self.train_lines)
+    def __getitem__(self, index):
+        if self.training:
+            samples = self.train_lines
+        label, sample_file = samples[index].split()
+        label = int(label)
+        base_name, prot, sample = sample_file.split('/')
+        feats = np.zeros((self.grid_size, self.grid_size, self.grid_size, self.nAtomTypes))
+        feat_cnt = 0
+        for name in self.feature_names:
+            if 'deepsite' == name:
+                data = np.load(os.path.join(self.data_path, base_name + '_' + name, prot, sample), allow_pickle=True)
+            elif 'kalasanty' == name:
+                data = sparse.load_npz(os.path.join(self.data_path, base_name, prot, sample[:-1] + 'z'))
+                data = np.reshape(np.array(data.todense()), (self.grid_size, self.grid_size, self.grid_size, self.nfeats['kalasanty']))
+            elif 'kalasanty_with_force_fields' == name:
+                data = sparse.load_npz(os.path.join(self.data_path, base_name, prot, sample[:-1] + 'z'))
+                data = np.reshape(np.array(data.todense()), (self.grid_size, self.grid_size, self.grid_size, self.nfeats['kalasanty_with_force_fields']))
+            elif 'spat_protr' in name:
+                data = np.load(os.path.join(self.data_path, base_name + '_' + name, prot, sample), allow_pickle=True)
+            else:
+                print('unknown feat')
+            if len(data) == 3:
+                data = data[2]  # prosoxh, mono sto scPDB, gia thn wra (sto kalasanty den exw points, normals)
+            feats[:, :, :, feat_cnt:feat_cnt + self.nfeats[name]] = data
+            feat_cnt += self.nfeats[name]
+        if feat_cnt != self.nAtomTypes:
+            print('error !')
+        # Modified code with explicit strides: Because pytorch does not handle negative samples
+        if self.training:
+            rot_axis = random.randint(1, 3)
+            feats_copy = feats.copy()
+            if rot_axis == 1:
+                feats_copy = np.rot90(feats_copy, random.randint(0, 3), axes=(0, 1))
+            elif rot_axis == 2:
+                feats_copy = np.rot90(feats_copy, random.randint(0, 3), axes=(0, 2))
+            elif rot_axis == 3:
+                feats_copy = np.rot90(feats_copy, random.randint(0, 3), axes=(1, 2))
+            feats = np.ascontiguousarray(feats_copy)
+        if np.isnan(np.sum(feats)):
+            print('nan input')
+        return feats, label

ParaSurf/model_weights/README.md ADDED Viewed

	@@ -0,0 +1,11 @@

+## *ParaSurf Model Weights*
+[Download ParaSurf model weights](https://drive.google.com/drive/folders/1Kpehru9SnWsl7_Wq93WuI_o7f8wrPgpI?usp=drive_link)
+Best model weights for the 3 benchmark datasets:
+* PECAN Dataset
+* Paragraph Expanded
+  * Paragraph Expanded (Heavy Chains Only)
+  * Paragraph Expanded (Light Chains Only)
+* MIPE Dataset

ParaSurf/preprocess/README.md ADDED Viewed

	@@ -0,0 +1,71 @@

+# **Feature Extraction - Preprocessing phase**
+This guide outlines the steps needed to generate the ParaSurf 41x41x41x22 input feature vector for training. By following these steps, you will create a dataset ready for training, organized in the specified folder structure.
+### Step 1: Clean the Antibody-Antigen Complex
+Remove ions, ligands, and water molecules from the antibody-antigen complex and rearrange atom IDs within the PDB structure.
+```bash
+# Clean the antibody-antigen complex
+python clean_dataset.py
+```
+### Step 2: Sanity Check for Interaction
+Verify that at least one antibody heavy atom is within 4.5Å of any antigen heavy atom, ensuring proximity-based interactions.
+```bash
+# Run sanity check
+python check_rec_ant_touch.py
+```
+### Step 3: Generate Molecular Surface Points
+Create the molecular surface for each receptor in the training folder using DMS software. These surface points will serve as a basis for feature extraction.
+```bash
+# Generate molecular surface points
+python create_surfpoints.py
+```
+### Step 4: Generate ParaSurf Input Feature Grids (41x41x41x22)
+```bash
+# Create the 3D feature grids for each surface point generated in Step 3. Each feature grid includes 22 channels with essential structural and electrostatic information.
+python create_input_features.py
+```
+### Step 5: Prepare .proteins Files
+Generate .proteins files for training, validation, and testing. These files list all receptors (antibodies) to be used in each dataset split.
+```bash
+# Create train/val/test .proteins files
+python create_proteins_file.py
+```
+### Step 6: Create .samples Files
+Generate .samples files, each listing paths to feature files created in Step 4. These files act as a link between features and the training pipeline.
+```bash
+# Generate .samples files for network training
+python create_sample_files.py
+```
+## **Folder Structure After Preprocessing**
+After completing the above steps, the resulting folder structure should be organized as follows:
+```bash
+├── test_data
+│   ├── datasets
+│   │   ├── PECAN_TRAIN.samples
+│   │   ├── PECAN_TRAIN.proteins
+│   │   ├── PECAN_VAL.proteins
+│   │   ├── PECAN_TEST.proteins
+│   │   └── ...
+├── feats
+│   ├── PECAN_22
+│   ├── Paragraph_Expanded_22
+│   └── MIPE_22
+├── surfpoints
+│   ├── PECAN
+│   │   └── TRAIN
+│   ├── Paragraph_Expanded
+│   │   └── TRAIN
+│   ├── MIPE
+│   │   └── TRAIN
+└── pdbs # already created from ParaSurf/create_datasets_from_csv
+```
+Now we are ready for training!

ParaSurf/preprocess/__pycache__/check_empty_features.cpython-310.pyc ADDED Viewed

Binary file (2.72 kB). View file

ParaSurf/preprocess/__pycache__/check_empty_features.cpython-39.pyc ADDED Viewed

Binary file (2.53 kB). View file

ParaSurf/preprocess/__pycache__/clean_dataset.cpython-310.pyc ADDED Viewed

Binary file (1.04 kB). View file

ParaSurf/preprocess/__pycache__/clean_dataset.cpython-39.pyc ADDED Viewed

Binary file (1.01 kB). View file

ParaSurf/preprocess/check_empty_features.py ADDED Viewed

	@@ -0,0 +1,68 @@

+import os
+def remove_empty_features(feats_folder, pdbs_path, surf_path, log_file_path="removed_complexes_log.txt"):
+    """
+    Checks each subfolder in the base folder for files. If a subfolder is empty, removes it along with
+    associated files from `data_path` and `surf_path` and logs the removals.
+    Parameters:
+    - feats_folder (str): The main directory containing subfolders with the features to check.
+    - data_path (str): Path where receptor and antigen PDB files are located.
+    - surf_path (str): Path where surface points files are located.
+    - log_file_path (str): Path for the log file to track removed folders and files. Default is 'removed_folders_log.txt'.
+    Returns:
+    - total_empty_folders (int): Count of empty folders removed.
+    """
+    # Identify all subfolders in the base folder
+    subfolders = [d for d in os.listdir(feats_folder) if os.path.isdir(os.path.join(feats_folder, d))]
+    empty_folders = []
+    # Open log file to record removed folders
+    with open(log_file_path, 'w') as log_file:
+        log_file.write("Log of Removed Folders and Files\n")
+        log_file.write("=" * 30 + "\n")
+        # Check each subfolder and remove if empty
+        for folder in subfolders:
+            path = os.path.join(feats_folder, folder)
+            if not any(os.path.isfile(os.path.join(path, i)) for i in os.listdir(path)):
+                empty_folders.append(folder)
+                pdb_code = folder.split('_')[0]
+                # Define paths to the files to be removed
+                rec_file = os.path.join(pdbs_path, pdb_code + '_receptor_1.pdb')
+                antigen_file = os.path.join(pdbs_path, pdb_code + '_antigen_1_1.pdb')
+                surf_file = os.path.join(surf_path, pdb_code + '_receptor_1.surfpoints')
+                # Remove the empty folder and associated files
+                os.rmdir(path)
+                if os.path.exists(rec_file):
+                    os.remove(rec_file)
+                if os.path.exists(antigen_file):
+                    os.remove(antigen_file)
+                if os.path.exists(surf_file):
+                    os.remove(surf_file)
+                # Log each removal
+                log_file.write(f"{pdb_code} complex removed since no features found.\n")
+    total_empty_folders = len(empty_folders)
+    # Delete the log file if no folders were removed
+    if total_empty_folders == 0:
+        os.remove(log_file_path)
+        print("\nAll complexes have features!!!")
+    else:
+        print(f"Total empty folders removed: {total_empty_folders}")
+        print(f"Details logged in {log_file_path}")
+    return total_empty_folders
+# Example usage
+if __name__ == '__main__':
+    user = os.getenv('USER')
+    pdbs_path = f'/home/{user}/PycharmProjects/github_projects/ParaSurf/test_data/pdbs/eraseme/TRAIN'
+    surf_path = f'/home/{user}/PycharmProjects/github_projects/ParaSurf/test_data/surf_points/eraseme/TRAIN'
+    feats_path = f'/home/{user}/PycharmProjects/github_projects/ParaSurf/test_data/feats/eraseme_22'
+    remove_empty_features(feats_path, pdbs_path, surf_path)

ParaSurf/preprocess/check_rec_ant_touch.py ADDED Viewed

	@@ -0,0 +1,89 @@

+import os
+import math
+from tqdm import tqdm
+def locate_receptor_binding_site_atoms(receptor_pdb_file, antigen_pdb_file, distance_cutoff=4):
+    rec_coordinates = []
+    with open(receptor_pdb_file, 'r') as file:
+        for line in file:
+            if line.startswith("ATOM"):
+                x = float(line[30:38].strip())
+                y = float(line[38:46].strip())
+                z = float(line[46:54].strip())
+                rec_coordinates.append((x, y, z))
+    ant_coordinates = []
+    with open(antigen_pdb_file, 'r') as file:
+        for line in file:
+            if line.startswith("ATOM"):
+                x = float(line[30:38].strip())
+                y = float(line[38:46].strip())
+                z = float(line[46:54].strip())
+                ant_coordinates.append((x, y, z))
+    # Create a list to store the final coordinates
+    final_coordinates = []
+    # Compare each coordinate from rec_coordinates with each coordinate from ant_coordinates
+    for rec_coord in rec_coordinates:
+        for ant_coord in ant_coordinates:
+            if math.dist(rec_coord, ant_coord) < distance_cutoff:
+                final_coordinates.append(rec_coord)
+                break  # Break the inner loop if a match is found to avoid duplicate entries
+    # sanity check
+    for coor in final_coordinates:
+        if coor not in rec_coordinates:
+            print('BINDING SITE COORDINATE NOT IN RECEPTORs COORDINATES!!!!!!')
+    return final_coordinates, rec_coordinates
+def check_receptor_antigen_interactions(pdb_dir, distance_cutoff=6, log_file="interaction_issues.txt"):
+    """
+    :param pdb_dir: directory with receptor and antigen pdb files
+    :param distance_cutoff: the distance cutoff for binding site
+    :param log_file: the file where issues will be logged
+    :return: It checks if the receptor and antigen are in contact with each other
+    """
+    all_successful = True  # A flag to track if all pairs are correct
+    # Open the log file for writing
+    with open(log_file, 'w') as log:
+        log.write("Receptor-Antigen Interaction Issues Log\n")
+        log.write("=====================================\n")
+        non_interacting_pdbs = 0
+        for pdb_file in tqdm(os.listdir(pdb_dir)):
+            pdb_id = pdb_file.split('_')[0]
+            cur_rec_pdb = os.path.join(pdb_dir, f'{pdb_id}_receptor_1.pdb')
+            cur_ant_pdb = os.path.join(pdb_dir, f'{pdb_id}_antigen_1_1.pdb')
+            if os.path.exists(cur_rec_pdb) and os.path.exists(cur_ant_pdb):
+                final, rec = locate_receptor_binding_site_atoms(cur_rec_pdb, cur_ant_pdb, distance_cutoff)
+                if len(final) == 0:
+                    non_interacting_pdbs += 1
+                    log.write(f'\nNON-INTERACTING PAIRS!!!: problem with {pdb_id}.pdb. {pdb_id}_receptor_1.pdb and '
+                              f' {pdb_id}_antigen_1_1.pdb files are removed.\n')
+                    os.remove(cur_rec_pdb)
+                    os.remove(cur_ant_pdb)
+                    all_successful = False  # Mark as unsuccessful if any issue is found
+        # Check if everything was successful
+        if all_successful:
+            print("Success! All receptors interact with their associated antigens.")
+            # since no issue s were found we can remove the log file
+            os.remove(log_file)
+        else:
+            print(f'\n ~~~~~ Total pdbs found with issues: {non_interacting_pdbs} and are removed from the folder ~~~~~\n')
+            log.write(f'\n\n ~~~~~ Total pdbs found with issues: {non_interacting_pdbs} ~~~~~')
+            print(f"Some receptors do not interact with their antigens. Issues logged in {log_file}.")
+# example usage
+if __name__ == '__main__':
+    user = os.getenv('USER')
+    pdb_dir = f'/home/{user}/PycharmProjects/github_projects/ParaSurf/test_data/pdbs/eraseme/TRAIN'
+    index = pdb_dir.split('/')[-1]
+    check_receptor_antigen_interactions(pdb_dir, distance_cutoff=4.5, log_file=f'{pdb_dir}/{index}_interaction_issues.txt')

ParaSurf/preprocess/clean_dataset.py ADDED Viewed

	@@ -0,0 +1,27 @@

+import os
+from ParaSurf.utils.remove_hydrogens_from_pdb import remove_hydrogens_from_pdb_folder
+from ParaSurf.utils.remove_HETATMS_from_receptors import remove_hetatm_from_pdb_folder
+from ParaSurf.utils.reaarange_atom_id import process_pdb_files_in_folder
+def clean_dataset(dataset_path_with_pdbs):
+    """
+    :param dataset_path_with_pdbs:
+    :return: a cleaned dataset ready to be processed for training purposes with 3 steps of filtering
+    """
+    data_path = dataset_path_with_pdbs
+    # step1: remove hydrogens
+    remove_hydrogens_from_pdb_folder(input_folder=data_path,
+                                     output_folder=data_path)
+    # step2: remove HETATMS only from the receptors
+    remove_hetatm_from_pdb_folder(input_folder=data_path,
+                                  output_folder=data_path)
+    # step3: re-arrange the atom_id of each pdb
+    process_pdb_files_in_folder(folder_path=data_path)
+if __name__ == "__main__":
+    user = os.getenv('USER')
+    clean_dataset(f'/home/{user}/PycharmProjects/github_projects/ParaSurf/test_data/pdbs/eraseme/TRAIN')

ParaSurf/preprocess/create_input_features.py ADDED Viewed

	@@ -0,0 +1,230 @@

+from multiprocessing import Pool
+from multiprocessing import Lock
+import time, os
+import numpy as np
+from Bio.PDB.PDBParser import PDBParser
+from ParaSurf.utils.bsite_lib import readSurfpoints, readSurfpoints_with_residues, dist_point_from_lig
+from ParaSurf.utils.features import KalasantyFeaturizer, KalasantyFeaturizer_with_force_fields
+from scipy import sparse
+from tqdm import tqdm
+import warnings
+from ParaSurf.utils.distance_coords import locate_receptor_binding_site_residues
+from check_empty_features import remove_empty_features
+# Ignore warnings
+warnings.filterwarnings('ignore')
+lock = Lock()  # Instantiate a Lock for thread safety.
+def balanced_sampling(surf_file, protein_file, lig_files, cutoff=4.5):
+    """
+    Returns a subset of equal positive and negative samples from surface points in `surf_file`, with positive samples
+    selected from residues close to the antigen.
+    Parameters:
+    - surf_file: Path to the file with protein surface points.
+    - protein_file: Path to the protein structure file (e.g., PDB format).
+    - lig_files: List of ligand (antigen) structure file paths.
+    - cutoff: Distance cutoff in Ångstroms for defining binding residues (default is 4).
+    Returns:
+    - Balanced samples including features and labels for the selected surface points.
+    """
+    all_lig_coords = []
+    for lig_file in lig_files:
+        with lock:  # Locks the parser to ensure thread safety.
+            lig = parser.get_structure('antigen', lig_file)
+        lig_coords = np.array([atom.get_coord() for atom in lig.get_atoms()])
+        all_lig_coords.append(lig_coords)
+    points, normals = readSurfpoints(surf_file)  # modified by me
+    # create the residue groups for the whole protein
+    all_rec_residues = readSurfpoints_with_residues(surf_file)
+    # find the bind site residues
+    bind_site_rec_residues = locate_receptor_binding_site_residues(protein_file, lig_file, distance_cutoff=cutoff)
+    # gather all distances
+    # Create an array to store the minimum distance of each point to any ligand atom
+    dist_from_lig = np.full(len(points), np.inf)
+    near_lig = np.zeros(len(points), dtype=bool)
+    # Update distances only for points in bind site residues
+    bind_site_indices = [item for i in bind_site_rec_residues for item in all_rec_residues[i]['idx']]
+    bind_site_indices_set = set(bind_site_indices)  # Convert to set for fast lookup
+    # Loop through ligand coordinates and update distances for binding site points
+    # IMPORTANT Step because if a residue belongs to the binding site that DOES NOT mean that all the atom of this
+    # residue belongs the binding site (<6 armstrong to the ligand). So here we check from the binding site residues which
+    # atoms actually bind (<6 armstrong to the ligand)
+    for lig_coords in all_lig_coords:
+        for i, p in enumerate(points):
+            if i in bind_site_indices_set:
+                dist = dist_point_from_lig(p, lig_coords)  # Adjust this function if necessary
+                if dist < dist_from_lig[i]:
+                    dist_from_lig[i] = dist
+                    near_lig[i] = dist < cutoff
+    # Filter positive indices to include only those near a ligand
+    pos_idxs = np.array([idx for idx in bind_site_indices if near_lig[idx]])
+    # If there are more positive indices than allowed, select the best ones based on the distance
+    if len(pos_idxs) > maxPosSamples:
+        pos_idxs = pos_idxs[np.argsort(dist_from_lig[pos_idxs])[:maxPosSamples]]
+    # Select the negative samples
+    all_neg_samples = [idx for idx, i in enumerate(points) if idx not in pos_idxs]
+    # Calculate number of negative samples to match the number of positive samples
+    num_neg_samples = min(len(all_neg_samples), len(pos_idxs))
+    neg_idxs = np.array(all_neg_samples)
+    if len(neg_idxs) > num_neg_samples:
+        neg_downsampled = np.random.choice(neg_idxs, num_neg_samples, replace=False)
+    else:
+        neg_downsampled = neg_idxs
+    # Concatenate positive and negative indices
+    sample_idxs = np.concatenate((pos_idxs, neg_downsampled))
+    # Shuffle the indices to ensure randomness
+    np.random.shuffle(sample_idxs)
+    # create the sample labels
+    # Convert pos_idxs to a set for faster membership testing
+    pos_set = set(pos_idxs)
+    # Use list comprehension to create labels
+    sample_labels = [i in pos_set for i in sample_idxs]
+    if feat_type == 'kalasanty':
+        featurizer = KalasantyFeaturizer(protein_file, protonate, gridSize, voxelSize, use_protrusion, protr_radius)
+    elif feat_type == 'kalasanty_with_force_fields':
+        featurizer = KalasantyFeaturizer_with_force_fields(protein_file, protonate, gridSize, voxelSize, use_protrusion, protr_radius,
+                                                           add_atom_radius_features=add_atoms_radius_ff_features)
+    feature_vector_length = featurizer.channels.shape[1]
+    with open(feature_vector_length_tmp_path, 'w') as file:
+        file.write(str(feature_vector_length))
+    for i, sample in enumerate(sample_idxs):
+        features = featurizer.grid_feats(points[sample], normals[sample], rotate_grid)
+        if np.count_nonzero(features) == 0:
+            print('Zero features', protein_file.rsplit('/', 1)[1][:-4], i, points[sample], normals[sample])
+        yield features, sample_labels[i], points[sample], normals[sample]
+def samples_per_prot(prot):
+    """
+    Generates and saves balanced surface point samples for a given protein.
+    Parameters:
+    - prot: Protein identifier for which features are being generated.
+    Saves each sample as a sparse matrix or NumPy array in `feats_path`.
+    """
+    prot_path = os.path.join(feats_path, prot)
+    # Check if directory exists and has files, if not create it
+    if not os.path.exists(prot_path):
+        os.makedirs(prot_path)
+    elif os.listdir(prot_path):
+        return
+    surf_file = os.path.join(surf_path, f"{prot}.surfpoints")
+    protein_file = os.path.join(pdbs_path, f"{prot}.pdb")
+    if not os.path.exists(protein_file):
+        protein_file = os.path.join(pdbs_path, f"{prot}.mol2")
+    receptor_id = prot_path.split('_')[-1]
+    antigen_prefix = prot.split('_')[0]
+    # Using set for faster membership checks
+    files_set = set(os.listdir(pdbs_path))
+    lig_files = [os.path.join(pdbs_path, f) for f in files_set if f"{antigen_prefix}_antigen_{receptor_id}" in f]
+    try:
+        cnt = 0
+        for features, y, point, normal in balanced_sampling(surf_file, protein_file, lig_files, cutoff=cutoff):
+            samples_file_name = os.path.join(prot_path, f"sample{cnt}_{int(y)}")
+            if feat_type == 'deepsite':
+                with open(f"{samples_file_name}.npy", 'w') as f:
+                    np.save(f, (point, normal, features.astype(np.float16)))
+            elif feat_type == 'kalasanty' or feat_type == 'kalasanty_with_force_fields':
+                sparse_mat = sparse.coo_matrix(features.flatten())
+                sparse.save_npz(f"{samples_file_name}.npz", sparse_mat)
+            cnt += 1
+        print(f'Saved "{cnt}" samples for "{prot}".')
+    except Exception as e:
+        print(f'Exception occurred while processing "{prot}". Error message: "{e}".')
+seed = 10  # random seed
+num_cores = 6  # Set this to the number of cores you wish to use
+maxPosSamples = 800  # maximum number of positive samples per protein
+gridSize = 41  # size of grid (16x16x16)
+voxelSize = 1  # size of voxel, e.g. 1 angstrom, if 2A we lose details, so leave it to 1
+cutoff = 4.5     # cutoff threshold in Armstrong's 6 for general PPIs, 4.5 for antibody antigen databases
+feature_vector_length_tmp_path = '/tmp/feature_vector_length.txt'
+# feat_type = 'kalasanty'  # select featurizer
+feat_type = 'kalasanty_with_force_fields'
+add_atoms_radius_ff_features = True    # If you want to add the atom radius features that correspond to the force fields
+rotate_grid = True  # whether to rotate the grid (ignore)
+use_protrusion = False  # ignore
+protr_radius = 10  # ignore
+protonate = True  # if protein pdbs are not protonated (do not have Hydrogens) set it to True
+user = os.getenv('USER')
+pdbs_path = f'/home/{user}/PycharmProjects/github_projects/ParaSurf/test_data/pdbs/eraseme/TRAIN'  # input folder with protein pdbs for training
+surf_path = f'/home/{user}/PycharmProjects/github_projects/ParaSurf/test_data/surfpoints/eraseme/TRAIN'  # input folder with surface points for training
+feats_path = f'/home/{user}/PycharmProjects/github_projects/ParaSurf/test_data/feats/eraseme'  # training features folder
+if not os.path.exists(feats_path):
+    os.makedirs(feats_path)
+np.random.seed(seed)
+all_proteins = [f.rsplit('.', 1)[0] for f in os.listdir(surf_path)]
+#
+# in case the procedure stacks use the 3 lines below
+completed = [f.rsplit('.', 1)[0] for f in os.listdir(feats_path)]
+all_proteins = [i for i in all_proteins if i not in completed]
+print(len(all_proteins))
+parser = PDBParser(PERMISSIVE=1)  # PERMISSIVE=1 allowing more flexibility in handling non-standard or problematic entries in PDB files during parsing.
+start = time.time()
+with Pool(num_cores) as pool:  # Use a specified number of CPU cores
+    list(tqdm(pool.imap(samples_per_prot, all_proteins), total=len(all_proteins)))
+print(f'Total preprocess time: {(time.time() - start)/60} mins')
+###################################################################################
+# Instead of using Pool and imap, iterate through all_proteins with a for loop for easy debugging
+# for prot in all_proteins:
+#     try:
+#         samples_per_prot(prot)
+#     except Exception as e:
+#         print(f'Error processing protein {prot}: {e}')
+#     break
+# the last number at the out_path will be the total number of the feature vector
+if os.path.exists(feature_vector_length_tmp_path):
+    with open(feature_vector_length_tmp_path, 'r') as file:
+        feature_vector_length = int(file.read().strip())
+        feats_path_new = f'{feats_path}_{feature_vector_length}'
+    os.rename(feats_path, feats_path_new)
+    os.remove(feature_vector_length_tmp_path)
+# remove empty features if found
+remove_empty_features(feats_folder=feats_path_new, pdbs_path=pdbs_path, surf_path=surf_path)

ParaSurf/preprocess/create_proteins_file.py ADDED Viewed

	@@ -0,0 +1,23 @@

+import os
+# Here we create a .proteins file that has all the proteins==receptors that we are working with.
+cases = ['TRAIN', 'VAL', 'TEST'] # change to ['TRAIN_VAL', 'TEST'] for MIPE
+user = os.getenv('USER')
+for case in cases:
+    pdbs_path = f'/home/{user}/PycharmProjects/github_projects/ParaSurf/test_data/pdbs/eraseme/{case}'
+    proteins_file = f'/home/{user}/PycharmProjects/github_projects/ParaSurf/test_data/datasets/eraseme_{case}.proteins'  # run for train val and test
+    # Create directories if they don't exist
+    os.makedirs(pdbs_path, exist_ok=True)
+    os.makedirs(os.path.dirname(proteins_file), exist_ok=True)
+    receptors = []
+    for prot in os.listdir(pdbs_path):
+        prot_name = prot.split('.')[0]
+        if 'rec' in prot:
+            receptors.append(prot_name + '\n')
+    with open(proteins_file,'w') as f:
+        f.writelines(receptors)

ParaSurf/preprocess/create_sample_files.py ADDED Viewed

	@@ -0,0 +1,31 @@

+import os, random
+user = os.getenv('USER')
+feats_path = f'/home/{user}/PycharmProjects/github_projects/ParaSurf/test_data/feats/eraseme_22'  # input folder with protein grids (training features)
+proteins_file = f'/home/{user}/PycharmProjects/github_projects/ParaSurf/test_data/datasets/eraseme_TRAIN.proteins'  # input file with a list of train proteins
+samples_file = f'/home/{user}/PycharmProjects/github_projects/ParaSurf/test_data/datasets/eraseme_TRAIN.samples'  # output file with respective training samples info (class_label + sample_path)
+seed = 1
+with open(proteins_file, 'r') as f:
+    proteins = f.readlines()
+sample_lines = []
+feats_prefix = feats_path.rsplit('/')[-1]
+for prot in proteins:
+    prot = prot[:-1]
+    prot_feats_path = os.path.join(feats_path, prot)
+    if not os.path.isdir(prot_feats_path):
+        print('No features for ', prot)
+        continue
+    for sample in os.listdir(prot_feats_path):
+        cls_idx = sample[-5]
+        sample_lines.append(cls_idx + ' ' + feats_prefix + '/' + prot + '/' + sample + '\n')
+random.seed(seed)
+random.shuffle(sample_lines)
+with open(samples_file, 'w') as f:
+    f.writelines(sample_lines)

ParaSurf/preprocess/create_surfpoints.py ADDED Viewed

	@@ -0,0 +1,57 @@

+import time
+import os
+from tqdm import tqdm
+from ParaSurf.utils.fix_surfpoints_format_issues import process_surfpoints_directory
+def generate_molecular_surface(input_path, out_path):
+    """
+    Generates the molecular surface for protein structures in PDB files using the DMS tool.
+    Parameters:
+    - input_path (str): Path to the input directory containing protein PDB files.
+    - out_path (str): Path to the output directory where generated surface points files will be saved.
+    Process:
+    - The function iterates over receptor PDB files in the input path.
+    - For each receptor file, it checks if a corresponding surface points file already exists in the output directory.
+    - If the surface points file does not exist, it generates the file using the DMS tool with a density of 0.5 Å.
+    Outputs:
+    - Each receptor file generates a surface points file saved in `out_path`.
+    """
+    if not os.path.exists(out_path):
+        os.makedirs(out_path)
+    start = time.time()
+    for f in tqdm(os.listdir(input_path), desc="Generating surface points"):
+        if 'antigen' in f:
+            continue
+        surfpoints_file = os.path.join(out_path, f[:-3] + 'surfpoints')
+        if os.path.exists(surfpoints_file):
+            continue
+        print(f"Processing {f}")
+        os.system(f'dms {os.path.join(input_path, f)} -d 0.5 -n -o {surfpoints_file}')
+    # Calculate and print statistics
+    rec_count = sum(1 for receptor in os.listdir(input_path) if 'receptor' in receptor)
+    total_time = (time.time() - start) / 60  # Convert time to minutes
+    print(f'Total time to create surfpoints for {rec_count} receptors: {total_time:.2f} mins')
+# Example usage
+if __name__ == '__main__':
+    user = os.getenv('USER')
+    pdbs_path = f'/home/{user}/PycharmProjects/github_projects/ParaSurf/test_data/pdbs/eraseme/TRAIN'  # input folder with protein pdbs for training
+    surfpoints_path =f'/home/{user}/PycharmProjects/github_projects/ParaSurf/test_data/surfpoints/eraseme/TRAIN'
+    # create the molecular surface
+    generate_molecular_surface(
+        input_path= pdbs_path,
+        out_path= surfpoints_path
+    )
+    # fix some format issues with the .surfpoints files
+    process_surfpoints_directory(surfpoints_path)

ParaSurf/train/V_domain_results.py ADDED Viewed

	@@ -0,0 +1,159 @@

+import numpy as np
+import pandas as pd
+from utils import calculate_metrics
+import os, re
+def calculate_Fv_and_cdr_regions(residues_best, gt_true_label_residues, rec_name, output_path, epoch, test_csv=None,
+                          thres=0.5):
+    # Load the CSV file to identify heavy and light chains, if provided
+    heavy_chain_name, light_chain_name = None, None
+    calculate_individual_cdrs = False
+    if test_csv:
+        test_csv = pd.read_csv(test_csv)
+        rec_info = test_csv[test_csv['pdb_code'] == rec_name]
+        if not rec_info.empty:
+            heavy_chain_name = rec_info['Heavy_chain'].iloc[0]
+            light_chain_name = rec_info['Light_chain'].iloc[0]
+            calculate_individual_cdrs = True
+        else:
+            print(f"Receptor {rec_name} not found in the test CSV.")
+    # Define the CDR+-2 ranges for heavy and light chains
+    cdr1 = list(range(25, 41))  # CDR-H/L1: 25-40
+    cdr2 = list(range(54, 68))  # CDR-H/L2: 54-67
+    cdr3 = list(range(103, 120))  # CDR-H/L3: 103-119
+    framework_ranges = list(range(1, 25)) + list(range(41, 54)) + list(range(68, 103)) + list(range(120, 129))
+    # Initialize dictionaries
+    CDRH1, CDRH2, CDRH3 = {}, {}, {}
+    CDRL1, CDRL2, CDRL3 = {}, {}, {}
+    FRAMEWORK = {}
+    # Loop over the predictions to populate the dictionaries
+    for residue, data in residues_best.items():
+        # Split the residue into components (e.g., '30_L_C' -> ['30', 'L', 'C'])
+        residue_parts = residue.split('_')
+        residue_num = int(re.findall(r'\d+', residue_parts[0])[0])
+        chain_name = residue_parts[1]
+        # Assign residue to the corresponding CDR or FRAMEWORK based on chain and residue number
+        if (not heavy_chain_name or chain_name == heavy_chain_name):  # If no csv or matching heavy chain
+            if residue_num in cdr1:
+                CDRH1[residue] = data
+            elif residue_num in cdr2:
+                CDRH2[residue] = data
+            elif residue_num in cdr3:
+                CDRH3[residue] = data
+            elif residue_num in framework_ranges:
+                FRAMEWORK[residue] = data
+        if (not light_chain_name or chain_name == light_chain_name):  # If no csv or matching light chain
+            if residue_num in cdr1:
+                CDRL1[residue] = data
+            elif residue_num in cdr2:
+                CDRL2[residue] = data
+            elif residue_num in cdr3:
+                CDRL3[residue] = data
+            elif residue_num in framework_ranges:
+                FRAMEWORK[residue] = data
+    # Helper function to calculate and save metrics for each CDR and FRAMEWORK
+    def calculate_and_save_metrics(cdr_dict, cdr_name, threshold=thres):
+        if len(cdr_dict) > 0:  # To check if CDR exists in the antibody
+            pred_scores = np.array([[i[1]['scores']] for i in cdr_dict.items()])
+            pred_labels = (pred_scores > threshold).astype(int)
+            gt_labels = np.array([1 if residue in gt_true_label_residues else 0 for residue in cdr_dict.keys()])
+            if len(np.unique(gt_labels)) > 1:  # Ensure both classes are present
+                output_results_path = os.path.join(output_path, f'{cdr_name}_results_epoch_{epoch}_{threshold}.txt')
+                auc_roc, accuracy, precision, recall, f1, auc_pr, conf_matrix, mcc, _, _, _ = \
+                    calculate_metrics(gt_labels, pred_labels, pred_scores, to_save_metrics_path=output_results_path)
+                return auc_roc, accuracy, precision, recall, f1, auc_pr, conf_matrix, mcc
+        return None
+    # Calculate and save metrics for each CDR and FRAMEWORK only if .csv is provided
+    if calculate_individual_cdrs:
+        calculate_and_save_metrics(CDRH1, 'CDRH1')
+        calculate_and_save_metrics(CDRH2, 'CDRH2')
+        calculate_and_save_metrics(CDRH3, 'CDRH3')
+        calculate_and_save_metrics(CDRL1, 'CDRL1')
+        calculate_and_save_metrics(CDRL2, 'CDRL2')
+        calculate_and_save_metrics(CDRL3, 'CDRL3')
+        calculate_and_save_metrics(FRAMEWORK, 'FRAMEWORK')
+    # Calculate the metrics for the CDR+-2 region (CDRH1 + CDRH2 + CDRH3 + CDRL1 + CDRL2 + CDRL3)
+    cdr_plus_minus_2 = {**CDRH1, **CDRH2, **CDRH3, **CDRL1, **CDRL2, **CDRL3}
+    calculate_and_save_metrics(cdr_plus_minus_2, 'CDR_plus_minus_2')
+    # Calculate the metrics for the Fv region (CDRs + FRAMEWORK)
+    fv_region = {**CDRH1, **CDRH2, **CDRH3, **CDRL1, **CDRL2, **CDRL3, **FRAMEWORK}
+    calculate_and_save_metrics(fv_region, 'Fv')
+def calculate_Fv_and_cdr_regions_only_one_chain(residues_best, gt_true_label_residues, rec_name, output_path, epoch, thres=0.5):
+    """
+    This function calculates metrics for the Fv and CDR+-2 regions, but only for a PDB file with one chain.
+    The CSV file is not needed in this case, as there is only one chain.
+    Args:
+    - residues_best: Dictionary containing residue information.
+    - gt_true_label_residues: List of ground truth binding residues.
+    - rec_name: Name of the receptor.
+    - output_path: Directory to save output results.
+    - epoch: Current epoch for model validation.
+    - thres: Threshold for classification (default is 0.5).
+    Returns:
+    - Metrics calculated and saved for CDR+-2 and Fv regions.
+    """
+    # Define the CDR+-2 and framework ranges for the single chain
+    cdr1 = list(range(25, 41))  # CDR1: 25-40
+    cdr2 = list(range(54, 68))  # CDR2: 54-67
+    cdr3 = list(range(103, 120))  # CDR3: 103-119
+    framework_ranges = list(range(1, 25)) + list(range(41, 54)) + list(range(68, 103)) + list(range(120, 129))
+    # Initialize dictionaries
+    CDR1, CDR2, CDR3 = {}, {}, {}
+    FRAMEWORK = {}
+    # Loop over the predictions to populate the dictionaries
+    for residue, data in residues_best.items():
+        # Split the residue into components (e.g., '30_L_C' -> ['30', 'L', 'C'])
+        residue_parts = residue.split('_')
+        residue_num = int(re.findall(r'\d+', residue_parts[0])[0])
+        # Assign residue to the corresponding CDR or FRAMEWORK based on residue number
+        if residue_num in cdr1:
+            CDR1[residue] = data
+        elif residue_num in cdr2:
+            CDR2[residue] = data
+        elif residue_num in cdr3:
+            CDR3[residue] = data
+        elif residue_num in framework_ranges:
+            FRAMEWORK[residue] = data
+    # Helper function to calculate and save metrics for each CDR and FRAMEWORK
+    def calculate_and_save_metrics(cdr_dict, cdr_name, threshold=thres):
+        if len(cdr_dict) > 0:  # Check if CDR exists in the antibody
+            pred_scores = np.array([[i[1]['scores']] for i in cdr_dict.items()])
+            pred_labels = (pred_scores > threshold).astype(int)
+            gt_labels = np.array([1 if residue in gt_true_label_residues else 0 for residue in cdr_dict.keys()])
+            if len(np.unique(gt_labels)) > 1:  # Ensure both classes are present
+                output_results_path = os.path.join(output_path, f'{cdr_name}_results_epoch_{epoch}_{threshold}.txt')
+                auc_roc, accuracy, precision, recall, f1, auc_pr, conf_matrix, mcc, _, _, _ = \
+                    calculate_metrics(gt_labels, pred_labels, pred_scores, to_save_metrics_path=output_results_path)
+                return auc_roc, accuracy, precision, recall, f1, auc_pr, conf_matrix, mcc
+        return None
+    # Calculate the metrics for the CDR+-2 region (CDR1 + CDR2 + CDR3)
+    cdr_plus_minus_2 = {**CDR1, **CDR2, **CDR3}
+    calculate_and_save_metrics(cdr_plus_minus_2, 'CDR_plus_minus_2')
+    # Calculate the metrics for the Fv region (CDR1 + CDR2 + CDR3 + FRAMEWORK)
+    fv_region = {**CDR1, **CDR2, **CDR3, **FRAMEWORK}
+    calculate_and_save_metrics(fv_region, 'Fv')

ParaSurf/train/__pycache__/V_domain_results.cpython-310.pyc ADDED Viewed

Binary file (4.54 kB). View file

ParaSurf/train/__pycache__/V_domain_results.cpython-39.pyc ADDED Viewed

Binary file (4.81 kB). View file

ParaSurf/train/__pycache__/bsite_extraction.cpython-310.pyc ADDED Viewed

Binary file (1.82 kB). View file

ParaSurf/train/__pycache__/bsite_extraction.cpython-39.pyc ADDED Viewed

Binary file (1.79 kB). View file

ParaSurf/train/__pycache__/distance_coords.cpython-310.pyc ADDED Viewed

Binary file (4.55 kB). View file

ParaSurf/train/__pycache__/distance_coords.cpython-39.pyc ADDED Viewed

Binary file (4.6 kB). View file

ParaSurf/train/__pycache__/features.cpython-310.pyc ADDED Viewed

Binary file (1.91 kB). View file

ParaSurf/train/__pycache__/features.cpython-39.pyc ADDED Viewed

Binary file (1.9 kB). View file

ParaSurf/train/__pycache__/network.cpython-310.pyc ADDED Viewed

Binary file (1.93 kB). View file

ParaSurf/train/__pycache__/network.cpython-39.pyc ADDED Viewed

Binary file (1.94 kB). View file

ParaSurf/train/__pycache__/protein.cpython-310.pyc ADDED Viewed

Binary file (4.47 kB). View file

ParaSurf/train/__pycache__/protein.cpython-39.pyc ADDED Viewed

Binary file (4.5 kB). View file

ParaSurf/train/__pycache__/utils.cpython-310.pyc ADDED Viewed

Binary file (14 kB). View file

ParaSurf/train/__pycache__/utils.cpython-39.pyc ADDED Viewed

Binary file (14 kB). View file

ParaSurf/train/__pycache__/validation.cpython-310.pyc ADDED Viewed

Binary file (7.12 kB). View file

ParaSurf/train/__pycache__/validation.cpython-39.pyc ADDED Viewed

Binary file (7.19 kB). View file

ParaSurf/train/bsite_extraction.py ADDED Viewed

	@@ -0,0 +1,48 @@

+import numpy as np
+from sklearn.cluster import MeanShift
+class Bsite_extractor():
+    def __init__(self, lig_thres=0.9, bw=15):
+        self.T = lig_thres
+        self.ms = MeanShift(bandwidth=bw,bin_seeding=True,cluster_all=False,n_jobs=4)
+    def _cluster_points(self,prot,lig_scores):
+        T_new = self.T
+        while sum(lig_scores>=T_new) < 10 and T_new>0.3001:    # at least 10 points with prob>P  and P>=0.3
+            T_new -= 0.1
+        # filtered_points = prot.surf_points[lig_scores>T_new]
+        filtered_points = prot.surf_points[lig_scores.flatten() > T_new]
+        filtered_scores = lig_scores[lig_scores>T_new]
+        if len(filtered_points)<5:
+            return ()
+        clustering = self.ms.fit(filtered_points)
+        labels = clustering.labels_
+        unique_l,freq = np.unique(labels,return_counts=True)
+        if len(unique_l[freq>=5])!=0:
+            unique_l = unique_l[freq>=5]    # keep clusters with 5 points and more
+        else:
+            return ()
+        if unique_l[0]==-1:                 # discard the "unclustered" cluster
+            unique_l = unique_l[1:]
+        clusters = [(filtered_points[labels==l],filtered_scores[labels==l]) for l in unique_l]
+        return clusters
+    def extract_bsites(self,prot,lig_scores):
+        clusters = self._cluster_points(prot,lig_scores)
+        if len(clusters)==0:
+            print('No binding site found!!!')
+            return
+        for cluster in clusters:
+            prot.add_bsite(cluster)
+        prot.sort_bsites()
+        prot.write_bsites()

ParaSurf/train/distance_coords.py ADDED Viewed

	@@ -0,0 +1,173 @@

+import math
+import re
+import numpy as np
+def locate_surface_binding_site_atoms(receptor_surf_file, antigen_pdb_file, distance_cutoff=4):
+    rec_coordinates = []
+    with open(receptor_surf_file, 'r') as file:
+        for line in file:
+            parts = line.split()
+            # Check for the presence of a numeric value in the 3rd element of parts
+            match = re.search(r'([-+]?\d*\.\d+|\d+)(?=\.)', parts[2])
+            if match:
+                numeric_value = match.group(0)
+                non_numeric_value = parts[2].replace(numeric_value, "")
+                # Update the 'parts' list
+                parts[2:3] = [non_numeric_value, numeric_value]
+            if len(parts) >= 7:  # Since we added an extra element to parts, its length increased by 1
+                x = float(parts[3])
+                y = float(parts[4])
+                z = float(parts[5])
+                rec_coordinates.append((x, y, z))
+    ant_coordinates = []
+    with open(antigen_pdb_file, 'r') as file:
+        for line in file:
+            if line.startswith("ATOM"):
+                x = float(line[30:38].strip())
+                y = float(line[38:46].strip())
+                z = float(line[46:54].strip())
+                ant_coordinates.append((x, y, z))
+    # Create a list to store the final coordinates
+    final_coordinates = []
+    # Compare each coordinate from rec_coordinates with each coordinate from ant_coordinates
+    for rec_coord in rec_coordinates:
+        for ant_coord in ant_coordinates:
+            if math.dist(rec_coord, ant_coord) < distance_cutoff:
+                final_coordinates.append(rec_coord)
+                break  # Break the inner loop if a match is found to avoid duplicate entries
+    # sanity check
+    for coor in final_coordinates:
+        if coor not in rec_coordinates:
+            print('BINDING SITE COORDINATE NOT IN RECEPTORs COORDINATES!!!!!!')
+    return final_coordinates, rec_coordinates
+def locate_receptor_binding_site_atoms(receptor_pdb_file, antigen_pdb_file, distance_cutoff=4):
+    rec_coordinates = []
+    with open(receptor_pdb_file, 'r') as file:
+        for line in file:
+            if line.startswith("ATOM"):
+                x = float(line[30:38].strip())
+                y = float(line[38:46].strip())
+                z = float(line[46:54].strip())
+                rec_coordinates.append((x, y, z))
+    ant_coordinates = []
+    with open(antigen_pdb_file, 'r') as file:
+        for line in file:
+            if line.startswith("ATOM"):
+                x = float(line[30:38].strip())
+                y = float(line[38:46].strip())
+                z = float(line[46:54].strip())
+                ant_coordinates.append((x, y, z))
+    # Create a list to store the final coordinates
+    final_coordinates = []
+    # Compare each coordinate from rec_coordinates with each coordinate from ant_coordinates
+    for rec_coord in rec_coordinates:
+        for ant_coord in ant_coordinates:
+            if math.dist(rec_coord, ant_coord) < distance_cutoff:
+                final_coordinates.append(rec_coord)
+                break  # Break the inner loop if a match is found to avoid duplicate entries
+    # sanity check
+    for coor in final_coordinates:
+        if coor not in rec_coordinates:
+            print('BINDING SITE COORDINATE NOT IN RECEPTORs COORDINATES!!!!!!')
+    return final_coordinates, rec_coordinates
+def coords2pdb(coordinates, tosavepath):
+    with open(tosavepath, 'w') as pdb_file:
+        atom_number = 1
+        for coord in coordinates:
+            x, y, z = coord
+            pdb_file.write(f"ATOM  {atom_number:5}  DUM DUM A{atom_number:4}    {x:8.3f}{y:8.3f}{z:8.3f}  1.00  0.00\n")
+            atom_number += 1
+            if atom_number == 9999:
+                atom_number = 1
+        pdb_file.write("END")
+def locate_receptor_binding_site_atoms_residue_level(receptor_file, antigen_pdb_file, distance_cutoff=4):
+    rec_atoms = []
+    chain_elements = []
+    with open(receptor_file, 'r') as file:
+        for line in file:
+            if line.startswith("ATOM"):
+                atom_id = line[6:11].strip()
+                atom_type = line[12:16].strip()
+                res_id = line[22:26].strip()
+                # check if there is Code for insertions of residues
+                insertion_code = line[26].strip()
+                if insertion_code:
+                    res_id = res_id + insertion_code
+                res_name = line[17:20].strip()
+                chain_id = line[21].strip()
+                x = float(line[30:38].strip())
+                y = float(line[38:46].strip())
+                z = float(line[46:54].strip())
+                rec_atoms.append((atom_id, atom_type, res_id, res_name, chain_id, x, y, z))
+                chain_elements.append((atom_id, atom_type, res_id, chain_id))
+    ant_atoms = []
+    with open(antigen_pdb_file, 'r') as file:
+        for line in file:
+            if line.startswith("ATOM"):
+                atom_id = line[6:11].strip()
+                atom_type = line[12:16].strip()
+                res_id = line[22:26].strip()
+                res_name = line[17:20].strip()
+                chain_id = line[21].strip()
+                x = float(line[30:38].strip())
+                y = float(line[38:46].strip())
+                z = float(line[46:54].strip())
+                ant_atoms.append((atom_id, atom_type, res_id, res_name, chain_id, x, y, z))
+    final_atoms = []
+    for rec_atom in rec_atoms:
+        for ant_atom in ant_atoms:
+            if math.dist(rec_atom[5:], ant_atom[5:]) < distance_cutoff:
+                final_atoms.append(rec_atom)
+                break
+    rec_atoms = np.array([atom[5:] for atom in rec_atoms])
+    final_atoms_ = np.array([atom[5:] for atom in final_atoms])
+    final_elements = np.array([atom[:5] for atom in final_atoms])
+    return final_atoms_, rec_atoms, final_elements
+def coords2pdb_residue_level(coordinates, tosavepath, elements):
+    with open(tosavepath, 'w') as pdb_file:
+        for i, atom in enumerate(coordinates):
+            atom_id, atom_type, res_id, res_name, chain_id = elements[i]
+            # Separate the numeric part from the insertion code (if any)
+            if res_id[-1].isalpha():  # Check if the last character is an insertion code
+                res_num = res_id[:-1]  # Numeric part of the residue
+                insertion_code = res_id[-1]  # Insertion code (e.g., 'A' in '30A')
+            else:
+                res_num = res_id
+                insertion_code = " "  # No insertion code
+            x, y, z = atom
+            # Write to the PDB file with the correct formatting
+            pdb_file.write(
+                f"ATOM  {int(atom_id):5} {atom_type:<4} {res_name} {chain_id}{int(res_num):4}{insertion_code:1}   {x:8.3f}{y:8.3f}{z:8.3f}  1.00  0.00\n")
+        pdb_file.write("END\n")

ParaSurf/train/features.py ADDED Viewed

	@@ -0,0 +1,37 @@

+from ParaSurf.utils import bio_data_featurizer
+from ParaSurf.train.utils import rotation
+import numpy as np
+class KalasantyFeaturizer:
+    def __init__(self, gridSize, voxelSize):
+        grid_limit = (gridSize / 2 - 0.5) * voxelSize
+        grid_radius = grid_limit * np.sqrt(3)
+        self.neigh_radius = 4 + grid_radius  # 4 > 2*R_vdw
+        # self.neigh_radius = 2*grid_radius  # 4 > 2*R_vdw
+        self.featurizer = bio_data_featurizer.Featurizer(save_molecule_codes=False)
+        self.grid_resolution = voxelSize
+        self.max_dist = (gridSize - 1) * voxelSize / 2
+    def get_channels(self, mol, add_forcefields, add_atom_radius_features=False):
+        if not add_forcefields:
+            self.coords, self.channels = self.featurizer.get_features(mol)  # returns only heavy atoms
+        else:
+            self.coords, self.channels = self.featurizer.get_features_with_force_fields(mol, add_atom_radius=add_atom_radius_features)  # returns only heavy atoms
+    def get_channels_with_forcefields(self, mol):
+        self.coords, self.channels = self.featurizer.get_features_with_force_fields(mol, add_atom_radius=True)  # returns only heavy atoms
+    def grid_feats(self, point, normal, mol_coords):
+        neigh_atoms = np.sqrt(np.sum((mol_coords - point) ** 2, axis=1)) < self.neigh_radius
+        Q = rotation(normal)
+        Q_inv = np.linalg.inv(Q)
+        transf_coords = np.transpose(mol_coords[neigh_atoms] - point)
+        rotated_mol_coords = np.matmul(Q_inv, transf_coords)
+        features = \
+        bio_data_featurizer.make_grid(np.transpose(rotated_mol_coords), self.channels[neigh_atoms], self.grid_resolution,
+                             self.max_dist)[0]
+        return features

ParaSurf/train/network.py ADDED Viewed

	@@ -0,0 +1,58 @@

+import numpy as np
+import os
+import torch
+from ParaSurf.train.features import KalasantyFeaturizer
+from ParaSurf.model import ParaSurf_model
+class Network:
+    def __init__(self, model_path, gridSize, feature_channels, voxelSize=1, device="cuda"):
+        self.gridSize = gridSize                                   # Does this change?
+        if device == 'cuda' and torch.cuda.is_available():
+            self.device = torch.device("cuda")
+        else:
+            self.device = torch.device("cpu")
+        # load model
+        self.model = ParaSurf_model.ResNet3D_Transformer(in_channels=feature_channels, block=ParaSurf_model.DilatedBottleneck,
+                                                          num_blocks=[3, 4, 6, 3], num_classes=1)
+        # load weights
+        self.model.load_state_dict(torch.load(model_path, map_location=self.device)) #
+        # model to eval mode and to device
+        self.model = self.model.to(self.device).eval()
+        self.featurizer = KalasantyFeaturizer(gridSize, voxelSize) # it is the "rules of the game"
+        self.feature_channels = feature_channels
+    def get_lig_scores(self, prot, batch_size, add_forcefields, add_atom_radius_features):
+        self.featurizer.get_channels(prot.mol, add_forcefields, add_atom_radius_features)
+        lig_scores = []
+        input_data = torch.zeros((batch_size, self.gridSize, self.gridSize, self.gridSize, self.feature_channels), device=self.device)
+        batch_cnt = 0
+        for p, n in zip(prot.surf_points, prot.surf_normals):
+            input_data[batch_cnt,:,:,:,:] = torch.tensor(self.featurizer.grid_feats(p, n, prot.heavy_atom_coords), device=self.device)
+            batch_cnt += 1
+            if batch_cnt == batch_size:
+                with torch.no_grad():
+                    output = self.model(input_data)
+                    output = torch.sigmoid(output)
+                lig_scores.extend(output.cpu().numpy())
+                batch_cnt = 0
+        if batch_cnt > 0:
+            with torch.no_grad():
+                output = self.model(input_data[:batch_cnt])
+                output = torch.sigmoid(output)
+            lig_scores.extend(output.cpu().numpy())
+        print(np.array(lig_scores).shape)
+        return np.array(lig_scores)

ParaSurf/train/protein.py ADDED Viewed

	@@ -0,0 +1,92 @@

+import os, numpy as np
+import shutil
+# import pybel
+from openbabel import pybel
+from ParaSurf.train.utils import simplify_dms
+from ParaSurf.utils.fix_surfpoints_format_issues import process_surfpoints_directory
+class Protein_pred:
+    def __init__(self, prot_file, save_path, seed=None, atom_points_threshold=5, locate_only_surface=False):
+        prot_id = prot_file.split('/')[-1].split('.')[0]
+        self.save_path = os.path.join(save_path, prot_id)
+        if not os.path.exists(self.save_path):
+            os.makedirs(self.save_path)
+        self.mol = next(pybel.readfile(prot_file.split('.')[-1], prot_file))
+        self.atom_points_thresh = atom_points_threshold
+        surfpoints_file = os.path.join(self.save_path, prot_id + '.surfpoints')
+        # we have all the surfpoints ready from the preprocessing step
+        if not os.path.exists(surfpoints_file):
+            os.system('dms ' + prot_file + ' -d 0.1 -n -o ' + surfpoints_file)      #default value for d is 0.2
+            # fix any format issues
+            print('\nfixing surfpoints format ...')
+            process_surfpoints_directory(self.save_path)
+            # raise Exception('probably DMS not installed')
+        # locate surface: if we want the final coordinates to have the receptor atoms or we want just the surface atoms
+        self.surf_points, self.surf_normals = simplify_dms(surfpoints_file, seed=seed,
+                                                           locate_surface=locate_only_surface)
+        self.heavy_atom_coords = np.array([atom.coords for atom in self.mol.atoms if atom.atomicnum > 1])
+        self.binding_sites = []
+        if prot_file.endswith('pdb'):
+            with open(prot_file, 'r') as f:
+                lines = f.readlines()
+            self.heavy_atom_lines = [line for line in lines if line[:4] == 'ATOM' and line.split()[2][0] != 'H']
+            if len(self.heavy_atom_lines) != len(self.heavy_atom_coords):
+                ligand_in_pdb = len([line for line in lines if line.startswith('HETATM')]) > 0
+                if ligand_in_pdb:
+                    raise Exception('Ligand found in PDBfile. Please remove it to procede.')
+                else:
+                    raise Exception('Incosistency between Coords and PDBLines')
+        else:
+            raise IOError('Protein file should be .pdb')
+    def _surfpoints_to_atoms(self, surfpoints):
+        close_atoms = np.zeros(len(surfpoints), dtype=int)
+        for p, surf_coord in enumerate(surfpoints):
+            dist = np.sqrt(np.sum((self.heavy_atom_coords - surf_coord) ** 2, axis=1))
+            close_atoms[p] = np.argmin(dist)
+        return np.unique(close_atoms)
+    def add_bsite(self, cluster):  # cluster -> tuple: (surf_points,scores)
+        atom_idxs = self._surfpoints_to_atoms(cluster[0])
+        self.binding_sites.append(Bsite(self.heavy_atom_coords, atom_idxs, cluster[1]))
+    def sort_bsites(self):
+        avg_scores = np.array([bsite.score for bsite in self.binding_sites])
+        sorted_idxs = np.flip(np.argsort(avg_scores), axis=0)
+        self.binding_sites = [self.binding_sites[idx] for idx in sorted_idxs]
+    def write_bsites(self):
+        if not os.path.exists(self.save_path):
+            os.makedirs(self.save_path)
+        centers = np.array([bsite.center for bsite in self.binding_sites])
+        np.savetxt(os.path.join(self.save_path, 'centers.txt'), centers, delimiter=' ', fmt='%10.3f')
+        pocket_count = 0
+        for i, bsite in enumerate(self.binding_sites):
+            outlines = [self.heavy_atom_lines[idx] for idx in bsite.atom_idxs]
+            if len(outlines) > self.atom_points_thresh:
+                pocket_count += 1
+                with open(os.path.join(self.save_path, 'pocket' + str(pocket_count) + '.pdb'), 'w') as f:
+                    f.writelines(outlines)
+class Bsite:
+    def __init__(self, mol_coords, atom_idxs, scores):
+        self.coords = mol_coords[atom_idxs]
+        self.center = np.average(self.coords, axis=0)
+        self.score = np.average(scores)
+        self.atom_idxs = atom_idxs

ParaSurf/train/train.py ADDED Viewed

	@@ -0,0 +1,172 @@

+import os
+import time, random
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.optim as optim
+from torch.utils.data import DataLoader
+from torch.optim.lr_scheduler import StepLR
+from ParaSurf.model import ParaSurf_model
+from ParaSurf.model.dataset import dataset
+from validation import validate_residue_level
+import wandb
+from tqdm import tqdm
+user = os.getenv('USER')
+base_dir = f'/home/{user}/PycharmProjects/github_projects/ParaSurf/test_data'
+CFG = {
+    'name': 'ParaSurf train dummy eraseme folder',
+    'initial_lr': 0.0001,
+    'epochs': 100,
+    'batch_size': 64,
+    'grid': 41, # don't change
+    'seed': 42,
+    'wandb': False,
+    'debug': False,
+    'model_weights': None,  # if ('' or None )is given then training starts from scratch
+    'num_workers': 8,
+    'feat_type': ['kalasanty_with_force_fields'],
+    'feats_path': os.path.join(base_dir, 'feats'),
+    'TRAIN_samples': os.path.join(base_dir, 'datasets/eraseme_TRAIN.samples'),
+    'VAL_proteins_list': os.path.join(base_dir, 'datasets/eraseme_VAL.proteins'),
+    'VAL_proteins': os.path.join(base_dir, 'pdbs/eraseme/VAL'),
+    'save_dir': f'/home/{user}/PycharmProjects/github_projects/ParaSurf/ParaSurf/train/eraseme/model_weights'
+}
+if CFG['wandb']:
+    wandb.init(project='ParaSurf', entity='your_project_name', config=CFG, name=CFG['name'])
+# Set random seed for repeatability
+def set_seed(seed_value):
+    """Set seed for reproducibility."""
+    random.seed(seed_value)
+    np.random.seed(seed_value)
+    torch.manual_seed(seed_value)
+    if torch.cuda.is_available():
+        torch.cuda.manual_seed_all(seed_value)
+set_seed(CFG['seed'])
+with open(CFG['TRAIN_samples']) as f:
+    lines = f.readlines()
+    feature_vector_lentgh = int(lines[0].split()[1].split('/')[0].split('_')[-1])
+# model
+device = 'cuda' if torch.cuda.is_available() else 'cpu'
+model = ParaSurf_model.ResNet3D_Transformer(in_channels=feature_vector_lentgh,
+                                                  block=ParaSurf_model.DilatedBottleneck,
+                                                  num_blocks=[3, 4, 6, 3], num_classes=1).to(device)
+print(model)
+criterion = nn.BCEWithLogitsLoss()
+optimizer = optim.Adam(model.parameters(), lr=CFG['initial_lr'])
+scheduler = StepLR(optimizer, step_size=5, gamma=0.1)
+# Load Dataset
+train_set = dataset(CFG['TRAIN_samples'], CFG['batch_size'], CFG['feats_path'], CFG['grid'], True,
+                    feature_vector_lentgh, CFG['feat_type'])
+train_loader = DataLoader(dataset=train_set, batch_size=CFG['batch_size'], shuffle=True,
+                          num_workers=CFG['num_workers'])
+# Training
+if not os.path.exists(CFG['save_dir']):
+    os.makedirs(CFG['save_dir'])
+# check if pretrain weights are loaded and start the epoch from there
+if CFG['model_weights'] and os.path.exists(CFG['model_weights']):
+    model.load_state_dict(torch.load(CFG['model_weights']))
+    start_epoch = int(CFG['model_weights'].split('/')[-1].split('.')[0].split('_')[1]) + 1
+    print(f"\nLoading weights from epoch {start_epoch-1} ...\n")
+    print(f"Start training for epoch {start_epoch} ...")
+else:
+    print('\nStart training from scratch ...')
+    start_epoch = 0
+train_losses = []  # to keep track of training losses
+for epoch in range(start_epoch, CFG['epochs']):
+    start = time.time()
+    model.train()
+    total_loss = 0.0
+    correct_train_predictions = 0  # Reset for each epoch
+    total_train_samples = 0  # Reset for each epoch
+    for i, (inputs, labels) in tqdm(enumerate(train_loader), total=len(train_loader)):
+        inputs, labels = inputs.float().to(device), labels.to(device).unsqueeze(1)
+        total_train_samples += labels.shape[0]
+        optimizer.zero_grad()
+        # scaler option
+        # with torch.cuda.amp.autocast():
+        outputs = model(inputs)
+        loss = criterion(outputs, labels.float())
+        loss.backward()
+        # Apply gradient clipping
+        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1)
+        optimizer.step()
+        total_loss += loss.item()
+        predicted_train = torch.sigmoid(outputs) > 0.5
+        correct_train_predictions += (predicted_train == labels).sum().item()
+        # Print the training loss every 100 batches
+        if (i + 1) % 100 == 0:
+            print(f"Epoch: {epoch + 1} Batch: {i + 1} Train Loss: {loss.item():.3f}")
+        # if CFG['wandb']:
+        #     wandb.log({'Mini Batch Train Loss': loss.item()})
+        if CFG['debug']:
+            break
+    avg_train_loss = total_loss / len(train_loader)
+    train_accuracy = correct_train_predictions / total_train_samples  # Calculate training accuracy
+    train_losses.append(avg_train_loss)
+    cur_model_weight_path = os.path.join(CFG['save_dir'], f'epoch_{epoch}.pth')
+    torch.save(model.state_dict(), cur_model_weight_path)
+    avg_auc_roc, avg_precision, avg_recall, avg_auc_pr, avg_f1 = validate_residue_level(valset=CFG['VAL_proteins_list'],
+                                                                    modelweights=cur_model_weight_path,
+                                                                    test_folder=CFG['VAL_proteins'],
+                                                                    epoch=epoch + 1,
+                                                                    feat_type=CFG['feat_type'],
+                                                                    feature_vector_lentgh=feature_vector_lentgh)
+    print(
+        f"Epoch {epoch + 1}/{CFG['epochs']} - Train Loss: {avg_train_loss:.3f}, Train Accuracy: {train_accuracy:.3f},"
+        f"Val_AUC-ROC: {avg_auc_roc:.3f}, Val_Precision: {avg_precision:.3f}, Val_Recall: {avg_recall:.3f},"
+        f" Val_AUC_pr: {avg_auc_pr:.3f}, Val_F1: {avg_f1}")
+    print(f"Total epoch time: {(time.time() - start) / 60:.3f} mins")
+    if CFG['wandb']:
+        wandb.log({'Epoch': epoch,
+                   'Train Loss': avg_train_loss,
+                   'Train Accuracy': train_accuracy,
+                   'Valid AUC-ROC': avg_auc_roc,
+                   'Valid Precision': avg_precision,
+                   'Valid Recall': avg_recall,
+                   'Valid AUC-pr': avg_auc_pr,
+                   'Valid F1':  avg_f1
+                   })
+    # Step the scheduler
+    scheduler.step()
+# # Finish the wandb run at the end of all epochs for the current iteration
+if CFG['wandb']:
+    wandb.finish()

ParaSurf/train/utils.py ADDED Viewed

	@@ -0,0 +1,497 @@

+import warnings, os
+import numpy as np
+from scipy.spatial.distance import euclidean
+from sklearn.cluster import KMeans
+import matplotlib.pyplot as plt
+from sklearn.metrics import roc_auc_score, roc_curve
+from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, auc, precision_recall_curve, \
+    confusion_matrix, matthews_corrcoef
+def mol2_reader(mol_file):  # does not handle H2
+    if mol_file[-4:] != 'mol2':
+        raise Exception("File's extension is not .mol2")
+    with open(mol_file, 'r') as f:
+        lines = f.readlines()
+    for i, line in enumerate(lines):
+        if '@<TRIPOS>ATOM' in line:
+            first_atom_idx = i + 1
+        if '@<TRIPOS>BOND' in line:
+            last_atom_idx = i - 1
+    return lines[first_atom_idx:last_atom_idx + 1]
+# maybe change to read_surfpoints_new ??
+def readSurfPoints(surf_file):
+    with open(surf_file, 'r') as f:
+        lines = f.readlines()
+    lines = [l for l in lines if len(l.split()) > 7]
+    if len(lines) > 100000:
+        warnings.warn('{} has too many points'.format(surf_file))
+        return
+    if len(lines) == 0:
+        warnings.warn('{} is empty'.format(surf_file))
+        return
+    coords = np.zeros((len(lines), 3))
+    normals = np.zeros((len(lines), 3))
+    for i, l in enumerate(lines):
+        parts = l.split()
+        try:
+            coords[i, 0] = float(parts[3])
+            coords[i, 1] = float(parts[4])
+            coords[i, 2] = float(parts[5])
+            normals[i, 0] = float(parts[8])
+            normals[i, 1] = float(parts[9])
+            normals[i, 2] = float(parts[10])
+        except:
+            coords[i, 0] = float(parts[2][-8:])
+            coords[i, 1] = float(parts[3])
+            coords[i, 2] = float(parts[4])
+            normals[i, 0] = float(parts[7])
+            normals[i, 1] = float(parts[8])
+            normals[i, 2] = float(parts[9])
+    return coords, normals
+def readSurfPoints_with_receptor_atoms(surf_file):
+    with open(surf_file, 'r') as f:
+        lines = f.readlines()
+    # lines = [l for l in lines if len(l.split()) > 7]
+    lines = [l for l in lines]
+    if len(lines) > 100000:
+        warnings.warn('{} has too many points'.format(surf_file))
+        return
+    if len(lines) == 0:
+        warnings.warn('{} is empty'.format(surf_file))
+        return
+    coords = np.zeros((len(lines), 3))
+    normals = np.zeros((len(lines), 3))
+    # First, ensure each line has at least 11 parts by filling with zeros
+    for i in range(len(lines)):
+        parts = lines[i].split()
+        while len(parts) < 11:
+            # Fill with '0' initially
+            parts.append('0')
+        lines[i] = ' '.join(parts)
+    # Modify lines according to the specified rules
+    for i in range(len(lines)):
+        parts = lines[i].split()
+        # Check if there are zeros that need to be replaced
+        if '0' in parts:
+            if i > 0:  # Use previous line if not the first line
+                prev_parts = lines[i - 1].split()
+                parts = [prev_parts[j] if part == '0' else part for j, part in enumerate(parts)]
+            elif i < len(lines) - 1:  # Use next line if not the last line
+                next_parts = lines[i + 1].split()
+                parts = [next_parts[j] if part == '0' else part for j, part in enumerate(parts)]
+        lines[i] = ' '.join(parts)
+        try:
+            coords[i, 0] = float(parts[3])
+            coords[i, 1] = float(parts[4])
+            coords[i, 2] = float(parts[5])
+            normals[i, 0] = float(parts[8])
+            normals[i, 1] = float(parts[9])
+            normals[i, 2] = float(parts[10])
+        except:
+            coords[i, 0] = float(parts[2][-8:])
+            coords[i, 1] = float(parts[3])
+            coords[i, 2] = float(parts[4])
+            normals[i, 0] = float(parts[7])
+            normals[i, 1] = float(parts[8])
+            normals[i, 2] = float(parts[9])
+    return coords, normals
+def simplify_dms(init_surf_file, seed=None, locate_surface=True):
+    # Here we decide if we want the final coordinates to have the receptor atoms or we want just
+    # the surface atoms
+    if locate_surface:
+        coords, normals = readSurfPoints(init_surf_file)
+    else:
+        coords, normals = readSurfPoints_with_receptor_atoms(init_surf_file)  # to also get the receptor points
+        return coords, normals
+    nCl =  len(coords)
+    kmeans = KMeans(n_clusters=nCl, max_iter=300, n_init=1, random_state=seed).fit(coords)
+    point_labels = kmeans.labels_
+    centers = kmeans.cluster_centers_
+    cluster_idx, freq = np.unique(point_labels, return_counts=True)
+    if len(cluster_idx) != nCl:
+        raise Exception('Number of created clusters should be equal to nCl')
+    idxs = []
+    for cl in cluster_idx:
+        cluster_points_idxs = np.where(point_labels == cl)[0]
+        closest_idx_to_center = np.argmin([euclidean(centers[cl], coords[idx]) for idx in cluster_points_idxs])
+        idxs.append(cluster_points_idxs[closest_idx_to_center])
+    return coords[idxs], normals[idxs]
+def rotation(n):
+    if n[0] == 0.0 and n[1] == 0.0:
+        if n[2] == 1.0:
+            return np.identity(3)
+        elif n[2] == -1.0:
+            Q = np.identity(3)
+            Q[0, 0] = -1
+            return Q
+        else:
+            print('not possible')
+    rx = -n[1] / np.sqrt(n[0] * n[0] + n[1] * n[1])
+    ry = n[0] / np.sqrt(n[0] * n[0] + n[1] * n[1])
+    rz = 0
+    th = np.arccos(n[2])
+    q0 = np.cos(th / 2)
+    q1 = np.sin(th / 2) * rx
+    q2 = np.sin(th / 2) * ry
+    q3 = np.sin(th / 2) * rz
+    Q = np.zeros((3, 3))
+    Q[0, 0] = q0 * q0 + q1 * q1 - q2 * q2 - q3 * q3
+    Q[0, 1] = 2 * (q1 * q2 - q0 * q3)
+    Q[0, 2] = 2 * (q1 * q3 + q0 * q2)
+    Q[1, 0] = 2 * (q1 * q2 + q0 * q3)
+    Q[1, 1] = q0 * q0 - q1 * q1 + q2 * q2 - q3 * q3
+    Q[1, 2] = 2 * (q3 * q2 - q0 * q1)
+    Q[2, 0] = 2 * (q1 * q3 - q0 * q2)
+    Q[2, 1] = 2 * (q3 * q2 + q0 * q1)
+    Q[2, 2] = q0 * q0 - q1 * q1 - q2 * q2 + q3 * q3
+    return Q
+def TP_TN_FP_FN_visualization2pdb(gt_binding_site_coordinates, lig_scores, to_save_path, gt_indexes):
+    '''
+    Create dummy PDB files to visualize the results (TP, TN, FP, FN) on the receptor PDB file
+    '''
+    threshold = 0.5
+    # Initialize lists
+    TP_coords = []
+    FP_coords = []
+    TN_coords = []
+    FN_coords = []
+    for i, score in enumerate(lig_scores):
+        # If the atom is a true binding site
+        if i in gt_indexes:
+            if score > threshold:
+                TP_coords.append(gt_binding_site_coordinates[i])
+            else:
+                FN_coords.append(gt_binding_site_coordinates[i])
+        # If the atom is not a binding site
+        else:
+            if score > threshold:
+                FP_coords.append(gt_binding_site_coordinates[i])
+            else:
+                TN_coords.append(gt_binding_site_coordinates[i])
+    def generate_pdb_file(coordinates, file_name):
+        """Generate a dummy PDB file using the provided coordinates."""
+        with open(os.path.join(to_save_path, file_name), 'w') as pdb_file:
+            atom_number = 1
+            for coord in coordinates:
+                x, y, z = coord
+                pdb_file.write(
+                    f"ATOM  {atom_number:5}  DUM DUM A{atom_number:4}    {x:8.3f}{y:8.3f}{z:8.3f}  1.00  0.00\n")
+                atom_number += 1
+                if atom_number == 9999:
+                    atom_number = 1
+            pdb_file.write("END")
+    # Generate PDB files for TP, FP, TN, and FN
+    generate_pdb_file(TP_coords, os.path.join(to_save_path, "TP_atoms.pdb"))
+    generate_pdb_file(FP_coords, os.path.join(to_save_path, "FP_atoms.pdb"))
+    generate_pdb_file(TN_coords, os.path.join(to_save_path, "TN_atoms.pdb"))
+    generate_pdb_file(FN_coords, os.path.join(to_save_path, "FN_atoms.pdb"))
+    print('TP:', len(TP_coords), 'FP:', len(FP_coords), 'FN:', len(FN_coords), 'TN:', len(TN_coords))
+def visualize_TP_TN_FP_FN_residue_level(lig_scores, gt_indexes, residues, receptor_path, tosavepath):
+    threshold = 0.5
+    # Initialize lists
+    tp_list = []
+    fp_list = []
+    tn_list = []
+    fn_list = []
+    with open(receptor_path, 'r') as f:
+        lines = f.readlines()
+    res_atoms = [len(i[1]['atoms']) for i in residues.items()]
+    for i, score in enumerate(lig_scores):
+        # If the atom is a true binding site
+        lines2add = res_atoms[i]
+        if i in gt_indexes:
+            if score > threshold:
+                tp_list.append(lines[:lines2add])
+                del lines[:lines2add]
+            else:
+                fn_list.append(lines[:lines2add])
+                del lines[:lines2add]
+        # If the atom is not a binding site
+        else:
+            if score > threshold:
+                fp_list.append(lines[:lines2add])
+                del lines[:lines2add]
+            else:
+                tn_list.append(lines[:lines2add])
+                del lines[:lines2add]
+    # Generate PDB files for TP, FP, TN, and FN
+    with open(os.path.join(tosavepath, 'TP_residues.pdb'), 'w') as f:
+        for l in tp_list:
+            for item in l:
+                f.write(item)
+    with open(os.path.join(tosavepath, 'FP_residues.pdb'), 'w') as f:
+        for l in fp_list:
+            for item in l:
+                f.write(item)
+    with open(os.path.join(tosavepath, 'FN_residues.pdb'), 'w') as f:
+        for l in fn_list:
+            for item in l:
+                f.write(item)
+    with open(os.path.join(tosavepath, 'TN_residues.pdb'), 'w') as f:
+        for l in tn_list:
+            for item in l:
+                f.write(item)
+    # print('TP:', len(tp_list), 'FP:', len(fp_list), 'FN:', len(fn_list), 'TN:', len(tn_list))
+def calculate_TP_TN_FP_FN(lig_scores, gt_indexes):
+    threshold = 0.5
+    # Initialize lists
+    TP = 0
+    FP = 0
+    TN = 0
+    FN = 0
+    for i, score in enumerate(lig_scores):
+        # If the atom is a true binding site
+        if i in gt_indexes:
+            if score > threshold:
+                TP += 1
+            else:
+                FN += 1
+        # If the atom is not a binding site
+        else:
+            if score > threshold:
+                FP += 1
+            else:
+                TN += 1
+    print('TP:', TP, 'FP:', FP, 'FN:', FN, 'TN:', TN)
+def show_roc_curve(true_labels, lig_scores, auc_roc):
+    # Calculate ROC curve
+    fpr, tpr, thresholds = roc_curve(true_labels, lig_scores)
+    # Plot ROC curve
+    plt.figure(figsize=(8, 6))
+    plt.plot(fpr, tpr, color='darkorange', lw=2, label=f'ROC curve (area = {auc_roc:.2f})')
+    plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
+    plt.xlim([0.0, 1.0])
+    plt.ylim([0.0, 1.05])
+    plt.xlabel('False Positive Rate')
+    plt.ylabel('True Positive Rate')
+    plt.title('Receiver Operating Characteristic (ROC) Curve')
+    plt.legend(loc="lower right")
+    plt.show()
+def calculate_metrics(true_labels, predicted_labels, lig_scores, to_save_metrics_path):
+    auc_roc = roc_auc_score(true_labels, lig_scores)
+    accuracy = accuracy_score(true_labels, predicted_labels)
+    precision = precision_score(true_labels, predicted_labels)
+    recall = recall_score(true_labels, predicted_labels)
+    f1 = f1_score(true_labels, predicted_labels)
+    pr, re, _ = precision_recall_curve(true_labels, lig_scores)
+    auc_pr = auc(re, pr)
+    conf_matrix = confusion_matrix(true_labels, predicted_labels)
+    mcc = matthews_corrcoef(true_labels, predicted_labels)
+    tn, fp, fn, tp = conf_matrix.ravel()
+    fpr = fp / (fp + tn)
+    tpr = tp / (tp + fn)  # True positive rate == sensitivity == recall
+    npv = tn / (tn + fn)
+    spc = tn / (fp + tn)  # Specificity or True Negative Rate
+    with open(to_save_metrics_path, 'w') as f:
+        print(f"AUC-ROC: {auc_roc:.4f}", file=f)
+        print(f"Accuracy: {accuracy:.4f}", file=f)
+        print(f"Precision: {precision:.4f}", file=f)
+        print(f"Recall: {recall:.4f}", file=f)
+        print(f"F1 Score: {f1:.4f}", file=f)
+        print(f"AUC-PR: {auc_pr:.4f}", file=f)
+        print(f"Confusion Matrix:\n {conf_matrix}", file=f)
+        print(f"Matthews Correlation Coefficient: {mcc:.4f}", file=f)
+        print(f"False Positive Rate (FPR): {fpr:.4f}", file=f)
+        print(f"Negative Predictive Value (NPV): {npv:.4f}", file=f)
+        print(f"Specificity (SPC): {spc:.4f}", file=f)
+    return auc_roc, accuracy, precision, recall, f1, auc_pr, conf_matrix, mcc, fpr, npv, spc
+def filter_out_HETATMs(pdb_file_path):
+    with open(pdb_file_path, 'r') as infile:
+        lines = infile.readlines()
+    # Filter out lines starting with 'HETATM'
+    filtered_lines = [line for line in lines if not line.startswith('HETATM')]
+    # Write the filtered lines back to the file
+    with open(pdb_file_path, 'w') as outfile:
+        outfile.writelines(filtered_lines)
+def write_residue_prediction_pdb(receptor, output_pdb_path, residues_best):
+    """
+    :param receptor: original receptor pdb file path
+    :param results_save_path: where to save the prediction pdb file residues: the residues dict with scores
+    :param residues_best: the residues dict with scores
+    :return: Write the prediction PDB file with scores at residue level (replaces B-factor for residues).
+    """
+    # rec_name = receptor.split('/')[-1].split('_')[0]
+    # output_pdb_path = os.path.join(results_save_path, f'{rec_name}_pred.pdb')
+    #
+    # # Ensure the directory exists
+    # os.makedirs(results_save_path, exist_ok=True)
+    # Open the original receptor PDB file and the output PDB file for writing the predictions
+    with open(receptor, 'r') as original_pdb, open(output_pdb_path, 'w') as pred_pdb:
+        for line in original_pdb:
+            if line.startswith("ATOM") or line.startswith("HETATM"):  # Process only ATOM and HETATM records
+                # Extract residue info (residue number, chain ID, and insertion code)
+                chain_id = line[21]
+                res_num = line[22:26].strip()
+                insertion_code = line[26].strip()
+                # Create the residue ID in the same format as in residues_best
+                res_id = f'{res_num}_{chain_id}'
+                if insertion_code:
+                    res_id = f'{res_id}_{insertion_code}'
+                # Check if the residue exists in residues_best
+                if res_id in residues_best:
+                    # Extract the prediction score
+                    pred_score = residues_best[res_id]['scores']
+                    # Modify the line to replace the B-factor (position 61-66) with the prediction score
+                    new_b_factor = f'{pred_score:6.3f}'  # Format the prediction score with 3 decimal places
+                    new_line = f'{line[:60]}{new_b_factor:>6}{line[66:]}'
+                    # Write the modified line to the new PDB file
+                    pred_pdb.write(new_line)
+                else:
+                    # If no prediction score is found, write the original line
+                    pred_pdb.write(line)
+            else:
+                # Write lines that do not start with ATOM or HETATM (like headers and footers) unchanged
+                pred_pdb.write(line)
+    print(f"Residue-level prediction PDB file saved as {output_pdb_path}")
+def write_atom_prediction_pdb(receptor, results_save_path, lig_scores_only_receptor_atoms):
+    """
+    :param receptor: original receptor pdb file path
+    :param results_save_path: where to save the prediction pdb file residues: the residues dict with scores
+    :param residues_best: the residues dict with scores
+    :return: Write the prediction PDB file with scores at atom level (replaces B-factor for each atom).
+    """
+    rec_name = receptor.split('/')[-1].split('_')[0]
+    output_pdb_path = os.path.join(results_save_path, f'{rec_name}_pred_per_atom.pdb')
+    # Make sure the length of lig_scores matches the number of atoms in the PDB
+    assert len(lig_scores_only_receptor_atoms) == sum(1 for line in open(receptor) if line.startswith("ATOM") or line.startswith("HETATM")), \
+        "Number of scores doesn't match the number of atoms in the PDB file"
+    # Open the original receptor PDB file and the output PDB file for writing the predictions
+    with open(receptor, 'r') as original_pdb, open(output_pdb_path, 'w') as pred_pdb2:
+        atom_index = 0  # To track which score corresponds to which atom
+        for line in original_pdb:
+            if line.startswith("ATOM") or line.startswith("HETATM"):  # Process only ATOM and HETATM records
+                # Extract the prediction score for the current atom
+                pred_score = lig_scores_only_receptor_atoms[atom_index][0]  # Get the prediction score for this atom
+                # Modify the line to replace the B-factor (position 61-66) with the prediction score
+                new_b_factor = f'{pred_score:6.3f}'  # Format the prediction score with 3 decimal places
+                new_line = f'{line[:60]}{new_b_factor:>6}{line[66:]}'  # Insert the prediction score at the correct position
+                # Write the modified line to the new PDB file
+                pred_pdb2.write(new_line)
+                # Increment the atom index
+                atom_index += 1
+            else:
+                # Write lines that do not start with ATOM or HETATM (like headers and footers) unchanged
+                pred_pdb2.write(line)
+    print(f"Per-atom prediction PDB file saved as {output_pdb_path}")
+def receptor_info(receptor, lig_scores_only_receptor_atoms):
+    """
+    Extract residue groups and compute the best scores for each residue in the receptor.
+    Args:
+        receptor (str): The path to the receptor PDB file.
+        lig_scores_only_receptor_atoms (ndarray): List of ligandability scores for each atom.
+    Returns:
+        residues (dict): Dictionary containing atom information and ligand scores for each residue.
+        residues_best (dict): Dictionary containing the best ligand score for each residue.
+    """
+    # Create the residue groups for the whole protein
+    residues = {}
+    with open(receptor, 'r') as file:
+        for line in file:
+            if line.startswith("ATOM"):
+                chain_id = line[21]  # Extract chain identifier
+                atom_id = line[6:11].strip()
+                res_id = f'{line[22:26].strip()}_{chain_id}'  # Concatenate residue ID with chain ID
+                insertion_code = line[26].strip()
+                if insertion_code:
+                    res_id = f'{res_id}_{insertion_code}'
+                if res_id not in residues:
+                    residues[res_id] = {"atoms": [], 'scores': []}
+                residues[res_id]["atoms"].append(atom_id)
+                atom2check = int(atom_id) - 1
+                residues[res_id]['scores'].append(lig_scores_only_receptor_atoms[atom2check][0])
+    # Take the best scores for the whole protein
+    residues_best = {}
+    for res_id, res_data in residues.items():
+        residues_best[res_id] = {'scores': []}
+        check_best = res_data['scores']
+        best_atom = check_best.index(max(check_best))  # We take the best atom score of the residue
+        residues_best[res_id]['scores'] = check_best[best_atom]
+    return residues, residues_best