Spaces:
Running
on
T4
Running
on
T4
File size: 2,920 Bytes
4a3f787 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 |
import os
from argparse import FileType, ArgumentParser
import numpy as np
from Bio.PDB import PDBParser
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from tqdm import tqdm
parser = ArgumentParser()
parser.add_argument('--data_dir', type=str, default='data/PDBBind_processed', help='')
parser.add_argument('--chain_cutoff', type=int, default=10, help='')
parser.add_argument('--out_file', type=str, default="data/pdbbind_sequences.fasta")
args = parser.parse_args()
cutoff = args.chain_cutoff
data_dir = args.data_dir
names = os.listdir(data_dir)
#%%
from Bio import SeqIO
biopython_parser = PDBParser()
three_to_one = {'ALA': 'A',
'ARG': 'R',
'ASN': 'N',
'ASP': 'D',
'CYS': 'C',
'GLN': 'Q',
'GLU': 'E',
'GLY': 'G',
'HIS': 'H',
'ILE': 'I',
'LEU': 'L',
'LYS': 'K',
'MET': 'M',
'MSE': 'M', # this is almost the same AA as MET. The sulfur is just replaced by Selen
'PHE': 'F',
'PRO': 'P',
'PYL': 'O',
'SER': 'S',
'SEC': 'U',
'THR': 'T',
'TRP': 'W',
'TYR': 'Y',
'VAL': 'V',
'ASX': 'B',
'GLX': 'Z',
'XAA': 'X',
'XLE': 'J'}
sequences = []
ids = []
for name in tqdm(names):
if name == '.DS_Store': continue
if os.path.exists(os.path.join(data_dir, name, f'{name}_protein_processed.pdb')):
rec_path = os.path.join(data_dir, name, f'{name}_protein_processed.pdb')
else:
rec_path = os.path.join(data_dir, name, f'{name}_protein.pdb')
if cutoff > 10:
rec_path = os.path.join(data_dir, name, f'{name}_protein_obabel_reduce.pdb')
if not os.path.exists(rec_path):
rec_path = os.path.join(data_dir, name, f'{name}_protein.pdb')
structure = biopython_parser.get_structure('random_id', rec_path)
structure = structure[0]
for i, chain in enumerate(structure):
seq = ''
for res_idx, residue in enumerate(chain):
if residue.get_resname() == 'HOH':
continue
residue_coords = []
c_alpha, n, c = None, None, None
for atom in residue:
if atom.name == 'CA':
c_alpha = list(atom.get_vector())
if atom.name == 'N':
n = list(atom.get_vector())
if atom.name == 'C':
c = list(atom.get_vector())
if c_alpha != None and n != None and c != None: # only append residue if it is an amino acid and not
try:
seq += three_to_one[residue.get_resname()]
except Exception as e:
seq += '-'
print("encountered unknown AA: ", residue.get_resname(), ' in the complex ', name, '. Replacing it with a dash - .')
sequences.append(seq)
ids.append(f'{name}_chain_{i}')
records = []
for (index, seq) in zip(ids,sequences):
record = SeqRecord(Seq(seq), str(index))
record.description = ''
records.append(record)
SeqIO.write(records, args.out_file, "fasta")
|