File size: 2,736 Bytes
4a3f787
 
 
 
 
 
 
 
 
 
 
 
 
486fd8a
 
4a3f787
486fd8a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4a3f787
486fd8a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
import os
from argparse import FileType, ArgumentParser

import numpy as np
import pandas as pd
from Bio.PDB import PDBParser
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from tqdm import tqdm
from Bio import SeqIO



def esm_embedding_prep(out_file, protein_path):
    biopython_parser = PDBParser()

    three_to_one = {
        "ALA": "A",
        "ARG": "R",
        "ASN": "N",
        "ASP": "D",
        "CYS": "C",
        "GLN": "Q",
        "GLU": "E",
        "GLY": "G",
        "HIS": "H",
        "ILE": "I",
        "LEU": "L",
        "LYS": "K",
        "MET": "M",
        "MSE": "M",  # MSE this is almost the same AA as MET. The sulfur is just replaced by Selen
        "PHE": "F",
        "PRO": "P",
        "PYL": "O",
        "SER": "S",
        "SEC": "U",
        "THR": "T",
        "TRP": "W",
        "TYR": "Y",
        "VAL": "V",
        "ASX": "B",
        "GLX": "Z",
        "XAA": "X",
        "XLE": "J",
    }

    file_paths = [protein_path]
    sequences = []
    ids = []
    for file_path in tqdm(file_paths):
        structure = biopython_parser.get_structure("random_id", file_path)
        structure = structure[0]
        for i, chain in enumerate(structure):
            seq = ""
            for res_idx, residue in enumerate(chain):
                if residue.get_resname() == "HOH":
                    continue
                residue_coords = []
                c_alpha, n, c = None, None, None
                for atom in residue:
                    if atom.name == "CA":
                        c_alpha = list(atom.get_vector())
                    if atom.name == "N":
                        n = list(atom.get_vector())
                    if atom.name == "C":
                        c = list(atom.get_vector())
                if (
                    c_alpha != None and n != None and c != None
                ):  # only append residue if it is an amino acid
                    try:
                        seq += three_to_one[residue.get_resname()]
                    except Exception as e:
                        seq += "-"
                        print(
                            "encountered unknown AA: ",
                            residue.get_resname(),
                            " in the complex ",
                            file_path,
                            ". Replacing it with a dash - .",
                        )
            sequences.append(seq)
            ids.append(f"{os.path.basename(file_path)}_chain_{i}")
    records = []
    for (index, seq) in zip(ids, sequences):
        record = SeqRecord(Seq(seq), str(index))
        record.description = ""
        records.append(record)
    SeqIO.write(records, out_file, "fasta")