ASCARIS / code /get_alphafoldStructures.py
fatmacankara's picture
Duplicate from fatmacankara/ASCARIS
c2a02c6
raw
history blame
3.11 kB
import tarfile, glob, os
from biopandas.pdb import PandasPdb
import argparse
import numpy as np
parser = argparse.ArgumentParser(description='ASCARIS')
parser.add_argument('-file_name', '--file_name',
help='Enter the file tar file name to untar',
default=1)
args = parser.parse_args()
alphafold = args.file_name
def threeToOne(variant):
if variant == "ALA":
variant = "A"
elif variant == "ARG":
variant = "R"
elif variant == "VAL":
variant = "V"
elif variant == "GLU":
variant = "E"
elif variant == "PRO":
variant = "P"
elif variant == "LEU":
variant = "L"
elif variant == "GLY":
variant = "G"
elif variant == "ASN":
variant = "N"
elif variant == "SER":
variant = "S"
elif variant == "GLN":
variant = "Q"
elif variant == "THR":
variant = "T"
elif variant == "MET":
variant = "M"
elif variant == "LYS":
variant = "K"
elif variant == "ASP":
variant = "D"
elif variant == "ILE":
variant = "I"
elif variant == "PHE":
variant = "F"
elif variant == "TRP":
variant = "W"
elif variant == "TYR":
variant = "Y"
elif variant == "HIS":
variant = "H"
elif variant == "CYS":
variant = "C"
elif variant == 'UNK':
variant = 'X'
elif variant == 'ASX':
variant = 'O'
return (variant)
# Unzip AlphaFold structures
def create_file():
os.makedirs('input_files/alphafold_structures/', exist_ok=True)
for f in glob.glob(f'input_files/{alphafold}'):
with tarfile.open(f) as tar:
tar.extractall(f'input_files/alphafold_structures/')
# Create summary file
alphafold_summary_file = open('input_files/alphafold_summary.txt', 'w')
alphafold_summary_file.write('uniprotID\tchain\tsequence\tmodel_num')
alphafold_summary_file.write('\n')
for f in glob.glob('input_files/alphafold_structures/*pdb*'):
str1 = PandasPdb().read_pdb(f)
str1 = str1.df['ATOM']
str1 = str1[['alt_loc', 'residue_name', 'residue_number', 'atom_name', 'insertion', 'chain_id']]
str1 = str1[str1.atom_name == 'CA']
str1['residue_name'] = str1['residue_name'].apply(lambda x: threeToOne(x))
str1['alt_loc'] = str1['alt_loc'].replace({'': np.NaN})
str1 = str1.drop_duplicates(['residue_name', 'residue_number'])
structure_residues_pdb = ''.join(str1.residue_name.to_list())
model_no = f.split('-')[2].strip()[1:]
up_name = f.split('-')[1].strip()
chain_id = list(set(str1.chain_id.to_list()))[0]
alphafold_summary_file.write(up_name)
alphafold_summary_file.write('\t')
alphafold_summary_file.write(chain_id)
alphafold_summary_file.write('\t')
alphafold_summary_file.write(structure_residues_pdb)
alphafold_summary_file.write('\t')
alphafold_summary_file.write(model_no)
alphafold_summary_file.write('\n')
if __name__ == '__main__':
create_file()