Spaces:
Sleeping
Sleeping
import tarfile, glob, os | |
from biopandas.pdb import PandasPdb | |
import argparse | |
import numpy as np | |
parser = argparse.ArgumentParser(description='ASCARIS') | |
parser.add_argument('-file_name', '--file_name', | |
help='Enter the file tar file name to untar', | |
default=1) | |
args = parser.parse_args() | |
alphafold = args.file_name | |
def threeToOne(variant): | |
if variant == "ALA": | |
variant = "A" | |
elif variant == "ARG": | |
variant = "R" | |
elif variant == "VAL": | |
variant = "V" | |
elif variant == "GLU": | |
variant = "E" | |
elif variant == "PRO": | |
variant = "P" | |
elif variant == "LEU": | |
variant = "L" | |
elif variant == "GLY": | |
variant = "G" | |
elif variant == "ASN": | |
variant = "N" | |
elif variant == "SER": | |
variant = "S" | |
elif variant == "GLN": | |
variant = "Q" | |
elif variant == "THR": | |
variant = "T" | |
elif variant == "MET": | |
variant = "M" | |
elif variant == "LYS": | |
variant = "K" | |
elif variant == "ASP": | |
variant = "D" | |
elif variant == "ILE": | |
variant = "I" | |
elif variant == "PHE": | |
variant = "F" | |
elif variant == "TRP": | |
variant = "W" | |
elif variant == "TYR": | |
variant = "Y" | |
elif variant == "HIS": | |
variant = "H" | |
elif variant == "CYS": | |
variant = "C" | |
elif variant == 'UNK': | |
variant = 'X' | |
elif variant == 'ASX': | |
variant = 'O' | |
return (variant) | |
# Unzip AlphaFold structures | |
def create_file(): | |
os.makedirs('input_files/alphafold_structures/', exist_ok=True) | |
for f in glob.glob(f'input_files/{alphafold}'): | |
with tarfile.open(f) as tar: | |
tar.extractall(f'input_files/alphafold_structures/') | |
# Create summary file | |
alphafold_summary_file = open('input_files/alphafold_summary.txt', 'w') | |
alphafold_summary_file.write('uniprotID\tchain\tsequence\tmodel_num') | |
alphafold_summary_file.write('\n') | |
for f in glob.glob('input_files/alphafold_structures/*pdb*'): | |
str1 = PandasPdb().read_pdb(f) | |
str1 = str1.df['ATOM'] | |
str1 = str1[['alt_loc', 'residue_name', 'residue_number', 'atom_name', 'insertion', 'chain_id']] | |
str1 = str1[str1.atom_name == 'CA'] | |
str1['residue_name'] = str1['residue_name'].apply(lambda x: threeToOne(x)) | |
str1['alt_loc'] = str1['alt_loc'].replace({'': np.NaN}) | |
str1 = str1.drop_duplicates(['residue_name', 'residue_number']) | |
structure_residues_pdb = ''.join(str1.residue_name.to_list()) | |
model_no = f.split('-')[2].strip()[1:] | |
up_name = f.split('-')[1].strip() | |
chain_id = list(set(str1.chain_id.to_list()))[0] | |
alphafold_summary_file.write(up_name) | |
alphafold_summary_file.write('\t') | |
alphafold_summary_file.write(chain_id) | |
alphafold_summary_file.write('\t') | |
alphafold_summary_file.write(structure_residues_pdb) | |
alphafold_summary_file.write('\t') | |
alphafold_summary_file.write(model_no) | |
alphafold_summary_file.write('\n') | |
if __name__ == '__main__': | |
create_file() |