Spaces:
Sleeping
Sleeping
# IMPORT NECESSARY MODULES AND LIBRARIES | |
from timeit import default_timer as timer | |
import xml.etree.ElementTree as ET | |
from collections import Counter | |
from bs4 import BeautifulSoup | |
from io import StringIO | |
from decimal import * | |
import pandas as pd | |
import requests | |
import os.path as op | |
import subprocess | |
import shutil | |
import ssbio.utils | |
import warnings | |
import sys | |
import pathlib | |
from pathlib import Path | |
import os, glob | |
import math | |
import ssbio | |
import ssl | |
from Bio.Align import substitution_matrices | |
from Bio.PDB.Polypeptide import * | |
from Bio.PDB import PDBList | |
from Bio import Align | |
from Bio import SeqIO | |
from Bio.PDB import * | |
from Bio.PDB import PDBParser, PPBuilder | |
warnings.filterwarnings("ignore") | |
start = timer() | |
import streamlit as st | |
# FUNCTIONS | |
# FUNCTIONS | |
from calc_pc_property import * | |
from add_domains import * | |
from add_annotations import * | |
from add_sequence import * | |
from add_structure import * | |
from add_alignment import * | |
from manage_files import * | |
from add_3Dalignment import * | |
from add_sasa import * | |
from standard import * | |
from add_interface_pos import * | |
from standard import * | |
from uniprotSequenceMatch import uniprotSequenceMatch | |
from process_input import clean_data | |
def pdb(input_set, mode, impute): | |
aligner = Align.PairwiseAligner() | |
""" | |
STEP 1 | |
Get input data as a console input. | |
Add datapoint identifier and remove non-standard input. | |
""" | |
data = clean_data(input_set) | |
path_to_input_files, path_to_output_files, path_to_domains, fisher_path, path_to_interfaces, buffer = manage_files( | |
mode) | |
out_path = path_to_output_files / 'log.txt' | |
print('Creating directories...') | |
annotation_list = ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite', | |
'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis', 'strand', | |
'helix', 'turn', 'metalBinding', 'repeat', 'topologicalDomain', 'caBinding', 'bindingSite', | |
'region', | |
'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', 'peptide', | |
'transitPeptide', 'glycosylation', 'propeptide'] | |
print('Feature vector generation started...\n') | |
if len(data) == 0: | |
print('Feature vectore generation terminated.') | |
else: | |
""" | |
STEP 2 | |
Add physicochemical properties. | |
""" | |
print('Adding physicochemical properties...\n') | |
data = add_physicochemical(data) | |
""" | |
STEP 3 | |
Add domain-related information. | |
""" | |
print('Adding domains\n') | |
data = add_domains(data, path_to_domains) | |
data = data.astype(str) | |
data = data.replace({'NaN': 'nan'}) | |
data.domain = data.domain.replace({'nan': '-1'}) | |
data.domStart = data.domStart.replace({'nan': '-1'}) | |
data.domEnd = data.domEnd.replace({'nan': '-1'}) | |
data.distance = data.distance.replace({'nan': '-1'}) | |
""" | |
STEP 4 | |
Retrieve canonical and isoform UniProt sequences. | |
Add to the data frame. | |
""" | |
print('Retrieving UniProt sequences...\n') | |
canonical_fasta = pd.DataFrame(columns=['uniprotID', 'uniprotSequence']) | |
up_list = list(set(data['uniprotID'].to_list())) | |
for i in range(len(up_list)): | |
canonical_fasta.at[i, 'uniprotSequence'] = get_uniprot_seq(up_list[i]) | |
canonical_fasta.at[i, 'uniprotID'] = up_list[i] | |
canonical_fasta = canonical_fasta.drop_duplicates() | |
isoform_fasta = pd.DataFrame(columns=['uniprotID', 'isoformSequence']) | |
iso_dict = [] | |
for i in range(len(up_list)): | |
iso_dict.append(get_isoforms(up_list[i])) | |
index = 0 | |
for i in iso_dict: | |
for key, val in i.items(): | |
isoform_fasta.at[index, 'uniprotID'] = key | |
isoform_fasta.at[index, 'isoformSequence'] = val | |
index += 1 | |
isoform_fasta = isoform_fasta.drop_duplicates() | |
for i in isoform_fasta.index: | |
isoform_fasta.at[i, 'whichIsoform'] = isoform_fasta.at[i, 'uniprotID'][7:10].strip() | |
isoform_fasta.at[i, 'uniprotID'] = isoform_fasta.at[i, 'uniprotID'][0:6] | |
print('Sequence files created...\n') | |
data = data.merge(canonical_fasta, on='uniprotID', how='left') | |
data = data.astype(str) | |
data['whichIsoform'] = 'nan' | |
data.replace({'': 'nan'}, inplace=True) | |
data['wt_sequence_match'] = '' | |
for i in data.index: | |
if len(data.at[i, 'uniprotSequence']) >= int(data.at[i, 'pos']): | |
wt = data.at[i, 'wt'] | |
can = str(data.at[i, 'uniprotSequence'])[int(data.at[i, 'pos']) - 1] | |
if wt == can: | |
data.at[i, 'wt_sequence_match'] = 'm' | |
elif wt != can: | |
isoList = isoform_fasta[ | |
isoform_fasta['uniprotID'] == data.at[i, 'uniprotID']].isoformSequence.to_list() | |
for k in isoList: | |
if len(k) >= int(data.at[i, 'pos']): | |
resInIso = k[int(int(data.at[i, 'pos']) - 1)] | |
if wt == resInIso: | |
whichIsoform = isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[ | |
0] | |
data.at[i, 'wt_sequence_match'] = 'i' | |
data.at[i, 'whichIsoform'] = whichIsoform | |
break | |
elif len(data.at[i, 'uniprotSequence']) < int(data.at[i, 'pos']): | |
isoList = isoform_fasta[isoform_fasta['uniprotID'] == data.at[i, 'uniprotID']].isoformSequence.to_list() | |
for k in isoList: | |
if len(k) >= int(data.at[i, 'pos']): | |
resInIso = k[int(int(data.at[i, 'pos']) - 1)] | |
wt = data.at[i, 'wt'] | |
if wt == resInIso: | |
whichIsoform = isoform_fasta[isoform_fasta.isoformSequence == k].whichIsoform.to_list()[0] | |
data.at[i, 'wt_sequence_match'] = 'i' | |
data.at[i, 'whichIsoform'] = whichIsoform | |
break | |
data.wt_sequence_match = data.wt_sequence_match.astype('str') | |
data.replace({'': 'nan'}, inplace=True) | |
data_size = len(data.drop_duplicates(['datapoint'])) | |
not_match_in_uniprot = data[(data.uniprotSequence == 'nan') | (data.wt_sequence_match == 'nan')] | |
uniprot_matched = data[(data.uniprotSequence != 'nan') & (data.wt_sequence_match != 'nan')] | |
data = None | |
print('You have %d data points that failed to match a UniProt Sequence\nProceeding with %d remaining...\n' | |
% (len(not_match_in_uniprot.drop_duplicates(['datapoint'])), | |
len(uniprot_matched.drop_duplicates(['datapoint'])))) | |
""" | |
STEP 5 | |
Retrieve related PDB sequences, extract their sequences. | |
Add to the data frame. | |
""" | |
from urllib.error import HTTPError | |
pdb_fasta = pd.DataFrame(columns=['pdbID', 'chain', 'pdbSequence']) | |
pdb_info = pd.DataFrame(columns=['uniprotID', 'pdbID', 'chain', 'resolution']) | |
print('Retrieving PDB structures...\n') | |
pdbs = [] | |
protein = uniprot_matched.uniprotID.to_list() | |
protein = list(set(protein)) | |
for prot in protein: | |
pdbs.append(get_pdb_ids(prot)) | |
if len(pdbs) >= 1: | |
pdbs = [item for sublist in pdbs for item in sublist] | |
else: | |
pdbs = [] | |
print('Processing PDB structures...\n') | |
if pdbs == []: | |
print('No PDB structure found for the query. ') | |
print('Starting PDB structures download...\n') | |
pdbs = list(filter(None, pdbs)) | |
pdbs = (set(pdbs)) | |
pdbs = [i.lower() for i in pdbs] | |
pdbl = PDBList() | |
parser = PDBParser() | |
index = 0 | |
try: | |
shutil.rmtree('obsolete') | |
except OSError as e: | |
pass | |
cnt = 0 | |
pdbs = [i.upper() for i in pdbs] | |
def fetch_uniprot_ids(pdb_code): | |
response = requests.get(f"https://www.ebi.ac.uk/pdbe/api/mappings/uniprot/{pdb_code}") | |
response.raise_for_status() # Check for a successful response | |
data = response.json() | |
return list(list(list(data.values())[0].values())[0].keys()) | |
for search in pdbs: | |
# Step 1: Fetch the PDB file | |
pdb_url = f"https://files.rcsb.org/download/{search}.pdb" | |
try: | |
response = requests.get(pdb_url) | |
response.raise_for_status() # Check for a successful response | |
except : | |
continue # Skip to the next PDB code if fetching fails | |
# Step 2: Parse the PDB file from memory | |
pdb_data = response.text | |
pdb_parser = PDBParser(QUIET=True) # QUIET=True suppresses warnings | |
pdb_file_content = StringIO(pdb_data) | |
structure = pdb_parser.get_structure(search, pdb_file_content) | |
ppb = PPBuilder() | |
pdb_data_list = pdb_data.split('\n') | |
pdb_data_list_sequence = [i for i in pdb_data_list if i.startswith('SEQRES')] | |
pdb_data_list_sequence = [ list(filter(None,i.split(' '))) for i in pdb_data_list_sequence] | |
seqs = {} | |
for i in pdb_data_list_sequence: | |
if i[2] in seqs.keys(): | |
seqs[i[2]] += i[4:] | |
else: | |
seqs[i[2]] = i[4:] | |
for key, val in seqs.items(): | |
seqs[key] = ''.join([threeToOne(i) for i in val]) | |
pdb_data_list = [i for i in pdb_data_list if i.startswith('DBREF')] | |
pdb_data_list = [[list(filter(None,i.split(' '))) for j in i.split(' ') if j == 'UNP'] for i in pdb_data_list] | |
pdb_data_list = [i for i in pdb_data_list if i != []] | |
pdb_data_list_uniprot = [[j[6] for j in i] for i in pdb_data_list] | |
#pdb_data_list = [[list(filter(None,j)) for j in i] for i in pdb_data_list] | |
pdb_data_list = [[j[2] for j in i] for i in pdb_data_list] | |
pdb_data_list = [i[0] for i in pdb_data_list] | |
for model in structure: | |
for pp in ppb.build_peptides(model): | |
sequence = pp.get_sequence() | |
for chain, up in zip(model,pdb_data_list_uniprot ): | |
chain_id = chain.get_id() | |
# Extract UniProt ID if available in the chain's annotations | |
uniprot_ids = fetch_uniprot_ids(search) | |
# Get the resolution from the PDB header | |
header = structure.header | |
resolution = header.get('resolution', 'N/A') | |
if chain_id in pdb_data_list: | |
# Print UniProt IDs, chain ID, and resolution for the current model | |
chain_id = chain.get_id() | |
pdb_fasta.at[index, 'pdbID'] = search | |
pdb_fasta.at[index, 'chain'] = chain_id | |
pdb_fasta.at[index, 'pdbSequence'] = str(seqs[chain_id]) | |
pdb_info.at[index, 'uniprotID'] = ', '.join(up) | |
pdb_info.at[index, 'pdbID'] = search | |
pdb_info.at[index, 'chain'] = chain_id | |
pdb_info.at[index, 'resolution'] = resolution | |
index += 1 | |
print('PDB file processing finished..') | |
for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")): | |
try: | |
filename_replace_ext = filename.with_suffix(".pdb") | |
filename.rename(filename_replace_ext) | |
except: | |
FileNotFoundError | |
for filename in list(Path(path_to_output_files / 'pdb_structures').glob("*")): | |
try: | |
if filename.stem.startswith("pdb"): | |
filename_replace_ext = filename.with_name(filename.stem[3:]) | |
filename.rename(filename_replace_ext.with_suffix('.pdb')) | |
except: | |
FileNotFoundError | |
uniprot_matched = pd.merge(uniprot_matched, pdb_info, on='uniprotID', how='left') | |
uniprot_matched = uniprot_matched.astype(str) | |
uniprot_matched = uniprot_matched.drop_duplicates() | |
uniprot_matched = uniprot_matched.merge(pdb_fasta, on=['pdbID', 'chain'], how='left') | |
uniprot_matched = uniprot_matched.astype(str) | |
with_pdb = uniprot_matched[(uniprot_matched.pdbID != 'nan') & ( | |
(uniprot_matched.resolution != 'nan') & (uniprot_matched.resolution != 'OT') & ( | |
uniprot_matched.resolution != 'None'))].drop_duplicates() | |
no_pdb = uniprot_matched[(uniprot_matched.pdbID == 'nan') | ( | |
(uniprot_matched.resolution == 'nan') | (uniprot_matched.resolution == 'OT') | ( | |
uniprot_matched.resolution == 'None'))] | |
no_pdb = no_pdb[~no_pdb.datapoint.isin(with_pdb.datapoint.to_list())] | |
no_pdb.drop(columns=['chain', 'pdbID', 'pdbSequence', 'resolution'], inplace=True) | |
print( | |
'PDB Information successfully added...\nPDB structures are found for %d of %d.\n%d of %d failed to match with PDB structure.\n' | |
% (len(with_pdb.drop_duplicates(['datapoint'])), len(uniprot_matched.drop_duplicates(['datapoint'])), | |
len(no_pdb.drop_duplicates(['datapoint'])), len(uniprot_matched.drop_duplicates(['datapoint'])))) | |
with_pdb = with_pdb.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True) | |
with_pdb = with_pdb.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'pdbSequence'], keep='first') | |
with_pdb.replace({'': 'nan'}, inplace=True) | |
if len(with_pdb) == 0: | |
with_pdb['pdbInfo'] = '' | |
else: | |
for i in with_pdb.index: | |
try: | |
res = str(with_pdb.at[i, 'resolution']) | |
chain = with_pdb.at[i, 'chain'] | |
new = with_pdb.at[i, 'pdbID'] + ':' + chain + ':' + res | |
with_pdb.at[i, 'pdbInfo'] = new | |
except: | |
TypeError | |
with_pdb.at[i, 'pdbInfo'] = 'nan' | |
with_pdb = with_pdb[['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore', | |
'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', 'pdbSequence', | |
'wt_sequence_match', | |
'whichIsoform', 'pdbID', 'resolution', 'chain', 'pdbInfo', 'datapoint']] | |
# If the query data points are found in no_match_in_uniprot data frame, it will not give any results. | |
# If the query data points are found in no_pdb data frame, it will be searched in the modbase and swiss_model steps. | |
# If the query data points are found in with_pdb data frame, it will be searched in the following steps. | |
""" | |
STEP 6 | |
Retrieve sequence annotations. | |
Add to the data frame. | |
""" | |
if len(with_pdb) > 0: | |
with_pdb = add_annotations(with_pdb) | |
else: | |
new_cols = with_pdb.columns.to_list() + ['disulfide', 'intMet', 'intramembrane', 'naturalVariant', | |
'dnaBinding', | |
'activeSite', | |
'nucleotideBinding', 'lipidation', 'site', 'transmembrane', | |
'crosslink', 'mutagenesis', 'strand', | |
'helix', 'turn', 'metalBinding', 'repeat', 'topologicalDomain', | |
'caBinding', 'bindingSite', 'region', | |
'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', | |
'coiledCoil', 'peptide', | |
'transitPeptide', 'glycosylation', 'propeptide', 'disulfideBinary', | |
'intMetBinary', 'intramembraneBinary', | |
'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary', | |
'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary', | |
'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', | |
'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary', | |
'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary', | |
'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', | |
'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary', | |
'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', | |
'glycosylationBinary', 'propeptideBinary'] | |
with_pdb = pd.DataFrame(columns=new_cols) | |
try: | |
with_pdb.whichIsoform = with_pdb.whichIsoform.astype('str') | |
except: | |
AttributeError | |
with_pdb['whichIsoform'] = '' | |
with_pdb = with_pdb.astype(str) | |
with_pdb = with_pdb.replace({'NaN': 'nan'}) | |
with_pdb.replace({'[]': 'nan'}, inplace=True) | |
with_pdb.replace({'nan-nan': 'nan'}, inplace=True) | |
with_pdb.replace({'': 'nan'}, inplace=True) | |
""" | |
STEP 7 | |
Do alignment for PDB | |
""" | |
# Canonical matches, i.e. labelled as m, canonical sequences will be aligned with PDB sequences. | |
# Isoform matches, i.e. labelled as i, isoform sequences will be aligned with PDB sequences. | |
with_pdb['uniprotSequence'] = with_pdb['uniprotSequence'].str.replace('U', 'C') | |
with_pdb['pdbSequence'] = with_pdb['pdbSequence'].str.replace('U', 'C') | |
dfM = with_pdb[with_pdb.wt_sequence_match == 'm'] | |
dfM = dfM.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True) | |
dfM = dfM.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'pdbSequence'], keep='first') | |
dfNM = with_pdb[with_pdb.wt_sequence_match == 'i'] | |
dfNM = dfNM.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True) | |
dfNM = dfNM.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'pdbSequence'], keep='first') | |
dfNM.rename(columns={'isoformSequence': 'uniprotSequence'}, inplace=True) | |
dfM = dfM.astype(str) | |
dfNM = dfNM.astype(str) | |
dfM.reset_index(inplace=True) | |
dfM.drop(['index'], axis=1, inplace=True) | |
dfNM.reset_index(inplace=True) | |
dfNM.drop(['index'], axis=1, inplace=True) | |
uniprot_matched_size = len(uniprot_matched.drop_duplicates(['datapoint'])) | |
uniprot_matched = None | |
pdb_fasta = None | |
pdb_info = None | |
pdbs = None | |
existing_pdb = None | |
with_pdb_size = len(with_pdb.drop_duplicates(['datapoint'])) | |
with_pdb = None | |
print('Aligning sequences...\n') | |
aligned_m = final_stage(dfM, annotation_list, Path(path_to_output_files / 'alignment_files')) | |
aligned_nm = final_stage(dfNM, annotation_list, Path(path_to_output_files / 'alignment_files')) | |
# When PDB sequence is nan, it is wrongly aligned to the UniProt sequence. Fix them. | |
for i in aligned_m.index: | |
if aligned_m.at[i, 'pdbSequence'] == 'nan': | |
aligned_m.at[i, 'mutationPositionOnPDB'] = 'nan' | |
aligned_m.at[i, 'domainStartonPDB'] = 'nan' | |
aligned_m.at[i, 'domainEndonPDB'] = 'nan' | |
aligned_m.at[i, 'pdb_alignStatus'] = 'nan' | |
for i in aligned_nm.index: | |
if aligned_nm.at[i, 'pdbSequence'] == 'nan': | |
aligned_nm.at[i, 'mutationPositionOnPDB'] = 'nan' | |
aligned_nm.at[i, 'domainStartonPDB'] = 'nan' | |
aligned_nm.at[i, 'domainEndonPDB'] = 'nan' | |
aligned_nm.at[i, 'pdb_alignStatus'] = 'nan' | |
# Check if they the same column name before merging. | |
aligned_m = aligned_m.astype(str) | |
aligned_nm = aligned_nm.astype(str) | |
frames = [aligned_m, aligned_nm] | |
after_up_pdb_alignment = pd.concat(frames, sort=False) | |
if len(after_up_pdb_alignment) == 0: | |
after_up_pdb_alignment['pdb_alignStatus'] = '' | |
after_up_pdb_alignment['mutationPositionOnPDB'] = '' | |
after_up_pdb_alignment['domainStartonPDB'] = '' | |
after_up_pdb_alignment['domainEndonPDB'] = '' | |
after_up_pdb_alignment = after_up_pdb_alignment.sort_values( | |
by=['uniprotID', 'wt', 'mut', 'pos', 'pdb_alignStatus', 'resolution', 'chain'], | |
ascending=[True, True, True, True, True, True, True]) | |
after_up_pdb_alignment = after_up_pdb_alignment.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos'], keep='first') | |
after_up_pdb_alignment = after_up_pdb_alignment.astype('str') | |
pdb_aligned = after_up_pdb_alignment[ | |
(after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB != 'nan')] | |
yes_pdb_no_match = after_up_pdb_alignment[ | |
(after_up_pdb_alignment.pdbID != 'nan') & (after_up_pdb_alignment.mutationPositionOnPDB == 'nan')] | |
no_pdb = no_pdb.copy() | |
print('PDB matching is completed...\n') | |
print('SUMMARY') | |
print('-------') | |
print('%d data points that failed to match a UniProt Sequence are discarded.' % len( | |
not_match_in_uniprot.drop_duplicates(['datapoint']))) | |
print('Of the remaining %d:' % uniprot_matched_size) | |
print('--%d of %d successfully aligned with PDB structures.' % ( | |
len(pdb_aligned.drop_duplicates(['datapoint'])), with_pdb_size)) | |
print('--%d of %d not found on the covered area by the structure.' % ( | |
len(yes_pdb_no_match.drop_duplicates(['datapoint'])), with_pdb_size)) | |
print('--PDB structures not found for %d datapoints.' % len(no_pdb.drop_duplicates(['datapoint']))) | |
print('--%d will be searched in Swiss-Model database.\n' % ( | |
len(yes_pdb_no_match.drop_duplicates(['datapoint'])) + len(no_pdb.drop_duplicates(['datapoint'])))) | |
dfM = None | |
dfNM = None | |
aligned_nm = None | |
aligned_m = None | |
after_up_pdb_alignment = None | |
print('Proceeding to SwissModel search...') | |
print('------------------------------------\n') | |
# At this point we have 4 dataframes | |
# 1. after_up_pdb_alignment --- This is after PDB sequence alignment. There may be mutations that wasnt found matching to after the alignment. Will be searched in other databases as well. | |
# 1a. aligned --- we are done with this. | |
# 1b. yes_pdb_no_match --- They have PDB structures but not matched, so will be searched in the other databases. | |
# 2. not_match_in_uniprot --- This wont be aligned with anything because these proteins dont have a uniprot ID. Only basic info is present. | |
# 3. no_pdb --- No PDB structures were found for them. Will be searched in other databases. | |
""" | |
Step 8 | |
Neutralize data points that are to be searched in Swiss-Model | |
# One point is that yes_pdb_no_match's annotations are the adjusted according to the PDBs they are matched before. | |
# They need to be converted to their old original UniProt annotation positions. | |
""" | |
yes_pdb_no_match.drop(['disulfide', 'intMet', | |
'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite', | |
'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', | |
'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat', | |
'caBinding', 'topologicalDomain', 'bindingSite', 'region', | |
'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', | |
'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'disulfideBinary', | |
'intMetBinary', 'intramembraneBinary', | |
'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary', | |
'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary', | |
'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', | |
'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary', | |
'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary', | |
'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', | |
'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary', | |
'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', | |
'glycosylationBinary', 'propeptideBinary', 'pdbSequence', 'pdbInfo', 'pdbID', | |
'chain', 'resolution', 'pdb_alignStatus', 'mutationPositionOnPDB', | |
'domainStartonPDB', 'domainEndonPDB'], axis=1, inplace=True) | |
to_swiss = pd.concat([yes_pdb_no_match.drop_duplicates(['datapoint']), no_pdb.drop_duplicates(['datapoint'])]) | |
no_pdb = None | |
to_swiss.reset_index(inplace=True) | |
to_swiss.drop(['index'], axis=1, inplace=True) | |
to_swiss = to_swiss.astype('str') | |
to_swiss = to_swiss.replace({'NaN': 'nan'}) | |
# Create model summary dataframe. | |
if len(to_swiss) != 0: | |
print('Generating SwissModel file...\n') | |
swiss_model = pd.read_csv(Path(path_to_input_files / 'swissmodel_structures.txt'), sep='\t', | |
dtype=str, header=None, skiprows=1, | |
names=['UniProtKB_ac', 'iso_id', 'uniprot_seq_length', 'uniprot_seq_md5', | |
'coordinate_id', 'provider', 'from', 'to', 'template', 'qmean', | |
'qmean_norm', 'seqid', 'url']) | |
else: | |
swiss_model = pd.DataFrame( | |
columns=['UniProtKB_ac', 'iso_id', 'uniprot_seq_length', 'uniprot_seq_md5', 'coordinate_id', | |
'provider', 'from', 'to', 'template', 'qmean', 'qmean_norm', 'seqid', 'url', 'whichIsoform']) | |
swiss_model = swiss_model.astype('str') | |
try: | |
swiss_model.iso_id = swiss_model.iso_id.astype('str') | |
except: | |
AttributeError | |
swiss_model['iso_id'] = 'nan' | |
swiss_model = swiss_model[swiss_model.UniProtKB_ac != 'nan'] | |
for ind in swiss_model.index: | |
swiss_model.at[ind, 'UniProtKB_ac'] = swiss_model.at[ind, 'UniProtKB_ac'].split('-')[0] | |
if swiss_model.at[ind, 'iso_id'] != 'nan': | |
swiss_model.at[ind, 'whichIsoform'] = swiss_model.at[ind, 'iso_id'].split('-')[1] | |
else: | |
swiss_model.at[ind, 'whichIsoform'] = 'nan' | |
# swiss_model.drop(['input'], axis=1, inplace=True) | |
swiss_model = swiss_model[swiss_model.provider == 'SWISSMODEL'] | |
print('Index File Processed...\n') | |
# Get relevant columns | |
swiss_model = swiss_model[ | |
['UniProtKB_ac', 'from', 'to', 'template', 'qmean_norm', 'seqid', 'url', 'whichIsoform']] | |
# Sort models on qmean score and identity. Some proteins have more than one models, we will pick one. | |
swiss_model = swiss_model.sort_values(by=['UniProtKB_ac', 'qmean_norm', 'seqid'], ascending=False) | |
swiss_model.reset_index(inplace=True) | |
swiss_model.drop(['index'], axis=1, inplace=True) | |
# Get protein IDs for which there exist models. | |
swiss_model_ids = set(swiss_model.UniProtKB_ac.to_list()) | |
to_swiss = to_swiss.astype(str) | |
no_swiss_models = pd.DataFrame() | |
for i in to_swiss.index: | |
if to_swiss.at[i, 'uniprotID'] not in swiss_model_ids: | |
k = pd.Series(to_swiss.iloc[i]) | |
no_swiss_models = no_swiss_models.append(k, ignore_index=True) | |
no_swiss_models = no_swiss_models.astype(str) | |
if len(no_swiss_models) == 0: | |
no_swiss_models = pd.DataFrame(columns=to_swiss.columns) | |
else: | |
no_swiss_models = no_swiss_models[to_swiss.columns] | |
no_swiss_models.reset_index(inplace=True) | |
no_swiss_models.drop('index', axis=1, inplace=True) | |
with_swiss_models = pd.concat([to_swiss, no_swiss_models]).drop_duplicates(['datapoint'], keep=False) | |
with_swiss_models = with_swiss_models[to_swiss.columns] | |
# Add model info. | |
with_swiss_models = with_swiss_models.astype(str) | |
swiss_model = swiss_model.astype(str) | |
swiss_models_with_data = pd.merge(with_swiss_models, swiss_model, left_on=['uniprotID', 'whichIsoform'], | |
right_on=['UniProtKB_ac', 'whichIsoform'], | |
how='left') | |
swiss_models_with_data = swiss_models_with_data.astype(str) | |
swiss_models_with_data = swiss_models_with_data.sort_values(by=['uniprotID', 'wt', 'mut', 'pos', 'qmean_norm'], | |
ascending=False) | |
swiss_models_with_data = swiss_models_with_data.drop_duplicates() | |
swiss_models_with_data = swiss_models_with_data.drop(['UniProtKB_ac', 'seqid'], axis=1) | |
swiss_models_with_data.pos = swiss_models_with_data.pos.astype('int') | |
swiss_models_with_data = swiss_models_with_data.astype(str) | |
# Get the ones in the list but without model url and add to the list to go to modbase. | |
url_nan = swiss_models_with_data[swiss_models_with_data.url == 'nan'] | |
# Add this nan's to no_model. These will be searched in MODBASE because here they dont have urls. | |
url_nan = url_nan.drop(['from', 'qmean_norm', 'template', 'to', 'url'], axis=1) | |
no_swiss_models_2 = pd.concat([no_swiss_models, url_nan]) | |
swiss_models_with_data = swiss_models_with_data[swiss_models_with_data.url != 'nan'] | |
for i in swiss_models_with_data.index: | |
try: | |
swiss_models_with_data.at[i, 'chain'] = swiss_models_with_data.at[i, 'template'].split('.')[2] | |
swiss_models_with_data.at[i, 'template'] = swiss_models_with_data.at[i, 'template'].split('.')[0] | |
except: | |
IndexError | |
if len(swiss_models_with_data) == 0: | |
swiss_models_with_data['chain'] = '' | |
swiss_models_with_data['template'] = '' | |
swiss_models_with_data.qmean_norm = swiss_models_with_data.qmean_norm.astype('str') | |
swiss_models_with_data.chain = swiss_models_with_data.chain.astype('str') | |
swiss_models_with_data['qmean_norm'] = swiss_models_with_data.qmean_norm.apply(lambda x: round(float(x), 2)) | |
swiss_models_with_data = swiss_models_with_data.astype(str) | |
# swiss_models_with_data: These data points will be aligned with their corresponding model sequences. | |
# Add sequences | |
no_swiss_models_2.reset_index(inplace=True) | |
no_swiss_models_2.drop('index', axis=1, inplace=True) | |
swiss_models_with_data.reset_index(inplace=True) | |
swiss_models_with_data.drop('index', axis=1, inplace=True) | |
swiss_model_ids = None | |
with_swiss_models = None | |
swiss_model = None | |
no_swiss_models = None | |
url_nan = None | |
# At this point we have: | |
# pdb_aligned --- Align in the PDB phase | |
# not_match_in_uniprot --- This wont be aligned with anything because these proteins dont have a uniprot ID. Only basic info is present. | |
# to_swiss (no_pdb + yes_pdb_no_match) --- to be searched in SwissModel database | |
# to_swiss (with_swiss_models & no_swiss_models) | |
# swiss_models_with_data --- We found swiss models for them. | |
# no_swiss_models_2 (no_swiss_models + url_nan)--- to be searched in modbase (the ones having swissmodels but not matching with the boundaries & broken_swiss will be added here) | |
""" | |
STEP 9 | |
Associated model IDs are added. | |
Download model files. | |
""" | |
print('Beginning SwissModel files download...') | |
existing_swiss = list(Path(path_to_output_files / 'swissmodel_structures').glob("*")) | |
existing_swiss = [str(i) for i in existing_swiss] | |
existing_swiss = ['.'.join(i.split('/')[-1].split('.')[:-1]) for i in existing_swiss] | |
swissmodels_fasta = pd.DataFrame() | |
for i in swiss_models_with_data.index: | |
protein = swiss_models_with_data.at[i, 'uniprotID'] | |
template = swiss_models_with_data.at[i, 'template'].split('.')[0] | |
qmean_norm = str(round(float(swiss_models_with_data.at[i, 'qmean_norm']), 2)) | |
if protein + '_' + template + '_' + qmean_norm not in existing_swiss: | |
url = swiss_models_with_data.at[i, 'url'].strip('\"').strip('}').replace('\\', '').strip('\"').replace( | |
'https', | |
'https:') | |
req = requests.get(url) | |
name = Path(path_to_output_files / 'swissmodel_structures' / f'{protein}_{template}_{qmean_norm}.txt') | |
print('Downloading for Protein:', protein + ' Model: ' + template) | |
with open(name, 'wb') as f: | |
f.write(req.content) | |
else: | |
print('Model exists.') | |
name = Path(path_to_output_files / 'swissmodel_structures' / f'{protein}_{template}_{qmean_norm}.txt') | |
with open(name, encoding="utf8") as f: | |
fasta = '' | |
lines = f.readlines() | |
chain = '' | |
for row in lines: | |
if row[0:4] == 'ATOM' and row[13:15] == 'CA': | |
chain = row[20:22].strip() | |
fasta += threeToOne(row[17:20]) | |
if row[0:3] == 'TER': | |
k = pd.Series([protein, template, qmean_norm, chain.upper(), fasta]) | |
swissmodels_fasta = swissmodels_fasta.append(k, ignore_index=True) | |
fasta = '' | |
if len(swissmodels_fasta) == 0: | |
swissmodels_fasta = pd.DataFrame(columns=['uniprotID', 'template', 'qmean_norm', 'chain', 'fasta']) | |
else: | |
swissmodels_fasta.columns = ['uniprotID', 'template', 'qmean_norm', 'chain', 'fasta'] | |
swissmodels_fasta = swissmodels_fasta.astype(str) | |
swiss_models_with_data.qmean_norm = swiss_models_with_data.qmean_norm.astype(float) | |
swissmodels_fasta.qmean_norm = swissmodels_fasta.qmean_norm.astype(float) | |
swissmodels_fasta = swissmodels_fasta.sort_values(['uniprotID', 'template', 'qmean_norm', 'chain'], | |
axis=0) # example = 3gdh | |
swissmodels_fasta.reset_index(inplace=True) | |
swissmodels_fasta.drop(['index'], axis=1, inplace=True) | |
swissmodels_fasta = swissmodels_fasta.drop_duplicates(['uniprotID', 'template', 'qmean_norm', 'chain']) | |
swissmodels_fasta = swissmodels_fasta.drop_duplicates(['uniprotID', 'template', 'chain', 'fasta']) | |
swissmodels_fasta = swissmodels_fasta.drop_duplicates(['uniprotID', 'template', 'fasta']) | |
# Some files were broken, thus their PDBs couldnt be recorded. | |
swissmodels_fasta = swissmodels_fasta.drop_duplicates() | |
swissmodels_fasta = swissmodels_fasta.astype(str) | |
swiss_models_with_data = swiss_models_with_data.astype(str) | |
swissmodels_fasta = swissmodels_fasta.astype(str) | |
swiss_models_with_data1 = swiss_models_with_data.merge(swissmodels_fasta, | |
on=['uniprotID', 'template', 'qmean_norm', 'chain']) | |
swiss_models_with_data1 = swiss_models_with_data1.sort_values(['datapoint', 'fasta'], axis=0, | |
ascending=[True, False]) | |
swiss_models_with_data1 = swiss_models_with_data1.drop_duplicates(['datapoint', 'template']) | |
swiss_models_with_data1_dp = list(set(swiss_models_with_data1.datapoint.to_list())) | |
swiss_models_with_data.reset_index(inplace=True) | |
swiss_models_with_data.drop(['index'], axis=1, inplace=True) | |
broken_swiss = pd.DataFrame() | |
c = 0 | |
for i in swiss_models_with_data.index: # en baştaki dfde var ama model gelende yok. | |
if swiss_models_with_data.at[i, 'datapoint'] not in swiss_models_with_data1_dp: | |
k = pd.Series(swiss_models_with_data.iloc[i]) | |
broken_swiss = broken_swiss.append(k, ignore_index=True) | |
c += 1 | |
if len(broken_swiss) == 0: | |
broken_swiss = pd.DataFrame(columns=swiss_models_with_data.columns.to_list()) | |
swiss_models_with_data = swiss_models_with_data1.copy() | |
swiss_models_with_data.qmean_norm = swiss_models_with_data.qmean_norm.astype('float') | |
swiss_models_with_data = swiss_models_with_data.sort_values(['uniprotID', 'wt', 'mut', 'qmean_norm'], | |
axis=0, ascending=[True, True, True, False]) | |
# Delete the same model sequence with lower quality | |
swiss_models_with_data = swiss_models_with_data.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'fasta'], | |
keep='first') | |
swiss_models_with_data.uniprotSequence = swiss_models_with_data.uniprotSequence.astype('str') | |
swiss_models_with_data.pos = swiss_models_with_data.pos.astype('int') | |
len(swiss_models_with_data.drop_duplicates(['datapoint'])) + len( | |
broken_swiss.drop_duplicates(['datapoint'])) + len( | |
no_swiss_models_2.drop_duplicates(['datapoint'])) == len(to_swiss.drop_duplicates(['datapoint'])) | |
# This printed data here includes all possible models with different qualities, | |
# because we may get a hit in either of them. | |
swiss_models_with_data.rename({'fasta': 'pdbSequence'}, axis=1, inplace=True) # for convenience. | |
# NOW DO ALIGNMENT HERE | |
swiss_models_with_data = swiss_models_with_data.replace({'[\'?\']': 'nan'}) | |
swiss_models_with_data = swiss_models_with_data.replace({'[]': 'nan'}) | |
swiss_models_with_data.rename({'template': 'pdbID'}, axis=1, | |
inplace=True) # Only to be able use the alignment code above. | |
swiss_models_with_data = swiss_models_with_data.astype(str) | |
swiss_models_with_data.pdbSequence = swiss_models_with_data.pdbSequence.astype('str') | |
swiss_models_with_data = add_annotations(swiss_models_with_data) | |
swiss_models_with_data = swiss_models_with_data.astype(str) | |
swiss_models_with_data.replace({'NaN': 'nan'}, inplace=True) | |
swiss_models_with_data_copy = swiss_models_with_data.copy() | |
swiss_models_with_data1_dp = None | |
swiss_models_with_data1 = None | |
existing_swiss = None | |
swissmodels_fasta = None | |
print('Aligning sequences...\n') | |
swiss_models_with_data['uniprotSequence'] = swiss_models_with_data['uniprotSequence'].str.replace('U', 'C') | |
swiss_models_with_data['pdbSequence'] = swiss_models_with_data['pdbSequence'].str.replace('U', 'C') | |
swiss_model_aligned = alignment(swiss_models_with_data, annotation_list, | |
path_to_output_files / 'alignment_files') | |
swiss_models_with_data = None | |
if len(swiss_model_aligned) == 0: | |
swiss_model_aligned = pd.DataFrame(columns=pdb_aligned.columns) | |
swiss_model_aligned['qmean_norm'] = 'nan' | |
else: | |
swiss_model_aligned = swiss_model_aligned.astype(str) | |
swiss_model_aligned.replace({'NaN': 'nan'}, inplace=True) | |
# Some datapoints appear in both nan and not_nan. If not_nan we take it only once. | |
nan = swiss_model_aligned[swiss_model_aligned.mutationPositionOnPDB == 'nan'] | |
not_nan = swiss_model_aligned[swiss_model_aligned.mutationPositionOnPDB != 'nan'] | |
not_nan.qmean_norm = not_nan.qmean_norm.astype('float') | |
not_nan.sort_values(['datapoint', 'pdb_alignStatus', 'qmean_norm'], ascending=[True, True, False], inplace=True) | |
which_ones_are_match = pd.concat([not_nan, nan]).drop_duplicates(['datapoint'], keep='first') | |
swiss_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB != 'nan'] | |
swiss_not_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB == 'nan'] | |
swiss_match.qmean_norm = swiss_match.qmean_norm.astype('float') | |
swiss_match.sort_values(['uniprotID', 'wt', 'pos', 'mut', 'pdb_alignStatus', 'qmean_norm'], | |
ascending=[True, True, True, True, True, False], inplace=True) | |
swiss_match.drop_duplicates(['uniprotID', 'wt', 'pos', 'mut'], keep='first', inplace=True) | |
swiss_not_match = swiss_not_match[no_swiss_models_2.columns] | |
broken_swiss = broken_swiss[no_swiss_models_2.columns] | |
swiss_not_match = swiss_not_match.drop_duplicates(['datapoint']) | |
broken_swiss = broken_swiss.drop_duplicates(['datapoint']) | |
to_modbase = pd.concat([no_swiss_models_2, broken_swiss]).drop_duplicates() | |
to_modbase = pd.concat([to_modbase, swiss_not_match]).drop_duplicates() | |
to_modbase = to_modbase.astype(str) | |
to_swiss_columns = to_swiss.columns | |
to_swiss_size = len(to_swiss.drop_duplicates(['datapoint'])) | |
to_swiss = None | |
# CONTROL | |
""" | |
# This should be the whole data. | |
len(swiss_match.drop_duplicates(['datapoint'])) + len(aligned.drop_duplicates(['datapoint'])) + len(to_modbase.drop_duplicates(['datapoint'])) + len(not_match_in_uniprot.drop_duplicates(['datapoint'])) ,len(data) | |
len(aligned.drop_duplicates(['datapoint'])) + len(not_match_in_uniprot.drop_duplicates(['datapoint'])) +len(to_swiss.drop_duplicates(['datapoint']))== len(data) | |
""" | |
print('SwissModel matching is completed...\n') | |
print('SUMMARY') | |
print('-------') | |
print('%d data points that failed to match a UniProt Sequence are discarded.' % len( | |
not_match_in_uniprot.drop_duplicates(['datapoint']))) | |
print('Of the remaining %d:' % uniprot_matched_size) | |
print('--%d of %d successfully aligned with PDB structures.' % ( | |
len(pdb_aligned.drop_duplicates(['datapoint'])), with_pdb_size)) | |
print('--%d of %d successfully aligned with SwissModels structures.' % ( | |
len(swiss_match.drop_duplicates(['datapoint'])), to_swiss_size)) | |
print('--%d will be searched in ModBase database.\n' % len(to_modbase.drop_duplicates(['datapoint']))) | |
print('Proceeding to ModBase search...') | |
print('------------------------------------\n') | |
no_swiss_models_2 = None | |
broken_swiss = None | |
swiss_model_aligned = None | |
nan = None | |
not_nan = None | |
which_ones_are_match = None | |
swiss_not_match = None | |
# STEP : GO TO MODBASE | |
# Should not include anything related to prev models. | |
if len(to_modbase) != 0: | |
to_modbase = to_modbase.astype(str) | |
# GET MODBASE MODELS | |
# Get IDs from data to retrieve only their models from MODBASE | |
to_modbase.reset_index(inplace=True) | |
to_modbase.drop(['index'], axis=1, inplace=True) | |
existing_modbase_models = list(Path(path_to_output_files / 'modbase_structures').glob("*")) | |
existing_modbase_models = [str(i) for i in existing_modbase_models] | |
existing_modbase_models = [i.split('/')[-1].split('.')[0] for i in existing_modbase_models] | |
existing_modbase_models_ind = list(Path(path_to_output_files / 'modbase_structures_individual').glob("*")) | |
existing_modbase_models_ind = [str(i) for i in existing_modbase_models_ind] | |
existing_modbase_models_ind = [i.split('/')[-1].split('.')[0] for i in existing_modbase_models_ind] | |
modbase_reduced = pd.DataFrame() | |
modbase_fasta = pd.DataFrame() | |
print('Retrieving ModBase models...\n') | |
# Get model files associated with each UniProtID | |
for protein in list(set(to_modbase.uniprotID.to_list())): | |
if protein not in existing_modbase_models: | |
print('Downloading Modbase models for ', protein) | |
url = 'https://salilab.org/modbase/retrieve/modbase/?databaseID=' + protein | |
req = requests.get(url) | |
name = path_to_output_files / 'modbase_structures' / f'{protein}.txt' | |
with open(name, 'wb') as f: | |
f.write(req.content) | |
else: | |
print('Model exists for', protein) | |
name = Path(path_to_output_files / 'modbase_structures' / f'{protein}.txt') | |
with open(name, encoding="utf8") as f: | |
a = open(name, 'r').read() | |
soup = BeautifulSoup(a, 'lxml') | |
for pdb in soup.findAll('pdbfile'): | |
model_id = str(pdb.contents[1])[10:-11] | |
if model_id not in existing_modbase_models_ind: | |
with open(path_to_output_files / 'modbase_structures_individual' / f'{model_id}.txt', 'w', | |
encoding="utf8") as individual: | |
individual.write(str('UniProt ID: ' + protein)) | |
individual.write('\n') | |
individual.write(str(pdb.contents[3])[10:-11].strip()) | |
with open(path_to_output_files / 'modbase_structures_individual' / f'{model_id}.txt', | |
encoding="utf8") as f: | |
fasta = '' | |
chain = '' | |
template_chain = '' | |
score = -999 | |
for ind_line in f.readlines(): | |
if ind_line[0:10] == 'UniProt ID': | |
uniprot_id = ind_line.split(':')[1].strip() | |
if ind_line[0:23] == 'REMARK 220 TARGET BEGIN': | |
target_begin = ind_line[40:43].strip() | |
if ind_line[0:21] == 'REMARK 220 TARGET END': | |
target_end = ind_line[40:43].strip() | |
if ind_line[0:25] == 'REMARK 220 TEMPLATE BEGIN': | |
pdb_begin = ind_line[40:43].strip() | |
if ind_line[0:23] == 'REMARK 220 TEMPLATE END': | |
pdb_end = ind_line[40:43].strip() | |
if ind_line[0:23] == 'REMARK 220 TEMPLATE PDB': | |
pdb_code = ind_line[40:43].strip() | |
if ind_line[0:25] == 'REMARK 220 TEMPLATE CHAIN': | |
pdb_chain = ind_line[40:43].strip() | |
if ind_line[0:32] == 'REMARK 220 ModPipe Quality Score': | |
quality_score = ind_line[40:].strip() | |
if ind_line[0:27] == 'REMARK 220 MODPIPE MODEL ID': | |
model_id = ind_line[40:].strip() | |
if ind_line[0:25] == 'REMARK 220 TEMPLATE CHAIN': | |
template_chain = ind_line[40:42].strip() | |
if ind_line[0:4] == 'ATOM' and ind_line[13:15] == 'CA': | |
fasta += threeToOne(ind_line[17:20]) | |
if ind_line[0:32] == 'REMARK 220 ModPipe Quality Score': | |
try: | |
score = ind_line[40:].strip() | |
except (ValueError): | |
score = -999 | |
if ind_line[0:3] == 'TER' or ind_line[0:3] == 'END': | |
k = pd.Series([uniprot_id, model_id, str(score), template_chain, fasta]) | |
modbase_fasta = modbase_fasta.append(k, ignore_index=True) | |
fasta = '' | |
try: | |
k = pd.Series( | |
[uniprot_id, target_begin, target_end, pdb_code, pdb_chain, pdb_begin, pdb_end, | |
quality_score, | |
model_id]) | |
modbase_reduced = modbase_reduced.append(k, ignore_index=True) | |
except: | |
NameError | |
print('This file doesnt have Quality Score. Replacer: -999', model_id) | |
quality_score = -999 | |
print() | |
if len(modbase_fasta) != 0: | |
modbase_fasta.columns = ['uniprotID', 'template', 'score', 'chain', 'fasta'] | |
else: | |
modbase_fasta = pd.DataFrame(columns=['uniprotID', 'template', 'score', 'chain', 'fasta']) | |
modbase_fasta = modbase_fasta.astype(str) | |
modbase_fasta = modbase_fasta.replace({'': 'nan'}) | |
modbase_fasta = modbase_fasta.replace({'NaN': 'nan'}) | |
modbase_fasta = modbase_fasta[modbase_fasta.fasta != 'nan'] | |
print('Modbase model frame constructed.\n') | |
if len(modbase_reduced) != 0: | |
modbase_reduced.columns = ['UniprotID', 'TargetBeg', 'TargetEnd', 'PDBCode', 'PDBChain', 'PDBBegin', | |
'PDBEnd', | |
'ModPipeQualityScore', 'ModelID'] | |
else: | |
modbase_reduced = pd.DataFrame( | |
columns=['UniprotID', 'TargetBeg', 'TargetEnd', 'PDBCode', 'PDBChain', 'PDBBegin', 'PDBEnd', | |
'ModPipeQualityScore', 'ModelID']) | |
to_modbase = add_annotations(to_modbase) | |
to_modbase = to_modbase.astype(str) | |
to_modbase.fillna('nan', inplace=True) | |
to_modbase = to_modbase.replace({'NaN': 'nan'}) | |
to_modbase.replace({'[]': 'nan'}, inplace=True) | |
to_modbase.replace({'nan-nan': 'nan'}, inplace=True) | |
to_modbase.replace({'': 'nan'}, inplace=True) | |
model_info_added = to_modbase.merge(modbase_reduced, right_on='UniprotID', left_on='uniprotID', | |
how='left') | |
modbase_reduced = None | |
existing_modbase_models = None | |
existing_modbase_models_ind = None | |
model_info_added = model_info_added.drop(['UniprotID'], axis=1) | |
model_info_added = model_info_added.rename(columns={'TargetBeg': 'from', 'TargetEnd': 'to', | |
'PDBCode': 'template', 'PDBChain': 'chain', | |
'ModPipeQualityScore': 'score', | |
'ModelID': 'pdbID'}) | |
model_info_added.drop(['PDBEnd', 'PDBBegin'], axis=1, inplace=True) | |
model_info_added.score = model_info_added.score.astype(float) | |
model_info_added = model_info_added.sort_values(by=['datapoint', 'score'], | |
ascending=False) | |
model_info_added.reset_index(inplace=True) | |
model_info_added.drop(['index'], axis=1, inplace=True) | |
model_info_added = model_info_added.drop_duplicates() | |
model_info_added = model_info_added.astype(str) | |
model_info_added = model_info_added.replace({'NaN': 'nan'}) | |
no_info = model_info_added[model_info_added.pdbID == 'nan'] | |
with_modbase_info = model_info_added[model_info_added.pdbID != 'nan'] | |
model_info_added = None | |
len(no_info.drop_duplicates(['datapoint'])), len(with_modbase_info.drop_duplicates(['datapoint'])) | |
len(no_info.drop_duplicates(['datapoint'])) + len(with_modbase_info.drop_duplicates(['datapoint'])) == len( | |
to_modbase.drop_duplicates(['datapoint'])) | |
# Add no_info to the rest down below! | |
no_info = no_info[to_swiss_columns] | |
with_modbase_info.score = with_modbase_info.score.astype(float) | |
modbase_fasta.score = modbase_fasta.score.astype(float) | |
modbase_fasta = modbase_fasta.sort_values(['uniprotID', 'score', 'template', 'chain'], | |
ascending=[True, False, True, True], axis=0) # example = 3gdh | |
# I added this newly downloaded ones to the main model file. | |
modbase_fasta = modbase_fasta.rename(columns={'template': 'pdbID'}) | |
with_modbase_info.pos = with_modbase_info.pos.astype('int') | |
with_modbase_info.score = with_modbase_info.score.astype(float) | |
with_modbase_info.score = with_modbase_info.score.apply(lambda x: round(x, 2)) | |
modbase_fasta.score = modbase_fasta.score.astype(float) | |
modbase_fasta.score = modbase_fasta.score.apply(lambda x: round(x, 2)) | |
with_modbase_info = with_modbase_info.merge(modbase_fasta, on='pdbID', how='left') | |
with_modbase_info.drop(['score_y'], axis=1, inplace=True) | |
with_modbase_info.rename(columns={'score_x': 'score'}, inplace=True) | |
with_modbase_info.drop(['uniprotID_y', 'chain_y'], axis=1, inplace=True) | |
with_modbase_info.rename(columns={'uniprotID_x': 'uniprotID', 'chain_x': 'chain'}, inplace=True) | |
with_modbase_info.score = with_modbase_info.score.astype('float') | |
with_modbase_info = with_modbase_info.sort_values(['uniprotID', 'wt', 'mut', 'pos', 'score', 'from', 'to'], | |
axis=0, | |
ascending=[True, True, True, True, False, True, False]) | |
with_modbase_info = with_modbase_info.drop_duplicates(['uniprotID', 'wt', 'mut', 'pos', 'fasta'], | |
keep='first') | |
with_modbase_info = with_modbase_info.replace({'[\'?\']': 'nan'}) | |
with_modbase_info = with_modbase_info.replace({'[]': 'nan'}) | |
with_modbase_info = with_modbase_info.replace({'\'?\', ': ''}) | |
with_modbase_info = with_modbase_info.replace({', \'?\'': ''}) | |
with_modbase_info = with_modbase_info.replace({'(': ''}) | |
with_modbase_info = with_modbase_info.replace( | |
{')': ''}) | |
with_modbase_info = with_modbase_info.astype(str) | |
with_modbase_info.fasta = with_modbase_info.fasta.astype('str') | |
with_modbase_info.reset_index(inplace=True) | |
with_modbase_info.drop('index', axis=1, inplace=True) | |
align = with_modbase_info[ | |
with_modbase_info.fasta != 'nan'] | |
yes_pdb_no_match = with_modbase_info[ | |
with_modbase_info.fasta == 'nan'] | |
yes_pdb_no_match = yes_pdb_no_match[~yes_pdb_no_match.datapoint.isin(align.datapoint.to_list())] | |
align.rename(columns={'fasta': 'pdbSequence'}, inplace=True) | |
align['uniprotSequence'] = align['uniprotSequence'].str.replace('U', 'C') | |
align['pdbSequence'] = align['pdbSequence'].str.replace('U', 'C') | |
to_modbase_size = len(to_modbase.drop_duplicates(['datapoint'])) | |
modbase_fasta = None | |
to_modbase = None | |
print('Aligning sequences...\n') | |
modbase_aligned = alignment(align, annotation_list, path_to_output_files / 'alignment_files') | |
modbase_aligned = modbase_aligned.astype(str) | |
modbase_aligned = modbase_aligned.replace({'NaN': 'nan'}) | |
# Get the ones whose models couldn't be found. Add to no_modbase (yani hiçbir şey de eşleşmemiş artık.) | |
if len(with_modbase_info) != 0: | |
not_in_aligned = pd.concat([modbase_aligned.drop_duplicates(['datapoint']), | |
with_modbase_info.drop_duplicates(['datapoint'])]).drop_duplicates( | |
['datapoint'], | |
keep=False) | |
else: | |
not_in_aligned = pd.DataFrame( | |
columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore', | |
'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', | |
'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', | |
'intMet', | |
'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite', | |
'nucleotideBinding', 'lipidation', 'site', 'transmembrane', | |
'crosslink', | |
'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat', | |
'topologicalDomain', 'caBinding', 'bindingSite', 'region', | |
'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', | |
'coiledCoil', | |
'peptide', 'transitPeptide', 'glycosylation', 'propeptide', | |
'disulfide', | |
'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', | |
'activeSite', | |
'nucleotideBinding', 'lipidation', 'site', 'transmembrane', | |
'crosslink', | |
'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat', | |
'topologicalDomain', 'caBinding', 'bindingSite', 'region', | |
'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', | |
'coiledCoil', | |
'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'from', | |
'to', 'template', 'chain', 'score', 'pdbID', 'pdbSequence', 'fasta']) | |
with_modbase_info = None | |
if len(not_in_aligned) != 0: | |
not_models = pd.concat([yes_pdb_no_match.drop_duplicates(['datapoint']), | |
not_in_aligned.drop_duplicates(['datapoint'])]).drop_duplicates(['datapoint'], | |
keep='first') | |
# Retain the best model among the aligned ones. | |
else: | |
not_models = pd.DataFrame(columns=not_in_aligned.columns) | |
yes_pdb_no_match = None | |
# # Some datapoints appear in both nan and not_nan. If not_nan we take it only once. | |
modbase_aligned = modbase_aligned.astype(str) | |
if len(modbase_aligned) != 0: | |
nan = modbase_aligned[modbase_aligned.mutationPositionOnPDB == 'nan'] | |
not_nan = modbase_aligned[modbase_aligned.mutationPositionOnPDB != 'nan'] | |
not_nan.score = not_nan.score.astype(float) | |
not_nan.sort_values(['datapoint', 'pdb_alignStatus', 'score'], ascending=[True, True, False], | |
inplace=True) | |
not_nan = not_nan.sort_values(['datapoint', 'mutationPositionOnPDB', 'score'], | |
ascending=[True, True, False]) | |
not_nan = not_nan.drop_duplicates(['datapoint'], keep='first') | |
else: | |
nan = pd.DataFrame(columns=modbase_aligned.columns) | |
not_nan = pd.DataFrame(columns=modbase_aligned.columns) | |
modbase_aligned = None | |
which_ones_are_match = pd.concat([not_nan, nan]).drop_duplicates(['datapoint'], keep='first') | |
if len(which_ones_are_match) == 0: | |
which_ones_are_match = pd.DataFrame( | |
columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore', | |
'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', | |
'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet', | |
'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite', | |
'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', | |
'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat', | |
'topologicalDomain', 'caBinding', 'bindingSite', 'region', | |
'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', | |
'peptide', 'transitPeptide', 'glycosylation', 'propeptide', | |
'disulfideBinary', 'intMetBinary', 'intramembraneBinary', | |
'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary', | |
'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary', | |
'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', | |
'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary', | |
'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary', | |
'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', | |
'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary', | |
'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', | |
'glycosylationBinary', 'propeptideBinary', 'from', 'to', 'template', | |
'chain', 'score', 'pdbID', 'pdbSequence', 'pdb_alignStatus', | |
'mutationPositionOnPDB', 'domainStartonPDB', 'domainEndonPDB']) | |
modbase_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB != 'nan'] | |
modbase_not_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB == 'nan'] | |
else: | |
modbase_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB != 'nan'] | |
modbase_not_match = which_ones_are_match[which_ones_are_match.mutationPositionOnPDB == 'nan'] | |
which_ones_are_match = None | |
modbase_match.score = modbase_match.score.astype('float') | |
modbase_match = modbase_match.sort_values(['datapoint', 'mutationPositionOnPDB', 'score'], | |
ascending=[True, True, False]) | |
modbase_match.drop_duplicates(['datapoint'], keep='first', inplace=True) | |
not_nan = None | |
nan = None | |
# merge not_in_align and modbase_not_match as they were both excluded from modbase match. | |
# No model | |
no_info = no_info[to_swiss_columns] | |
no_info = no_info.drop_duplicates() | |
# Model present, no sequence | |
not_models = not_models[to_swiss_columns] | |
not_models = not_models.drop_duplicates() | |
# Modbase model and sequence present, no match in PDB | |
modbase_not_match = modbase_not_match[to_swiss_columns] | |
modbase_not_match = modbase_not_match.drop_duplicates() | |
if len(not_in_aligned) != 0 and len(modbase_not_match) != 0 and len(no_info) != 0: | |
rest = pd.concat([not_in_aligned, modbase_not_match, no_info]) | |
elif len(not_in_aligned) != 0 and len(modbase_not_match) != 0 and len(no_info) == 0: | |
rest = pd.concat([not_in_aligned, modbase_not_match]) | |
elif len(not_in_aligned) == 0 and len(modbase_not_match) != 0 and len(no_info) != 0: | |
rest = pd.concat([modbase_not_match, no_info]) | |
elif len(not_in_aligned) != 0 and len(modbase_not_match) == 0 and len(no_info) != 0: | |
rest = pd.concat([not_in_aligned, no_info]) | |
elif len(not_in_aligned) != 0 and len(modbase_not_match) == 0 and len(no_info) == 0: | |
rest = not_in_aligned | |
elif len(not_in_aligned) == 0 and len(modbase_not_match) != 0 and len(no_info) == 0: | |
rest = modbase_not_match | |
elif len(not_in_aligned) == 0 and len(modbase_not_match) == 0 and len(no_info) != 0: | |
rest = no_info | |
else: | |
rest = pd.DataFrame( | |
columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore', | |
'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', | |
'wt_sequence_match', 'whichIsoform', 'datapoint']) | |
rest = rest[to_swiss_columns] | |
rest = rest.drop_duplicates() | |
rest.reset_index(inplace=True) | |
rest.drop(['index'], axis=1, inplace=True) | |
rest = rest.astype('str') | |
else: | |
modbase_match = pd.DataFrame( | |
columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore', | |
'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', | |
'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet', | |
'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite', | |
'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', | |
'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat', | |
'topologicalDomain', 'caBinding', 'bindingSite', 'region', | |
'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', | |
'peptide', 'transitPeptide', 'glycosylation', 'propeptide', | |
'disulfideBinary', 'intMetBinary', 'intramembraneBinary', | |
'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary', | |
'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary', | |
'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', | |
'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary', | |
'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary', | |
'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', | |
'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary', | |
'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', | |
'glycosylationBinary', 'propeptideBinary', 'from', 'to', 'template', | |
'chain', 'score', 'pdbID', 'pdbSequence', 'pdb_alignStatus', | |
'mutationPositionOnPDB', 'domainStartonPDB', 'domainEndonPDB']) | |
not_in_aligned = pd.DataFrame( | |
columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore', | |
'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', | |
'wt_sequence_match', 'whichIsoform', 'datapoint', 'disulfide', 'intMet', | |
'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite', | |
'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', | |
'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat', | |
'topologicalDomain', 'caBinding', 'bindingSite', 'region', | |
'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', | |
'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'disulfide', | |
'intMet', 'intramembrane', 'naturalVariant', 'dnaBinding', 'activeSite', | |
'nucleotideBinding', 'lipidation', 'site', 'transmembrane', 'crosslink', | |
'mutagenesis', 'strand', 'helix', 'turn', 'metalBinding', 'repeat', | |
'topologicalDomain', 'caBinding', 'bindingSite', 'region', | |
'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', | |
'peptide', 'transitPeptide', 'glycosylation', 'propeptide', 'from', | |
'to', 'template', 'chain', 'score', 'pdbID', 'pdbSequence', 'fasta']) | |
no_info = pd.DataFrame( | |
columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore', | |
'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', | |
'wt_sequence_match', 'whichIsoform', 'datapoint']) | |
rest = pd.DataFrame( | |
columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore', | |
'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', | |
'wt_sequence_match', 'whichIsoform', 'datapoint']) | |
rest = rest[to_swiss_columns] | |
rest = rest.drop_duplicates() | |
rest.reset_index(inplace=True) | |
rest.drop(['index'], axis=1, inplace=True) | |
rest = rest.astype('str') | |
to_modbase_size = 0 | |
print('Modbase matching is completed...\n') | |
print('SUMMARY') | |
print('-------') | |
print('%d data points that failed to match a UniProt Sequence are discarded.' % len( | |
not_match_in_uniprot.drop_duplicates(['datapoint']))) | |
print('Of the remaining %d:' % uniprot_matched_size) | |
print('--%d of %d successfully aligned with PDB structures.' % ( | |
len(pdb_aligned.drop_duplicates(['datapoint'])), with_pdb_size)) | |
print('--%d of %d successfully aligned with SwissModels structures.' % ( | |
len(swiss_match.drop_duplicates(['datapoint'])), to_swiss_size)) | |
print('--%d of %d successfully aligned with Modbase structures.\n' % ( | |
len(modbase_match.drop_duplicates(['datapoint'])), to_modbase_size)) | |
print('--Remaining %d not found to match any models.' % len(rest.drop_duplicates(['datapoint']))) | |
print('--A total of %d datapoints will not be evaluated.\n' % ( | |
len(rest.drop_duplicates(['datapoint'])) + len(not_match_in_uniprot.drop_duplicates(['datapoint'])))) | |
print('FOR CHECKING : ', | |
len(rest.drop_duplicates(['datapoint'])) + len(not_match_in_uniprot.drop_duplicates(['datapoint'])) + len( | |
pdb_aligned.drop_duplicates(['datapoint'])) + len(swiss_match.drop_duplicates(['datapoint'])) + len( | |
modbase_match.drop_duplicates(['datapoint'])) == data_size) | |
no_info = None | |
align = None | |
not_in_aligned = None | |
not_models = None | |
modbase_not_match = None | |
# Final corrections | |
# Now 3D alignment. | |
pdb = pdb_aligned.copy() | |
swiss = swiss_match.copy() | |
modbase = modbase_match.copy() | |
pdb_aligned = None | |
swiss_match = None | |
modbase_match = None | |
""" | |
WHAT DO WE HAVE NOW? | |
- uniprot sequence not found | |
- pdb aligned | |
- swiss aligned | |
- modbase aligned | |
- not aligned with anything (rest) | |
""" | |
# Fix the axes and merge all data. | |
pdb.drop(['pdbInfo'], axis=1, inplace=True) | |
pdb.rename(columns={'resolution': 'score'}, inplace=True) | |
swiss.rename(columns={'qmean_norm': 'score'}, inplace=True) | |
modbase.rename(columns={'qmean_norm': 'score'}, inplace=True) | |
swiss = swiss[pdb.columns] | |
modbase = modbase[pdb.columns] | |
pdb['source'] = 'PDB' | |
swiss['source'] = 'SWISSMODEL' | |
modbase['source'] = 'MODBASE' | |
data = pd.concat([swiss, modbase, pdb]) | |
data.reset_index(inplace=True) | |
data.drop(['index'], axis=1, inplace=True) | |
data = data.astype('str') | |
data_spare = pd.concat([not_match_in_uniprot, rest]) | |
not_match_in_uniprot = None | |
pdb = None | |
swiss = None | |
modbase = None | |
rest = None | |
print('Generating FreeSASA files...') | |
print('------------------------------------\n') | |
# Folder to calculated RSA values. | |
existing_free_sasa = list(Path(path_to_output_files / 'freesasa_files').glob("*")) | |
existing_free_sasa = [str(i) for i in existing_free_sasa] | |
existing_free_sasa = [i.split('/')[-1].split('.')[0] for i in existing_free_sasa] | |
print('Calculation RSA for PDB Structure Files...\n') | |
pdb_only = data[data.source == 'PDB'] | |
for pdbID in pdb_only.pdbID.to_list(): | |
if pdbID not in existing_free_sasa: | |
(run_freesasa(Path(path_to_output_files / 'pdb_structures' / f'{pdbID.lower()}.pdb'), | |
Path(path_to_output_files / 'freesasa_files' / f'{pdbID.lower()}.txt'), | |
include_hetatms=True, | |
outdir=None, force_rerun=False, file_type='pdb')) | |
print('Calculation RSA for SwissModel Files...\n') | |
swiss_only = data[data.source == 'SWISSMODEL'] | |
swiss_dp = [] | |
for i in swiss_only.index: | |
swiss_dp.append(swiss_only.at[i, 'uniprotID'] + '_' + swiss_only.at[i, 'pdbID'].lower() + '_' + str( | |
round(float(swiss_only.at[i, 'score']), 2))) | |
for pdbID in swiss_dp: | |
if pdbID not in existing_free_sasa: | |
(run_freesasa(Path(path_to_output_files / 'swissmodel_structures' / f'{pdbID}.txt'), | |
Path(path_to_output_files / 'freesasa_files' / f'{pdbID}.txt'), include_hetatms=True, | |
outdir=None, force_rerun=False, file_type='pdb')) | |
print('Calculation RSA for Modbase Model Files...\n') | |
modbase_only = data[data.source == 'MODBASE'] | |
for pdbID in modbase_only.pdbID.to_list(): | |
if pdbID not in existing_free_sasa: | |
(run_freesasa(Path(path_to_output_files / 'modbase_structures_individual' / f'{pdbID.lower()}.txt'), | |
Path(path_to_output_files / 'freesasa_files' / f'{pdbID.lower()}.txt'), | |
include_hetatms=True, | |
outdir=None, force_rerun=False, file_type='pdb')) | |
# This annotation list is different than the prev one, keep it. | |
annotation_list += ['domainStartonPDB', 'domainEndonPDB'] | |
folder_path = path_to_output_files / 'freesasa_files' | |
aligner = Align.PairwiseAligner() | |
print('Proceeding to 3D distance calculation...\n') | |
data.domainEndonPDB = data.domainEndonPDB.astype(str) | |
data.domainStartonPDB = data.domainStartonPDB.astype(str) | |
existing_free_sasa = None | |
swiss_dp = None | |
pdb_only = None | |
swiss_only = None | |
modbase_only = None | |
data['uniprotSequence'] = data['uniprotSequence'].str.replace('U', 'C') | |
data['pdbSequence'] = data['pdbSequence'].str.replace('U', 'C') | |
for i in data.index: | |
id_ = data.at[i, 'pdbID'].lower() | |
up_id_ = data.at[i, 'uniprotID'] | |
score_ = str(data.at[i, 'score']) | |
if data.at[i, 'source'] == 'PDB': | |
pdb_path = Path(path_to_output_files / 'pdb_structures' / f'{id_}.pdb') | |
elif data.at[i, 'source'] == 'MODBASE': | |
pdb_path = Path(path_to_output_files / 'modbase_structures_individual' / f'{id_}.txt') | |
elif data.at[i, 'source'] == 'SWISSMODEL': | |
pdb_path = Path(path_to_output_files / 'swissmodel_structures' / f'{up_id_}_{id_}_{score_}.txt') | |
pdbSequence = data.at[i, 'pdbSequence'] | |
source = data.at[i, 'source'] | |
chain = data.at[i, 'chain'] | |
uniprotID = data.at[i, 'uniprotID'] | |
pdbID = data.at[i, 'pdbID'] | |
alignments = get_alignments_3D(uniprotID, 'nan', pdb_path, pdbSequence, source, chain, pdbID, mode, Path(path_to_output_files / '3D_alignment'), file_format = 'gzip') | |
mutPos = data.at[i, 'mutationPositionOnPDB'] | |
try: | |
coordMut = get_coords(mutPos, alignments, 'nan', 'nan', mode)[0] | |
except: | |
ValueError | |
coordMut = 'nan' | |
try: | |
sasa_pos = get_coords(mutPos, alignments, 'nan', 'nan', mode)[2] | |
data.at[i, 'sasa'] = sasa(data.at[i, 'source'], data.at[i, 'pdbID'], data.at[i, 'uniprotID'], sasa_pos, | |
data.at[i, 'wt'], mode, path_to_output_files, file_type='pdb') | |
except: | |
ValueError | |
data.at[i, 'sasa'] = 'nan' # mutation position is nan | |
for annot in annotation_list: | |
annotx = [] | |
try: | |
positions_of_annotations = data.at[i, annot].split(',') | |
for pos in positions_of_annotations: | |
pos = pos.strip().strip('\'').strip('[\'').strip('\']') | |
try: | |
if '-' not in pos: | |
pos = int(float(pos)) | |
coordAnnot = get_coords(pos, alignments, 'nan', 'nan', mode)[0] | |
try: | |
annotx.append(find_distance(coordMut, coordAnnot)) | |
except: | |
ValueError | |
else: | |
for r in range(int(pos.split('-')[0]), int(pos.split('-')[1]) + 1): | |
coordAnnot = get_coords(r, alignments, 'nan', 'nan', mode)[0] | |
annotx.append(find_distance(coordMut, coordAnnot)) | |
except: | |
ValueError | |
try: | |
data.at[i, annot] = min([float(i) for i in annotx]) | |
except: | |
ValueError | |
data.at[i, annot] = 'nan' | |
except: | |
ValueError | |
if (str(data.at[i, 'domainStartonPDB']) == 'NaN' or str(data.at[i, 'domainStartonPDB']) == 'nan') and ( | |
str(data.at[i, 'domainEndonPDB']) != 'NaN' and str(data.at[i, 'domainEndonPDB']) != 'nan'): | |
data.at[i, 'domainStartonPDB'] = 100000 | |
elif (str(data.at[i, 'domainEndonPDB']) == 'NaN' or str(data.at[i, 'domainEndonPDB']) == 'nan') and ( | |
str(data.at[i, 'domainStartonPDB']) != 'NaN' and str(data.at[i, 'domainStartonPDB']) != 'nan'): | |
data.at[i, 'domainEndonPDB'] = 100000 | |
elif (str(data.at[i, 'domainStartonPDB']) == 'NaN' and str(data.at[i, 'domainEndonPDB']) == 'nan'): | |
data.at[i, 'domaindistance3D'] = 'nan' | |
data.at[i, 'domaindistance3D'] = min(float(data.at[i, 'domainStartonPDB']), | |
float(data.at[i, 'domainEndonPDB'])) | |
data.at[i, 'domaindistance3D'] = min(float(data.at[i, 'domainStartonPDB']), | |
float(data.at[i, 'domainEndonPDB'])) | |
data = data.astype(str) | |
data.replace({'NaN': 'nan'}, inplace=True) | |
# Now unify all 3 separate data. We have with_pdb. The ones that have pdb structyres, swiss, modbase, the ones didnt match with ant and the ones didnt have wt seq match. | |
# Get interface positions from ECLAIR. Download HQ human | |
print() | |
print('Assigning surface regions...') | |
print('------------------------------------\n') | |
print('Extracting interface residues...\n') | |
data_interface = pd.read_csv(path_to_interfaces, sep='\t') | |
positions = get_interface_positions(data_interface, 'P1', 'P2') | |
interface_dataframe = pd.DataFrame() | |
for key, val in positions.items(): | |
k = pd.Series((key, str(list(set(val))))) | |
interface_dataframe = interface_dataframe.append(k, ignore_index=True) | |
interface_dataframe.columns = ['uniprotID', 'positions'] | |
if len(data) == 0: | |
data = pd.DataFrame( | |
columns=['uniprotID', 'wt', 'mut', 'pos', 'composition', 'polarity', 'volume', 'granthamScore', | |
'domain', 'domStart', 'domEnd', 'distance', 'uniprotSequence', | |
'pdbSequence', 'wt_sequence_match', 'whichIsoform', 'pdbID', 'score', | |
'chain', 'datapoint', 'disulfide', 'intMet', 'intramembrane', | |
'naturalVariant', 'dnaBinding', 'activeSite', 'nucleotideBinding', | |
'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis', | |
'strand', 'helix', 'turn', 'metalBinding', 'repeat', | |
'topologicalDomain', 'caBinding', 'bindingSite', 'region', | |
'signalPeptide', 'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', | |
'peptide', 'transitPeptide', 'glycosylation', 'propeptide', | |
'disulfideBinary', 'intMetBinary', 'intramembraneBinary', | |
'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary', | |
'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary', | |
'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', | |
'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary', | |
'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary', | |
'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', | |
'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary', | |
'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', | |
'glycosylationBinary', 'propeptideBinary', 'pdb_alignStatus', | |
'mutationPositionOnPDB', 'domainStartonPDB', 'domainEndonPDB', | |
'source', 'sasa', 'domaindistance3D', 'threeState_trsh4_HQ', 'domain_fisher']) | |
else: | |
data.sasa = data.sasa.astype('str') | |
for i in data.index: | |
if '*' in data.at[i, 'sasa']: | |
data.at[i, 'sasa'] = data.at[i, 'sasa'].split('*')[0] | |
data.sasa = data.sasa.replace({'N/A': 'nan'}) | |
data.sasa = data.sasa.replace({'None': 'nan'}) | |
data.replace({' N/A': 'nan'}, inplace=True) | |
data.replace({'None': 'nan'}, inplace=True) | |
data.sasa = data.sasa.astype(float) | |
data = data.astype(str) | |
for i in data.index: | |
if float(data.at[i, 'sasa']) < 5: | |
data.at[i, 'trsh4'] = 'core' | |
elif float(data.at[i, 'sasa']) >= 5: | |
data.at[i, 'trsh4'] = 'surface' | |
elif data.at[i, 'sasa'] == 'nan': | |
data.at[i, 'trsh4'] = 'nan' | |
data = data.merge(interface_dataframe, on='uniprotID', how='left') | |
data.positions = data.positions.astype('str') | |
for i in data.index: | |
if (str(data.at[i, 'pos']) in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'surface': | |
data.at[i, 'threeState_trsh4_HQ'] = 'interface' | |
elif (str(data.at[i, 'pos']) not in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'surface': | |
data.at[i, 'threeState_trsh4_HQ'] = 'surface' | |
elif (str(data.at[i, 'pos']) not in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'core': | |
data.at[i, 'threeState_trsh4_HQ'] = 'core' | |
elif (str(data.at[i, 'pos']) in data.at[i, 'positions']) and data.at[i, 'trsh4'] == 'core': | |
data.at[i, 'threeState_trsh4_HQ'] = 'conflict' | |
elif data.at[i, 'trsh4'] == 'nan': | |
data.at[i, 'threeState_trsh4_HQ'] = 'nan' | |
data.drop(['positions'], axis=1, inplace=True) | |
# OPTIONAL | |
# DOMAIN SELECTION | |
# Next step: Delete all other domains with 'NULL.' R is capable of handling 53 categories. We will keep 52 most | |
# significant domains and 53th category will be NULL. | |
fisherResult = pd.read_csv(fisher_path, sep='\t') | |
significant_domains = fisherResult.domain.to_list() | |
for i in data.index: | |
if data.at[i, 'domain'] in significant_domains: | |
data.at[i, 'domain_fisher'] = data.at[i, 'domain'] | |
else: | |
data.at[i, 'domain_fisher'] = 'NULL' | |
# Change the numbering for binary annotations and create 3 classes: | |
# nan--> 0, 0 -->1 and 1 -->2 | |
print('Final adjustments are being done...\n') | |
binaryCols = ['disulfideBinary', 'intMetBinary', 'intramembraneBinary', 'naturalVariantBinary', | |
'dnaBindingBinary', | |
'activeSiteBinary', 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary', | |
'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', | |
'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary', | |
'repeatBinary', 'caBindingBinary', 'topologicalDomainBinary', | |
'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', | |
'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary', | |
'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', | |
'glycosylationBinary', 'propeptideBinary'] | |
data = data.astype(str) | |
data.replace({'NaN': 'nan'}, inplace=True) | |
for i in data.index: | |
for j in binaryCols: | |
data[j] = data[j].astype('str') | |
if (data.at[i, j] == '0') or (data.at[i, j] == '0.0'): | |
data.at[i, j] = '1' | |
elif data.at[i, j] == 'nan': | |
data.at[i, j] = '0' | |
elif (data.at[i, j] == '1') or (data.at[i, j] == '1.0'): | |
data.at[i, j] = '2' | |
annotCols = ['disulfide', 'intMet', 'intramembrane', | |
'naturalVariant', 'dnaBinding', 'activeSite', 'nucleotideBinding', | |
'lipidation', 'site', 'transmembrane', 'crosslink', 'mutagenesis', | |
'strand', 'helix', 'turn', 'metalBinding', 'repeat', 'caBinding', | |
'topologicalDomain', 'bindingSite', 'region', 'signalPeptide', | |
'modifiedResidue', 'zincFinger', 'motif', 'coiledCoil', 'peptide', | |
'transitPeptide', 'glycosylation', 'propeptide'] | |
for i in data.index: | |
for annot in annotCols: | |
binaryName = str(annot) + 'Binary' | |
if data.at[i, binaryName] == '2': | |
data.at[i, annot] = '0.0' | |
data.replace({'100000': 'nan'}, inplace=True) | |
data = add_physicochemical(data) | |
data.rename( | |
columns={'uniprotID': 'prot_uniprotAcc', 'wt': 'wt_residue', 'pos': 'position', 'mut': 'mut_residue', | |
'datapoint': 'meta_merged', 'datapoint_disease': 'meta-lab_merged', 'label': 'source_db', | |
'family': 'prot_family', 'domain': 'domains_all', 'domain_fisher': 'domains_sig', | |
'domaindistance3D': 'domains_3Ddist', 'threeState_trsh4_HQ': 'location_3state', | |
'disulfideBinary': 'disulfide_bin', 'intMetBinary': 'intMet_bin', | |
'intramembraneBinary': 'intramembrane_bin', | |
'naturalVariantBinary': 'naturalVariant_bin', 'dnaBindingBinary': 'dnaBinding_bin', | |
'activeSiteBinary': 'activeSite_bin', | |
'nucleotideBindingBinary': 'nucleotideBinding_bin', 'lipidationBinary': 'lipidation_bin', | |
'siteBinary': 'site_bin', | |
'transmembraneBinary': 'transmembrane_bin', 'crosslinkBinary': 'crosslink_bin', | |
'mutagenesisBinary': 'mutagenesis_bin', | |
'strandBinary': 'strand_bin', 'helixBinary': 'helix_bin', 'turnBinary': 'turn_bin', | |
'metalBindingBinary': 'metalBinding_bin', | |
'repeatBinary': 'repeat_bin', 'topologicalDomainBinary': 'topologicalDomain_bin', | |
'caBindingBinary': 'caBinding_bin', | |
'bindingSiteBinary': 'bindingSite_bin', 'regionBinary': 'region_bin', | |
'signalPeptideBinary': 'signalPeptide_bin', | |
'modifiedResidueBinary': 'modifiedResidue_bin', 'zincFingerBinary': 'zincFinger_bin', | |
'motifBinary': 'motif_bin', | |
'coiledCoilBinary': 'coiledCoil_bin', 'peptideBinary': 'peptide_bin', | |
'transitPeptideBinary': 'transitPeptide_bin', | |
'glycosylationBinary': 'glycosylation_bin', 'propeptideBinary': 'propeptide_bin', | |
'disulfide': 'disulfide_dist', 'intMet': 'intMet_dist', | |
'intramembrane': 'intramembrane_dist', 'naturalVariant': 'naturalVariant_dist', | |
'dnaBinding': 'dnaBinding_dist', 'activeSite': 'activeSite_dist', | |
'nucleotideBinding': 'nucleotideBinding_dist', 'lipidation': 'lipidation_dist', | |
'site': 'site_dist', | |
'transmembrane': 'transmembrane_dist', 'crosslink': 'crosslink_dist', | |
'mutagenesis': 'mutagenesis_dist', 'strand': 'strand_dist', 'helix': 'helix_dist', | |
'turn': 'turn_dist', | |
'metalBinding': 'metalBinding_dist', 'repeat': 'repeat_dist', | |
'topologicalDomain': 'topologicalDomain_dist', 'caBinding': 'caBinding_dist', | |
'bindingSite': 'bindingSite_dist', 'region': 'region_dist', | |
'signalPeptide': 'signalPeptide_dist', 'modifiedResidue': 'modifiedResidue_dist', | |
'zincFinger': 'zincFinger_dist', 'motif': 'motif_dist', 'coiledCoil': 'coiledCoil_dist', | |
'peptide': 'peptide_dist', 'transitPeptide': 'transitPeptide_dist', | |
'glycosylation': 'glycosylation_dist', 'propeptide': 'propeptide_dist'}, inplace=True) | |
data = data[ | |
['prot_uniprotAcc', 'wt_residue', 'mut_residue', 'position', 'meta_merged', 'composition', 'polarity', | |
'volume', | |
'granthamScore', 'domains_all', | |
'domains_sig', 'domains_3Ddist', 'sasa', 'location_3state', 'disulfide_bin', 'intMet_bin', | |
'intramembrane_bin', 'naturalVariant_bin', 'dnaBinding_bin', | |
'activeSite_bin', 'nucleotideBinding_bin', 'lipidation_bin', 'site_bin', | |
'transmembrane_bin', 'crosslink_bin', 'mutagenesis_bin', 'strand_bin', | |
'helix_bin', 'turn_bin', 'metalBinding_bin', 'repeat_bin', | |
'caBinding_bin', 'topologicalDomain_bin', 'bindingSite_bin', | |
'region_bin', 'signalPeptide_bin', 'modifiedResidue_bin', | |
'zincFinger_bin', 'motif_bin', 'coiledCoil_bin', 'peptide_bin', | |
'transitPeptide_bin', 'glycosylation_bin', 'propeptide_bin', 'disulfide_dist', 'intMet_dist', | |
'intramembrane_dist', | |
'naturalVariant_dist', 'dnaBinding_dist', 'activeSite_dist', | |
'nucleotideBinding_dist', 'lipidation_dist', 'site_dist', | |
'transmembrane_dist', 'crosslink_dist', 'mutagenesis_dist', | |
'strand_dist', 'helix_dist', 'turn_dist', 'metalBinding_dist', | |
'repeat_dist', 'caBinding_dist', 'topologicalDomain_dist', | |
'bindingSite_dist', 'region_dist', 'signalPeptide_dist', | |
'modifiedResidue_dist', 'zincFinger_dist', 'motif_dist', | |
'coiledCoil_dist', 'peptide_dist', 'transitPeptide_dist', | |
'glycosylation_dist', 'propeptide_dist']] | |
ready = data.copy() | |
# Imputation | |
if (impute == 'True') or (impute == 'true') or (impute == True): | |
filler = [17.84, 30.8, 24.96, 13.12, 23.62, 18.97, 20.87, 29.59, 20.7, 12.7, 22.85, 17.21, 9.8, 9, 15.99, | |
16.82, | |
20.46, 24.58, 9.99, 17.43, 20.08, 30.91, 20.86, 22.14, 21.91, 28.45, 17.81, 25.12, 20.33, 22.36] | |
col_index = 0 | |
for col_ in ready.columns[-30:]: | |
ready[col_] = ready[col_].fillna(filler[col_index]) | |
ready[col_] = ready[col_].replace({'nan': filler[col_index]}) | |
col_index += 1 | |
ready['domains_3Ddist'] = ready['domains_3Ddist'].fillna(24.5) | |
ready['sasa'] = ready['sasa'].fillna(29.5) | |
ready['location_3state'] = ready['location_3state'].fillna('unknown') | |
elif (impute == 'False') or (impute == 'false') or (impute == False): | |
pass | |
ready = ready.replace({'nan': np.NaN}) | |
ready.to_csv(path_to_output_files / 'featurevector_pdb.txt', sep='\t', index=False) | |
if len(ready) == 0: | |
print( | |
'No feature vector could be produced for input data. Please check the presence of a structure for the input proteins.') | |
print(ready) | |
print('Feature vector successfully created...') | |
return ready | |
end = timer() | |
hours, rem = divmod(end - start, 3600) | |
minutes, seconds = divmod(rem, 60) | |
print("Time passed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds)) | |
return ready |