Spaces:
Sleeping
Sleeping
# IMPORT NECESSARY MODULES AND LIBRARIES | |
from timeit import default_timer as timer | |
import xml.etree.ElementTree as ET | |
from collections import Counter | |
from bs4 import BeautifulSoup | |
from io import StringIO | |
from decimal import * | |
import pandas as pd | |
import requests | |
import os.path as op | |
import subprocess | |
import shutil | |
import ssbio.utils | |
import warnings | |
import sys | |
import pathlib | |
from pathlib import Path | |
import os, glob | |
import math | |
import ssbio | |
import ssl | |
import numpy as np | |
from Bio.Align import substitution_matrices | |
from Bio.PDB.Polypeptide import * | |
from Bio.PDB import PDBList | |
from Bio import Align | |
from Bio import SeqIO | |
from Bio.PDB import * | |
warnings.filterwarnings("ignore") | |
start = timer() | |
# FUNCTIONS | |
from calc_pc_property import * | |
from add_domains import * | |
from retrieveUniprotSequences import * | |
from add_annotations import * | |
from add_sequence import * | |
from add_structure import * | |
from manage_files import * | |
from add_sasa import * | |
from standard import * | |
from add_interface_pos import * | |
from standard import * | |
from utils import * | |
from pdbMapping import * | |
from uniprotSequenceMatch import uniprotSequenceMatch | |
from process_input import clean_data | |
from urllib.error import HTTPError | |
from swissModelAdd import * | |
from modbaseModelAdd import * | |
import streamlit as st | |
def pdb(input_set, mode, impute): | |
# Fill empty dataframes with SIMPLE_COLS | |
SIMPLE_COLS = ['uniprotID', 'wt', 'pos', 'mut', 'datapoint', 'composition', 'polarity', | |
'volume', 'granthamScore', 'domain', 'domStart', 'domEnd', 'distance', | |
'intMet', 'naturalVariant', 'activeSite', 'crosslink', 'mutagenesis', | |
'strand', 'helix', 'turn', 'region', 'modifiedResidue', 'motif', | |
'metalBinding', 'lipidation', 'glycosylation', 'topologicalDomain', | |
'nucleotideBinding', 'bindingSite', 'transmembrane', 'transitPeptide', | |
'repeat', 'site', 'peptide', 'signalPeptide', 'disulfide', 'coiledCoil', | |
'intramembrane', 'zincFinger', 'caBinding', 'propeptide', 'dnaBinding', | |
'disulfideBinary', 'intMetBinary', 'intramembraneBinary', | |
'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary', | |
'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary', | |
'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', | |
'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary', | |
'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary', | |
'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', | |
'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary', | |
'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', | |
'glycosylationBinary', 'propeptideBinary'] | |
UNIPROT_ANNOTATION_COLS = SIMPLE_COLS[-60:] | |
path_to_input_files, path_to_output_files, path_to_domains, fisher_path, path_to_interfaces, buffer = manage_files(mode) | |
out_path = path_to_output_files / 'log.txt' | |
#sys.stdout = open(out_path, 'w') | |
data = clean_data(input_set) | |
if len(data) == 0: | |
st.write('Feature vectore generation terminated. Please enter a query or check your input format.') | |
else: | |
data = add_uniprot_sequence(data) | |
match = data[(data.wt_sequence_match == 'm')] | |
org_len = len(match) | |
iso = data[(data.wt_sequence_match == 'i')] | |
noMatch = data[(data.wt_sequence_match != 'm') & (data.wt_sequence_match != 'i')] | |
if len(noMatch) == len(data) : | |
st.write('Aminoacid at the position could not be mapped to canonical or isoform sequence. Please check the input amino acid.') | |
elif len(noMatch) > 0: | |
st.write( | |
f'{len(noMatch)} of {len(data)} datapoints has not been mapped to any sequence. These datapoints are omitted.') | |
if len(iso) > 0: | |
st.write(f'{len(iso)} of {len(data)} datapoints has been mapped to isoform sequences. These datapoints are omitted.') | |
if len(match) == 0: | |
st.write('Feature generation terminated due to failed mapping of input amino acid to UniProt sequence.') | |
else: | |
st.write(f'{len(match)} of {len(data)} datapoints has been mapped to canonical sequences. Proceeding with these datapoins.') | |
if (len(iso) != 0) | (len(noMatch) != 0): | |
st.write('Omitted datapoints are:', noMatch.datapoint.to_list() + iso.datapoint.to_list()) | |
st.write('\n') | |
st.write('Check log file for updates.') | |
data = match[['uniprotID', 'wt', 'pos', 'mut', 'datapoint']] | |
print('>> Feature vector generation started...\n') | |
print('\n>> Creating directories...') | |
print('\n>> Adding physicochemical properties...\n') | |
data = add_physicochemical(data) | |
print('\n>> Adding domains\n') | |
data = add_domains(data, path_to_domains) | |
print('\n>> Adding sequence annotations...\n') | |
data = add_annotations(data) | |
print('\n>> Retrieving PDB structure information...\n') | |
pdb_info = addPDBinfo(data, path_to_output_files) | |
if len(pdb_info) != 0: | |
data = pd.merge(data, pdb_info, on='uniprotID', how='left') | |
# Spare datapoint if there is no associated PDB. | |
no_pdb = data[data.pdbID.isna()].drop_duplicates() | |
pdb = data[~data.pdbID.isna()].drop_duplicates() | |
# Spare datapoint if associated PDB does not cover mutated area. | |
pdb.pos = pdb.pos.apply(lambda x:int(x)) | |
pdb.start = pdb.start.apply(lambda x: int(x)) | |
pdb.end = pdb.end.apply(lambda x: int(x)) | |
no_pdb_add = pdb[~((pdb.pos > pdb.start) & (pdb.pos < pdb.end))] | |
pdb = pdb[(pdb.pos > pdb.start) & (pdb.pos < pdb.end)] # do not change order | |
pdb.reset_index(drop=True, inplace=True) | |
# Delete spared datapoint from no_pdb list if it has any other PDB that spans the mutated area. | |
no_pdb_add = no_pdb_add[~no_pdb_add.datapoint.isin(pdb.datapoint.to_list())] | |
# Final collection of datapoints without PDB associaton. | |
no_pdb = pd.concat([no_pdb, no_pdb_add]) | |
no_pdb = no_pdb[SIMPLE_COLS] | |
no_pdb = no_pdb.drop_duplicates() | |
pdb = pdb.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True) | |
pdb.reset_index(drop=True, inplace=True) | |
pdb.fillna(np.NaN, inplace=True) | |
# Get position mapping from added structures | |
print('\n>> Adding structure residue positions...\n') | |
if len(pdb) > 0: # there are mapped structures, and some of them span the mutated area. | |
pdb.replace({'[]': np.NaN, 'nan-nan': np.NaN, '': np.NaN}, inplace=True) | |
pdb = pdbMapping(pdb, Path(path_to_output_files / 'pdb_structures')) | |
pdb.reset_index(drop=True, inplace=True) | |
pdb = pdb.fillna(np.NaN) | |
no_pdb_add_ = pdb[pdb.AAonPDB.isna()] | |
no_pdb_add = pdb[pdb.MATCHDICT.isna()] | |
no_pdb = pd.concat([no_pdb_add_, no_pdb, no_pdb_add]) | |
no_pdb.reset_index(inplace=True, drop=True) | |
pdb = pdb[~(pdb.MATCHDICT.isna())] | |
pdb = pdb[~(pdb.AAonPDB.isna())] | |
if len(pdb) > 0: | |
print('\n>> Mapping to PDB residues...\n') | |
pdb = changeUPtoPDB(pdb) | |
pdb.reset_index(drop=True, inplace=True) | |
print('\n>> Calculating 3D distances for PDB structures...\n') | |
pdb = isZeroDistance(pdb) | |
pdb = processFile(pdb, path_to_output_files) | |
pdb = match3D(pdb) | |
pdb = selectMaxAnnot(pdb) | |
pdb = pdb.sort_values(by=['datapoint', 'resolution', 'annotTotal'], ascending=[True, True, True]) | |
pdb = pdb.drop_duplicates(['datapoint']) | |
pdb.replace({'[]': np.NaN, 'hit':0.0}, inplace=True) | |
print('\n>> PDB matching is completed...\n') | |
else: | |
# There was no residue match in the associated PDB. So we cannot use PDB data. | |
pdb = pdb[SIMPLE_COLS] | |
print('\n>>> No PDB structure could be matched.') | |
else: | |
pdb = pdb[SIMPLE_COLS] | |
print('\n>>> No PDB structure could be matched.') | |
else: | |
pdb = pd.DataFrame(columns = SIMPLE_COLS) | |
print('\n>>> No PDB structure could be matched.') | |
no_pdb = data.copy() | |
no_pdb = no_pdb[SIMPLE_COLS] | |
print( | |
'PDB phase is finished...\nPDB structures are found for %d of %d.\n%d of %d failed to match with PDB structure.\n' | |
% (len(pdb.drop_duplicates(['datapoint'])), len(data.drop_duplicates(['datapoint'])), | |
len(no_pdb.drop_duplicates(['datapoint'])), len(data.drop_duplicates(['datapoint'])))) | |
print('\n>>> Proceeding to SwissModel search...') | |
print('------------------------------------\n') | |
swiss = no_pdb.copy() | |
if len(swiss) > 0: | |
print('\n>> Adding SwissModel residue positions...\n') | |
swiss.replace({'[]': np.NaN, 'nan-nan': np.NaN, '': np.NaN}, inplace=True) | |
swiss = swiss.fillna(np.NaN) | |
swiss, no_swiss_models= addSwissModels(swiss, path_to_input_files, path_to_output_files) | |
print('\n>> Mapping to SwissModels...\n') | |
if len(swiss) > 0: | |
swiss.reset_index(drop=True, inplace=True) | |
swiss = changeUPtoModels(swiss) | |
swiss.reset_index(drop=True, inplace=True) | |
print('\n>> Calculating 3D distances for SwissModels...\n') | |
swiss = isZeroDistance(swiss) | |
swiss = match3DModels(swiss) | |
swiss = selectMaxAnnot(swiss) | |
swiss = swiss.sort_values(by=['datapoint', 'qmean_norm', 'distance', 'hitTotal', 'annotTotal'], ascending=[True, False, True, False, True]) | |
swiss = swiss.drop_duplicates(['datapoint']) | |
swiss.replace({'[]': np.NaN, 'hit': 0.0}, inplace=True) | |
else: | |
swiss = swiss[SIMPLE_COLS] | |
if len(no_swiss_models) > 0: | |
no_swiss_models = no_swiss_models[SIMPLE_COLS] | |
no_swiss_models.reset_index(inplace=True, drop=True) | |
else: | |
swiss = swiss[SIMPLE_COLS] | |
no_swiss_models = no_pdb.copy() | |
if len(no_swiss_models) >0: | |
modbase = no_swiss_models.copy() | |
print('Proceeding to Modbase search...') | |
print('------------------------------------\n') | |
modbase = modbase[SIMPLE_COLS] | |
modbase.replace({'[]': np.NaN, 'nan-nan': np.NaN, '': np.NaN}, inplace=True) | |
modbase = modbase.fillna(np.NaN) | |
print('\n>> Adding Modbase residue positions...\n') | |
modbase_simple = modbase[['uniprotID', 'wt', 'pos', 'mut','datapoint']] | |
modbase_simple = modbase_simple.drop_duplicates(['uniprotID', 'wt', 'pos' ,'mut','datapoint']) | |
modbaseOut, no_modbase_models_updated = addModbaseModels(modbase_simple, path_to_input_files, path_to_output_files) | |
if len(modbaseOut) > 0: | |
modbase = modbase.merge(modbaseOut, on = ['uniprotID', 'wt', 'pos', 'mut','datapoint'], how = 'left') | |
no_modbase_models_updated['sasa'] = np.NaN | |
modbase.reset_index(inplace=True, drop=True) | |
no_modbase_add = modbase[pd.isna(modbase.coordinates)] | |
modbase = modbase[~pd.isna(modbase.coordinates)] | |
no_modbase_models_updated = pd.concat([no_modbase_models_updated, no_modbase_add]) | |
print('\n>> Mapping to Modbase models...\n') | |
modbase = changeUPtoModels(modbase) | |
print('\n>> Calculating 3D distances for Modbase models...\n') | |
modbase = isZeroDistance(modbase) | |
modbase = match3DModels(modbase) | |
modbase = selectMaxAnnot(modbase) | |
modbase = modbase.sort_values(by=['datapoint', 'quality_score', 'distance','hitTotal', 'annotTotal'], ascending=[True, False, True, False, True]) | |
modbase = modbase.drop_duplicates(['datapoint']) | |
modbase.replace({'[]': np.NaN, 'hit': 0.0}, inplace=True) | |
else: | |
modbase = pd.DataFrame(columns = SIMPLE_COLS) | |
else: | |
no_modbase_models_updated = pd.DataFrame(columns = SIMPLE_COLS) | |
modbase= pd.DataFrame(columns = SIMPLE_COLS) | |
COLS = ['uniprotID', 'wt', 'pos', 'mut', 'datapoint', 'composition', 'polarity', 'volume', 'granthamScore', 'domain', 'domStart', 'domEnd', 'distance', | |
'region', 'crosslink', 'peptide', 'disulfide', 'signalPeptide', 'propeptide', 'naturalVariant', 'nucleotideBinding', 'modifiedResidue', 'site', | |
'caBinding', 'turn', 'transmembrane', 'repeat', 'glycosylation', 'intramembrane', 'metalBinding', 'bindingSite', 'dnaBinding', 'activeSite', | |
'coiledCoil', 'helix', 'mutagenesis', 'zincFinger', 'transitPeptide', 'intMet', 'strand', 'lipidation', 'motif', 'topologicalDomain', | |
'disulfideBinary', 'intMetBinary', 'intramembraneBinary', 'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary', 'nucleotideBindingBinary', | |
'lipidationBinary', 'siteBinary', 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary', | |
'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary', 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', 'modifiedResidueBinary', 'zincFingerBinary', | |
'motifBinary', 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', 'glycosylationBinary', 'propeptideBinary', 'sasa'] | |
if len(no_modbase_models_updated) == 0: | |
no_modbase_models_updated = pd.DataFrame(columns = SIMPLE_COLS) | |
no_modbase_models_updated = no_modbase_models_updated[~no_modbase_models_updated.datapoint.isin(modbase.datapoint.to_list())] | |
no_modbase_models_updated = no_modbase_models_updated[['uniprotID', 'wt', 'pos', 'mut', 'datapoint']] | |
no_modbase_models_updated.pos = no_modbase_models_updated.pos.astype(int) | |
no_modbase_models_updated = no_modbase_models_updated.drop_duplicates() | |
if len(pdb)>0: | |
pdb = pdb[COLS] | |
pdb['Source'] = 'PDB' | |
else: | |
pdb = pd.DataFrame() | |
if len(swiss) > 0: | |
swiss = swiss[COLS] | |
swiss['Source'] = 'SWISS-Model' | |
else: | |
swiss = pd.DataFrame() | |
if len(modbase) > 0: | |
modbase = modbase[COLS] | |
modbase['Source'] = 'Modbase' | |
else: | |
modbase = pd.DataFrame() | |
# st.write('======PDB==========') | |
# st.write(pdb.to_string()) | |
# st.write('======SWISS==========') | |
# st.write(swiss.to_string()) | |
# st.write('======MODBASE==========') | |
# st.write(modbase.to_string()) | |
allData = pd.concat([pdb, swiss, modbase]) | |
allData.reset_index(inplace=True, drop=True) | |
allData.replace({np.NaN: ''}, inplace=True) | |
# st.write('======ALL DATA==========') | |
# st.write(allData.to_string()) | |
if len(allData)>0: | |
allData.distance.replace({-1000: ''}, inplace=True) | |
# Get interface positions from ECLAIR. Download HQ human | |
print() | |
print('Assigning surface regions...') | |
print('------------------------------------\n') | |
print('Extracting interface residues...\n') | |
data_interface = pd.read_csv(path_to_interfaces, sep='\t') | |
positions = get_interface_positions(data_interface, 'P1', 'P2') | |
interface_dataframe = pd.DataFrame() | |
for key, val in positions.items(): | |
k = pd.Series((key, str(list(set(val))))) | |
interface_dataframe = interface_dataframe.append(k, ignore_index=True) | |
interface_dataframe.columns = ['uniprotID', 'positions'] | |
final_data = finalTouch(allData) | |
final_data = final_data.merge(interface_dataframe, on='uniprotID', how='left') | |
final_data.positions = final_data.positions.astype('str') | |
for i in final_data.index: | |
if (str(final_data.at[i, 'pos']) in final_data.at[i, 'positions']) and final_data.at[i, 'trsh4'] == 'surface': | |
final_data.at[i, 'threeState_trsh4_HQ'] = 'interface' | |
elif (str(final_data.at[i, 'pos']) not in final_data.at[i, 'positions']) and final_data.at[i, 'trsh4'] == 'surface': | |
final_data.at[i, 'threeState_trsh4_HQ'] = 'surface' | |
elif (str(final_data.at[i, 'pos']) not in final_data.at[i, 'positions']) and final_data.at[i, 'trsh4'] == 'core': | |
final_data.at[i, 'threeState_trsh4_HQ'] = 'core' | |
elif (str(final_data.at[i, 'pos']) in final_data.at[i, 'positions']) and final_data.at[i, 'trsh4'] == 'core': | |
final_data.at[i, 'threeState_trsh4_HQ'] = 'conflict' | |
elif final_data.at[i, 'trsh4'] == 'nan': | |
final_data.at[i, 'threeState_trsh4_HQ'] = 'nan' | |
final_data.drop(['positions'], axis=1, inplace=True) | |
fisherResult = pd.read_csv(fisher_path, sep='\t') | |
significant_domains = fisherResult.domain.to_list() | |
for i in final_data.index: | |
if final_data.at[i, 'domain'] in significant_domains: | |
final_data.at[i, 'domain_fisher'] = final_data.at[i, 'domain'] | |
else: | |
final_data.at[i, 'domain_fisher'] = 'NULL' | |
print('Final adjustments are being done...\n') | |
binaryCols = UNIPROT_ANNOTATION_COLS[-30:] | |
final_data = final_data.astype(str) | |
final_data.replace({'NaN': 'nan'}, inplace=True) | |
for i in final_data.index: | |
for j in binaryCols: | |
final_data[j] = final_data[j].astype('str') | |
if (final_data.at[i, j] == '0') or (final_data.at[i, j] == '0.0'): | |
final_data.at[i, j] = '1' | |
elif final_data.at[i, j] == 'nan': | |
final_data.at[i, j] = '0' | |
elif (final_data.at[i, j] == '1') or (final_data.at[i, j] == '1.0'): | |
final_data.at[i, j] = '2' | |
annotCols = UNIPROT_ANNOTATION_COLS[:30] | |
for i in final_data.index: | |
for annot in annotCols: | |
binaryName = str(annot) + 'Binary' | |
if final_data.at[i, binaryName] == '2': | |
final_data.at[i, annot] = '0.0' | |
final_data.rename( | |
columns={'uniprotID': 'prot_uniprotAcc', 'wt': 'wt_residue', 'pos': 'position', 'mut': 'mut_residue', | |
'datapoint': 'meta_merged', 'datapoint_disease': 'meta-lab_merged', 'label': 'source_db', | |
'family': 'prot_family', 'domain': 'domains_all', 'domain_fisher': 'domains_sig', | |
'distance': 'domains_3Ddist', 'threeState_trsh4_HQ': 'location_3state', | |
'disulfideBinary': 'disulfide_bin', 'intMetBinary': 'intMet_bin', | |
'intramembraneBinary': 'intramembrane_bin', | |
'naturalVariantBinary': 'naturalVariant_bin', 'dnaBindingBinary': 'dnaBinding_bin', | |
'activeSiteBinary': 'activeSite_bin', | |
'nucleotideBindingBinary': 'nucleotideBinding_bin', 'lipidationBinary': 'lipidation_bin', | |
'siteBinary': 'site_bin', | |
'transmembraneBinary': 'transmembrane_bin', 'crosslinkBinary': 'crosslink_bin', | |
'mutagenesisBinary': 'mutagenesis_bin', | |
'strandBinary': 'strand_bin', 'helixBinary': 'helix_bin', 'turnBinary': 'turn_bin', | |
'metalBindingBinary': 'metalBinding_bin', | |
'repeatBinary': 'repeat_bin', 'topologicalDomainBinary': 'topologicalDomain_bin', | |
'caBindingBinary': 'caBinding_bin', | |
'bindingSiteBinary': 'bindingSite_bin', 'regionBinary': 'region_bin', | |
'signalPeptideBinary': 'signalPeptide_bin', | |
'modifiedResidueBinary': 'modifiedResidue_bin', 'zincFingerBinary': 'zincFinger_bin', | |
'motifBinary': 'motif_bin', | |
'coiledCoilBinary': 'coiledCoil_bin', 'peptideBinary': 'peptide_bin', | |
'transitPeptideBinary': 'transitPeptide_bin', | |
'glycosylationBinary': 'glycosylation_bin', 'propeptideBinary': 'propeptide_bin', | |
'disulfide': 'disulfide_dist', 'intMet': 'intMet_dist', | |
'intramembrane': 'intramembrane_dist', 'naturalVariant': 'naturalVariant_dist', | |
'dnaBinding': 'dnaBinding_dist', 'activeSite': 'activeSite_dist', | |
'nucleotideBinding': 'nucleotideBinding_dist', 'lipidation': 'lipidation_dist', | |
'site': 'site_dist', | |
'transmembrane': 'transmembrane_dist', 'crosslink': 'crosslink_dist', | |
'mutagenesis': 'mutagenesis_dist', 'strand': 'strand_dist', 'helix': 'helix_dist', | |
'turn': 'turn_dist', | |
'metalBinding': 'metalBinding_dist', 'repeat': 'repeat_dist', | |
'topologicalDomain': 'topologicalDomain_dist', 'caBinding': 'caBinding_dist', | |
'bindingSite': 'bindingSite_dist', 'region': 'region_dist', | |
'signalPeptide': 'signalPeptide_dist', 'modifiedResidue': 'modifiedResidue_dist', | |
'zincFinger': 'zincFinger_dist', 'motif': 'motif_dist', 'coiledCoil': 'coiledCoil_dist', | |
'peptide': 'peptide_dist', 'transitPeptide': 'transitPeptide_dist', | |
'glycosylation': 'glycosylation_dist', 'propeptide': 'propeptide_dist'}, inplace=True) | |
final_data = final_data[ | |
['prot_uniprotAcc', 'wt_residue', 'mut_residue', 'position','Source', 'meta_merged', 'composition', 'polarity', | |
'volume', | |
'granthamScore', 'domains_all', | |
'domains_sig', 'domains_3Ddist', 'sasa', 'location_3state', 'disulfide_bin', 'intMet_bin', | |
'intramembrane_bin', 'naturalVariant_bin', 'dnaBinding_bin', | |
'activeSite_bin', 'nucleotideBinding_bin', 'lipidation_bin', 'site_bin', | |
'transmembrane_bin', 'crosslink_bin', 'mutagenesis_bin', 'strand_bin', | |
'helix_bin', 'turn_bin', 'metalBinding_bin', 'repeat_bin', | |
'caBinding_bin', 'topologicalDomain_bin', 'bindingSite_bin', | |
'region_bin', 'signalPeptide_bin', 'modifiedResidue_bin', | |
'zincFinger_bin', 'motif_bin', 'coiledCoil_bin', 'peptide_bin', | |
'transitPeptide_bin', 'glycosylation_bin', 'propeptide_bin', 'disulfide_dist', 'intMet_dist', | |
'intramembrane_dist', | |
'naturalVariant_dist', 'dnaBinding_dist', 'activeSite_dist', | |
'nucleotideBinding_dist', 'lipidation_dist', 'site_dist', | |
'transmembrane_dist', 'crosslink_dist', 'mutagenesis_dist', | |
'strand_dist', 'helix_dist', 'turn_dist', 'metalBinding_dist', | |
'repeat_dist', 'caBinding_dist', 'topologicalDomain_dist', | |
'bindingSite_dist', 'region_dist', 'signalPeptide_dist', | |
'modifiedResidue_dist', 'zincFinger_dist', 'motif_dist', | |
'coiledCoil_dist', 'peptide_dist', 'transitPeptide_dist', | |
'glycosylation_dist', 'propeptide_dist']] | |
# Imputation | |
if (impute == 'True') or (impute == 'true') or (impute == True): | |
filler = [17.84, 30.8, 24.96, 13.12, 23.62, 18.97, 20.87, 29.59, 20.7, 12.7, 22.85, 17.21, 9.8, 9, 15.99, | |
16.82, | |
20.46, 24.58, 9.99, 17.43, 20.08, 30.91, 20.86, 22.14, 21.91, 28.45, 17.81, 25.12, 20.33, 22.36] | |
col_index = 0 | |
for col_ in final_data.columns[-30:]: | |
final_data[col_] = final_data[col_].fillna(filler[col_index]) | |
final_data[col_] = final_data[col_].replace({'nan': filler[col_index]}) | |
col_index += 1 | |
final_data['domains_3Ddist'] = final_data['domains_3Ddist'].fillna(24.5) | |
final_data['sasa'] = final_data['sasa'].fillna(29.5) | |
final_data['location_3state'] = final_data['location_3state'].fillna('unknown') | |
elif (impute == 'False') or (impute == 'false'): | |
pass | |
final_data = final_data.replace({'nan': np.NaN}) | |
final_data.domains_all = final_data.domains_all.replace({-1: 'NULL'}) | |
# ready.to_csv(path_to_output_files / 'featurevector_pdb.txt', sep='\t', index=False) | |
if len(final_data) == 0: | |
print( | |
'No feature vector could be produced for input data. Please check the presence of a structure for the input proteins.') | |
final_data.to_csv(path_to_output_files / 'featurevector_pdb.txt', sep='\t', index=False) | |
print('Feature vector successfully created...') | |
end = timer() | |
hours, rem = divmod(end - start, 3600) | |
minutes, seconds = divmod(rem, 60) | |
print("Time passed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds)) | |
if len(no_modbase_models_updated) >0 and (len(no_modbase_models_updated) !=org_len): | |
st.write(f'{len(no_modbase_models_updated)} of {org_len} datapoins could not be mapped to any structures.') | |
st.write(f'{org_len-len(no_modbase_models_updated)} of {org_len} datapoins were mapped to a structure.') | |
elif len(no_modbase_models_updated) == org_len: | |
st.write(f'0 of {org_len} datapoins could not be mapped to any structures. Feature vector could not be created.') | |
return final_data | |
elif len(no_modbase_models_updated) >0 and (len(no_modbase_models_updated) !=org_len): | |
st.write(f'{len(no_modbase_models_updated)} of {org_len} datapoins could not be mapped to any structures.') | |
st.write(f'{org_len-len(no_modbase_models_updated)} of {org_len} datapoins were mapped to a structure.') | |
elif len(no_modbase_models_updated) == org_len: | |
st.write(f'0 of {org_len} datapoins could not be mapped to any structures. Feature vector could not be created.') | |