Spaces:
Sleeping
Sleeping
| # IMPORT NECESSARY MODULES AND LIBRARIES | |
| from timeit import default_timer as timer | |
| import xml.etree.ElementTree as ET | |
| from collections import Counter | |
| from bs4 import BeautifulSoup | |
| from io import StringIO | |
| from decimal import * | |
| import pandas as pd | |
| import requests | |
| import os.path as op | |
| import subprocess | |
| import shutil | |
| import ssbio.utils | |
| import warnings | |
| import sys | |
| import pathlib | |
| from pathlib import Path | |
| import os, glob | |
| import math | |
| import ssbio | |
| import ssl | |
| import numpy as np | |
| from Bio.Align import substitution_matrices | |
| from Bio.PDB.Polypeptide import * | |
| from Bio.PDB import PDBList | |
| from Bio import Align | |
| from Bio import SeqIO | |
| from Bio.PDB import * | |
| warnings.filterwarnings("ignore") | |
| start = timer() | |
| # FUNCTIONS | |
| from calc_pc_property import * | |
| from add_domains import * | |
| from retrieveUniprotSequences import * | |
| from add_annotations import * | |
| from add_sequence import * | |
| from add_structure import * | |
| from add_alignment import * | |
| from manage_files import * | |
| from add_3Dalignment import * | |
| from add_sasa import * | |
| from standard import * | |
| from add_interface_pos import * | |
| from standard import * | |
| from utils import * | |
| from pdbMapping import * | |
| from uniprotSequenceMatch import uniprotSequenceMatch | |
| from process_input import clean_data | |
| from urllib.error import HTTPError | |
| from swissModelAdd import * | |
| from modbaseModelAdd import * | |
| import streamlit as st | |
| def pdb(input_set, mode, impute): | |
| # Fill empty dataframes with SIMPLE_COLS | |
| SIMPLE_COLS = ['uniprotID', 'wt', 'pos', 'mut', 'datapoint', 'composition', 'polarity', | |
| 'volume', 'granthamScore', 'domain', 'domStart', 'domEnd', 'distance', | |
| 'intMet', 'naturalVariant', 'activeSite', 'crosslink', 'mutagenesis', | |
| 'strand', 'helix', 'turn', 'region', 'modifiedResidue', 'motif', | |
| 'metalBinding', 'lipidation', 'glycosylation', 'topologicalDomain', | |
| 'nucleotideBinding', 'bindingSite', 'transmembrane', 'transitPeptide', | |
| 'repeat', 'site', 'peptide', 'signalPeptide', 'disulfide', 'coiledCoil', | |
| 'intramembrane', 'zincFinger', 'caBinding', 'propeptide', 'dnaBinding', | |
| 'disulfideBinary', 'intMetBinary', 'intramembraneBinary', | |
| 'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary', | |
| 'nucleotideBindingBinary', 'lipidationBinary', 'siteBinary', | |
| 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', | |
| 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary', | |
| 'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary', | |
| 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', | |
| 'modifiedResidueBinary', 'zincFingerBinary', 'motifBinary', | |
| 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', | |
| 'glycosylationBinary', 'propeptideBinary'] | |
| UNIPROT_ANNOTATION_COLS = SIMPLE_COLS[-60:] | |
| path_to_input_files, path_to_output_files, path_to_domains, fisher_path, path_to_interfaces, buffer = manage_files(mode) | |
| out_path = path_to_output_files / 'log.txt' | |
| #sys.stdout = open(out_path, 'w') | |
| data = clean_data(input_set) | |
| data = add_uniprot_sequence(data) | |
| match = data[(data.wt_sequence_match == 'm')] | |
| org_len = len(match) | |
| iso = data[(data.wt_sequence_match == 'i')] | |
| noMatch = data[(data.wt_sequence_match != 'm') & (data.wt_sequence_match != 'i')] | |
| if len(data) == 0: | |
| st.write('Feature vectore generation terminated. Please enter a query.') | |
| else: | |
| if len(noMatch) == len(data) : | |
| st.write('Aminoacid at the position could not be mapped to canonical or isoform sequence. Please check the input amino acid.') | |
| elif len(noMatch) > 0: | |
| st.write( | |
| f'{len(noMatch)} of {len(data)} datapoints has not been mapped to any sequence. These datapoints are omitted.') | |
| if len(iso) > 0: | |
| st.write(f'{len(iso)} of {len(data)} datapoints has been mapped to isoform sequences. These datapoints are omitted.') | |
| if len(match) == 0: | |
| st.write('Feature generation terminated due to failed mapping of input amino acid to UniProt sequence.') | |
| else: | |
| st.write(f'{len(match)} of {len(data)} datapoints has been mapped to canonical sequences. Proceeding with these datapoins.') | |
| if (len(iso) != 0) | (len(noMatch) != 0): | |
| st.write('Omitted datapoints are:', noMatch.datapoint.to_list() + iso.datapoint.to_list()) | |
| st.write('\n') | |
| st.write('Check log file for updates.') | |
| data = match[['uniprotID', 'wt', 'pos', 'mut', 'datapoint']] | |
| print('>> Feature vector generation started...\n') | |
| print('\n>> Creating directories...') | |
| print('\n>> Adding physicochemical properties...\n') | |
| data = add_physicochemical(data) | |
| print('\n>> Adding domains\n') | |
| data = add_domains(data, path_to_domains) | |
| print('\n>> Adding sequence annotations...\n') | |
| data = add_annotations(data) | |
| print('\n>> Retrieving PDB structure information...\n') | |
| pdb_info = addPDBinfo(data, path_to_output_files) | |
| if len(pdb_info) != 0: | |
| data = pd.merge(data, pdb_info, on='uniprotID', how='left') | |
| # Spare datapoint if there is no associated PDB. | |
| no_pdb = data[data.pdbID.isna()].drop_duplicates() | |
| pdb = data[~data.pdbID.isna()].drop_duplicates() | |
| # Spare datapoint if associated PDB does not cover mutated area. | |
| pdb.pos = pdb.pos.apply(lambda x:int(x)) | |
| pdb.start = pdb.start.apply(lambda x: int(x)) | |
| pdb.end = pdb.end.apply(lambda x: int(x)) | |
| no_pdb_add = pdb[~((pdb.pos > pdb.start) & (pdb.pos < pdb.end))] | |
| pdb = pdb[(pdb.pos > pdb.start) & (pdb.pos < pdb.end)] # do not change order | |
| pdb.reset_index(drop=True, inplace=True) | |
| # Delete spared datapoint from no_pdb list if it has any other PDB that spans the mutated area. | |
| no_pdb_add = no_pdb_add[~no_pdb_add.datapoint.isin(pdb.datapoint.to_list())] | |
| # Final collection of datapoints without PDB associaton. | |
| no_pdb = pd.concat([no_pdb, no_pdb_add]) | |
| no_pdb = no_pdb[SIMPLE_COLS] | |
| no_pdb = no_pdb.drop_duplicates() | |
| pdb = pdb.sort_values(['uniprotID', 'resolution'], axis=0, ascending=True) | |
| pdb.reset_index(drop=True, inplace=True) | |
| pdb.fillna(np.NaN, inplace=True) | |
| # Get position mapping from added structures | |
| print('\n>> Adding structure residue positions...\n') | |
| if len(pdb) > 0: # there are mapped structures, and some of them span the mutated area. | |
| pdb.replace({'[]': np.NaN, 'nan-nan': np.NaN, '': np.NaN}, inplace=True) | |
| pdb = pdbMapping(pdb, Path(path_to_output_files / 'pdb_structures')) | |
| pdb.reset_index(drop=True, inplace=True) | |
| pdb = pdb.fillna(np.NaN) | |
| no_pdb_add_ = pdb[pdb.AAonPDB.isna()] | |
| no_pdb_add = pdb[pdb.MATCHDICT.isna()] | |
| no_pdb = pd.concat([no_pdb_add_, no_pdb, no_pdb_add]) | |
| no_pdb.reset_index(inplace=True, drop=True) | |
| pdb = pdb[~(pdb.MATCHDICT.isna())] | |
| pdb = pdb[~(pdb.AAonPDB.isna())] | |
| if len(pdb) > 0: | |
| print('\n>> Mapping to PDB residues...\n') | |
| pdb = changeUPtoPDB(pdb) | |
| pdb.reset_index(drop=True, inplace=True) | |
| print('\n>> Calculating 3D distances for PDB structures...\n') | |
| pdb = isZeroDistance(pdb) | |
| pdb = processFile(pdb, path_to_output_files) | |
| pdb = match3D(pdb) | |
| pdb = selectMaxAnnot(pdb) | |
| pdb = pdb.sort_values(by=['datapoint', 'resolution', 'annotTotal'], ascending=[True, True, True]) | |
| pdb = pdb.drop_duplicates(['datapoint']) | |
| pdb.replace({'[]': np.NaN, 'hit':0.0}, inplace=True) | |
| print('\n>> PDB matching is completed...\n') | |
| else: | |
| # There was no residue match in the associated PDB. So we cannot use PDB data. | |
| pdb = pdb[SIMPLE_COLS] | |
| print('\n>>> No PDB structure could be matched.') | |
| else: | |
| pdb = pdb[SIMPLE_COLS] | |
| print('\n>>> No PDB structure could be matched.') | |
| else: | |
| pdb = pd.DataFrame(columns = SIMPLE_COLS) | |
| print('\n>>> No PDB structure could be matched.') | |
| no_pdb = data.copy() | |
| no_pdb = no_pdb[SIMPLE_COLS] | |
| print( | |
| 'PDB phase is finished...\nPDB structures are found for %d of %d.\n%d of %d failed to match with PDB structure.\n' | |
| % (len(pdb.drop_duplicates(['datapoint'])), len(data.drop_duplicates(['datapoint'])), | |
| len(no_pdb.drop_duplicates(['datapoint'])), len(data.drop_duplicates(['datapoint'])))) | |
| print('\n>>> Proceeding to SwissModel search...') | |
| print('------------------------------------\n') | |
| swiss = no_pdb.copy() | |
| if len(swiss) > 0: | |
| print('\n>> Adding SwissModel residue positions...\n') | |
| swiss.replace({'[]': np.NaN, 'nan-nan': np.NaN, '': np.NaN}, inplace=True) | |
| swiss = swiss.fillna(np.NaN) | |
| swiss, no_swiss_models= addSwissModels(swiss, path_to_input_files, path_to_output_files) | |
| print('\n>> Mapping to SwissModels...\n') | |
| if len(swiss) > 0: | |
| swiss.reset_index(drop=True, inplace=True) | |
| swiss = changeUPtoModels(swiss) | |
| swiss.reset_index(drop=True, inplace=True) | |
| print('\n>> Calculating 3D distances for SwissModels...\n') | |
| swiss = isZeroDistance(swiss) | |
| swiss = match3DModels(swiss) | |
| swiss = selectMaxAnnot(swiss) | |
| swiss = swiss.sort_values(by=['datapoint', 'qmean_norm', 'distance', 'hitTotal', 'annotTotal'], ascending=[True, False, True, False, True]) | |
| swiss = swiss.drop_duplicates(['datapoint']) | |
| swiss.replace({'[]': np.NaN, 'hit': 0.0}, inplace=True) | |
| else: | |
| swiss = swiss[SIMPLE_COLS] | |
| if len(no_swiss_models) > 0: | |
| no_swiss_models = no_swiss_models[SIMPLE_COLS] | |
| no_swiss_models.reset_index(inplace=True, drop=True) | |
| else: | |
| swiss = swiss[SIMPLE_COLS] | |
| no_swiss_models = no_pdb.copy() | |
| if len(no_swiss_models) >0: | |
| modbase = no_swiss_models.copy() | |
| print('Proceeding to Modbase search...') | |
| print('------------------------------------\n') | |
| modbase = modbase[SIMPLE_COLS] | |
| modbase.replace({'[]': np.NaN, 'nan-nan': np.NaN, '': np.NaN}, inplace=True) | |
| modbase = modbase.fillna(np.NaN) | |
| print('\n>> Adding Modbase residue positions...\n') | |
| modbase_simple = modbase[['uniprotID', 'wt', 'pos', 'mut','datapoint']] | |
| modbase_simple = modbase_simple.drop_duplicates(['uniprotID', 'wt', 'pos' ,'mut','datapoint']) | |
| modbaseOut, no_modbase_models_updated = addModbaseModels(modbase_simple, path_to_input_files, path_to_output_files) | |
| if len(modbaseOut) > 0: | |
| modbase = modbase.merge(modbaseOut, on = ['uniprotID', 'wt', 'pos', 'mut','datapoint'], how = 'left') | |
| no_modbase_models_updated['sasa'] = np.NaN | |
| modbase.reset_index(inplace=True, drop=True) | |
| no_modbase_add = modbase[pd.isna(modbase.coordinates)] | |
| modbase = modbase[~pd.isna(modbase.coordinates)] | |
| no_modbase_models_updated = pd.concat([no_modbase_models_updated, no_modbase_add]) | |
| print('\n>> Mapping to Modbase models...\n') | |
| modbase = changeUPtoModels(modbase) | |
| print('\n>> Calculating 3D distances for Modbase models...\n') | |
| modbase = isZeroDistance(modbase) | |
| modbase = match3DModels(modbase) | |
| modbase = selectMaxAnnot(modbase) | |
| modbase = modbase.sort_values(by=['datapoint', 'quality_score', 'distance','hitTotal', 'annotTotal'], ascending=[True, False, True, False, True]) | |
| modbase = modbase.drop_duplicates(['datapoint']) | |
| modbase.replace({'[]': np.NaN, 'hit': 0.0}, inplace=True) | |
| else: | |
| modbase = pd.DataFrame(columns = SIMPLE_COLS) | |
| else: | |
| no_modbase_models_updated = pd.DataFrame(columns = SIMPLE_COLS) | |
| modbase= pd.DataFrame(columns = SIMPLE_COLS) | |
| COLS = ['uniprotID', 'wt', 'pos', 'mut', 'datapoint', 'composition', 'polarity', 'volume', 'granthamScore', 'domain', 'domStart', 'domEnd', 'distance', | |
| 'region', 'crosslink', 'peptide', 'disulfide', 'signalPeptide', 'propeptide', 'naturalVariant', 'nucleotideBinding', 'modifiedResidue', 'site', | |
| 'caBinding', 'turn', 'transmembrane', 'repeat', 'glycosylation', 'intramembrane', 'metalBinding', 'bindingSite', 'dnaBinding', 'activeSite', | |
| 'coiledCoil', 'helix', 'mutagenesis', 'zincFinger', 'transitPeptide', 'intMet', 'strand', 'lipidation', 'motif', 'topologicalDomain', | |
| 'disulfideBinary', 'intMetBinary', 'intramembraneBinary', 'naturalVariantBinary', 'dnaBindingBinary', 'activeSiteBinary', 'nucleotideBindingBinary', | |
| 'lipidationBinary', 'siteBinary', 'transmembraneBinary', 'crosslinkBinary', 'mutagenesisBinary', 'strandBinary', 'helixBinary', 'turnBinary', 'metalBindingBinary', | |
| 'repeatBinary', 'topologicalDomainBinary', 'caBindingBinary', 'bindingSiteBinary', 'regionBinary', 'signalPeptideBinary', 'modifiedResidueBinary', 'zincFingerBinary', | |
| 'motifBinary', 'coiledCoilBinary', 'peptideBinary', 'transitPeptideBinary', 'glycosylationBinary', 'propeptideBinary', 'sasa'] | |
| if len(no_modbase_models_updated) == 0: | |
| no_modbase_models_updated = pd.DataFrame(columns = SIMPLE_COLS) | |
| no_modbase_models_updated = no_modbase_models_updated[~no_modbase_models_updated.datapoint.isin(modbase.datapoint.to_list())] | |
| no_modbase_models_updated = no_modbase_models_updated[['uniprotID', 'wt', 'pos', 'mut', 'datapoint']] | |
| no_modbase_models_updated.pos = no_modbase_models_updated.pos.astype(int) | |
| no_modbase_models_updated = no_modbase_models_updated.drop_duplicates() | |
| if len(pdb)>0: | |
| pdb = pdb[COLS] | |
| pdb['Source'] = 'PDB' | |
| else: | |
| pdb = pd.DataFrame() | |
| if len(swiss) > 0: | |
| swiss = swiss[COLS] | |
| swiss['Source'] = 'SWISS-Model' | |
| else: | |
| swiss = pd.DataFrame() | |
| if len(modbase) > 0: | |
| modbase = modbase[COLS] | |
| modbase['Source'] = 'Modbase' | |
| else: | |
| modbase = pd.DataFrame() | |
| # st.write('======PDB==========') | |
| # st.write(pdb.to_string()) | |
| # st.write('======SWISS==========') | |
| # st.write(swiss.to_string()) | |
| # st.write('======MODBASE==========') | |
| # st.write(modbase.to_string()) | |
| allData = pd.concat([pdb, swiss, modbase]) | |
| allData.reset_index(inplace=True, drop=True) | |
| allData.replace({np.NaN: ''}, inplace=True) | |
| # st.write('======ALL DATA==========') | |
| # st.write(allData.to_string()) | |
| if len(allData)>0: | |
| allData.distance.replace({-1000: ''}, inplace=True) | |
| # Get interface positions from ECLAIR. Download HQ human | |
| print() | |
| print('Assigning surface regions...') | |
| print('------------------------------------\n') | |
| print('Extracting interface residues...\n') | |
| data_interface = pd.read_csv(path_to_interfaces, sep='\t') | |
| positions = get_interface_positions(data_interface, 'P1', 'P2') | |
| interface_dataframe = pd.DataFrame() | |
| for key, val in positions.items(): | |
| k = pd.Series((key, str(list(set(val))))) | |
| interface_dataframe = interface_dataframe.append(k, ignore_index=True) | |
| interface_dataframe.columns = ['uniprotID', 'positions'] | |
| final_data = finalTouch(allData) | |
| final_data = final_data.merge(interface_dataframe, on='uniprotID', how='left') | |
| final_data.positions = final_data.positions.astype('str') | |
| for i in final_data.index: | |
| if (str(final_data.at[i, 'pos']) in final_data.at[i, 'positions']) and final_data.at[i, 'trsh4'] == 'surface': | |
| final_data.at[i, 'threeState_trsh4_HQ'] = 'interface' | |
| elif (str(final_data.at[i, 'pos']) not in final_data.at[i, 'positions']) and final_data.at[i, 'trsh4'] == 'surface': | |
| final_data.at[i, 'threeState_trsh4_HQ'] = 'surface' | |
| elif (str(final_data.at[i, 'pos']) not in final_data.at[i, 'positions']) and final_data.at[i, 'trsh4'] == 'core': | |
| final_data.at[i, 'threeState_trsh4_HQ'] = 'core' | |
| elif (str(final_data.at[i, 'pos']) in final_data.at[i, 'positions']) and final_data.at[i, 'trsh4'] == 'core': | |
| final_data.at[i, 'threeState_trsh4_HQ'] = 'conflict' | |
| elif final_data.at[i, 'trsh4'] == 'nan': | |
| final_data.at[i, 'threeState_trsh4_HQ'] = 'nan' | |
| final_data.drop(['positions'], axis=1, inplace=True) | |
| fisherResult = pd.read_csv(fisher_path, sep='\t') | |
| significant_domains = fisherResult.domain.to_list() | |
| for i in final_data.index: | |
| if final_data.at[i, 'domain'] in significant_domains: | |
| final_data.at[i, 'domain_fisher'] = final_data.at[i, 'domain'] | |
| else: | |
| final_data.at[i, 'domain_fisher'] = 'NULL' | |
| print('Final adjustments are being done...\n') | |
| binaryCols = UNIPROT_ANNOTATION_COLS[-30:] | |
| final_data = final_data.astype(str) | |
| final_data.replace({'NaN': 'nan'}, inplace=True) | |
| for i in final_data.index: | |
| for j in binaryCols: | |
| final_data[j] = final_data[j].astype('str') | |
| if (final_data.at[i, j] == '0') or (final_data.at[i, j] == '0.0'): | |
| final_data.at[i, j] = '1' | |
| elif final_data.at[i, j] == 'nan': | |
| final_data.at[i, j] = '0' | |
| elif (final_data.at[i, j] == '1') or (final_data.at[i, j] == '1.0'): | |
| final_data.at[i, j] = '2' | |
| annotCols = UNIPROT_ANNOTATION_COLS[:30] | |
| for i in final_data.index: | |
| for annot in annotCols: | |
| binaryName = str(annot) + 'Binary' | |
| if final_data.at[i, binaryName] == '2': | |
| final_data.at[i, annot] = '0.0' | |
| final_data.rename( | |
| columns={'uniprotID': 'prot_uniprotAcc', 'wt': 'wt_residue', 'pos': 'position', 'mut': 'mut_residue', | |
| 'datapoint': 'meta_merged', 'datapoint_disease': 'meta-lab_merged', 'label': 'source_db', | |
| 'family': 'prot_family', 'domain': 'domains_all', 'domain_fisher': 'domains_sig', | |
| 'distance': 'domains_3Ddist', 'threeState_trsh4_HQ': 'location_3state', | |
| 'disulfideBinary': 'disulfide_bin', 'intMetBinary': 'intMet_bin', | |
| 'intramembraneBinary': 'intramembrane_bin', | |
| 'naturalVariantBinary': 'naturalVariant_bin', 'dnaBindingBinary': 'dnaBinding_bin', | |
| 'activeSiteBinary': 'activeSite_bin', | |
| 'nucleotideBindingBinary': 'nucleotideBinding_bin', 'lipidationBinary': 'lipidation_bin', | |
| 'siteBinary': 'site_bin', | |
| 'transmembraneBinary': 'transmembrane_bin', 'crosslinkBinary': 'crosslink_bin', | |
| 'mutagenesisBinary': 'mutagenesis_bin', | |
| 'strandBinary': 'strand_bin', 'helixBinary': 'helix_bin', 'turnBinary': 'turn_bin', | |
| 'metalBindingBinary': 'metalBinding_bin', | |
| 'repeatBinary': 'repeat_bin', 'topologicalDomainBinary': 'topologicalDomain_bin', | |
| 'caBindingBinary': 'caBinding_bin', | |
| 'bindingSiteBinary': 'bindingSite_bin', 'regionBinary': 'region_bin', | |
| 'signalPeptideBinary': 'signalPeptide_bin', | |
| 'modifiedResidueBinary': 'modifiedResidue_bin', 'zincFingerBinary': 'zincFinger_bin', | |
| 'motifBinary': 'motif_bin', | |
| 'coiledCoilBinary': 'coiledCoil_bin', 'peptideBinary': 'peptide_bin', | |
| 'transitPeptideBinary': 'transitPeptide_bin', | |
| 'glycosylationBinary': 'glycosylation_bin', 'propeptideBinary': 'propeptide_bin', | |
| 'disulfide': 'disulfide_dist', 'intMet': 'intMet_dist', | |
| 'intramembrane': 'intramembrane_dist', 'naturalVariant': 'naturalVariant_dist', | |
| 'dnaBinding': 'dnaBinding_dist', 'activeSite': 'activeSite_dist', | |
| 'nucleotideBinding': 'nucleotideBinding_dist', 'lipidation': 'lipidation_dist', | |
| 'site': 'site_dist', | |
| 'transmembrane': 'transmembrane_dist', 'crosslink': 'crosslink_dist', | |
| 'mutagenesis': 'mutagenesis_dist', 'strand': 'strand_dist', 'helix': 'helix_dist', | |
| 'turn': 'turn_dist', | |
| 'metalBinding': 'metalBinding_dist', 'repeat': 'repeat_dist', | |
| 'topologicalDomain': 'topologicalDomain_dist', 'caBinding': 'caBinding_dist', | |
| 'bindingSite': 'bindingSite_dist', 'region': 'region_dist', | |
| 'signalPeptide': 'signalPeptide_dist', 'modifiedResidue': 'modifiedResidue_dist', | |
| 'zincFinger': 'zincFinger_dist', 'motif': 'motif_dist', 'coiledCoil': 'coiledCoil_dist', | |
| 'peptide': 'peptide_dist', 'transitPeptide': 'transitPeptide_dist', | |
| 'glycosylation': 'glycosylation_dist', 'propeptide': 'propeptide_dist'}, inplace=True) | |
| final_data = final_data[ | |
| ['prot_uniprotAcc', 'wt_residue', 'mut_residue', 'position','Source', 'meta_merged', 'composition', 'polarity', | |
| 'volume', | |
| 'granthamScore', 'domains_all', | |
| 'domains_sig', 'domains_3Ddist', 'sasa', 'location_3state', 'disulfide_bin', 'intMet_bin', | |
| 'intramembrane_bin', 'naturalVariant_bin', 'dnaBinding_bin', | |
| 'activeSite_bin', 'nucleotideBinding_bin', 'lipidation_bin', 'site_bin', | |
| 'transmembrane_bin', 'crosslink_bin', 'mutagenesis_bin', 'strand_bin', | |
| 'helix_bin', 'turn_bin', 'metalBinding_bin', 'repeat_bin', | |
| 'caBinding_bin', 'topologicalDomain_bin', 'bindingSite_bin', | |
| 'region_bin', 'signalPeptide_bin', 'modifiedResidue_bin', | |
| 'zincFinger_bin', 'motif_bin', 'coiledCoil_bin', 'peptide_bin', | |
| 'transitPeptide_bin', 'glycosylation_bin', 'propeptide_bin', 'disulfide_dist', 'intMet_dist', | |
| 'intramembrane_dist', | |
| 'naturalVariant_dist', 'dnaBinding_dist', 'activeSite_dist', | |
| 'nucleotideBinding_dist', 'lipidation_dist', 'site_dist', | |
| 'transmembrane_dist', 'crosslink_dist', 'mutagenesis_dist', | |
| 'strand_dist', 'helix_dist', 'turn_dist', 'metalBinding_dist', | |
| 'repeat_dist', 'caBinding_dist', 'topologicalDomain_dist', | |
| 'bindingSite_dist', 'region_dist', 'signalPeptide_dist', | |
| 'modifiedResidue_dist', 'zincFinger_dist', 'motif_dist', | |
| 'coiledCoil_dist', 'peptide_dist', 'transitPeptide_dist', | |
| 'glycosylation_dist', 'propeptide_dist']] | |
| # Imputation | |
| if (impute == 'True') or (impute == 'true') or (impute == True): | |
| filler = [17.84, 30.8, 24.96, 13.12, 23.62, 18.97, 20.87, 29.59, 20.7, 12.7, 22.85, 17.21, 9.8, 9, 15.99, | |
| 16.82, | |
| 20.46, 24.58, 9.99, 17.43, 20.08, 30.91, 20.86, 22.14, 21.91, 28.45, 17.81, 25.12, 20.33, 22.36] | |
| col_index = 0 | |
| for col_ in final_data.columns[-30:]: | |
| final_data[col_] = final_data[col_].fillna(filler[col_index]) | |
| final_data[col_] = final_data[col_].replace({'nan': filler[col_index]}) | |
| col_index += 1 | |
| final_data['domains_3Ddist'] = final_data['domains_3Ddist'].fillna(24.5) | |
| final_data['sasa'] = final_data['sasa'].fillna(29.5) | |
| final_data['location_3state'] = final_data['location_3state'].fillna('unknown') | |
| elif (impute == 'False') or (impute == 'false'): | |
| pass | |
| final_data = final_data.replace({'nan': np.NaN}) | |
| final_data.domains_all = final_data.domains_all.replace({-1: 'NULL'}) | |
| # ready.to_csv(path_to_output_files / 'featurevector_pdb.txt', sep='\t', index=False) | |
| if len(final_data) == 0: | |
| print( | |
| 'No feature vector could be produced for input data. Please check the presence of a structure for the input proteins.') | |
| final_data.to_csv(path_to_output_files / 'featurevector_pdb.txt', sep='\t', index=False) | |
| print('Feature vector successfully created...') | |
| end = timer() | |
| hours, rem = divmod(end - start, 3600) | |
| minutes, seconds = divmod(rem, 60) | |
| print("Time passed: {:0>2}:{:0>2}:{:05.2f}".format(int(hours), int(minutes), seconds)) | |
| if len(no_modbase_models_updated) >0 and (len(no_modbase_models_updated) !=org_len): | |
| st.write(f'{len(no_modbase_models_updated)} of {org_len} datapoins could not be mapped to any structures.') | |
| st.write(f'{org_len-len(no_modbase_models_updated)} of {org_len} datapoins were mapped to a structure.') | |
| elif len(no_modbase_models_updated) == org_len: | |
| st.write(f'0 of {org_len} datapoins could not be mapped to any structures. Feature vector could not be created.') | |
| return final_data | |
| elif len(no_modbase_models_updated) >0 and (len(no_modbase_models_updated) !=org_len): | |
| st.write(f'{len(no_modbase_models_updated)} of {org_len} datapoins could not be mapped to any structures.') | |
| st.write(f'{org_len-len(no_modbase_models_updated)} of {org_len} datapoins were mapped to a structure.') | |
| elif len(no_modbase_models_updated) == org_len: | |
| st.write(f'0 of {org_len} datapoins could not be mapped to any structures. Feature vector could not be created.') | |