Spaces:
Sleeping
Sleeping
| import numpy as np | |
| import pandas as pd | |
| import os | |
| import requests | |
| import json | |
| import tarfile, gzip | |
| import time, glob | |
| from utils import threeToOne | |
| import streamlit as st | |
| from pathlib import Path | |
| import gzip | |
| import shutil | |
| import codecs | |
| import io | |
| def uniprot_pdb_residue_mapping(pdb_id, uniprot_id, save_path): | |
| """ | |
| This code does residue-wise mapping between UniProt and PDB residues. | |
| """ | |
| ascaris = {} | |
| full_ascaris = {} | |
| res = requests.get(f'https://www.ebi.ac.uk/pdbe/download/api/pdb/entry/sifts?id={pdb_id}') | |
| url = json.loads(res.text)['url'] | |
| response = requests.get(url, stream=True) | |
| file = tarfile.open(fileobj=response.raw, mode="r|gz") | |
| file.extractall(path=save_path) # Creates another gz file | |
| existing_pdb = list(Path(save_path).glob("*")) | |
| existing_pdb = [str(i) for i in existing_pdb] | |
| try: | |
| with gzip.open(f'{save_path}/{pdb_id.lower()}.xml.gz', 'rt') as f: | |
| file_content = f.read() | |
| except FileNotFoundError: | |
| with gzip.open(f'{save_path}/{pdb_id}.xml.gz', 'rt') as f: | |
| file_content = f.read() | |
| content = file_content.split('\n') | |
| index = [idx for idx, s in enumerate(content) if 'listResidue' in s] | |
| listResidues = [] | |
| for ind in range(0, len(index), 2): | |
| try: | |
| if ((content[index[ind]]).strip() == '<listResidue>') & ( | |
| (content[index[ind + 1]]).strip() == '</listResidue>'): | |
| listResidues.append(content[index[ind]:index[ind + 1]]) | |
| except: | |
| IndexError | |
| for true_content in listResidues: | |
| for sub_content in true_content: | |
| if f'dbAccessionId="{uniprot_id}"' in sub_content: | |
| content = [i.strip() for i in true_content] | |
| sel = [i for i in content if | |
| ('<crossRefDb dbSource="PDB"' in i or '<crossRefDb dbSource="UniProt"' in i)] | |
| matching_dict = {} | |
| if len(sel) % 2 == 0: # if correct residues | |
| dbAccessionId = [i.split('dbAccessionId')[1].split(' ')[0].split('=')[1].strip('"').upper() for i | |
| in sel] | |
| dbSource = [i.split('dbSource')[1].split(' ')[0].split('=')[1].strip('"').upper() for i in sel] | |
| dbResNum = [i.split('dbResNum')[1].split(' ')[0].split('=')[1].strip('"') for i in sel] | |
| dbResName = [i.split('dbResName')[1].split(' ')[0].split('=')[1].split('/')[0].strip('"') for i in | |
| sel] | |
| dbChainName = [i.split('dbChainId')[1].split(' ')[0].split('=')[1].split('/')[0].strip('"') for i | |
| in sel if 'crossRefDb dbSource="PDB' in i] | |
| for k, j, m in zip(range(0, len(dbAccessionId), 2), range(1, len(dbAccessionId) - 1, 2), range(len(dbChainName))): | |
| # try: | |
| if dbResName[j] == threeToOne(dbResName[k]) and dbAccessionId[j] == uniprot_id: | |
| matching_dict[ | |
| dbSource[j] + '_' + dbAccessionId[j] + '_' + dbResNum[j] + '_' + dbResName[j]] = \ | |
| dbSource[k] + '_' + dbAccessionId[k] + '_' + dbResNum[k] + '_' + threeToOne( | |
| dbResName[k]) + '_' + dbChainName[m] | |
| # except: | |
| # KeyError | |
| only_residues = {k.split('_')[2]: v.split('_')[2] for k, v in matching_dict.items()} | |
| for k, v in matching_dict.items(): | |
| if v.split('_')[1] + v.split('_')[-1] not in ascaris.keys(): | |
| ascaris[v.split('_')[1] + v.split('_')[-1]] = only_residues | |
| for k, v in matching_dict.items(): | |
| if v.split('_')[1] + v.split('_')[-1] not in full_ascaris.keys(): | |
| full_ascaris[v.split('_')[1] + v.split('_')[-1]] = matching_dict | |
| return ascaris ,full_ascaris | |
| import ast | |
| def pdbMapping(data, save_path): # BU DATA hangi df hepi mi azalttigimiz mi | |
| # Here we add match dictionary containing different positons for different chains and PDB Ids/ | |
| for i in data.index: | |
| posOnPDB = {} | |
| uniprot_id = data.at[i, 'uniprotID'] | |
| pdb_id = data.at[i, 'pdbID'] | |
| pos = data.at[i, 'pos'] | |
| wt = data.at[i, 'wt'] | |
| data.at[i, 'AAonPDB'] = np.NaN | |
| data.at[i,'pdbinfo'] = pdb_id + data.at[i, 'chain'] | |
| allMatchesForDP, full_ascaris = uniprot_pdb_residue_mapping(pdb_id, uniprot_id, save_path) | |
| for key, val in full_ascaris[data.at[i,'pdbinfo']].items(): | |
| if int(key.split('_')[2]) == int(pos): | |
| data.loc[i, 'AAonPDB'] = val.split('_')[3] | |
| break | |
| if data.at[i, 'AAonPDB'] == wt: | |
| data.at[i, 'PDB_ALIGN_STATUS'] = 'aligned' | |
| else: | |
| data.at[i, 'PDB_ALIGN_STATUS'] = 'notAligned' | |
| keep = allMatchesForDP[data.at[i,'pdbinfo']] | |
| for pos in ast.literal_eval(data.at[i, 'POSITIONS']): | |
| try: | |
| if keep[str(pos)] != 'null': | |
| posOnPDB[str(pos)] = keep[str(pos)] | |
| else: | |
| pass | |
| except KeyError: | |
| pass | |
| data.at[i, 'MATCHDICT'] = str(posOnPDB) | |
| data = data.drop(columns=['POSITIONS']) | |
| return data | |
| def processAnnotation(annot_positions): | |
| annot_positions = str(annot_positions).replace("'", '') | |
| annot_positions = str(annot_positions).replace('[', '') | |
| annot_positions = str(annot_positions).replace("]", '') | |
| positionList_perAnnotation = annot_positions.split(',') | |
| positionList_perAnnotation = [h.strip() for h in positionList_perAnnotation] |