Spaces:
Sleeping
Sleeping
File size: 5,615 Bytes
3810f27 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
import numpy as np
import pandas as pd
import os
import requests
import json
import tarfile, gzip
import time, glob
from utils import threeToOne
import streamlit as st
from pathlib import Path
import gzip
import shutil
import codecs
import io
def uniprot_pdb_residue_mapping(pdb_id, uniprot_id, save_path):
"""
This code does residue-wise mapping between UniProt and PDB residues.
"""
ascaris = {}
full_ascaris = {}
res = requests.get(f'https://www.ebi.ac.uk/pdbe/download/api/pdb/entry/sifts?id={pdb_id}')
url = json.loads(res.text)['url']
response = requests.get(url, stream=True)
file = tarfile.open(fileobj=response.raw, mode="r|gz")
file.extractall(path=save_path) # Creates another gz file
existing_pdb = list(Path(save_path).glob("*"))
existing_pdb = [str(i) for i in existing_pdb]
try:
with gzip.open(f'{save_path}/{pdb_id.lower()}.xml.gz', 'rt') as f:
file_content = f.read()
except FileNotFoundError:
with gzip.open(f'{save_path}/{pdb_id}.xml.gz', 'rt') as f:
file_content = f.read()
content = file_content.split('\n')
index = [idx for idx, s in enumerate(content) if 'listResidue' in s]
listResidues = []
for ind in range(0, len(index), 2):
try:
if ((content[index[ind]]).strip() == '<listResidue>') & (
(content[index[ind + 1]]).strip() == '</listResidue>'):
listResidues.append(content[index[ind]:index[ind + 1]])
except:
IndexError
for true_content in listResidues:
for sub_content in true_content:
if f'dbAccessionId="{uniprot_id}"' in sub_content:
content = [i.strip() for i in true_content]
sel = [i for i in content if
('<crossRefDb dbSource="PDB"' in i or '<crossRefDb dbSource="UniProt"' in i)]
matching_dict = {}
if len(sel) % 2 == 0: # if correct residues
dbAccessionId = [i.split('dbAccessionId')[1].split(' ')[0].split('=')[1].strip('"').upper() for i
in sel]
dbSource = [i.split('dbSource')[1].split(' ')[0].split('=')[1].strip('"').upper() for i in sel]
dbResNum = [i.split('dbResNum')[1].split(' ')[0].split('=')[1].strip('"') for i in sel]
dbResName = [i.split('dbResName')[1].split(' ')[0].split('=')[1].split('/')[0].strip('"') for i in
sel]
dbChainName = [i.split('dbChainId')[1].split(' ')[0].split('=')[1].split('/')[0].strip('"') for i
in sel if 'crossRefDb dbSource="PDB' in i]
for k, j, m in zip(range(0, len(dbAccessionId), 2), range(1, len(dbAccessionId) - 1, 2), range(len(dbChainName))):
# try:
if dbResName[j] == threeToOne(dbResName[k]) and dbAccessionId[j] == uniprot_id:
matching_dict[
dbSource[j] + '_' + dbAccessionId[j] + '_' + dbResNum[j] + '_' + dbResName[j]] = \
dbSource[k] + '_' + dbAccessionId[k] + '_' + dbResNum[k] + '_' + threeToOne(
dbResName[k]) + '_' + dbChainName[m]
# except:
# KeyError
only_residues = {k.split('_')[2]: v.split('_')[2] for k, v in matching_dict.items()}
for k, v in matching_dict.items():
if v.split('_')[1] + v.split('_')[-1] not in ascaris.keys():
ascaris[v.split('_')[1] + v.split('_')[-1]] = only_residues
for k, v in matching_dict.items():
if v.split('_')[1] + v.split('_')[-1] not in full_ascaris.keys():
full_ascaris[v.split('_')[1] + v.split('_')[-1]] = matching_dict
return ascaris ,full_ascaris
import ast
def pdbMapping(data, save_path): # BU DATA hangi df hepi mi azalttigimiz mi
# Here we add match dictionary containing different positons for different chains and PDB Ids/
for i in data.index:
posOnPDB = {}
uniprot_id = data.at[i, 'uniprotID']
pdb_id = data.at[i, 'pdbID']
pos = data.at[i, 'pos']
wt = data.at[i, 'wt']
data.at[i, 'AAonPDB'] = np.NaN
data.at[i,'pdbinfo'] = pdb_id + data.at[i, 'chain']
allMatchesForDP, full_ascaris = uniprot_pdb_residue_mapping(pdb_id, uniprot_id, save_path)
for key, val in full_ascaris[data.at[i,'pdbinfo']].items():
if int(key.split('_')[2]) == int(pos):
data.loc[i, 'AAonPDB'] = val.split('_')[3]
break
if data.at[i, 'AAonPDB'] == wt:
data.at[i, 'PDB_ALIGN_STATUS'] = 'aligned'
else:
data.at[i, 'PDB_ALIGN_STATUS'] = 'notAligned'
keep = allMatchesForDP[data.at[i,'pdbinfo']]
for pos in ast.literal_eval(data.at[i, 'POSITIONS']):
try:
if keep[str(pos)] != 'null':
posOnPDB[str(pos)] = keep[str(pos)]
else:
pass
except KeyError:
pass
data.at[i, 'MATCHDICT'] = str(posOnPDB)
data = data.drop(columns=['POSITIONS'])
return data
def processAnnotation(annot_positions):
annot_positions = str(annot_positions).replace("'", '')
annot_positions = str(annot_positions).replace('[', '')
annot_positions = str(annot_positions).replace("]", '')
positionList_perAnnotation = annot_positions.split(',')
positionList_perAnnotation = [h.strip() for h in positionList_perAnnotation] |