fatmacankara commited on
Commit
3810f27
1 Parent(s): b24bdaf

Create pdbMapping.py

Browse files
Files changed (1) hide show
  1. code/pdbMapping.py +129 -0
code/pdbMapping.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import os
4
+ import requests
5
+ import json
6
+ import tarfile, gzip
7
+ import time, glob
8
+ from utils import threeToOne
9
+ import streamlit as st
10
+ from pathlib import Path
11
+ import gzip
12
+ import shutil
13
+ import codecs
14
+ import io
15
+
16
+ def uniprot_pdb_residue_mapping(pdb_id, uniprot_id, save_path):
17
+
18
+ """
19
+ This code does residue-wise mapping between UniProt and PDB residues.
20
+ """
21
+ ascaris = {}
22
+ full_ascaris = {}
23
+
24
+ res = requests.get(f'https://www.ebi.ac.uk/pdbe/download/api/pdb/entry/sifts?id={pdb_id}')
25
+ url = json.loads(res.text)['url']
26
+ response = requests.get(url, stream=True)
27
+
28
+ file = tarfile.open(fileobj=response.raw, mode="r|gz")
29
+ file.extractall(path=save_path) # Creates another gz file
30
+
31
+
32
+ existing_pdb = list(Path(save_path).glob("*"))
33
+ existing_pdb = [str(i) for i in existing_pdb]
34
+
35
+ try:
36
+ with gzip.open(f'{save_path}/{pdb_id.lower()}.xml.gz', 'rt') as f:
37
+ file_content = f.read()
38
+ except FileNotFoundError:
39
+ with gzip.open(f'{save_path}/{pdb_id}.xml.gz', 'rt') as f:
40
+ file_content = f.read()
41
+ content = file_content.split('\n')
42
+ index = [idx for idx, s in enumerate(content) if 'listResidue' in s]
43
+ listResidues = []
44
+ for ind in range(0, len(index), 2):
45
+ try:
46
+ if ((content[index[ind]]).strip() == '<listResidue>') & (
47
+ (content[index[ind + 1]]).strip() == '</listResidue>'):
48
+ listResidues.append(content[index[ind]:index[ind + 1]])
49
+ except:
50
+ IndexError
51
+ for true_content in listResidues:
52
+ for sub_content in true_content:
53
+ if f'dbAccessionId="{uniprot_id}"' in sub_content:
54
+ content = [i.strip() for i in true_content]
55
+ sel = [i for i in content if
56
+ ('<crossRefDb dbSource="PDB"' in i or '<crossRefDb dbSource="UniProt"' in i)]
57
+ matching_dict = {}
58
+ if len(sel) % 2 == 0: # if correct residues
59
+ dbAccessionId = [i.split('dbAccessionId')[1].split(' ')[0].split('=')[1].strip('"').upper() for i
60
+ in sel]
61
+ dbSource = [i.split('dbSource')[1].split(' ')[0].split('=')[1].strip('"').upper() for i in sel]
62
+ dbResNum = [i.split('dbResNum')[1].split(' ')[0].split('=')[1].strip('"') for i in sel]
63
+ dbResName = [i.split('dbResName')[1].split(' ')[0].split('=')[1].split('/')[0].strip('"') for i in
64
+ sel]
65
+ dbChainName = [i.split('dbChainId')[1].split(' ')[0].split('=')[1].split('/')[0].strip('"') for i
66
+ in sel if 'crossRefDb dbSource="PDB' in i]
67
+
68
+ for k, j, m in zip(range(0, len(dbAccessionId), 2), range(1, len(dbAccessionId) - 1, 2), range(len(dbChainName))):
69
+ # try:
70
+ if dbResName[j] == threeToOne(dbResName[k]) and dbAccessionId[j] == uniprot_id:
71
+ matching_dict[
72
+ dbSource[j] + '_' + dbAccessionId[j] + '_' + dbResNum[j] + '_' + dbResName[j]] = \
73
+ dbSource[k] + '_' + dbAccessionId[k] + '_' + dbResNum[k] + '_' + threeToOne(
74
+ dbResName[k]) + '_' + dbChainName[m]
75
+ # except:
76
+ # KeyError
77
+
78
+ only_residues = {k.split('_')[2]: v.split('_')[2] for k, v in matching_dict.items()}
79
+ for k, v in matching_dict.items():
80
+ if v.split('_')[1] + v.split('_')[-1] not in ascaris.keys():
81
+ ascaris[v.split('_')[1] + v.split('_')[-1]] = only_residues
82
+ for k, v in matching_dict.items():
83
+ if v.split('_')[1] + v.split('_')[-1] not in full_ascaris.keys():
84
+ full_ascaris[v.split('_')[1] + v.split('_')[-1]] = matching_dict
85
+
86
+ return ascaris ,full_ascaris
87
+
88
+ import ast
89
+ def pdbMapping(data, save_path): # BU DATA hangi df hepi mi azalttigimiz mi
90
+ # Here we add match dictionary containing different positons for different chains and PDB Ids/
91
+ for i in data.index:
92
+ posOnPDB = {}
93
+ uniprot_id = data.at[i, 'uniprotID']
94
+ pdb_id = data.at[i, 'pdbID']
95
+ pos = data.at[i, 'pos']
96
+ wt = data.at[i, 'wt']
97
+ data.at[i, 'AAonPDB'] = np.NaN
98
+ data.at[i,'pdbinfo'] = pdb_id + data.at[i, 'chain']
99
+ allMatchesForDP, full_ascaris = uniprot_pdb_residue_mapping(pdb_id, uniprot_id, save_path)
100
+ for key, val in full_ascaris[data.at[i,'pdbinfo']].items():
101
+ if int(key.split('_')[2]) == int(pos):
102
+ data.loc[i, 'AAonPDB'] = val.split('_')[3]
103
+ break
104
+
105
+ if data.at[i, 'AAonPDB'] == wt:
106
+ data.at[i, 'PDB_ALIGN_STATUS'] = 'aligned'
107
+ else:
108
+ data.at[i, 'PDB_ALIGN_STATUS'] = 'notAligned'
109
+ keep = allMatchesForDP[data.at[i,'pdbinfo']]
110
+ for pos in ast.literal_eval(data.at[i, 'POSITIONS']):
111
+ try:
112
+ if keep[str(pos)] != 'null':
113
+ posOnPDB[str(pos)] = keep[str(pos)]
114
+ else:
115
+ pass
116
+ except KeyError:
117
+ pass
118
+
119
+ data.at[i, 'MATCHDICT'] = str(posOnPDB)
120
+ data = data.drop(columns=['POSITIONS'])
121
+ return data
122
+
123
+
124
+ def processAnnotation(annot_positions):
125
+ annot_positions = str(annot_positions).replace("'", '')
126
+ annot_positions = str(annot_positions).replace('[', '')
127
+ annot_positions = str(annot_positions).replace("]", '')
128
+ positionList_perAnnotation = annot_positions.split(',')
129
+ positionList_perAnnotation = [h.strip() for h in positionList_perAnnotation]