fatmacankara commited on
Commit
771222f
1 Parent(s): f9b9a8c

Update code/modbaseModelAdd.py

Browse files
Files changed (1) hide show
  1. code/modbaseModelAdd.py +0 -149
code/modbaseModelAdd.py CHANGED
@@ -1,152 +1,3 @@
1
- # import requests
2
- # import numpy as np
3
- # import pandas as pd
4
- # from utils import *
5
- # from pathlib import Path
6
- # from bs4 import BeautifulSoup
7
- # from add_sasa import *
8
- # def addModbaseModels(dataframe, path_to_input_files, path_to_output_files):
9
- # if len(dataframe) != 0:
10
- # # GET MODBASE MODELS
11
- # # Get IDs from data to retrieve only their models from MODBASE
12
- # dataframe.reset_index(inplace=True, drop=True)
13
- # existing_modbase_models = list(Path(path_to_output_files / 'modbase_structures').glob("*"))
14
- # existing_modbase_models = [str(i) for i in existing_modbase_models]
15
- # existing_modbase_models = [i.split('/')[-1].split('.')[0] for i in existing_modbase_models]
16
-
17
- # existing_modbase_models_ind = list(Path(path_to_output_files / 'modbase_structures_individual').glob("*"))
18
- # existing_modbase_models_ind = [str(i) for i in existing_modbase_models_ind]
19
- # existing_modbase_models_ind = [i.split('/')[-1].split('.')[0] for i in existing_modbase_models_ind]
20
-
21
- # modbase_reduced = pd.DataFrame(columns = ['uniprotID', 'target_begin', 'target_end', 'quality_score',
22
- # 'model_id', 'coordinates','AAonPDB', 'coordVAR'])
23
- # print('Retrieving ModBase models...\n')
24
- # modbase = pd.DataFrame(
25
- # columns=['uniprotID', 'target_begin', 'target_end', 'quality_score', 'model_id',
26
- # 'coordinates', 'AAonPDB', 'coordVAR'])
27
- # no_modbase = pd.DataFrame(
28
- # columns=['uniprotID', 'target_begin', 'target_end', 'quality_score', 'model_id',
29
- # 'coordinates', 'AAonPDB', 'coordVAR'])
30
- # # Get model files associated with each UniProtID
31
- # existing_free_sasa = list(Path(path_to_output_files / 'freesasa_files').glob("*"))
32
- # existing_free_sasa = [str(i) for i in existing_free_sasa]
33
- # existing_free_sasa = [i.split('/')[-1].split('.')[0] for i in existing_free_sasa]
34
- # keep_cols = dataframe.columns
35
- # for i in dataframe.index:
36
- # coordDict = {}
37
- # protein = dataframe.at[i, 'uniprotID']
38
- # varPos = int(dataframe.at[i, 'pos'])
39
- # wt = dataframe.at[i, 'wt']
40
- # mut = dataframe.at[i, 'mut']
41
- # datapoint = dataframe.at[i, 'datapoint']
42
-
43
- # if protein not in existing_modbase_models:
44
- # print('Downloading Modbase models for ', protein)
45
- # url = 'https://salilab.org/modbase/retrieve/modbase/?databaseID=' + protein
46
- # req = requests.get(url)
47
- # name = path_to_output_files / 'modbase_structures' / f'{protein}.txt'
48
- # with open(name, 'wb') as f:
49
- # f.write(req.content)
50
- # else:
51
- # print('Model exists for', protein)
52
- # name = Path(path_to_output_files / 'modbase_structures' / f'{protein}.txt')
53
-
54
- # with open(name, encoding="utf8") as f:
55
- # a = open(name, 'r').read()
56
- # soup = BeautifulSoup(a, 'lxml')
57
- # if soup.findAll('pdbfile') != []:
58
- # for pdb in soup.findAll('pdbfile'):
59
- # model_id = str(pdb.contents[1])[10:-11]
60
- # if model_id not in existing_modbase_models_ind:
61
- # with open(path_to_output_files / 'modbase_structures_individual' / f'{model_id}.txt', 'w', encoding="utf8") as individual:
62
- # individual.write(str('UniProt ID: ' + protein))
63
- # individual.write('\n')
64
- # individual.write(str(pdb.contents[3])[10:-11].strip())
65
- # run_freesasa(
66
- # Path(path_to_output_files / 'modbase_structures_individual' / f'{model_id.lower()}.txt'),
67
- # Path(path_to_output_files / 'freesasa_files' / f'{model_id.lower()}.txt'),
68
- # include_hetatms=True,
69
- # outdir=None, force_rerun=False, file_type='pdb')
70
- # filename = Path(path_to_output_files / 'freesasa_files' / f'{model_id.lower()}.txt')
71
- # st.write('filename', filename)
72
- # st.write('varPos', varPos)
73
- # st.write('wt', wt)
74
- # st.write('protein', protein)
75
- # st.write('path_to_output_files', path_to_output_files)
76
- # dataframe.loc[i, 'sasa'] = sasa(protein, varPos, wt, 1, filename, path_to_output_files, file_type='pdb')
77
- # st.write('sasa', dataframe.loc[i, 'sasa'] )
78
- # st.write('model_id', model_id)
79
- # with open(path_to_output_files / 'modbase_structures_individual'/ f'{model_id}.txt', encoding="utf8") as m:
80
-
81
- # lines = m.readlines()
82
- # quality_score = -999
83
- # for ind_line in lines:
84
- # if ind_line[0:10] == 'UniProt ID':
85
- # uniprot_id = ind_line.split(':')[1].strip()
86
- # if ind_line[0:23] == 'REMARK 220 TARGET BEGIN':
87
- # target_begin = ind_line[40:43].strip()
88
- # if ind_line[0:21] == 'REMARK 220 TARGET END':
89
- # target_end = ind_line[40:43].strip()
90
- # coordDict, AAonPDB, coordVAR = {},np.NaN,np.NaN
91
- # if (int(varPos) > int(target_begin)) & (int(varPos) < int(target_end)):
92
- # coordDict = {}
93
- # for ind_line in lines:
94
- # if ind_line[0:27] == 'REMARK 220 MODPIPE MODEL ID':
95
- # model_id = ind_line[40:].strip()
96
- # if ind_line[0:15].strip() == 'REMARK 220 MPQS':
97
- # quality_score = ind_line[40:].strip()
98
- # if ind_line[0:4] == 'ATOM' and ind_line[13:15] == 'CA':
99
- # position = int(ind_line[22:26].strip())
100
- # chain = ind_line[20:22].strip()
101
- # aminoacid = threeToOne(ind_line[17:20])
102
- # coords = [ind_line[31:38].strip(), ind_line[39:46].strip(), ind_line[47:54].strip()]
103
- # coordDict[position] = coords
104
- # if position == int(varPos):
105
- # AAonPDB = aminoacid
106
- # coordVAR = str(coords)
107
- # if ind_line[0:3] == 'TER':
108
- # break
109
- # try:
110
- # k = pd.Series(
111
- # [uniprot_id, target_begin, target_end,quality_score, model_id, coordDict, AAonPDB, coordVAR])
112
- # new_row = {'uniprotID': uniprot_id, 'target_begin': target_begin,
113
- # 'target_end': target_end, 'quality_score': quality_score,
114
- # 'model_id': model_id, 'coordinates': coordDict,
115
- # 'AAonPDB': AAonPDB, 'coordVAR': coordVAR}
116
- # modbase_reduced = modbase_reduced.append(new_row, ignore_index=True)
117
- # modbase_reduced = modbase_reduced[['uniprotID', 'quality_score', 'model_id', 'coordinates', 'AAonPDB', 'coordVAR']]
118
- # modbase = dataframe.merge(modbase_reduced, on='uniprotID', how='left')
119
- # modbase.quality_score = modbase.quality_score.astype(float)
120
- # modbase = modbase.sort_values(by=['datapoint', 'quality_score'], ascending=False)
121
- # modbase.reset_index(inplace=True, drop=True)
122
- # modbase.fillna(np.NaN, inplace=True)
123
- # modbase.replace({'\'?\', ': '',
124
- # ', \'?\'': '',
125
- # '(': '', ')': '',
126
- # '[\'?\']': np.NaN,
127
- # '[]': np.NaN,
128
- # 'nan-nan': np.NaN,
129
- # '': np.NaN}, inplace=True)
130
- # except NameError:
131
- # print('This file doesnt have Quality Score. Replacer: -999', model_id)
132
- # else:
133
- # new_row = {'uniprotID': uniprot_id, 'wt': wt,
134
- # 'pos': varPos, 'mut': mut, 'datapoint': datapoint }
135
- # no_modbase = no_modbase.append(new_row, ignore_index=True)
136
-
137
- # else:
138
- # new_row = {'uniprotID': uniprot_id, 'wt': wt,
139
- # 'pos': varPos, 'mut': mut, 'datapoint': datapoint }
140
- # no_modbase = no_modbase.append(new_row, ignore_index=True)
141
-
142
-
143
-
144
- # no_modbase_no_Coord = modbase[pd.isna(modbase['coordVAR'])]
145
- # no_modbase = pd.concat([no_modbase, no_modbase_no_Coord])
146
- # modbase = modbase[~pd.isna(modbase['coordVAR'])]
147
- # no_modbase = no_modbase[keep_cols]
148
- # return modbase, no_modbase
149
-
150
  import requests
151
  import numpy as np
152
  import pandas as pd
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  import requests
2
  import numpy as np
3
  import pandas as pd