mishtert commited on
Commit
32da6be
1 Parent(s): ac27aa9

Upload dtxutils.py

Browse files
Files changed (1) hide show
  1. dtxutils.py +343 -0
dtxutils.py ADDED
@@ -0,0 +1,343 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils.pharmap_utils.meshutils import nct_to_mesh_term, mesh_term_to_id, df_mesh, df_mesh_ct
2
+ from utils.pharmap_utils.cid import CaseInsensitiveDict
3
+ from utils.pharmap_utils.dictutils import *
4
+ import re
5
+ import streamlit as st
6
+
7
+
8
+ # mesh list extract
9
+ def meshtrm_lst_xtract(nct_value):
10
+ try:
11
+ mesh_term = nct_to_mesh_term[nct_value]
12
+ mesh_term_list = list(mesh_term)
13
+ return mesh_term_list
14
+ except:
15
+ pass
16
+
17
+
18
+ @st.cache(suppress_st_warning=True, allow_output_mutation=True)
19
+ # type extract fun
20
+ def type_extract(mesh_term_list):
21
+ mesh_term_list = [mesh_term_list] if isinstance(mesh_term_list, str) else mesh_term_list
22
+ # print('mesh_term_list: ',mesh_term_list)
23
+
24
+ # l2_map_lst=[]
25
+ uid_lst = []
26
+ if mesh_term_list is not None:
27
+ for val in mesh_term_list:
28
+ # print('value inside uid forloop:',val)
29
+ try:
30
+ # print('Inside get uid')
31
+ uid = mesh_term_to_id[val]
32
+ uid_lst.append(uid)
33
+ # print(uid_lst)
34
+ if uid_lst is None:
35
+ uid_lst = []
36
+ except:
37
+ pass
38
+ # print('error in get uid list')
39
+
40
+ # get mesh num
41
+ mesh_num_xtract_lst = []
42
+
43
+ for val in uid_lst:
44
+ try:
45
+ # print('Inside get mesh num')
46
+ mesh_num_xtract = df_mesh.loc[df_mesh['ui'] == val, 'mesh_number'].iloc[0]
47
+ mesh_num_xtract_lst.append(mesh_num_xtract)
48
+ # print(mesh_num_xtract_lst)
49
+ if ',' in mesh_num_xtract_lst[0]:
50
+ mesh_num_xtract_lst = mesh_num_xtract_lst[0].split(", ")
51
+ # print('mesh_num_xtract_lst after spltting',mesh_num_xtract_lst)
52
+ except:
53
+ pass
54
+ # print('error in get mesh num')
55
+
56
+ # mesh number extract l2
57
+ l2_map_lst = []
58
+ for val in mesh_num_xtract_lst:
59
+ # print('Inside l2map for loop',val)
60
+ search_value = val[:3]
61
+ # print('printing search value:',search_value)
62
+ try:
63
+ l2_map = df_mesh.loc[df_mesh['mesh_number'] == search_value, 'name'].iloc[0]
64
+ # print(l2_map)
65
+ l2_map_lst.append(l2_map)
66
+ # print(l2_map_lst)
67
+ if l2_map_lst is None:
68
+ l2_map_lst = []
69
+ except:
70
+ pass
71
+
72
+ l2_map_lst = list(set(l2_map_lst))
73
+ # print('finaloutput',l2_map_lst)
74
+ return l2_map_lst
75
+
76
+
77
+ def split_values(col_val):
78
+ # """split words seperated by special characters"""
79
+ # print(col_val)
80
+ if col_val != '':
81
+ char_list = ['|', ',', '/', '.', ';', './', ',/', '/ ', ' /']
82
+ # res = ' '.join([ele for ele in char_list if(ele in col_val)])
83
+ res = [ele for ele in char_list if (ele in col_val)]
84
+ # print('printing string of found char',res)
85
+ colstring = str(col_val)
86
+ f_res = []
87
+ try:
88
+ while len(res) > 0:
89
+ res = res[-1]
90
+ f_res = colstring.split(''.join(res))
91
+ # print(f_res)
92
+ # return f_res
93
+ f_res = [x for x in f_res if x is not None]
94
+ return ', '.join(f_res)
95
+ except:
96
+ pass
97
+ else:
98
+ return col_val
99
+
100
+
101
+ def map_entry_terms(myText):
102
+ obj = CaseInsensitiveDict(entry_dict)
103
+ pattern = re.compile(r'(?<!\w)(' + '|'.join(re.escape(key) for key in obj.keys()) + r')(?!\w)', flags=re.IGNORECASE)
104
+ text = pattern.sub(lambda x: obj[x.group()], myText)
105
+ # text = pattern.sub(lambda x: obj[x.group()], text)
106
+ return text.strip().split('/')
107
+
108
+
109
+ def remove_none(some_list):
110
+ some_list = [some_list] if isinstance(some_list, str) else some_list
111
+ if some_list is not None:
112
+ some_list = list(filter(lambda x: x != None, some_list))
113
+ return some_list
114
+
115
+
116
+ def retain_all_ta(some_list):
117
+ some_list = [some_list] if isinstance(some_list, str) else some_list
118
+ # some_list.split(',')
119
+ value = 'all_ta'
120
+ # print(value)
121
+ if some_list is not None:
122
+ if value in some_list:
123
+ some_list = [value]
124
+ return some_list
125
+ else:
126
+ return some_list
127
+
128
+
129
+ def unique_list(l):
130
+ l = map(str.strip, l) # remove whitespace from list element
131
+ # print(l)
132
+ ulist = []
133
+ [ulist.append(x) for x in l if x not in ulist]
134
+ return ulist
135
+
136
+
137
+ def split_for_type_extract(my_list, char):
138
+ # print('entering the function:',my_list)
139
+ try:
140
+ my_list = [my_list] if isinstance(my_list, str) else my_list
141
+ if my_list is not None:
142
+ # print(my_list)
143
+ my_list = list(map(lambda x: x.split(char)[0], my_list))
144
+ # my_list = [x for x in my_list if x is not None]
145
+ return my_list
146
+ except:
147
+ pass
148
+
149
+
150
+ def special_ask(col_value):
151
+ col_value = col_value.lower()
152
+ if col_value == 'obesity':
153
+ ta_list = 'met'
154
+ return ta_list.split()
155
+ elif col_value == 'healthy subject':
156
+ ta_list = 'all_ta'
157
+ return ta_list.split()
158
+ elif col_value == 'healthy subjects':
159
+ ta_list = 'all_ta'
160
+ return ta_list.split()
161
+ elif col_value == 'healthy participants':
162
+ ta_list = 'all_ta'
163
+ return ta_list.split()
164
+ elif col_value == 'healthy participant':
165
+ ta_list = 'all_ta'
166
+ return ta_list.split()
167
+ elif col_value == 'inflammation':
168
+ ta_list = 'ai'
169
+ return ta_list.split()
170
+ else:
171
+ pass
172
+
173
+
174
+ def remove_stopwords(query):
175
+ stopwords = ['acute-on-chronic', 'acute', 'chronic',
176
+ 'diseases of the', '-19', '- 19', '19', '.']
177
+ if query is not None:
178
+ querywords = query.split()
179
+ resultwords = [word for word in querywords if word.lower() not in stopwords]
180
+ result = ' '.join(resultwords)
181
+ return result
182
+ else:
183
+ ''
184
+
185
+
186
+ def gb_2_us(text, mydict):
187
+ try:
188
+ for us, gb in mydict.items():
189
+ text = text.replace(gb, us)
190
+ return text
191
+ except:
192
+ return ''
193
+
194
+
195
+ def fix_text_with_dict(text, mydict):
196
+ text = ','.join([repl_dict.get(i, i) for i in text.split(', ')])
197
+ return text
198
+
199
+
200
+ def replace_text(mytext):
201
+ cancer = ['cancer', 'neoplasm', 'carcinoma', 'lymphoma', 'adenoma', 'myoma', 'meningioma',
202
+ 'malignancy', 'tumor', 'malignancies', 'chemotherapy']
203
+ # fracture = ['fractures', 'fracture']
204
+ heart_failure = ['heart failure', 'cardiac']
205
+ ectomy = 'prostatectomy'
206
+ covid = 'covid'
207
+ transplant = 'transplant'
208
+ healthy = 'healthy'
209
+ park = 'parkinson'
210
+ allergy = ['allergy', 'allergic']
211
+ virus = 'virus'
212
+ cornea = ['cornea', 'eye', 'ocular', 'macular']
213
+ vaccine = 'vaccines'
214
+ ureter = 'ureter'
215
+ mutation = 'mutation'
216
+ stemcell = 'stem cells'
217
+ behavior = ['behavior', 'depressive', 'depression', 'anxiety', 'satisfaction', 'grief']
218
+ molar = ['molar', 'dental', 'maxillary']
219
+ diet = 'diet'
220
+ biopsy = 'biopsy'
221
+ physiology = 'physiology'
222
+ infection = ['infection', 'bacteremia', 'fungemia']
223
+ preg = ['pregnancy', 'pregnant', 'labor', 'birth']
224
+ imaging = ['x-ray', 'imaging', 'mri']
225
+ surgery = 'surgery'
226
+ angina = 'angina'
227
+ use_disorder = ['use disorder', 'obsessive', 'panic', 'posttraumatic stress',
228
+ 'post-traumatic stress', 'schizophrenia']
229
+
230
+ if mytext:
231
+ try:
232
+ if any(text in mytext.lower() for text in cancer):
233
+ mytext = 'neoplasms'
234
+ return mytext
235
+ if any(text in mytext.lower() for text in heart_failure):
236
+ mytext = 'cardiovascular diseases'
237
+ return mytext
238
+ if covid in mytext.lower():
239
+ mytext = 'covid-19'
240
+ return mytext
241
+ if ectomy in mytext.lower():
242
+ mytext = 'urogenital surgical procedures'
243
+ return mytext
244
+ if transplant in mytext.lower():
245
+ mytext = 'body regions'
246
+ return mytext
247
+ if healthy in mytext.lower():
248
+ mytext = 'healthy volunteers'
249
+ return mytext
250
+ if any(text in mytext.lower() for text in allergy):
251
+ mytext = 'immune system diseases'
252
+ return mytext
253
+ if park in mytext.lower():
254
+ mytext = 'parkinson disease'
255
+ return mytext
256
+ if park in mytext.lower():
257
+ mytext = 'immune system diseases'
258
+ return mytext
259
+ if virus in mytext.lower():
260
+ mytext = 'viruses'
261
+ return mytext
262
+ if any(text in mytext.lower() for text in cornea):
263
+ mytext = 'eye diseases'
264
+ return mytext
265
+ if vaccine in mytext.lower():
266
+ mytext = 'vaccines'
267
+ return mytext
268
+ if ureter in mytext.lower():
269
+ mytext = 'ureter'
270
+ return mytext
271
+ if mutation in mytext.lower():
272
+ mytext = 'mutation'
273
+ return mytext
274
+ if stemcell in mytext.lower():
275
+ mytext = 'stem cells'
276
+ return mytext
277
+ if any(text in mytext.lower() for text in behavior):
278
+ mytext = 'behavior'
279
+ return mytext
280
+ if any(text in mytext.lower() for text in molar):
281
+ mytext = 'molar'
282
+ return mytext
283
+ if diet in mytext.lower():
284
+ mytext = 'diet'
285
+ return mytext
286
+ if biopsy in mytext.lower():
287
+ mytext = 'biopsy'
288
+ return mytext
289
+ if physiology in mytext.lower():
290
+ mytext = 'physiology'
291
+ return mytext
292
+ if any(text in mytext.lower() for text in infection):
293
+ mytext = 'infections'
294
+ return mytext
295
+ if any(text in mytext.lower() for text in preg):
296
+ mytext = 'reproductive and urinary physiological phenomena'
297
+ return mytext
298
+ if any(text in mytext.lower() for text in imaging):
299
+ mytext = 'diagnosis'
300
+ return mytext
301
+ if surgery in mytext.lower():
302
+ mytext = 'medicine'
303
+ return mytext
304
+ if angina in mytext.lower():
305
+ mytext = 'angina pectoris'
306
+ return mytext
307
+ if any(text in mytext.lower() for text in use_disorder):
308
+ mytext = 'mental disorders'
309
+ return mytext
310
+ else:
311
+ return mytext
312
+ except:
313
+ return ''
314
+
315
+
316
+ # For studies in CTgov
317
+ def is_nct(col_value):
318
+ # Returns mesh term list based on NCT ID
319
+ val = col_value[:3]
320
+ if val == 'NCT':
321
+ try:
322
+ if col_value in df_mesh_ct.values:
323
+ mesh_term_list = meshtrm_lst_xtract(col_value)
324
+ l2map = type_extract(mesh_term_list)
325
+ return l2map
326
+ except:
327
+ pass
328
+ else:
329
+ 'Study Not in Database, Please enter condition or conditions treated'
330
+ return
331
+
332
+
333
+ # For studies not in CTgov
334
+ def is_not_nct(col_value):
335
+ # Returns mesh term list based on NCT ID
336
+ # Returns disease type l2 tag in Mesh dictionary
337
+ if col_value is not None:
338
+ mesh_term_list = col_value
339
+ l2map = type_extract(mesh_term_list)
340
+ return l2map
341
+ else:
342
+ None
343
+ return