mishtert commited on
Commit
e067ea9
1 Parent(s): 10b8968

Upload summ_utils.py

Browse files
Files changed (1) hide show
  1. summ_utils.py +411 -0
summ_utils.py ADDED
@@ -0,0 +1,411 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from utils.pharmap_utils.cid import CaseInsensitiveDict
2
+ import re
3
+
4
+
5
+
6
+ ##==============================================================================
7
+ #load mapping terms
8
+ with open('utils/summarize_utils/map_terms.txt') as f:
9
+ mt_dict = dict(x.rstrip().split(',', 1) for x in f)
10
+
11
+ ##==============================================================================
12
+ #load number mapping terms to convert numbers in words appearing before weeks
13
+ # to number
14
+ with open('utils/summarize_utils/map_nums.txt') as f:
15
+ num_dict = dict(x.rstrip().split(',', 1) for x in f)
16
+ ##==============================================================================
17
+ ## load stop words
18
+ with open('utils/summarize_utils/stopwords-en.txt','r',encoding='unicode_escape') as f:
19
+ stopwords = f.read().split()
20
+ ##==============================================================================
21
+ def get_first_word(alloc,masking,status):
22
+ print('Getting first word..')
23
+ if (alloc.lower()=='n/a' and masking.lower()=='none (open label)'):
24
+ if status == 'Active, not recruiting':
25
+ fw = 'An '
26
+ start_word = 'It is in '
27
+ result = start_word + fw.lower()
28
+ return result
29
+ elif status == 'Recruiting':
30
+ fw = 'An '
31
+ start_word = 'It is in '
32
+ result = start_word + fw.lower()
33
+ return result
34
+ else:
35
+ fw = 'An '
36
+ result = fw
37
+ return result
38
+ else:
39
+ if status == 'Active, not recruiting':
40
+ fw = 'A '
41
+ start_word = 'It is in '
42
+ result = start_word + fw.lower()
43
+ return result
44
+ elif status == 'Recruiting':
45
+ fw = 'A '
46
+ start_word = 'It is in '
47
+ result = start_word + fw.lower()
48
+ return result
49
+ else:
50
+ fw = 'A '
51
+ result = fw
52
+ return result
53
+
54
+ ##==============================================================================
55
+ #get masking type
56
+ def get_mask(masking):
57
+ print('Getting mask..')
58
+ # print('maskingentry:',masking)
59
+ try:
60
+ if masking.lower() == 'double':
61
+ masking = 'double-blind, '
62
+ elif masking.lower() == 'none (open label)':
63
+ masking = 'open-label, '
64
+ elif masking.lower() in 'quadruple':
65
+ masking = 'quadruple-blind, '
66
+ # print('....... ..... done..')
67
+ return str(masking)
68
+ except:
69
+ pass
70
+ ##==============================================================================
71
+ #get study type
72
+ def get_stype(stype):
73
+ print('Getting study type...')
74
+ if stype.lower() == 'interventional':
75
+ stype = 'interventional study '
76
+ else:
77
+ stype = 'observational study '
78
+ # print('....... ....... done..')
79
+ return stype
80
+ ##==============================================================================
81
+ # get intervention model
82
+ def get_imodel(imodel):
83
+ print('Getting imodel...')
84
+ if imodel.lower() is not None:
85
+ res = imodel.lower() + ', '
86
+ return res
87
+ else:
88
+ pass
89
+ ##==============================================================================
90
+ #get objective
91
+ def get_obj(otitle,bsumm, ddesc):
92
+ print('Getting objective for..')
93
+ # print(string)
94
+ # keywords = ['purpose','objective','evaluated','aim','assess','pharmcokinetic',
95
+ # 'pharmacodynamic','safety','immunogenecity']'Study to Evaluate'
96
+ keywords = ['to Demonstrate',
97
+ 'to Evaluate',
98
+ 'to Investigate',
99
+ 'to Assess',
100
+ 'to Determine',
101
+ # 'Investigating',
102
+ 'Placebo','Purpose','aim','purpose','main purpose',
103
+ 'Aim','Objective', 'objective', 'Main Objective', 'Selection Study',
104
+ 'Main Purpose', 'Main Aim','Study', 'STUDY', 'study',
105
+ 'Ascending Multiple-dose','Adaptive','Dose Escalation',
106
+ 'assess', 'Bioavailability','investigate','Investigating'
107
+ ]
108
+
109
+ otitle_result = [ele for ele in keywords if(ele in otitle)]
110
+ print('otitle_result:', otitle_result)
111
+ bsumm_result = [ele for ele in keywords if(ele in bsumm.lower())]
112
+ print('bsumm_result:', bsumm_result)
113
+ ddesc_result = [ele for ele in keywords if(ele in ddesc.lower())]
114
+ print('ddesc_result:',ddesc_result)
115
+ # print(otitle_result)
116
+ try:
117
+ if len(otitle_result)>0:
118
+ print('im in otitle')
119
+ word = ''.join(otitle_result[0])
120
+ print('word in otitle:', word)
121
+ matched = [sentence + '.' for sentence in otitle.split('. ') if word in sentence]
122
+ sobj = ''.join(matched)
123
+ print('matched sobj',sobj)
124
+ # result = re.sub(r'^.*?to', 'to', sobj)
125
+ pattern=word+'(.*)'+'.'
126
+ result = re.search(pattern, sobj)
127
+ print('result of pattern search:',result)
128
+ result = word+result.group(1)
129
+ print('result group:',result)
130
+ result = non_abbr(result)
131
+ print('non-abbr result:',result)
132
+ return result
133
+ elif len(bsumm_result)>0:
134
+ print('im in bsumm')
135
+ # print(bsumm_result)
136
+ word = ''.join(bsumm_result[0])
137
+ # print(word)
138
+ matched = [sentence + '.' for sentence in bsumm.split('. ') if word in sentence]
139
+ sobj = ', '.join(matched)
140
+ sobj = non_abbr(sobj)
141
+ return sobj
142
+ elif len(ddesc_result)>0:
143
+ # print('im in ddesc')
144
+ word = ''.join(ddesc_result[0])
145
+ matched = [sentence + '.' for sentence in ddesc.split('. ') if word in sentence]
146
+ sobj = ''.join(matched)
147
+ sobj = non_abbr(sobj)
148
+ return sobj
149
+ else:
150
+ sobj = 'No Objective Found'
151
+ return sobj
152
+ except:
153
+ pass
154
+
155
+ ##==============================================================================
156
+ # other study id extract
157
+ def get_osid(osid,sid):
158
+ print('Getting Study Ids...')
159
+ if None not in (osid,sid):
160
+ if sid !='':
161
+ osid = '(' + '; '.join(osid.split('|')) + '; '+ ', '.join(sid.split('|')) +') '
162
+ # print('both not none:',osid)
163
+ return osid
164
+ elif osid is not None:
165
+ osid_only = '(' + '; '.join(osid.split('|')) + ') '
166
+ # print('sid is none:',osid_only)
167
+ return osid_only
168
+ elif osid is None and sid is not None:
169
+ sid_only = '(' + '; '.join(sid.split('|')) + ') '
170
+ # print('osid is none:',sid_only)
171
+ # print('....... ....... done..')
172
+ return sid_only
173
+ else:
174
+ pass
175
+
176
+ ##==============================================================================
177
+ # get locations
178
+ def join_and(items):
179
+ if len(items)>1:
180
+ return ', '.join(items[:-1]) + ', and '+items[-1]
181
+ else:
182
+ return ', '.join(items)
183
+
184
+ def get_locs(locations):
185
+ print('Getting Locations...')
186
+ print(locations)
187
+ print(len(locations))
188
+ if locations !='':
189
+ print('location is not empty')
190
+ if '|' in locations:
191
+ res = join_and(sorted(list(set(locations.split('|')))))
192
+ print('inside location split if:', res)
193
+ else:
194
+ res = locations
195
+ print('inside location split else:', res)
196
+ else:
197
+ res = locations
198
+ print('outside location split else:', res)
199
+ if res =='':
200
+ pass
201
+ else:
202
+ res = ' in ' + res +', '
203
+ # print('....... ....... done..')
204
+ return res
205
+
206
+ ##==============================================================================
207
+ # status extract
208
+ status_dict = {'Not yet recruiting':', is planned ',
209
+ # 'Recruiting':', is active ',
210
+ 'Active, not recruiting':' (enrollment complete) ',
211
+ 'Completed' :', is complete ',
212
+ 'Terminated':', has been terminated',
213
+ 'Suspended' :', has been suspended',
214
+ 'Withdrawn' :', has been withdrawn'
215
+ }
216
+ def get_status(status):
217
+ print('Getting trial type...')
218
+ search_key = status
219
+ # print(search_key)
220
+ try:
221
+ res = [val for key, val in status_dict.items() if search_key in key]
222
+ res = str(res).replace("['",'').replace("']",'')
223
+ # print('....... ....... done..')
224
+ return res
225
+ except:
226
+ pass
227
+
228
+ ##==============================================================================
229
+ # lower non abbr word for ystop
230
+ def non_abbr(string):
231
+ word = string.split(' ')
232
+ my_list=[]
233
+ try:
234
+ for word in word:
235
+ if word.isupper() == True:
236
+ word = word.upper()
237
+ my_list.append(word)
238
+ else:
239
+ word = word.lower()
240
+ my_list.append(word)
241
+ return ' '.join(my_list)
242
+ except:
243
+ pass
244
+ ##==============================================================================
245
+ # reason for stop extract
246
+ def get_ystop(ystop):
247
+ print('Getting ystop...')
248
+ if ystop!='':
249
+ ystop = non_abbr(ystop)
250
+ ystop = ', '+ 'due to ' + ystop
251
+ return ystop
252
+ else:
253
+ pass
254
+ ##==============================================================================
255
+ #get age
256
+ def get_age(minage,maxage):
257
+ # print('Getting age...')
258
+ if maxage !='':
259
+ age = 'aged between '+ minage+ ' and ' + maxage
260
+ else:
261
+ age = 'with minimum age of ' +minage
262
+ # print('....... ....... done..')
263
+ return age
264
+ ##==============================================================================
265
+
266
+ # get link
267
+ def get_url(nctid,lupd):
268
+ print('Cooking up final url...')
269
+ urll='https://clinicaltrials.gov/ct2/show/'
270
+ new_url= ' ('+ 'ClinicalTrials.gov, '+ lupd+', ' +urll+nctid + ')'
271
+ return new_url
272
+ ##==============================================================================
273
+ #map week numbers
274
+ def map_week_num(myText):
275
+ obj = CaseInsensitiveDict(num_dict)
276
+ pattern = re.compile(r'(?<!\w)(' + '|'.join(re.escape(key) for key in obj.keys()) + r')(?!\w)',flags=re.IGNORECASE)
277
+ text = pattern.sub(lambda x: obj[x.group()], myText)
278
+ # text = pattern.sub(lambda x: obj[x.group()], text)
279
+ return text
280
+ ##==============================================================================
281
+ #map terms
282
+ def map_terms(myText):
283
+ obj = CaseInsensitiveDict(mt_dict)
284
+ pattern = re.compile(r'(?<!\w)(' + '|'.join(re.escape(key) for key in obj.keys()) + r')(?!\w)',flags=re.IGNORECASE)
285
+ text = pattern.sub(lambda x: obj[x.group()], myText)
286
+ # text = pattern.sub(lambda x: obj[x.group()], text)
287
+ return text
288
+ ##==============================================================================
289
+ # adjust space, period, comma
290
+ def remove_period_spaces(text):
291
+ text = text.replace('||','')
292
+ text = text.replace('Korea, Republic of','S Korea')
293
+ text = text.replace('[]','')
294
+ text = text.replace(', This',', this')
295
+ text = text.replace(') The',') the')
296
+ text = text.replace('in The The','in the')
297
+ text = text.replace('The','the')
298
+ text = text.replace('the the','the')
299
+ text = text.replace('this is a','')
300
+ text = text.replace('.,',',')
301
+ text = text.replace('., ',',')
302
+ text = text.replace(',',', ')
303
+ text = text.replace("due to", "because of", 1)
304
+ text = text.replace("male subjects", "male participants")
305
+ text = text.replace("female subjects", "female participants")
306
+ # text = text.capitalize()
307
+ text=" ".join(text.split())
308
+ return text
309
+ ##==============================================================================
310
+ # remove duplicate words
311
+ def unique_list(text_str):
312
+ l = text_str.split()
313
+ temp = []
314
+ for x in l:
315
+ if x not in temp:
316
+ temp.append(x)
317
+ return ' '.join(temp)
318
+ #===============================================================================
319
+ #reposition the condition in the summary
320
+ def repos_condition(my_string):
321
+ try:
322
+ # print(my_string)
323
+ subjects=re.search('with(.*),',my_string)
324
+ # print(subjects.group(1))
325
+ if subjects:
326
+ fs=subjects.group(1).split(',')[0]
327
+ # print(fs)
328
+ else:
329
+ subjects=re.search('with(.*).',my_string)
330
+ fs=subjects.group(1).split('.')[0]
331
+ # print(subjects.group(1).split(',')[0])
332
+ a=re.search(r"\d+\s+subjects\s",my_string)
333
+ # print(a.group(0))
334
+ r=re.sub(r"\d+\s+subjects\s",a.group(0)+"with"+fs+" ",my_string)
335
+ # print(r)
336
+ result=re.sub("with"+fs+",","",r)
337
+ print("--------------")
338
+ return result
339
+ except:
340
+ print("not found")
341
+
342
+ #================================================================================
343
+
344
+ #reposition the additional study_design words
345
+ def repos_study_design(text):
346
+ try:
347
+ result = re.search('subjects(.*)study', text.lower())
348
+ if result:
349
+ r = result.group(1)+'study'
350
+ newtext= text.replace(r, '')
351
+ try:
352
+ idx = newtext.lower().index('phase')
353
+ newtext = newtext[:idx] + result.group(1) + newtext[idx:]
354
+ return newtext
355
+ except:
356
+ return text
357
+ else:
358
+ return text
359
+ except:
360
+ print("nothing happened")
361
+ #================================================================================
362
+ #identify purpose issues
363
+ def purpose_issue(summary):
364
+ flag_words = ['will also be evaluated','will be evaluated','No Objective Found','subjects), is', 'subjects, is complete']
365
+ if any(word in summary for word in flag_words):
366
+ return "Yes - Grammar/Endpoint related Mistakes in Summary"
367
+ else:
368
+ return "No"
369
+ #================================================================================
370
+ # duplicate words check
371
+ def dupe_check(text,rr_value,stopwords=stopwords):
372
+ if rr_value == 'No':
373
+ split_text = text.split(' ')
374
+ clean_text = ' '.join(i for i in split_text if i.lower() not in (x.lower() for x in stopwords))
375
+ words = clean_text.split()
376
+ result = (len(words) > len(set(words)))
377
+ if result ==True:
378
+ return " Yes - Duplicate Words maybe found in Summary"
379
+ else:
380
+ return rr_value
381
+ else:
382
+ return rr_value
383
+ #================================================================================
384
+ #count all cap words
385
+ def count_caps(summary,rr_value):
386
+ if rr_value == 'No':
387
+ match_length = len(' '.join(re.findall(r"\b[A-Z\s]+\b", summary)).split())
388
+ if match_length > 10:
389
+ res = 'Yes - Summary May Contain Lot of Words in Upper Case'
390
+ return res
391
+ else:
392
+ return rr_value
393
+ else:
394
+ return rr_value
395
+ #================================================================================
396
+ #identify route/dose misses
397
+ def route_miss(summary,rr_value,int_dec):
398
+ if rr_value == 'No':
399
+ split_summ = summary.split(' ')
400
+ clean_text = ' '.join(i for i in split_summ if i.lower() not in (x.lower() for x in stopwords))
401
+ summ_list = clean_text.split()
402
+ int_summ = int_dec.split(' ')
403
+ clean_text = ' '.join(i for i in int_summ if i.lower() not in (x.lower() for x in stopwords))
404
+ int_list = clean_text.split()
405
+ if any(check in int_list for check in summ_list):
406
+ return "No"
407
+ else:
408
+ return "Yes - Route/Dose info might have been missed"
409
+ else:
410
+ return rr_value
411
+