Upload summ_utils.py
Browse files- summ_utils.py +411 -0
summ_utils.py
ADDED
@@ -0,0 +1,411 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from utils.pharmap_utils.cid import CaseInsensitiveDict
|
2 |
+
import re
|
3 |
+
|
4 |
+
|
5 |
+
|
6 |
+
##==============================================================================
|
7 |
+
#load mapping terms
|
8 |
+
with open('utils/summarize_utils/map_terms.txt') as f:
|
9 |
+
mt_dict = dict(x.rstrip().split(',', 1) for x in f)
|
10 |
+
|
11 |
+
##==============================================================================
|
12 |
+
#load number mapping terms to convert numbers in words appearing before weeks
|
13 |
+
# to number
|
14 |
+
with open('utils/summarize_utils/map_nums.txt') as f:
|
15 |
+
num_dict = dict(x.rstrip().split(',', 1) for x in f)
|
16 |
+
##==============================================================================
|
17 |
+
## load stop words
|
18 |
+
with open('utils/summarize_utils/stopwords-en.txt','r',encoding='unicode_escape') as f:
|
19 |
+
stopwords = f.read().split()
|
20 |
+
##==============================================================================
|
21 |
+
def get_first_word(alloc,masking,status):
|
22 |
+
print('Getting first word..')
|
23 |
+
if (alloc.lower()=='n/a' and masking.lower()=='none (open label)'):
|
24 |
+
if status == 'Active, not recruiting':
|
25 |
+
fw = 'An '
|
26 |
+
start_word = 'It is in '
|
27 |
+
result = start_word + fw.lower()
|
28 |
+
return result
|
29 |
+
elif status == 'Recruiting':
|
30 |
+
fw = 'An '
|
31 |
+
start_word = 'It is in '
|
32 |
+
result = start_word + fw.lower()
|
33 |
+
return result
|
34 |
+
else:
|
35 |
+
fw = 'An '
|
36 |
+
result = fw
|
37 |
+
return result
|
38 |
+
else:
|
39 |
+
if status == 'Active, not recruiting':
|
40 |
+
fw = 'A '
|
41 |
+
start_word = 'It is in '
|
42 |
+
result = start_word + fw.lower()
|
43 |
+
return result
|
44 |
+
elif status == 'Recruiting':
|
45 |
+
fw = 'A '
|
46 |
+
start_word = 'It is in '
|
47 |
+
result = start_word + fw.lower()
|
48 |
+
return result
|
49 |
+
else:
|
50 |
+
fw = 'A '
|
51 |
+
result = fw
|
52 |
+
return result
|
53 |
+
|
54 |
+
##==============================================================================
|
55 |
+
#get masking type
|
56 |
+
def get_mask(masking):
|
57 |
+
print('Getting mask..')
|
58 |
+
# print('maskingentry:',masking)
|
59 |
+
try:
|
60 |
+
if masking.lower() == 'double':
|
61 |
+
masking = 'double-blind, '
|
62 |
+
elif masking.lower() == 'none (open label)':
|
63 |
+
masking = 'open-label, '
|
64 |
+
elif masking.lower() in 'quadruple':
|
65 |
+
masking = 'quadruple-blind, '
|
66 |
+
# print('....... ..... done..')
|
67 |
+
return str(masking)
|
68 |
+
except:
|
69 |
+
pass
|
70 |
+
##==============================================================================
|
71 |
+
#get study type
|
72 |
+
def get_stype(stype):
|
73 |
+
print('Getting study type...')
|
74 |
+
if stype.lower() == 'interventional':
|
75 |
+
stype = 'interventional study '
|
76 |
+
else:
|
77 |
+
stype = 'observational study '
|
78 |
+
# print('....... ....... done..')
|
79 |
+
return stype
|
80 |
+
##==============================================================================
|
81 |
+
# get intervention model
|
82 |
+
def get_imodel(imodel):
|
83 |
+
print('Getting imodel...')
|
84 |
+
if imodel.lower() is not None:
|
85 |
+
res = imodel.lower() + ', '
|
86 |
+
return res
|
87 |
+
else:
|
88 |
+
pass
|
89 |
+
##==============================================================================
|
90 |
+
#get objective
|
91 |
+
def get_obj(otitle,bsumm, ddesc):
|
92 |
+
print('Getting objective for..')
|
93 |
+
# print(string)
|
94 |
+
# keywords = ['purpose','objective','evaluated','aim','assess','pharmcokinetic',
|
95 |
+
# 'pharmacodynamic','safety','immunogenecity']'Study to Evaluate'
|
96 |
+
keywords = ['to Demonstrate',
|
97 |
+
'to Evaluate',
|
98 |
+
'to Investigate',
|
99 |
+
'to Assess',
|
100 |
+
'to Determine',
|
101 |
+
# 'Investigating',
|
102 |
+
'Placebo','Purpose','aim','purpose','main purpose',
|
103 |
+
'Aim','Objective', 'objective', 'Main Objective', 'Selection Study',
|
104 |
+
'Main Purpose', 'Main Aim','Study', 'STUDY', 'study',
|
105 |
+
'Ascending Multiple-dose','Adaptive','Dose Escalation',
|
106 |
+
'assess', 'Bioavailability','investigate','Investigating'
|
107 |
+
]
|
108 |
+
|
109 |
+
otitle_result = [ele for ele in keywords if(ele in otitle)]
|
110 |
+
print('otitle_result:', otitle_result)
|
111 |
+
bsumm_result = [ele for ele in keywords if(ele in bsumm.lower())]
|
112 |
+
print('bsumm_result:', bsumm_result)
|
113 |
+
ddesc_result = [ele for ele in keywords if(ele in ddesc.lower())]
|
114 |
+
print('ddesc_result:',ddesc_result)
|
115 |
+
# print(otitle_result)
|
116 |
+
try:
|
117 |
+
if len(otitle_result)>0:
|
118 |
+
print('im in otitle')
|
119 |
+
word = ''.join(otitle_result[0])
|
120 |
+
print('word in otitle:', word)
|
121 |
+
matched = [sentence + '.' for sentence in otitle.split('. ') if word in sentence]
|
122 |
+
sobj = ''.join(matched)
|
123 |
+
print('matched sobj',sobj)
|
124 |
+
# result = re.sub(r'^.*?to', 'to', sobj)
|
125 |
+
pattern=word+'(.*)'+'.'
|
126 |
+
result = re.search(pattern, sobj)
|
127 |
+
print('result of pattern search:',result)
|
128 |
+
result = word+result.group(1)
|
129 |
+
print('result group:',result)
|
130 |
+
result = non_abbr(result)
|
131 |
+
print('non-abbr result:',result)
|
132 |
+
return result
|
133 |
+
elif len(bsumm_result)>0:
|
134 |
+
print('im in bsumm')
|
135 |
+
# print(bsumm_result)
|
136 |
+
word = ''.join(bsumm_result[0])
|
137 |
+
# print(word)
|
138 |
+
matched = [sentence + '.' for sentence in bsumm.split('. ') if word in sentence]
|
139 |
+
sobj = ', '.join(matched)
|
140 |
+
sobj = non_abbr(sobj)
|
141 |
+
return sobj
|
142 |
+
elif len(ddesc_result)>0:
|
143 |
+
# print('im in ddesc')
|
144 |
+
word = ''.join(ddesc_result[0])
|
145 |
+
matched = [sentence + '.' for sentence in ddesc.split('. ') if word in sentence]
|
146 |
+
sobj = ''.join(matched)
|
147 |
+
sobj = non_abbr(sobj)
|
148 |
+
return sobj
|
149 |
+
else:
|
150 |
+
sobj = 'No Objective Found'
|
151 |
+
return sobj
|
152 |
+
except:
|
153 |
+
pass
|
154 |
+
|
155 |
+
##==============================================================================
|
156 |
+
# other study id extract
|
157 |
+
def get_osid(osid,sid):
|
158 |
+
print('Getting Study Ids...')
|
159 |
+
if None not in (osid,sid):
|
160 |
+
if sid !='':
|
161 |
+
osid = '(' + '; '.join(osid.split('|')) + '; '+ ', '.join(sid.split('|')) +') '
|
162 |
+
# print('both not none:',osid)
|
163 |
+
return osid
|
164 |
+
elif osid is not None:
|
165 |
+
osid_only = '(' + '; '.join(osid.split('|')) + ') '
|
166 |
+
# print('sid is none:',osid_only)
|
167 |
+
return osid_only
|
168 |
+
elif osid is None and sid is not None:
|
169 |
+
sid_only = '(' + '; '.join(sid.split('|')) + ') '
|
170 |
+
# print('osid is none:',sid_only)
|
171 |
+
# print('....... ....... done..')
|
172 |
+
return sid_only
|
173 |
+
else:
|
174 |
+
pass
|
175 |
+
|
176 |
+
##==============================================================================
|
177 |
+
# get locations
|
178 |
+
def join_and(items):
|
179 |
+
if len(items)>1:
|
180 |
+
return ', '.join(items[:-1]) + ', and '+items[-1]
|
181 |
+
else:
|
182 |
+
return ', '.join(items)
|
183 |
+
|
184 |
+
def get_locs(locations):
|
185 |
+
print('Getting Locations...')
|
186 |
+
print(locations)
|
187 |
+
print(len(locations))
|
188 |
+
if locations !='':
|
189 |
+
print('location is not empty')
|
190 |
+
if '|' in locations:
|
191 |
+
res = join_and(sorted(list(set(locations.split('|')))))
|
192 |
+
print('inside location split if:', res)
|
193 |
+
else:
|
194 |
+
res = locations
|
195 |
+
print('inside location split else:', res)
|
196 |
+
else:
|
197 |
+
res = locations
|
198 |
+
print('outside location split else:', res)
|
199 |
+
if res =='':
|
200 |
+
pass
|
201 |
+
else:
|
202 |
+
res = ' in ' + res +', '
|
203 |
+
# print('....... ....... done..')
|
204 |
+
return res
|
205 |
+
|
206 |
+
##==============================================================================
|
207 |
+
# status extract
|
208 |
+
status_dict = {'Not yet recruiting':', is planned ',
|
209 |
+
# 'Recruiting':', is active ',
|
210 |
+
'Active, not recruiting':' (enrollment complete) ',
|
211 |
+
'Completed' :', is complete ',
|
212 |
+
'Terminated':', has been terminated',
|
213 |
+
'Suspended' :', has been suspended',
|
214 |
+
'Withdrawn' :', has been withdrawn'
|
215 |
+
}
|
216 |
+
def get_status(status):
|
217 |
+
print('Getting trial type...')
|
218 |
+
search_key = status
|
219 |
+
# print(search_key)
|
220 |
+
try:
|
221 |
+
res = [val for key, val in status_dict.items() if search_key in key]
|
222 |
+
res = str(res).replace("['",'').replace("']",'')
|
223 |
+
# print('....... ....... done..')
|
224 |
+
return res
|
225 |
+
except:
|
226 |
+
pass
|
227 |
+
|
228 |
+
##==============================================================================
|
229 |
+
# lower non abbr word for ystop
|
230 |
+
def non_abbr(string):
|
231 |
+
word = string.split(' ')
|
232 |
+
my_list=[]
|
233 |
+
try:
|
234 |
+
for word in word:
|
235 |
+
if word.isupper() == True:
|
236 |
+
word = word.upper()
|
237 |
+
my_list.append(word)
|
238 |
+
else:
|
239 |
+
word = word.lower()
|
240 |
+
my_list.append(word)
|
241 |
+
return ' '.join(my_list)
|
242 |
+
except:
|
243 |
+
pass
|
244 |
+
##==============================================================================
|
245 |
+
# reason for stop extract
|
246 |
+
def get_ystop(ystop):
|
247 |
+
print('Getting ystop...')
|
248 |
+
if ystop!='':
|
249 |
+
ystop = non_abbr(ystop)
|
250 |
+
ystop = ', '+ 'due to ' + ystop
|
251 |
+
return ystop
|
252 |
+
else:
|
253 |
+
pass
|
254 |
+
##==============================================================================
|
255 |
+
#get age
|
256 |
+
def get_age(minage,maxage):
|
257 |
+
# print('Getting age...')
|
258 |
+
if maxage !='':
|
259 |
+
age = 'aged between '+ minage+ ' and ' + maxage
|
260 |
+
else:
|
261 |
+
age = 'with minimum age of ' +minage
|
262 |
+
# print('....... ....... done..')
|
263 |
+
return age
|
264 |
+
##==============================================================================
|
265 |
+
|
266 |
+
# get link
|
267 |
+
def get_url(nctid,lupd):
|
268 |
+
print('Cooking up final url...')
|
269 |
+
urll='https://clinicaltrials.gov/ct2/show/'
|
270 |
+
new_url= ' ('+ 'ClinicalTrials.gov, '+ lupd+', ' +urll+nctid + ')'
|
271 |
+
return new_url
|
272 |
+
##==============================================================================
|
273 |
+
#map week numbers
|
274 |
+
def map_week_num(myText):
|
275 |
+
obj = CaseInsensitiveDict(num_dict)
|
276 |
+
pattern = re.compile(r'(?<!\w)(' + '|'.join(re.escape(key) for key in obj.keys()) + r')(?!\w)',flags=re.IGNORECASE)
|
277 |
+
text = pattern.sub(lambda x: obj[x.group()], myText)
|
278 |
+
# text = pattern.sub(lambda x: obj[x.group()], text)
|
279 |
+
return text
|
280 |
+
##==============================================================================
|
281 |
+
#map terms
|
282 |
+
def map_terms(myText):
|
283 |
+
obj = CaseInsensitiveDict(mt_dict)
|
284 |
+
pattern = re.compile(r'(?<!\w)(' + '|'.join(re.escape(key) for key in obj.keys()) + r')(?!\w)',flags=re.IGNORECASE)
|
285 |
+
text = pattern.sub(lambda x: obj[x.group()], myText)
|
286 |
+
# text = pattern.sub(lambda x: obj[x.group()], text)
|
287 |
+
return text
|
288 |
+
##==============================================================================
|
289 |
+
# adjust space, period, comma
|
290 |
+
def remove_period_spaces(text):
|
291 |
+
text = text.replace('||','')
|
292 |
+
text = text.replace('Korea, Republic of','S Korea')
|
293 |
+
text = text.replace('[]','')
|
294 |
+
text = text.replace(', This',', this')
|
295 |
+
text = text.replace(') The',') the')
|
296 |
+
text = text.replace('in The The','in the')
|
297 |
+
text = text.replace('The','the')
|
298 |
+
text = text.replace('the the','the')
|
299 |
+
text = text.replace('this is a','')
|
300 |
+
text = text.replace('.,',',')
|
301 |
+
text = text.replace('., ',',')
|
302 |
+
text = text.replace(',',', ')
|
303 |
+
text = text.replace("due to", "because of", 1)
|
304 |
+
text = text.replace("male subjects", "male participants")
|
305 |
+
text = text.replace("female subjects", "female participants")
|
306 |
+
# text = text.capitalize()
|
307 |
+
text=" ".join(text.split())
|
308 |
+
return text
|
309 |
+
##==============================================================================
|
310 |
+
# remove duplicate words
|
311 |
+
def unique_list(text_str):
|
312 |
+
l = text_str.split()
|
313 |
+
temp = []
|
314 |
+
for x in l:
|
315 |
+
if x not in temp:
|
316 |
+
temp.append(x)
|
317 |
+
return ' '.join(temp)
|
318 |
+
#===============================================================================
|
319 |
+
#reposition the condition in the summary
|
320 |
+
def repos_condition(my_string):
|
321 |
+
try:
|
322 |
+
# print(my_string)
|
323 |
+
subjects=re.search('with(.*),',my_string)
|
324 |
+
# print(subjects.group(1))
|
325 |
+
if subjects:
|
326 |
+
fs=subjects.group(1).split(',')[0]
|
327 |
+
# print(fs)
|
328 |
+
else:
|
329 |
+
subjects=re.search('with(.*).',my_string)
|
330 |
+
fs=subjects.group(1).split('.')[0]
|
331 |
+
# print(subjects.group(1).split(',')[0])
|
332 |
+
a=re.search(r"\d+\s+subjects\s",my_string)
|
333 |
+
# print(a.group(0))
|
334 |
+
r=re.sub(r"\d+\s+subjects\s",a.group(0)+"with"+fs+" ",my_string)
|
335 |
+
# print(r)
|
336 |
+
result=re.sub("with"+fs+",","",r)
|
337 |
+
print("--------------")
|
338 |
+
return result
|
339 |
+
except:
|
340 |
+
print("not found")
|
341 |
+
|
342 |
+
#================================================================================
|
343 |
+
|
344 |
+
#reposition the additional study_design words
|
345 |
+
def repos_study_design(text):
|
346 |
+
try:
|
347 |
+
result = re.search('subjects(.*)study', text.lower())
|
348 |
+
if result:
|
349 |
+
r = result.group(1)+'study'
|
350 |
+
newtext= text.replace(r, '')
|
351 |
+
try:
|
352 |
+
idx = newtext.lower().index('phase')
|
353 |
+
newtext = newtext[:idx] + result.group(1) + newtext[idx:]
|
354 |
+
return newtext
|
355 |
+
except:
|
356 |
+
return text
|
357 |
+
else:
|
358 |
+
return text
|
359 |
+
except:
|
360 |
+
print("nothing happened")
|
361 |
+
#================================================================================
|
362 |
+
#identify purpose issues
|
363 |
+
def purpose_issue(summary):
|
364 |
+
flag_words = ['will also be evaluated','will be evaluated','No Objective Found','subjects), is', 'subjects, is complete']
|
365 |
+
if any(word in summary for word in flag_words):
|
366 |
+
return "Yes - Grammar/Endpoint related Mistakes in Summary"
|
367 |
+
else:
|
368 |
+
return "No"
|
369 |
+
#================================================================================
|
370 |
+
# duplicate words check
|
371 |
+
def dupe_check(text,rr_value,stopwords=stopwords):
|
372 |
+
if rr_value == 'No':
|
373 |
+
split_text = text.split(' ')
|
374 |
+
clean_text = ' '.join(i for i in split_text if i.lower() not in (x.lower() for x in stopwords))
|
375 |
+
words = clean_text.split()
|
376 |
+
result = (len(words) > len(set(words)))
|
377 |
+
if result ==True:
|
378 |
+
return " Yes - Duplicate Words maybe found in Summary"
|
379 |
+
else:
|
380 |
+
return rr_value
|
381 |
+
else:
|
382 |
+
return rr_value
|
383 |
+
#================================================================================
|
384 |
+
#count all cap words
|
385 |
+
def count_caps(summary,rr_value):
|
386 |
+
if rr_value == 'No':
|
387 |
+
match_length = len(' '.join(re.findall(r"\b[A-Z\s]+\b", summary)).split())
|
388 |
+
if match_length > 10:
|
389 |
+
res = 'Yes - Summary May Contain Lot of Words in Upper Case'
|
390 |
+
return res
|
391 |
+
else:
|
392 |
+
return rr_value
|
393 |
+
else:
|
394 |
+
return rr_value
|
395 |
+
#================================================================================
|
396 |
+
#identify route/dose misses
|
397 |
+
def route_miss(summary,rr_value,int_dec):
|
398 |
+
if rr_value == 'No':
|
399 |
+
split_summ = summary.split(' ')
|
400 |
+
clean_text = ' '.join(i for i in split_summ if i.lower() not in (x.lower() for x in stopwords))
|
401 |
+
summ_list = clean_text.split()
|
402 |
+
int_summ = int_dec.split(' ')
|
403 |
+
clean_text = ' '.join(i for i in int_summ if i.lower() not in (x.lower() for x in stopwords))
|
404 |
+
int_list = clean_text.split()
|
405 |
+
if any(check in int_list for check in summ_list):
|
406 |
+
return "No"
|
407 |
+
else:
|
408 |
+
return "Yes - Route/Dose info might have been missed"
|
409 |
+
else:
|
410 |
+
return rr_value
|
411 |
+
|