File size: 14,218 Bytes
bf98c4e
e067ea9
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
from cid import CaseInsensitiveDict
import re



##==============================================================================
#load mapping terms
with open('utils/summarize_utils/map_terms.txt') as f:
  mt_dict = dict(x.rstrip().split(',', 1) for x in f)

##==============================================================================
#load number mapping terms to convert numbers in words appearing before weeks 
# to number 
with open('utils/summarize_utils/map_nums.txt') as f:
  num_dict = dict(x.rstrip().split(',', 1) for x in f)
##==============================================================================
## load stop words
with open('utils/summarize_utils/stopwords-en.txt','r',encoding='unicode_escape') as f:
  stopwords = f.read().split()
##==============================================================================
def get_first_word(alloc,masking,status):
  print('Getting first word..')
  if (alloc.lower()=='n/a' and masking.lower()=='none (open label)'):
    if status == 'Active, not recruiting':
      fw = 'An '
      start_word = 'It is in '
      result = start_word + fw.lower()
      return result
    elif status == 'Recruiting':
      fw = 'An '
      start_word = 'It is in '
      result = start_word + fw.lower()
      return result
    else:
      fw = 'An '
      result = fw
      return result
  else:
    if status == 'Active, not recruiting':
      fw = 'A '
      start_word = 'It is in '
      result = start_word + fw.lower()
      return result
    elif status == 'Recruiting':
      fw = 'A '
      start_word = 'It is in '
      result = start_word + fw.lower()
      return result
    else:
      fw = 'A '
      result = fw
      return result

##==============================================================================
#get masking type
def get_mask(masking):
  print('Getting mask..')
  # print('maskingentry:',masking)
  try:
    if masking.lower() == 'double':
      masking = 'double-blind, '
    elif masking.lower() == 'none (open label)':
      masking = 'open-label, '
    elif masking.lower() in 'quadruple':
      masking = 'quadruple-blind, ' 
    # print('....... ..... done..')  
    return str(masking)
  except:
    pass
##==============================================================================
#get study type
def get_stype(stype):
  print('Getting study type...')
  if stype.lower() == 'interventional':
    stype = 'interventional study '
  else:
    stype = 'observational study '
  # print('....... ....... done..')
  return stype  
##==============================================================================
# get intervention model
def get_imodel(imodel):
  print('Getting imodel...')
  if imodel.lower() is not None:
    res = imodel.lower() + ', '
    return res
  else:
    pass  
##==============================================================================
#get objective
def get_obj(otitle,bsumm, ddesc):
  print('Getting objective for..')
  # print(string)
  # keywords = ['purpose','objective','evaluated','aim','assess','pharmcokinetic',
  #             'pharmacodynamic','safety','immunogenecity']'Study to Evaluate'
  keywords = ['to Demonstrate', 
              'to Evaluate',
              'to Investigate',
              'to Assess',
              'to Determine',
              # 'Investigating',
              'Placebo','Purpose','aim','purpose','main purpose',
              'Aim','Objective', 'objective', 'Main Objective', 'Selection Study',
              'Main Purpose', 'Main Aim','Study', 'STUDY', 'study',
              'Ascending Multiple-dose','Adaptive','Dose Escalation',
              'assess', 'Bioavailability','investigate','Investigating'
              ]

  otitle_result = [ele for ele in keywords if(ele in otitle)]
  print('otitle_result:', otitle_result)
  bsumm_result = [ele for ele in keywords if(ele in bsumm.lower())]
  print('bsumm_result:', bsumm_result)
  ddesc_result = [ele for ele in keywords if(ele in ddesc.lower())]
  print('ddesc_result:',ddesc_result)
  # print(otitle_result)
  try:
    if len(otitle_result)>0:
      print('im in otitle')
      word = ''.join(otitle_result[0])
      print('word in otitle:', word)
      matched = [sentence + '.' for sentence in otitle.split('. ') if word in sentence]
      sobj = ''.join(matched)
      print('matched sobj',sobj)
      # result = re.sub(r'^.*?to', 'to', sobj)
      pattern=word+'(.*)'+'.'
      result = re.search(pattern, sobj)
      print('result of pattern search:',result)
      result = word+result.group(1)
      print('result group:',result)
      result = non_abbr(result)
      print('non-abbr result:',result)
      return result
    elif len(bsumm_result)>0:
      print('im in bsumm')
      # print(bsumm_result)
      word = ''.join(bsumm_result[0])
      # print(word)
      matched = [sentence + '.' for sentence in bsumm.split('. ') if word in sentence]
      sobj = ', '.join(matched)
      sobj = non_abbr(sobj)
      return sobj
    elif len(ddesc_result)>0:
      # print('im in ddesc')
      word = ''.join(ddesc_result[0])
      matched = [sentence + '.' for sentence in ddesc.split('. ') if word in sentence]
      sobj = ''.join(matched)
      sobj = non_abbr(sobj)
      return sobj
    else:
      sobj = 'No Objective Found'
      return sobj
  except:
      pass
  
##==============================================================================
# other study id extract
def get_osid(osid,sid):
  print('Getting Study Ids...')
  if None not in (osid,sid):
    if sid !='':
      osid = '(' + '; '.join(osid.split('|')) + '; '+ ', '.join(sid.split('|')) +') '
      # print('both not none:',osid)
      return osid
    elif osid is not None:
      osid_only = '(' + '; '.join(osid.split('|')) + ') '
      # print('sid is none:',osid_only)
      return osid_only
    elif osid is None and sid is not None:
      sid_only = '(' + '; '.join(sid.split('|')) + ') '
      # print('osid is none:',sid_only)
      # print('....... ....... done..')
      return sid_only
  else:
    pass

##==============================================================================
# get locations
def join_and(items):
  if len(items)>1:
    return ', '.join(items[:-1]) + ', and '+items[-1]
  else:
    return ', '.join(items)
    
def get_locs(locations):
  print('Getting Locations...')
  print(locations)
  print(len(locations))
  if locations !='':
    print('location is not empty')
    if '|' in locations:
      res = join_and(sorted(list(set(locations.split('|')))))
      print('inside location split if:', res)
    else:
      res = locations
      print('inside location split else:', res)
  else:
    res = locations
    print('outside location split else:', res)
  if res =='':
    pass
  else:  
    res = ' in ' + res +', '
  # print('....... ....... done..')
  return res  

##==============================================================================
# status extract
status_dict = {'Not yet recruiting':', is planned ',
              # 'Recruiting':', is active ',
              'Active, not recruiting':' (enrollment complete) ',
              'Completed' :', is complete ',
              'Terminated':', has been terminated',
              'Suspended' :', has been suspended',
              'Withdrawn' :', has been withdrawn'
              }
def get_status(status):
  print('Getting trial type...')
  search_key = status
  # print(search_key)
  try:
    res = [val for key, val in status_dict.items() if search_key in key]
    res = str(res).replace("['",'').replace("']",'')
      # print('....... ....... done..')
    return res
  except:
    pass

##==============================================================================
# lower non abbr word for ystop
def non_abbr(string):
  word = string.split(' ')
  my_list=[]
  try:
    for word in word:
      if word.isupper() == True:
        word = word.upper()
        my_list.append(word)
      else:
        word = word.lower()
        my_list.append(word)
    return ' '.join(my_list)
  except:
    pass
##==============================================================================
# reason for stop extract
def get_ystop(ystop):
  print('Getting ystop...')
  if ystop!='':
    ystop = non_abbr(ystop)
    ystop = ', '+ 'due to ' + ystop
    return ystop
  else:
    pass
##==============================================================================
#get age
def get_age(minage,maxage):
  # print('Getting age...')
  if maxage !='':
    age = 'aged between '+ minage+ ' and ' + maxage
  else:
    age = 'with minimum age of ' +minage
  # print('....... ....... done..')
  return age
##==============================================================================

# get link
def get_url(nctid,lupd):
  print('Cooking up final url...')
  urll='https://clinicaltrials.gov/ct2/show/'
  new_url= ' ('+ 'ClinicalTrials.gov, '+ lupd+', ' +urll+nctid + ')'
  return new_url
##==============================================================================
#map week numbers
def map_week_num(myText):
  obj = CaseInsensitiveDict(num_dict)
  pattern = re.compile(r'(?<!\w)(' + '|'.join(re.escape(key) for key in obj.keys()) + r')(?!\w)',flags=re.IGNORECASE)
  text = pattern.sub(lambda x: obj[x.group()], myText)
  # text = pattern.sub(lambda x: obj[x.group()], text)
  return text
##==============================================================================
#map terms
def map_terms(myText):
  obj = CaseInsensitiveDict(mt_dict)
  pattern = re.compile(r'(?<!\w)(' + '|'.join(re.escape(key) for key in obj.keys()) + r')(?!\w)',flags=re.IGNORECASE)
  text = pattern.sub(lambda x: obj[x.group()], myText)
  # text = pattern.sub(lambda x: obj[x.group()], text)
  return text
##==============================================================================
# adjust space, period, comma
def remove_period_spaces(text):
  text = text.replace('||','')
  text = text.replace('Korea, Republic of','S Korea')
  text = text.replace('[]','')
  text = text.replace(', This',', this')
  text = text.replace(') The',') the')
  text = text.replace('in The The','in the')
  text = text.replace('The','the')
  text = text.replace('the the','the')
  text = text.replace('this is a','')
  text = text.replace('.,',',')
  text = text.replace('., ',',')
  text = text.replace(',',', ')  
  text = text.replace("due to", "because of", 1)
  text = text.replace("male subjects", "male participants")
  text = text.replace("female subjects", "female participants")
  # text = text.capitalize()
  text=" ".join(text.split())
  return text
##==============================================================================
# remove duplicate words
def unique_list(text_str):
    l = text_str.split()
    temp = []
    for x in l:
        if x not in temp:
            temp.append(x)
    return ' '.join(temp)
#===============================================================================
#reposition the condition in the summary
def repos_condition(my_string):
  try:
    # print(my_string)
    subjects=re.search('with(.*),',my_string)
    # print(subjects.group(1))
    if subjects:   
      fs=subjects.group(1).split(',')[0]
      # print(fs)
    else:
      subjects=re.search('with(.*).',my_string)
      fs=subjects.group(1).split('.')[0]
    # print(subjects.group(1).split(',')[0])
    a=re.search(r"\d+\s+subjects\s",my_string)
    # print(a.group(0))
    r=re.sub(r"\d+\s+subjects\s",a.group(0)+"with"+fs+" ",my_string)
    # print(r)
    result=re.sub("with"+fs+",","",r)
    print("--------------")
    return result
  except:
    print("not found")

#================================================================================

#reposition the additional study_design words
def repos_study_design(text):
    try:
        result = re.search('subjects(.*)study', text.lower())
        if result:
            r = result.group(1)+'study'
            newtext= text.replace(r, '')
            try:
              idx = newtext.lower().index('phase')
              newtext = newtext[:idx] + result.group(1) + newtext[idx:]
              return newtext
            except:
              return text
        else:
            return text
    except:
        print("nothing happened")  
#================================================================================
#identify purpose issues
def purpose_issue(summary):
  flag_words = ['will also be evaluated','will be evaluated','No Objective Found','subjects), is', 'subjects, is complete']
  if any(word in summary for word in flag_words):
    return "Yes - Grammar/Endpoint related Mistakes in Summary"
  else:
    return "No"
#================================================================================
# duplicate words check
def dupe_check(text,rr_value,stopwords=stopwords):
  if rr_value == 'No':
    split_text = text.split(' ')
    clean_text = ' '.join(i for i in split_text if i.lower() not in (x.lower() for x in stopwords))
    words = clean_text.split()
    result = (len(words) > len(set(words)))
    if result ==True:
      return " Yes - Duplicate Words maybe found in Summary"
    else:
      return rr_value
  else:
    return rr_value
#================================================================================  
#count all cap words
def count_caps(summary,rr_value):
  if rr_value == 'No':
    match_length = len(' '.join(re.findall(r"\b[A-Z\s]+\b", summary)).split())
    if match_length > 10:
      res = 'Yes - Summary May Contain Lot of Words in Upper Case'
      return res
    else:
      return rr_value
  else:
    return rr_value
#================================================================================
#identify route/dose misses
def route_miss(summary,rr_value,int_dec):
    if rr_value == 'No':
      split_summ = summary.split(' ')
      clean_text = ' '.join(i for i in split_summ if i.lower() not in (x.lower() for x in stopwords))
      summ_list = clean_text.split()
      int_summ = int_dec.split(' ')
      clean_text = ' '.join(i for i in int_summ if i.lower() not in (x.lower() for x in stopwords))
      int_list = clean_text.split()
      if any(check in int_list for check in summ_list):
        return "No"
      else:
        return "Yes - Route/Dose info might have been missed"
    else:
      return rr_value