from cid import CaseInsensitiveDict import re ##============================================================================== #load mapping terms with open('utils/summarize_utils/map_terms.txt') as f: mt_dict = dict(x.rstrip().split(',', 1) for x in f) ##============================================================================== #load number mapping terms to convert numbers in words appearing before weeks # to number with open('utils/summarize_utils/map_nums.txt') as f: num_dict = dict(x.rstrip().split(',', 1) for x in f) ##============================================================================== ## load stop words with open('utils/summarize_utils/stopwords-en.txt','r',encoding='unicode_escape') as f: stopwords = f.read().split() ##============================================================================== def get_first_word(alloc,masking,status): print('Getting first word..') if (alloc.lower()=='n/a' and masking.lower()=='none (open label)'): if status == 'Active, not recruiting': fw = 'An ' start_word = 'It is in ' result = start_word + fw.lower() return result elif status == 'Recruiting': fw = 'An ' start_word = 'It is in ' result = start_word + fw.lower() return result else: fw = 'An ' result = fw return result else: if status == 'Active, not recruiting': fw = 'A ' start_word = 'It is in ' result = start_word + fw.lower() return result elif status == 'Recruiting': fw = 'A ' start_word = 'It is in ' result = start_word + fw.lower() return result else: fw = 'A ' result = fw return result ##============================================================================== #get masking type def get_mask(masking): print('Getting mask..') # print('maskingentry:',masking) try: if masking.lower() == 'double': masking = 'double-blind, ' elif masking.lower() == 'none (open label)': masking = 'open-label, ' elif masking.lower() in 'quadruple': masking = 'quadruple-blind, ' # print('....... ..... done..') return str(masking) except: pass ##============================================================================== #get study type def get_stype(stype): print('Getting study type...') if stype.lower() == 'interventional': stype = 'interventional study ' else: stype = 'observational study ' # print('....... ....... done..') return stype ##============================================================================== # get intervention model def get_imodel(imodel): print('Getting imodel...') if imodel.lower() is not None: res = imodel.lower() + ', ' return res else: pass ##============================================================================== #get objective def get_obj(otitle,bsumm, ddesc): print('Getting objective for..') # print(string) # keywords = ['purpose','objective','evaluated','aim','assess','pharmcokinetic', # 'pharmacodynamic','safety','immunogenecity']'Study to Evaluate' keywords = ['to Demonstrate', 'to Evaluate', 'to Investigate', 'to Assess', 'to Determine', # 'Investigating', 'Placebo','Purpose','aim','purpose','main purpose', 'Aim','Objective', 'objective', 'Main Objective', 'Selection Study', 'Main Purpose', 'Main Aim','Study', 'STUDY', 'study', 'Ascending Multiple-dose','Adaptive','Dose Escalation', 'assess', 'Bioavailability','investigate','Investigating' ] otitle_result = [ele for ele in keywords if(ele in otitle)] print('otitle_result:', otitle_result) bsumm_result = [ele for ele in keywords if(ele in bsumm.lower())] print('bsumm_result:', bsumm_result) ddesc_result = [ele for ele in keywords if(ele in ddesc.lower())] print('ddesc_result:',ddesc_result) # print(otitle_result) try: if len(otitle_result)>0: print('im in otitle') word = ''.join(otitle_result[0]) print('word in otitle:', word) matched = [sentence + '.' for sentence in otitle.split('. ') if word in sentence] sobj = ''.join(matched) print('matched sobj',sobj) # result = re.sub(r'^.*?to', 'to', sobj) pattern=word+'(.*)'+'.' result = re.search(pattern, sobj) print('result of pattern search:',result) result = word+result.group(1) print('result group:',result) result = non_abbr(result) print('non-abbr result:',result) return result elif len(bsumm_result)>0: print('im in bsumm') # print(bsumm_result) word = ''.join(bsumm_result[0]) # print(word) matched = [sentence + '.' for sentence in bsumm.split('. ') if word in sentence] sobj = ', '.join(matched) sobj = non_abbr(sobj) return sobj elif len(ddesc_result)>0: # print('im in ddesc') word = ''.join(ddesc_result[0]) matched = [sentence + '.' for sentence in ddesc.split('. ') if word in sentence] sobj = ''.join(matched) sobj = non_abbr(sobj) return sobj else: sobj = 'No Objective Found' return sobj except: pass ##============================================================================== # other study id extract def get_osid(osid,sid): print('Getting Study Ids...') if None not in (osid,sid): if sid !='': osid = '(' + '; '.join(osid.split('|')) + '; '+ ', '.join(sid.split('|')) +') ' # print('both not none:',osid) return osid elif osid is not None: osid_only = '(' + '; '.join(osid.split('|')) + ') ' # print('sid is none:',osid_only) return osid_only elif osid is None and sid is not None: sid_only = '(' + '; '.join(sid.split('|')) + ') ' # print('osid is none:',sid_only) # print('....... ....... done..') return sid_only else: pass ##============================================================================== # get locations def join_and(items): if len(items)>1: return ', '.join(items[:-1]) + ', and '+items[-1] else: return ', '.join(items) def get_locs(locations): print('Getting Locations...') print(locations) print(len(locations)) if locations !='': print('location is not empty') if '|' in locations: res = join_and(sorted(list(set(locations.split('|'))))) print('inside location split if:', res) else: res = locations print('inside location split else:', res) else: res = locations print('outside location split else:', res) if res =='': pass else: res = ' in ' + res +', ' # print('....... ....... done..') return res ##============================================================================== # status extract status_dict = {'Not yet recruiting':', is planned ', # 'Recruiting':', is active ', 'Active, not recruiting':' (enrollment complete) ', 'Completed' :', is complete ', 'Terminated':', has been terminated', 'Suspended' :', has been suspended', 'Withdrawn' :', has been withdrawn' } def get_status(status): print('Getting trial type...') search_key = status # print(search_key) try: res = [val for key, val in status_dict.items() if search_key in key] res = str(res).replace("['",'').replace("']",'') # print('....... ....... done..') return res except: pass ##============================================================================== # lower non abbr word for ystop def non_abbr(string): word = string.split(' ') my_list=[] try: for word in word: if word.isupper() == True: word = word.upper() my_list.append(word) else: word = word.lower() my_list.append(word) return ' '.join(my_list) except: pass ##============================================================================== # reason for stop extract def get_ystop(ystop): print('Getting ystop...') if ystop!='': ystop = non_abbr(ystop) ystop = ', '+ 'due to ' + ystop return ystop else: pass ##============================================================================== #get age def get_age(minage,maxage): # print('Getting age...') if maxage !='': age = 'aged between '+ minage+ ' and ' + maxage else: age = 'with minimum age of ' +minage # print('....... ....... done..') return age ##============================================================================== # get link def get_url(nctid,lupd): print('Cooking up final url...') urll='https://clinicaltrials.gov/ct2/show/' new_url= ' ('+ 'ClinicalTrials.gov, '+ lupd+', ' +urll+nctid + ')' return new_url ##============================================================================== #map week numbers def map_week_num(myText): obj = CaseInsensitiveDict(num_dict) pattern = re.compile(r'(? len(set(words))) if result ==True: return " Yes - Duplicate Words maybe found in Summary" else: return rr_value else: return rr_value #================================================================================ #count all cap words def count_caps(summary,rr_value): if rr_value == 'No': match_length = len(' '.join(re.findall(r"\b[A-Z\s]+\b", summary)).split()) if match_length > 10: res = 'Yes - Summary May Contain Lot of Words in Upper Case' return res else: return rr_value else: return rr_value #================================================================================ #identify route/dose misses def route_miss(summary,rr_value,int_dec): if rr_value == 'No': split_summ = summary.split(' ') clean_text = ' '.join(i for i in split_summ if i.lower() not in (x.lower() for x in stopwords)) summ_list = clean_text.split() int_summ = int_dec.split(' ') clean_text = ' '.join(i for i in int_summ if i.lower() not in (x.lower() for x in stopwords)) int_list = clean_text.split() if any(check in int_list for check in summ_list): return "No" else: return "Yes - Route/Dose info might have been missed" else: return rr_value