wzkariampuzha commited on
Commit
d5406d4
1 Parent(s): 31ca6c1

Update classify_abs.py

Browse files
Files changed (1) hide show
  1. classify_abs.py +100 -0
classify_abs.py CHANGED
@@ -277,6 +277,106 @@ def search_getAbs(searchterm_list:Union[List[str],List[int],str], maxResults:int
277
 
278
  return pmid_abs
279
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
280
  # Generate predictions for a PubMed Id
281
  # nlp: en_core_web_lg
282
  # nlpSci: en_ner_bc5cdr_md
 
277
 
278
  return pmid_abs
279
 
280
+ def streamlist_getAbs(searchterm_list:Union[List[str],List[int],str], maxResults:int, filtering:str) -> Dict[str,str]:
281
+ #set of all pmids
282
+ pmids = set()
283
+
284
+ #dictionary {pmid:abstract}
285
+ pmid_abs = {}
286
+
287
+ #type validation, allows string or list input
288
+ if type(searchterm_list)!=list:
289
+ if type(searchterm_list)==str:
290
+ searchterm_list = [searchterm_list]
291
+ else:
292
+ searchterm_list = list(searchterm_list)
293
+
294
+ my_bar = st.progress(0)
295
+ percent_by_step = 100/maxResults
296
+
297
+ #gathers pmids into a set first
298
+ for dz in searchterm_list:
299
+ term = ''
300
+ dz_words = dz.split()
301
+ for word in dz_words:
302
+ term += word + '%20'
303
+ query = term[:-3]
304
+
305
+ ## get pmid results from searching for disease name through PubMed API
306
+ url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term='+query
307
+ r = requests.get(url)
308
+ root = ET.fromstring(r.content)
309
+
310
+ # loop over resulting articles
311
+ for result in root.iter('IdList'):
312
+ if len(pmids) >= maxResults:
313
+ break
314
+ pmidlist = [pmid.text for pmid in result.iter('Id')]
315
+ pmids.update(pmidlist)
316
+
317
+ ## get results from searching for disease name through EBI API
318
+ url = 'https://www.ebi.ac.uk/europepmc/webservices/rest/search?query='+query+'&resulttype=core'
319
+ r = requests.get(url)
320
+ root = ET.fromstring(r.content)
321
+
322
+ # loop over resulting articles
323
+ for result in root.iter('result'):
324
+ if len(pmids) >= maxResults:
325
+ break
326
+ pmidlist = [pmid.text for pmid in result.iter('id')]
327
+ #can also gather abstract and title here but for some reason did not work as intended the first time. Optimize in future versions to reduce latency.
328
+ if len(pmidlist) > 0:
329
+ pmid = pmidlist[0]
330
+ if pmid[0].isdigit():
331
+ pmids.add(pmid)
332
+
333
+ #Construct sets for filtering (right before adding abstract to pmid_abs
334
+ # The purpose of this is to do a second check of the abstracts, filters out any abstracts unrelated to the search terms
335
+ #if filtering is 'lenient' or default
336
+ if filtering !='none' or filtering !='strict':
337
+ filter_terms = set(searchterm_list).union(set(str(re.sub(',','',' '.join(searchterm_list))).split()).difference(STOPWORDS))
338
+ '''
339
+ # The above is equivalent to this but uses less memory and may be faster:
340
+ #create a single string of the terms within the searchterm_list
341
+ joined = ' '.join(searchterm_list)
342
+ #remove commas
343
+ comma_gone = re.sub(',','',joined)
344
+ #split the string into list of words and convert list into a Pythonic set
345
+ split = set(comma_gone.split())
346
+ #remove the STOPWORDS from the set of key words
347
+ key_words = split.difference(STOPWORDS)
348
+ #create a new set of the list members in searchterm_list
349
+ search_set = set(searchterm_list)
350
+ #join the two sets
351
+ terms = search_set.union(key_words)
352
+ #if any word(s) in the abstract intersect with any of these terms then the abstract is good to go.
353
+ '''
354
+
355
+ ## get abstracts from EBI PMID API and output a dictionary
356
+ for pmid in pmids:
357
+ abstract = PMID_getAb(pmid)
358
+ if len(abstract)>5:
359
+ #do filtering here
360
+ if filtering == 'strict':
361
+ uncased_ab = abstract.lower()
362
+ for term in searchterm_list:
363
+ if term.lower() in uncased_ab:
364
+ pmid_abs[pmid] = abstract
365
+ break
366
+ elif filtering =='none':
367
+ pmid_abs[pmid] = abstract
368
+
369
+ #Default filtering is 'lenient'.
370
+ else:
371
+ #Else and if are separated for readability and to better understand logical flow.
372
+ if set(filter_terms).intersection(set(word_tokenize(abstract))):
373
+ pmid_abs[pmid] = abstract
374
+
375
+
376
+ print('Found',len(pmids),'PMIDs. Gathered',len(pmid_abs),'Relevant Abstracts.')
377
+
378
+ return pmid_abs
379
+
380
  # Generate predictions for a PubMed Id
381
  # nlp: en_core_web_lg
382
  # nlpSci: en_ner_bc5cdr_md