wzkariampuzha commited on
Commit
0bc8dab
1 Parent(s): e7caceb

Update extract_abs.py

Browse files
Files changed (1) hide show
  1. extract_abs.py +47 -0
extract_abs.py CHANGED
@@ -19,6 +19,7 @@ import json
19
  import codecs
20
  from unidecode import unidecode
21
  from collections import OrderedDict
 
22
  from typing import (
23
  Dict,
24
  List,
@@ -275,6 +276,52 @@ def search_term_extraction(search_term:Union[int,str], maxResults:int, filtering
275
 
276
  print(len(results),'abstracts classified as epidemiological.')
277
  return results.sort_values('EPI_PROB', ascending=False)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
278
 
279
  #Identical to search_term_extraction, except it returns a JSON object instead of a df
280
  def API_extraction(search_term:Union[int,str], maxResults:int, filtering:str, #for abstract search
 
19
  import codecs
20
  from unidecode import unidecode
21
  from collections import OrderedDict
22
+ import streamlit as st
23
  from typing import (
24
  Dict,
25
  List,
 
276
 
277
  print(len(results),'abstracts classified as epidemiological.')
278
  return results.sort_values('EPI_PROB', ascending=False)
279
+
280
+ #Returns a Pandas dataframe
281
+ def streamlit_extraction(search_term:Union[int,str], maxResults:int, filtering:str, #for abstract search
282
+ NER_pipeline:Any, entity_classes:Union[Set[str],List[str]], #for biobert extraction
283
+ extract_diseases:bool, GARD_dict:Dict[str,str], max_length:int, #for disease extraction
284
+ classify_model_vars:Tuple[Any,Any,Any,Any,Any]) -> Any: #for classification
285
+
286
+ #Format of Output
287
+ ordered_labels = order_labels(entity_classes)
288
+ if extract_diseases:
289
+ columns = ['PMID', 'ABSTRACT','EPI_PROB','IsEpi','IDS','DIS']+ordered_labels
290
+ else:
291
+ columns = ['PMID', 'ABSTRACT','EPI_PROB','IsEpi']+ordered_labels
292
+
293
+ results = pd.DataFrame(columns=columns)
294
+
295
+ ##Check to see if search term maps to anything in the GARD dictionary, if so it pulls up all synonyms for the search
296
+ search_term_list = autosearch(search_term, GARD_dict)
297
+ if len(search_term_list)>1:
298
+ st.write("SEARCH TERM MATCHED TO GARD DICTIONARY. SEARCHING FOR: "+ str(search_term_list))
299
+ else:
300
+ st.write("SEARCHING FOR: "+ str(search_term_list))
301
+
302
+ #Gather title+abstracts into a dictionary {pmid:abstract}
303
+ pmid_abs = classify_abs.search_getAbs(search_term_list, maxResults, filtering)
304
+ st.write("GATHERED " +str(len(pmid_abs))+" PubMed IDs.")
305
+
306
+ i = 0
307
+ my_bar = st.progress(i)
308
+ percent_at_step = 100/len(pmid_abs)
309
+ for pmid, abstract in pmid_abs.items():
310
+ epi_prob, isEpi = classify_abs.getTextPredictions(abstract, classify_model_vars)
311
+ if isEpi:
312
+ #Preprocessing Functions for Extraction
313
+ sentences = str2sents(abstract)
314
+ model_outputs = [NER_pipeline(sent) for sent in sentences]
315
+ extraction = parse_info(sentences, model_outputs, entity_classes, extract_diseases, GARD_dict, max_length)
316
+ if extraction:
317
+ extraction.update({'PMID':pmid, 'ABSTRACT':abstract, 'EPI_PROB':epi_prob, 'IsEpi':isEpi})
318
+ #Slow dataframe update
319
+ results = results.append(extraction, ignore_index=True)
320
+ i+=1
321
+ my_bar.progress(i*percent_at_step)
322
+
323
+ st.write(len(results),'abstracts classified as epidemiological.')
324
+ return results.sort_values('EPI_PROB', ascending=False)
325
 
326
  #Identical to search_term_extraction, except it returns a JSON object instead of a df
327
  def API_extraction(search_term:Union[int,str], maxResults:int, filtering:str, #for abstract search