Updated classification model and GARD_search data

#1
Files changed (1) hide show
  1. epi_pipeline.py +52 -92
epi_pipeline.py CHANGED
@@ -221,69 +221,38 @@ def streamlit_getAbs(searchterm_list:Union[List[str],List[int],str], maxResults:
221
  return pmid_abs, (found, relevant)
222
 
223
  ## Section: LSTM RNN Epi Classification Model (EpiClassify4GARD)
224
- import os
225
- os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
226
- from tensorflow.keras.preprocessing.sequence import pad_sequences
227
- from tensorflow.keras.preprocessing.text import tokenizer_from_json
228
- import tensorflow as tf
229
- import numpy as np
230
- import spacy
231
- import json
232
 
 
 
233
  class Classify_Pipeline:
234
- def __init__(self,model_name:str='LSTM_RNN_Model'):
235
- #Load spaCy models
236
- self.nlp = spacy.load('en_core_web_lg')
237
- self.nlpSci = spacy.load("en_ner_bc5cdr_md")
238
- self.nlpSci2 = spacy.load('en_ner_bionlp13cg_md')
239
- # load the tokenizer
240
- with open(model_name+'/tokenizer.json') as f:
241
- self.classify_tokenizer = tokenizer_from_json(json.load(f))
242
- #OLD Code - used pickle which is unsafe
243
- #with open(model+'/tokenizer.pickle', 'rb') as handle:
244
- # import pickle
245
- # self.classify_tokenizer = pickle.load(handle)
246
- # Defaults to load my_model_orphanet_final, the most up-to-date version of the classification model,
247
- # but can also be run on any other tf.keras model
248
-
249
- # load the model
250
- self.classify_model = tf.keras.models.load_model(model_name)
251
- # for preprocessing
252
- from nltk.corpus import stopwords
253
- self.STOPWORDS = set(stopwords.words('english'))
254
- # Modes
255
- self.max_length = 300
256
- self.trunc_type = 'post'
257
- self.padding_type = 'post'
258
-
259
  def __str__(self) -> str:
260
- return "Instantiation: epi_classify = Classify_Pipeline(path_to_model_folder)" +"\n Calling: prob, isEpi = epi_classify(text) \n PubMed ID Predictions: abstracts, prob, isEpi = epi_classify.getPMIDPredictions(pmid)"
261
 
262
  def __call__(self, abstract:str) -> Tuple[float,bool]:
263
  return self.getTextPredictions(abstract)
264
 
265
  def getTextPredictions(self, abstract:str) -> Tuple[float,bool]:
266
  if len(abstract)>5:
267
- # remove stopwords
268
- for word in self.STOPWORDS:
269
- token = ' ' + word + ' '
270
- abstract = abstract.replace(token, ' ')
271
- abstract = abstract.replace(' ', ' ')
272
-
273
- # preprocess abstract
274
- abstract_standard = [self.standardizeAbstract(self.standardizeSciTerms(abstract))]
275
- sequence = self.classify_tokenizer.texts_to_sequences(abstract_standard)
276
- padded = pad_sequences(sequence, maxlen=self.max_length, padding=self.padding_type, truncating=self.trunc_type)
277
-
278
- y_pred1 = self.classify_model.predict(padded) # generate prediction
279
- y_pred = np.argmax(y_pred1, axis=1) # get binary prediction
280
-
281
- prob = y_pred1[0][1]
282
- if y_pred == 1:
283
- isEpi = True
284
- else:
285
- isEpi = False
286
-
287
  return prob, isEpi
288
  else:
289
  return 0.0, False
@@ -292,36 +261,6 @@ class Classify_Pipeline:
292
  abstract = PMID_getAb(pmid)
293
  prob, isEpi = self.getTextPredictions(abstract)
294
  return abstract, prob, isEpi
295
-
296
- # Standardize the abstract by replacing all named entities with their entity label.
297
- # Eg. 3 patients reported at a clinic in England --> CARDINAL patients reported at a clinic in GPE
298
- # expects the spaCy model en_core_web_lg as input
299
- def standardizeAbstract(self, abstract:str) -> str:
300
- doc = self.nlp(abstract)
301
- newAbstract = abstract
302
- for e in reversed(doc.ents):
303
- if e.label_ in {'PERCENT','CARDINAL','GPE','LOC','DATE','TIME','QUANTITY','ORDINAL'}:
304
- start = e.start_char
305
- end = start + len(e.text)
306
- newAbstract = newAbstract[:start] + e.label_ + newAbstract[end:]
307
- return newAbstract
308
-
309
- # Same as above but replaces biomedical named entities from scispaCy models
310
- # Expects as input en_ner_bc5cdr_md and en_ner_bionlp13cg_md
311
- def standardizeSciTerms(self, abstract:str) -> str:
312
- doc = self.nlpSci(abstract)
313
- newAbstract = abstract
314
- for e in reversed(doc.ents):
315
- start = e.start_char
316
- end = start + len(e.text)
317
- newAbstract = newAbstract[:start] + e.label_ + newAbstract[end:]
318
-
319
- doc = self.nlpSci2(newAbstract)
320
- for e in reversed(doc.ents):
321
- start = e.start_char
322
- end = start + len(e.text)
323
- newAbstract = newAbstract[:start] + e.label_ + newAbstract[end:]
324
- return newAbstract
325
 
326
  ## Section: GARD SEARCH
327
  # can identify rare diseases in text using the GARD dictionary from neo4j
@@ -331,11 +270,13 @@ class GARD_Search:
331
  def __init__(self):
332
  import json, codecs
333
  #These are opened locally so that garbage collection removes them from memory
334
- with codecs.open('gard-id-name-synonyms.json', 'r', 'utf-8-sig') as f:
335
- diseases = json.load(f)
336
  from nltk.corpus import stopwords
337
  STOPWORDS = set(stopwords.words('english'))
338
 
 
 
339
  #keys are going to be disease names, values are going to be the GARD ID, set up this way bc dictionaries are faster lookup than lists
340
  GARD_dict = {}
341
  #Find out what the length of the longest disease name sequence is, of all names and synonyms. This is used by get_diseases
@@ -356,6 +297,7 @@ class GARD_Search:
356
  GARD_dict[s] = entry['gard_id']
357
  max_length = max(max_length,len(s.split()))
358
 
 
359
  self.GARD_dict = GARD_dict
360
  self.max_length = max_length
361
 
@@ -444,6 +386,12 @@ class GARD_Search:
444
  print("SEARCH TERM DID NOT MATCH TO GARD DICTIONARY. SEARCHING BY USER INPUT")
445
  return [searchterm]
446
 
 
 
 
 
 
 
447
  ## Section: BioBERT-based epidemiology NER Model (EpiExtract4GARD)
448
  from nltk import tokenize as nltk_tokenize
449
  from dataclasses import dataclass
@@ -455,6 +403,7 @@ import re
455
  from transformers import BertConfig, AutoModelForTokenClassification, BertTokenizer, Trainer
456
  from unidecode import unidecode
457
  from collections import OrderedDict
 
458
  import pandas as pd
459
  from more_itertools import pairwise
460
 
@@ -855,6 +804,11 @@ def API_search_classification(search_term:Union[int,str], maxResults:int,
855
 
856
  return results
857
 
 
 
 
 
 
858
  def API_text_classification(text:str,epi_classify:Classify_Pipeline) -> Dict[str,str]:
859
  epi_prob, isEpi = epi_classify(text)
860
  return {'ABSTRACT':text, 'EPI_PROB':str(epi_prob), 'IsEpi':isEpi}
@@ -900,7 +854,7 @@ def search_term_extraction(search_term:Union[int,str], maxResults:int, filtering
900
  print(len(results),'abstracts classified as epidemiological.')
901
  return results.sort_values('EPI_PROB', ascending=False)
902
 
903
- #Returns a Pandas dataframe
904
  def streamlit_extraction(search_term:Union[int,str], maxResults:int, filtering:str, #for abstract search
905
  epi_ner:NER_Pipeline, #for biobert extraction
906
  GARD_Search:GARD_Search, extract_diseases:bool, #for disease extraction
@@ -1021,7 +975,7 @@ def API_text_extraction(text:str, #Text to be extracted
1021
  else:
1022
  json_output = ['ABSTRACT']+ordered_labels
1023
 
1024
- results = {'entries':[]}
1025
  #Do the extraction
1026
  if extract_diseases:
1027
  extraction = epi_ner(text, GARD_Search)
@@ -1031,15 +985,17 @@ def API_text_extraction(text:str, #Text to be extracted
1031
  if extraction:
1032
  #Re-order the dictionary into desired JSON output
1033
  extraction = OrderedDict([(term, extraction[term]) for term in json_output if term in extraction.keys()])
1034
- results['entries'].append(extraction)
 
 
1035
 
1036
- return results
1037
 
1038
  def API_text_classification_extraction(text:str, #Text to be extracted
1039
  epi_ner:NER_Pipeline, #for biobert extraction
1040
  GARD_Search:GARD_Search, extract_diseases:bool, #for disease extraction
1041
  epi_classify:Classify_Pipeline) -> Dict[str,str]:
1042
-
1043
  #Format of Output
1044
  ordered_labels = order_labels(epi_ner.labels)
1045
  if extract_diseases:
@@ -1061,7 +1017,11 @@ def API_text_classification_extraction(text:str, #Text to be extracted
1061
 
1062
  #Re-order the dictionary into desired JSON output
1063
  output = OrderedDict([(term, extraction[term]) for term in json_output if term in extraction.keys()])
1064
- return output
 
 
 
 
1065
 
1066
  ## Section: Deprecated Functions
1067
  import requests
 
221
  return pmid_abs, (found, relevant)
222
 
223
  ## Section: LSTM RNN Epi Classification Model (EpiClassify4GARD)
 
 
 
 
 
 
 
 
224
 
225
+ # Imports
226
+ from transformers import AutoModelForSequenceClassification, BertTokenizer, BertConfig
227
  class Classify_Pipeline:
228
+ def __init__(self, name_or_path_to_model_folder:str = "ncats/EpiClassify4GARD"):
229
+ #Initialize tokenizer and model
230
+ self.config = BertConfig.from_pretrained(name_or_path_to_model_folder)
231
+ self.tokenizer = BertTokenizer.from_pretrained(self.config._name_or_path, model_max_length=self.config.max_position_embeddings)
232
+ self.model = AutoModelForSequenceClassification.from_pretrained(name_or_path_to_model_folder,config=self.config)
233
+
234
+ #Custom pipeline by WKariampuzha @NCATS (not Huggingface/Google/NVIDIA copyright)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
235
  def __str__(self) -> str:
236
+ return "Instantiation: epi_classify = Classify_Pipeline(name_or_path_to_model_folder)" +"\n Calling: prob, isEpi = epi_classify(text) \n PubMed ID Predictions: abstracts, prob, isEpi = epi_classify.getPMIDPredictions(pmid)"
237
 
238
  def __call__(self, abstract:str) -> Tuple[float,bool]:
239
  return self.getTextPredictions(abstract)
240
 
241
  def getTextPredictions(self, abstract:str) -> Tuple[float,bool]:
242
  if len(abstract)>5:
243
+ # input_ids
244
+ input_ids = self.tokenizer(text=abstract, max_length=self.config.max_position_embeddings,padding="max_length",truncation=True,return_tensors='pt')
245
+ if len(input_ids)>self.config.max_position_embeddings:
246
+ raise InputError(f"Token Embeddings of size {input_ids} exceed length for maximum model embedding input {self.config.max_position_embeddings}.")
247
+ #split into sentences?
248
+ # softmax output is a Torch Tensor with two classes [[vector_False_class,vector_True_class]]
249
+ output = self.model(**input_ids)
250
+ # True = 1, False = 0
251
+ isEpi = bool(output.logits.argmax().item())
252
+ # softmax output is a Torch Tensor with two classes [[prob_is_False,prob_is_True]]
253
+ prob_tensor = output.logits.softmax(dim=-1)
254
+ # We only want to return the probability that it is true
255
+ prob = prob_tensor.data[0][1].item()
 
 
 
 
 
 
 
256
  return prob, isEpi
257
  else:
258
  return 0.0, False
 
261
  abstract = PMID_getAb(pmid)
262
  prob, isEpi = self.getTextPredictions(abstract)
263
  return abstract, prob, isEpi
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
264
 
265
  ## Section: GARD SEARCH
266
  # can identify rare diseases in text using the GARD dictionary from neo4j
 
270
  def __init__(self):
271
  import json, codecs
272
  #These are opened locally so that garbage collection removes them from memory
273
+ r = requests.get('https://raw.githubusercontent.com/ncats/epi4GARD/master/EpiExtract4GARD/gard-id-name-synonyms.json')
274
+ diseases = json.loads(r.content)
275
  from nltk.corpus import stopwords
276
  STOPWORDS = set(stopwords.words('english'))
277
 
278
+ #This should be a list of all GARD IDs for purposes like random choice
279
+ GARD_id_list = [entry['gard_id'] for entry in diseases]
280
  #keys are going to be disease names, values are going to be the GARD ID, set up this way bc dictionaries are faster lookup than lists
281
  GARD_dict = {}
282
  #Find out what the length of the longest disease name sequence is, of all names and synonyms. This is used by get_diseases
 
297
  GARD_dict[s] = entry['gard_id']
298
  max_length = max(max_length,len(s.split()))
299
 
300
+ self.GARD_id_list = GARD_id_list
301
  self.GARD_dict = GARD_dict
302
  self.max_length = max_length
303
 
 
386
  print("SEARCH TERM DID NOT MATCH TO GARD DICTIONARY. SEARCHING BY USER INPUT")
387
  return [searchterm]
388
 
389
+ # Return a random GARD_ID Search Term list
390
+ def random_disease(self) -> List[str]:
391
+ import random
392
+ gard_id = random.choice(self.GARD_id_list)
393
+ return self.autosearch(gard_id)
394
+
395
  ## Section: BioBERT-based epidemiology NER Model (EpiExtract4GARD)
396
  from nltk import tokenize as nltk_tokenize
397
  from dataclasses import dataclass
 
403
  from transformers import BertConfig, AutoModelForTokenClassification, BertTokenizer, Trainer
404
  from unidecode import unidecode
405
  from collections import OrderedDict
406
+ import json
407
  import pandas as pd
408
  from more_itertools import pairwise
409
 
 
804
 
805
  return results
806
 
807
+ def API_PMID_classification(pmid:Union[int,str], epi_classify:Classify_Pipeline) -> Dict[str,str]:
808
+ text = PMID_getAb(pmid)
809
+ epi_prob, isEpi = epi_classify(text)
810
+ return {'PMID':pmid,'ABSTRACT':text, 'EPI_PROB':str(epi_prob), 'IsEpi':isEpi}
811
+
812
  def API_text_classification(text:str,epi_classify:Classify_Pipeline) -> Dict[str,str]:
813
  epi_prob, isEpi = epi_classify(text)
814
  return {'ABSTRACT':text, 'EPI_PROB':str(epi_prob), 'IsEpi':isEpi}
 
854
  print(len(results),'abstracts classified as epidemiological.')
855
  return results.sort_values('EPI_PROB', ascending=False)
856
 
857
+ #Returns a Pandas dataframe
858
  def streamlit_extraction(search_term:Union[int,str], maxResults:int, filtering:str, #for abstract search
859
  epi_ner:NER_Pipeline, #for biobert extraction
860
  GARD_Search:GARD_Search, extract_diseases:bool, #for disease extraction
 
975
  else:
976
  json_output = ['ABSTRACT']+ordered_labels
977
 
978
+ extraction = dict()
979
  #Do the extraction
980
  if extract_diseases:
981
  extraction = epi_ner(text, GARD_Search)
 
985
  if extraction:
986
  #Re-order the dictionary into desired JSON output
987
  extraction = OrderedDict([(term, extraction[term]) for term in json_output if term in extraction.keys()])
988
+ else:
989
+ #This may return JSONs of different length than above
990
+ extraction = OrderedDict([(term, []) for term in json_output])
991
 
992
+ return extraction
993
 
994
  def API_text_classification_extraction(text:str, #Text to be extracted
995
  epi_ner:NER_Pipeline, #for biobert extraction
996
  GARD_Search:GARD_Search, extract_diseases:bool, #for disease extraction
997
  epi_classify:Classify_Pipeline) -> Dict[str,str]:
998
+
999
  #Format of Output
1000
  ordered_labels = order_labels(epi_ner.labels)
1001
  if extract_diseases:
 
1017
 
1018
  #Re-order the dictionary into desired JSON output
1019
  output = OrderedDict([(term, extraction[term]) for term in json_output if term in extraction.keys()])
1020
+ else:
1021
+ #This may return JSONs of different length than above
1022
+ output = OrderedDict([(term, []) for term in json_output])
1023
+
1024
+ return output
1025
 
1026
  ## Section: Deprecated Functions
1027
  import requests