Spaces:

ncats
/

EpiPipeline4RD

Sleeping

App Files Files Community

Updated classification model and GARD_search data

by wzkariampuzha - opened Apr 25, 2023

base: refs/heads/main

←

from: refs/pr/1

Discussion Files changed

+52

-92

Files changed (1) hide show

epi_pipeline.py +52 -92

epi_pipeline.py CHANGED Viewed

@@ -221,69 +221,38 @@ def streamlit_getAbs(searchterm_list:Union[List[str],List[int],str], maxResults:
     return pmid_abs, (found, relevant)
 ## Section: LSTM RNN Epi Classification Model (EpiClassify4GARD)
-import os
-os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
-from tensorflow.keras.preprocessing.sequence import pad_sequences
-from tensorflow.keras.preprocessing.text import tokenizer_from_json
-import tensorflow as tf
-import numpy as np
-import spacy
-import json
 class Classify_Pipeline:
-    def __init__(self,model_name:str='LSTM_RNN_Model'):
-        #Load spaCy models
-        self.nlp = spacy.load('en_core_web_lg')
-        self.nlpSci = spacy.load("en_ner_bc5cdr_md")
-        self.nlpSci2 = spacy.load('en_ner_bionlp13cg_md')
-        # load the tokenizer
-        with open(model_name+'/tokenizer.json') as f:
-            self.classify_tokenizer = tokenizer_from_json(json.load(f))
-        #OLD Code - used pickle which is unsafe
-        #with open(model+'/tokenizer.pickle', 'rb') as handle:
-        #    import pickle
-        #    self.classify_tokenizer = pickle.load(handle)
-        # Defaults to load my_model_orphanet_final, the most up-to-date version of the classification model,
-        # but can also be run on any other tf.keras model
-        # load the model
-        self.classify_model = tf.keras.models.load_model(model_name)
-        # for preprocessing
-        from nltk.corpus import stopwords
-        self.STOPWORDS = set(stopwords.words('english'))
-        # Modes
-        self.max_length = 300
-        self.trunc_type = 'post'
-        self.padding_type = 'post'
     def __str__(self) -> str:
-        return "Instantiation: epi_classify = Classify_Pipeline(path_to_model_folder)" +"\n Calling: prob, isEpi = epi_classify(text) \n PubMed ID Predictions: abstracts, prob, isEpi = epi_classify.getPMIDPredictions(pmid)"
     def __call__(self, abstract:str) -> Tuple[float,bool]:
         return self.getTextPredictions(abstract)
     def getTextPredictions(self, abstract:str) -> Tuple[float,bool]:
         if len(abstract)>5:
-            # remove stopwords
-            for word in self.STOPWORDS:
-                token = ' ' + word + ' '
-                abstract = abstract.replace(token, ' ')
-                abstract = abstract.replace('  ', ' ')
-            # preprocess abstract
-            abstract_standard = [self.standardizeAbstract(self.standardizeSciTerms(abstract))]
-            sequence = self.classify_tokenizer.texts_to_sequences(abstract_standard)
-            padded = pad_sequences(sequence, maxlen=self.max_length, padding=self.padding_type, truncating=self.trunc_type)
-            y_pred1 = self.classify_model.predict(padded) # generate prediction
-            y_pred = np.argmax(y_pred1, axis=1) # get binary prediction
-            prob = y_pred1[0][1]
-            if y_pred == 1:
-                isEpi = True
-            else:
-                isEpi = False
             return prob, isEpi
         else:
             return 0.0, False
@@ -292,36 +261,6 @@ class Classify_Pipeline:
         abstract = PMID_getAb(pmid)
         prob, isEpi = self.getTextPredictions(abstract)
         return abstract, prob, isEpi
-    # Standardize the abstract by replacing all named entities with their entity label.
-    # Eg. 3 patients reported at a clinic in England --> CARDINAL patients reported at a clinic in GPE
-    # expects the spaCy model en_core_web_lg as input
-    def standardizeAbstract(self, abstract:str) -> str:
-        doc = self.nlp(abstract)
-        newAbstract = abstract
-        for e in reversed(doc.ents):
-            if e.label_ in {'PERCENT','CARDINAL','GPE','LOC','DATE','TIME','QUANTITY','ORDINAL'}:
-                start = e.start_char
-                end = start + len(e.text)
-                newAbstract = newAbstract[:start] + e.label_ + newAbstract[end:]
-        return newAbstract
-    # Same as above but replaces biomedical named entities from scispaCy models
-    # Expects as input en_ner_bc5cdr_md and en_ner_bionlp13cg_md
-    def standardizeSciTerms(self, abstract:str) -> str:
-        doc = self.nlpSci(abstract)
-        newAbstract = abstract
-        for e in reversed(doc.ents):
-            start = e.start_char
-            end = start + len(e.text)
-            newAbstract = newAbstract[:start] + e.label_ + newAbstract[end:]
-        doc = self.nlpSci2(newAbstract)
-        for e in reversed(doc.ents):
-            start = e.start_char
-            end = start + len(e.text)
-            newAbstract = newAbstract[:start] + e.label_ + newAbstract[end:]
-        return newAbstract
 ## Section: GARD SEARCH
 # can identify rare diseases in text using the GARD dictionary from neo4j
@@ -331,11 +270,13 @@ class GARD_Search:
     def __init__(self):
         import json, codecs
         #These are opened locally so that garbage collection removes them from memory
-        with codecs.open('gard-id-name-synonyms.json', 'r', 'utf-8-sig') as f:
-            diseases = json.load(f)
         from nltk.corpus import stopwords
         STOPWORDS = set(stopwords.words('english'))
         #keys are going to be disease names, values are going to be the GARD ID, set up this way bc dictionaries are faster lookup than lists
         GARD_dict = {}
         #Find out what the length of the longest disease name sequence is, of all names and synonyms. This is used by get_diseases
@@ -356,6 +297,7 @@ class GARD_Search:
                             GARD_dict[s] = entry['gard_id']
                             max_length = max(max_length,len(s.split()))
         self.GARD_dict = GARD_dict
         self.max_length = max_length
@@ -444,6 +386,12 @@ class GARD_Search:
         print("SEARCH TERM DID NOT MATCH TO GARD DICTIONARY. SEARCHING BY USER INPUT")
         return [searchterm]
 ## Section: BioBERT-based epidemiology NER Model (EpiExtract4GARD)
 from nltk import tokenize as nltk_tokenize
 from dataclasses import dataclass
@@ -455,6 +403,7 @@ import re
 from transformers import BertConfig, AutoModelForTokenClassification, BertTokenizer, Trainer
 from unidecode import unidecode
 from collections import OrderedDict
 import pandas as pd
 from more_itertools import pairwise
@@ -855,6 +804,11 @@ def API_search_classification(search_term:Union[int,str], maxResults:int,
     return results
 def API_text_classification(text:str,epi_classify:Classify_Pipeline) ->  Dict[str,str]:
     epi_prob, isEpi = epi_classify(text)
     return {'ABSTRACT':text, 'EPI_PROB':str(epi_prob), 'IsEpi':isEpi}
@@ -900,7 +854,7 @@ def search_term_extraction(search_term:Union[int,str], maxResults:int, filtering
     print(len(results),'abstracts classified as epidemiological.')
     return results.sort_values('EPI_PROB', ascending=False)
-#Returns a Pandas dataframe
 def streamlit_extraction(search_term:Union[int,str], maxResults:int, filtering:str, #for abstract search
                            epi_ner:NER_Pipeline, #for biobert extraction
                            GARD_Search:GARD_Search, extract_diseases:bool, #for disease extraction
@@ -1021,7 +975,7 @@ def API_text_extraction(text:str, #Text to be extracted
     else:
         json_output = ['ABSTRACT']+ordered_labels
-    results = {'entries':[]}
     #Do the extraction
     if extract_diseases:
         extraction = epi_ner(text, GARD_Search)
@@ -1031,15 +985,17 @@ def API_text_extraction(text:str, #Text to be extracted
     if extraction:
         #Re-order the dictionary into desired JSON output
         extraction = OrderedDict([(term, extraction[term]) for term in json_output if term in extraction.keys()])
-        results['entries'].append(extraction)
-    return results
 def API_text_classification_extraction(text:str, #Text to be extracted
                            epi_ner:NER_Pipeline, #for biobert extraction
                            GARD_Search:GARD_Search, extract_diseases:bool, #for disease extraction
                            epi_classify:Classify_Pipeline) ->  Dict[str,str]:
     #Format of Output
     ordered_labels = order_labels(epi_ner.labels)
     if extract_diseases:
@@ -1061,7 +1017,11 @@ def API_text_classification_extraction(text:str, #Text to be extracted
         #Re-order the dictionary into desired JSON output
         output = OrderedDict([(term, extraction[term]) for term in json_output if term in extraction.keys()])
-        return output
 ## Section: Deprecated Functions
 import requests

     return pmid_abs, (found, relevant)
 ## Section: LSTM RNN Epi Classification Model (EpiClassify4GARD)
+# Imports
+from transformers import AutoModelForSequenceClassification, BertTokenizer, BertConfig
 class Classify_Pipeline:
+    def __init__(self, name_or_path_to_model_folder:str = "ncats/EpiClassify4GARD"):
+        #Initialize tokenizer and model
+        self.config = BertConfig.from_pretrained(name_or_path_to_model_folder)
+        self.tokenizer = BertTokenizer.from_pretrained(self.config._name_or_path, model_max_length=self.config.max_position_embeddings)
+        self.model = AutoModelForSequenceClassification.from_pretrained(name_or_path_to_model_folder,config=self.config)
+    #Custom pipeline by WKariampuzha @NCATS (not Huggingface/Google/NVIDIA copyright)
     def __str__(self) -> str:
+        return "Instantiation: epi_classify = Classify_Pipeline(name_or_path_to_model_folder)" +"\n Calling: prob, isEpi = epi_classify(text) \n PubMed ID Predictions: abstracts, prob, isEpi = epi_classify.getPMIDPredictions(pmid)"
     def __call__(self, abstract:str) -> Tuple[float,bool]:
         return self.getTextPredictions(abstract)
     def getTextPredictions(self, abstract:str) -> Tuple[float,bool]:
         if len(abstract)>5:
+            # input_ids
+            input_ids = self.tokenizer(text=abstract, max_length=self.config.max_position_embeddings,padding="max_length",truncation=True,return_tensors='pt')
+            if len(input_ids)>self.config.max_position_embeddings:
+              raise InputError(f"Token Embeddings of size {input_ids} exceed length for maximum model embedding input {self.config.max_position_embeddings}.")
+              #split into sentences?
+            # softmax output is a Torch Tensor with two classes [[vector_False_class,vector_True_class]]
+            output = self.model(**input_ids)
+            # True = 1, False = 0
+            isEpi = bool(output.logits.argmax().item())
+            # softmax output is a Torch Tensor with two classes [[prob_is_False,prob_is_True]]
+            prob_tensor = output.logits.softmax(dim=-1)
+            # We only want to return the probability that it is true
+            prob = prob_tensor.data[0][1].item()
             return prob, isEpi
         else:
             return 0.0, False
         abstract = PMID_getAb(pmid)
         prob, isEpi = self.getTextPredictions(abstract)
         return abstract, prob, isEpi
 ## Section: GARD SEARCH
 # can identify rare diseases in text using the GARD dictionary from neo4j
     def __init__(self):
         import json, codecs
         #These are opened locally so that garbage collection removes them from memory
+        r = requests.get('https://raw.githubusercontent.com/ncats/epi4GARD/master/EpiExtract4GARD/gard-id-name-synonyms.json')
+        diseases = json.loads(r.content)
         from nltk.corpus import stopwords
         STOPWORDS = set(stopwords.words('english'))
+        #This should be a list of all GARD IDs for purposes like random choice
+        GARD_id_list = [entry['gard_id'] for entry in diseases]
         #keys are going to be disease names, values are going to be the GARD ID, set up this way bc dictionaries are faster lookup than lists
         GARD_dict = {}
         #Find out what the length of the longest disease name sequence is, of all names and synonyms. This is used by get_diseases
                             GARD_dict[s] = entry['gard_id']
                             max_length = max(max_length,len(s.split()))
+        self.GARD_id_list = GARD_id_list
         self.GARD_dict = GARD_dict
         self.max_length = max_length
         print("SEARCH TERM DID NOT MATCH TO GARD DICTIONARY. SEARCHING BY USER INPUT")
         return [searchterm]
+    # Return a random GARD_ID Search Term list
+    def random_disease(self) -> List[str]:
+        import random
+        gard_id = random.choice(self.GARD_id_list)
+        return self.autosearch(gard_id)
 ## Section: BioBERT-based epidemiology NER Model (EpiExtract4GARD)
 from nltk import tokenize as nltk_tokenize
 from dataclasses import dataclass
 from transformers import BertConfig, AutoModelForTokenClassification, BertTokenizer, Trainer
 from unidecode import unidecode
 from collections import OrderedDict
+import json
 import pandas as pd
 from more_itertools import pairwise
     return results
+def API_PMID_classification(pmid:Union[int,str], epi_classify:Classify_Pipeline) ->  Dict[str,str]:
+    text = PMID_getAb(pmid)
+    epi_prob, isEpi = epi_classify(text)
+    return {'PMID':pmid,'ABSTRACT':text, 'EPI_PROB':str(epi_prob), 'IsEpi':isEpi}
 def API_text_classification(text:str,epi_classify:Classify_Pipeline) ->  Dict[str,str]:
     epi_prob, isEpi = epi_classify(text)
     return {'ABSTRACT':text, 'EPI_PROB':str(epi_prob), 'IsEpi':isEpi}
     print(len(results),'abstracts classified as epidemiological.')
     return results.sort_values('EPI_PROB', ascending=False)
+#Returns a Pandas dataframe
 def streamlit_extraction(search_term:Union[int,str], maxResults:int, filtering:str, #for abstract search
                            epi_ner:NER_Pipeline, #for biobert extraction
                            GARD_Search:GARD_Search, extract_diseases:bool, #for disease extraction
     else:
         json_output = ['ABSTRACT']+ordered_labels
+    extraction = dict()
     #Do the extraction
     if extract_diseases:
         extraction = epi_ner(text, GARD_Search)
     if extraction:
         #Re-order the dictionary into desired JSON output
         extraction = OrderedDict([(term, extraction[term]) for term in json_output if term in extraction.keys()])
+    else:
+        #This may return JSONs of different length than above
+        extraction = OrderedDict([(term, []) for term in json_output])
+    return extraction
 def API_text_classification_extraction(text:str, #Text to be extracted
                            epi_ner:NER_Pipeline, #for biobert extraction
                            GARD_Search:GARD_Search, extract_diseases:bool, #for disease extraction
                            epi_classify:Classify_Pipeline) ->  Dict[str,str]:
     #Format of Output
     ordered_labels = order_labels(epi_ner.labels)
     if extract_diseases:
         #Re-order the dictionary into desired JSON output
         output = OrderedDict([(term, extraction[term]) for term in json_output if term in extraction.keys()])
+    else:
+        #This may return JSONs of different length than above
+        output = OrderedDict([(term, []) for term in json_output])
+    return output
 ## Section: Deprecated Functions
 import requests