Spaces:

lingbionlp
/

PhenoTagger-Demo

Build error

App Files Files Community

lingbionlp commited on Nov 24, 2022

Commit

4dc59ae

•

1 Parent(s): 82ee352

Upload app.py

Browse files

Files changed (1) hide show

app.py +215 -116

app.py CHANGED Viewed

@@ -1,157 +1,256 @@
 # -*- coding: utf-8 -*-
 """
-Created on Tue Nov 22 09:54:41 2022
 @author: luol2
 """
 import streamlit as st
-import argparse
-from src.nn_model import bioTag_CNN,bioTag_BERT,bioTag_Bioformer
 from src.dic_ner import dic_ont
 from src.tagging_text import bioTag
 import os
-import time
 import json
-import sys
-import nltk
-nltk.download('punkt')
-nltk.download('averaged_perceptron_tagger')
-nltk.download('wordnet')
 st.set_page_config(
     page_title="PhenoTagger",
-    page_icon=":shark:",
-    #  layout="wide",
-    initial_sidebar_state="expanded",
     menu_items={
-        'Get Help': 'https://www.extremelycoolapp.com/help',
-        'Report a bug': "https://www.extremelycoolapp.com/bug",
-        'About': "# This is a header. This is an *extremely* cool app!"
     }
 )
-st.title('PhenoTagger Demo')
-# with st.spinner('Model is being loaded..'):
-#   print('load model done!')
 with st.form(key="my_form"):
-    @st.cache(allow_output_mutation=True)
-    def load_model():
-        ontfiles={'dic_file':'./dict_new/noabb_lemma.dic',
                   'word_hpo_file':'./dict_new/word_id_map.json',
                   'hpo_word_file':'./dict_new/id_word_map.json'}
-        # if para_set['model_type']=='cnn':
-        #     vocabfiles={'w2vfile':'../vocab/bio_embedding_intrinsic.d200',
-        #                 'charfile':'../vocab/char.vocab',
-        #                 'labelfile':'../dict_new/lable.vocab',
-        #                 'posfile':'../vocab/pos.vocab'}
-        #     modelfile='../models/cnn_p5n5_b128_95_hponew1.h5'
-        # elif para_set['model_type']=='bioformer':
-        vocabfiles={'labelfile':'./dict_new/lable.vocab',
-                    'config_path':'./vocab/bioformer-cased-v1.0/bert_config.json',
-                    'checkpoint_path':'./vocab/bioformer-cased-v1.0/bioformer-cased-v1.0-model.ckpt-2000000',
-                    'vocab_path':'./vocab/bioformer-cased-v1.0/vocab.txt'}
-        modelfile='./vocab/bioformer_p5n5_b64_1e-5_95_hponew3.h5'
-        # else:
-        #     print('Model type is wrong, please select cnn or bioformer.')
-        #     sys.exit()
-        biotag_dic=dic_ont(ontfiles)
-        # if para_set['model_type']=='cnn':
-        #     nn_model=bioTag_CNN(vocabfiles)
-        #     nn_model.load_model(modelfile)
-        # elif para_set['model_type']=='bioformer':
-        nn_model=bioTag_Bioformer(vocabfiles)
-        session=nn_model.load_model(modelfile)
-        test_tag='1232'
-        return nn_model,biotag_dic,test_tag,session
-    #hyper-parameter
-    st.sidebar.header("Hyperparameter Settings")
-    sbform = st.sidebar.form("Hyper-paramiters")
-    # para_model=sbform.selectbox('Model', ['cnn', 'bioformer'])
-    para_overlap=sbform.selectbox('Return overlapping concepts', ['True', 'False'])
-    para_abbr=sbform.selectbox('Identify abbreviations', ['True', 'False'])
-    para_threshold = sbform.slider('Threshold:', min_value=0.5, max_value=0.95, value=0.95, step=0.05)
-    sbform.form_submit_button("Setting")
-    st.write('parameters:', para_overlap,para_abbr,para_threshold)
-    nn_model,biotag_dic,test_tag,session=load_model()
-    input_text = st.text_area(
-        "Paste your text below (max 500 words)",
-        height=510,
-    )
-    MAX_WORDS = 500
-    import re
-    res = len(re.findall(r"\w+", input_text))
-    if res > MAX_WORDS:
-        st.warning(
-            "⚠️ Your text contains "
-            + str(res)
-            + " words."
-            + " Only the first 500 words will be reviewed. Stay tuned as increased allowance is coming! 😊"
         )
-        input_text = input_text[:MAX_WORDS]
-    submit_button = st.form_submit_button(label="✨ Get me the data!")
-    if para_overlap=='True':
-        para_overlap=True
-    else:
-        para_overlap=False
-    if para_abbr=='True':
-        para_abbr=True
-    else:
-        para_abbr=False
-    para_set={
-              #model_type':para_model, # cnn or bioformer
-              'onlyLongest':para_overlap, # False: return overlap concepts, True only longgest
-              'abbrRecog':para_abbr,# False: don't identify abbr, True: identify abbr
-              'ML_Threshold':para_threshold,# the Threshold of deep learning model
-              }
 if not submit_button:
     st.stop()
-st.markdown(f"""**Results:**\n""")
 # print('dic...........:',biotag_dic.keys())
-print('........:',test_tag)
-print('........!!!!!!:',input_text)
-print('...input:',input_text)
-tag_result=bioTag(session,input_text,biotag_dic,nn_model,onlyLongest=para_set['onlyLongest'], abbrRecog=para_set['abbrRecog'],Threshold=para_set['ML_Threshold'])
-for ele in tag_result:
-    start = ele[0]
-    last = ele[1]
-    mention = input_text[int(ele[0]):int(ele[1])]
-    type='Phenotype'
-    id=ele[2]
-    score=ele[3]
-    output=start+"\t"+last+"\t"+mention+"\t"+id+'\t'+score+"\n"
-    st.info(output)

 # -*- coding: utf-8 -*-
 """
+Created on Mon Nov 21 16:21:25 2022
 @author: luol2
 """
 import streamlit as st
+from src.nn_model import bioTag_CNN,bioTag_Bioformer
 from src.dic_ner import dic_ont
 from src.tagging_text import bioTag
 import os
 import json
+from pandas import DataFrame
 st.set_page_config(
     page_title="PhenoTagger",
+    page_icon="🎈",
+    layout="wide",
     menu_items={
+        'Get Help': 'https://www.ncbi.nlm.nih.gov/research/bionlp/',
+        'About': "PhenoTagger v1.1"
     }
 )
+# def _max_width_():
+#     max_width_str = f"max-width: 2400px;"
+#     st.markdown(
+#         f"""
+#     <style>
+#     .reportview-container .main .block-container{{
+#         {max_width_str}
+#     }}
+#     </style>
+#     """,
+#         unsafe_allow_html=True,
+#     )
+# _max_width_()
+# c30, c31, c32 = st.columns([2.5, 1, 3])
+# with c30:
+#     # st.image("logo.png", width=400)
+st.title("👨‍⚕️ PhenoTagger Demo")
+with st.expander("ℹ️ - About this app", expanded=True):
+    st.write(
+        """
+-   This app is an easy-to-use interface built in Streamlit for [PhenoTagger](https://github.com/ncbi-nlp/PhenoTagger) library!
+-   PhenoTagger is a hybrid method that combines dictionary and deep learning-based methods to recognize Human Phenotype Ontology (HPO) concepts in unstructured biomedical text. Please refer to [our paper](https://doi.org/10.1093/bioinformatics/btab019) for more details.
+-   Contact: [NLM/NCBI BioNLP Research Group](https://www.ncbi.nlm.nih.gov/research/bionlp/)
+	    """
+    )
+    st.markdown("")
+st.markdown("")
+st.markdown("## 📌 Paste document ")
 with st.form(key="my_form"):
+    ce, c1, ce, c2, c3 = st.columns([0.07, 1, 0.07, 4, 0.07])
+    with c1:
+        ModelType = st.radio(
+            "Choose your model",
+            ["Bioformer(Default)", "CNN"],
+            help="Bioformer is more precise, CNN is more efficient",
+        )
+        if ModelType == "Bioformer(Default)":
+            # kw_model = KeyBERT(model=roberta)
+            @st.cache(allow_output_mutation=True)
+            def load_model():
+                ontfiles={'dic_file':'./dict_new/noabb_lemma.dic',
+                  'word_hpo_file':'./dict_new/word_id_map.json',
+                  'hpo_word_file':'./dict_new/id_word_map.json'}
+                vocabfiles={'labelfile':'./dict_new/lable.vocab',
+                            'config_path':'./vocab/bioformer-cased-v1.0/bert_config.json',
+                            'checkpoint_path':'./vocab/bioformer-cased-v1.0/bioformer-cased-v1.0-model.ckpt-2000000',
+                            'vocab_path':'./vocab/bioformer-cased-v1.0/vocab.txt'}
+                modelfile='./vocab/bioformer_p5n5_b64_1e-5_95_hponew3.h5'
+                biotag_dic=dic_ont(ontfiles)
+                nn_model=bioTag_Bioformer(vocabfiles)
+                nn_model.load_model(modelfile)
+                return nn_model,biotag_dic
+            nn_model,biotag_dic = load_model()
+        else:
+            @st.cache(allow_output_mutation=True)
+            def load_model():
+                ontfiles={'dic_file':'./dict_new/noabb_lemma.dic',
                   'word_hpo_file':'./dict_new/word_id_map.json',
                   'hpo_word_file':'./dict_new/id_word_map.json'}
+                vocabfiles={'w2vfile':'./vocab/bio_embedding_intrinsic.d200',
+                            'charfile':'./vocab/char.vocab',
+                            'labelfile':'./dict_new/lable.vocab',
+                            'posfile':'./vocab/pos.vocab'}
+                modelfile='./models/cnn_p5n5_b128_95_hponew1.h5'
+                biotag_dic=dic_ont(ontfiles)
+                nn_model=bioTag_CNN(vocabfiles)
+                nn_model.load_model(modelfile)
+                return nn_model,biotag_dic
+            nn_model,biotag_dic = load_model()
+        para_overlap = st.checkbox(
+            "Overlap concept",
+            value=True,
+            help="Tick this box to identify overlapping concepts",
+        )
+        para_abbr = st.checkbox(
+            "Abbreviaitons",
+            value=True,
+            help="Tick this box to identify abbreviations",
+        )
+        para_threshold = st.slider(
+            "Threshold",
+            min_value=0.5,
+            max_value=0.95,
+            value=0.95,
+            step=0.05,
+            help="Retrun the preditions which socre over the threshold.",
+        )
+    with c2:
+        doc = st.text_area(
+            "Paste your text below",
+            height=400,
         )
+        # MAX_WORDS = 500
+        # import re
+        # res = len(re.findall(r"\w+", doc))
+        # if res > MAX_WORDS:
+        #     st.warning(
+        #         "⚠️ Your text contains "
+        #         + str(res)
+        #         + " words."
+        #         + " Only the first 500 words will be reviewed. Stay tuned as increased allowance is coming! 😊"
+        #     )
+        #     doc = doc[:MAX_WORDS]
+        submit_button = st.form_submit_button(label="✨ Submit!")
 if not submit_button:
     st.stop()
+para_set={
+          #model_type':para_model, # cnn or bioformer
+          'onlyLongest':para_overlap, # False: return overlap concepts, True only longgest
+          'abbrRecog':para_abbr,# False: don't identify abbr, True: identify abbr
+          'ML_Threshold':para_threshold,# the Threshold of deep learning model
+          }
+st.markdown("")
+st.markdown("## 💡 Tagging results:")
+with st.spinner('Wait for tagging...'):
+    tag_result=bioTag(doc,biotag_dic,nn_model,onlyLongest=para_set['onlyLongest'], abbrRecog=para_set['abbrRecog'],Threshold=para_set['ML_Threshold'])
+st.markdown('<font style="color: rgb(128, 128, 128);">Move the mouse over the entity to display the HPO id.</font>', unsafe_allow_html=True)
 # print('dic...........:',biotag_dic.keys())
+# st.write('parameters:', para_overlap,para_abbr,para_threshold)
+html_results=''
+text_results=doc+'\n'
+entity_end=0
+hpoid_count={}
+if len(tag_result)>=0:
+    for ele in tag_result:
+        entity_start=int(ele[0])
+        html_results+=doc[entity_end:entity_start]
+        entity_end=int(ele[1])
+        entity_id=ele[2]
+        entity_score=ele[3]
+        text_results+=ele[0]+'\t'+ele[1]+'\t'+doc[entity_start:entity_end]+'\t'+ele[2]+'\t'+format(float(ele[3]),'.2f')+'\n'
+        if entity_id not in hpoid_count.keys():
+            hpoid_count[entity_id]=1
+        else:
+            hpoid_count[entity_id]+=1
+        html_results+='<font style="background-color: rgb(255, 204, 0)'+';" title="'+entity_id+'">'+doc[entity_start:entity_end]+'</font>'
+    html_results+=doc[entity_end:]
+else:
+    html_results=doc
+st.markdown('<table border="1"><tr><td>'+html_results+'</td></tr></table>', unsafe_allow_html=True)
+#table
+data_entity=[]
+for ele in hpoid_count.keys():
+    temp=[ele,biotag_dic.hpo_word[ele][0],hpoid_count[ele]] #hpoid, term name, count
+    data_entity.append(temp)
+st.markdown("")
+st.markdown("")
+# st.markdown("## Table output:")
+# cs, c1, c2, c3, cLast = st.columns([2, 1.5, 1.5, 1.5, 2])
+# with c1:
+#     CSVButton2 = download_button(keywords, "Data.csv", "📥 Download (.csv)")
+# with c2:
+#     CSVButton2 = download_button(keywords, "Data.txt", "📥 Download (.txt)")
+# with c3:
+#     CSVButton2 = download_button(keywords, "Data.json", "📥 Download (.json)")
+# st.header("")
+df = (
+    DataFrame(data_entity, columns=["HPO_id", "Term name","Frequency"])
+    .sort_values(by="Frequency", ascending=False)
+    .reset_index(drop=True)
+)
+df.index += 1
+c1, c2, c3 = st.columns([1, 4, 1])
+# format_dictionary = {
+#     "Relevancy": "{:.1%}",
+# }
+# df = df.format(format_dictionary)
+with c2:
+    st.table(df)
+c1, c2, c3 = st.columns([1, 1, 1])
+with c2:
+    st.download_button('Download annotations', text_results)