Spaces:

d4data
/

Biomedical-Epidemiology-NER-App

Running

File size: 9,517 Bytes

# -*- coding: utf-8 -*-
"""
Created on Mon Jul  4 08:43:02 2022

@author: dreji18
"""

import streamlit as st
import hydralit_components as hc
import datetime
import time
from Bio_Epidemiology_NER.bio_recognizer import ner_prediction
from Bio_Epidemiology_NER.bio_recognizer import pdf_annotate_streamlit
from functionforDownloadButtons import download_button
import fitz
import pandas as pd
import base64
import tempfile
import os
import streamlit.components.v1 as components

# set page size wide and theme
st.set_page_config(layout='wide', initial_sidebar_state='collapsed',)
over_theme = {'txc_inactive': '#FFFFFF','menu_background':'#696969','txc_active':'black'}

# app page setup
import hydralit as hy
app = hy.HydraApp(title='Biomedical Epidemiology NER App',
                  nav_container= None,
                  nav_horizontal=bool,
                  layout='wide', 
                  #favicon = "🧊",
                  use_navbar=True,
                  navbar_theme=over_theme,
                  navbar_sticky=True,
                  navbar_mode='pinned',
                  use_loader=True,
                  use_cookie_cache=True,
                  sidebar_state = 'auto',
                  navbar_animation=True,
                  allow_url_nav=False,
                  hide_streamlit_markers = True,
                  #use_banner_images=["./background.png",None,{'header':"<h1 style='text-align:center;padding: 10px 10px;color:black;font-size:200%;'>Biomedical Epidemiology Entity Recognizer</h1><br>"},None,"./background.png"],
                  #banner_spacing=[5,30,60,30,5], 
                  clear_cross_app_sessions=True, 
                  session_params=None
                  )


# individual pages
@app.addapp(is_home=True)
def my_home():
    hy.markdown("<h3 style='text-align: center; color: black;'>Biomedical Epidemiology Named Entity Recognition System </h3>", unsafe_allow_html=True)

    st.write("""This application presents a generalizable ML pipeline capable of identifying and recognizing many biomedical named entities in texts. In three significant ways, this pipeline improves on previous efforts. First, it can recognize over 50 different entity types, including clinical entities (disease, symptoms, risks, effects, drugs, diabetes, respiration, vital signs, and others), as well as non-clinical entities, such as event-based data, social factors that are not clinical factors but are related to health outcomes. Second, with no code changes, this pipeline is simple to use and adaptable to individual methods for a given data type, task, or domain of application. Third, this pipeline can take any free texts, for example, in the form of text or PDF files and parse them for scientific texts. We hope that this application will provide a more transparent and customizable solution for the healthcare industry, helping to educate and encourage more rigorous applications of ML to biomedical analyses.""")
    st.write("\n")
    
    st.write("""The implications of this application in the context of healthcare are multi-facet. For example, these biomedical entity types can help doctors, nurses, and other healthcare professionals align symptoms to diagnosis, treatment, and follow-up. There are also opportunities for policymakers to understand the value that is within electronic and clinical medical records to understand the cost-effectiveness and cost-saving planning. For example, knowing the number of clinically informative, human diagnoses within population groups can assist learning health systems in planning strategies. Tracking social determinants can lead to reducing biases in the health data. This research can also be used to translate the clinical data into knowledge, evidence, and clinical impact.""")
    hy.image("Epidemiologist.jpeg")

@app.addapp(title='Entity Recognizer', icon="far fa-copy",)
def app2():
    hy.subheader("NER from text corpus")
    with hy.form(key="text_form"):
        ce, c1, ce, c2, c3 = hy.columns([0.07, 1, 0.07, 4, 1.5])
        with c1:
            hy.write("You can paste your biomedical data here. The Named Entity Recognition model will identify the required entities")
            hy.image("medical care logo template social media.png")
    
    with c2:
        doc = st.text_area(
            "Paste your text below (max 500 words)",
            height=310,
        )

        MAX_WORDS = 500
        import re
        res = len(re.findall(r"\w+", doc))
        if res > MAX_WORDS:
            st.warning(
                "⚠️ Your text contains "
                + str(res)
                + " words."
                + " Only the first 500 words will be reviewed. Stay tuned as increased allowance is coming! 😊"
            )

            doc = doc[:MAX_WORDS]
    
        submit_button = st.form_submit_button(label="🍃 Get me the data!")
      
    if len(doc)!=0:
        pred_df = ner_prediction(corpus=doc, compute='cpu') #pass compute='gpu' if using gpu
        with c3:
            st.dataframe(pred_df)
            CSVButton1 = download_button(pred_df, "key-value-content.csv", "📥 Download (.csv)")
    
    hy.markdown(" ")
    hy.markdown(" ")
    hy.markdown(" ")
    
    hy.subheader("NER from Pdf Reports")
    counter = 0
    with hy.form(key="pdf_form"):     
        ce, c1, ce, c2, c3 = hy.columns([0.07, 1, 0.07, 4, 1.5])
        with c1:
            hy.write("You can upload your biomedical report here. The Named Entity Recognition model will identify the required entities")
            hy.image("medical care logo template social media.png")
        
        with c2:
            uploaded_file = st.file_uploader('Choose your .pdf file', type=["pdf"])
            submit_button1 = st.form_submit_button(label="🍃 Get me the data!")
      
        if uploaded_file is not None:

            try:
                document = fitz.open(stream=uploaded_file.read(), filetype="pdf")
                page = 0
                final_df = pd.DataFrame(columns= ["Page","Entity Group","Value","Score"])
                while page <  document.pageCount:
                    page_text=document.get_page_text(page)
                    out = ner_prediction(corpus=page_text, compute='cpu')
                    output = out.drop_duplicates(subset=["value"],keep='first')
                    #to iterate through every row in the dataframe
                    for index, row in output.iterrows():  
                        text = row['value']
                        #selecting values which has threshold greater than 0.5
                        #avoiding words less than than length of 3 to avoid false positives
                        if row["score"] > 0.5 and len(text) > 2:
                            final_df.loc[len(final_df.index)] = [page +1 ,row['entity_group'],row['value'],row['score']] 

                            text_instances = document[page].search_for(text)
                            current_page = document[page]
                            if text_instances is not None:
                                #for adding/marking the annotation in the pdf
                                for inst in text_instances:
                                    #coordinates of the annoation in the pdf
                                    x0,x1,x2,x3 = inst
                                    rect = (x0,x1,x2,x3)
                                    annot = current_page.add_rect_annot(rect) 
                                    info = annot.info
                                    info["title"]   = row['entity_group']
                                    annot.set_info(info)
                                    annot.update()
                                        
                    page+=1  
                
                if len(final_df)!=0:
                    final_df['Pdf File'] = uploaded_file.name
                    final_df = final_df[['Entity Group', 'Value', 'Score', 'Page', 'Pdf File']]
                    with c2:
                        st.dataframe(final_df)
                        CSVButton2 = download_button(final_df, "key-value-pdf.csv", "📥 Download (.csv)")
                else:
                    print("No Entities Extracted!!!")
                
                temp_dir = tempfile.TemporaryDirectory()
                document.save(tempfile.gettempdir()+"/annott.pdf")    
                counter+=1      
                    
            except Exception as e:
                  print("Error occured: {}".format(e))
                  raise e      
    
    ce, c1, ce, c2, c3 = hy.columns([0.07, 1, 0.07, 4, 1.5])
    with c2:
        if counter !=0:
            with open((tempfile.gettempdir()+"/annott.pdf"), "rb") as pdf_file:
                PDFbyte = pdf_file.read()
                
            hy.download_button(label="📥 Download Annotated PDF", 
                    data=PDFbyte,
                    file_name=uploaded_file.name+"_annotated.pdf",
                    mime='application/octet-stream')
            
            components.iframe(tempfile.gettempdir()+"/annott.pdf", width=800, height=800)
    
    #with open((tempfile.gettempdir()+"/annott.pdf"),"rb") as f:
    #    base64_pdf = base64.b64encode(f.read()).decode('utf-8')
    #pdf_display = f'<embed src="data:application/pdf;base64,{base64_pdf}" width="800" height="800" type="application/pdf">'
    #pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="800" height="800" type="application/pdf"></iframe>'
    

    


app.run()