# -*- coding: utf-8 -*- """ Created on Mon Jul 4 08:43:02 2022 @author: dreji18 """ import streamlit as st import hydralit_components as hc import datetime import time from Bio_Epidemiology_NER.bio_recognizer import ner_prediction from Bio_Epidemiology_NER.bio_recognizer import pdf_annotate_streamlit from functionforDownloadButtons import download_button import fitz import pandas as pd import base64 import tempfile import os import streamlit.components.v1 as components # set page size wide and theme st.set_page_config(layout='wide', initial_sidebar_state='collapsed',) over_theme = {'txc_inactive': '#FFFFFF','menu_background':'#696969','txc_active':'black'} # app page setup import hydralit as hy app = hy.HydraApp(title='Biomedical Epidemiology NER App', nav_container= None, nav_horizontal=bool, layout='wide', #favicon = "🧊", use_navbar=True, navbar_theme=over_theme, navbar_sticky=True, navbar_mode='pinned', use_loader=True, use_cookie_cache=True, sidebar_state = 'auto', navbar_animation=True, allow_url_nav=False, hide_streamlit_markers = True, #use_banner_images=["./background.png",None,{'header':"

Biomedical Epidemiology Entity Recognizer

"},None,"./background.png"], #banner_spacing=[5,30,60,30,5], clear_cross_app_sessions=True, session_params=None ) # individual pages @app.addapp(is_home=True) def my_home(): hy.markdown("

Biomedical Epidemiology Named Entity Recognition System

", unsafe_allow_html=True) st.write("""This application presents a generalizable ML pipeline capable of identifying and recognizing many biomedical named entities in texts. In three significant ways, this pipeline improves on previous efforts. First, it can recognize over 50 different entity types, including clinical entities (disease, symptoms, risks, effects, drugs, diabetes, respiration, vital signs, and others), as well as non-clinical entities, such as event-based data, social factors that are not clinical factors but are related to health outcomes. Second, with no code changes, this pipeline is simple to use and adaptable to individual methods for a given data type, task, or domain of application. Third, this pipeline can take any free texts, for example, in the form of text or PDF files and parse them for scientific texts. We hope that this application will provide a more transparent and customizable solution for the healthcare industry, helping to educate and encourage more rigorous applications of ML to biomedical analyses.""") st.write("\n") st.write("""The implications of this application in the context of healthcare are multi-facet. For example, these biomedical entity types can help doctors, nurses, and other healthcare professionals align symptoms to diagnosis, treatment, and follow-up. There are also opportunities for policymakers to understand the value that is within electronic and clinical medical records to understand the cost-effectiveness and cost-saving planning. For example, knowing the number of clinically informative, human diagnoses within population groups can assist learning health systems in planning strategies. Tracking social determinants can lead to reducing biases in the health data. This research can also be used to translate the clinical data into knowledge, evidence, and clinical impact.""") hy.image("Epidemiologist.jpeg") @app.addapp(title='Entity Recognizer', icon="far fa-copy",) def app2(): hy.subheader("NER from text corpus") with hy.form(key="text_form"): ce, c1, ce, c2, c3 = hy.columns([0.07, 1, 0.07, 4, 1.5]) with c1: hy.write("You can paste your biomedical data here. The Named Entity Recognition model will identify the required entities") hy.image("medical care logo template social media.png") with c2: doc = st.text_area( "Paste your text below (max 500 words)", height=310, ) MAX_WORDS = 500 import re res = len(re.findall(r"\w+", doc)) if res > MAX_WORDS: st.warning( "⚠️ Your text contains " + str(res) + " words." + " Only the first 500 words will be reviewed. Stay tuned as increased allowance is coming! 😊" ) doc = doc[:MAX_WORDS] submit_button = st.form_submit_button(label="🍃 Get me the data!") if len(doc)!=0: pred_df = ner_prediction(corpus=doc, compute='cpu') #pass compute='gpu' if using gpu with c3: st.dataframe(pred_df) CSVButton1 = download_button(pred_df, "key-value-content.csv", "📥 Download (.csv)") hy.markdown(" ") hy.markdown(" ") hy.markdown(" ") hy.subheader("NER from Pdf Reports") counter = 0 with hy.form(key="pdf_form"): ce, c1, ce, c2, c3 = hy.columns([0.07, 1, 0.07, 4, 1.5]) with c1: hy.write("You can upload your biomedical report here. The Named Entity Recognition model will identify the required entities") hy.image("medical care logo template social media.png") with c2: uploaded_file = st.file_uploader('Choose your .pdf file', type=["pdf"]) submit_button1 = st.form_submit_button(label="🍃 Get me the data!") if uploaded_file is not None: try: document = fitz.open(stream=uploaded_file.read(), filetype="pdf") page = 0 final_df = pd.DataFrame(columns= ["Page","Entity Group","Value","Score"]) while page < document.pageCount: page_text=document.get_page_text(page) out = ner_prediction(corpus=page_text, compute='cpu') output = out.drop_duplicates(subset=["value"],keep='first') #to iterate through every row in the dataframe for index, row in output.iterrows(): text = row['value'] #selecting values which has threshold greater than 0.5 #avoiding words less than than length of 3 to avoid false positives if row["score"] > 0.5 and len(text) > 2: final_df.loc[len(final_df.index)] = [page +1 ,row['entity_group'],row['value'],row['score']] text_instances = document[page].search_for(text) current_page = document[page] if text_instances is not None: #for adding/marking the annotation in the pdf for inst in text_instances: #coordinates of the annoation in the pdf x0,x1,x2,x3 = inst rect = (x0,x1,x2,x3) annot = current_page.add_rect_annot(rect) info = annot.info info["title"] = row['entity_group'] annot.set_info(info) annot.update() page+=1 if len(final_df)!=0: final_df['Pdf File'] = uploaded_file.name final_df = final_df[['Entity Group', 'Value', 'Score', 'Page', 'Pdf File']] with c2: st.dataframe(final_df) CSVButton2 = download_button(final_df, "key-value-pdf.csv", "📥 Download (.csv)") else: print("No Entities Extracted!!!") temp_dir = tempfile.TemporaryDirectory() document.save(tempfile.gettempdir()+"/annott.pdf") counter+=1 except Exception as e: print("Error occured: {}".format(e)) raise e ce, c1, ce, c2, c3 = hy.columns([0.07, 1, 0.07, 4, 1.5]) with c2: if counter !=0: with open((tempfile.gettempdir()+"/annott.pdf"), "rb") as pdf_file: PDFbyte = pdf_file.read() hy.download_button(label="📥 Download Annotated PDF", data=PDFbyte, file_name=uploaded_file.name+"_annotated.pdf", mime='application/octet-stream') with open((tempfile.gettempdir()+"/annott.pdf"),"rb") as f: base64_pdf = base64.b64encode(f.read()).decode('utf-8') components.iframe("data:application/pdf;base64,{base64_pdf}", width=800, height=800) #with open((tempfile.gettempdir()+"/annott.pdf"),"rb") as f: # base64_pdf = base64.b64encode(f.read()).decode('utf-8') #pdf_display = f'

' #pdf_display = f'' app.run()