Spaces:

d4data
/

Biomedical-Epidemiology-NER-App

Sleeping

App Files Files Community

Biomedical-Epidemiology-NER-App / app.py

dreji18

Update app.py

82bf7a9 about 2 years ago

raw

history blame

No virus

9.52 kB

	# -- coding: utf-8 --
	"""
	Created on Mon Jul 4 08:43:02 2022

	@author: dreji18
	"""

	import streamlit as st
	import hydralit_components as hc
	import datetime
	import time
	from Bio_Epidemiology_NER.bio_recognizer import ner_prediction
	from Bio_Epidemiology_NER.bio_recognizer import pdf_annotate_streamlit
	from functionforDownloadButtons import download_button
	import fitz
	import pandas as pd
	import base64
	import tempfile
	import os
	import streamlit.components.v1 as components

	# set page size wide and theme
	st.set_page_config(layout='wide', initial_sidebar_state='collapsed',)
	over_theme = {'txc_inactive': '#FFFFFF','menu_background':'#696969','txc_active':'black'}

	# app page setup
	import hydralit as hy
	app = hy.HydraApp(title='Biomedical Epidemiology NER App',
	nav_container= None,
	nav_horizontal=bool,
	layout='wide',
	#favicon = "🧊",
	use_navbar=True,
	navbar_theme=over_theme,
	navbar_sticky=True,
	navbar_mode='pinned',
	use_loader=True,
	use_cookie_cache=True,
	sidebar_state = 'auto',
	navbar_animation=True,
	allow_url_nav=False,
	hide_streamlit_markers = True,
	#use_banner_images=["./background.png",None,{'header':"<h1 style='text-align:center;padding: 10px 10px;color:black;font-size:200%;'>Biomedical Epidemiology Entity Recognizer</h1><br>"},None,"./background.png"],
	#banner_spacing=[5,30,60,30,5],
	clear_cross_app_sessions=True,
	session_params=None
	)


	# individual pages
	@app.addapp(is_home=True)
	def my_home():
	hy.markdown("<h3 style='text-align: center; color: black;'>Biomedical Epidemiology Named Entity Recognition System </h3>", unsafe_allow_html=True)

	st.write("""This application presents a generalizable ML pipeline capable of identifying and recognizing many biomedical named entities in texts. In three significant ways, this pipeline improves on previous efforts. First, it can recognize over 50 different entity types, including clinical entities (disease, symptoms, risks, effects, drugs, diabetes, respiration, vital signs, and others), as well as non-clinical entities, such as event-based data, social factors that are not clinical factors but are related to health outcomes. Second, with no code changes, this pipeline is simple to use and adaptable to individual methods for a given data type, task, or domain of application. Third, this pipeline can take any free texts, for example, in the form of text or PDF files and parse them for scientific texts. We hope that this application will provide a more transparent and customizable solution for the healthcare industry, helping to educate and encourage more rigorous applications of ML to biomedical analyses.""")
	st.write("\n")

	st.write("""The implications of this application in the context of healthcare are multi-facet. For example, these biomedical entity types can help doctors, nurses, and other healthcare professionals align symptoms to diagnosis, treatment, and follow-up. There are also opportunities for policymakers to understand the value that is within electronic and clinical medical records to understand the cost-effectiveness and cost-saving planning. For example, knowing the number of clinically informative, human diagnoses within population groups can assist learning health systems in planning strategies. Tracking social determinants can lead to reducing biases in the health data. This research can also be used to translate the clinical data into knowledge, evidence, and clinical impact.""")
	hy.image("Epidemiologist.jpeg")

	@app.addapp(title='Entity Recognizer', icon="far fa-copy",)
	def app2():
	hy.subheader("NER from text corpus")
	with hy.form(key="text_form"):
	ce, c1, ce, c2, c3 = hy.columns([0.07, 1, 0.07, 4, 1.5])
	with c1:
	hy.write("You can paste your biomedical data here. The Named Entity Recognition model will identify the required entities")
	hy.image("medical care logo template social media.png")

	with c2:
	doc = st.text_area(
	"Paste your text below (max 500 words)",
	height=310,
	)

	MAX_WORDS = 500
	import re
	res = len(re.findall(r"\w+", doc))
	if res > MAX_WORDS:
	st.warning(
	"⚠️ Your text contains "
	+ str(res)
	+ " words."
	+ " Only the first 500 words will be reviewed. Stay tuned as increased allowance is coming! 😊"
	)

	doc = doc[:MAX_WORDS]

	submit_button = st.form_submit_button(label="🍃 Get me the data!")

	if len(doc)!=0:
	pred_df = ner_prediction(corpus=doc, compute='cpu') #pass compute='gpu' if using gpu
	with c3:
	st.dataframe(pred_df)
	CSVButton1 = download_button(pred_df, "key-value-content.csv", "📥 Download (.csv)")

	hy.markdown(" ")
	hy.markdown(" ")
	hy.markdown(" ")

	hy.subheader("NER from Pdf Reports")
	counter = 0
	with hy.form(key="pdf_form"):
	ce, c1, ce, c2, c3 = hy.columns([0.07, 1, 0.07, 4, 1.5])
	with c1:
	hy.write("You can upload your biomedical report here. The Named Entity Recognition model will identify the required entities")
	hy.image("medical care logo template social media.png")

	with c2:
	uploaded_file = st.file_uploader('Choose your .pdf file', type=["pdf"])
	submit_button1 = st.form_submit_button(label="🍃 Get me the data!")

	if uploaded_file is not None:

	try:
	document = fitz.open(stream=uploaded_file.read(), filetype="pdf")
	page = 0
	final_df = pd.DataFrame(columns= ["Page","Entity Group","Value","Score"])
	while page < document.pageCount:
	page_text=document.get_page_text(page)
	out = ner_prediction(corpus=page_text, compute='cpu')
	output = out.drop_duplicates(subset=["value"],keep='first')
	#to iterate through every row in the dataframe
	for index, row in output.iterrows():
	text = row['value']
	#selecting values which has threshold greater than 0.5
	#avoiding words less than than length of 3 to avoid false positives
	if row["score"] > 0.5 and len(text) > 2:
	final_df.loc[len(final_df.index)] = [page +1 ,row['entity_group'],row['value'],row['score']]

	text_instances = document[page].search_for(text)
	current_page = document[page]
	if text_instances is not None:
	#for adding/marking the annotation in the pdf
	for inst in text_instances:
	#coordinates of the annoation in the pdf
	x0,x1,x2,x3 = inst
	rect = (x0,x1,x2,x3)
	annot = current_page.add_rect_annot(rect)
	info = annot.info
	info["title"] = row['entity_group']
	annot.set_info(info)
	annot.update()

	page+=1

	if len(final_df)!=0:
	final_df['Pdf File'] = uploaded_file.name
	final_df = final_df[['Entity Group', 'Value', 'Score', 'Page', 'Pdf File']]
	with c2:
	st.dataframe(final_df)
	CSVButton2 = download_button(final_df, "key-value-pdf.csv", "📥 Download (.csv)")
	else:
	print("No Entities Extracted!!!")

	temp_dir = tempfile.TemporaryDirectory()
	document.save(tempfile.gettempdir()+"/annott.pdf")
	counter+=1

	except Exception as e:
	print("Error occured: {}".format(e))
	raise e

	ce, c1, ce, c2, c3 = hy.columns([0.07, 1, 0.07, 4, 1.5])
	with c2:
	if counter !=0:
	with open((tempfile.gettempdir()+"/annott.pdf"), "rb") as pdf_file:
	PDFbyte = pdf_file.read()

	hy.download_button(label="📥 Download Annotated PDF",
	data=PDFbyte,
	file_name=uploaded_file.name+"_annotated.pdf",
	mime='application/octet-stream')

	components.iframe(tempfile.gettempdir()+"/annott.pdf", width=800, height=800)

	#with open((tempfile.gettempdir()+"/annott.pdf"),"rb") as f:
	# base64_pdf = base64.b64encode(f.read()).decode('utf-8')
	#pdf_display = f'<embed src="data:application/pdf;base64,{base64_pdf}" width="800" height="800" type="application/pdf">'
	#pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="800" height="800" type="application/pdf"></iframe>'





	app.run()