File size: 9,517 Bytes
946703c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
928a017
 
86f369d
946703c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
f1f7cd6
 
0029965
f1f7cd6
946703c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d31e05
946703c
 
 
 
 
 
 
 
 
39ad2bf
946703c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9d31e05
946703c
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
928a017
39ad2bf
 
b82085d
946703c
 
 
39ad2bf
8c9d6ab
 
 
 
 
 
 
 
a5e67b4
8c9d6ab
86f369d
82bf7a9
f61237c
7155227
 
5c0f1a9
5a1f5ea
d32fbac
0b9d4d4
7155227
5a1f5ea
946703c
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
# -*- coding: utf-8 -*-
"""
Created on Mon Jul  4 08:43:02 2022

@author: dreji18
"""

import streamlit as st
import hydralit_components as hc
import datetime
import time
from Bio_Epidemiology_NER.bio_recognizer import ner_prediction
from Bio_Epidemiology_NER.bio_recognizer import pdf_annotate_streamlit
from functionforDownloadButtons import download_button
import fitz
import pandas as pd
import base64
import tempfile
import os
import streamlit.components.v1 as components

# set page size wide and theme
st.set_page_config(layout='wide', initial_sidebar_state='collapsed',)
over_theme = {'txc_inactive': '#FFFFFF','menu_background':'#696969','txc_active':'black'}

# app page setup
import hydralit as hy
app = hy.HydraApp(title='Biomedical Epidemiology NER App',
                  nav_container= None,
                  nav_horizontal=bool,
                  layout='wide', 
                  #favicon = "🧊",
                  use_navbar=True,
                  navbar_theme=over_theme,
                  navbar_sticky=True,
                  navbar_mode='pinned',
                  use_loader=True,
                  use_cookie_cache=True,
                  sidebar_state = 'auto',
                  navbar_animation=True,
                  allow_url_nav=False,
                  hide_streamlit_markers = True,
                  #use_banner_images=["./background.png",None,{'header':"<h1 style='text-align:center;padding: 10px 10px;color:black;font-size:200%;'>Biomedical Epidemiology Entity Recognizer</h1><br>"},None,"./background.png"],
                  #banner_spacing=[5,30,60,30,5], 
                  clear_cross_app_sessions=True, 
                  session_params=None
                  )


# individual pages
@app.addapp(is_home=True)
def my_home():
    hy.markdown("<h3 style='text-align: center; color: black;'>Biomedical Epidemiology Named Entity Recognition System </h3>", unsafe_allow_html=True)

    st.write("""This application presents a generalizable ML pipeline capable of identifying and recognizing many biomedical named entities in texts. In three significant ways, this pipeline improves on previous efforts. First, it can recognize over 50 different entity types, including clinical entities (disease, symptoms, risks, effects, drugs, diabetes, respiration, vital signs, and others), as well as non-clinical entities, such as event-based data, social factors that are not clinical factors but are related to health outcomes. Second, with no code changes, this pipeline is simple to use and adaptable to individual methods for a given data type, task, or domain of application. Third, this pipeline can take any free texts, for example, in the form of text or PDF files and parse them for scientific texts. We hope that this application will provide a more transparent and customizable solution for the healthcare industry, helping to educate and encourage more rigorous applications of ML to biomedical analyses.""")
    st.write("\n")
    
    st.write("""The implications of this application in the context of healthcare are multi-facet. For example, these biomedical entity types can help doctors, nurses, and other healthcare professionals align symptoms to diagnosis, treatment, and follow-up. There are also opportunities for policymakers to understand the value that is within electronic and clinical medical records to understand the cost-effectiveness and cost-saving planning. For example, knowing the number of clinically informative, human diagnoses within population groups can assist learning health systems in planning strategies. Tracking social determinants can lead to reducing biases in the health data. This research can also be used to translate the clinical data into knowledge, evidence, and clinical impact.""")
    hy.image("Epidemiologist.jpeg")

@app.addapp(title='Entity Recognizer', icon="far fa-copy",)
def app2():
    hy.subheader("NER from text corpus")
    with hy.form(key="text_form"):
        ce, c1, ce, c2, c3 = hy.columns([0.07, 1, 0.07, 4, 1.5])
        with c1:
            hy.write("You can paste your biomedical data here. The Named Entity Recognition model will identify the required entities")
            hy.image("medical care logo template social media.png")
    
    with c2:
        doc = st.text_area(
            "Paste your text below (max 500 words)",
            height=310,
        )

        MAX_WORDS = 500
        import re
        res = len(re.findall(r"\w+", doc))
        if res > MAX_WORDS:
            st.warning(
                "⚠️ Your text contains "
                + str(res)
                + " words."
                + " Only the first 500 words will be reviewed. Stay tuned as increased allowance is coming! 😊"
            )

            doc = doc[:MAX_WORDS]
    
        submit_button = st.form_submit_button(label="πŸƒ Get me the data!")
      
    if len(doc)!=0:
        pred_df = ner_prediction(corpus=doc, compute='cpu') #pass compute='gpu' if using gpu
        with c3:
            st.dataframe(pred_df)
            CSVButton1 = download_button(pred_df, "key-value-content.csv", "πŸ“₯ Download (.csv)")
    
    hy.markdown(" ")
    hy.markdown(" ")
    hy.markdown(" ")
    
    hy.subheader("NER from Pdf Reports")
    counter = 0
    with hy.form(key="pdf_form"):     
        ce, c1, ce, c2, c3 = hy.columns([0.07, 1, 0.07, 4, 1.5])
        with c1:
            hy.write("You can upload your biomedical report here. The Named Entity Recognition model will identify the required entities")
            hy.image("medical care logo template social media.png")
        
        with c2:
            uploaded_file = st.file_uploader('Choose your .pdf file', type=["pdf"])
            submit_button1 = st.form_submit_button(label="πŸƒ Get me the data!")
      
        if uploaded_file is not None:

            try:
                document = fitz.open(stream=uploaded_file.read(), filetype="pdf")
                page = 0
                final_df = pd.DataFrame(columns= ["Page","Entity Group","Value","Score"])
                while page <  document.pageCount:
                    page_text=document.get_page_text(page)
                    out = ner_prediction(corpus=page_text, compute='cpu')
                    output = out.drop_duplicates(subset=["value"],keep='first')
                    #to iterate through every row in the dataframe
                    for index, row in output.iterrows():  
                        text = row['value']
                        #selecting values which has threshold greater than 0.5
                        #avoiding words less than than length of 3 to avoid false positives
                        if row["score"] > 0.5 and len(text) > 2:
                            final_df.loc[len(final_df.index)] = [page +1 ,row['entity_group'],row['value'],row['score']] 

                            text_instances = document[page].search_for(text)
                            current_page = document[page]
                            if text_instances is not None:
                                #for adding/marking the annotation in the pdf
                                for inst in text_instances:
                                    #coordinates of the annoation in the pdf
                                    x0,x1,x2,x3 = inst
                                    rect = (x0,x1,x2,x3)
                                    annot = current_page.add_rect_annot(rect) 
                                    info = annot.info
                                    info["title"]   = row['entity_group']
                                    annot.set_info(info)
                                    annot.update()
                                        
                    page+=1  
                
                if len(final_df)!=0:
                    final_df['Pdf File'] = uploaded_file.name
                    final_df = final_df[['Entity Group', 'Value', 'Score', 'Page', 'Pdf File']]
                    with c2:
                        st.dataframe(final_df)
                        CSVButton2 = download_button(final_df, "key-value-pdf.csv", "πŸ“₯ Download (.csv)")
                else:
                    print("No Entities Extracted!!!")
                
                temp_dir = tempfile.TemporaryDirectory()
                document.save(tempfile.gettempdir()+"/annott.pdf")    
                counter+=1      
                    
            except Exception as e:
                  print("Error occured: {}".format(e))
                  raise e      
    
    ce, c1, ce, c2, c3 = hy.columns([0.07, 1, 0.07, 4, 1.5])
    with c2:
        if counter !=0:
            with open((tempfile.gettempdir()+"/annott.pdf"), "rb") as pdf_file:
                PDFbyte = pdf_file.read()
                
            hy.download_button(label="πŸ“₯ Download Annotated PDF", 
                    data=PDFbyte,
                    file_name=uploaded_file.name+"_annotated.pdf",
                    mime='application/octet-stream')
            
            components.iframe(tempfile.gettempdir()+"/annott.pdf", width=800, height=800)
    
    #with open((tempfile.gettempdir()+"/annott.pdf"),"rb") as f:
    #    base64_pdf = base64.b64encode(f.read()).decode('utf-8')
    #pdf_display = f'<embed src="data:application/pdf;base64,{base64_pdf}" width="800" height="800" type="application/pdf">'
    #pdf_display = f'<iframe src="data:application/pdf;base64,{base64_pdf}" width="800" height="800" type="application/pdf"></iframe>'
    

    


app.run()