File size: 11,813 Bytes
8da5944
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
0ea3651
 
 
8da5944
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
# -*- coding: utf-8 -*-
"""
Created on Tue Jun  6 10:31:42 2023

@author: sanmpaul
"""

#Import the required Libraries
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
nltk.download('all')
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from string import punctuation
import re
import pathlib
import time
import spacy
import s3fs
from ast import literal_eval
from operator import itemgetter
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForTokenClassification
import json

aws_access_key_id="ASIA2WUO352SZMWMOQU5"
aws_secret_access_key="wY9VOu1bVTsyNdfB2v/YT27/+IdpCENKr7Ksz68L"
aws_session_token="IQoJb3JpZ2luX2VjENj//////////wEaCXVzLWVhc3QtMSJHMEUCIQD6weAxsbMjC2gO6QzTx2UXDSrw43MFmDU+51XB3cAh1QIgUgp7KCkN8IC6XfHs7iYbh/oLFYhJaskpZ1smaES211Aq7QIIMBAAGgw3MzU4MTI3NzU1ODkiDH5ReYz2DBaO/Lel9yrKAls2Qhf0eeisMEqtdwiZeiEID9Tbc2ZBHIn67m/h2VtoTLw6MXN0Bx2yOsVImYE/2tyM7V/e5uuyOibDQKWf+SM02snjq/YvpGhuYSxm3tYSeIDWiJxSFuef3VcbvsFHMVpbSbZrY2LAVqugZANxSek7nflbWXGBxwT+E3eAoS9DKLGdBfFjtlbhT73Kact5Im8VS7ZhbLu2VaiTfWcUT4mjIDRS1z/0hB2byrbns/FYDmpu9lFTSB8M1dLJHjAuFdhLNla5JWYX3S6yRQQ1hOh3Q7luBGMsmRYo8vle/Ss8yLXygW5luDhu9gle9QPUbqdjAUqqeR1CWmseU5CBtC0zdEwniftexYFEq3ysqMgRFCsfZq2cQGlhWSW8EP9wpv8adif2xvcpJ7b/pzoPGQG2ubGXVy9gNFOW/jAccRIfSTITXw0Mg2MygDCu06ykBjqnAR1/pzYywwgaBZk7fJwv66EazUUGrVnR7dx5Pub0H1IjIlgGt+E5TBmMt95aj6ka2wC9ARHaLvta58ZVdQ8rtya+ZHl6TJG6XGRnMcyPabd6kvzk8l5LFm7hQ0r7zRS9nm+wThx5VYnVCP+C8nbwzyswIq3cnORy8XIL7UdtnvfE4p+13lxlZKZ4W4m2hE3c08xnMVok9GOLteZbpSERj3mxe7CWOc4J"

fs = s3fs.S3FileSystem(key=aws_access_key_id, secret=aws_secret_access_key, token=aws_session_token)

bucket_name = 'mimic3data'
folder_path = 'cleanedmimic'
file_name = 'extracted_diseases_DIAGNOSES.csv'
s3_file_path = f'{bucket_name}/{folder_path}/{file_name}'

# Add a title and intro text
st.title('Medical Coding Explorer')
st.text('This is a web app to allow exploration of the capabilities of Medical coding')

# Create file uploader object
upload_file = st.file_uploader('Upload a file containing medical data')
custom = set(stopwords.words('english') + list(punctuation))

to_check = ["TEXT","DESCRIPTION","CATEGORY","ROW_ID","SUBJECT_ID","HADM_ID","CHARTDATE","TEXT"]
def checkcols(x):
    if not set(to_check).issubset(set(x.columns)):
       return False
    return True

headings = ["Discharge Diagnosis:","Final Diagnosis:","Final Report:","FINAL DIAGNOSES:","DISCHARGE DIAGNOSES:","PAST MEDICAL HISTORY:"]
pattern = r"(?i)(?:^|\n)(?:{})\s*([\s\S]*?)(?=\n\n\w|$)".format("|".join(re.escape(heading) for heading in headings))
def extractSummary(df):
    for idx,rows in df.iterrows():
        #print(f"ROW_ID--------------{rows['ROW_ID']}")
        text = rows['TEXT']
        matches = re.findall(pattern, text)
        ext_txt_lst = []
        for match in matches:
            ext_txt_lst.append(match.strip().replace("\n", " "))
        extracted_txt = " ".join([ts for ts in ext_txt_lst])
        df.iloc[idx,5] = extracted_txt 
        df.iloc[idx,6] = " ".join([word.lower() for word in word_tokenize(extracted_txt) if word.isalpha() and word not in custom and len(word)>5])

sci_nlp = spacy.load('en_ner_bc5cdr_md')
abbreviations_to_remove = ["s/p", "d/c'd", "w/"]
chars_to_remove = ["#", ".", ")", "(", "[", "]"]
numeric_pattern = r'\d+'
text_col = "summaryInfo"

# Function to extract all diseases
def extract_diseases(text):
    
    docx = sci_nlp(text)    
    results_diseases = [ent.text.lower() for ent in docx.ents if (ent.label_ == 'DISEASE')]
    results_diseases2 = [' '.join(word for word in disease.split() if word not in abbreviations_to_remove) for disease in results_diseases]
    results_diseases3 = [re.sub(numeric_pattern, '', phrase) for phrase in results_diseases2]
    results_diseases4 = ["".join(c for c in phrase if c not in chars_to_remove) for phrase in results_diseases3]
    results_diseases_cleaned = list(set(results_diseases4))
    
    return results_diseases_cleaned      

def convert_df(df):
   return df.to_csv(index=False).encode('utf-8')

if "extract_button" not in st.session_state:
    st.session_state.extract_button=False
    
if "ner_button" not in st.session_state:
    st.session_state.ner_button=False
    
if "icd_button" not in st.session_state:
    st.session_state.icd_button=False

def extractCallback():
    st.session_state.extract_button=True
    st.session_state.extract_spinner=True
    
def nercallback():
    st.session_state.extract_button=True
    st.session_state.ner_button=True
    st.session_state.extract_spinner=False
    st.session_state.ner_spinner=True
    
def icdcallback():
    st.session_state.icd_button=True
    st.session_state.extract_button=True
    st.session_state.ner_button=True
    st.session_state.icd_spinner=True
    st.session_state.ner_spinner=False
    

model_name = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
def compareWith_ICD(row1,row2):
    scobj=dict()
    txt2 = row2['DISEASES']
    code=row2['ICD9_CODE']
    tkns = tokenizer([row1, txt2],padding=True,truncation=True,max_length=512)
    score = cosine_similarity([tkns['input_ids'][0]], [tkns['input_ids'][1]])
    scobj['ICD9_CODE']=code
    scobj['score']=round(score[0][0],2)
    return scobj

def compare(row):
    obj = icd_df1.apply(lambda x: compareWith_ICD(row, x), axis=1)
    return json.dumps(sorted(obj.to_list(), key=itemgetter('score'), reverse=True))

def top_icd(row):
    sorted_lst = literal_eval(row)
    
    k = [x['ICD9_CODE'] for x in sorted_lst]  
    new_vals=[]
    for i in Counter(k):
        all = [x for x in sorted_lst if x['ICD9_CODE']==i]
        new_vals.append(max(all, key=lambda x: x['score']))

    return json.dumps(sorted(new_vals, key=itemgetter('score'), reverse=True)[:5])


# Read the file from S3 using s3fs and store it in a Pandas DataFrame
with fs.open(s3_file_path, 'rb') as file:
    icd_df = pd.read_csv(file)
    
icd_df['DISEASES'] = icd_df['DISEASES'].apply(literal_eval)
icd_df1 = icd_df.explode('DISEASES')
icd_df1 = icd_df1[icd_df1['DISEASES'].notna()]
icd_df1.drop(columns=['SEQ_NUM','SHORT_TITLE','LONG_TITLE'], inplace=True)
icd_df1.drop_duplicates(subset=["ICD9_CODE", "DISEASES"], inplace=True)
icd_df1.reset_index(drop=True, inplace=True)
      
# Check to see if a file has been uploaded
if upload_file is not None:
    file_extension = pathlib.Path(upload_file.name).suffix
    if file_extension=='.csv':
        
       # Read the file to a dataframe using pandas
       notevents_df = pd.read_csv(upload_file)
       cols = notevents_df.columns
       chk = checkcols(notevents_df)
       if chk:
       
           #Shape of Data Frame
           st.caption(f'Shape of data: {notevents_df.shape}')
           
           # Create a section for the dataframe header
           st.info('Display top 5 rows of data')
           st.write(notevents_df.head())
           
           # Filter Data Frame
           st.subheader('Extract Discharge Summary info')
           if st.button('Extract', on_click=extractCallback) or st.session_state.extract_button:
               
               #if st.session_state.extract_spinner:
               with st.spinner('Extracting...'):
                   time.sleep(5)
                    
                   notevents_df1 = notevents_df.loc[(notevents_df['DESCRIPTION']=='Report') & \
                                                 (notevents_df['CATEGORY']=='Discharge summary'),
                                                 ['ROW_ID','SUBJECT_ID','HADM_ID','CHARTDATE','TEXT']]
                        
                   notevents_df1['summaryInfo'] = np.nan
                   notevents_df1['cleanedTxt'] = np.nan
                   notevents_df1.reset_index(inplace=True, drop=True)
                    # notevents_df1['HADM_ID']=notevents_df1['HADM_ID'].astype('int')
                    
                   extractSummary(notevents_df1)
                    
                   notevents_df2 = notevents_df1.loc[notevents_df1['cleanedTxt']!=""]
                   notevents_df2.drop_duplicates(subset=["summaryInfo"], keep="first", inplace=True)
                   notevents_df2.drop(columns=['TEXT'], inplace=True)
                    
                   st.caption(f'Shape of extracted data: {notevents_df2.shape}')
                    
                   # Create a section for the dataframe header
                   st.info('Display top 5 rows of extracted data')
                   st.write(notevents_df2.head())
                   
                   
           st.subheader('Extract Disease Information')
           if st.button('Run NER Model', on_click=nercallback) or st.session_state.ner_button:
               with st.spinner('Running NER...'):
                   time.sleep(15)
                     
                   st.text(f"notevents_df2-------{notevents_df2.shape}")
                     
                   notevents_df2['DISEASES'] = notevents_df2[text_col].apply(lambda x: extract_diseases(x))
                   # notevents_df2['DISEASES'] = notevents_df2['DISEASES'].apply(literal_eval)
                   st.subheader('Display top 5 rows after running NER Model')
                   st.write(notevents_df2.head())
                     
                   csv = convert_df(notevents_df2)
                   st.download_button(
                    "Press to Download",
                     csv,
                     "NER_diseases.csv",
                     "text/csv",
                     key='download-csv'
                  )

           st.subheader('Map Disease to ICD9 Codes')
           if st.button('Run ICD Model'):
               #if st.session_state.icd_spinner:
               with st.spinner('icd model running...'):
                   time.sleep(15)
                   
                   notevents_df2.drop_duplicates(subset=['HADM_ID'],inplace=True)
                   ex_disease_df1 = notevents_df2.explode('DISEASES')                            
                   
                   ex_disease_df1 = ex_disease_df1[ex_disease_df1['DISEASES'].notna()]
                   ex_disease_df1 = ex_disease_df1[ex_disease_df1['DISEASES']!=""]
                   ex_disease_df1.drop(columns=['ROW_ID','SUBJECT_ID','CHARTDATE','summaryInfo','cleanedTxt'], inplace=True)
                   ex_disease_df1['DISEASES']=ex_disease_df1['DISEASES'].str.replace("/", " ")
                   ex_disease_df1.drop_duplicates(subset=["DISEASES"], keep="first", inplace=True)
                   ex_disease_df1.reset_index(drop=True, inplace=True)
                   
                   ex_disease_df1['icd_map'] = ex_disease_df1.apply(lambda x: compare(x['DISEASES']), axis=1)
                   ex_disease_df1['top5_icd'] = ex_disease_df1.apply(lambda x: top_icd(x['icd_map']), axis=1)
                   
                   ex_disease_df1.drop(columns=['icd_map'], inplace=True)
                   
                   st.info('Display top 5 rows of icd mapping')
                   st.write(ex_disease_df1.head())
                   
                   csv = convert_df(ex_disease_df1)
                   st.download_button(
                       "Press to Download",
                       csv,
                       "top5icd.csv",
                       "text/csv",
                       key='top5icd-csv'
                    )
                            
                
                            
                       
                           
                           
                
               
       else:
            st.error('Invalid csv file', icon="🚨")

    else:
        st.error('Upload a csv file', icon="🚨")