test_app / upload_file.py
sanmoy27's picture
Upload 2 files
0ea3651
# -*- coding: utf-8 -*-
"""
Created on Tue Jun 6 10:31:42 2023
@author: sanmpaul
"""
#Import the required Libraries
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
nltk.download('all')
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from string import punctuation
import re
import pathlib
import time
import spacy
import s3fs
from ast import literal_eval
from operator import itemgetter
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForTokenClassification
import json
aws_access_key_id="ASIA2WUO352SZMWMOQU5"
aws_secret_access_key="wY9VOu1bVTsyNdfB2v/YT27/+IdpCENKr7Ksz68L"
aws_session_token="IQoJb3JpZ2luX2VjENj//////////wEaCXVzLWVhc3QtMSJHMEUCIQD6weAxsbMjC2gO6QzTx2UXDSrw43MFmDU+51XB3cAh1QIgUgp7KCkN8IC6XfHs7iYbh/oLFYhJaskpZ1smaES211Aq7QIIMBAAGgw3MzU4MTI3NzU1ODkiDH5ReYz2DBaO/Lel9yrKAls2Qhf0eeisMEqtdwiZeiEID9Tbc2ZBHIn67m/h2VtoTLw6MXN0Bx2yOsVImYE/2tyM7V/e5uuyOibDQKWf+SM02snjq/YvpGhuYSxm3tYSeIDWiJxSFuef3VcbvsFHMVpbSbZrY2LAVqugZANxSek7nflbWXGBxwT+E3eAoS9DKLGdBfFjtlbhT73Kact5Im8VS7ZhbLu2VaiTfWcUT4mjIDRS1z/0hB2byrbns/FYDmpu9lFTSB8M1dLJHjAuFdhLNla5JWYX3S6yRQQ1hOh3Q7luBGMsmRYo8vle/Ss8yLXygW5luDhu9gle9QPUbqdjAUqqeR1CWmseU5CBtC0zdEwniftexYFEq3ysqMgRFCsfZq2cQGlhWSW8EP9wpv8adif2xvcpJ7b/pzoPGQG2ubGXVy9gNFOW/jAccRIfSTITXw0Mg2MygDCu06ykBjqnAR1/pzYywwgaBZk7fJwv66EazUUGrVnR7dx5Pub0H1IjIlgGt+E5TBmMt95aj6ka2wC9ARHaLvta58ZVdQ8rtya+ZHl6TJG6XGRnMcyPabd6kvzk8l5LFm7hQ0r7zRS9nm+wThx5VYnVCP+C8nbwzyswIq3cnORy8XIL7UdtnvfE4p+13lxlZKZ4W4m2hE3c08xnMVok9GOLteZbpSERj3mxe7CWOc4J"
fs = s3fs.S3FileSystem(key=aws_access_key_id, secret=aws_secret_access_key, token=aws_session_token)
bucket_name = 'mimic3data'
folder_path = 'cleanedmimic'
file_name = 'extracted_diseases_DIAGNOSES.csv'
s3_file_path = f'{bucket_name}/{folder_path}/{file_name}'
# Add a title and intro text
st.title('Medical Coding Explorer')
st.text('This is a web app to allow exploration of the capabilities of Medical coding')
# Create file uploader object
upload_file = st.file_uploader('Upload a file containing medical data')
custom = set(stopwords.words('english') + list(punctuation))
to_check = ["TEXT","DESCRIPTION","CATEGORY","ROW_ID","SUBJECT_ID","HADM_ID","CHARTDATE","TEXT"]
def checkcols(x):
if not set(to_check).issubset(set(x.columns)):
return False
return True
headings = ["Discharge Diagnosis:","Final Diagnosis:","Final Report:","FINAL DIAGNOSES:","DISCHARGE DIAGNOSES:","PAST MEDICAL HISTORY:"]
pattern = r"(?i)(?:^|\n)(?:{})\s*([\s\S]*?)(?=\n\n\w|$)".format("|".join(re.escape(heading) for heading in headings))
def extractSummary(df):
for idx,rows in df.iterrows():
#print(f"ROW_ID--------------{rows['ROW_ID']}")
text = rows['TEXT']
matches = re.findall(pattern, text)
ext_txt_lst = []
for match in matches:
ext_txt_lst.append(match.strip().replace("\n", " "))
extracted_txt = " ".join([ts for ts in ext_txt_lst])
df.iloc[idx,5] = extracted_txt
df.iloc[idx,6] = " ".join([word.lower() for word in word_tokenize(extracted_txt) if word.isalpha() and word not in custom and len(word)>5])
sci_nlp = spacy.load('en_ner_bc5cdr_md')
abbreviations_to_remove = ["s/p", "d/c'd", "w/"]
chars_to_remove = ["#", ".", ")", "(", "[", "]"]
numeric_pattern = r'\d+'
text_col = "summaryInfo"
# Function to extract all diseases
def extract_diseases(text):
docx = sci_nlp(text)
results_diseases = [ent.text.lower() for ent in docx.ents if (ent.label_ == 'DISEASE')]
results_diseases2 = [' '.join(word for word in disease.split() if word not in abbreviations_to_remove) for disease in results_diseases]
results_diseases3 = [re.sub(numeric_pattern, '', phrase) for phrase in results_diseases2]
results_diseases4 = ["".join(c for c in phrase if c not in chars_to_remove) for phrase in results_diseases3]
results_diseases_cleaned = list(set(results_diseases4))
return results_diseases_cleaned
def convert_df(df):
return df.to_csv(index=False).encode('utf-8')
if "extract_button" not in st.session_state:
st.session_state.extract_button=False
if "ner_button" not in st.session_state:
st.session_state.ner_button=False
if "icd_button" not in st.session_state:
st.session_state.icd_button=False
def extractCallback():
st.session_state.extract_button=True
st.session_state.extract_spinner=True
def nercallback():
st.session_state.extract_button=True
st.session_state.ner_button=True
st.session_state.extract_spinner=False
st.session_state.ner_spinner=True
def icdcallback():
st.session_state.icd_button=True
st.session_state.extract_button=True
st.session_state.ner_button=True
st.session_state.icd_spinner=True
st.session_state.ner_spinner=False
model_name = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
def compareWith_ICD(row1,row2):
scobj=dict()
txt2 = row2['DISEASES']
code=row2['ICD9_CODE']
tkns = tokenizer([row1, txt2],padding=True,truncation=True,max_length=512)
score = cosine_similarity([tkns['input_ids'][0]], [tkns['input_ids'][1]])
scobj['ICD9_CODE']=code
scobj['score']=round(score[0][0],2)
return scobj
def compare(row):
obj = icd_df1.apply(lambda x: compareWith_ICD(row, x), axis=1)
return json.dumps(sorted(obj.to_list(), key=itemgetter('score'), reverse=True))
def top_icd(row):
sorted_lst = literal_eval(row)
k = [x['ICD9_CODE'] for x in sorted_lst]
new_vals=[]
for i in Counter(k):
all = [x for x in sorted_lst if x['ICD9_CODE']==i]
new_vals.append(max(all, key=lambda x: x['score']))
return json.dumps(sorted(new_vals, key=itemgetter('score'), reverse=True)[:5])
# Read the file from S3 using s3fs and store it in a Pandas DataFrame
with fs.open(s3_file_path, 'rb') as file:
icd_df = pd.read_csv(file)
icd_df['DISEASES'] = icd_df['DISEASES'].apply(literal_eval)
icd_df1 = icd_df.explode('DISEASES')
icd_df1 = icd_df1[icd_df1['DISEASES'].notna()]
icd_df1.drop(columns=['SEQ_NUM','SHORT_TITLE','LONG_TITLE'], inplace=True)
icd_df1.drop_duplicates(subset=["ICD9_CODE", "DISEASES"], inplace=True)
icd_df1.reset_index(drop=True, inplace=True)
# Check to see if a file has been uploaded
if upload_file is not None:
file_extension = pathlib.Path(upload_file.name).suffix
if file_extension=='.csv':
# Read the file to a dataframe using pandas
notevents_df = pd.read_csv(upload_file)
cols = notevents_df.columns
chk = checkcols(notevents_df)
if chk:
#Shape of Data Frame
st.caption(f'Shape of data: {notevents_df.shape}')
# Create a section for the dataframe header
st.info('Display top 5 rows of data')
st.write(notevents_df.head())
# Filter Data Frame
st.subheader('Extract Discharge Summary info')
if st.button('Extract', on_click=extractCallback) or st.session_state.extract_button:
#if st.session_state.extract_spinner:
with st.spinner('Extracting...'):
time.sleep(5)
notevents_df1 = notevents_df.loc[(notevents_df['DESCRIPTION']=='Report') & \
(notevents_df['CATEGORY']=='Discharge summary'),
['ROW_ID','SUBJECT_ID','HADM_ID','CHARTDATE','TEXT']]
notevents_df1['summaryInfo'] = np.nan
notevents_df1['cleanedTxt'] = np.nan
notevents_df1.reset_index(inplace=True, drop=True)
# notevents_df1['HADM_ID']=notevents_df1['HADM_ID'].astype('int')
extractSummary(notevents_df1)
notevents_df2 = notevents_df1.loc[notevents_df1['cleanedTxt']!=""]
notevents_df2.drop_duplicates(subset=["summaryInfo"], keep="first", inplace=True)
notevents_df2.drop(columns=['TEXT'], inplace=True)
st.caption(f'Shape of extracted data: {notevents_df2.shape}')
# Create a section for the dataframe header
st.info('Display top 5 rows of extracted data')
st.write(notevents_df2.head())
st.subheader('Extract Disease Information')
if st.button('Run NER Model', on_click=nercallback) or st.session_state.ner_button:
with st.spinner('Running NER...'):
time.sleep(15)
st.text(f"notevents_df2-------{notevents_df2.shape}")
notevents_df2['DISEASES'] = notevents_df2[text_col].apply(lambda x: extract_diseases(x))
# notevents_df2['DISEASES'] = notevents_df2['DISEASES'].apply(literal_eval)
st.subheader('Display top 5 rows after running NER Model')
st.write(notevents_df2.head())
csv = convert_df(notevents_df2)
st.download_button(
"Press to Download",
csv,
"NER_diseases.csv",
"text/csv",
key='download-csv'
)
st.subheader('Map Disease to ICD9 Codes')
if st.button('Run ICD Model'):
#if st.session_state.icd_spinner:
with st.spinner('icd model running...'):
time.sleep(15)
notevents_df2.drop_duplicates(subset=['HADM_ID'],inplace=True)
ex_disease_df1 = notevents_df2.explode('DISEASES')
ex_disease_df1 = ex_disease_df1[ex_disease_df1['DISEASES'].notna()]
ex_disease_df1 = ex_disease_df1[ex_disease_df1['DISEASES']!=""]
ex_disease_df1.drop(columns=['ROW_ID','SUBJECT_ID','CHARTDATE','summaryInfo','cleanedTxt'], inplace=True)
ex_disease_df1['DISEASES']=ex_disease_df1['DISEASES'].str.replace("/", " ")
ex_disease_df1.drop_duplicates(subset=["DISEASES"], keep="first", inplace=True)
ex_disease_df1.reset_index(drop=True, inplace=True)
ex_disease_df1['icd_map'] = ex_disease_df1.apply(lambda x: compare(x['DISEASES']), axis=1)
ex_disease_df1['top5_icd'] = ex_disease_df1.apply(lambda x: top_icd(x['icd_map']), axis=1)
ex_disease_df1.drop(columns=['icd_map'], inplace=True)
st.info('Display top 5 rows of icd mapping')
st.write(ex_disease_df1.head())
csv = convert_df(ex_disease_df1)
st.download_button(
"Press to Download",
csv,
"top5icd.csv",
"text/csv",
key='top5icd-csv'
)
else:
st.error('Invalid csv file', icon="🚨")
else:
st.error('Upload a csv file', icon="🚨")