|
|
|
""" |
|
Created on Tue Jun 6 10:31:42 2023 |
|
|
|
@author: sanmpaul |
|
""" |
|
|
|
|
|
import streamlit as st |
|
import pandas as pd |
|
import numpy as np |
|
import matplotlib.pyplot as plt |
|
import nltk |
|
nltk.download('all') |
|
from nltk.tokenize import word_tokenize, sent_tokenize |
|
from nltk.stem import WordNetLemmatizer |
|
from nltk.corpus import stopwords |
|
from string import punctuation |
|
import re |
|
import pathlib |
|
import time |
|
import spacy |
|
import s3fs |
|
from ast import literal_eval |
|
from operator import itemgetter |
|
from collections import Counter |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
from transformers import AutoTokenizer, AutoModelForTokenClassification |
|
import json |
|
|
|
aws_access_key_id="ASIA2WUO352SZMWMOQU5" |
|
aws_secret_access_key="wY9VOu1bVTsyNdfB2v/YT27/+IdpCENKr7Ksz68L" |
|
aws_session_token="IQoJb3JpZ2luX2VjENj//////////wEaCXVzLWVhc3QtMSJHMEUCIQD6weAxsbMjC2gO6QzTx2UXDSrw43MFmDU+51XB3cAh1QIgUgp7KCkN8IC6XfHs7iYbh/oLFYhJaskpZ1smaES211Aq7QIIMBAAGgw3MzU4MTI3NzU1ODkiDH5ReYz2DBaO/Lel9yrKAls2Qhf0eeisMEqtdwiZeiEID9Tbc2ZBHIn67m/h2VtoTLw6MXN0Bx2yOsVImYE/2tyM7V/e5uuyOibDQKWf+SM02snjq/YvpGhuYSxm3tYSeIDWiJxSFuef3VcbvsFHMVpbSbZrY2LAVqugZANxSek7nflbWXGBxwT+E3eAoS9DKLGdBfFjtlbhT73Kact5Im8VS7ZhbLu2VaiTfWcUT4mjIDRS1z/0hB2byrbns/FYDmpu9lFTSB8M1dLJHjAuFdhLNla5JWYX3S6yRQQ1hOh3Q7luBGMsmRYo8vle/Ss8yLXygW5luDhu9gle9QPUbqdjAUqqeR1CWmseU5CBtC0zdEwniftexYFEq3ysqMgRFCsfZq2cQGlhWSW8EP9wpv8adif2xvcpJ7b/pzoPGQG2ubGXVy9gNFOW/jAccRIfSTITXw0Mg2MygDCu06ykBjqnAR1/pzYywwgaBZk7fJwv66EazUUGrVnR7dx5Pub0H1IjIlgGt+E5TBmMt95aj6ka2wC9ARHaLvta58ZVdQ8rtya+ZHl6TJG6XGRnMcyPabd6kvzk8l5LFm7hQ0r7zRS9nm+wThx5VYnVCP+C8nbwzyswIq3cnORy8XIL7UdtnvfE4p+13lxlZKZ4W4m2hE3c08xnMVok9GOLteZbpSERj3mxe7CWOc4J" |
|
|
|
fs = s3fs.S3FileSystem(key=aws_access_key_id, secret=aws_secret_access_key, token=aws_session_token) |
|
|
|
bucket_name = 'mimic3data' |
|
folder_path = 'cleanedmimic' |
|
file_name = 'extracted_diseases_DIAGNOSES.csv' |
|
s3_file_path = f'{bucket_name}/{folder_path}/{file_name}' |
|
|
|
|
|
st.title('Medical Coding Explorer') |
|
st.text('This is a web app to allow exploration of the capabilities of Medical coding') |
|
|
|
|
|
upload_file = st.file_uploader('Upload a file containing medical data') |
|
custom = set(stopwords.words('english') + list(punctuation)) |
|
|
|
to_check = ["TEXT","DESCRIPTION","CATEGORY","ROW_ID","SUBJECT_ID","HADM_ID","CHARTDATE","TEXT"] |
|
def checkcols(x): |
|
if not set(to_check).issubset(set(x.columns)): |
|
return False |
|
return True |
|
|
|
headings = ["Discharge Diagnosis:","Final Diagnosis:","Final Report:","FINAL DIAGNOSES:","DISCHARGE DIAGNOSES:","PAST MEDICAL HISTORY:"] |
|
pattern = r"(?i)(?:^|\n)(?:{})\s*([\s\S]*?)(?=\n\n\w|$)".format("|".join(re.escape(heading) for heading in headings)) |
|
def extractSummary(df): |
|
for idx,rows in df.iterrows(): |
|
|
|
text = rows['TEXT'] |
|
matches = re.findall(pattern, text) |
|
ext_txt_lst = [] |
|
for match in matches: |
|
ext_txt_lst.append(match.strip().replace("\n", " ")) |
|
extracted_txt = " ".join([ts for ts in ext_txt_lst]) |
|
df.iloc[idx,5] = extracted_txt |
|
df.iloc[idx,6] = " ".join([word.lower() for word in word_tokenize(extracted_txt) if word.isalpha() and word not in custom and len(word)>5]) |
|
|
|
sci_nlp = spacy.load('en_ner_bc5cdr_md') |
|
abbreviations_to_remove = ["s/p", "d/c'd", "w/"] |
|
chars_to_remove = ["#", ".", ")", "(", "[", "]"] |
|
numeric_pattern = r'\d+' |
|
text_col = "summaryInfo" |
|
|
|
|
|
def extract_diseases(text): |
|
|
|
docx = sci_nlp(text) |
|
results_diseases = [ent.text.lower() for ent in docx.ents if (ent.label_ == 'DISEASE')] |
|
results_diseases2 = [' '.join(word for word in disease.split() if word not in abbreviations_to_remove) for disease in results_diseases] |
|
results_diseases3 = [re.sub(numeric_pattern, '', phrase) for phrase in results_diseases2] |
|
results_diseases4 = ["".join(c for c in phrase if c not in chars_to_remove) for phrase in results_diseases3] |
|
results_diseases_cleaned = list(set(results_diseases4)) |
|
|
|
return results_diseases_cleaned |
|
|
|
def convert_df(df): |
|
return df.to_csv(index=False).encode('utf-8') |
|
|
|
if "extract_button" not in st.session_state: |
|
st.session_state.extract_button=False |
|
|
|
if "ner_button" not in st.session_state: |
|
st.session_state.ner_button=False |
|
|
|
if "icd_button" not in st.session_state: |
|
st.session_state.icd_button=False |
|
|
|
def extractCallback(): |
|
st.session_state.extract_button=True |
|
st.session_state.extract_spinner=True |
|
|
|
def nercallback(): |
|
st.session_state.extract_button=True |
|
st.session_state.ner_button=True |
|
st.session_state.extract_spinner=False |
|
st.session_state.ner_spinner=True |
|
|
|
def icdcallback(): |
|
st.session_state.icd_button=True |
|
st.session_state.extract_button=True |
|
st.session_state.ner_button=True |
|
st.session_state.icd_spinner=True |
|
st.session_state.ner_spinner=False |
|
|
|
|
|
model_name = "emilyalsentzer/Bio_ClinicalBERT" |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
def compareWith_ICD(row1,row2): |
|
scobj=dict() |
|
txt2 = row2['DISEASES'] |
|
code=row2['ICD9_CODE'] |
|
tkns = tokenizer([row1, txt2],padding=True,truncation=True,max_length=512) |
|
score = cosine_similarity([tkns['input_ids'][0]], [tkns['input_ids'][1]]) |
|
scobj['ICD9_CODE']=code |
|
scobj['score']=round(score[0][0],2) |
|
return scobj |
|
|
|
def compare(row): |
|
obj = icd_df1.apply(lambda x: compareWith_ICD(row, x), axis=1) |
|
return json.dumps(sorted(obj.to_list(), key=itemgetter('score'), reverse=True)) |
|
|
|
def top_icd(row): |
|
sorted_lst = literal_eval(row) |
|
|
|
k = [x['ICD9_CODE'] for x in sorted_lst] |
|
new_vals=[] |
|
for i in Counter(k): |
|
all = [x for x in sorted_lst if x['ICD9_CODE']==i] |
|
new_vals.append(max(all, key=lambda x: x['score'])) |
|
|
|
return json.dumps(sorted(new_vals, key=itemgetter('score'), reverse=True)[:5]) |
|
|
|
|
|
|
|
with fs.open(s3_file_path, 'rb') as file: |
|
icd_df = pd.read_csv(file) |
|
|
|
icd_df['DISEASES'] = icd_df['DISEASES'].apply(literal_eval) |
|
icd_df1 = icd_df.explode('DISEASES') |
|
icd_df1 = icd_df1[icd_df1['DISEASES'].notna()] |
|
icd_df1.drop(columns=['SEQ_NUM','SHORT_TITLE','LONG_TITLE'], inplace=True) |
|
icd_df1.drop_duplicates(subset=["ICD9_CODE", "DISEASES"], inplace=True) |
|
icd_df1.reset_index(drop=True, inplace=True) |
|
|
|
|
|
if upload_file is not None: |
|
file_extension = pathlib.Path(upload_file.name).suffix |
|
if file_extension=='.csv': |
|
|
|
|
|
notevents_df = pd.read_csv(upload_file) |
|
cols = notevents_df.columns |
|
chk = checkcols(notevents_df) |
|
if chk: |
|
|
|
|
|
st.caption(f'Shape of data: {notevents_df.shape}') |
|
|
|
|
|
st.info('Display top 5 rows of data') |
|
st.write(notevents_df.head()) |
|
|
|
|
|
st.subheader('Extract Discharge Summary info') |
|
if st.button('Extract', on_click=extractCallback) or st.session_state.extract_button: |
|
|
|
|
|
with st.spinner('Extracting...'): |
|
time.sleep(5) |
|
|
|
notevents_df1 = notevents_df.loc[(notevents_df['DESCRIPTION']=='Report') & \ |
|
(notevents_df['CATEGORY']=='Discharge summary'), |
|
['ROW_ID','SUBJECT_ID','HADM_ID','CHARTDATE','TEXT']] |
|
|
|
notevents_df1['summaryInfo'] = np.nan |
|
notevents_df1['cleanedTxt'] = np.nan |
|
notevents_df1.reset_index(inplace=True, drop=True) |
|
|
|
|
|
extractSummary(notevents_df1) |
|
|
|
notevents_df2 = notevents_df1.loc[notevents_df1['cleanedTxt']!=""] |
|
notevents_df2.drop_duplicates(subset=["summaryInfo"], keep="first", inplace=True) |
|
notevents_df2.drop(columns=['TEXT'], inplace=True) |
|
|
|
st.caption(f'Shape of extracted data: {notevents_df2.shape}') |
|
|
|
|
|
st.info('Display top 5 rows of extracted data') |
|
st.write(notevents_df2.head()) |
|
|
|
|
|
st.subheader('Extract Disease Information') |
|
if st.button('Run NER Model', on_click=nercallback) or st.session_state.ner_button: |
|
with st.spinner('Running NER...'): |
|
time.sleep(15) |
|
|
|
st.text(f"notevents_df2-------{notevents_df2.shape}") |
|
|
|
notevents_df2['DISEASES'] = notevents_df2[text_col].apply(lambda x: extract_diseases(x)) |
|
|
|
st.subheader('Display top 5 rows after running NER Model') |
|
st.write(notevents_df2.head()) |
|
|
|
csv = convert_df(notevents_df2) |
|
st.download_button( |
|
"Press to Download", |
|
csv, |
|
"NER_diseases.csv", |
|
"text/csv", |
|
key='download-csv' |
|
) |
|
|
|
st.subheader('Map Disease to ICD9 Codes') |
|
if st.button('Run ICD Model'): |
|
|
|
with st.spinner('icd model running...'): |
|
time.sleep(15) |
|
|
|
notevents_df2.drop_duplicates(subset=['HADM_ID'],inplace=True) |
|
ex_disease_df1 = notevents_df2.explode('DISEASES') |
|
|
|
ex_disease_df1 = ex_disease_df1[ex_disease_df1['DISEASES'].notna()] |
|
ex_disease_df1 = ex_disease_df1[ex_disease_df1['DISEASES']!=""] |
|
ex_disease_df1.drop(columns=['ROW_ID','SUBJECT_ID','CHARTDATE','summaryInfo','cleanedTxt'], inplace=True) |
|
ex_disease_df1['DISEASES']=ex_disease_df1['DISEASES'].str.replace("/", " ") |
|
ex_disease_df1.drop_duplicates(subset=["DISEASES"], keep="first", inplace=True) |
|
ex_disease_df1.reset_index(drop=True, inplace=True) |
|
|
|
ex_disease_df1['icd_map'] = ex_disease_df1.apply(lambda x: compare(x['DISEASES']), axis=1) |
|
ex_disease_df1['top5_icd'] = ex_disease_df1.apply(lambda x: top_icd(x['icd_map']), axis=1) |
|
|
|
ex_disease_df1.drop(columns=['icd_map'], inplace=True) |
|
|
|
st.info('Display top 5 rows of icd mapping') |
|
st.write(ex_disease_df1.head()) |
|
|
|
csv = convert_df(ex_disease_df1) |
|
st.download_button( |
|
"Press to Download", |
|
csv, |
|
"top5icd.csv", |
|
"text/csv", |
|
key='top5icd-csv' |
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
else: |
|
st.error('Invalid csv file', icon="π¨") |
|
|
|
else: |
|
st.error('Upload a csv file', icon="π¨") |
|
|
|
|