# -*- coding: utf-8 -*- """ Created on Tue Jun 6 10:31:42 2023 @author: sanmpaul """ #Import the required Libraries import streamlit as st import pandas as pd import numpy as np import matplotlib.pyplot as plt import nltk nltk.download('all') from nltk.tokenize import word_tokenize, sent_tokenize from nltk.stem import WordNetLemmatizer from nltk.corpus import stopwords from string import punctuation import re import pathlib import time import spacy import s3fs from ast import literal_eval from operator import itemgetter from collections import Counter from sklearn.metrics.pairwise import cosine_similarity from transformers import AutoTokenizer, AutoModelForTokenClassification import json aws_access_key_id="ASIA2WUO352SZMWMOQU5" aws_secret_access_key="wY9VOu1bVTsyNdfB2v/YT27/+IdpCENKr7Ksz68L" aws_session_token="IQoJb3JpZ2luX2VjENj//////////wEaCXVzLWVhc3QtMSJHMEUCIQD6weAxsbMjC2gO6QzTx2UXDSrw43MFmDU+51XB3cAh1QIgUgp7KCkN8IC6XfHs7iYbh/oLFYhJaskpZ1smaES211Aq7QIIMBAAGgw3MzU4MTI3NzU1ODkiDH5ReYz2DBaO/Lel9yrKAls2Qhf0eeisMEqtdwiZeiEID9Tbc2ZBHIn67m/h2VtoTLw6MXN0Bx2yOsVImYE/2tyM7V/e5uuyOibDQKWf+SM02snjq/YvpGhuYSxm3tYSeIDWiJxSFuef3VcbvsFHMVpbSbZrY2LAVqugZANxSek7nflbWXGBxwT+E3eAoS9DKLGdBfFjtlbhT73Kact5Im8VS7ZhbLu2VaiTfWcUT4mjIDRS1z/0hB2byrbns/FYDmpu9lFTSB8M1dLJHjAuFdhLNla5JWYX3S6yRQQ1hOh3Q7luBGMsmRYo8vle/Ss8yLXygW5luDhu9gle9QPUbqdjAUqqeR1CWmseU5CBtC0zdEwniftexYFEq3ysqMgRFCsfZq2cQGlhWSW8EP9wpv8adif2xvcpJ7b/pzoPGQG2ubGXVy9gNFOW/jAccRIfSTITXw0Mg2MygDCu06ykBjqnAR1/pzYywwgaBZk7fJwv66EazUUGrVnR7dx5Pub0H1IjIlgGt+E5TBmMt95aj6ka2wC9ARHaLvta58ZVdQ8rtya+ZHl6TJG6XGRnMcyPabd6kvzk8l5LFm7hQ0r7zRS9nm+wThx5VYnVCP+C8nbwzyswIq3cnORy8XIL7UdtnvfE4p+13lxlZKZ4W4m2hE3c08xnMVok9GOLteZbpSERj3mxe7CWOc4J" fs = s3fs.S3FileSystem(key=aws_access_key_id, secret=aws_secret_access_key, token=aws_session_token) bucket_name = 'mimic3data' folder_path = 'cleanedmimic' file_name = 'extracted_diseases_DIAGNOSES.csv' s3_file_path = f'{bucket_name}/{folder_path}/{file_name}' # Add a title and intro text st.title('Medical Coding Explorer') st.text('This is a web app to allow exploration of the capabilities of Medical coding') # Create file uploader object upload_file = st.file_uploader('Upload a file containing medical data') custom = set(stopwords.words('english') + list(punctuation)) to_check = ["TEXT","DESCRIPTION","CATEGORY","ROW_ID","SUBJECT_ID","HADM_ID","CHARTDATE","TEXT"] def checkcols(x): if not set(to_check).issubset(set(x.columns)): return False return True headings = ["Discharge Diagnosis:","Final Diagnosis:","Final Report:","FINAL DIAGNOSES:","DISCHARGE DIAGNOSES:","PAST MEDICAL HISTORY:"] pattern = r"(?i)(?:^|\n)(?:{})\s*([\s\S]*?)(?=\n\n\w|$)".format("|".join(re.escape(heading) for heading in headings)) def extractSummary(df): for idx,rows in df.iterrows(): #print(f"ROW_ID--------------{rows['ROW_ID']}") text = rows['TEXT'] matches = re.findall(pattern, text) ext_txt_lst = [] for match in matches: ext_txt_lst.append(match.strip().replace("\n", " ")) extracted_txt = " ".join([ts for ts in ext_txt_lst]) df.iloc[idx,5] = extracted_txt df.iloc[idx,6] = " ".join([word.lower() for word in word_tokenize(extracted_txt) if word.isalpha() and word not in custom and len(word)>5]) sci_nlp = spacy.load('en_ner_bc5cdr_md') abbreviations_to_remove = ["s/p", "d/c'd", "w/"] chars_to_remove = ["#", ".", ")", "(", "[", "]"] numeric_pattern = r'\d+' text_col = "summaryInfo" # Function to extract all diseases def extract_diseases(text): docx = sci_nlp(text) results_diseases = [ent.text.lower() for ent in docx.ents if (ent.label_ == 'DISEASE')] results_diseases2 = [' '.join(word for word in disease.split() if word not in abbreviations_to_remove) for disease in results_diseases] results_diseases3 = [re.sub(numeric_pattern, '', phrase) for phrase in results_diseases2] results_diseases4 = ["".join(c for c in phrase if c not in chars_to_remove) for phrase in results_diseases3] results_diseases_cleaned = list(set(results_diseases4)) return results_diseases_cleaned def convert_df(df): return df.to_csv(index=False).encode('utf-8') if "extract_button" not in st.session_state: st.session_state.extract_button=False if "ner_button" not in st.session_state: st.session_state.ner_button=False if "icd_button" not in st.session_state: st.session_state.icd_button=False def extractCallback(): st.session_state.extract_button=True st.session_state.extract_spinner=True def nercallback(): st.session_state.extract_button=True st.session_state.ner_button=True st.session_state.extract_spinner=False st.session_state.ner_spinner=True def icdcallback(): st.session_state.icd_button=True st.session_state.extract_button=True st.session_state.ner_button=True st.session_state.icd_spinner=True st.session_state.ner_spinner=False model_name = "emilyalsentzer/Bio_ClinicalBERT" tokenizer = AutoTokenizer.from_pretrained(model_name) def compareWith_ICD(row1,row2): scobj=dict() txt2 = row2['DISEASES'] code=row2['ICD9_CODE'] tkns = tokenizer([row1, txt2],padding=True,truncation=True,max_length=512) score = cosine_similarity([tkns['input_ids'][0]], [tkns['input_ids'][1]]) scobj['ICD9_CODE']=code scobj['score']=round(score[0][0],2) return scobj def compare(row): obj = icd_df1.apply(lambda x: compareWith_ICD(row, x), axis=1) return json.dumps(sorted(obj.to_list(), key=itemgetter('score'), reverse=True)) def top_icd(row): sorted_lst = literal_eval(row) k = [x['ICD9_CODE'] for x in sorted_lst] new_vals=[] for i in Counter(k): all = [x for x in sorted_lst if x['ICD9_CODE']==i] new_vals.append(max(all, key=lambda x: x['score'])) return json.dumps(sorted(new_vals, key=itemgetter('score'), reverse=True)[:5]) # Read the file from S3 using s3fs and store it in a Pandas DataFrame with fs.open(s3_file_path, 'rb') as file: icd_df = pd.read_csv(file) icd_df['DISEASES'] = icd_df['DISEASES'].apply(literal_eval) icd_df1 = icd_df.explode('DISEASES') icd_df1 = icd_df1[icd_df1['DISEASES'].notna()] icd_df1.drop(columns=['SEQ_NUM','SHORT_TITLE','LONG_TITLE'], inplace=True) icd_df1.drop_duplicates(subset=["ICD9_CODE", "DISEASES"], inplace=True) icd_df1.reset_index(drop=True, inplace=True) # Check to see if a file has been uploaded if upload_file is not None: file_extension = pathlib.Path(upload_file.name).suffix if file_extension=='.csv': # Read the file to a dataframe using pandas notevents_df = pd.read_csv(upload_file) cols = notevents_df.columns chk = checkcols(notevents_df) if chk: #Shape of Data Frame st.caption(f'Shape of data: {notevents_df.shape}') # Create a section for the dataframe header st.info('Display top 5 rows of data') st.write(notevents_df.head()) # Filter Data Frame st.subheader('Extract Discharge Summary info') if st.button('Extract', on_click=extractCallback) or st.session_state.extract_button: #if st.session_state.extract_spinner: with st.spinner('Extracting...'): time.sleep(5) notevents_df1 = notevents_df.loc[(notevents_df['DESCRIPTION']=='Report') & \ (notevents_df['CATEGORY']=='Discharge summary'), ['ROW_ID','SUBJECT_ID','HADM_ID','CHARTDATE','TEXT']] notevents_df1['summaryInfo'] = np.nan notevents_df1['cleanedTxt'] = np.nan notevents_df1.reset_index(inplace=True, drop=True) # notevents_df1['HADM_ID']=notevents_df1['HADM_ID'].astype('int') extractSummary(notevents_df1) notevents_df2 = notevents_df1.loc[notevents_df1['cleanedTxt']!=""] notevents_df2.drop_duplicates(subset=["summaryInfo"], keep="first", inplace=True) notevents_df2.drop(columns=['TEXT'], inplace=True) st.caption(f'Shape of extracted data: {notevents_df2.shape}') # Create a section for the dataframe header st.info('Display top 5 rows of extracted data') st.write(notevents_df2.head()) st.subheader('Extract Disease Information') if st.button('Run NER Model', on_click=nercallback) or st.session_state.ner_button: with st.spinner('Running NER...'): time.sleep(15) st.text(f"notevents_df2-------{notevents_df2.shape}") notevents_df2['DISEASES'] = notevents_df2[text_col].apply(lambda x: extract_diseases(x)) # notevents_df2['DISEASES'] = notevents_df2['DISEASES'].apply(literal_eval) st.subheader('Display top 5 rows after running NER Model') st.write(notevents_df2.head()) csv = convert_df(notevents_df2) st.download_button( "Press to Download", csv, "NER_diseases.csv", "text/csv", key='download-csv' ) st.subheader('Map Disease to ICD9 Codes') if st.button('Run ICD Model'): #if st.session_state.icd_spinner: with st.spinner('icd model running...'): time.sleep(15) notevents_df2.drop_duplicates(subset=['HADM_ID'],inplace=True) ex_disease_df1 = notevents_df2.explode('DISEASES') ex_disease_df1 = ex_disease_df1[ex_disease_df1['DISEASES'].notna()] ex_disease_df1 = ex_disease_df1[ex_disease_df1['DISEASES']!=""] ex_disease_df1.drop(columns=['ROW_ID','SUBJECT_ID','CHARTDATE','summaryInfo','cleanedTxt'], inplace=True) ex_disease_df1['DISEASES']=ex_disease_df1['DISEASES'].str.replace("/", " ") ex_disease_df1.drop_duplicates(subset=["DISEASES"], keep="first", inplace=True) ex_disease_df1.reset_index(drop=True, inplace=True) ex_disease_df1['icd_map'] = ex_disease_df1.apply(lambda x: compare(x['DISEASES']), axis=1) ex_disease_df1['top5_icd'] = ex_disease_df1.apply(lambda x: top_icd(x['icd_map']), axis=1) ex_disease_df1.drop(columns=['icd_map'], inplace=True) st.info('Display top 5 rows of icd mapping') st.write(ex_disease_df1.head()) csv = convert_df(ex_disease_df1) st.download_button( "Press to Download", csv, "top5icd.csv", "text/csv", key='top5icd-csv' ) else: st.error('Invalid csv file', icon="🚨") else: st.error('Upload a csv file', icon="🚨")