Spaces:

sanmoy27
/

test_app

Sleeping

File size: 11,813 Bytes

# -*- coding: utf-8 -*-
"""
Created on Tue Jun  6 10:31:42 2023

@author: sanmpaul
"""

#Import the required Libraries
import streamlit as st
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import nltk
nltk.download('all')
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from string import punctuation
import re
import pathlib
import time
import spacy
import s3fs
from ast import literal_eval
from operator import itemgetter
from collections import Counter
from sklearn.metrics.pairwise import cosine_similarity
from transformers import AutoTokenizer, AutoModelForTokenClassification
import json

aws_access_key_id="ASIA2WUO352SZMWMOQU5"
aws_secret_access_key="wY9VOu1bVTsyNdfB2v/YT27/+IdpCENKr7Ksz68L"
aws_session_token="IQoJb3JpZ2luX2VjENj//////////wEaCXVzLWVhc3QtMSJHMEUCIQD6weAxsbMjC2gO6QzTx2UXDSrw43MFmDU+51XB3cAh1QIgUgp7KCkN8IC6XfHs7iYbh/oLFYhJaskpZ1smaES211Aq7QIIMBAAGgw3MzU4MTI3NzU1ODkiDH5ReYz2DBaO/Lel9yrKAls2Qhf0eeisMEqtdwiZeiEID9Tbc2ZBHIn67m/h2VtoTLw6MXN0Bx2yOsVImYE/2tyM7V/e5uuyOibDQKWf+SM02snjq/YvpGhuYSxm3tYSeIDWiJxSFuef3VcbvsFHMVpbSbZrY2LAVqugZANxSek7nflbWXGBxwT+E3eAoS9DKLGdBfFjtlbhT73Kact5Im8VS7ZhbLu2VaiTfWcUT4mjIDRS1z/0hB2byrbns/FYDmpu9lFTSB8M1dLJHjAuFdhLNla5JWYX3S6yRQQ1hOh3Q7luBGMsmRYo8vle/Ss8yLXygW5luDhu9gle9QPUbqdjAUqqeR1CWmseU5CBtC0zdEwniftexYFEq3ysqMgRFCsfZq2cQGlhWSW8EP9wpv8adif2xvcpJ7b/pzoPGQG2ubGXVy9gNFOW/jAccRIfSTITXw0Mg2MygDCu06ykBjqnAR1/pzYywwgaBZk7fJwv66EazUUGrVnR7dx5Pub0H1IjIlgGt+E5TBmMt95aj6ka2wC9ARHaLvta58ZVdQ8rtya+ZHl6TJG6XGRnMcyPabd6kvzk8l5LFm7hQ0r7zRS9nm+wThx5VYnVCP+C8nbwzyswIq3cnORy8XIL7UdtnvfE4p+13lxlZKZ4W4m2hE3c08xnMVok9GOLteZbpSERj3mxe7CWOc4J"

fs = s3fs.S3FileSystem(key=aws_access_key_id, secret=aws_secret_access_key, token=aws_session_token)

bucket_name = 'mimic3data'
folder_path = 'cleanedmimic'
file_name = 'extracted_diseases_DIAGNOSES.csv'
s3_file_path = f'{bucket_name}/{folder_path}/{file_name}'

# Add a title and intro text
st.title('Medical Coding Explorer')
st.text('This is a web app to allow exploration of the capabilities of Medical coding')

# Create file uploader object
upload_file = st.file_uploader('Upload a file containing medical data')
custom = set(stopwords.words('english') + list(punctuation))

to_check = ["TEXT","DESCRIPTION","CATEGORY","ROW_ID","SUBJECT_ID","HADM_ID","CHARTDATE","TEXT"]
def checkcols(x):
    if not set(to_check).issubset(set(x.columns)):
       return False
    return True

headings = ["Discharge Diagnosis:","Final Diagnosis:","Final Report:","FINAL DIAGNOSES:","DISCHARGE DIAGNOSES:","PAST MEDICAL HISTORY:"]
pattern = r"(?i)(?:^|\n)(?:{})\s*([\s\S]*?)(?=\n\n\w|$)".format("|".join(re.escape(heading) for heading in headings))
def extractSummary(df):
    for idx,rows in df.iterrows():
        #print(f"ROW_ID--------------{rows['ROW_ID']}")
        text = rows['TEXT']
        matches = re.findall(pattern, text)
        ext_txt_lst = []
        for match in matches:
            ext_txt_lst.append(match.strip().replace("\n", " "))
        extracted_txt = " ".join([ts for ts in ext_txt_lst])
        df.iloc[idx,5] = extracted_txt 
        df.iloc[idx,6] = " ".join([word.lower() for word in word_tokenize(extracted_txt) if word.isalpha() and word not in custom and len(word)>5])

sci_nlp = spacy.load('en_ner_bc5cdr_md')
abbreviations_to_remove = ["s/p", "d/c'd", "w/"]
chars_to_remove = ["#", ".", ")", "(", "[", "]"]
numeric_pattern = r'\d+'
text_col = "summaryInfo"

# Function to extract all diseases
def extract_diseases(text):
    
    docx = sci_nlp(text)    
    results_diseases = [ent.text.lower() for ent in docx.ents if (ent.label_ == 'DISEASE')]
    results_diseases2 = [' '.join(word for word in disease.split() if word not in abbreviations_to_remove) for disease in results_diseases]
    results_diseases3 = [re.sub(numeric_pattern, '', phrase) for phrase in results_diseases2]
    results_diseases4 = ["".join(c for c in phrase if c not in chars_to_remove) for phrase in results_diseases3]
    results_diseases_cleaned = list(set(results_diseases4))
    
    return results_diseases_cleaned      

def convert_df(df):
   return df.to_csv(index=False).encode('utf-8')

if "extract_button" not in st.session_state:
    st.session_state.extract_button=False
    
if "ner_button" not in st.session_state:
    st.session_state.ner_button=False
    
if "icd_button" not in st.session_state:
    st.session_state.icd_button=False

def extractCallback():
    st.session_state.extract_button=True
    st.session_state.extract_spinner=True
    
def nercallback():
    st.session_state.extract_button=True
    st.session_state.ner_button=True
    st.session_state.extract_spinner=False
    st.session_state.ner_spinner=True
    
def icdcallback():
    st.session_state.icd_button=True
    st.session_state.extract_button=True
    st.session_state.ner_button=True
    st.session_state.icd_spinner=True
    st.session_state.ner_spinner=False
    

model_name = "emilyalsentzer/Bio_ClinicalBERT"
tokenizer = AutoTokenizer.from_pretrained(model_name)
def compareWith_ICD(row1,row2):
    scobj=dict()
    txt2 = row2['DISEASES']
    code=row2['ICD9_CODE']
    tkns = tokenizer([row1, txt2],padding=True,truncation=True,max_length=512)
    score = cosine_similarity([tkns['input_ids'][0]], [tkns['input_ids'][1]])
    scobj['ICD9_CODE']=code
    scobj['score']=round(score[0][0],2)
    return scobj

def compare(row):
    obj = icd_df1.apply(lambda x: compareWith_ICD(row, x), axis=1)
    return json.dumps(sorted(obj.to_list(), key=itemgetter('score'), reverse=True))

def top_icd(row):
    sorted_lst = literal_eval(row)
    
    k = [x['ICD9_CODE'] for x in sorted_lst]  
    new_vals=[]
    for i in Counter(k):
        all = [x for x in sorted_lst if x['ICD9_CODE']==i]
        new_vals.append(max(all, key=lambda x: x['score']))

    return json.dumps(sorted(new_vals, key=itemgetter('score'), reverse=True)[:5])


# Read the file from S3 using s3fs and store it in a Pandas DataFrame
with fs.open(s3_file_path, 'rb') as file:
    icd_df = pd.read_csv(file)
    
icd_df['DISEASES'] = icd_df['DISEASES'].apply(literal_eval)
icd_df1 = icd_df.explode('DISEASES')
icd_df1 = icd_df1[icd_df1['DISEASES'].notna()]
icd_df1.drop(columns=['SEQ_NUM','SHORT_TITLE','LONG_TITLE'], inplace=True)
icd_df1.drop_duplicates(subset=["ICD9_CODE", "DISEASES"], inplace=True)
icd_df1.reset_index(drop=True, inplace=True)
      
# Check to see if a file has been uploaded
if upload_file is not None:
    file_extension = pathlib.Path(upload_file.name).suffix
    if file_extension=='.csv':
        
       # Read the file to a dataframe using pandas
       notevents_df = pd.read_csv(upload_file)
       cols = notevents_df.columns
       chk = checkcols(notevents_df)
       if chk:
       
           #Shape of Data Frame
           st.caption(f'Shape of data: {notevents_df.shape}')
           
           # Create a section for the dataframe header
           st.info('Display top 5 rows of data')
           st.write(notevents_df.head())
           
           # Filter Data Frame
           st.subheader('Extract Discharge Summary info')
           if st.button('Extract', on_click=extractCallback) or st.session_state.extract_button:
               
               #if st.session_state.extract_spinner:
               with st.spinner('Extracting...'):
                   time.sleep(5)
                    
                   notevents_df1 = notevents_df.loc[(notevents_df['DESCRIPTION']=='Report') & \
                                                 (notevents_df['CATEGORY']=='Discharge summary'),
                                                 ['ROW_ID','SUBJECT_ID','HADM_ID','CHARTDATE','TEXT']]
                        
                   notevents_df1['summaryInfo'] = np.nan
                   notevents_df1['cleanedTxt'] = np.nan
                   notevents_df1.reset_index(inplace=True, drop=True)
                    # notevents_df1['HADM_ID']=notevents_df1['HADM_ID'].astype('int')
                    
                   extractSummary(notevents_df1)
                    
                   notevents_df2 = notevents_df1.loc[notevents_df1['cleanedTxt']!=""]
                   notevents_df2.drop_duplicates(subset=["summaryInfo"], keep="first", inplace=True)
                   notevents_df2.drop(columns=['TEXT'], inplace=True)
                    
                   st.caption(f'Shape of extracted data: {notevents_df2.shape}')
                    
                   # Create a section for the dataframe header
                   st.info('Display top 5 rows of extracted data')
                   st.write(notevents_df2.head())
                   
                   
           st.subheader('Extract Disease Information')
           if st.button('Run NER Model', on_click=nercallback) or st.session_state.ner_button:
               with st.spinner('Running NER...'):
                   time.sleep(15)
                     
                   st.text(f"notevents_df2-------{notevents_df2.shape}")
                     
                   notevents_df2['DISEASES'] = notevents_df2[text_col].apply(lambda x: extract_diseases(x))
                   # notevents_df2['DISEASES'] = notevents_df2['DISEASES'].apply(literal_eval)
                   st.subheader('Display top 5 rows after running NER Model')
                   st.write(notevents_df2.head())
                     
                   csv = convert_df(notevents_df2)
                   st.download_button(
                    "Press to Download",
                     csv,
                     "NER_diseases.csv",
                     "text/csv",
                     key='download-csv'
                  )

           st.subheader('Map Disease to ICD9 Codes')
           if st.button('Run ICD Model'):
               #if st.session_state.icd_spinner:
               with st.spinner('icd model running...'):
                   time.sleep(15)
                   
                   notevents_df2.drop_duplicates(subset=['HADM_ID'],inplace=True)
                   ex_disease_df1 = notevents_df2.explode('DISEASES')                            
                   
                   ex_disease_df1 = ex_disease_df1[ex_disease_df1['DISEASES'].notna()]
                   ex_disease_df1 = ex_disease_df1[ex_disease_df1['DISEASES']!=""]
                   ex_disease_df1.drop(columns=['ROW_ID','SUBJECT_ID','CHARTDATE','summaryInfo','cleanedTxt'], inplace=True)
                   ex_disease_df1['DISEASES']=ex_disease_df1['DISEASES'].str.replace("/", " ")
                   ex_disease_df1.drop_duplicates(subset=["DISEASES"], keep="first", inplace=True)
                   ex_disease_df1.reset_index(drop=True, inplace=True)
                   
                   ex_disease_df1['icd_map'] = ex_disease_df1.apply(lambda x: compare(x['DISEASES']), axis=1)
                   ex_disease_df1['top5_icd'] = ex_disease_df1.apply(lambda x: top_icd(x['icd_map']), axis=1)
                   
                   ex_disease_df1.drop(columns=['icd_map'], inplace=True)
                   
                   st.info('Display top 5 rows of icd mapping')
                   st.write(ex_disease_df1.head())
                   
                   csv = convert_df(ex_disease_df1)
                   st.download_button(
                       "Press to Download",
                       csv,
                       "top5icd.csv",
                       "text/csv",
                       key='top5icd-csv'
                    )
                            
                
                            
                       
                           
                           
                
               
       else:
            st.error('Invalid csv file', icon="🚨")

    else:
        st.error('Upload a csv file', icon="🚨")