|
|
|
""" |
|
Created on Tue Jun 6 10:31:42 2023 |
|
|
|
@author: sanmpaul |
|
""" |
|
|
|
|
|
import streamlit as st |
|
import pandas as pd |
|
import matplotlib.pyplot as plt |
|
import time |
|
from nltk.tokenize import word_tokenize, sent_tokenize |
|
from nltk.stem import WordNetLemmatizer |
|
from nltk.corpus import stopwords |
|
from string import punctuation |
|
from collections import Counter |
|
import re |
|
import pathlib |
|
import time |
|
import json |
|
import s3fs |
|
import spacy |
|
from io import StringIO |
|
from ast import literal_eval |
|
from operator import itemgetter |
|
from sklearn.metrics.pairwise import cosine_similarity |
|
from transformers import AutoTokenizer, AutoModelForTokenClassification |
|
|
|
aws_access_key_id="ASIA2WUO352SX6WKQ3V2" |
|
aws_secret_access_key="KrlZ7mfvBc75It5Bk3snhOE/fyh/TM/i9nVpBQ/D" |
|
aws_session_token="IQoJb3JpZ2luX2VjENr//////////wEaCXVzLWVhc3QtMSJGMEQCIBdDB/bWuclqtZz9bpKOTeVf8fcB07s3MDF6yfv86u7YAiBVbvZ95spkRt000Uu26EVD0o+8i0R8ZkH964l/7ZQGjirtAggyEAAaDDczNTgxMjc3NTU4OSIMKdbdomk59ANo40RKKsoCKRpcyQJCx+6BLumSwbtxd13glZ2t/4HamUQ87rYlxLL1z/2CYYli3SLt6PiLz7Hi+OtnL6avWiay6xbfhBDW5sOFKZ6AIMlBwpso4VP3cXlbcFSgcrN/Yp3wBAyHvT0cVUjm7pNlAvsZCA/1STCs4N+xI/y7hWUH9MsoCmqqvowD0ioPJ1ebONvP0HZHu9emhuLQ7gReQJGl40fdJMwZ2SW61G6TwrvnNDOlADb2v5JFrz6UoxbK3MO7/B2fgShrA3Y0RQIZkKKd7faMUsLipH8lZ/PRE3H+NMICIb0Te51gltv1KXRE6Fdy5AsA6CW/rlveyhy4BBr7jc3SPDzzNnp2yg2m7towCclkfpfubgcOzOLXEsefTkc5BlNr8syYgrwXXUBBVjAHqf1EgqSutr+k3twuLq+aw1U3I3O8GxaSMPV7FItNJqQuMN6NraQGOqgBGe6b4mdPvctSxzpZaJ7Y200J9Cw2VcNSLTOylZodH+lNlkcVR3KNY6RckllVkXIjUULpSV0oLhFaJkHZQgsX8cKSladGGtmLwIJeEJWjx4hp0nYZy2eqL/YHbC9isU+t3eYbJIR4936G4hxhx9i1mlqLpbDI0QRUJSp1E1ur6HzPioXmCqiu0rEoUBPRjW9jTcIUwWLoJmGLxtmzQNNhVGoZF4Y/Etub" |
|
|
|
fs = s3fs.S3FileSystem(key=aws_access_key_id, secret=aws_secret_access_key, token=aws_session_token) |
|
|
|
bucket_name = 'mimic3data' |
|
folder_path = 'cleanedmimic' |
|
file_name = 'extracted_diseases_DIAGNOSES.csv' |
|
s3_file_path = f'{bucket_name}/{folder_path}/{file_name}' |
|
|
|
|
|
|
|
st.title('Medical Coding Explorer') |
|
st.text('This is a web app to allow exploration of the capabilities of Medical coding') |
|
|
|
headings = ["Discharge Diagnosis:","Final Diagnosis:","Final Report:","FINAL DIAGNOSES:","DISCHARGE DIAGNOSES:","[**Hospital **] Diagnosis:"] |
|
pattern = r"(?i)(?:^|\n)(?:{})\s*([\s\S]*?)(?=\n\n\w|$)".format("|".join(re.escape(heading) for heading in headings)) |
|
def extractSummary(text): |
|
matches = re.findall(pattern, text) |
|
ext_txt_lst = [] |
|
for match in matches: |
|
ext_txt_lst.append(match.strip().replace("\n", " ")) |
|
|
|
extracted_txt = " ".join([ts for ts in ext_txt_lst]) |
|
return extracted_txt |
|
|
|
|
|
abbreviations_to_remove = ["s/p", "d/c'd", "w/"] |
|
chars_to_remove = ["#", ".", ")", "(", "[", "]"] |
|
numeric_pattern = r'\d+' |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def convert_df(df): |
|
return df.to_csv(index=False).encode('utf-8') |
|
|
|
if "extract_button" not in st.session_state: |
|
st.session_state.extract_button=False |
|
|
|
if "ner_button" not in st.session_state: |
|
st.session_state.ner_button=False |
|
|
|
def extractCallback(): |
|
st.session_state.extract_button=True |
|
st.session_state.extract_spinner=True |
|
|
|
def nercallback(): |
|
st.session_state.extract_button=True |
|
st.session_state.ner_button=True |
|
st.session_state.extract_spinner=False |
|
st.session_state.ner_spinner=True |
|
|
|
def icdcallback(): |
|
st.session_state.icd_button=True |
|
st.session_state.extract_button=True |
|
st.session_state.ner_button=True |
|
st.session_state.icd_spinner=True |
|
st.session_state.ner_spinner=False |
|
|
|
|
|
model_name = "emilyalsentzer/Bio_ClinicalBERT" |
|
tokenizer = AutoTokenizer.from_pretrained(model_name) |
|
def compareWith_ICD(row1,row2): |
|
scobj=dict() |
|
txt2 = row2['DISEASES'] |
|
code=row2['ICD9_CODE'] |
|
tkns = tokenizer([row1, txt2],padding=True,truncation=True,max_length=512) |
|
score = cosine_similarity([tkns['input_ids'][0]], [tkns['input_ids'][1]]) |
|
scobj['ICD9_CODE']=code |
|
scobj['score']=round(score[0][0],2) |
|
return scobj |
|
|
|
def compare(row): |
|
obj = icd_df1.apply(lambda x: compareWith_ICD(row, x), axis=1) |
|
return json.dumps(sorted(obj.to_list(), key=itemgetter('score'), reverse=True)) |
|
|
|
def top_icd(row): |
|
sorted_lst = literal_eval(row) |
|
|
|
k = [x['ICD9_CODE'] for x in sorted_lst] |
|
new_vals=[] |
|
for i in Counter(k): |
|
all = [x for x in sorted_lst if x['ICD9_CODE']==i] |
|
new_vals.append(max(all, key=lambda x: x['score'])) |
|
|
|
return json.dumps(sorted(new_vals, key=itemgetter('score'), reverse=True)[:5]) |
|
|
|
|
|
|
|
with fs.open(s3_file_path, 'rb') as file: |
|
icd_df = pd.read_csv(file) |
|
|
|
icd_df['DISEASES'] = icd_df['DISEASES'].apply(literal_eval) |
|
icd_df1 = icd_df.explode('DISEASES') |
|
icd_df1 = icd_df1[icd_df1['DISEASES'].notna()] |
|
icd_df1.drop(columns=['SEQ_NUM','SHORT_TITLE','LONG_TITLE'], inplace=True) |
|
icd_df1.drop_duplicates(subset=["ICD9_CODE", "DISEASES"], inplace=True) |
|
icd_df1.reset_index(drop=True, inplace=True) |
|
|
|
|
|
upload_file = st.file_uploader('Upload a file containing medical data') |
|
|
|
if upload_file is not None: |
|
file_extension = pathlib.Path(upload_file.name).suffix |
|
if file_extension=='.txt': |
|
|
|
time.sleep(15) |
|
|
|
bytes_data = upload_file.getvalue() |
|
string_data = bytes_data.decode('utf-8') |
|
|
|
|
|
string_data = string_data.replace('**','') |
|
|
|
|
|
|
|
ex_txt = extractSummary(string_data) |
|
st.subheader('Extracted Discharge Summary information') |
|
st.write(ex_txt) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
else: |
|
st.error('Upload a txt file', icon="π¨") |
|
|
|
else: |
|
st.error('Upload a file', icon="π¨") |
|
|
|
|
|
|
|
|
|
|