Spaces:

sanmoy27
/

test_app

Sleeping

App Files Files Community

test_app / upload_file.py

sanmoy27

Upload 2 files

0ea3651 over 2 years ago

raw

history blame contribute delete

11.8 kB

	# -- coding: utf-8 --
	"""
	Created on Tue Jun 6 10:31:42 2023

	@author: sanmpaul
	"""

	#Import the required Libraries
	import streamlit as st
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import nltk
	nltk.download('all')
	from nltk.tokenize import word_tokenize, sent_tokenize
	from nltk.stem import WordNetLemmatizer
	from nltk.corpus import stopwords
	from string import punctuation
	import re
	import pathlib
	import time
	import spacy
	import s3fs
	from ast import literal_eval
	from operator import itemgetter
	from collections import Counter
	from sklearn.metrics.pairwise import cosine_similarity
	from transformers import AutoTokenizer, AutoModelForTokenClassification
	import json

	aws_access_key_id="ASIA2WUO352SZMWMOQU5"
	aws_secret_access_key="wY9VOu1bVTsyNdfB2v/YT27/+IdpCENKr7Ksz68L"
	aws_session_token="IQoJb3JpZ2luX2VjENj//////////wEaCXVzLWVhc3QtMSJHMEUCIQD6weAxsbMjC2gO6QzTx2UXDSrw43MFmDU+51XB3cAh1QIgUgp7KCkN8IC6XfHs7iYbh/oLFYhJaskpZ1smaES211Aq7QIIMBAAGgw3MzU4MTI3NzU1ODkiDH5ReYz2DBaO/Lel9yrKAls2Qhf0eeisMEqtdwiZeiEID9Tbc2ZBHIn67m/h2VtoTLw6MXN0Bx2yOsVImYE/2tyM7V/e5uuyOibDQKWf+SM02snjq/YvpGhuYSxm3tYSeIDWiJxSFuef3VcbvsFHMVpbSbZrY2LAVqugZANxSek7nflbWXGBxwT+E3eAoS9DKLGdBfFjtlbhT73Kact5Im8VS7ZhbLu2VaiTfWcUT4mjIDRS1z/0hB2byrbns/FYDmpu9lFTSB8M1dLJHjAuFdhLNla5JWYX3S6yRQQ1hOh3Q7luBGMsmRYo8vle/Ss8yLXygW5luDhu9gle9QPUbqdjAUqqeR1CWmseU5CBtC0zdEwniftexYFEq3ysqMgRFCsfZq2cQGlhWSW8EP9wpv8adif2xvcpJ7b/pzoPGQG2ubGXVy9gNFOW/jAccRIfSTITXw0Mg2MygDCu06ykBjqnAR1/pzYywwgaBZk7fJwv66EazUUGrVnR7dx5Pub0H1IjIlgGt+E5TBmMt95aj6ka2wC9ARHaLvta58ZVdQ8rtya+ZHl6TJG6XGRnMcyPabd6kvzk8l5LFm7hQ0r7zRS9nm+wThx5VYnVCP+C8nbwzyswIq3cnORy8XIL7UdtnvfE4p+13lxlZKZ4W4m2hE3c08xnMVok9GOLteZbpSERj3mxe7CWOc4J"

	fs = s3fs.S3FileSystem(key=aws_access_key_id, secret=aws_secret_access_key, token=aws_session_token)

	bucket_name = 'mimic3data'
	folder_path = 'cleanedmimic'
	file_name = 'extracted_diseases_DIAGNOSES.csv'
	s3_file_path = f'{bucket_name}/{folder_path}/{file_name}'

	# Add a title and intro text
	st.title('Medical Coding Explorer')
	st.text('This is a web app to allow exploration of the capabilities of Medical coding')

	# Create file uploader object
	upload_file = st.file_uploader('Upload a file containing medical data')
	custom = set(stopwords.words('english') + list(punctuation))

	to_check = ["TEXT","DESCRIPTION","CATEGORY","ROW_ID","SUBJECT_ID","HADM_ID","CHARTDATE","TEXT"]
	def checkcols(x):
	if not set(to_check).issubset(set(x.columns)):
	return False
	return True

	headings = ["Discharge Diagnosis:","Final Diagnosis:","Final Report:","FINAL DIAGNOSES:","DISCHARGE DIAGNOSES:","PAST MEDICAL HISTORY:"]
	pattern = r"(?i)(?:^\|\n)(?:{})\s([\s\S]?)(?=\n\n\w\|$)".format("\|".join(re.escape(heading) for heading in headings))
	def extractSummary(df):
	for idx,rows in df.iterrows():
	#print(f"ROW_ID--------------{rows['ROW_ID']}")
	text = rows['TEXT']
	matches = re.findall(pattern, text)
	ext_txt_lst = []
	for match in matches:
	ext_txt_lst.append(match.strip().replace("\n", " "))
	extracted_txt = " ".join([ts for ts in ext_txt_lst])
	df.iloc[idx,5] = extracted_txt
	df.iloc[idx,6] = " ".join([word.lower() for word in word_tokenize(extracted_txt) if word.isalpha() and word not in custom and len(word)>5])

	sci_nlp = spacy.load('en_ner_bc5cdr_md')
	abbreviations_to_remove = ["s/p", "d/c'd", "w/"]
	chars_to_remove = ["#", ".", ")", "(", "[", "]"]
	numeric_pattern = r'\d+'
	text_col = "summaryInfo"

	# Function to extract all diseases
	def extract_diseases(text):

	docx = sci_nlp(text)
	results_diseases = [ent.text.lower() for ent in docx.ents if (ent.label_ == 'DISEASE')]
	results_diseases2 = [' '.join(word for word in disease.split() if word not in abbreviations_to_remove) for disease in results_diseases]
	results_diseases3 = [re.sub(numeric_pattern, '', phrase) for phrase in results_diseases2]
	results_diseases4 = ["".join(c for c in phrase if c not in chars_to_remove) for phrase in results_diseases3]
	results_diseases_cleaned = list(set(results_diseases4))

	return results_diseases_cleaned

	def convert_df(df):
	return df.to_csv(index=False).encode('utf-8')

	if "extract_button" not in st.session_state:
	st.session_state.extract_button=False

	if "ner_button" not in st.session_state:
	st.session_state.ner_button=False

	if "icd_button" not in st.session_state:
	st.session_state.icd_button=False

	def extractCallback():
	st.session_state.extract_button=True
	st.session_state.extract_spinner=True

	def nercallback():
	st.session_state.extract_button=True
	st.session_state.ner_button=True
	st.session_state.extract_spinner=False
	st.session_state.ner_spinner=True

	def icdcallback():
	st.session_state.icd_button=True
	st.session_state.extract_button=True
	st.session_state.ner_button=True
	st.session_state.icd_spinner=True
	st.session_state.ner_spinner=False


	model_name = "emilyalsentzer/Bio_ClinicalBERT"
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	def compareWith_ICD(row1,row2):
	scobj=dict()
	txt2 = row2['DISEASES']
	code=row2['ICD9_CODE']
	tkns = tokenizer([row1, txt2],padding=True,truncation=True,max_length=512)
	score = cosine_similarity([tkns['input_ids'][0]], [tkns['input_ids'][1]])
	scobj['ICD9_CODE']=code
	scobj['score']=round(score[0][0],2)
	return scobj

	def compare(row):
	obj = icd_df1.apply(lambda x: compareWith_ICD(row, x), axis=1)
	return json.dumps(sorted(obj.to_list(), key=itemgetter('score'), reverse=True))

	def top_icd(row):
	sorted_lst = literal_eval(row)

	k = [x['ICD9_CODE'] for x in sorted_lst]
	new_vals=[]
	for i in Counter(k):
	all = [x for x in sorted_lst if x['ICD9_CODE']==i]
	new_vals.append(max(all, key=lambda x: x['score']))

	return json.dumps(sorted(new_vals, key=itemgetter('score'), reverse=True)[:5])


	# Read the file from S3 using s3fs and store it in a Pandas DataFrame
	with fs.open(s3_file_path, 'rb') as file:
	icd_df = pd.read_csv(file)

	icd_df['DISEASES'] = icd_df['DISEASES'].apply(literal_eval)
	icd_df1 = icd_df.explode('DISEASES')
	icd_df1 = icd_df1[icd_df1['DISEASES'].notna()]
	icd_df1.drop(columns=['SEQ_NUM','SHORT_TITLE','LONG_TITLE'], inplace=True)
	icd_df1.drop_duplicates(subset=["ICD9_CODE", "DISEASES"], inplace=True)
	icd_df1.reset_index(drop=True, inplace=True)

	# Check to see if a file has been uploaded
	if upload_file is not None:
	file_extension = pathlib.Path(upload_file.name).suffix
	if file_extension=='.csv':

	# Read the file to a dataframe using pandas
	notevents_df = pd.read_csv(upload_file)
	cols = notevents_df.columns
	chk = checkcols(notevents_df)
	if chk:

	#Shape of Data Frame
	st.caption(f'Shape of data: {notevents_df.shape}')

	# Create a section for the dataframe header
	st.info('Display top 5 rows of data')
	st.write(notevents_df.head())

	# Filter Data Frame
	st.subheader('Extract Discharge Summary info')
	if st.button('Extract', on_click=extractCallback) or st.session_state.extract_button:

	#if st.session_state.extract_spinner:
	with st.spinner('Extracting...'):
	time.sleep(5)

	notevents_df1 = notevents_df.loc[(notevents_df['DESCRIPTION']=='Report') & \
	(notevents_df['CATEGORY']=='Discharge summary'),
	['ROW_ID','SUBJECT_ID','HADM_ID','CHARTDATE','TEXT']]

	notevents_df1['summaryInfo'] = np.nan
	notevents_df1['cleanedTxt'] = np.nan
	notevents_df1.reset_index(inplace=True, drop=True)
	# notevents_df1['HADM_ID']=notevents_df1['HADM_ID'].astype('int')

	extractSummary(notevents_df1)

	notevents_df2 = notevents_df1.loc[notevents_df1['cleanedTxt']!=""]
	notevents_df2.drop_duplicates(subset=["summaryInfo"], keep="first", inplace=True)
	notevents_df2.drop(columns=['TEXT'], inplace=True)

	st.caption(f'Shape of extracted data: {notevents_df2.shape}')

	# Create a section for the dataframe header
	st.info('Display top 5 rows of extracted data')
	st.write(notevents_df2.head())


	st.subheader('Extract Disease Information')
	if st.button('Run NER Model', on_click=nercallback) or st.session_state.ner_button:
	with st.spinner('Running NER...'):
	time.sleep(15)

	st.text(f"notevents_df2-------{notevents_df2.shape}")

	notevents_df2['DISEASES'] = notevents_df2[text_col].apply(lambda x: extract_diseases(x))
	# notevents_df2['DISEASES'] = notevents_df2['DISEASES'].apply(literal_eval)
	st.subheader('Display top 5 rows after running NER Model')
	st.write(notevents_df2.head())

	csv = convert_df(notevents_df2)
	st.download_button(
	"Press to Download",
	csv,
	"NER_diseases.csv",
	"text/csv",
	key='download-csv'
	)

	st.subheader('Map Disease to ICD9 Codes')
	if st.button('Run ICD Model'):
	#if st.session_state.icd_spinner:
	with st.spinner('icd model running...'):
	time.sleep(15)

	notevents_df2.drop_duplicates(subset=['HADM_ID'],inplace=True)
	ex_disease_df1 = notevents_df2.explode('DISEASES')

	ex_disease_df1 = ex_disease_df1[ex_disease_df1['DISEASES'].notna()]
	ex_disease_df1 = ex_disease_df1[ex_disease_df1['DISEASES']!=""]
	ex_disease_df1.drop(columns=['ROW_ID','SUBJECT_ID','CHARTDATE','summaryInfo','cleanedTxt'], inplace=True)
	ex_disease_df1['DISEASES']=ex_disease_df1['DISEASES'].str.replace("/", " ")
	ex_disease_df1.drop_duplicates(subset=["DISEASES"], keep="first", inplace=True)
	ex_disease_df1.reset_index(drop=True, inplace=True)

	ex_disease_df1['icd_map'] = ex_disease_df1.apply(lambda x: compare(x['DISEASES']), axis=1)
	ex_disease_df1['top5_icd'] = ex_disease_df1.apply(lambda x: top_icd(x['icd_map']), axis=1)

	ex_disease_df1.drop(columns=['icd_map'], inplace=True)

	st.info('Display top 5 rows of icd mapping')
	st.write(ex_disease_df1.head())

	csv = convert_df(ex_disease_df1)
	st.download_button(
	"Press to Download",
	csv,
	"top5icd.csv",
	"text/csv",
	key='top5icd-csv'
	)








	else:
	st.error('Invalid csv file', icon="🚨")

	else:
	st.error('Upload a csv file', icon="🚨")