Spaces:

tsantos
/

Hierarchical-Classification-System-for-Breast-Cancer

Runtime error

App Files Files Community

Thiago commited on Apr 7, 2022

Commit

05b0e9e

•

1 Parent(s): 71186ae

Move application to root dir

Browse files

Files changed (28) hide show

__pycache__/app.cpython-38.pyc +0 -0
__pycache__/config.cpython-37.pyc +0 -0
__pycache__/config.cpython-38.pyc +0 -0
__pycache__/download_models.cpython-37.pyc +0 -0
__pycache__/pipeline.cpython-37.pyc +0 -0
__pycache__/pipeline.cpython-38.pyc +0 -0
__pycache__/pipeline.cpython-39.pyc +0 -0
__pycache__/text_cleaning.cpython-37.pyc +0 -0
__pycache__/text_cleaning.cpython-38.pyc +0 -0
__pycache__/text_cleaning_transforerms.cpython-37.pyc +0 -0
__pycache__/text_cleaning_transforerms.cpython-38.pyc +0 -0
app.py +576 -0
config.py +221 -0
download_models.py +187 -0
imgs/.DS_Store +0 -0
imgs/doctor.png +0 -0
imgs/emory_1.png +0 -0
imgs/hybrid_system.png +0 -0
imgs/icon.png +0 -0
imgs/icons8-github-240.png +0 -0
imgs/medical-checkup.png +0 -0
imgs/pipeline.png +0 -0
models/.DS_Store +0 -0
models/all_labels_hierarchy/.gitignore +4 -0
models/higher_order_hierarchy/.gitignore +4 -0
pipeline.py +668 -0
text_cleaning.py +250 -0
text_cleaning_transforerms.py +229 -0

__pycache__/app.cpython-38.pyc ADDED Viewed

Binary file (4.36 kB). View file

__pycache__/config.cpython-37.pyc ADDED Viewed

Binary file (3.68 kB). View file

__pycache__/config.cpython-38.pyc ADDED Viewed

Binary file (3.65 kB). View file

__pycache__/download_models.cpython-37.pyc ADDED Viewed

Binary file (5.51 kB). View file

__pycache__/pipeline.cpython-37.pyc ADDED Viewed

Binary file (20 kB). View file

__pycache__/pipeline.cpython-38.pyc ADDED Viewed

Binary file (18.2 kB). View file

__pycache__/pipeline.cpython-39.pyc ADDED Viewed

Binary file (11.7 kB). View file

__pycache__/text_cleaning.cpython-37.pyc ADDED Viewed

Binary file (7.86 kB). View file

__pycache__/text_cleaning.cpython-38.pyc ADDED Viewed

Binary file (8.01 kB). View file

__pycache__/text_cleaning_transforerms.cpython-37.pyc ADDED Viewed

Binary file (6.09 kB). View file

__pycache__/text_cleaning_transforerms.cpython-38.pyc ADDED Viewed

Binary file (6.5 kB). View file

app.py ADDED Viewed

	@@ -0,0 +1,576 @@

+# Copyright (C) 2021, Mindee.
+# This program is licensed under the Apache License version 2.
+# See LICENSE or go to <https://www.apache.org/licenses/LICENSE-2.0.txt> for full license details.
+import os
+import streamlit as st
+import streamlit.components.v1 as components
+import time
+import matplotlib.pyplot as plt
+import pandas as pd
+from pipeline import Pipeline
+import html
+from IPython.core.display import display, HTML
+import json
+from PIL import Image
+from tqdm import tqdm
+import logging
+from htbuilder import HtmlElement, div, ul, li, br, hr, a, p, img, styles, classes, fonts
+from htbuilder.units import percent, px
+from htbuilder.funcs import rgba, rgb
+import copy
+from download_models import check_if_exist
+import re
+import numpy as np
+from sklearn.manifold import TSNE
+from sklearn.decomposition import PCA
+import plotly.express as plotpx
+import umap
+def image(src_as_string, **style):
+	return img(src=src_as_string, style=styles(**style))
+def link(link, text, **style):
+	return a(_href=link, _target="_blank", style=styles(**style))(text)
+def update_highlight(current,old):
+	out = current
+	matches_background_new = [(m.start(0), m.end(0)) for m in re.finditer("background-color:rgba\\(234, 131, 4,", out)]
+	matches_background_old = [(m.start(0), m.end(0)) for m in re.finditer("background-color:rgba\\(234, 131, 4,", old)]
+	for x,y in zip(matches_background_old,matches_background_new):
+		try:
+			old_importance = re.search("\\d+\\.\\d+",old[x[1]:x[1]+20])
+			new_importance = re.search("\\d+\\.\\d+",current[y[1]:y[1]+20])
+			if int(out[y[1]]) ==0 and float(old[x[1]]) != 0:
+				out = out[0:y[1]] + str(old_importance.group(0))  + out[y[1]:]
+				return False,out
+			if float(out[y[1]]) !=0 and float(old[x[1]]) != 0:
+				if float(old[x[1]]) > float(out[y[1]]):
+					out = out[0:y[1]] + str(old_importance.group(0))[0] + out[y[1]:]
+					return False,out
+		except Exception as e:
+			return True, out
+	return True,out
+def hidde_menu():
+	footer_style = """<style>
+					footer {
+					visibility: hidden;
+					}
+					footer:after {
+						content:"An end-to-end Breast Pathology Classification System to infer Breast Cancer Diagnosis and Severity";
+						visibility: visible;
+						display: block;
+						position: center;
+						#background-color: red;
+						padding: 5px;
+						top: 2px;
+					}
+					</style>
+				"""
+	st.markdown(footer_style, unsafe_allow_html=True)
+def main(myargs):
+	project_dir = os.path.dirname(os.path.abspath(__file__))
+	def add_content(columns):
+		if 'hg_df' in st.session_state:
+			columns[1].dataframe(st.session_state.hg_df)
+		if 'all_l' in st.session_state:
+			columns[2].dataframe(st.session_state.all_l)
+		if "highlight_samples" in st.session_state:
+			if "selected_indices" in st.session_state:
+				if len(st.session_state.selected_indices) >0:
+					out = ""
+					l = st.session_state.selected_indices
+					l.sort()
+					for ind in l:
+						out += st.session_state.highlight_samples[ind] +  "<br><br>"
+					components.html(out,scrolling=True)
+				else:
+					components.html(st.session_state.highlight_samples[0])
+			else:
+				components.html(st.session_state.highlight_samples[0])
+		# Add Plot - Only for File version
+		if st.session_state['input_type'] == 'File' and "embeddings_all" in st.session_state and st.session_state.embeddings_plot in ["2D", "3D"]:
+			indices = [x for x in range(st.session_state.data_df[st.session_state.input_column].values.shape[0])]
+			if "selected_indices" in st.session_state:
+				if len(st.session_state.selected_indices) >=4:
+					l = st.session_state.selected_indices
+					l.sort()
+					indices = l
+			if st.session_state.data_df[st.session_state.input_column].values.shape[0] >=2:
+				sub_embeddings = st.session_state.embeddings_all[indices]
+				sentences = st.session_state.data_df[st.session_state.input_column].values[indices]
+				sentences_parses = []
+				break_size = 20
+				for data in sentences:
+					d = data.split()
+					size_sentence = len(d)
+					if len(d) >break_size:
+						out = ""
+						for lower_bound in range(0,size_sentence, break_size):
+							upper_bound = lower_bound + break_size if lower_bound + break_size <= size_sentence else size_sentence
+							out += " ".join(x for x in d[lower_bound:upper_bound]) + "<br>"
+						sentences_parses.append(out)
+					else:
+						sentences_parses.append(data)
+				prediction_label = st.session_state.hg_df["Prediction"].values[indices]
+				prediction_worst_label = []
+				for pred in prediction_label:
+					preds = pred.split(" && ")
+					if len(preds) ==1:
+						prediction_worst_label.extend(preds)
+					else:
+						worst_index = min([st.session_state.predictor.bert_model.config['worst_rank'].index(x) for x in preds])
+						prediction_worst_label.append(st.session_state.predictor.bert_model.config['worst_rank'][worst_index])
+				if st.session_state.embeddings_type == "PCA":
+					low_dim_embeddings = PCA(n_components=3).fit_transform(sub_embeddings)
+				elif st.session_state.embeddings_type == "TSNE":
+					low_dim_embeddings = TSNE(n_components=3,init="pca",perplexity=st.session_state.perplexity,learning_rate=st.session_state.learning_rate).fit_transform(sub_embeddings)
+				else:
+					n_neighbors = min(st.session_state.n_neighbors, len(sub_embeddings)-1 )
+					low_dim_embeddings = umap.UMAP(n_neighbors=n_neighbors, min_dist=st.session_state.min_dist,n_components=3).fit(sub_embeddings).embedding_
+				df_embeddings = pd.DataFrame(low_dim_embeddings)
+				df_embeddings = df_embeddings.rename(columns={0:'x',1:'y',2:'z'})
+				df_embeddings = df_embeddings.assign(severity=prediction_worst_label)
+				df_embeddings = df_embeddings.assign(text=sentences_parses)
+				df_embeddings = df_embeddings.assign(data_index=indices)
+				df_embeddings = df_embeddings.assign(all_predictions=prediction_label)
+				if st.session_state.embeddings_plot == "2D":
+					# 2D
+					plot = plotpx.scatter(
+								df_embeddings, x='x', y='y',
+								color='severity', labels={'color': 'severity'},
+								hover_data=['text','all_predictions','data_index'],title = 'BERT Embeddings Visualization - Please select rows (at least 4) to display specific examples'
+					)
+				else:
+					# 3D
+					plot = plotpx.scatter_3d(
+								df_embeddings, x='x', y='y', z='z',
+								color='severity', labels={'color': 'severity'},
+								hover_data=['text','all_predictions','data_index'],title = 'BERT Embeddings Visualization - Please select rows (at least 4) to display specific examples'
+					)
+				st.plotly_chart(plot,use_container_width=True,)
+			#worst_rank_ind = [classes.index(x) for x in worst_rank]
+		if 'bert_lime_output' in st.session_state and st.session_state.bert_lime:
+			if len(st.session_state.bert_lime_output) >0: # need to re-run prediction
+				st.markdown("BERT Interpretability")
+				components.html(st.session_state.bert_lime_output[0])
+		if 'json_output' in st.session_state and st.session_state.json_out:
+			st.markdown("Here are your analysis results in JSON format:")
+			out = {}
+			if "selected_indices" in st.session_state:
+				if len(st.session_state.selected_indices) >0:
+					l = st.session_state.selected_indices
+					l.sort()
+					for ind in l:
+						out['sample_'+str(ind)] = st.session_state.json_output['sample_'+str(ind)]
+					st.json(out)
+				else:
+					out['sample_'+str(0)] = st.session_state.json_output['sample_'+str(0)]
+					st.json(out)
+			else:
+				# Display JSON
+				out['sample_'+str(0)] = st.session_state.json_output['sample_'+str(0)]
+				st.json(out)
+	def delete_var_session(keys:list):
+		for key in keys:
+			if key in st.session_state:
+				del st.session_state[key]
+	im = Image.open(os.path.join(project_dir, "imgs/icon.png"))
+	# Wide mode
+	st.set_page_config(page_title='HCSBC', layout = 'wide',page_icon=im,menu_items={
+		'Get Help': 'https://github.com/thiagosantos1/BreastPathologyClassificationSystem',
+		'Report a bug': "https://github.com/thiagosantos1/BreastPathologyClassificationSystem",
+		'About': "An end-to-end breast pathology classification system https://github.com/thiagosantos1/BreastPathologyClassificationSystem"
+	})
+	st.sidebar.image(os.path.join(project_dir,"imgs/doctor.png"),use_column_width=False)
+	# Designing the interface
+	st.markdown("<h1 style='text-align: center; color: black;'>HCSBC: Hierarchical Classification System for Breast Cancer</h1>", unsafe_allow_html=True)
+	st.markdown("System Pipeline: Pathology Emory Pubmed BERT + 6 independent Machine Learning discriminators")
+	# For newline
+	st.write('\n')
+	# Instructions
+	st.markdown("*Hint: click on the top-right corner to enlarge it!*")
+	# Set the columns
+	cols = st.columns((1, 1, 1))
+	#cols = st.columns(4)
+	cols[0].subheader("Input Data")
+	cols[1].subheader("Severity Predictions")
+	cols[2].subheader("Diagnose Predictions")
+	# Sidebar
+	# File selection
+	st.sidebar.title("Data Selection")
+	st.session_state['input_type'] = st.sidebar.radio("Input Selection", ('File', 'Text'), key="data_format")
+	if "prev_input_type" not in st.session_state:
+		st.session_state['prev_input_type'] = st.session_state.input_type
+	st.write('<style>div.row-widget.stRadio > div{flex-direction:row;}</style>', unsafe_allow_html=True)
+	# Disabling warning
+	st.set_option('deprecation.showfileUploaderEncoding', False)
+	if st.session_state['input_type'] == 'File':
+		if st.session_state['prev_input_type'] == 'Text':
+			delete_var_session(keys=["data_df","data_columns","hg_df","all_l","highlight_samples","selected_indices","json_output","bert_lime_output","embeddings_all"])
+		st.session_state['prev_input_type'] = "File"
+		# Choose your own file
+		new_file = st.sidebar.file_uploader("Upload Document", type=['xlsx','csv'])
+		if 'uploaded_file' in st.session_state and st.session_state.uploaded_file != None and new_file != None:
+			if st.session_state.uploaded_file.name != new_file.name and st.session_state.uploaded_file.id != new_file.id:
+				delete_var_session(keys=["data_df","data_columns","hg_df","all_l","highlight_samples","selected_indices","json_output","bert_lime_output","embeddings_all"])
+		st.session_state['uploaded_file'] = new_file
+		data_columns = ['Input']
+		if 'data_columns' not in st.session_state:
+			st.session_state['data_columns'] = data_columns
+		if st.session_state.uploaded_file is not None:
+			if 'data_df' not in st.session_state:
+				if st.session_state.uploaded_file.name.endswith('.xlsx'):
+					df = pd.read_excel(st.session_state.uploaded_file)
+				else:
+					df = pd.read_csv(st.session_state.uploaded_file)
+				df = df.loc[:, ~df.columns.str.contains('^Unnamed')]
+				df = df.fillna("NA")
+				data_columns = df.columns.values
+				st.session_state['data_df'] = df
+				st.session_state['data_columns'] = data_columns
+	else:
+		if st.session_state['prev_input_type'] == 'File':
+			delete_var_session(keys=["data_df","input_column","user_input","hg_df","all_l","highlight_samples","selected_indices","json_output","bert_lime_output","embeddings_all"])
+		st.session_state['prev_input_type'] = "Text"
+		input_column = "Input"
+		data = st.sidebar.text_area("Please enter a breast cancer pathology diagnose")
+		if "user_input" in st.session_state:
+			if data != st.session_state.user_input:
+				delete_var_session(keys=["data_df","input_column","user_input","hg_df","all_l","highlight_samples","selected_indices","json_output","bert_lime_output","embeddings_all"])
+		st.session_state['user_input'] = data
+		if len(st.session_state.user_input.split()) >0:
+			st.session_state['data_df'] = pd.DataFrame([st.session_state['user_input']], columns =[input_column])
+			st.session_state['input_column'] = input_column
+			st.session_state['uploaded_file'] = True
+		else:
+			delete_var_session(keys=["data_df","input_column","user_input","hg_df","all_l","highlight_samples","selected_indices","json_output","bert_lime_output","embeddings_all"])
+	if 'data_df' in st.session_state:
+		cols[0].dataframe(st.session_state.data_df)
+	if st.session_state['input_type'] == 'File':
+		# Columns selection
+		st.sidebar.write('\n')
+		st.sidebar.title("Column For Prediction")
+		input_column = st.sidebar.selectbox("Columns", st.session_state.data_columns)
+		st.session_state['input_column'] = input_column
+	st.sidebar.write('\n')
+	st.sidebar.title("Severity Model")
+	input_higher = st.sidebar.selectbox("Model", ["PathologyEmoryPubMedBERT"])
+	st.session_state['input_higher'] = input_higher
+	if "prev_input_higher" not in st.session_state:
+		st.session_state['prev_input_higher'] = st.session_state.input_higher
+		st.session_state['input_higher_exist'] = check_if_exist(st.session_state.input_higher)
+		st.session_state['load_new_higher_model'] = True
+	elif st.session_state.prev_input_higher != st.session_state.input_higher:
+		st.session_state['input_higher_exist'] = check_if_exist(st.session_state.input_higher)
+		st.session_state['prev_input_higher'] = st.session_state.input_higher
+		st.session_state['load_new_higher_model'] = True
+		delete_var_session(keys=["data_df","input_column","user_input","hg_df","all_l","highlight_samples","selected_indices","json_output","bert_lime_output","embeddings_all"])
+	st.sidebar.write('\n')
+	st.sidebar.title("Diagnosis Model")
+	input_all_labels = st.sidebar.selectbox("Model", ['single_vectorizer', 'branch_vectorizer'])
+	st.session_state['input_all_labels'] = input_all_labels
+	if "prev_input_all_labels" not in st.session_state:
+		st.session_state['prev_input_all_labels'] = st.session_state.input_all_labels
+		st.session_state['input_all_labels_exist'] = check_if_exist(st.session_state.input_all_labels)
+		st.session_state['load_new_all_label_model'] = True
+	elif st.session_state.prev_input_all_labels != st.session_state.input_all_labels:
+		st.session_state['input_all_labels_exist'] = check_if_exist(st.session_state.input_all_labels)
+		st.session_state['prev_input_all_labels'] = st.session_state.input_all_labels
+		st.session_state['load_new_all_label_model'] = True
+		delete_var_session(keys=["data_df","input_column","user_input","hg_df","all_l","highlight_samples","selected_indices","json_output","bert_lime_output","embeddings_all"])
+	# For newline
+	st.sidebar.write('\n')
+	st.sidebar.title("Analysis Options")
+	predictions, json_output, higher_order_pred,all_labels_pred,higher_order_prob,all_labels_prob = {},[],[],[],[],[]
+	hg_df, all_l,highlight_samples, bert_lime_output, embeddings_all= [],[],[],[],[]
+	if st.session_state['input_type'] == 'File':
+		embeddings_plot = st.sidebar.radio('Display embeddings plot',
+					  ['2D',
+					   '3D',
+					   'Dont Display'],index=1)
+		st.session_state['embeddings_plot'] = embeddings_plot
+	else:
+		st.session_state['embeddings_plot'] = 'Dont Display'
+	if st.session_state['input_type'] == 'File':
+		embeddings_type = st.sidebar.radio('Dimensionality Reduction',
+					  ['PCA',
+					   'TSNE','UMAP'],index=0)
+		st.session_state['embeddings_type'] = embeddings_type
+		if st.session_state.embeddings_type == "TSNE":
+			perplexity = st.sidebar.slider("Perplexity", min_value=5, max_value=100, step=5, value=30)
+			st.session_state['perplexity'] = perplexity
+			learning_rate = st.sidebar.slider("Learning Rate", min_value=10, max_value=1000, step=10, value=100)
+			st.session_state['learning_rate'] = learning_rate
+		if st.session_state.embeddings_type == "UMAP":
+			n_neighbors = st.sidebar.slider("Neighbors", min_value=2, max_value=100, step=1, value=2)
+			st.session_state['n_neighbors'] = n_neighbors
+			min_dist = st.sidebar.slider("Minimal Distance", min_value=0.1, max_value=0.99, step=0.05, value=0.1)
+			st.session_state['min_dist'] = min_dist
+	json_out = st.sidebar.checkbox('Display Json',value = True,key='check3')
+	st.session_state['json_out'] = json_out
+	if st.session_state['input_type'] == 'Text':
+		bert_lime = st.sidebar.checkbox('Display BERT Interpretability',value = False,key='check3')
+		st.session_state['bert_lime'] = bert_lime
+	else:
+		st.session_state['bert_lime'] = False
+	# For newline
+	st.sidebar.write('\n')
+	st.sidebar.title("Prediction")
+	if st.sidebar.button("Run Prediction"):
+		if st.session_state.uploaded_file is None:
+			st.sidebar.write("Please upload a your data")
+		else:
+			st.session_state['input_all_labels_exist'] = check_if_exist(st.session_state.input_all_labels)
+			if not st.session_state.input_all_labels_exist:
+				st.sidebar.write("Please Download Model: " + str(st.session_state.input_all_labels))
+			st.session_state['input_higher_exist'] = check_if_exist(st.session_state.input_higher)
+			if not st.session_state.input_higher_exist:
+				st.sidebar.write("Please Download Model: " + str(st.session_state.input_higher))
+			if st.session_state.input_all_labels_exist and st.session_state.input_higher_exist:
+				if "predictor" not in st.session_state or st.session_state.load_new_higher_model or st.session_state.load_new_all_label_model:
+					with st.spinner('Loading model...'):
+						print("\n\tLoading Model")
+						st.session_state["predictor"] = Pipeline(bert_option=str(st.session_state.input_higher), branch_option=str(st.session_state.input_all_labels))
+						st.session_state['load_new_higher_model'] = False
+						st.session_state['load_new_all_label_model'] = False
+				with st.spinner('Transforming Data...'):
+					data = st.session_state.data_df[st.session_state.input_column].values
+				with st.spinner('Analyzing...'):
+					time.sleep(0.1)
+					prog_bar = st.progress(0)
+					logging.info("Running Predictions for data size of: " + str(len(data)))
+					logging.info("\n\tRunning Predictions with: " + str(st.session_state.input_higher) + str(st.session_state.input_all_labels))
+					for index in tqdm(range(len(data))):
+						d = data[index]
+						time.sleep(0.1)
+						prog_bar.progress(int( (100/len(data)) * (index+1) ))
+						# refactor json
+						preds,embeddings_output = st.session_state.predictor.run(d)
+						embeddings = embeddings_output.tolist()
+						embeddings_all.append(embeddings[0])
+						if st.session_state.bert_lime:
+							logging.info("Running BERT LIME Interpretability Predictions")
+							bert_lime_output.append(st.session_state.predictor.bert_interpretability(d))
+						predictions["sample_" + str(index)] = {}
+						for ind,pred in enumerate(preds):
+							predictions["sample_" + str(index)]["prediction_" + str(ind)] = pred
+					prog_bar.progress(100)
+					time.sleep(0.1)
+					for key,sample in predictions.items():
+						higher,all_p, prob_higher, prob_all = [],[],[],[]
+						for key,pred in sample.items():
+							for higher_order, sub_arr in pred.items():
+								higher.append(higher_order)
+								prob_higher.append(round(sub_arr["probability"], 2))
+								for label,v in sub_arr['labels'].items():
+									all_p.append(label)
+									prob_all.append(round(v["probability"], 2))
+						higher_order_pred.append(" && ".join(x for x in higher))
+						all_labels_pred.append(" && ".join(x for x in all_p))
+						higher_order_prob.append(" && ".join(str(x) for x in prob_higher))
+						all_labels_prob.append(" && ".join(str(x) for x in prob_all))
+					predictions_refact = copy.deepcopy(predictions)
+					for index in tqdm(range(len(data))):
+						highlights = ""
+						key = "sample_" + str(index)
+						for k,v in predictions[key].items():
+							for k_s, v_s in v.items():
+								predictions_refact["sample_" + str(index)]["data"] = v_s['data']
+								predictions_refact["sample_" + str(index)]["transformer_data"] = v_s['transformer_data']
+								predictions_refact["sample_" + str(index)]["discriminator_data"] = v_s['word_analysis']['discriminator_data']
+								highlight = v_s['word_analysis']['highlighted_html_text']
+								if len(highlights) >0:
+									done = False
+									merged = highlight
+									while not done:
+										done,merged = update_highlight(merged,highlights)
+									highlights = merged
+								else:
+									highlights = highlight
+								del predictions_refact[key][k][k_s]['data']
+								del predictions_refact[key][k][k_s]['transformer_data']
+								del predictions_refact[key][k][k_s]['word_analysis']['discriminator_data']
+						highlight_samples.append(highlights)
+					json_output = predictions_refact
+					hg_df = pd.DataFrame(list(zip(higher_order_pred, higher_order_prob)), columns =['Prediction', "Probability"])
+					all_l = pd.DataFrame(list(zip(all_labels_pred,all_labels_prob)), columns =['Prediction',"Probability"])
+					all_preds = pd.DataFrame(list(zip(higher_order_pred, all_labels_pred)), columns =['Severity Prediction',"Diagnose Prediction"])
+					st.session_state['hg_df'] = hg_df
+					st.session_state['all_l'] = all_l
+					st.session_state['all_preds'] = all_preds
+					st.session_state['json_output'] = json_output
+					st.session_state['highlight_samples'] = highlight_samples
+					st.session_state['highlight_samples_df'] = pd.DataFrame(highlight_samples, columns =["HTML Word Importance"])
+					st.session_state['bert_lime_output'] = bert_lime_output
+					st.session_state['embeddings_all'] = np.asarray(embeddings_all)
+	if 'data_df' in st.session_state and 'json_output' in st.session_state:
+		st.markdown("<h1 style='text-align: center; color: purple;'>Model Analysis</h1>", unsafe_allow_html=True)
+		selected_indices = st.multiselect('Select Rows to Display Word Importance, Embeddings Visualization, and Json Analysis:', [x for x in range(len(st.session_state.data_df))])
+		st.session_state['selected_indices'] = selected_indices
+	add_content(cols)
+	if 'json_output' in st.session_state:
+		st.sidebar.write('\n')
+		st.sidebar.title("Save Results")
+		st.sidebar.write('\n')
+		st.sidebar.download_button(
+			label="Download Output Json",
+			data=str(st.session_state.json_output),
+			file_name="output.json",
+		 )
+		st.sidebar.download_button(
+			label="Download Predictions",
+			data=st.session_state.all_preds.to_csv(),
+			file_name="predictions.csv",
+		 )
+		st.sidebar.download_button(
+			label="Download Data + Predictions",
+			data = pd.concat([st.session_state.data_df, st.session_state.all_preds,st.session_state.highlight_samples_df], axis=1, join='inner').to_csv(),
+			file_name="data_predictions.csv",
+		 )
+	st.sidebar.write('\n')
+	st.sidebar.title("Contact Me")
+	sub_colms = st.sidebar.columns([1, 1, 1])
+	sub_colms[0].markdown('''<a href="https://github.com/thiagosantos1/BreastPathologyClassificationSystem">
+						<img src="https://img.icons8.com/fluency/48/000000/github.png" /></a>''',unsafe_allow_html=True)
+	sub_colms[1].markdown('''<a href="https://twitter.com/intent/follow?original_referer=https%3A%2F%2Fgithub.com%2Ftsantos_maia&screen_name=tsantos_maia">
+						<img src="https://img.icons8.com/color/48/000000/twitter--v1.png" /></a>''',unsafe_allow_html=True)
+	sub_colms[2].markdown('''<a href="https://www.linkedin.com/in/thiagosantos-cs/">
+						<img src="https://img.icons8.com/color/48/000000/linkedin.png" /></a>''',unsafe_allow_html=True)
+	hidde_menu()
+if __name__ == '__main__':
+	myargs = [
+		"Made in ",
+		image('https://avatars3.githubusercontent.com/u/45109972?s=400&v=4',
+			  width=px(25), height=px(25)),
+		" with ❤️ by ",
+		link("https://www.linkedin.com/in/thiagosantos-cs/", "@thiagosantos-cs"),
+		br(),
+		link("https://www.linkedin.com/in/thiagosantos-cs/", image('https://img.icons8.com/color/48/000000/twitter--v1.png')),
+		link("https://github.com/thiagosantos1/BreastPathologyClassificationSystem", image('https://img.icons8.com/fluency/48/000000/github.png')),
+	]
+	logging.basicConfig(
+		format="%(asctime)s - %(levelname)s - %(filename)s -   %(message)s",
+		datefmt="%d/%m/%Y %H:%M:%S",
+		level=logging.INFO)
+	main(myargs)

config.py ADDED Viewed

	@@ -0,0 +1,221 @@

+"""
+	Input config for pipeline
+"""
+def config_file() -> dict:
+	config = {
+				"BERT_config": {
+						"model_emb": 'bert',
+						"model_option": {
+											"PathologyEmoryPubMedBERT": {
+															 "model_folder":"models/higher_order_hierarchy/PathologyEmoryPubMedBERT/"
+															},
+											"PathologyEmoryBERT": {
+															 "model_folder":"models/higher_order_hierarchy/PathologyEmoryBERT/"
+															},
+											"ClinicalBERT": {
+															 "model_folder":"models/higher_order_hierarchy/ClinicalBERT/"
+															},
+											"BlueBERT": {
+															 "model_folder":"models/higher_order_hierarchy/BlueBERT/"
+															},
+											"BioBERT": {
+															 "model_folder":"models/higher_order_hierarchy/BioBERT/"
+															},
+											"BERT": {
+															 "model_folder":"models/higher_order_hierarchy/BERT/"
+															},
+										},
+						"max_seq_length": "64",
+						"threshold_prediction":0.5,
+						"classes": ['Invasive breast cancer-IBC','Non-breast cancer-NBC','In situ breast cancer-ISC',
+									 'Borderline lesion-BLL','High risk lesion-HRL','Benign-B','Negative'],
+						"worst_rank" : ['Invasive breast cancer-IBC', 'In situ breast cancer-ISC', 'High risk lesion-HRL',
+										'Borderline lesion-BLL','Benign-B','Non-breast cancer-NBC','Negative']
+				},
+				"ibc_config": {
+						"model_option": {
+											"single_tfidf": {
+																"path_model":"models/all_labels_hierarchy/single_tfidf/classifiers",
+																"model": "ibc_xgboost_classifier.pkl",
+																"path_vectorizer":"models/all_labels_hierarchy/single_tfidf/vectorizers",
+																"vectorizer":"vectorizer_all_branches.pkl",
+																"path_bigrmas":"models/all_labels_hierarchy/single_tfidf/vectorizers",
+																"bigrams":"best_bigrams.csv",
+																"path_phrase_bigrams":"models/all_labels_hierarchy/single_tfidf/vectorizers",
+																"phrase_bigrams" : "phrase_bigrams.pkl"
+																},
+											"branch_tfidf": {
+																"path_model":"models/all_labels_hierarchy/branch_tfidf/classifiers",
+																"model": "ibc_xgboost_classifier.pkl",
+																"path_vectorizer":"models/all_labels_hierarchy/branch_tfidf/vectorizers",
+																"vectorizer":"ibc_vectorizer.pkl",
+																"path_bigrmas":"models/all_labels_hierarchy/branch_tfidf/vectorizers",
+																"bigrams":"best_bigrams.csv",
+																"path_phrase_bigrams":"models/all_labels_hierarchy/branch_tfidf/vectorizers",
+																"phrase_bigrams" : "phrase_bigrams.pkl"
+																}
+										},
+						"classes": ['apocrine carcinoma','grade i','grade ii','grade iii','invasive ductal carcinoma','invasive lobular carcinoma','medullary carcinoma','metaplastic carcinoma','mucinous carcinoma','tubular carcinoma','lymph node - metastatic']
+				},
+				"isc_config": {
+						"model_option": {
+											"single_tfidf": {
+																"path_model":"models/all_labels_hierarchy/single_tfidf/classifiers",
+																"model": "isc_xgboost_classifier.pkl",
+																"path_vectorizer":"models/all_labels_hierarchy/single_tfidf/vectorizers",
+																"vectorizer":"vectorizer_all_branches.pkl",
+																"path_bigrmas":"models/all_labels_hierarchy/single_tfidf/vectorizers",
+																"bigrams":"best_bigrams.csv",
+																"path_phrase_bigrams":"models/all_labels_hierarchy/single_tfidf/vectorizers",
+																"phrase_bigrams" : "phrase_bigrams.pkl"
+																},
+											"branch_tfidf": {
+																"path_model":"models/all_labels_hierarchy/branch_tfidf/classifiers",
+																"model": "isc_xgboost_classifier.pkl",
+																"path_vectorizer":"models/all_labels_hierarchy/branch_tfidf/vectorizers",
+																"vectorizer":"isc_vectorizer.pkl",
+																"path_bigrmas":"models/all_labels_hierarchy/branch_tfidf/vectorizers",
+																"bigrams":"best_bigrams.csv",
+																"path_phrase_bigrams":"models/all_labels_hierarchy/branch_tfidf/vectorizers",
+																"phrase_bigrams" : "phrase_bigrams.pkl"
+																}
+										},
+						"classes": ['ductal carcinoma in situ','high','intermediate','intracystic papillary carcinoma','intraductal papillary carcinoma','low','pagets','fna - malignant']
+				},
+				"hrl_config": {
+						"model_option": {
+											"single_tfidf": {
+																"path_model":"models/all_labels_hierarchy/single_tfidf/classifiers",
+																"model": "hrl_xgboost_classifier.pkl",
+																"path_vectorizer":"models/all_labels_hierarchy/single_tfidf/vectorizers",
+																"vectorizer":"vectorizer_all_branches.pkl",
+																"path_bigrmas":"models/all_labels_hierarchy/single_tfidf/vectorizers",
+																"bigrams":"best_bigrams.csv",
+																"path_phrase_bigrams":"models/all_labels_hierarchy/single_tfidf/vectorizers",
+																"phrase_bigrams" : "phrase_bigrams.pkl"
+																},
+											"branch_tfidf": {
+																"path_model":"models/all_labels_hierarchy/branch_tfidf/classifiers",
+																"model": "hrl_xgboost_classifier.pkl",
+																"path_vectorizer":"models/all_labels_hierarchy/branch_tfidf/vectorizers",
+																"vectorizer":"hrl_vectorizer.pkl",
+																"path_bigrmas":"models/all_labels_hierarchy/branch_tfidf/vectorizers",
+																"bigrams":"best_bigrams.csv",
+																"path_phrase_bigrams":"models/all_labels_hierarchy/branch_tfidf/vectorizers",
+																"phrase_bigrams" : "phrase_bigrams.pkl"
+																}
+										},
+						"classes": ['atypical ductal hyperplasia','atypical lobular hyperplasia','atypical papilloma','columnar cell change with atypia','flat epithelial atypia','hyperplasia with atypia','intraductal papilloma','lobular carcinoma in situ','microscopic papilloma','radial scar']
+				},
+				"bll_config": {
+						"model_option": {
+											"single_tfidf": {
+																"path_model":"models/all_labels_hierarchy/single_tfidf/classifiers",
+																"model": "bll_xgboost_classifier.pkl",
+																"path_vectorizer":"models/all_labels_hierarchy/single_tfidf/vectorizers",
+																"vectorizer":"vectorizer_all_branches.pkl",
+																"path_bigrmas":"models/all_labels_hierarchy/single_tfidf/vectorizers",
+																"bigrams":"best_bigrams.csv",
+																"path_phrase_bigrams":"models/all_labels_hierarchy/single_tfidf/vectorizers",
+																"phrase_bigrams" : "phrase_bigrams.pkl"
+																},
+											"branch_tfidf": {
+																"path_model":"models/all_labels_hierarchy/branch_tfidf/classifiers",
+																"model": "bll_xgboost_classifier.pkl",
+																"path_vectorizer":"models/all_labels_hierarchy/branch_tfidf/vectorizers",
+																"vectorizer":"bll_vectorizer.pkl",
+																"path_bigrmas":"models/all_labels_hierarchy/branch_tfidf/vectorizers",
+																"bigrams":"best_bigrams.csv",
+																"path_phrase_bigrams":"models/all_labels_hierarchy/branch_tfidf/vectorizers",
+																"phrase_bigrams" : "phrase_bigrams.pkl"
+																}
+										},
+						"classes": ['atypical phyllodes', 'granular cell tumor', 'mucocele']
+				},
+				"benign_config": {
+						"model_option": {
+											"single_tfidf": {
+																"path_model":"models/all_labels_hierarchy/single_tfidf/classifiers",
+																"model": "benign_xgboost_classifier.pkl",
+																"path_vectorizer":"models/all_labels_hierarchy/single_tfidf/vectorizers",
+																"vectorizer":"vectorizer_all_branches.pkl",
+																"path_bigrmas":"models/all_labels_hierarchy/single_tfidf/vectorizers",
+																"bigrams":"best_bigrams.csv",
+																"path_phrase_bigrams":"models/all_labels_hierarchy/single_tfidf/vectorizers",
+																"phrase_bigrams" : "phrase_bigrams.pkl"
+																},
+											"branch_tfidf": {
+																"path_model":"models/all_labels_hierarchy/branch_tfidf/classifiers",
+																"model": "benign_xgboost_classifier.pkl",
+																"path_vectorizer":"models/all_labels_hierarchy/branch_tfidf/vectorizers",
+																"vectorizer":"benign_vectorizer.pkl",
+																"path_bigrmas":"models/all_labels_hierarchy/branch_tfidf/vectorizers",
+																"bigrams":"best_bigrams.csv",
+																"path_phrase_bigrams":"models/all_labels_hierarchy/branch_tfidf/vectorizers",
+																"phrase_bigrams" : "phrase_bigrams.pkl"
+																}
+										},
+						"classes": ['apocrine metaplasia','biopsy site changes','columnar cell change without atypia','cyst','excisional or post-surgical change','fat necrosis','fibroadenoma','fibroadenomatoid','fibrocystic disease','fibromatoses','fibrosis','hamartoma','hemangioma','lactational change','lymph node - benign','myofibroblastoma','myxoma','phyllodes','pseudoangiomatous stromal hyperplasia','sclerosing adenosis','usual ductal hyperplasia','fna - benign','seroma']
+				},
+				"nbc_config": {
+						"model_option": {
+											"single_tfidf": {
+																"path_model":"models/all_labels_hierarchy/single_tfidf/classifiers",
+																"model": "nbc_xgboost_classifier.pkl",
+																"path_vectorizer":"models/all_labels_hierarchy/single_tfidf/vectorizers",
+																"vectorizer":"vectorizer_all_branches.pkl",
+																"path_bigrmas":"models/all_labels_hierarchy/single_tfidf/vectorizers",
+																"bigrams":"best_bigrams.csv",
+																"path_phrase_bigrams":"models/all_labels_hierarchy/single_tfidf/vectorizers",
+																"phrase_bigrams" : "phrase_bigrams.pkl"
+																},
+											"branch_tfidf": {
+																"path_model":"models/all_labels_hierarchy/branch_tfidf/classifiers",
+																"model": "nbc_xgboost_classifier.pkl",
+																"path_vectorizer":"models/all_labels_hierarchy/branch_tfidf/vectorizers",
+																"vectorizer":"nbc_vectorizer.pkl",
+																"path_bigrmas":"models/all_labels_hierarchy/branch_tfidf/vectorizers",
+																"bigrams":"best_bigrams.csv",
+																"path_phrase_bigrams":"models/all_labels_hierarchy/branch_tfidf/vectorizers",
+																"phrase_bigrams" : "phrase_bigrams.pkl"
+																}
+										},
+						"classes": ['lymphoma', 'malignant(sarcomas)', 'non-breast metastasis']
+				},
+	}
+	return config
+if __name__ == '__main__':
+	pass

download_models.py ADDED Viewed

	@@ -0,0 +1,187 @@

+""" Download pre-trained models from Google drive. """
+import os
+import argparse
+import zipfile
+import logging
+import requests
+from tqdm import tqdm
+import fire
+import re
+logging.basicConfig(
+		format="%(asctime)s - %(levelname)s - %(filename)s -   %(message)s",
+		datefmt="%d/%m/%Y %H:%M:%S",
+		level=logging.INFO)
+"", "", "", "","",""
+MODEL_TO_URL = {
+	'PathologyEmoryPubMedBERT': 'https://drive.google.com/open?id=1l_el_mYXoTIQvGwKN2NZbp97E4svH4Fh',
+	'PathologyEmoryBERT': 'https://drive.google.com/open?id=11vzo6fJBw1RcdHVBAh6nnn8yua-4kj2IX',
+	'ClinicalBERT': 'https://drive.google.com/open?id=1UK9HqSspVneK8zGg7B93vIdTGKK9MI_v',
+	'BlueBERT': 'https://drive.google.com/open?id=1o-tcItErOiiwqZ-YRa3sMM3hGB4d3WkP',
+	'BioBERT': 'https://drive.google.com/open?id=1m7EkWkFBIBuGbfwg7j0R_WINNnYk3oS9',
+	'BERT': 'https://drive.google.com/open?id=1SB_AQAAsHkF79iSAaB3kumYT1rwcOJru',
+	'single_tfidf': 'https://drive.google.com/open?id=1-hxf7sKRtFGMOenlafdkeAr8_9pOz6Ym',
+	'branch_tfidf': 'https://drive.google.com/open?id=1pDSnwLFn3YzPRac9rKFV_FN9kdzj2Lb0'
+}
+"""
+	For large Files, Drive requires a Virus Check.
+	This function is reponsivle to extract the link from the button confirmation
+"""
+def get_url_from_gdrive_confirmation(contents):
+    url = ""
+    for line in contents.splitlines():
+        m = re.search(r'href="(\/uc\?export=download[^"]+)', line)
+        if m:
+            url = "https://docs.google.com" + m.groups()[0]
+            url = url.replace("&amp;", "&")
+            break
+        m = re.search('id="downloadForm" action="(.+?)"', line)
+        if m:
+            url = m.groups()[0]
+            url = url.replace("&amp;", "&")
+            break
+        m = re.search('"downloadUrl":"([^"]+)', line)
+        if m:
+            url = m.groups()[0]
+            url = url.replace("\\u003d", "=")
+            url = url.replace("\\u0026", "&")
+            break
+        m = re.search('<p class="uc-error-subcaption">(.*)</p>', line)
+        if m:
+            error = m.groups()[0]
+            raise RuntimeError(error)
+    if not url:
+        return None
+    return url
+def download_file_from_google_drive(id, destination):
+	URL = "https://docs.google.com/uc?export=download"
+	session = requests.Session()
+	response = session.get(URL, params={ 'id' : id }, stream=True)
+	URL_new = get_url_from_gdrive_confirmation(response.text)
+	if URL_new != None:
+		URL = URL_new
+		response = session.get(URL, params={ 'id' : id }, stream=True)
+	token = get_confirm_token(response)
+	if token:
+		params = { 'id' : id, 'confirm' : token }
+		response = session.get(URL, params=params, stream=True)
+	save_response_content(response, destination)
+def get_confirm_token(response):
+	for key, value in response.cookies.items():
+		if key.startswith('download_warning'):
+			return value
+	return None
+def save_response_content(response, destination):
+	CHUNK_SIZE = 32768
+	with open(destination, "wb") as f:
+		for chunk in tqdm(response.iter_content(CHUNK_SIZE)):
+			if chunk: # filter out keep-alive new chunks
+				f.write(chunk)
+def check_if_exist(model:str = "single_tfidf"):
+	if model =="single_vectorizer":
+		model = "single_tfidf"
+	if model =="branch_vectorizer":
+		model = "branch_tfidf"
+	project_dir = os.path.dirname(os.path.abspath(__file__))
+	if model != None:
+		if model in ['single_tfidf', 'branch_tfidf' ]:
+			path='models/all_labels_hierarchy/'
+			path_model = os.path.join(project_dir, path,  model,'classifiers')
+			path_vectorizer = os.path.join(project_dir, path,  model,'vectorizers')
+			if os.path.exists(path_model) and os.path.exists(path_vectorizer):
+				if len(os.listdir(path_model)) >0 and len(os.listdir(path_vectorizer)) >0:
+					return True
+		else:
+			path='models/higher_order_hierarchy/'
+			path_folder = os.path.join(project_dir, path,  model)
+			if os.path.exists(path_folder):
+				if len(os.listdir(path_folder + "/" )) >1:
+					return True
+	return False
+def download_model(all_labels='single_tfidf', higher_order='PathologyEmoryPubMedBERT'):
+	project_dir = os.path.dirname(os.path.abspath(__file__))
+	path_all_labels='models/all_labels_hierarchy/'
+	path_higher_order='models/higher_order_hierarchy/'
+	def extract_model(path_file, name):
+		os.makedirs(os.path.join(project_dir, path_file), exist_ok=True)
+		file_destination = os.path.join(project_dir, path_file, name + '.zip')
+		file_id = MODEL_TO_URL[name].split('id=')[-1]
+		logging.info(f'Downloading {name} model (~1000MB tar.xz archive)')
+		download_file_from_google_drive(file_id, file_destination)
+		logging.info('Extracting model from archive (~1300MB folder) and saving to ' + str(file_destination))
+		with zipfile.ZipFile(file_destination, 'r') as zip_ref:
+			zip_ref.extractall(path=os.path.dirname(file_destination))
+		logging.info('Removing archive')
+		os.remove(file_destination)
+		logging.info('Done.')
+	if higher_order != None:
+		if not check_if_exist(higher_order):
+			extract_model(path_higher_order, higher_order)
+		else:
+			logging.info('Model ' + str(higher_order) + ' already exist')
+	if all_labels!= None:
+		if not check_if_exist(all_labels):
+			extract_model(path_all_labels, all_labels)
+		else:
+			logging.info('Model ' + str(all_labels) + ' already exist')
+def download(all_labels:str = "single_tfidf", higher_order:str = "PathologyEmoryPubMedBERT"):
+	"""
+		Input Options:
+			all_labels : single_tfidf, branch_tfidf
+			higher_order : clinicalBERT, blueBERT, patho_clinicalBERT, patho_blueBERT, charBERT
+	"""
+	all_labels_options  = [ "single_tfidf", "branch_tfidf"]
+	higher_order_option = [ "PathologyEmoryPubMedBERT", "PathologyEmoryBERT", "ClinicalBERT", "BlueBERT","BioBERT","BERT" ]
+	if all_labels not in all_labels_options or higher_order not in higher_order_option:
+		print("\n\tPlease provide a valid model for downloading")
+		print("\n\t\tall_labels: " + " ".join(x for x in all_labels_options))
+		print("\n\t\thigher_order: " + " ".join(x for x in higher_order))
+		exit()
+	download_model(all_labels,higher_order)
+if __name__ == "__main__":
+	fire.Fire(download)

imgs/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

imgs/doctor.png ADDED Viewed

imgs/emory_1.png ADDED Viewed

imgs/hybrid_system.png ADDED Viewed

imgs/icon.png ADDED Viewed

imgs/icons8-github-240.png ADDED Viewed

imgs/medical-checkup.png ADDED Viewed

imgs/pipeline.png ADDED Viewed

models/.DS_Store ADDED Viewed

Binary file (6.15 kB). View file

models/all_labels_hierarchy/.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+# Ignore everything in this directory
+*
+# Except this file
+!.gitignore

models/higher_order_hierarchy/.gitignore ADDED Viewed

	@@ -0,0 +1,4 @@

+# Ignore everything in this directory
+*
+# Except this file
+!.gitignore

pipeline.py ADDED Viewed

	@@ -0,0 +1,668 @@

+import os
+import sys
+import text_cleaning_transforerms as tc
+import text_cleaning
+import logging
+import torch
+import matplotlib.pyplot as plt
+import numpy as np
+import pandas as pd
+import itertools
+import json
+import joblib
+from gensim.models import phrases
+import math
+import xgboost
+import re
+import nltk
+nltk.download('stopwords')
+nltk.download('wordnet')
+import html
+from config import config_file
+from lime import lime_text
+from lime.lime_text import LimeTextExplainer
+from transformers import AutoModelForSequenceClassification,AutoTokenizer
+from nltk.tokenize import word_tokenize
+"""
+	Cancer Severity Class.
+	export env_name="path"
+"""
+class BERT_Model(object):
+	def __init__(self, config,bert_option:str="clinicalBERT"):
+		try:
+			self.config = config
+			self.project_dir = os.path.dirname(os.path.abspath(__file__))
+			self.bert_option = bert_option
+			# check if a path was alreadey added to os env table
+			if "model_folder" in os.environ:
+				self.config['model_folder'] = os.environ['model_folder']
+			else:
+				self.config['model_folder'] = os.path.join(self.project_dir, self.config['model_option'][self.bert_option]['model_folder'])
+			self.initialize()
+		except Exception as e:
+			logging.exception("Error occurred while Initializing BERT Model, please double check you have a config file " +" Info: " + str(e))
+			exit()
+	def initialize(self):
+		# Set up logging
+		logging.basicConfig(
+			format="%(asctime)s - %(levelname)s - %(filename)s -   %(message)s",
+			datefmt="%d/%m/%Y %H:%M:%S",
+			level=logging.INFO)
+		# Check for GPUs
+		if torch.cuda.is_available():
+			self.config["use_cuda"] = True
+			self.config["cuda_device"] = torch.cuda.current_device()
+			logging.info("Using GPU (`%s`)", torch.cuda.get_device_name())
+		else:
+			self.config["use_cuda"] = False
+			self.config["cuda_device"] = "cpu"
+			logging.info("Using CPU")
+		self.model  = AutoModelForSequenceClassification.from_pretrained(self.config["model_folder"], num_labels=len(self.config['classes']),output_hidden_states=True).to(self.config["cuda_device"])
+		self.tokenizer = AutoTokenizer.from_pretrained(self.config["model_folder"])
+	def clean_data(self,text:str):
+		return tc.pre_process(text,max_size=int(self.config["max_seq_length"]),remove_punctuation=True )
+	def sigmoid(self,x):
+		return 1 / (1 + math.exp(-x))
+	"""
+		Convert output of multi-class to probabilities between 0-1
+	"""
+	def raw_to_probs(self,vector):
+		return [self.sigmoid(x) for x in vector]
+	"""
+		Given a threshold, convert a vector of probabiities into predictions (0 or 1)
+	"""
+	def _threshold(self, vector:list, threshold:float=0.5) -> list:
+		logit_vector = [1 if x >=threshold else 0 for x in vector]
+		return logit_vector
+	"""
+		Pre-Process the data according to the same strategy used during training
+	"""
+	def pre_process(self,texts:list)-> list:
+		transformer_clean_data,transformer_clean_data_chunks = [],[]
+		for index,t in enumerate(texts):
+			clean_data, clean_data_chunks = self.clean_data(t)
+			transformer_clean_data.append(clean_data)
+			transformer_clean_data_chunks.append(clean_data_chunks)
+		return transformer_clean_data,transformer_clean_data_chunks
+	"""
+		Giving a list of texts, return the sentence embedding (CLS token from last BERT layer)
+	"""
+	def get_embeddings(self,texts:list)-> list:
+		transformer_clean_data,_ = self.pre_process(texts)
+		inputs = self.tokenizer(transformer_clean_data, return_tensors="pt", padding=True).to(self.config["cuda_device"])
+		outputs = self.model(**inputs,output_hidden_states=True)
+		last_hidden_states = outputs[1][-1].detach().cpu().numpy()
+		embeddings_output = np.asarray(last_hidden_states[:, 0])
+		return embeddings_output
+	"""
+		Giving a list of texts, run BERT prediction for each sample
+		If use_chunks is set to True (default), it chunks de data into chunks of max_size (set on config.py)
+		The final prediction for that sample is the concatenation of predictions from every chunck
+		Returns:
+			* Predictions
+			* Probabiities
+			* Sentence Embedding (CLS token from last BERT layer)
+			* Pre-Processed data used for Prediction
+	"""
+	def predict(self,texts:list, use_chunks=True)-> list:
+		transformer_clean_data,transformer_clean_data_chunks = self.pre_process(texts)
+		ids_chunks = []
+		# Flat all chunks (2d list) into 1d List (each chunck is feed separetly to prediction)
+		if use_chunks:
+			flatten_chunks = [j for sub in transformer_clean_data_chunks for j in sub]
+			ids = [[x]*len(transformer_clean_data_chunks[x]) for x in range(len(transformer_clean_data_chunks))]
+			ids_chunks = [j for sub in ids for j in sub]
+			data = flatten_chunks.copy()
+		else:
+			data = transformer_clean_data.copy()
+		inputs = self.tokenizer(data, return_tensors="pt", padding=True).to(self.config["cuda_device"])
+		outputs = self.model(**inputs,output_hidden_states=True)
+		# Post-Process output if using chunks --> Merge chunck Predictions into 1
+		if use_chunks:
+			raw_probs_chunks = outputs[0].detach().cpu().numpy()
+			probs_chunks = [self.raw_to_probs(x) for x in raw_probs_chunks]
+			probs = np.asarray([[0 for x in range(len(probs_chunks[0]))] for x in range(len(texts))],dtype=float)
+			for index, prob in enumerate(probs_chunks):
+				id_ = ids_chunks[index]
+				# if no predictions for such index yet, add (this is the base - avoid zero preds)
+				if np.sum(probs[id_])<=0:
+					probs[id_] = prob
+				else: # update to merge predictions
+					pred = np.asarray(self._threshold(vector=prob,threshold=self.config["threshold_prediction"]))
+					pos_pred_index  = np.where(pred>0)[0]
+					if len(pos_pred_index)>0:
+						for pos in pos_pred_index:
+							probs[id_][pos] = prob[pos]
+		else:
+			raw_probs = outputs[0].detach().cpu().numpy()
+			probs = [self.raw_to_probs(x) for x in raw_probs]
+		predictions = [self._threshold(vector=pred,threshold=self.config["threshold_prediction"]) for pred in probs]
+		last_hidden_states = outputs[1][-1].detach().cpu().numpy()
+		embeddings_output = np.asarray(last_hidden_states[:, 0])
+		return predictions, probs, embeddings_output, transformer_clean_data
+	"""
+		Giving a list of text, it executes the branch prediction
+		This function call BERT Predict, pre-process predictions, and return the post-process branch prediction
+		Returns:
+			* Branch Prediction
+			* Sentence Embedding (CLS token from last BERT layer)
+	"""
+	def branch_prediction(self,texts:list)-> list:
+		out_pred = []
+		predictions, probs, embeddings_output, transformer_clean_data = self.predict(texts,use_chunks=True)
+		try:
+			for index, preds in enumerate(probs):
+				preds = np.asarray(preds)
+				pos = np.where(preds > 0.5)[0]
+				pred = []
+				if len(pos) >0:
+					for ind in pos:
+						pred.append({self.config['classes'][ind]: {"probability":preds[ind], "data":texts[index], "transformer_data": transformer_clean_data[index] }})
+				else:
+					pred.append({"No Prediction": {"probability":0, "data":texts[index], "transformer_data": transformer_clean_data[index]}})
+				out_pred.append(pred)
+		except Exception as e:
+			logging.exception("Error occurred on BERT model prediction" +" Info: " + str(e))
+			exit()
+		return out_pred,embeddings_output
+"""
+	Cancer Diagnose Prediction Class.
+	This class is used to load each individual branch classifier
+"""
+class Branch_Classifier(object):
+	def __init__(self, config, branch_option:str="single_tfidf"):
+		self.config = config
+		self.branch_option = branch_option
+		self.project_dir = os.path.dirname(os.path.abspath(__file__))
+		try:
+			if "path_model" in os.environ:
+				self.config['path_model'] = os.environ['path_model']
+			else:
+				self.config['path_model'] = os.path.join(self.project_dir, self.config['model_option'][self.branch_option]['path_model'])
+			if "path_vectorizer" in os.environ:
+				self.config['path_vectorizer'] = os.environ['path_vectorizer']
+			else:
+				self.config['path_vectorizer'] = os.path.join(self.project_dir, self.config['model_option'][self.branch_option]['path_vectorizer'])
+			if "path_bigrmas" in os.environ:
+				self.config['path_bigrmas'] = os.environ['path_bigrmas']
+			else:
+				self.config['path_bigrmas'] = os.path.join(self.project_dir, self.config['model_option'][self.branch_option]['path_bigrmas'])
+			if "path_phrase_bigrams" in os.environ:
+				self.config['path_phrase_bigrams'] = os.environ['path_phrase_bigrams']
+			else:
+				self.config['path_phrase_bigrams'] = os.path.join(self.project_dir, self.config['model_option'][self.branch_option]['path_phrase_bigrams'])
+		except Exception as e:
+			logging.exception("Error occurred while reading config file. Please read config instructions" +" Info: " + str(e))
+			exit()
+		self.initialize()
+	def initialize(self):
+		try:
+			self.model = joblib.load(os.path.join(self.config['path_model'],self.config['model_option'][self.branch_option]['model']))
+			self.vectorizer = joblib.load(os.path.join(self.config['path_vectorizer'],self.config['model_option'][self.branch_option]['vectorizer']))
+			self.good_bigrams = pd.read_csv(os.path.join(self.config["path_bigrmas"],self.config['model_option'][self.branch_option]['bigrams']))['bigram'].to_list()
+			self.phrase_bigrams = phrases.Phrases.load(os.path.join(self.config["path_phrase_bigrams"],self.config['model_option'][self.branch_option]['phrase_bigrams']))
+		except Exception as e:
+			logging.exception("Error occurred while initializing models and vectorizer" +" Info: " + str(e))
+			exit()
+	"""
+		Only add specific Bi-grams (Pre-calculated during Training)
+	"""
+	def clean_bigram(self,data:list)-> list:
+		data_clean = []
+		for word in data:
+			if re.search("_",word) == None:
+				data_clean.append(word)
+			else: # gotta add the word without _ as well
+				if word in self.good_bigrams:
+					data_clean.append(word)
+				else:
+					data_clean.append(word.split("_")[0])
+					data_clean.append(word.split("_")[1])
+		return np.asarray(data_clean)
+	"""
+		Giving a list of text, pre-process and format the data
+	"""
+	def format_data(self,data:list)-> list:
+		try:
+			X = text_cleaning.text_cleaning(data, steam=False, lemma=True,single_input=True)[0]
+			### Add Bigrams and keep only the good ones(pre-selected)
+			X_bigrmas  = self.phrase_bigrams[X]
+			data_clean = self.clean_bigram(X_bigrmas)
+			X_bigrams_clean = ' '.join(map(str, data_clean))
+			pre_processed = self.vectorizer.transform([X_bigrams_clean]).toarray(),X_bigrams_clean
+		except Exception as e:
+			logging.exception("Error occurred while formatting and cleaning data" +" Info: " + str(e))
+			exit()
+		return pre_processed
+	def html_escape(self,text):
+		return html.escape(text)
+	def predict(self, texts:list)-> list:
+		"""
+			Steps:
+				1) Run the predictions from higher-order
+				2) Based on the prediction, activate which brach(es) to send for final prediction (cancer characteristics)
+				3) For final prediction, create a word importance HTML for each input
+		"""
+		out_pred = {'predictions': {}, 'word_analysis':{},}
+		color = "234, 131, 4" # orange
+		try:
+			for t in texts:
+				text_tfidf,clean_data = self.format_data(t)
+				probs = self.model.predict_proba(text_tfidf).toarray()
+				predictions = self.model.predict(text_tfidf).toarray()
+				for index,preds in enumerate(predictions):
+					pos = np.where(preds > 0.5)[0]
+					pred = []
+					if len(pos) >0:
+						for ind in pos:
+							highlighted_html_text = []
+							weigts = self.model.classifiers_[ind].feature_importances_
+							word_weights = {}
+							words = clean_data.split()
+							min_new = 0
+							max_new = 100
+							min_old = np.min(weigts)
+							max_old = np.max(weigts)
+							for w in words:
+								found = False
+								for word, key in self.vectorizer.vocabulary_.items():
+									if w == word:
+										found = True
+										# rescale weights
+										weight = ( (max_new - min_new) / (max_old - min_old) * (weigts[key] - max_old) + max_new)
+										if weight <0.5:
+											weight = 0
+										if "_" in w: # add for each word
+											w1,w2 = w.split("_")
+											word_weights[w1] =  weight
+											word_weights[w2] =  weight
+											if w2 =="one":
+												word_weights["1"] =  weight
+												word_weights["i"] =  weight
+											if w2 =="two":
+												word_weights["2"] =  weight
+												word_weights["ii"] =  weight
+											if w2 =="three":
+												word_weights["3"] =  weight
+												word_weights["iii"] =  weight
+										else:
+											word_weights[w] =  weight
+								if found == False: # some words aren't presented in the model
+									word_weights[w] =  0
+							words = word_tokenize(t.lower().replace("-", " - ").replace("_", " ").replace(".", " . ").replace(",", " , ").replace("(", " ( ").replace(")", " ) "))
+							for i,w in enumerate(words):
+								if w not in word_weights or w=='-' or w==',' or w=='.' or w=="(" or w==")":
+									word_weights[w] =  0
+									highlighted_html_text.append(w)
+								else:
+									weight = 0 if word_weights[w] <1 else word_weights[w]
+									highlighted_html_text.append('<span font-size:40px; ; style="background-color:rgba(' + color + ',' + str(weight) + ');">' + self.html_escape(w) + '</span>')
+							highlighted_html_text = ' '.join(highlighted_html_text)
+							#pred.append({ "predictions": {self.config['classes'][ind]: {"probability":probs[index][ind]}},"word_analysis": {"discriminator_data": clean_data,"word_importance": word_weights, "highlighted_html_text":highlighted_html_text}})
+							out_pred["predictions"][self.config['classes'][ind]] = {"probability":probs[index][ind]}
+							out_pred["word_analysis"] = {"discriminator_data": clean_data,"word_importance": word_weights, "highlighted_html_text":highlighted_html_text}
+					else:
+						out_pred["predictions"] = {"Unkown": {"probability":0.5}}
+						out_pred["word_analysis"] = {"discriminator_data": clean_data,"word_importance": {x:0 for x in t.split()}, "highlighted_html_text": " ".join(x for x in t.split())}
+						#pred.append({"predictions": {"Unkown": {"probability":0.5}}, "word_analysis": {"discriminator_data": clean_data,"word_importance": {x:0 for x in t.split()}, "highlighted_html_text": " ".join(x for x in t.split())}})
+					#out_pred.append(pred)
+		except Exception as e:
+			logging.exception("Error occurred on model prediction" +" Info: " + str(e))
+			exit()
+		return out_pred
+class LIME_Interpretability(object):
+	"""
+		Class for LIME Analysis
+	"""
+	def __init__(self, label_colors = { "positive": "234, 131, 4",  # orange
+										 "negative":'65, 137, 225',  # blue
+										}):
+		self.color_classes = label_colors
+	# function to normalize, if applicable
+	def __normalize_MinMax(self,arr, t_min=0, t_max=1):
+		norm_arr = []
+		diff = t_max - t_min
+		diff_arr = max(arr) - min(arr)
+		for i in arr:
+			temp = (((i - min(arr)) * diff) / diff_arr) + t_min
+			norm_arr.append(temp)
+		return norm_arr
+	def __html_escape(self,text):
+		return html.escape(text)
+	def __add_bigrams(self,txt):
+		fixed_bigrams = [ [' gradeone ', 'grade 1', 'grade i', 'grade I', 'grade one',],
+						[' gradetwo ', 'grade 2', 'grade ii', 'grade II', 'grade two', ],
+						[' gradethree ', 'grade 3' , 'grade iii', 'grade III', 'grade three']]
+		for b in fixed_bigrams:
+			sub = ""
+			not_first = False
+			for x in b[1:]:
+				if not_first:
+					sub += "|"
+					not_first = True
+				sub += str(x) + "|" + str(x) + " " + "|" +  " " + str(x) + "|" + " " + str(x)
+			txt = re.sub(sub, b[0], txt)
+			# Removing multiple spaces
+			txt = re.sub(r'\s+', ' ', txt)
+			txt = re.sub(' +', ' ', txt)
+		return txt
+	def __highlight_full_data(self,lime_weights, data, exp_labels,class_names):
+		words_p = [x[0] for x in lime_weights if x[1]>0]
+		weights_p = np.asarray([x[1] for x in lime_weights if x[1] >0])
+		if len(weights_p) >1:
+			weights_p = self.__normalize_MinMax(weights_p, t_min=min(weights_p), t_max=1)
+		else:
+			weights_p = [1]
+		words_n = [x[0] for x in lime_weights if x[1]<0]
+		weights_n = np.asarray([x[1] for x in lime_weights if x[1] <0])
+	#     weights_n = self.__normalize_MinMax(weights_n, t_min=max(weights_p), t_max=-0.8)
+		labels = exp_labels
+		pred = class_names[labels[0]]
+		corr_pred = class_names[labels[1]] # negative lime weights
+		# positive values
+		df_coeff = pd.DataFrame(
+			{'word': words_p,
+			 'num_code': weights_p
+			})
+		word_to_coeff_mapping_p = {}
+		for row in df_coeff.iterrows():
+			row = row[1]
+			word_to_coeff_mapping_p[row[0]] = row[1]
+		# negative values
+		df_coeff = pd.DataFrame(
+			{'word': words_n,
+			 'num_code': weights_n
+			})
+		word_to_coeff_mapping_n = {}
+		for row in df_coeff.iterrows():
+			row = row[1]
+			word_to_coeff_mapping_n[row[0]] = row[1]
+		max_alpha = 1
+		highlighted_text = []
+		data = re.sub("-"," ", data)
+		data = re.sub("/","", data)
+		for word in word_tokenize(self.__add_bigrams(data)):
+			if word.lower() in word_to_coeff_mapping_p or word.lower() in word_to_coeff_mapping_n:
+				if word.lower() in word_to_coeff_mapping_p:
+					weight = word_to_coeff_mapping_p[word.lower()]
+				else:
+					weight = word_to_coeff_mapping_n[word.lower()]
+				if weight >0:
+					color = self.color_classes["positive"]
+				else:
+					color = self.color_classes["negative"]
+					weight *= -1
+					weight *=10
+				highlighted_text.append('<span font-size:40px; ; style="background-color:rgba(' + color + ',' + str(weight) + ');">' + self.__html_escape(word) + '</span>')
+			else:
+				highlighted_text.append(word)
+		highlighted_text = ' '.join(highlighted_text)
+		return highlighted_text
+	def lime_analysis(self,model,data_original, data_clean, num_features=30, num_samples=50, top_labels=2,
+					class_names=['ibc', 'nbc', 'isc', 'bll', 'hrl', 'benign', 'negative']):
+		# LIME Predictor Function
+		def predict(texts):
+			results = []
+			for text in texts:
+				predictions, probs, embeddings_output, transformer_clean_data = model.predict([text],use_chunks=False)
+				results.append(probs[0])
+			return np.array(results)
+		explainer = LimeTextExplainer(class_names=class_names)
+		exp = explainer.explain_instance(data_clean, predict, num_features=num_features,
+										 num_samples=num_samples, top_labels=top_labels)
+		l = exp.available_labels()
+		run_info = exp.as_list(l[0])
+		return self.__highlight_full_data(run_info, data_original, l,class_names)
+"""
+	The pipeline is responsible to consolidate the output of all models (higher order and all labels hierarchy)
+	It takes a string as input, and returns a jason with higher-order(Severity) and all labels(Diagnose) predictions and their probability score
+"""
+class Pipeline(object):
+	def __init__(self, bert_option:str="clinicalBERT", branch_option:str="single_tfidf"):
+		logging.basicConfig(format="%(asctime)s - %(levelname)s - %(filename)s -   %(message)s",datefmt="%d/%m/%Y %H:%M:%S",level=logging.INFO)
+		if branch_option =="single_vectorizer":
+			self.branch_option = "single_tfidf"
+		elif branch_option =="branch_vectorizer":
+			self.branch_option = "branch_tfidf"
+		else:
+			self.branch_option=branch_option
+		self.bert_option=bert_option
+		try:
+			self.config = config_file()
+			self.BERT_config = self.config['BERT_config']
+			self.ibc_config = self.config['ibc_config']
+			self.isc_config = self.config['isc_config']
+			self.hrl_config = self.config['hrl_config']
+			self.bll_config = self.config['bll_config']
+			self.benign_config = self.config['benign_config']
+			self.nbc_config = self.config['nbc_config']
+		except Exception as e:
+			logging.exception("Error occurred while initializing models and vectorizer" +" Info: " + str(e))
+			exit()
+		self.lime_interpretability = LIME_Interpretability()
+		self.initialize()
+	def initialize(self):
+		try:
+			self.bert_model = BERT_Model(self.BERT_config, self.bert_option)
+			try:
+				self.ibc_branch = Branch_Classifier(self.ibc_config,branch_option=self.branch_option)
+			except Exception as e:
+				logging.exception("Error occurred while Initializing IBC branch Model, please double check you have a config file " +" Info: " + str(e))
+				exit()
+			try:
+				self.isc_branch = Branch_Classifier(self.isc_config,branch_option=self.branch_option)
+			except Exception as e:
+				logging.exception("Error occurred while Initializing isc branch Model, please double check you have a config file " +" Info: " + str(e))
+				exit()
+			try:
+				self.hrl_branch = Branch_Classifier(self.hrl_config,branch_option=self.branch_option)
+			except Exception as e:
+				logging.exception("Error occurred while Initializing hrl branch Model, please double check you have a config file " +" Info: " + str(e))
+				exit()
+			try:
+				self.bll_branch = Branch_Classifier(self.bll_config,branch_option=self.branch_option)
+			except Exception as e:
+				logging.exception("Error occurred while Initializing bll branch Model, please double check you have a config file " +" Info: " + str(e))
+				exit()
+			try:
+				self.benign_branch = Branch_Classifier(self.benign_config,branch_option=self.branch_option)
+			except Exception as e:
+				logging.exception("Error occurred while Initializing benign branch Model, please double check you have a config file " +" Info: " + str(e))
+				exit()
+			try:
+				self.nbc_branch = Branch_Classifier(self.nbc_config,branch_option=self.branch_option)
+			except Exception as e:
+				logging.exception("Error occurred while Initializing nbc branch Model, please double check you have a config file " +" Info: " + str(e))
+				exit()
+			self.all_label_models = [self.ibc_branch,self.nbc_branch,self.isc_branch,self.bll_branch,self.hrl_branch,self.benign_branch]
+		except Exception as e:
+			logging.exception("Error occurred while Initializing Pipeline, please double check you have a config file " +" Info: " + str(e))
+			exit()
+	"""
+		Run the entire pipeline
+		Steps:
+			1) First, we run the Severity Prediction (BERT)
+			2) Given each prediction for each sample, we then:
+				2.1) Run the corresponding Diagnose Branch Prediction
+				2.2) Merge every branch prediction
+			3) Merge Every Severity and Branch Prediction
+		Inputs:
+			* Text
+		Output:
+			* Predictions (Predictions + Probabilites)
+			* Sentence Embedding
+	"""
+	def run(self,input_text:str):
+		"""
+			First, get the severity prediction (higher order branch)
+		"""
+		predictions,embeddings_output =  self.bert_model.branch_prediction([input_text])
+		predictions = predictions[0]
+		for pred in predictions:
+			for higher_order, sub_arr in pred.items():
+				# Check which branch it belongs to
+				if higher_order in ["Negative","No Prediction"]:
+					pred[higher_order]['labels'] = {higher_order: {"probability":sub_arr['probability']}}
+					pred[higher_order]["word_analysis"] = {"discriminator_data": "Not Used", "word_importance": {x:0 for x in input_text.split()}, "highlighted_html_text": " ".join(x for x in input_text.split())}
+				# For each Severity, run the corresponding Branch Prediction
+				else:
+					model = self.all_label_models[self.bert_model.config['classes'].index(higher_order)]
+					out_pred = model.predict([input_text])
+					pred[higher_order]['labels'] = out_pred['predictions']
+					pred[higher_order]['word_analysis'] = out_pred['word_analysis']
+		return predictions,embeddings_output
+	def bert_interpretability(self, input_text:str):
+		return self.lime_interpretability.lime_analysis(self.bert_model,input_text, self.bert_model.clean_data(input_text), class_names=self.bert_model.config['classes'])
+if __name__ == '__main__':
+	exit()

text_cleaning.py ADDED Viewed

	@@ -0,0 +1,250 @@

+from gensim.parsing import preprocessing
+from gensim.parsing.preprocessing import strip_tags, strip_punctuation,strip_numeric,remove_stopwords
+import re
+from nltk.stem import PorterStemmer
+import nltk
+from nltk.corpus import stopwords
+import pandas as pd
+def remove_noise_text(txt):
+  txt = txt.lower()
+  txt = re.sub('right|left', '', txt) # remove right/left spaces
+  txt = re.sub("primary site:", '', txt)
+  #txt = re.sub('post-surgical changes', ' ', txt.lower())
+  # Remove any mentions to " Findings were discussed with...."
+  txt = txt.split("findings were discussed with")[0]
+  # Remove any other occurance of PI's Information
+  txt = txt.split("this study has been reviewed and interpreted")[0]
+  txt = txt.split("this finding was communicated to")[0]
+  txt = txt.split("important findings were identified")[0]
+  txt = txt.split("these findings")[0]
+  txt = txt.split("findings above were")[0]
+  txt = txt.split("findings regarding")[0]
+  txt = txt.split("were discussed")[0]
+  txt = txt.split("these images were")[0]
+  txt = txt.split("important finding")[0]
+  # remove any section headers
+  txt = re.sub("post-surgical changes:", '', txt)
+  txt = re.sub("post surgical changes:", '', txt)
+  txt = re.sub("primary site:", '', txt)
+  txt = re.sub("primary site", '', txt)
+  txt = re.sub("neck:", '', txt)
+  txt = re.sub("post-treatment changes:", '', txt)
+  txt = re.sub("post treatment changes:", '', txt)
+  txt = re.sub("brain, orbits, spine and lungs:", '', txt)
+  txt = re.sub("primary :", '', txt)
+  txt = re.sub("neck:", '', txt)
+  txt = re.sub("aerodigestive tract:", '', txt)
+  txt = re.sub("calvarium, skull base, and spine:", '', txt)
+  txt = re.sub("other:", '', txt)
+  txt = re.sub("upper neck:", '', txt)
+  txt = re.sub("perineural disease:", '', txt)
+  txt = re.sub("technique:", '', txt)
+  txt = re.sub("comparison:", '', txt)
+  txt = re.sub("paranasal sinuses:", '', txt)
+  txt = re.sub("included orbits:", '', txt)
+  txt = re.sub("nasopharynx:", '', txt)
+  txt = re.sub("tympanomastoid cavities:", '', txt)
+  txt = re.sub("skull base and calvarium:", '', txt)
+  txt = re.sub("included intracranial structures:", '', txt)
+  txt = re.sub("abnormal enhancement:", '', txt)
+  txt = re.sub("lymph nodes:", '', txt)
+  txt = re.sub("impression:", '', txt)
+  txt = re.sub("nodes:", '', txt)
+  txt = re.sub("mri orbits:", '', txt)
+  txt = re.sub("mri brain:", '', txt)
+  txt = re.sub("brain:", '', txt)
+  txt = re.sub("ct face w/:", '', txt)
+  txt = re.sub("transspatial extension:", '', txt)
+  txt = re.sub("thyroid bed:", '', txt)
+  txt = re.sub("additional findings:", '', txt)
+  txt = re.sub("series_image", '', txt)
+  txt = re.sub("series image", '', txt)
+  txt = re.sub("image series", '', txt)
+  txt = re.sub("series", '', txt)
+  txt = re.sub(" mm | mm|mm ", " ", txt)
+  txt = re.sub(" series | series|series ", "", txt)
+  txt = re.sub(" cm | cm|cm ", " ", txt)
+  txt = re.sub(" cc | cc|cc ", " ", txt)
+  txt = re.sub(" ct | ct|ct ", " ", txt)
+  txt = re.sub(" mri | mri|mri ", " ", txt)
+  txt = re.sub(" see | see|see ", " ", txt)
+  txt = re.sub(" iia | iia|iia ", " ", txt)
+  txt = re.sub("comment", "", txt)
+  txt = re.sub("post treatment", '', txt)
+  txt = re.sub("post_treatment", '', txt)
+  txt = re.sub("post-treatment", '', txt)
+  txt = re.sub("findings suggest", '', txt)
+  txt = re.sub("findings", '', txt)
+  txt = re.sub("suggest", '', txt)
+  txt = re.sub("study reviewed", '', txt)
+  txt = re.sub("study", '', txt)
+  txt = re.sub("reviewed", '', txt)
+  txt = re.sub("please see", '', txt)
+  txt = re.sub("please", '', txt)
+  txt = re.sub("skull base", '', txt)
+  txt = re.sub("fdg avid", '', txt)
+  txt = re.sub("fdg aivity", '', txt)
+  txt = re.sub("please see chest ct for further evaluation of known lung mass", '', txt)
+  txt = re.sub("status_post", '', txt)
+  txt = re.sub("status post|clock|/|'/'", '', txt)
+  txt = re.sub("statuspost|:", '', txt)
+  txt = re.sub(" cm | cm|cm ", " centimeters ", txt)
+  txt = re.sub(" cc | cc|cc ", " cubic centimeters ", txt)
+  txt = re.sub(" ct | ct|ct ", " carat metric ", txt)
+  txt = re.sub(" mm | mm|mm ", " millimeters ", txt)
+  #txt = re.sub("(\\d*\\.\\d+)|(\\d+\\.[0-9 ]+)","",txt)
+  # in the worst case, just replace the name from PI to empty string
+  txt = re.sub("dr\\.\\s[^\\s]+", '', txt)
+  txt = re.sub('\\;', ' .', txt)
+  txt = re.sub('\\.', ' .', txt)
+  # Removing multiple spaces
+  txt = re.sub(r'\s+', ' ', txt)
+  return txt
+def add_bigrams(txt, fixed_bigrams):
+  for b in fixed_bigrams:
+    sub = ""
+    not_first = False
+    for x in b[1:]:
+      if not_first:
+        sub += "|"
+        not_first = True
+      sub += str(x) + "|" + str(x) + " " + "|" +  " " + str(x) + "|" + " " + str(x)
+    txt = re.sub(sub, b[0], txt)
+  return txt
+def clean_text(txt_orig,filters,stop_words,non_stop_words,freq_words,fixed_bigrams,steam, lemma , clean, min_lenght, eightify=False):
+  txt = remove_noise_text(txt_orig)
+  #print("\n\t\tOriginal\n", txt)
+  txt = add_bigrams(txt, fixed_bigrams)
+  #print("\n\t\tCleaned\n", txt)
+  words = preprocessing.preprocess_string(txt, filters)
+  words = add_bigrams(" ".join(w for w in words), fixed_bigrams).split()
+  txt = " ".join(w for w in words)
+  # eightify
+  #
+  if eightify:
+    replaces = [ ["her2|her 2|her two", " hertwo "], ["0", "8"], ["1", "8"], ["2", "8"], ["3", "8"],["4", "8"],
+                ["5", "8"],["6", "8"] ,["7", "8"] ,["8", "8"] ,["9", "8"] ,
+                ["\\>", " greather "], ["\\<", " less "]]
+  else:
+    replaces = [ ["her2|her 2|her two", " hertwo "], ["0", "zero "], ["1", "one "], ["2", "two "], ["3", "three "],["4", "four "],
+                ["5", "five "],["6", "six "] ,["7", "seven "] ,["8", "eight "] ,["9", "nine " ] ,
+                ["\\>", " greather "], ["\\<", " less "]]
+  for sub in replaces:
+    txt = re.sub(sub[0], sub[1], txt)
+  # Removing multiple spaces
+  txt = re.sub(r'\s+', ' ', txt)
+  words = txt.split()
+  if clean:
+    words = [w for w in words if (not w in stop_words and re.search("[a-z-A-Z]+\\w+",w) != None and (len(w) >min_lenght or w in non_stop_words) or w=='.') ]
+  else:
+    words = [w for w in words if (re.search("[a-z-A-Z]+\\w+",w) != None and (len(w) >min_lenght or w in non_stop_words) or w=='.')]
+  c_words = words.copy()
+  if steam:
+    porter = PorterStemmer()
+    c_words = [porter.stem(word) for word in c_words if not porter.stem(word) in freq_words and (len(porter.stem(word)) >min_lenght or word in non_stop_words or word=='.')]
+  if lemma:
+    lem = nltk.stem.wordnet.WordNetLemmatizer()
+    c_words = [lem.lemmatize(word) for word in c_words if not lem.lemmatize(word) in freq_words and (len(lem.lemmatize(word)) >min_lenght or word in non_stop_words or word=='.')]
+  return c_words
+def text_cleaning(data, steam=False, lemma = True, clean=True, min_lenght=2, remove_punctuation=True,
+                  freq_words_analysis=False, single_input=False,eightify=True):
+  clean_txt = []
+  freq_words = ["breast","biopsy","margin","dual","tissue","excision","change","core","identified",
+                "mastectomy","site","report","lesion","superior","anterior","inferior","medial",
+                "lateral","synoptic","evidence","slide", "brbx"]
+                    # position 0 means the bigram output - 1:end means how they may come on text
+  fixed_bigrams = [ [' grade_one ', 'grade 1', 'grade i', 'grade I', 'grade one',],
+                    [' grade_two ', 'grade 2', 'grade ii', 'grade II', 'grade two', ],
+                    [' grade_three ', 'grade 3' , 'grade iii', 'grade III', 'grade three']]
+  if remove_punctuation:
+    filters = [lambda x: x.lower(), strip_tags, strip_punctuation]
+  else:
+    filters = [lambda x: x.lower(), strip_tags]
+  stop_words = set(stopwords.words('english'))
+  non_stop_words = ['no', 'than', 'not']
+  for x in non_stop_words:
+    stop_words.remove(x)
+  if single_input:
+    c_words = clean_text(data,filters,stop_words,non_stop_words,freq_words,fixed_bigrams,steam, lemma, clean, min_lenght,eightify=eightify)
+    if len(c_words)>0:
+      if c_words[0] =='.':
+        c_words = c_words[1:]
+    clean_txt.append(c_words)
+  else:
+    for i in range(data.shape[0]):
+      txt_orig = data.iloc[i].lower()
+      c_words = clean_text(txt_orig,filters,stop_words,non_stop_words,freq_words,fixed_bigrams,steam, lemma, clean, min_lenght,eightify=eightify)
+      if len(c_words)>0:
+        if c_words[0] =='.':
+          c_words = c_words[1:]
+      clean_txt.append(c_words)
+  if freq_words_analysis:
+    flatten_corpus = [j for sub in clean_txt for j in sub]
+    clean_txt = []
+    unique = list(set(flatten_corpus))
+    wordfreq = [flatten_corpus.count(p) for p in unique]
+    wordfreq =  dict(list(zip(unique,wordfreq)))
+    freqdict = [(wordfreq[key], key) for key in wordfreq]
+    freqdict.sort()
+    freqdict.reverse()
+    df = pd.DataFrame(freqdict,columns = ['Frequency','Word'])
+    df.to_excel('../mammo_word_count.xls')
+  return clean_txt
+if __name__ == '__main__':
+  exit()

text_cleaning_transforerms.py ADDED Viewed

	@@ -0,0 +1,229 @@

+import pandas as pd
+from os import walk
+from os import listdir
+from os.path import isfile, join
+import numpy as np
+import re
+from gensim.parsing import preprocessing
+from gensim.parsing.preprocessing import strip_tags, strip_punctuation
+from nltk.tokenize import word_tokenize, sent_tokenize
+import math
+from tqdm import tqdm
+def remove_noise_text(txt):
+  txt = txt.lower()
+  txt = re.sub("primary site:", ' ', txt)
+  #txt = re.sub('post-surgical changes', ' ', txt.lower())
+  # Remove any mentions to " Findings were discussed with...."
+  txt = txt.split("findings were discussed with")[0]
+  # Remove any other occurance of PI's Information
+  txt = txt.split("this study has been reviewed and interpreted")[0]
+  txt = txt.split("this finding was communicated to")[0]
+  txt = txt.split("important findings were identified")[0]
+  txt = txt.split("these findings")[0]
+  txt = txt.split("findings above were")[0]
+  txt = txt.split("findings regarding")[0]
+  txt = txt.split("were discussed")[0]
+  txt = txt.split("these images were")[0]
+  txt = txt.split("important finding")[0]
+  # remove any section headers
+  txt = re.sub("post-surgical changes:", ' ', txt)
+  txt = re.sub("post surgical changes:", ' ', txt)
+  txt = re.sub("primary site:", ' ', txt)
+  txt = re.sub("primary site", ' ', txt)
+  txt = re.sub("neck:", ' ', txt)
+  txt = re.sub("post-treatment changes:", ' ', txt)
+  txt = re.sub("post treatment changes:", ' ', txt)
+  txt = re.sub("brain, orbits, spine and lungs:", ' ', txt)
+  txt = re.sub("primary :", ' ', txt)
+  txt = re.sub("neck:", ' ', txt)
+  txt = re.sub("aerodigestive tract:", ' ', txt)
+  txt = re.sub("calvarium, skull base, and spine:", ' ', txt)
+  txt = re.sub("other:", ' ', txt)
+  txt = re.sub("upper neck:", ' ', txt)
+  txt = re.sub("perineural disease:", ' ', txt)
+  txt = re.sub("technique:", ' ', txt)
+  txt = re.sub("comparison:", ' ', txt)
+  txt = re.sub("paranasal sinuses:", ' ', txt)
+  txt = re.sub("included orbits:", ' ', txt)
+  txt = re.sub("nasopharynx:", ' ', txt)
+  txt = re.sub("tympanomastoid cavities:", ' ', txt)
+  txt = re.sub("skull base and calvarium:", ' ', txt)
+  txt = re.sub("included intracranial structures:", ' ', txt)
+  txt = re.sub("impression:", ' ', txt)
+  txt = re.sub("nodes:", ' ', txt)
+  txt = re.sub("mri orbits:", ' ', txt)
+  txt = re.sub("mri brain:", ' ', txt)
+  txt = re.sub("brain:", ' ', txt)
+  txt = re.sub("ct face w/:", ' ', txt)
+  txt = re.sub("transspatial extension:", ' ', txt)
+  txt = re.sub("thyroid bed:", ' ', txt)
+  txt = re.sub("additional findings:", ' ', txt)
+  txt = re.sub("series_image", ' ', txt)
+  txt = re.sub("series image", ' ', txt)
+  txt = re.sub("image series", ' ', txt)
+  txt = re.sub("see synoptic report", ' ', txt)
+  txt = re.sub("see report", ' ', txt)
+  txt = re.sub("brstwo|brstmarun|brstwln|brlump|lnbx", ' ', txt)
+  txt = re.sub("post_treatment", 'post treatment', txt)
+  txt = re.sub("post-treatment", 'post treatment', txt)
+  txt = re.sub("nonmasslike", 'non mass like', txt)
+  txt = re.sub("non_mass_like", 'non mass like', txt)
+  txt = re.sub("non-mass-like", 'non mass like', txt)
+  txt = re.sub("statuspost", 'status post', txt)
+  # in the worst case, just replace the name from PI to empty string
+  txt = re.sub("dr\\.\\s[^\\s]+", ' ', txt)
+  txt = re.sub(" series | series|series ", "", txt)
+  txt = re.sub(" cm | cm|cm ", " centimeters ", txt)
+  txt = re.sub(" cc | cc|cc ", " cubic centimeters ", txt)
+  txt = re.sub(" ct | ct|ct ", " carat metric ", txt)
+  txt = re.sub(" mm | mm|mm ", " millimeters ", txt)
+  txt = re.sub("status_post|o\'", '', txt)
+  txt = re.sub("status post|clock|/|'/'", '', txt)
+  txt = re.sub("statuspost", '', txt)
+  txt = re.sub("brstwo|brlump|brstmarun|brwire|brstcap|", '', txt)
+  txt = re.sub("\\(|\\)", ',', txt)
+  txt = re.sub(",,", ',', txt)
+  txt = re.sub(",\\.", '.', txt)
+  txt = re.sub(", \\.", '.', txt)
+  txt = re.sub(" ,", ', ', txt)
+  txt = re.sub("a\\.", ' ', txt[0:5]) + txt[5:]
+  txt = re.sub("b\\.", ' ', txt[0:5]) + txt[5:]
+  txt = re.sub("c\\.", ' ', txt[0:5]) + txt[5:]
+  txt = re.sub("d\\.", ' ', txt[0:5]) + txt[5:]
+  txt = re.sub("e\\.", ' ', txt[0:5]) + txt[5:]
+  txt = re.sub("f\\.", ' ', txt[0:5]) + txt[5:]
+  # in the worst case, just replace the name from PI to empty string
+  txt = re.sub("dr\\.\\s[^\\s]+", '', txt)
+  # Removing multiple spaces
+  txt = re.sub(r'\s+', ' ', txt)
+  txt = re.sub(' +', ' ', txt)
+  txt = txt.rstrip().lstrip()
+  return txt
+def add_bigrams(txt, fixed_bigrams):
+  for b in fixed_bigrams:
+    sub = ""
+    not_first = False
+    for x in b[1:]:
+      if not_first:
+        sub += "|"
+        not_first = True
+      sub += str(x) + "|" + str(x) + " " + "|" +  " " + str(x) + "|" + " " + str(x)
+    txt = re.sub(sub, b[0], txt)
+  return txt
+def extra_clean_text(clean_t,fixed_bigrams):
+  txt = add_bigrams(clean_t, fixed_bigrams)
+  replaces = [ ["her2|her 2|her two", " hertwo "],
+              # ["0", "zero "], ["1", "one "], ["2", "two "], ["3", "three "],["4", "four "],
+              # ["5", "five "],["6", "six "] ,["7", "seven "] ,["8", "eight "] ,["9", "nine " ] ,
+              ["\\>", " greather "], ["\\<", " less "]]
+  for sub in replaces:
+    txt = re.sub(sub[0], sub[1], txt)
+  return txt
+def text_cleaning(data,min_lenght=2,extra_clean=True, remove_punctuation=False):
+                  # position 0 means the bigram output - 1:end means how they may come on text
+  fixed_bigrams = [ [' gradeone ', 'grade 1', 'grade i', 'grade I', 'grade one',],
+                    [' gradetwo ', 'grade 2', 'grade ii', 'grade II', 'grade two', ],
+                    [' gradethree ', 'grade 3' , 'grade iii', 'grade III', 'grade three']]
+  clean_txt = []
+  clean_t = remove_noise_text(data)
+  if extra_clean:
+    clean_t = extra_clean_text(clean_t,fixed_bigrams)
+    if remove_punctuation:
+      filters = [lambda x: x.lower(), strip_tags, strip_punctuation]
+    else:
+      filters = [lambda x: x.lower(), strip_tags]
+    clean_t = " ".join(x for x in preprocessing.preprocess_string(clean_t, filters) if len(x) >=min_lenght)
+  # Removing multiple spaces
+  clean_t = re.sub(r'\s+', ' ', clean_t)
+  return clean_t
+# set only_data = True if no need to get scores or if dataaset doesn't have a score
+def pre_process(data,min_lenght=2,max_size=64, extra_clean=True, remove_punctuation=False):
+  data_pre_processed = text_cleaning(data,min_lenght=min_lenght,extra_clean=extra_clean, remove_punctuation=remove_punctuation)
+  """
+    Partion the data into max_size chunks
+  """
+  sentences = sent_tokenize(data)
+  data_pre_processed_chunks,sample = [],""
+  # Were able to split into sentences
+  if len(sentences)>1:
+    for index,sentence in enumerate(sentences):
+      if len(sentence.split()) + len(sample.split()) <= max_size:
+        sample += sentence
+      else:
+        data_pre_processed_chunks.append(text_cleaning(sample,min_lenght=min_lenght,extra_clean=extra_clean, remove_punctuation=remove_punctuation))
+        sample = sentence if index < len(sentences)-1 else ""
+    if len(sample) ==0:
+      clean_data = text_cleaning(sentences[-1],min_lenght=min_lenght,extra_clean=extra_clean, remove_punctuation=remove_punctuation)
+    else:
+      clean_data = text_cleaning(sample,min_lenght=min_lenght,extra_clean=extra_clean, remove_punctuation=remove_punctuation)
+    #if len(clean_data.split()) >3:
+    data_pre_processed_chunks.append(clean_data)
+  # Split by get max size chunks
+  else:
+    words = word_tokenize(data)
+    lower_b, upper_b = 0, max_size
+    for x in range(math.ceil(len(words)/max_size)):
+        sample = " ".join(x for x in words[lower_b:upper_b])
+        lower_b, upper_b = upper_b, upper_b+max_size
+        clean_data = text_cleaning(sample,min_lenght=min_lenght,extra_clean=extra_clean, remove_punctuation=remove_punctuation)
+        #if len(clean_data.split()) >3:
+        data_pre_processed_chunks.append(clean_data)
+  # return the pre_processed of whoole text and chunks
+  return data_pre_processed,data_pre_processed_chunks
+if __name__ == '__main__':
+  exit(1)