Spaces:

adaszymorek
/

StrokePredictionRF

Runtime error

App Files Files Community

StrokePredictionRF / app.py

adaszymorek

Update app.py

1cb0989 verified 9 months ago

raw

history blame contribute delete

No virus

11.1 kB

	import gradio as gr
	import pandas as pd
	import numpy as np
	from joblib import dump, load
	import os
	import matplotlib as mpl
	import matplotlib.pyplot as plt
	import seaborn as sns
	from io import StringIO
	from sklearn.preprocessing import OneHotEncoder
	from sklearn.tree import DecisionTreeClassifier
	from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, RocCurveDisplay
	from sklearn.metrics import roc_curve,ConfusionMatrixDisplay, classification_report
	from sklearn.metrics import roc_auc_score, precision_score, recall_score
	from sklearn.metrics import PrecisionRecallDisplay, precision_recall_curve

	clf = load("RandomForestClassifier()20.joblib")
	modelname = "Random Forest"


	def encode(data, employement):
	data['work_type'] = data['work_type'].replace({'Goverment job' : 'Govt_job',
	"Never worked" : "Never_worked",
	"Self-employed" : "Self-employed"})
	data_jobs = ['Govt_job', 'Never_worked','Private','Self-employed']
	for job in data_jobs:
	if data['work_type'][0] == job:
	data[job] = 1
	else:
	data[job] = 0
	return data
	none_argument = lambda y: -999 if type(y) == list else y
	def replace_with_numeric_one_patient(data):
	data['ever_married'] = data['ever_married'].apply(none_argument)
	data['ever_married'] = data['ever_married'].replace({'Yes' : 1 , 'No' : 0 })

	data['residence_type'] = data['residence_type'].replace({'Urban' : 1 , 'Rural' : 0, '' : -999})

	data['smoking_status'] = data['smoking_status'].apply(none_argument)
	data['smoking_status'] = data['smoking_status'].replace({'Never smoked' : 0 , 'Formerly smoked' : 1,
	'Smokes': 2})

	data['gender'] = data['gender'].replace({'Male' : -1 , 'Female' : 1, 'Other' : 1, '' : -999})

	data['avg_glucose_level'] = data['avg_glucose_level'].apply(none_argument)
	data['avg_glucose_level'] = data['avg_glucose_level'].replace({"Normal (<100 mg/dL)" : 0,
	"Prediabetes (<100, 125> mg/dL)" : 1,
	"Diabetes (>125 mg/dL)" : 2})

	data['bmi'] = data['bmi'].apply(none_argument)
	data['bmi'] = data['bmi'].replace({"Underweight (<18.4)" : 0, "Normal (<18.5, 24.9>)" : 1,
	"Overweight (<25, 29.9>)" : 2,"Obese (>29.9)" : 3})
	return data
	def change_dtype(data):
	data['age'].astype('int32')
	data['Govt_job'].astype(pd.SparseDtype('int32', 0))
	data['Never_worked'].astype(pd.SparseDtype("int32", 0))
	data['Private'].astype(pd.SparseDtype("int32", 0))
	data['Self-employed'].astype(pd.SparseDtype("int32", 0))
	data.info()
	return data

	def predict_stroke_from_one_patient(
	gender, age, hypertension, heartDisease,
	everMarried, residenceType, averageGlucoseLevel,
	bmi, smokingStatus, employementType):
	if type(bmi) == list:
	d = {'gender': [gender], 'age': [age], 'hypertension': [hypertension],
	'heart_disease': [heartDisease], 'ever_married': [everMarried],
	'residence_type': [residenceType],'avg_glucose_level': [averageGlucoseLevel],
	'bmi': [bmi], 'smoking_status': [smokingStatus], 'bmi_was_missing' : True, 'work_type': [employementType]}
	else:
	d = {'gender': [gender], 'age': [age], 'hypertension': [hypertension],
	'heart_disease': [heartDisease], 'ever_married': [everMarried],
	'residence_type': [residenceType],'avg_glucose_level': [averageGlucoseLevel],
	'bmi': [bmi], 'smoking_status': [smokingStatus], 'bmi_was_missing' : False, 'work_type': [employementType]}
	data = pd.DataFrame(data=d)
	data = pd.DataFrame(data=d)
	encode(data, employementType)
	data = data.drop("work_type", axis = 1)
	data = replace_with_numeric_one_patient(data)
	#data = change_dtype(data)
	y_predicted = clf.predict(data)
	if y_predicted == 1:
	prediction = 'stroke'
	else:
	prediction = 'no stroke'
	return prediction

	demo2 = gr.Interface(predict_stroke_from_one_patient,
	[
	gr.Radio(["Male", "Female", "Other"]),
	gr.Slider(40, 90, value=40, step=1),
	gr.Checkbox(label="Hypertension"),
	gr.Checkbox(label="Heart Disease"),
	gr.Checkbox(label="Is/Was Married?"),
	gr.Radio(["Urban", "Rural"]),
	gr.Dropdown(["Normal (<100 mg/dL)", "Prediabetes (<100, 125> mg/dL)",
	"Diabetes (>125 mg/dL)"]),
	gr.Dropdown(["Underweight (<18.4)", "Normal (<18.5, 24.9>)",
	"Overweight (<25, 29.9>)","Obese (>29.9)"]),
	gr.Dropdown(["Never smoked", "Formerly smoked","Smokes"]),
	gr.Dropdown(["Goverment job", "Never worked", "Private",
	"Self-employed"])],outputs="label")

	def bmi(col):
	if col <= 18.4: #Underweight
	return 0
	elif col >= 18.5 and col <= 24.9: #normal
	return 1
	elif col >= 25.0 and col <= 29.9: #Overweight (Pre-obese)
	return 2
	else: #obese
	return 3
	def glucose(col):
	if col >= 100 and col <= 125: #prediabetes
	return 1
	elif col < 100: #normal
	return 0
	else: # diabetes
	return 2
	def smoking_status(col):
	if col == 'never smoked':
	return 0
	elif col == 'formerly smoked':
	return 1
	elif col == 'smokes':
	return 2
	else:
	return -999

	def fill_with_median(data):
	bmi_num = data[["bmi"]]
	#MEDIAN OF BMI VALUES
	bmi_median = bmi_num.median()
	bmi_plus = bmi_num.copy()
	bmi_plus['bmi'] = bmi_plus.isnull()
	bmi_plus.columns = ['bmi_was_missing']
	data = data.fillna(bmi_median)
	data = data.join(bmi_plus)
	return data
	def encode_1H(data):
	cat_encoder = OneHotEncoder()
	work_cat = data[["work_type"]]
	work_cat_1hot =pd.DataFrame.sparse.from_spmatrix(cat_encoder.fit_transform(work_cat))
	work_cat_1hot.columns = ['Govt_job', 'Never_worked', 'Private', 'Self-employed', 'children']
	work_cat_1hot = work_cat_1hot.astype(int)
	data = data.join(work_cat_1hot)
	data = data.drop("work_type", axis = 1)
	return data
	def replace_with_numeric(data):
	data['age'] = data['age'].astype(int)
	data['ever_married'] = data['ever_married'].replace({'Yes' : 1 , 'No' : 0 })
	data['residence_type'] = data['residence_type'].replace({'Urban' : 1 , 'Rural' : 0 })
	data.smoking_status = data.smoking_status.apply(smoking_status)
	data['gender'] = data['gender'].replace({'Male' : -1 , 'Female' : 1 , 'Other': 1})
	data.avg_glucose_level = data.avg_glucose_level.apply(glucose)
	data.bmi = data.bmi.apply(bmi)
	return data

	def rf_feat_importance(df):
	return pd.DataFrame({'Feature':df.columns,
	'Importance':clf.feature_importances_}).sort_values('Importance', ascending=False)
	def plot_importance(df):
	fi = rf_feat_importance(model_data)

	fig, ax = plt.subplots(1,1, figsize=(10, 8))

	sns.barplot(data=fi,x='Importance',y='Feature',ax=ax)
	for s in ['top', 'left', 'right']:
	ax.spines[s].set_visible(False)

	fig.text(0.12,0.92,"Feature Importance: "+ modelname +" Stroke Prediction", fontsize=18, fontweight='bold', fontfamily='serif')

	plt.xlabel(" ", fontsize=12, fontweight='light', fontfamily='serif',loc='left',y=-1.5)
	plt.ylabel(" ", fontsize=12, fontweight='light', fontfamily='serif')

	import matplotlib.lines as lines
	l1 = lines.Line2D([0.98, 0.98], [0, 1], transform=fig.transFigure, figure=fig,color='black',lw=0.2)
	fig.lines.extend([l1])
	return fig

	def predict_stroke_from_csv(file):
	# print(file.name)
	if isinstance(file, str):
	data = pd.read_csv(StringIO(file))
	else:
	data = pd.read_csv(file.name)
	print(data)
	data.columns = data.columns.str.lower()
	if data.isna().any().any() == True:
	print('Missing values detected. Filling with median of feature values')
	data = fill_with_median(data)
	data = encode_1H(data)
	if data['age'].where(data['age'] < 40).any():
	print("Patients younger than 40 years old detected. " +
	"Diagnose of younger than 40 years old can be false")
	#"Dropping the data about too young for model to predict stroke")
	#data = data.drop(data[data.age < 40].index)
	data = data.drop(['children'],axis=1)
	data = replace_with_numeric(data)
	data = data.drop(['stroke'],axis=1)
	model_data = data.drop(['id'],axis=1)
	y_predicted = clf.predict(model_data)
	y_predicted_proba = clf.predict_proba(model_data)
	predictions = []
	proba_predictions = []
	i = 0
	for y in y_predicted:
	if y == 1:
	predictions.append(data._get_value(i, 'id'))
	proba_predictions.append(y_predicted_proba[i, 1].round(2))
	i = i + 1
	prob = pd.DataFrame({'ID': predictions, 'Probability of stroke': proba_predictions})
	plot = plot_importance(df = model_data)
	return prob, plot

	with gr.Blocks() as demo3:
	gr.Markdown(
	"""
	# Predict stroke fo multiple patients
	Upload a CSV file with data about your patients to get IDs of patients at risk of stroke with probability > 50%
	""")
	filename = gr.File(file_types=['.csv'])
	print(filename)
	button = gr.Button("Diagnose")
	plot = gr.Plot(label="Plot")
	outputs = gr.Dataframe(row_count = (1, "dynamic"),
	col_count=(1, "dynamic"), label="Predictions",
	headers=["ID"])
	button.click(fn=predict_stroke_from_csv, inputs=filename, outputs=[outputs, plot])

	recallScore = load("recall.joblib")
	recallDT = recallScore.loc[:, modelname]
	precisionScore = load("precision.joblib")
	precisionDT = precisionScore.loc[:, modelname]
	accuracy = load("accuracy.joblib")
	accuracyDT = accuracy.loc[:, modelname]
	score = {'Recall' : recallDT, 'Precision' : precisionDT, 'Accuracy' : accuracyDT}
	df = pd.DataFrame(score)
	df.reset_index(inplace=True)

	learning = load("learning.joblib")
	importance = load("importance.joblib")
	matrixes = load("matrixes.joblib")
	recCurve = load("recCurve.joblib")
	roc = load("roc.joblib")

	with gr.Blocks() as demo4:
	gr.Markdown(
	"""
	# Random Forest
	"""),
	with gr.Row():
	with gr.Column():
	plot4 = gr.Plot(recCurve, show_label = False)
	with gr.Column():
	plot1 = gr.Plot(importance, show_label = False)
	with gr.Row():
	plot2 = gr.Plot(learning, show_label = False)
	plot4 = gr.Plot(roc, show_label = False)
	with gr.Row():
	plot3 = gr.Plot(matrixes, show_label = False)
	scores = gr.Dataframe(df, label="Metrics scores")
	with gr.Blocks() as demo:
	with gr.Tab("Model Overview"):
	demo4.render()
	with gr.Tab("Predict Stroke"):
	demo2.render()
	with gr.Tab("Predict Stroke CSV"):
	demo3.render()
	demo.launch()