Spaces:
Runtime error
Runtime error
import gradio as gr | |
import pandas as pd | |
import numpy as np | |
from joblib import dump, load | |
import os | |
import matplotlib as mpl | |
import matplotlib.pyplot as plt | |
import seaborn as sns | |
from io import StringIO | |
from sklearn.preprocessing import OneHotEncoder | |
from sklearn.tree import DecisionTreeClassifier | |
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, RocCurveDisplay | |
from sklearn.metrics import roc_curve,ConfusionMatrixDisplay, classification_report | |
from sklearn.metrics import roc_auc_score, precision_score, recall_score | |
from sklearn.metrics import PrecisionRecallDisplay, precision_recall_curve | |
clf = load("RandomForestClassifier()20.joblib") | |
modelname = "Random Forest" | |
def encode(data, employement): | |
data['work_type'] = data['work_type'].replace({'Goverment job' : 'Govt_job', | |
"Never worked" : "Never_worked", | |
"Self-employed" : "Self-employed"}) | |
data_jobs = ['Govt_job', 'Never_worked','Private','Self-employed'] | |
for job in data_jobs: | |
if data['work_type'][0] == job: | |
data[job] = 1 | |
else: | |
data[job] = 0 | |
return data | |
none_argument = lambda y: -999 if type(y) == list else y | |
def replace_with_numeric_one_patient(data): | |
data['ever_married'] = data['ever_married'].apply(none_argument) | |
data['ever_married'] = data['ever_married'].replace({'Yes' : 1 , 'No' : 0 }) | |
data['residence_type'] = data['residence_type'].replace({'Urban' : 1 , 'Rural' : 0, '' : -999}) | |
data['smoking_status'] = data['smoking_status'].apply(none_argument) | |
data['smoking_status'] = data['smoking_status'].replace({'Never smoked' : 0 , 'Formerly smoked' : 1, | |
'Smokes': 2}) | |
data['gender'] = data['gender'].replace({'Male' : -1 , 'Female' : 1, 'Other' : 1, '' : -999}) | |
data['avg_glucose_level'] = data['avg_glucose_level'].apply(none_argument) | |
data['avg_glucose_level'] = data['avg_glucose_level'].replace({"Normal (<100 mg/dL)" : 0, | |
"Prediabetes (<100, 125> mg/dL)" : 1, | |
"Diabetes (>125 mg/dL)" : 2}) | |
data['bmi'] = data['bmi'].apply(none_argument) | |
data['bmi'] = data['bmi'].replace({"Underweight (<18.4)" : 0, "Normal (<18.5, 24.9>)" : 1, | |
"Overweight (<25, 29.9>)" : 2,"Obese (>29.9)" : 3}) | |
return data | |
def change_dtype(data): | |
data['age'].astype('int32') | |
data['Govt_job'].astype(pd.SparseDtype('int32', 0)) | |
data['Never_worked'].astype(pd.SparseDtype("int32", 0)) | |
data['Private'].astype(pd.SparseDtype("int32", 0)) | |
data['Self-employed'].astype(pd.SparseDtype("int32", 0)) | |
data.info() | |
return data | |
def predict_stroke_from_one_patient( | |
gender, age, hypertension, heartDisease, | |
everMarried, residenceType, averageGlucoseLevel, | |
bmi, smokingStatus, employementType): | |
if type(bmi) == list: | |
d = {'gender': [gender], 'age': [age], 'hypertension': [hypertension], | |
'heart_disease': [heartDisease], 'ever_married': [everMarried], | |
'residence_type': [residenceType],'avg_glucose_level': [averageGlucoseLevel], | |
'bmi': [bmi], 'smoking_status': [smokingStatus], 'bmi_was_missing' : True, 'work_type': [employementType]} | |
else: | |
d = {'gender': [gender], 'age': [age], 'hypertension': [hypertension], | |
'heart_disease': [heartDisease], 'ever_married': [everMarried], | |
'residence_type': [residenceType],'avg_glucose_level': [averageGlucoseLevel], | |
'bmi': [bmi], 'smoking_status': [smokingStatus], 'bmi_was_missing' : False, 'work_type': [employementType]} | |
data = pd.DataFrame(data=d) | |
data = pd.DataFrame(data=d) | |
encode(data, employementType) | |
data = data.drop("work_type", axis = 1) | |
data = replace_with_numeric_one_patient(data) | |
#data = change_dtype(data) | |
y_predicted = clf.predict(data) | |
if y_predicted == 1: | |
prediction = 'stroke' | |
else: | |
prediction = 'no stroke' | |
return prediction | |
demo2 = gr.Interface(predict_stroke_from_one_patient, | |
[ | |
gr.Radio(["Male", "Female", "Other"]), | |
gr.Slider(40, 90, value=40, step=1), | |
gr.Checkbox(label="Hypertension"), | |
gr.Checkbox(label="Heart Disease"), | |
gr.Checkbox(label="Is/Was Married?"), | |
gr.Radio(["Urban", "Rural"]), | |
gr.Dropdown(["Normal (<100 mg/dL)", "Prediabetes (<100, 125> mg/dL)", | |
"Diabetes (>125 mg/dL)"]), | |
gr.Dropdown(["Underweight (<18.4)", "Normal (<18.5, 24.9>)", | |
"Overweight (<25, 29.9>)","Obese (>29.9)"]), | |
gr.Dropdown(["Never smoked", "Formerly smoked","Smokes"]), | |
gr.Dropdown(["Goverment job", "Never worked", "Private", | |
"Self-employed"])],outputs="label") | |
def bmi(col): | |
if col <= 18.4: #Underweight | |
return 0 | |
elif col >= 18.5 and col <= 24.9: #normal | |
return 1 | |
elif col >= 25.0 and col <= 29.9: #Overweight (Pre-obese) | |
return 2 | |
else: #obese | |
return 3 | |
def glucose(col): | |
if col >= 100 and col <= 125: #prediabetes | |
return 1 | |
elif col < 100: #normal | |
return 0 | |
else: # diabetes | |
return 2 | |
def smoking_status(col): | |
if col == 'never smoked': | |
return 0 | |
elif col == 'formerly smoked': | |
return 1 | |
elif col == 'smokes': | |
return 2 | |
else: | |
return -999 | |
def fill_with_median(data): | |
bmi_num = data[["bmi"]] | |
#MEDIAN OF BMI VALUES | |
bmi_median = bmi_num.median() | |
bmi_plus = bmi_num.copy() | |
bmi_plus['bmi'] = bmi_plus.isnull() | |
bmi_plus.columns = ['bmi_was_missing'] | |
data = data.fillna(bmi_median) | |
data = data.join(bmi_plus) | |
return data | |
def encode_1H(data): | |
cat_encoder = OneHotEncoder() | |
work_cat = data[["work_type"]] | |
work_cat_1hot =pd.DataFrame.sparse.from_spmatrix(cat_encoder.fit_transform(work_cat)) | |
work_cat_1hot.columns = ['Govt_job', 'Never_worked', 'Private', 'Self-employed', 'children'] | |
work_cat_1hot = work_cat_1hot.astype(int) | |
data = data.join(work_cat_1hot) | |
data = data.drop("work_type", axis = 1) | |
return data | |
def replace_with_numeric(data): | |
data['age'] = data['age'].astype(int) | |
data['ever_married'] = data['ever_married'].replace({'Yes' : 1 , 'No' : 0 }) | |
data['residence_type'] = data['residence_type'].replace({'Urban' : 1 , 'Rural' : 0 }) | |
data.smoking_status = data.smoking_status.apply(smoking_status) | |
data['gender'] = data['gender'].replace({'Male' : -1 , 'Female' : 1 , 'Other': 1}) | |
data.avg_glucose_level = data.avg_glucose_level.apply(glucose) | |
data.bmi = data.bmi.apply(bmi) | |
return data | |
def rf_feat_importance(df): | |
return pd.DataFrame({'Feature':df.columns, | |
'Importance':clf.feature_importances_}).sort_values('Importance', ascending=False) | |
def plot_importance(df): | |
fi = rf_feat_importance(model_data) | |
fig, ax = plt.subplots(1,1, figsize=(10, 8)) | |
sns.barplot(data=fi,x='Importance',y='Feature',ax=ax) | |
for s in ['top', 'left', 'right']: | |
ax.spines[s].set_visible(False) | |
fig.text(0.12,0.92,"Feature Importance: "+ modelname +" Stroke Prediction", fontsize=18, fontweight='bold', fontfamily='serif') | |
plt.xlabel(" ", fontsize=12, fontweight='light', fontfamily='serif',loc='left',y=-1.5) | |
plt.ylabel(" ", fontsize=12, fontweight='light', fontfamily='serif') | |
import matplotlib.lines as lines | |
l1 = lines.Line2D([0.98, 0.98], [0, 1], transform=fig.transFigure, figure=fig,color='black',lw=0.2) | |
fig.lines.extend([l1]) | |
return fig | |
def predict_stroke_from_csv(file): | |
# print(file.name) | |
if isinstance(file, str): | |
data = pd.read_csv(StringIO(file)) | |
else: | |
data = pd.read_csv(file.name) | |
print(data) | |
data.columns = data.columns.str.lower() | |
if data.isna().any().any() == True: | |
print('Missing values detected. Filling with median of feature values') | |
data = fill_with_median(data) | |
data = encode_1H(data) | |
if data['age'].where(data['age'] < 40).any(): | |
print("Patients younger than 40 years old detected. " + | |
"Diagnose of younger than 40 years old can be false") | |
#"Dropping the data about too young for model to predict stroke") | |
#data = data.drop(data[data.age < 40].index) | |
data = data.drop(['children'],axis=1) | |
data = replace_with_numeric(data) | |
data = data.drop(['stroke'],axis=1) | |
model_data = data.drop(['id'],axis=1) | |
y_predicted = clf.predict(model_data) | |
y_predicted_proba = clf.predict_proba(model_data) | |
predictions = [] | |
proba_predictions = [] | |
i = 0 | |
for y in y_predicted: | |
if y == 1: | |
predictions.append(data._get_value(i, 'id')) | |
proba_predictions.append(y_predicted_proba[i, 1].round(2)) | |
i = i + 1 | |
prob = pd.DataFrame({'ID': predictions, 'Probability of stroke': proba_predictions}) | |
plot = plot_importance(df = model_data) | |
return prob, plot | |
with gr.Blocks() as demo3: | |
gr.Markdown( | |
""" | |
# Predict stroke fo multiple patients | |
Upload a CSV file with data about your patients to get IDs of patients at risk of stroke with probability > 50% | |
""") | |
filename = gr.File(file_types=['.csv']) | |
print(filename) | |
button = gr.Button("Diagnose") | |
plot = gr.Plot(label="Plot") | |
outputs = gr.Dataframe(row_count = (1, "dynamic"), | |
col_count=(1, "dynamic"), label="Predictions", | |
headers=["ID"]) | |
button.click(fn=predict_stroke_from_csv, inputs=filename, outputs=[outputs, plot]) | |
recallScore = load("recall.joblib") | |
recallDT = recallScore.loc[:, modelname] | |
precisionScore = load("precision.joblib") | |
precisionDT = precisionScore.loc[:, modelname] | |
accuracy = load("accuracy.joblib") | |
accuracyDT = accuracy.loc[:, modelname] | |
score = {'Recall' : recallDT, 'Precision' : precisionDT, 'Accuracy' : accuracyDT} | |
df = pd.DataFrame(score) | |
df.reset_index(inplace=True) | |
learning = load("learning.joblib") | |
importance = load("importance.joblib") | |
matrixes = load("matrixes.joblib") | |
recCurve = load("recCurve.joblib") | |
roc = load("roc.joblib") | |
with gr.Blocks() as demo4: | |
gr.Markdown( | |
""" | |
# Random Forest | |
"""), | |
with gr.Row(): | |
with gr.Column(): | |
plot4 = gr.Plot(recCurve, show_label = False) | |
with gr.Column(): | |
plot1 = gr.Plot(importance, show_label = False) | |
with gr.Row(): | |
plot2 = gr.Plot(learning, show_label = False) | |
plot4 = gr.Plot(roc, show_label = False) | |
with gr.Row(): | |
plot3 = gr.Plot(matrixes, show_label = False) | |
scores = gr.Dataframe(df, label="Metrics scores") | |
with gr.Blocks() as demo: | |
with gr.Tab("Model Overview"): | |
demo4.render() | |
with gr.Tab("Predict Stroke"): | |
demo2.render() | |
with gr.Tab("Predict Stroke CSV"): | |
demo3.render() | |
demo.launch() |