import gradio as gr import pandas as pd import numpy as np from joblib import dump, load import os import matplotlib as mpl import matplotlib.pyplot as plt import seaborn as sns from io import StringIO from sklearn.preprocessing import OneHotEncoder from sklearn.tree import DecisionTreeClassifier from sklearn.metrics import accuracy_score, f1_score, confusion_matrix, RocCurveDisplay from sklearn.metrics import roc_curve,ConfusionMatrixDisplay, classification_report from sklearn.metrics import roc_auc_score, precision_score, recall_score from sklearn.metrics import PrecisionRecallDisplay, precision_recall_curve clf = load("RandomForestClassifier()20.joblib") modelname = "Random Forest" def encode(data, employement): data['work_type'] = data['work_type'].replace({'Goverment job' : 'Govt_job', "Never worked" : "Never_worked", "Self-employed" : "Self-employed"}) data_jobs = ['Govt_job', 'Never_worked','Private','Self-employed'] for job in data_jobs: if data['work_type'][0] == job: data[job] = 1 else: data[job] = 0 return data none_argument = lambda y: -999 if type(y) == list else y def replace_with_numeric_one_patient(data): data['ever_married'] = data['ever_married'].apply(none_argument) data['ever_married'] = data['ever_married'].replace({'Yes' : 1 , 'No' : 0 }) data['residence_type'] = data['residence_type'].replace({'Urban' : 1 , 'Rural' : 0, '' : -999}) data['smoking_status'] = data['smoking_status'].apply(none_argument) data['smoking_status'] = data['smoking_status'].replace({'Never smoked' : 0 , 'Formerly smoked' : 1, 'Smokes': 2}) data['gender'] = data['gender'].replace({'Male' : -1 , 'Female' : 1, 'Other' : 1, '' : -999}) data['avg_glucose_level'] = data['avg_glucose_level'].apply(none_argument) data['avg_glucose_level'] = data['avg_glucose_level'].replace({"Normal (<100 mg/dL)" : 0, "Prediabetes (<100, 125> mg/dL)" : 1, "Diabetes (>125 mg/dL)" : 2}) data['bmi'] = data['bmi'].apply(none_argument) data['bmi'] = data['bmi'].replace({"Underweight (<18.4)" : 0, "Normal (<18.5, 24.9>)" : 1, "Overweight (<25, 29.9>)" : 2,"Obese (>29.9)" : 3}) return data def change_dtype(data): data['age'].astype('int32') data['Govt_job'].astype(pd.SparseDtype('int32', 0)) data['Never_worked'].astype(pd.SparseDtype("int32", 0)) data['Private'].astype(pd.SparseDtype("int32", 0)) data['Self-employed'].astype(pd.SparseDtype("int32", 0)) data.info() return data def predict_stroke_from_one_patient( gender, age, hypertension, heartDisease, everMarried, residenceType, averageGlucoseLevel, bmi, smokingStatus, employementType): if type(bmi) == list: d = {'gender': [gender], 'age': [age], 'hypertension': [hypertension], 'heart_disease': [heartDisease], 'ever_married': [everMarried], 'residence_type': [residenceType],'avg_glucose_level': [averageGlucoseLevel], 'bmi': [bmi], 'smoking_status': [smokingStatus], 'bmi_was_missing' : True, 'work_type': [employementType]} else: d = {'gender': [gender], 'age': [age], 'hypertension': [hypertension], 'heart_disease': [heartDisease], 'ever_married': [everMarried], 'residence_type': [residenceType],'avg_glucose_level': [averageGlucoseLevel], 'bmi': [bmi], 'smoking_status': [smokingStatus], 'bmi_was_missing' : False, 'work_type': [employementType]} data = pd.DataFrame(data=d) data = pd.DataFrame(data=d) encode(data, employementType) data = data.drop("work_type", axis = 1) data = replace_with_numeric_one_patient(data) #data = change_dtype(data) y_predicted = clf.predict(data) if y_predicted == 1: prediction = 'stroke' else: prediction = 'no stroke' return prediction demo2 = gr.Interface(predict_stroke_from_one_patient, [ gr.Radio(["Male", "Female", "Other"]), gr.Slider(40, 90, value=40, step=1), gr.Checkbox(label="Hypertension"), gr.Checkbox(label="Heart Disease"), gr.Checkbox(label="Is/Was Married?"), gr.Radio(["Urban", "Rural"]), gr.Dropdown(["Normal (<100 mg/dL)", "Prediabetes (<100, 125> mg/dL)", "Diabetes (>125 mg/dL)"]), gr.Dropdown(["Underweight (<18.4)", "Normal (<18.5, 24.9>)", "Overweight (<25, 29.9>)","Obese (>29.9)"]), gr.Dropdown(["Never smoked", "Formerly smoked","Smokes"]), gr.Dropdown(["Goverment job", "Never worked", "Private", "Self-employed"])],outputs="label") def bmi(col): if col <= 18.4: #Underweight return 0 elif col >= 18.5 and col <= 24.9: #normal return 1 elif col >= 25.0 and col <= 29.9: #Overweight (Pre-obese) return 2 else: #obese return 3 def glucose(col): if col >= 100 and col <= 125: #prediabetes return 1 elif col < 100: #normal return 0 else: # diabetes return 2 def smoking_status(col): if col == 'never smoked': return 0 elif col == 'formerly smoked': return 1 elif col == 'smokes': return 2 else: return -999 def fill_with_median(data): bmi_num = data[["bmi"]] #MEDIAN OF BMI VALUES bmi_median = bmi_num.median() bmi_plus = bmi_num.copy() bmi_plus['bmi'] = bmi_plus.isnull() bmi_plus.columns = ['bmi_was_missing'] data = data.fillna(bmi_median) data = data.join(bmi_plus) return data def encode_1H(data): cat_encoder = OneHotEncoder() work_cat = data[["work_type"]] work_cat_1hot =pd.DataFrame.sparse.from_spmatrix(cat_encoder.fit_transform(work_cat)) work_cat_1hot.columns = ['Govt_job', 'Never_worked', 'Private', 'Self-employed', 'children'] work_cat_1hot = work_cat_1hot.astype(int) data = data.join(work_cat_1hot) data = data.drop("work_type", axis = 1) return data def replace_with_numeric(data): data['age'] = data['age'].astype(int) data['ever_married'] = data['ever_married'].replace({'Yes' : 1 , 'No' : 0 }) data['residence_type'] = data['residence_type'].replace({'Urban' : 1 , 'Rural' : 0 }) data.smoking_status = data.smoking_status.apply(smoking_status) data['gender'] = data['gender'].replace({'Male' : -1 , 'Female' : 1 , 'Other': 1}) data.avg_glucose_level = data.avg_glucose_level.apply(glucose) data.bmi = data.bmi.apply(bmi) return data def rf_feat_importance(df): return pd.DataFrame({'Feature':df.columns, 'Importance':clf.feature_importances_}).sort_values('Importance', ascending=False) def plot_importance(df): fi = rf_feat_importance(model_data) fig, ax = plt.subplots(1,1, figsize=(10, 8)) sns.barplot(data=fi,x='Importance',y='Feature',ax=ax) for s in ['top', 'left', 'right']: ax.spines[s].set_visible(False) fig.text(0.12,0.92,"Feature Importance: "+ modelname +" Stroke Prediction", fontsize=18, fontweight='bold', fontfamily='serif') plt.xlabel(" ", fontsize=12, fontweight='light', fontfamily='serif',loc='left',y=-1.5) plt.ylabel(" ", fontsize=12, fontweight='light', fontfamily='serif') import matplotlib.lines as lines l1 = lines.Line2D([0.98, 0.98], [0, 1], transform=fig.transFigure, figure=fig,color='black',lw=0.2) fig.lines.extend([l1]) return fig def predict_stroke_from_csv(file): # print(file.name) if isinstance(file, str): data = pd.read_csv(StringIO(file)) else: data = pd.read_csv(file.name) print(data) data.columns = data.columns.str.lower() if data.isna().any().any() == True: print('Missing values detected. Filling with median of feature values') data = fill_with_median(data) data = encode_1H(data) if data['age'].where(data['age'] < 40).any(): print("Patients younger than 40 years old detected. " + "Diagnose of younger than 40 years old can be false") #"Dropping the data about too young for model to predict stroke") #data = data.drop(data[data.age < 40].index) data = data.drop(['children'],axis=1) data = replace_with_numeric(data) data = data.drop(['stroke'],axis=1) model_data = data.drop(['id'],axis=1) y_predicted = clf.predict(model_data) y_predicted_proba = clf.predict_proba(model_data) predictions = [] proba_predictions = [] i = 0 for y in y_predicted: if y == 1: predictions.append(data._get_value(i, 'id')) proba_predictions.append(y_predicted_proba[i, 1].round(2)) i = i + 1 prob = pd.DataFrame({'ID': predictions, 'Probability of stroke': proba_predictions}) plot = plot_importance(df = model_data) return prob, plot with gr.Blocks() as demo3: gr.Markdown( """ # Predict stroke fo multiple patients Upload a CSV file with data about your patients to get IDs of patients at risk of stroke with probability > 50% """) filename = gr.File(file_types=['.csv']) print(filename) button = gr.Button("Diagnose") plot = gr.Plot(label="Plot") outputs = gr.Dataframe(row_count = (1, "dynamic"), col_count=(1, "dynamic"), label="Predictions", headers=["ID"]) button.click(fn=predict_stroke_from_csv, inputs=filename, outputs=[outputs, plot]) recallScore = load("recall.joblib") recallDT = recallScore.loc[:, modelname] precisionScore = load("precision.joblib") precisionDT = precisionScore.loc[:, modelname] accuracy = load("accuracy.joblib") accuracyDT = accuracy.loc[:, modelname] score = {'Recall' : recallDT, 'Precision' : precisionDT, 'Accuracy' : accuracyDT} df = pd.DataFrame(score) df.reset_index(inplace=True) learning = load("learning.joblib") importance = load("importance.joblib") matrixes = load("matrixes.joblib") recCurve = load("recCurve.joblib") roc = load("roc.joblib") with gr.Blocks() as demo4: gr.Markdown( """ # Random Forest """), with gr.Row(): with gr.Column(): plot4 = gr.Plot(recCurve, show_label = False) with gr.Column(): plot1 = gr.Plot(importance, show_label = False) with gr.Row(): plot2 = gr.Plot(learning, show_label = False) plot4 = gr.Plot(roc, show_label = False) with gr.Row(): plot3 = gr.Plot(matrixes, show_label = False) scores = gr.Dataframe(df, label="Metrics scores") with gr.Blocks() as demo: with gr.Tab("Model Overview"): demo4.render() with gr.Tab("Predict Stroke"): demo2.render() with gr.Tab("Predict Stroke CSV"): demo3.render() demo.launch()