Diabetes_Model / app.py
cnasa's picture
Update app.py
c38b397
raw history blame
No virus
8.26 kB
import streamlit as st
import pickle
import pandas as pd
import seaborn as sns
import sklearn
from sklearn.ensemble import RandomForestClassifier
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix,plot_confusion_matrix,classification_report
from imblearn.over_sampling import RandomOverSampler
import numpy as np
from sklearn.preprocessing import StandardScaler,OneHotEncoder
from sklearn.compose import make_column_transformer
import warnings
warnings.filterwarnings("ignore")
# Load Data
@st.cache(allow_output_mutation=True)
def loading_data():
df = pd.read_csv("diabetes_012__health_indicators_BRFSS2015.csv")
#convert all columns to integer
for col in df.columns:
df[col] = df[col].astype("int")
# Drop duplicated rows
df_ = df.drop_duplicates()
df=df_
X,y = df.drop(['Diabetes_012'],axis=1),df['Diabetes_012'].values
classes = np.unique(y)
#Oversampling data
randomSampler = RandomOverSampler(sampling_strategy='all',random_state=24)
X_new,y_new = randomSampler.fit_resample(X,y)
new_df = X_new.copy()
new_df["Diabetes_012"] = y_new
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X_new,y_new,test_size=0.2,random_state=24,stratify=y_new)
# Some feature engineering
Xtrain_transf = Xtrain.copy()
Xtrain_transf["Age2"] = Xtrain_transf["Age"]**2
## Numerical column transformation
col_num=["BMI","MentHlth","PhysHlth","Age","Age2"]
num_col_trans = make_column_transformer((StandardScaler(),col_num),remainder="passthrough")
Xtrain_transf_std = num_col_trans.fit_transform(Xtrain_transf,Ytrain)
Xtrain_transf_std = pd.DataFrame(Xtrain_transf_std ,columns=list(Xtrain_transf.columns) )
# Données tests:
Xtest_transf = Xtest.copy()
Xtest_transf["Age2"] = Xtest_transf["Age"]**2
Xtest_transf_std = num_col_trans.transform(Xtest_transf)
Xtest_transf_std = pd.DataFrame(Xtest_transf_std ,columns=list(Xtest_transf.columns) )
## Categorical columns transformation
col_cat = ["GenHlth","Education","Income"]
cat_col_trans = make_column_transformer((OneHotEncoder(handle_unknown = 'ignore'),col_cat),remainder="passthrough")
Xtrain_transf_std_encoded = cat_col_trans.fit_transform(Xtrain_transf_std,Ytrain)
# Sur les données tests :
Xtest_encoded = cat_col_trans.transform(Xtest_transf_std)
results = {"Xtrain": Xtrain,
"Ytrain": Ytrain,
"Xtest_encoded": Xtest_encoded,
"Ytest": Ytest,
"num_col_trans": num_col_trans,
"cat_col_trans": cat_col_trans}
return results
# Function for plotting the feature importance
@st.cache(allow_output_mutation=True)
def Plot_feature_importance(my_model,X_train):
feature_importance = my_model.feature_importances_
columns_name = list(X_train.columns)
dico_importance = {col:var_imp for var_imp,col in zip(feature_importance,columns_name)}
col_imp = ["Features","Importance"]
df_importance = pd.DataFrame(dico_importance,index=[0]).T.reset_index()
df_importance.columns = col_imp
df_importance = df_importance.sort_values("Importance",ascending=False)
sort_col_desc = list(df_importance["Features"].values)
df_importance["Features"] = df_importance["Features"].astype("category")
return df_importance,sort_col_desc
# Function use to get the user input data
def user_input():
Sex_txt = st.sidebar.selectbox('Sex',("homme","femme"))
if Sex_txt=="homme":
Sex = 0
else:
Sex = 1
HighBP = int(st.sidebar.selectbox('HighBP',(0,1) ))
HighChol = int(st.sidebar.selectbox('HighChol',(0,1),0 ))
CholCheck = int(st.sidebar.selectbox('CholCheck',(0,1) ))
Smoker = int(st.sidebar.selectbox('Smoker',(0,1) ))
Stroke = int(st.sidebar.selectbox('Stroke',(0,1),1 ))
HeartDiseaseorAttack = int(st.sidebar.selectbox('HeartDiseaseorAttack',(0,1) ))
PhysActivity = int(st.sidebar.selectbox('PhysActivity',(0,1) ))
Fruits = int(st.sidebar.selectbox('Fruits',(0,1),1 ))
Veggies = int(st.sidebar.selectbox('Veggies',(0,1) ))
HvyAlcoholConsump = int(st.sidebar.selectbox('HvyAlcoholConsump',(0,1),1 ))
AnyHealthcare = int(st.sidebar.selectbox('AnyHealthcare',(0,1) ))
NoDocbcCost = int(st.sidebar.selectbox('NoDocbcCost',(0,1) ))
DiffWalk = int(st.sidebar.selectbox('DiffWalk',(0,1) ))
GenHlth = int(st.sidebar.slider('GenHlth',1,5,3,step=1 ))
MentHlth = int(st.sidebar.slider('MentHlth',0,30,value=10,step=1 ))
PhysHlth = int(st.sidebar.slider('PhysHlth',0,30,value=8,step=1 ))
BMI = int(st.sidebar.slider('BMI',12,98,70 ))
Age = int(st.sidebar.slider('Age',1,13,value=12,step=1 ))
Education = int(st.sidebar.slider('Education',1,6,value=5,step=1 ))
Income = int(st.sidebar.slider('Income',1,8,value=2,step=1 ))
user_data = {'HighBP':HighBP, 'HighChol':HighChol, 'CholCheck':CholCheck, 'BMI':BMI, 'Smoker':Smoker,
'Stroke':Stroke, 'HeartDiseaseorAttack':HeartDiseaseorAttack, 'PhysActivity':PhysActivity, 'Fruits':Fruits, 'Veggies':Veggies,
'HvyAlcoholConsump':HvyAlcoholConsump, 'AnyHealthcare':AnyHealthcare, 'NoDocbcCost':NoDocbcCost, 'GenHlth':GenHlth,
'MentHlth':MentHlth, 'PhysHlth':PhysHlth, 'DiffWalk':DiffWalk, 'Sex':Sex, 'Age':Age, 'Education':Education,
'Income':Income}
user_df = pd.DataFrame(user_data,index=[0])
return user_df
# Prepare user input data before fitting it to the model
def prepare_user_df(user_data,num_trans,col_trans):
user_data["Age2"] = user_data["Age"]**2
user_data_std = num_trans.transform(user_data)
user_data_std = pd.DataFrame(user_data_std ,columns=list(user_data.columns) )
user_data_encoded = col_trans.transform(user_data_std)
return user_data_encoded
# Create Web App
st.title("Machine Learning Demo :")
st.header("Prédiction de la santé d'un patient concernant le diabète")
st.subheader("Matrice de corrélation sur les données dédoublonnées")
img_path = "corr_matrix.PNG"
st.image(img_path)
st.subheader("La courbe ROC associée au model de RandomForest choisi")
img_path_roc = "ROC_Model_RF.PNG"
st.image(img_path_roc)
st.sidebar.header("Features")
st.subheader("Table à prédire :")
user_df = user_input()
st.dataframe(user_df)
target_names = ['0: no diabetes', '1: prediabetes', '2: diabetes']
#LOADING THE DATA
results = loading_data()
Xtrain = results["Xtrain"]
Ytrain = results["Ytrain"]
Xtest_encoded = results["Xtest_encoded"]
Ytest = results["Ytest"]
num_col_trans = results["num_col_trans"]
cat_col_trans = results["cat_col_trans"]
## Load Model from pickle file
with open("Model_package.pkl","rb") as f:
Model_package = pickle.load(f)
model = Model_package['my_classif']
num_col_trans = Model_package['num_col_trans']
cat_col_trans = Model_package['cat_col_trans']
# Plot the features Importance
st.subheader("Importance des variables :")
df_importance,sort_col_desc = Plot_feature_importance(model,Xtrain)
fig2 , ax2 = plt.subplots(figsize=(12,8))
sns.barplot(x="Importance",y="Features",data = df_importance,orient="h",ax=ax2,order=sort_col_desc)
plt.grid(visible=False)
st.pyplot(fig2)
# PREDICTION
#Taux d'erreur de validation
Ypred_test = model.predict(Xtest_encoded)
st.subheader("Performance du model sur les données de validation : RandomForestClassifier ")
classif_report = pd.DataFrame(classification_report(Ytest,Ypred_test,output_dict=True,target_names = target_names )).T
classif_report["support"] = classif_report["support"].apply(lambda x: int(x))
st.write(classif_report)
#Prepapre the input user before fitting to the model
user_df_prepared = prepare_user_df(user_df,num_col_trans,cat_col_trans)
pred = model.predict(user_df_prepared)
pred_class = str(target_names[int(pred)])
st.subheader("Prédiction associée à l'échantillon :")
st.write(f"La classe prédite est : ----->> {pred_class}")
pred_proba = model.predict_proba(user_df_prepared)
df_pred_proba = pd.DataFrame(pred_proba,columns=target_names)
st.subheader("Probabilités de prédire chaque classe selon l'échantillon :")
st.write(df_pred_proba)