Spaces:
Runtime error
Runtime error
import streamlit as st | |
import pickle | |
import pandas as pd | |
import seaborn as sns | |
import sklearn | |
from sklearn.ensemble import RandomForestClassifier | |
import matplotlib.pyplot as plt | |
from sklearn.model_selection import train_test_split | |
from sklearn.metrics import confusion_matrix,plot_confusion_matrix,classification_report | |
from imblearn.over_sampling import RandomOverSampler | |
import numpy as np | |
from sklearn.preprocessing import StandardScaler,OneHotEncoder | |
from sklearn.compose import make_column_transformer | |
import warnings | |
warnings.filterwarnings("ignore") | |
# Load Data | |
def loading_data(): | |
df = pd.read_csv("diabetes_012__health_indicators_BRFSS2015.csv") | |
#convert all columns to integer | |
for col in df.columns: | |
df[col] = df[col].astype("int") | |
# Drop duplicated rows | |
df_ = df.drop_duplicates() | |
df=df_ | |
X,y = df.drop(['Diabetes_012'],axis=1),df['Diabetes_012'].values | |
classes = np.unique(y) | |
#Oversampling data | |
randomSampler = RandomOverSampler(sampling_strategy='all',random_state=24) | |
X_new,y_new = randomSampler.fit_resample(X,y) | |
new_df = X_new.copy() | |
new_df["Diabetes_012"] = y_new | |
Xtrain,Xtest,Ytrain,Ytest = train_test_split(X_new,y_new,test_size=0.2,random_state=24,stratify=y_new) | |
# Some feature engineering | |
Xtrain_transf = Xtrain.copy() | |
Xtrain_transf["Age2"] = Xtrain_transf["Age"]**2 | |
## Numerical column transformation | |
col_num=["BMI","MentHlth","PhysHlth","Age","Age2"] | |
num_col_trans = make_column_transformer((StandardScaler(),col_num),remainder="passthrough") | |
Xtrain_transf_std = num_col_trans.fit_transform(Xtrain_transf,Ytrain) | |
Xtrain_transf_std = pd.DataFrame(Xtrain_transf_std ,columns=list(Xtrain_transf.columns) ) | |
# Données tests: | |
Xtest_transf = Xtest.copy() | |
Xtest_transf["Age2"] = Xtest_transf["Age"]**2 | |
Xtest_transf_std = num_col_trans.transform(Xtest_transf) | |
Xtest_transf_std = pd.DataFrame(Xtest_transf_std ,columns=list(Xtest_transf.columns) ) | |
## Categorical columns transformation | |
col_cat = ["GenHlth","Education","Income"] | |
cat_col_trans = make_column_transformer((OneHotEncoder(handle_unknown = 'ignore'),col_cat),remainder="passthrough") | |
Xtrain_transf_std_encoded = cat_col_trans.fit_transform(Xtrain_transf_std,Ytrain) | |
# Sur les données tests : | |
Xtest_encoded = cat_col_trans.transform(Xtest_transf_std) | |
results = {"Xtrain": Xtrain, | |
"Ytrain": Ytrain, | |
"Xtest_encoded": Xtest_encoded, | |
"Ytest": Ytest, | |
"num_col_trans": num_col_trans, | |
"cat_col_trans": cat_col_trans} | |
return results | |
# Function for plotting the feature importance | |
def Plot_feature_importance(my_model,X_train): | |
feature_importance = my_model.feature_importances_ | |
columns_name = list(X_train.columns) | |
dico_importance = {col:var_imp for var_imp,col in zip(feature_importance,columns_name)} | |
col_imp = ["Features","Importance"] | |
df_importance = pd.DataFrame(dico_importance,index=[0]).T.reset_index() | |
df_importance.columns = col_imp | |
df_importance = df_importance.sort_values("Importance",ascending=False) | |
sort_col_desc = list(df_importance["Features"].values) | |
df_importance["Features"] = df_importance["Features"].astype("category") | |
return df_importance,sort_col_desc | |
# Function use to get the user input data | |
def user_input(): | |
Sex_txt = st.sidebar.selectbox('Sex',("homme","femme")) | |
if Sex_txt=="homme": | |
Sex = 0 | |
else: | |
Sex = 1 | |
HighBP = int(st.sidebar.selectbox('HighBP',(0,1) )) | |
HighChol = int(st.sidebar.selectbox('HighChol',(0,1),0 )) | |
CholCheck = int(st.sidebar.selectbox('CholCheck',(0,1) )) | |
Smoker = int(st.sidebar.selectbox('Smoker',(0,1) )) | |
Stroke = int(st.sidebar.selectbox('Stroke',(0,1),1 )) | |
HeartDiseaseorAttack = int(st.sidebar.selectbox('HeartDiseaseorAttack',(0,1) )) | |
PhysActivity = int(st.sidebar.selectbox('PhysActivity',(0,1) )) | |
Fruits = int(st.sidebar.selectbox('Fruits',(0,1),1 )) | |
Veggies = int(st.sidebar.selectbox('Veggies',(0,1) )) | |
HvyAlcoholConsump = int(st.sidebar.selectbox('HvyAlcoholConsump',(0,1),1 )) | |
AnyHealthcare = int(st.sidebar.selectbox('AnyHealthcare',(0,1) )) | |
NoDocbcCost = int(st.sidebar.selectbox('NoDocbcCost',(0,1) )) | |
DiffWalk = int(st.sidebar.selectbox('DiffWalk',(0,1) )) | |
GenHlth = int(st.sidebar.slider('GenHlth',1,5,3,step=1 )) | |
MentHlth = int(st.sidebar.slider('MentHlth',0,30,value=10,step=1 )) | |
PhysHlth = int(st.sidebar.slider('PhysHlth',0,30,value=8,step=1 )) | |
BMI = int(st.sidebar.slider('BMI',12,98,70 )) | |
Age = int(st.sidebar.slider('Age',1,13,value=12,step=1 )) | |
Education = int(st.sidebar.slider('Education',1,6,value=5,step=1 )) | |
Income = int(st.sidebar.slider('Income',1,8,value=2,step=1 )) | |
user_data = {'HighBP':HighBP, 'HighChol':HighChol, 'CholCheck':CholCheck, 'BMI':BMI, 'Smoker':Smoker, | |
'Stroke':Stroke, 'HeartDiseaseorAttack':HeartDiseaseorAttack, 'PhysActivity':PhysActivity, 'Fruits':Fruits, 'Veggies':Veggies, | |
'HvyAlcoholConsump':HvyAlcoholConsump, 'AnyHealthcare':AnyHealthcare, 'NoDocbcCost':NoDocbcCost, 'GenHlth':GenHlth, | |
'MentHlth':MentHlth, 'PhysHlth':PhysHlth, 'DiffWalk':DiffWalk, 'Sex':Sex, 'Age':Age, 'Education':Education, | |
'Income':Income} | |
user_df = pd.DataFrame(user_data,index=[0]) | |
return user_df | |
# Prepare user input data before fitting it to the model | |
def prepare_user_df(user_data,num_trans,col_trans): | |
user_data["Age2"] = user_data["Age"]**2 | |
user_data_std = num_trans.transform(user_data) | |
user_data_std = pd.DataFrame(user_data_std ,columns=list(user_data.columns) ) | |
user_data_encoded = col_trans.transform(user_data_std) | |
return user_data_encoded | |
# Create Web App | |
st.title("Machine Learning Demo :") | |
st.header("Prédiction de la santé d'un patient concernant le diabète") | |
st.subheader("Matrice de corrélation sur les données dédoublonnées") | |
img_path = "corr_matrix.PNG" | |
st.image(img_path) | |
st.subheader("La courbe ROC associée au model de RandomForest choisi") | |
img_path_roc = "ROC_Model_RF.PNG" | |
st.image(img_path_roc) | |
st.sidebar.header("Features") | |
st.subheader("Table à prédire :") | |
user_df = user_input() | |
st.dataframe(user_df) | |
target_names = ['0: no diabetes', '1: prediabetes', '2: diabetes'] | |
#LOADING THE DATA | |
results = loading_data() | |
Xtrain = results["Xtrain"] | |
Ytrain = results["Ytrain"] | |
Xtest_encoded = results["Xtest_encoded"] | |
Ytest = results["Ytest"] | |
num_col_trans = results["num_col_trans"] | |
cat_col_trans = results["cat_col_trans"] | |
## Load Model from pickle file | |
with open("Model_package.pkl","rb") as f: | |
Model_package = pickle.load(f) | |
model = Model_package['my_classif'] | |
num_col_trans = Model_package['num_col_trans'] | |
cat_col_trans = Model_package['cat_col_trans'] | |
# Plot the features Importance | |
st.subheader("Importance des variables :") | |
df_importance,sort_col_desc = Plot_feature_importance(model,Xtrain) | |
fig2 , ax2 = plt.subplots(figsize=(12,8)) | |
sns.barplot(x="Importance",y="Features",data = df_importance,orient="h",ax=ax2,order=sort_col_desc) | |
plt.grid(visible=False) | |
st.pyplot(fig2) | |
# PREDICTION | |
#Taux d'erreur de validation | |
Ypred_test = model.predict(Xtest_encoded) | |
st.subheader("Performance du model sur les données de validation : RandomForestClassifier ") | |
classif_report = pd.DataFrame(classification_report(Ytest,Ypred_test,output_dict=True,target_names = target_names )).T | |
classif_report["support"] = classif_report["support"].apply(lambda x: int(x)) | |
st.write(classif_report) | |
#Prepapre the input user before fitting to the model | |
user_df_prepared = prepare_user_df(user_df,num_col_trans,cat_col_trans) | |
pred = model.predict(user_df_prepared) | |
pred_class = str(target_names[int(pred)]) | |
st.subheader("Prédiction associée à l'échantillon :") | |
st.write(f"La classe prédite est : ----->> {pred_class}") | |
pred_proba = model.predict_proba(user_df_prepared) | |
df_pred_proba = pd.DataFrame(pred_proba,columns=target_names) | |
st.subheader("Probabilités de prédire chaque classe selon l'échantillon :") | |
st.write(df_pred_proba) |