import streamlit as st import pickle import pandas as pd import seaborn as sns import sklearn from sklearn.ensemble import RandomForestClassifier import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split from sklearn.metrics import confusion_matrix,plot_confusion_matrix,classification_report from imblearn.over_sampling import RandomOverSampler import numpy as np from sklearn.preprocessing import StandardScaler,OneHotEncoder from sklearn.compose import make_column_transformer import warnings import random warnings.filterwarnings("ignore") # Load Data @st.cache(allow_output_mutation=True) def loading_data(): df = pd.read_csv("diabetes_012__health_indicators_BRFSS2015.csv") #convert all columns to integer for col in df.columns: df[col] = df[col].astype("int") # Drop duplicated rows df_ = df.drop_duplicates() df=df_ X,y = df.drop(['Diabetes_012'],axis=1),df['Diabetes_012'].values classes = np.unique(y) #Oversampling data randomSampler = RandomOverSampler(sampling_strategy='all',random_state=24) X_new,y_new = randomSampler.fit_resample(X,y) new_df = X_new.copy() new_df["Diabetes_012"] = y_new Xtrain,Xtest,Ytrain,Ytest = train_test_split(X_new,y_new,test_size=0.2,random_state=24,stratify=y_new) # Some feature engineering Xtrain_transf = Xtrain.copy() Xtrain_transf["Age2"] = Xtrain_transf["Age"]**2 ## Numerical column transformation col_num=["BMI","MentHlth","PhysHlth","Age","Age2"] num_col_trans = make_column_transformer((StandardScaler(),col_num),remainder="passthrough") Xtrain_transf_std = num_col_trans.fit_transform(Xtrain_transf,Ytrain) Xtrain_transf_std = pd.DataFrame(Xtrain_transf_std ,columns=list(Xtrain_transf.columns) ) # Données tests: Xtest_transf = Xtest.copy() Xtest_transf["Age2"] = Xtest_transf["Age"]**2 Xtest_transf_std = num_col_trans.transform(Xtest_transf) Xtest_transf_std = pd.DataFrame(Xtest_transf_std ,columns=list(Xtest_transf.columns) ) ## Categorical columns transformation col_cat = ["GenHlth","Education","Income"] cat_col_trans = make_column_transformer((OneHotEncoder(handle_unknown = 'ignore'),col_cat),remainder="passthrough") Xtrain_transf_std_encoded = cat_col_trans.fit_transform(Xtrain_transf_std,Ytrain) # Sur les données tests : Xtest_encoded = cat_col_trans.transform(Xtest_transf_std) results = {"Xtrain": Xtrain, "Ytrain": Ytrain, "Xtest_encoded": Xtest_encoded, "Ytest": Ytest, "num_col_trans": num_col_trans, "cat_col_trans": cat_col_trans} return results # Function for plotting the feature importance @st.cache(allow_output_mutation=True) def Plot_feature_importance(my_model,X_train): feature_importance = my_model.feature_importances_ columns_name = list(X_train.columns) dico_importance = {col:var_imp for var_imp,col in zip(feature_importance,columns_name)} col_imp = ["Features","Importance"] df_importance = pd.DataFrame(dico_importance,index=[0]).T.reset_index() df_importance.columns = col_imp df_importance = df_importance.sort_values("Importance",ascending=False) sort_col_desc = list(df_importance["Features"].values) df_importance["Features"] = df_importance["Features"].astype("category") return df_importance,sort_col_desc # Function use to get the user input data def user_input(): Sex_txt = st.sidebar.selectbox('Sex',("homme","femme")) if Sex_txt=="homme": Sex = 0 else: Sex = 1 HighBP = int(st.sidebar.selectbox('HighBP',(0,1) )) HighChol = int(st.sidebar.selectbox('HighChol',(0,1),0 )) CholCheck = int(st.sidebar.selectbox('CholCheck',(0,1) )) Smoker = int(st.sidebar.selectbox('Smoker',(0,1) )) Stroke = int(st.sidebar.selectbox('Stroke',(0,1),1 )) HeartDiseaseorAttack = int(st.sidebar.selectbox('HeartDiseaseorAttack',(0,1) )) PhysActivity = int(st.sidebar.selectbox('PhysActivity',(0,1) )) Fruits = int(st.sidebar.selectbox('Fruits',(0,1),1 )) Veggies = int(st.sidebar.selectbox('Veggies',(0,1) )) HvyAlcoholConsump = int(st.sidebar.selectbox('HvyAlcoholConsump',(0,1),1 )) AnyHealthcare = int(st.sidebar.selectbox('AnyHealthcare',(0,1) )) NoDocbcCost = int(st.sidebar.selectbox('NoDocbcCost',(0,1) )) DiffWalk = int(st.sidebar.selectbox('DiffWalk',(0,1) )) GenHlth = int(st.sidebar.slider('GenHlth',1,5,3,step=1 )) MentHlth = int(st.sidebar.slider('MentHlth',0,30,value=10,step=1 )) PhysHlth = int(st.sidebar.slider('PhysHlth',0,30,value=8,step=1 )) BMI = int(st.sidebar.slider('BMI',12,98,70 )) Age = int(st.sidebar.slider('Age',1,13,value=12,step=1 )) Education = int(st.sidebar.slider('Education',1,6,value=5,step=1 )) Income = int(st.sidebar.slider('Income',1,8,value=2,step=1 )) user_data = {'HighBP':HighBP, 'HighChol':HighChol, 'CholCheck':CholCheck, 'BMI':BMI, 'Smoker':Smoker, 'Stroke':Stroke, 'HeartDiseaseorAttack':HeartDiseaseorAttack, 'PhysActivity':PhysActivity, 'Fruits':Fruits, 'Veggies':Veggies, 'HvyAlcoholConsump':HvyAlcoholConsump, 'AnyHealthcare':AnyHealthcare, 'NoDocbcCost':NoDocbcCost, 'GenHlth':GenHlth, 'MentHlth':MentHlth, 'PhysHlth':PhysHlth, 'DiffWalk':DiffWalk, 'Sex':Sex, 'Age':Age, 'Education':Education, 'Income':Income} user_df = pd.DataFrame(user_data,index=[0]) return user_df def generate_user_input(): Sex_txt_list = ["homme","femme"] Sex_txt = random.choice(Sex_txt_list) if Sex_txt=="homme": Sex = 0 else: Sex = 1 HighBP = random.randint(0,1) HighChol = random.randint(0,1) CholCheck = random.randint(0,1) Smoker = random.randint(0,1) Stroke = random.randint(0,1) HeartDiseaseorAttack = random.randint(0,1) PhysActivity = random.randint(0,1) Fruits = random.randint(0,1) Veggies = random.randint(0,1) HvyAlcoholConsump = random.randint(0,1) AnyHealthcare = random.randint(0,1) NoDocbcCost = random.randint(0,1) DiffWalk = random.randint(0,1) GenHlth = random.randint(1,5) MentHlth = random.randint(0,30) PhysHlth = random.randint(0,30) BMI = random.randint(12,98) Age = random.randint(1,13) Education = random.randint(1,6) Income = random.randint(1,8) user_data = {'HighBP':HighBP, 'HighChol':HighChol, 'CholCheck':CholCheck, 'BMI':BMI, 'Smoker':Smoker, 'Stroke':Stroke, 'HeartDiseaseorAttack':HeartDiseaseorAttack, 'PhysActivity':PhysActivity, 'Fruits':Fruits, 'Veggies':Veggies, 'HvyAlcoholConsump':HvyAlcoholConsump, 'AnyHealthcare':AnyHealthcare, 'NoDocbcCost':NoDocbcCost, 'GenHlth':GenHlth, 'MentHlth':MentHlth, 'PhysHlth':PhysHlth, 'DiffWalk':DiffWalk, 'Sex':Sex, 'Age':Age, 'Education':Education, 'Income':Income} user_df_rand = pd.DataFrame(user_data,index=[0]) return user_df_rand # Prepare user input data before fitting it to the model def prepare_user_df(user_data,num_trans,col_trans): user_data["Age2"] = user_data["Age"]**2 user_data_std = num_trans.transform(user_data) user_data_std = pd.DataFrame(user_data_std ,columns=list(user_data.columns) ) user_data_encoded = col_trans.transform(user_data_std) return user_data_encoded def plot_proba(dframe): dframe= dframe.T.reset_index() dframe.columns = ["Classes","Pourcentage"] fig, ax = plt.subplots(figsize=(12,8)) sns.barplot(x='Pourcentage', y='Classes', data=dframe,ax = ax) for p in ax.patches: percentage = '{:.1f}%'.format(100 * p.get_width()) x = p.get_x() + p.get_width() + 0.009 y = p.get_y() + p.get_height()/2 ax.annotate(percentage, (x, y)) st.pyplot(fig) # Create Web App st.title("Machine Learning Demo :") st.header("Prédiction de la santé d'un patient concernant le diabète") col1, col2 = st.columns(2) with col1: st.subheader("Matrice de corrélation sur les données dédoublonnées") img_path = "corr_matrix.PNG" st.image(img_path) with col2: st.subheader("La courbe ROC associée au model de RandomForest choisi") img_path_roc = "ROC_Model_RF.PNG" st.image(img_path_roc) st.sidebar.header("Features") # Load user input st.subheader("Table à prédire :") button = st.button("Générer des données") if button: user_df = generate_user_input() else : user_df = user_input() st.dataframe(user_df) target_names = ['0: no diabetes', '1: prediabetes', '2: diabetes'] #LOADING THE DATA results = loading_data() Xtrain = results["Xtrain"] Ytrain = results["Ytrain"] Xtest_encoded = results["Xtest_encoded"] Ytest = results["Ytest"] num_col_trans = results['num_col_trans'] cat_col_trans = results['cat_col_trans'] ## Load Model from pickle file with open("Model_package.pkl","rb") as f: Model_package = pickle.load(f) model = Model_package['my_classif'] # Plot the features Importance st.subheader("Importance des variables :") df_importance,sort_col_desc = Plot_feature_importance(model,Xtrain) fig2 , ax2 = plt.subplots(figsize=(12,8)) sns.barplot(x="Importance",y="Features",data = df_importance,orient="h",ax=ax2,order=sort_col_desc) plt.grid(visible=False) st.pyplot(fig2) # PREDICTION #Taux d'erreur de validation Ypred_test = model.predict(Xtest_encoded) st.subheader("Performance du model sur les données de validation : RandomForestClassifier ") classif_report = pd.DataFrame(classification_report(Ytest,Ypred_test,output_dict=True,target_names = target_names )).T classif_report["support"] = classif_report["support"].apply(lambda x: int(x)) st.write(classif_report) #Prepapre the input user before fitting to the model user_df_prepared = prepare_user_df(user_df,num_col_trans,cat_col_trans) pred = model.predict(user_df_prepared) pred_class = str(target_names[int(pred)]) st.subheader("Prédiction associée à l'échantillon :") st.write(f"La classe prédite est : ----->> {pred_class}") pred_proba = model.predict_proba(user_df_prepared) df_pred_proba = pd.DataFrame(pred_proba,columns=target_names) st.subheader("Probabilités de prédire chaque classe selon l'échantillon :") st.write(df_pred_proba) plot_proba(df_pred_proba)