Spaces:
Running
Running
import os | |
import re | |
import time | |
import streamlit as st | |
import matplotlib.pyplot as plt | |
import pandas as pd | |
import numpy as np | |
import altair as alt | |
import plotly.express as px | |
from st_pages import add_indentation | |
from utils import load_data_csv | |
from sklearn.datasets import fetch_california_housing | |
from sklearn.compose import make_column_selector as selector | |
from sklearn.model_selection import train_test_split | |
from sklearn.pipeline import Pipeline | |
from sklearn.preprocessing import MinMaxScaler, StandardScaler, OneHotEncoder | |
from sklearn.neighbors import KNeighborsClassifier | |
from sklearn.tree import DecisionTreeClassifier | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.compose import ColumnTransformer | |
from sklearn.metrics import confusion_matrix | |
st.set_page_config(layout="wide") | |
####################################################################################################### | |
# FUNCTIONS | |
####################################################################################################### | |
def model_training(X, y, model_dict, _num_transformer=MinMaxScaler(), | |
_cat_transformer=OneHotEncoder()): | |
model = model_dict["model"] | |
param = model_dict["param"] | |
explainability = False | |
feature_imp = None | |
if model == "K-nearest-neighbor ποΈ": | |
model_sklearn = KNeighborsClassifier(n_neighbors=param) | |
if model == "Decision Tree π³": | |
model_sklearn = DecisionTreeClassifier(max_depth=param) | |
explainability = True | |
if model == "Random Forest ποΈ": | |
model_sklearn = RandomForestClassifier(max_depth=param) | |
explainability = True | |
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.33) | |
preprocessor = ColumnTransformer( | |
transformers=[ | |
("numerical", _num_transformer, selector(dtype_exclude="category")), | |
("categorical", _cat_transformer, selector(dtype_include="category")), | |
]) | |
pipe = Pipeline( | |
steps=[("preprocessor", preprocessor), ("classifier", model_sklearn)]) | |
pipe.fit(X_train, y_train) | |
feature_names = pipe[:-1].get_feature_names_out() | |
feature_names = [name.split("__")[1] for name in feature_names] | |
feature_names = [name.split("_")[0] if "_" in name else name for name in feature_names] | |
y_pred = pipe.predict(X_test) | |
clf = pipe[-1] | |
cm = confusion_matrix(y_test, y_pred, labels=clf.classes_, normalize='pred') | |
if explainability: | |
feature_imp = clf.feature_importances_ | |
labels = clf.classes_ | |
return np.diag(cm), feature_imp, feature_names, labels | |
def see_code(model): | |
if model == "K-nearest-neighbor ποΈ": | |
model_sklearn = "KNeighborsClassifier(n_neighbors=6)" | |
if model == "Decision Tree π³": | |
model_sklearn = "DecisionTreeClassifier()" | |
if model == "Random Forest ποΈ": | |
model_sklearn = "RandomForestClassifier()" | |
code = f'''# Split data into train and test sets | |
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.33) | |
# Build data preprocessing step to numerical and categorical/text variables | |
preprocessor = ColumnTransformer( | |
transformers=[ | |
("numerical", num_transformer, selector(dtype_exclude="category")), | |
("categorical", cat_transformer, selector(dtype_include="category")), | |
]) | |
# Train the model with the preprocessing step | |
pipe = Pipeline( | |
steps=[("preprocessor", preprocessor), ("classifier", {model_sklearn})]) | |
pipe.fit(X_train, y_train) | |
# Predict values for the test set | |
y_pred = pipe.predict(X_test) | |
# Compute confusion matrix to get the accuracy for each label | |
clf = pipe[-1] | |
cm = confusion_matrix(y_test, y_pred, labels=clf.classes_, normalize='pred') | |
scores = np.diag(cm) | |
''' | |
st.warning("""**Note**: The following code uses functions from popular Python Data Science libraries `numpy` and `scikit-learn`.""") | |
st.code(code, language='python') | |
############################################################################################## | |
# START OF THE PAGE | |
############################################################################################## | |
st.image("images/ML_header.jpg") | |
st.markdown("# Go further π") | |
st.markdown("""This page allows you to test and compare the results of different AI models, and gain a deeper understanding of how they function. <br> | |
It includes three different types of **classification models** with Python code illustrations, as well as four datasets to choose from. | |
""", unsafe_allow_html=True) | |
# st.markdown("""**Reminder**: Classification models are AI models that are trained to predict a finite number of values/categories. | |
# Examples can be found in the *Supervised vs Unsupervised* page with the credit score classification and customer churn prediction use cases.""") | |
st.warning("""**Note**: Different types of models exists for most Machine Learning tasks. | |
Models tend to vary in complexity and picking which one to train for a specific use case isn't always straightforward. | |
Complex model might output better results but take longer to make predictions. | |
The model selection step requires a good amount of testing by practitioners.""") | |
st.markdown("""All of the classification models used in this page come from `scikit-learn`, which is a popular Data Science library in Python.""") | |
try: | |
st.link_button("Go to the scikit-learn website", "https://scikit-learn.org/stable/index.html") | |
except: | |
st.markdown("You need internet connexion to access the link.") | |
st.markdown(" ") | |
st.divider() | |
path_data = r'data/other_data' | |
st.markdown("# Classification ") | |
st.markdown("""**Reminder**: Classification models are AI models that are trained to predict a finite number of values/categories. | |
Examples can be found in the *Supervised vs Unsupervised* page with the credit score classification and customer churn prediction use cases.""") | |
st.markdown(" ") | |
st.markdown(" ") | |
########################## SELECT A DATASET ############################### | |
st.markdown("### Select a dataset π") | |
st.markdown("""To perform the classification task, you can choose between three different datasets: **Wine quality**, **Titanic** and **Car evaluation**. <br> | |
Each dataset will be shown in its original format and will go through pre-processing steps to insure its quality and usability for the chosen model. | |
""", unsafe_allow_html=True) | |
st.warning("""**Note:** The performance of a Machine Learning model is sensitive to the data being used to train it. | |
Data cleaning and pre-processing are usually as important as training the AI model. These steps can include removing missing values, identifying outliers and transforming columns from text to numbers.""") | |
select_data = st.selectbox("Choose an option", ["Wine quality π·", "Titanic π’", "Car evaluation π", "Diabetes π©ββοΈ"]) #label_visibility="collapsed") | |
st.markdown(" ") | |
if select_data =="Wine quality π·": | |
# Load data and clean it | |
data = load_data_csv(path_data, "winequality.csv") | |
data = data.loc[data["residual sugar"] < 40] | |
data = data.loc[data["free sulfur dioxide"] < 200] | |
data = data.loc[data["total sulfur dioxide"] < 400] | |
data.drop(columns=["free sulfur dioxide"], inplace=True) | |
X = data.drop(columns=["quality"]) | |
y = data["quality"] | |
# Information on the data | |
st.info("""**About the data**: The goal of the wine quality dataset is to **predict the quality** of different wines using their formulation. | |
The target in this use case is the `quality` variable which has two possible values (Good and Mediocre).""") | |
# View data | |
view_data = st.checkbox("View the data", key="wine") | |
if view_data: | |
st.dataframe(data) | |
if select_data == "Titanic π’": | |
# Load data and clean it | |
data = load_data_csv(path_data, "titanic.csv") | |
data = data.drop(columns=["Name","Cabin","Ticket","PassengerId"]).dropna() | |
data["Survived"] = data["Survived"].map({0: "Died", 1:"Survived"}) | |
data.rename({"Sex":"Gender"}, axis=1, inplace=True) | |
data["Age"] = data["Age"].astype(int) | |
data["Fare"] = data["Fare"].round(2) | |
cat_columns = data.select_dtypes(include="object").columns | |
data[cat_columns] = data[cat_columns].astype("category") | |
X = data.drop(columns=["Survived"]) | |
y = data["Survived"] | |
# Information on the data | |
st.info("""**About the data**: The goal of the titanic dataset is to **predict whether a passenger on the ship survived**. | |
The target in this use case is the `Survived` variable which has two possible values (Died or Survived). | |
""") | |
# View data | |
view_data = st.checkbox("View the data", key="titanic") | |
if view_data: | |
st.dataframe(data) | |
# About the variables | |
about_var = st.checkbox("Information on the variables", key="titanic-var") | |
if about_var: | |
st.markdown(""" | |
- **Survived**: Survival (Died or Survived) | |
- **Pclass**: Ticket class of the passenger (1=First, 2=Second, 3=Third) | |
- **Gender**: Gender | |
- **Age**: Age in years | |
- **SibSp**: Number of siblings aboard the Titanic | |
- **Parch**: Number of parents/children aboard the Titanic | |
- **Fare**: Passenger fare | |
- **Embarked**: Port of Embarkation (C=Cherbourg, Q=Queenstown, S=Southampton)""") | |
if select_data == "Car evaluation π": | |
# Load data and clean it | |
data = load_data_csv(path_data, "car.csv") | |
data.rename({"Price":"Buying"}, axis=1, inplace=True) | |
cat_columns = data.select_dtypes(include="object").columns | |
data[cat_columns] = data[cat_columns].astype("category") | |
X = data.drop(columns="Evaluation") | |
y = data["Evaluation"] | |
# Information on the data | |
st.info("""**About the data**: The goal of the car evaluation dataset is to predict the evaluation made about a car before being sold. | |
The target in this use case is the `Evaluation` variable, which has two possible values (Not acceptable or acceptable)""") | |
# View data | |
view_data = st.checkbox("View the data", key="car") | |
if view_data: | |
st.dataframe(data) | |
# View data | |
about_var = st.checkbox("Information on the variables", key="car-var") | |
if about_var: | |
st.markdown(""" | |
- **Buying**: Buying price of the vehicule (Very high, high, medium, low) | |
- **Maintenance**: Price for maintenance (Very high, high, medium, low) | |
- **Doors**: Number of doors in the vehicule (2, 3, 4, 5 or more) | |
- **Persons**: Capacity in terms of persons to carry (2, 4, more) | |
- **Luggage boot**: Size of luggage boot | |
- **Safety**: Estimated safety of the car (low, medium, high) | |
- **Evaluation**: Evaluation level (unacceptable, acceptable)""") | |
if select_data == "Diabetes π©ββοΈ": | |
# Load data and clean it | |
data = load_data_csv(path_data, "diabetes.csv") | |
data["Outcome"] = data["Outcome"].map({1:"Yes", 0:"No"}) | |
#data.drop(columns=["DiabetesPedigreeFunction"], inplace=True) | |
# data.rename({"Price":"Buying"}, axis=1, inplace=True) | |
cat_columns = data.select_dtypes(include="object").columns | |
data[cat_columns] = data[cat_columns].astype("category") | |
X = data.drop(columns="Outcome") | |
y = data["Outcome"] | |
# Information on the data | |
st.info("""**About the data**: The goal of the diabetes dataset is to predict whether a patient has diabetes. | |
The target in this use case is the `Outcome` variable, which has two possible values (Yes or No)""") | |
# View data | |
view_data = st.checkbox("View the data", key="diabetes") | |
if view_data: | |
st.dataframe(data) | |
# View data | |
about_var = st.checkbox("Information on the variables", key="car-var") | |
if about_var: | |
st.markdown(""" | |
- **Pregnancies**: Number of pregnancies had | |
- **Glucose**: The level of glucose in the patient's blood | |
- **BloodPressure**: Blood pressure measurement | |
- **SkinThickness**: Thickness of the skin | |
- **Insulin**: Level of insulin in the blood | |
- **BMI**: Body mass index | |
- **DiabetesPedigreeFunction**: Likelihood of diabetes depending on the patient's age and diabetic family history | |
- **Age**: Age of the patient | |
- **Outcome**: Whether the patient has diabetes (Yes or No)""") | |
st.markdown(" ") | |
st.markdown(" ") | |
########################## SELECT A MODEL ############################### | |
st.markdown("### Select a model π") | |
st.markdown("""You can choose between three types of classification models: **K nearest neighbors (KNN)**, **Decision Trees** and **Random Forests**. <br> | |
For each model, you will be given a short explanation as to how they function. | |
""", unsafe_allow_html=True) | |
select_model = st.selectbox("**Choose an option**", ["K-nearest-neighbor ποΈ", "Decision Tree π³", "Random Forest ποΈ"]) | |
st.markdown(" ") | |
if select_model == "K-nearest-neighbor ποΈ": | |
#st.markdown("#### Model: K-nearest-neighbor") | |
st.info("""**About the model**: K-nearest-neighbor (or KNN) is a type of classification model that uses neighboring points to classify new data. | |
When trying to predict a class to new data points, the algorithm will look at points in close proximity (or in its neighborhood) to make a decision. | |
The most common class among its neighborhood will then be assigned to the data point.""") | |
select_param = 6 | |
model_dict = {"model":select_model, "param":select_param} | |
learn_model = st.checkbox("Learn more", key="knn") | |
if learn_model: | |
st.markdown("""An important parameter in KNN algorithms is the number of points to choose as neighboors. <br> | |
The image below shows two cases where the number of neighboors (k) are equal to 3 and 6. | |
- When k is equal to 3, the most common class is **Classe B**. The red point will then be predicted as Classe B. | |
- When k is equal to 6, the the most common class is **Classe A**. The red point will then be predicted as Classe A.""", | |
unsafe_allow_html=True) | |
st.image("images/knn.png", width=600) | |
st.markdown("""K-nearest-neighbor algorithm are popular for their simplicity. <br> | |
This can be a drawback for use cases/dataset that require a more complex approach to make accurate predictions.""", unsafe_allow_html=True) | |
see_code_box = st.checkbox("See the code", key='knn_code') | |
if see_code_box: | |
see_code(select_model) | |
if select_model == "Decision Tree π³": | |
st.info("""**About the model**: Decision trees are classification model that split the prediction task into a succession of decisions, each with only two possible outcomes. | |
These decisions can be visualized as a tree, with data points arriving from the top of the tree and landing at final "prediction regions".""") | |
select_param = None | |
model_dict = {"model":select_model, "param":select_param} | |
learn_model = st.checkbox("Learn more", key="tree") | |
if learn_model: | |
st.markdown("""The following image showcases a decision tree that was built to predict whether a **bank should give out a loan** to a client. <br> | |
The data used to train the model has each client's **age**, **salary** and **number of children**.""", unsafe_allow_html=True) | |
st.markdown("""To predict whether a client gets a loan, the client's data goes through each 'question' in the tree and **gets assigned the class of the region it fell into**. <br> | |
For example, a client that is under 30 years old and has a lower salary than 2500$ will not be awarded a loan by the model.""", unsafe_allow_html=True) | |
st.image("images/decisiontree.png", width=800) | |
st.markdown("""Decision tree models are popular as they are easy to interpret. <br> | |
The higher the variable is on the tree, the more important it is in the decision process.""", unsafe_allow_html=True) | |
see_code_box = st.checkbox("See the code", key='tree_code') | |
if see_code_box: | |
see_code(select_model) | |
if select_model == "Random Forest ποΈ": | |
st.info("""**About the model:** Random Forest models generate multiple decision tree models to make predictions. | |
The main drawback of decision trees is that their predictions can be unstable, meaning that their output often changes. | |
Random Forest models aggregate the predictions of multiple decision trees to reduce this unstability and improve robustness.""") | |
select_param = None | |
model_dict = {"model":select_model, "param":select_param} | |
learn_model = st.checkbox("Learn more", key="tree") | |
if learn_model: | |
st.markdown("""Random Forests classifiers aggregate results by apply **majority voting**, which means selecting the class that was most often predicted by trees as the final prediction. | |
In the following image, the random forest model built four decision trees, who each have made their own final prediction. <br>""" | |
, unsafe_allow_html=True) | |
st.markdown("""Class C was predicted twice, whereas Class B et D where only predicted once. <br> | |
The final prediction of the random forest model is thus Class C.""", unsafe_allow_html=True) | |
st.image("images/randomforest.png", width=800) | |
see_code_box = st.checkbox("See the code", key='forest_code') | |
if see_code_box: | |
see_code(select_model) | |
st.markdown(" ") | |
st.markdown(" ") | |
########################## RUN THE MODEL ############################### | |
st.markdown("### Train the model βοΈ") | |
st.markdown("""Now, you can build the chosen classification model and use the selected dataset to train it. <br> | |
You will get the model's accuracy in predicting each category, as well as the importance of each variable in the final predictions.""", unsafe_allow_html=True) | |
st.warning("""**Note**: Most machine learning models have an element of randomness in their predictions. | |
This explains why a model's accuracy might change even if you run it with the same dataset.""") | |
st.markdown(f"""You've selected the **{select_data}** dataset and the **{select_model}** model.""") | |
run_model = st.button("Run model", type="primary") | |
if run_model: | |
score, feature_imp, feature_names, labels = model_training(X, y, model_dict, _num_transformer=StandardScaler()) | |
if select_model in ["Decision Tree π³", "Random Forest ποΈ"]: # show explainability for decision tree, random firest | |
tab1, tab2 = st.tabs(["Accuracy", "Explainability"]) | |
with tab1: | |
if select_data == "Diabetes π©ββοΈ": | |
st.error("""**Important**: The Diabetes dataset only contains information on 768 patients. 500 patients don't have diabetes and 268 do have the disease. | |
This small number of patient data explains why the model's performance isn't optimal. | |
Additional data collection should be conducted to improve results, as well as hyperparameter tuning (see explanation after graph).""") | |
score_df = pd.DataFrame({"label":labels, "accuracy":np.round(score*100)}) | |
fig = px.bar(score_df, x="label", y="accuracy", color="label", title="Accuracy results", text_auto=True) | |
st.plotly_chart(fig, use_container_width=True) | |
st.warning("""**Note**: To improve the results of a model, practionners often conduct *hyperparameter tuning*. | |
It consists of trying different combination of the model's parameters to maximise the accuracy score. | |
Hyperparameter tuning wasn't conduct here in order to insure the app doesn't lag.""") | |
with tab2: | |
df_feature_imp = pd.DataFrame({"variable":feature_names, "importance":feature_imp}) | |
df_feature_imp = df_feature_imp.groupby("variable").mean().reset_index() | |
df_feature_imp["importance"] = df_feature_imp["importance"].round(2) | |
df_feature_imp.sort_values(by=["importance"], ascending=False, inplace=True) | |
fig = px.bar(df_feature_imp, x="importance", y="variable", color="importance") | |
st.plotly_chart(fig, use_container_width=True) | |
else: # only show results for knn | |
st.markdown("#### Results") | |
st.markdown("""The K-nearest-neighbor algorithm doesn't have a built-in solution to compute model explainability with `scikit-learn`. | |
You can use other python packages such as `SHAP` to compute explainability, which we didn't use here since they usually take a long time to output results.""") | |
if select_data == "Diabetes π©ββοΈ": | |
st.error("""**Important**: Note that Diabetes dataset only contains information on 768 patients. 500 patients don't have diabetes and 268 do have the disease. | |
This small number of patient data explains why the model's performance isn't optimal. | |
Additional data collection should be conducted to improve results, as well as hyperparameter tuning (see explanation after graph).""") | |
score_df = pd.DataFrame({"label":labels, "accuracy":np.round(score*100)}) | |
fig = px.bar(score_df, x="label", y="accuracy", color="label", title="Accuracy results", text_auto=True) | |
st.plotly_chart(fig, use_container_width=True) | |
st.warning("""**Note**: To improve the results of a model, practionners often conduct *hyperparameter tuning*. | |
It consists of trying different combination of the model's parameters to maximise the accuracy score. | |
Hyperparameter tuning wasn't conduct here in order to insure the app doesn't lag.""") | |