{ "cells": [ { "cell_type": "markdown", "metadata": {}, "source": [ "# 1. Introduction\n", "\n", "This notebook was written to train Porto Alegre Traffic Accidents Data after the first cleaning, processing, and transforming step. This was made in a notebook in the `data` folder. In truth, we will have 3 models.\n", "\n", "1. Predict the probability of injured people.\n", "\n", "2. Predict the probability of seriously injured people.\n", "\n", "3. Predict the probability of dead people in the event or after it.\n", "\n", "The path to training the models will be the same, just make some filtering on data and analyze the results properly." ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 2. Data Loading" ] }, { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
012
latitude-30.009614-30.0403-30.069
longitude-51.185581-51.1958-51.1437
feridosTrueTrueTrue
feridos_grFalseFalseFalse
fataisFalseFalseFalse
caminhaoFalseFalseFalse
motoTrueTrueFalse
carsTrueTrueTrue
transportFalseFalseFalse
othersFalseFalseFalse
holidayFalseTrueTrue
day_1000
day_2000
day_3000
day_4000
day_5100
day_6011
hour_1000
hour_2000
hour_3000
hour_4000
hour_5000
hour_6000
hour_7000
hour_8000
hour_9000
hour_10010
hour_11000
hour_12000
hour_13000
hour_14000
hour_15000
hour_16000
hour_17000
hour_18000
hour_19101
hour_20000
hour_21000
hour_22000
hour_23000
type_ATROPELAMENTO001
type_CHOQUE000
type_COLISÃO000
type_OUTROS000
\n", "
" ], "text/plain": [ " 0 1 2\n", "latitude -30.009614 -30.0403 -30.069\n", "longitude -51.185581 -51.1958 -51.1437\n", "feridos True True True\n", "feridos_gr False False False\n", "fatais False False False\n", "caminhao False False False\n", "moto True True False\n", "cars True True True\n", "transport False False False\n", "others False False False\n", "holiday False True True\n", "day_1 0 0 0\n", "day_2 0 0 0\n", "day_3 0 0 0\n", "day_4 0 0 0\n", "day_5 1 0 0\n", "day_6 0 1 1\n", "hour_1 0 0 0\n", "hour_2 0 0 0\n", "hour_3 0 0 0\n", "hour_4 0 0 0\n", "hour_5 0 0 0\n", "hour_6 0 0 0\n", "hour_7 0 0 0\n", "hour_8 0 0 0\n", "hour_9 0 0 0\n", "hour_10 0 1 0\n", "hour_11 0 0 0\n", "hour_12 0 0 0\n", "hour_13 0 0 0\n", "hour_14 0 0 0\n", "hour_15 0 0 0\n", "hour_16 0 0 0\n", "hour_17 0 0 0\n", "hour_18 0 0 0\n", "hour_19 1 0 1\n", "hour_20 0 0 0\n", "hour_21 0 0 0\n", "hour_22 0 0 0\n", "hour_23 0 0 0\n", "type_ATROPELAMENTO 0 0 1\n", "type_CHOQUE 0 0 0\n", "type_COLISÃO 0 0 0\n", "type_OUTROS 0 0 0" ] }, "execution_count": 1, "metadata": {}, "output_type": "execute_result" } ], "source": [ "import os.path as path\n", "from pandas import read_csv\n", "\n", "file_csv = path.abspath(\"../\")\n", "\n", "file_csv = path.join(file_csv, \"data\" ,\"accidents_trans.csv\")\n", "\n", "accidents_trans = read_csv(file_csv)\n", "\n", "accidents_trans.head(3).T" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 3. Data Preparation" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "import joblib as jb # Use to save the model to deploy\n", "from sklearn.preprocessing import StandardScaler\n", "from sklearn.model_selection import train_test_split" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Our model to predict the probability of feridos will be create with 68218 rows and 41 features.\n" ] } ], "source": [ "outputs = [\"feridos\", \"feridos_gr\", \"fatais\"]\n", "inputs = [col for col in accidents_trans.columns if col not in outputs]\n", "\n", "X = accidents_trans[inputs].copy()\n", "Y = accidents_trans[outputs].copy()\n", "\n", "# Filtering data considering the output\n", "output = \"feridos\"\n", "\n", "if output == \"feridos_gr\":\n", " X = X[Y[\"feridos\"]]\n", " Y = Y.loc[Y[\"feridos\"], \"feridos_gr\"]\n", "elif output == \"fatais\":\n", " X = X[Y[\"feridos_gr\"]]\n", " Y = Y.loc[Y[\"feridos_gr\"], \"fatais\"]\n", "else:\n", " Y = Y[\"feridos\"]\n", "\n", "print(f\"Our model to predict the probability of \" \\\n", " f\"{output} will be create with {X.shape[0]} \" \\\n", " f\"rows and {X.shape[1]} features.\")" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [], "source": [ "import csv\n", "\n", "with open(\"model_features.csv\", 'w') as f:\n", " writer = csv.writer(f)\n", " writer.writerow(X.columns)" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "Considering that we will use models scaling sensitive, we will need to scale our data first. Beside this, we will need to save our scaler for future use." ] }, { "cell_type": "code", "execution_count": 5, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['c:\\\\Users\\\\grego\\\\OneDrive\\\\Documentos\\\\Documentos Pessoais\\\\00_DataCamp\\\\09_VSC\\\\poa_car_accidents\\\\poa_car_accidents\\\\model\\\\scaler_feridos.pkl']" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Setting the random state using my luck number :-)\n", "lucky_num = 7\n", "\n", "# X_train and y_train to train our model\n", "X_train, X_test, y_train, y_test = train_test_split(\n", " X,\n", " Y,\n", " test_size=0.30,\n", " random_state=lucky_num,\n", " shuffle=True, # Used because our data is sort by date\n", " stratify=Y) # Used because our data is unbalanced\n", "\n", "# Scaling\n", "scaler = StandardScaler()\n", "X_train = scaler.fit_transform(X_train)\n", "X_test = scaler.transform(X_test)\n", "\n", "# Saving scaler\n", "file_name = \"scaler_\" + output + '.pkl'\n", "jb.dump(scaler, path.join(path.abspath(\"./\"), file_name))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "# 4. Data Modeling\n", "\n", "We will create and use cross-validation to evaluate the following models:\n", "\n", "- Logistic Regression;\n", "\n", "- Gaussian Naive Bayes;\n", "\n", "- K Neighbors;\n", "\n", "- Random Forest;\n", "\n", "- Gradient Boosting; and,\n", "\n", "- XGBoost.\n", "\n", "We will use two scores to select and evaluate our models:\n", "\n", "- F1 score: composition between the precision (how much our model correct classify every true label) and recall (how moch our model correct indicate true labels); and,\n", "\n", "- Brier score: average between the correct and the predict probability.\n", "\n", "However, we will see other metrics to support our decision:\n", "\n", "- Accurancy;\n", "\n", "- ROC_AOC; and,\n", "\n", "- Log loss (an other way to quantify the quality of probability predictions).\n", "\n", "And, before you go, we will find for each model if there is a hyperparameter to deal with the unbalanced output." ] }, { "cell_type": "code", "execution_count": 6, "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import xgboost as xgb\n", "from sklearn.naive_bayes import GaussianNB\n", "from sklearn.neighbors import KNeighborsClassifier\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.model_selection import cross_validate \n", "from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier\n", "from sklearn.metrics import accuracy_score, recall_score, precision_score, roc_auc_score, f1_score, brier_score_loss, log_loss\n", "\n", "scores = [\"accuracy\", \"f1\", \"precision\", \"recall\", \"roc_auc\", \"neg_brier_score\",\"neg_log_loss\"]" ] }, { "cell_type": "code", "execution_count": 7, "metadata": {}, "outputs": [], "source": [ "def eval_model(cls) -> tuple:\n", " \"\"\"This function will calculate the metrics\n", " to evaluate a classification model.\n", " \"\"\"\n", " # Predicting labels and probabilities\n", " y_pred = cls.predict(X_test)\n", " y_prob = cls.predict_proba(X_test)[:,1]\n", "\n", " # Calculating scores\n", " accurancy = accuracy_score(y_test, y_pred)\n", " f1 = f1_score(y_test, y_pred)\n", " recall = recall_score(y_test, y_pred)\n", " precision = precision_score(y_test, y_pred)\n", " roc_auc = roc_auc_score(y_test, y_prob) # https://datascience.stackexchange.com/questions/114394/does-roc-auc-different-between-crossval-and-test-set-indicate-overfitting-or-oth\n", " brier_score = brier_score_loss(y_test, y_prob)\n", " log_loss_value = log_loss(y_test, y_prob)\n", "\n", " return accurancy, f1, precision, recall, roc_auc, brier_score, log_loss_value\n", "\n", "def create_model(name: str, cls) -> list:\n", " \"\"\"This function will create some models\n", " and return scores to evaluate it.\"\"\"\n", " # Ftting model\n", " cls.fit(X_train, y_train)\n", "\n", " # Using cross-validation to evaluate the model fitted\n", " cls_cross = cross_validate(\n", " estimator=cls,\n", " X=X_train,\n", " y=y_train,\n", " cv=5,\n", " scoring=scores)\n", "\n", " df_cv = pd.DataFrame.from_dict(cls_cross, orient='index', columns=[\"CV\"+str(i) for i in range(1,6)])\n", "\n", " # Calculating score to test set\n", " accurancy, f1, precision, recall, roc_auc, brier_score, log_loss_value = eval_model(cls)\n", "\n", " # Filling a dataframe to better presentation\n", " df_cv.at[\"test_accuracy\", \"TestSet\"] = accurancy\n", " df_cv.at[\"test_f1\", \"TestSet\"] = f1\n", " df_cv.at[\"test_recall\", \"TestSet\"] = recall\n", " df_cv.at[\"test_precision\", \"TestSet\"] = precision\n", " df_cv.at[\"test_roc_auc\", \"TestSet\"] = roc_auc\n", " df_cv.at[\"test_neg_brier_score\", \"TestSet\"] = -brier_score\n", " df_cv.at[\"test_neg_log_loss\", \"TestSet\"] = -log_loss_value\n", "\n", " caption = f\"{name} Validation Scores\"\n", "\n", " display(df_cv.style.set_caption(caption))\n", "\n", " return [accurancy, f1, precision, recall, roc_auc, brier_score, log_loss_value]" ] }, { "cell_type": "code", "execution_count": 8, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
LR Validation Scores
 CV1CV2CV3CV4CV5TestSet
fit_time0.0823540.0802570.0893290.0947200.087742nan
score_time0.0160660.0176350.0201000.0182600.018356nan
test_accuracy0.8692280.8683910.8723560.8690050.8675390.865924
test_f10.8175840.8181160.8239200.8196110.8170110.814469
test_precision0.8541350.8461540.8505820.8443260.8444980.843439
test_recall0.7840340.7918770.7988800.7963010.7912580.787423
test_roc_auc0.9034180.9049700.9063770.9024050.9069390.904458
test_neg_brier_score-0.109808-0.109221-0.106382-0.110939-0.109709-0.110435
test_neg_log_loss-0.370200-0.366684-0.360534-0.372374-0.367532-0.370350
\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
NB Validation Scores
 CV1CV2CV3CV4CV5TestSet
fit_time0.0354100.0300150.0326390.0297520.030653nan
score_time0.0378260.0409930.0323760.0307670.028092nan
test_accuracy0.7684010.7633760.7651310.7715180.7722510.766637
test_f10.6670680.6542230.6609220.6758760.6688990.664795
test_precision0.7208850.7208360.7178980.7192540.7323330.717684
test_recall0.6207280.5988800.6123250.6374330.6155790.619166
test_roc_auc0.8522900.8471840.8437330.8518730.8560470.848834
test_neg_brier_score-0.206596-0.210362-0.211968-0.204214-0.202682-0.208278
test_neg_log_loss-1.668014-1.788896-1.917438-1.662381-1.670358-1.761326
\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
KNN Validation Scores
 CV1CV2CV3CV4CV5TestSet
fit_time0.0100020.0113120.0116210.0138430.011473nan
score_time1.6602691.3605701.6512961.7341291.823339nan
test_accuracy0.8423200.8487070.8473300.8427230.8477490.843692
test_f10.7764920.7872180.7835510.7790530.7863650.779698
test_precision0.8257580.8298670.8335440.8200680.8266910.823778
test_recall0.7327730.7487390.7392160.7419450.7497900.740097
test_roc_auc0.8673300.8699240.8729510.8668680.8722770.872155
test_neg_brier_score-0.130989-0.127425-0.126777-0.130655-0.127401-0.128215
test_neg_log_loss-2.083997-1.959589-1.815403-2.007178-1.929602-1.877810
\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
RF Validation Scores
 CV1CV2CV3CV4CV5TestSet
fit_time4.0996654.0612004.0901164.0557054.050387nan
score_time0.3903650.3892440.3921080.3873580.400155nan
test_accuracy0.8561410.8592820.8615710.8535080.8553930.856152
test_f10.8003490.8052170.8076810.7986760.7994770.800623
test_precision0.8315220.8342340.8401940.8210060.8297170.830547
test_recall0.7714290.7781510.7775910.7775290.7713650.772781
test_roc_auc0.8901220.8905610.8973210.8873960.8910780.893466
test_neg_brier_score-0.116884-0.114867-0.111343-0.117719-0.116295-0.115285
test_neg_log_loss-0.607395-0.579640-0.536542-0.614554-0.631888-0.562042
\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
GBC Validation Scores
 CV1CV2CV3CV4CV5TestSet
fit_time4.5914374.4372134.1210674.1421804.113901nan
score_time0.0559930.0481130.0494920.0501630.055706nan
test_accuracy0.8711130.8732070.8786390.8700520.8701570.871054
test_f10.8171690.8208310.8277090.8170430.8171090.817560
test_precision0.8697440.8698650.8818500.8621660.8626600.867518
test_recall0.7705880.7770310.7798320.7764080.7761280.773042
test_roc_auc0.9070410.9080410.9119300.9062830.9093480.908648
test_neg_brier_score-0.105054-0.103463-0.099338-0.104658-0.104459-0.104280
test_neg_log_loss-0.352792-0.348499-0.338605-0.351285-0.350193-0.350152
\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" }, { "data": { "text/html": [ "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
XGB Validation Scores
 CV1CV2CV3CV4CV5TestSet
fit_time3.8020293.0367642.9796472.1772322.287098nan
score_time0.0690130.0718190.0494020.0572790.050020nan
test_accuracy0.8602240.8518480.8541360.8532980.8560210.854344
test_f10.8141450.8047470.8082590.8073700.8103710.808283
test_precision0.8093000.7930380.7945870.7926570.7979360.795443
test_recall0.8190480.8168070.8224090.8226390.8232000.821545
test_roc_auc0.9084070.9063790.9108330.9075070.9089590.908681
test_neg_brier_score-0.116893-0.119319-0.116034-0.119313-0.118294-0.118266
test_neg_log_loss-0.393473-0.395306-0.384403-0.397352-0.394224-0.392001
\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# XGB hyperparameter that deals with unbalanced\n", "scale_pos_weight = Y.mean()**-1\n", "\n", "# Creating the model objects\n", "cls_lr = LogisticRegression(\n", " class_weight=\"balanced\", # Hyperparameter to deal with unbalanced output\n", " random_state=lucky_num)\n", "# cls_svm = SVC(random_state=lucky_num) # Remove due its resource consumption and worst results\n", "cls_NB = GaussianNB()\n", "cls_knn = KNeighborsClassifier()\n", "cls_rf = RandomForestClassifier(\n", " random_state=lucky_num,\n", " class_weight=\"balanced_subsample\") # Hyperparameter to deal with unbalanced output\n", "cls_gbc = GradientBoostingClassifier(random_state=lucky_num)\n", "cls_xgb = xgb.XGBClassifier(\n", " objective=\"binary:logistic\",\n", " verbose=None,\n", " random_state=lucky_num,\n", " scale_pos_weight = scale_pos_weight)\n", "\n", "# Lists to iterate on our modeling function\n", "cls_name = [\"LR\", \"NB\", \"KNN\", \"RF\", \"GBC\", \"XGB\"]\n", "cls_list = [cls_lr, cls_NB, cls_knn, cls_rf, cls_gbc, cls_xgb]\n", "\n", "mdl_summaries = []\n", "for name, inst in zip(cls_name, cls_list):\n", " mdl_list = create_model(name, inst)\n", " mdl_list = [name] + mdl_list\n", " mdl_summaries.append(mdl_list)\n", "\n", "df_mdl = pd.DataFrame(\n", " mdl_summaries,\n", " columns=[\n", " \"model\",\n", " \"test_accuracy\",\n", " \"test_f1\",\n", " \"test_precision\",\n", " \"test_recall\",\n", " \"test_roc_auc\",\n", " \"test_brier\",\n", " \"test_log_loss\"])" ] }, { "cell_type": "code", "execution_count": 9, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Test set validation scores
 modeltest_accuracytest_f1test_precisiontest_recalltest_roc_auctest_briertest_log_loss
0GBC0.8710540.8175600.8675180.7730420.9086480.1042800.350152
1LR0.8659240.8144690.8434390.7874230.9044580.1104350.370350
2XGB0.8543440.8082830.7954430.8215450.9086810.1182660.392001
3RF0.8561520.8006230.8305470.7727810.8934660.1152850.562042
4KNN0.8436920.7796980.8237780.7400970.8721550.1282151.877810
5NB0.7666370.6647950.7176840.6191660.8488340.2082781.761326
\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "df_mdl.sort_values(\n", " \"test_f1\",\n", " ascending=False,\n", " inplace=True,\n", " ignore_index=True)\n", "\n", "display(df_mdl.style.set_caption(\"Test set validation scores\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "GBC, LR, XGB and RF preset great results! We have two ways here: hyperparameters tunning or creating a composite model. Let's begin with the composite model.\n" ] }, { "cell_type": "code", "execution_count": 10, "metadata": {}, "outputs": [ { "data": { "text/html": [ "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Test set validation scores for Composite Model
 CV1CV2CV3CV4CV5TestSet
fit_time10.10961311.76601111.45081811.73763412.702598nan
score_time0.4905180.5326950.5294590.5490510.586749nan
test_accuracy0.8707990.8715320.8754970.8699480.8692150.869002
test_f10.8186890.8207970.8262970.8193190.8175310.817283
test_precision0.8609390.8574920.8635110.8520420.8540900.853645
test_recall0.7803920.7871150.7921570.7890170.7839730.783893
test_roc_auc0.9090220.9088900.9124180.9073150.9103400.910500
test_neg_brier_score-0.105818-0.105354-0.101743-0.106567-0.105957-0.105743
test_neg_log_loss-0.356051-0.353269-0.344184-0.357062-0.355010-0.353621
\n" ], "text/plain": [ "" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Selecting the models\n", "cls_name = [\"GBC\", \"XGB\", \"LR\", \"RF\",]\n", "cls_list = [cls_gbc, cls_xgb, cls_lr, cls_rf]\n", "\n", "# Training the voting classifier\n", "cls_vot = VotingClassifier([*zip(cls_name, cls_list)], voting=\"soft\")\n", "cls_vot.fit(X_train, y_train)\n", "\n", "# Using cross-validation to evaluate the model fitted\n", "cls_cross = cross_validate(\n", " estimator=cls_vot,\n", " X=X_train,\n", " y=y_train,\n", " cv=5,\n", " scoring=scores)\n", "\n", "df_vot = pd.DataFrame.from_dict(cls_cross, orient='index', columns=[\"CV\"+str(i) for i in range(1,6)])\n", "\n", "# Calculating score to test set\n", "accurancy, f1, precision, recall, roc_auc, brier_score, log_loss_value = eval_model(cls_vot)\n", "\n", "# Filling a dataframe to better presentation\n", "df_vot.at[\"test_accuracy\", \"TestSet\"] = accurancy\n", "df_vot.at[\"test_f1\", \"TestSet\"] = f1\n", "df_vot.at[\"test_recall\", \"TestSet\"] = recall\n", "df_vot.at[\"test_precision\", \"TestSet\"] = precision\n", "df_vot.at[\"test_roc_auc\", \"TestSet\"] = roc_auc\n", "df_vot.at[\"test_neg_brier_score\", \"TestSet\"] = -brier_score\n", "df_vot.at[\"test_neg_log_loss\", \"TestSet\"] = -log_loss_value\n", "\n", "display(df_vot.style.set_caption(\"Test set validation scores for Composite Model\"))" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "The composite model does not present any evidence of overfitting. For now, we will use it on our app." ] }, { "cell_type": "code", "execution_count": 11, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "['c:\\\\Users\\\\grego\\\\OneDrive\\\\Documentos\\\\Documentos Pessoais\\\\00_DataCamp\\\\09_VSC\\\\poa_car_accidents\\\\poa_car_accidents\\\\model\\\\model_feridos.pkl']" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Saving\n", "file_name = \"model_\" + output + '.pkl'\n", "jb.dump(cls_vot, path.join(path.abspath(\"./\"), file_name))" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3.10.6 64-bit", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.6" }, "orig_nbformat": 4, "vscode": { "interpreter": { "hash": "1372d04dbd71fdc5436c5d6e671c1b9287e750e86143c81b5a7ba0acaf653c5e" } } }, "nbformat": 4, "nbformat_minor": 2 }