diff --git "a/src/classifier.ipynb" "b/src/classifier.ipynb" --- "a/src/classifier.ipynb" +++ "b/src/classifier.ipynb" @@ -12,16 +12,7 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/mehdi/miniconda3/envs/adc/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], + "outputs": [], "source": [ "import pickle\n", "\n", @@ -29,15 +20,12 @@ "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", - "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.linear_model import LogisticRegression\n", - "from sklearn.model_selection import RandomizedSearchCV\n", - "from sklearn.preprocessing import LabelEncoder\n", "import torch\n", "from transformers import AutoModel, AutoTokenizer\n", "import xgboost as xgb\n", "\n", - "from utils import evaluate_predictions" + "from utils import evaluate_predictions, plot_confusion_matrix" ] }, { @@ -599,206 +587,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can try different models. \n", - "\n", - "For the ensemble models, we can do a randomized or grid search to tune the hyperparameters. We will use a 5-fold cross validation strategy, and optimize for the macro averaged f1 score (because we want to give an equal importance to each class, regardless of how many observations each one has)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 2.3.1 Logistic Regression " - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
LogisticRegression(class_weight='balanced', max_iter=1000,\n", - " multi_class='multinomial', random_state=2024)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
LogisticRegression(class_weight='balanced', max_iter=1000,\n", - " multi_class='multinomial', random_state=2024)
RandomizedSearchCV(cv=5,\n", - " estimator=RandomForestClassifier(class_weight='balanced',\n", - " random_state=2024),\n", - " n_iter=20,\n", - " param_distributions={'max_depth': [3, 4, 5, 6, 7, 8],\n", - " 'n_estimators': [100, 150, 200, 250,\n", - " 300, 400, 500]},\n", - " scoring='f1_macro')In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomizedSearchCV(cv=5,\n", - " estimator=RandomForestClassifier(class_weight='balanced',\n", - " random_state=2024),\n", - " n_iter=20,\n", - " param_distributions={'max_depth': [3, 4, 5, 6, 7, 8],\n", - " 'n_estimators': [100, 150, 200, 250,\n", - " 300, 400, 500]},\n", - " scoring='f1_macro')
RandomForestClassifier(class_weight='balanced', random_state=2024)
RandomForestClassifier(class_weight='balanced', random_state=2024)
RandomForestClassifier(class_weight='balanced', max_depth=8, n_estimators=400,\n", - " random_state=2024)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
RandomForestClassifier(class_weight='balanced', max_depth=8, n_estimators=400,\n", - " random_state=2024)
XGBClassifier(base_score=None, booster=None, callbacks=None,\n", - " colsample_bylevel=None, colsample_bynode=None,\n", - " colsample_bytree=None, device='cuda', early_stopping_rounds=None,\n", - " enable_categorical=False, eval_metric=None, feature_types=None,\n", - " gamma=None, grow_policy=None, importance_type=None,\n", - " interaction_constraints=None, learning_rate=0.1, max_bin=None,\n", - " max_cat_threshold=None, max_cat_to_onehot=None,\n", - " max_delta_step=None, max_depth=7, max_leaves=None,\n", - " min_child_weight=None, missing=nan, monotone_constraints=None,\n", - " multi_strategy=None, n_estimators=450, n_jobs=None,\n", - " num_parallel_tree=None, objective='multi:softprob', ...)In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
XGBClassifier(base_score=None, booster=None, callbacks=None,\n", - " colsample_bylevel=None, colsample_bynode=None,\n", - " colsample_bytree=None, device='cuda', early_stopping_rounds=None,\n", - " enable_categorical=False, eval_metric=None, feature_types=None,\n", - " gamma=None, grow_policy=None, importance_type=None,\n", - " interaction_constraints=None, learning_rate=0.1, max_bin=None,\n", - " max_cat_threshold=None, max_cat_to_onehot=None,\n", - " max_delta_step=None, max_depth=7, max_leaves=None,\n", - " min_child_weight=None, missing=nan, monotone_constraints=None,\n", - " multi_strategy=None, n_estimators=450, n_jobs=None,\n", - " num_parallel_tree=None, objective='multi:softprob', ...)