diff --git "a/src/classifier.ipynb" "b/src/classifier.ipynb" --- "a/src/classifier.ipynb" +++ "b/src/classifier.ipynb" @@ -12,16 +12,7 @@ "cell_type": "code", "execution_count": 1, "metadata": {}, - "outputs": [ - { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/mehdi/miniconda3/envs/adc/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" - ] - } - ], + "outputs": [], "source": [ "import pickle\n", "\n", @@ -29,15 +20,12 @@ "import matplotlib.pyplot as plt\n", "import numpy as np\n", "import pandas as pd\n", - "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.linear_model import LogisticRegression\n", - "from sklearn.model_selection import RandomizedSearchCV\n", - "from sklearn.preprocessing import LabelEncoder\n", "import torch\n", "from transformers import AutoModel, AutoTokenizer\n", "import xgboost as xgb\n", "\n", - "from utils import evaluate_predictions" + "from utils import evaluate_predictions, plot_confusion_matrix" ] }, { @@ -599,206 +587,13 @@ "cell_type": "markdown", "metadata": {}, "source": [ - "We can try different models. \n", - "\n", - "For the ensemble models, we can do a randomized or grid search to tune the hyperparameters. We will use a 5-fold cross validation strategy, and optimize for the macro averaged f1 score (because we want to give an equal importance to each class, regardless of how many observations each one has)." - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 2.3.1 Logistic Regression " - ] - }, - { - "cell_type": "code", - "execution_count": 32, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
LogisticRegression(class_weight='balanced', max_iter=1000,\n",
-       "                   multi_class='multinomial', random_state=2024)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" - ], - "text/plain": [ - "LogisticRegression(class_weight='balanced', max_iter=1000,\n", - " multi_class='multinomial', random_state=2024)" - ] - }, - "execution_count": 32, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "lr_model = LogisticRegression(multi_class='multinomial', \n", - " class_weight=\"balanced\", \n", - " max_iter=1000, \n", - " random_state=2024)\n", - "lr_model.fit(X_train, y_train)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 2.3.2 Random Forest" - ] - }, - { - "cell_type": "code", - "execution_count": 39, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
RandomizedSearchCV(cv=5,\n",
-       "                   estimator=RandomForestClassifier(class_weight='balanced',\n",
-       "                                                    random_state=2024),\n",
-       "                   n_iter=20,\n",
-       "                   param_distributions={'max_depth': [3, 4, 5, 6, 7, 8],\n",
-       "                                        'n_estimators': [100, 150, 200, 250,\n",
-       "                                                         300, 400, 500]},\n",
-       "                   scoring='f1_macro')
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" - ], - "text/plain": [ - "RandomizedSearchCV(cv=5,\n", - " estimator=RandomForestClassifier(class_weight='balanced',\n", - " random_state=2024),\n", - " n_iter=20,\n", - " param_distributions={'max_depth': [3, 4, 5, 6, 7, 8],\n", - " 'n_estimators': [100, 150, 200, 250,\n", - " 300, 400, 500]},\n", - " scoring='f1_macro')" - ] - }, - "execution_count": 39, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "rf_model = RandomForestClassifier(class_weight=\"balanced\", random_state=2024)\n", - "parameters = {\n", - " \"n_estimators\": [100, 150, 200, 250, 300, 400, 500],\n", - " \"max_depth\": [3, 4, 5, 6, 7, 8]\n", - "}\n", - "rf_search = RandomizedSearchCV(estimator=rf_model, param_distributions=parameters, \n", - " scoring=\"f1_macro\", cv=5, n_iter=20)\n", - "rf_search.fit(X_train, y_train)" - ] - }, - { - "cell_type": "code", - "execution_count": 40, - "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Best Parameters: {'n_estimators': 400, 'max_depth': 8}\n", - "Best Score: 0.15591886021384346\n" - ] - } - ], - "source": [ - "print(\"Best Parameters:\", rf_search.best_params_)\n", - "print(\"Best Score:\", rf_search.best_score_)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "#### 2.3.3 XGBoost" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "For XGBoost, we first need to encode the target variable." + "We will start simple, and use a logistic regression to see what we can achieve with those embeddings." ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, - "outputs": [], - "source": [ - "label_encoder = LabelEncoder()\n", - "y_train_encoded = label_encoder.fit_transform(y_train)\n", - "y_test_encoded = label_encoder.transform(y_test)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "xgb_model = xgb.XGBClassifier(device=\"cuda\", seed=2024)\n", - "parameters = {\n", - " \"n_estimators\" : [100, 150, 200, 300, 400, 450, 500],\n", - " \"max_depth\" : [3, 4, 5, 6, 7, 8],\n", - " \"learning_rate\": [0.1, 0.05, 0.01, 0.005, 0.001]\n", - "}\n", - "xgb_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=parameters,\n", - " scoring=\"f1_macro\", cv=5, n_iter=20)\n", - "xgb_search.fit(X_train, y_train_encoded)" - ] - }, - { - "cell_type": "code", - "execution_count": null, - "metadata": {}, - "outputs": [], - "source": [ - "print(\"Best Parameters:\", xgb_search.best_params_)\n", - "print(\"Best Score (Macro Average F1):\", xgb_search.best_score_)" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "The best parameters and best score obtained are the following: \n", - "```\n", - "Best Parameters: {'n_estimators': 450, 'max_depth': 7, 'learning_rate': 0.1} \n", - "Best Score (Macro Average F1): 0.17356889596239114\n", - "```" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "## 3. Evaluating the Performance" - ] - }, - { - "cell_type": "markdown", - "metadata": {}, - "source": [ - "First, let's retrain the models with the best parameters we obtained." - ] - }, - { - "cell_type": "code", - "execution_count": 5, - "metadata": {}, "outputs": [ { "data": { @@ -812,7 +607,7 @@ " multi_class='multinomial', random_state=2024)" ] }, - "execution_count": 5, + "execution_count": 4, "metadata": {}, "output_type": "execute_result" } @@ -826,91 +621,10 @@ ] }, { - "cell_type": "code", - "execution_count": 6, - "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
RandomForestClassifier(class_weight='balanced', max_depth=8, n_estimators=400,\n",
-       "                       random_state=2024)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" - ], - "text/plain": [ - "RandomForestClassifier(class_weight='balanced', max_depth=8, n_estimators=400,\n", - " random_state=2024)" - ] - }, - "execution_count": 6, - "metadata": {}, - "output_type": "execute_result" - } - ], - "source": [ - "rf_model = RandomForestClassifier(class_weight=\"balanced\", \n", - " random_state=2024,\n", - " n_estimators=400,\n", - " max_depth=8)\n", - "rf_model.fit(X_train, y_train)" - ] - }, - { - "cell_type": "code", - "execution_count": 7, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "data": { - "text/html": [ - "
XGBClassifier(base_score=None, booster=None, callbacks=None,\n",
-       "              colsample_bylevel=None, colsample_bynode=None,\n",
-       "              colsample_bytree=None, device='cuda', early_stopping_rounds=None,\n",
-       "              enable_categorical=False, eval_metric=None, feature_types=None,\n",
-       "              gamma=None, grow_policy=None, importance_type=None,\n",
-       "              interaction_constraints=None, learning_rate=0.1, max_bin=None,\n",
-       "              max_cat_threshold=None, max_cat_to_onehot=None,\n",
-       "              max_delta_step=None, max_depth=7, max_leaves=None,\n",
-       "              min_child_weight=None, missing=nan, monotone_constraints=None,\n",
-       "              multi_strategy=None, n_estimators=450, n_jobs=None,\n",
-       "              num_parallel_tree=None, objective='multi:softprob', ...)
In a Jupyter environment, please rerun this cell to show the HTML representation or trust the notebook.
On GitHub, the HTML representation is unable to render, please try loading this page with nbviewer.org.
" - ], - "text/plain": [ - "XGBClassifier(base_score=None, booster=None, callbacks=None,\n", - " colsample_bylevel=None, colsample_bynode=None,\n", - " colsample_bytree=None, device='cuda', early_stopping_rounds=None,\n", - " enable_categorical=False, eval_metric=None, feature_types=None,\n", - " gamma=None, grow_policy=None, importance_type=None,\n", - " interaction_constraints=None, learning_rate=0.1, max_bin=None,\n", - " max_cat_threshold=None, max_cat_to_onehot=None,\n", - " max_delta_step=None, max_depth=7, max_leaves=None,\n", - " min_child_weight=None, missing=nan, monotone_constraints=None,\n", - " multi_strategy=None, n_estimators=450, n_jobs=None,\n", - " num_parallel_tree=None, objective='multi:softprob', ...)" - ] - }, - "execution_count": 7, - "metadata": {}, - "output_type": "execute_result" - } - ], "source": [ - "xgb_model = xgb.XGBClassifier(device=\"cuda\", \n", - " seed=2024,\n", - " n_estimators=450,\n", - " max_depth=7,\n", - " learning_rate=0.1)\n", - "xgb_model.fit(X_train, y_train_encoded)" + "## 3. Evaluating the Performance" ] }, { @@ -922,7 +636,7 @@ }, { "cell_type": "code", - "execution_count": 14, + "execution_count": 5, "metadata": {}, "outputs": [ { @@ -953,81 +667,37 @@ ] }, { - "cell_type": "code", - "execution_count": 15, + "cell_type": "markdown", "metadata": {}, - "outputs": [ - { - "name": "stdout", - "output_type": "stream", - "text": [ - "Random Forest\n", - "\n", - "Train set:\n", - "Accuracy: 0.5817619047619048\n", - "F1 macro average: 0.6302920544396868\n", - "F1 weighted average: 0.5817320656440126\n", - "--------------------------------------------------\n", - "Test set:\n", - "Accuracy: 0.2188\n", - "F1 macro average: 0.11020842424166737\n", - "F1 weighted average: 0.2054551695522176\n" - ] - } - ], "source": [ - "rf_train_preds = rf_model.predict(X_train)\n", - "rf_test_preds = rf_model.predict(X_test)\n", - "\n", - "evaluate_predictions(model=\"Random Forest\", \n", - " train_preds= rf_train_preds, y_train=y_train,\n", - " test_preds=rf_test_preds, y_test=y_test)" + "We see that the model is struggling to correctly classify the different dialects, (which makes sense because everything is in arabic at the end of the day). Let's have a look at the confusion matrix." ] }, { "cell_type": "code", - "execution_count": 16, + "execution_count": 6, "metadata": {}, "outputs": [ { - "name": "stderr", - "output_type": "stream", - "text": [ - "/home/mehdi/miniconda3/envs/adc/lib/python3.10/site-packages/xgboost/core.py:160: UserWarning: [12:38:31] WARNING: /home/conda/feedstock_root/build_artifacts/xgboost-split_1705650282415/work/src/common/error_msg.cc:58: Falling back to prediction using DMatrix due to mismatched devices. This might lead to higher memory usage and slower performance. XGBoost is running on: cuda:0, while the input data is on: cpu.\n", - "Potential solutions:\n", - "- Use a data structure that matches the device ordinal in the booster.\n", - "- Set the device for booster before call to inplace_predict.\n", - "\n", - "This warning will only be shown once.\n", - "\n", - " warnings.warn(smsg, UserWarning)\n" - ] - }, - { - "name": "stdout", - "output_type": "stream", - "text": [ - "XgBoost\n", - "\n", - "Train set:\n", - "Accuracy: 0.9998571428571429\n", - "F1 macro average: 0.9998485323510583\n", - "F1 weighted average: 0.9998571499183782\n", - "--------------------------------------------------\n", - "Test set:\n", - "Accuracy: 0.3552\n", - "F1 macro average: 0.13665190979200587\n", - "F1 weighted average: 0.288613804297705\n" - ] + "data": { + "image/png": "", + "text/plain": [ + "
" + ] + }, + "metadata": {}, + "output_type": "display_data" } ], "source": [ - "xgb_train_preds = xgb_model.predict(X_train)\n", - "xgb_test_preds = xgb_model.predict(X_test)\n", - "\n", - "evaluate_predictions(model=\"XgBoost\", \n", - " train_preds= xgb_train_preds, y_train=y_train_encoded,\n", - " test_preds=xgb_test_preds, y_test=y_test_encoded)" + "plot_confusion_matrix(y_test, lr_test_preds)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "From the confusion matrix, we see that the model is only really able to detect Egyptian arabic, and to a lesser extent Iraqi and Algerian." ] } ],