{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "id": "a751d479-1500-41e2-8c01-252e849dad05",
   "metadata": {},
   "outputs": [],
   "source": [
    "import warnings\n",
    "warnings.filterwarnings(\"ignore\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "8158cb66-9f9a-4bb2-bc6e-6a51146be10c",
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import matplotlib.pyplot as plt \n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.pipeline import make_pipeline\n",
    "from sklearn.linear_model import LogisticRegression\n",
    "from sklearn.naive_bayes import MultinomialNB\n",
    "from sklearn.svm import SVC\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.metrics import classification_report,accuracy_score\n",
    "import numpy as np\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.preprocessing import OneHotEncoder\n",
    "from sklearn.compose import ColumnTransformer\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "from sklearn.model_selection import train_test_split\n",
    "from sklearn.metrics import classification_report, accuracy_score\n",
    "from sklearn.utils.class_weight import compute_class_weight\n",
    "import pickle"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "70ea935b-3b62-4cf9-8bef-06bf30904b20",
   "metadata": {},
   "source": [
    "## Sub Products"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "f9ddaa89-dc8d-40f5-8098-7d108ab9d578",
   "metadata": {},
   "source": [
    "### Model"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "id": "c1f9fd85-f47e-4962-a693-7cb9efca763a",
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.pipeline import Pipeline\n",
    "from sklearn.feature_extraction.text import TfidfVectorizer\n",
    "from sklearn.metrics import accuracy_score, classification_report\n",
    "from sklearn.utils.class_weight import compute_class_weight\n",
    "\n",
    "def train_model(training_df, validation_df, subproduct_to_predict, classifier_model, subproducts_to_drop=None, random_state=None):\n",
    "    # Drop specified subproducts from training and validation dataframes\n",
    "    if subproducts_to_drop:\n",
    "        training_df = training_df[~training_df['Sub-product'].isin(subproducts_to_drop)]\n",
    "        validation_df = validation_df[~validation_df['Sub-product'].isin(subproducts_to_drop)]\n",
    "    \n",
    "    # Compute class weights\n",
    "    class_weights = compute_class_weight('balanced', classes=np.unique(training_df['Sub-product']), y=training_df['Sub-product'])\n",
    "    \n",
    "    # Convert class weights to dictionary format\n",
    "    class_weight = {label: weight for label, weight in zip(np.unique(training_df['Sub-product']), class_weights)}\n",
    "    \n",
    "    # Define a default class weight for missing classes\n",
    "    default_class_weight = 0.5\n",
    "    \n",
    "    # Assign default class weight for missing classes\n",
    "    for label in np.unique(training_df['Sub-product']):\n",
    "        if label not in class_weight:\n",
    "            class_weight[label] = default_class_weight\n",
    "    \n",
    "    # Define the pipeline\n",
    "    pipeline = Pipeline([\n",
    "        ('tfidf', TfidfVectorizer()),\n",
    "        ('classifier', classifier_model)\n",
    "    ])\n",
    "    \n",
    "    # Train the pipeline\n",
    "    pipeline.fit(training_df['Consumer complaint narrative'], training_df['Sub-product'])\n",
    "    \n",
    "    # Make predictions on the validation set\n",
    "    y_pred = pipeline.predict(validation_df['Consumer complaint narrative'])\n",
    "    \n",
    "    # Evaluate the pipeline\n",
    "    accuracy = accuracy_score(validation_df['Sub-product'], y_pred)\n",
    "    print(\"Accuracy:\", accuracy)\n",
    "    print(\"\\nClassification Report:\")\n",
    "    print(classification_report(validation_df['Sub-product'], y_pred))\n",
    "    \n",
    "    return pipeline\n"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "a7a0d277-75c1-4435-86e5-d0ee7d3dabf3",
   "metadata": {},
   "source": [
    "#### Debt Collection"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "id": "6a2e4857-31c7-4b57-a25c-e9e36473c033",
   "metadata": {},
   "outputs": [],
   "source": [
    "debt_training_df= pd.read_csv('../data_preprocessing_scripts/product_data_splits/debt_collection_train_data.csv')\n",
    "debt_val_df= pd.read_csv('../data_preprocessing_scripts/product_data_splits/debt_collection_val_data.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "id": "7fb6be2b-244f-4232-972c-9772128890ca",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>Consumer complaint narrative</th>\n",
       "      <th>Product</th>\n",
       "      <th>Sub-product</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>{$37.00} on XXXX XXXX XXXX I paid for gas thro...</td>\n",
       "      <td>Debt collection</td>\n",
       "      <td>Other debt</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>Debt from XXXX XXXX is result of identity thef...</td>\n",
       "      <td>Debt collection</td>\n",
       "      <td>Credit card debt</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>My son attended XXXX XXXX XXXX XXXX for severa...</td>\n",
       "      <td>Debt collection</td>\n",
       "      <td>Medical debt</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>XXXX is claiming I owe a debt for utilities ba...</td>\n",
       "      <td>Debt collection</td>\n",
       "      <td>Other debt</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>This debt collector engaged in abusive, decept...</td>\n",
       "      <td>Debt collection</td>\n",
       "      <td>I do not know</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "</div>"
      ],
      "text/plain": [
       "                        Consumer complaint narrative          Product  \\\n",
       "0  {$37.00} on XXXX XXXX XXXX I paid for gas thro...  Debt collection   \n",
       "1  Debt from XXXX XXXX is result of identity thef...  Debt collection   \n",
       "2  My son attended XXXX XXXX XXXX XXXX for severa...  Debt collection   \n",
       "3  XXXX is claiming I owe a debt for utilities ba...  Debt collection   \n",
       "4  This debt collector engaged in abusive, decept...  Debt collection   \n",
       "\n",
       "        Sub-product  \n",
       "0        Other debt  \n",
       "1  Credit card debt  \n",
       "2      Medical debt  \n",
       "3        Other debt  \n",
       "4     I do not know  "
      ]
     },
     "execution_count": 5,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "debt_training_df.head()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "a14dbafd-6f1b-49cb-9712-434055da84f1",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Sub-product\n",
       "Other debt                 2056\n",
       "I do not know              1530\n",
       "Credit card debt           1139\n",
       "Medical debt                726\n",
       "Auto debt                   397\n",
       "Telecommunications debt     267\n",
       "Rental debt                 122\n",
       "Mortgage debt                94\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 6,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "debt_training_df['Sub-product'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "b78398b7-d027-403f-acf4-fa580d113b02",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.6633986928104575\n",
      "\n",
      "Classification Report:\n",
      "                         precision    recall  f1-score   support\n",
      "\n",
      "              Auto debt       0.95      0.48      0.64        44\n",
      "       Credit card debt       0.59      0.96      0.73       127\n",
      "           Medical debt       0.77      0.62      0.68        81\n",
      "          Mortgage debt       1.00      0.40      0.57        10\n",
      "            Rental debt       0.67      0.14      0.24        14\n",
      "Telecommunications debt       1.00      0.13      0.24        30\n",
      "\n",
      "               accuracy                           0.66       306\n",
      "              macro avg       0.83      0.46      0.52       306\n",
      "           weighted avg       0.75      0.66      0.63       306\n",
      "\n"
     ]
    }
   ],
   "source": [
    "\n",
    "from sklearn.ensemble import RandomForestClassifier\n",
    "\n",
    "rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)\n",
    "trained_model_d = train_model(debt_training_df, debt_val_df, 'Sub-product', rf_classifier, subproducts_to_drop=['Other debt', 'I do not know'], random_state=42)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "85bbc3fe-50b0-4578-8e67-151861f839da",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('models/Debt_model.pkl', 'wb') as f:\n",
    "    pickle.dump(trained_model_d, f)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "5c529ed8-3735-4494-9f90-6c005dfea6df",
   "metadata": {},
   "source": [
    "#### Loan/Mortgages"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 10,
   "id": "f33b26e9-4c5b-4498-ab23-a88aca5eb07f",
   "metadata": {},
   "outputs": [],
   "source": [
    "loans_training_df= pd.read_csv('../data_preprocessing_scripts/product_data_splits/loans___mortgage_train_data.csv')\n",
    "loans_val_df= pd.read_csv('../data_preprocessing_scripts/product_data_splits/loans___mortgage_val_data.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 11,
   "id": "c8dcc18b-f7bb-4edd-965a-8c58500a0ea6",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Sub-product\n",
       "Loan                              1464\n",
       "Federal student loan servicing     914\n",
       "Conventional home mortgage         236\n",
       "Lease                              186\n",
       "FHA mortgage                        94\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 11,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "loans_training_df['Sub-product'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 12,
   "id": "b0da7a52-e00a-413a-80be-2e8221851275",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.8757763975155279\n",
      "\n",
      "Classification Report:\n",
      "                                precision    recall  f1-score   support\n",
      "\n",
      "    Conventional home mortgage       0.81      0.50      0.62        26\n",
      "                  FHA mortgage       1.00      0.20      0.33        10\n",
      "Federal student loan servicing       1.00      0.96      0.98       102\n",
      "                         Lease       1.00      0.29      0.44        21\n",
      "                          Loan       0.81      1.00      0.90       163\n",
      "\n",
      "                      accuracy                           0.88       322\n",
      "                     macro avg       0.93      0.59      0.65       322\n",
      "                  weighted avg       0.89      0.88      0.85       322\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "\n",
    "rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)\n",
    "trained_model_l = train_model(loans_training_df, loans_val_df, 'Sub-product', rf_classifier, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 13,
   "id": "a668b946-da36-410f-b474-f8a311952c5d",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('models/loan_model.pkl', 'wb') as f:\n",
    "    pickle.dump(trained_model_l, f)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "74796ebf-9934-46d2-a1b7-d6672dea727c",
   "metadata": {},
   "source": [
    "#### Checking or savings account"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 14,
   "id": "1cc65f08-96c8-4458-8703-b84b7554a04c",
   "metadata": {},
   "outputs": [],
   "source": [
    "cs_training_df= pd.read_csv('../data_preprocessing_scripts/product_data_splits/checking_or_savings_account_train_data.csv')\n",
    "cs_val_df= pd.read_csv('../data_preprocessing_scripts/product_data_splits/checking_or_savings_account_val_data.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 15,
   "id": "240b2bcd-3839-4584-8a63-952fa17f9715",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Sub-product\n",
       "Checking account                    13500\n",
       "Savings account                      1391\n",
       "Other banking product or service     1158\n",
       "CD (Certificate of Deposit)           176\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 15,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cs_training_df['Sub-product'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 16,
   "id": "3170c0c8-0dac-4755-aebf-dca9aa7f4dee",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.940099833610649\n",
      "\n",
      "Classification Report:\n",
      "                                  precision    recall  f1-score   support\n",
      "\n",
      "     CD (Certificate of Deposit)       0.95      0.95      0.95        19\n",
      "                Checking account       0.93      1.00      0.97      1500\n",
      "Other banking product or service       1.00      0.60      0.75       129\n",
      "                 Savings account       0.99      0.65      0.79       155\n",
      "\n",
      "                        accuracy                           0.94      1803\n",
      "                       macro avg       0.97      0.80      0.86      1803\n",
      "                    weighted avg       0.94      0.94      0.93      1803\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "\n",
    "rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)\n",
    "trained_model_cs = train_model(cs_training_df, cs_val_df, 'Sub-product', rf_classifier, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 17,
   "id": "59c87ff1-d7de-41a9-9e0a-33630bff1c18",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('models/Checking_saving_model.pkl', 'wb') as f:\n",
    "    pickle.dump(trained_model_cs, f)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "fe443859-4be6-4b87-be79-22487aaf5b3b",
   "metadata": {},
   "source": [
    "#### 'Credit/Prepaid Card'"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "id": "31a70db8-06cb-4fb0-8d45-a7451aa81b0e",
   "metadata": {},
   "outputs": [],
   "source": [
    "cp_training_df= pd.read_csv('../data_preprocessing_scripts/product_data_splits/credit_prepaid_card_train_data.csv')\n",
    "cp_val_df= pd.read_csv('../data_preprocessing_scripts/product_data_splits/credit_prepaid_card_val_data.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 27,
   "id": "0e70a22d-01f9-4f59-a903-286a05eb5179",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Sub-product\n",
       "General-purpose credit card or charge card    13320\n",
       "Store credit card                              2232\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 27,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cp_training_df['Sub-product'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "id": "ef3b03f6-8207-4292-8ce2-e6ca5695c606",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.9427414690572585\n",
      "\n",
      "Classification Report:\n",
      "                                            precision    recall  f1-score   support\n",
      "\n",
      "General-purpose credit card or charge card       0.94      1.00      0.97      1481\n",
      "                         Store credit card       1.00      0.60      0.75       248\n",
      "\n",
      "                                  accuracy                           0.94      1729\n",
      "                                 macro avg       0.97      0.80      0.86      1729\n",
      "                              weighted avg       0.95      0.94      0.94      1729\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "\n",
    "rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)\n",
    "trained_model_cp = train_model(cp_training_df, cp_val_df, 'Sub-product', rf_classifier, random_state=42)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 21,
   "id": "ac3f39d0-8cb8-457e-9db7-510cc5a99830",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('models/Credit_Prepaid_Card_model.pkl', 'wb') as f:\n",
    "    pickle.dump(trained_model_cp, f)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "0787d4eb-9673-417b-91d1-cc98becd037e",
   "metadata": {},
   "source": [
    "#### Credit_reporting_df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 22,
   "id": "8e074864-16f6-4fd5-8bfe-b054aeb0fc2a",
   "metadata": {},
   "outputs": [],
   "source": [
    "cr_training_df= pd.read_csv('../data_preprocessing_scripts/product_data_splits/credit_reporting_train_data.csv')\n",
    "cr_val_df= pd.read_csv('../data_preprocessing_scripts/product_data_splits/credit_reporting_val_data.csv')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "id": "57257613-7dde-4561-942c-f559d2159744",
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "Sub-product\n",
       "Credit reporting                  13500\n",
       "Other personal consumer report      661\n",
       "Name: count, dtype: int64"
      ]
     },
     "execution_count": 23,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "cr_training_df['Sub-product'].value_counts()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "id": "cca27513-501f-4257-a4b1-0e13a3604250",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Accuracy: 0.9841168996188056\n",
      "\n",
      "Classification Report:\n",
      "                                precision    recall  f1-score   support\n",
      "\n",
      "              Credit reporting       0.99      1.00      0.99      1500\n",
      "Other personal consumer report       0.93      0.72      0.81        74\n",
      "\n",
      "                      accuracy                           0.98      1574\n",
      "                     macro avg       0.96      0.86      0.90      1574\n",
      "                  weighted avg       0.98      0.98      0.98      1574\n",
      "\n"
     ]
    }
   ],
   "source": [
    "from sklearn.ensemble import RandomForestClassifier\n",
    "\n",
    "rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)\n",
    "trained_model_cr = train_model(cr_training_df, cr_val_df, 'Sub-product', rf_classifier, random_state=42)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "id": "3cbb9aa5-6c0c-4b59-a181-7431e8fc60fc",
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('models/Credit_Reporting_model.pkl', 'wb') as f:\n",
    "    pickle.dump(trained_model_cr, f)"
   ]
  },
  {
   "cell_type": "markdown",
   "id": "9aea8fdd-ec86-40bc-b417-ba9169edabd9",
   "metadata": {},
   "source": [
    "with open('models/Debt_model.pkl', 'wb') as f:\n",
    "    pickle.dump(trained_model_d, f)\n",
    "\n",
    "with open('models/loan_model.pkl', 'wb') as f:\n",
    "    pickle.dump(trained_model_l, f)\n",
    "\n",
    "with open('models/Checking_saving_model.pkl', 'wb') as f:\n",
    "    pickle.dump(trained_model_cs, f)\n",
    "\n",
    "with open('models/Credit_Prepaid_Card_model.pkl', 'wb') as f:\n",
    "    pickle.dump(trained_model_cp, f)\n",
    "\n",
    "with open('models/Credit_Reporting_model.pkl', 'wb') as f:\n",
    "    pickle.dump(trained_model_cr, f)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.9.19"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}