{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "a751d479-1500-41e2-8c01-252e849dad05", "metadata": {}, "outputs": [], "source": [ "import warnings\n", "warnings.filterwarnings(\"ignore\")" ] }, { "cell_type": "code", "execution_count": 2, "id": "8158cb66-9f9a-4bb2-bc6e-6a51146be10c", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt \n", "from sklearn.model_selection import train_test_split\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.pipeline import make_pipeline\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.svm import SVC\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.metrics import classification_report,accuracy_score\n", "import numpy as np\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.preprocessing import OneHotEncoder\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import classification_report, accuracy_score\n", "from sklearn.utils.class_weight import compute_class_weight\n", "import pickle" ] }, { "cell_type": "markdown", "id": "70ea935b-3b62-4cf9-8bef-06bf30904b20", "metadata": {}, "source": [ "## Sub Products" ] }, { "cell_type": "markdown", "id": "f9ddaa89-dc8d-40f5-8098-7d108ab9d578", "metadata": {}, "source": [ "### Model" ] }, { "cell_type": "code", "execution_count": 3, "id": "c1f9fd85-f47e-4962-a693-7cb9efca763a", "metadata": {}, "outputs": [], "source": [ "from sklearn.pipeline import Pipeline\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.metrics import accuracy_score, classification_report\n", "from sklearn.utils.class_weight import compute_class_weight\n", "\n", "def train_model(training_df, validation_df, subproduct_to_predict, classifier_model, subproducts_to_drop=None, random_state=None):\n", " # Drop specified subproducts from training and validation dataframes\n", " if subproducts_to_drop:\n", " training_df = training_df[~training_df['Sub-product'].isin(subproducts_to_drop)]\n", " validation_df = validation_df[~validation_df['Sub-product'].isin(subproducts_to_drop)]\n", " \n", " # Compute class weights\n", " class_weights = compute_class_weight('balanced', classes=np.unique(training_df['Sub-product']), y=training_df['Sub-product'])\n", " \n", " # Convert class weights to dictionary format\n", " class_weight = {label: weight for label, weight in zip(np.unique(training_df['Sub-product']), class_weights)}\n", " \n", " # Define a default class weight for missing classes\n", " default_class_weight = 0.5\n", " \n", " # Assign default class weight for missing classes\n", " for label in np.unique(training_df['Sub-product']):\n", " if label not in class_weight:\n", " class_weight[label] = default_class_weight\n", " \n", " # Define the pipeline\n", " pipeline = Pipeline([\n", " ('tfidf', TfidfVectorizer()),\n", " ('classifier', classifier_model)\n", " ])\n", " \n", " # Train the pipeline\n", " pipeline.fit(training_df['Consumer complaint narrative'], training_df['Sub-product'])\n", " \n", " # Make predictions on the validation set\n", " y_pred = pipeline.predict(validation_df['Consumer complaint narrative'])\n", " \n", " # Evaluate the pipeline\n", " accuracy = accuracy_score(validation_df['Sub-product'], y_pred)\n", " print(\"Accuracy:\", accuracy)\n", " print(\"\\nClassification Report:\")\n", " print(classification_report(validation_df['Sub-product'], y_pred))\n", " \n", " return pipeline\n" ] }, { "cell_type": "markdown", "id": "a7a0d277-75c1-4435-86e5-d0ee7d3dabf3", "metadata": {}, "source": [ "#### Debt Collection" ] }, { "cell_type": "code", "execution_count": 4, "id": "6a2e4857-31c7-4b57-a25c-e9e36473c033", "metadata": {}, "outputs": [], "source": [ "debt_training_df= pd.read_csv('../data_preprocessing_scripts/product_data_splits/debt_collection_train_data.csv')\n", "debt_val_df= pd.read_csv('../data_preprocessing_scripts/product_data_splits/debt_collection_val_data.csv')" ] }, { "cell_type": "code", "execution_count": 5, "id": "7fb6be2b-244f-4232-972c-9772128890ca", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
Consumer complaint narrativeProductSub-product
0{$37.00} on XXXX XXXX XXXX I paid for gas thro...Debt collectionOther debt
1Debt from XXXX XXXX is result of identity thef...Debt collectionCredit card debt
2My son attended XXXX XXXX XXXX XXXX for severa...Debt collectionMedical debt
3XXXX is claiming I owe a debt for utilities ba...Debt collectionOther debt
4This debt collector engaged in abusive, decept...Debt collectionI do not know
\n", "
" ], "text/plain": [ " Consumer complaint narrative Product \\\n", "0 {$37.00} on XXXX XXXX XXXX I paid for gas thro... Debt collection \n", "1 Debt from XXXX XXXX is result of identity thef... Debt collection \n", "2 My son attended XXXX XXXX XXXX XXXX for severa... Debt collection \n", "3 XXXX is claiming I owe a debt for utilities ba... Debt collection \n", "4 This debt collector engaged in abusive, decept... Debt collection \n", "\n", " Sub-product \n", "0 Other debt \n", "1 Credit card debt \n", "2 Medical debt \n", "3 Other debt \n", "4 I do not know " ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "debt_training_df.head()" ] }, { "cell_type": "code", "execution_count": 6, "id": "a14dbafd-6f1b-49cb-9712-434055da84f1", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Sub-product\n", "Other debt 2056\n", "I do not know 1530\n", "Credit card debt 1139\n", "Medical debt 726\n", "Auto debt 397\n", "Telecommunications debt 267\n", "Rental debt 122\n", "Mortgage debt 94\n", "Name: count, dtype: int64" ] }, "execution_count": 6, "metadata": {}, "output_type": "execute_result" } ], "source": [ "debt_training_df['Sub-product'].value_counts()" ] }, { "cell_type": "code", "execution_count": 7, "id": "b78398b7-d027-403f-acf4-fa580d113b02", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.6633986928104575\n", "\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " Auto debt 0.95 0.48 0.64 44\n", " Credit card debt 0.59 0.96 0.73 127\n", " Medical debt 0.77 0.62 0.68 81\n", " Mortgage debt 1.00 0.40 0.57 10\n", " Rental debt 0.67 0.14 0.24 14\n", "Telecommunications debt 1.00 0.13 0.24 30\n", "\n", " accuracy 0.66 306\n", " macro avg 0.83 0.46 0.52 306\n", " weighted avg 0.75 0.66 0.63 306\n", "\n" ] } ], "source": [ "\n", "from sklearn.ensemble import RandomForestClassifier\n", "\n", "rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)\n", "trained_model_d = train_model(debt_training_df, debt_val_df, 'Sub-product', rf_classifier, subproducts_to_drop=['Other debt', 'I do not know'], random_state=42)\n" ] }, { "cell_type": "code", "execution_count": 9, "id": "85bbc3fe-50b0-4578-8e67-151861f839da", "metadata": {}, "outputs": [], "source": [ "with open('models/Debt_model.pkl', 'wb') as f:\n", " pickle.dump(trained_model_d, f)" ] }, { "cell_type": "markdown", "id": "5c529ed8-3735-4494-9f90-6c005dfea6df", "metadata": {}, "source": [ "#### Loan/Mortgages" ] }, { "cell_type": "code", "execution_count": 10, "id": "f33b26e9-4c5b-4498-ab23-a88aca5eb07f", "metadata": {}, "outputs": [], "source": [ "loans_training_df= pd.read_csv('../data_preprocessing_scripts/product_data_splits/loans___mortgage_train_data.csv')\n", "loans_val_df= pd.read_csv('../data_preprocessing_scripts/product_data_splits/loans___mortgage_val_data.csv')" ] }, { "cell_type": "code", "execution_count": 11, "id": "c8dcc18b-f7bb-4edd-965a-8c58500a0ea6", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Sub-product\n", "Loan 1464\n", "Federal student loan servicing 914\n", "Conventional home mortgage 236\n", "Lease 186\n", "FHA mortgage 94\n", "Name: count, dtype: int64" ] }, "execution_count": 11, "metadata": {}, "output_type": "execute_result" } ], "source": [ "loans_training_df['Sub-product'].value_counts()" ] }, { "cell_type": "code", "execution_count": 12, "id": "b0da7a52-e00a-413a-80be-2e8221851275", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.8757763975155279\n", "\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " Conventional home mortgage 0.81 0.50 0.62 26\n", " FHA mortgage 1.00 0.20 0.33 10\n", "Federal student loan servicing 1.00 0.96 0.98 102\n", " Lease 1.00 0.29 0.44 21\n", " Loan 0.81 1.00 0.90 163\n", "\n", " accuracy 0.88 322\n", " macro avg 0.93 0.59 0.65 322\n", " weighted avg 0.89 0.88 0.85 322\n", "\n" ] } ], "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "\n", "rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)\n", "trained_model_l = train_model(loans_training_df, loans_val_df, 'Sub-product', rf_classifier, random_state=42)" ] }, { "cell_type": "code", "execution_count": 13, "id": "a668b946-da36-410f-b474-f8a311952c5d", "metadata": {}, "outputs": [], "source": [ "with open('models/loan_model.pkl', 'wb') as f:\n", " pickle.dump(trained_model_l, f)" ] }, { "cell_type": "markdown", "id": "74796ebf-9934-46d2-a1b7-d6672dea727c", "metadata": {}, "source": [ "#### Checking or savings account" ] }, { "cell_type": "code", "execution_count": 14, "id": "1cc65f08-96c8-4458-8703-b84b7554a04c", "metadata": {}, "outputs": [], "source": [ "cs_training_df= pd.read_csv('../data_preprocessing_scripts/product_data_splits/checking_or_savings_account_train_data.csv')\n", "cs_val_df= pd.read_csv('../data_preprocessing_scripts/product_data_splits/checking_or_savings_account_val_data.csv')" ] }, { "cell_type": "code", "execution_count": 15, "id": "240b2bcd-3839-4584-8a63-952fa17f9715", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Sub-product\n", "Checking account 13500\n", "Savings account 1391\n", "Other banking product or service 1158\n", "CD (Certificate of Deposit) 176\n", "Name: count, dtype: int64" ] }, "execution_count": 15, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cs_training_df['Sub-product'].value_counts()" ] }, { "cell_type": "code", "execution_count": 16, "id": "3170c0c8-0dac-4755-aebf-dca9aa7f4dee", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.940099833610649\n", "\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " CD (Certificate of Deposit) 0.95 0.95 0.95 19\n", " Checking account 0.93 1.00 0.97 1500\n", "Other banking product or service 1.00 0.60 0.75 129\n", " Savings account 0.99 0.65 0.79 155\n", "\n", " accuracy 0.94 1803\n", " macro avg 0.97 0.80 0.86 1803\n", " weighted avg 0.94 0.94 0.93 1803\n", "\n" ] } ], "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "\n", "rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)\n", "trained_model_cs = train_model(cs_training_df, cs_val_df, 'Sub-product', rf_classifier, random_state=42)" ] }, { "cell_type": "code", "execution_count": 17, "id": "59c87ff1-d7de-41a9-9e0a-33630bff1c18", "metadata": {}, "outputs": [], "source": [ "with open('models/Checking_saving_model.pkl', 'wb') as f:\n", " pickle.dump(trained_model_cs, f)" ] }, { "cell_type": "markdown", "id": "fe443859-4be6-4b87-be79-22487aaf5b3b", "metadata": {}, "source": [ "#### 'Credit/Prepaid Card'" ] }, { "cell_type": "code", "execution_count": 26, "id": "31a70db8-06cb-4fb0-8d45-a7451aa81b0e", "metadata": {}, "outputs": [], "source": [ "cp_training_df= pd.read_csv('../data_preprocessing_scripts/product_data_splits/credit_prepaid_card_train_data.csv')\n", "cp_val_df= pd.read_csv('../data_preprocessing_scripts/product_data_splits/credit_prepaid_card_val_data.csv')" ] }, { "cell_type": "code", "execution_count": 27, "id": "0e70a22d-01f9-4f59-a903-286a05eb5179", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Sub-product\n", "General-purpose credit card or charge card 13320\n", "Store credit card 2232\n", "Name: count, dtype: int64" ] }, "execution_count": 27, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cp_training_df['Sub-product'].value_counts()" ] }, { "cell_type": "code", "execution_count": 28, "id": "ef3b03f6-8207-4292-8ce2-e6ca5695c606", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.9427414690572585\n", "\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", "General-purpose credit card or charge card 0.94 1.00 0.97 1481\n", " Store credit card 1.00 0.60 0.75 248\n", "\n", " accuracy 0.94 1729\n", " macro avg 0.97 0.80 0.86 1729\n", " weighted avg 0.95 0.94 0.94 1729\n", "\n" ] } ], "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "\n", "rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)\n", "trained_model_cp = train_model(cp_training_df, cp_val_df, 'Sub-product', rf_classifier, random_state=42)" ] }, { "cell_type": "code", "execution_count": 21, "id": "ac3f39d0-8cb8-457e-9db7-510cc5a99830", "metadata": {}, "outputs": [], "source": [ "with open('models/Credit_Prepaid_Card_model.pkl', 'wb') as f:\n", " pickle.dump(trained_model_cp, f)" ] }, { "cell_type": "markdown", "id": "0787d4eb-9673-417b-91d1-cc98becd037e", "metadata": {}, "source": [ "#### Credit_reporting_df" ] }, { "cell_type": "code", "execution_count": 22, "id": "8e074864-16f6-4fd5-8bfe-b054aeb0fc2a", "metadata": {}, "outputs": [], "source": [ "cr_training_df= pd.read_csv('../data_preprocessing_scripts/product_data_splits/credit_reporting_train_data.csv')\n", "cr_val_df= pd.read_csv('../data_preprocessing_scripts/product_data_splits/credit_reporting_val_data.csv')" ] }, { "cell_type": "code", "execution_count": 23, "id": "57257613-7dde-4561-942c-f559d2159744", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "Sub-product\n", "Credit reporting 13500\n", "Other personal consumer report 661\n", "Name: count, dtype: int64" ] }, "execution_count": 23, "metadata": {}, "output_type": "execute_result" } ], "source": [ "cr_training_df['Sub-product'].value_counts()" ] }, { "cell_type": "code", "execution_count": 24, "id": "cca27513-501f-4257-a4b1-0e13a3604250", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Accuracy: 0.9841168996188056\n", "\n", "Classification Report:\n", " precision recall f1-score support\n", "\n", " Credit reporting 0.99 1.00 0.99 1500\n", "Other personal consumer report 0.93 0.72 0.81 74\n", "\n", " accuracy 0.98 1574\n", " macro avg 0.96 0.86 0.90 1574\n", " weighted avg 0.98 0.98 0.98 1574\n", "\n" ] } ], "source": [ "from sklearn.ensemble import RandomForestClassifier\n", "\n", "rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)\n", "trained_model_cr = train_model(cr_training_df, cr_val_df, 'Sub-product', rf_classifier, random_state=42)\n" ] }, { "cell_type": "code", "execution_count": 25, "id": "3cbb9aa5-6c0c-4b59-a181-7431e8fc60fc", "metadata": {}, "outputs": [], "source": [ "with open('models/Credit_Reporting_model.pkl', 'wb') as f:\n", " pickle.dump(trained_model_cr, f)" ] }, { "cell_type": "markdown", "id": "9aea8fdd-ec86-40bc-b417-ba9169edabd9", "metadata": {}, "source": [ "with open('models/Debt_model.pkl', 'wb') as f:\n", " pickle.dump(trained_model_d, f)\n", "\n", "with open('models/loan_model.pkl', 'wb') as f:\n", " pickle.dump(trained_model_l, f)\n", "\n", "with open('models/Checking_saving_model.pkl', 'wb') as f:\n", " pickle.dump(trained_model_cs, f)\n", "\n", "with open('models/Credit_Prepaid_Card_model.pkl', 'wb') as f:\n", " pickle.dump(trained_model_cp, f)\n", "\n", "with open('models/Credit_Reporting_model.pkl', 'wb') as f:\n", " pickle.dump(trained_model_cr, f)" ] } ], "metadata": { "kernelspec": { "display_name": "Python 3 (ipykernel)", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.9.19" } }, "nbformat": 4, "nbformat_minor": 5 }