In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,accuracy_score
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
import pickle

## Sub Products

### Model

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight

def train_model(training_df, validation_df, subproduct_to_predict, classifier_model, subproducts_to_drop=None, random_state=None):
    # Drop specified subproducts from training and validation dataframes
    if subproducts_to_drop:
        training_df = training_df[~training_df['Sub-product'].isin(subproducts_to_drop)]
        validation_df = validation_df[~validation_df['Sub-product'].isin(subproducts_to_drop)]
    
    # Compute class weights
    class_weights = compute_class_weight('balanced', classes=np.unique(training_df['Sub-product']), y=training_df['Sub-product'])
    
    # Convert class weights to dictionary format
    class_weight = {label: weight for label, weight in zip(np.unique(training_df['Sub-product']), class_weights)}
    
    # Define a default class weight for missing classes
    default_class_weight = 0.5
    
    # Assign default class weight for missing classes
    for label in np.unique(training_df['Sub-product']):
        if label not in class_weight:
            class_weight[label] = default_class_weight
    
    # Define the pipeline
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer()),
        ('classifier', classifier_model)
    ])
    
    # Train the pipeline
    pipeline.fit(training_df['Consumer complaint narrative'], training_df['Sub-product'])
    
    # Make predictions on the validation set
    y_pred = pipeline.predict(validation_df['Consumer complaint narrative'])
    
    # Evaluate the pipeline
    accuracy = accuracy_score(validation_df['Sub-product'], y_pred)
    print("Accuracy:", accuracy)
    print("\nClassification Report:")
    print(classification_report(validation_df['Sub-product'], y_pred))
    
    return pipeline


#### Debt Collection

In [4]:
debt_training_df= pd.read_csv('../data_preprocessing_scripts/product_data_splits/debt_collection_train_data.csv')
debt_val_df= pd.read_csv('../data_preprocessing_scripts/product_data_splits/debt_collection_val_data.csv')

In [5]:
debt_training_df.head()

Unnamed: 0,Consumer complaint narrative,Product,Sub-product
0,{$37.00} on XXXX XXXX XXXX I paid for gas thro...,Debt collection,Other debt
1,Debt from XXXX XXXX is result of identity thef...,Debt collection,Credit card debt
2,My son attended XXXX XXXX XXXX XXXX for severa...,Debt collection,Medical debt
3,XXXX is claiming I owe a debt for utilities ba...,Debt collection,Other debt
4,"This debt collector engaged in abusive, decept...",Debt collection,I do not know


In [6]:
debt_training_df['Sub-product'].value_counts()

Sub-product
Other debt                 2056
I do not know              1530
Credit card debt           1139
Medical debt                726
Auto debt                   397
Telecommunications debt     267
Rental debt                 122
Mortgage debt                94
Name: count, dtype: int64

In [7]:

from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)
trained_model_d = train_model(debt_training_df, debt_val_df, 'Sub-product', rf_classifier, subproducts_to_drop=['Other debt', 'I do not know'], random_state=42)


Accuracy: 0.6633986928104575

Classification Report:
                         precision    recall  f1-score   support

              Auto debt       0.95      0.48      0.64        44
       Credit card debt       0.59      0.96      0.73       127
           Medical debt       0.77      0.62      0.68        81
          Mortgage debt       1.00      0.40      0.57        10
            Rental debt       0.67      0.14      0.24        14
Telecommunications debt       1.00      0.13      0.24        30

               accuracy                           0.66       306
              macro avg       0.83      0.46      0.52       306
           weighted avg       0.75      0.66      0.63       306



In [9]:
with open('models/Debt_model.pkl', 'wb') as f:
    pickle.dump(trained_model_d, f)

#### Loan/Mortgages

In [10]:
loans_training_df= pd.read_csv('../data_preprocessing_scripts/product_data_splits/loans___mortgage_train_data.csv')
loans_val_df= pd.read_csv('../data_preprocessing_scripts/product_data_splits/loans___mortgage_val_data.csv')

In [11]:
loans_training_df['Sub-product'].value_counts()

Sub-product
Loan                              1464
Federal student loan servicing     914
Conventional home mortgage         236
Lease                              186
FHA mortgage                        94
Name: count, dtype: int64

In [12]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)
trained_model_l = train_model(loans_training_df, loans_val_df, 'Sub-product', rf_classifier, random_state=42)

Accuracy: 0.8757763975155279

Classification Report:
                                precision    recall  f1-score   support

    Conventional home mortgage       0.81      0.50      0.62        26
                  FHA mortgage       1.00      0.20      0.33        10
Federal student loan servicing       1.00      0.96      0.98       102
                         Lease       1.00      0.29      0.44        21
                          Loan       0.81      1.00      0.90       163

                      accuracy                           0.88       322
                     macro avg       0.93      0.59      0.65       322
                  weighted avg       0.89      0.88      0.85       322



In [13]:
with open('models/loan_model.pkl', 'wb') as f:
    pickle.dump(trained_model_l, f)

#### Checking or savings account

In [14]:
cs_training_df= pd.read_csv('../data_preprocessing_scripts/product_data_splits/checking_or_savings_account_train_data.csv')
cs_val_df= pd.read_csv('../data_preprocessing_scripts/product_data_splits/checking_or_savings_account_val_data.csv')

In [15]:
cs_training_df['Sub-product'].value_counts()

Sub-product
Checking account                    13500
Savings account                      1391
Other banking product or service     1158
CD (Certificate of Deposit)           176
Name: count, dtype: int64

In [16]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)
trained_model_cs = train_model(cs_training_df, cs_val_df, 'Sub-product', rf_classifier, random_state=42)

Accuracy: 0.940099833610649

Classification Report:
                                  precision    recall  f1-score   support

     CD (Certificate of Deposit)       0.95      0.95      0.95        19
                Checking account       0.93      1.00      0.97      1500
Other banking product or service       1.00      0.60      0.75       129
                 Savings account       0.99      0.65      0.79       155

                        accuracy                           0.94      1803
                       macro avg       0.97      0.80      0.86      1803
                    weighted avg       0.94      0.94      0.93      1803



In [17]:
with open('models/Checking_saving_model.pkl', 'wb') as f:
    pickle.dump(trained_model_cs, f)

#### 'Credit/Prepaid Card'

In [26]:
cp_training_df= pd.read_csv('../data_preprocessing_scripts/product_data_splits/credit_prepaid_card_train_data.csv')
cp_val_df= pd.read_csv('../data_preprocessing_scripts/product_data_splits/credit_prepaid_card_val_data.csv')

In [27]:
cp_training_df['Sub-product'].value_counts()

Sub-product
General-purpose credit card or charge card    13320
Store credit card                              2232
Name: count, dtype: int64

In [28]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)
trained_model_cp = train_model(cp_training_df, cp_val_df, 'Sub-product', rf_classifier, random_state=42)

Accuracy: 0.9427414690572585

Classification Report:
                                            precision    recall  f1-score   support

General-purpose credit card or charge card       0.94      1.00      0.97      1481
                         Store credit card       1.00      0.60      0.75       248

                                  accuracy                           0.94      1729
                                 macro avg       0.97      0.80      0.86      1729
                              weighted avg       0.95      0.94      0.94      1729



In [21]:
with open('models/Credit_Prepaid_Card_model.pkl', 'wb') as f:
    pickle.dump(trained_model_cp, f)

#### Credit_reporting_df

In [22]:
cr_training_df= pd.read_csv('../data_preprocessing_scripts/product_data_splits/credit_reporting_train_data.csv')
cr_val_df= pd.read_csv('../data_preprocessing_scripts/product_data_splits/credit_reporting_val_data.csv')

In [23]:
cr_training_df['Sub-product'].value_counts()

Sub-product
Credit reporting                  13500
Other personal consumer report      661
Name: count, dtype: int64

In [24]:
from sklearn.ensemble import RandomForestClassifier

rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)
trained_model_cr = train_model(cr_training_df, cr_val_df, 'Sub-product', rf_classifier, random_state=42)


Accuracy: 0.9841168996188056

Classification Report:
                                precision    recall  f1-score   support

              Credit reporting       0.99      1.00      0.99      1500
Other personal consumer report       0.93      0.72      0.81        74

                      accuracy                           0.98      1574
                     macro avg       0.96      0.86      0.90      1574
                  weighted avg       0.98      0.98      0.98      1574



In [25]:
with open('models/Credit_Reporting_model.pkl', 'wb') as f:
    pickle.dump(trained_model_cr, f)

with open('models/Debt_model.pkl', 'wb') as f:
    pickle.dump(trained_model_d, f)

with open('models/loan_model.pkl', 'wb') as f:
    pickle.dump(trained_model_l, f)

with open('models/Checking_saving_model.pkl', 'wb') as f:
    pickle.dump(trained_model_cs, f)

with open('models/Credit_Prepaid_Card_model.pkl', 'wb') as f:
    pickle.dump(trained_model_cp, f)

with open('models/Credit_Reporting_model.pkl', 'wb') as f:
    pickle.dump(trained_model_cr, f)