### Loading data (2023 year) 

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

In [2]:
df = pd.read_csv('./complaints.csv')
df['Date received'] = pd.to_datetime(df['Date received'])

cols_to_consider = ['Product','Sub-product','Issue','Sub-issue','Consumer complaint narrative','Company public response','Company',
                    'State', 'ZIP code', 'Date received']
df_new = df[cols_to_consider]

df_new = df_new.dropna()

  df = pd.read_csv('./complaints.csv')


In [3]:
df_2023 = df_new[df_new['Date received'].dt.year.isin([2023])].reset_index(drop=True)

product_map = {'Credit reporting or other personal consumer reports' : 'Credit Reporting',
               'Credit reporting, credit repair services, or other personal consumer reports' : 'Credit Reporting',
               'Payday loan, title loan, personal loan, or advance loan' : 'Loans / Mortgage',
               'Payday loan, title loan, or personal loan' : 'Loans / Mortgage',
               'Student loan' : 'Loans / Mortgage',
               'Vehicle loan or lease' : 'Loans / Mortgage',
               'Debt collection' : 'Debt collection',
               'Credit card or prepaid card' : 'Credit/Prepaid Card',
               'Credit card' : 'Credit/Prepaid Card',
               'Prepaid card' : 'Credit/Prepaid Card',
               'Mortgage' : 'Loans / Mortgage',
               'Checking or savings account' : 'Checking or savings account'  
              }

df_2023.loc[:,'Product'] = df_2023['Product'].map(product_map)

In [4]:
df_2023.head()

Unnamed: 0,Product,Sub-product,Issue,Sub-issue,Consumer complaint narrative,Company public response,Company,State,ZIP code,Date received
0,Checking or savings account,Other banking product or service,Opening an account,Account opened without my consent or knowledge,Date : XXXX XXXXo : XXXX XXXX XXXX / XXXX XXXX...,Company has responded to the consumer and the ...,WELLS FARGO & COMPANY,NC,27513,2023-12-29
1,Credit Reporting,Credit reporting,Problem with a company's investigation into an...,Investigation took more than 30 days,I have previously disputed this item with you ...,Company has responded to the consumer and the ...,Experian Information Solutions Inc.,MN,55124,2023-12-29
2,Debt collection,Other debt,Attempts to collect debt not owed,Debt was result of identity theft,I kindly request that you update my credit rep...,Company has responded to the consumer and the ...,Experian Information Solutions Inc.,IL,60621,2023-12-28
3,Debt collection,Other debt,Attempts to collect debt not owed,Debt was result of identity theft,I implore you to conduct a comprehensive inves...,Company has responded to the consumer and the ...,Experian Information Solutions Inc.,NJ,8723,2023-12-28
4,Credit Reporting,Credit reporting,Incorrect information on your report,Information belongs to someone else,In accordance with the Fair Credit Reporting A...,Company has responded to the consumer and the ...,"TRANSUNION INTERMEDIATE HOLDINGS, INC.",TX,77377,2023-11-27


In [5]:
df_2023.columns

Index(['Product', 'Sub-product', 'Issue', 'Sub-issue',
       'Consumer complaint narrative', 'Company public response', 'Company',
       'State', 'ZIP code', 'Date received'],
      dtype='object')

### Complaint pre-processing

In [6]:
df_2023['complaint length'] = df_2023['Consumer complaint narrative'].apply(lambda x : len(x))

In [7]:
df_2023 = df_2023[df_2023['complaint length'] > 20]

complaints_to_exclude = ['See document attached', 'See the attached documents.', 'Incorrect information on my credit report', 'incorrect information on my credit report',
'please see attached file','Please see documents Attached','Incorrect information on my credit report.', 'Please see attached file', 'see attached',
'See attached', 'SEE ATTACHED DOCUMENTS', 'See Attached', 'SEE ATTACHMENT', 'SEE ATTACHMENTS', 
'XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX']

df_2023 = df_2023[~df_2023['Consumer complaint narrative'].isin(complaints_to_exclude)]

### Categories consideration

In [8]:
df_2023_subset = df_2023[['Consumer complaint narrative','Product','Sub-product','Issue','Sub-issue']]
df_2023_subset.shape

(264968, 5)

In [9]:
df_2023_subset['Product'].value_counts()

Product
Credit Reporting               213403
Credit/Prepaid Card             16319
Checking or savings account     15143
Debt collection                 11767
Loans / Mortgage                 8336
Name: count, dtype: int64

In [10]:
sub_issues_to_consider = df_2023_subset['Sub-issue'].value_counts()[df_2023_subset['Sub-issue'].value_counts() > 500].index

In [11]:
reduced_subissues = df_2023_subset[df_2023_subset['Sub-issue'].isin(sub_issues_to_consider)]

In [12]:
reduced_subissues.shape

(248065, 5)

In [13]:
reduced_subissues['Sub-issue'].value_counts()

Sub-issue
Information belongs to someone else                                                      57877
Reporting company used your report improperly                                            48781
Their investigation did not fix an error on your report                                  45407
Credit inquiries on your report that you don't recognize                                 13150
Account status incorrect                                                                 10271
Account information incorrect                                                             9307
Was not notified of investigation status or results                                       9201
Investigation took more than 30 days                                                      8937
Personal information incorrect                                                            5900
Debt is not yours                                                                         2821
Deposits and withdrawals                

In [14]:
sub_products_to_consider = reduced_subissues['Sub-product'].value_counts()[reduced_subissues['Sub-product'].value_counts() > 100].index
final_df_2023 = reduced_subissues[reduced_subissues['Sub-product'].isin(sub_products_to_consider)]

In [15]:
final_df_2023.shape

(247517, 5)

### Issue categories grouping

In [16]:
issues_to_subissues = {}
for issue in final_df_2023['Issue'].value_counts().index:
    issues_to_subissues[issue] = list(final_df_2023[final_df_2023['Issue'] == issue]['Sub-issue'].value_counts().to_dict().keys())

one_subissue = {key : value for key,value in issues_to_subissues.items() if len(issues_to_subissues[key]) == 1}
more_than_one_subissue = {key : value for key,value in issues_to_subissues.items() if len(issues_to_subissues[key]) > 1}

existing_issue_mapping = {issue : issue for issue in more_than_one_subissue}

issue_renaming = {
    'Problem with a lender or other company charging your account': 'Account Operations and Unauthorized Transaction Issues',
    'Opening an account': 'Account Operations and Unauthorized Transaction Issues',
    'Getting a credit card': 'Account Operations and Unauthorized Transaction Issues',

    'Unable to get your credit report or credit score': 'Credit Report and Monitoring Issues',
    'Credit monitoring or identity theft protection services': 'Credit Report and Monitoring Issues',
    'Identity theft protection or other monitoring services': 'Credit Report and Monitoring Issues',
    
    'Problem caused by your funds being low': 'Payment and Funds Management',
    'Problem when making payments': 'Payment and Funds Management',
    'Managing the loan or lease': 'Payment and Funds Management',

    'False statements or representation': 'Disputes and Misrepresentations',
    'Fees or interest': 'Disputes and Misrepresentations',
    'Other features, terms, or problems': 'Disputes and Misrepresentations',

    'Took or threatened to take negative or legal action': 'Legal and Threat Actions'
}

issues_mapping = {**issue_renaming, **existing_issue_mapping}

final_df_2023.loc[:,'Issue'] = final_df_2023['Issue'].apply(lambda x : issues_mapping[x])

### Value counts

In [17]:
final_df_2023['Product'].value_counts()

Product
Credit Reporting               211695
Checking or savings account     12285
Credit/Prepaid Card             11975
Debt collection                  9380
Loans / Mortgage                 2182
Name: count, dtype: int64

In [18]:
final_df_2023['Sub-product'].value_counts()

Sub-product
Credit reporting                              210735
General-purpose credit card or charge card     10668
Checking account                               10409
Other debt                                      3041
I do not know                                   2316
Credit card debt                                1652
Federal student loan servicing                  1344
Store credit card                               1307
Medical debt                                    1053
Savings account                                  989
Other personal consumer report                   960
Loan                                             732
Other banking product or service                 725
Auto debt                                        581
Telecommunications debt                          419
Rental debt                                      179
CD (Certificate of Deposit)                      162
Mortgage debt                                    139
Conventional home mortgage        

In [19]:
final_df_2023['Issue'].value_counts()

Issue
Incorrect information on your report                                                87200
Improper use of your report                                                         61868
Problem with a credit reporting company's investigation into an existing problem    45371
Problem with a company's investigation into an existing problem                     20985
Managing an account                                                                  7367
Attempts to collect debt not owed                                                    5453
Problem with a purchase shown on your statement                                      3253
Account Operations and Unauthorized Transaction Issues                               2450
Written notification about debt                                                      2404
Disputes and Misrepresentations                                                      2311
Payment and Funds Management                                                         2259
Clos

In [20]:
final_df_2023['Sub-issue'].value_counts()

Sub-issue
Information belongs to someone else                                                      57850
Reporting company used your report improperly                                            48732
Their investigation did not fix an error on your report                                  45395
Credit inquiries on your report that you don't recognize                                 13136
Account status incorrect                                                                 10208
Account information incorrect                                                             9267
Was not notified of investigation status or results                                       9200
Investigation took more than 30 days                                                      8928
Personal information incorrect                                                            5900
Debt is not yours                                                                         2785
Deposits and withdrawals                

### Unique categories

In [21]:
print(f"Unique Product offerings: {final_df_2023['Product'].nunique()}")
print(f"Unique Sub-product offerings: {final_df_2023['Sub-product'].nunique()}")
print(f"Unique Issue offerings: {final_df_2023['Issue'].nunique()}")
print(f"Unique Sub-issue offerings: {final_df_2023['Sub-issue'].nunique()}")

Unique Product offerings: 5
Unique Sub-product offerings: 19
Unique Issue offerings: 17
Unique Sub-issue offerings: 44


### Preparing the train and test splits

In [22]:
from sklearn.model_selection import train_test_split

In [23]:
final_df_2023.head()

Unnamed: 0,Consumer complaint narrative,Product,Sub-product,Issue,Sub-issue
1,I have previously disputed this item with you ...,Credit Reporting,Credit reporting,Problem with a company's investigation into an...,Investigation took more than 30 days
2,I kindly request that you update my credit rep...,Debt collection,Other debt,Attempts to collect debt not owed,Debt was result of identity theft
3,I implore you to conduct a comprehensive inves...,Debt collection,Other debt,Attempts to collect debt not owed,Debt was result of identity theft
4,In accordance with the Fair Credit Reporting A...,Credit Reporting,Credit reporting,Incorrect information on your report,Information belongs to someone else
5,In accordance with Fair c=Credit Reporting Act...,Credit Reporting,Credit reporting,Improper use of your report,Reporting company used your report improperly


In [24]:
X = final_df_2023['Consumer complaint narrative']
y = final_df_2023[['Product','Sub-product','Issue','Sub-issue']]

X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y['Product'],test_size=0.25,random_state=42)

In [25]:
train_df = pd.concat([X_train,y_train],axis = 1).reset_index(drop = True)
test_df = pd.concat([X_test,y_test],axis = 1).reset_index(drop = True)

In [26]:
train_df.head()

Unnamed: 0,Consumer complaint narrative,Product,Sub-product,Issue,Sub-issue
0,The credit bureaus keep disrespecting the laws...,Credit Reporting,Credit reporting,Problem with a company's investigation into an...,Their investigation did not fix an error on yo...
1,I sent in a complaint in XXXX of 2021 about so...,Credit Reporting,Credit reporting,Incorrect information on your report,Information belongs to someone else
2,I ordered a copy of my report and I found out ...,Credit Reporting,Credit reporting,Problem with a credit reporting company's inve...,Their investigation did not fix an error on yo...
3,It appears that my credit file has been compro...,Credit Reporting,Credit reporting,Incorrect information on your report,Information belongs to someone else
4,"I have never authorized, consented to nor bene...",Credit Reporting,Credit reporting,Incorrect information on your report,Information belongs to someone else


In [27]:
train_df.shape

(185637, 5)

In [28]:
test_df.shape

(61880, 5)

In [29]:
import os

directory_to_save = './data_splits/'

if not os.path.exists(directory_to_save):
    os.makedirs(directory_to_save)

train_df.to_csv(directory_to_save + 'train-data-split.csv',index = False)
test_df.to_csv(directory_to_save + 'test-data-split.csv',index = False)