In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import matplotlib.pyplot as plt 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,accuracy_score
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.utils.class_weight import compute_class_weight
import pickle

## Sub Issues

### Model

In [29]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils.class_weight import compute_class_weight

def train_model(training_df, validation_df, target_column, classifier_model, subissues_to_drop=None, random_state=42):
 # Drop specified subproducts from training and validation dataframes
 if subissues_to_drop:
 training_df = training_df[~training_df[target_column].isin(subissues_to_drop)]
 validation_df = validation_df[~validation_df[target_column].isin(subissues_to_drop)]
 
 # Compute class weights
 class_weights = compute_class_weight('balanced', classes=np.unique(training_df[target_column]), y=training_df[target_column])
 
 # Convert class weights to dictionary format
 class_weight = {label: weight for label, weight in zip(np.unique(training_df[target_column]), class_weights)}
 
 # Define a default class weight for missing classes
 default_class_weight = 0.5
 
 # Assign default class weight for missing classes
 for label in np.unique(training_df[target_column]):
 if label not in class_weight:
 class_weight[label] = default_class_weight
 
 # Define the pipeline
 pipeline = Pipeline([
 ('tfidf', TfidfVectorizer()),
 ('classifier', classifier_model)
 ])
 
 # Train the pipeline
 pipeline.fit(training_df['Consumer complaint narrative'], training_df[target_column])
 
 # Make predictions on the validation set
 y_pred = pipeline.predict(validation_df['Consumer complaint narrative'])
 
 # Evaluate the pipeline
 accuracy = accuracy_score(validation_df[target_column], y_pred)
 print("\nClassification Report:")
 print(classification_report(validation_df[target_column], y_pred))
 print("Accuracy:", accuracy)
 
 return pipeline

#### Reading the Issue DataFrame

In [30]:
import os
import pandas as pd

def read_subissue_data(issue_name, data_dir='../data_preprocessing_scripts/issue_data_splits'):
 # Convert issue name to lower case and replace '/' and spaces with underscores
 issue_name = issue_name.replace('/', '_').replace(' ', '_').lower()
 
 # Construct file paths
 train_file = os.path.join(data_dir, f"{issue_name}_train_data.csv")
 val_file = os.path.join(data_dir, f"{issue_name}_val_data.csv")
 
 # Read the CSV files
 train_df = pd.read_csv(train_file)
 val_df = pd.read_csv(val_file )
 
 return train_df, val_df

In [31]:
df = pd.read_csv("../data_splits/train-data-split.csv")
issue_categories = list(df_train['Issue'].unique())

def classify_sub_issue(issue):
 issue_name = issue.replace('/', '_').replace(' ', '_').lower()
 train_df,val_df= read_subissue_data(issue)
 rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)
 trained_model = train_model(train_df, val_df, 'Sub-issue', rf_classifier, random_state=42)

 # Saving the model
 with open(f"issue_models/{issue_name}.pkl", 'wb') as f:
 pickle.dump(trained_model, f)

### Sub-issues classification

#### 1. Problem with a company's investigation into an existing problem

In [32]:
issue_name = issue_categories[0]
print(f"Issue : {issue_name}\n")

classify_sub_issue(issue_name)

Issue : Problem with a company's investigation into an existing problem


Classification Report:
 precision recall f1-score support

Difficulty submitting a dispute or getting information about a dispute over the phone 0.88 0.37 0.52 41
 Investigation took more than 30 days 0.95 0.73 0.83 162
 Problem with personal statement of dispute 0.90 0.53 0.67 53
 Their investigation did not fix an error on your report 0.91 1.00 0.95 1122
 Was not notified of investigation status or results 0.98 0.87 0.92 209

 accuracy 0.92 1587
 macro avg 0.93 0.70 0.78 1587
 weighted avg 0.92 0.92 0.91 1587

Accuracy: 0.9199747952110902


#### 2. Incorrect information on your report

In [34]:
issue_name = issue_categories[1]
print(f"Issue : {issue_name}\n")

classify_sub_issue(issue_name)

Issue : Incorrect information on your report


Classification Report:
 precision recall f1-score support

 Account information incorrect 0.74 0.68 0.71 699
 Account status incorrect 0.87 0.73 0.79 771
 Information belongs to someone else 0.90 0.99 0.94 4337
Information is missing that should be on the report 0.95 0.31 0.47 65
 Old information reappears or never goes away 0.93 0.40 0.56 126
 Personal information incorrect 0.95 0.78 0.86 440
 Public record information inaccurate 0.98 0.47 0.64 102

 accuracy 0.88 6540
 macro avg 0.90 0.62 0.71 6540
 weighted avg 0.88 0.88 0.88 6540

Accuracy: 0.8831804281345565


#### 3. Problem with a credit reporting company's investigation into an existing problem

In [35]:
issue_name = issue_categories[2]
print(f"Issue : {issue_name}\n")

classify_sub_issue(issue_name)

Issue : Problem with a credit reporting company's investigation into an existing problem


Classification Report:
 precision recall f1-score support

Difficulty submitting a dispute or getting information about a dispute over the phone 0.83 0.36 0.50 83
 Investigation took more than 30 days 0.97 0.84 0.90 505
 Problem with personal statement of dispute 1.00 0.38 0.55 47
 Their investigation did not fix an error on your report 0.92 0.99 0.95 2277
 Was not notified of investigation status or results 0.96 0.88 0.92 473

 accuracy 0.93 3385
 macro avg 0.94 0.69 0.77 3385
 weighted avg 0.93 0.93 0.92 3385

Accuracy: 0.9288035450516987


#### 4. Problem with a purchase shown on your statement

In [36]:
issue_name = issue_categories[3]
print(f"Issue : {issue_name}\n")

classify_sub_issue(issue_name)

Issue : Problem with a purchase shown on your statement


Classification Report:
 precision recall f1-score support

 Card was charged for something you did not purchase with the card 0.81 0.19 0.30 70
Credit card company isn't resolving a dispute about a purchase on your statement 0.75 0.98 0.85 172

 accuracy 0.75 242
 macro avg 0.78 0.58 0.58 242
 weighted avg 0.77 0.75 0.69 242

Accuracy: 0.7520661157024794


#### 5. Improper use of your report

In [37]:
issue_name = issue_categories[4]
print(f"Issue : {issue_name}\n")

classify_sub_issue(issue_name)

Issue : Improper use of your report


Classification Report:
 precision recall f1-score support

Credit inquiries on your report that you don't recognize 0.93 0.84 0.88 990
 Reporting company used your report improperly 0.96 0.98 0.97 3654

 accuracy 0.95 4644
 macro avg 0.95 0.91 0.93 4644
 weighted avg 0.95 0.95 0.95 4644

Accuracy: 0.9528423772609819


#### 6. Account Operations and Unauthorized Transaction Issues

In [38]:
issue_name = issue_categories[5]
print(f"Issue : {issue_name}\n")

classify_sub_issue(issue_name)

Issue : Account Operations and Unauthorized Transaction Issues


Classification Report:
 precision recall f1-score support

 Account opened as a result of fraud 0.83 0.67 0.74 43
Card opened as result of identity theft or fraud 0.88 0.77 0.82 39
 Transaction was not authorized 0.86 0.97 0.91 102

 accuracy 0.86 184
 macro avg 0.86 0.80 0.83 184
 weighted avg 0.86 0.86 0.85 184

Accuracy: 0.8586956521739131


#### 7. Payment and Funds Management

In [39]:
issue_name = issue_categories[6]
print(f"Issue : {issue_name}\n")

classify_sub_issue(issue_name)

Issue : Payment and Funds Management


Classification Report:
 precision recall f1-score support

 Billing problem 1.00 0.65 0.79 34
 Overdrafts and overdraft fees 0.89 0.92 0.91 74
Problem during payment process 0.81 0.94 0.87 65

 accuracy 0.87 173
 macro avg 0.90 0.83 0.85 173
 weighted avg 0.88 0.87 0.87 173

Accuracy: 0.8728323699421965


#### 8. Managing an account

In [40]:
issue_name = issue_categories[7]
print(f"Issue : {issue_name}\n")

classify_sub_issue(issue_name)

Issue : Managing an account


Classification Report:
 precision recall f1-score support

 Banking errors 0.50 0.10 0.16 73
 Deposits and withdrawals 0.46 0.90 0.61 201
 Fee problem 0.55 0.57 0.56 56
Funds not handled or disbursed as instructed 0.00 0.00 0.00 72
 Problem accessing account 0.00 0.00 0.00 40
 Problem using a debit or ATM card 0.71 0.58 0.64 113

 accuracy 0.52 555
 macro avg 0.37 0.36 0.33 555
 weighted avg 0.43 0.52 0.43 555

Accuracy: 0.5153153153153153


#### 9. Attempts to collect debt not owed

In [41]:
issue_name = issue_categories[8]
print(f"Issue : {issue_name}\n")

classify_sub_issue(issue_name)

Issue : Attempts to collect debt not owed


Classification Report:
 precision recall f1-score support

 Debt is not yours 0.64 0.93 0.76 207
 Debt was paid 0.96 0.31 0.46 72
Debt was result of identity theft 0.84 0.56 0.67 129

 accuracy 0.70 408
 macro avg 0.81 0.60 0.63 408
 weighted avg 0.76 0.70 0.68 408

Accuracy: 0.7009803921568627


-----

#### 10. Written notification about debt

In [42]:
issue_name = issue_categories[9]
print(f"Issue : {issue_name}\n")

classify_sub_issue(issue_name)

Issue : Written notification about debt


Classification Report:
 precision recall f1-score support

Didn't receive enough information to verify debt 0.77 0.99 0.87 135
 Didn't receive notice of right to dispute 0.90 0.19 0.31 48

 accuracy 0.78 183
 macro avg 0.84 0.59 0.59 183
 weighted avg 0.81 0.78 0.72 183

Accuracy: 0.7814207650273224


----

#### 11. Dealing with your lender or servicer

In [43]:
issue_name = issue_categories[10]
print(f"Issue : {issue_name}\n")

classify_sub_issue(issue_name)

Issue : Dealing with your lender or servicer


Classification Report:
 precision recall f1-score support

 Received bad information about your loan 0.74 0.70 0.72 50
Trouble with how payments are being handled 0.71 0.75 0.73 48

 accuracy 0.72 98
 macro avg 0.73 0.72 0.72 98
 weighted avg 0.73 0.72 0.72 98

Accuracy: 0.7244897959183674


----

#### 12. Disputes and Misrepresentations

In [44]:
issue_name = issue_categories[11]
print(f"Issue : {issue_name}\n")

classify_sub_issue(issue_name)

Issue : Disputes and Misrepresentations


Classification Report:
 precision recall f1-score support

Attempted to collect wrong amount 0.85 0.92 0.88 66
 Other problem 0.85 0.65 0.74 54
 Problem with fees 0.83 0.93 0.88 57

 accuracy 0.84 177
 macro avg 0.84 0.83 0.83 177
 weighted avg 0.84 0.84 0.84 177

Accuracy: 0.8418079096045198


----

#### 13. Problem with a company's investigation into an existing issue

In [45]:
issue_name = issue_categories[12]
print(f"Issue : {issue_name}\n")

classify_sub_issue(issue_name)

Issue : Problem with a company's investigation into an existing issue


Classification Report:
 precision recall f1-score support

Difficulty submitting a dispute or getting information about a dispute over the phone 0.00 0.00 0.00 3
 Investigation took more than 30 days 1.00 1.00 1.00 3
 Problem with personal statement of dispute 0.00 0.00 0.00 2
 Their investigation did not fix an error on your report 0.50 1.00 0.67 7
 Was not notified of investigation status or results 0.00 0.00 0.00 2

 accuracy 0.59 17
 macro avg 0.30 0.40 0.33 17
 weighted avg 0.38 0.59 0.45 17

Accuracy: 0.5882352941176471


----

#### 14. Closing your account

In [46]:
issue_name = issue_categories[13]
print(f"Issue : {issue_name}\n")

classify_sub_issue(issue_name)

Issue : Closing your account


Classification Report:
 precision recall f1-score support

 Can't close your account 1.00 0.24 0.38 17
Company closed your account 0.78 1.00 0.88 46

 accuracy 0.79 63
 macro avg 0.89 0.62 0.63 63
 weighted avg 0.84 0.79 0.74 63

Accuracy: 0.7936507936507936


----

#### 15. Credit Report and Monitoring Issues

In [47]:
issue_name = issue_categories[14]
print(f"Issue : {issue_name}\n")

classify_sub_issue(issue_name)

Issue : Credit Report and Monitoring Issues


Classification Report:
 precision recall f1-score support

 Other problem getting your report or credit score 0.89 0.99 0.94 82
Problem canceling credit monitoring or identify theft protection service 0.97 0.75 0.85 40

 accuracy 0.91 122
 macro avg 0.93 0.87 0.89 122
 weighted avg 0.92 0.91 0.91 122

Accuracy: 0.9098360655737705


----

#### 16. Closing an account

In [48]:
issue_name = issue_categories[15]
print(f"Issue : {issue_name}\n")

classify_sub_issue(issue_name)

Issue : Closing an account


Classification Report:
 precision recall f1-score support

 Can't close your account 1.00 0.04 0.07 27
 Company closed your account 0.57 0.83 0.67 69
Funds not received from closed account 0.56 0.50 0.53 50

 accuracy 0.57 146
 macro avg 0.71 0.45 0.42 146
 weighted avg 0.64 0.57 0.51 146

Accuracy: 0.5684931506849316


#### 17. Legal and Threat Actions

In [49]:
issue_name = issue_categories[16]
print(f"Issue : {issue_name}\n")

classify_sub_issue(issue_name)

Issue : Legal and Threat Actions


Classification Report:
 precision recall f1-score support

Threatened or suggested your credit would be damaged 1.00 1.00 1.00 48

 accuracy 1.00 48
 macro avg 1.00 1.00 1.00 48
 weighted avg 1.00 1.00 1.00 48

Accuracy: 1.0
