Mahesh Babu commited on
Commit
210b96e
1 Parent(s): 19ca65a

added the UI

Browse files
app.py ADDED
@@ -0,0 +1,358 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #Importing the necessary libraries
2
+ import pandas as pd
3
+ import torch
4
+ from streamlit_option_menu import option_menu
5
+ from plotting_helpers import (plot_top_5_products, plot_top_5_issues, plot_top_5_issues_in_product, plot_top_10_companies_complaints,
6
+ plot_top_10_states_most_complaints, plot_top_10_states_least_complaints, complaints_by_year,
7
+ complaints_across_states)
8
+ from transformers import pipeline
9
+ import streamlit as st
10
+ import pickle
11
+ import warnings
12
+ warnings.filterwarnings("ignore")
13
+
14
+ # Setting page config
15
+ st.set_page_config(page_title='CFPB Consumer Complaint Insights', page_icon='📋',
16
+ layout="wide", initial_sidebar_state='expanded')
17
+
18
+ @st.cache_data(show_spinner=False)
19
+ def load_process_data():
20
+ df = pd.read_csv('complaints.csv')
21
+ df['Date received'] = pd.to_datetime(df['Date received'])
22
+
23
+ cols_to_consider = ['Product','Sub-product','Issue','Sub-issue','Consumer complaint narrative','Company public response','Company',
24
+ 'State', 'ZIP code', 'Date received']
25
+ df_new = df[cols_to_consider]
26
+
27
+ df_new = df_new.dropna()
28
+
29
+ product_map = {'Credit reporting or other personal consumer reports' : 'Credit Reporting',
30
+ 'Credit reporting, credit repair services, or other personal consumer reports' : 'Credit Reporting',
31
+ 'Payday loan, title loan, personal loan, or advance loan' : 'Loans / Mortgage',
32
+ 'Payday loan, title loan, or personal loan' : 'Loans / Mortgage',
33
+ 'Student loan' : 'Loans / Mortgage',
34
+ 'Vehicle loan or lease' : 'Loans / Mortgage',
35
+ 'Debt collection' : 'Debt collection',
36
+ 'Credit card or prepaid card' : 'Credit/Prepaid Card',
37
+ 'Credit card' : 'Credit/Prepaid Card',
38
+ 'Prepaid card' : 'Credit/Prepaid Card',
39
+ 'Mortgage' : 'Loans / Mortgage',
40
+ 'Checking or savings account' : 'Checking or savings account'
41
+ }
42
+
43
+ df_new.loc[:,'Product'] = df_new['Product'].map(product_map)
44
+
45
+
46
+ df_new['complaint length'] = df_new['Consumer complaint narrative'].apply(lambda x : len(x))
47
+ df_new = df_new[df_new['complaint length'] > 20]
48
+
49
+ complaints_to_exclude = ['See document attached', 'See the attached documents.', 'Incorrect information on my credit report', 'incorrect information on my credit report',
50
+ 'please see attached file','Please see documents Attached','Incorrect information on my credit report.', 'Please see attached file', 'see attached',
51
+ 'See attached', 'SEE ATTACHED DOCUMENTS', 'See Attached', 'SEE ATTACHMENT', 'SEE ATTACHMENTS',
52
+ 'XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX']
53
+
54
+ df_new = df_new[~df_new['Consumer complaint narrative'].isin(complaints_to_exclude)]
55
+
56
+ return df_new
57
+
58
+ # Load the processed data
59
+ df = load_process_data()
60
+
61
+ # Loading the product classifier model
62
+ device = "mps" if torch.backends.mps.is_available() else "cpu"
63
+ # Initialize the pipeline for classifying product
64
+ product_classifier = pipeline("text-classification", model="Mahesh9/distil-bert-fintuned-product-cfpb-complaints",
65
+ max_length = 512, truncation = True, device = device)
66
+
67
+ # Load sub-product classifier models
68
+ with open('subproduct_prediction/models/Credit_Reporting_model.pkl', 'rb') as f:
69
+ trained_model_cr= pickle.load(f)
70
+ with open('subproduct_prediction/models/Credit_Prepaid_Card_model.pkl', 'rb') as f:
71
+ trained_model_cp= pickle.load(f)
72
+ with open('subproduct_prediction/models/Checking_saving_model.pkl', 'rb') as f:
73
+ trained_model_cs=pickle.load(f)
74
+ with open('subproduct_prediction/models/loan_model.pkl', 'rb') as f:
75
+ trained_model_l= pickle.load(f)
76
+ with open('subproduct_prediction/models/Debt_model.pkl', 'rb') as f:
77
+ trained_model_d= pickle.load(f)
78
+
79
+ @st.cache_resource(show_spinner=False)
80
+ # Define a function to select the appropriate subproduct prediction model based on the predicted product
81
+ def select_subproduct_model(predicted_product):
82
+ if predicted_product == 'Credit Reporting' :
83
+ return trained_model_cr
84
+ elif predicted_product == 'Credit/Prepaid Card':
85
+ return trained_model_cp
86
+ elif predicted_product == 'Checking or savings account':
87
+ return trained_model_cs
88
+ elif predicted_product == 'Loans / Mortgage':
89
+ return trained_model_l
90
+ elif predicted_product == 'Debt collection':
91
+ return trained_model_d
92
+ else:
93
+ raise ValueError("Invalid predicted product category")
94
+
95
+ # Loading the issue classifier model
96
+ issue_classifier = pipeline("text-classification", model="Mahesh9/distil-bert-fintuned-issues-cfpb-complaints",
97
+ max_length = 512, truncation = True, device = device)
98
+
99
+ # Path to the models and their corresponding names
100
+ issue_model_files = {
101
+ 'trained_model_account_operations': 'subproduct_prediction/issue_models/account_operations_and_unauthorized_transaction_issues.pkl',
102
+ 'trained_model_collect_debt': 'subproduct_prediction/issue_models/attempts_to_collect_debt_not_owed.pkl',
103
+ 'trained_model_closing_account': 'subproduct_prediction/issue_models/closing_an_account.pkl',
104
+ 'trained_model_closing_your_account': 'subproduct_prediction/issue_models/closing_your_account.pkl',
105
+ 'trained_model_credit_report': 'subproduct_prediction/issue_models/credit_report_and_monitoring_issues.pkl',
106
+ 'trained_model_lender': 'subproduct_prediction/issue_models/dealing_with_your_lender_or_servicer.pkl',
107
+ 'trained_model_disputes': 'subproduct_prediction/issue_models/disputes_and_misrepresentations.pkl',
108
+ 'trained_model_improper_use_report': 'subproduct_prediction/issue_models/improper_use_of_your_report.pkl',
109
+ 'trained_model_incorrect_info': 'subproduct_prediction/issue_models/incorrect_information_on_your_report.pkl',
110
+ 'trained_model_legal_and_threat': 'subproduct_prediction/issue_models/legal_and_threat_actions.pkl',
111
+ 'trained_model_managing_account': 'subproduct_prediction/issue_models/managing_an_account.pkl',
112
+ 'trained_model_payment_funds': 'subproduct_prediction/issue_models/payment_and_funds_management.pkl',
113
+ 'trained_model_investigation_wrt_issue': 'subproduct_prediction/issue_models/problem_with_a_company\'s_investigation_into_an_existing_issue.pkl',
114
+ 'trained_model_investigation_wrt_problem': 'subproduct_prediction/issue_models/problem_with_a_company\'s_investigation_into_an_existing_problem.pkl',
115
+ 'trained_model_credit_investigation_wrt_problem': 'subproduct_prediction/issue_models/problem_with_a_credit_reporting_company\'s_investigation_into_an_existing_problem.pkl',
116
+ 'trained_model_purchase_shown': 'subproduct_prediction/issue_models/problem_with_a_purchase_shown_on_your_statement.pkl',
117
+ 'trained_model_notification_about_debt': 'subproduct_prediction/issue_models/written_notification_about_debt.pkl',
118
+ }
119
+
120
+ issue_models = {}
121
+
122
+ for model_name, file_path in issue_model_files.items():
123
+ with open(file_path, 'rb') as f:
124
+ issue_models[model_name] = pickle.load(f)
125
+
126
+ # Define a function to select the appropriate subissue prediction model based on the predicted issue
127
+ def select_subissue_model(predicted_issue):
128
+ if predicted_issue == "Problem with a company's investigation into an existing problem":
129
+ return issue_models['trained_model_investigation_wrt_problem']
130
+
131
+ elif predicted_issue == "Problem with a credit reporting company's investigation into an existing problem":
132
+ return issue_models['trained_model_credit_investigation_wrt_problem']
133
+
134
+ elif predicted_issue == "Problem with a company's investigation into an existing issue":
135
+ return issue_models['trained_model_investigation_wrt_issue']
136
+
137
+ elif predicted_issue == "Problem with a purchase shown on your statement":
138
+ return issue_models['trained_model_purchase_shown']
139
+
140
+ elif predicted_issue == "Incorrect information on your report":
141
+ return issue_models['trained_model_incorrect_info']
142
+
143
+ elif predicted_issue == "Improper use of your report":
144
+ return issue_models['trained_model_improper_use_report']
145
+
146
+ elif predicted_issue == "Account Operations and Unauthorized Transaction Issues":
147
+ return issue_models['trained_model_account_operations']
148
+
149
+ elif predicted_issue == "Payment and Funds Management":
150
+ return issue_models['trained_model_payment_funds']
151
+
152
+ elif predicted_issue == "Managing an account":
153
+ return issue_models['trained_model_managing_account']
154
+
155
+ elif predicted_issue == "Attempts to collect debt not owed":
156
+ return issue_models['trained_model_collect_debt']
157
+
158
+ elif predicted_issue == "Written notification about debt":
159
+ return issue_models['trained_model_notification_about_debt']
160
+
161
+ elif predicted_issue == "Dealing with your lender or servicer":
162
+ return issue_models['trained_model_lender']
163
+
164
+ elif predicted_issue == "Disputes and Misrepresentations":
165
+ return issue_models['trained_model_disputes']
166
+
167
+ elif predicted_issue == "Closing your account":
168
+ return issue_models['trained_model_closing_your_account']
169
+
170
+ elif predicted_issue == "Closing an account":
171
+ return issue_models['trained_model_closing_account']
172
+
173
+ elif predicted_issue == "Credit Report and Monitoring Issues":
174
+ return issue_models['trained_model_credit_report']
175
+
176
+ elif predicted_issue == "Legal and Threat Actions":
177
+ return issue_models['trained_model_legal_and_threat']
178
+
179
+ else:
180
+ raise ValueError("Invalid predicted issue category")
181
+
182
+ # Custom Headers for enhancing UI Text elements
183
+ def custom_header(text, level=1):
184
+ if level == 1:
185
+ icon_url = "https://cfpb.github.io/design-system/images/uploads/logo_vertical_071720.png"
186
+ # Adjust the img style as needed (e.g., height, vertical alignment, margin)
187
+ st.markdown(f"""
188
+ <h1 style="text-align: center;">
189
+ <img src="{icon_url}" alt="Icon" style="vertical-align: middle; height: 112px; margin-right: -160px;">
190
+ <span style="color: #008000; font-family: 'Sans Serif';">{text}</span>
191
+ </h1>
192
+ """, unsafe_allow_html=True)
193
+ #st.markdown(f"<h1 style='text-align: center; color: #ef8236; font-family: sans serif;'>{text}</h1>", unsafe_allow_html=True)
194
+ elif level == 2:
195
+ st.markdown(f"<h2 style='text-align: center; color: #00749C; font-family: sans serif;'>{text}</h2>", unsafe_allow_html=True)
196
+ elif level == 3:
197
+ st.markdown(f"<h3 style='text-align: center; color: #00749C; font-family: sans serif;'>{text}</h3>", unsafe_allow_html=True)
198
+ elif level == 4:
199
+ st.markdown(f"<h5 style='text-align: center; color: #00749C; font-family: sans serif;'>{text}</h5>", unsafe_allow_html=True)
200
+ elif level == 5:
201
+ st.markdown(f"<h5 style='text-align: center; color: #f63366; font-family: sans serif;'>{text}</h5>", unsafe_allow_html=True)
202
+
203
+ # Helper function for classifying the complaint
204
+ def classify_complaint(narrative):
205
+ # Predict product category
206
+ predicted_product = product_classifier(narrative)[0]['label']
207
+
208
+ # Load the appropriate subproduct prediction model
209
+ subproduct_model = select_subproduct_model(predicted_product)
210
+ # Predict subproduct category using the selected model
211
+ predicted_subproduct = subproduct_model.predict([narrative])[0]
212
+
213
+ # Predict the appropriate issue category using the narrative
214
+ predicted_issue = issue_classifier(narrative)[0]['label']
215
+
216
+ # Load the appropriate subissue prediction model
217
+ subissue_model = select_subissue_model(predicted_issue)
218
+ # Predict subissue category using the selected model
219
+ predicted_subissue = subissue_model.predict([narrative])[0]
220
+
221
+ return {
222
+ "Product" : predicted_product,
223
+ "Sub-product" : predicted_subproduct,
224
+ "Issue" : predicted_issue,
225
+ "Sub-issue" : predicted_subissue
226
+ }
227
+
228
+ # Helper function to display key insights
229
+ def plot_eda_charts(level):
230
+ if level == 1:
231
+ fig = complaints_by_year(df)
232
+ return fig
233
+
234
+ if level == 2:
235
+ fig = complaints_across_states(df)
236
+ return fig
237
+
238
+ if level == 3:
239
+ fig = plot_top_5_products(df)
240
+ return fig
241
+
242
+ if level == 4:
243
+ fig = plot_top_5_issues(df)
244
+ return fig
245
+
246
+ if level == 5:
247
+ fig = plot_top_5_issues_in_product(df)
248
+ return fig
249
+
250
+ if level == 6:
251
+ fig = plot_top_10_companies_complaints(df)
252
+ return fig
253
+
254
+ if level == 7:
255
+ fig = plot_top_10_states_most_complaints(df)
256
+ return fig
257
+
258
+ if level == 8:
259
+ fig = plot_top_10_states_least_complaints(df)
260
+ return fig
261
+
262
+ # Navigation setup
263
+ with st.sidebar:
264
+ selected = option_menu(menu_title = "Navigate",
265
+ options = ["Home", "Key Insights", "Complaint Classifier"]
266
+ ,default_index = 0)
267
+
268
+ # Home Page
269
+ if selected == "Home":
270
+ custom_header('CFPB Consumer Complaint Insights', level=1)
271
+ # Introduction
272
+ st.markdown("""
273
+ <div style='text-align: center; color: #333; font-size: 20px;'>
274
+ <p><strong>Uncover Consumer Trends and Automate Complaint Categorization with CFPB Insights</strong></p>
275
+ </div>
276
+ """, unsafe_allow_html=True)
277
+
278
+ st.write("\n")
279
+
280
+ # Project Motivation
281
+ st.markdown("""
282
+ ### :orange[Motivation]
283
+ Consumers can face challenges with financial products and services, leading to complaints that may not always be resolved directly with financial institutions. The **Consumer Financial Protection Bureau (CFPB)** acts as a mediator in these scenarios. However, consumers often struggle to categorize their complaints accurately, leading to inefficiencies in the resolution process. Our project aims to **facilitate faster resolution** by automatically categorizing complaints based on narrative descriptions, enhancing the efficiency of complaint management.
284
+ """, unsafe_allow_html=True)
285
+
286
+ # Impact
287
+ st.markdown("""
288
+ ### :green[Impact]
289
+ The implementation of our project has two primary impacts:
290
+ - **Ease for Consumers:** Automates the tagging of complaints into appropriate categories, reducing the need for consumers to understand complex financial product categories.
291
+ - **Industry Adoption:** Offers a streamlined approach to complaint handling that can be adopted by financial institutions beyond the CFPB, promoting consistency across the industry.
292
+ """, unsafe_allow_html=True)
293
+ # Complaint Classifier
294
+ st.markdown("""
295
+ #### :blue[Complaint Classifier]
296
+ Our dashboard features an innovative :rainbow[**Complaint Classifier**] that utilizes the narrative descriptions provided by consumers to categorize complaints into the correct product, issue, and sub-issue categories. This tool simplifies the submission process for consumers and enhances the efficiency of complaint resolution.
297
+ """, unsafe_allow_html=True)
298
+
299
+ # Key Insights Page
300
+ elif selected == "Key Insights":
301
+
302
+ headers = ["Evolution of complaints across years", "Complaints across US states",
303
+ "Top 5 Common Product Categories", "Top 5 Common Issue Categories",
304
+ "Top 5 Issues in Each Product Category", "Top 10 Companies with Most Complaints in 2023",
305
+ "Top 10 states with Most Complaints", "Top 10 states with Least Complaints"]
306
+
307
+ custom_header("Key Insights", level=1)
308
+ st.write("\n")
309
+ st.write("\n")
310
+ st.write("\n")
311
+
312
+ for i in range(0, len(headers), 2):
313
+ cols = st.columns(2) # Create two columns
314
+
315
+ with cols[0]:
316
+ custom_header(headers[i], level=4)
317
+ fig = plot_eda_charts(level=i+1)
318
+ st.plotly_chart(fig, use_container_width=True)
319
+
320
+ if (i+1) < len(headers):
321
+ with cols[1]:
322
+ custom_header(headers[i+1], level=4)
323
+ fig = plot_eda_charts(level=i+2)
324
+ st.plotly_chart(fig, use_container_width=True)
325
+
326
+ # Complaints Classifier Page
327
+ elif selected == "Complaint Classifier":
328
+ custom_header("Complaint Classifier", level=2)
329
+ st.write("\n")
330
+
331
+ # Using a key for the text_area widget to reference its current value
332
+ query = st.text_area("Enter your complaint:", placeholder="It is absurd that I have consistently made timely payments for this account and have never been overdue. I kindly request that you promptly update my account to reflect this accurately.", key="input_text")
333
+ if st.button("Classify Complaint"):
334
+ if query.strip(): # Check if the input is not empty
335
+ with st.spinner("Classifying Complaint..."):
336
+ result = classify_complaint(query)
337
+ if result: # Check if the result is not empty
338
+ st.success("Complaint Classification Results:")
339
+
340
+ #Using HTML for better control over formatting
341
+ st.markdown(f"""
342
+ **Product:** :blue[{result.get("Product")}]<br>
343
+
344
+ **Sub-product:** :green[{result.get("Sub-product")}]<br>
345
+
346
+ **Issue:** :red[{result.get("Issue")}]<br>
347
+
348
+ **Sub-issue:** :orange[{result.get("Sub-issue")}]<br>
349
+
350
+ """, unsafe_allow_html=True)
351
+ st.write("\n\n")
352
+ st.header("", divider= 'rainbow')
353
+ else:
354
+ st.error("Failed to classify the complaint. Please try again.")
355
+ #time.sleep(1)
356
+ st.balloons() # Celebratory balloons on successful classification
357
+ else:
358
+ st.info("Please enter a complaint to classify.")
notebooks/.DS_Store ADDED
Binary file (6.15 kB). View file
 
notebooks/.ipynb_checkpoints/Complaints preprocessing-Copy1-checkpoint.ipynb ADDED
@@ -0,0 +1,1061 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "cd6a338a-9a00-45f4-ac13-9ed131c9049e",
6
+ "metadata": {},
7
+ "source": [
8
+ "### Loading data (2023 year) "
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": 1,
14
+ "id": "2e8de3f1-6812-4c0d-bd56-32459911000e",
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": [
18
+ "import numpy as np\n",
19
+ "import pandas as pd\n",
20
+ "import matplotlib.pyplot as plt\n",
21
+ "import seaborn as sns\n",
22
+ "import plotly.express as px"
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "code",
27
+ "execution_count": 2,
28
+ "id": "ad45c437-7720-445e-8fa1-27d2b14b7bb5",
29
+ "metadata": {},
30
+ "outputs": [
31
+ {
32
+ "name": "stderr",
33
+ "output_type": "stream",
34
+ "text": [
35
+ "/tmp/ipykernel_42602/219708379.py:1: DtypeWarning: Columns (16) have mixed types. Specify dtype option on import or set low_memory=False.\n",
36
+ " df = pd.read_csv('./complaints.csv')\n"
37
+ ]
38
+ }
39
+ ],
40
+ "source": [
41
+ "df = pd.read_csv('./complaints.csv')\n",
42
+ "df['Date received'] = pd.to_datetime(df['Date received'])\n",
43
+ "\n",
44
+ "cols_to_consider = ['Product','Sub-product','Issue','Sub-issue','Consumer complaint narrative','Company public response','Company',\n",
45
+ " 'State', 'ZIP code', 'Date received']\n",
46
+ "df_new = df[cols_to_consider]\n",
47
+ "\n",
48
+ "df_new = df_new.dropna()"
49
+ ]
50
+ },
51
+ {
52
+ "cell_type": "code",
53
+ "execution_count": 29,
54
+ "id": "6df32835-7186-4c57-bffa-536f779636fe",
55
+ "metadata": {},
56
+ "outputs": [],
57
+ "source": [
58
+ "df_2023 = df_new[df_new['Date received'].dt.year.isin([2023])].reset_index(drop=True)\n",
59
+ "\n",
60
+ "product_map = {'Credit reporting or other personal consumer reports' : 'Credit Reporting',\n",
61
+ " 'Credit reporting, credit repair services, or other personal consumer reports' : 'Credit Reporting',\n",
62
+ " 'Payday loan, title loan, personal loan, or advance loan' : 'Loans / Mortgage',\n",
63
+ " 'Payday loan, title loan, or personal loan' : 'Loans / Mortgage',\n",
64
+ " 'Student loan' : 'Loans / Mortgage',\n",
65
+ " 'Vehicle loan or lease' : 'Loans / Mortgage',\n",
66
+ " 'Debt collection' : 'Debt collection',\n",
67
+ " 'Credit card or prepaid card' : 'Credit/Prepaid Card',\n",
68
+ " 'Credit card' : 'Credit/Prepaid Card',\n",
69
+ " 'Prepaid card' : 'Credit/Prepaid Card',\n",
70
+ " 'Mortgage' : 'Loans / Mortgage',\n",
71
+ " 'Checking or savings account' : 'Checking or savings account' \n",
72
+ " }\n",
73
+ "\n",
74
+ "df_2023.loc[:,'Product'] = df_2023['Product'].map(product_map)"
75
+ ]
76
+ },
77
+ {
78
+ "cell_type": "code",
79
+ "execution_count": 30,
80
+ "id": "679ffbe3-a6ba-4f4d-bf65-0690794fb4e1",
81
+ "metadata": {},
82
+ "outputs": [
83
+ {
84
+ "data": {
85
+ "text/html": [
86
+ "<div>\n",
87
+ "<style scoped>\n",
88
+ " .dataframe tbody tr th:only-of-type {\n",
89
+ " vertical-align: middle;\n",
90
+ " }\n",
91
+ "\n",
92
+ " .dataframe tbody tr th {\n",
93
+ " vertical-align: top;\n",
94
+ " }\n",
95
+ "\n",
96
+ " .dataframe thead th {\n",
97
+ " text-align: right;\n",
98
+ " }\n",
99
+ "</style>\n",
100
+ "<table border=\"1\" class=\"dataframe\">\n",
101
+ " <thead>\n",
102
+ " <tr style=\"text-align: right;\">\n",
103
+ " <th></th>\n",
104
+ " <th>Product</th>\n",
105
+ " <th>Sub-product</th>\n",
106
+ " <th>Issue</th>\n",
107
+ " <th>Sub-issue</th>\n",
108
+ " <th>Consumer complaint narrative</th>\n",
109
+ " <th>Company public response</th>\n",
110
+ " <th>Company</th>\n",
111
+ " <th>State</th>\n",
112
+ " <th>ZIP code</th>\n",
113
+ " <th>Date received</th>\n",
114
+ " </tr>\n",
115
+ " </thead>\n",
116
+ " <tbody>\n",
117
+ " <tr>\n",
118
+ " <th>0</th>\n",
119
+ " <td>Checking or savings account</td>\n",
120
+ " <td>Other banking product or service</td>\n",
121
+ " <td>Opening an account</td>\n",
122
+ " <td>Account opened without my consent or knowledge</td>\n",
123
+ " <td>Date : XXXX XXXXo : XXXX XXXX XXXX / XXXX XXXX...</td>\n",
124
+ " <td>Company has responded to the consumer and the ...</td>\n",
125
+ " <td>WELLS FARGO &amp; COMPANY</td>\n",
126
+ " <td>NC</td>\n",
127
+ " <td>27513</td>\n",
128
+ " <td>2023-12-29</td>\n",
129
+ " </tr>\n",
130
+ " <tr>\n",
131
+ " <th>1</th>\n",
132
+ " <td>Credit Reporting</td>\n",
133
+ " <td>Credit reporting</td>\n",
134
+ " <td>Problem with a company's investigation into an...</td>\n",
135
+ " <td>Investigation took more than 30 days</td>\n",
136
+ " <td>I have previously disputed this item with you ...</td>\n",
137
+ " <td>Company has responded to the consumer and the ...</td>\n",
138
+ " <td>Experian Information Solutions Inc.</td>\n",
139
+ " <td>MN</td>\n",
140
+ " <td>55124</td>\n",
141
+ " <td>2023-12-29</td>\n",
142
+ " </tr>\n",
143
+ " <tr>\n",
144
+ " <th>2</th>\n",
145
+ " <td>Debt collection</td>\n",
146
+ " <td>Other debt</td>\n",
147
+ " <td>Attempts to collect debt not owed</td>\n",
148
+ " <td>Debt was result of identity theft</td>\n",
149
+ " <td>I kindly request that you update my credit rep...</td>\n",
150
+ " <td>Company has responded to the consumer and the ...</td>\n",
151
+ " <td>Experian Information Solutions Inc.</td>\n",
152
+ " <td>IL</td>\n",
153
+ " <td>60621</td>\n",
154
+ " <td>2023-12-28</td>\n",
155
+ " </tr>\n",
156
+ " <tr>\n",
157
+ " <th>3</th>\n",
158
+ " <td>Debt collection</td>\n",
159
+ " <td>Other debt</td>\n",
160
+ " <td>Attempts to collect debt not owed</td>\n",
161
+ " <td>Debt was result of identity theft</td>\n",
162
+ " <td>I implore you to conduct a comprehensive inves...</td>\n",
163
+ " <td>Company has responded to the consumer and the ...</td>\n",
164
+ " <td>Experian Information Solutions Inc.</td>\n",
165
+ " <td>NJ</td>\n",
166
+ " <td>08723</td>\n",
167
+ " <td>2023-12-28</td>\n",
168
+ " </tr>\n",
169
+ " <tr>\n",
170
+ " <th>4</th>\n",
171
+ " <td>Credit Reporting</td>\n",
172
+ " <td>Credit reporting</td>\n",
173
+ " <td>Incorrect information on your report</td>\n",
174
+ " <td>Information belongs to someone else</td>\n",
175
+ " <td>In accordance with the Fair Credit Reporting A...</td>\n",
176
+ " <td>Company has responded to the consumer and the ...</td>\n",
177
+ " <td>TRANSUNION INTERMEDIATE HOLDINGS, INC.</td>\n",
178
+ " <td>TX</td>\n",
179
+ " <td>77377</td>\n",
180
+ " <td>2023-11-27</td>\n",
181
+ " </tr>\n",
182
+ " </tbody>\n",
183
+ "</table>\n",
184
+ "</div>"
185
+ ],
186
+ "text/plain": [
187
+ " Product Sub-product \\\n",
188
+ "0 Checking or savings account Other banking product or service \n",
189
+ "1 Credit Reporting Credit reporting \n",
190
+ "2 Debt collection Other debt \n",
191
+ "3 Debt collection Other debt \n",
192
+ "4 Credit Reporting Credit reporting \n",
193
+ "\n",
194
+ " Issue \\\n",
195
+ "0 Opening an account \n",
196
+ "1 Problem with a company's investigation into an... \n",
197
+ "2 Attempts to collect debt not owed \n",
198
+ "3 Attempts to collect debt not owed \n",
199
+ "4 Incorrect information on your report \n",
200
+ "\n",
201
+ " Sub-issue \\\n",
202
+ "0 Account opened without my consent or knowledge \n",
203
+ "1 Investigation took more than 30 days \n",
204
+ "2 Debt was result of identity theft \n",
205
+ "3 Debt was result of identity theft \n",
206
+ "4 Information belongs to someone else \n",
207
+ "\n",
208
+ " Consumer complaint narrative \\\n",
209
+ "0 Date : XXXX XXXXo : XXXX XXXX XXXX / XXXX XXXX... \n",
210
+ "1 I have previously disputed this item with you ... \n",
211
+ "2 I kindly request that you update my credit rep... \n",
212
+ "3 I implore you to conduct a comprehensive inves... \n",
213
+ "4 In accordance with the Fair Credit Reporting A... \n",
214
+ "\n",
215
+ " Company public response \\\n",
216
+ "0 Company has responded to the consumer and the ... \n",
217
+ "1 Company has responded to the consumer and the ... \n",
218
+ "2 Company has responded to the consumer and the ... \n",
219
+ "3 Company has responded to the consumer and the ... \n",
220
+ "4 Company has responded to the consumer and the ... \n",
221
+ "\n",
222
+ " Company State ZIP code Date received \n",
223
+ "0 WELLS FARGO & COMPANY NC 27513 2023-12-29 \n",
224
+ "1 Experian Information Solutions Inc. MN 55124 2023-12-29 \n",
225
+ "2 Experian Information Solutions Inc. IL 60621 2023-12-28 \n",
226
+ "3 Experian Information Solutions Inc. NJ 08723 2023-12-28 \n",
227
+ "4 TRANSUNION INTERMEDIATE HOLDINGS, INC. TX 77377 2023-11-27 "
228
+ ]
229
+ },
230
+ "execution_count": 30,
231
+ "metadata": {},
232
+ "output_type": "execute_result"
233
+ }
234
+ ],
235
+ "source": [
236
+ "df_2023.head()"
237
+ ]
238
+ },
239
+ {
240
+ "cell_type": "code",
241
+ "execution_count": 31,
242
+ "id": "a85ec9b1-5de7-47f9-b204-4d42c8880bbb",
243
+ "metadata": {},
244
+ "outputs": [
245
+ {
246
+ "data": {
247
+ "text/plain": [
248
+ "Index(['Product', 'Sub-product', 'Issue', 'Sub-issue',\n",
249
+ " 'Consumer complaint narrative', 'Company public response', 'Company',\n",
250
+ " 'State', 'ZIP code', 'Date received'],\n",
251
+ " dtype='object')"
252
+ ]
253
+ },
254
+ "execution_count": 31,
255
+ "metadata": {},
256
+ "output_type": "execute_result"
257
+ }
258
+ ],
259
+ "source": [
260
+ "df_2023.columns"
261
+ ]
262
+ },
263
+ {
264
+ "cell_type": "markdown",
265
+ "id": "0487636d-9663-4fdb-b219-f9e6be257b51",
266
+ "metadata": {},
267
+ "source": [
268
+ "### Complaint pre-processing"
269
+ ]
270
+ },
271
+ {
272
+ "cell_type": "code",
273
+ "execution_count": 32,
274
+ "id": "e35208c6-020a-4fb9-8c9f-13fdeee44935",
275
+ "metadata": {},
276
+ "outputs": [],
277
+ "source": [
278
+ "df_2023['complaint length'] = df_2023['Consumer complaint narrative'].apply(lambda x : len(x))"
279
+ ]
280
+ },
281
+ {
282
+ "cell_type": "code",
283
+ "execution_count": 33,
284
+ "id": "63deb9bb-d48a-460b-8edb-f66575ec1eaf",
285
+ "metadata": {},
286
+ "outputs": [],
287
+ "source": [
288
+ "df_2023 = df_2023[df_2023['complaint length'] > 20]\n",
289
+ "\n",
290
+ "complaints_to_exclude = ['See document attached', 'See the attached documents.', 'Incorrect information on my credit report', 'incorrect information on my credit report',\n",
291
+ "'please see attached file','Please see documents Attached','Incorrect information on my credit report.', 'Please see attached file', 'see attached',\n",
292
+ "'See attached', 'SEE ATTACHED DOCUMENTS', 'See Attached', 'SEE ATTACHMENT', 'SEE ATTACHMENTS', \n",
293
+ "'XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX']\n",
294
+ "\n",
295
+ "df_2023 = df_2023[~df_2023['Consumer complaint narrative'].isin(complaints_to_exclude)]"
296
+ ]
297
+ },
298
+ {
299
+ "cell_type": "markdown",
300
+ "id": "492f8261-3e01-41d5-8f24-82bd289ee229",
301
+ "metadata": {},
302
+ "source": [
303
+ "### Categories consideration"
304
+ ]
305
+ },
306
+ {
307
+ "cell_type": "code",
308
+ "execution_count": 56,
309
+ "id": "0be9e1f3-61aa-494a-bd0c-a6afeab5aacd",
310
+ "metadata": {},
311
+ "outputs": [
312
+ {
313
+ "data": {
314
+ "text/plain": [
315
+ "(264968, 5)"
316
+ ]
317
+ },
318
+ "execution_count": 56,
319
+ "metadata": {},
320
+ "output_type": "execute_result"
321
+ }
322
+ ],
323
+ "source": [
324
+ "df_2023_subset = df_2023[['Consumer complaint narrative','Product','Sub-product','Issue','Sub-issue']]\n",
325
+ "df_2023_subset.shape"
326
+ ]
327
+ },
328
+ {
329
+ "cell_type": "code",
330
+ "execution_count": 57,
331
+ "id": "33e4e7e3-6661-48aa-aec2-b706fa64338d",
332
+ "metadata": {},
333
+ "outputs": [
334
+ {
335
+ "data": {
336
+ "text/plain": [
337
+ "Product\n",
338
+ "Credit Reporting 213403\n",
339
+ "Credit/Prepaid Card 16319\n",
340
+ "Checking or savings account 15143\n",
341
+ "Debt collection 11767\n",
342
+ "Loans / Mortgage 8336\n",
343
+ "Name: count, dtype: int64"
344
+ ]
345
+ },
346
+ "execution_count": 57,
347
+ "metadata": {},
348
+ "output_type": "execute_result"
349
+ }
350
+ ],
351
+ "source": [
352
+ "df_2023_subset['Product'].value_counts()"
353
+ ]
354
+ },
355
+ {
356
+ "cell_type": "code",
357
+ "execution_count": 58,
358
+ "id": "dbc49ba8-f15a-4a4b-b018-d9d2273620ba",
359
+ "metadata": {},
360
+ "outputs": [],
361
+ "source": [
362
+ "sub_issues_to_consider = df_2023_subset['Sub-issue'].value_counts()[df_2023_subset['Sub-issue'].value_counts() > 500].index"
363
+ ]
364
+ },
365
+ {
366
+ "cell_type": "code",
367
+ "execution_count": 59,
368
+ "id": "746db565-e6ff-4ab2-bf92-d56088c0f2da",
369
+ "metadata": {},
370
+ "outputs": [],
371
+ "source": [
372
+ "reduced_subissues = df_2023_subset[df_2023_subset['Sub-issue'].isin(sub_issues_to_consider)]"
373
+ ]
374
+ },
375
+ {
376
+ "cell_type": "code",
377
+ "execution_count": 60,
378
+ "id": "0f786b1d-b139-40b5-ad53-639d8687d3b4",
379
+ "metadata": {},
380
+ "outputs": [
381
+ {
382
+ "data": {
383
+ "text/plain": [
384
+ "(248065, 5)"
385
+ ]
386
+ },
387
+ "execution_count": 60,
388
+ "metadata": {},
389
+ "output_type": "execute_result"
390
+ }
391
+ ],
392
+ "source": [
393
+ "reduced_subissues.shape"
394
+ ]
395
+ },
396
+ {
397
+ "cell_type": "code",
398
+ "execution_count": 61,
399
+ "id": "f64515e8-ac65-4041-a201-a8576a86d7ad",
400
+ "metadata": {},
401
+ "outputs": [
402
+ {
403
+ "data": {
404
+ "text/plain": [
405
+ "Sub-issue\n",
406
+ "Information belongs to someone else 57877\n",
407
+ "Reporting company used your report improperly 48781\n",
408
+ "Their investigation did not fix an error on your report 45407\n",
409
+ "Credit inquiries on your report that you don't recognize 13150\n",
410
+ "Account status incorrect 10271\n",
411
+ "Account information incorrect 9307\n",
412
+ "Was not notified of investigation status or results 9201\n",
413
+ "Investigation took more than 30 days 8937\n",
414
+ "Personal information incorrect 5900\n",
415
+ "Debt is not yours 2821\n",
416
+ "Deposits and withdrawals 2626\n",
417
+ "Credit card company isn't resolving a dispute about a purchase on your statement 2289\n",
418
+ "Didn't receive enough information to verify debt 1816\n",
419
+ "Debt was result of identity theft 1761\n",
420
+ "Old information reappears or never goes away 1716\n",
421
+ "Difficulty submitting a dispute or getting information about a dispute over the phone 1709\n",
422
+ "Company closed your account 1517\n",
423
+ "Problem using a debit or ATM card 1503\n",
424
+ "Public record information inaccurate 1389\n",
425
+ "Transaction was not authorized 1378\n",
426
+ "Problem with personal statement of dispute 1361\n",
427
+ "Other problem getting your report or credit score 1112\n",
428
+ "Debt was paid 969\n",
429
+ "Card was charged for something you did not purchase with the card 964\n",
430
+ "Banking errors 958\n",
431
+ "Funds not handled or disbursed as instructed 955\n",
432
+ "Overdrafts and overdraft fees 951\n",
433
+ "Attempted to collect wrong amount 885\n",
434
+ "Information is missing that should be on the report 881\n",
435
+ "Problem during payment process 840\n",
436
+ "Fee problem 764\n",
437
+ "Problem with fees 749\n",
438
+ "Received bad information about your loan 710\n",
439
+ "Other problem 701\n",
440
+ "Threatened or suggested your credit would be damaged 687\n",
441
+ "Funds not received from closed account 673\n",
442
+ "Trouble with how payments are being handled 650\n",
443
+ "Didn't receive notice of right to dispute 644\n",
444
+ "Can't close your account 598\n",
445
+ "Problem accessing account 561\n",
446
+ "Account opened as a result of fraud 561\n",
447
+ "Problem canceling credit monitoring or identify theft protection service 521\n",
448
+ "Card opened as result of identity theft or fraud 511\n",
449
+ "Billing problem 503\n",
450
+ "Name: count, dtype: int64"
451
+ ]
452
+ },
453
+ "execution_count": 61,
454
+ "metadata": {},
455
+ "output_type": "execute_result"
456
+ }
457
+ ],
458
+ "source": [
459
+ "reduced_subissues['Sub-issue'].value_counts()"
460
+ ]
461
+ },
462
+ {
463
+ "cell_type": "code",
464
+ "execution_count": 62,
465
+ "id": "6204eb53-1a5b-457f-ab67-957d73f568af",
466
+ "metadata": {},
467
+ "outputs": [],
468
+ "source": [
469
+ "sub_products_to_consider = reduced_subissues['Sub-product'].value_counts()[reduced_subissues['Sub-product'].value_counts() > 100].index\n",
470
+ "final_df_2023 = reduced_subissues[reduced_subissues['Sub-product'].isin(sub_products_to_consider)]"
471
+ ]
472
+ },
473
+ {
474
+ "cell_type": "code",
475
+ "execution_count": 63,
476
+ "id": "781850e8-cd50-4d08-87aa-8d86715cc2ef",
477
+ "metadata": {},
478
+ "outputs": [
479
+ {
480
+ "data": {
481
+ "text/plain": [
482
+ "(247517, 5)"
483
+ ]
484
+ },
485
+ "execution_count": 63,
486
+ "metadata": {},
487
+ "output_type": "execute_result"
488
+ }
489
+ ],
490
+ "source": [
491
+ "final_df_2023.shape"
492
+ ]
493
+ },
494
+ {
495
+ "cell_type": "markdown",
496
+ "id": "0ab4f91f-c938-4093-a299-b895ea13121a",
497
+ "metadata": {},
498
+ "source": [
499
+ "### Value counts"
500
+ ]
501
+ },
502
+ {
503
+ "cell_type": "code",
504
+ "execution_count": 64,
505
+ "id": "17ddf55c-f824-4b2c-8059-07d02597a1cb",
506
+ "metadata": {},
507
+ "outputs": [
508
+ {
509
+ "data": {
510
+ "text/plain": [
511
+ "Product\n",
512
+ "Credit Reporting 211695\n",
513
+ "Checking or savings account 12285\n",
514
+ "Credit/Prepaid Card 11975\n",
515
+ "Debt collection 9380\n",
516
+ "Loans / Mortgage 2182\n",
517
+ "Name: count, dtype: int64"
518
+ ]
519
+ },
520
+ "execution_count": 64,
521
+ "metadata": {},
522
+ "output_type": "execute_result"
523
+ }
524
+ ],
525
+ "source": [
526
+ "final_df_2023['Product'].value_counts()"
527
+ ]
528
+ },
529
+ {
530
+ "cell_type": "code",
531
+ "execution_count": 65,
532
+ "id": "eae2d688-f706-4c31-9228-1ae7eadbf228",
533
+ "metadata": {},
534
+ "outputs": [
535
+ {
536
+ "data": {
537
+ "text/plain": [
538
+ "Sub-product\n",
539
+ "Credit reporting 210735\n",
540
+ "General-purpose credit card or charge card 10668\n",
541
+ "Checking account 10409\n",
542
+ "Other debt 3041\n",
543
+ "I do not know 2316\n",
544
+ "Credit card debt 1652\n",
545
+ "Federal student loan servicing 1344\n",
546
+ "Store credit card 1307\n",
547
+ "Medical debt 1053\n",
548
+ "Savings account 989\n",
549
+ "Other personal consumer report 960\n",
550
+ "Loan 732\n",
551
+ "Other banking product or service 725\n",
552
+ "Auto debt 581\n",
553
+ "Telecommunications debt 419\n",
554
+ "Rental debt 179\n",
555
+ "CD (Certificate of Deposit) 162\n",
556
+ "Mortgage debt 139\n",
557
+ "Conventional home mortgage 106\n",
558
+ "Name: count, dtype: int64"
559
+ ]
560
+ },
561
+ "execution_count": 65,
562
+ "metadata": {},
563
+ "output_type": "execute_result"
564
+ }
565
+ ],
566
+ "source": [
567
+ "final_df_2023['Sub-product'].value_counts()"
568
+ ]
569
+ },
570
+ {
571
+ "cell_type": "code",
572
+ "execution_count": 66,
573
+ "id": "61179ec3-f49f-4d2a-adde-738a0ff89371",
574
+ "metadata": {},
575
+ "outputs": [
576
+ {
577
+ "data": {
578
+ "text/plain": [
579
+ "Issue\n",
580
+ "Incorrect information on your report 87200\n",
581
+ "Improper use of your report 61868\n",
582
+ "Problem with a credit reporting company's investigation into an existing problem 45371\n",
583
+ "Problem with a company's investigation into an existing problem 20985\n",
584
+ "Managing an account 7367\n",
585
+ "Attempts to collect debt not owed 5453\n",
586
+ "Problem with a purchase shown on your statement 3253\n",
587
+ "Written notification about debt 2404\n",
588
+ "Closing an account 1975\n",
589
+ "Problem with a lender or other company charging your account 1378\n",
590
+ "Dealing with your lender or servicer 1293\n",
591
+ "Unable to get your credit report or credit score 1109\n",
592
+ "Problem caused by your funds being low 951\n",
593
+ "False statements or representation 861\n",
594
+ "Problem when making payments 840\n",
595
+ "Closing your account 813\n",
596
+ "Fees or interest 749\n",
597
+ "Other features, terms, or problems 701\n",
598
+ "Took or threatened to take negative or legal action 662\n",
599
+ "Opening an account 561\n",
600
+ "Getting a credit card 511\n",
601
+ "Credit monitoring or identity theft protection services 495\n",
602
+ "Managing the loan or lease 468\n",
603
+ "Problem with a company's investigation into an existing issue 223\n",
604
+ "Identity theft protection or other monitoring services 26\n",
605
+ "Name: count, dtype: int64"
606
+ ]
607
+ },
608
+ "execution_count": 66,
609
+ "metadata": {},
610
+ "output_type": "execute_result"
611
+ }
612
+ ],
613
+ "source": [
614
+ "final_df['Issue'].value_counts()"
615
+ ]
616
+ },
617
+ {
618
+ "cell_type": "code",
619
+ "execution_count": 67,
620
+ "id": "928750bb-7324-480f-aaa1-a4438841399c",
621
+ "metadata": {},
622
+ "outputs": [
623
+ {
624
+ "data": {
625
+ "text/plain": [
626
+ "Sub-issue\n",
627
+ "Information belongs to someone else 57850\n",
628
+ "Reporting company used your report improperly 48732\n",
629
+ "Their investigation did not fix an error on your report 45395\n",
630
+ "Credit inquiries on your report that you don't recognize 13136\n",
631
+ "Account status incorrect 10208\n",
632
+ "Account information incorrect 9267\n",
633
+ "Was not notified of investigation status or results 9200\n",
634
+ "Investigation took more than 30 days 8928\n",
635
+ "Personal information incorrect 5900\n",
636
+ "Debt is not yours 2785\n",
637
+ "Deposits and withdrawals 2626\n",
638
+ "Credit card company isn't resolving a dispute about a purchase on your statement 2289\n",
639
+ "Didn't receive enough information to verify debt 1777\n",
640
+ "Debt was result of identity theft 1727\n",
641
+ "Old information reappears or never goes away 1714\n",
642
+ "Difficulty submitting a dispute or getting information about a dispute over the phone 1704\n",
643
+ "Company closed your account 1517\n",
644
+ "Problem using a debit or ATM card 1503\n",
645
+ "Public record information inaccurate 1384\n",
646
+ "Transaction was not authorized 1378\n",
647
+ "Problem with personal statement of dispute 1352\n",
648
+ "Other problem getting your report or credit score 1109\n",
649
+ "Card was charged for something you did not purchase with the card 964\n",
650
+ "Banking errors 958\n",
651
+ "Funds not handled or disbursed as instructed 955\n",
652
+ "Overdrafts and overdraft fees 951\n",
653
+ "Debt was paid 941\n",
654
+ "Information is missing that should be on the report 877\n",
655
+ "Attempted to collect wrong amount 861\n",
656
+ "Problem during payment process 840\n",
657
+ "Fee problem 764\n",
658
+ "Problem with fees 749\n",
659
+ "Other problem 701\n",
660
+ "Received bad information about your loan 677\n",
661
+ "Funds not received from closed account 673\n",
662
+ "Threatened or suggested your credit would be damaged 662\n",
663
+ "Didn't receive notice of right to dispute 627\n",
664
+ "Trouble with how payments are being handled 616\n",
665
+ "Can't close your account 598\n",
666
+ "Problem accessing account 561\n",
667
+ "Account opened as a result of fraud 561\n",
668
+ "Problem canceling credit monitoring or identify theft protection service 521\n",
669
+ "Card opened as result of identity theft or fraud 511\n",
670
+ "Billing problem 468\n",
671
+ "Name: count, dtype: int64"
672
+ ]
673
+ },
674
+ "execution_count": 67,
675
+ "metadata": {},
676
+ "output_type": "execute_result"
677
+ }
678
+ ],
679
+ "source": [
680
+ "final_df_2023['Sub-issue'].value_counts()"
681
+ ]
682
+ },
683
+ {
684
+ "cell_type": "markdown",
685
+ "id": "fd91e57e-766c-4c4b-92c1-4b61469be9b4",
686
+ "metadata": {},
687
+ "source": [
688
+ "### Unique categories"
689
+ ]
690
+ },
691
+ {
692
+ "cell_type": "code",
693
+ "execution_count": 68,
694
+ "id": "028803cd-86c0-4c8a-9fab-8f05ba6793a1",
695
+ "metadata": {},
696
+ "outputs": [
697
+ {
698
+ "name": "stdout",
699
+ "output_type": "stream",
700
+ "text": [
701
+ "Unique Product offerings: 5\n",
702
+ "Unique Sub-product offerings: 19\n",
703
+ "Unique Issue offerings: 25\n",
704
+ "Unique Sub-issue offerings: 44\n"
705
+ ]
706
+ }
707
+ ],
708
+ "source": [
709
+ "print(f\"Unique Product offerings: {final_df_2023['Product'].nunique()}\")\n",
710
+ "print(f\"Unique Sub-product offerings: {final_df_2023['Sub-product'].nunique()}\")\n",
711
+ "print(f\"Unique Issue offerings: {final_df_2023['Issue'].nunique()}\")\n",
712
+ "print(f\"Unique Sub-issue offerings: {final_df_2023['Sub-issue'].nunique()}\")"
713
+ ]
714
+ },
715
+ {
716
+ "cell_type": "markdown",
717
+ "id": "06ea0454-ed84-450a-90f7-e7552ffc181f",
718
+ "metadata": {},
719
+ "source": [
720
+ "### Preparing the train and test splits"
721
+ ]
722
+ },
723
+ {
724
+ "cell_type": "code",
725
+ "execution_count": 69,
726
+ "id": "267b771c-f944-443a-8048-c2f0097f4f29",
727
+ "metadata": {},
728
+ "outputs": [],
729
+ "source": [
730
+ "from sklearn.model_selection import train_test_split"
731
+ ]
732
+ },
733
+ {
734
+ "cell_type": "code",
735
+ "execution_count": 70,
736
+ "id": "eebed808-66b4-4fa8-a0ce-872b70d18106",
737
+ "metadata": {},
738
+ "outputs": [
739
+ {
740
+ "data": {
741
+ "text/html": [
742
+ "<div>\n",
743
+ "<style scoped>\n",
744
+ " .dataframe tbody tr th:only-of-type {\n",
745
+ " vertical-align: middle;\n",
746
+ " }\n",
747
+ "\n",
748
+ " .dataframe tbody tr th {\n",
749
+ " vertical-align: top;\n",
750
+ " }\n",
751
+ "\n",
752
+ " .dataframe thead th {\n",
753
+ " text-align: right;\n",
754
+ " }\n",
755
+ "</style>\n",
756
+ "<table border=\"1\" class=\"dataframe\">\n",
757
+ " <thead>\n",
758
+ " <tr style=\"text-align: right;\">\n",
759
+ " <th></th>\n",
760
+ " <th>Consumer complaint narrative</th>\n",
761
+ " <th>Product</th>\n",
762
+ " <th>Sub-product</th>\n",
763
+ " <th>Issue</th>\n",
764
+ " <th>Sub-issue</th>\n",
765
+ " </tr>\n",
766
+ " </thead>\n",
767
+ " <tbody>\n",
768
+ " <tr>\n",
769
+ " <th>1</th>\n",
770
+ " <td>I have previously disputed this item with you ...</td>\n",
771
+ " <td>Credit Reporting</td>\n",
772
+ " <td>Credit reporting</td>\n",
773
+ " <td>Problem with a company's investigation into an...</td>\n",
774
+ " <td>Investigation took more than 30 days</td>\n",
775
+ " </tr>\n",
776
+ " <tr>\n",
777
+ " <th>2</th>\n",
778
+ " <td>I kindly request that you update my credit rep...</td>\n",
779
+ " <td>Debt collection</td>\n",
780
+ " <td>Other debt</td>\n",
781
+ " <td>Attempts to collect debt not owed</td>\n",
782
+ " <td>Debt was result of identity theft</td>\n",
783
+ " </tr>\n",
784
+ " <tr>\n",
785
+ " <th>3</th>\n",
786
+ " <td>I implore you to conduct a comprehensive inves...</td>\n",
787
+ " <td>Debt collection</td>\n",
788
+ " <td>Other debt</td>\n",
789
+ " <td>Attempts to collect debt not owed</td>\n",
790
+ " <td>Debt was result of identity theft</td>\n",
791
+ " </tr>\n",
792
+ " <tr>\n",
793
+ " <th>4</th>\n",
794
+ " <td>In accordance with the Fair Credit Reporting A...</td>\n",
795
+ " <td>Credit Reporting</td>\n",
796
+ " <td>Credit reporting</td>\n",
797
+ " <td>Incorrect information on your report</td>\n",
798
+ " <td>Information belongs to someone else</td>\n",
799
+ " </tr>\n",
800
+ " <tr>\n",
801
+ " <th>5</th>\n",
802
+ " <td>In accordance with Fair c=Credit Reporting Act...</td>\n",
803
+ " <td>Credit Reporting</td>\n",
804
+ " <td>Credit reporting</td>\n",
805
+ " <td>Improper use of your report</td>\n",
806
+ " <td>Reporting company used your report improperly</td>\n",
807
+ " </tr>\n",
808
+ " </tbody>\n",
809
+ "</table>\n",
810
+ "</div>"
811
+ ],
812
+ "text/plain": [
813
+ " Consumer complaint narrative Product \\\n",
814
+ "1 I have previously disputed this item with you ... Credit Reporting \n",
815
+ "2 I kindly request that you update my credit rep... Debt collection \n",
816
+ "3 I implore you to conduct a comprehensive inves... Debt collection \n",
817
+ "4 In accordance with the Fair Credit Reporting A... Credit Reporting \n",
818
+ "5 In accordance with Fair c=Credit Reporting Act... Credit Reporting \n",
819
+ "\n",
820
+ " Sub-product Issue \\\n",
821
+ "1 Credit reporting Problem with a company's investigation into an... \n",
822
+ "2 Other debt Attempts to collect debt not owed \n",
823
+ "3 Other debt Attempts to collect debt not owed \n",
824
+ "4 Credit reporting Incorrect information on your report \n",
825
+ "5 Credit reporting Improper use of your report \n",
826
+ "\n",
827
+ " Sub-issue \n",
828
+ "1 Investigation took more than 30 days \n",
829
+ "2 Debt was result of identity theft \n",
830
+ "3 Debt was result of identity theft \n",
831
+ "4 Information belongs to someone else \n",
832
+ "5 Reporting company used your report improperly "
833
+ ]
834
+ },
835
+ "execution_count": 70,
836
+ "metadata": {},
837
+ "output_type": "execute_result"
838
+ }
839
+ ],
840
+ "source": [
841
+ "final_df_2023.head()"
842
+ ]
843
+ },
844
+ {
845
+ "cell_type": "code",
846
+ "execution_count": 86,
847
+ "id": "da025cda-f04e-4822-b100-855e981d632a",
848
+ "metadata": {},
849
+ "outputs": [],
850
+ "source": [
851
+ "X = final_df_2023['Consumer complaint narrative']\n",
852
+ "y = final_df_2023[['Product','Sub-product','Issue','Sub-issue']]\n",
853
+ "\n",
854
+ "X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y['Product'],test_size=0.25,random_state=42)"
855
+ ]
856
+ },
857
+ {
858
+ "cell_type": "code",
859
+ "execution_count": 91,
860
+ "id": "d291102d-7136-4512-84c2-ba970b169cbf",
861
+ "metadata": {},
862
+ "outputs": [],
863
+ "source": [
864
+ "train_df = pd.concat([X_train,y_train],axis = 1).reset_index(drop = True)\n",
865
+ "test_df = pd.concat([X_test,y_test],axis = 1).reset_index(drop = True)"
866
+ ]
867
+ },
868
+ {
869
+ "cell_type": "code",
870
+ "execution_count": 92,
871
+ "id": "0006636f-24cf-41dd-98cd-dc3a2b65432f",
872
+ "metadata": {},
873
+ "outputs": [
874
+ {
875
+ "data": {
876
+ "text/html": [
877
+ "<div>\n",
878
+ "<style scoped>\n",
879
+ " .dataframe tbody tr th:only-of-type {\n",
880
+ " vertical-align: middle;\n",
881
+ " }\n",
882
+ "\n",
883
+ " .dataframe tbody tr th {\n",
884
+ " vertical-align: top;\n",
885
+ " }\n",
886
+ "\n",
887
+ " .dataframe thead th {\n",
888
+ " text-align: right;\n",
889
+ " }\n",
890
+ "</style>\n",
891
+ "<table border=\"1\" class=\"dataframe\">\n",
892
+ " <thead>\n",
893
+ " <tr style=\"text-align: right;\">\n",
894
+ " <th></th>\n",
895
+ " <th>Consumer complaint narrative</th>\n",
896
+ " <th>Product</th>\n",
897
+ " <th>Sub-product</th>\n",
898
+ " <th>Issue</th>\n",
899
+ " <th>Sub-issue</th>\n",
900
+ " </tr>\n",
901
+ " </thead>\n",
902
+ " <tbody>\n",
903
+ " <tr>\n",
904
+ " <th>0</th>\n",
905
+ " <td>The credit bureaus keep disrespecting the laws...</td>\n",
906
+ " <td>Credit Reporting</td>\n",
907
+ " <td>Credit reporting</td>\n",
908
+ " <td>Problem with a company's investigation into an...</td>\n",
909
+ " <td>Their investigation did not fix an error on yo...</td>\n",
910
+ " </tr>\n",
911
+ " <tr>\n",
912
+ " <th>1</th>\n",
913
+ " <td>I sent in a complaint in XXXX of 2021 about so...</td>\n",
914
+ " <td>Credit Reporting</td>\n",
915
+ " <td>Credit reporting</td>\n",
916
+ " <td>Incorrect information on your report</td>\n",
917
+ " <td>Information belongs to someone else</td>\n",
918
+ " </tr>\n",
919
+ " <tr>\n",
920
+ " <th>2</th>\n",
921
+ " <td>I ordered a copy of my report and I found out ...</td>\n",
922
+ " <td>Credit Reporting</td>\n",
923
+ " <td>Credit reporting</td>\n",
924
+ " <td>Problem with a credit reporting company's inve...</td>\n",
925
+ " <td>Their investigation did not fix an error on yo...</td>\n",
926
+ " </tr>\n",
927
+ " <tr>\n",
928
+ " <th>3</th>\n",
929
+ " <td>It appears that my credit file has been compro...</td>\n",
930
+ " <td>Credit Reporting</td>\n",
931
+ " <td>Credit reporting</td>\n",
932
+ " <td>Incorrect information on your report</td>\n",
933
+ " <td>Information belongs to someone else</td>\n",
934
+ " </tr>\n",
935
+ " <tr>\n",
936
+ " <th>4</th>\n",
937
+ " <td>I have never authorized, consented to nor bene...</td>\n",
938
+ " <td>Credit Reporting</td>\n",
939
+ " <td>Credit reporting</td>\n",
940
+ " <td>Incorrect information on your report</td>\n",
941
+ " <td>Information belongs to someone else</td>\n",
942
+ " </tr>\n",
943
+ " </tbody>\n",
944
+ "</table>\n",
945
+ "</div>"
946
+ ],
947
+ "text/plain": [
948
+ " Consumer complaint narrative Product \\\n",
949
+ "0 The credit bureaus keep disrespecting the laws... Credit Reporting \n",
950
+ "1 I sent in a complaint in XXXX of 2021 about so... Credit Reporting \n",
951
+ "2 I ordered a copy of my report and I found out ... Credit Reporting \n",
952
+ "3 It appears that my credit file has been compro... Credit Reporting \n",
953
+ "4 I have never authorized, consented to nor bene... Credit Reporting \n",
954
+ "\n",
955
+ " Sub-product Issue \\\n",
956
+ "0 Credit reporting Problem with a company's investigation into an... \n",
957
+ "1 Credit reporting Incorrect information on your report \n",
958
+ "2 Credit reporting Problem with a credit reporting company's inve... \n",
959
+ "3 Credit reporting Incorrect information on your report \n",
960
+ "4 Credit reporting Incorrect information on your report \n",
961
+ "\n",
962
+ " Sub-issue \n",
963
+ "0 Their investigation did not fix an error on yo... \n",
964
+ "1 Information belongs to someone else \n",
965
+ "2 Their investigation did not fix an error on yo... \n",
966
+ "3 Information belongs to someone else \n",
967
+ "4 Information belongs to someone else "
968
+ ]
969
+ },
970
+ "execution_count": 92,
971
+ "metadata": {},
972
+ "output_type": "execute_result"
973
+ }
974
+ ],
975
+ "source": [
976
+ "train_df.head()"
977
+ ]
978
+ },
979
+ {
980
+ "cell_type": "code",
981
+ "execution_count": 94,
982
+ "id": "724b3508-7e79-4526-a20f-3797250f9cf9",
983
+ "metadata": {},
984
+ "outputs": [
985
+ {
986
+ "data": {
987
+ "text/plain": [
988
+ "(185637, 5)"
989
+ ]
990
+ },
991
+ "execution_count": 94,
992
+ "metadata": {},
993
+ "output_type": "execute_result"
994
+ }
995
+ ],
996
+ "source": [
997
+ "train_df.shape"
998
+ ]
999
+ },
1000
+ {
1001
+ "cell_type": "code",
1002
+ "execution_count": 95,
1003
+ "id": "06972769-eddd-4ee7-9ebc-e6f587ad5366",
1004
+ "metadata": {},
1005
+ "outputs": [
1006
+ {
1007
+ "data": {
1008
+ "text/plain": [
1009
+ "(61880, 5)"
1010
+ ]
1011
+ },
1012
+ "execution_count": 95,
1013
+ "metadata": {},
1014
+ "output_type": "execute_result"
1015
+ }
1016
+ ],
1017
+ "source": [
1018
+ "test_df.shape"
1019
+ ]
1020
+ },
1021
+ {
1022
+ "cell_type": "code",
1023
+ "execution_count": 99,
1024
+ "id": "de358d80-fd59-4f9c-83ee-2264659f4b0f",
1025
+ "metadata": {},
1026
+ "outputs": [],
1027
+ "source": [
1028
+ "import os\n",
1029
+ "\n",
1030
+ "directory_to_save = './data_splits/'\n",
1031
+ "\n",
1032
+ "if not os.path.exists(directory_to_save):\n",
1033
+ " os.makedirs(directory_to_save)\n",
1034
+ "\n",
1035
+ "train_df.to_csv(directory_to_save + 'train-data-split.csv',index = False)\n",
1036
+ "test_df.to_csv(directory_to_save + 'test-data-split.csv',index = False)"
1037
+ ]
1038
+ }
1039
+ ],
1040
+ "metadata": {
1041
+ "kernelspec": {
1042
+ "display_name": "Python 3 (ipykernel)",
1043
+ "language": "python",
1044
+ "name": "python3"
1045
+ },
1046
+ "language_info": {
1047
+ "codemirror_mode": {
1048
+ "name": "ipython",
1049
+ "version": 3
1050
+ },
1051
+ "file_extension": ".py",
1052
+ "mimetype": "text/x-python",
1053
+ "name": "python",
1054
+ "nbconvert_exporter": "python",
1055
+ "pygments_lexer": "ipython3",
1056
+ "version": "3.9.19"
1057
+ }
1058
+ },
1059
+ "nbformat": 4,
1060
+ "nbformat_minor": 5
1061
+ }
notebooks/.ipynb_checkpoints/Complaints preprocessing_new-checkpoint.ipynb ADDED
@@ -0,0 +1,1102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "cd6a338a-9a00-45f4-ac13-9ed131c9049e",
6
+ "metadata": {},
7
+ "source": [
8
+ "### Loading data (2023 year) "
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": 1,
14
+ "id": "2e8de3f1-6812-4c0d-bd56-32459911000e",
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": [
18
+ "import numpy as np\n",
19
+ "import pandas as pd\n",
20
+ "import matplotlib.pyplot as plt\n",
21
+ "import seaborn as sns\n",
22
+ "import plotly.express as px"
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "code",
27
+ "execution_count": 2,
28
+ "id": "ad45c437-7720-445e-8fa1-27d2b14b7bb5",
29
+ "metadata": {},
30
+ "outputs": [
31
+ {
32
+ "name": "stderr",
33
+ "output_type": "stream",
34
+ "text": [
35
+ "/tmp/ipykernel_9929/219708379.py:1: DtypeWarning: Columns (16) have mixed types. Specify dtype option on import or set low_memory=False.\n",
36
+ " df = pd.read_csv('./complaints.csv')\n"
37
+ ]
38
+ }
39
+ ],
40
+ "source": [
41
+ "df = pd.read_csv('./complaints.csv')\n",
42
+ "df['Date received'] = pd.to_datetime(df['Date received'])\n",
43
+ "\n",
44
+ "cols_to_consider = ['Product','Sub-product','Issue','Sub-issue','Consumer complaint narrative','Company public response','Company',\n",
45
+ " 'State', 'ZIP code', 'Date received']\n",
46
+ "df_new = df[cols_to_consider]\n",
47
+ "\n",
48
+ "df_new = df_new.dropna()"
49
+ ]
50
+ },
51
+ {
52
+ "cell_type": "code",
53
+ "execution_count": 3,
54
+ "id": "6df32835-7186-4c57-bffa-536f779636fe",
55
+ "metadata": {},
56
+ "outputs": [],
57
+ "source": [
58
+ "df_2023 = df_new[df_new['Date received'].dt.year.isin([2023])].reset_index(drop=True)\n",
59
+ "\n",
60
+ "product_map = {'Credit reporting or other personal consumer reports' : 'Credit Reporting',\n",
61
+ " 'Credit reporting, credit repair services, or other personal consumer reports' : 'Credit Reporting',\n",
62
+ " 'Payday loan, title loan, personal loan, or advance loan' : 'Loans / Mortgage',\n",
63
+ " 'Payday loan, title loan, or personal loan' : 'Loans / Mortgage',\n",
64
+ " 'Student loan' : 'Loans / Mortgage',\n",
65
+ " 'Vehicle loan or lease' : 'Loans / Mortgage',\n",
66
+ " 'Debt collection' : 'Debt collection',\n",
67
+ " 'Credit card or prepaid card' : 'Credit/Prepaid Card',\n",
68
+ " 'Credit card' : 'Credit/Prepaid Card',\n",
69
+ " 'Prepaid card' : 'Credit/Prepaid Card',\n",
70
+ " 'Mortgage' : 'Loans / Mortgage',\n",
71
+ " 'Checking or savings account' : 'Checking or savings account' \n",
72
+ " }\n",
73
+ "\n",
74
+ "df_2023.loc[:,'Product'] = df_2023['Product'].map(product_map)"
75
+ ]
76
+ },
77
+ {
78
+ "cell_type": "code",
79
+ "execution_count": 4,
80
+ "id": "679ffbe3-a6ba-4f4d-bf65-0690794fb4e1",
81
+ "metadata": {},
82
+ "outputs": [
83
+ {
84
+ "data": {
85
+ "text/html": [
86
+ "<div>\n",
87
+ "<style scoped>\n",
88
+ " .dataframe tbody tr th:only-of-type {\n",
89
+ " vertical-align: middle;\n",
90
+ " }\n",
91
+ "\n",
92
+ " .dataframe tbody tr th {\n",
93
+ " vertical-align: top;\n",
94
+ " }\n",
95
+ "\n",
96
+ " .dataframe thead th {\n",
97
+ " text-align: right;\n",
98
+ " }\n",
99
+ "</style>\n",
100
+ "<table border=\"1\" class=\"dataframe\">\n",
101
+ " <thead>\n",
102
+ " <tr style=\"text-align: right;\">\n",
103
+ " <th></th>\n",
104
+ " <th>Product</th>\n",
105
+ " <th>Sub-product</th>\n",
106
+ " <th>Issue</th>\n",
107
+ " <th>Sub-issue</th>\n",
108
+ " <th>Consumer complaint narrative</th>\n",
109
+ " <th>Company public response</th>\n",
110
+ " <th>Company</th>\n",
111
+ " <th>State</th>\n",
112
+ " <th>ZIP code</th>\n",
113
+ " <th>Date received</th>\n",
114
+ " </tr>\n",
115
+ " </thead>\n",
116
+ " <tbody>\n",
117
+ " <tr>\n",
118
+ " <th>0</th>\n",
119
+ " <td>Checking or savings account</td>\n",
120
+ " <td>Other banking product or service</td>\n",
121
+ " <td>Opening an account</td>\n",
122
+ " <td>Account opened without my consent or knowledge</td>\n",
123
+ " <td>Date : XXXX XXXXo : XXXX XXXX XXXX / XXXX XXXX...</td>\n",
124
+ " <td>Company has responded to the consumer and the ...</td>\n",
125
+ " <td>WELLS FARGO &amp; COMPANY</td>\n",
126
+ " <td>NC</td>\n",
127
+ " <td>27513</td>\n",
128
+ " <td>2023-12-29</td>\n",
129
+ " </tr>\n",
130
+ " <tr>\n",
131
+ " <th>1</th>\n",
132
+ " <td>Credit Reporting</td>\n",
133
+ " <td>Credit reporting</td>\n",
134
+ " <td>Problem with a company's investigation into an...</td>\n",
135
+ " <td>Investigation took more than 30 days</td>\n",
136
+ " <td>I have previously disputed this item with you ...</td>\n",
137
+ " <td>Company has responded to the consumer and the ...</td>\n",
138
+ " <td>Experian Information Solutions Inc.</td>\n",
139
+ " <td>MN</td>\n",
140
+ " <td>55124</td>\n",
141
+ " <td>2023-12-29</td>\n",
142
+ " </tr>\n",
143
+ " <tr>\n",
144
+ " <th>2</th>\n",
145
+ " <td>Debt collection</td>\n",
146
+ " <td>Other debt</td>\n",
147
+ " <td>Attempts to collect debt not owed</td>\n",
148
+ " <td>Debt was result of identity theft</td>\n",
149
+ " <td>I kindly request that you update my credit rep...</td>\n",
150
+ " <td>Company has responded to the consumer and the ...</td>\n",
151
+ " <td>Experian Information Solutions Inc.</td>\n",
152
+ " <td>IL</td>\n",
153
+ " <td>60621</td>\n",
154
+ " <td>2023-12-28</td>\n",
155
+ " </tr>\n",
156
+ " <tr>\n",
157
+ " <th>3</th>\n",
158
+ " <td>Debt collection</td>\n",
159
+ " <td>Other debt</td>\n",
160
+ " <td>Attempts to collect debt not owed</td>\n",
161
+ " <td>Debt was result of identity theft</td>\n",
162
+ " <td>I implore you to conduct a comprehensive inves...</td>\n",
163
+ " <td>Company has responded to the consumer and the ...</td>\n",
164
+ " <td>Experian Information Solutions Inc.</td>\n",
165
+ " <td>NJ</td>\n",
166
+ " <td>08723</td>\n",
167
+ " <td>2023-12-28</td>\n",
168
+ " </tr>\n",
169
+ " <tr>\n",
170
+ " <th>4</th>\n",
171
+ " <td>Credit Reporting</td>\n",
172
+ " <td>Credit reporting</td>\n",
173
+ " <td>Incorrect information on your report</td>\n",
174
+ " <td>Information belongs to someone else</td>\n",
175
+ " <td>In accordance with the Fair Credit Reporting A...</td>\n",
176
+ " <td>Company has responded to the consumer and the ...</td>\n",
177
+ " <td>TRANSUNION INTERMEDIATE HOLDINGS, INC.</td>\n",
178
+ " <td>TX</td>\n",
179
+ " <td>77377</td>\n",
180
+ " <td>2023-11-27</td>\n",
181
+ " </tr>\n",
182
+ " </tbody>\n",
183
+ "</table>\n",
184
+ "</div>"
185
+ ],
186
+ "text/plain": [
187
+ " Product Sub-product \\\n",
188
+ "0 Checking or savings account Other banking product or service \n",
189
+ "1 Credit Reporting Credit reporting \n",
190
+ "2 Debt collection Other debt \n",
191
+ "3 Debt collection Other debt \n",
192
+ "4 Credit Reporting Credit reporting \n",
193
+ "\n",
194
+ " Issue \\\n",
195
+ "0 Opening an account \n",
196
+ "1 Problem with a company's investigation into an... \n",
197
+ "2 Attempts to collect debt not owed \n",
198
+ "3 Attempts to collect debt not owed \n",
199
+ "4 Incorrect information on your report \n",
200
+ "\n",
201
+ " Sub-issue \\\n",
202
+ "0 Account opened without my consent or knowledge \n",
203
+ "1 Investigation took more than 30 days \n",
204
+ "2 Debt was result of identity theft \n",
205
+ "3 Debt was result of identity theft \n",
206
+ "4 Information belongs to someone else \n",
207
+ "\n",
208
+ " Consumer complaint narrative \\\n",
209
+ "0 Date : XXXX XXXXo : XXXX XXXX XXXX / XXXX XXXX... \n",
210
+ "1 I have previously disputed this item with you ... \n",
211
+ "2 I kindly request that you update my credit rep... \n",
212
+ "3 I implore you to conduct a comprehensive inves... \n",
213
+ "4 In accordance with the Fair Credit Reporting A... \n",
214
+ "\n",
215
+ " Company public response \\\n",
216
+ "0 Company has responded to the consumer and the ... \n",
217
+ "1 Company has responded to the consumer and the ... \n",
218
+ "2 Company has responded to the consumer and the ... \n",
219
+ "3 Company has responded to the consumer and the ... \n",
220
+ "4 Company has responded to the consumer and the ... \n",
221
+ "\n",
222
+ " Company State ZIP code Date received \n",
223
+ "0 WELLS FARGO & COMPANY NC 27513 2023-12-29 \n",
224
+ "1 Experian Information Solutions Inc. MN 55124 2023-12-29 \n",
225
+ "2 Experian Information Solutions Inc. IL 60621 2023-12-28 \n",
226
+ "3 Experian Information Solutions Inc. NJ 08723 2023-12-28 \n",
227
+ "4 TRANSUNION INTERMEDIATE HOLDINGS, INC. TX 77377 2023-11-27 "
228
+ ]
229
+ },
230
+ "execution_count": 4,
231
+ "metadata": {},
232
+ "output_type": "execute_result"
233
+ }
234
+ ],
235
+ "source": [
236
+ "df_2023.head()"
237
+ ]
238
+ },
239
+ {
240
+ "cell_type": "code",
241
+ "execution_count": 5,
242
+ "id": "a85ec9b1-5de7-47f9-b204-4d42c8880bbb",
243
+ "metadata": {},
244
+ "outputs": [
245
+ {
246
+ "data": {
247
+ "text/plain": [
248
+ "Index(['Product', 'Sub-product', 'Issue', 'Sub-issue',\n",
249
+ " 'Consumer complaint narrative', 'Company public response', 'Company',\n",
250
+ " 'State', 'ZIP code', 'Date received'],\n",
251
+ " dtype='object')"
252
+ ]
253
+ },
254
+ "execution_count": 5,
255
+ "metadata": {},
256
+ "output_type": "execute_result"
257
+ }
258
+ ],
259
+ "source": [
260
+ "df_2023.columns"
261
+ ]
262
+ },
263
+ {
264
+ "cell_type": "markdown",
265
+ "id": "0487636d-9663-4fdb-b219-f9e6be257b51",
266
+ "metadata": {},
267
+ "source": [
268
+ "### Complaint pre-processing"
269
+ ]
270
+ },
271
+ {
272
+ "cell_type": "code",
273
+ "execution_count": 6,
274
+ "id": "e35208c6-020a-4fb9-8c9f-13fdeee44935",
275
+ "metadata": {},
276
+ "outputs": [],
277
+ "source": [
278
+ "df_2023['complaint length'] = df_2023['Consumer complaint narrative'].apply(lambda x : len(x))"
279
+ ]
280
+ },
281
+ {
282
+ "cell_type": "code",
283
+ "execution_count": 7,
284
+ "id": "63deb9bb-d48a-460b-8edb-f66575ec1eaf",
285
+ "metadata": {},
286
+ "outputs": [],
287
+ "source": [
288
+ "df_2023 = df_2023[df_2023['complaint length'] > 20]\n",
289
+ "\n",
290
+ "complaints_to_exclude = ['See document attached', 'See the attached documents.', 'Incorrect information on my credit report', 'incorrect information on my credit report',\n",
291
+ "'please see attached file','Please see documents Attached','Incorrect information on my credit report.', 'Please see attached file', 'see attached',\n",
292
+ "'See attached', 'SEE ATTACHED DOCUMENTS', 'See Attached', 'SEE ATTACHMENT', 'SEE ATTACHMENTS', \n",
293
+ "'XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX']\n",
294
+ "\n",
295
+ "df_2023 = df_2023[~df_2023['Consumer complaint narrative'].isin(complaints_to_exclude)]"
296
+ ]
297
+ },
298
+ {
299
+ "cell_type": "markdown",
300
+ "id": "492f8261-3e01-41d5-8f24-82bd289ee229",
301
+ "metadata": {},
302
+ "source": [
303
+ "### Categories consideration"
304
+ ]
305
+ },
306
+ {
307
+ "cell_type": "code",
308
+ "execution_count": 8,
309
+ "id": "0be9e1f3-61aa-494a-bd0c-a6afeab5aacd",
310
+ "metadata": {},
311
+ "outputs": [
312
+ {
313
+ "data": {
314
+ "text/plain": [
315
+ "(264968, 5)"
316
+ ]
317
+ },
318
+ "execution_count": 8,
319
+ "metadata": {},
320
+ "output_type": "execute_result"
321
+ }
322
+ ],
323
+ "source": [
324
+ "df_2023_subset = df_2023[['Consumer complaint narrative','Product','Sub-product','Issue','Sub-issue']]\n",
325
+ "df_2023_subset.shape"
326
+ ]
327
+ },
328
+ {
329
+ "cell_type": "code",
330
+ "execution_count": 9,
331
+ "id": "33e4e7e3-6661-48aa-aec2-b706fa64338d",
332
+ "metadata": {},
333
+ "outputs": [
334
+ {
335
+ "data": {
336
+ "text/plain": [
337
+ "Product\n",
338
+ "Credit Reporting 213403\n",
339
+ "Credit/Prepaid Card 16319\n",
340
+ "Checking or savings account 15143\n",
341
+ "Debt collection 11767\n",
342
+ "Loans / Mortgage 8336\n",
343
+ "Name: count, dtype: int64"
344
+ ]
345
+ },
346
+ "execution_count": 9,
347
+ "metadata": {},
348
+ "output_type": "execute_result"
349
+ }
350
+ ],
351
+ "source": [
352
+ "df_2023_subset['Product'].value_counts()"
353
+ ]
354
+ },
355
+ {
356
+ "cell_type": "code",
357
+ "execution_count": 10,
358
+ "id": "dbc49ba8-f15a-4a4b-b018-d9d2273620ba",
359
+ "metadata": {},
360
+ "outputs": [],
361
+ "source": [
362
+ "sub_issues_to_consider = df_2023_subset['Sub-issue'].value_counts()[df_2023_subset['Sub-issue'].value_counts() > 500].index"
363
+ ]
364
+ },
365
+ {
366
+ "cell_type": "code",
367
+ "execution_count": 11,
368
+ "id": "746db565-e6ff-4ab2-bf92-d56088c0f2da",
369
+ "metadata": {},
370
+ "outputs": [],
371
+ "source": [
372
+ "reduced_subissues = df_2023_subset[df_2023_subset['Sub-issue'].isin(sub_issues_to_consider)]"
373
+ ]
374
+ },
375
+ {
376
+ "cell_type": "code",
377
+ "execution_count": 12,
378
+ "id": "0f786b1d-b139-40b5-ad53-639d8687d3b4",
379
+ "metadata": {},
380
+ "outputs": [
381
+ {
382
+ "data": {
383
+ "text/plain": [
384
+ "(248065, 5)"
385
+ ]
386
+ },
387
+ "execution_count": 12,
388
+ "metadata": {},
389
+ "output_type": "execute_result"
390
+ }
391
+ ],
392
+ "source": [
393
+ "reduced_subissues.shape"
394
+ ]
395
+ },
396
+ {
397
+ "cell_type": "code",
398
+ "execution_count": 13,
399
+ "id": "f64515e8-ac65-4041-a201-a8576a86d7ad",
400
+ "metadata": {},
401
+ "outputs": [
402
+ {
403
+ "data": {
404
+ "text/plain": [
405
+ "Sub-issue\n",
406
+ "Information belongs to someone else 57877\n",
407
+ "Reporting company used your report improperly 48781\n",
408
+ "Their investigation did not fix an error on your report 45407\n",
409
+ "Credit inquiries on your report that you don't recognize 13150\n",
410
+ "Account status incorrect 10271\n",
411
+ "Account information incorrect 9307\n",
412
+ "Was not notified of investigation status or results 9201\n",
413
+ "Investigation took more than 30 days 8937\n",
414
+ "Personal information incorrect 5900\n",
415
+ "Debt is not yours 2821\n",
416
+ "Deposits and withdrawals 2626\n",
417
+ "Credit card company isn't resolving a dispute about a purchase on your statement 2289\n",
418
+ "Didn't receive enough information to verify debt 1816\n",
419
+ "Debt was result of identity theft 1761\n",
420
+ "Old information reappears or never goes away 1716\n",
421
+ "Difficulty submitting a dispute or getting information about a dispute over the phone 1709\n",
422
+ "Company closed your account 1517\n",
423
+ "Problem using a debit or ATM card 1503\n",
424
+ "Public record information inaccurate 1389\n",
425
+ "Transaction was not authorized 1378\n",
426
+ "Problem with personal statement of dispute 1361\n",
427
+ "Other problem getting your report or credit score 1112\n",
428
+ "Debt was paid 969\n",
429
+ "Card was charged for something you did not purchase with the card 964\n",
430
+ "Banking errors 958\n",
431
+ "Funds not handled or disbursed as instructed 955\n",
432
+ "Overdrafts and overdraft fees 951\n",
433
+ "Attempted to collect wrong amount 885\n",
434
+ "Information is missing that should be on the report 881\n",
435
+ "Problem during payment process 840\n",
436
+ "Fee problem 764\n",
437
+ "Problem with fees 749\n",
438
+ "Received bad information about your loan 710\n",
439
+ "Other problem 701\n",
440
+ "Threatened or suggested your credit would be damaged 687\n",
441
+ "Funds not received from closed account 673\n",
442
+ "Trouble with how payments are being handled 650\n",
443
+ "Didn't receive notice of right to dispute 644\n",
444
+ "Can't close your account 598\n",
445
+ "Problem accessing account 561\n",
446
+ "Account opened as a result of fraud 561\n",
447
+ "Problem canceling credit monitoring or identify theft protection service 521\n",
448
+ "Card opened as result of identity theft or fraud 511\n",
449
+ "Billing problem 503\n",
450
+ "Name: count, dtype: int64"
451
+ ]
452
+ },
453
+ "execution_count": 13,
454
+ "metadata": {},
455
+ "output_type": "execute_result"
456
+ }
457
+ ],
458
+ "source": [
459
+ "reduced_subissues['Sub-issue'].value_counts()"
460
+ ]
461
+ },
462
+ {
463
+ "cell_type": "code",
464
+ "execution_count": 14,
465
+ "id": "6204eb53-1a5b-457f-ab67-957d73f568af",
466
+ "metadata": {},
467
+ "outputs": [],
468
+ "source": [
469
+ "sub_products_to_consider = reduced_subissues['Sub-product'].value_counts()[reduced_subissues['Sub-product'].value_counts() > 100].index\n",
470
+ "final_df_2023 = reduced_subissues[reduced_subissues['Sub-product'].isin(sub_products_to_consider)]"
471
+ ]
472
+ },
473
+ {
474
+ "cell_type": "code",
475
+ "execution_count": 15,
476
+ "id": "781850e8-cd50-4d08-87aa-8d86715cc2ef",
477
+ "metadata": {},
478
+ "outputs": [
479
+ {
480
+ "data": {
481
+ "text/plain": [
482
+ "(247517, 5)"
483
+ ]
484
+ },
485
+ "execution_count": 15,
486
+ "metadata": {},
487
+ "output_type": "execute_result"
488
+ }
489
+ ],
490
+ "source": [
491
+ "final_df_2023.shape"
492
+ ]
493
+ },
494
+ {
495
+ "cell_type": "markdown",
496
+ "id": "563955e5-8b1b-4d67-a552-5d1b69ff8891",
497
+ "metadata": {},
498
+ "source": [
499
+ "### Issue categories grouping"
500
+ ]
501
+ },
502
+ {
503
+ "cell_type": "code",
504
+ "execution_count": 16,
505
+ "id": "8cb41375-d72e-4f90-bde1-6ff13af37082",
506
+ "metadata": {},
507
+ "outputs": [],
508
+ "source": [
509
+ "issues_to_subissues = {}\n",
510
+ "for issue in final_df_2023['Issue'].value_counts().index:\n",
511
+ " issues_to_subissues[issue] = list(final_df_2023[final_df_2023['Issue'] == issue]['Sub-issue'].value_counts().to_dict().keys())\n",
512
+ "\n",
513
+ "one_subissue = {key : value for key,value in issues_to_subissues.items() if len(issues_to_subissues[key]) == 1}\n",
514
+ "more_than_one_subissue = {key : value for key,value in issues_to_subissues.items() if len(issues_to_subissues[key]) > 1}\n",
515
+ "\n",
516
+ "existing_issue_mapping = {issue : issue for issue in more_than_one_subissue}\n",
517
+ "\n",
518
+ "issue_renaming = {\n",
519
+ " 'Problem with a lender or other company charging your account': 'Account Operations and Unauthorized Transaction Issues',\n",
520
+ " 'Opening an account': 'Account Operations and Unauthorized Transaction Issues',\n",
521
+ " 'Getting a credit card': 'Account Operations and Unauthorized Transaction Issues',\n",
522
+ "\n",
523
+ " 'Unable to get your credit report or credit score': 'Credit Report and Monitoring Issues',\n",
524
+ " 'Credit monitoring or identity theft protection services': 'Credit Report and Monitoring Issues',\n",
525
+ " 'Identity theft protection or other monitoring services': 'Credit Report and Monitoring Issues',\n",
526
+ " \n",
527
+ " 'Problem caused by your funds being low': 'Payment and Funds Management',\n",
528
+ " 'Problem when making payments': 'Payment and Funds Management',\n",
529
+ " 'Managing the loan or lease': 'Payment and Funds Management',\n",
530
+ "\n",
531
+ " 'False statements or representation': 'Disputes and Misrepresentations',\n",
532
+ " 'Fees or interest': 'Disputes and Misrepresentations',\n",
533
+ " 'Other features, terms, or problems': 'Disputes and Misrepresentations',\n",
534
+ "\n",
535
+ " 'Took or threatened to take negative or legal action': 'Legal and Threat Actions'\n",
536
+ "}\n",
537
+ "\n",
538
+ "issues_mapping = {**issue_renaming, **existing_issue_mapping}\n",
539
+ "\n",
540
+ "final_df_2023.loc[:,'Issue'] = final_df_2023['Issue'].apply(lambda x : issues_mapping[x])"
541
+ ]
542
+ },
543
+ {
544
+ "cell_type": "markdown",
545
+ "id": "0ab4f91f-c938-4093-a299-b895ea13121a",
546
+ "metadata": {},
547
+ "source": [
548
+ "### Value counts"
549
+ ]
550
+ },
551
+ {
552
+ "cell_type": "code",
553
+ "execution_count": 17,
554
+ "id": "17ddf55c-f824-4b2c-8059-07d02597a1cb",
555
+ "metadata": {},
556
+ "outputs": [
557
+ {
558
+ "data": {
559
+ "text/plain": [
560
+ "Product\n",
561
+ "Credit Reporting 211695\n",
562
+ "Checking or savings account 12285\n",
563
+ "Credit/Prepaid Card 11975\n",
564
+ "Debt collection 9380\n",
565
+ "Loans / Mortgage 2182\n",
566
+ "Name: count, dtype: int64"
567
+ ]
568
+ },
569
+ "execution_count": 17,
570
+ "metadata": {},
571
+ "output_type": "execute_result"
572
+ }
573
+ ],
574
+ "source": [
575
+ "final_df_2023['Product'].value_counts()"
576
+ ]
577
+ },
578
+ {
579
+ "cell_type": "code",
580
+ "execution_count": 18,
581
+ "id": "eae2d688-f706-4c31-9228-1ae7eadbf228",
582
+ "metadata": {},
583
+ "outputs": [
584
+ {
585
+ "data": {
586
+ "text/plain": [
587
+ "Sub-product\n",
588
+ "Credit reporting 210735\n",
589
+ "General-purpose credit card or charge card 10668\n",
590
+ "Checking account 10409\n",
591
+ "Other debt 3041\n",
592
+ "I do not know 2316\n",
593
+ "Credit card debt 1652\n",
594
+ "Federal student loan servicing 1344\n",
595
+ "Store credit card 1307\n",
596
+ "Medical debt 1053\n",
597
+ "Savings account 989\n",
598
+ "Other personal consumer report 960\n",
599
+ "Loan 732\n",
600
+ "Other banking product or service 725\n",
601
+ "Auto debt 581\n",
602
+ "Telecommunications debt 419\n",
603
+ "Rental debt 179\n",
604
+ "CD (Certificate of Deposit) 162\n",
605
+ "Mortgage debt 139\n",
606
+ "Conventional home mortgage 106\n",
607
+ "Name: count, dtype: int64"
608
+ ]
609
+ },
610
+ "execution_count": 18,
611
+ "metadata": {},
612
+ "output_type": "execute_result"
613
+ }
614
+ ],
615
+ "source": [
616
+ "final_df_2023['Sub-product'].value_counts()"
617
+ ]
618
+ },
619
+ {
620
+ "cell_type": "code",
621
+ "execution_count": 19,
622
+ "id": "61179ec3-f49f-4d2a-adde-738a0ff89371",
623
+ "metadata": {},
624
+ "outputs": [
625
+ {
626
+ "data": {
627
+ "text/plain": [
628
+ "Issue\n",
629
+ "Incorrect information on your report 87200\n",
630
+ "Improper use of your report 61868\n",
631
+ "Problem with a credit reporting company's investigation into an existing problem 45371\n",
632
+ "Problem with a company's investigation into an existing problem 20985\n",
633
+ "Managing an account 7367\n",
634
+ "Attempts to collect debt not owed 5453\n",
635
+ "Problem with a purchase shown on your statement 3253\n",
636
+ "Account Operations and Unauthorized Transaction Issues 2450\n",
637
+ "Written notification about debt 2404\n",
638
+ "Disputes and Misrepresentations 2311\n",
639
+ "Payment and Funds Management 2259\n",
640
+ "Closing an account 1975\n",
641
+ "Credit Report and Monitoring Issues 1630\n",
642
+ "Dealing with your lender or servicer 1293\n",
643
+ "Closing your account 813\n",
644
+ "Legal and Threat Actions 662\n",
645
+ "Problem with a company's investigation into an existing issue 223\n",
646
+ "Name: count, dtype: int64"
647
+ ]
648
+ },
649
+ "execution_count": 19,
650
+ "metadata": {},
651
+ "output_type": "execute_result"
652
+ }
653
+ ],
654
+ "source": [
655
+ "final_df_2023['Issue'].value_counts()"
656
+ ]
657
+ },
658
+ {
659
+ "cell_type": "code",
660
+ "execution_count": 20,
661
+ "id": "928750bb-7324-480f-aaa1-a4438841399c",
662
+ "metadata": {},
663
+ "outputs": [
664
+ {
665
+ "data": {
666
+ "text/plain": [
667
+ "Sub-issue\n",
668
+ "Information belongs to someone else 57850\n",
669
+ "Reporting company used your report improperly 48732\n",
670
+ "Their investigation did not fix an error on your report 45395\n",
671
+ "Credit inquiries on your report that you don't recognize 13136\n",
672
+ "Account status incorrect 10208\n",
673
+ "Account information incorrect 9267\n",
674
+ "Was not notified of investigation status or results 9200\n",
675
+ "Investigation took more than 30 days 8928\n",
676
+ "Personal information incorrect 5900\n",
677
+ "Debt is not yours 2785\n",
678
+ "Deposits and withdrawals 2626\n",
679
+ "Credit card company isn't resolving a dispute about a purchase on your statement 2289\n",
680
+ "Didn't receive enough information to verify debt 1777\n",
681
+ "Debt was result of identity theft 1727\n",
682
+ "Old information reappears or never goes away 1714\n",
683
+ "Difficulty submitting a dispute or getting information about a dispute over the phone 1704\n",
684
+ "Company closed your account 1517\n",
685
+ "Problem using a debit or ATM card 1503\n",
686
+ "Public record information inaccurate 1384\n",
687
+ "Transaction was not authorized 1378\n",
688
+ "Problem with personal statement of dispute 1352\n",
689
+ "Other problem getting your report or credit score 1109\n",
690
+ "Card was charged for something you did not purchase with the card 964\n",
691
+ "Banking errors 958\n",
692
+ "Funds not handled or disbursed as instructed 955\n",
693
+ "Overdrafts and overdraft fees 951\n",
694
+ "Debt was paid 941\n",
695
+ "Information is missing that should be on the report 877\n",
696
+ "Attempted to collect wrong amount 861\n",
697
+ "Problem during payment process 840\n",
698
+ "Fee problem 764\n",
699
+ "Problem with fees 749\n",
700
+ "Other problem 701\n",
701
+ "Received bad information about your loan 677\n",
702
+ "Funds not received from closed account 673\n",
703
+ "Threatened or suggested your credit would be damaged 662\n",
704
+ "Didn't receive notice of right to dispute 627\n",
705
+ "Trouble with how payments are being handled 616\n",
706
+ "Can't close your account 598\n",
707
+ "Problem accessing account 561\n",
708
+ "Account opened as a result of fraud 561\n",
709
+ "Problem canceling credit monitoring or identify theft protection service 521\n",
710
+ "Card opened as result of identity theft or fraud 511\n",
711
+ "Billing problem 468\n",
712
+ "Name: count, dtype: int64"
713
+ ]
714
+ },
715
+ "execution_count": 20,
716
+ "metadata": {},
717
+ "output_type": "execute_result"
718
+ }
719
+ ],
720
+ "source": [
721
+ "final_df_2023['Sub-issue'].value_counts()"
722
+ ]
723
+ },
724
+ {
725
+ "cell_type": "markdown",
726
+ "id": "fd91e57e-766c-4c4b-92c1-4b61469be9b4",
727
+ "metadata": {},
728
+ "source": [
729
+ "### Unique categories"
730
+ ]
731
+ },
732
+ {
733
+ "cell_type": "code",
734
+ "execution_count": 21,
735
+ "id": "028803cd-86c0-4c8a-9fab-8f05ba6793a1",
736
+ "metadata": {},
737
+ "outputs": [
738
+ {
739
+ "name": "stdout",
740
+ "output_type": "stream",
741
+ "text": [
742
+ "Unique Product offerings: 5\n",
743
+ "Unique Sub-product offerings: 19\n",
744
+ "Unique Issue offerings: 17\n",
745
+ "Unique Sub-issue offerings: 44\n"
746
+ ]
747
+ }
748
+ ],
749
+ "source": [
750
+ "print(f\"Unique Product offerings: {final_df_2023['Product'].nunique()}\")\n",
751
+ "print(f\"Unique Sub-product offerings: {final_df_2023['Sub-product'].nunique()}\")\n",
752
+ "print(f\"Unique Issue offerings: {final_df_2023['Issue'].nunique()}\")\n",
753
+ "print(f\"Unique Sub-issue offerings: {final_df_2023['Sub-issue'].nunique()}\")"
754
+ ]
755
+ },
756
+ {
757
+ "cell_type": "markdown",
758
+ "id": "06ea0454-ed84-450a-90f7-e7552ffc181f",
759
+ "metadata": {},
760
+ "source": [
761
+ "### Preparing the train and test splits"
762
+ ]
763
+ },
764
+ {
765
+ "cell_type": "code",
766
+ "execution_count": 22,
767
+ "id": "267b771c-f944-443a-8048-c2f0097f4f29",
768
+ "metadata": {},
769
+ "outputs": [],
770
+ "source": [
771
+ "from sklearn.model_selection import train_test_split"
772
+ ]
773
+ },
774
+ {
775
+ "cell_type": "code",
776
+ "execution_count": 23,
777
+ "id": "eebed808-66b4-4fa8-a0ce-872b70d18106",
778
+ "metadata": {},
779
+ "outputs": [
780
+ {
781
+ "data": {
782
+ "text/html": [
783
+ "<div>\n",
784
+ "<style scoped>\n",
785
+ " .dataframe tbody tr th:only-of-type {\n",
786
+ " vertical-align: middle;\n",
787
+ " }\n",
788
+ "\n",
789
+ " .dataframe tbody tr th {\n",
790
+ " vertical-align: top;\n",
791
+ " }\n",
792
+ "\n",
793
+ " .dataframe thead th {\n",
794
+ " text-align: right;\n",
795
+ " }\n",
796
+ "</style>\n",
797
+ "<table border=\"1\" class=\"dataframe\">\n",
798
+ " <thead>\n",
799
+ " <tr style=\"text-align: right;\">\n",
800
+ " <th></th>\n",
801
+ " <th>Consumer complaint narrative</th>\n",
802
+ " <th>Product</th>\n",
803
+ " <th>Sub-product</th>\n",
804
+ " <th>Issue</th>\n",
805
+ " <th>Sub-issue</th>\n",
806
+ " </tr>\n",
807
+ " </thead>\n",
808
+ " <tbody>\n",
809
+ " <tr>\n",
810
+ " <th>1</th>\n",
811
+ " <td>I have previously disputed this item with you ...</td>\n",
812
+ " <td>Credit Reporting</td>\n",
813
+ " <td>Credit reporting</td>\n",
814
+ " <td>Problem with a company's investigation into an...</td>\n",
815
+ " <td>Investigation took more than 30 days</td>\n",
816
+ " </tr>\n",
817
+ " <tr>\n",
818
+ " <th>2</th>\n",
819
+ " <td>I kindly request that you update my credit rep...</td>\n",
820
+ " <td>Debt collection</td>\n",
821
+ " <td>Other debt</td>\n",
822
+ " <td>Attempts to collect debt not owed</td>\n",
823
+ " <td>Debt was result of identity theft</td>\n",
824
+ " </tr>\n",
825
+ " <tr>\n",
826
+ " <th>3</th>\n",
827
+ " <td>I implore you to conduct a comprehensive inves...</td>\n",
828
+ " <td>Debt collection</td>\n",
829
+ " <td>Other debt</td>\n",
830
+ " <td>Attempts to collect debt not owed</td>\n",
831
+ " <td>Debt was result of identity theft</td>\n",
832
+ " </tr>\n",
833
+ " <tr>\n",
834
+ " <th>4</th>\n",
835
+ " <td>In accordance with the Fair Credit Reporting A...</td>\n",
836
+ " <td>Credit Reporting</td>\n",
837
+ " <td>Credit reporting</td>\n",
838
+ " <td>Incorrect information on your report</td>\n",
839
+ " <td>Information belongs to someone else</td>\n",
840
+ " </tr>\n",
841
+ " <tr>\n",
842
+ " <th>5</th>\n",
843
+ " <td>In accordance with Fair c=Credit Reporting Act...</td>\n",
844
+ " <td>Credit Reporting</td>\n",
845
+ " <td>Credit reporting</td>\n",
846
+ " <td>Improper use of your report</td>\n",
847
+ " <td>Reporting company used your report improperly</td>\n",
848
+ " </tr>\n",
849
+ " </tbody>\n",
850
+ "</table>\n",
851
+ "</div>"
852
+ ],
853
+ "text/plain": [
854
+ " Consumer complaint narrative Product \\\n",
855
+ "1 I have previously disputed this item with you ... Credit Reporting \n",
856
+ "2 I kindly request that you update my credit rep... Debt collection \n",
857
+ "3 I implore you to conduct a comprehensive inves... Debt collection \n",
858
+ "4 In accordance with the Fair Credit Reporting A... Credit Reporting \n",
859
+ "5 In accordance with Fair c=Credit Reporting Act... Credit Reporting \n",
860
+ "\n",
861
+ " Sub-product Issue \\\n",
862
+ "1 Credit reporting Problem with a company's investigation into an... \n",
863
+ "2 Other debt Attempts to collect debt not owed \n",
864
+ "3 Other debt Attempts to collect debt not owed \n",
865
+ "4 Credit reporting Incorrect information on your report \n",
866
+ "5 Credit reporting Improper use of your report \n",
867
+ "\n",
868
+ " Sub-issue \n",
869
+ "1 Investigation took more than 30 days \n",
870
+ "2 Debt was result of identity theft \n",
871
+ "3 Debt was result of identity theft \n",
872
+ "4 Information belongs to someone else \n",
873
+ "5 Reporting company used your report improperly "
874
+ ]
875
+ },
876
+ "execution_count": 23,
877
+ "metadata": {},
878
+ "output_type": "execute_result"
879
+ }
880
+ ],
881
+ "source": [
882
+ "final_df_2023.head()"
883
+ ]
884
+ },
885
+ {
886
+ "cell_type": "code",
887
+ "execution_count": 24,
888
+ "id": "da025cda-f04e-4822-b100-855e981d632a",
889
+ "metadata": {},
890
+ "outputs": [],
891
+ "source": [
892
+ "X = final_df_2023['Consumer complaint narrative']\n",
893
+ "y = final_df_2023[['Product','Sub-product','Issue','Sub-issue']]\n",
894
+ "\n",
895
+ "X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y['Product'],test_size=0.25,random_state=42)"
896
+ ]
897
+ },
898
+ {
899
+ "cell_type": "code",
900
+ "execution_count": 25,
901
+ "id": "d291102d-7136-4512-84c2-ba970b169cbf",
902
+ "metadata": {},
903
+ "outputs": [],
904
+ "source": [
905
+ "train_df = pd.concat([X_train,y_train],axis = 1).reset_index(drop = True)\n",
906
+ "test_df = pd.concat([X_test,y_test],axis = 1).reset_index(drop = True)"
907
+ ]
908
+ },
909
+ {
910
+ "cell_type": "code",
911
+ "execution_count": 26,
912
+ "id": "0006636f-24cf-41dd-98cd-dc3a2b65432f",
913
+ "metadata": {},
914
+ "outputs": [
915
+ {
916
+ "data": {
917
+ "text/html": [
918
+ "<div>\n",
919
+ "<style scoped>\n",
920
+ " .dataframe tbody tr th:only-of-type {\n",
921
+ " vertical-align: middle;\n",
922
+ " }\n",
923
+ "\n",
924
+ " .dataframe tbody tr th {\n",
925
+ " vertical-align: top;\n",
926
+ " }\n",
927
+ "\n",
928
+ " .dataframe thead th {\n",
929
+ " text-align: right;\n",
930
+ " }\n",
931
+ "</style>\n",
932
+ "<table border=\"1\" class=\"dataframe\">\n",
933
+ " <thead>\n",
934
+ " <tr style=\"text-align: right;\">\n",
935
+ " <th></th>\n",
936
+ " <th>Consumer complaint narrative</th>\n",
937
+ " <th>Product</th>\n",
938
+ " <th>Sub-product</th>\n",
939
+ " <th>Issue</th>\n",
940
+ " <th>Sub-issue</th>\n",
941
+ " </tr>\n",
942
+ " </thead>\n",
943
+ " <tbody>\n",
944
+ " <tr>\n",
945
+ " <th>0</th>\n",
946
+ " <td>The credit bureaus keep disrespecting the laws...</td>\n",
947
+ " <td>Credit Reporting</td>\n",
948
+ " <td>Credit reporting</td>\n",
949
+ " <td>Problem with a company's investigation into an...</td>\n",
950
+ " <td>Their investigation did not fix an error on yo...</td>\n",
951
+ " </tr>\n",
952
+ " <tr>\n",
953
+ " <th>1</th>\n",
954
+ " <td>I sent in a complaint in XXXX of 2021 about so...</td>\n",
955
+ " <td>Credit Reporting</td>\n",
956
+ " <td>Credit reporting</td>\n",
957
+ " <td>Incorrect information on your report</td>\n",
958
+ " <td>Information belongs to someone else</td>\n",
959
+ " </tr>\n",
960
+ " <tr>\n",
961
+ " <th>2</th>\n",
962
+ " <td>I ordered a copy of my report and I found out ...</td>\n",
963
+ " <td>Credit Reporting</td>\n",
964
+ " <td>Credit reporting</td>\n",
965
+ " <td>Problem with a credit reporting company's inve...</td>\n",
966
+ " <td>Their investigation did not fix an error on yo...</td>\n",
967
+ " </tr>\n",
968
+ " <tr>\n",
969
+ " <th>3</th>\n",
970
+ " <td>It appears that my credit file has been compro...</td>\n",
971
+ " <td>Credit Reporting</td>\n",
972
+ " <td>Credit reporting</td>\n",
973
+ " <td>Incorrect information on your report</td>\n",
974
+ " <td>Information belongs to someone else</td>\n",
975
+ " </tr>\n",
976
+ " <tr>\n",
977
+ " <th>4</th>\n",
978
+ " <td>I have never authorized, consented to nor bene...</td>\n",
979
+ " <td>Credit Reporting</td>\n",
980
+ " <td>Credit reporting</td>\n",
981
+ " <td>Incorrect information on your report</td>\n",
982
+ " <td>Information belongs to someone else</td>\n",
983
+ " </tr>\n",
984
+ " </tbody>\n",
985
+ "</table>\n",
986
+ "</div>"
987
+ ],
988
+ "text/plain": [
989
+ " Consumer complaint narrative Product \\\n",
990
+ "0 The credit bureaus keep disrespecting the laws... Credit Reporting \n",
991
+ "1 I sent in a complaint in XXXX of 2021 about so... Credit Reporting \n",
992
+ "2 I ordered a copy of my report and I found out ... Credit Reporting \n",
993
+ "3 It appears that my credit file has been compro... Credit Reporting \n",
994
+ "4 I have never authorized, consented to nor bene... Credit Reporting \n",
995
+ "\n",
996
+ " Sub-product Issue \\\n",
997
+ "0 Credit reporting Problem with a company's investigation into an... \n",
998
+ "1 Credit reporting Incorrect information on your report \n",
999
+ "2 Credit reporting Problem with a credit reporting company's inve... \n",
1000
+ "3 Credit reporting Incorrect information on your report \n",
1001
+ "4 Credit reporting Incorrect information on your report \n",
1002
+ "\n",
1003
+ " Sub-issue \n",
1004
+ "0 Their investigation did not fix an error on yo... \n",
1005
+ "1 Information belongs to someone else \n",
1006
+ "2 Their investigation did not fix an error on yo... \n",
1007
+ "3 Information belongs to someone else \n",
1008
+ "4 Information belongs to someone else "
1009
+ ]
1010
+ },
1011
+ "execution_count": 26,
1012
+ "metadata": {},
1013
+ "output_type": "execute_result"
1014
+ }
1015
+ ],
1016
+ "source": [
1017
+ "train_df.head()"
1018
+ ]
1019
+ },
1020
+ {
1021
+ "cell_type": "code",
1022
+ "execution_count": 27,
1023
+ "id": "724b3508-7e79-4526-a20f-3797250f9cf9",
1024
+ "metadata": {},
1025
+ "outputs": [
1026
+ {
1027
+ "data": {
1028
+ "text/plain": [
1029
+ "(185637, 5)"
1030
+ ]
1031
+ },
1032
+ "execution_count": 27,
1033
+ "metadata": {},
1034
+ "output_type": "execute_result"
1035
+ }
1036
+ ],
1037
+ "source": [
1038
+ "train_df.shape"
1039
+ ]
1040
+ },
1041
+ {
1042
+ "cell_type": "code",
1043
+ "execution_count": 28,
1044
+ "id": "06972769-eddd-4ee7-9ebc-e6f587ad5366",
1045
+ "metadata": {},
1046
+ "outputs": [
1047
+ {
1048
+ "data": {
1049
+ "text/plain": [
1050
+ "(61880, 5)"
1051
+ ]
1052
+ },
1053
+ "execution_count": 28,
1054
+ "metadata": {},
1055
+ "output_type": "execute_result"
1056
+ }
1057
+ ],
1058
+ "source": [
1059
+ "test_df.shape"
1060
+ ]
1061
+ },
1062
+ {
1063
+ "cell_type": "code",
1064
+ "execution_count": 29,
1065
+ "id": "de358d80-fd59-4f9c-83ee-2264659f4b0f",
1066
+ "metadata": {},
1067
+ "outputs": [],
1068
+ "source": [
1069
+ "import os\n",
1070
+ "\n",
1071
+ "directory_to_save = './data_splits/'\n",
1072
+ "\n",
1073
+ "if not os.path.exists(directory_to_save):\n",
1074
+ " os.makedirs(directory_to_save)\n",
1075
+ "\n",
1076
+ "train_df.to_csv(directory_to_save + 'train-data-split.csv',index = False)\n",
1077
+ "test_df.to_csv(directory_to_save + 'test-data-split.csv',index = False)"
1078
+ ]
1079
+ }
1080
+ ],
1081
+ "metadata": {
1082
+ "kernelspec": {
1083
+ "display_name": "Python 3 (ipykernel)",
1084
+ "language": "python",
1085
+ "name": "python3"
1086
+ },
1087
+ "language_info": {
1088
+ "codemirror_mode": {
1089
+ "name": "ipython",
1090
+ "version": 3
1091
+ },
1092
+ "file_extension": ".py",
1093
+ "mimetype": "text/x-python",
1094
+ "name": "python",
1095
+ "nbconvert_exporter": "python",
1096
+ "pygments_lexer": "ipython3",
1097
+ "version": "3.10.13"
1098
+ }
1099
+ },
1100
+ "nbformat": 4,
1101
+ "nbformat_minor": 5
1102
+ }
notebooks/.ipynb_checkpoints/Data Exploration-checkpoint.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/.ipynb_checkpoints/Data preprocessing-checkpoint.ipynb ADDED
@@ -0,0 +1,1069 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "cd6a338a-9a00-45f4-ac13-9ed131c9049e",
6
+ "metadata": {
7
+ "jp-MarkdownHeadingCollapsed": true
8
+ },
9
+ "source": [
10
+ "### Loading data (2023 year) "
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 1,
16
+ "id": "2e8de3f1-6812-4c0d-bd56-32459911000e",
17
+ "metadata": {},
18
+ "outputs": [],
19
+ "source": [
20
+ "import numpy as np\n",
21
+ "import pandas as pd\n",
22
+ "import matplotlib.pyplot as plt\n",
23
+ "import seaborn as sns\n",
24
+ "import plotly.express as px"
25
+ ]
26
+ },
27
+ {
28
+ "cell_type": "code",
29
+ "execution_count": 2,
30
+ "id": "ad45c437-7720-445e-8fa1-27d2b14b7bb5",
31
+ "metadata": {},
32
+ "outputs": [
33
+ {
34
+ "name": "stderr",
35
+ "output_type": "stream",
36
+ "text": [
37
+ "/tmp/ipykernel_42602/219708379.py:1: DtypeWarning: Columns (16) have mixed types. Specify dtype option on import or set low_memory=False.\n",
38
+ " df = pd.read_csv('./complaints.csv')\n"
39
+ ]
40
+ }
41
+ ],
42
+ "source": [
43
+ "df = pd.read_csv('./complaints.csv')\n",
44
+ "df['Date received'] = pd.to_datetime(df['Date received'])\n",
45
+ "\n",
46
+ "cols_to_consider = ['Product','Sub-product','Issue','Sub-issue','Consumer complaint narrative','Company public response','Company',\n",
47
+ " 'State', 'ZIP code', 'Date received']\n",
48
+ "df_new = df[cols_to_consider]\n",
49
+ "\n",
50
+ "df_new = df_new.dropna()"
51
+ ]
52
+ },
53
+ {
54
+ "cell_type": "code",
55
+ "execution_count": 29,
56
+ "id": "6df32835-7186-4c57-bffa-536f779636fe",
57
+ "metadata": {},
58
+ "outputs": [],
59
+ "source": [
60
+ "df_2023 = df_new[df_new['Date received'].dt.year.isin([2023])].reset_index(drop=True)\n",
61
+ "\n",
62
+ "product_map = {'Credit reporting or other personal consumer reports' : 'Credit Reporting',\n",
63
+ " 'Credit reporting, credit repair services, or other personal consumer reports' : 'Credit Reporting',\n",
64
+ " 'Payday loan, title loan, personal loan, or advance loan' : 'Loans / Mortgage',\n",
65
+ " 'Payday loan, title loan, or personal loan' : 'Loans / Mortgage',\n",
66
+ " 'Student loan' : 'Loans / Mortgage',\n",
67
+ " 'Vehicle loan or lease' : 'Loans / Mortgage',\n",
68
+ " 'Debt collection' : 'Debt collection',\n",
69
+ " 'Credit card or prepaid card' : 'Credit/Prepaid Card',\n",
70
+ " 'Credit card' : 'Credit/Prepaid Card',\n",
71
+ " 'Prepaid card' : 'Credit/Prepaid Card',\n",
72
+ " 'Mortgage' : 'Loans / Mortgage',\n",
73
+ " 'Checking or savings account' : 'Checking or savings account' \n",
74
+ " }\n",
75
+ "\n",
76
+ "df_2023.loc[:,'Product'] = df_2023['Product'].map(product_map)"
77
+ ]
78
+ },
79
+ {
80
+ "cell_type": "code",
81
+ "execution_count": 30,
82
+ "id": "679ffbe3-a6ba-4f4d-bf65-0690794fb4e1",
83
+ "metadata": {},
84
+ "outputs": [
85
+ {
86
+ "data": {
87
+ "text/html": [
88
+ "<div>\n",
89
+ "<style scoped>\n",
90
+ " .dataframe tbody tr th:only-of-type {\n",
91
+ " vertical-align: middle;\n",
92
+ " }\n",
93
+ "\n",
94
+ " .dataframe tbody tr th {\n",
95
+ " vertical-align: top;\n",
96
+ " }\n",
97
+ "\n",
98
+ " .dataframe thead th {\n",
99
+ " text-align: right;\n",
100
+ " }\n",
101
+ "</style>\n",
102
+ "<table border=\"1\" class=\"dataframe\">\n",
103
+ " <thead>\n",
104
+ " <tr style=\"text-align: right;\">\n",
105
+ " <th></th>\n",
106
+ " <th>Product</th>\n",
107
+ " <th>Sub-product</th>\n",
108
+ " <th>Issue</th>\n",
109
+ " <th>Sub-issue</th>\n",
110
+ " <th>Consumer complaint narrative</th>\n",
111
+ " <th>Company public response</th>\n",
112
+ " <th>Company</th>\n",
113
+ " <th>State</th>\n",
114
+ " <th>ZIP code</th>\n",
115
+ " <th>Date received</th>\n",
116
+ " </tr>\n",
117
+ " </thead>\n",
118
+ " <tbody>\n",
119
+ " <tr>\n",
120
+ " <th>0</th>\n",
121
+ " <td>Checking or savings account</td>\n",
122
+ " <td>Other banking product or service</td>\n",
123
+ " <td>Opening an account</td>\n",
124
+ " <td>Account opened without my consent or knowledge</td>\n",
125
+ " <td>Date : XXXX XXXXo : XXXX XXXX XXXX / XXXX XXXX...</td>\n",
126
+ " <td>Company has responded to the consumer and the ...</td>\n",
127
+ " <td>WELLS FARGO &amp; COMPANY</td>\n",
128
+ " <td>NC</td>\n",
129
+ " <td>27513</td>\n",
130
+ " <td>2023-12-29</td>\n",
131
+ " </tr>\n",
132
+ " <tr>\n",
133
+ " <th>1</th>\n",
134
+ " <td>Credit Reporting</td>\n",
135
+ " <td>Credit reporting</td>\n",
136
+ " <td>Problem with a company's investigation into an...</td>\n",
137
+ " <td>Investigation took more than 30 days</td>\n",
138
+ " <td>I have previously disputed this item with you ...</td>\n",
139
+ " <td>Company has responded to the consumer and the ...</td>\n",
140
+ " <td>Experian Information Solutions Inc.</td>\n",
141
+ " <td>MN</td>\n",
142
+ " <td>55124</td>\n",
143
+ " <td>2023-12-29</td>\n",
144
+ " </tr>\n",
145
+ " <tr>\n",
146
+ " <th>2</th>\n",
147
+ " <td>Debt collection</td>\n",
148
+ " <td>Other debt</td>\n",
149
+ " <td>Attempts to collect debt not owed</td>\n",
150
+ " <td>Debt was result of identity theft</td>\n",
151
+ " <td>I kindly request that you update my credit rep...</td>\n",
152
+ " <td>Company has responded to the consumer and the ...</td>\n",
153
+ " <td>Experian Information Solutions Inc.</td>\n",
154
+ " <td>IL</td>\n",
155
+ " <td>60621</td>\n",
156
+ " <td>2023-12-28</td>\n",
157
+ " </tr>\n",
158
+ " <tr>\n",
159
+ " <th>3</th>\n",
160
+ " <td>Debt collection</td>\n",
161
+ " <td>Other debt</td>\n",
162
+ " <td>Attempts to collect debt not owed</td>\n",
163
+ " <td>Debt was result of identity theft</td>\n",
164
+ " <td>I implore you to conduct a comprehensive inves...</td>\n",
165
+ " <td>Company has responded to the consumer and the ...</td>\n",
166
+ " <td>Experian Information Solutions Inc.</td>\n",
167
+ " <td>NJ</td>\n",
168
+ " <td>08723</td>\n",
169
+ " <td>2023-12-28</td>\n",
170
+ " </tr>\n",
171
+ " <tr>\n",
172
+ " <th>4</th>\n",
173
+ " <td>Credit Reporting</td>\n",
174
+ " <td>Credit reporting</td>\n",
175
+ " <td>Incorrect information on your report</td>\n",
176
+ " <td>Information belongs to someone else</td>\n",
177
+ " <td>In accordance with the Fair Credit Reporting A...</td>\n",
178
+ " <td>Company has responded to the consumer and the ...</td>\n",
179
+ " <td>TRANSUNION INTERMEDIATE HOLDINGS, INC.</td>\n",
180
+ " <td>TX</td>\n",
181
+ " <td>77377</td>\n",
182
+ " <td>2023-11-27</td>\n",
183
+ " </tr>\n",
184
+ " </tbody>\n",
185
+ "</table>\n",
186
+ "</div>"
187
+ ],
188
+ "text/plain": [
189
+ " Product Sub-product \\\n",
190
+ "0 Checking or savings account Other banking product or service \n",
191
+ "1 Credit Reporting Credit reporting \n",
192
+ "2 Debt collection Other debt \n",
193
+ "3 Debt collection Other debt \n",
194
+ "4 Credit Reporting Credit reporting \n",
195
+ "\n",
196
+ " Issue \\\n",
197
+ "0 Opening an account \n",
198
+ "1 Problem with a company's investigation into an... \n",
199
+ "2 Attempts to collect debt not owed \n",
200
+ "3 Attempts to collect debt not owed \n",
201
+ "4 Incorrect information on your report \n",
202
+ "\n",
203
+ " Sub-issue \\\n",
204
+ "0 Account opened without my consent or knowledge \n",
205
+ "1 Investigation took more than 30 days \n",
206
+ "2 Debt was result of identity theft \n",
207
+ "3 Debt was result of identity theft \n",
208
+ "4 Information belongs to someone else \n",
209
+ "\n",
210
+ " Consumer complaint narrative \\\n",
211
+ "0 Date : XXXX XXXXo : XXXX XXXX XXXX / XXXX XXXX... \n",
212
+ "1 I have previously disputed this item with you ... \n",
213
+ "2 I kindly request that you update my credit rep... \n",
214
+ "3 I implore you to conduct a comprehensive inves... \n",
215
+ "4 In accordance with the Fair Credit Reporting A... \n",
216
+ "\n",
217
+ " Company public response \\\n",
218
+ "0 Company has responded to the consumer and the ... \n",
219
+ "1 Company has responded to the consumer and the ... \n",
220
+ "2 Company has responded to the consumer and the ... \n",
221
+ "3 Company has responded to the consumer and the ... \n",
222
+ "4 Company has responded to the consumer and the ... \n",
223
+ "\n",
224
+ " Company State ZIP code Date received \n",
225
+ "0 WELLS FARGO & COMPANY NC 27513 2023-12-29 \n",
226
+ "1 Experian Information Solutions Inc. MN 55124 2023-12-29 \n",
227
+ "2 Experian Information Solutions Inc. IL 60621 2023-12-28 \n",
228
+ "3 Experian Information Solutions Inc. NJ 08723 2023-12-28 \n",
229
+ "4 TRANSUNION INTERMEDIATE HOLDINGS, INC. TX 77377 2023-11-27 "
230
+ ]
231
+ },
232
+ "execution_count": 30,
233
+ "metadata": {},
234
+ "output_type": "execute_result"
235
+ }
236
+ ],
237
+ "source": [
238
+ "df_2023.head()"
239
+ ]
240
+ },
241
+ {
242
+ "cell_type": "code",
243
+ "execution_count": 31,
244
+ "id": "a85ec9b1-5de7-47f9-b204-4d42c8880bbb",
245
+ "metadata": {},
246
+ "outputs": [
247
+ {
248
+ "data": {
249
+ "text/plain": [
250
+ "Index(['Product', 'Sub-product', 'Issue', 'Sub-issue',\n",
251
+ " 'Consumer complaint narrative', 'Company public response', 'Company',\n",
252
+ " 'State', 'ZIP code', 'Date received'],\n",
253
+ " dtype='object')"
254
+ ]
255
+ },
256
+ "execution_count": 31,
257
+ "metadata": {},
258
+ "output_type": "execute_result"
259
+ }
260
+ ],
261
+ "source": [
262
+ "df_2023.columns"
263
+ ]
264
+ },
265
+ {
266
+ "cell_type": "markdown",
267
+ "id": "0487636d-9663-4fdb-b219-f9e6be257b51",
268
+ "metadata": {
269
+ "jp-MarkdownHeadingCollapsed": true
270
+ },
271
+ "source": [
272
+ "### Complaint pre-processing"
273
+ ]
274
+ },
275
+ {
276
+ "cell_type": "code",
277
+ "execution_count": 32,
278
+ "id": "e35208c6-020a-4fb9-8c9f-13fdeee44935",
279
+ "metadata": {},
280
+ "outputs": [],
281
+ "source": [
282
+ "df_2023['complaint length'] = df_2023['Consumer complaint narrative'].apply(lambda x : len(x))"
283
+ ]
284
+ },
285
+ {
286
+ "cell_type": "code",
287
+ "execution_count": 33,
288
+ "id": "63deb9bb-d48a-460b-8edb-f66575ec1eaf",
289
+ "metadata": {},
290
+ "outputs": [],
291
+ "source": [
292
+ "df_2023 = df_2023[df_2023['complaint length'] > 20]\n",
293
+ "\n",
294
+ "complaints_to_exclude = ['See document attached', 'See the attached documents.', 'Incorrect information on my credit report', 'incorrect information on my credit report',\n",
295
+ "'please see attached file','Please see documents Attached','Incorrect information on my credit report.', 'Please see attached file', 'see attached',\n",
296
+ "'See attached', 'SEE ATTACHED DOCUMENTS', 'See Attached', 'SEE ATTACHMENT', 'SEE ATTACHMENTS', \n",
297
+ "'XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX']\n",
298
+ "\n",
299
+ "df_2023 = df_2023[~df_2023['Consumer complaint narrative'].isin(complaints_to_exclude)]"
300
+ ]
301
+ },
302
+ {
303
+ "cell_type": "markdown",
304
+ "id": "492f8261-3e01-41d5-8f24-82bd289ee229",
305
+ "metadata": {
306
+ "jp-MarkdownHeadingCollapsed": true
307
+ },
308
+ "source": [
309
+ "### Categories consideration"
310
+ ]
311
+ },
312
+ {
313
+ "cell_type": "code",
314
+ "execution_count": 56,
315
+ "id": "0be9e1f3-61aa-494a-bd0c-a6afeab5aacd",
316
+ "metadata": {},
317
+ "outputs": [
318
+ {
319
+ "data": {
320
+ "text/plain": [
321
+ "(264968, 5)"
322
+ ]
323
+ },
324
+ "execution_count": 56,
325
+ "metadata": {},
326
+ "output_type": "execute_result"
327
+ }
328
+ ],
329
+ "source": [
330
+ "df_2023_subset = df_2023[['Consumer complaint narrative','Product','Sub-product','Issue','Sub-issue']]\n",
331
+ "df_2023_subset.shape"
332
+ ]
333
+ },
334
+ {
335
+ "cell_type": "code",
336
+ "execution_count": 57,
337
+ "id": "33e4e7e3-6661-48aa-aec2-b706fa64338d",
338
+ "metadata": {},
339
+ "outputs": [
340
+ {
341
+ "data": {
342
+ "text/plain": [
343
+ "Product\n",
344
+ "Credit Reporting 213403\n",
345
+ "Credit/Prepaid Card 16319\n",
346
+ "Checking or savings account 15143\n",
347
+ "Debt collection 11767\n",
348
+ "Loans / Mortgage 8336\n",
349
+ "Name: count, dtype: int64"
350
+ ]
351
+ },
352
+ "execution_count": 57,
353
+ "metadata": {},
354
+ "output_type": "execute_result"
355
+ }
356
+ ],
357
+ "source": [
358
+ "df_2023_subset['Product'].value_counts()"
359
+ ]
360
+ },
361
+ {
362
+ "cell_type": "code",
363
+ "execution_count": 58,
364
+ "id": "dbc49ba8-f15a-4a4b-b018-d9d2273620ba",
365
+ "metadata": {},
366
+ "outputs": [],
367
+ "source": [
368
+ "sub_issues_to_consider = df_2023_subset['Sub-issue'].value_counts()[df_2023_subset['Sub-issue'].value_counts() > 500].index"
369
+ ]
370
+ },
371
+ {
372
+ "cell_type": "code",
373
+ "execution_count": 59,
374
+ "id": "746db565-e6ff-4ab2-bf92-d56088c0f2da",
375
+ "metadata": {},
376
+ "outputs": [],
377
+ "source": [
378
+ "reduced_subissues = df_2023_subset[df_2023_subset['Sub-issue'].isin(sub_issues_to_consider)]"
379
+ ]
380
+ },
381
+ {
382
+ "cell_type": "code",
383
+ "execution_count": 60,
384
+ "id": "0f786b1d-b139-40b5-ad53-639d8687d3b4",
385
+ "metadata": {},
386
+ "outputs": [
387
+ {
388
+ "data": {
389
+ "text/plain": [
390
+ "(248065, 5)"
391
+ ]
392
+ },
393
+ "execution_count": 60,
394
+ "metadata": {},
395
+ "output_type": "execute_result"
396
+ }
397
+ ],
398
+ "source": [
399
+ "reduced_subissues.shape"
400
+ ]
401
+ },
402
+ {
403
+ "cell_type": "code",
404
+ "execution_count": 61,
405
+ "id": "f64515e8-ac65-4041-a201-a8576a86d7ad",
406
+ "metadata": {},
407
+ "outputs": [
408
+ {
409
+ "data": {
410
+ "text/plain": [
411
+ "Sub-issue\n",
412
+ "Information belongs to someone else 57877\n",
413
+ "Reporting company used your report improperly 48781\n",
414
+ "Their investigation did not fix an error on your report 45407\n",
415
+ "Credit inquiries on your report that you don't recognize 13150\n",
416
+ "Account status incorrect 10271\n",
417
+ "Account information incorrect 9307\n",
418
+ "Was not notified of investigation status or results 9201\n",
419
+ "Investigation took more than 30 days 8937\n",
420
+ "Personal information incorrect 5900\n",
421
+ "Debt is not yours 2821\n",
422
+ "Deposits and withdrawals 2626\n",
423
+ "Credit card company isn't resolving a dispute about a purchase on your statement 2289\n",
424
+ "Didn't receive enough information to verify debt 1816\n",
425
+ "Debt was result of identity theft 1761\n",
426
+ "Old information reappears or never goes away 1716\n",
427
+ "Difficulty submitting a dispute or getting information about a dispute over the phone 1709\n",
428
+ "Company closed your account 1517\n",
429
+ "Problem using a debit or ATM card 1503\n",
430
+ "Public record information inaccurate 1389\n",
431
+ "Transaction was not authorized 1378\n",
432
+ "Problem with personal statement of dispute 1361\n",
433
+ "Other problem getting your report or credit score 1112\n",
434
+ "Debt was paid 969\n",
435
+ "Card was charged for something you did not purchase with the card 964\n",
436
+ "Banking errors 958\n",
437
+ "Funds not handled or disbursed as instructed 955\n",
438
+ "Overdrafts and overdraft fees 951\n",
439
+ "Attempted to collect wrong amount 885\n",
440
+ "Information is missing that should be on the report 881\n",
441
+ "Problem during payment process 840\n",
442
+ "Fee problem 764\n",
443
+ "Problem with fees 749\n",
444
+ "Received bad information about your loan 710\n",
445
+ "Other problem 701\n",
446
+ "Threatened or suggested your credit would be damaged 687\n",
447
+ "Funds not received from closed account 673\n",
448
+ "Trouble with how payments are being handled 650\n",
449
+ "Didn't receive notice of right to dispute 644\n",
450
+ "Can't close your account 598\n",
451
+ "Problem accessing account 561\n",
452
+ "Account opened as a result of fraud 561\n",
453
+ "Problem canceling credit monitoring or identify theft protection service 521\n",
454
+ "Card opened as result of identity theft or fraud 511\n",
455
+ "Billing problem 503\n",
456
+ "Name: count, dtype: int64"
457
+ ]
458
+ },
459
+ "execution_count": 61,
460
+ "metadata": {},
461
+ "output_type": "execute_result"
462
+ }
463
+ ],
464
+ "source": [
465
+ "reduced_subissues['Sub-issue'].value_counts()"
466
+ ]
467
+ },
468
+ {
469
+ "cell_type": "code",
470
+ "execution_count": 62,
471
+ "id": "6204eb53-1a5b-457f-ab67-957d73f568af",
472
+ "metadata": {},
473
+ "outputs": [],
474
+ "source": [
475
+ "sub_products_to_consider = reduced_subissues['Sub-product'].value_counts()[reduced_subissues['Sub-product'].value_counts() > 100].index\n",
476
+ "final_df_2023 = reduced_subissues[reduced_subissues['Sub-product'].isin(sub_products_to_consider)]"
477
+ ]
478
+ },
479
+ {
480
+ "cell_type": "code",
481
+ "execution_count": 63,
482
+ "id": "781850e8-cd50-4d08-87aa-8d86715cc2ef",
483
+ "metadata": {},
484
+ "outputs": [
485
+ {
486
+ "data": {
487
+ "text/plain": [
488
+ "(247517, 5)"
489
+ ]
490
+ },
491
+ "execution_count": 63,
492
+ "metadata": {},
493
+ "output_type": "execute_result"
494
+ }
495
+ ],
496
+ "source": [
497
+ "final_df_2023.shape"
498
+ ]
499
+ },
500
+ {
501
+ "cell_type": "markdown",
502
+ "id": "0ab4f91f-c938-4093-a299-b895ea13121a",
503
+ "metadata": {
504
+ "jp-MarkdownHeadingCollapsed": true
505
+ },
506
+ "source": [
507
+ "### Value counts"
508
+ ]
509
+ },
510
+ {
511
+ "cell_type": "code",
512
+ "execution_count": 64,
513
+ "id": "17ddf55c-f824-4b2c-8059-07d02597a1cb",
514
+ "metadata": {},
515
+ "outputs": [
516
+ {
517
+ "data": {
518
+ "text/plain": [
519
+ "Product\n",
520
+ "Credit Reporting 211695\n",
521
+ "Checking or savings account 12285\n",
522
+ "Credit/Prepaid Card 11975\n",
523
+ "Debt collection 9380\n",
524
+ "Loans / Mortgage 2182\n",
525
+ "Name: count, dtype: int64"
526
+ ]
527
+ },
528
+ "execution_count": 64,
529
+ "metadata": {},
530
+ "output_type": "execute_result"
531
+ }
532
+ ],
533
+ "source": [
534
+ "final_df_2023['Product'].value_counts()"
535
+ ]
536
+ },
537
+ {
538
+ "cell_type": "code",
539
+ "execution_count": 65,
540
+ "id": "eae2d688-f706-4c31-9228-1ae7eadbf228",
541
+ "metadata": {},
542
+ "outputs": [
543
+ {
544
+ "data": {
545
+ "text/plain": [
546
+ "Sub-product\n",
547
+ "Credit reporting 210735\n",
548
+ "General-purpose credit card or charge card 10668\n",
549
+ "Checking account 10409\n",
550
+ "Other debt 3041\n",
551
+ "I do not know 2316\n",
552
+ "Credit card debt 1652\n",
553
+ "Federal student loan servicing 1344\n",
554
+ "Store credit card 1307\n",
555
+ "Medical debt 1053\n",
556
+ "Savings account 989\n",
557
+ "Other personal consumer report 960\n",
558
+ "Loan 732\n",
559
+ "Other banking product or service 725\n",
560
+ "Auto debt 581\n",
561
+ "Telecommunications debt 419\n",
562
+ "Rental debt 179\n",
563
+ "CD (Certificate of Deposit) 162\n",
564
+ "Mortgage debt 139\n",
565
+ "Conventional home mortgage 106\n",
566
+ "Name: count, dtype: int64"
567
+ ]
568
+ },
569
+ "execution_count": 65,
570
+ "metadata": {},
571
+ "output_type": "execute_result"
572
+ }
573
+ ],
574
+ "source": [
575
+ "final_df_2023['Sub-product'].value_counts()"
576
+ ]
577
+ },
578
+ {
579
+ "cell_type": "code",
580
+ "execution_count": 66,
581
+ "id": "61179ec3-f49f-4d2a-adde-738a0ff89371",
582
+ "metadata": {},
583
+ "outputs": [
584
+ {
585
+ "data": {
586
+ "text/plain": [
587
+ "Issue\n",
588
+ "Incorrect information on your report 87200\n",
589
+ "Improper use of your report 61868\n",
590
+ "Problem with a credit reporting company's investigation into an existing problem 45371\n",
591
+ "Problem with a company's investigation into an existing problem 20985\n",
592
+ "Managing an account 7367\n",
593
+ "Attempts to collect debt not owed 5453\n",
594
+ "Problem with a purchase shown on your statement 3253\n",
595
+ "Written notification about debt 2404\n",
596
+ "Closing an account 1975\n",
597
+ "Problem with a lender or other company charging your account 1378\n",
598
+ "Dealing with your lender or servicer 1293\n",
599
+ "Unable to get your credit report or credit score 1109\n",
600
+ "Problem caused by your funds being low 951\n",
601
+ "False statements or representation 861\n",
602
+ "Problem when making payments 840\n",
603
+ "Closing your account 813\n",
604
+ "Fees or interest 749\n",
605
+ "Other features, terms, or problems 701\n",
606
+ "Took or threatened to take negative or legal action 662\n",
607
+ "Opening an account 561\n",
608
+ "Getting a credit card 511\n",
609
+ "Credit monitoring or identity theft protection services 495\n",
610
+ "Managing the loan or lease 468\n",
611
+ "Problem with a company's investigation into an existing issue 223\n",
612
+ "Identity theft protection or other monitoring services 26\n",
613
+ "Name: count, dtype: int64"
614
+ ]
615
+ },
616
+ "execution_count": 66,
617
+ "metadata": {},
618
+ "output_type": "execute_result"
619
+ }
620
+ ],
621
+ "source": [
622
+ "demo['Issue'].value_counts()"
623
+ ]
624
+ },
625
+ {
626
+ "cell_type": "code",
627
+ "execution_count": 67,
628
+ "id": "928750bb-7324-480f-aaa1-a4438841399c",
629
+ "metadata": {},
630
+ "outputs": [
631
+ {
632
+ "data": {
633
+ "text/plain": [
634
+ "Sub-issue\n",
635
+ "Information belongs to someone else 57850\n",
636
+ "Reporting company used your report improperly 48732\n",
637
+ "Their investigation did not fix an error on your report 45395\n",
638
+ "Credit inquiries on your report that you don't recognize 13136\n",
639
+ "Account status incorrect 10208\n",
640
+ "Account information incorrect 9267\n",
641
+ "Was not notified of investigation status or results 9200\n",
642
+ "Investigation took more than 30 days 8928\n",
643
+ "Personal information incorrect 5900\n",
644
+ "Debt is not yours 2785\n",
645
+ "Deposits and withdrawals 2626\n",
646
+ "Credit card company isn't resolving a dispute about a purchase on your statement 2289\n",
647
+ "Didn't receive enough information to verify debt 1777\n",
648
+ "Debt was result of identity theft 1727\n",
649
+ "Old information reappears or never goes away 1714\n",
650
+ "Difficulty submitting a dispute or getting information about a dispute over the phone 1704\n",
651
+ "Company closed your account 1517\n",
652
+ "Problem using a debit or ATM card 1503\n",
653
+ "Public record information inaccurate 1384\n",
654
+ "Transaction was not authorized 1378\n",
655
+ "Problem with personal statement of dispute 1352\n",
656
+ "Other problem getting your report or credit score 1109\n",
657
+ "Card was charged for something you did not purchase with the card 964\n",
658
+ "Banking errors 958\n",
659
+ "Funds not handled or disbursed as instructed 955\n",
660
+ "Overdrafts and overdraft fees 951\n",
661
+ "Debt was paid 941\n",
662
+ "Information is missing that should be on the report 877\n",
663
+ "Attempted to collect wrong amount 861\n",
664
+ "Problem during payment process 840\n",
665
+ "Fee problem 764\n",
666
+ "Problem with fees 749\n",
667
+ "Other problem 701\n",
668
+ "Received bad information about your loan 677\n",
669
+ "Funds not received from closed account 673\n",
670
+ "Threatened or suggested your credit would be damaged 662\n",
671
+ "Didn't receive notice of right to dispute 627\n",
672
+ "Trouble with how payments are being handled 616\n",
673
+ "Can't close your account 598\n",
674
+ "Problem accessing account 561\n",
675
+ "Account opened as a result of fraud 561\n",
676
+ "Problem canceling credit monitoring or identify theft protection service 521\n",
677
+ "Card opened as result of identity theft or fraud 511\n",
678
+ "Billing problem 468\n",
679
+ "Name: count, dtype: int64"
680
+ ]
681
+ },
682
+ "execution_count": 67,
683
+ "metadata": {},
684
+ "output_type": "execute_result"
685
+ }
686
+ ],
687
+ "source": [
688
+ "final_df_2023['Sub-issue'].value_counts()"
689
+ ]
690
+ },
691
+ {
692
+ "cell_type": "markdown",
693
+ "id": "fd91e57e-766c-4c4b-92c1-4b61469be9b4",
694
+ "metadata": {},
695
+ "source": [
696
+ "### Unique categories"
697
+ ]
698
+ },
699
+ {
700
+ "cell_type": "code",
701
+ "execution_count": 68,
702
+ "id": "028803cd-86c0-4c8a-9fab-8f05ba6793a1",
703
+ "metadata": {},
704
+ "outputs": [
705
+ {
706
+ "name": "stdout",
707
+ "output_type": "stream",
708
+ "text": [
709
+ "Unique Product offerings: 5\n",
710
+ "Unique Sub-product offerings: 19\n",
711
+ "Unique Issue offerings: 25\n",
712
+ "Unique Sub-issue offerings: 44\n"
713
+ ]
714
+ }
715
+ ],
716
+ "source": [
717
+ "print(f\"Unique Product offerings: {final_df_2023['Product'].nunique()}\")\n",
718
+ "print(f\"Unique Sub-product offerings: {final_df_2023['Sub-product'].nunique()}\")\n",
719
+ "print(f\"Unique Issue offerings: {final_df_2023['Issue'].nunique()}\")\n",
720
+ "print(f\"Unique Sub-issue offerings: {final_df_2023['Sub-issue'].nunique()}\")"
721
+ ]
722
+ },
723
+ {
724
+ "cell_type": "markdown",
725
+ "id": "06ea0454-ed84-450a-90f7-e7552ffc181f",
726
+ "metadata": {},
727
+ "source": [
728
+ "### Preparing the train and test splits"
729
+ ]
730
+ },
731
+ {
732
+ "cell_type": "code",
733
+ "execution_count": 69,
734
+ "id": "267b771c-f944-443a-8048-c2f0097f4f29",
735
+ "metadata": {},
736
+ "outputs": [],
737
+ "source": [
738
+ "from sklearn.model_selection import train_test_split"
739
+ ]
740
+ },
741
+ {
742
+ "cell_type": "code",
743
+ "execution_count": 70,
744
+ "id": "eebed808-66b4-4fa8-a0ce-872b70d18106",
745
+ "metadata": {},
746
+ "outputs": [
747
+ {
748
+ "data": {
749
+ "text/html": [
750
+ "<div>\n",
751
+ "<style scoped>\n",
752
+ " .dataframe tbody tr th:only-of-type {\n",
753
+ " vertical-align: middle;\n",
754
+ " }\n",
755
+ "\n",
756
+ " .dataframe tbody tr th {\n",
757
+ " vertical-align: top;\n",
758
+ " }\n",
759
+ "\n",
760
+ " .dataframe thead th {\n",
761
+ " text-align: right;\n",
762
+ " }\n",
763
+ "</style>\n",
764
+ "<table border=\"1\" class=\"dataframe\">\n",
765
+ " <thead>\n",
766
+ " <tr style=\"text-align: right;\">\n",
767
+ " <th></th>\n",
768
+ " <th>Consumer complaint narrative</th>\n",
769
+ " <th>Product</th>\n",
770
+ " <th>Sub-product</th>\n",
771
+ " <th>Issue</th>\n",
772
+ " <th>Sub-issue</th>\n",
773
+ " </tr>\n",
774
+ " </thead>\n",
775
+ " <tbody>\n",
776
+ " <tr>\n",
777
+ " <th>1</th>\n",
778
+ " <td>I have previously disputed this item with you ...</td>\n",
779
+ " <td>Credit Reporting</td>\n",
780
+ " <td>Credit reporting</td>\n",
781
+ " <td>Problem with a company's investigation into an...</td>\n",
782
+ " <td>Investigation took more than 30 days</td>\n",
783
+ " </tr>\n",
784
+ " <tr>\n",
785
+ " <th>2</th>\n",
786
+ " <td>I kindly request that you update my credit rep...</td>\n",
787
+ " <td>Debt collection</td>\n",
788
+ " <td>Other debt</td>\n",
789
+ " <td>Attempts to collect debt not owed</td>\n",
790
+ " <td>Debt was result of identity theft</td>\n",
791
+ " </tr>\n",
792
+ " <tr>\n",
793
+ " <th>3</th>\n",
794
+ " <td>I implore you to conduct a comprehensive inves...</td>\n",
795
+ " <td>Debt collection</td>\n",
796
+ " <td>Other debt</td>\n",
797
+ " <td>Attempts to collect debt not owed</td>\n",
798
+ " <td>Debt was result of identity theft</td>\n",
799
+ " </tr>\n",
800
+ " <tr>\n",
801
+ " <th>4</th>\n",
802
+ " <td>In accordance with the Fair Credit Reporting A...</td>\n",
803
+ " <td>Credit Reporting</td>\n",
804
+ " <td>Credit reporting</td>\n",
805
+ " <td>Incorrect information on your report</td>\n",
806
+ " <td>Information belongs to someone else</td>\n",
807
+ " </tr>\n",
808
+ " <tr>\n",
809
+ " <th>5</th>\n",
810
+ " <td>In accordance with Fair c=Credit Reporting Act...</td>\n",
811
+ " <td>Credit Reporting</td>\n",
812
+ " <td>Credit reporting</td>\n",
813
+ " <td>Improper use of your report</td>\n",
814
+ " <td>Reporting company used your report improperly</td>\n",
815
+ " </tr>\n",
816
+ " </tbody>\n",
817
+ "</table>\n",
818
+ "</div>"
819
+ ],
820
+ "text/plain": [
821
+ " Consumer complaint narrative Product \\\n",
822
+ "1 I have previously disputed this item with you ... Credit Reporting \n",
823
+ "2 I kindly request that you update my credit rep... Debt collection \n",
824
+ "3 I implore you to conduct a comprehensive inves... Debt collection \n",
825
+ "4 In accordance with the Fair Credit Reporting A... Credit Reporting \n",
826
+ "5 In accordance with Fair c=Credit Reporting Act... Credit Reporting \n",
827
+ "\n",
828
+ " Sub-product Issue \\\n",
829
+ "1 Credit reporting Problem with a company's investigation into an... \n",
830
+ "2 Other debt Attempts to collect debt not owed \n",
831
+ "3 Other debt Attempts to collect debt not owed \n",
832
+ "4 Credit reporting Incorrect information on your report \n",
833
+ "5 Credit reporting Improper use of your report \n",
834
+ "\n",
835
+ " Sub-issue \n",
836
+ "1 Investigation took more than 30 days \n",
837
+ "2 Debt was result of identity theft \n",
838
+ "3 Debt was result of identity theft \n",
839
+ "4 Information belongs to someone else \n",
840
+ "5 Reporting company used your report improperly "
841
+ ]
842
+ },
843
+ "execution_count": 70,
844
+ "metadata": {},
845
+ "output_type": "execute_result"
846
+ }
847
+ ],
848
+ "source": [
849
+ "final_df_2023.head()"
850
+ ]
851
+ },
852
+ {
853
+ "cell_type": "code",
854
+ "execution_count": 86,
855
+ "id": "da025cda-f04e-4822-b100-855e981d632a",
856
+ "metadata": {},
857
+ "outputs": [],
858
+ "source": [
859
+ "X = final_df_2023['Consumer complaint narrative']\n",
860
+ "y = final_df_2023[['Product','Sub-product','Issue','Sub-issue']]\n",
861
+ "\n",
862
+ "X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y['Product'],test_size=0.25,random_state=42)"
863
+ ]
864
+ },
865
+ {
866
+ "cell_type": "code",
867
+ "execution_count": 91,
868
+ "id": "d291102d-7136-4512-84c2-ba970b169cbf",
869
+ "metadata": {},
870
+ "outputs": [],
871
+ "source": [
872
+ "train_df = pd.concat([X_train,y_train],axis = 1).reset_index(drop = True)\n",
873
+ "test_df = pd.concat([X_test,y_test],axis = 1).reset_index(drop = True)"
874
+ ]
875
+ },
876
+ {
877
+ "cell_type": "code",
878
+ "execution_count": 92,
879
+ "id": "0006636f-24cf-41dd-98cd-dc3a2b65432f",
880
+ "metadata": {},
881
+ "outputs": [
882
+ {
883
+ "data": {
884
+ "text/html": [
885
+ "<div>\n",
886
+ "<style scoped>\n",
887
+ " .dataframe tbody tr th:only-of-type {\n",
888
+ " vertical-align: middle;\n",
889
+ " }\n",
890
+ "\n",
891
+ " .dataframe tbody tr th {\n",
892
+ " vertical-align: top;\n",
893
+ " }\n",
894
+ "\n",
895
+ " .dataframe thead th {\n",
896
+ " text-align: right;\n",
897
+ " }\n",
898
+ "</style>\n",
899
+ "<table border=\"1\" class=\"dataframe\">\n",
900
+ " <thead>\n",
901
+ " <tr style=\"text-align: right;\">\n",
902
+ " <th></th>\n",
903
+ " <th>Consumer complaint narrative</th>\n",
904
+ " <th>Product</th>\n",
905
+ " <th>Sub-product</th>\n",
906
+ " <th>Issue</th>\n",
907
+ " <th>Sub-issue</th>\n",
908
+ " </tr>\n",
909
+ " </thead>\n",
910
+ " <tbody>\n",
911
+ " <tr>\n",
912
+ " <th>0</th>\n",
913
+ " <td>The credit bureaus keep disrespecting the laws...</td>\n",
914
+ " <td>Credit Reporting</td>\n",
915
+ " <td>Credit reporting</td>\n",
916
+ " <td>Problem with a company's investigation into an...</td>\n",
917
+ " <td>Their investigation did not fix an error on yo...</td>\n",
918
+ " </tr>\n",
919
+ " <tr>\n",
920
+ " <th>1</th>\n",
921
+ " <td>I sent in a complaint in XXXX of 2021 about so...</td>\n",
922
+ " <td>Credit Reporting</td>\n",
923
+ " <td>Credit reporting</td>\n",
924
+ " <td>Incorrect information on your report</td>\n",
925
+ " <td>Information belongs to someone else</td>\n",
926
+ " </tr>\n",
927
+ " <tr>\n",
928
+ " <th>2</th>\n",
929
+ " <td>I ordered a copy of my report and I found out ...</td>\n",
930
+ " <td>Credit Reporting</td>\n",
931
+ " <td>Credit reporting</td>\n",
932
+ " <td>Problem with a credit reporting company's inve...</td>\n",
933
+ " <td>Their investigation did not fix an error on yo...</td>\n",
934
+ " </tr>\n",
935
+ " <tr>\n",
936
+ " <th>3</th>\n",
937
+ " <td>It appears that my credit file has been compro...</td>\n",
938
+ " <td>Credit Reporting</td>\n",
939
+ " <td>Credit reporting</td>\n",
940
+ " <td>Incorrect information on your report</td>\n",
941
+ " <td>Information belongs to someone else</td>\n",
942
+ " </tr>\n",
943
+ " <tr>\n",
944
+ " <th>4</th>\n",
945
+ " <td>I have never authorized, consented to nor bene...</td>\n",
946
+ " <td>Credit Reporting</td>\n",
947
+ " <td>Credit reporting</td>\n",
948
+ " <td>Incorrect information on your report</td>\n",
949
+ " <td>Information belongs to someone else</td>\n",
950
+ " </tr>\n",
951
+ " </tbody>\n",
952
+ "</table>\n",
953
+ "</div>"
954
+ ],
955
+ "text/plain": [
956
+ " Consumer complaint narrative Product \\\n",
957
+ "0 The credit bureaus keep disrespecting the laws... Credit Reporting \n",
958
+ "1 I sent in a complaint in XXXX of 2021 about so... Credit Reporting \n",
959
+ "2 I ordered a copy of my report and I found out ... Credit Reporting \n",
960
+ "3 It appears that my credit file has been compro... Credit Reporting \n",
961
+ "4 I have never authorized, consented to nor bene... Credit Reporting \n",
962
+ "\n",
963
+ " Sub-product Issue \\\n",
964
+ "0 Credit reporting Problem with a company's investigation into an... \n",
965
+ "1 Credit reporting Incorrect information on your report \n",
966
+ "2 Credit reporting Problem with a credit reporting company's inve... \n",
967
+ "3 Credit reporting Incorrect information on your report \n",
968
+ "4 Credit reporting Incorrect information on your report \n",
969
+ "\n",
970
+ " Sub-issue \n",
971
+ "0 Their investigation did not fix an error on yo... \n",
972
+ "1 Information belongs to someone else \n",
973
+ "2 Their investigation did not fix an error on yo... \n",
974
+ "3 Information belongs to someone else \n",
975
+ "4 Information belongs to someone else "
976
+ ]
977
+ },
978
+ "execution_count": 92,
979
+ "metadata": {},
980
+ "output_type": "execute_result"
981
+ }
982
+ ],
983
+ "source": [
984
+ "train_df.head()"
985
+ ]
986
+ },
987
+ {
988
+ "cell_type": "code",
989
+ "execution_count": 94,
990
+ "id": "724b3508-7e79-4526-a20f-3797250f9cf9",
991
+ "metadata": {},
992
+ "outputs": [
993
+ {
994
+ "data": {
995
+ "text/plain": [
996
+ "(185637, 5)"
997
+ ]
998
+ },
999
+ "execution_count": 94,
1000
+ "metadata": {},
1001
+ "output_type": "execute_result"
1002
+ }
1003
+ ],
1004
+ "source": [
1005
+ "train_df.shape"
1006
+ ]
1007
+ },
1008
+ {
1009
+ "cell_type": "code",
1010
+ "execution_count": 95,
1011
+ "id": "06972769-eddd-4ee7-9ebc-e6f587ad5366",
1012
+ "metadata": {},
1013
+ "outputs": [
1014
+ {
1015
+ "data": {
1016
+ "text/plain": [
1017
+ "(61880, 5)"
1018
+ ]
1019
+ },
1020
+ "execution_count": 95,
1021
+ "metadata": {},
1022
+ "output_type": "execute_result"
1023
+ }
1024
+ ],
1025
+ "source": [
1026
+ "test_df.shape"
1027
+ ]
1028
+ },
1029
+ {
1030
+ "cell_type": "code",
1031
+ "execution_count": 99,
1032
+ "id": "de358d80-fd59-4f9c-83ee-2264659f4b0f",
1033
+ "metadata": {},
1034
+ "outputs": [],
1035
+ "source": [
1036
+ "import os\n",
1037
+ "\n",
1038
+ "directory_to_save = './data_splits/'\n",
1039
+ "\n",
1040
+ "if not os.path.exists(directory_to_save):\n",
1041
+ " os.makedirs(directory_to_save)\n",
1042
+ "\n",
1043
+ "train_df.to_csv(directory_to_save + 'train-data-split.csv',index = False)\n",
1044
+ "test_df.to_csv(directory_to_save + 'test-data-split.csv',index = False)"
1045
+ ]
1046
+ }
1047
+ ],
1048
+ "metadata": {
1049
+ "kernelspec": {
1050
+ "display_name": "Python 3 (ipykernel)",
1051
+ "language": "python",
1052
+ "name": "python3"
1053
+ },
1054
+ "language_info": {
1055
+ "codemirror_mode": {
1056
+ "name": "ipython",
1057
+ "version": 3
1058
+ },
1059
+ "file_extension": ".py",
1060
+ "mimetype": "text/x-python",
1061
+ "name": "python",
1062
+ "nbconvert_exporter": "python",
1063
+ "pygments_lexer": "ipython3",
1064
+ "version": "3.10.13"
1065
+ }
1066
+ },
1067
+ "nbformat": 4,
1068
+ "nbformat_minor": 5
1069
+ }
notebooks/.ipynb_checkpoints/Data split-checkpoint.ipynb ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [],
3
+ "metadata": {},
4
+ "nbformat": 4,
5
+ "nbformat_minor": 5
6
+ }
notebooks/.ipynb_checkpoints/Issues Preprocessing-checkpoint.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
notebooks/.ipynb_checkpoints/Untitled-checkpoint.ipynb ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [],
3
+ "metadata": {},
4
+ "nbformat": 4,
5
+ "nbformat_minor": 5
6
+ }
notebooks/Data preprocessing.ipynb ADDED
@@ -0,0 +1,1102 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "cd6a338a-9a00-45f4-ac13-9ed131c9049e",
6
+ "metadata": {},
7
+ "source": [
8
+ "### Loading data (2023 year) "
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "code",
13
+ "execution_count": 1,
14
+ "id": "2e8de3f1-6812-4c0d-bd56-32459911000e",
15
+ "metadata": {},
16
+ "outputs": [],
17
+ "source": [
18
+ "import numpy as np\n",
19
+ "import pandas as pd\n",
20
+ "import matplotlib.pyplot as plt\n",
21
+ "import seaborn as sns\n",
22
+ "import plotly.express as px"
23
+ ]
24
+ },
25
+ {
26
+ "cell_type": "code",
27
+ "execution_count": 2,
28
+ "id": "ad45c437-7720-445e-8fa1-27d2b14b7bb5",
29
+ "metadata": {},
30
+ "outputs": [
31
+ {
32
+ "name": "stderr",
33
+ "output_type": "stream",
34
+ "text": [
35
+ "/tmp/ipykernel_9929/219708379.py:1: DtypeWarning: Columns (16) have mixed types. Specify dtype option on import or set low_memory=False.\n",
36
+ " df = pd.read_csv('./complaints.csv')\n"
37
+ ]
38
+ }
39
+ ],
40
+ "source": [
41
+ "df = pd.read_csv('./complaints.csv')\n",
42
+ "df['Date received'] = pd.to_datetime(df['Date received'])\n",
43
+ "\n",
44
+ "cols_to_consider = ['Product','Sub-product','Issue','Sub-issue','Consumer complaint narrative','Company public response','Company',\n",
45
+ " 'State', 'ZIP code', 'Date received']\n",
46
+ "df_new = df[cols_to_consider]\n",
47
+ "\n",
48
+ "df_new = df_new.dropna()"
49
+ ]
50
+ },
51
+ {
52
+ "cell_type": "code",
53
+ "execution_count": 3,
54
+ "id": "6df32835-7186-4c57-bffa-536f779636fe",
55
+ "metadata": {},
56
+ "outputs": [],
57
+ "source": [
58
+ "df_2023 = df_new[df_new['Date received'].dt.year.isin([2023])].reset_index(drop=True)\n",
59
+ "\n",
60
+ "product_map = {'Credit reporting or other personal consumer reports' : 'Credit Reporting',\n",
61
+ " 'Credit reporting, credit repair services, or other personal consumer reports' : 'Credit Reporting',\n",
62
+ " 'Payday loan, title loan, personal loan, or advance loan' : 'Loans / Mortgage',\n",
63
+ " 'Payday loan, title loan, or personal loan' : 'Loans / Mortgage',\n",
64
+ " 'Student loan' : 'Loans / Mortgage',\n",
65
+ " 'Vehicle loan or lease' : 'Loans / Mortgage',\n",
66
+ " 'Debt collection' : 'Debt collection',\n",
67
+ " 'Credit card or prepaid card' : 'Credit/Prepaid Card',\n",
68
+ " 'Credit card' : 'Credit/Prepaid Card',\n",
69
+ " 'Prepaid card' : 'Credit/Prepaid Card',\n",
70
+ " 'Mortgage' : 'Loans / Mortgage',\n",
71
+ " 'Checking or savings account' : 'Checking or savings account' \n",
72
+ " }\n",
73
+ "\n",
74
+ "df_2023.loc[:,'Product'] = df_2023['Product'].map(product_map)"
75
+ ]
76
+ },
77
+ {
78
+ "cell_type": "code",
79
+ "execution_count": 4,
80
+ "id": "679ffbe3-a6ba-4f4d-bf65-0690794fb4e1",
81
+ "metadata": {},
82
+ "outputs": [
83
+ {
84
+ "data": {
85
+ "text/html": [
86
+ "<div>\n",
87
+ "<style scoped>\n",
88
+ " .dataframe tbody tr th:only-of-type {\n",
89
+ " vertical-align: middle;\n",
90
+ " }\n",
91
+ "\n",
92
+ " .dataframe tbody tr th {\n",
93
+ " vertical-align: top;\n",
94
+ " }\n",
95
+ "\n",
96
+ " .dataframe thead th {\n",
97
+ " text-align: right;\n",
98
+ " }\n",
99
+ "</style>\n",
100
+ "<table border=\"1\" class=\"dataframe\">\n",
101
+ " <thead>\n",
102
+ " <tr style=\"text-align: right;\">\n",
103
+ " <th></th>\n",
104
+ " <th>Product</th>\n",
105
+ " <th>Sub-product</th>\n",
106
+ " <th>Issue</th>\n",
107
+ " <th>Sub-issue</th>\n",
108
+ " <th>Consumer complaint narrative</th>\n",
109
+ " <th>Company public response</th>\n",
110
+ " <th>Company</th>\n",
111
+ " <th>State</th>\n",
112
+ " <th>ZIP code</th>\n",
113
+ " <th>Date received</th>\n",
114
+ " </tr>\n",
115
+ " </thead>\n",
116
+ " <tbody>\n",
117
+ " <tr>\n",
118
+ " <th>0</th>\n",
119
+ " <td>Checking or savings account</td>\n",
120
+ " <td>Other banking product or service</td>\n",
121
+ " <td>Opening an account</td>\n",
122
+ " <td>Account opened without my consent or knowledge</td>\n",
123
+ " <td>Date : XXXX XXXXo : XXXX XXXX XXXX / XXXX XXXX...</td>\n",
124
+ " <td>Company has responded to the consumer and the ...</td>\n",
125
+ " <td>WELLS FARGO &amp; COMPANY</td>\n",
126
+ " <td>NC</td>\n",
127
+ " <td>27513</td>\n",
128
+ " <td>2023-12-29</td>\n",
129
+ " </tr>\n",
130
+ " <tr>\n",
131
+ " <th>1</th>\n",
132
+ " <td>Credit Reporting</td>\n",
133
+ " <td>Credit reporting</td>\n",
134
+ " <td>Problem with a company's investigation into an...</td>\n",
135
+ " <td>Investigation took more than 30 days</td>\n",
136
+ " <td>I have previously disputed this item with you ...</td>\n",
137
+ " <td>Company has responded to the consumer and the ...</td>\n",
138
+ " <td>Experian Information Solutions Inc.</td>\n",
139
+ " <td>MN</td>\n",
140
+ " <td>55124</td>\n",
141
+ " <td>2023-12-29</td>\n",
142
+ " </tr>\n",
143
+ " <tr>\n",
144
+ " <th>2</th>\n",
145
+ " <td>Debt collection</td>\n",
146
+ " <td>Other debt</td>\n",
147
+ " <td>Attempts to collect debt not owed</td>\n",
148
+ " <td>Debt was result of identity theft</td>\n",
149
+ " <td>I kindly request that you update my credit rep...</td>\n",
150
+ " <td>Company has responded to the consumer and the ...</td>\n",
151
+ " <td>Experian Information Solutions Inc.</td>\n",
152
+ " <td>IL</td>\n",
153
+ " <td>60621</td>\n",
154
+ " <td>2023-12-28</td>\n",
155
+ " </tr>\n",
156
+ " <tr>\n",
157
+ " <th>3</th>\n",
158
+ " <td>Debt collection</td>\n",
159
+ " <td>Other debt</td>\n",
160
+ " <td>Attempts to collect debt not owed</td>\n",
161
+ " <td>Debt was result of identity theft</td>\n",
162
+ " <td>I implore you to conduct a comprehensive inves...</td>\n",
163
+ " <td>Company has responded to the consumer and the ...</td>\n",
164
+ " <td>Experian Information Solutions Inc.</td>\n",
165
+ " <td>NJ</td>\n",
166
+ " <td>08723</td>\n",
167
+ " <td>2023-12-28</td>\n",
168
+ " </tr>\n",
169
+ " <tr>\n",
170
+ " <th>4</th>\n",
171
+ " <td>Credit Reporting</td>\n",
172
+ " <td>Credit reporting</td>\n",
173
+ " <td>Incorrect information on your report</td>\n",
174
+ " <td>Information belongs to someone else</td>\n",
175
+ " <td>In accordance with the Fair Credit Reporting A...</td>\n",
176
+ " <td>Company has responded to the consumer and the ...</td>\n",
177
+ " <td>TRANSUNION INTERMEDIATE HOLDINGS, INC.</td>\n",
178
+ " <td>TX</td>\n",
179
+ " <td>77377</td>\n",
180
+ " <td>2023-11-27</td>\n",
181
+ " </tr>\n",
182
+ " </tbody>\n",
183
+ "</table>\n",
184
+ "</div>"
185
+ ],
186
+ "text/plain": [
187
+ " Product Sub-product \\\n",
188
+ "0 Checking or savings account Other banking product or service \n",
189
+ "1 Credit Reporting Credit reporting \n",
190
+ "2 Debt collection Other debt \n",
191
+ "3 Debt collection Other debt \n",
192
+ "4 Credit Reporting Credit reporting \n",
193
+ "\n",
194
+ " Issue \\\n",
195
+ "0 Opening an account \n",
196
+ "1 Problem with a company's investigation into an... \n",
197
+ "2 Attempts to collect debt not owed \n",
198
+ "3 Attempts to collect debt not owed \n",
199
+ "4 Incorrect information on your report \n",
200
+ "\n",
201
+ " Sub-issue \\\n",
202
+ "0 Account opened without my consent or knowledge \n",
203
+ "1 Investigation took more than 30 days \n",
204
+ "2 Debt was result of identity theft \n",
205
+ "3 Debt was result of identity theft \n",
206
+ "4 Information belongs to someone else \n",
207
+ "\n",
208
+ " Consumer complaint narrative \\\n",
209
+ "0 Date : XXXX XXXXo : XXXX XXXX XXXX / XXXX XXXX... \n",
210
+ "1 I have previously disputed this item with you ... \n",
211
+ "2 I kindly request that you update my credit rep... \n",
212
+ "3 I implore you to conduct a comprehensive inves... \n",
213
+ "4 In accordance with the Fair Credit Reporting A... \n",
214
+ "\n",
215
+ " Company public response \\\n",
216
+ "0 Company has responded to the consumer and the ... \n",
217
+ "1 Company has responded to the consumer and the ... \n",
218
+ "2 Company has responded to the consumer and the ... \n",
219
+ "3 Company has responded to the consumer and the ... \n",
220
+ "4 Company has responded to the consumer and the ... \n",
221
+ "\n",
222
+ " Company State ZIP code Date received \n",
223
+ "0 WELLS FARGO & COMPANY NC 27513 2023-12-29 \n",
224
+ "1 Experian Information Solutions Inc. MN 55124 2023-12-29 \n",
225
+ "2 Experian Information Solutions Inc. IL 60621 2023-12-28 \n",
226
+ "3 Experian Information Solutions Inc. NJ 08723 2023-12-28 \n",
227
+ "4 TRANSUNION INTERMEDIATE HOLDINGS, INC. TX 77377 2023-11-27 "
228
+ ]
229
+ },
230
+ "execution_count": 4,
231
+ "metadata": {},
232
+ "output_type": "execute_result"
233
+ }
234
+ ],
235
+ "source": [
236
+ "df_2023.head()"
237
+ ]
238
+ },
239
+ {
240
+ "cell_type": "code",
241
+ "execution_count": 5,
242
+ "id": "a85ec9b1-5de7-47f9-b204-4d42c8880bbb",
243
+ "metadata": {},
244
+ "outputs": [
245
+ {
246
+ "data": {
247
+ "text/plain": [
248
+ "Index(['Product', 'Sub-product', 'Issue', 'Sub-issue',\n",
249
+ " 'Consumer complaint narrative', 'Company public response', 'Company',\n",
250
+ " 'State', 'ZIP code', 'Date received'],\n",
251
+ " dtype='object')"
252
+ ]
253
+ },
254
+ "execution_count": 5,
255
+ "metadata": {},
256
+ "output_type": "execute_result"
257
+ }
258
+ ],
259
+ "source": [
260
+ "df_2023.columns"
261
+ ]
262
+ },
263
+ {
264
+ "cell_type": "markdown",
265
+ "id": "0487636d-9663-4fdb-b219-f9e6be257b51",
266
+ "metadata": {},
267
+ "source": [
268
+ "### Complaint pre-processing"
269
+ ]
270
+ },
271
+ {
272
+ "cell_type": "code",
273
+ "execution_count": 6,
274
+ "id": "e35208c6-020a-4fb9-8c9f-13fdeee44935",
275
+ "metadata": {},
276
+ "outputs": [],
277
+ "source": [
278
+ "df_2023['complaint length'] = df_2023['Consumer complaint narrative'].apply(lambda x : len(x))"
279
+ ]
280
+ },
281
+ {
282
+ "cell_type": "code",
283
+ "execution_count": 7,
284
+ "id": "63deb9bb-d48a-460b-8edb-f66575ec1eaf",
285
+ "metadata": {},
286
+ "outputs": [],
287
+ "source": [
288
+ "df_2023 = df_2023[df_2023['complaint length'] > 20]\n",
289
+ "\n",
290
+ "complaints_to_exclude = ['See document attached', 'See the attached documents.', 'Incorrect information on my credit report', 'incorrect information on my credit report',\n",
291
+ "'please see attached file','Please see documents Attached','Incorrect information on my credit report.', 'Please see attached file', 'see attached',\n",
292
+ "'See attached', 'SEE ATTACHED DOCUMENTS', 'See Attached', 'SEE ATTACHMENT', 'SEE ATTACHMENTS', \n",
293
+ "'XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX']\n",
294
+ "\n",
295
+ "df_2023 = df_2023[~df_2023['Consumer complaint narrative'].isin(complaints_to_exclude)]"
296
+ ]
297
+ },
298
+ {
299
+ "cell_type": "markdown",
300
+ "id": "492f8261-3e01-41d5-8f24-82bd289ee229",
301
+ "metadata": {},
302
+ "source": [
303
+ "### Categories consideration"
304
+ ]
305
+ },
306
+ {
307
+ "cell_type": "code",
308
+ "execution_count": 8,
309
+ "id": "0be9e1f3-61aa-494a-bd0c-a6afeab5aacd",
310
+ "metadata": {},
311
+ "outputs": [
312
+ {
313
+ "data": {
314
+ "text/plain": [
315
+ "(264968, 5)"
316
+ ]
317
+ },
318
+ "execution_count": 8,
319
+ "metadata": {},
320
+ "output_type": "execute_result"
321
+ }
322
+ ],
323
+ "source": [
324
+ "df_2023_subset = df_2023[['Consumer complaint narrative','Product','Sub-product','Issue','Sub-issue']]\n",
325
+ "df_2023_subset.shape"
326
+ ]
327
+ },
328
+ {
329
+ "cell_type": "code",
330
+ "execution_count": 9,
331
+ "id": "33e4e7e3-6661-48aa-aec2-b706fa64338d",
332
+ "metadata": {},
333
+ "outputs": [
334
+ {
335
+ "data": {
336
+ "text/plain": [
337
+ "Product\n",
338
+ "Credit Reporting 213403\n",
339
+ "Credit/Prepaid Card 16319\n",
340
+ "Checking or savings account 15143\n",
341
+ "Debt collection 11767\n",
342
+ "Loans / Mortgage 8336\n",
343
+ "Name: count, dtype: int64"
344
+ ]
345
+ },
346
+ "execution_count": 9,
347
+ "metadata": {},
348
+ "output_type": "execute_result"
349
+ }
350
+ ],
351
+ "source": [
352
+ "df_2023_subset['Product'].value_counts()"
353
+ ]
354
+ },
355
+ {
356
+ "cell_type": "code",
357
+ "execution_count": 10,
358
+ "id": "dbc49ba8-f15a-4a4b-b018-d9d2273620ba",
359
+ "metadata": {},
360
+ "outputs": [],
361
+ "source": [
362
+ "sub_issues_to_consider = df_2023_subset['Sub-issue'].value_counts()[df_2023_subset['Sub-issue'].value_counts() > 500].index"
363
+ ]
364
+ },
365
+ {
366
+ "cell_type": "code",
367
+ "execution_count": 11,
368
+ "id": "746db565-e6ff-4ab2-bf92-d56088c0f2da",
369
+ "metadata": {},
370
+ "outputs": [],
371
+ "source": [
372
+ "reduced_subissues = df_2023_subset[df_2023_subset['Sub-issue'].isin(sub_issues_to_consider)]"
373
+ ]
374
+ },
375
+ {
376
+ "cell_type": "code",
377
+ "execution_count": 12,
378
+ "id": "0f786b1d-b139-40b5-ad53-639d8687d3b4",
379
+ "metadata": {},
380
+ "outputs": [
381
+ {
382
+ "data": {
383
+ "text/plain": [
384
+ "(248065, 5)"
385
+ ]
386
+ },
387
+ "execution_count": 12,
388
+ "metadata": {},
389
+ "output_type": "execute_result"
390
+ }
391
+ ],
392
+ "source": [
393
+ "reduced_subissues.shape"
394
+ ]
395
+ },
396
+ {
397
+ "cell_type": "code",
398
+ "execution_count": 13,
399
+ "id": "f64515e8-ac65-4041-a201-a8576a86d7ad",
400
+ "metadata": {},
401
+ "outputs": [
402
+ {
403
+ "data": {
404
+ "text/plain": [
405
+ "Sub-issue\n",
406
+ "Information belongs to someone else 57877\n",
407
+ "Reporting company used your report improperly 48781\n",
408
+ "Their investigation did not fix an error on your report 45407\n",
409
+ "Credit inquiries on your report that you don't recognize 13150\n",
410
+ "Account status incorrect 10271\n",
411
+ "Account information incorrect 9307\n",
412
+ "Was not notified of investigation status or results 9201\n",
413
+ "Investigation took more than 30 days 8937\n",
414
+ "Personal information incorrect 5900\n",
415
+ "Debt is not yours 2821\n",
416
+ "Deposits and withdrawals 2626\n",
417
+ "Credit card company isn't resolving a dispute about a purchase on your statement 2289\n",
418
+ "Didn't receive enough information to verify debt 1816\n",
419
+ "Debt was result of identity theft 1761\n",
420
+ "Old information reappears or never goes away 1716\n",
421
+ "Difficulty submitting a dispute or getting information about a dispute over the phone 1709\n",
422
+ "Company closed your account 1517\n",
423
+ "Problem using a debit or ATM card 1503\n",
424
+ "Public record information inaccurate 1389\n",
425
+ "Transaction was not authorized 1378\n",
426
+ "Problem with personal statement of dispute 1361\n",
427
+ "Other problem getting your report or credit score 1112\n",
428
+ "Debt was paid 969\n",
429
+ "Card was charged for something you did not purchase with the card 964\n",
430
+ "Banking errors 958\n",
431
+ "Funds not handled or disbursed as instructed 955\n",
432
+ "Overdrafts and overdraft fees 951\n",
433
+ "Attempted to collect wrong amount 885\n",
434
+ "Information is missing that should be on the report 881\n",
435
+ "Problem during payment process 840\n",
436
+ "Fee problem 764\n",
437
+ "Problem with fees 749\n",
438
+ "Received bad information about your loan 710\n",
439
+ "Other problem 701\n",
440
+ "Threatened or suggested your credit would be damaged 687\n",
441
+ "Funds not received from closed account 673\n",
442
+ "Trouble with how payments are being handled 650\n",
443
+ "Didn't receive notice of right to dispute 644\n",
444
+ "Can't close your account 598\n",
445
+ "Problem accessing account 561\n",
446
+ "Account opened as a result of fraud 561\n",
447
+ "Problem canceling credit monitoring or identify theft protection service 521\n",
448
+ "Card opened as result of identity theft or fraud 511\n",
449
+ "Billing problem 503\n",
450
+ "Name: count, dtype: int64"
451
+ ]
452
+ },
453
+ "execution_count": 13,
454
+ "metadata": {},
455
+ "output_type": "execute_result"
456
+ }
457
+ ],
458
+ "source": [
459
+ "reduced_subissues['Sub-issue'].value_counts()"
460
+ ]
461
+ },
462
+ {
463
+ "cell_type": "code",
464
+ "execution_count": 14,
465
+ "id": "6204eb53-1a5b-457f-ab67-957d73f568af",
466
+ "metadata": {},
467
+ "outputs": [],
468
+ "source": [
469
+ "sub_products_to_consider = reduced_subissues['Sub-product'].value_counts()[reduced_subissues['Sub-product'].value_counts() > 100].index\n",
470
+ "final_df_2023 = reduced_subissues[reduced_subissues['Sub-product'].isin(sub_products_to_consider)]"
471
+ ]
472
+ },
473
+ {
474
+ "cell_type": "code",
475
+ "execution_count": 15,
476
+ "id": "781850e8-cd50-4d08-87aa-8d86715cc2ef",
477
+ "metadata": {},
478
+ "outputs": [
479
+ {
480
+ "data": {
481
+ "text/plain": [
482
+ "(247517, 5)"
483
+ ]
484
+ },
485
+ "execution_count": 15,
486
+ "metadata": {},
487
+ "output_type": "execute_result"
488
+ }
489
+ ],
490
+ "source": [
491
+ "final_df_2023.shape"
492
+ ]
493
+ },
494
+ {
495
+ "cell_type": "markdown",
496
+ "id": "563955e5-8b1b-4d67-a552-5d1b69ff8891",
497
+ "metadata": {},
498
+ "source": [
499
+ "### Issue categories grouping"
500
+ ]
501
+ },
502
+ {
503
+ "cell_type": "code",
504
+ "execution_count": 16,
505
+ "id": "8cb41375-d72e-4f90-bde1-6ff13af37082",
506
+ "metadata": {},
507
+ "outputs": [],
508
+ "source": [
509
+ "issues_to_subissues = {}\n",
510
+ "for issue in final_df_2023['Issue'].value_counts().index:\n",
511
+ " issues_to_subissues[issue] = list(final_df_2023[final_df_2023['Issue'] == issue]['Sub-issue'].value_counts().to_dict().keys())\n",
512
+ "\n",
513
+ "one_subissue = {key : value for key,value in issues_to_subissues.items() if len(issues_to_subissues[key]) == 1}\n",
514
+ "more_than_one_subissue = {key : value for key,value in issues_to_subissues.items() if len(issues_to_subissues[key]) > 1}\n",
515
+ "\n",
516
+ "existing_issue_mapping = {issue : issue for issue in more_than_one_subissue}\n",
517
+ "\n",
518
+ "issue_renaming = {\n",
519
+ " 'Problem with a lender or other company charging your account': 'Account Operations and Unauthorized Transaction Issues',\n",
520
+ " 'Opening an account': 'Account Operations and Unauthorized Transaction Issues',\n",
521
+ " 'Getting a credit card': 'Account Operations and Unauthorized Transaction Issues',\n",
522
+ "\n",
523
+ " 'Unable to get your credit report or credit score': 'Credit Report and Monitoring Issues',\n",
524
+ " 'Credit monitoring or identity theft protection services': 'Credit Report and Monitoring Issues',\n",
525
+ " 'Identity theft protection or other monitoring services': 'Credit Report and Monitoring Issues',\n",
526
+ " \n",
527
+ " 'Problem caused by your funds being low': 'Payment and Funds Management',\n",
528
+ " 'Problem when making payments': 'Payment and Funds Management',\n",
529
+ " 'Managing the loan or lease': 'Payment and Funds Management',\n",
530
+ "\n",
531
+ " 'False statements or representation': 'Disputes and Misrepresentations',\n",
532
+ " 'Fees or interest': 'Disputes and Misrepresentations',\n",
533
+ " 'Other features, terms, or problems': 'Disputes and Misrepresentations',\n",
534
+ "\n",
535
+ " 'Took or threatened to take negative or legal action': 'Legal and Threat Actions'\n",
536
+ "}\n",
537
+ "\n",
538
+ "issues_mapping = {**issue_renaming, **existing_issue_mapping}\n",
539
+ "\n",
540
+ "final_df_2023.loc[:,'Issue'] = final_df_2023['Issue'].apply(lambda x : issues_mapping[x])"
541
+ ]
542
+ },
543
+ {
544
+ "cell_type": "markdown",
545
+ "id": "0ab4f91f-c938-4093-a299-b895ea13121a",
546
+ "metadata": {},
547
+ "source": [
548
+ "### Value counts"
549
+ ]
550
+ },
551
+ {
552
+ "cell_type": "code",
553
+ "execution_count": 17,
554
+ "id": "17ddf55c-f824-4b2c-8059-07d02597a1cb",
555
+ "metadata": {},
556
+ "outputs": [
557
+ {
558
+ "data": {
559
+ "text/plain": [
560
+ "Product\n",
561
+ "Credit Reporting 211695\n",
562
+ "Checking or savings account 12285\n",
563
+ "Credit/Prepaid Card 11975\n",
564
+ "Debt collection 9380\n",
565
+ "Loans / Mortgage 2182\n",
566
+ "Name: count, dtype: int64"
567
+ ]
568
+ },
569
+ "execution_count": 17,
570
+ "metadata": {},
571
+ "output_type": "execute_result"
572
+ }
573
+ ],
574
+ "source": [
575
+ "final_df_2023['Product'].value_counts()"
576
+ ]
577
+ },
578
+ {
579
+ "cell_type": "code",
580
+ "execution_count": 18,
581
+ "id": "eae2d688-f706-4c31-9228-1ae7eadbf228",
582
+ "metadata": {},
583
+ "outputs": [
584
+ {
585
+ "data": {
586
+ "text/plain": [
587
+ "Sub-product\n",
588
+ "Credit reporting 210735\n",
589
+ "General-purpose credit card or charge card 10668\n",
590
+ "Checking account 10409\n",
591
+ "Other debt 3041\n",
592
+ "I do not know 2316\n",
593
+ "Credit card debt 1652\n",
594
+ "Federal student loan servicing 1344\n",
595
+ "Store credit card 1307\n",
596
+ "Medical debt 1053\n",
597
+ "Savings account 989\n",
598
+ "Other personal consumer report 960\n",
599
+ "Loan 732\n",
600
+ "Other banking product or service 725\n",
601
+ "Auto debt 581\n",
602
+ "Telecommunications debt 419\n",
603
+ "Rental debt 179\n",
604
+ "CD (Certificate of Deposit) 162\n",
605
+ "Mortgage debt 139\n",
606
+ "Conventional home mortgage 106\n",
607
+ "Name: count, dtype: int64"
608
+ ]
609
+ },
610
+ "execution_count": 18,
611
+ "metadata": {},
612
+ "output_type": "execute_result"
613
+ }
614
+ ],
615
+ "source": [
616
+ "final_df_2023['Sub-product'].value_counts()"
617
+ ]
618
+ },
619
+ {
620
+ "cell_type": "code",
621
+ "execution_count": 19,
622
+ "id": "61179ec3-f49f-4d2a-adde-738a0ff89371",
623
+ "metadata": {},
624
+ "outputs": [
625
+ {
626
+ "data": {
627
+ "text/plain": [
628
+ "Issue\n",
629
+ "Incorrect information on your report 87200\n",
630
+ "Improper use of your report 61868\n",
631
+ "Problem with a credit reporting company's investigation into an existing problem 45371\n",
632
+ "Problem with a company's investigation into an existing problem 20985\n",
633
+ "Managing an account 7367\n",
634
+ "Attempts to collect debt not owed 5453\n",
635
+ "Problem with a purchase shown on your statement 3253\n",
636
+ "Account Operations and Unauthorized Transaction Issues 2450\n",
637
+ "Written notification about debt 2404\n",
638
+ "Disputes and Misrepresentations 2311\n",
639
+ "Payment and Funds Management 2259\n",
640
+ "Closing an account 1975\n",
641
+ "Credit Report and Monitoring Issues 1630\n",
642
+ "Dealing with your lender or servicer 1293\n",
643
+ "Closing your account 813\n",
644
+ "Legal and Threat Actions 662\n",
645
+ "Problem with a company's investigation into an existing issue 223\n",
646
+ "Name: count, dtype: int64"
647
+ ]
648
+ },
649
+ "execution_count": 19,
650
+ "metadata": {},
651
+ "output_type": "execute_result"
652
+ }
653
+ ],
654
+ "source": [
655
+ "final_df_2023['Issue'].value_counts()"
656
+ ]
657
+ },
658
+ {
659
+ "cell_type": "code",
660
+ "execution_count": 20,
661
+ "id": "928750bb-7324-480f-aaa1-a4438841399c",
662
+ "metadata": {},
663
+ "outputs": [
664
+ {
665
+ "data": {
666
+ "text/plain": [
667
+ "Sub-issue\n",
668
+ "Information belongs to someone else 57850\n",
669
+ "Reporting company used your report improperly 48732\n",
670
+ "Their investigation did not fix an error on your report 45395\n",
671
+ "Credit inquiries on your report that you don't recognize 13136\n",
672
+ "Account status incorrect 10208\n",
673
+ "Account information incorrect 9267\n",
674
+ "Was not notified of investigation status or results 9200\n",
675
+ "Investigation took more than 30 days 8928\n",
676
+ "Personal information incorrect 5900\n",
677
+ "Debt is not yours 2785\n",
678
+ "Deposits and withdrawals 2626\n",
679
+ "Credit card company isn't resolving a dispute about a purchase on your statement 2289\n",
680
+ "Didn't receive enough information to verify debt 1777\n",
681
+ "Debt was result of identity theft 1727\n",
682
+ "Old information reappears or never goes away 1714\n",
683
+ "Difficulty submitting a dispute or getting information about a dispute over the phone 1704\n",
684
+ "Company closed your account 1517\n",
685
+ "Problem using a debit or ATM card 1503\n",
686
+ "Public record information inaccurate 1384\n",
687
+ "Transaction was not authorized 1378\n",
688
+ "Problem with personal statement of dispute 1352\n",
689
+ "Other problem getting your report or credit score 1109\n",
690
+ "Card was charged for something you did not purchase with the card 964\n",
691
+ "Banking errors 958\n",
692
+ "Funds not handled or disbursed as instructed 955\n",
693
+ "Overdrafts and overdraft fees 951\n",
694
+ "Debt was paid 941\n",
695
+ "Information is missing that should be on the report 877\n",
696
+ "Attempted to collect wrong amount 861\n",
697
+ "Problem during payment process 840\n",
698
+ "Fee problem 764\n",
699
+ "Problem with fees 749\n",
700
+ "Other problem 701\n",
701
+ "Received bad information about your loan 677\n",
702
+ "Funds not received from closed account 673\n",
703
+ "Threatened or suggested your credit would be damaged 662\n",
704
+ "Didn't receive notice of right to dispute 627\n",
705
+ "Trouble with how payments are being handled 616\n",
706
+ "Can't close your account 598\n",
707
+ "Problem accessing account 561\n",
708
+ "Account opened as a result of fraud 561\n",
709
+ "Problem canceling credit monitoring or identify theft protection service 521\n",
710
+ "Card opened as result of identity theft or fraud 511\n",
711
+ "Billing problem 468\n",
712
+ "Name: count, dtype: int64"
713
+ ]
714
+ },
715
+ "execution_count": 20,
716
+ "metadata": {},
717
+ "output_type": "execute_result"
718
+ }
719
+ ],
720
+ "source": [
721
+ "final_df_2023['Sub-issue'].value_counts()"
722
+ ]
723
+ },
724
+ {
725
+ "cell_type": "markdown",
726
+ "id": "fd91e57e-766c-4c4b-92c1-4b61469be9b4",
727
+ "metadata": {},
728
+ "source": [
729
+ "### Unique categories"
730
+ ]
731
+ },
732
+ {
733
+ "cell_type": "code",
734
+ "execution_count": 21,
735
+ "id": "028803cd-86c0-4c8a-9fab-8f05ba6793a1",
736
+ "metadata": {},
737
+ "outputs": [
738
+ {
739
+ "name": "stdout",
740
+ "output_type": "stream",
741
+ "text": [
742
+ "Unique Product offerings: 5\n",
743
+ "Unique Sub-product offerings: 19\n",
744
+ "Unique Issue offerings: 17\n",
745
+ "Unique Sub-issue offerings: 44\n"
746
+ ]
747
+ }
748
+ ],
749
+ "source": [
750
+ "print(f\"Unique Product offerings: {final_df_2023['Product'].nunique()}\")\n",
751
+ "print(f\"Unique Sub-product offerings: {final_df_2023['Sub-product'].nunique()}\")\n",
752
+ "print(f\"Unique Issue offerings: {final_df_2023['Issue'].nunique()}\")\n",
753
+ "print(f\"Unique Sub-issue offerings: {final_df_2023['Sub-issue'].nunique()}\")"
754
+ ]
755
+ },
756
+ {
757
+ "cell_type": "markdown",
758
+ "id": "06ea0454-ed84-450a-90f7-e7552ffc181f",
759
+ "metadata": {},
760
+ "source": [
761
+ "### Preparing the train and test splits"
762
+ ]
763
+ },
764
+ {
765
+ "cell_type": "code",
766
+ "execution_count": 22,
767
+ "id": "267b771c-f944-443a-8048-c2f0097f4f29",
768
+ "metadata": {},
769
+ "outputs": [],
770
+ "source": [
771
+ "from sklearn.model_selection import train_test_split"
772
+ ]
773
+ },
774
+ {
775
+ "cell_type": "code",
776
+ "execution_count": 23,
777
+ "id": "eebed808-66b4-4fa8-a0ce-872b70d18106",
778
+ "metadata": {},
779
+ "outputs": [
780
+ {
781
+ "data": {
782
+ "text/html": [
783
+ "<div>\n",
784
+ "<style scoped>\n",
785
+ " .dataframe tbody tr th:only-of-type {\n",
786
+ " vertical-align: middle;\n",
787
+ " }\n",
788
+ "\n",
789
+ " .dataframe tbody tr th {\n",
790
+ " vertical-align: top;\n",
791
+ " }\n",
792
+ "\n",
793
+ " .dataframe thead th {\n",
794
+ " text-align: right;\n",
795
+ " }\n",
796
+ "</style>\n",
797
+ "<table border=\"1\" class=\"dataframe\">\n",
798
+ " <thead>\n",
799
+ " <tr style=\"text-align: right;\">\n",
800
+ " <th></th>\n",
801
+ " <th>Consumer complaint narrative</th>\n",
802
+ " <th>Product</th>\n",
803
+ " <th>Sub-product</th>\n",
804
+ " <th>Issue</th>\n",
805
+ " <th>Sub-issue</th>\n",
806
+ " </tr>\n",
807
+ " </thead>\n",
808
+ " <tbody>\n",
809
+ " <tr>\n",
810
+ " <th>1</th>\n",
811
+ " <td>I have previously disputed this item with you ...</td>\n",
812
+ " <td>Credit Reporting</td>\n",
813
+ " <td>Credit reporting</td>\n",
814
+ " <td>Problem with a company's investigation into an...</td>\n",
815
+ " <td>Investigation took more than 30 days</td>\n",
816
+ " </tr>\n",
817
+ " <tr>\n",
818
+ " <th>2</th>\n",
819
+ " <td>I kindly request that you update my credit rep...</td>\n",
820
+ " <td>Debt collection</td>\n",
821
+ " <td>Other debt</td>\n",
822
+ " <td>Attempts to collect debt not owed</td>\n",
823
+ " <td>Debt was result of identity theft</td>\n",
824
+ " </tr>\n",
825
+ " <tr>\n",
826
+ " <th>3</th>\n",
827
+ " <td>I implore you to conduct a comprehensive inves...</td>\n",
828
+ " <td>Debt collection</td>\n",
829
+ " <td>Other debt</td>\n",
830
+ " <td>Attempts to collect debt not owed</td>\n",
831
+ " <td>Debt was result of identity theft</td>\n",
832
+ " </tr>\n",
833
+ " <tr>\n",
834
+ " <th>4</th>\n",
835
+ " <td>In accordance with the Fair Credit Reporting A...</td>\n",
836
+ " <td>Credit Reporting</td>\n",
837
+ " <td>Credit reporting</td>\n",
838
+ " <td>Incorrect information on your report</td>\n",
839
+ " <td>Information belongs to someone else</td>\n",
840
+ " </tr>\n",
841
+ " <tr>\n",
842
+ " <th>5</th>\n",
843
+ " <td>In accordance with Fair c=Credit Reporting Act...</td>\n",
844
+ " <td>Credit Reporting</td>\n",
845
+ " <td>Credit reporting</td>\n",
846
+ " <td>Improper use of your report</td>\n",
847
+ " <td>Reporting company used your report improperly</td>\n",
848
+ " </tr>\n",
849
+ " </tbody>\n",
850
+ "</table>\n",
851
+ "</div>"
852
+ ],
853
+ "text/plain": [
854
+ " Consumer complaint narrative Product \\\n",
855
+ "1 I have previously disputed this item with you ... Credit Reporting \n",
856
+ "2 I kindly request that you update my credit rep... Debt collection \n",
857
+ "3 I implore you to conduct a comprehensive inves... Debt collection \n",
858
+ "4 In accordance with the Fair Credit Reporting A... Credit Reporting \n",
859
+ "5 In accordance with Fair c=Credit Reporting Act... Credit Reporting \n",
860
+ "\n",
861
+ " Sub-product Issue \\\n",
862
+ "1 Credit reporting Problem with a company's investigation into an... \n",
863
+ "2 Other debt Attempts to collect debt not owed \n",
864
+ "3 Other debt Attempts to collect debt not owed \n",
865
+ "4 Credit reporting Incorrect information on your report \n",
866
+ "5 Credit reporting Improper use of your report \n",
867
+ "\n",
868
+ " Sub-issue \n",
869
+ "1 Investigation took more than 30 days \n",
870
+ "2 Debt was result of identity theft \n",
871
+ "3 Debt was result of identity theft \n",
872
+ "4 Information belongs to someone else \n",
873
+ "5 Reporting company used your report improperly "
874
+ ]
875
+ },
876
+ "execution_count": 23,
877
+ "metadata": {},
878
+ "output_type": "execute_result"
879
+ }
880
+ ],
881
+ "source": [
882
+ "final_df_2023.head()"
883
+ ]
884
+ },
885
+ {
886
+ "cell_type": "code",
887
+ "execution_count": 24,
888
+ "id": "da025cda-f04e-4822-b100-855e981d632a",
889
+ "metadata": {},
890
+ "outputs": [],
891
+ "source": [
892
+ "X = final_df_2023['Consumer complaint narrative']\n",
893
+ "y = final_df_2023[['Product','Sub-product','Issue','Sub-issue']]\n",
894
+ "\n",
895
+ "X_train, X_test, y_train, y_test = train_test_split(X,y,stratify=y['Product'],test_size=0.25,random_state=42)"
896
+ ]
897
+ },
898
+ {
899
+ "cell_type": "code",
900
+ "execution_count": 25,
901
+ "id": "d291102d-7136-4512-84c2-ba970b169cbf",
902
+ "metadata": {},
903
+ "outputs": [],
904
+ "source": [
905
+ "train_df = pd.concat([X_train,y_train],axis = 1).reset_index(drop = True)\n",
906
+ "test_df = pd.concat([X_test,y_test],axis = 1).reset_index(drop = True)"
907
+ ]
908
+ },
909
+ {
910
+ "cell_type": "code",
911
+ "execution_count": 26,
912
+ "id": "0006636f-24cf-41dd-98cd-dc3a2b65432f",
913
+ "metadata": {},
914
+ "outputs": [
915
+ {
916
+ "data": {
917
+ "text/html": [
918
+ "<div>\n",
919
+ "<style scoped>\n",
920
+ " .dataframe tbody tr th:only-of-type {\n",
921
+ " vertical-align: middle;\n",
922
+ " }\n",
923
+ "\n",
924
+ " .dataframe tbody tr th {\n",
925
+ " vertical-align: top;\n",
926
+ " }\n",
927
+ "\n",
928
+ " .dataframe thead th {\n",
929
+ " text-align: right;\n",
930
+ " }\n",
931
+ "</style>\n",
932
+ "<table border=\"1\" class=\"dataframe\">\n",
933
+ " <thead>\n",
934
+ " <tr style=\"text-align: right;\">\n",
935
+ " <th></th>\n",
936
+ " <th>Consumer complaint narrative</th>\n",
937
+ " <th>Product</th>\n",
938
+ " <th>Sub-product</th>\n",
939
+ " <th>Issue</th>\n",
940
+ " <th>Sub-issue</th>\n",
941
+ " </tr>\n",
942
+ " </thead>\n",
943
+ " <tbody>\n",
944
+ " <tr>\n",
945
+ " <th>0</th>\n",
946
+ " <td>The credit bureaus keep disrespecting the laws...</td>\n",
947
+ " <td>Credit Reporting</td>\n",
948
+ " <td>Credit reporting</td>\n",
949
+ " <td>Problem with a company's investigation into an...</td>\n",
950
+ " <td>Their investigation did not fix an error on yo...</td>\n",
951
+ " </tr>\n",
952
+ " <tr>\n",
953
+ " <th>1</th>\n",
954
+ " <td>I sent in a complaint in XXXX of 2021 about so...</td>\n",
955
+ " <td>Credit Reporting</td>\n",
956
+ " <td>Credit reporting</td>\n",
957
+ " <td>Incorrect information on your report</td>\n",
958
+ " <td>Information belongs to someone else</td>\n",
959
+ " </tr>\n",
960
+ " <tr>\n",
961
+ " <th>2</th>\n",
962
+ " <td>I ordered a copy of my report and I found out ...</td>\n",
963
+ " <td>Credit Reporting</td>\n",
964
+ " <td>Credit reporting</td>\n",
965
+ " <td>Problem with a credit reporting company's inve...</td>\n",
966
+ " <td>Their investigation did not fix an error on yo...</td>\n",
967
+ " </tr>\n",
968
+ " <tr>\n",
969
+ " <th>3</th>\n",
970
+ " <td>It appears that my credit file has been compro...</td>\n",
971
+ " <td>Credit Reporting</td>\n",
972
+ " <td>Credit reporting</td>\n",
973
+ " <td>Incorrect information on your report</td>\n",
974
+ " <td>Information belongs to someone else</td>\n",
975
+ " </tr>\n",
976
+ " <tr>\n",
977
+ " <th>4</th>\n",
978
+ " <td>I have never authorized, consented to nor bene...</td>\n",
979
+ " <td>Credit Reporting</td>\n",
980
+ " <td>Credit reporting</td>\n",
981
+ " <td>Incorrect information on your report</td>\n",
982
+ " <td>Information belongs to someone else</td>\n",
983
+ " </tr>\n",
984
+ " </tbody>\n",
985
+ "</table>\n",
986
+ "</div>"
987
+ ],
988
+ "text/plain": [
989
+ " Consumer complaint narrative Product \\\n",
990
+ "0 The credit bureaus keep disrespecting the laws... Credit Reporting \n",
991
+ "1 I sent in a complaint in XXXX of 2021 about so... Credit Reporting \n",
992
+ "2 I ordered a copy of my report and I found out ... Credit Reporting \n",
993
+ "3 It appears that my credit file has been compro... Credit Reporting \n",
994
+ "4 I have never authorized, consented to nor bene... Credit Reporting \n",
995
+ "\n",
996
+ " Sub-product Issue \\\n",
997
+ "0 Credit reporting Problem with a company's investigation into an... \n",
998
+ "1 Credit reporting Incorrect information on your report \n",
999
+ "2 Credit reporting Problem with a credit reporting company's inve... \n",
1000
+ "3 Credit reporting Incorrect information on your report \n",
1001
+ "4 Credit reporting Incorrect information on your report \n",
1002
+ "\n",
1003
+ " Sub-issue \n",
1004
+ "0 Their investigation did not fix an error on yo... \n",
1005
+ "1 Information belongs to someone else \n",
1006
+ "2 Their investigation did not fix an error on yo... \n",
1007
+ "3 Information belongs to someone else \n",
1008
+ "4 Information belongs to someone else "
1009
+ ]
1010
+ },
1011
+ "execution_count": 26,
1012
+ "metadata": {},
1013
+ "output_type": "execute_result"
1014
+ }
1015
+ ],
1016
+ "source": [
1017
+ "train_df.head()"
1018
+ ]
1019
+ },
1020
+ {
1021
+ "cell_type": "code",
1022
+ "execution_count": 27,
1023
+ "id": "724b3508-7e79-4526-a20f-3797250f9cf9",
1024
+ "metadata": {},
1025
+ "outputs": [
1026
+ {
1027
+ "data": {
1028
+ "text/plain": [
1029
+ "(185637, 5)"
1030
+ ]
1031
+ },
1032
+ "execution_count": 27,
1033
+ "metadata": {},
1034
+ "output_type": "execute_result"
1035
+ }
1036
+ ],
1037
+ "source": [
1038
+ "train_df.shape"
1039
+ ]
1040
+ },
1041
+ {
1042
+ "cell_type": "code",
1043
+ "execution_count": 28,
1044
+ "id": "06972769-eddd-4ee7-9ebc-e6f587ad5366",
1045
+ "metadata": {},
1046
+ "outputs": [
1047
+ {
1048
+ "data": {
1049
+ "text/plain": [
1050
+ "(61880, 5)"
1051
+ ]
1052
+ },
1053
+ "execution_count": 28,
1054
+ "metadata": {},
1055
+ "output_type": "execute_result"
1056
+ }
1057
+ ],
1058
+ "source": [
1059
+ "test_df.shape"
1060
+ ]
1061
+ },
1062
+ {
1063
+ "cell_type": "code",
1064
+ "execution_count": 29,
1065
+ "id": "de358d80-fd59-4f9c-83ee-2264659f4b0f",
1066
+ "metadata": {},
1067
+ "outputs": [],
1068
+ "source": [
1069
+ "import os\n",
1070
+ "\n",
1071
+ "directory_to_save = './data_splits/'\n",
1072
+ "\n",
1073
+ "if not os.path.exists(directory_to_save):\n",
1074
+ " os.makedirs(directory_to_save)\n",
1075
+ "\n",
1076
+ "train_df.to_csv(directory_to_save + 'train-data-split.csv',index = False)\n",
1077
+ "test_df.to_csv(directory_to_save + 'test-data-split.csv',index = False)"
1078
+ ]
1079
+ }
1080
+ ],
1081
+ "metadata": {
1082
+ "kernelspec": {
1083
+ "display_name": "Python 3 (ipykernel)",
1084
+ "language": "python",
1085
+ "name": "python3"
1086
+ },
1087
+ "language_info": {
1088
+ "codemirror_mode": {
1089
+ "name": "ipython",
1090
+ "version": 3
1091
+ },
1092
+ "file_extension": ".py",
1093
+ "mimetype": "text/x-python",
1094
+ "name": "python",
1095
+ "nbconvert_exporter": "python",
1096
+ "pygments_lexer": "ipython3",
1097
+ "version": "3.9.19"
1098
+ }
1099
+ },
1100
+ "nbformat": 4,
1101
+ "nbformat_minor": 5
1102
+ }
notebooks/Plotting.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
plotting_helpers.py ADDED
@@ -0,0 +1,254 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import plotly.express as px
3
+ import plotly.graph_objects as go
4
+ import seaborn as sns
5
+ import warnings
6
+ warnings.filterwarnings('ignore')
7
+
8
+ # State abbreviation to full name mapping
9
+ state_mapping = {
10
+ 'FL': 'Florida', 'CA': 'California', 'TX': 'Texas', 'GA': 'Georgia',
11
+ 'NY': 'New York', 'IL': 'Illinois', 'PA': 'Pennsylvania', 'NC': 'North Carolina',
12
+ 'NJ': 'New Jersey', 'MD': 'Maryland', 'VA': 'Virginia', 'OH': 'Ohio',
13
+ 'MI': 'Michigan', 'SC': 'South Carolina', 'AZ': 'Arizona', 'TN': 'Tennessee',
14
+ 'NV': 'Nevada', 'LA': 'Louisiana', 'AL': 'Alabama', 'MO': 'Missouri',
15
+ 'MA': 'Massachusetts', 'IN': 'Indiana', 'AR': 'Arkansas', 'WA': 'Washington',
16
+ 'CO': 'Colorado', 'MS': 'Mississippi', 'CT': 'Connecticut', 'MN': 'Minnesota',
17
+ 'WI': 'Wisconsin', 'KY': 'Kentucky', 'UT': 'Utah', 'DE': 'Delaware',
18
+ 'OR': 'Oregon', 'OK': 'Oklahoma', 'DC': 'District of Columbia', 'KS': 'Kansas',
19
+ 'IA': 'Iowa', 'NM': 'New Mexico', 'NE': 'Nebraska', 'HI': 'Hawaii',
20
+ 'RI': 'Rhode Island', 'ID': 'Idaho', 'WV': 'West Virginia', 'NH': 'New Hampshire',
21
+ 'ME': 'Maine', 'MT': 'Montana', 'ND': 'North Dakota', 'AK': 'Alaska',
22
+ 'SD': 'South Dakota', 'WY': 'Wyoming', 'VT': 'Vermont'
23
+ # Removed territories and minor outlying islands not listed as states
24
+ }
25
+
26
+ # Function to plot top n most common categories
27
+ def plot_top_n(df, column, title, n=5, palette_name=None):
28
+ # Generate a color sequence from the seaborn palette
29
+ color_sequence = sns.color_palette(palette_name, n_colors=n).as_hex() if palette_name else None
30
+
31
+ # Get top n most common values in the specified column
32
+ counts = df[column].value_counts().reset_index()
33
+ counts.columns = [column, 'Count']
34
+ top_n = counts.head(n)
35
+
36
+ # Create a horizontal bar plot with the seaborn color sequence and remove the legend
37
+ fig = px.bar(top_n, y=column, x='Count', orientation='h',
38
+ color=column, color_discrete_sequence=color_sequence)
39
+ fig.update_layout(showlegend=False)
40
+ return fig
41
+
42
+ # 1. Plotting top 5 most common products
43
+ def plot_top_5_products(df_new):
44
+ # df_new = load_process_data(df)
45
+ fig = plot_top_n(df_new, 'Product', 'Top 5 Most Common Products')
46
+ return fig
47
+
48
+ # 2. Plotting Top 5 common issues
49
+ def plot_top_5_issues(df_new):
50
+ # df_new = load_process_data(df)
51
+ fig = plot_top_n(df_new, 'Issue', 'Top 5 Most Common Issues', palette_name='plasma')
52
+ return fig
53
+
54
+ # 3. Plotting top 5 issues in each product category
55
+ def plot_top_5_issues_in_product(df_new):
56
+ # Step 1: Group data by 'Product' and 'Issue', then count occurrences
57
+ grouped_data = df_new.groupby(['Product', 'Issue']).size().reset_index(name='Count')
58
+
59
+ # Calculate total issues per product for ordering
60
+ total_issues_per_product = grouped_data.groupby('Product')['Count'].sum().reset_index(name='TotalIssues')
61
+
62
+ # Sort products by total issues in descending order
63
+ sorted_products = total_issues_per_product.sort_values('TotalIssues', ascending=False)
64
+
65
+ # Step 2: Get top 5 issues for each product sorted by 'Count' in descending order
66
+ top_issues_per_product = (grouped_data.groupby('Product', as_index=False)
67
+ .apply(lambda x: x.nlargest(5, 'Count'))
68
+ .reset_index(drop=True))
69
+
70
+ # Merge to get the order column (TotalIssues) in top_issues_per_product for sorting
71
+ top_issues_per_product = top_issues_per_product.merge(sorted_products, on='Product')
72
+
73
+ # Sort top_issues_per_product DataFrame based on TotalIssues column to ensure the plot respects this order
74
+ top_issues_per_product = top_issues_per_product.sort_values(by=['TotalIssues', 'Count'], ascending=[False, False])
75
+
76
+ # Step 3: Create a vertical stacked bar chart
77
+ fig = px.bar(top_issues_per_product, x='Product', y='Count', color='Issue',
78
+ labels={'Count': 'Number of Complaints'},
79
+ category_orders={'Product': sorted_products['Product'].tolist()}) # Explicitly set the order of products
80
+
81
+ # Update layout to remove legend and adjust dimensions for clarity
82
+ fig.update_layout(showlegend=False, width=900, height=600)
83
+ return fig
84
+
85
+ # 4.Companies with the Most Complaints in 2023
86
+ def plot_top_10_companies_complaints(df_new):
87
+ # Filter data for the year 2023
88
+ df_2023 = df_new[df_new['Date received'].dt.year == 2023]
89
+
90
+ # Group data by company name and count the number of complaints for each company
91
+ company_complaint_counts = df_2023['Company'].value_counts()
92
+
93
+ top_n = 10
94
+ # Ensure the companies are sorted in ascending order for correct plotting
95
+ top_companies = company_complaint_counts.head(top_n).sort_values(ascending=True)
96
+
97
+ # Create a horizontal bar chart using Plotly Express with a nicer color scale
98
+ fig = px.bar(
99
+ x=top_companies.values,
100
+ y=top_companies.index,
101
+ orientation='h',
102
+ color=top_companies.values, # This assigns a color based on the value
103
+ color_continuous_scale=[(0.0, "green"),
104
+ (0.05, "yellow"),
105
+ (1.0, "red")], # This is an example of a nice color scale
106
+ labels={'x': 'Number of Complaints', 'y': 'Company'}
107
+ )
108
+
109
+ fig.update_layout(
110
+ xaxis=dict(
111
+ title='Number of Complaints',
112
+ ),
113
+ yaxis=dict(
114
+ tickfont=dict(size=10),
115
+ ),
116
+ height=500,
117
+ width=800,
118
+ )
119
+
120
+ # To display a color bar, showing the mapping of colors to values
121
+ fig.update_layout(coloraxis_showscale=False)
122
+ return fig
123
+
124
+ # 5. Top 10 States with the Most Complaints
125
+ def plot_top_10_states_most_complaints(df_new):
126
+ # Assuming df_new is your DataFrame and 'State' contains the abbreviations
127
+ # Map state abbreviations to full names
128
+ df_new['State Name'] = df_new['State'].map(state_mapping)
129
+
130
+ # Calculate complaint counts by state
131
+ state_complaint_counts = df_new['State Name'].value_counts()
132
+
133
+ # Get top 10 states with the most complaint counts
134
+ top_n = 10
135
+ top_states = state_complaint_counts.head(top_n)
136
+
137
+ # Create a horizontal bar chart using Plotly Express with a nice color scale
138
+ fig = px.bar(
139
+ x=top_states.values,
140
+ y=top_states.index,
141
+ orientation='h',
142
+ color=top_states.values, # Assign color based on values
143
+ color_continuous_scale='Turbo', # A nice color scale
144
+ labels={'x': 'Number of Complaints', 'y': 'State'},
145
+ category_orders={'y': top_states.index.tolist()}
146
+ )
147
+
148
+ fig.update_layout(
149
+ yaxis=dict(
150
+ tickfont=dict(size=10),
151
+ ),
152
+ xaxis=dict(
153
+ tickangle=0,
154
+ ),
155
+ height=500,
156
+ width=900,
157
+ )
158
+
159
+ # To display a color bar, showing the mapping of colors to values
160
+ fig.update_layout(coloraxis_showscale=False)
161
+ return fig
162
+
163
+ # 6. Top 10 States with the Least Complaints
164
+ def plot_top_10_states_least_complaints(df_new):
165
+ # Map state abbreviations to full names
166
+ df_new['State Name'] = df_new['State'].map(state_mapping)
167
+
168
+ # Calculate complaint counts by state
169
+ state_complaint_counts = df_new['State Name'].value_counts()
170
+
171
+ # Get top 10 states with the most complaint counts
172
+ top_n = 10
173
+ top_states = state_complaint_counts.tail(top_n)
174
+
175
+ # Create a horizontal bar chart using Plotly Express with a nice color scale
176
+ fig = px.bar(
177
+ x=top_states.values,
178
+ y=top_states.index,
179
+ orientation='h',
180
+ color=top_states.values, # Assign color based on values
181
+ color_continuous_scale='Temps', # A nice color scale
182
+ labels={'x': 'Number of Complaints', 'y': 'State'},
183
+ category_orders={'x': top_states.index.tolist()}
184
+ )
185
+
186
+ fig.update_layout(
187
+ yaxis=dict(
188
+ tickfont=dict(size=10),
189
+ ),
190
+ xaxis=dict(
191
+ tickangle=0,
192
+ ),
193
+ height=500,
194
+ width=900,
195
+ )
196
+
197
+ # To display a color bar, showing the mapping of colors to values
198
+ fig.update_layout(coloraxis_showscale=False)
199
+
200
+ return fig
201
+
202
+ # 7. Number of Complaints by Year
203
+ def complaints_by_year(df_new):
204
+ monthly_complaints = df_new.copy()
205
+ monthly_complaints = monthly_complaints[monthly_complaints['Date received'].dt.year != 2024]
206
+
207
+ monthly_complaints['MonthYear'] = monthly_complaints['Date received'].dt.to_period('M').astype(str)
208
+ monthly_complaints = monthly_complaints.groupby('MonthYear').size().reset_index(name = "NumComplaints")
209
+
210
+
211
+ fig = px.line(monthly_complaints, x='MonthYear', y='NumComplaints',
212
+ labels={'MonthYear': 'Year', 'NumComplaints': 'Number of Complaints'})
213
+
214
+ fig.update_layout(
215
+ width=900,
216
+ height=400
217
+ )
218
+ return fig
219
+
220
+ # 8. Number of Complaints by State
221
+ def complaints_across_states(df_new):
222
+ df_2023 = df_new[df_new['Date received'].dt.year == 2023]
223
+
224
+ state_complaints = df_2023.groupby('State').size().reset_index(name='Num_complaints')
225
+ state_complaints['Full_state_name'] = state_complaints['State'].apply(lambda x : state_mapping[x] if x in state_mapping else x)
226
+
227
+ fig = px.choropleth(state_complaints,
228
+ locations='State',
229
+ locationmode='USA-states',
230
+ color='Num_complaints',
231
+ color_continuous_scale='Inferno',
232
+ scope="usa",
233
+ hover_name='Full_state_name')
234
+ fig.add_scattergeo(
235
+ locations=state_complaints['State'], ###codes for states,
236
+ locationmode='USA-states',
237
+ text=state_complaints['State'],
238
+ mode='text',
239
+ hoverinfo='skip',
240
+ textfont=dict(size = 8.5,color='white'))
241
+
242
+ fig.update_layout(
243
+ autosize = True,
244
+ geo=dict(
245
+ landcolor='rgb(217, 217, 217)',
246
+ lakecolor='rgb(255, 255, 255)',
247
+ bgcolor='rgb(255, 255, 255)'
248
+ ),
249
+ paper_bgcolor='rgb(255, 255, 255)',
250
+ margin={"r":0,"t":50,"l":0,"b":0},
251
+ width=1000,
252
+ height=400
253
+ )
254
+ return fig
requirements.txt ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ matplotlib==3.8.3
2
+ numpy==1.26.4
3
+ pandas==2.2.2
4
+ plotly==5.20.0
5
+ scikit_learn==1.4.1.post1
6
+ seaborn==0.13.2
7
+ streamlit==1.33.0
8
+ streamlit_option_menu==0.3.12
9
+ torch==2.2.2
10
+ transformers==4.39.3