Mahesh Babu commited on
Commit
26c2106
1 Parent(s): 7467c1e

added modeling files

Browse files
Files changed (31) hide show
  1. subproduct_prediction/.DS_Store +0 -0
  2. subproduct_prediction/.ipynb_checkpoints/Pipeline Modified-checkpoint.ipynb +421 -0
  3. subproduct_prediction/.ipynb_checkpoints/Pipeline-checkpoint.ipynb +192 -0
  4. subproduct_prediction/.ipynb_checkpoints/Sub_Issue-checkpoint.ipynb +1900 -0
  5. subproduct_prediction/.ipynb_checkpoints/Sub_Issues-modified-checkpoint.ipynb +990 -0
  6. subproduct_prediction/Pipeline.ipynb +0 -0
  7. subproduct_prediction/Sub_Issue.ipynb +990 -0
  8. subproduct_prediction/Sub_Product.ipynb +700 -0
  9. subproduct_prediction/issue_models/account_operations_and_unauthorized_transaction_issues.pkl +3 -0
  10. subproduct_prediction/issue_models/attempts_to_collect_debt_not_owed.pkl +3 -0
  11. subproduct_prediction/issue_models/closing_an_account.pkl +3 -0
  12. subproduct_prediction/issue_models/closing_your_account.pkl +3 -0
  13. subproduct_prediction/issue_models/credit_report_and_monitoring_issues.pkl +3 -0
  14. subproduct_prediction/issue_models/dealing_with_your_lender_or_servicer.pkl +3 -0
  15. subproduct_prediction/issue_models/disputes_and_misrepresentations.pkl +3 -0
  16. subproduct_prediction/issue_models/improper_use_of_your_report.pkl +3 -0
  17. subproduct_prediction/issue_models/incorrect_information_on_your_report.pkl +3 -0
  18. subproduct_prediction/issue_models/legal_and_threat_actions.pkl +3 -0
  19. subproduct_prediction/issue_models/managing_an_account.pkl +3 -0
  20. subproduct_prediction/issue_models/payment_and_funds_management.pkl +3 -0
  21. subproduct_prediction/issue_models/problem_with_a_company's_investigation_into_an_existing_issue.pkl +3 -0
  22. subproduct_prediction/issue_models/problem_with_a_company's_investigation_into_an_existing_problem.pkl +3 -0
  23. subproduct_prediction/issue_models/problem_with_a_credit_reporting_company's_investigation_into_an_existing_problem.pkl +3 -0
  24. subproduct_prediction/issue_models/problem_with_a_purchase_shown_on_your_statement.pkl +3 -0
  25. subproduct_prediction/issue_models/written_notification_about_debt.pkl +3 -0
  26. subproduct_prediction/models/Checking_saving_model.pkl +3 -0
  27. subproduct_prediction/models/Credit_Prepaid_Card_model.pkl +3 -0
  28. subproduct_prediction/models/Credit_Reporting_model.pkl +3 -0
  29. subproduct_prediction/models/Debt_model.pkl +3 -0
  30. subproduct_prediction/models/Product_model.pkl +3 -0
  31. subproduct_prediction/models/loan_model.pkl +3 -0
subproduct_prediction/.DS_Store ADDED
Binary file (6.15 kB). View file
 
subproduct_prediction/.ipynb_checkpoints/Pipeline Modified-checkpoint.ipynb ADDED
@@ -0,0 +1,421 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "299ffd7f-502b-4183-9536-4e47654baae8",
6
+ "metadata": {
7
+ "jp-MarkdownHeadingCollapsed": true
8
+ },
9
+ "source": [
10
+ "#### Importing the necessary libraries"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 1,
16
+ "id": "e27f22a3-f39e-4007-a048-56ccc9af915e",
17
+ "metadata": {},
18
+ "outputs": [],
19
+ "source": [
20
+ "import torch\n",
21
+ "import pickle\n",
22
+ "import pandas as pd\n",
23
+ "from tqdm import tqdm\n",
24
+ "from sklearn.pipeline import Pipeline\n",
25
+ "from transformers import pipeline\n",
26
+ "from sklearn.metrics import accuracy_score, precision_score"
27
+ ]
28
+ },
29
+ {
30
+ "cell_type": "markdown",
31
+ "id": "7b6553e4-339a-4003-b6f9-4aa52d2818c0",
32
+ "metadata": {
33
+ "jp-MarkdownHeadingCollapsed": true
34
+ },
35
+ "source": [
36
+ "#### Loading 5 product models"
37
+ ]
38
+ },
39
+ {
40
+ "cell_type": "code",
41
+ "execution_count": 2,
42
+ "id": "bd40a9e0-faab-4999-9ad5-f74e7ae8b272",
43
+ "metadata": {},
44
+ "outputs": [],
45
+ "source": [
46
+ "with open('models/Credit_Reporting_model.pkl', 'rb') as f:\n",
47
+ " trained_model_cr= pickle.load(f)\n",
48
+ "\n",
49
+ "with open('models/Credit_Prepaid_Card_model.pkl', 'rb') as f:\n",
50
+ " trained_model_cp= pickle.load(f)\n",
51
+ "\n",
52
+ "with open('models/Checking_saving_model.pkl', 'rb') as f:\n",
53
+ " trained_model_cs=pickle.load(f)\n",
54
+ "\n",
55
+ "with open('models/loan_model.pkl', 'rb') as f:\n",
56
+ " trained_model_l= pickle.load(f)\n",
57
+ "\n",
58
+ "with open('models/Debt_model.pkl', 'rb') as f:\n",
59
+ " trained_model_d= pickle.load(f)"
60
+ ]
61
+ },
62
+ {
63
+ "cell_type": "markdown",
64
+ "id": "8dd19c5a-5e4f-457c-88b7-5efa18964a8b",
65
+ "metadata": {
66
+ "jp-MarkdownHeadingCollapsed": true
67
+ },
68
+ "source": [
69
+ "#### Loading 17 issue models"
70
+ ]
71
+ },
72
+ {
73
+ "cell_type": "code",
74
+ "execution_count": 3,
75
+ "id": "3dae2131-cfa4-4887-a30a-00d6caf547e8",
76
+ "metadata": {},
77
+ "outputs": [],
78
+ "source": [
79
+ "# Path to the models and their corresponding names\n",
80
+ "issue_model_files = {\n",
81
+ " 'trained_model_account_operations': 'issue_models/account_operations_and_unauthorized_transaction_issues.pkl',\n",
82
+ " 'trained_model_collect_debt': 'issue_models/attempts_to_collect_debt_not_owed.pkl',\n",
83
+ " 'trained_model_closing_account': 'issue_models/closing_an_account.pkl',\n",
84
+ " 'trained_model_closing_your_account': 'issue_models/closing_your_account.pkl',\n",
85
+ " 'trained_model_credit_report': 'issue_models/credit_report_and_monitoring_issues.pkl',\n",
86
+ " 'trained_model_lender': 'issue_models/dealing_with_your_lender_or_servicer.pkl',\n",
87
+ " 'trained_model_disputes': 'issue_models/disputes_and_misrepresentations.pkl',\n",
88
+ " 'trained_model_improper_use_report': 'issue_models/improper_use_of_your_report.pkl',\n",
89
+ " 'trained_model_incorrect_info': 'issue_models/incorrect_information_on_your_report.pkl',\n",
90
+ " 'trained_model_legal_and_threat': 'issue_models/legal_and_threat_actions.pkl',\n",
91
+ " 'trained_model_managing_account': 'issue_models/managing_an_account.pkl',\n",
92
+ " 'trained_model_payment_funds': 'issue_models/payment_and_funds_management.pkl',\n",
93
+ " 'trained_model_investigation_wrt_issue': 'issue_models/problem_with_a_company\\'s_investigation_into_an_existing_issue.pkl',\n",
94
+ " 'trained_model_investigation_wrt_problem': 'issue_models/problem_with_a_company\\'s_investigation_into_an_existing_problem.pkl',\n",
95
+ " 'trained_model_credit_investigation_wrt_problem': 'issue_models/problem_with_a_credit_reporting_company\\'s_investigation_into_an_existing_problem.pkl',\n",
96
+ " 'trained_model_purchase_shown': 'issue_models/problem_with_a_purchase_shown_on_your_statement.pkl',\n",
97
+ " 'trained_model_notification_about_debt': 'issue_models/written_notification_about_debt.pkl',\n",
98
+ "}\n",
99
+ "\n",
100
+ "issue_models = {}\n",
101
+ "\n",
102
+ "for model_name, file_path in issue_model_files.items():\n",
103
+ " with open(file_path, 'rb') as f:\n",
104
+ " issue_models[model_name] = pickle.load(f)"
105
+ ]
106
+ },
107
+ {
108
+ "cell_type": "markdown",
109
+ "id": "bf41b143-2ff3-4a79-83a9-afcc0d352dd0",
110
+ "metadata": {
111
+ "jp-MarkdownHeadingCollapsed": true
112
+ },
113
+ "source": [
114
+ "#### LLM to classify the product based on the narrative"
115
+ ]
116
+ },
117
+ {
118
+ "cell_type": "code",
119
+ "execution_count": 4,
120
+ "id": "b946427b-b259-4eb2-a40b-ed7b7e476354",
121
+ "metadata": {},
122
+ "outputs": [],
123
+ "source": [
124
+ "device = \"mps\" if torch.backends.mps.is_available() else \"cpu\"\n",
125
+ "\n",
126
+ "# Define the pipeline for classifying product\n",
127
+ "product_classifier = pipeline(\"text-classification\", model=\"Mahesh9/distil-bert-fintuned-product-cfpb-complaints\",\n",
128
+ " max_length = 512, truncation = True, device = device)"
129
+ ]
130
+ },
131
+ {
132
+ "cell_type": "markdown",
133
+ "id": "0f0c40cd-f23e-4e0a-8c03-34b517a4c727",
134
+ "metadata": {
135
+ "jp-MarkdownHeadingCollapsed": true
136
+ },
137
+ "source": [
138
+ "#### Function to choose the appropriate product model to classify the sub-product"
139
+ ]
140
+ },
141
+ {
142
+ "cell_type": "code",
143
+ "execution_count": 5,
144
+ "id": "619d9c58-1a83-4279-b452-63f3cb69998f",
145
+ "metadata": {},
146
+ "outputs": [],
147
+ "source": [
148
+ "# Define a function to select the appropriate subproduct prediction model based on the predicted product\n",
149
+ "def select_subproduct_model(predicted_product):\n",
150
+ " if predicted_product == 'Credit Reporting' :\n",
151
+ " return trained_model_cr\n",
152
+ " elif predicted_product == 'Credit/Prepaid Card':\n",
153
+ " return trained_model_cp\n",
154
+ " elif predicted_product == 'Checking or savings account':\n",
155
+ " return trained_model_cs\n",
156
+ " elif predicted_product == 'Loans / Mortgage':\n",
157
+ " return trained_model_l\n",
158
+ " elif predicted_product == 'Debt collection':\n",
159
+ " return trained_model_d\n",
160
+ " else:\n",
161
+ " raise ValueError(\"Invalid predicted product category\")"
162
+ ]
163
+ },
164
+ {
165
+ "cell_type": "markdown",
166
+ "id": "2f361468-ab6d-4d9a-a665-2c9dbce42e93",
167
+ "metadata": {
168
+ "jp-MarkdownHeadingCollapsed": true
169
+ },
170
+ "source": [
171
+ "#### LLM to classify the issue based on the narrative"
172
+ ]
173
+ },
174
+ {
175
+ "cell_type": "code",
176
+ "execution_count": 6,
177
+ "id": "0a8da273-8dfb-43b8-abf9-cf06871f2763",
178
+ "metadata": {},
179
+ "outputs": [],
180
+ "source": [
181
+ "# Define the pipeline for classifying issue\n",
182
+ "issue_classifier = pipeline(\"text-classification\", model=\"Mahesh9/distil-bert-fintuned-issues-cfpb-complaints\",\n",
183
+ " max_length = 512, truncation = True, device = device)"
184
+ ]
185
+ },
186
+ {
187
+ "cell_type": "markdown",
188
+ "id": "df05c0c0-c4cc-4287-b129-75f60dd88348",
189
+ "metadata": {
190
+ "jp-MarkdownHeadingCollapsed": true
191
+ },
192
+ "source": [
193
+ "#### Function to choose the appropriate issue model to classify the sub-issue"
194
+ ]
195
+ },
196
+ {
197
+ "cell_type": "code",
198
+ "execution_count": 7,
199
+ "id": "f55a787b-ce6a-49dd-96dd-1cbfda8a68a5",
200
+ "metadata": {},
201
+ "outputs": [],
202
+ "source": [
203
+ "# Define a function to select the appropriate subissue prediction model based on the predicted issue\n",
204
+ "def select_subissue_model(predicted_issue):\n",
205
+ " if predicted_issue == \"Problem with a company's investigation into an existing problem\":\n",
206
+ " return issue_models['trained_model_investigation_wrt_problem']\n",
207
+ " \n",
208
+ " elif predicted_issue == \"Problem with a credit reporting company's investigation into an existing problem\":\n",
209
+ " return issue_models['trained_model_credit_investigation_wrt_problem']\n",
210
+ "\n",
211
+ " elif predicted_issue == \"Problem with a company's investigation into an existing issue\":\n",
212
+ " return issue_models['trained_model_investigation_wrt_issue']\n",
213
+ "\n",
214
+ " elif predicted_issue == \"Problem with a purchase shown on your statement\":\n",
215
+ " return issue_models['trained_model_purchase_shown']\n",
216
+ "\n",
217
+ " elif predicted_issue == \"Incorrect information on your report\":\n",
218
+ " return issue_models['trained_model_incorrect_info']\n",
219
+ " \n",
220
+ " elif predicted_issue == \"Improper use of your report\":\n",
221
+ " return issue_models['trained_model_improper_use_report']\n",
222
+ "\n",
223
+ " elif predicted_issue == \"Account Operations and Unauthorized Transaction Issues\":\n",
224
+ " return issue_models['trained_model_account_operations']\n",
225
+ " \n",
226
+ " elif predicted_issue == \"Payment and Funds Management\":\n",
227
+ " return issue_models['trained_model_payment_funds']\n",
228
+ "\n",
229
+ " elif predicted_issue == \"Managing an account\":\n",
230
+ " return issue_models['trained_model_managing_account']\n",
231
+ " \n",
232
+ " elif predicted_issue == \"Attempts to collect debt not owed\":\n",
233
+ " return issue_models['trained_model_collect_debt']\n",
234
+ "\n",
235
+ " elif predicted_issue == \"Written notification about debt\":\n",
236
+ " return issue_models['trained_model_notification_about_debt']\n",
237
+ " \n",
238
+ " elif predicted_issue == \"Dealing with your lender or servicer\":\n",
239
+ " return issue_models['trained_model_lender']\n",
240
+ "\n",
241
+ " elif predicted_issue == \"Disputes and Misrepresentations\":\n",
242
+ " return issue_models['trained_model_disputes']\n",
243
+ " \n",
244
+ " elif predicted_issue == \"Closing your account\":\n",
245
+ " return issue_models['trained_model_closing_your_account']\n",
246
+ "\n",
247
+ " elif predicted_issue == \"Closing an account\":\n",
248
+ " return issue_models['trained_model_closing_account']\n",
249
+ " \n",
250
+ " elif predicted_issue == \"Credit Report and Monitoring Issues\":\n",
251
+ " return issue_models['trained_model_credit_report']\n",
252
+ "\n",
253
+ " elif predicted_issue == \"Legal and Threat Actions\":\n",
254
+ " return issue_models['trained_model_legal_and_threat']\n",
255
+ " \n",
256
+ " else:\n",
257
+ " raise ValueError(\"Invalid predicted issue category\")"
258
+ ]
259
+ },
260
+ {
261
+ "cell_type": "markdown",
262
+ "id": "d87974e1-1bf8-44ea-bfee-75de8e2960b4",
263
+ "metadata": {
264
+ "jp-MarkdownHeadingCollapsed": true
265
+ },
266
+ "source": [
267
+ "#### Driver code to classify the complaint into various categories"
268
+ ]
269
+ },
270
+ {
271
+ "cell_type": "code",
272
+ "execution_count": 8,
273
+ "id": "dc785511-d68f-4341-a080-23f8f27eefc4",
274
+ "metadata": {},
275
+ "outputs": [],
276
+ "source": [
277
+ "def classify_complaint(narrative):\n",
278
+ " # Predict product category\n",
279
+ " predicted_product = product_classifier(narrative)[0]['label']\n",
280
+ " \n",
281
+ " # Load the appropriate subproduct prediction model\n",
282
+ " subproduct_model = select_subproduct_model(predicted_product)\n",
283
+ " # Predict subproduct category using the selected model\n",
284
+ " predicted_subproduct = subproduct_model.predict([narrative])[0]\n",
285
+ "\n",
286
+ "\n",
287
+ " \n",
288
+ " # Predict the appropriate issue category using the narrative\n",
289
+ " predicted_issue = issue_classifier(narrative)[0]['label']\n",
290
+ " \n",
291
+ " # Load the appropriate subissue prediction model\n",
292
+ " subissue_model = select_subissue_model(predicted_issue)\n",
293
+ " # Predict subissue category using the selected model\n",
294
+ " predicted_subissue = subissue_model.predict([narrative])[0]\n",
295
+ " \n",
296
+ " return {\n",
297
+ " \"Product\" : predicted_product,\n",
298
+ " \"Sub-product\" : predicted_subproduct,\n",
299
+ " \"Issue\" : predicted_issue,\n",
300
+ " \"Sub-issue\" : predicted_subissue\n",
301
+ " }"
302
+ ]
303
+ },
304
+ {
305
+ "cell_type": "code",
306
+ "execution_count": 9,
307
+ "id": "982521ea-364e-4521-889e-fe586c186701",
308
+ "metadata": {},
309
+ "outputs": [
310
+ {
311
+ "data": {
312
+ "text/plain": [
313
+ "{'Product': 'Credit/Prepaid Card',\n",
314
+ " 'Sub-product': 'General-purpose credit card or charge card',\n",
315
+ " 'Issue': \"Problem with a company's investigation into an existing problem\",\n",
316
+ " 'Sub-issue': 'Was not notified of investigation status or results'}"
317
+ ]
318
+ },
319
+ "execution_count": 9,
320
+ "metadata": {},
321
+ "output_type": "execute_result"
322
+ }
323
+ ],
324
+ "source": [
325
+ "narrative = \"\"\"It is absurd that I have consistently made timely payments for this account and have never been\n",
326
+ " overdue. I kindly request that you promptly update my account to reflect this accurately.\"\"\"\n",
327
+ "\n",
328
+ "classify_complaint(narrative)"
329
+ ]
330
+ },
331
+ {
332
+ "cell_type": "markdown",
333
+ "id": "6a68ebbc-de80-4176-ac38-bfe5fd84b86c",
334
+ "metadata": {
335
+ "jp-MarkdownHeadingCollapsed": true
336
+ },
337
+ "source": [
338
+ "#### Evaluation on external test set"
339
+ ]
340
+ },
341
+ {
342
+ "cell_type": "code",
343
+ "execution_count": null,
344
+ "id": "88529ef1-6ed2-41b9-a266-e550a50b831f",
345
+ "metadata": {},
346
+ "outputs": [],
347
+ "source": [
348
+ "# Load the test dataset\n",
349
+ "test_data = pd.read_csv('../data_splits/test-data-split.csv') \n",
350
+ "\n",
351
+ "# Initialize lists to store predicted and actual labels\n",
352
+ "predicted_products = []\n",
353
+ "predicted_subproducts = []\n",
354
+ "predicted_issues = []\n",
355
+ "predicted_subissues = []\n",
356
+ "\n",
357
+ "actual_products = test_data['Product']\n",
358
+ "actual_subproducts = test_data['Sub-product']\n",
359
+ "actual_issues = test_data['Issue']\n",
360
+ "actual_subissues = test_data['Sub-issue']\n",
361
+ "\n",
362
+ "# Iterate over each complaint narrative in the test set\n",
363
+ "for narrative in tqdm(test_data['Consumer complaint narrative']):\n",
364
+ " # Predict product and subproduct using the custom_predict function\n",
365
+ " prediction = classify_complaint(narrative)\n",
366
+ " \n",
367
+ " # Append predicted labels to lists\n",
368
+ " predicted_products.append(prediction['Product'])\n",
369
+ " predicted_subproducts.append(prediction['Sub-product'])\n",
370
+ " predicted_issues.append(prediction['Issue'])\n",
371
+ " predicted_subissues.append(prediction['Sub-issue'])\n",
372
+ " \n",
373
+ "# Calculate accuracy and precision\n",
374
+ "accuracy_product = accuracy_score(actual_products, predicted_products)\n",
375
+ "precision_product = precision_score(actual_products, predicted_products, average='macro',zero_division=1)\n",
376
+ "accuracy_subproduct = accuracy_score(actual_subproducts, predicted_subproducts)\n",
377
+ "precision_subproduct = precision_score(actual_subproducts, predicted_subproducts, average='macro',zero_division=1)\n",
378
+ "\n",
379
+ "accuracy_product = accuracy_score(actual_issues, predicted_issues)\n",
380
+ "precision_product = precision_score(actual_issues, predicted_issues, average='macro',zero_division=1)\n",
381
+ "accuracy_subproduct = accuracy_score(actual_subissues, predicted_subissues)\n",
382
+ "precision_subproduct = precision_score(actual_subissues, predicted_subissues, average='macro',zero_division=1)\n",
383
+ "\n",
384
+ "\n",
385
+ "# Print the results\n",
386
+ "print(\"Product Prediction Accuracy:\", accuracy_product)\n",
387
+ "print(\"Product Prediction Precision:\", precision_product)\n",
388
+ "\n",
389
+ "print(\"Subproduct Prediction Accuracy:\", accuracy_subproduct)\n",
390
+ "print(\"Subproduct Prediction Precision:\", precision_subproduct)\n",
391
+ "\n",
392
+ "print(\"Issue Prediction Accuracy:\", accuracy_issue)\n",
393
+ "print(\"Issue Prediction Precision:\", precision_issue)\n",
394
+ "\n",
395
+ "print(\"Sub-issue Prediction Accuracy:\", accuracy_issue)\n",
396
+ "print(\"Sub-issue Prediction Precision:\", precision_issue)"
397
+ ]
398
+ }
399
+ ],
400
+ "metadata": {
401
+ "kernelspec": {
402
+ "display_name": "Python 3 (ipykernel)",
403
+ "language": "python",
404
+ "name": "python3"
405
+ },
406
+ "language_info": {
407
+ "codemirror_mode": {
408
+ "name": "ipython",
409
+ "version": 3
410
+ },
411
+ "file_extension": ".py",
412
+ "mimetype": "text/x-python",
413
+ "name": "python",
414
+ "nbconvert_exporter": "python",
415
+ "pygments_lexer": "ipython3",
416
+ "version": "3.9.19"
417
+ }
418
+ },
419
+ "nbformat": 4,
420
+ "nbformat_minor": 5
421
+ }
subproduct_prediction/.ipynb_checkpoints/Pipeline-checkpoint.ipynb ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "e27f22a3-f39e-4007-a048-56ccc9af915e",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import pickle"
11
+ ]
12
+ },
13
+ {
14
+ "cell_type": "code",
15
+ "execution_count": 2,
16
+ "id": "bd40a9e0-faab-4999-9ad5-f74e7ae8b272",
17
+ "metadata": {},
18
+ "outputs": [],
19
+ "source": [
20
+ "with open('models/Credit_Reporting_model.pkl', 'rb') as f:\n",
21
+ " trained_model_cr= pickle.load(f)\n",
22
+ "\n",
23
+ "with open('models/Credit_Prepaid_Card_model.pkl', 'rb') as f:\n",
24
+ " trained_model_cp= pickle.load(f)\n",
25
+ "\n",
26
+ "with open('models/Checking_saving_model.pkl', 'rb') as f:\n",
27
+ " trained_model_cs=pickle.load(f)\n",
28
+ "\n",
29
+ "with open('models/loan_model.pkl', 'rb') as f:\n",
30
+ " trained_model_l= pickle.load(f)\n",
31
+ "\n",
32
+ "with open('models/Debt_model.pkl', 'rb') as f:\n",
33
+ " trained_model_d= pickle.load(f)"
34
+ ]
35
+ },
36
+ {
37
+ "cell_type": "code",
38
+ "execution_count": 3,
39
+ "id": "d1ad5fb9-36bf-4637-a137-17fca19224f6",
40
+ "metadata": {},
41
+ "outputs": [],
42
+ "source": [
43
+ "with open('models/Product_model.pkl', 'rb') as f:\n",
44
+ " product_model= pickle.load(f)"
45
+ ]
46
+ },
47
+ {
48
+ "cell_type": "code",
49
+ "execution_count": 9,
50
+ "id": "b946427b-b259-4eb2-a40b-ed7b7e476354",
51
+ "metadata": {},
52
+ "outputs": [],
53
+ "source": [
54
+ "from sklearn.pipeline import Pipeline\n",
55
+ "\n",
56
+ "# Define the pipeline steps\n",
57
+ "trained_product_model=product_model\n",
58
+ "\n",
59
+ "\n",
60
+ "# Define a function to select the appropriate subproduct prediction model based on the predicted product\n",
61
+ "def select_subproduct_model(predicted_product):\n",
62
+ " if predicted_product == 'Credit Reporting' :\n",
63
+ " return trained_model_cr\n",
64
+ " elif predicted_product == 'Credit/Prepaid Card':\n",
65
+ " return trained_model_cp\n",
66
+ " elif predicted_product == 'Checking or savings account':\n",
67
+ " return trained_model_cs\n",
68
+ " elif predicted_product == 'Loans / Mortgage':\n",
69
+ " return trained_model_l\n",
70
+ " elif predicted_product == 'Debt collection':\n",
71
+ " return trained_model_d\n",
72
+ " else:\n",
73
+ " raise ValueError(\"Invalid predicted product category\")\n",
74
+ "\n",
75
+ "def custom_predict(narrative):\n",
76
+ " # Predict product category\n",
77
+ " predicted_product = product_model.predict([narrative])[0]\n",
78
+ " \n",
79
+ " # Load the appropriate subproduct prediction model\n",
80
+ " subproduct_model = select_subproduct_model(predicted_product)\n",
81
+ " \n",
82
+ " # Predict subproduct category using the selected model\n",
83
+ " predicted_subproduct = subproduct_model.predict([narrative])\n",
84
+ " return predicted_product, predicted_subproduct"
85
+ ]
86
+ },
87
+ {
88
+ "cell_type": "code",
89
+ "execution_count": 11,
90
+ "id": "982521ea-364e-4521-889e-fe586c186701",
91
+ "metadata": {},
92
+ "outputs": [
93
+ {
94
+ "name": "stdout",
95
+ "output_type": "stream",
96
+ "text": [
97
+ "Predicted product: Credit/Prepaid Card\n",
98
+ "Predicted subproduct: ['Checking account']\n"
99
+ ]
100
+ }
101
+ ],
102
+ "source": [
103
+ "narrative = \"I have a problem with my credit card bill.\"\n",
104
+ "#narrative = \"it is absurd that i have consistently made timely payments for this account and have never been overdue. i kindly request that you promptly update my account to reflect this accurately.\"\n",
105
+ "predicted_product, predicted_subproduct = custom_predict(narrative)\n",
106
+ "print(\"Predicted product:\", predicted_product)\n",
107
+ "print(\"Predicted subproduct:\", predicted_subproduct)"
108
+ ]
109
+ },
110
+ {
111
+ "cell_type": "code",
112
+ "execution_count": 7,
113
+ "id": "88529ef1-6ed2-41b9-a266-e550a50b831f",
114
+ "metadata": {},
115
+ "outputs": [
116
+ {
117
+ "name": "stdout",
118
+ "output_type": "stream",
119
+ "text": [
120
+ "Product Prediction Accuracy: 0.9110859728506787\n",
121
+ "Product Prediction Precision: 0.6634108079865927\n",
122
+ "Subproduct Prediction Accuracy: 0.8377989657401422\n",
123
+ "Subproduct Prediction Precision: 0.5058767033148038\n"
124
+ ]
125
+ }
126
+ ],
127
+ "source": [
128
+ "from sklearn.metrics import accuracy_score, precision_score\n",
129
+ "import pandas as pd\n",
130
+ "\n",
131
+ "# Load the test dataset\n",
132
+ "test_data = pd.read_csv('../data_splits/test-data-split.csv') \n",
133
+ "\n",
134
+ "# Initialize lists to store predicted and actual labels\n",
135
+ "predicted_products = []\n",
136
+ "predicted_subproducts = []\n",
137
+ "actual_products = test_data['Product']\n",
138
+ "actual_subproducts = test_data['Sub-product']\n",
139
+ "\n",
140
+ "# Iterate over each complaint narrative in the test set\n",
141
+ "for narrative in test_data['Consumer complaint narrative']:\n",
142
+ " # Predict product and subproduct using the custom_predict function\n",
143
+ " predicted_product, predicted_subproduct = custom_predict(narrative)\n",
144
+ " \n",
145
+ " # Append predicted labels to lists\n",
146
+ " predicted_products.append(predicted_product)\n",
147
+ " predicted_subproducts.append(predicted_subproduct)\n",
148
+ "\n",
149
+ "# Calculate accuracy and precision\n",
150
+ "accuracy_product = accuracy_score(actual_products, predicted_products)\n",
151
+ "precision_product = precision_score(actual_products, predicted_products, average='macro',zero_division=1)\n",
152
+ "accuracy_subproduct = accuracy_score(actual_subproducts, predicted_subproducts)\n",
153
+ "precision_subproduct = precision_score(actual_subproducts, predicted_subproducts, average='macro',zero_division=1)\n",
154
+ "\n",
155
+ "# Print the results\n",
156
+ "print(\"Product Prediction Accuracy:\", accuracy_product)\n",
157
+ "print(\"Product Prediction Precision:\", precision_product)\n",
158
+ "print(\"Subproduct Prediction Accuracy:\", accuracy_subproduct)\n",
159
+ "print(\"Subproduct Prediction Precision:\", precision_subproduct)\n"
160
+ ]
161
+ },
162
+ {
163
+ "cell_type": "code",
164
+ "execution_count": null,
165
+ "id": "ce982e0a",
166
+ "metadata": {},
167
+ "outputs": [],
168
+ "source": []
169
+ }
170
+ ],
171
+ "metadata": {
172
+ "kernelspec": {
173
+ "display_name": "Python 3 (ipykernel)",
174
+ "language": "python",
175
+ "name": "python3"
176
+ },
177
+ "language_info": {
178
+ "codemirror_mode": {
179
+ "name": "ipython",
180
+ "version": 3
181
+ },
182
+ "file_extension": ".py",
183
+ "mimetype": "text/x-python",
184
+ "name": "python",
185
+ "nbconvert_exporter": "python",
186
+ "pygments_lexer": "ipython3",
187
+ "version": "3.9.19"
188
+ }
189
+ },
190
+ "nbformat": 4,
191
+ "nbformat_minor": 5
192
+ }
subproduct_prediction/.ipynb_checkpoints/Sub_Issue-checkpoint.ipynb ADDED
@@ -0,0 +1,1900 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "a751d479-1500-41e2-8c01-252e849dad05",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import warnings\n",
11
+ "warnings.filterwarnings(\"ignore\")"
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "execution_count": 2,
17
+ "id": "8158cb66-9f9a-4bb2-bc6e-6a51146be10c",
18
+ "metadata": {},
19
+ "outputs": [],
20
+ "source": [
21
+ "import pandas as pd\n",
22
+ "import matplotlib.pyplot as plt \n",
23
+ "from sklearn.model_selection import train_test_split\n",
24
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
25
+ "from sklearn.pipeline import make_pipeline\n",
26
+ "from sklearn.linear_model import LogisticRegression\n",
27
+ "from sklearn.naive_bayes import MultinomialNB\n",
28
+ "from sklearn.svm import SVC\n",
29
+ "from sklearn.ensemble import RandomForestClassifier\n",
30
+ "from sklearn.metrics import classification_report,accuracy_score\n",
31
+ "import numpy as np\n",
32
+ "from sklearn.ensemble import RandomForestClassifier\n",
33
+ "from sklearn.preprocessing import OneHotEncoder\n",
34
+ "from sklearn.compose import ColumnTransformer\n",
35
+ "from sklearn.pipeline import Pipeline\n",
36
+ "from sklearn.pipeline import Pipeline\n",
37
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
38
+ "from sklearn.ensemble import RandomForestClassifier\n",
39
+ "from sklearn.model_selection import train_test_split\n",
40
+ "from sklearn.metrics import classification_report, accuracy_score\n",
41
+ "from sklearn.utils.class_weight import compute_class_weight\n",
42
+ "import pickle"
43
+ ]
44
+ },
45
+ {
46
+ "cell_type": "markdown",
47
+ "id": "70ea935b-3b62-4cf9-8bef-06bf30904b20",
48
+ "metadata": {},
49
+ "source": [
50
+ "## Sub Issues"
51
+ ]
52
+ },
53
+ {
54
+ "cell_type": "markdown",
55
+ "id": "f9ddaa89-dc8d-40f5-8098-7d108ab9d578",
56
+ "metadata": {},
57
+ "source": [
58
+ "### Model"
59
+ ]
60
+ },
61
+ {
62
+ "cell_type": "code",
63
+ "execution_count": 3,
64
+ "id": "c1f9fd85-f47e-4962-a693-7cb9efca763a",
65
+ "metadata": {},
66
+ "outputs": [],
67
+ "source": [
68
+ "from sklearn.pipeline import Pipeline\n",
69
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
70
+ "from sklearn.metrics import accuracy_score, classification_report\n",
71
+ "from sklearn.utils.class_weight import compute_class_weight\n",
72
+ "\n",
73
+ "def train_model(training_df, validation_df, target_column, classifier_model, subissues_to_drop=None, random_state=42):\n",
74
+ " # Drop specified subproducts from training and validation dataframes\n",
75
+ " if subissues_to_drop:\n",
76
+ " training_df = training_df[~training_df[target_column].isin(subissues_to_drop)]\n",
77
+ " validation_df = validation_df[~validation_df[target_column].isin(subissues_to_drop)]\n",
78
+ " \n",
79
+ " # Compute class weights\n",
80
+ " class_weights = compute_class_weight('balanced', classes=np.unique(training_df[target_column]), y=training_df[target_column])\n",
81
+ " \n",
82
+ " # Convert class weights to dictionary format\n",
83
+ " class_weight = {label: weight for label, weight in zip(np.unique(training_df[target_column]), class_weights)}\n",
84
+ " \n",
85
+ " # Define a default class weight for missing classes\n",
86
+ " default_class_weight = 0.5\n",
87
+ " \n",
88
+ " # Assign default class weight for missing classes\n",
89
+ " for label in np.unique(training_df[target_column]):\n",
90
+ " if label not in class_weight:\n",
91
+ " class_weight[label] = default_class_weight\n",
92
+ " \n",
93
+ " # Define the pipeline\n",
94
+ " pipeline = Pipeline([\n",
95
+ " ('tfidf', TfidfVectorizer()),\n",
96
+ " ('classifier', classifier_model)\n",
97
+ " ])\n",
98
+ " \n",
99
+ " # Train the pipeline\n",
100
+ " pipeline.fit(training_df['Consumer complaint narrative'], training_df[target_column])\n",
101
+ " \n",
102
+ " # Make predictions on the validation set\n",
103
+ " y_pred = pipeline.predict(validation_df['Consumer complaint narrative'])\n",
104
+ " \n",
105
+ " # Evaluate the pipeline\n",
106
+ " accuracy = accuracy_score(validation_df[target_column], y_pred)\n",
107
+ " print(\"Accuracy:\", accuracy)\n",
108
+ " print(\"\\nClassification Report:\")\n",
109
+ " print(classification_report(validation_df[target_column], y_pred))\n",
110
+ " \n",
111
+ " return pipeline"
112
+ ]
113
+ },
114
+ {
115
+ "cell_type": "markdown",
116
+ "id": "a7a0d277-75c1-4435-86e5-d0ee7d3dabf3",
117
+ "metadata": {
118
+ "jp-MarkdownHeadingCollapsed": true
119
+ },
120
+ "source": [
121
+ "#### Reading the Issue DataFrame"
122
+ ]
123
+ },
124
+ {
125
+ "cell_type": "code",
126
+ "execution_count": 6,
127
+ "id": "c1ea3fbc-4062-483b-a5c6-65d644983ce5",
128
+ "metadata": {},
129
+ "outputs": [],
130
+ "source": [
131
+ "import os\n",
132
+ "import pandas as pd\n",
133
+ "\n",
134
+ "def read_subissue_data(issue_name, data_dir='../data_preprocessing_scripts/issue_data_splits'):\n",
135
+ " # Convert issue name to lower case and replace '/' and spaces with underscores\n",
136
+ " issue_name = issue_name.replace('/', '_').replace(' ', '_').lower()\n",
137
+ " \n",
138
+ " # Construct file paths\n",
139
+ " train_file = os.path.join(data_dir, f\"{issue_name}_train_data.csv\")\n",
140
+ " val_file = os.path.join(data_dir, f\"{issue_name}_val_data.csv\")\n",
141
+ " \n",
142
+ " # Read the CSV files\n",
143
+ " train_df = pd.read_csv(train_file)\n",
144
+ " val_df = pd.read_csv(val_file )\n",
145
+ " \n",
146
+ " return train_df, val_df"
147
+ ]
148
+ },
149
+ {
150
+ "cell_type": "markdown",
151
+ "id": "7a53f046-c7f8-48de-a8f3-9a66ffad5f55",
152
+ "metadata": {
153
+ "jp-MarkdownHeadingCollapsed": true
154
+ },
155
+ "source": [
156
+ "#### Incorrect Information on your report"
157
+ ]
158
+ },
159
+ {
160
+ "cell_type": "code",
161
+ "execution_count": 7,
162
+ "id": "665d036b-dd86-4cf5-a2ff-23358fb148c7",
163
+ "metadata": {},
164
+ "outputs": [],
165
+ "source": [
166
+ "incorrect_information_train_df,incorrect_information_val_df= read_subissue_data('Incorrect information on your report')"
167
+ ]
168
+ },
169
+ {
170
+ "cell_type": "code",
171
+ "execution_count": 9,
172
+ "id": "be2f7669-496b-4f5d-a4ab-1dd8137ec988",
173
+ "metadata": {},
174
+ "outputs": [
175
+ {
176
+ "data": {
177
+ "text/html": [
178
+ "<div>\n",
179
+ "<style scoped>\n",
180
+ " .dataframe tbody tr th:only-of-type {\n",
181
+ " vertical-align: middle;\n",
182
+ " }\n",
183
+ "\n",
184
+ " .dataframe tbody tr th {\n",
185
+ " vertical-align: top;\n",
186
+ " }\n",
187
+ "\n",
188
+ " .dataframe thead th {\n",
189
+ " text-align: right;\n",
190
+ " }\n",
191
+ "</style>\n",
192
+ "<table border=\"1\" class=\"dataframe\">\n",
193
+ " <thead>\n",
194
+ " <tr style=\"text-align: right;\">\n",
195
+ " <th></th>\n",
196
+ " <th>Unnamed: 0</th>\n",
197
+ " <th>Consumer complaint narrative</th>\n",
198
+ " <th>Issue</th>\n",
199
+ " <th>Sub-issue</th>\n",
200
+ " </tr>\n",
201
+ " </thead>\n",
202
+ " <tbody>\n",
203
+ " <tr>\n",
204
+ " <th>0</th>\n",
205
+ " <td>153339</td>\n",
206
+ " <td>XX/XX/XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX,...</td>\n",
207
+ " <td>Incorrect information on your report</td>\n",
208
+ " <td>Old information reappears or never goes away</td>\n",
209
+ " </tr>\n",
210
+ " <tr>\n",
211
+ " <th>1</th>\n",
212
+ " <td>160945</td>\n",
213
+ " <td>This is my Follow-up request that I have been ...</td>\n",
214
+ " <td>Incorrect information on your report</td>\n",
215
+ " <td>Information belongs to someone else</td>\n",
216
+ " </tr>\n",
217
+ " <tr>\n",
218
+ " <th>2</th>\n",
219
+ " <td>41469</td>\n",
220
+ " <td>This inquiry does not belong to me ; I have no...</td>\n",
221
+ " <td>Incorrect information on your report</td>\n",
222
+ " <td>Information belongs to someone else</td>\n",
223
+ " </tr>\n",
224
+ " <tr>\n",
225
+ " <th>3</th>\n",
226
+ " <td>34315</td>\n",
227
+ " <td>I have items passed on that should be taken ou...</td>\n",
228
+ " <td>Incorrect information on your report</td>\n",
229
+ " <td>Information belongs to someone else</td>\n",
230
+ " </tr>\n",
231
+ " <tr>\n",
232
+ " <th>4</th>\n",
233
+ " <td>48970</td>\n",
234
+ " <td>Im submitting a complaint to you today to info...</td>\n",
235
+ " <td>Incorrect information on your report</td>\n",
236
+ " <td>Information belongs to someone else</td>\n",
237
+ " </tr>\n",
238
+ " </tbody>\n",
239
+ "</table>\n",
240
+ "</div>"
241
+ ],
242
+ "text/plain": [
243
+ " Unnamed: 0 Consumer complaint narrative \\\n",
244
+ "0 153339 XX/XX/XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX,... \n",
245
+ "1 160945 This is my Follow-up request that I have been ... \n",
246
+ "2 41469 This inquiry does not belong to me ; I have no... \n",
247
+ "3 34315 I have items passed on that should be taken ou... \n",
248
+ "4 48970 Im submitting a complaint to you today to info... \n",
249
+ "\n",
250
+ " Issue \\\n",
251
+ "0 Incorrect information on your report \n",
252
+ "1 Incorrect information on your report \n",
253
+ "2 Incorrect information on your report \n",
254
+ "3 Incorrect information on your report \n",
255
+ "4 Incorrect information on your report \n",
256
+ "\n",
257
+ " Sub-issue \n",
258
+ "0 Old information reappears or never goes away \n",
259
+ "1 Information belongs to someone else \n",
260
+ "2 Information belongs to someone else \n",
261
+ "3 Information belongs to someone else \n",
262
+ "4 Information belongs to someone else "
263
+ ]
264
+ },
265
+ "execution_count": 9,
266
+ "metadata": {},
267
+ "output_type": "execute_result"
268
+ }
269
+ ],
270
+ "source": [
271
+ "incorrect_information_train_df.head()"
272
+ ]
273
+ },
274
+ {
275
+ "cell_type": "code",
276
+ "execution_count": 10,
277
+ "id": "b78398b7-d027-403f-acf4-fa580d113b02",
278
+ "metadata": {},
279
+ "outputs": [
280
+ {
281
+ "name": "stdout",
282
+ "output_type": "stream",
283
+ "text": [
284
+ "Accuracy: 0.8831804281345565\n",
285
+ "\n",
286
+ "Classification Report:\n",
287
+ " precision recall f1-score support\n",
288
+ "\n",
289
+ " Account information incorrect 0.74 0.68 0.71 699\n",
290
+ " Account status incorrect 0.87 0.73 0.79 771\n",
291
+ " Information belongs to someone else 0.90 0.99 0.94 4337\n",
292
+ "Information is missing that should be on the report 0.95 0.31 0.47 65\n",
293
+ " Old information reappears or never goes away 0.93 0.40 0.56 126\n",
294
+ " Personal information incorrect 0.95 0.78 0.86 440\n",
295
+ " Public record information inaccurate 0.98 0.47 0.64 102\n",
296
+ "\n",
297
+ " accuracy 0.88 6540\n",
298
+ " macro avg 0.90 0.62 0.71 6540\n",
299
+ " weighted avg 0.88 0.88 0.88 6540\n",
300
+ "\n"
301
+ ]
302
+ }
303
+ ],
304
+ "source": [
305
+ "from sklearn.ensemble import RandomForestClassifier\n",
306
+ "rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)\n",
307
+ "trained_model_ii = train_model(incorrect_information_train_df, incorrect_information_val_df, 'Sub-issue', rf_classifier, random_state=42)"
308
+ ]
309
+ },
310
+ {
311
+ "cell_type": "code",
312
+ "execution_count": 11,
313
+ "id": "85bbc3fe-50b0-4578-8e67-151861f839da",
314
+ "metadata": {},
315
+ "outputs": [],
316
+ "source": [
317
+ "with open('issue_models/incorrect_information_on_your_report.pkl', 'wb') as f:\n",
318
+ " pickle.dump(trained_model_ii, f)"
319
+ ]
320
+ },
321
+ {
322
+ "cell_type": "markdown",
323
+ "id": "5c529ed8-3735-4494-9f90-6c005dfea6df",
324
+ "metadata": {
325
+ "jp-MarkdownHeadingCollapsed": true
326
+ },
327
+ "source": [
328
+ "#### Improper use of your report "
329
+ ]
330
+ },
331
+ {
332
+ "cell_type": "code",
333
+ "execution_count": 12,
334
+ "id": "f33b26e9-4c5b-4498-ab23-a88aca5eb07f",
335
+ "metadata": {},
336
+ "outputs": [
337
+ {
338
+ "data": {
339
+ "text/html": [
340
+ "<div>\n",
341
+ "<style scoped>\n",
342
+ " .dataframe tbody tr th:only-of-type {\n",
343
+ " vertical-align: middle;\n",
344
+ " }\n",
345
+ "\n",
346
+ " .dataframe tbody tr th {\n",
347
+ " vertical-align: top;\n",
348
+ " }\n",
349
+ "\n",
350
+ " .dataframe thead th {\n",
351
+ " text-align: right;\n",
352
+ " }\n",
353
+ "</style>\n",
354
+ "<table border=\"1\" class=\"dataframe\">\n",
355
+ " <thead>\n",
356
+ " <tr style=\"text-align: right;\">\n",
357
+ " <th></th>\n",
358
+ " <th>Unnamed: 0</th>\n",
359
+ " <th>Consumer complaint narrative</th>\n",
360
+ " <th>Issue</th>\n",
361
+ " <th>Sub-issue</th>\n",
362
+ " </tr>\n",
363
+ " </thead>\n",
364
+ " <tbody>\n",
365
+ " <tr>\n",
366
+ " <th>0</th>\n",
367
+ " <td>75713</td>\n",
368
+ " <td>I found inaccurate and incorrect data on my cr...</td>\n",
369
+ " <td>Improper use of your report</td>\n",
370
+ " <td>Credit inquiries on your report that you don't...</td>\n",
371
+ " </tr>\n",
372
+ " <tr>\n",
373
+ " <th>1</th>\n",
374
+ " <td>72157</td>\n",
375
+ " <td>HI I AM SUBMITTING THIS WITHOUT ANY INFLUENCE ...</td>\n",
376
+ " <td>Improper use of your report</td>\n",
377
+ " <td>Credit inquiries on your report that you don't...</td>\n",
378
+ " </tr>\n",
379
+ " <tr>\n",
380
+ " <th>2</th>\n",
381
+ " <td>174012</td>\n",
382
+ " <td>I checked my credit report and seen that there...</td>\n",
383
+ " <td>Improper use of your report</td>\n",
384
+ " <td>Credit inquiries on your report that you don't...</td>\n",
385
+ " </tr>\n",
386
+ " <tr>\n",
387
+ " <th>3</th>\n",
388
+ " <td>131412</td>\n",
389
+ " <td>XXXX XXXX XXXX XXXX has started to report inco...</td>\n",
390
+ " <td>Improper use of your report</td>\n",
391
+ " <td>Reporting company used your report improperly</td>\n",
392
+ " </tr>\n",
393
+ " <tr>\n",
394
+ " <th>4</th>\n",
395
+ " <td>157599</td>\n",
396
+ " <td>My name is XXXX XXXX this complaint is not mad...</td>\n",
397
+ " <td>Improper use of your report</td>\n",
398
+ " <td>Reporting company used your report improperly</td>\n",
399
+ " </tr>\n",
400
+ " </tbody>\n",
401
+ "</table>\n",
402
+ "</div>"
403
+ ],
404
+ "text/plain": [
405
+ " Unnamed: 0 Consumer complaint narrative \\\n",
406
+ "0 75713 I found inaccurate and incorrect data on my cr... \n",
407
+ "1 72157 HI I AM SUBMITTING THIS WITHOUT ANY INFLUENCE ... \n",
408
+ "2 174012 I checked my credit report and seen that there... \n",
409
+ "3 131412 XXXX XXXX XXXX XXXX has started to report inco... \n",
410
+ "4 157599 My name is XXXX XXXX this complaint is not mad... \n",
411
+ "\n",
412
+ " Issue \\\n",
413
+ "0 Improper use of your report \n",
414
+ "1 Improper use of your report \n",
415
+ "2 Improper use of your report \n",
416
+ "3 Improper use of your report \n",
417
+ "4 Improper use of your report \n",
418
+ "\n",
419
+ " Sub-issue \n",
420
+ "0 Credit inquiries on your report that you don't... \n",
421
+ "1 Credit inquiries on your report that you don't... \n",
422
+ "2 Credit inquiries on your report that you don't... \n",
423
+ "3 Reporting company used your report improperly \n",
424
+ "4 Reporting company used your report improperly "
425
+ ]
426
+ },
427
+ "execution_count": 12,
428
+ "metadata": {},
429
+ "output_type": "execute_result"
430
+ }
431
+ ],
432
+ "source": [
433
+ "improper_use_report_train_df,improper_use_report_val_df= read_subissue_data('Improper use of your report')\n",
434
+ "improper_use_report_train_df.head()"
435
+ ]
436
+ },
437
+ {
438
+ "cell_type": "code",
439
+ "execution_count": 13,
440
+ "id": "c8dcc18b-f7bb-4edd-965a-8c58500a0ea6",
441
+ "metadata": {},
442
+ "outputs": [
443
+ {
444
+ "name": "stdout",
445
+ "output_type": "stream",
446
+ "text": [
447
+ "Accuracy: 0.9528423772609819\n",
448
+ "\n",
449
+ "Classification Report:\n",
450
+ " precision recall f1-score support\n",
451
+ "\n",
452
+ "Credit inquiries on your report that you don't recognize 0.93 0.84 0.88 990\n",
453
+ " Reporting company used your report improperly 0.96 0.98 0.97 3654\n",
454
+ "\n",
455
+ " accuracy 0.95 4644\n",
456
+ " macro avg 0.95 0.91 0.93 4644\n",
457
+ " weighted avg 0.95 0.95 0.95 4644\n",
458
+ "\n"
459
+ ]
460
+ }
461
+ ],
462
+ "source": [
463
+ "from sklearn.ensemble import RandomForestClassifier\n",
464
+ "rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)\n",
465
+ "trained_model_iu = train_model(improper_use_report_train_df, improper_use_report_val_df, 'Sub-issue', rf_classifier, random_state=42)"
466
+ ]
467
+ },
468
+ {
469
+ "cell_type": "code",
470
+ "execution_count": 60,
471
+ "id": "a668b946-da36-410f-b474-f8a311952c5d",
472
+ "metadata": {},
473
+ "outputs": [],
474
+ "source": [
475
+ "with open('models/loan_model.pkl', 'wb') as f:\n",
476
+ " pickle.dump(trained_model_iu, f)"
477
+ ]
478
+ },
479
+ {
480
+ "cell_type": "markdown",
481
+ "id": "74796ebf-9934-46d2-a1b7-d6672dea727c",
482
+ "metadata": {
483
+ "jp-MarkdownHeadingCollapsed": true
484
+ },
485
+ "source": [
486
+ "#### Problem with a credit reporting company's investigation into an existing problem"
487
+ ]
488
+ },
489
+ {
490
+ "cell_type": "code",
491
+ "execution_count": 25,
492
+ "id": "7cde4eda-37a1-4643-b62b-41e7be8f865f",
493
+ "metadata": {},
494
+ "outputs": [
495
+ {
496
+ "data": {
497
+ "text/html": [
498
+ "<div>\n",
499
+ "<style scoped>\n",
500
+ " .dataframe tbody tr th:only-of-type {\n",
501
+ " vertical-align: middle;\n",
502
+ " }\n",
503
+ "\n",
504
+ " .dataframe tbody tr th {\n",
505
+ " vertical-align: top;\n",
506
+ " }\n",
507
+ "\n",
508
+ " .dataframe thead th {\n",
509
+ " text-align: right;\n",
510
+ " }\n",
511
+ "</style>\n",
512
+ "<table border=\"1\" class=\"dataframe\">\n",
513
+ " <thead>\n",
514
+ " <tr style=\"text-align: right;\">\n",
515
+ " <th></th>\n",
516
+ " <th>Unnamed: 0</th>\n",
517
+ " <th>Consumer complaint narrative</th>\n",
518
+ " <th>Issue</th>\n",
519
+ " <th>Sub-issue</th>\n",
520
+ " </tr>\n",
521
+ " </thead>\n",
522
+ " <tbody>\n",
523
+ " <tr>\n",
524
+ " <th>0</th>\n",
525
+ " <td>117380</td>\n",
526
+ " <td>On XX/XX/2023 I sent a letter to XXXX, Experia...</td>\n",
527
+ " <td>Problem with a credit reporting company's inve...</td>\n",
528
+ " <td>Investigation took more than 30 days</td>\n",
529
+ " </tr>\n",
530
+ " <tr>\n",
531
+ " <th>1</th>\n",
532
+ " <td>172530</td>\n",
533
+ " <td>XXXX XXXX XXXX XXXX XXXX, PA XXXX Please be ad...</td>\n",
534
+ " <td>Problem with a credit reporting company's inve...</td>\n",
535
+ " <td>Their investigation did not fix an error on yo...</td>\n",
536
+ " </tr>\n",
537
+ " <tr>\n",
538
+ " <th>2</th>\n",
539
+ " <td>5336</td>\n",
540
+ " <td>This creditor engaged in abusive, deceptive, a...</td>\n",
541
+ " <td>Problem with a credit reporting company's inve...</td>\n",
542
+ " <td>Was not notified of investigation status or re...</td>\n",
543
+ " </tr>\n",
544
+ " <tr>\n",
545
+ " <th>3</th>\n",
546
+ " <td>63755</td>\n",
547
+ " <td>Despite multiple written requests, the unverif...</td>\n",
548
+ " <td>Problem with a credit reporting company's inve...</td>\n",
549
+ " <td>Their investigation did not fix an error on yo...</td>\n",
550
+ " </tr>\n",
551
+ " <tr>\n",
552
+ " <th>4</th>\n",
553
+ " <td>124437</td>\n",
554
+ " <td>I have a loan with DEPT OF EDUCATION / XXXX. I...</td>\n",
555
+ " <td>Problem with a credit reporting company's inve...</td>\n",
556
+ " <td>Their investigation did not fix an error on yo...</td>\n",
557
+ " </tr>\n",
558
+ " </tbody>\n",
559
+ "</table>\n",
560
+ "</div>"
561
+ ],
562
+ "text/plain": [
563
+ " Unnamed: 0 Consumer complaint narrative \\\n",
564
+ "0 117380 On XX/XX/2023 I sent a letter to XXXX, Experia... \n",
565
+ "1 172530 XXXX XXXX XXXX XXXX XXXX, PA XXXX Please be ad... \n",
566
+ "2 5336 This creditor engaged in abusive, deceptive, a... \n",
567
+ "3 63755 Despite multiple written requests, the unverif... \n",
568
+ "4 124437 I have a loan with DEPT OF EDUCATION / XXXX. I... \n",
569
+ "\n",
570
+ " Issue \\\n",
571
+ "0 Problem with a credit reporting company's inve... \n",
572
+ "1 Problem with a credit reporting company's inve... \n",
573
+ "2 Problem with a credit reporting company's inve... \n",
574
+ "3 Problem with a credit reporting company's inve... \n",
575
+ "4 Problem with a credit reporting company's inve... \n",
576
+ "\n",
577
+ " Sub-issue \n",
578
+ "0 Investigation took more than 30 days \n",
579
+ "1 Their investigation did not fix an error on yo... \n",
580
+ "2 Was not notified of investigation status or re... \n",
581
+ "3 Their investigation did not fix an error on yo... \n",
582
+ "4 Their investigation did not fix an error on yo... "
583
+ ]
584
+ },
585
+ "execution_count": 25,
586
+ "metadata": {},
587
+ "output_type": "execute_result"
588
+ }
589
+ ],
590
+ "source": [
591
+ "problem_credit_reporting_train_df, problem_credit_reporting_val_df = read_subissue_data(\"Problem with a credit reporting company's investigation into an existing problem\")\n",
592
+ "\n",
593
+ "# Displaying the first few rows of the training data\n",
594
+ "problem_credit_reporting_train_df.head()\n"
595
+ ]
596
+ },
597
+ {
598
+ "cell_type": "code",
599
+ "execution_count": 26,
600
+ "id": "1cc65f08-96c8-4458-8703-b84b7554a04c",
601
+ "metadata": {},
602
+ "outputs": [
603
+ {
604
+ "name": "stdout",
605
+ "output_type": "stream",
606
+ "text": [
607
+ "Accuracy: 0.9288035450516987\n",
608
+ "\n",
609
+ "Classification Report:\n",
610
+ " precision recall f1-score support\n",
611
+ "\n",
612
+ "Difficulty submitting a dispute or getting information about a dispute over the phone 0.83 0.36 0.50 83\n",
613
+ " Investigation took more than 30 days 0.97 0.84 0.90 505\n",
614
+ " Problem with personal statement of dispute 1.00 0.38 0.55 47\n",
615
+ " Their investigation did not fix an error on your report 0.92 0.99 0.95 2277\n",
616
+ " Was not notified of investigation status or results 0.96 0.88 0.92 473\n",
617
+ "\n",
618
+ " accuracy 0.93 3385\n",
619
+ " macro avg 0.94 0.69 0.77 3385\n",
620
+ " weighted avg 0.93 0.93 0.92 3385\n",
621
+ "\n"
622
+ ]
623
+ }
624
+ ],
625
+ "source": [
626
+ "rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)\n",
627
+ "trained_model_problem_credit_reporting = train_model(problem_credit_reporting_train_df, problem_credit_reporting_val_df, 'Sub-issue', rf_classifier, random_state=42)"
628
+ ]
629
+ },
630
+ {
631
+ "cell_type": "code",
632
+ "execution_count": 69,
633
+ "id": "59c87ff1-d7de-41a9-9e0a-33630bff1c18",
634
+ "metadata": {},
635
+ "outputs": [],
636
+ "source": [
637
+ "with open('models/Checking_saving_model.pkl', 'wb') as f:\n",
638
+ " pickle.dump(trained_model_problem_credit_reporting, f)"
639
+ ]
640
+ },
641
+ {
642
+ "cell_type": "markdown",
643
+ "id": "fe443859-4be6-4b87-be79-22487aaf5b3b",
644
+ "metadata": {
645
+ "jp-MarkdownHeadingCollapsed": true
646
+ },
647
+ "source": [
648
+ "#### Problem with a company's investigation into an existing problem"
649
+ ]
650
+ },
651
+ {
652
+ "cell_type": "code",
653
+ "execution_count": 28,
654
+ "id": "31a70db8-06cb-4fb0-8d45-a7451aa81b0e",
655
+ "metadata": {},
656
+ "outputs": [
657
+ {
658
+ "data": {
659
+ "text/html": [
660
+ "<div>\n",
661
+ "<style scoped>\n",
662
+ " .dataframe tbody tr th:only-of-type {\n",
663
+ " vertical-align: middle;\n",
664
+ " }\n",
665
+ "\n",
666
+ " .dataframe tbody tr th {\n",
667
+ " vertical-align: top;\n",
668
+ " }\n",
669
+ "\n",
670
+ " .dataframe thead th {\n",
671
+ " text-align: right;\n",
672
+ " }\n",
673
+ "</style>\n",
674
+ "<table border=\"1\" class=\"dataframe\">\n",
675
+ " <thead>\n",
676
+ " <tr style=\"text-align: right;\">\n",
677
+ " <th></th>\n",
678
+ " <th>Unnamed: 0</th>\n",
679
+ " <th>Consumer complaint narrative</th>\n",
680
+ " <th>Issue</th>\n",
681
+ " <th>Sub-issue</th>\n",
682
+ " </tr>\n",
683
+ " </thead>\n",
684
+ " <tbody>\n",
685
+ " <tr>\n",
686
+ " <th>0</th>\n",
687
+ " <td>30922</td>\n",
688
+ " <td>I have filed numerous FTC reports and disputes...</td>\n",
689
+ " <td>Problem with a company's investigation into an...</td>\n",
690
+ " <td>Investigation took more than 30 days</td>\n",
691
+ " </tr>\n",
692
+ " <tr>\n",
693
+ " <th>1</th>\n",
694
+ " <td>6933</td>\n",
695
+ " <td>I filed a dispute for incorrect information on...</td>\n",
696
+ " <td>Problem with a company's investigation into an...</td>\n",
697
+ " <td>Their investigation did not fix an error on yo...</td>\n",
698
+ " </tr>\n",
699
+ " <tr>\n",
700
+ " <th>2</th>\n",
701
+ " <td>34620</td>\n",
702
+ " <td>When I reviewed my credit report, I discovered...</td>\n",
703
+ " <td>Problem with a company's investigation into an...</td>\n",
704
+ " <td>Their investigation did not fix an error on yo...</td>\n",
705
+ " </tr>\n",
706
+ " <tr>\n",
707
+ " <th>3</th>\n",
708
+ " <td>56460</td>\n",
709
+ " <td>I am writing to convey my ongoing concern rega...</td>\n",
710
+ " <td>Problem with a company's investigation into an...</td>\n",
711
+ " <td>Their investigation did not fix an error on yo...</td>\n",
712
+ " </tr>\n",
713
+ " <tr>\n",
714
+ " <th>4</th>\n",
715
+ " <td>128600</td>\n",
716
+ " <td>When I reviewed my credit report, I discovered...</td>\n",
717
+ " <td>Problem with a company's investigation into an...</td>\n",
718
+ " <td>Their investigation did not fix an error on yo...</td>\n",
719
+ " </tr>\n",
720
+ " </tbody>\n",
721
+ "</table>\n",
722
+ "</div>"
723
+ ],
724
+ "text/plain": [
725
+ " Unnamed: 0 Consumer complaint narrative \\\n",
726
+ "0 30922 I have filed numerous FTC reports and disputes... \n",
727
+ "1 6933 I filed a dispute for incorrect information on... \n",
728
+ "2 34620 When I reviewed my credit report, I discovered... \n",
729
+ "3 56460 I am writing to convey my ongoing concern rega... \n",
730
+ "4 128600 When I reviewed my credit report, I discovered... \n",
731
+ "\n",
732
+ " Issue \\\n",
733
+ "0 Problem with a company's investigation into an... \n",
734
+ "1 Problem with a company's investigation into an... \n",
735
+ "2 Problem with a company's investigation into an... \n",
736
+ "3 Problem with a company's investigation into an... \n",
737
+ "4 Problem with a company's investigation into an... \n",
738
+ "\n",
739
+ " Sub-issue \n",
740
+ "0 Investigation took more than 30 days \n",
741
+ "1 Their investigation did not fix an error on yo... \n",
742
+ "2 Their investigation did not fix an error on yo... \n",
743
+ "3 Their investigation did not fix an error on yo... \n",
744
+ "4 Their investigation did not fix an error on yo... "
745
+ ]
746
+ },
747
+ "execution_count": 28,
748
+ "metadata": {},
749
+ "output_type": "execute_result"
750
+ }
751
+ ],
752
+ "source": [
753
+ "# Reading the data\n",
754
+ "problem_company_investigation_train_df, problem_company_investigation_val_df = read_subissue_data(\"Problem with a company's investigation into an existing problem\")\n",
755
+ "\n",
756
+ "# Displaying the first few rows of the training data\n",
757
+ "problem_company_investigation_train_df.head()"
758
+ ]
759
+ },
760
+ {
761
+ "cell_type": "code",
762
+ "execution_count": 29,
763
+ "id": "0e70a22d-01f9-4f59-a903-286a05eb5179",
764
+ "metadata": {},
765
+ "outputs": [
766
+ {
767
+ "name": "stdout",
768
+ "output_type": "stream",
769
+ "text": [
770
+ "Accuracy: 0.9199747952110902\n",
771
+ "\n",
772
+ "Classification Report:\n",
773
+ " precision recall f1-score support\n",
774
+ "\n",
775
+ "Difficulty submitting a dispute or getting information about a dispute over the phone 0.88 0.37 0.52 41\n",
776
+ " Investigation took more than 30 days 0.95 0.73 0.83 162\n",
777
+ " Problem with personal statement of dispute 0.90 0.53 0.67 53\n",
778
+ " Their investigation did not fix an error on your report 0.91 1.00 0.95 1122\n",
779
+ " Was not notified of investigation status or results 0.98 0.87 0.92 209\n",
780
+ "\n",
781
+ " accuracy 0.92 1587\n",
782
+ " macro avg 0.93 0.70 0.78 1587\n",
783
+ " weighted avg 0.92 0.92 0.91 1587\n",
784
+ "\n"
785
+ ]
786
+ }
787
+ ],
788
+ "source": [
789
+ "from sklearn.ensemble import RandomForestClassifier\n",
790
+ "\n",
791
+ "rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)\n",
792
+ "trained_model_problem_company_investigation = train_model(problem_company_investigation_train_df, problem_company_investigation_val_df, 'Sub-issue', rf_classifier, random_state=42)\n"
793
+ ]
794
+ },
795
+ {
796
+ "cell_type": "code",
797
+ "execution_count": 68,
798
+ "id": "ac3f39d0-8cb8-457e-9db7-510cc5a99830",
799
+ "metadata": {},
800
+ "outputs": [],
801
+ "source": [
802
+ "with open('models/trained_model_problem_company_investigation.pkl', 'wb') as f:\n",
803
+ " pickle.dump(trained_model_problem_company_investigation, f)"
804
+ ]
805
+ },
806
+ {
807
+ "cell_type": "markdown",
808
+ "id": "0787d4eb-9673-417b-91d1-cc98becd037e",
809
+ "metadata": {
810
+ "jp-MarkdownHeadingCollapsed": true
811
+ },
812
+ "source": [
813
+ "#### Managing an account"
814
+ ]
815
+ },
816
+ {
817
+ "cell_type": "code",
818
+ "execution_count": 30,
819
+ "id": "8e074864-16f6-4fd5-8bfe-b054aeb0fc2a",
820
+ "metadata": {},
821
+ "outputs": [
822
+ {
823
+ "data": {
824
+ "text/html": [
825
+ "<div>\n",
826
+ "<style scoped>\n",
827
+ " .dataframe tbody tr th:only-of-type {\n",
828
+ " vertical-align: middle;\n",
829
+ " }\n",
830
+ "\n",
831
+ " .dataframe tbody tr th {\n",
832
+ " vertical-align: top;\n",
833
+ " }\n",
834
+ "\n",
835
+ " .dataframe thead th {\n",
836
+ " text-align: right;\n",
837
+ " }\n",
838
+ "</style>\n",
839
+ "<table border=\"1\" class=\"dataframe\">\n",
840
+ " <thead>\n",
841
+ " <tr style=\"text-align: right;\">\n",
842
+ " <th></th>\n",
843
+ " <th>Unnamed: 0</th>\n",
844
+ " <th>Consumer complaint narrative</th>\n",
845
+ " <th>Issue</th>\n",
846
+ " <th>Sub-issue</th>\n",
847
+ " </tr>\n",
848
+ " </thead>\n",
849
+ " <tbody>\n",
850
+ " <tr>\n",
851
+ " <th>0</th>\n",
852
+ " <td>37312</td>\n",
853
+ " <td>On XX/XX/2023 I had XXXX in my savings account...</td>\n",
854
+ " <td>Managing an account</td>\n",
855
+ " <td>Fee problem</td>\n",
856
+ " </tr>\n",
857
+ " <tr>\n",
858
+ " <th>1</th>\n",
859
+ " <td>92449</td>\n",
860
+ " <td>I recently opened a new account with this bank...</td>\n",
861
+ " <td>Managing an account</td>\n",
862
+ " <td>Deposits and withdrawals</td>\n",
863
+ " </tr>\n",
864
+ " <tr>\n",
865
+ " <th>2</th>\n",
866
+ " <td>169426</td>\n",
867
+ " <td>Wells Fargo bank has leaked my account details...</td>\n",
868
+ " <td>Managing an account</td>\n",
869
+ " <td>Deposits and withdrawals</td>\n",
870
+ " </tr>\n",
871
+ " <tr>\n",
872
+ " <th>3</th>\n",
873
+ " <td>60751</td>\n",
874
+ " <td>I disputed two transactions on my Wells Fargo ...</td>\n",
875
+ " <td>Managing an account</td>\n",
876
+ " <td>Problem using a debit or ATM card</td>\n",
877
+ " </tr>\n",
878
+ " <tr>\n",
879
+ " <th>4</th>\n",
880
+ " <td>169432</td>\n",
881
+ " <td>On XX/XX/23 someone hacked my XXXX app and ord...</td>\n",
882
+ " <td>Managing an account</td>\n",
883
+ " <td>Funds not handled or disbursed as instructed</td>\n",
884
+ " </tr>\n",
885
+ " </tbody>\n",
886
+ "</table>\n",
887
+ "</div>"
888
+ ],
889
+ "text/plain": [
890
+ " Unnamed: 0 Consumer complaint narrative \\\n",
891
+ "0 37312 On XX/XX/2023 I had XXXX in my savings account... \n",
892
+ "1 92449 I recently opened a new account with this bank... \n",
893
+ "2 169426 Wells Fargo bank has leaked my account details... \n",
894
+ "3 60751 I disputed two transactions on my Wells Fargo ... \n",
895
+ "4 169432 On XX/XX/23 someone hacked my XXXX app and ord... \n",
896
+ "\n",
897
+ " Issue Sub-issue \n",
898
+ "0 Managing an account Fee problem \n",
899
+ "1 Managing an account Deposits and withdrawals \n",
900
+ "2 Managing an account Deposits and withdrawals \n",
901
+ "3 Managing an account Problem using a debit or ATM card \n",
902
+ "4 Managing an account Funds not handled or disbursed as instructed "
903
+ ]
904
+ },
905
+ "execution_count": 30,
906
+ "metadata": {},
907
+ "output_type": "execute_result"
908
+ }
909
+ ],
910
+ "source": [
911
+ "# Update the issue name in the function call to read_subissue_data\n",
912
+ "managing_account_train_df, managing_account_val_df = read_subissue_data(\"Managing an account\")\n",
913
+ "\n",
914
+ "# Displaying the first few rows of the training data\n",
915
+ "managing_account_train_df.head()\n"
916
+ ]
917
+ },
918
+ {
919
+ "cell_type": "code",
920
+ "execution_count": 31,
921
+ "id": "57257613-7dde-4561-942c-f559d2159744",
922
+ "metadata": {},
923
+ "outputs": [
924
+ {
925
+ "name": "stdout",
926
+ "output_type": "stream",
927
+ "text": [
928
+ "Accuracy: 0.5171171171171172\n",
929
+ "\n",
930
+ "Classification Report:\n",
931
+ " precision recall f1-score support\n",
932
+ "\n",
933
+ " Banking errors 0.50 0.10 0.16 73\n",
934
+ " Deposits and withdrawals 0.47 0.90 0.61 201\n",
935
+ " Fee problem 0.56 0.59 0.57 56\n",
936
+ "Funds not handled or disbursed as instructed 0.00 0.00 0.00 72\n",
937
+ " Problem accessing account 0.00 0.00 0.00 40\n",
938
+ " Problem using a debit or ATM card 0.70 0.58 0.64 113\n",
939
+ "\n",
940
+ " accuracy 0.52 555\n",
941
+ " macro avg 0.37 0.36 0.33 555\n",
942
+ " weighted avg 0.43 0.52 0.43 555\n",
943
+ "\n"
944
+ ]
945
+ },
946
+ {
947
+ "name": "stderr",
948
+ "output_type": "stream",
949
+ "text": [
950
+ "/Users/shivanimundle/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
951
+ " _warn_prf(average, modifier, msg_start, len(result))\n",
952
+ "/Users/shivanimundle/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
953
+ " _warn_prf(average, modifier, msg_start, len(result))\n",
954
+ "/Users/shivanimundle/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
955
+ " _warn_prf(average, modifier, msg_start, len(result))\n"
956
+ ]
957
+ }
958
+ ],
959
+ "source": [
960
+ "# Initialize the RandomForestClassifier\n",
961
+ "rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)\n",
962
+ "\n",
963
+ "# Train the model using the updated training and validation datasets\n",
964
+ "trained_model_managing_account = train_model(managing_account_train_df, managing_account_val_df, 'Sub-issue', rf_classifier, random_state=42)\n",
965
+ "\n"
966
+ ]
967
+ },
968
+ {
969
+ "cell_type": "code",
970
+ "execution_count": 75,
971
+ "id": "cca27513-501f-4257-a4b1-0e13a3604250",
972
+ "metadata": {},
973
+ "outputs": [
974
+ {
975
+ "name": "stdout",
976
+ "output_type": "stream",
977
+ "text": [
978
+ "Accuracy: 0.9841168996188056\n",
979
+ "\n",
980
+ "Classification Report:\n",
981
+ " precision recall f1-score support\n",
982
+ "\n",
983
+ " Credit reporting 0.99 1.00 0.99 1500\n",
984
+ "Other personal consumer report 0.93 0.72 0.81 74\n",
985
+ "\n",
986
+ " accuracy 0.98 1574\n",
987
+ " macro avg 0.96 0.86 0.90 1574\n",
988
+ " weighted avg 0.98 0.98 0.98 1574\n",
989
+ "\n"
990
+ ]
991
+ }
992
+ ],
993
+ "source": [
994
+ "# Save the trained model to a file\n",
995
+ "with open('models/Managing_account_model.pkl', 'wb') as f:\n",
996
+ " pickle.dump(trained_model_managing_account, f)"
997
+ ]
998
+ },
999
+ {
1000
+ "cell_type": "code",
1001
+ "execution_count": 76,
1002
+ "id": "3cbb9aa5-6c0c-4b59-a181-7431e8fc60fc",
1003
+ "metadata": {},
1004
+ "outputs": [],
1005
+ "source": [
1006
+ "with open('models/Credit_Reporting_model.pkl', 'wb') as f:\n",
1007
+ " pickle.dump(trained_model_cr, f)"
1008
+ ]
1009
+ },
1010
+ {
1011
+ "cell_type": "markdown",
1012
+ "id": "93af6b14-f33a-479b-8b6a-79d6621309ed",
1013
+ "metadata": {
1014
+ "jp-MarkdownHeadingCollapsed": true
1015
+ },
1016
+ "source": [
1017
+ "#### Attempts to collect debt not owed"
1018
+ ]
1019
+ },
1020
+ {
1021
+ "cell_type": "code",
1022
+ "execution_count": 32,
1023
+ "id": "384692cb-09ee-453e-910e-5179f3a33b9d",
1024
+ "metadata": {},
1025
+ "outputs": [
1026
+ {
1027
+ "data": {
1028
+ "text/html": [
1029
+ "<div>\n",
1030
+ "<style scoped>\n",
1031
+ " .dataframe tbody tr th:only-of-type {\n",
1032
+ " vertical-align: middle;\n",
1033
+ " }\n",
1034
+ "\n",
1035
+ " .dataframe tbody tr th {\n",
1036
+ " vertical-align: top;\n",
1037
+ " }\n",
1038
+ "\n",
1039
+ " .dataframe thead th {\n",
1040
+ " text-align: right;\n",
1041
+ " }\n",
1042
+ "</style>\n",
1043
+ "<table border=\"1\" class=\"dataframe\">\n",
1044
+ " <thead>\n",
1045
+ " <tr style=\"text-align: right;\">\n",
1046
+ " <th></th>\n",
1047
+ " <th>Unnamed: 0</th>\n",
1048
+ " <th>Consumer complaint narrative</th>\n",
1049
+ " <th>Issue</th>\n",
1050
+ " <th>Sub-issue</th>\n",
1051
+ " </tr>\n",
1052
+ " </thead>\n",
1053
+ " <tbody>\n",
1054
+ " <tr>\n",
1055
+ " <th>0</th>\n",
1056
+ " <td>74601</td>\n",
1057
+ " <td>I had a mobile number with XXXX XXXX for sever...</td>\n",
1058
+ " <td>Attempts to collect debt not owed</td>\n",
1059
+ " <td>Debt is not yours</td>\n",
1060
+ " </tr>\n",
1061
+ " <tr>\n",
1062
+ " <th>1</th>\n",
1063
+ " <td>126394</td>\n",
1064
+ " <td>When running my credit report I notice a few c...</td>\n",
1065
+ " <td>Attempts to collect debt not owed</td>\n",
1066
+ " <td>Debt was result of identity theft</td>\n",
1067
+ " </tr>\n",
1068
+ " <tr>\n",
1069
+ " <th>2</th>\n",
1070
+ " <td>145518</td>\n",
1071
+ " <td>In early XXXX, XXXX I received notice via the ...</td>\n",
1072
+ " <td>Attempts to collect debt not owed</td>\n",
1073
+ " <td>Debt is not yours</td>\n",
1074
+ " </tr>\n",
1075
+ " <tr>\n",
1076
+ " <th>3</th>\n",
1077
+ " <td>59439</td>\n",
1078
+ " <td>I was sent via U.S. mail a debt collection not...</td>\n",
1079
+ " <td>Attempts to collect debt not owed</td>\n",
1080
+ " <td>Debt was result of identity theft</td>\n",
1081
+ " </tr>\n",
1082
+ " <tr>\n",
1083
+ " <th>4</th>\n",
1084
+ " <td>116810</td>\n",
1085
+ " <td>This debt collector engaged in abusive, decept...</td>\n",
1086
+ " <td>Attempts to collect debt not owed</td>\n",
1087
+ " <td>Debt is not yours</td>\n",
1088
+ " </tr>\n",
1089
+ " </tbody>\n",
1090
+ "</table>\n",
1091
+ "</div>"
1092
+ ],
1093
+ "text/plain": [
1094
+ " Unnamed: 0 Consumer complaint narrative \\\n",
1095
+ "0 74601 I had a mobile number with XXXX XXXX for sever... \n",
1096
+ "1 126394 When running my credit report I notice a few c... \n",
1097
+ "2 145518 In early XXXX, XXXX I received notice via the ... \n",
1098
+ "3 59439 I was sent via U.S. mail a debt collection not... \n",
1099
+ "4 116810 This debt collector engaged in abusive, decept... \n",
1100
+ "\n",
1101
+ " Issue Sub-issue \n",
1102
+ "0 Attempts to collect debt not owed Debt is not yours \n",
1103
+ "1 Attempts to collect debt not owed Debt was result of identity theft \n",
1104
+ "2 Attempts to collect debt not owed Debt is not yours \n",
1105
+ "3 Attempts to collect debt not owed Debt was result of identity theft \n",
1106
+ "4 Attempts to collect debt not owed Debt is not yours "
1107
+ ]
1108
+ },
1109
+ "execution_count": 32,
1110
+ "metadata": {},
1111
+ "output_type": "execute_result"
1112
+ }
1113
+ ],
1114
+ "source": [
1115
+ "debt_collection_train_df, debt_collection_val_df = read_subissue_data(\"Attempts to collect debt not owed\")\n",
1116
+ "\n",
1117
+ "# Displaying the first few rows of the training data\n",
1118
+ "debt_collection_train_df.head()\n",
1119
+ "\n"
1120
+ ]
1121
+ },
1122
+ {
1123
+ "cell_type": "code",
1124
+ "execution_count": 33,
1125
+ "id": "1cd16300-096b-43f2-aa9c-9500fbcdd0bd",
1126
+ "metadata": {},
1127
+ "outputs": [
1128
+ {
1129
+ "name": "stdout",
1130
+ "output_type": "stream",
1131
+ "text": [
1132
+ "Accuracy: 0.7009803921568627\n",
1133
+ "\n",
1134
+ "Classification Report:\n",
1135
+ " precision recall f1-score support\n",
1136
+ "\n",
1137
+ " Debt is not yours 0.64 0.93 0.76 207\n",
1138
+ " Debt was paid 0.96 0.31 0.46 72\n",
1139
+ "Debt was result of identity theft 0.84 0.56 0.67 129\n",
1140
+ "\n",
1141
+ " accuracy 0.70 408\n",
1142
+ " macro avg 0.81 0.60 0.63 408\n",
1143
+ " weighted avg 0.76 0.70 0.68 408\n",
1144
+ "\n"
1145
+ ]
1146
+ }
1147
+ ],
1148
+ "source": [
1149
+ "# Initialize the RandomForestClassifier\n",
1150
+ "rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)\n",
1151
+ "\n",
1152
+ "# Train the model using the updated training and validation datasets\n",
1153
+ "trained_model_debt_collection = train_model(debt_collection_train_df, debt_collection_val_df, 'Sub-issue', rf_classifier, random_state=42)\n"
1154
+ ]
1155
+ },
1156
+ {
1157
+ "cell_type": "code",
1158
+ "execution_count": null,
1159
+ "id": "9967357b-a3ec-44da-9dfb-a2034a673e8d",
1160
+ "metadata": {},
1161
+ "outputs": [],
1162
+ "source": [
1163
+ "with open('models/Debt_collection_model.pkl', 'wb') as f:\n",
1164
+ " pickle.dump(trained_model_debt_collection, f)"
1165
+ ]
1166
+ },
1167
+ {
1168
+ "cell_type": "markdown",
1169
+ "id": "00777cb3-8df2-4b27-8978-eeb008042f0f",
1170
+ "metadata": {
1171
+ "jp-MarkdownHeadingCollapsed": true
1172
+ },
1173
+ "source": [
1174
+ "#### Problem with a purchase shown on your statement"
1175
+ ]
1176
+ },
1177
+ {
1178
+ "cell_type": "code",
1179
+ "execution_count": 34,
1180
+ "id": "9400ec96-05e7-4458-bc19-0ef544709004",
1181
+ "metadata": {},
1182
+ "outputs": [
1183
+ {
1184
+ "name": "stdout",
1185
+ "output_type": "stream",
1186
+ "text": [
1187
+ "Accuracy: 0.7479338842975206\n",
1188
+ "\n",
1189
+ "Classification Report:\n",
1190
+ " precision recall f1-score support\n",
1191
+ "\n",
1192
+ " Card was charged for something you did not purchase with the card 0.76 0.19 0.30 70\n",
1193
+ "Credit card company isn't resolving a dispute about a purchase on your statement 0.75 0.98 0.85 172\n",
1194
+ "\n",
1195
+ " accuracy 0.75 242\n",
1196
+ " macro avg 0.76 0.58 0.57 242\n",
1197
+ " weighted avg 0.75 0.75 0.69 242\n",
1198
+ "\n"
1199
+ ]
1200
+ }
1201
+ ],
1202
+ "source": [
1203
+ "# Update the issue name in the function call to read_subissue_data\n",
1204
+ "purchase_problem_train_df, purchase_problem_val_df = read_subissue_data(\"Problem with a purchase shown on your statement\")\n",
1205
+ "\n",
1206
+ "# Displaying the first few rows of the training data\n",
1207
+ "purchase_problem_train_df.head()\n",
1208
+ "\n",
1209
+ "# Initialize the RandomForestClassifier\n",
1210
+ "rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)\n",
1211
+ "\n",
1212
+ "# Train the model using the updated training and validation datasets\n",
1213
+ "trained_model_purchase_problem = train_model(purchase_problem_train_df, purchase_problem_val_df, 'Sub-issue', rf_classifier, random_state=42)\n",
1214
+ "\n"
1215
+ ]
1216
+ },
1217
+ {
1218
+ "cell_type": "code",
1219
+ "execution_count": null,
1220
+ "id": "446b56ef-54c9-4975-a4ab-4982bf2585b8",
1221
+ "metadata": {},
1222
+ "outputs": [],
1223
+ "source": [
1224
+ "# Save the trained model to a file\n",
1225
+ "with open('models/Purchase_problem_model.pkl', 'wb') as f:\n",
1226
+ " pickle.dump(trained_model_purchase_problem, f)\n"
1227
+ ]
1228
+ },
1229
+ {
1230
+ "cell_type": "markdown",
1231
+ "id": "25526885-aabf-4257-b5c9-4e1c5133a96a",
1232
+ "metadata": {
1233
+ "jp-MarkdownHeadingCollapsed": true
1234
+ },
1235
+ "source": [
1236
+ "#### Account Operations and Unauthorized Transaction Issues"
1237
+ ]
1238
+ },
1239
+ {
1240
+ "cell_type": "code",
1241
+ "execution_count": 35,
1242
+ "id": "35916c05-e001-462c-91a2-aded09da6e6c",
1243
+ "metadata": {},
1244
+ "outputs": [
1245
+ {
1246
+ "name": "stdout",
1247
+ "output_type": "stream",
1248
+ "text": [
1249
+ "Accuracy: 0.8586956521739131\n",
1250
+ "\n",
1251
+ "Classification Report:\n",
1252
+ " precision recall f1-score support\n",
1253
+ "\n",
1254
+ " Account opened as a result of fraud 0.83 0.67 0.74 43\n",
1255
+ "Card opened as result of identity theft or fraud 0.88 0.77 0.82 39\n",
1256
+ " Transaction was not authorized 0.86 0.97 0.91 102\n",
1257
+ "\n",
1258
+ " accuracy 0.86 184\n",
1259
+ " macro avg 0.86 0.80 0.83 184\n",
1260
+ " weighted avg 0.86 0.86 0.85 184\n",
1261
+ "\n"
1262
+ ]
1263
+ }
1264
+ ],
1265
+ "source": [
1266
+ "# Update the issue name in the function call to read_subissue_data\n",
1267
+ "account_operations_train_df, account_operations_val_df = read_subissue_data(\"Account Operations and Unauthorized Transaction Issues\")\n",
1268
+ "\n",
1269
+ "# Displaying the first few rows of the training data\n",
1270
+ "account_operations_train_df.head()\n",
1271
+ "\n",
1272
+ "# Initialize the RandomForestClassifier\n",
1273
+ "rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)\n",
1274
+ "\n",
1275
+ "# Train the model using the updated training and validation datasets\n",
1276
+ "trained_model_account_operations = train_model(account_operations_train_df, account_operations_val_df, 'Sub-issue', rf_classifier, random_state=42)\n",
1277
+ "\n"
1278
+ ]
1279
+ },
1280
+ {
1281
+ "cell_type": "code",
1282
+ "execution_count": null,
1283
+ "id": "7b35fb47-ad1f-44e7-a952-c8e75118080f",
1284
+ "metadata": {},
1285
+ "outputs": [],
1286
+ "source": [
1287
+ "# Save the trained model to a file\n",
1288
+ "with open('models/Account_operations_model.pkl', 'wb') as f:\n",
1289
+ " pickle.dump(trained_model_account_operations, f)"
1290
+ ]
1291
+ },
1292
+ {
1293
+ "cell_type": "markdown",
1294
+ "id": "913129c1-9e06-407a-bc4b-1974f9f984bd",
1295
+ "metadata": {
1296
+ "jp-MarkdownHeadingCollapsed": true
1297
+ },
1298
+ "source": [
1299
+ "#### 'Payment and Funds Management'"
1300
+ ]
1301
+ },
1302
+ {
1303
+ "cell_type": "code",
1304
+ "execution_count": 36,
1305
+ "id": "e1575ee1-a8e8-4aa2-ab42-1bf88d2759de",
1306
+ "metadata": {},
1307
+ "outputs": [
1308
+ {
1309
+ "name": "stdout",
1310
+ "output_type": "stream",
1311
+ "text": [
1312
+ "Accuracy: 0.8728323699421965\n",
1313
+ "\n",
1314
+ "Classification Report:\n",
1315
+ " precision recall f1-score support\n",
1316
+ "\n",
1317
+ " Billing problem 1.00 0.65 0.79 34\n",
1318
+ " Overdrafts and overdraft fees 0.89 0.92 0.91 74\n",
1319
+ "Problem during payment process 0.81 0.94 0.87 65\n",
1320
+ "\n",
1321
+ " accuracy 0.87 173\n",
1322
+ " macro avg 0.90 0.83 0.85 173\n",
1323
+ " weighted avg 0.88 0.87 0.87 173\n",
1324
+ "\n"
1325
+ ]
1326
+ }
1327
+ ],
1328
+ "source": [
1329
+ "# Update the issue name in the function call to read_subissue_data\n",
1330
+ "payment_funds_train_df, payment_funds_val_df = read_subissue_data(\"Payment and Funds Management\")\n",
1331
+ "\n",
1332
+ "# Displaying the first few rows of the training data\n",
1333
+ "payment_funds_train_df.head()\n",
1334
+ "\n",
1335
+ "# Initialize the RandomForestClassifier\n",
1336
+ "rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)\n",
1337
+ "\n",
1338
+ "# Train the model using the updated training and validation datasets\n",
1339
+ "trained_model_payment_funds = train_model(payment_funds_train_df, payment_funds_val_df, 'Sub-issue', rf_classifier, random_state=42)\n"
1340
+ ]
1341
+ },
1342
+ {
1343
+ "cell_type": "code",
1344
+ "execution_count": null,
1345
+ "id": "fd2b3201-b2d9-4943-af2c-b8813bb5379b",
1346
+ "metadata": {},
1347
+ "outputs": [],
1348
+ "source": [
1349
+ "# Save the trained model to a file\n",
1350
+ "with open('models/Payment_funds_model.pkl', 'wb') as f:\n",
1351
+ " pickle.dump(trained_model_payment_funds, f)\n"
1352
+ ]
1353
+ },
1354
+ {
1355
+ "cell_type": "markdown",
1356
+ "id": "621c0a53-5aca-4d17-bf86-e9b8b98f76e5",
1357
+ "metadata": {
1358
+ "jp-MarkdownHeadingCollapsed": true
1359
+ },
1360
+ "source": [
1361
+ "#### 'Written notification about debt'"
1362
+ ]
1363
+ },
1364
+ {
1365
+ "cell_type": "code",
1366
+ "execution_count": 37,
1367
+ "id": "ecdaaba3-1882-486e-82ee-ade1c0b83eb1",
1368
+ "metadata": {},
1369
+ "outputs": [
1370
+ {
1371
+ "name": "stdout",
1372
+ "output_type": "stream",
1373
+ "text": [
1374
+ "Accuracy: 0.7814207650273224\n",
1375
+ "\n",
1376
+ "Classification Report:\n",
1377
+ " precision recall f1-score support\n",
1378
+ "\n",
1379
+ "Didn't receive enough information to verify debt 0.77 0.99 0.87 135\n",
1380
+ " Didn't receive notice of right to dispute 0.90 0.19 0.31 48\n",
1381
+ "\n",
1382
+ " accuracy 0.78 183\n",
1383
+ " macro avg 0.84 0.59 0.59 183\n",
1384
+ " weighted avg 0.81 0.78 0.72 183\n",
1385
+ "\n"
1386
+ ]
1387
+ }
1388
+ ],
1389
+ "source": [
1390
+ "# Update the issue name in the function call to read_subissue_data\n",
1391
+ "notification_debt_train_df, notification_debt_val_df = read_subissue_data(\"Written notification about debt\")\n",
1392
+ "\n",
1393
+ "# Displaying the first few rows of the training data\n",
1394
+ "notification_debt_train_df.head()\n",
1395
+ "\n",
1396
+ "# Initialize the RandomForestClassifier\n",
1397
+ "rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)\n",
1398
+ "\n",
1399
+ "# Train the model using the updated training and validation datasets\n",
1400
+ "trained_model_notification_debt = train_model(notification_debt_train_df, notification_debt_val_df, 'Sub-issue', rf_classifier, random_state=42)\n",
1401
+ "\n"
1402
+ ]
1403
+ },
1404
+ {
1405
+ "cell_type": "code",
1406
+ "execution_count": null,
1407
+ "id": "68697a56-7bdf-4fbd-9d1d-e6c4dbcc7c74",
1408
+ "metadata": {},
1409
+ "outputs": [],
1410
+ "source": [
1411
+ "# Save the trained model to a file\n",
1412
+ "with open('models/Notification_debt_model.pkl', 'wb') as f:\n",
1413
+ " pickle.dump(trained_model_notification_debt, f)"
1414
+ ]
1415
+ },
1416
+ {
1417
+ "cell_type": "markdown",
1418
+ "id": "c31597eb-7601-4cfe-a779-f7de38e7e8cc",
1419
+ "metadata": {
1420
+ "jp-MarkdownHeadingCollapsed": true
1421
+ },
1422
+ "source": [
1423
+ "#### 'Dealing with your lender or servicer':"
1424
+ ]
1425
+ },
1426
+ {
1427
+ "cell_type": "code",
1428
+ "execution_count": 38,
1429
+ "id": "36511f84-069e-4d71-9089-a454f2707467",
1430
+ "metadata": {},
1431
+ "outputs": [
1432
+ {
1433
+ "name": "stdout",
1434
+ "output_type": "stream",
1435
+ "text": [
1436
+ "Accuracy: 0.7244897959183674\n",
1437
+ "\n",
1438
+ "Classification Report:\n",
1439
+ " precision recall f1-score support\n",
1440
+ "\n",
1441
+ " Received bad information about your loan 0.74 0.70 0.72 50\n",
1442
+ "Trouble with how payments are being handled 0.71 0.75 0.73 48\n",
1443
+ "\n",
1444
+ " accuracy 0.72 98\n",
1445
+ " macro avg 0.73 0.72 0.72 98\n",
1446
+ " weighted avg 0.73 0.72 0.72 98\n",
1447
+ "\n"
1448
+ ]
1449
+ }
1450
+ ],
1451
+ "source": [
1452
+ "# Update the issue name in the function call to read_subissue_data\n",
1453
+ "lender_servicer_train_df, lender_servicer_val_df = read_subissue_data(\"Dealing with your lender or servicer\")\n",
1454
+ "\n",
1455
+ "# Displaying the first few rows of the training data\n",
1456
+ "lender_servicer_train_df.head()\n",
1457
+ "\n",
1458
+ "# Initialize the RandomForestClassifier\n",
1459
+ "rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)\n",
1460
+ "\n",
1461
+ "# Train the model using the updated training and validation datasets\n",
1462
+ "trained_model_lender_servicer = train_model(lender_servicer_train_df, lender_servicer_val_df, 'Sub-issue', rf_classifier, random_state=42)\n",
1463
+ "\n"
1464
+ ]
1465
+ },
1466
+ {
1467
+ "cell_type": "code",
1468
+ "execution_count": null,
1469
+ "id": "9aee0547-5d02-4ff1-ba8d-858ddd6590a6",
1470
+ "metadata": {},
1471
+ "outputs": [],
1472
+ "source": [
1473
+ "\n",
1474
+ "# Save the trained model to a file\n",
1475
+ "with open('models/Lender_servicer_model.pkl', 'wb') as f:\n",
1476
+ " pickle.dump(trained_model_lender_servicer, f)"
1477
+ ]
1478
+ },
1479
+ {
1480
+ "cell_type": "markdown",
1481
+ "id": "ac9b6231-dff8-4490-a022-ac1519b77405",
1482
+ "metadata": {
1483
+ "jp-MarkdownHeadingCollapsed": true
1484
+ },
1485
+ "source": [
1486
+ "#### 'Disputes and Misrepresentations'"
1487
+ ]
1488
+ },
1489
+ {
1490
+ "cell_type": "code",
1491
+ "execution_count": 39,
1492
+ "id": "d60d9dd5-b1e7-44b9-9ad2-dd5ae5e4060f",
1493
+ "metadata": {},
1494
+ "outputs": [
1495
+ {
1496
+ "name": "stdout",
1497
+ "output_type": "stream",
1498
+ "text": [
1499
+ "Accuracy: 0.8418079096045198\n",
1500
+ "\n",
1501
+ "Classification Report:\n",
1502
+ " precision recall f1-score support\n",
1503
+ "\n",
1504
+ "Attempted to collect wrong amount 0.85 0.92 0.88 66\n",
1505
+ " Other problem 0.85 0.65 0.74 54\n",
1506
+ " Problem with fees 0.83 0.93 0.88 57\n",
1507
+ "\n",
1508
+ " accuracy 0.84 177\n",
1509
+ " macro avg 0.84 0.83 0.83 177\n",
1510
+ " weighted avg 0.84 0.84 0.84 177\n",
1511
+ "\n"
1512
+ ]
1513
+ }
1514
+ ],
1515
+ "source": [
1516
+ "# Update the issue name in the function call to read_subissue_data\n",
1517
+ "disputes_misrepresentations_train_df, disputes_misrepresentations_val_df = read_subissue_data(\"Disputes and Misrepresentations\")\n",
1518
+ "\n",
1519
+ "# Displaying the first few rows of the training data\n",
1520
+ "disputes_misrepresentations_train_df.head()\n",
1521
+ "\n",
1522
+ "# Initialize the RandomForestClassifier\n",
1523
+ "rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)\n",
1524
+ "\n",
1525
+ "# Train the model using the updated training and validation datasets\n",
1526
+ "trained_model_disputes_misrepresentations = train_model(disputes_misrepresentations_train_df, disputes_misrepresentations_val_df, 'Sub-issue', rf_classifier, random_state=42)\n",
1527
+ "\n"
1528
+ ]
1529
+ },
1530
+ {
1531
+ "cell_type": "code",
1532
+ "execution_count": null,
1533
+ "id": "8bc31a9a-2725-46cb-ad25-1e60721dc0b0",
1534
+ "metadata": {},
1535
+ "outputs": [],
1536
+ "source": [
1537
+ "\n",
1538
+ "# Save the trained model to a file\n",
1539
+ "with open('models/Disputes_misrepresentations_model.pkl', 'wb') as f:\n",
1540
+ " pickle.dump(trained_model_disputes_misrepresentations, f)"
1541
+ ]
1542
+ },
1543
+ {
1544
+ "cell_type": "markdown",
1545
+ "id": "83967347-b3ec-4aad-b87f-b06b8752e184",
1546
+ "metadata": {
1547
+ "jp-MarkdownHeadingCollapsed": true
1548
+ },
1549
+ "source": [
1550
+ "#### \"Problem with a company's investigation into an existing issue\""
1551
+ ]
1552
+ },
1553
+ {
1554
+ "cell_type": "code",
1555
+ "execution_count": 40,
1556
+ "id": "1fe01200-373a-444a-b684-06f6a36eb447",
1557
+ "metadata": {},
1558
+ "outputs": [
1559
+ {
1560
+ "name": "stdout",
1561
+ "output_type": "stream",
1562
+ "text": [
1563
+ "Accuracy: 0.5882352941176471\n",
1564
+ "\n",
1565
+ "Classification Report:\n",
1566
+ " precision recall f1-score support\n",
1567
+ "\n",
1568
+ "Difficulty submitting a dispute or getting information about a dispute over the phone 0.00 0.00 0.00 3\n",
1569
+ " Investigation took more than 30 days 1.00 1.00 1.00 3\n",
1570
+ " Problem with personal statement of dispute 0.00 0.00 0.00 2\n",
1571
+ " Their investigation did not fix an error on your report 0.50 1.00 0.67 7\n",
1572
+ " Was not notified of investigation status or results 0.00 0.00 0.00 2\n",
1573
+ "\n",
1574
+ " accuracy 0.59 17\n",
1575
+ " macro avg 0.30 0.40 0.33 17\n",
1576
+ " weighted avg 0.38 0.59 0.45 17\n",
1577
+ "\n"
1578
+ ]
1579
+ },
1580
+ {
1581
+ "name": "stderr",
1582
+ "output_type": "stream",
1583
+ "text": [
1584
+ "/Users/shivanimundle/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
1585
+ " _warn_prf(average, modifier, msg_start, len(result))\n",
1586
+ "/Users/shivanimundle/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
1587
+ " _warn_prf(average, modifier, msg_start, len(result))\n",
1588
+ "/Users/shivanimundle/opt/anaconda3/lib/python3.9/site-packages/sklearn/metrics/_classification.py:1344: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use `zero_division` parameter to control this behavior.\n",
1589
+ " _warn_prf(average, modifier, msg_start, len(result))\n"
1590
+ ]
1591
+ }
1592
+ ],
1593
+ "source": [
1594
+ "# Update the issue name in the function call to read_subissue_data\n",
1595
+ "investigation_issue_train_df, investigation_issue_val_df = read_subissue_data(\"Problem with a company's investigation into an existing issue\")\n",
1596
+ "\n",
1597
+ "# Displaying the first few rows of the training data\n",
1598
+ "investigation_issue_train_df.head()\n",
1599
+ "\n",
1600
+ "# Initialize the RandomForestClassifier\n",
1601
+ "rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)\n",
1602
+ "\n",
1603
+ "# Train the model using the updated training and validation datasets\n",
1604
+ "trained_model_investigation_issue = train_model(investigation_issue_train_df, investigation_issue_val_df, 'Sub-issue', rf_classifier, random_state=42)\n",
1605
+ "\n"
1606
+ ]
1607
+ },
1608
+ {
1609
+ "cell_type": "code",
1610
+ "execution_count": null,
1611
+ "id": "f7541d40-be19-4570-8863-11329cdcd6a2",
1612
+ "metadata": {},
1613
+ "outputs": [],
1614
+ "source": [
1615
+ "# Save the trained model to a file\n",
1616
+ "with open('models/Investigation_issue_model.pkl', 'wb') as f:\n",
1617
+ " pickle.dump(trained_model_investigation_issue, f)"
1618
+ ]
1619
+ },
1620
+ {
1621
+ "cell_type": "markdown",
1622
+ "id": "e4a5e9a1-1e04-4e6f-888b-3cb417d8a89f",
1623
+ "metadata": {
1624
+ "jp-MarkdownHeadingCollapsed": true
1625
+ },
1626
+ "source": [
1627
+ "#### 'Closing your account'"
1628
+ ]
1629
+ },
1630
+ {
1631
+ "cell_type": "code",
1632
+ "execution_count": 41,
1633
+ "id": "f1c81af1-7378-4d35-923b-1cdfb3e16b47",
1634
+ "metadata": {},
1635
+ "outputs": [
1636
+ {
1637
+ "name": "stdout",
1638
+ "output_type": "stream",
1639
+ "text": [
1640
+ "Accuracy: 0.7936507936507936\n",
1641
+ "\n",
1642
+ "Classification Report:\n",
1643
+ " precision recall f1-score support\n",
1644
+ "\n",
1645
+ " Can't close your account 1.00 0.24 0.38 17\n",
1646
+ "Company closed your account 0.78 1.00 0.88 46\n",
1647
+ "\n",
1648
+ " accuracy 0.79 63\n",
1649
+ " macro avg 0.89 0.62 0.63 63\n",
1650
+ " weighted avg 0.84 0.79 0.74 63\n",
1651
+ "\n"
1652
+ ]
1653
+ }
1654
+ ],
1655
+ "source": [
1656
+ "# Update the issue name in the function call to read_subissue_data\n",
1657
+ "closing_account_train_df, closing_account_val_df = read_subissue_data(\"Closing your account\")\n",
1658
+ "\n",
1659
+ "# Displaying the first few rows of the training data\n",
1660
+ "closing_account_train_df.head()\n",
1661
+ "\n",
1662
+ "# Initialize the RandomForestClassifier\n",
1663
+ "rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)\n",
1664
+ "\n",
1665
+ "# Train the model using the updated training and validation datasets\n",
1666
+ "trained_model_closing_account = train_model(closing_account_train_df, closing_account_val_df, 'Sub-issue', rf_classifier, random_state=42)\n",
1667
+ "\n"
1668
+ ]
1669
+ },
1670
+ {
1671
+ "cell_type": "code",
1672
+ "execution_count": null,
1673
+ "id": "da02d848-8a33-4694-a1e8-51cd16904374",
1674
+ "metadata": {},
1675
+ "outputs": [],
1676
+ "source": [
1677
+ "\n",
1678
+ "# Save the trained model to a file\n",
1679
+ "with open('models/Closing_account_model.pkl', 'wb') as f:\n",
1680
+ " pickle.dump(trained_model_closing_account, f)"
1681
+ ]
1682
+ },
1683
+ {
1684
+ "cell_type": "markdown",
1685
+ "id": "bf8e194c-18d3-4958-8a95-ace85b32bf0d",
1686
+ "metadata": {
1687
+ "jp-MarkdownHeadingCollapsed": true
1688
+ },
1689
+ "source": [
1690
+ "#### 'Credit Report and Monitoring Issues'"
1691
+ ]
1692
+ },
1693
+ {
1694
+ "cell_type": "code",
1695
+ "execution_count": 42,
1696
+ "id": "798c24ec-678c-48e5-a763-641f0f6b4da1",
1697
+ "metadata": {},
1698
+ "outputs": [
1699
+ {
1700
+ "name": "stdout",
1701
+ "output_type": "stream",
1702
+ "text": [
1703
+ "Accuracy: 0.9098360655737705\n",
1704
+ "\n",
1705
+ "Classification Report:\n",
1706
+ " precision recall f1-score support\n",
1707
+ "\n",
1708
+ " Other problem getting your report or credit score 0.89 0.99 0.94 82\n",
1709
+ "Problem canceling credit monitoring or identify theft protection service 0.97 0.75 0.85 40\n",
1710
+ "\n",
1711
+ " accuracy 0.91 122\n",
1712
+ " macro avg 0.93 0.87 0.89 122\n",
1713
+ " weighted avg 0.92 0.91 0.91 122\n",
1714
+ "\n"
1715
+ ]
1716
+ }
1717
+ ],
1718
+ "source": [
1719
+ "# Update the issue name in the function call to read_subissue_data\n",
1720
+ "credit_report_train_df, credit_report_val_df = read_subissue_data(\"Credit Report and Monitoring Issues\")\n",
1721
+ "\n",
1722
+ "# Displaying the first few rows of the training data\n",
1723
+ "credit_report_train_df.head()\n",
1724
+ "\n",
1725
+ "# Initialize the RandomForestClassifier\n",
1726
+ "rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)\n",
1727
+ "\n",
1728
+ "# Train the model using the updated training and validation datasets\n",
1729
+ "trained_model_credit_report = train_model(credit_report_train_df, credit_report_val_df, 'Sub-issue', rf_classifier, random_state=42)\n"
1730
+ ]
1731
+ },
1732
+ {
1733
+ "cell_type": "code",
1734
+ "execution_count": null,
1735
+ "id": "2e49e772-1351-4c2c-905a-0f77b6169268",
1736
+ "metadata": {},
1737
+ "outputs": [],
1738
+ "source": [
1739
+ "\n",
1740
+ "# Save the trained model to a file\n",
1741
+ "with open('models/Credit_report_model.pkl', 'wb') as f:\n",
1742
+ " pickle.dump(trained_model_credit_report, f)\n"
1743
+ ]
1744
+ },
1745
+ {
1746
+ "cell_type": "markdown",
1747
+ "id": "d4384e07-0b29-4239-9404-cceaeece2a7c",
1748
+ "metadata": {
1749
+ "jp-MarkdownHeadingCollapsed": true
1750
+ },
1751
+ "source": [
1752
+ "#### 'Closing an account':"
1753
+ ]
1754
+ },
1755
+ {
1756
+ "cell_type": "code",
1757
+ "execution_count": 43,
1758
+ "id": "d7270a5a-4e07-4841-8f1a-600f01940f98",
1759
+ "metadata": {},
1760
+ "outputs": [
1761
+ {
1762
+ "name": "stdout",
1763
+ "output_type": "stream",
1764
+ "text": [
1765
+ "Accuracy: 0.5684931506849316\n",
1766
+ "\n",
1767
+ "Classification Report:\n",
1768
+ " precision recall f1-score support\n",
1769
+ "\n",
1770
+ " Can't close your account 1.00 0.04 0.07 27\n",
1771
+ " Company closed your account 0.57 0.83 0.67 69\n",
1772
+ "Funds not received from closed account 0.56 0.50 0.53 50\n",
1773
+ "\n",
1774
+ " accuracy 0.57 146\n",
1775
+ " macro avg 0.71 0.45 0.42 146\n",
1776
+ " weighted avg 0.64 0.57 0.51 146\n",
1777
+ "\n"
1778
+ ]
1779
+ }
1780
+ ],
1781
+ "source": [
1782
+ "# Update the issue name in the function call to read_subissue_data\n",
1783
+ "closing_account_train_df, closing_account_val_df = read_subissue_data(\"Closing an account\")\n",
1784
+ "\n",
1785
+ "# Displaying the first few rows of the training data\n",
1786
+ "closing_account_train_df.head()\n",
1787
+ "\n",
1788
+ "# Initialize the RandomForestClassifier\n",
1789
+ "rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)\n",
1790
+ "\n",
1791
+ "# Train the model using the updated training and validation datasets\n",
1792
+ "trained_model_closing_account = train_model(closing_account_train_df, closing_account_val_df, 'Sub-issue', rf_classifier, random_state=42)\n",
1793
+ "\n"
1794
+ ]
1795
+ },
1796
+ {
1797
+ "cell_type": "code",
1798
+ "execution_count": null,
1799
+ "id": "79c54f47-5fdd-4db4-a70d-ae7fe3068fdb",
1800
+ "metadata": {},
1801
+ "outputs": [],
1802
+ "source": [
1803
+ "\n",
1804
+ "# Save the trained model to a file\n",
1805
+ "with open('models/Closing_account_model.pkl', 'wb') as f:\n",
1806
+ " pickle.dump(trained_model_closing_account, f)"
1807
+ ]
1808
+ },
1809
+ {
1810
+ "cell_type": "markdown",
1811
+ "id": "157b5a71-5b58-4a2a-ae42-b5299660a422",
1812
+ "metadata": {
1813
+ "jp-MarkdownHeadingCollapsed": true
1814
+ },
1815
+ "source": [
1816
+ "#### 'Legal and Threat Actions':"
1817
+ ]
1818
+ },
1819
+ {
1820
+ "cell_type": "code",
1821
+ "execution_count": 44,
1822
+ "id": "8cf7f8ee-c4f1-4b71-901f-74e260e6c700",
1823
+ "metadata": {},
1824
+ "outputs": [
1825
+ {
1826
+ "name": "stdout",
1827
+ "output_type": "stream",
1828
+ "text": [
1829
+ "Accuracy: 1.0\n",
1830
+ "\n",
1831
+ "Classification Report:\n",
1832
+ " precision recall f1-score support\n",
1833
+ "\n",
1834
+ "Threatened or suggested your credit would be damaged 1.00 1.00 1.00 48\n",
1835
+ "\n",
1836
+ " accuracy 1.00 48\n",
1837
+ " macro avg 1.00 1.00 1.00 48\n",
1838
+ " weighted avg 1.00 1.00 1.00 48\n",
1839
+ "\n"
1840
+ ]
1841
+ }
1842
+ ],
1843
+ "source": [
1844
+ "# Update the issue name in the function call to read_subissue_data\n",
1845
+ "legal_threat_actions_train_df, legal_threat_actions_val_df = read_subissue_data(\"Legal and Threat Actions\")\n",
1846
+ "\n",
1847
+ "# Displaying the first few rows of the training data\n",
1848
+ "legal_threat_actions_train_df.head()\n",
1849
+ "\n",
1850
+ "# Initialize the RandomForestClassifier\n",
1851
+ "rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)\n",
1852
+ "\n",
1853
+ "# Train the model using the updated training and validation datasets\n",
1854
+ "trained_model_legal_threat_actions = train_model(legal_threat_actions_train_df, legal_threat_actions_val_df, 'Sub-issue', rf_classifier, random_state=42)\n",
1855
+ "\n"
1856
+ ]
1857
+ },
1858
+ {
1859
+ "cell_type": "code",
1860
+ "execution_count": null,
1861
+ "id": "7e1bbe22-ced3-49f9-914e-b9ef713153cc",
1862
+ "metadata": {},
1863
+ "outputs": [],
1864
+ "source": [
1865
+ "# Save the trained model to a file\n",
1866
+ "with open('models/Legal_threat_actions_model.pkl', 'wb') as f:\n",
1867
+ " pickle.dump(trained_model_legal_threat_actions, f)\n"
1868
+ ]
1869
+ },
1870
+ {
1871
+ "cell_type": "code",
1872
+ "execution_count": null,
1873
+ "id": "0f7446a2-3e93-46fc-8710-cae1db734297",
1874
+ "metadata": {},
1875
+ "outputs": [],
1876
+ "source": []
1877
+ }
1878
+ ],
1879
+ "metadata": {
1880
+ "kernelspec": {
1881
+ "display_name": "Python 3 (ipykernel)",
1882
+ "language": "python",
1883
+ "name": "python3"
1884
+ },
1885
+ "language_info": {
1886
+ "codemirror_mode": {
1887
+ "name": "ipython",
1888
+ "version": 3
1889
+ },
1890
+ "file_extension": ".py",
1891
+ "mimetype": "text/x-python",
1892
+ "name": "python",
1893
+ "nbconvert_exporter": "python",
1894
+ "pygments_lexer": "ipython3",
1895
+ "version": "3.9.19"
1896
+ }
1897
+ },
1898
+ "nbformat": 4,
1899
+ "nbformat_minor": 5
1900
+ }
subproduct_prediction/.ipynb_checkpoints/Sub_Issues-modified-checkpoint.ipynb ADDED
@@ -0,0 +1,990 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "a751d479-1500-41e2-8c01-252e849dad05",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import warnings\n",
11
+ "warnings.filterwarnings(\"ignore\")"
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "execution_count": 2,
17
+ "id": "8158cb66-9f9a-4bb2-bc6e-6a51146be10c",
18
+ "metadata": {},
19
+ "outputs": [],
20
+ "source": [
21
+ "import pandas as pd\n",
22
+ "import matplotlib.pyplot as plt \n",
23
+ "from sklearn.model_selection import train_test_split\n",
24
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
25
+ "from sklearn.pipeline import make_pipeline\n",
26
+ "from sklearn.linear_model import LogisticRegression\n",
27
+ "from sklearn.naive_bayes import MultinomialNB\n",
28
+ "from sklearn.svm import SVC\n",
29
+ "from sklearn.ensemble import RandomForestClassifier\n",
30
+ "from sklearn.metrics import classification_report,accuracy_score\n",
31
+ "import numpy as np\n",
32
+ "from sklearn.ensemble import RandomForestClassifier\n",
33
+ "from sklearn.preprocessing import OneHotEncoder\n",
34
+ "from sklearn.compose import ColumnTransformer\n",
35
+ "from sklearn.pipeline import Pipeline\n",
36
+ "from sklearn.pipeline import Pipeline\n",
37
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
38
+ "from sklearn.ensemble import RandomForestClassifier\n",
39
+ "from sklearn.model_selection import train_test_split\n",
40
+ "from sklearn.metrics import classification_report, accuracy_score\n",
41
+ "from sklearn.utils.class_weight import compute_class_weight\n",
42
+ "import pickle"
43
+ ]
44
+ },
45
+ {
46
+ "cell_type": "markdown",
47
+ "id": "70ea935b-3b62-4cf9-8bef-06bf30904b20",
48
+ "metadata": {},
49
+ "source": [
50
+ "## Sub Issues"
51
+ ]
52
+ },
53
+ {
54
+ "cell_type": "markdown",
55
+ "id": "f9ddaa89-dc8d-40f5-8098-7d108ab9d578",
56
+ "metadata": {},
57
+ "source": [
58
+ "### Model"
59
+ ]
60
+ },
61
+ {
62
+ "cell_type": "code",
63
+ "execution_count": 29,
64
+ "id": "c1f9fd85-f47e-4962-a693-7cb9efca763a",
65
+ "metadata": {},
66
+ "outputs": [],
67
+ "source": [
68
+ "from sklearn.pipeline import Pipeline\n",
69
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
70
+ "from sklearn.metrics import accuracy_score, classification_report\n",
71
+ "from sklearn.utils.class_weight import compute_class_weight\n",
72
+ "\n",
73
+ "def train_model(training_df, validation_df, target_column, classifier_model, subissues_to_drop=None, random_state=42):\n",
74
+ " # Drop specified subproducts from training and validation dataframes\n",
75
+ " if subissues_to_drop:\n",
76
+ " training_df = training_df[~training_df[target_column].isin(subissues_to_drop)]\n",
77
+ " validation_df = validation_df[~validation_df[target_column].isin(subissues_to_drop)]\n",
78
+ " \n",
79
+ " # Compute class weights\n",
80
+ " class_weights = compute_class_weight('balanced', classes=np.unique(training_df[target_column]), y=training_df[target_column])\n",
81
+ " \n",
82
+ " # Convert class weights to dictionary format\n",
83
+ " class_weight = {label: weight for label, weight in zip(np.unique(training_df[target_column]), class_weights)}\n",
84
+ " \n",
85
+ " # Define a default class weight for missing classes\n",
86
+ " default_class_weight = 0.5\n",
87
+ " \n",
88
+ " # Assign default class weight for missing classes\n",
89
+ " for label in np.unique(training_df[target_column]):\n",
90
+ " if label not in class_weight:\n",
91
+ " class_weight[label] = default_class_weight\n",
92
+ " \n",
93
+ " # Define the pipeline\n",
94
+ " pipeline = Pipeline([\n",
95
+ " ('tfidf', TfidfVectorizer()),\n",
96
+ " ('classifier', classifier_model)\n",
97
+ " ])\n",
98
+ " \n",
99
+ " # Train the pipeline\n",
100
+ " pipeline.fit(training_df['Consumer complaint narrative'], training_df[target_column])\n",
101
+ " \n",
102
+ " # Make predictions on the validation set\n",
103
+ " y_pred = pipeline.predict(validation_df['Consumer complaint narrative'])\n",
104
+ " \n",
105
+ " # Evaluate the pipeline\n",
106
+ " accuracy = accuracy_score(validation_df[target_column], y_pred)\n",
107
+ " print(\"\\nClassification Report:\")\n",
108
+ " print(classification_report(validation_df[target_column], y_pred))\n",
109
+ " print(\"Accuracy:\", accuracy)\n",
110
+ " \n",
111
+ " return pipeline"
112
+ ]
113
+ },
114
+ {
115
+ "cell_type": "markdown",
116
+ "id": "a7a0d277-75c1-4435-86e5-d0ee7d3dabf3",
117
+ "metadata": {},
118
+ "source": [
119
+ "#### Reading the Issue DataFrame"
120
+ ]
121
+ },
122
+ {
123
+ "cell_type": "code",
124
+ "execution_count": 30,
125
+ "id": "c1ea3fbc-4062-483b-a5c6-65d644983ce5",
126
+ "metadata": {},
127
+ "outputs": [],
128
+ "source": [
129
+ "import os\n",
130
+ "import pandas as pd\n",
131
+ "\n",
132
+ "def read_subissue_data(issue_name, data_dir='../data_preprocessing_scripts/issue_data_splits'):\n",
133
+ " # Convert issue name to lower case and replace '/' and spaces with underscores\n",
134
+ " issue_name = issue_name.replace('/', '_').replace(' ', '_').lower()\n",
135
+ " \n",
136
+ " # Construct file paths\n",
137
+ " train_file = os.path.join(data_dir, f\"{issue_name}_train_data.csv\")\n",
138
+ " val_file = os.path.join(data_dir, f\"{issue_name}_val_data.csv\")\n",
139
+ " \n",
140
+ " # Read the CSV files\n",
141
+ " train_df = pd.read_csv(train_file)\n",
142
+ " val_df = pd.read_csv(val_file )\n",
143
+ " \n",
144
+ " return train_df, val_df"
145
+ ]
146
+ },
147
+ {
148
+ "cell_type": "code",
149
+ "execution_count": 31,
150
+ "id": "ae74f945-3fe9-4207-8fe0-fb4d8c5d2a27",
151
+ "metadata": {},
152
+ "outputs": [],
153
+ "source": [
154
+ "df = pd.read_csv(\"../data_splits/train-data-split.csv\")\n",
155
+ "issue_categories = list(df_train['Issue'].unique())\n",
156
+ "\n",
157
+ "def classify_sub_issue(issue):\n",
158
+ " issue_name = issue.replace('/', '_').replace(' ', '_').lower()\n",
159
+ " train_df,val_df= read_subissue_data(issue)\n",
160
+ " rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)\n",
161
+ " trained_model = train_model(train_df, val_df, 'Sub-issue', rf_classifier, random_state=42)\n",
162
+ "\n",
163
+ " # Saving the model\n",
164
+ " with open(f\"issue_models/{issue_name}.pkl\", 'wb') as f:\n",
165
+ " pickle.dump(trained_model, f)"
166
+ ]
167
+ },
168
+ {
169
+ "cell_type": "markdown",
170
+ "id": "0540f68f-4e14-40c2-ba9e-1875138678a1",
171
+ "metadata": {},
172
+ "source": [
173
+ "### Sub-issues classification"
174
+ ]
175
+ },
176
+ {
177
+ "cell_type": "markdown",
178
+ "id": "7a53f046-c7f8-48de-a8f3-9a66ffad5f55",
179
+ "metadata": {},
180
+ "source": [
181
+ "#### 1. Problem with a company's investigation into an existing problem"
182
+ ]
183
+ },
184
+ {
185
+ "cell_type": "code",
186
+ "execution_count": 32,
187
+ "id": "a33a3974-b3e9-466c-85a9-8d9b0255bbba",
188
+ "metadata": {},
189
+ "outputs": [
190
+ {
191
+ "name": "stdout",
192
+ "output_type": "stream",
193
+ "text": [
194
+ "Issue : Problem with a company's investigation into an existing problem\n",
195
+ "\n",
196
+ "\n",
197
+ "Classification Report:\n",
198
+ " precision recall f1-score support\n",
199
+ "\n",
200
+ "Difficulty submitting a dispute or getting information about a dispute over the phone 0.88 0.37 0.52 41\n",
201
+ " Investigation took more than 30 days 0.95 0.73 0.83 162\n",
202
+ " Problem with personal statement of dispute 0.90 0.53 0.67 53\n",
203
+ " Their investigation did not fix an error on your report 0.91 1.00 0.95 1122\n",
204
+ " Was not notified of investigation status or results 0.98 0.87 0.92 209\n",
205
+ "\n",
206
+ " accuracy 0.92 1587\n",
207
+ " macro avg 0.93 0.70 0.78 1587\n",
208
+ " weighted avg 0.92 0.92 0.91 1587\n",
209
+ "\n",
210
+ "Accuracy: 0.9199747952110902\n"
211
+ ]
212
+ }
213
+ ],
214
+ "source": [
215
+ "issue_name = issue_categories[0]\n",
216
+ "print(f\"Issue : {issue_name}\\n\")\n",
217
+ "\n",
218
+ "classify_sub_issue(issue_name)"
219
+ ]
220
+ },
221
+ {
222
+ "cell_type": "markdown",
223
+ "id": "4ffa280b-614f-48b2-9870-70fb053b45b6",
224
+ "metadata": {},
225
+ "source": [
226
+ "#### 2. Incorrect information on your report"
227
+ ]
228
+ },
229
+ {
230
+ "cell_type": "code",
231
+ "execution_count": 34,
232
+ "id": "3d431635-227e-4873-b017-8cb4180a6e2e",
233
+ "metadata": {},
234
+ "outputs": [
235
+ {
236
+ "name": "stdout",
237
+ "output_type": "stream",
238
+ "text": [
239
+ "Issue : Incorrect information on your report\n",
240
+ "\n",
241
+ "\n",
242
+ "Classification Report:\n",
243
+ " precision recall f1-score support\n",
244
+ "\n",
245
+ " Account information incorrect 0.74 0.68 0.71 699\n",
246
+ " Account status incorrect 0.87 0.73 0.79 771\n",
247
+ " Information belongs to someone else 0.90 0.99 0.94 4337\n",
248
+ "Information is missing that should be on the report 0.95 0.31 0.47 65\n",
249
+ " Old information reappears or never goes away 0.93 0.40 0.56 126\n",
250
+ " Personal information incorrect 0.95 0.78 0.86 440\n",
251
+ " Public record information inaccurate 0.98 0.47 0.64 102\n",
252
+ "\n",
253
+ " accuracy 0.88 6540\n",
254
+ " macro avg 0.90 0.62 0.71 6540\n",
255
+ " weighted avg 0.88 0.88 0.88 6540\n",
256
+ "\n",
257
+ "Accuracy: 0.8831804281345565\n"
258
+ ]
259
+ }
260
+ ],
261
+ "source": [
262
+ "issue_name = issue_categories[1]\n",
263
+ "print(f\"Issue : {issue_name}\\n\")\n",
264
+ "\n",
265
+ "classify_sub_issue(issue_name)"
266
+ ]
267
+ },
268
+ {
269
+ "cell_type": "markdown",
270
+ "id": "f5cb1853-9bc1-4541-9dac-5cb208abcfc5",
271
+ "metadata": {},
272
+ "source": [
273
+ "#### 3. Problem with a credit reporting company's investigation into an existing problem"
274
+ ]
275
+ },
276
+ {
277
+ "cell_type": "code",
278
+ "execution_count": 35,
279
+ "id": "86f04fd6-7625-4aba-9094-f7025078d1fc",
280
+ "metadata": {},
281
+ "outputs": [
282
+ {
283
+ "name": "stdout",
284
+ "output_type": "stream",
285
+ "text": [
286
+ "Issue : Problem with a credit reporting company's investigation into an existing problem\n",
287
+ "\n",
288
+ "\n",
289
+ "Classification Report:\n",
290
+ " precision recall f1-score support\n",
291
+ "\n",
292
+ "Difficulty submitting a dispute or getting information about a dispute over the phone 0.83 0.36 0.50 83\n",
293
+ " Investigation took more than 30 days 0.97 0.84 0.90 505\n",
294
+ " Problem with personal statement of dispute 1.00 0.38 0.55 47\n",
295
+ " Their investigation did not fix an error on your report 0.92 0.99 0.95 2277\n",
296
+ " Was not notified of investigation status or results 0.96 0.88 0.92 473\n",
297
+ "\n",
298
+ " accuracy 0.93 3385\n",
299
+ " macro avg 0.94 0.69 0.77 3385\n",
300
+ " weighted avg 0.93 0.93 0.92 3385\n",
301
+ "\n",
302
+ "Accuracy: 0.9288035450516987\n"
303
+ ]
304
+ }
305
+ ],
306
+ "source": [
307
+ "issue_name = issue_categories[2]\n",
308
+ "print(f\"Issue : {issue_name}\\n\")\n",
309
+ "\n",
310
+ "classify_sub_issue(issue_name)"
311
+ ]
312
+ },
313
+ {
314
+ "cell_type": "markdown",
315
+ "id": "f00b115b-46c4-4d46-adae-a10a5e92a839",
316
+ "metadata": {},
317
+ "source": [
318
+ "#### 4. Problem with a purchase shown on your statement"
319
+ ]
320
+ },
321
+ {
322
+ "cell_type": "code",
323
+ "execution_count": 36,
324
+ "id": "e6577c57-6caa-4221-a68b-e0b65e739511",
325
+ "metadata": {},
326
+ "outputs": [
327
+ {
328
+ "name": "stdout",
329
+ "output_type": "stream",
330
+ "text": [
331
+ "Issue : Problem with a purchase shown on your statement\n",
332
+ "\n",
333
+ "\n",
334
+ "Classification Report:\n",
335
+ " precision recall f1-score support\n",
336
+ "\n",
337
+ " Card was charged for something you did not purchase with the card 0.81 0.19 0.30 70\n",
338
+ "Credit card company isn't resolving a dispute about a purchase on your statement 0.75 0.98 0.85 172\n",
339
+ "\n",
340
+ " accuracy 0.75 242\n",
341
+ " macro avg 0.78 0.58 0.58 242\n",
342
+ " weighted avg 0.77 0.75 0.69 242\n",
343
+ "\n",
344
+ "Accuracy: 0.7520661157024794\n"
345
+ ]
346
+ }
347
+ ],
348
+ "source": [
349
+ "issue_name = issue_categories[3]\n",
350
+ "print(f\"Issue : {issue_name}\\n\")\n",
351
+ "\n",
352
+ "classify_sub_issue(issue_name)"
353
+ ]
354
+ },
355
+ {
356
+ "cell_type": "markdown",
357
+ "id": "a8648f75-e62d-4b80-b4ed-ccf104137c74",
358
+ "metadata": {},
359
+ "source": [
360
+ "#### 5. Improper use of your report"
361
+ ]
362
+ },
363
+ {
364
+ "cell_type": "code",
365
+ "execution_count": 37,
366
+ "id": "ea64cabb-1372-4a52-826f-8b1bf8f2cb32",
367
+ "metadata": {},
368
+ "outputs": [
369
+ {
370
+ "name": "stdout",
371
+ "output_type": "stream",
372
+ "text": [
373
+ "Issue : Improper use of your report\n",
374
+ "\n",
375
+ "\n",
376
+ "Classification Report:\n",
377
+ " precision recall f1-score support\n",
378
+ "\n",
379
+ "Credit inquiries on your report that you don't recognize 0.93 0.84 0.88 990\n",
380
+ " Reporting company used your report improperly 0.96 0.98 0.97 3654\n",
381
+ "\n",
382
+ " accuracy 0.95 4644\n",
383
+ " macro avg 0.95 0.91 0.93 4644\n",
384
+ " weighted avg 0.95 0.95 0.95 4644\n",
385
+ "\n",
386
+ "Accuracy: 0.9528423772609819\n"
387
+ ]
388
+ }
389
+ ],
390
+ "source": [
391
+ "issue_name = issue_categories[4]\n",
392
+ "print(f\"Issue : {issue_name}\\n\")\n",
393
+ "\n",
394
+ "classify_sub_issue(issue_name)"
395
+ ]
396
+ },
397
+ {
398
+ "cell_type": "markdown",
399
+ "id": "f48f3308-d884-440c-8a24-8a81e7140ee0",
400
+ "metadata": {},
401
+ "source": [
402
+ "#### 6. Account Operations and Unauthorized Transaction Issues"
403
+ ]
404
+ },
405
+ {
406
+ "cell_type": "code",
407
+ "execution_count": 38,
408
+ "id": "08ec2d0e-950e-4f6d-9cdb-8328fed17384",
409
+ "metadata": {},
410
+ "outputs": [
411
+ {
412
+ "name": "stdout",
413
+ "output_type": "stream",
414
+ "text": [
415
+ "Issue : Account Operations and Unauthorized Transaction Issues\n",
416
+ "\n",
417
+ "\n",
418
+ "Classification Report:\n",
419
+ " precision recall f1-score support\n",
420
+ "\n",
421
+ " Account opened as a result of fraud 0.83 0.67 0.74 43\n",
422
+ "Card opened as result of identity theft or fraud 0.88 0.77 0.82 39\n",
423
+ " Transaction was not authorized 0.86 0.97 0.91 102\n",
424
+ "\n",
425
+ " accuracy 0.86 184\n",
426
+ " macro avg 0.86 0.80 0.83 184\n",
427
+ " weighted avg 0.86 0.86 0.85 184\n",
428
+ "\n",
429
+ "Accuracy: 0.8586956521739131\n"
430
+ ]
431
+ }
432
+ ],
433
+ "source": [
434
+ "issue_name = issue_categories[5]\n",
435
+ "print(f\"Issue : {issue_name}\\n\")\n",
436
+ "\n",
437
+ "classify_sub_issue(issue_name)"
438
+ ]
439
+ },
440
+ {
441
+ "cell_type": "markdown",
442
+ "id": "7c7332c0-3cc9-42b6-9bbd-5b33719e676d",
443
+ "metadata": {},
444
+ "source": [
445
+ "#### 7. Payment and Funds Management"
446
+ ]
447
+ },
448
+ {
449
+ "cell_type": "code",
450
+ "execution_count": 39,
451
+ "id": "bf0e0437-a85d-4dcd-8b93-982fbd33cee6",
452
+ "metadata": {},
453
+ "outputs": [
454
+ {
455
+ "name": "stdout",
456
+ "output_type": "stream",
457
+ "text": [
458
+ "Issue : Payment and Funds Management\n",
459
+ "\n",
460
+ "\n",
461
+ "Classification Report:\n",
462
+ " precision recall f1-score support\n",
463
+ "\n",
464
+ " Billing problem 1.00 0.65 0.79 34\n",
465
+ " Overdrafts and overdraft fees 0.89 0.92 0.91 74\n",
466
+ "Problem during payment process 0.81 0.94 0.87 65\n",
467
+ "\n",
468
+ " accuracy 0.87 173\n",
469
+ " macro avg 0.90 0.83 0.85 173\n",
470
+ " weighted avg 0.88 0.87 0.87 173\n",
471
+ "\n",
472
+ "Accuracy: 0.8728323699421965\n"
473
+ ]
474
+ }
475
+ ],
476
+ "source": [
477
+ "issue_name = issue_categories[6]\n",
478
+ "print(f\"Issue : {issue_name}\\n\")\n",
479
+ "\n",
480
+ "classify_sub_issue(issue_name)"
481
+ ]
482
+ },
483
+ {
484
+ "cell_type": "markdown",
485
+ "id": "b034a174-16e7-41b6-970c-ef23d9b9da29",
486
+ "metadata": {},
487
+ "source": [
488
+ "#### 8. Managing an account"
489
+ ]
490
+ },
491
+ {
492
+ "cell_type": "code",
493
+ "execution_count": 40,
494
+ "id": "bc62e5f5-14ef-4d8a-8434-79b4e7da5a9a",
495
+ "metadata": {},
496
+ "outputs": [
497
+ {
498
+ "name": "stdout",
499
+ "output_type": "stream",
500
+ "text": [
501
+ "Issue : Managing an account\n",
502
+ "\n",
503
+ "\n",
504
+ "Classification Report:\n",
505
+ " precision recall f1-score support\n",
506
+ "\n",
507
+ " Banking errors 0.50 0.10 0.16 73\n",
508
+ " Deposits and withdrawals 0.46 0.90 0.61 201\n",
509
+ " Fee problem 0.55 0.57 0.56 56\n",
510
+ "Funds not handled or disbursed as instructed 0.00 0.00 0.00 72\n",
511
+ " Problem accessing account 0.00 0.00 0.00 40\n",
512
+ " Problem using a debit or ATM card 0.71 0.58 0.64 113\n",
513
+ "\n",
514
+ " accuracy 0.52 555\n",
515
+ " macro avg 0.37 0.36 0.33 555\n",
516
+ " weighted avg 0.43 0.52 0.43 555\n",
517
+ "\n",
518
+ "Accuracy: 0.5153153153153153\n"
519
+ ]
520
+ }
521
+ ],
522
+ "source": [
523
+ "issue_name = issue_categories[7]\n",
524
+ "print(f\"Issue : {issue_name}\\n\")\n",
525
+ "\n",
526
+ "classify_sub_issue(issue_name)"
527
+ ]
528
+ },
529
+ {
530
+ "cell_type": "markdown",
531
+ "id": "6c2e3454-eaa2-4a71-a058-988ad7716eac",
532
+ "metadata": {},
533
+ "source": [
534
+ "#### 9. Attempts to collect debt not owed"
535
+ ]
536
+ },
537
+ {
538
+ "cell_type": "code",
539
+ "execution_count": 41,
540
+ "id": "85ad1ffc-97e5-436b-afea-abed93b67b75",
541
+ "metadata": {},
542
+ "outputs": [
543
+ {
544
+ "name": "stdout",
545
+ "output_type": "stream",
546
+ "text": [
547
+ "Issue : Attempts to collect debt not owed\n",
548
+ "\n",
549
+ "\n",
550
+ "Classification Report:\n",
551
+ " precision recall f1-score support\n",
552
+ "\n",
553
+ " Debt is not yours 0.64 0.93 0.76 207\n",
554
+ " Debt was paid 0.96 0.31 0.46 72\n",
555
+ "Debt was result of identity theft 0.84 0.56 0.67 129\n",
556
+ "\n",
557
+ " accuracy 0.70 408\n",
558
+ " macro avg 0.81 0.60 0.63 408\n",
559
+ " weighted avg 0.76 0.70 0.68 408\n",
560
+ "\n",
561
+ "Accuracy: 0.7009803921568627\n"
562
+ ]
563
+ }
564
+ ],
565
+ "source": [
566
+ "issue_name = issue_categories[8]\n",
567
+ "print(f\"Issue : {issue_name}\\n\")\n",
568
+ "\n",
569
+ "classify_sub_issue(issue_name)"
570
+ ]
571
+ },
572
+ {
573
+ "cell_type": "markdown",
574
+ "id": "43b186f0-b626-43c2-9823-6818da478d48",
575
+ "metadata": {},
576
+ "source": [
577
+ "-----"
578
+ ]
579
+ },
580
+ {
581
+ "cell_type": "markdown",
582
+ "id": "8d87e677-da08-4682-9823-72c8315e52a2",
583
+ "metadata": {},
584
+ "source": [
585
+ "#### 10. Written notification about debt"
586
+ ]
587
+ },
588
+ {
589
+ "cell_type": "code",
590
+ "execution_count": 42,
591
+ "id": "214fc01d-7bf1-4b5a-b409-10b3c99076ae",
592
+ "metadata": {},
593
+ "outputs": [
594
+ {
595
+ "name": "stdout",
596
+ "output_type": "stream",
597
+ "text": [
598
+ "Issue : Written notification about debt\n",
599
+ "\n",
600
+ "\n",
601
+ "Classification Report:\n",
602
+ " precision recall f1-score support\n",
603
+ "\n",
604
+ "Didn't receive enough information to verify debt 0.77 0.99 0.87 135\n",
605
+ " Didn't receive notice of right to dispute 0.90 0.19 0.31 48\n",
606
+ "\n",
607
+ " accuracy 0.78 183\n",
608
+ " macro avg 0.84 0.59 0.59 183\n",
609
+ " weighted avg 0.81 0.78 0.72 183\n",
610
+ "\n",
611
+ "Accuracy: 0.7814207650273224\n"
612
+ ]
613
+ }
614
+ ],
615
+ "source": [
616
+ "issue_name = issue_categories[9]\n",
617
+ "print(f\"Issue : {issue_name}\\n\")\n",
618
+ "\n",
619
+ "classify_sub_issue(issue_name)"
620
+ ]
621
+ },
622
+ {
623
+ "cell_type": "markdown",
624
+ "id": "7cca2ba7-f0e1-4e56-a6f0-2a3c92bcac56",
625
+ "metadata": {},
626
+ "source": [
627
+ "----"
628
+ ]
629
+ },
630
+ {
631
+ "cell_type": "markdown",
632
+ "id": "401e87db-4759-437c-bcb1-382a7f8ed226",
633
+ "metadata": {},
634
+ "source": [
635
+ "#### 11. Dealing with your lender or servicer"
636
+ ]
637
+ },
638
+ {
639
+ "cell_type": "code",
640
+ "execution_count": 43,
641
+ "id": "9c1485fc-1b14-44c9-b4c9-d92bea864800",
642
+ "metadata": {},
643
+ "outputs": [
644
+ {
645
+ "name": "stdout",
646
+ "output_type": "stream",
647
+ "text": [
648
+ "Issue : Dealing with your lender or servicer\n",
649
+ "\n",
650
+ "\n",
651
+ "Classification Report:\n",
652
+ " precision recall f1-score support\n",
653
+ "\n",
654
+ " Received bad information about your loan 0.74 0.70 0.72 50\n",
655
+ "Trouble with how payments are being handled 0.71 0.75 0.73 48\n",
656
+ "\n",
657
+ " accuracy 0.72 98\n",
658
+ " macro avg 0.73 0.72 0.72 98\n",
659
+ " weighted avg 0.73 0.72 0.72 98\n",
660
+ "\n",
661
+ "Accuracy: 0.7244897959183674\n"
662
+ ]
663
+ }
664
+ ],
665
+ "source": [
666
+ "issue_name = issue_categories[10]\n",
667
+ "print(f\"Issue : {issue_name}\\n\")\n",
668
+ "\n",
669
+ "classify_sub_issue(issue_name)"
670
+ ]
671
+ },
672
+ {
673
+ "cell_type": "markdown",
674
+ "id": "8ca1aab7-158f-48bf-871c-1fa991fb1f9e",
675
+ "metadata": {},
676
+ "source": [
677
+ "----"
678
+ ]
679
+ },
680
+ {
681
+ "cell_type": "markdown",
682
+ "id": "36ce1724-61e5-4d5b-bbaf-a79293af6506",
683
+ "metadata": {},
684
+ "source": [
685
+ "#### 12. Disputes and Misrepresentations"
686
+ ]
687
+ },
688
+ {
689
+ "cell_type": "code",
690
+ "execution_count": 44,
691
+ "id": "380ee173-6c72-40b8-9eb2-a5af680c8ff7",
692
+ "metadata": {},
693
+ "outputs": [
694
+ {
695
+ "name": "stdout",
696
+ "output_type": "stream",
697
+ "text": [
698
+ "Issue : Disputes and Misrepresentations\n",
699
+ "\n",
700
+ "\n",
701
+ "Classification Report:\n",
702
+ " precision recall f1-score support\n",
703
+ "\n",
704
+ "Attempted to collect wrong amount 0.85 0.92 0.88 66\n",
705
+ " Other problem 0.85 0.65 0.74 54\n",
706
+ " Problem with fees 0.83 0.93 0.88 57\n",
707
+ "\n",
708
+ " accuracy 0.84 177\n",
709
+ " macro avg 0.84 0.83 0.83 177\n",
710
+ " weighted avg 0.84 0.84 0.84 177\n",
711
+ "\n",
712
+ "Accuracy: 0.8418079096045198\n"
713
+ ]
714
+ }
715
+ ],
716
+ "source": [
717
+ "issue_name = issue_categories[11]\n",
718
+ "print(f\"Issue : {issue_name}\\n\")\n",
719
+ "\n",
720
+ "classify_sub_issue(issue_name)"
721
+ ]
722
+ },
723
+ {
724
+ "cell_type": "markdown",
725
+ "id": "e44501a4-2021-4d78-b3c2-c937d286cb22",
726
+ "metadata": {},
727
+ "source": [
728
+ "----"
729
+ ]
730
+ },
731
+ {
732
+ "cell_type": "markdown",
733
+ "id": "451ccf3a-c97e-46e3-9c47-c225d6e3dd49",
734
+ "metadata": {},
735
+ "source": [
736
+ "#### 13. Problem with a company's investigation into an existing issue"
737
+ ]
738
+ },
739
+ {
740
+ "cell_type": "code",
741
+ "execution_count": 45,
742
+ "id": "20201d0c-b9da-4e2e-957b-23649f06e48e",
743
+ "metadata": {},
744
+ "outputs": [
745
+ {
746
+ "name": "stdout",
747
+ "output_type": "stream",
748
+ "text": [
749
+ "Issue : Problem with a company's investigation into an existing issue\n",
750
+ "\n",
751
+ "\n",
752
+ "Classification Report:\n",
753
+ " precision recall f1-score support\n",
754
+ "\n",
755
+ "Difficulty submitting a dispute or getting information about a dispute over the phone 0.00 0.00 0.00 3\n",
756
+ " Investigation took more than 30 days 1.00 1.00 1.00 3\n",
757
+ " Problem with personal statement of dispute 0.00 0.00 0.00 2\n",
758
+ " Their investigation did not fix an error on your report 0.50 1.00 0.67 7\n",
759
+ " Was not notified of investigation status or results 0.00 0.00 0.00 2\n",
760
+ "\n",
761
+ " accuracy 0.59 17\n",
762
+ " macro avg 0.30 0.40 0.33 17\n",
763
+ " weighted avg 0.38 0.59 0.45 17\n",
764
+ "\n",
765
+ "Accuracy: 0.5882352941176471\n"
766
+ ]
767
+ }
768
+ ],
769
+ "source": [
770
+ "issue_name = issue_categories[12]\n",
771
+ "print(f\"Issue : {issue_name}\\n\")\n",
772
+ "\n",
773
+ "classify_sub_issue(issue_name)"
774
+ ]
775
+ },
776
+ {
777
+ "cell_type": "markdown",
778
+ "id": "c5d37ff8-2382-4c3b-aef0-5affd4d3083b",
779
+ "metadata": {},
780
+ "source": [
781
+ "----"
782
+ ]
783
+ },
784
+ {
785
+ "cell_type": "markdown",
786
+ "id": "c9876639-9e72-49ab-9dd4-3ef5ac38a8d8",
787
+ "metadata": {},
788
+ "source": [
789
+ "#### 14. Closing your account"
790
+ ]
791
+ },
792
+ {
793
+ "cell_type": "code",
794
+ "execution_count": 46,
795
+ "id": "95eff365-09f8-4640-9f65-4a82fc321fa9",
796
+ "metadata": {},
797
+ "outputs": [
798
+ {
799
+ "name": "stdout",
800
+ "output_type": "stream",
801
+ "text": [
802
+ "Issue : Closing your account\n",
803
+ "\n",
804
+ "\n",
805
+ "Classification Report:\n",
806
+ " precision recall f1-score support\n",
807
+ "\n",
808
+ " Can't close your account 1.00 0.24 0.38 17\n",
809
+ "Company closed your account 0.78 1.00 0.88 46\n",
810
+ "\n",
811
+ " accuracy 0.79 63\n",
812
+ " macro avg 0.89 0.62 0.63 63\n",
813
+ " weighted avg 0.84 0.79 0.74 63\n",
814
+ "\n",
815
+ "Accuracy: 0.7936507936507936\n"
816
+ ]
817
+ }
818
+ ],
819
+ "source": [
820
+ "issue_name = issue_categories[13]\n",
821
+ "print(f\"Issue : {issue_name}\\n\")\n",
822
+ "\n",
823
+ "classify_sub_issue(issue_name)"
824
+ ]
825
+ },
826
+ {
827
+ "cell_type": "markdown",
828
+ "id": "c66b9044-32af-4aee-af08-b685480d9f53",
829
+ "metadata": {},
830
+ "source": [
831
+ "----"
832
+ ]
833
+ },
834
+ {
835
+ "cell_type": "markdown",
836
+ "id": "455f8d69-5531-42e0-a53c-66427ff68fcc",
837
+ "metadata": {},
838
+ "source": [
839
+ "#### 15. Credit Report and Monitoring Issues"
840
+ ]
841
+ },
842
+ {
843
+ "cell_type": "code",
844
+ "execution_count": 47,
845
+ "id": "a039cb86-3503-4757-a8ee-7e518eafb9a5",
846
+ "metadata": {},
847
+ "outputs": [
848
+ {
849
+ "name": "stdout",
850
+ "output_type": "stream",
851
+ "text": [
852
+ "Issue : Credit Report and Monitoring Issues\n",
853
+ "\n",
854
+ "\n",
855
+ "Classification Report:\n",
856
+ " precision recall f1-score support\n",
857
+ "\n",
858
+ " Other problem getting your report or credit score 0.89 0.99 0.94 82\n",
859
+ "Problem canceling credit monitoring or identify theft protection service 0.97 0.75 0.85 40\n",
860
+ "\n",
861
+ " accuracy 0.91 122\n",
862
+ " macro avg 0.93 0.87 0.89 122\n",
863
+ " weighted avg 0.92 0.91 0.91 122\n",
864
+ "\n",
865
+ "Accuracy: 0.9098360655737705\n"
866
+ ]
867
+ }
868
+ ],
869
+ "source": [
870
+ "issue_name = issue_categories[14]\n",
871
+ "print(f\"Issue : {issue_name}\\n\")\n",
872
+ "\n",
873
+ "classify_sub_issue(issue_name)"
874
+ ]
875
+ },
876
+ {
877
+ "cell_type": "markdown",
878
+ "id": "ee0dfc45-96b2-4cbb-b34d-a8e1441c0c82",
879
+ "metadata": {},
880
+ "source": [
881
+ "----"
882
+ ]
883
+ },
884
+ {
885
+ "cell_type": "markdown",
886
+ "id": "0dcf3701-d59f-43fa-9aa0-2c65c27a8fe0",
887
+ "metadata": {},
888
+ "source": [
889
+ "#### 16. Closing an account"
890
+ ]
891
+ },
892
+ {
893
+ "cell_type": "code",
894
+ "execution_count": 48,
895
+ "id": "1ed7956b-3d41-46f8-a7e8-ad9f36e1694d",
896
+ "metadata": {},
897
+ "outputs": [
898
+ {
899
+ "name": "stdout",
900
+ "output_type": "stream",
901
+ "text": [
902
+ "Issue : Closing an account\n",
903
+ "\n",
904
+ "\n",
905
+ "Classification Report:\n",
906
+ " precision recall f1-score support\n",
907
+ "\n",
908
+ " Can't close your account 1.00 0.04 0.07 27\n",
909
+ " Company closed your account 0.57 0.83 0.67 69\n",
910
+ "Funds not received from closed account 0.56 0.50 0.53 50\n",
911
+ "\n",
912
+ " accuracy 0.57 146\n",
913
+ " macro avg 0.71 0.45 0.42 146\n",
914
+ " weighted avg 0.64 0.57 0.51 146\n",
915
+ "\n",
916
+ "Accuracy: 0.5684931506849316\n"
917
+ ]
918
+ }
919
+ ],
920
+ "source": [
921
+ "issue_name = issue_categories[15]\n",
922
+ "print(f\"Issue : {issue_name}\\n\")\n",
923
+ "\n",
924
+ "classify_sub_issue(issue_name)"
925
+ ]
926
+ },
927
+ {
928
+ "cell_type": "markdown",
929
+ "id": "3822541c-f13c-4a96-862f-4c23cf2d3895",
930
+ "metadata": {},
931
+ "source": [
932
+ "#### 17. Legal and Threat Actions"
933
+ ]
934
+ },
935
+ {
936
+ "cell_type": "code",
937
+ "execution_count": 49,
938
+ "id": "8fa5fc40-6d4f-4321-8eb0-9608dc5b84e2",
939
+ "metadata": {},
940
+ "outputs": [
941
+ {
942
+ "name": "stdout",
943
+ "output_type": "stream",
944
+ "text": [
945
+ "Issue : Legal and Threat Actions\n",
946
+ "\n",
947
+ "\n",
948
+ "Classification Report:\n",
949
+ " precision recall f1-score support\n",
950
+ "\n",
951
+ "Threatened or suggested your credit would be damaged 1.00 1.00 1.00 48\n",
952
+ "\n",
953
+ " accuracy 1.00 48\n",
954
+ " macro avg 1.00 1.00 1.00 48\n",
955
+ " weighted avg 1.00 1.00 1.00 48\n",
956
+ "\n",
957
+ "Accuracy: 1.0\n"
958
+ ]
959
+ }
960
+ ],
961
+ "source": [
962
+ "issue_name = issue_categories[16]\n",
963
+ "print(f\"Issue : {issue_name}\\n\")\n",
964
+ "\n",
965
+ "classify_sub_issue(issue_name)"
966
+ ]
967
+ }
968
+ ],
969
+ "metadata": {
970
+ "kernelspec": {
971
+ "display_name": "Python 3 (ipykernel)",
972
+ "language": "python",
973
+ "name": "python3"
974
+ },
975
+ "language_info": {
976
+ "codemirror_mode": {
977
+ "name": "ipython",
978
+ "version": 3
979
+ },
980
+ "file_extension": ".py",
981
+ "mimetype": "text/x-python",
982
+ "name": "python",
983
+ "nbconvert_exporter": "python",
984
+ "pygments_lexer": "ipython3",
985
+ "version": "3.9.19"
986
+ }
987
+ },
988
+ "nbformat": 4,
989
+ "nbformat_minor": 5
990
+ }
subproduct_prediction/Pipeline.ipynb ADDED
The diff for this file is too large to render. See raw diff
 
subproduct_prediction/Sub_Issue.ipynb ADDED
@@ -0,0 +1,990 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "a751d479-1500-41e2-8c01-252e849dad05",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import warnings\n",
11
+ "warnings.filterwarnings(\"ignore\")"
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "execution_count": 2,
17
+ "id": "8158cb66-9f9a-4bb2-bc6e-6a51146be10c",
18
+ "metadata": {},
19
+ "outputs": [],
20
+ "source": [
21
+ "import pandas as pd\n",
22
+ "import matplotlib.pyplot as plt \n",
23
+ "from sklearn.model_selection import train_test_split\n",
24
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
25
+ "from sklearn.pipeline import make_pipeline\n",
26
+ "from sklearn.linear_model import LogisticRegression\n",
27
+ "from sklearn.naive_bayes import MultinomialNB\n",
28
+ "from sklearn.svm import SVC\n",
29
+ "from sklearn.ensemble import RandomForestClassifier\n",
30
+ "from sklearn.metrics import classification_report,accuracy_score\n",
31
+ "import numpy as np\n",
32
+ "from sklearn.ensemble import RandomForestClassifier\n",
33
+ "from sklearn.preprocessing import OneHotEncoder\n",
34
+ "from sklearn.compose import ColumnTransformer\n",
35
+ "from sklearn.pipeline import Pipeline\n",
36
+ "from sklearn.pipeline import Pipeline\n",
37
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
38
+ "from sklearn.ensemble import RandomForestClassifier\n",
39
+ "from sklearn.model_selection import train_test_split\n",
40
+ "from sklearn.metrics import classification_report, accuracy_score\n",
41
+ "from sklearn.utils.class_weight import compute_class_weight\n",
42
+ "import pickle"
43
+ ]
44
+ },
45
+ {
46
+ "cell_type": "markdown",
47
+ "id": "70ea935b-3b62-4cf9-8bef-06bf30904b20",
48
+ "metadata": {},
49
+ "source": [
50
+ "## Sub Issues"
51
+ ]
52
+ },
53
+ {
54
+ "cell_type": "markdown",
55
+ "id": "f9ddaa89-dc8d-40f5-8098-7d108ab9d578",
56
+ "metadata": {},
57
+ "source": [
58
+ "### Model"
59
+ ]
60
+ },
61
+ {
62
+ "cell_type": "code",
63
+ "execution_count": 29,
64
+ "id": "c1f9fd85-f47e-4962-a693-7cb9efca763a",
65
+ "metadata": {},
66
+ "outputs": [],
67
+ "source": [
68
+ "from sklearn.pipeline import Pipeline\n",
69
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
70
+ "from sklearn.metrics import accuracy_score, classification_report\n",
71
+ "from sklearn.utils.class_weight import compute_class_weight\n",
72
+ "\n",
73
+ "def train_model(training_df, validation_df, target_column, classifier_model, subissues_to_drop=None, random_state=42):\n",
74
+ " # Drop specified subproducts from training and validation dataframes\n",
75
+ " if subissues_to_drop:\n",
76
+ " training_df = training_df[~training_df[target_column].isin(subissues_to_drop)]\n",
77
+ " validation_df = validation_df[~validation_df[target_column].isin(subissues_to_drop)]\n",
78
+ " \n",
79
+ " # Compute class weights\n",
80
+ " class_weights = compute_class_weight('balanced', classes=np.unique(training_df[target_column]), y=training_df[target_column])\n",
81
+ " \n",
82
+ " # Convert class weights to dictionary format\n",
83
+ " class_weight = {label: weight for label, weight in zip(np.unique(training_df[target_column]), class_weights)}\n",
84
+ " \n",
85
+ " # Define a default class weight for missing classes\n",
86
+ " default_class_weight = 0.5\n",
87
+ " \n",
88
+ " # Assign default class weight for missing classes\n",
89
+ " for label in np.unique(training_df[target_column]):\n",
90
+ " if label not in class_weight:\n",
91
+ " class_weight[label] = default_class_weight\n",
92
+ " \n",
93
+ " # Define the pipeline\n",
94
+ " pipeline = Pipeline([\n",
95
+ " ('tfidf', TfidfVectorizer()),\n",
96
+ " ('classifier', classifier_model)\n",
97
+ " ])\n",
98
+ " \n",
99
+ " # Train the pipeline\n",
100
+ " pipeline.fit(training_df['Consumer complaint narrative'], training_df[target_column])\n",
101
+ " \n",
102
+ " # Make predictions on the validation set\n",
103
+ " y_pred = pipeline.predict(validation_df['Consumer complaint narrative'])\n",
104
+ " \n",
105
+ " # Evaluate the pipeline\n",
106
+ " accuracy = accuracy_score(validation_df[target_column], y_pred)\n",
107
+ " print(\"\\nClassification Report:\")\n",
108
+ " print(classification_report(validation_df[target_column], y_pred))\n",
109
+ " print(\"Accuracy:\", accuracy)\n",
110
+ " \n",
111
+ " return pipeline"
112
+ ]
113
+ },
114
+ {
115
+ "cell_type": "markdown",
116
+ "id": "a7a0d277-75c1-4435-86e5-d0ee7d3dabf3",
117
+ "metadata": {},
118
+ "source": [
119
+ "#### Reading the Issue DataFrame"
120
+ ]
121
+ },
122
+ {
123
+ "cell_type": "code",
124
+ "execution_count": 30,
125
+ "id": "c1ea3fbc-4062-483b-a5c6-65d644983ce5",
126
+ "metadata": {},
127
+ "outputs": [],
128
+ "source": [
129
+ "import os\n",
130
+ "import pandas as pd\n",
131
+ "\n",
132
+ "def read_subissue_data(issue_name, data_dir='../data_preprocessing_scripts/issue_data_splits'):\n",
133
+ " # Convert issue name to lower case and replace '/' and spaces with underscores\n",
134
+ " issue_name = issue_name.replace('/', '_').replace(' ', '_').lower()\n",
135
+ " \n",
136
+ " # Construct file paths\n",
137
+ " train_file = os.path.join(data_dir, f\"{issue_name}_train_data.csv\")\n",
138
+ " val_file = os.path.join(data_dir, f\"{issue_name}_val_data.csv\")\n",
139
+ " \n",
140
+ " # Read the CSV files\n",
141
+ " train_df = pd.read_csv(train_file)\n",
142
+ " val_df = pd.read_csv(val_file )\n",
143
+ " \n",
144
+ " return train_df, val_df"
145
+ ]
146
+ },
147
+ {
148
+ "cell_type": "code",
149
+ "execution_count": 31,
150
+ "id": "ae74f945-3fe9-4207-8fe0-fb4d8c5d2a27",
151
+ "metadata": {},
152
+ "outputs": [],
153
+ "source": [
154
+ "df = pd.read_csv(\"../data_splits/train-data-split.csv\")\n",
155
+ "issue_categories = list(df_train['Issue'].unique())\n",
156
+ "\n",
157
+ "def classify_sub_issue(issue):\n",
158
+ " issue_name = issue.replace('/', '_').replace(' ', '_').lower()\n",
159
+ " train_df,val_df= read_subissue_data(issue)\n",
160
+ " rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)\n",
161
+ " trained_model = train_model(train_df, val_df, 'Sub-issue', rf_classifier, random_state=42)\n",
162
+ "\n",
163
+ " # Saving the model\n",
164
+ " with open(f\"issue_models/{issue_name}.pkl\", 'wb') as f:\n",
165
+ " pickle.dump(trained_model, f)"
166
+ ]
167
+ },
168
+ {
169
+ "cell_type": "markdown",
170
+ "id": "0540f68f-4e14-40c2-ba9e-1875138678a1",
171
+ "metadata": {},
172
+ "source": [
173
+ "### Sub-issues classification"
174
+ ]
175
+ },
176
+ {
177
+ "cell_type": "markdown",
178
+ "id": "7a53f046-c7f8-48de-a8f3-9a66ffad5f55",
179
+ "metadata": {},
180
+ "source": [
181
+ "#### 1. Problem with a company's investigation into an existing problem"
182
+ ]
183
+ },
184
+ {
185
+ "cell_type": "code",
186
+ "execution_count": 32,
187
+ "id": "a33a3974-b3e9-466c-85a9-8d9b0255bbba",
188
+ "metadata": {},
189
+ "outputs": [
190
+ {
191
+ "name": "stdout",
192
+ "output_type": "stream",
193
+ "text": [
194
+ "Issue : Problem with a company's investigation into an existing problem\n",
195
+ "\n",
196
+ "\n",
197
+ "Classification Report:\n",
198
+ " precision recall f1-score support\n",
199
+ "\n",
200
+ "Difficulty submitting a dispute or getting information about a dispute over the phone 0.88 0.37 0.52 41\n",
201
+ " Investigation took more than 30 days 0.95 0.73 0.83 162\n",
202
+ " Problem with personal statement of dispute 0.90 0.53 0.67 53\n",
203
+ " Their investigation did not fix an error on your report 0.91 1.00 0.95 1122\n",
204
+ " Was not notified of investigation status or results 0.98 0.87 0.92 209\n",
205
+ "\n",
206
+ " accuracy 0.92 1587\n",
207
+ " macro avg 0.93 0.70 0.78 1587\n",
208
+ " weighted avg 0.92 0.92 0.91 1587\n",
209
+ "\n",
210
+ "Accuracy: 0.9199747952110902\n"
211
+ ]
212
+ }
213
+ ],
214
+ "source": [
215
+ "issue_name = issue_categories[0]\n",
216
+ "print(f\"Issue : {issue_name}\\n\")\n",
217
+ "\n",
218
+ "classify_sub_issue(issue_name)"
219
+ ]
220
+ },
221
+ {
222
+ "cell_type": "markdown",
223
+ "id": "4ffa280b-614f-48b2-9870-70fb053b45b6",
224
+ "metadata": {},
225
+ "source": [
226
+ "#### 2. Incorrect information on your report"
227
+ ]
228
+ },
229
+ {
230
+ "cell_type": "code",
231
+ "execution_count": 34,
232
+ "id": "3d431635-227e-4873-b017-8cb4180a6e2e",
233
+ "metadata": {},
234
+ "outputs": [
235
+ {
236
+ "name": "stdout",
237
+ "output_type": "stream",
238
+ "text": [
239
+ "Issue : Incorrect information on your report\n",
240
+ "\n",
241
+ "\n",
242
+ "Classification Report:\n",
243
+ " precision recall f1-score support\n",
244
+ "\n",
245
+ " Account information incorrect 0.74 0.68 0.71 699\n",
246
+ " Account status incorrect 0.87 0.73 0.79 771\n",
247
+ " Information belongs to someone else 0.90 0.99 0.94 4337\n",
248
+ "Information is missing that should be on the report 0.95 0.31 0.47 65\n",
249
+ " Old information reappears or never goes away 0.93 0.40 0.56 126\n",
250
+ " Personal information incorrect 0.95 0.78 0.86 440\n",
251
+ " Public record information inaccurate 0.98 0.47 0.64 102\n",
252
+ "\n",
253
+ " accuracy 0.88 6540\n",
254
+ " macro avg 0.90 0.62 0.71 6540\n",
255
+ " weighted avg 0.88 0.88 0.88 6540\n",
256
+ "\n",
257
+ "Accuracy: 0.8831804281345565\n"
258
+ ]
259
+ }
260
+ ],
261
+ "source": [
262
+ "issue_name = issue_categories[1]\n",
263
+ "print(f\"Issue : {issue_name}\\n\")\n",
264
+ "\n",
265
+ "classify_sub_issue(issue_name)"
266
+ ]
267
+ },
268
+ {
269
+ "cell_type": "markdown",
270
+ "id": "f5cb1853-9bc1-4541-9dac-5cb208abcfc5",
271
+ "metadata": {},
272
+ "source": [
273
+ "#### 3. Problem with a credit reporting company's investigation into an existing problem"
274
+ ]
275
+ },
276
+ {
277
+ "cell_type": "code",
278
+ "execution_count": 35,
279
+ "id": "86f04fd6-7625-4aba-9094-f7025078d1fc",
280
+ "metadata": {},
281
+ "outputs": [
282
+ {
283
+ "name": "stdout",
284
+ "output_type": "stream",
285
+ "text": [
286
+ "Issue : Problem with a credit reporting company's investigation into an existing problem\n",
287
+ "\n",
288
+ "\n",
289
+ "Classification Report:\n",
290
+ " precision recall f1-score support\n",
291
+ "\n",
292
+ "Difficulty submitting a dispute or getting information about a dispute over the phone 0.83 0.36 0.50 83\n",
293
+ " Investigation took more than 30 days 0.97 0.84 0.90 505\n",
294
+ " Problem with personal statement of dispute 1.00 0.38 0.55 47\n",
295
+ " Their investigation did not fix an error on your report 0.92 0.99 0.95 2277\n",
296
+ " Was not notified of investigation status or results 0.96 0.88 0.92 473\n",
297
+ "\n",
298
+ " accuracy 0.93 3385\n",
299
+ " macro avg 0.94 0.69 0.77 3385\n",
300
+ " weighted avg 0.93 0.93 0.92 3385\n",
301
+ "\n",
302
+ "Accuracy: 0.9288035450516987\n"
303
+ ]
304
+ }
305
+ ],
306
+ "source": [
307
+ "issue_name = issue_categories[2]\n",
308
+ "print(f\"Issue : {issue_name}\\n\")\n",
309
+ "\n",
310
+ "classify_sub_issue(issue_name)"
311
+ ]
312
+ },
313
+ {
314
+ "cell_type": "markdown",
315
+ "id": "f00b115b-46c4-4d46-adae-a10a5e92a839",
316
+ "metadata": {},
317
+ "source": [
318
+ "#### 4. Problem with a purchase shown on your statement"
319
+ ]
320
+ },
321
+ {
322
+ "cell_type": "code",
323
+ "execution_count": 36,
324
+ "id": "e6577c57-6caa-4221-a68b-e0b65e739511",
325
+ "metadata": {},
326
+ "outputs": [
327
+ {
328
+ "name": "stdout",
329
+ "output_type": "stream",
330
+ "text": [
331
+ "Issue : Problem with a purchase shown on your statement\n",
332
+ "\n",
333
+ "\n",
334
+ "Classification Report:\n",
335
+ " precision recall f1-score support\n",
336
+ "\n",
337
+ " Card was charged for something you did not purchase with the card 0.81 0.19 0.30 70\n",
338
+ "Credit card company isn't resolving a dispute about a purchase on your statement 0.75 0.98 0.85 172\n",
339
+ "\n",
340
+ " accuracy 0.75 242\n",
341
+ " macro avg 0.78 0.58 0.58 242\n",
342
+ " weighted avg 0.77 0.75 0.69 242\n",
343
+ "\n",
344
+ "Accuracy: 0.7520661157024794\n"
345
+ ]
346
+ }
347
+ ],
348
+ "source": [
349
+ "issue_name = issue_categories[3]\n",
350
+ "print(f\"Issue : {issue_name}\\n\")\n",
351
+ "\n",
352
+ "classify_sub_issue(issue_name)"
353
+ ]
354
+ },
355
+ {
356
+ "cell_type": "markdown",
357
+ "id": "a8648f75-e62d-4b80-b4ed-ccf104137c74",
358
+ "metadata": {},
359
+ "source": [
360
+ "#### 5. Improper use of your report"
361
+ ]
362
+ },
363
+ {
364
+ "cell_type": "code",
365
+ "execution_count": 37,
366
+ "id": "ea64cabb-1372-4a52-826f-8b1bf8f2cb32",
367
+ "metadata": {},
368
+ "outputs": [
369
+ {
370
+ "name": "stdout",
371
+ "output_type": "stream",
372
+ "text": [
373
+ "Issue : Improper use of your report\n",
374
+ "\n",
375
+ "\n",
376
+ "Classification Report:\n",
377
+ " precision recall f1-score support\n",
378
+ "\n",
379
+ "Credit inquiries on your report that you don't recognize 0.93 0.84 0.88 990\n",
380
+ " Reporting company used your report improperly 0.96 0.98 0.97 3654\n",
381
+ "\n",
382
+ " accuracy 0.95 4644\n",
383
+ " macro avg 0.95 0.91 0.93 4644\n",
384
+ " weighted avg 0.95 0.95 0.95 4644\n",
385
+ "\n",
386
+ "Accuracy: 0.9528423772609819\n"
387
+ ]
388
+ }
389
+ ],
390
+ "source": [
391
+ "issue_name = issue_categories[4]\n",
392
+ "print(f\"Issue : {issue_name}\\n\")\n",
393
+ "\n",
394
+ "classify_sub_issue(issue_name)"
395
+ ]
396
+ },
397
+ {
398
+ "cell_type": "markdown",
399
+ "id": "f48f3308-d884-440c-8a24-8a81e7140ee0",
400
+ "metadata": {},
401
+ "source": [
402
+ "#### 6. Account Operations and Unauthorized Transaction Issues"
403
+ ]
404
+ },
405
+ {
406
+ "cell_type": "code",
407
+ "execution_count": 38,
408
+ "id": "08ec2d0e-950e-4f6d-9cdb-8328fed17384",
409
+ "metadata": {},
410
+ "outputs": [
411
+ {
412
+ "name": "stdout",
413
+ "output_type": "stream",
414
+ "text": [
415
+ "Issue : Account Operations and Unauthorized Transaction Issues\n",
416
+ "\n",
417
+ "\n",
418
+ "Classification Report:\n",
419
+ " precision recall f1-score support\n",
420
+ "\n",
421
+ " Account opened as a result of fraud 0.83 0.67 0.74 43\n",
422
+ "Card opened as result of identity theft or fraud 0.88 0.77 0.82 39\n",
423
+ " Transaction was not authorized 0.86 0.97 0.91 102\n",
424
+ "\n",
425
+ " accuracy 0.86 184\n",
426
+ " macro avg 0.86 0.80 0.83 184\n",
427
+ " weighted avg 0.86 0.86 0.85 184\n",
428
+ "\n",
429
+ "Accuracy: 0.8586956521739131\n"
430
+ ]
431
+ }
432
+ ],
433
+ "source": [
434
+ "issue_name = issue_categories[5]\n",
435
+ "print(f\"Issue : {issue_name}\\n\")\n",
436
+ "\n",
437
+ "classify_sub_issue(issue_name)"
438
+ ]
439
+ },
440
+ {
441
+ "cell_type": "markdown",
442
+ "id": "7c7332c0-3cc9-42b6-9bbd-5b33719e676d",
443
+ "metadata": {},
444
+ "source": [
445
+ "#### 7. Payment and Funds Management"
446
+ ]
447
+ },
448
+ {
449
+ "cell_type": "code",
450
+ "execution_count": 39,
451
+ "id": "bf0e0437-a85d-4dcd-8b93-982fbd33cee6",
452
+ "metadata": {},
453
+ "outputs": [
454
+ {
455
+ "name": "stdout",
456
+ "output_type": "stream",
457
+ "text": [
458
+ "Issue : Payment and Funds Management\n",
459
+ "\n",
460
+ "\n",
461
+ "Classification Report:\n",
462
+ " precision recall f1-score support\n",
463
+ "\n",
464
+ " Billing problem 1.00 0.65 0.79 34\n",
465
+ " Overdrafts and overdraft fees 0.89 0.92 0.91 74\n",
466
+ "Problem during payment process 0.81 0.94 0.87 65\n",
467
+ "\n",
468
+ " accuracy 0.87 173\n",
469
+ " macro avg 0.90 0.83 0.85 173\n",
470
+ " weighted avg 0.88 0.87 0.87 173\n",
471
+ "\n",
472
+ "Accuracy: 0.8728323699421965\n"
473
+ ]
474
+ }
475
+ ],
476
+ "source": [
477
+ "issue_name = issue_categories[6]\n",
478
+ "print(f\"Issue : {issue_name}\\n\")\n",
479
+ "\n",
480
+ "classify_sub_issue(issue_name)"
481
+ ]
482
+ },
483
+ {
484
+ "cell_type": "markdown",
485
+ "id": "b034a174-16e7-41b6-970c-ef23d9b9da29",
486
+ "metadata": {},
487
+ "source": [
488
+ "#### 8. Managing an account"
489
+ ]
490
+ },
491
+ {
492
+ "cell_type": "code",
493
+ "execution_count": 40,
494
+ "id": "bc62e5f5-14ef-4d8a-8434-79b4e7da5a9a",
495
+ "metadata": {},
496
+ "outputs": [
497
+ {
498
+ "name": "stdout",
499
+ "output_type": "stream",
500
+ "text": [
501
+ "Issue : Managing an account\n",
502
+ "\n",
503
+ "\n",
504
+ "Classification Report:\n",
505
+ " precision recall f1-score support\n",
506
+ "\n",
507
+ " Banking errors 0.50 0.10 0.16 73\n",
508
+ " Deposits and withdrawals 0.46 0.90 0.61 201\n",
509
+ " Fee problem 0.55 0.57 0.56 56\n",
510
+ "Funds not handled or disbursed as instructed 0.00 0.00 0.00 72\n",
511
+ " Problem accessing account 0.00 0.00 0.00 40\n",
512
+ " Problem using a debit or ATM card 0.71 0.58 0.64 113\n",
513
+ "\n",
514
+ " accuracy 0.52 555\n",
515
+ " macro avg 0.37 0.36 0.33 555\n",
516
+ " weighted avg 0.43 0.52 0.43 555\n",
517
+ "\n",
518
+ "Accuracy: 0.5153153153153153\n"
519
+ ]
520
+ }
521
+ ],
522
+ "source": [
523
+ "issue_name = issue_categories[7]\n",
524
+ "print(f\"Issue : {issue_name}\\n\")\n",
525
+ "\n",
526
+ "classify_sub_issue(issue_name)"
527
+ ]
528
+ },
529
+ {
530
+ "cell_type": "markdown",
531
+ "id": "6c2e3454-eaa2-4a71-a058-988ad7716eac",
532
+ "metadata": {},
533
+ "source": [
534
+ "#### 9. Attempts to collect debt not owed"
535
+ ]
536
+ },
537
+ {
538
+ "cell_type": "code",
539
+ "execution_count": 41,
540
+ "id": "85ad1ffc-97e5-436b-afea-abed93b67b75",
541
+ "metadata": {},
542
+ "outputs": [
543
+ {
544
+ "name": "stdout",
545
+ "output_type": "stream",
546
+ "text": [
547
+ "Issue : Attempts to collect debt not owed\n",
548
+ "\n",
549
+ "\n",
550
+ "Classification Report:\n",
551
+ " precision recall f1-score support\n",
552
+ "\n",
553
+ " Debt is not yours 0.64 0.93 0.76 207\n",
554
+ " Debt was paid 0.96 0.31 0.46 72\n",
555
+ "Debt was result of identity theft 0.84 0.56 0.67 129\n",
556
+ "\n",
557
+ " accuracy 0.70 408\n",
558
+ " macro avg 0.81 0.60 0.63 408\n",
559
+ " weighted avg 0.76 0.70 0.68 408\n",
560
+ "\n",
561
+ "Accuracy: 0.7009803921568627\n"
562
+ ]
563
+ }
564
+ ],
565
+ "source": [
566
+ "issue_name = issue_categories[8]\n",
567
+ "print(f\"Issue : {issue_name}\\n\")\n",
568
+ "\n",
569
+ "classify_sub_issue(issue_name)"
570
+ ]
571
+ },
572
+ {
573
+ "cell_type": "markdown",
574
+ "id": "43b186f0-b626-43c2-9823-6818da478d48",
575
+ "metadata": {},
576
+ "source": [
577
+ "-----"
578
+ ]
579
+ },
580
+ {
581
+ "cell_type": "markdown",
582
+ "id": "8d87e677-da08-4682-9823-72c8315e52a2",
583
+ "metadata": {},
584
+ "source": [
585
+ "#### 10. Written notification about debt"
586
+ ]
587
+ },
588
+ {
589
+ "cell_type": "code",
590
+ "execution_count": 42,
591
+ "id": "214fc01d-7bf1-4b5a-b409-10b3c99076ae",
592
+ "metadata": {},
593
+ "outputs": [
594
+ {
595
+ "name": "stdout",
596
+ "output_type": "stream",
597
+ "text": [
598
+ "Issue : Written notification about debt\n",
599
+ "\n",
600
+ "\n",
601
+ "Classification Report:\n",
602
+ " precision recall f1-score support\n",
603
+ "\n",
604
+ "Didn't receive enough information to verify debt 0.77 0.99 0.87 135\n",
605
+ " Didn't receive notice of right to dispute 0.90 0.19 0.31 48\n",
606
+ "\n",
607
+ " accuracy 0.78 183\n",
608
+ " macro avg 0.84 0.59 0.59 183\n",
609
+ " weighted avg 0.81 0.78 0.72 183\n",
610
+ "\n",
611
+ "Accuracy: 0.7814207650273224\n"
612
+ ]
613
+ }
614
+ ],
615
+ "source": [
616
+ "issue_name = issue_categories[9]\n",
617
+ "print(f\"Issue : {issue_name}\\n\")\n",
618
+ "\n",
619
+ "classify_sub_issue(issue_name)"
620
+ ]
621
+ },
622
+ {
623
+ "cell_type": "markdown",
624
+ "id": "7cca2ba7-f0e1-4e56-a6f0-2a3c92bcac56",
625
+ "metadata": {},
626
+ "source": [
627
+ "----"
628
+ ]
629
+ },
630
+ {
631
+ "cell_type": "markdown",
632
+ "id": "401e87db-4759-437c-bcb1-382a7f8ed226",
633
+ "metadata": {},
634
+ "source": [
635
+ "#### 11. Dealing with your lender or servicer"
636
+ ]
637
+ },
638
+ {
639
+ "cell_type": "code",
640
+ "execution_count": 43,
641
+ "id": "9c1485fc-1b14-44c9-b4c9-d92bea864800",
642
+ "metadata": {},
643
+ "outputs": [
644
+ {
645
+ "name": "stdout",
646
+ "output_type": "stream",
647
+ "text": [
648
+ "Issue : Dealing with your lender or servicer\n",
649
+ "\n",
650
+ "\n",
651
+ "Classification Report:\n",
652
+ " precision recall f1-score support\n",
653
+ "\n",
654
+ " Received bad information about your loan 0.74 0.70 0.72 50\n",
655
+ "Trouble with how payments are being handled 0.71 0.75 0.73 48\n",
656
+ "\n",
657
+ " accuracy 0.72 98\n",
658
+ " macro avg 0.73 0.72 0.72 98\n",
659
+ " weighted avg 0.73 0.72 0.72 98\n",
660
+ "\n",
661
+ "Accuracy: 0.7244897959183674\n"
662
+ ]
663
+ }
664
+ ],
665
+ "source": [
666
+ "issue_name = issue_categories[10]\n",
667
+ "print(f\"Issue : {issue_name}\\n\")\n",
668
+ "\n",
669
+ "classify_sub_issue(issue_name)"
670
+ ]
671
+ },
672
+ {
673
+ "cell_type": "markdown",
674
+ "id": "8ca1aab7-158f-48bf-871c-1fa991fb1f9e",
675
+ "metadata": {},
676
+ "source": [
677
+ "----"
678
+ ]
679
+ },
680
+ {
681
+ "cell_type": "markdown",
682
+ "id": "36ce1724-61e5-4d5b-bbaf-a79293af6506",
683
+ "metadata": {},
684
+ "source": [
685
+ "#### 12. Disputes and Misrepresentations"
686
+ ]
687
+ },
688
+ {
689
+ "cell_type": "code",
690
+ "execution_count": 44,
691
+ "id": "380ee173-6c72-40b8-9eb2-a5af680c8ff7",
692
+ "metadata": {},
693
+ "outputs": [
694
+ {
695
+ "name": "stdout",
696
+ "output_type": "stream",
697
+ "text": [
698
+ "Issue : Disputes and Misrepresentations\n",
699
+ "\n",
700
+ "\n",
701
+ "Classification Report:\n",
702
+ " precision recall f1-score support\n",
703
+ "\n",
704
+ "Attempted to collect wrong amount 0.85 0.92 0.88 66\n",
705
+ " Other problem 0.85 0.65 0.74 54\n",
706
+ " Problem with fees 0.83 0.93 0.88 57\n",
707
+ "\n",
708
+ " accuracy 0.84 177\n",
709
+ " macro avg 0.84 0.83 0.83 177\n",
710
+ " weighted avg 0.84 0.84 0.84 177\n",
711
+ "\n",
712
+ "Accuracy: 0.8418079096045198\n"
713
+ ]
714
+ }
715
+ ],
716
+ "source": [
717
+ "issue_name = issue_categories[11]\n",
718
+ "print(f\"Issue : {issue_name}\\n\")\n",
719
+ "\n",
720
+ "classify_sub_issue(issue_name)"
721
+ ]
722
+ },
723
+ {
724
+ "cell_type": "markdown",
725
+ "id": "e44501a4-2021-4d78-b3c2-c937d286cb22",
726
+ "metadata": {},
727
+ "source": [
728
+ "----"
729
+ ]
730
+ },
731
+ {
732
+ "cell_type": "markdown",
733
+ "id": "451ccf3a-c97e-46e3-9c47-c225d6e3dd49",
734
+ "metadata": {},
735
+ "source": [
736
+ "#### 13. Problem with a company's investigation into an existing issue"
737
+ ]
738
+ },
739
+ {
740
+ "cell_type": "code",
741
+ "execution_count": 45,
742
+ "id": "20201d0c-b9da-4e2e-957b-23649f06e48e",
743
+ "metadata": {},
744
+ "outputs": [
745
+ {
746
+ "name": "stdout",
747
+ "output_type": "stream",
748
+ "text": [
749
+ "Issue : Problem with a company's investigation into an existing issue\n",
750
+ "\n",
751
+ "\n",
752
+ "Classification Report:\n",
753
+ " precision recall f1-score support\n",
754
+ "\n",
755
+ "Difficulty submitting a dispute or getting information about a dispute over the phone 0.00 0.00 0.00 3\n",
756
+ " Investigation took more than 30 days 1.00 1.00 1.00 3\n",
757
+ " Problem with personal statement of dispute 0.00 0.00 0.00 2\n",
758
+ " Their investigation did not fix an error on your report 0.50 1.00 0.67 7\n",
759
+ " Was not notified of investigation status or results 0.00 0.00 0.00 2\n",
760
+ "\n",
761
+ " accuracy 0.59 17\n",
762
+ " macro avg 0.30 0.40 0.33 17\n",
763
+ " weighted avg 0.38 0.59 0.45 17\n",
764
+ "\n",
765
+ "Accuracy: 0.5882352941176471\n"
766
+ ]
767
+ }
768
+ ],
769
+ "source": [
770
+ "issue_name = issue_categories[12]\n",
771
+ "print(f\"Issue : {issue_name}\\n\")\n",
772
+ "\n",
773
+ "classify_sub_issue(issue_name)"
774
+ ]
775
+ },
776
+ {
777
+ "cell_type": "markdown",
778
+ "id": "c5d37ff8-2382-4c3b-aef0-5affd4d3083b",
779
+ "metadata": {},
780
+ "source": [
781
+ "----"
782
+ ]
783
+ },
784
+ {
785
+ "cell_type": "markdown",
786
+ "id": "c9876639-9e72-49ab-9dd4-3ef5ac38a8d8",
787
+ "metadata": {},
788
+ "source": [
789
+ "#### 14. Closing your account"
790
+ ]
791
+ },
792
+ {
793
+ "cell_type": "code",
794
+ "execution_count": 46,
795
+ "id": "95eff365-09f8-4640-9f65-4a82fc321fa9",
796
+ "metadata": {},
797
+ "outputs": [
798
+ {
799
+ "name": "stdout",
800
+ "output_type": "stream",
801
+ "text": [
802
+ "Issue : Closing your account\n",
803
+ "\n",
804
+ "\n",
805
+ "Classification Report:\n",
806
+ " precision recall f1-score support\n",
807
+ "\n",
808
+ " Can't close your account 1.00 0.24 0.38 17\n",
809
+ "Company closed your account 0.78 1.00 0.88 46\n",
810
+ "\n",
811
+ " accuracy 0.79 63\n",
812
+ " macro avg 0.89 0.62 0.63 63\n",
813
+ " weighted avg 0.84 0.79 0.74 63\n",
814
+ "\n",
815
+ "Accuracy: 0.7936507936507936\n"
816
+ ]
817
+ }
818
+ ],
819
+ "source": [
820
+ "issue_name = issue_categories[13]\n",
821
+ "print(f\"Issue : {issue_name}\\n\")\n",
822
+ "\n",
823
+ "classify_sub_issue(issue_name)"
824
+ ]
825
+ },
826
+ {
827
+ "cell_type": "markdown",
828
+ "id": "c66b9044-32af-4aee-af08-b685480d9f53",
829
+ "metadata": {},
830
+ "source": [
831
+ "----"
832
+ ]
833
+ },
834
+ {
835
+ "cell_type": "markdown",
836
+ "id": "455f8d69-5531-42e0-a53c-66427ff68fcc",
837
+ "metadata": {},
838
+ "source": [
839
+ "#### 15. Credit Report and Monitoring Issues"
840
+ ]
841
+ },
842
+ {
843
+ "cell_type": "code",
844
+ "execution_count": 47,
845
+ "id": "a039cb86-3503-4757-a8ee-7e518eafb9a5",
846
+ "metadata": {},
847
+ "outputs": [
848
+ {
849
+ "name": "stdout",
850
+ "output_type": "stream",
851
+ "text": [
852
+ "Issue : Credit Report and Monitoring Issues\n",
853
+ "\n",
854
+ "\n",
855
+ "Classification Report:\n",
856
+ " precision recall f1-score support\n",
857
+ "\n",
858
+ " Other problem getting your report or credit score 0.89 0.99 0.94 82\n",
859
+ "Problem canceling credit monitoring or identify theft protection service 0.97 0.75 0.85 40\n",
860
+ "\n",
861
+ " accuracy 0.91 122\n",
862
+ " macro avg 0.93 0.87 0.89 122\n",
863
+ " weighted avg 0.92 0.91 0.91 122\n",
864
+ "\n",
865
+ "Accuracy: 0.9098360655737705\n"
866
+ ]
867
+ }
868
+ ],
869
+ "source": [
870
+ "issue_name = issue_categories[14]\n",
871
+ "print(f\"Issue : {issue_name}\\n\")\n",
872
+ "\n",
873
+ "classify_sub_issue(issue_name)"
874
+ ]
875
+ },
876
+ {
877
+ "cell_type": "markdown",
878
+ "id": "ee0dfc45-96b2-4cbb-b34d-a8e1441c0c82",
879
+ "metadata": {},
880
+ "source": [
881
+ "----"
882
+ ]
883
+ },
884
+ {
885
+ "cell_type": "markdown",
886
+ "id": "0dcf3701-d59f-43fa-9aa0-2c65c27a8fe0",
887
+ "metadata": {},
888
+ "source": [
889
+ "#### 16. Closing an account"
890
+ ]
891
+ },
892
+ {
893
+ "cell_type": "code",
894
+ "execution_count": 48,
895
+ "id": "1ed7956b-3d41-46f8-a7e8-ad9f36e1694d",
896
+ "metadata": {},
897
+ "outputs": [
898
+ {
899
+ "name": "stdout",
900
+ "output_type": "stream",
901
+ "text": [
902
+ "Issue : Closing an account\n",
903
+ "\n",
904
+ "\n",
905
+ "Classification Report:\n",
906
+ " precision recall f1-score support\n",
907
+ "\n",
908
+ " Can't close your account 1.00 0.04 0.07 27\n",
909
+ " Company closed your account 0.57 0.83 0.67 69\n",
910
+ "Funds not received from closed account 0.56 0.50 0.53 50\n",
911
+ "\n",
912
+ " accuracy 0.57 146\n",
913
+ " macro avg 0.71 0.45 0.42 146\n",
914
+ " weighted avg 0.64 0.57 0.51 146\n",
915
+ "\n",
916
+ "Accuracy: 0.5684931506849316\n"
917
+ ]
918
+ }
919
+ ],
920
+ "source": [
921
+ "issue_name = issue_categories[15]\n",
922
+ "print(f\"Issue : {issue_name}\\n\")\n",
923
+ "\n",
924
+ "classify_sub_issue(issue_name)"
925
+ ]
926
+ },
927
+ {
928
+ "cell_type": "markdown",
929
+ "id": "3822541c-f13c-4a96-862f-4c23cf2d3895",
930
+ "metadata": {},
931
+ "source": [
932
+ "#### 17. Legal and Threat Actions"
933
+ ]
934
+ },
935
+ {
936
+ "cell_type": "code",
937
+ "execution_count": 49,
938
+ "id": "8fa5fc40-6d4f-4321-8eb0-9608dc5b84e2",
939
+ "metadata": {},
940
+ "outputs": [
941
+ {
942
+ "name": "stdout",
943
+ "output_type": "stream",
944
+ "text": [
945
+ "Issue : Legal and Threat Actions\n",
946
+ "\n",
947
+ "\n",
948
+ "Classification Report:\n",
949
+ " precision recall f1-score support\n",
950
+ "\n",
951
+ "Threatened or suggested your credit would be damaged 1.00 1.00 1.00 48\n",
952
+ "\n",
953
+ " accuracy 1.00 48\n",
954
+ " macro avg 1.00 1.00 1.00 48\n",
955
+ " weighted avg 1.00 1.00 1.00 48\n",
956
+ "\n",
957
+ "Accuracy: 1.0\n"
958
+ ]
959
+ }
960
+ ],
961
+ "source": [
962
+ "issue_name = issue_categories[16]\n",
963
+ "print(f\"Issue : {issue_name}\\n\")\n",
964
+ "\n",
965
+ "classify_sub_issue(issue_name)"
966
+ ]
967
+ }
968
+ ],
969
+ "metadata": {
970
+ "kernelspec": {
971
+ "display_name": "Python 3 (ipykernel)",
972
+ "language": "python",
973
+ "name": "python3"
974
+ },
975
+ "language_info": {
976
+ "codemirror_mode": {
977
+ "name": "ipython",
978
+ "version": 3
979
+ },
980
+ "file_extension": ".py",
981
+ "mimetype": "text/x-python",
982
+ "name": "python",
983
+ "nbconvert_exporter": "python",
984
+ "pygments_lexer": "ipython3",
985
+ "version": "3.9.19"
986
+ }
987
+ },
988
+ "nbformat": 4,
989
+ "nbformat_minor": 5
990
+ }
subproduct_prediction/Sub_Product.ipynb ADDED
@@ -0,0 +1,700 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": 1,
6
+ "id": "a751d479-1500-41e2-8c01-252e849dad05",
7
+ "metadata": {},
8
+ "outputs": [],
9
+ "source": [
10
+ "import warnings\n",
11
+ "warnings.filterwarnings(\"ignore\")"
12
+ ]
13
+ },
14
+ {
15
+ "cell_type": "code",
16
+ "execution_count": 2,
17
+ "id": "8158cb66-9f9a-4bb2-bc6e-6a51146be10c",
18
+ "metadata": {},
19
+ "outputs": [],
20
+ "source": [
21
+ "import pandas as pd\n",
22
+ "import matplotlib.pyplot as plt \n",
23
+ "from sklearn.model_selection import train_test_split\n",
24
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
25
+ "from sklearn.pipeline import make_pipeline\n",
26
+ "from sklearn.linear_model import LogisticRegression\n",
27
+ "from sklearn.naive_bayes import MultinomialNB\n",
28
+ "from sklearn.svm import SVC\n",
29
+ "from sklearn.ensemble import RandomForestClassifier\n",
30
+ "from sklearn.metrics import classification_report,accuracy_score\n",
31
+ "import numpy as np\n",
32
+ "from sklearn.ensemble import RandomForestClassifier\n",
33
+ "from sklearn.preprocessing import OneHotEncoder\n",
34
+ "from sklearn.compose import ColumnTransformer\n",
35
+ "from sklearn.pipeline import Pipeline\n",
36
+ "from sklearn.pipeline import Pipeline\n",
37
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
38
+ "from sklearn.ensemble import RandomForestClassifier\n",
39
+ "from sklearn.model_selection import train_test_split\n",
40
+ "from sklearn.metrics import classification_report, accuracy_score\n",
41
+ "from sklearn.utils.class_weight import compute_class_weight\n",
42
+ "import pickle"
43
+ ]
44
+ },
45
+ {
46
+ "cell_type": "markdown",
47
+ "id": "70ea935b-3b62-4cf9-8bef-06bf30904b20",
48
+ "metadata": {},
49
+ "source": [
50
+ "## Sub Products"
51
+ ]
52
+ },
53
+ {
54
+ "cell_type": "markdown",
55
+ "id": "f9ddaa89-dc8d-40f5-8098-7d108ab9d578",
56
+ "metadata": {},
57
+ "source": [
58
+ "### Model"
59
+ ]
60
+ },
61
+ {
62
+ "cell_type": "code",
63
+ "execution_count": 3,
64
+ "id": "c1f9fd85-f47e-4962-a693-7cb9efca763a",
65
+ "metadata": {},
66
+ "outputs": [],
67
+ "source": [
68
+ "from sklearn.pipeline import Pipeline\n",
69
+ "from sklearn.feature_extraction.text import TfidfVectorizer\n",
70
+ "from sklearn.metrics import accuracy_score, classification_report\n",
71
+ "from sklearn.utils.class_weight import compute_class_weight\n",
72
+ "\n",
73
+ "def train_model(training_df, validation_df, subproduct_to_predict, classifier_model, subproducts_to_drop=None, random_state=None):\n",
74
+ " # Drop specified subproducts from training and validation dataframes\n",
75
+ " if subproducts_to_drop:\n",
76
+ " training_df = training_df[~training_df['Sub-product'].isin(subproducts_to_drop)]\n",
77
+ " validation_df = validation_df[~validation_df['Sub-product'].isin(subproducts_to_drop)]\n",
78
+ " \n",
79
+ " # Compute class weights\n",
80
+ " class_weights = compute_class_weight('balanced', classes=np.unique(training_df['Sub-product']), y=training_df['Sub-product'])\n",
81
+ " \n",
82
+ " # Convert class weights to dictionary format\n",
83
+ " class_weight = {label: weight for label, weight in zip(np.unique(training_df['Sub-product']), class_weights)}\n",
84
+ " \n",
85
+ " # Define a default class weight for missing classes\n",
86
+ " default_class_weight = 0.5\n",
87
+ " \n",
88
+ " # Assign default class weight for missing classes\n",
89
+ " for label in np.unique(training_df['Sub-product']):\n",
90
+ " if label not in class_weight:\n",
91
+ " class_weight[label] = default_class_weight\n",
92
+ " \n",
93
+ " # Define the pipeline\n",
94
+ " pipeline = Pipeline([\n",
95
+ " ('tfidf', TfidfVectorizer()),\n",
96
+ " ('classifier', classifier_model)\n",
97
+ " ])\n",
98
+ " \n",
99
+ " # Train the pipeline\n",
100
+ " pipeline.fit(training_df['Consumer complaint narrative'], training_df['Sub-product'])\n",
101
+ " \n",
102
+ " # Make predictions on the validation set\n",
103
+ " y_pred = pipeline.predict(validation_df['Consumer complaint narrative'])\n",
104
+ " \n",
105
+ " # Evaluate the pipeline\n",
106
+ " accuracy = accuracy_score(validation_df['Sub-product'], y_pred)\n",
107
+ " print(\"Accuracy:\", accuracy)\n",
108
+ " print(\"\\nClassification Report:\")\n",
109
+ " print(classification_report(validation_df['Sub-product'], y_pred))\n",
110
+ " \n",
111
+ " return pipeline\n"
112
+ ]
113
+ },
114
+ {
115
+ "cell_type": "markdown",
116
+ "id": "a7a0d277-75c1-4435-86e5-d0ee7d3dabf3",
117
+ "metadata": {},
118
+ "source": [
119
+ "#### Debt Collection"
120
+ ]
121
+ },
122
+ {
123
+ "cell_type": "code",
124
+ "execution_count": 4,
125
+ "id": "6a2e4857-31c7-4b57-a25c-e9e36473c033",
126
+ "metadata": {},
127
+ "outputs": [],
128
+ "source": [
129
+ "debt_training_df= pd.read_csv('../data_preprocessing_scripts/product_data_splits/debt_collection_train_data.csv')\n",
130
+ "debt_val_df= pd.read_csv('../data_preprocessing_scripts/product_data_splits/debt_collection_val_data.csv')"
131
+ ]
132
+ },
133
+ {
134
+ "cell_type": "code",
135
+ "execution_count": 5,
136
+ "id": "7fb6be2b-244f-4232-972c-9772128890ca",
137
+ "metadata": {},
138
+ "outputs": [
139
+ {
140
+ "data": {
141
+ "text/html": [
142
+ "<div>\n",
143
+ "<style scoped>\n",
144
+ " .dataframe tbody tr th:only-of-type {\n",
145
+ " vertical-align: middle;\n",
146
+ " }\n",
147
+ "\n",
148
+ " .dataframe tbody tr th {\n",
149
+ " vertical-align: top;\n",
150
+ " }\n",
151
+ "\n",
152
+ " .dataframe thead th {\n",
153
+ " text-align: right;\n",
154
+ " }\n",
155
+ "</style>\n",
156
+ "<table border=\"1\" class=\"dataframe\">\n",
157
+ " <thead>\n",
158
+ " <tr style=\"text-align: right;\">\n",
159
+ " <th></th>\n",
160
+ " <th>Consumer complaint narrative</th>\n",
161
+ " <th>Product</th>\n",
162
+ " <th>Sub-product</th>\n",
163
+ " </tr>\n",
164
+ " </thead>\n",
165
+ " <tbody>\n",
166
+ " <tr>\n",
167
+ " <th>0</th>\n",
168
+ " <td>{$37.00} on XXXX XXXX XXXX I paid for gas thro...</td>\n",
169
+ " <td>Debt collection</td>\n",
170
+ " <td>Other debt</td>\n",
171
+ " </tr>\n",
172
+ " <tr>\n",
173
+ " <th>1</th>\n",
174
+ " <td>Debt from XXXX XXXX is result of identity thef...</td>\n",
175
+ " <td>Debt collection</td>\n",
176
+ " <td>Credit card debt</td>\n",
177
+ " </tr>\n",
178
+ " <tr>\n",
179
+ " <th>2</th>\n",
180
+ " <td>My son attended XXXX XXXX XXXX XXXX for severa...</td>\n",
181
+ " <td>Debt collection</td>\n",
182
+ " <td>Medical debt</td>\n",
183
+ " </tr>\n",
184
+ " <tr>\n",
185
+ " <th>3</th>\n",
186
+ " <td>XXXX is claiming I owe a debt for utilities ba...</td>\n",
187
+ " <td>Debt collection</td>\n",
188
+ " <td>Other debt</td>\n",
189
+ " </tr>\n",
190
+ " <tr>\n",
191
+ " <th>4</th>\n",
192
+ " <td>This debt collector engaged in abusive, decept...</td>\n",
193
+ " <td>Debt collection</td>\n",
194
+ " <td>I do not know</td>\n",
195
+ " </tr>\n",
196
+ " </tbody>\n",
197
+ "</table>\n",
198
+ "</div>"
199
+ ],
200
+ "text/plain": [
201
+ " Consumer complaint narrative Product \\\n",
202
+ "0 {$37.00} on XXXX XXXX XXXX I paid for gas thro... Debt collection \n",
203
+ "1 Debt from XXXX XXXX is result of identity thef... Debt collection \n",
204
+ "2 My son attended XXXX XXXX XXXX XXXX for severa... Debt collection \n",
205
+ "3 XXXX is claiming I owe a debt for utilities ba... Debt collection \n",
206
+ "4 This debt collector engaged in abusive, decept... Debt collection \n",
207
+ "\n",
208
+ " Sub-product \n",
209
+ "0 Other debt \n",
210
+ "1 Credit card debt \n",
211
+ "2 Medical debt \n",
212
+ "3 Other debt \n",
213
+ "4 I do not know "
214
+ ]
215
+ },
216
+ "execution_count": 5,
217
+ "metadata": {},
218
+ "output_type": "execute_result"
219
+ }
220
+ ],
221
+ "source": [
222
+ "debt_training_df.head()"
223
+ ]
224
+ },
225
+ {
226
+ "cell_type": "code",
227
+ "execution_count": 6,
228
+ "id": "a14dbafd-6f1b-49cb-9712-434055da84f1",
229
+ "metadata": {},
230
+ "outputs": [
231
+ {
232
+ "data": {
233
+ "text/plain": [
234
+ "Sub-product\n",
235
+ "Other debt 2056\n",
236
+ "I do not know 1530\n",
237
+ "Credit card debt 1139\n",
238
+ "Medical debt 726\n",
239
+ "Auto debt 397\n",
240
+ "Telecommunications debt 267\n",
241
+ "Rental debt 122\n",
242
+ "Mortgage debt 94\n",
243
+ "Name: count, dtype: int64"
244
+ ]
245
+ },
246
+ "execution_count": 6,
247
+ "metadata": {},
248
+ "output_type": "execute_result"
249
+ }
250
+ ],
251
+ "source": [
252
+ "debt_training_df['Sub-product'].value_counts()"
253
+ ]
254
+ },
255
+ {
256
+ "cell_type": "code",
257
+ "execution_count": 7,
258
+ "id": "b78398b7-d027-403f-acf4-fa580d113b02",
259
+ "metadata": {},
260
+ "outputs": [
261
+ {
262
+ "name": "stdout",
263
+ "output_type": "stream",
264
+ "text": [
265
+ "Accuracy: 0.6633986928104575\n",
266
+ "\n",
267
+ "Classification Report:\n",
268
+ " precision recall f1-score support\n",
269
+ "\n",
270
+ " Auto debt 0.95 0.48 0.64 44\n",
271
+ " Credit card debt 0.59 0.96 0.73 127\n",
272
+ " Medical debt 0.77 0.62 0.68 81\n",
273
+ " Mortgage debt 1.00 0.40 0.57 10\n",
274
+ " Rental debt 0.67 0.14 0.24 14\n",
275
+ "Telecommunications debt 1.00 0.13 0.24 30\n",
276
+ "\n",
277
+ " accuracy 0.66 306\n",
278
+ " macro avg 0.83 0.46 0.52 306\n",
279
+ " weighted avg 0.75 0.66 0.63 306\n",
280
+ "\n"
281
+ ]
282
+ }
283
+ ],
284
+ "source": [
285
+ "\n",
286
+ "from sklearn.ensemble import RandomForestClassifier\n",
287
+ "\n",
288
+ "rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)\n",
289
+ "trained_model_d = train_model(debt_training_df, debt_val_df, 'Sub-product', rf_classifier, subproducts_to_drop=['Other debt', 'I do not know'], random_state=42)\n"
290
+ ]
291
+ },
292
+ {
293
+ "cell_type": "code",
294
+ "execution_count": 9,
295
+ "id": "85bbc3fe-50b0-4578-8e67-151861f839da",
296
+ "metadata": {},
297
+ "outputs": [],
298
+ "source": [
299
+ "with open('models/Debt_model.pkl', 'wb') as f:\n",
300
+ " pickle.dump(trained_model_d, f)"
301
+ ]
302
+ },
303
+ {
304
+ "cell_type": "markdown",
305
+ "id": "5c529ed8-3735-4494-9f90-6c005dfea6df",
306
+ "metadata": {},
307
+ "source": [
308
+ "#### Loan/Mortgages"
309
+ ]
310
+ },
311
+ {
312
+ "cell_type": "code",
313
+ "execution_count": 10,
314
+ "id": "f33b26e9-4c5b-4498-ab23-a88aca5eb07f",
315
+ "metadata": {},
316
+ "outputs": [],
317
+ "source": [
318
+ "loans_training_df= pd.read_csv('../data_preprocessing_scripts/product_data_splits/loans___mortgage_train_data.csv')\n",
319
+ "loans_val_df= pd.read_csv('../data_preprocessing_scripts/product_data_splits/loans___mortgage_val_data.csv')"
320
+ ]
321
+ },
322
+ {
323
+ "cell_type": "code",
324
+ "execution_count": 11,
325
+ "id": "c8dcc18b-f7bb-4edd-965a-8c58500a0ea6",
326
+ "metadata": {},
327
+ "outputs": [
328
+ {
329
+ "data": {
330
+ "text/plain": [
331
+ "Sub-product\n",
332
+ "Loan 1464\n",
333
+ "Federal student loan servicing 914\n",
334
+ "Conventional home mortgage 236\n",
335
+ "Lease 186\n",
336
+ "FHA mortgage 94\n",
337
+ "Name: count, dtype: int64"
338
+ ]
339
+ },
340
+ "execution_count": 11,
341
+ "metadata": {},
342
+ "output_type": "execute_result"
343
+ }
344
+ ],
345
+ "source": [
346
+ "loans_training_df['Sub-product'].value_counts()"
347
+ ]
348
+ },
349
+ {
350
+ "cell_type": "code",
351
+ "execution_count": 12,
352
+ "id": "b0da7a52-e00a-413a-80be-2e8221851275",
353
+ "metadata": {},
354
+ "outputs": [
355
+ {
356
+ "name": "stdout",
357
+ "output_type": "stream",
358
+ "text": [
359
+ "Accuracy: 0.8757763975155279\n",
360
+ "\n",
361
+ "Classification Report:\n",
362
+ " precision recall f1-score support\n",
363
+ "\n",
364
+ " Conventional home mortgage 0.81 0.50 0.62 26\n",
365
+ " FHA mortgage 1.00 0.20 0.33 10\n",
366
+ "Federal student loan servicing 1.00 0.96 0.98 102\n",
367
+ " Lease 1.00 0.29 0.44 21\n",
368
+ " Loan 0.81 1.00 0.90 163\n",
369
+ "\n",
370
+ " accuracy 0.88 322\n",
371
+ " macro avg 0.93 0.59 0.65 322\n",
372
+ " weighted avg 0.89 0.88 0.85 322\n",
373
+ "\n"
374
+ ]
375
+ }
376
+ ],
377
+ "source": [
378
+ "from sklearn.ensemble import RandomForestClassifier\n",
379
+ "\n",
380
+ "rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)\n",
381
+ "trained_model_l = train_model(loans_training_df, loans_val_df, 'Sub-product', rf_classifier, random_state=42)"
382
+ ]
383
+ },
384
+ {
385
+ "cell_type": "code",
386
+ "execution_count": 13,
387
+ "id": "a668b946-da36-410f-b474-f8a311952c5d",
388
+ "metadata": {},
389
+ "outputs": [],
390
+ "source": [
391
+ "with open('models/loan_model.pkl', 'wb') as f:\n",
392
+ " pickle.dump(trained_model_l, f)"
393
+ ]
394
+ },
395
+ {
396
+ "cell_type": "markdown",
397
+ "id": "74796ebf-9934-46d2-a1b7-d6672dea727c",
398
+ "metadata": {},
399
+ "source": [
400
+ "#### Checking or savings account"
401
+ ]
402
+ },
403
+ {
404
+ "cell_type": "code",
405
+ "execution_count": 14,
406
+ "id": "1cc65f08-96c8-4458-8703-b84b7554a04c",
407
+ "metadata": {},
408
+ "outputs": [],
409
+ "source": [
410
+ "cs_training_df= pd.read_csv('../data_preprocessing_scripts/product_data_splits/checking_or_savings_account_train_data.csv')\n",
411
+ "cs_val_df= pd.read_csv('../data_preprocessing_scripts/product_data_splits/checking_or_savings_account_val_data.csv')"
412
+ ]
413
+ },
414
+ {
415
+ "cell_type": "code",
416
+ "execution_count": 15,
417
+ "id": "240b2bcd-3839-4584-8a63-952fa17f9715",
418
+ "metadata": {},
419
+ "outputs": [
420
+ {
421
+ "data": {
422
+ "text/plain": [
423
+ "Sub-product\n",
424
+ "Checking account 13500\n",
425
+ "Savings account 1391\n",
426
+ "Other banking product or service 1158\n",
427
+ "CD (Certificate of Deposit) 176\n",
428
+ "Name: count, dtype: int64"
429
+ ]
430
+ },
431
+ "execution_count": 15,
432
+ "metadata": {},
433
+ "output_type": "execute_result"
434
+ }
435
+ ],
436
+ "source": [
437
+ "cs_training_df['Sub-product'].value_counts()"
438
+ ]
439
+ },
440
+ {
441
+ "cell_type": "code",
442
+ "execution_count": 16,
443
+ "id": "3170c0c8-0dac-4755-aebf-dca9aa7f4dee",
444
+ "metadata": {},
445
+ "outputs": [
446
+ {
447
+ "name": "stdout",
448
+ "output_type": "stream",
449
+ "text": [
450
+ "Accuracy: 0.940099833610649\n",
451
+ "\n",
452
+ "Classification Report:\n",
453
+ " precision recall f1-score support\n",
454
+ "\n",
455
+ " CD (Certificate of Deposit) 0.95 0.95 0.95 19\n",
456
+ " Checking account 0.93 1.00 0.97 1500\n",
457
+ "Other banking product or service 1.00 0.60 0.75 129\n",
458
+ " Savings account 0.99 0.65 0.79 155\n",
459
+ "\n",
460
+ " accuracy 0.94 1803\n",
461
+ " macro avg 0.97 0.80 0.86 1803\n",
462
+ " weighted avg 0.94 0.94 0.93 1803\n",
463
+ "\n"
464
+ ]
465
+ }
466
+ ],
467
+ "source": [
468
+ "from sklearn.ensemble import RandomForestClassifier\n",
469
+ "\n",
470
+ "rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)\n",
471
+ "trained_model_cs = train_model(cs_training_df, cs_val_df, 'Sub-product', rf_classifier, random_state=42)"
472
+ ]
473
+ },
474
+ {
475
+ "cell_type": "code",
476
+ "execution_count": 17,
477
+ "id": "59c87ff1-d7de-41a9-9e0a-33630bff1c18",
478
+ "metadata": {},
479
+ "outputs": [],
480
+ "source": [
481
+ "with open('models/Checking_saving_model.pkl', 'wb') as f:\n",
482
+ " pickle.dump(trained_model_cs, f)"
483
+ ]
484
+ },
485
+ {
486
+ "cell_type": "markdown",
487
+ "id": "fe443859-4be6-4b87-be79-22487aaf5b3b",
488
+ "metadata": {},
489
+ "source": [
490
+ "#### 'Credit/Prepaid Card'"
491
+ ]
492
+ },
493
+ {
494
+ "cell_type": "code",
495
+ "execution_count": 26,
496
+ "id": "31a70db8-06cb-4fb0-8d45-a7451aa81b0e",
497
+ "metadata": {},
498
+ "outputs": [],
499
+ "source": [
500
+ "cp_training_df= pd.read_csv('../data_preprocessing_scripts/product_data_splits/credit_prepaid_card_train_data.csv')\n",
501
+ "cp_val_df= pd.read_csv('../data_preprocessing_scripts/product_data_splits/credit_prepaid_card_val_data.csv')"
502
+ ]
503
+ },
504
+ {
505
+ "cell_type": "code",
506
+ "execution_count": 27,
507
+ "id": "0e70a22d-01f9-4f59-a903-286a05eb5179",
508
+ "metadata": {},
509
+ "outputs": [
510
+ {
511
+ "data": {
512
+ "text/plain": [
513
+ "Sub-product\n",
514
+ "General-purpose credit card or charge card 13320\n",
515
+ "Store credit card 2232\n",
516
+ "Name: count, dtype: int64"
517
+ ]
518
+ },
519
+ "execution_count": 27,
520
+ "metadata": {},
521
+ "output_type": "execute_result"
522
+ }
523
+ ],
524
+ "source": [
525
+ "cp_training_df['Sub-product'].value_counts()"
526
+ ]
527
+ },
528
+ {
529
+ "cell_type": "code",
530
+ "execution_count": 28,
531
+ "id": "ef3b03f6-8207-4292-8ce2-e6ca5695c606",
532
+ "metadata": {},
533
+ "outputs": [
534
+ {
535
+ "name": "stdout",
536
+ "output_type": "stream",
537
+ "text": [
538
+ "Accuracy: 0.9427414690572585\n",
539
+ "\n",
540
+ "Classification Report:\n",
541
+ " precision recall f1-score support\n",
542
+ "\n",
543
+ "General-purpose credit card or charge card 0.94 1.00 0.97 1481\n",
544
+ " Store credit card 1.00 0.60 0.75 248\n",
545
+ "\n",
546
+ " accuracy 0.94 1729\n",
547
+ " macro avg 0.97 0.80 0.86 1729\n",
548
+ " weighted avg 0.95 0.94 0.94 1729\n",
549
+ "\n"
550
+ ]
551
+ }
552
+ ],
553
+ "source": [
554
+ "from sklearn.ensemble import RandomForestClassifier\n",
555
+ "\n",
556
+ "rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)\n",
557
+ "trained_model_cp = train_model(cp_training_df, cp_val_df, 'Sub-product', rf_classifier, random_state=42)"
558
+ ]
559
+ },
560
+ {
561
+ "cell_type": "code",
562
+ "execution_count": 21,
563
+ "id": "ac3f39d0-8cb8-457e-9db7-510cc5a99830",
564
+ "metadata": {},
565
+ "outputs": [],
566
+ "source": [
567
+ "with open('models/Credit_Prepaid_Card_model.pkl', 'wb') as f:\n",
568
+ " pickle.dump(trained_model_cp, f)"
569
+ ]
570
+ },
571
+ {
572
+ "cell_type": "markdown",
573
+ "id": "0787d4eb-9673-417b-91d1-cc98becd037e",
574
+ "metadata": {},
575
+ "source": [
576
+ "#### Credit_reporting_df"
577
+ ]
578
+ },
579
+ {
580
+ "cell_type": "code",
581
+ "execution_count": 22,
582
+ "id": "8e074864-16f6-4fd5-8bfe-b054aeb0fc2a",
583
+ "metadata": {},
584
+ "outputs": [],
585
+ "source": [
586
+ "cr_training_df= pd.read_csv('../data_preprocessing_scripts/product_data_splits/credit_reporting_train_data.csv')\n",
587
+ "cr_val_df= pd.read_csv('../data_preprocessing_scripts/product_data_splits/credit_reporting_val_data.csv')"
588
+ ]
589
+ },
590
+ {
591
+ "cell_type": "code",
592
+ "execution_count": 23,
593
+ "id": "57257613-7dde-4561-942c-f559d2159744",
594
+ "metadata": {},
595
+ "outputs": [
596
+ {
597
+ "data": {
598
+ "text/plain": [
599
+ "Sub-product\n",
600
+ "Credit reporting 13500\n",
601
+ "Other personal consumer report 661\n",
602
+ "Name: count, dtype: int64"
603
+ ]
604
+ },
605
+ "execution_count": 23,
606
+ "metadata": {},
607
+ "output_type": "execute_result"
608
+ }
609
+ ],
610
+ "source": [
611
+ "cr_training_df['Sub-product'].value_counts()"
612
+ ]
613
+ },
614
+ {
615
+ "cell_type": "code",
616
+ "execution_count": 24,
617
+ "id": "cca27513-501f-4257-a4b1-0e13a3604250",
618
+ "metadata": {},
619
+ "outputs": [
620
+ {
621
+ "name": "stdout",
622
+ "output_type": "stream",
623
+ "text": [
624
+ "Accuracy: 0.9841168996188056\n",
625
+ "\n",
626
+ "Classification Report:\n",
627
+ " precision recall f1-score support\n",
628
+ "\n",
629
+ " Credit reporting 0.99 1.00 0.99 1500\n",
630
+ "Other personal consumer report 0.93 0.72 0.81 74\n",
631
+ "\n",
632
+ " accuracy 0.98 1574\n",
633
+ " macro avg 0.96 0.86 0.90 1574\n",
634
+ " weighted avg 0.98 0.98 0.98 1574\n",
635
+ "\n"
636
+ ]
637
+ }
638
+ ],
639
+ "source": [
640
+ "from sklearn.ensemble import RandomForestClassifier\n",
641
+ "\n",
642
+ "rf_classifier = RandomForestClassifier(n_estimators=200, random_state=42)\n",
643
+ "trained_model_cr = train_model(cr_training_df, cr_val_df, 'Sub-product', rf_classifier, random_state=42)\n"
644
+ ]
645
+ },
646
+ {
647
+ "cell_type": "code",
648
+ "execution_count": 25,
649
+ "id": "3cbb9aa5-6c0c-4b59-a181-7431e8fc60fc",
650
+ "metadata": {},
651
+ "outputs": [],
652
+ "source": [
653
+ "with open('models/Credit_Reporting_model.pkl', 'wb') as f:\n",
654
+ " pickle.dump(trained_model_cr, f)"
655
+ ]
656
+ },
657
+ {
658
+ "cell_type": "markdown",
659
+ "id": "9aea8fdd-ec86-40bc-b417-ba9169edabd9",
660
+ "metadata": {},
661
+ "source": [
662
+ "with open('models/Debt_model.pkl', 'wb') as f:\n",
663
+ " pickle.dump(trained_model_d, f)\n",
664
+ "\n",
665
+ "with open('models/loan_model.pkl', 'wb') as f:\n",
666
+ " pickle.dump(trained_model_l, f)\n",
667
+ "\n",
668
+ "with open('models/Checking_saving_model.pkl', 'wb') as f:\n",
669
+ " pickle.dump(trained_model_cs, f)\n",
670
+ "\n",
671
+ "with open('models/Credit_Prepaid_Card_model.pkl', 'wb') as f:\n",
672
+ " pickle.dump(trained_model_cp, f)\n",
673
+ "\n",
674
+ "with open('models/Credit_Reporting_model.pkl', 'wb') as f:\n",
675
+ " pickle.dump(trained_model_cr, f)"
676
+ ]
677
+ }
678
+ ],
679
+ "metadata": {
680
+ "kernelspec": {
681
+ "display_name": "Python 3 (ipykernel)",
682
+ "language": "python",
683
+ "name": "python3"
684
+ },
685
+ "language_info": {
686
+ "codemirror_mode": {
687
+ "name": "ipython",
688
+ "version": 3
689
+ },
690
+ "file_extension": ".py",
691
+ "mimetype": "text/x-python",
692
+ "name": "python",
693
+ "nbconvert_exporter": "python",
694
+ "pygments_lexer": "ipython3",
695
+ "version": "3.9.19"
696
+ }
697
+ },
698
+ "nbformat": 4,
699
+ "nbformat_minor": 5
700
+ }
subproduct_prediction/issue_models/account_operations_and_unauthorized_transaction_issues.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f5a38e0d8214e3f947f2425245fb0cabd6484cbf5416bc7cb967be933d550e48
3
+ size 13402084
subproduct_prediction/issue_models/attempts_to_collect_debt_not_owed.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5d2f89f554b692874926acc0622cc0da2b0d373adf6fe0ef991d396751a3e1fb
3
+ size 35287313
subproduct_prediction/issue_models/closing_an_account.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3d6e05991b41724502ec39bcc0b36f4a99bcdc53b3f25e5b43dded7e6bdb872b
3
+ size 13327249
subproduct_prediction/issue_models/closing_your_account.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:6c1c947503ffd02bb74c27ea5516e92542c12a1f7075a8dd1ca2b40a50924a47
3
+ size 3219384
subproduct_prediction/issue_models/credit_report_and_monitoring_issues.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc0b2236e3428e157037f2a1be6b1895811df1f9c26552423278a796cc420700
3
+ size 4546265
subproduct_prediction/issue_models/dealing_with_your_lender_or_servicer.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:48357a57a2aa170a13b424d9a0ccffc4aae3b03c7d072dec125bef02e5c24e11
3
+ size 6053321
subproduct_prediction/issue_models/disputes_and_misrepresentations.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84658b48682d815eaf93db9ead50d7c34a92ed9e09a72ff5397b594197bf3d10
3
+ size 14356455
subproduct_prediction/issue_models/improper_use_of_your_report.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:64f73d84a7db394d0049116190ed19b7630c39ae4672f7c9840907d0e77ba544
3
+ size 122627308
subproduct_prediction/issue_models/incorrect_information_on_your_report.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f2d2319330ab30e110677a3137e9a12c55d913f3b0ec4f6fa5a4e00353612ec3
3
+ size 459390697
subproduct_prediction/issue_models/legal_and_threat_actions.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fedd05ae3cdb61f8015b09b487aa5740fbc92b97b480b0bde5d1c65d753fd54e
3
+ size 224561
subproduct_prediction/issue_models/managing_an_account.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2528894a0f0fb7f90626d899b8059486e2f9fc21ce4dbb54f27cc284483ebeb0
3
+ size 85679764
subproduct_prediction/issue_models/payment_and_funds_management.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d72801fc16ca98b4cbc703a3f8ad5f8103f087c3dcbe0e1c80a63401192f3f73
3
+ size 11929289
subproduct_prediction/issue_models/problem_with_a_company's_investigation_into_an_existing_issue.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d71f394eaf1623337c71638ff409e1374e9bba3c87851f5ec9421f343629892d
3
+ size 2050572
subproduct_prediction/issue_models/problem_with_a_company's_investigation_into_an_existing_problem.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:57ebfb56b53d745eeae4c732227f4af83f98242b1083c3a63963ec9aabcfbec1
3
+ size 49789793
subproduct_prediction/issue_models/problem_with_a_credit_reporting_company's_investigation_into_an_existing_problem.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ff6c316c3100e632b1cfafee59d81f52f07d75d6e8af9323e72e5dcb9997ed5b
3
+ size 132836007
subproduct_prediction/issue_models/problem_with_a_purchase_shown_on_your_statement.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cca681b5711e049df5e344596be0aa0f4a06db512aede2bba31899192aff2db8
3
+ size 13227946
subproduct_prediction/issue_models/written_notification_about_debt.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:69e250f983a5dcaf1630d27fe5386402f3b99d9e22ebed40904f3d790245e1ef
3
+ size 9169604
subproduct_prediction/models/Checking_saving_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:aa166dafab04f7c1ec8431cf5b9ccdfe9486abf0b7ad505ed835142a615029dd
3
+ size 67244100
subproduct_prediction/models/Credit_Prepaid_Card_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8e0d8cee975c35ce85b98db9005a4517fe95f9a8f8b3fcb5b50e9fecd1c0a003
3
+ size 44123155
subproduct_prediction/models/Credit_Reporting_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:8ede473b35d13d58aa40501d27c6126403c5711dcedfcf0639ef472f7228967d
3
+ size 18568054
subproduct_prediction/models/Debt_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d6be682dfe69330f5154309f4a00e0df313c3d7a75ca5d005ab8dd2394cc4ffb
3
+ size 39776752
subproduct_prediction/models/Product_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:879ef7eea5e9d6e5e03c596bec4ac9cb18b9276ace228d53a2c44cf3912d280c
3
+ size 288515807
subproduct_prediction/models/loan_model.pkl ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:63c50de262139d08a1a3c380cf2a4fa94114273a73daffd04ef2cc94859a9259
3
+ size 23675105