Spaces:
Sleeping
Sleeping
Mahesh Babu
commited on
Commit
•
ac79280
1
Parent(s):
e877961
added preprocessing files
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- data_preprocessing_scripts/.DS_Store +0 -0
- data_preprocessing_scripts/Data Preprocessing Script.ipynb +576 -0
- data_preprocessing_scripts/data_splits/test-data-split_2022.csv +3 -0
- data_preprocessing_scripts/data_splits/test-data-split_2023.csv +3 -0
- data_preprocessing_scripts/data_splits/train-data-balanced.csv +3 -0
- data_preprocessing_scripts/data_splits/train-data-split_2022.csv +3 -0
- data_preprocessing_scripts/data_splits/train-data-split_2023.csv +3 -0
- data_preprocessing_scripts/issue_data_splits/account_operations_and_unauthorized_transaction_issues_data.csv +3 -0
- data_preprocessing_scripts/issue_data_splits/account_operations_and_unauthorized_transaction_issues_train_data.csv +3 -0
- data_preprocessing_scripts/issue_data_splits/account_operations_and_unauthorized_transaction_issues_val_data.csv +3 -0
- data_preprocessing_scripts/issue_data_splits/attempts_to_collect_debt_not_owed_data.csv +3 -0
- data_preprocessing_scripts/issue_data_splits/attempts_to_collect_debt_not_owed_train_data.csv +3 -0
- data_preprocessing_scripts/issue_data_splits/attempts_to_collect_debt_not_owed_val_data.csv +3 -0
- data_preprocessing_scripts/issue_data_splits/closing_an_account_data.csv +3 -0
- data_preprocessing_scripts/issue_data_splits/closing_an_account_train_data.csv +3 -0
- data_preprocessing_scripts/issue_data_splits/closing_an_account_val_data.csv +3 -0
- data_preprocessing_scripts/issue_data_splits/closing_your_account_data.csv +3 -0
- data_preprocessing_scripts/issue_data_splits/closing_your_account_train_data.csv +3 -0
- data_preprocessing_scripts/issue_data_splits/closing_your_account_val_data.csv +3 -0
- data_preprocessing_scripts/issue_data_splits/credit_report_and_monitoring_issues_data.csv +3 -0
- data_preprocessing_scripts/issue_data_splits/credit_report_and_monitoring_issues_train_data.csv +3 -0
- data_preprocessing_scripts/issue_data_splits/credit_report_and_monitoring_issues_val_data.csv +3 -0
- data_preprocessing_scripts/issue_data_splits/dealing_with_your_lender_or_servicer_data.csv +3 -0
- data_preprocessing_scripts/issue_data_splits/dealing_with_your_lender_or_servicer_train_data.csv +3 -0
- data_preprocessing_scripts/issue_data_splits/dealing_with_your_lender_or_servicer_val_data.csv +3 -0
- data_preprocessing_scripts/issue_data_splits/disputes_and_misrepresentations_data.csv +3 -0
- data_preprocessing_scripts/issue_data_splits/disputes_and_misrepresentations_train_data.csv +3 -0
- data_preprocessing_scripts/issue_data_splits/disputes_and_misrepresentations_val_data.csv +3 -0
- data_preprocessing_scripts/issue_data_splits/improper_use_of_your_report_data.csv +3 -0
- data_preprocessing_scripts/issue_data_splits/improper_use_of_your_report_train_data.csv +3 -0
- data_preprocessing_scripts/issue_data_splits/improper_use_of_your_report_val_data.csv +3 -0
- data_preprocessing_scripts/issue_data_splits/incorrect_information_on_your_report_data.csv +3 -0
- data_preprocessing_scripts/issue_data_splits/incorrect_information_on_your_report_train_data.csv +3 -0
- data_preprocessing_scripts/issue_data_splits/incorrect_information_on_your_report_val_data.csv +3 -0
- data_preprocessing_scripts/issue_data_splits/legal_and_threat_actions_data.csv +3 -0
- data_preprocessing_scripts/issue_data_splits/legal_and_threat_actions_train_data.csv +3 -0
- data_preprocessing_scripts/issue_data_splits/legal_and_threat_actions_val_data.csv +3 -0
- data_preprocessing_scripts/issue_data_splits/managing_an_account_data.csv +3 -0
- data_preprocessing_scripts/issue_data_splits/managing_an_account_train_data.csv +3 -0
- data_preprocessing_scripts/issue_data_splits/managing_an_account_val_data.csv +3 -0
- data_preprocessing_scripts/issue_data_splits/payment_and_funds_management_data.csv +3 -0
- data_preprocessing_scripts/issue_data_splits/payment_and_funds_management_train_data.csv +3 -0
- data_preprocessing_scripts/issue_data_splits/payment_and_funds_management_val_data.csv +3 -0
- data_preprocessing_scripts/issue_data_splits/problem_with_a_company's_investigation_into_an_existing_issue_data.csv +3 -0
- data_preprocessing_scripts/issue_data_splits/problem_with_a_company's_investigation_into_an_existing_issue_train_data.csv +3 -0
- data_preprocessing_scripts/issue_data_splits/problem_with_a_company's_investigation_into_an_existing_issue_val_data.csv +3 -0
- data_preprocessing_scripts/issue_data_splits/problem_with_a_company's_investigation_into_an_existing_problem_data.csv +3 -0
- data_preprocessing_scripts/issue_data_splits/problem_with_a_company's_investigation_into_an_existing_problem_train_data.csv +3 -0
- data_preprocessing_scripts/issue_data_splits/problem_with_a_company's_investigation_into_an_existing_problem_val_data.csv +3 -0
- data_preprocessing_scripts/issue_data_splits/problem_with_a_credit_reporting_company's_investigation_into_an_existing_problem_data.csv +3 -0
data_preprocessing_scripts/.DS_Store
ADDED
Binary file (6.15 kB). View file
|
|
data_preprocessing_scripts/Data Preprocessing Script.ipynb
ADDED
@@ -0,0 +1,576 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"cells": [
|
3 |
+
{
|
4 |
+
"cell_type": "markdown",
|
5 |
+
"id": "5619ac0c-7398-4eb5-bdc0-8d338bf4a41f",
|
6 |
+
"metadata": {},
|
7 |
+
"source": [
|
8 |
+
"### Data Preprocessing"
|
9 |
+
]
|
10 |
+
},
|
11 |
+
{
|
12 |
+
"cell_type": "markdown",
|
13 |
+
"id": "8774cfd1-91b0-4d2d-b0f1-f057a5940cea",
|
14 |
+
"metadata": {
|
15 |
+
"jp-MarkdownHeadingCollapsed": true
|
16 |
+
},
|
17 |
+
"source": [
|
18 |
+
"### Importing Libraries"
|
19 |
+
]
|
20 |
+
},
|
21 |
+
{
|
22 |
+
"cell_type": "code",
|
23 |
+
"execution_count": 1,
|
24 |
+
"id": "b0b6e81d-c547-41ae-8a2b-4f8864cbc8d4",
|
25 |
+
"metadata": {},
|
26 |
+
"outputs": [],
|
27 |
+
"source": [
|
28 |
+
"import warnings\n",
|
29 |
+
"warnings.filterwarnings(\"ignore\")"
|
30 |
+
]
|
31 |
+
},
|
32 |
+
{
|
33 |
+
"cell_type": "code",
|
34 |
+
"execution_count": 2,
|
35 |
+
"id": "27dd12eb-2975-4b6f-9010-845ae2d23c8f",
|
36 |
+
"metadata": {},
|
37 |
+
"outputs": [],
|
38 |
+
"source": [
|
39 |
+
"import pandas as pd\n",
|
40 |
+
"from sklearn.model_selection import train_test_split\n",
|
41 |
+
"import os"
|
42 |
+
]
|
43 |
+
},
|
44 |
+
{
|
45 |
+
"cell_type": "markdown",
|
46 |
+
"id": "c49948e7-3018-4cf8-b3bc-0bae7e6a051f",
|
47 |
+
"metadata": {},
|
48 |
+
"source": [
|
49 |
+
"### Data Preprocessing Function"
|
50 |
+
]
|
51 |
+
},
|
52 |
+
{
|
53 |
+
"cell_type": "code",
|
54 |
+
"execution_count": 3,
|
55 |
+
"id": "02ee5dbe-dc71-4839-a19b-34699133f2be",
|
56 |
+
"metadata": {},
|
57 |
+
"outputs": [],
|
58 |
+
"source": [
|
59 |
+
"def load_and_clean_data(file_path):\n",
|
60 |
+
" \"\"\"\n",
|
61 |
+
" Load and clean the data from the specified CSV file.\n",
|
62 |
+
"\n",
|
63 |
+
" Parameters:\n",
|
64 |
+
" - file_path (str): Path to the CSV file containing the data.\n",
|
65 |
+
"\n",
|
66 |
+
" Returns:\n",
|
67 |
+
" - DataFrame: Cleaned DataFrame containing selected columns with NaN values dropped.\n",
|
68 |
+
" \"\"\"\n",
|
69 |
+
" df = pd.read_csv(file_path)\n",
|
70 |
+
" df['Date received'] = pd.to_datetime(df['Date received'])\n",
|
71 |
+
" \n",
|
72 |
+
" cols_to_consider = ['Product', 'Sub-product', 'Issue', 'Sub-issue', 'Consumer complaint narrative',\n",
|
73 |
+
" 'Company public response', 'Company', 'State', 'ZIP code', 'Date received']\n",
|
74 |
+
" \n",
|
75 |
+
" df_new = df[cols_to_consider]\n",
|
76 |
+
" \n",
|
77 |
+
" df_new = df_new.dropna()\n",
|
78 |
+
" \n",
|
79 |
+
" return df_new"
|
80 |
+
]
|
81 |
+
},
|
82 |
+
{
|
83 |
+
"cell_type": "code",
|
84 |
+
"execution_count": 4,
|
85 |
+
"id": "413135b7-1fb8-4cef-876e-99bfd1f148ac",
|
86 |
+
"metadata": {},
|
87 |
+
"outputs": [],
|
88 |
+
"source": [
|
89 |
+
"def filter_by_years(df, years):\n",
|
90 |
+
" \"\"\"\n",
|
91 |
+
" Filter the DataFrame to include only the rows corresponding to specified years.\n",
|
92 |
+
"\n",
|
93 |
+
" Parameters:\n",
|
94 |
+
" - df (DataFrame): The DataFrame containing data to filter.\n",
|
95 |
+
" - years (list of int): List of years to filter by.\n",
|
96 |
+
"\n",
|
97 |
+
" Returns:\n",
|
98 |
+
" - DataFrame: Filtered DataFrame containing rows corresponding to specified years.\n",
|
99 |
+
" \"\"\"\n",
|
100 |
+
" filtered_df = df[df['Date received'].dt.year.isin(years)].reset_index(drop=True)\n",
|
101 |
+
" return filtered_df"
|
102 |
+
]
|
103 |
+
},
|
104 |
+
{
|
105 |
+
"cell_type": "code",
|
106 |
+
"execution_count": 5,
|
107 |
+
"id": "9e3f199f-0ad9-40a3-82f1-065af9efa9f5",
|
108 |
+
"metadata": {},
|
109 |
+
"outputs": [],
|
110 |
+
"source": [
|
111 |
+
"def map_product_column(df):\n",
|
112 |
+
" \"\"\"\n",
|
113 |
+
" Map values in the 'Product' column of the DataFrame to a standardized set of categories.\n",
|
114 |
+
"\n",
|
115 |
+
" Parameters:\n",
|
116 |
+
" - df (DataFrame): The DataFrame containing the 'Product' column to map.\n",
|
117 |
+
"\n",
|
118 |
+
" Returns:\n",
|
119 |
+
" - DataFrame: DataFrame with the 'Product' column values mapped to standardized categories.\n",
|
120 |
+
" \"\"\"\n",
|
121 |
+
"\n",
|
122 |
+
" product_map = {'Credit reporting or other personal consumer reports': 'Credit Reporting',\n",
|
123 |
+
" 'Credit reporting, credit repair services, or other personal consumer reports': 'Credit Reporting',\n",
|
124 |
+
" 'Payday loan, title loan, personal loan, or advance loan': 'Loans / Mortgage',\n",
|
125 |
+
" 'Payday loan, title loan, or personal loan': 'Loans / Mortgage',\n",
|
126 |
+
" 'Student loan': 'Loans / Mortgage',\n",
|
127 |
+
" 'Vehicle loan or lease': 'Loans / Mortgage',\n",
|
128 |
+
" 'Debt collection': 'Debt collection',\n",
|
129 |
+
" 'Credit card or prepaid card': 'Credit/Prepaid Card',\n",
|
130 |
+
" 'Credit card': 'Credit/Prepaid Card',\n",
|
131 |
+
" 'Prepaid card': 'Credit/Prepaid Card',\n",
|
132 |
+
" 'Mortgage': 'Loans / Mortgage',\n",
|
133 |
+
" 'Checking or savings account': 'Checking or savings account' \n",
|
134 |
+
" }\n",
|
135 |
+
" # Map 'Product' column\n",
|
136 |
+
" df.loc[:,'Product'] = df['Product'].map(product_map)\n",
|
137 |
+
" \n",
|
138 |
+
" return df\n"
|
139 |
+
]
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"cell_type": "code",
|
143 |
+
"execution_count": 6,
|
144 |
+
"id": "03a2e0a2-75ff-4b33-b081-58e3d5c791c9",
|
145 |
+
"metadata": {},
|
146 |
+
"outputs": [],
|
147 |
+
"source": [
|
148 |
+
"def clean_narrative(df):\n",
|
149 |
+
"\n",
|
150 |
+
" \"\"\"\n",
|
151 |
+
" Clean the consumer complaint narratives in the DataFrame by excluding short and irrelevant complaints.\n",
|
152 |
+
"\n",
|
153 |
+
" Parameters:\n",
|
154 |
+
" - df (DataFrame): The input DataFrame containing consumer complaint data.\n",
|
155 |
+
"\n",
|
156 |
+
" Returns:\n",
|
157 |
+
" - DataFrame: DataFrame with cleaned consumer complaint narratives.\n",
|
158 |
+
" \"\"\"\n",
|
159 |
+
"# Compute complaint length\n",
|
160 |
+
" df['complaint length'] = df['Consumer complaint narrative'].apply(lambda x : len(x))\n",
|
161 |
+
"\n",
|
162 |
+
" df = df[df['complaint length'] > 20]\n",
|
163 |
+
" \n",
|
164 |
+
" complaints_to_exclude = ['See document attached', 'See the attached documents.', 'Incorrect information on my credit report', 'incorrect information on my credit report',\n",
|
165 |
+
" 'please see attached file','Please see documents Attached','Incorrect information on my credit report.', 'Please see attached file', 'see attached',\n",
|
166 |
+
" 'See attached', 'SEE ATTACHED DOCUMENTS', 'See Attached', 'SEE ATTACHMENT', 'SEE ATTACHMENTS', \n",
|
167 |
+
" 'XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX']\n",
|
168 |
+
" \n",
|
169 |
+
" df = df[~df['Consumer complaint narrative'].isin(complaints_to_exclude)]\n",
|
170 |
+
" return df"
|
171 |
+
]
|
172 |
+
},
|
173 |
+
{
|
174 |
+
"cell_type": "code",
|
175 |
+
"execution_count": 7,
|
176 |
+
"id": "0cb28135-12d8-41fc-94a8-2968e558473b",
|
177 |
+
"metadata": {},
|
178 |
+
"outputs": [],
|
179 |
+
"source": [
|
180 |
+
"def filter_by_frequency(df):\n",
|
181 |
+
" \"\"\"\n",
|
182 |
+
" Filter the DataFrame based on the frequency of sub-issues and sub-products.\n",
|
183 |
+
"\n",
|
184 |
+
" Parameters:\n",
|
185 |
+
" - df (DataFrame): The input DataFrame containing consumer complaint data.\n",
|
186 |
+
"\n",
|
187 |
+
" Returns:\n",
|
188 |
+
" - DataFrame: DataFrame filtered based on the frequency of sub-issues and sub-products.\n",
|
189 |
+
" \"\"\"\n",
|
190 |
+
" # Select sub-issues with frequency > 500\n",
|
191 |
+
" sub_issues_to_consider = df['Sub-issue'].value_counts()[df['Sub-issue'].value_counts() > 500].index\n",
|
192 |
+
"\n",
|
193 |
+
" # Filter DataFrame based on selected sub-issues\n",
|
194 |
+
" reduced_subissues = df[df['Sub-issue'].isin(sub_issues_to_consider)]\n",
|
195 |
+
" # Select sub-products with frequency > 100\n",
|
196 |
+
" sub_products_to_consider = reduced_subissues['Sub-product'].value_counts()[reduced_subissues['Sub-product'].value_counts() > 100].index\n",
|
197 |
+
"\n",
|
198 |
+
" # Filter DataFrame based on selected sub-products\n",
|
199 |
+
" final_df = reduced_subissues[reduced_subissues['Sub-product'].isin(sub_products_to_consider)]\n",
|
200 |
+
"\n",
|
201 |
+
" return final_df"
|
202 |
+
]
|
203 |
+
},
|
204 |
+
{
|
205 |
+
"cell_type": "code",
|
206 |
+
"execution_count": 8,
|
207 |
+
"id": "e102c902-645e-453e-9a67-07781ba6fc55",
|
208 |
+
"metadata": {},
|
209 |
+
"outputs": [],
|
210 |
+
"source": [
|
211 |
+
"def map_issue(df):\n",
|
212 |
+
" \"\"\"\n",
|
213 |
+
" Map the issues to more defined and lesser number of issues in the DataFrame.\n",
|
214 |
+
"\n",
|
215 |
+
" Parameters:\n",
|
216 |
+
" - df (DataFrame): The input DataFrame containing consumer complaint data.\n",
|
217 |
+
"\n",
|
218 |
+
" Returns:\n",
|
219 |
+
" - DataFrame: DataFrame with the 'Issue' column mapped to appropriate issues.\n",
|
220 |
+
" \"\"\"\n",
|
221 |
+
" # Create a dictionary mapping issues to sub-issues\n",
|
222 |
+
" issues_to_subissues = {}\n",
|
223 |
+
" for issue in df['Issue'].value_counts().index:\n",
|
224 |
+
" issues_to_subissues[issue] = list(df[df['Issue'] == issue]['Sub-issue'].value_counts().to_dict().keys())\n",
|
225 |
+
"\n",
|
226 |
+
" # Separate issues with only one sub-issue and more than one sub-issue\n",
|
227 |
+
" one_subissue = {key: value for key, value in issues_to_subissues.items() if len(issues_to_subissues[key]) == 1}\n",
|
228 |
+
" more_than_one_subissue = {key: value for key, value in issues_to_subissues.items() if len(issues_to_subissues[key]) > 1}\n",
|
229 |
+
"\n",
|
230 |
+
" # Existing issue mapping for issues with more than one sub-issue\n",
|
231 |
+
" existing_issue_mapping = {issue: issue for issue in more_than_one_subissue}\n",
|
232 |
+
"\n",
|
233 |
+
" # Issue renaming based on provided mapping\n",
|
234 |
+
" issue_renaming = {\n",
|
235 |
+
" 'Problem with a lender or other company charging your account': 'Account Operations and Unauthorized Transaction Issues',\n",
|
236 |
+
" 'Opening an account': 'Account Operations and Unauthorized Transaction Issues',\n",
|
237 |
+
" 'Getting a credit card': 'Account Operations and Unauthorized Transaction Issues',\n",
|
238 |
+
"\n",
|
239 |
+
" 'Unable to get your credit report or credit score': 'Credit Report and Monitoring Issues',\n",
|
240 |
+
" 'Credit monitoring or identity theft protection services': 'Credit Report and Monitoring Issues',\n",
|
241 |
+
" 'Identity theft protection or other monitoring services': 'Credit Report and Monitoring Issues',\n",
|
242 |
+
"\n",
|
243 |
+
" 'Problem caused by your funds being low': 'Payment and Funds Management',\n",
|
244 |
+
" 'Problem when making payments': 'Payment and Funds Management',\n",
|
245 |
+
" 'Managing the loan or lease': 'Payment and Funds Management',\n",
|
246 |
+
"\n",
|
247 |
+
" 'False statements or representation': 'Disputes and Misrepresentations',\n",
|
248 |
+
" 'Fees or interest': 'Disputes and Misrepresentations',\n",
|
249 |
+
" 'Other features, terms, or problems': 'Disputes and Misrepresentations',\n",
|
250 |
+
"\n",
|
251 |
+
" 'Took or threatened to take negative or legal action': 'Legal and Threat Actions'\n",
|
252 |
+
" }\n",
|
253 |
+
"\n",
|
254 |
+
" # Combine issue renaming and existing issue mapping\n",
|
255 |
+
" issues_mapping = {**issue_renaming, **existing_issue_mapping}\n",
|
256 |
+
"\n",
|
257 |
+
" # Map 'Issue' column using the defined mapping dictionary\n",
|
258 |
+
" df['Issue'] = df['Issue'].apply(lambda x: issues_mapping[x])\n",
|
259 |
+
"\n",
|
260 |
+
" return df"
|
261 |
+
]
|
262 |
+
},
|
263 |
+
{
|
264 |
+
"cell_type": "code",
|
265 |
+
"execution_count": 9,
|
266 |
+
"id": "b8a50ac3-1a5b-4c78-92bb-da14d38a679c",
|
267 |
+
"metadata": {},
|
268 |
+
"outputs": [],
|
269 |
+
"source": [
|
270 |
+
"def split_and_save_data(df,year, test_size=0.25, random_state=42, directory_to_save='./data_splits/'):\n",
|
271 |
+
" \"\"\"\n",
|
272 |
+
" Split the input DataFrame into train and test sets, and save them as CSV files with the specified year included in the file names.\n",
|
273 |
+
"\n",
|
274 |
+
" Parameters:\n",
|
275 |
+
" - df (DataFrame): The input DataFrame containing consumer complaint data.\n",
|
276 |
+
" - year (int): The year associated with the data split.\n",
|
277 |
+
" - test_size (float, optional): The proportion of the dataset to include in the test split. Default is 0.25.\n",
|
278 |
+
" - random_state (int, optional): The seed used by the random number generator. Default is 42.\n",
|
279 |
+
" - directory_to_save (str, optional): The directory path to save the data splits. Default is './data_splits/'.\n",
|
280 |
+
"\n",
|
281 |
+
" Returns:\n",
|
282 |
+
" - None\n",
|
283 |
+
" \"\"\"\n",
|
284 |
+
" # Split the data into train and test sets\n",
|
285 |
+
" X = df['Consumer complaint narrative']\n",
|
286 |
+
" y = df[['Product', 'Sub-product', 'Issue', 'Sub-issue']]\n",
|
287 |
+
" X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y['Product'], test_size=test_size, random_state=random_state)\n",
|
288 |
+
"\n",
|
289 |
+
" # Concatenate X_train and y_train, and X_test and y_test respectively\n",
|
290 |
+
" train_df = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)\n",
|
291 |
+
" test_df = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)\n",
|
292 |
+
"\n",
|
293 |
+
" # Create directory if it doesn't exist\n",
|
294 |
+
" if not os.path.exists(directory_to_save):\n",
|
295 |
+
" os.makedirs(directory_to_save)\n",
|
296 |
+
" \n",
|
297 |
+
" # Save train and test data as CSV files with the year included in the file names\n",
|
298 |
+
" train_df.to_csv(os.path.join(directory_to_save, f'train-data-split_{year}.csv'), index=False)\n",
|
299 |
+
" test_df.to_csv(os.path.join(directory_to_save, f'test-data-split_{year}.csv'), index=False)"
|
300 |
+
]
|
301 |
+
},
|
302 |
+
{
|
303 |
+
"cell_type": "markdown",
|
304 |
+
"id": "a11fbb2c-548e-4bf8-bb41-abe22a4bd485",
|
305 |
+
"metadata": {
|
306 |
+
"jp-MarkdownHeadingCollapsed": true
|
307 |
+
},
|
308 |
+
"source": [
|
309 |
+
"### Main Cleaning Pipeline"
|
310 |
+
]
|
311 |
+
},
|
312 |
+
{
|
313 |
+
"cell_type": "code",
|
314 |
+
"execution_count": 13,
|
315 |
+
"id": "479d2872-0202-4eac-85d9-ce2b532881dd",
|
316 |
+
"metadata": {},
|
317 |
+
"outputs": [],
|
318 |
+
"source": [
|
319 |
+
"def main(file_path, year,year_name):\n",
|
320 |
+
" # Load and clean the data\n",
|
321 |
+
" df_cleaned = load_and_clean_data(file_path)\n",
|
322 |
+
" \n",
|
323 |
+
" # Filter the data by years\n",
|
324 |
+
" df_filtered = filter_by_years(df_cleaned, year)\n",
|
325 |
+
" \n",
|
326 |
+
" # Map the 'Product' column\n",
|
327 |
+
" df_mapped = map_product_column(df_filtered)\n",
|
328 |
+
" \n",
|
329 |
+
" # Clean the customer narratives in the data\n",
|
330 |
+
" df_clean_narrative = clean_narrative(df_mapped)\n",
|
331 |
+
"\n",
|
332 |
+
" # Clean the data by frequency\n",
|
333 |
+
" df_freq = filter_by_frequency(df_clean_narrative)\n",
|
334 |
+
"\n",
|
335 |
+
" #Mapping the Issues and filtering Sub Issues\n",
|
336 |
+
" df_final = map_issue_to_subissue(df_freq)\n",
|
337 |
+
" \n",
|
338 |
+
" # Split and save the data\n",
|
339 |
+
" split_and_save_data(df_final,year_name)\n",
|
340 |
+
" return df_final"
|
341 |
+
]
|
342 |
+
},
|
343 |
+
{
|
344 |
+
"cell_type": "markdown",
|
345 |
+
"id": "60c6f92a-de2b-4c8a-817e-fb49a68f87ba",
|
346 |
+
"metadata": {},
|
347 |
+
"source": [
|
348 |
+
"### Calling the data preprocessing script"
|
349 |
+
]
|
350 |
+
},
|
351 |
+
{
|
352 |
+
"cell_type": "code",
|
353 |
+
"execution_count": 14,
|
354 |
+
"id": "76932edd-4de8-47e1-a1a1-3cc4b07d6850",
|
355 |
+
"metadata": {},
|
356 |
+
"outputs": [],
|
357 |
+
"source": [
|
358 |
+
"file_path = 'complaints.csv'\n",
|
359 |
+
"years_to_include = [2023]\n",
|
360 |
+
"year_name=2023\n",
|
361 |
+
"df_final=main(file_path, years_to_include,year_name)"
|
362 |
+
]
|
363 |
+
},
|
364 |
+
{
|
365 |
+
"cell_type": "code",
|
366 |
+
"execution_count": 15,
|
367 |
+
"id": "b55f4b09-6c89-470b-a4f0-60c0148534ec",
|
368 |
+
"metadata": {},
|
369 |
+
"outputs": [
|
370 |
+
{
|
371 |
+
"data": {
|
372 |
+
"text/plain": [
|
373 |
+
"(247517, 11)"
|
374 |
+
]
|
375 |
+
},
|
376 |
+
"execution_count": 15,
|
377 |
+
"metadata": {},
|
378 |
+
"output_type": "execute_result"
|
379 |
+
}
|
380 |
+
],
|
381 |
+
"source": [
|
382 |
+
"df_final.shape"
|
383 |
+
]
|
384 |
+
},
|
385 |
+
{
|
386 |
+
"cell_type": "code",
|
387 |
+
"execution_count": 16,
|
388 |
+
"id": "ed38bd8a-2004-435a-87b2-2bcc43dbd565",
|
389 |
+
"metadata": {},
|
390 |
+
"outputs": [
|
391 |
+
{
|
392 |
+
"data": {
|
393 |
+
"text/plain": [
|
394 |
+
"Credit Reporting 211695\n",
|
395 |
+
"Checking or savings account 12285\n",
|
396 |
+
"Credit/Prepaid Card 11975\n",
|
397 |
+
"Debt collection 9380\n",
|
398 |
+
"Loans / Mortgage 2182\n",
|
399 |
+
"Name: Product, dtype: int64"
|
400 |
+
]
|
401 |
+
},
|
402 |
+
"execution_count": 16,
|
403 |
+
"metadata": {},
|
404 |
+
"output_type": "execute_result"
|
405 |
+
}
|
406 |
+
],
|
407 |
+
"source": [
|
408 |
+
"df_final.Product.value_counts()"
|
409 |
+
]
|
410 |
+
},
|
411 |
+
{
|
412 |
+
"cell_type": "code",
|
413 |
+
"execution_count": 17,
|
414 |
+
"id": "dab29296-8a5b-44c3-8c6e-7b60e062343f",
|
415 |
+
"metadata": {},
|
416 |
+
"outputs": [
|
417 |
+
{
|
418 |
+
"data": {
|
419 |
+
"text/plain": [
|
420 |
+
"Credit reporting 210735\n",
|
421 |
+
"General-purpose credit card or charge card 10668\n",
|
422 |
+
"Checking account 10409\n",
|
423 |
+
"Other debt 3041\n",
|
424 |
+
"I do not know 2316\n",
|
425 |
+
"Credit card debt 1652\n",
|
426 |
+
"Federal student loan servicing 1344\n",
|
427 |
+
"Store credit card 1307\n",
|
428 |
+
"Medical debt 1053\n",
|
429 |
+
"Savings account 989\n",
|
430 |
+
"Other personal consumer report 960\n",
|
431 |
+
"Loan 732\n",
|
432 |
+
"Other banking product or service 725\n",
|
433 |
+
"Auto debt 581\n",
|
434 |
+
"Telecommunications debt 419\n",
|
435 |
+
"Rental debt 179\n",
|
436 |
+
"CD (Certificate of Deposit) 162\n",
|
437 |
+
"Mortgage debt 139\n",
|
438 |
+
"Conventional home mortgage 106\n",
|
439 |
+
"Name: Sub-product, dtype: int64"
|
440 |
+
]
|
441 |
+
},
|
442 |
+
"execution_count": 17,
|
443 |
+
"metadata": {},
|
444 |
+
"output_type": "execute_result"
|
445 |
+
}
|
446 |
+
],
|
447 |
+
"source": [
|
448 |
+
"df_final['Sub-product'].value_counts()"
|
449 |
+
]
|
450 |
+
},
|
451 |
+
{
|
452 |
+
"cell_type": "code",
|
453 |
+
"execution_count": 19,
|
454 |
+
"id": "10127c9e-ea4a-49ce-a5f2-76991abde850",
|
455 |
+
"metadata": {},
|
456 |
+
"outputs": [
|
457 |
+
{
|
458 |
+
"data": {
|
459 |
+
"text/plain": [
|
460 |
+
"Incorrect information on your report 87200\n",
|
461 |
+
"Improper use of your report 61868\n",
|
462 |
+
"Problem with a credit reporting company's investigation into an existing problem 45371\n",
|
463 |
+
"Problem with a company's investigation into an existing problem 20985\n",
|
464 |
+
"Managing an account 7367\n",
|
465 |
+
"Attempts to collect debt not owed 5453\n",
|
466 |
+
"Problem with a purchase shown on your statement 3253\n",
|
467 |
+
"Account Operations and Unauthorized Transaction Issues 2450\n",
|
468 |
+
"Written notification about debt 2404\n",
|
469 |
+
"Disputes and Misrepresentations 2311\n",
|
470 |
+
"Payment and Funds Management 2259\n",
|
471 |
+
"Closing an account 1975\n",
|
472 |
+
"Credit Report and Monitoring Issues 1630\n",
|
473 |
+
"Dealing with your lender or servicer 1293\n",
|
474 |
+
"Closing your account 813\n",
|
475 |
+
"Legal and Threat Actions 662\n",
|
476 |
+
"Problem with a company's investigation into an existing issue 223\n",
|
477 |
+
"Name: Issue, dtype: int64"
|
478 |
+
]
|
479 |
+
},
|
480 |
+
"execution_count": 19,
|
481 |
+
"metadata": {},
|
482 |
+
"output_type": "execute_result"
|
483 |
+
}
|
484 |
+
],
|
485 |
+
"source": [
|
486 |
+
"df_final['Issue'].value_counts()"
|
487 |
+
]
|
488 |
+
},
|
489 |
+
{
|
490 |
+
"cell_type": "code",
|
491 |
+
"execution_count": 20,
|
492 |
+
"id": "e774f715-6d3d-4035-9f28-753801490d13",
|
493 |
+
"metadata": {},
|
494 |
+
"outputs": [
|
495 |
+
{
|
496 |
+
"data": {
|
497 |
+
"text/plain": [
|
498 |
+
"Information belongs to someone else 57850\n",
|
499 |
+
"Reporting company used your report improperly 48732\n",
|
500 |
+
"Their investigation did not fix an error on your report 45395\n",
|
501 |
+
"Credit inquiries on your report that you don't recognize 13136\n",
|
502 |
+
"Account status incorrect 10208\n",
|
503 |
+
"Account information incorrect 9267\n",
|
504 |
+
"Was not notified of investigation status or results 9200\n",
|
505 |
+
"Investigation took more than 30 days 8928\n",
|
506 |
+
"Personal information incorrect 5900\n",
|
507 |
+
"Debt is not yours 2785\n",
|
508 |
+
"Deposits and withdrawals 2626\n",
|
509 |
+
"Credit card company isn't resolving a dispute about a purchase on your statement 2289\n",
|
510 |
+
"Didn't receive enough information to verify debt 1777\n",
|
511 |
+
"Debt was result of identity theft 1727\n",
|
512 |
+
"Old information reappears or never goes away 1714\n",
|
513 |
+
"Difficulty submitting a dispute or getting information about a dispute over the phone 1704\n",
|
514 |
+
"Company closed your account 1517\n",
|
515 |
+
"Problem using a debit or ATM card 1503\n",
|
516 |
+
"Public record information inaccurate 1384\n",
|
517 |
+
"Transaction was not authorized 1378\n",
|
518 |
+
"Problem with personal statement of dispute 1352\n",
|
519 |
+
"Other problem getting your report or credit score 1109\n",
|
520 |
+
"Card was charged for something you did not purchase with the card 964\n",
|
521 |
+
"Banking errors 958\n",
|
522 |
+
"Funds not handled or disbursed as instructed 955\n",
|
523 |
+
"Overdrafts and overdraft fees 951\n",
|
524 |
+
"Debt was paid 941\n",
|
525 |
+
"Information is missing that should be on the report 877\n",
|
526 |
+
"Attempted to collect wrong amount 861\n",
|
527 |
+
"Problem during payment process 840\n",
|
528 |
+
"Fee problem 764\n",
|
529 |
+
"Problem with fees 749\n",
|
530 |
+
"Other problem 701\n",
|
531 |
+
"Received bad information about your loan 677\n",
|
532 |
+
"Funds not received from closed account 673\n",
|
533 |
+
"Threatened or suggested your credit would be damaged 662\n",
|
534 |
+
"Didn't receive notice of right to dispute 627\n",
|
535 |
+
"Trouble with how payments are being handled 616\n",
|
536 |
+
"Can't close your account 598\n",
|
537 |
+
"Problem accessing account 561\n",
|
538 |
+
"Account opened as a result of fraud 561\n",
|
539 |
+
"Problem canceling credit monitoring or identify theft protection service 521\n",
|
540 |
+
"Card opened as result of identity theft or fraud 511\n",
|
541 |
+
"Billing problem 468\n",
|
542 |
+
"Name: Sub-issue, dtype: int64"
|
543 |
+
]
|
544 |
+
},
|
545 |
+
"execution_count": 20,
|
546 |
+
"metadata": {},
|
547 |
+
"output_type": "execute_result"
|
548 |
+
}
|
549 |
+
],
|
550 |
+
"source": [
|
551 |
+
"df_final['Sub-issue'].value_counts()"
|
552 |
+
]
|
553 |
+
}
|
554 |
+
],
|
555 |
+
"metadata": {
|
556 |
+
"kernelspec": {
|
557 |
+
"display_name": "Python 3 (ipykernel)",
|
558 |
+
"language": "python",
|
559 |
+
"name": "python3"
|
560 |
+
},
|
561 |
+
"language_info": {
|
562 |
+
"codemirror_mode": {
|
563 |
+
"name": "ipython",
|
564 |
+
"version": 3
|
565 |
+
},
|
566 |
+
"file_extension": ".py",
|
567 |
+
"mimetype": "text/x-python",
|
568 |
+
"name": "python",
|
569 |
+
"nbconvert_exporter": "python",
|
570 |
+
"pygments_lexer": "ipython3",
|
571 |
+
"version": "3.9.19"
|
572 |
+
}
|
573 |
+
},
|
574 |
+
"nbformat": 4,
|
575 |
+
"nbformat_minor": 5
|
576 |
+
}
|
data_preprocessing_scripts/data_splits/test-data-split_2022.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4c203087801339af843b99a42a2187585d4af3445aab36c59c7cd7653b87587e
|
3 |
+
size 62357439
|
data_preprocessing_scripts/data_splits/test-data-split_2023.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:9a253ea22f1b90530a803e61f1624d3b270ef62c88b89aece879cbe793e64eea
|
3 |
+
size 68758811
|
data_preprocessing_scripts/data_splits/train-data-balanced.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:acbfac231d968f341f65ce49dd5369f75fea7ab792e29c6116dfafe113589186
|
3 |
+
size 70018604
|
data_preprocessing_scripts/data_splits/train-data-split_2022.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:53dac829454c1da6425cfa6d07ad668e2150d4854b6a470c413bb267f7f0f51d
|
3 |
+
size 185727392
|
data_preprocessing_scripts/data_splits/train-data-split_2023.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:fd0cab77284db3c21e6d186d5a2d46ba27557a3320de250c19784822c73fc443
|
3 |
+
size 205853579
|
data_preprocessing_scripts/issue_data_splits/account_operations_and_unauthorized_transaction_issues_data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:204e335acf3df6981acc48e8aa845b8f2f1ca356d961f99f3e973aca273599bc
|
3 |
+
size 2090623
|
data_preprocessing_scripts/issue_data_splits/account_operations_and_unauthorized_transaction_issues_train_data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:d1f00513b1d44dbc7190ff68aa135655fdd22fe4b38bbe042a7713e68fc85ed8
|
3 |
+
size 1883764
|
data_preprocessing_scripts/issue_data_splits/account_operations_and_unauthorized_transaction_issues_val_data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3e46867c68fee83e0994f36aa6e39be17a2e5e67eb05f17378aedc7bc01350ce
|
3 |
+
size 206915
|
data_preprocessing_scripts/issue_data_splits/attempts_to_collect_debt_not_owed_data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:126f320e8e8df97c756c507ff4eb5c669f6058c92a3ec25f52c5f7b81599719b
|
3 |
+
size 4135658
|
data_preprocessing_scripts/issue_data_splits/attempts_to_collect_debt_not_owed_train_data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e96f4e5342aa61f35cdd4eb42bd81a092fc25bc168f70e1f7be3322b8ea64cc4
|
3 |
+
size 3718608
|
data_preprocessing_scripts/issue_data_splits/attempts_to_collect_debt_not_owed_val_data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:036e70ce151a32c8a6757c2abef7f2547542d4c11c6541c6b6d705bd0063fd85
|
3 |
+
size 417106
|
data_preprocessing_scripts/issue_data_splits/closing_an_account_data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:84dbff3ce91e5c3433d3a0ae1cf543671c4025c2a9c3b9bcb40656975e17852d
|
3 |
+
size 1799450
|
data_preprocessing_scripts/issue_data_splits/closing_an_account_train_data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:216a3640f091ef01b069809e4895528912bc2665358971d0eb30eee103892a68
|
3 |
+
size 1587242
|
data_preprocessing_scripts/issue_data_splits/closing_an_account_val_data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:704275d826033848072c0d747e2595745185cd385c4a65b4b34cb46b849eb388
|
3 |
+
size 212264
|
data_preprocessing_scripts/issue_data_splits/closing_your_account_data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e8428ebcc053869342b7343c709c2eef18d1a88973601dd4cead6141034c0ff2
|
3 |
+
size 799995
|
data_preprocessing_scripts/issue_data_splits/closing_your_account_train_data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2a4de5779cddd84b99a6695dca6e58d36ef93cd5d670fbee821a3b9ff2b39fc8
|
3 |
+
size 736684
|
data_preprocessing_scripts/issue_data_splits/closing_your_account_val_data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f0e2e7589b709c008ff61ee8a2c5ac805462c4b0028074c5abb0e6c824437903
|
3 |
+
size 63367
|
data_preprocessing_scripts/issue_data_splits/credit_report_and_monitoring_issues_data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7029133836244969f80daa7f682a53951e7bfdd1fc304ccb65dcd4f126adc526
|
3 |
+
size 2353757
|
data_preprocessing_scripts/issue_data_splits/credit_report_and_monitoring_issues_train_data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:791d59058ebd88fec2f991f66b1e4fd557322179b4435215dcfcbc33f8984429
|
3 |
+
size 2115089
|
data_preprocessing_scripts/issue_data_splits/credit_report_and_monitoring_issues_val_data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:53fbdf8fca538f24db0dc091d8f39a155a1e2952fdf2b96a62b9317d7dab7368
|
3 |
+
size 238724
|
data_preprocessing_scripts/issue_data_splits/dealing_with_your_lender_or_servicer_data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:e65c17415c7deba2c51c98fe5a96e563aff73cb9d2354a3c318f9d04ddffe746
|
3 |
+
size 1500034
|
data_preprocessing_scripts/issue_data_splits/dealing_with_your_lender_or_servicer_train_data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:235e66bcf2e35fd99d858d301ffd7a2831b1f33c09098cd2fd503a6642a18048
|
3 |
+
size 1370056
|
data_preprocessing_scripts/issue_data_splits/dealing_with_your_lender_or_servicer_val_data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:5f8289ab48f58adcacb8a81078cb8c9e216d61dedb87b907ac5f850ea7244218
|
3 |
+
size 130034
|
data_preprocessing_scripts/issue_data_splits/disputes_and_misrepresentations_data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:cfa794e53b61d14225d6c03bc749fb83ad315ac5f485c4bcf946a6e0cc0fb34a
|
3 |
+
size 2207971
|
data_preprocessing_scripts/issue_data_splits/disputes_and_misrepresentations_train_data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:00081a0b4c6c429f141df193159b55c550c2f15007970a54c12b10e25166e970
|
3 |
+
size 1981586
|
data_preprocessing_scripts/issue_data_splits/disputes_and_misrepresentations_val_data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:66990f81784bae88083bb6cc6fa0582d94c0410050d70c261b5561547d18492d
|
3 |
+
size 226441
|
data_preprocessing_scripts/issue_data_splits/improper_use_of_your_report_data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:22ced2ef476e18679dee486e7b81ad35e6d66a6eb7ad936b92efeb85d30e65c9
|
3 |
+
size 51426802
|
data_preprocessing_scripts/issue_data_splits/improper_use_of_your_report_train_data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b2dde33856c2cb76ef0674aa869cd6ccc6b96b548bb283d0c39c010aca0f722a
|
3 |
+
size 46289254
|
data_preprocessing_scripts/issue_data_splits/improper_use_of_your_report_val_data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:df20ab17ff8751b8251244684294af3ebc11979b2bc699d7e207abe2bc511942
|
3 |
+
size 5137604
|
data_preprocessing_scripts/issue_data_splits/incorrect_information_on_your_report_data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:c557f260383639a5248caae11d252de71cd72e88a23e20c7d32c1f65a42232bb
|
3 |
+
size 67002391
|
data_preprocessing_scripts/issue_data_splits/incorrect_information_on_your_report_train_data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:83a947015eb1b0d8a3958d70e6a86df6dd79436fe4bb42cf7dcae5238f5a1b95
|
3 |
+
size 60306312
|
data_preprocessing_scripts/issue_data_splits/incorrect_information_on_your_report_val_data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:db7603a2b5d168bf2a6ee56dfaa5ad909a06e35ad345c5772344d6f77765ce96
|
3 |
+
size 6696135
|
data_preprocessing_scripts/issue_data_splits/legal_and_threat_actions_data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:06f9c755e499c83d36f8c5cb20d813cb5ac0db73d188c32a334ea9a47b6efb52
|
3 |
+
size 612393
|
data_preprocessing_scripts/issue_data_splits/legal_and_threat_actions_train_data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3dd9fd5013473734be09aac6b9563f2f10fbea3156a72d548e42bbdfef5c891a
|
3 |
+
size 543877
|
data_preprocessing_scripts/issue_data_splits/legal_and_threat_actions_val_data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:7623b91317d2f8746f340422183fd089a3b0ace3647bdf3b71692928dbf5e3a9
|
3 |
+
size 68572
|
data_preprocessing_scripts/issue_data_splits/managing_an_account_data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f46c2f03a742087ee42a05c22444648e56ad25bf551d19e28bd09b06a3425ef7
|
3 |
+
size 7169538
|
data_preprocessing_scripts/issue_data_splits/managing_an_account_train_data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:09d0d485f89d83edb41c61262e2fe5963bfb9100f6f54ef6b16788dc967326ff
|
3 |
+
size 6420679
|
data_preprocessing_scripts/issue_data_splits/managing_an_account_val_data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ac9882076a622b2c2bc1399fdb27b4ca396c6d6cd7623da4ac41af0456471261
|
3 |
+
size 748915
|
data_preprocessing_scripts/issue_data_splits/payment_and_funds_management_data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:15719bc2484dc054a68e9c7d38bc110aedf6fa82962058088f8f2c1dc9d2384f
|
3 |
+
size 1854714
|
data_preprocessing_scripts/issue_data_splits/payment_and_funds_management_train_data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:2d12880274bfdccc9e99f2b973c5f7bf7239e2e8fdda2e08833d1a68a013c3b0
|
3 |
+
size 1680205
|
data_preprocessing_scripts/issue_data_splits/payment_and_funds_management_val_data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:ec358880a0bc9e79f74843b91e0607e5ed2930831715a539231a0b28d9158681
|
3 |
+
size 174565
|
data_preprocessing_scripts/issue_data_splits/problem_with_a_company's_investigation_into_an_existing_issue_data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:51956f542a4924d8e6f5c363210406a4ccfa553f5703bdb61b9c252d3c74198b
|
3 |
+
size 320657
|
data_preprocessing_scripts/issue_data_splits/problem_with_a_company's_investigation_into_an_existing_issue_train_data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:f1011d1017bebc41a4df1217ee9d99604c9e1aa1e11f4f386fb34896383e8dca
|
3 |
+
size 284289
|
data_preprocessing_scripts/issue_data_splits/problem_with_a_company's_investigation_into_an_existing_issue_val_data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:84f7f02d80205cf532ebbd6307e849f71eaeab724e0c5c94cf7de9cca4397d32
|
3 |
+
size 36424
|
data_preprocessing_scripts/issue_data_splits/problem_with_a_company's_investigation_into_an_existing_problem_data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:bbf2935aed3387bc5bcf55dc3afaf29794eb8f4b084ec843b092da96b2c7590c
|
3 |
+
size 15288595
|
data_preprocessing_scripts/issue_data_splits/problem_with_a_company's_investigation_into_an_existing_problem_train_data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:81ae7dd9f926f0cce7be50f1a21327b4bd2d416edd2524b87a1a5617457447ef
|
3 |
+
size 13714937
|
data_preprocessing_scripts/issue_data_splits/problem_with_a_company's_investigation_into_an_existing_problem_val_data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:4ff329c35b163e0ff2d3e868cc41a1046f045903ae0fede9cd341655f906b310
|
3 |
+
size 1573714
|
data_preprocessing_scripts/issue_data_splits/problem_with_a_credit_reporting_company's_investigation_into_an_existing_problem_data.csv
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:3324a8147b120f23f086f69a7a3ccea1b893ec6769ee470e3121dfe9bec059a0
|
3 |
+
size 35696420
|