Mahesh Babu commited on
Commit
ac79280
1 Parent(s): e877961

added preprocessing files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. data_preprocessing_scripts/.DS_Store +0 -0
  2. data_preprocessing_scripts/Data Preprocessing Script.ipynb +576 -0
  3. data_preprocessing_scripts/data_splits/test-data-split_2022.csv +3 -0
  4. data_preprocessing_scripts/data_splits/test-data-split_2023.csv +3 -0
  5. data_preprocessing_scripts/data_splits/train-data-balanced.csv +3 -0
  6. data_preprocessing_scripts/data_splits/train-data-split_2022.csv +3 -0
  7. data_preprocessing_scripts/data_splits/train-data-split_2023.csv +3 -0
  8. data_preprocessing_scripts/issue_data_splits/account_operations_and_unauthorized_transaction_issues_data.csv +3 -0
  9. data_preprocessing_scripts/issue_data_splits/account_operations_and_unauthorized_transaction_issues_train_data.csv +3 -0
  10. data_preprocessing_scripts/issue_data_splits/account_operations_and_unauthorized_transaction_issues_val_data.csv +3 -0
  11. data_preprocessing_scripts/issue_data_splits/attempts_to_collect_debt_not_owed_data.csv +3 -0
  12. data_preprocessing_scripts/issue_data_splits/attempts_to_collect_debt_not_owed_train_data.csv +3 -0
  13. data_preprocessing_scripts/issue_data_splits/attempts_to_collect_debt_not_owed_val_data.csv +3 -0
  14. data_preprocessing_scripts/issue_data_splits/closing_an_account_data.csv +3 -0
  15. data_preprocessing_scripts/issue_data_splits/closing_an_account_train_data.csv +3 -0
  16. data_preprocessing_scripts/issue_data_splits/closing_an_account_val_data.csv +3 -0
  17. data_preprocessing_scripts/issue_data_splits/closing_your_account_data.csv +3 -0
  18. data_preprocessing_scripts/issue_data_splits/closing_your_account_train_data.csv +3 -0
  19. data_preprocessing_scripts/issue_data_splits/closing_your_account_val_data.csv +3 -0
  20. data_preprocessing_scripts/issue_data_splits/credit_report_and_monitoring_issues_data.csv +3 -0
  21. data_preprocessing_scripts/issue_data_splits/credit_report_and_monitoring_issues_train_data.csv +3 -0
  22. data_preprocessing_scripts/issue_data_splits/credit_report_and_monitoring_issues_val_data.csv +3 -0
  23. data_preprocessing_scripts/issue_data_splits/dealing_with_your_lender_or_servicer_data.csv +3 -0
  24. data_preprocessing_scripts/issue_data_splits/dealing_with_your_lender_or_servicer_train_data.csv +3 -0
  25. data_preprocessing_scripts/issue_data_splits/dealing_with_your_lender_or_servicer_val_data.csv +3 -0
  26. data_preprocessing_scripts/issue_data_splits/disputes_and_misrepresentations_data.csv +3 -0
  27. data_preprocessing_scripts/issue_data_splits/disputes_and_misrepresentations_train_data.csv +3 -0
  28. data_preprocessing_scripts/issue_data_splits/disputes_and_misrepresentations_val_data.csv +3 -0
  29. data_preprocessing_scripts/issue_data_splits/improper_use_of_your_report_data.csv +3 -0
  30. data_preprocessing_scripts/issue_data_splits/improper_use_of_your_report_train_data.csv +3 -0
  31. data_preprocessing_scripts/issue_data_splits/improper_use_of_your_report_val_data.csv +3 -0
  32. data_preprocessing_scripts/issue_data_splits/incorrect_information_on_your_report_data.csv +3 -0
  33. data_preprocessing_scripts/issue_data_splits/incorrect_information_on_your_report_train_data.csv +3 -0
  34. data_preprocessing_scripts/issue_data_splits/incorrect_information_on_your_report_val_data.csv +3 -0
  35. data_preprocessing_scripts/issue_data_splits/legal_and_threat_actions_data.csv +3 -0
  36. data_preprocessing_scripts/issue_data_splits/legal_and_threat_actions_train_data.csv +3 -0
  37. data_preprocessing_scripts/issue_data_splits/legal_and_threat_actions_val_data.csv +3 -0
  38. data_preprocessing_scripts/issue_data_splits/managing_an_account_data.csv +3 -0
  39. data_preprocessing_scripts/issue_data_splits/managing_an_account_train_data.csv +3 -0
  40. data_preprocessing_scripts/issue_data_splits/managing_an_account_val_data.csv +3 -0
  41. data_preprocessing_scripts/issue_data_splits/payment_and_funds_management_data.csv +3 -0
  42. data_preprocessing_scripts/issue_data_splits/payment_and_funds_management_train_data.csv +3 -0
  43. data_preprocessing_scripts/issue_data_splits/payment_and_funds_management_val_data.csv +3 -0
  44. data_preprocessing_scripts/issue_data_splits/problem_with_a_company's_investigation_into_an_existing_issue_data.csv +3 -0
  45. data_preprocessing_scripts/issue_data_splits/problem_with_a_company's_investigation_into_an_existing_issue_train_data.csv +3 -0
  46. data_preprocessing_scripts/issue_data_splits/problem_with_a_company's_investigation_into_an_existing_issue_val_data.csv +3 -0
  47. data_preprocessing_scripts/issue_data_splits/problem_with_a_company's_investigation_into_an_existing_problem_data.csv +3 -0
  48. data_preprocessing_scripts/issue_data_splits/problem_with_a_company's_investigation_into_an_existing_problem_train_data.csv +3 -0
  49. data_preprocessing_scripts/issue_data_splits/problem_with_a_company's_investigation_into_an_existing_problem_val_data.csv +3 -0
  50. data_preprocessing_scripts/issue_data_splits/problem_with_a_credit_reporting_company's_investigation_into_an_existing_problem_data.csv +3 -0
data_preprocessing_scripts/.DS_Store ADDED
Binary file (6.15 kB). View file
 
data_preprocessing_scripts/Data Preprocessing Script.ipynb ADDED
@@ -0,0 +1,576 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "markdown",
5
+ "id": "5619ac0c-7398-4eb5-bdc0-8d338bf4a41f",
6
+ "metadata": {},
7
+ "source": [
8
+ "### Data Preprocessing"
9
+ ]
10
+ },
11
+ {
12
+ "cell_type": "markdown",
13
+ "id": "8774cfd1-91b0-4d2d-b0f1-f057a5940cea",
14
+ "metadata": {
15
+ "jp-MarkdownHeadingCollapsed": true
16
+ },
17
+ "source": [
18
+ "### Importing Libraries"
19
+ ]
20
+ },
21
+ {
22
+ "cell_type": "code",
23
+ "execution_count": 1,
24
+ "id": "b0b6e81d-c547-41ae-8a2b-4f8864cbc8d4",
25
+ "metadata": {},
26
+ "outputs": [],
27
+ "source": [
28
+ "import warnings\n",
29
+ "warnings.filterwarnings(\"ignore\")"
30
+ ]
31
+ },
32
+ {
33
+ "cell_type": "code",
34
+ "execution_count": 2,
35
+ "id": "27dd12eb-2975-4b6f-9010-845ae2d23c8f",
36
+ "metadata": {},
37
+ "outputs": [],
38
+ "source": [
39
+ "import pandas as pd\n",
40
+ "from sklearn.model_selection import train_test_split\n",
41
+ "import os"
42
+ ]
43
+ },
44
+ {
45
+ "cell_type": "markdown",
46
+ "id": "c49948e7-3018-4cf8-b3bc-0bae7e6a051f",
47
+ "metadata": {},
48
+ "source": [
49
+ "### Data Preprocessing Function"
50
+ ]
51
+ },
52
+ {
53
+ "cell_type": "code",
54
+ "execution_count": 3,
55
+ "id": "02ee5dbe-dc71-4839-a19b-34699133f2be",
56
+ "metadata": {},
57
+ "outputs": [],
58
+ "source": [
59
+ "def load_and_clean_data(file_path):\n",
60
+ " \"\"\"\n",
61
+ " Load and clean the data from the specified CSV file.\n",
62
+ "\n",
63
+ " Parameters:\n",
64
+ " - file_path (str): Path to the CSV file containing the data.\n",
65
+ "\n",
66
+ " Returns:\n",
67
+ " - DataFrame: Cleaned DataFrame containing selected columns with NaN values dropped.\n",
68
+ " \"\"\"\n",
69
+ " df = pd.read_csv(file_path)\n",
70
+ " df['Date received'] = pd.to_datetime(df['Date received'])\n",
71
+ " \n",
72
+ " cols_to_consider = ['Product', 'Sub-product', 'Issue', 'Sub-issue', 'Consumer complaint narrative',\n",
73
+ " 'Company public response', 'Company', 'State', 'ZIP code', 'Date received']\n",
74
+ " \n",
75
+ " df_new = df[cols_to_consider]\n",
76
+ " \n",
77
+ " df_new = df_new.dropna()\n",
78
+ " \n",
79
+ " return df_new"
80
+ ]
81
+ },
82
+ {
83
+ "cell_type": "code",
84
+ "execution_count": 4,
85
+ "id": "413135b7-1fb8-4cef-876e-99bfd1f148ac",
86
+ "metadata": {},
87
+ "outputs": [],
88
+ "source": [
89
+ "def filter_by_years(df, years):\n",
90
+ " \"\"\"\n",
91
+ " Filter the DataFrame to include only the rows corresponding to specified years.\n",
92
+ "\n",
93
+ " Parameters:\n",
94
+ " - df (DataFrame): The DataFrame containing data to filter.\n",
95
+ " - years (list of int): List of years to filter by.\n",
96
+ "\n",
97
+ " Returns:\n",
98
+ " - DataFrame: Filtered DataFrame containing rows corresponding to specified years.\n",
99
+ " \"\"\"\n",
100
+ " filtered_df = df[df['Date received'].dt.year.isin(years)].reset_index(drop=True)\n",
101
+ " return filtered_df"
102
+ ]
103
+ },
104
+ {
105
+ "cell_type": "code",
106
+ "execution_count": 5,
107
+ "id": "9e3f199f-0ad9-40a3-82f1-065af9efa9f5",
108
+ "metadata": {},
109
+ "outputs": [],
110
+ "source": [
111
+ "def map_product_column(df):\n",
112
+ " \"\"\"\n",
113
+ " Map values in the 'Product' column of the DataFrame to a standardized set of categories.\n",
114
+ "\n",
115
+ " Parameters:\n",
116
+ " - df (DataFrame): The DataFrame containing the 'Product' column to map.\n",
117
+ "\n",
118
+ " Returns:\n",
119
+ " - DataFrame: DataFrame with the 'Product' column values mapped to standardized categories.\n",
120
+ " \"\"\"\n",
121
+ "\n",
122
+ " product_map = {'Credit reporting or other personal consumer reports': 'Credit Reporting',\n",
123
+ " 'Credit reporting, credit repair services, or other personal consumer reports': 'Credit Reporting',\n",
124
+ " 'Payday loan, title loan, personal loan, or advance loan': 'Loans / Mortgage',\n",
125
+ " 'Payday loan, title loan, or personal loan': 'Loans / Mortgage',\n",
126
+ " 'Student loan': 'Loans / Mortgage',\n",
127
+ " 'Vehicle loan or lease': 'Loans / Mortgage',\n",
128
+ " 'Debt collection': 'Debt collection',\n",
129
+ " 'Credit card or prepaid card': 'Credit/Prepaid Card',\n",
130
+ " 'Credit card': 'Credit/Prepaid Card',\n",
131
+ " 'Prepaid card': 'Credit/Prepaid Card',\n",
132
+ " 'Mortgage': 'Loans / Mortgage',\n",
133
+ " 'Checking or savings account': 'Checking or savings account' \n",
134
+ " }\n",
135
+ " # Map 'Product' column\n",
136
+ " df.loc[:,'Product'] = df['Product'].map(product_map)\n",
137
+ " \n",
138
+ " return df\n"
139
+ ]
140
+ },
141
+ {
142
+ "cell_type": "code",
143
+ "execution_count": 6,
144
+ "id": "03a2e0a2-75ff-4b33-b081-58e3d5c791c9",
145
+ "metadata": {},
146
+ "outputs": [],
147
+ "source": [
148
+ "def clean_narrative(df):\n",
149
+ "\n",
150
+ " \"\"\"\n",
151
+ " Clean the consumer complaint narratives in the DataFrame by excluding short and irrelevant complaints.\n",
152
+ "\n",
153
+ " Parameters:\n",
154
+ " - df (DataFrame): The input DataFrame containing consumer complaint data.\n",
155
+ "\n",
156
+ " Returns:\n",
157
+ " - DataFrame: DataFrame with cleaned consumer complaint narratives.\n",
158
+ " \"\"\"\n",
159
+ "# Compute complaint length\n",
160
+ " df['complaint length'] = df['Consumer complaint narrative'].apply(lambda x : len(x))\n",
161
+ "\n",
162
+ " df = df[df['complaint length'] > 20]\n",
163
+ " \n",
164
+ " complaints_to_exclude = ['See document attached', 'See the attached documents.', 'Incorrect information on my credit report', 'incorrect information on my credit report',\n",
165
+ " 'please see attached file','Please see documents Attached','Incorrect information on my credit report.', 'Please see attached file', 'see attached',\n",
166
+ " 'See attached', 'SEE ATTACHED DOCUMENTS', 'See Attached', 'SEE ATTACHMENT', 'SEE ATTACHMENTS', \n",
167
+ " 'XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX']\n",
168
+ " \n",
169
+ " df = df[~df['Consumer complaint narrative'].isin(complaints_to_exclude)]\n",
170
+ " return df"
171
+ ]
172
+ },
173
+ {
174
+ "cell_type": "code",
175
+ "execution_count": 7,
176
+ "id": "0cb28135-12d8-41fc-94a8-2968e558473b",
177
+ "metadata": {},
178
+ "outputs": [],
179
+ "source": [
180
+ "def filter_by_frequency(df):\n",
181
+ " \"\"\"\n",
182
+ " Filter the DataFrame based on the frequency of sub-issues and sub-products.\n",
183
+ "\n",
184
+ " Parameters:\n",
185
+ " - df (DataFrame): The input DataFrame containing consumer complaint data.\n",
186
+ "\n",
187
+ " Returns:\n",
188
+ " - DataFrame: DataFrame filtered based on the frequency of sub-issues and sub-products.\n",
189
+ " \"\"\"\n",
190
+ " # Select sub-issues with frequency > 500\n",
191
+ " sub_issues_to_consider = df['Sub-issue'].value_counts()[df['Sub-issue'].value_counts() > 500].index\n",
192
+ "\n",
193
+ " # Filter DataFrame based on selected sub-issues\n",
194
+ " reduced_subissues = df[df['Sub-issue'].isin(sub_issues_to_consider)]\n",
195
+ " # Select sub-products with frequency > 100\n",
196
+ " sub_products_to_consider = reduced_subissues['Sub-product'].value_counts()[reduced_subissues['Sub-product'].value_counts() > 100].index\n",
197
+ "\n",
198
+ " # Filter DataFrame based on selected sub-products\n",
199
+ " final_df = reduced_subissues[reduced_subissues['Sub-product'].isin(sub_products_to_consider)]\n",
200
+ "\n",
201
+ " return final_df"
202
+ ]
203
+ },
204
+ {
205
+ "cell_type": "code",
206
+ "execution_count": 8,
207
+ "id": "e102c902-645e-453e-9a67-07781ba6fc55",
208
+ "metadata": {},
209
+ "outputs": [],
210
+ "source": [
211
+ "def map_issue(df):\n",
212
+ " \"\"\"\n",
213
+ " Map the issues to more defined and lesser number of issues in the DataFrame.\n",
214
+ "\n",
215
+ " Parameters:\n",
216
+ " - df (DataFrame): The input DataFrame containing consumer complaint data.\n",
217
+ "\n",
218
+ " Returns:\n",
219
+ " - DataFrame: DataFrame with the 'Issue' column mapped to appropriate issues.\n",
220
+ " \"\"\"\n",
221
+ " # Create a dictionary mapping issues to sub-issues\n",
222
+ " issues_to_subissues = {}\n",
223
+ " for issue in df['Issue'].value_counts().index:\n",
224
+ " issues_to_subissues[issue] = list(df[df['Issue'] == issue]['Sub-issue'].value_counts().to_dict().keys())\n",
225
+ "\n",
226
+ " # Separate issues with only one sub-issue and more than one sub-issue\n",
227
+ " one_subissue = {key: value for key, value in issues_to_subissues.items() if len(issues_to_subissues[key]) == 1}\n",
228
+ " more_than_one_subissue = {key: value for key, value in issues_to_subissues.items() if len(issues_to_subissues[key]) > 1}\n",
229
+ "\n",
230
+ " # Existing issue mapping for issues with more than one sub-issue\n",
231
+ " existing_issue_mapping = {issue: issue for issue in more_than_one_subissue}\n",
232
+ "\n",
233
+ " # Issue renaming based on provided mapping\n",
234
+ " issue_renaming = {\n",
235
+ " 'Problem with a lender or other company charging your account': 'Account Operations and Unauthorized Transaction Issues',\n",
236
+ " 'Opening an account': 'Account Operations and Unauthorized Transaction Issues',\n",
237
+ " 'Getting a credit card': 'Account Operations and Unauthorized Transaction Issues',\n",
238
+ "\n",
239
+ " 'Unable to get your credit report or credit score': 'Credit Report and Monitoring Issues',\n",
240
+ " 'Credit monitoring or identity theft protection services': 'Credit Report and Monitoring Issues',\n",
241
+ " 'Identity theft protection or other monitoring services': 'Credit Report and Monitoring Issues',\n",
242
+ "\n",
243
+ " 'Problem caused by your funds being low': 'Payment and Funds Management',\n",
244
+ " 'Problem when making payments': 'Payment and Funds Management',\n",
245
+ " 'Managing the loan or lease': 'Payment and Funds Management',\n",
246
+ "\n",
247
+ " 'False statements or representation': 'Disputes and Misrepresentations',\n",
248
+ " 'Fees or interest': 'Disputes and Misrepresentations',\n",
249
+ " 'Other features, terms, or problems': 'Disputes and Misrepresentations',\n",
250
+ "\n",
251
+ " 'Took or threatened to take negative or legal action': 'Legal and Threat Actions'\n",
252
+ " }\n",
253
+ "\n",
254
+ " # Combine issue renaming and existing issue mapping\n",
255
+ " issues_mapping = {**issue_renaming, **existing_issue_mapping}\n",
256
+ "\n",
257
+ " # Map 'Issue' column using the defined mapping dictionary\n",
258
+ " df['Issue'] = df['Issue'].apply(lambda x: issues_mapping[x])\n",
259
+ "\n",
260
+ " return df"
261
+ ]
262
+ },
263
+ {
264
+ "cell_type": "code",
265
+ "execution_count": 9,
266
+ "id": "b8a50ac3-1a5b-4c78-92bb-da14d38a679c",
267
+ "metadata": {},
268
+ "outputs": [],
269
+ "source": [
270
+ "def split_and_save_data(df,year, test_size=0.25, random_state=42, directory_to_save='./data_splits/'):\n",
271
+ " \"\"\"\n",
272
+ " Split the input DataFrame into train and test sets, and save them as CSV files with the specified year included in the file names.\n",
273
+ "\n",
274
+ " Parameters:\n",
275
+ " - df (DataFrame): The input DataFrame containing consumer complaint data.\n",
276
+ " - year (int): The year associated with the data split.\n",
277
+ " - test_size (float, optional): The proportion of the dataset to include in the test split. Default is 0.25.\n",
278
+ " - random_state (int, optional): The seed used by the random number generator. Default is 42.\n",
279
+ " - directory_to_save (str, optional): The directory path to save the data splits. Default is './data_splits/'.\n",
280
+ "\n",
281
+ " Returns:\n",
282
+ " - None\n",
283
+ " \"\"\"\n",
284
+ " # Split the data into train and test sets\n",
285
+ " X = df['Consumer complaint narrative']\n",
286
+ " y = df[['Product', 'Sub-product', 'Issue', 'Sub-issue']]\n",
287
+ " X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y['Product'], test_size=test_size, random_state=random_state)\n",
288
+ "\n",
289
+ " # Concatenate X_train and y_train, and X_test and y_test respectively\n",
290
+ " train_df = pd.concat([X_train, y_train], axis=1).reset_index(drop=True)\n",
291
+ " test_df = pd.concat([X_test, y_test], axis=1).reset_index(drop=True)\n",
292
+ "\n",
293
+ " # Create directory if it doesn't exist\n",
294
+ " if not os.path.exists(directory_to_save):\n",
295
+ " os.makedirs(directory_to_save)\n",
296
+ " \n",
297
+ " # Save train and test data as CSV files with the year included in the file names\n",
298
+ " train_df.to_csv(os.path.join(directory_to_save, f'train-data-split_{year}.csv'), index=False)\n",
299
+ " test_df.to_csv(os.path.join(directory_to_save, f'test-data-split_{year}.csv'), index=False)"
300
+ ]
301
+ },
302
+ {
303
+ "cell_type": "markdown",
304
+ "id": "a11fbb2c-548e-4bf8-bb41-abe22a4bd485",
305
+ "metadata": {
306
+ "jp-MarkdownHeadingCollapsed": true
307
+ },
308
+ "source": [
309
+ "### Main Cleaning Pipeline"
310
+ ]
311
+ },
312
+ {
313
+ "cell_type": "code",
314
+ "execution_count": 13,
315
+ "id": "479d2872-0202-4eac-85d9-ce2b532881dd",
316
+ "metadata": {},
317
+ "outputs": [],
318
+ "source": [
319
+ "def main(file_path, year,year_name):\n",
320
+ " # Load and clean the data\n",
321
+ " df_cleaned = load_and_clean_data(file_path)\n",
322
+ " \n",
323
+ " # Filter the data by years\n",
324
+ " df_filtered = filter_by_years(df_cleaned, year)\n",
325
+ " \n",
326
+ " # Map the 'Product' column\n",
327
+ " df_mapped = map_product_column(df_filtered)\n",
328
+ " \n",
329
+ " # Clean the customer narratives in the data\n",
330
+ " df_clean_narrative = clean_narrative(df_mapped)\n",
331
+ "\n",
332
+ " # Clean the data by frequency\n",
333
+ " df_freq = filter_by_frequency(df_clean_narrative)\n",
334
+ "\n",
335
+ " #Mapping the Issues and filtering Sub Issues\n",
336
+ " df_final = map_issue_to_subissue(df_freq)\n",
337
+ " \n",
338
+ " # Split and save the data\n",
339
+ " split_and_save_data(df_final,year_name)\n",
340
+ " return df_final"
341
+ ]
342
+ },
343
+ {
344
+ "cell_type": "markdown",
345
+ "id": "60c6f92a-de2b-4c8a-817e-fb49a68f87ba",
346
+ "metadata": {},
347
+ "source": [
348
+ "### Calling the data preprocessing script"
349
+ ]
350
+ },
351
+ {
352
+ "cell_type": "code",
353
+ "execution_count": 14,
354
+ "id": "76932edd-4de8-47e1-a1a1-3cc4b07d6850",
355
+ "metadata": {},
356
+ "outputs": [],
357
+ "source": [
358
+ "file_path = 'complaints.csv'\n",
359
+ "years_to_include = [2023]\n",
360
+ "year_name=2023\n",
361
+ "df_final=main(file_path, years_to_include,year_name)"
362
+ ]
363
+ },
364
+ {
365
+ "cell_type": "code",
366
+ "execution_count": 15,
367
+ "id": "b55f4b09-6c89-470b-a4f0-60c0148534ec",
368
+ "metadata": {},
369
+ "outputs": [
370
+ {
371
+ "data": {
372
+ "text/plain": [
373
+ "(247517, 11)"
374
+ ]
375
+ },
376
+ "execution_count": 15,
377
+ "metadata": {},
378
+ "output_type": "execute_result"
379
+ }
380
+ ],
381
+ "source": [
382
+ "df_final.shape"
383
+ ]
384
+ },
385
+ {
386
+ "cell_type": "code",
387
+ "execution_count": 16,
388
+ "id": "ed38bd8a-2004-435a-87b2-2bcc43dbd565",
389
+ "metadata": {},
390
+ "outputs": [
391
+ {
392
+ "data": {
393
+ "text/plain": [
394
+ "Credit Reporting 211695\n",
395
+ "Checking or savings account 12285\n",
396
+ "Credit/Prepaid Card 11975\n",
397
+ "Debt collection 9380\n",
398
+ "Loans / Mortgage 2182\n",
399
+ "Name: Product, dtype: int64"
400
+ ]
401
+ },
402
+ "execution_count": 16,
403
+ "metadata": {},
404
+ "output_type": "execute_result"
405
+ }
406
+ ],
407
+ "source": [
408
+ "df_final.Product.value_counts()"
409
+ ]
410
+ },
411
+ {
412
+ "cell_type": "code",
413
+ "execution_count": 17,
414
+ "id": "dab29296-8a5b-44c3-8c6e-7b60e062343f",
415
+ "metadata": {},
416
+ "outputs": [
417
+ {
418
+ "data": {
419
+ "text/plain": [
420
+ "Credit reporting 210735\n",
421
+ "General-purpose credit card or charge card 10668\n",
422
+ "Checking account 10409\n",
423
+ "Other debt 3041\n",
424
+ "I do not know 2316\n",
425
+ "Credit card debt 1652\n",
426
+ "Federal student loan servicing 1344\n",
427
+ "Store credit card 1307\n",
428
+ "Medical debt 1053\n",
429
+ "Savings account 989\n",
430
+ "Other personal consumer report 960\n",
431
+ "Loan 732\n",
432
+ "Other banking product or service 725\n",
433
+ "Auto debt 581\n",
434
+ "Telecommunications debt 419\n",
435
+ "Rental debt 179\n",
436
+ "CD (Certificate of Deposit) 162\n",
437
+ "Mortgage debt 139\n",
438
+ "Conventional home mortgage 106\n",
439
+ "Name: Sub-product, dtype: int64"
440
+ ]
441
+ },
442
+ "execution_count": 17,
443
+ "metadata": {},
444
+ "output_type": "execute_result"
445
+ }
446
+ ],
447
+ "source": [
448
+ "df_final['Sub-product'].value_counts()"
449
+ ]
450
+ },
451
+ {
452
+ "cell_type": "code",
453
+ "execution_count": 19,
454
+ "id": "10127c9e-ea4a-49ce-a5f2-76991abde850",
455
+ "metadata": {},
456
+ "outputs": [
457
+ {
458
+ "data": {
459
+ "text/plain": [
460
+ "Incorrect information on your report 87200\n",
461
+ "Improper use of your report 61868\n",
462
+ "Problem with a credit reporting company's investigation into an existing problem 45371\n",
463
+ "Problem with a company's investigation into an existing problem 20985\n",
464
+ "Managing an account 7367\n",
465
+ "Attempts to collect debt not owed 5453\n",
466
+ "Problem with a purchase shown on your statement 3253\n",
467
+ "Account Operations and Unauthorized Transaction Issues 2450\n",
468
+ "Written notification about debt 2404\n",
469
+ "Disputes and Misrepresentations 2311\n",
470
+ "Payment and Funds Management 2259\n",
471
+ "Closing an account 1975\n",
472
+ "Credit Report and Monitoring Issues 1630\n",
473
+ "Dealing with your lender or servicer 1293\n",
474
+ "Closing your account 813\n",
475
+ "Legal and Threat Actions 662\n",
476
+ "Problem with a company's investigation into an existing issue 223\n",
477
+ "Name: Issue, dtype: int64"
478
+ ]
479
+ },
480
+ "execution_count": 19,
481
+ "metadata": {},
482
+ "output_type": "execute_result"
483
+ }
484
+ ],
485
+ "source": [
486
+ "df_final['Issue'].value_counts()"
487
+ ]
488
+ },
489
+ {
490
+ "cell_type": "code",
491
+ "execution_count": 20,
492
+ "id": "e774f715-6d3d-4035-9f28-753801490d13",
493
+ "metadata": {},
494
+ "outputs": [
495
+ {
496
+ "data": {
497
+ "text/plain": [
498
+ "Information belongs to someone else 57850\n",
499
+ "Reporting company used your report improperly 48732\n",
500
+ "Their investigation did not fix an error on your report 45395\n",
501
+ "Credit inquiries on your report that you don't recognize 13136\n",
502
+ "Account status incorrect 10208\n",
503
+ "Account information incorrect 9267\n",
504
+ "Was not notified of investigation status or results 9200\n",
505
+ "Investigation took more than 30 days 8928\n",
506
+ "Personal information incorrect 5900\n",
507
+ "Debt is not yours 2785\n",
508
+ "Deposits and withdrawals 2626\n",
509
+ "Credit card company isn't resolving a dispute about a purchase on your statement 2289\n",
510
+ "Didn't receive enough information to verify debt 1777\n",
511
+ "Debt was result of identity theft 1727\n",
512
+ "Old information reappears or never goes away 1714\n",
513
+ "Difficulty submitting a dispute or getting information about a dispute over the phone 1704\n",
514
+ "Company closed your account 1517\n",
515
+ "Problem using a debit or ATM card 1503\n",
516
+ "Public record information inaccurate 1384\n",
517
+ "Transaction was not authorized 1378\n",
518
+ "Problem with personal statement of dispute 1352\n",
519
+ "Other problem getting your report or credit score 1109\n",
520
+ "Card was charged for something you did not purchase with the card 964\n",
521
+ "Banking errors 958\n",
522
+ "Funds not handled or disbursed as instructed 955\n",
523
+ "Overdrafts and overdraft fees 951\n",
524
+ "Debt was paid 941\n",
525
+ "Information is missing that should be on the report 877\n",
526
+ "Attempted to collect wrong amount 861\n",
527
+ "Problem during payment process 840\n",
528
+ "Fee problem 764\n",
529
+ "Problem with fees 749\n",
530
+ "Other problem 701\n",
531
+ "Received bad information about your loan 677\n",
532
+ "Funds not received from closed account 673\n",
533
+ "Threatened or suggested your credit would be damaged 662\n",
534
+ "Didn't receive notice of right to dispute 627\n",
535
+ "Trouble with how payments are being handled 616\n",
536
+ "Can't close your account 598\n",
537
+ "Problem accessing account 561\n",
538
+ "Account opened as a result of fraud 561\n",
539
+ "Problem canceling credit monitoring or identify theft protection service 521\n",
540
+ "Card opened as result of identity theft or fraud 511\n",
541
+ "Billing problem 468\n",
542
+ "Name: Sub-issue, dtype: int64"
543
+ ]
544
+ },
545
+ "execution_count": 20,
546
+ "metadata": {},
547
+ "output_type": "execute_result"
548
+ }
549
+ ],
550
+ "source": [
551
+ "df_final['Sub-issue'].value_counts()"
552
+ ]
553
+ }
554
+ ],
555
+ "metadata": {
556
+ "kernelspec": {
557
+ "display_name": "Python 3 (ipykernel)",
558
+ "language": "python",
559
+ "name": "python3"
560
+ },
561
+ "language_info": {
562
+ "codemirror_mode": {
563
+ "name": "ipython",
564
+ "version": 3
565
+ },
566
+ "file_extension": ".py",
567
+ "mimetype": "text/x-python",
568
+ "name": "python",
569
+ "nbconvert_exporter": "python",
570
+ "pygments_lexer": "ipython3",
571
+ "version": "3.9.19"
572
+ }
573
+ },
574
+ "nbformat": 4,
575
+ "nbformat_minor": 5
576
+ }
data_preprocessing_scripts/data_splits/test-data-split_2022.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4c203087801339af843b99a42a2187585d4af3445aab36c59c7cd7653b87587e
3
+ size 62357439
data_preprocessing_scripts/data_splits/test-data-split_2023.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9a253ea22f1b90530a803e61f1624d3b270ef62c88b89aece879cbe793e64eea
3
+ size 68758811
data_preprocessing_scripts/data_splits/train-data-balanced.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:acbfac231d968f341f65ce49dd5369f75fea7ab792e29c6116dfafe113589186
3
+ size 70018604
data_preprocessing_scripts/data_splits/train-data-split_2022.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53dac829454c1da6425cfa6d07ad668e2150d4854b6a470c413bb267f7f0f51d
3
+ size 185727392
data_preprocessing_scripts/data_splits/train-data-split_2023.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:fd0cab77284db3c21e6d186d5a2d46ba27557a3320de250c19784822c73fc443
3
+ size 205853579
data_preprocessing_scripts/issue_data_splits/account_operations_and_unauthorized_transaction_issues_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:204e335acf3df6981acc48e8aa845b8f2f1ca356d961f99f3e973aca273599bc
3
+ size 2090623
data_preprocessing_scripts/issue_data_splits/account_operations_and_unauthorized_transaction_issues_train_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:d1f00513b1d44dbc7190ff68aa135655fdd22fe4b38bbe042a7713e68fc85ed8
3
+ size 1883764
data_preprocessing_scripts/issue_data_splits/account_operations_and_unauthorized_transaction_issues_val_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3e46867c68fee83e0994f36aa6e39be17a2e5e67eb05f17378aedc7bc01350ce
3
+ size 206915
data_preprocessing_scripts/issue_data_splits/attempts_to_collect_debt_not_owed_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:126f320e8e8df97c756c507ff4eb5c669f6058c92a3ec25f52c5f7b81599719b
3
+ size 4135658
data_preprocessing_scripts/issue_data_splits/attempts_to_collect_debt_not_owed_train_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e96f4e5342aa61f35cdd4eb42bd81a092fc25bc168f70e1f7be3322b8ea64cc4
3
+ size 3718608
data_preprocessing_scripts/issue_data_splits/attempts_to_collect_debt_not_owed_val_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:036e70ce151a32c8a6757c2abef7f2547542d4c11c6541c6b6d705bd0063fd85
3
+ size 417106
data_preprocessing_scripts/issue_data_splits/closing_an_account_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84dbff3ce91e5c3433d3a0ae1cf543671c4025c2a9c3b9bcb40656975e17852d
3
+ size 1799450
data_preprocessing_scripts/issue_data_splits/closing_an_account_train_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:216a3640f091ef01b069809e4895528912bc2665358971d0eb30eee103892a68
3
+ size 1587242
data_preprocessing_scripts/issue_data_splits/closing_an_account_val_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:704275d826033848072c0d747e2595745185cd385c4a65b4b34cb46b849eb388
3
+ size 212264
data_preprocessing_scripts/issue_data_splits/closing_your_account_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e8428ebcc053869342b7343c709c2eef18d1a88973601dd4cead6141034c0ff2
3
+ size 799995
data_preprocessing_scripts/issue_data_splits/closing_your_account_train_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2a4de5779cddd84b99a6695dca6e58d36ef93cd5d670fbee821a3b9ff2b39fc8
3
+ size 736684
data_preprocessing_scripts/issue_data_splits/closing_your_account_val_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f0e2e7589b709c008ff61ee8a2c5ac805462c4b0028074c5abb0e6c824437903
3
+ size 63367
data_preprocessing_scripts/issue_data_splits/credit_report_and_monitoring_issues_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7029133836244969f80daa7f682a53951e7bfdd1fc304ccb65dcd4f126adc526
3
+ size 2353757
data_preprocessing_scripts/issue_data_splits/credit_report_and_monitoring_issues_train_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:791d59058ebd88fec2f991f66b1e4fd557322179b4435215dcfcbc33f8984429
3
+ size 2115089
data_preprocessing_scripts/issue_data_splits/credit_report_and_monitoring_issues_val_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:53fbdf8fca538f24db0dc091d8f39a155a1e2952fdf2b96a62b9317d7dab7368
3
+ size 238724
data_preprocessing_scripts/issue_data_splits/dealing_with_your_lender_or_servicer_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:e65c17415c7deba2c51c98fe5a96e563aff73cb9d2354a3c318f9d04ddffe746
3
+ size 1500034
data_preprocessing_scripts/issue_data_splits/dealing_with_your_lender_or_servicer_train_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:235e66bcf2e35fd99d858d301ffd7a2831b1f33c09098cd2fd503a6642a18048
3
+ size 1370056
data_preprocessing_scripts/issue_data_splits/dealing_with_your_lender_or_servicer_val_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5f8289ab48f58adcacb8a81078cb8c9e216d61dedb87b907ac5f850ea7244218
3
+ size 130034
data_preprocessing_scripts/issue_data_splits/disputes_and_misrepresentations_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cfa794e53b61d14225d6c03bc749fb83ad315ac5f485c4bcf946a6e0cc0fb34a
3
+ size 2207971
data_preprocessing_scripts/issue_data_splits/disputes_and_misrepresentations_train_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:00081a0b4c6c429f141df193159b55c550c2f15007970a54c12b10e25166e970
3
+ size 1981586
data_preprocessing_scripts/issue_data_splits/disputes_and_misrepresentations_val_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:66990f81784bae88083bb6cc6fa0582d94c0410050d70c261b5561547d18492d
3
+ size 226441
data_preprocessing_scripts/issue_data_splits/improper_use_of_your_report_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:22ced2ef476e18679dee486e7b81ad35e6d66a6eb7ad936b92efeb85d30e65c9
3
+ size 51426802
data_preprocessing_scripts/issue_data_splits/improper_use_of_your_report_train_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:b2dde33856c2cb76ef0674aa869cd6ccc6b96b548bb283d0c39c010aca0f722a
3
+ size 46289254
data_preprocessing_scripts/issue_data_splits/improper_use_of_your_report_val_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:df20ab17ff8751b8251244684294af3ebc11979b2bc699d7e207abe2bc511942
3
+ size 5137604
data_preprocessing_scripts/issue_data_splits/incorrect_information_on_your_report_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:c557f260383639a5248caae11d252de71cd72e88a23e20c7d32c1f65a42232bb
3
+ size 67002391
data_preprocessing_scripts/issue_data_splits/incorrect_information_on_your_report_train_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:83a947015eb1b0d8a3958d70e6a86df6dd79436fe4bb42cf7dcae5238f5a1b95
3
+ size 60306312
data_preprocessing_scripts/issue_data_splits/incorrect_information_on_your_report_val_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:db7603a2b5d168bf2a6ee56dfaa5ad909a06e35ad345c5772344d6f77765ce96
3
+ size 6696135
data_preprocessing_scripts/issue_data_splits/legal_and_threat_actions_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:06f9c755e499c83d36f8c5cb20d813cb5ac0db73d188c32a334ea9a47b6efb52
3
+ size 612393
data_preprocessing_scripts/issue_data_splits/legal_and_threat_actions_train_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3dd9fd5013473734be09aac6b9563f2f10fbea3156a72d548e42bbdfef5c891a
3
+ size 543877
data_preprocessing_scripts/issue_data_splits/legal_and_threat_actions_val_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:7623b91317d2f8746f340422183fd089a3b0ace3647bdf3b71692928dbf5e3a9
3
+ size 68572
data_preprocessing_scripts/issue_data_splits/managing_an_account_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f46c2f03a742087ee42a05c22444648e56ad25bf551d19e28bd09b06a3425ef7
3
+ size 7169538
data_preprocessing_scripts/issue_data_splits/managing_an_account_train_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:09d0d485f89d83edb41c61262e2fe5963bfb9100f6f54ef6b16788dc967326ff
3
+ size 6420679
data_preprocessing_scripts/issue_data_splits/managing_an_account_val_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ac9882076a622b2c2bc1399fdb27b4ca396c6d6cd7623da4ac41af0456471261
3
+ size 748915
data_preprocessing_scripts/issue_data_splits/payment_and_funds_management_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:15719bc2484dc054a68e9c7d38bc110aedf6fa82962058088f8f2c1dc9d2384f
3
+ size 1854714
data_preprocessing_scripts/issue_data_splits/payment_and_funds_management_train_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2d12880274bfdccc9e99f2b973c5f7bf7239e2e8fdda2e08833d1a68a013c3b0
3
+ size 1680205
data_preprocessing_scripts/issue_data_splits/payment_and_funds_management_val_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:ec358880a0bc9e79f74843b91e0607e5ed2930831715a539231a0b28d9158681
3
+ size 174565
data_preprocessing_scripts/issue_data_splits/problem_with_a_company's_investigation_into_an_existing_issue_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:51956f542a4924d8e6f5c363210406a4ccfa553f5703bdb61b9c252d3c74198b
3
+ size 320657
data_preprocessing_scripts/issue_data_splits/problem_with_a_company's_investigation_into_an_existing_issue_train_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:f1011d1017bebc41a4df1217ee9d99604c9e1aa1e11f4f386fb34896383e8dca
3
+ size 284289
data_preprocessing_scripts/issue_data_splits/problem_with_a_company's_investigation_into_an_existing_issue_val_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:84f7f02d80205cf532ebbd6307e849f71eaeab724e0c5c94cf7de9cca4397d32
3
+ size 36424
data_preprocessing_scripts/issue_data_splits/problem_with_a_company's_investigation_into_an_existing_problem_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:bbf2935aed3387bc5bcf55dc3afaf29794eb8f4b084ec843b092da96b2c7590c
3
+ size 15288595
data_preprocessing_scripts/issue_data_splits/problem_with_a_company's_investigation_into_an_existing_problem_train_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:81ae7dd9f926f0cce7be50f1a21327b4bd2d416edd2524b87a1a5617457447ef
3
+ size 13714937
data_preprocessing_scripts/issue_data_splits/problem_with_a_company's_investigation_into_an_existing_problem_val_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:4ff329c35b163e0ff2d3e868cc41a1046f045903ae0fede9cd341655f906b310
3
+ size 1573714
data_preprocessing_scripts/issue_data_splits/problem_with_a_credit_reporting_company's_investigation_into_an_existing_problem_data.csv ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3324a8147b120f23f086f69a7a3ccea1b893ec6769ee470e3121dfe9bec059a0
3
+ size 35696420