{ "cells": [ { "cell_type": "code", "execution_count": 1, "id": "a751d479-1500-41e2-8c01-252e849dad05", "metadata": {}, "outputs": [], "source": [ "import warnings\n", "warnings.filterwarnings(\"ignore\")" ] }, { "cell_type": "code", "execution_count": 2, "id": "8158cb66-9f9a-4bb2-bc6e-6a51146be10c", "metadata": {}, "outputs": [], "source": [ "import pandas as pd\n", "import matplotlib.pyplot as plt \n", "from sklearn.model_selection import train_test_split\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.pipeline import make_pipeline\n", "from sklearn.linear_model import LogisticRegression\n", "from sklearn.naive_bayes import MultinomialNB\n", "from sklearn.svm import SVC\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.metrics import classification_report,accuracy_score\n", "import numpy as np\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.preprocessing import OneHotEncoder\n", "from sklearn.compose import ColumnTransformer\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.pipeline import Pipeline\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.ensemble import RandomForestClassifier\n", "from sklearn.model_selection import train_test_split\n", "from sklearn.metrics import classification_report, accuracy_score\n", "from sklearn.utils.class_weight import compute_class_weight\n", "import pickle" ] }, { "cell_type": "markdown", "id": "70ea935b-3b62-4cf9-8bef-06bf30904b20", "metadata": {}, "source": [ "## Sub Products" ] }, { "cell_type": "markdown", "id": "f9ddaa89-dc8d-40f5-8098-7d108ab9d578", "metadata": {}, "source": [ "### Model" ] }, { "cell_type": "code", "execution_count": 3, "id": "c1f9fd85-f47e-4962-a693-7cb9efca763a", "metadata": {}, "outputs": [], "source": [ "from sklearn.pipeline import Pipeline\n", "from sklearn.feature_extraction.text import TfidfVectorizer\n", "from sklearn.metrics import accuracy_score, classification_report\n", "from sklearn.utils.class_weight import compute_class_weight\n", "\n", "def train_model(training_df, validation_df, subproduct_to_predict, classifier_model, subproducts_to_drop=None, random_state=None):\n", " # Drop specified subproducts from training and validation dataframes\n", " if subproducts_to_drop:\n", " training_df = training_df[~training_df['Sub-product'].isin(subproducts_to_drop)]\n", " validation_df = validation_df[~validation_df['Sub-product'].isin(subproducts_to_drop)]\n", " \n", " # Compute class weights\n", " class_weights = compute_class_weight('balanced', classes=np.unique(training_df['Sub-product']), y=training_df['Sub-product'])\n", " \n", " # Convert class weights to dictionary format\n", " class_weight = {label: weight for label, weight in zip(np.unique(training_df['Sub-product']), class_weights)}\n", " \n", " # Define a default class weight for missing classes\n", " default_class_weight = 0.5\n", " \n", " # Assign default class weight for missing classes\n", " for label in np.unique(training_df['Sub-product']):\n", " if label not in class_weight:\n", " class_weight[label] = default_class_weight\n", " \n", " # Define the pipeline\n", " pipeline = Pipeline([\n", " ('tfidf', TfidfVectorizer()),\n", " ('classifier', classifier_model)\n", " ])\n", " \n", " # Train the pipeline\n", " pipeline.fit(training_df['Consumer complaint narrative'], training_df['Sub-product'])\n", " \n", " # Make predictions on the validation set\n", " y_pred = pipeline.predict(validation_df['Consumer complaint narrative'])\n", " \n", " # Evaluate the pipeline\n", " accuracy = accuracy_score(validation_df['Sub-product'], y_pred)\n", " print(\"Accuracy:\", accuracy)\n", " print(\"\\nClassification Report:\")\n", " print(classification_report(validation_df['Sub-product'], y_pred))\n", " \n", " return pipeline\n" ] }, { "cell_type": "markdown", "id": "a7a0d277-75c1-4435-86e5-d0ee7d3dabf3", "metadata": {}, "source": [ "#### Debt Collection" ] }, { "cell_type": "code", "execution_count": 4, "id": "6a2e4857-31c7-4b57-a25c-e9e36473c033", "metadata": {}, "outputs": [], "source": [ "debt_training_df= pd.read_csv('../data_preprocessing_scripts/product_data_splits/debt_collection_train_data.csv')\n", "debt_val_df= pd.read_csv('../data_preprocessing_scripts/product_data_splits/debt_collection_val_data.csv')" ] }, { "cell_type": "code", "execution_count": 5, "id": "7fb6be2b-244f-4232-972c-9772128890ca", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | Consumer complaint narrative | \n", "Product | \n", "Sub-product | \n", "
---|---|---|---|
0 | \n", "{$37.00} on XXXX XXXX XXXX I paid for gas thro... | \n", "Debt collection | \n", "Other debt | \n", "
1 | \n", "Debt from XXXX XXXX is result of identity thef... | \n", "Debt collection | \n", "Credit card debt | \n", "
2 | \n", "My son attended XXXX XXXX XXXX XXXX for severa... | \n", "Debt collection | \n", "Medical debt | \n", "
3 | \n", "XXXX is claiming I owe a debt for utilities ba... | \n", "Debt collection | \n", "Other debt | \n", "
4 | \n", "This debt collector engaged in abusive, decept... | \n", "Debt collection | \n", "I do not know | \n", "