""" This script trains classification models using scikit-learn. It handles data loading, preprocessing, hyperparameter tuning, model evaluation with classification metrics, and saving of models, metrics, and visualizations. Usage: python train_classification_model.py --model_module MODEL_MODULE --data_path DATA_PATH/DATA_NAME.csv --target_variable TARGET_VARIABLE Optional arguments: --test_size TEST_SIZE --random_state RANDOM_STATE --cv_folds CV_FOLDS --scoring_metric SCORING_METRIC --model_path MODEL_PATH --results_path RESULTS_PATH --visualize --drop_columns COLUMN_NAMES Example: python train_classification_model.py --model_module logistic_regression --data_path data/adult_income/train.csv --target_variable income_bracket --drop_columns Id --scoring_metric accuracy --visualize """ import os import sys import argparse import importlib import pandas as pd import numpy as np import matplotlib.pyplot as plt import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, ConfusionMatrixDisplay) import joblib from timeit import default_timer as timer def main(args): # Change to the root directory of the project project_root = os.path.abspath(os.path.join(os.path.dirname(__file__), "..")) os.chdir(project_root) sys.path.insert(0, project_root) # Import the hyperparameter tuning and the model modules from utils.supervised_hyperparameter_tuning import classification_hyperparameter_tuning model_module_path = f"models.supervised.classification.{args.model_module}" model_module = importlib.import_module(model_module_path) # Get the model estimator, parameters grid, and scoring metric estimator = model_module.estimator param_grid = model_module.param_grid scoring_metric = args.scoring_metric or getattr(model_module, 'default_scoring', 'accuracy') model_name = estimator.__class__.__name__ # Set default paths if not provided args.model_path = args.model_path or os.path.join('saved_models', model_name) args.results_path = args.results_path or os.path.join('results', model_name) os.makedirs(args.results_path, exist_ok=True) # Load the dataset df = pd.read_csv(os.path.join(args.data_path)) # Drop specified columns if args.drop_columns: columns_to_drop = args.drop_columns.split(',') df = df.drop(columns=columns_to_drop) # Define target variable and features target_variable = args.target_variable X = df.drop(columns=[target_variable]) y = df[target_variable] # Ensure target variable is not numeric (or at least, is categorical) # It's fine if it's numeric labels for classes, but typically classification is categorical. # We'll just run as is and rely on the estimator to handle it. # If needed, we can print a note: if np.issubdtype(y.dtype, np.number) and len(np.unique(y)) > 20: # Large number of unique values might indicate a regression-like problem print(f"Warning: The target variable '{target_variable}' seems to have many unique numeric values. Ensure it's truly a classification problem.") # Encode target variable if not numeric if y.dtype == 'object' or not np.issubdtype(y.dtype, np.number): from sklearn.preprocessing import LabelEncoder le = LabelEncoder() y = le.fit_transform(y) # Save label encoder so that we can interpret predictions later # Create model_path directory if not exists os.makedirs(args.model_path, exist_ok=True) joblib.dump(le, os.path.join(args.model_path, 'label_encoder.pkl')) print("LabelEncoder applied to target variable. Classes:", le.classes_) # Split the data X_train, X_test, y_train, y_test = train_test_split( X, y, test_size=args.test_size, random_state=args.random_state) # Start the timer start_time = timer() # Perform hyperparameter tuning (classification) best_model, best_params = classification_hyperparameter_tuning( X_train, y_train, estimator, param_grid, cv=args.cv_folds, scoring=scoring_metric) # End the timer and calculate how long it took end_time = timer() train_time = end_time - start_time # Evaluate the best model on the test set y_pred = best_model.predict(X_test) # Calculate classification metrics accuracy = accuracy_score(y_test, y_pred) precision = precision_score(y_test, y_pred, average='weighted', zero_division=0) recall = recall_score(y_test, y_pred, average='weighted', zero_division=0) f1 = f1_score(y_test, y_pred, average='weighted', zero_division=0) print(f"\n{model_name} Classification Metrics on Test Set:") print(f"- Accuracy: {accuracy:.4f}") print(f"- Precision: {precision:.4f}") print(f"- Recall: {recall:.4f}") print(f"- F1 Score: {f1:.4f}") print(f"- Training Time: {train_time:.4f} seconds") # Save the trained model model_output_path = os.path.join(args.model_path, 'best_model.pkl') os.makedirs(args.model_path, exist_ok=True) joblib.dump(best_model, model_output_path) print(f"Trained model saved to {model_output_path}") # Save metrics to CSV metrics = { 'Accuracy': [accuracy], 'Precision': [precision], 'Recall': [recall], 'F1 Score': [f1], 'train_time': [train_time] } results_df = pd.DataFrame(metrics) results_df.to_csv(os.path.join(args.results_path, 'metrics.csv'), index=False) print(f"\nMetrics saved to {os.path.join(args.results_path, 'metrics.csv')}") if args.visualize: # Plot Classification Metrics plt.figure(figsize=(8, 6)) metric_names = list(metrics.keys()) metric_values = [value[0] for value in metrics.values() if value[0] is not None and isinstance(value[0], (int,float))] plt.bar(metric_names[:-1], metric_values[:-1], color='skyblue', alpha=0.8) # exclude train_time from plotting plt.ylim(0, 1) plt.xlabel('Metrics') plt.ylabel('Scores') plt.title('Classification Metrics') plt.savefig(os.path.join(args.results_path, 'classification_metrics.png')) plt.show() print(f"Visualization saved to {os.path.join(args.results_path, 'classification_metrics.png')}") # Display and save the confusion matrix from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay # Load the label encoder (if it exists) label_encoder_path = os.path.join(args.model_path, "label_encoder.pkl") if os.path.exists(label_encoder_path): label_encoder = joblib.load(label_encoder_path) # Decode the predicted and true labels y_test_decoded = label_encoder.inverse_transform(y_test) y_pred_decoded = label_encoder.inverse_transform(y_pred) display_labels = label_encoder.classes_ else: # If no encoder, use the original numeric labels y_test_decoded = y_test y_pred_decoded = y_pred display_labels = None # Numeric labels will be used by default # Save confusion matrix conf_mat = confusion_matrix(y_test_decoded, y_pred_decoded) plt.figure(figsize=(10, 8)) # Increased figure size for better spacing disp = ConfusionMatrixDisplay(conf_mat, display_labels=display_labels) # Customize the plot disp.plot(cmap="Blues", values_format="d", ax=plt.gca()) plt.title("Confusion Matrix", fontsize=16, pad=20) # Increased font size and added padding plt.xticks(rotation=45, ha="right", fontsize=12) # Rotated x-axis labels and increased font size plt.yticks(fontsize=12) # Increased font size for y-axis labels plt.xlabel("Predicted Label", fontsize=14) # Added font size for x-axis label plt.ylabel("True Label", fontsize=14) # Added font size for y-axis label # Save the improved plot cm_path = os.path.join(args.results_path, "confusion_matrix.png") plt.savefig(cm_path, bbox_inches="tight") # Ensures no clipping of labels plt.show() print(f"Confusion matrix saved to {cm_path}") if __name__ == "__main__": parser = argparse.ArgumentParser(description="Train a classification model.") # Model module argument parser.add_argument('--model_module', type=str, required=True, help='Name of the classification model module to import.') # Data arguments parser.add_argument('--data_path', type=str, required=True, help='Path to the dataset file including data name.') parser.add_argument('--target_variable', type=str, required=True, help='Name of the target variable (categorical).') parser.add_argument('--drop_columns', type=str, default='', help='Columns to drop from the dataset.') # Model arguments parser.add_argument('--test_size', type=float, default=0.2, help='Proportion for test split.') parser.add_argument('--random_state', type=int, default=42, help='Random seed.') parser.add_argument('--cv_folds', type=int, default=5, help='Number of cross-validation folds.') parser.add_argument('--scoring_metric', type=str, default=None, help='Scoring metric for model evaluation (e.g., accuracy, f1, roc_auc).') # Output arguments parser.add_argument('--model_path', type=str, default=None, help='Path to save the trained model.') parser.add_argument('--results_path', type=str, default=None, help='Path to save results and metrics.') parser.add_argument('--visualize', action='store_true', help='Generate and save visualizations (classification metrics chart and confusion matrix).') args = parser.parse_args() main(args)