# utils/prediction_models.py import pandas as pd from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier from sklearn.model_selection import train_test_split from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report import numpy as np def train_predict_random_forest(data_list, target_col, feature_cols, prediction_type='regression'): """ Trains a Random Forest model and performs prediction/evaluation. Args: data_list (list of dict): List of dictionaries representing the dataset. target_col (str): Name of the target variable. feature_cols (list): List of names of feature variables. prediction_type (str): 'regression' or 'classification'. Returns: dict: A dictionary containing model results (metrics, predictions, feature importances). """ df = pd.DataFrame(data_list) if not all(col in df.columns for col in feature_cols + [target_col]): missing_cols = [col for col in feature_cols + [target_col] if col not in df.columns] raise ValueError(f"Missing columns in data: {missing_cols}") X = df[feature_cols] y = df[target_col] # Handle categorical features if any X = pd.get_dummies(X, drop_first=True) # One-hot encode categorical features # Split data for robust evaluation X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42) results = {} if prediction_type == 'regression': model = RandomForestRegressor(n_estimators=100, random_state=42) model.fit(X_train, y_train) y_pred = model.predict(X_test) results['model_type'] = 'Regression' results['r2_score'] = r2_score(y_test, y_pred) results['mean_squared_error'] = mean_squared_error(y_test, y_pred) results['root_mean_squared_error'] = np.sqrt(mean_squared_error(y_test, y_pred)) results['actual_vs_predicted'] = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred}).to_dict(orient='list') elif prediction_type == 'classification': # Ensure target variable is suitable for classification (e.g., integer/categorical) # You might need more robust handling for different target types here if y.dtype == 'object' or y.dtype.name == 'category': y_train = y_train.astype('category').cat.codes y_test = y_test.astype('category').cat.codes y_unique_labels = df[target_col].astype('category').cat.categories.tolist() results['class_labels'] = y_unique_labels else: y_unique_labels = sorted(y.unique().tolist()) results['class_labels'] = y_unique_labels model = RandomForestClassifier(n_estimators=100, random_state=42) model.fit(X_train, y_train) y_pred = model.predict(X_test) results['model_type'] = 'Classification' results['accuracy'] = accuracy_score(y_test, y_pred) # Precision, Recall, F1-score - use 'weighted' average for multi-class precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted', zero_division=0) results['precision'] = precision results['recall'] = recall results['f1_score'] = f1 results['confusion_matrix'] = confusion_matrix(y_test, y_pred).tolist() results['classification_report'] = classification_report(y_test, y_pred, output_dict=True, zero_division=0) else: raise ValueError("prediction_type must be 'regression' or 'classification'") # Feature Importance (common for both) if hasattr(model, 'feature_importances_'): feature_importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False) results['feature_importances'] = feature_importances.to_dict() return results