Spaces:

ShutterStack
/

CausalBox

Sleeping

App Files Files Community

CausalBox / utils /prediction_models.py

ShutterStack

major changes

ab66d4e verified 14 days ago

raw

history blame contribute delete

4.03 kB

	# utils/prediction_models.py
	import pandas as pd
	from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
	from sklearn.model_selection import train_test_split
	from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report
	import numpy as np

	def train_predict_random_forest(data_list, target_col, feature_cols, prediction_type='regression'):
	"""
	Trains a Random Forest model and performs prediction/evaluation.

	Args:
	data_list (list of dict): List of dictionaries representing the dataset.
	target_col (str): Name of the target variable.
	feature_cols (list): List of names of feature variables.
	prediction_type (str): 'regression' or 'classification'.

	Returns:
	dict: A dictionary containing model results (metrics, predictions, feature importances).
	"""
	df = pd.DataFrame(data_list)

	if not all(col in df.columns for col in feature_cols + [target_col]):
	missing_cols = [col for col in feature_cols + [target_col] if col not in df.columns]
	raise ValueError(f"Missing columns in data: {missing_cols}")

	X = df[feature_cols]
	y = df[target_col]

	# Handle categorical features if any
	X = pd.get_dummies(X, drop_first=True) # One-hot encode categorical features

	# Split data for robust evaluation
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

	results = {}

	if prediction_type == 'regression':
	model = RandomForestRegressor(n_estimators=100, random_state=42)
	model.fit(X_train, y_train)
	y_pred = model.predict(X_test)

	results['model_type'] = 'Regression'
	results['r2_score'] = r2_score(y_test, y_pred)
	results['mean_squared_error'] = mean_squared_error(y_test, y_pred)
	results['root_mean_squared_error'] = np.sqrt(mean_squared_error(y_test, y_pred))
	results['actual_vs_predicted'] = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred}).to_dict(orient='list')

	elif prediction_type == 'classification':
	# Ensure target variable is suitable for classification (e.g., integer/categorical)
	# You might need more robust handling for different target types here
	if y.dtype == 'object' or y.dtype.name == 'category':
	y_train = y_train.astype('category').cat.codes
	y_test = y_test.astype('category').cat.codes
	y_unique_labels = df[target_col].astype('category').cat.categories.tolist()
	results['class_labels'] = y_unique_labels
	else:
	y_unique_labels = sorted(y.unique().tolist())
	results['class_labels'] = y_unique_labels


	model = RandomForestClassifier(n_estimators=100, random_state=42)
	model.fit(X_train, y_train)
	y_pred = model.predict(X_test)

	results['model_type'] = 'Classification'
	results['accuracy'] = accuracy_score(y_test, y_pred)

	# Precision, Recall, F1-score - use 'weighted' average for multi-class
	precision, recall, f1, _ = precision_recall_fscore_support(y_test, y_pred, average='weighted', zero_division=0)
	results['precision'] = precision
	results['recall'] = recall
	results['f1_score'] = f1

	results['confusion_matrix'] = confusion_matrix(y_test, y_pred).tolist()
	results['classification_report'] = classification_report(y_test, y_pred, output_dict=True, zero_division=0)

	else:
	raise ValueError("prediction_type must be 'regression' or 'classification'")

	# Feature Importance (common for both)
	if hasattr(model, 'feature_importances_'):
	feature_importances = pd.Series(model.feature_importances_, index=X.columns).sort_values(ascending=False)
	results['feature_importances'] = feature_importances.to_dict()

	return results