Spaces:

krislette
/

bach-or-bot

Sleeping

Acelle Krislette Rosales

Initial commit: Added application code

fc7b4a9 2 months ago

5.83 kB

	"""
	MLP Model Evaluation Script for AI vs Human Music Detection
	==========================================================

	This script evaluates the performance of the trained MLP classifier on test data.
	It gives a complete performance report showing how well the model can distinguish
	between AI-generated and human-composed music.

	What this script does:
	- Loads our saved/trained MLP model
	- Tests it on held-out test data (music the model has never seen)
	- Calculates accuracy, precision, recall, and F1-score
	- Reports confusion statistics (true positives, true negatives, false positives, false negatives)
	- Displays sample predictions with probabilities for transparency

	Quick Start:
	---------------------------
	# Basic evaluation with default model path
	python evaluate.py

	# Evaluate a specific model
	python evaluate.py --model "models/fusion/mlp_multimodal.pth"

	# From code
	from evaluate import evaluate_model
	results = evaluate_model("models/fusion/mlp_multimodal.pth")

	Performance Metrics Explained:
	------------------------------
	- Accuracy: Overall correctness (how many songs classified correctly)
	- Precision: Of songs predicted as human, how many actually were human
	- Recall: Of all human songs, how many did we correctly identify
	- F1-Score: Balance between precision and recall (harmonic mean)
	- Confusion stats:
	TP = Human songs correctly identified
	TN = AI songs correctly identified
	FP = AI songs incorrectly labeled as human
	FN = Human songs incorrectly labeled as AI

	Expected Output:
	----------------
	Loading model from: models/fusion/mlp_multimodal.pth
	Loaded dataset: (50000, 684), Labels: 50000
	Test set size: (10000, 684)
	Evaluating model on test set...

	Sample predictions:
	True: 1, Pred: 1, Prob: 0.8234 # Correctly identified human song
	True: 0, Pred: 0, Prob: 0.1456 # Correctly identified AI song
	True: 1, Pred: 0, Prob: 0.4123 # Missed a human song (false negative)

	=== Evaluation Results ===
	Test Accuracy: 87.54%
	Test Loss: 0.3412
	Precision: 0.8832
	Recall: 0.8654
	F1-Score: 0.8742
	"""

	import argparse
	import logging
	import numpy as np
	from pathlib import Path

	from src.models.mlp import build_mlp, load_config
	from src.utils.config_loader import DATASET_NPZ
	from sklearn.model_selection import train_test_split

	# Set up logging
	logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
	logger = logging.getLogger(__name__)


	def evaluate_model(model_path: str = "models/fusion/mlp_multimodal.pth"):
	logger.info(f"Loading model from: {model_path}")

	# Check if dataset exists
	if not Path(DATASET_NPZ).exists():
	raise FileNotFoundError(f"Dataset not found at {DATASET_NPZ}. Run train.py first.")

	# Load the full dataset
	loaded_data = np.load(DATASET_NPZ)
	X = loaded_data["X"]
	Y = loaded_data["Y"]

	logger.info(f"Loaded dataset: {X.shape}, Labels: {len(Y)}")

	# Split data (same as training)
	from src.utils.dataset import dataset_scaler
	data = dataset_scaler(X, Y)
	X_test, y_test = data["test"]

	logger.info(f"Test set size: {X_test.shape}")

	# Load configuration
	config = load_config("config/model_config.yml")

	# Build model architecture (needed for loading weights)
	mlp_classifier = build_mlp(input_dim=X_test.shape[1], config=config)

	# Load trained model
	mlp_classifier.load_model(model_path)

	# Evaluate on test set
	logger.info("Evaluating model on test set...")
	test_results = mlp_classifier.evaluate(X_test, y_test)

	# Get predictions for detailed analysis
	probabilities, predictions = mlp_classifier.predict(X_test)

	# Show a few sample predictions
	for i in range(10):
	print(f"True: {y_test[i]}, Pred: {predictions[i]}, Prob: {probabilities[i]:.4f} "
	f"(Probability of predicted class)")

	logger.info("=== Evaluation Results ===")
	logger.info(f"Test Accuracy: {test_results['test_accuracy']:.2f}%")
	logger.info(f"Test Loss: {test_results['test_loss']:.4f}")

	# Additional statistics
	true_positives = np.sum((y_test == 1) & (predictions == 1))
	true_negatives = np.sum((y_test == 0) & (predictions == 0))
	false_positives = np.sum((y_test == 0) & (predictions == 1))
	false_negatives = np.sum((y_test == 1) & (predictions == 0))

	precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
	recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
	f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

	logger.info(f"Precision: {precision:.4f}")
	logger.info(f"Recall: {recall:.4f}")
	logger.info(f"F1-Score: {f1_score:.4f}")

	# Include all metrics in return dict
	return {
	"test_accuracy": test_results["test_accuracy"],
	"test_loss": test_results["test_loss"],
	"precision": precision,
	"recall": recall,
	"f1_score": f1_score,
	"true_positives": int(true_positives),
	"true_negatives": int(true_negatives),
	"false_positives": int(false_positives),
	"false_negatives": int(false_negatives)
	}


	def main():
	"""Main evaluation function."""
	parser = argparse.ArgumentParser(description='Evaluate Bach-or-Bot MLP classifier')
	parser.add_argument('--model', default='models/fusion/mlp_multimodal.pth',
	help='Path to trained model')
	args = parser.parse_args()

	try:
	results = evaluate_model(args.model)
	logger.info("Evaluation completed successfully!")
	except Exception as e:
	logger.error(f"Evaluation failed: {str(e)}")
	raise


	if __name__ == "__main__":
	main()