bach-or-bot / scripts /evaluate.py
Acelle Krislette Rosales
Initial commit: Added application code
fc7b4a9
raw
history blame
5.83 kB
"""
MLP Model Evaluation Script for AI vs Human Music Detection
==========================================================
This script evaluates the performance of the trained MLP classifier on test data.
It gives a complete performance report showing how well the model can distinguish
between AI-generated and human-composed music.
What this script does:
- Loads our saved/trained MLP model
- Tests it on held-out test data (music the model has never seen)
- Calculates accuracy, precision, recall, and F1-score
- Reports confusion statistics (true positives, true negatives, false positives, false negatives)
- Displays sample predictions with probabilities for transparency
Quick Start:
---------------------------
# Basic evaluation with default model path
python evaluate.py
# Evaluate a specific model
python evaluate.py --model "models/fusion/mlp_multimodal.pth"
# From code
from evaluate import evaluate_model
results = evaluate_model("models/fusion/mlp_multimodal.pth")
Performance Metrics Explained:
------------------------------
- Accuracy: Overall correctness (how many songs classified correctly)
- Precision: Of songs predicted as human, how many actually were human
- Recall: Of all human songs, how many did we correctly identify
- F1-Score: Balance between precision and recall (harmonic mean)
- Confusion stats:
TP = Human songs correctly identified
TN = AI songs correctly identified
FP = AI songs incorrectly labeled as human
FN = Human songs incorrectly labeled as AI
Expected Output:
----------------
Loading model from: models/fusion/mlp_multimodal.pth
Loaded dataset: (50000, 684), Labels: 50000
Test set size: (10000, 684)
Evaluating model on test set...
Sample predictions:
True: 1, Pred: 1, Prob: 0.8234 # Correctly identified human song
True: 0, Pred: 0, Prob: 0.1456 # Correctly identified AI song
True: 1, Pred: 0, Prob: 0.4123 # Missed a human song (false negative)
=== Evaluation Results ===
Test Accuracy: 87.54%
Test Loss: 0.3412
Precision: 0.8832
Recall: 0.8654
F1-Score: 0.8742
"""
import argparse
import logging
import numpy as np
from pathlib import Path
from src.models.mlp import build_mlp, load_config
from src.utils.config_loader import DATASET_NPZ
from sklearn.model_selection import train_test_split
# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)
def evaluate_model(model_path: str = "models/fusion/mlp_multimodal.pth"):
logger.info(f"Loading model from: {model_path}")
# Check if dataset exists
if not Path(DATASET_NPZ).exists():
raise FileNotFoundError(f"Dataset not found at {DATASET_NPZ}. Run train.py first.")
# Load the full dataset
loaded_data = np.load(DATASET_NPZ)
X = loaded_data["X"]
Y = loaded_data["Y"]
logger.info(f"Loaded dataset: {X.shape}, Labels: {len(Y)}")
# Split data (same as training)
from src.utils.dataset import dataset_scaler
data = dataset_scaler(X, Y)
X_test, y_test = data["test"]
logger.info(f"Test set size: {X_test.shape}")
# Load configuration
config = load_config("config/model_config.yml")
# Build model architecture (needed for loading weights)
mlp_classifier = build_mlp(input_dim=X_test.shape[1], config=config)
# Load trained model
mlp_classifier.load_model(model_path)
# Evaluate on test set
logger.info("Evaluating model on test set...")
test_results = mlp_classifier.evaluate(X_test, y_test)
# Get predictions for detailed analysis
probabilities, predictions = mlp_classifier.predict(X_test)
# Show a few sample predictions
for i in range(10):
print(f"True: {y_test[i]}, Pred: {predictions[i]}, Prob: {probabilities[i]:.4f} "
f"(Probability of predicted class)")
logger.info("=== Evaluation Results ===")
logger.info(f"Test Accuracy: {test_results['test_accuracy']:.2f}%")
logger.info(f"Test Loss: {test_results['test_loss']:.4f}")
# Additional statistics
true_positives = np.sum((y_test == 1) & (predictions == 1))
true_negatives = np.sum((y_test == 0) & (predictions == 0))
false_positives = np.sum((y_test == 0) & (predictions == 1))
false_negatives = np.sum((y_test == 1) & (predictions == 0))
precision = true_positives / (true_positives + false_positives) if (true_positives + false_positives) > 0 else 0
recall = true_positives / (true_positives + false_negatives) if (true_positives + false_negatives) > 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0
logger.info(f"Precision: {precision:.4f}")
logger.info(f"Recall: {recall:.4f}")
logger.info(f"F1-Score: {f1_score:.4f}")
# Include all metrics in return dict
return {
"test_accuracy": test_results["test_accuracy"],
"test_loss": test_results["test_loss"],
"precision": precision,
"recall": recall,
"f1_score": f1_score,
"true_positives": int(true_positives),
"true_negatives": int(true_negatives),
"false_positives": int(false_positives),
"false_negatives": int(false_negatives)
}
def main():
"""Main evaluation function."""
parser = argparse.ArgumentParser(description='Evaluate Bach-or-Bot MLP classifier')
parser.add_argument('--model', default='models/fusion/mlp_multimodal.pth',
help='Path to trained model')
args = parser.parse_args()
try:
results = evaluate_model(args.model)
logger.info("Evaluation completed successfully!")
except Exception as e:
logger.error(f"Evaluation failed: {str(e)}")
raise
if __name__ == "__main__":
main()