image-eval / modules /aggregator.py
VOIDER's picture
Upload 11 files
f89e218 verified
"""
Module for aggregating results from different evaluation metrics.
"""
import pandas as pd
import numpy as np
from collections import defaultdict
class ResultsAggregator:
"""Class for aggregating and analyzing image evaluation results."""
def __init__(self):
"""Initialize the aggregator."""
# Weights for different metric categories
self.default_weights = {
# Technical metrics
'sharpness': 1.0,
'noise': 1.0,
'contrast': 1.0,
'saturation': 1.0,
'entropy': 1.0,
'compression_artifacts': 1.0,
'dynamic_range': 1.0,
# Aesthetic metrics
'aesthetic_score': 1.5,
'composition_score': 1.2,
'color_harmony': 1.2,
# Prompt metrics
'prompt_similarity': 2.0,
}
# Metrics where lower is better
self.inverse_metrics = ['noise', 'compression_artifacts']
def normalize_metric(self, values, metric_name):
"""
Normalize metric values to 0-10 scale.
Args:
values: list of metric values
metric_name: name of the metric
Returns:
list: normalized values
"""
if not values:
return []
# For metrics where lower is better, invert the values
if metric_name in self.inverse_metrics:
values = [max(values) - v + min(values) for v in values]
# Normalize to 0-10 scale
min_val = min(values)
max_val = max(values)
if max_val == min_val:
return [5.0] * len(values) # Default to middle value if all values are the same
return [10 * (v - min_val) / (max_val - min_val) for v in values]
def aggregate_model_results(self, model_results, custom_weights=None):
"""
Aggregate results for a single model across multiple images.
Args:
model_results: list of metric dictionaries for images from the same model
custom_weights: optional dictionary of custom weights for metrics
Returns:
dict: aggregated metrics
"""
if not model_results:
return {}
# Use default weights if custom weights not provided
weights = custom_weights if custom_weights else self.default_weights
# Initialize aggregated results
aggregated = {}
# Collect all metrics
all_metrics = set()
for result in model_results:
all_metrics.update(result.keys())
# Aggregate each metric
for metric in all_metrics:
# Skip non-numeric metrics
values = [result.get(metric) for result in model_results if metric in result
and isinstance(result[metric], (int, float))]
if values:
aggregated[metric] = {
'mean': np.mean(values),
'median': np.median(values),
'std': np.std(values),
'min': np.min(values),
'max': np.max(values),
'count': len(values)
}
# Calculate overall score
score_components = []
weight_sum = 0
for metric, stats in aggregated.items():
if metric in weights:
# Normalize the mean value to 0-10 scale
normalized_value = stats['mean']
if metric in self.inverse_metrics:
# For metrics where lower is better, invert the scale
normalized_value = 10 - normalized_value
# Apply weight
weight = weights[metric]
score_components.append(normalized_value * weight)
weight_sum += weight
# Calculate weighted average
if weight_sum > 0:
aggregated['overall_score'] = sum(score_components) / weight_sum
else:
aggregated['overall_score'] = 5.0 # Default middle score
return aggregated
def compare_models(self, model_results_dict, custom_weights=None):
"""
Compare results across different models.
Args:
model_results_dict: dictionary with model names as keys and lists of results as values
custom_weights: optional dictionary of custom weights for metrics
Returns:
dict: comparison results
"""
# Aggregate results for each model
aggregated_results = {}
for model_name, results in model_results_dict.items():
aggregated_results[model_name] = self.aggregate_model_results(results, custom_weights)
# Extract key metrics for comparison
comparison = {}
for model_name, agg_results in aggregated_results.items():
model_comparison = {
'overall_score': agg_results.get('overall_score', 5.0)
}
# Add mean values of all metrics
for metric, stats in agg_results.items():
if metric != 'overall_score' and isinstance(stats, dict) and 'mean' in stats:
model_comparison[f"{metric}"] = stats['mean']
comparison[model_name] = model_comparison
return comparison
def analyze_by_prompt(self, results_by_prompt, custom_weights=None):
"""
Analyze results grouped by prompt.
Args:
results_by_prompt: dictionary with prompts as keys and dictionaries of model results as values
custom_weights: optional dictionary of custom weights for metrics
Returns:
dict: analysis results by prompt
"""
prompt_analysis = {}
for prompt, model_results in results_by_prompt.items():
# Compare models for this prompt
prompt_comparison = self.compare_models(model_results, custom_weights)
# Find best model for this prompt
best_model = None
best_score = -1
for model, metrics in prompt_comparison.items():
score = metrics.get('overall_score', 0)
if score > best_score:
best_score = score
best_model = model
prompt_analysis[prompt] = {
'model_comparison': prompt_comparison,
'best_model': best_model,
'best_score': best_score
}
return prompt_analysis
def create_comparison_dataframe(self, comparison_results):
"""
Create a pandas DataFrame from comparison results.
Args:
comparison_results: dictionary with model names as keys and metric dictionaries as values
Returns:
pandas.DataFrame: comparison table
"""
# Convert to DataFrame
df = pd.DataFrame.from_dict(comparison_results, orient='index')
# Sort by overall score
if 'overall_score' in df.columns:
df = df.sort_values('overall_score', ascending=False)
return df