Spaces:
Sleeping
Sleeping
""" | |
Module for aggregating results from different evaluation metrics. | |
""" | |
import pandas as pd | |
import numpy as np | |
from collections import defaultdict | |
class ResultsAggregator: | |
"""Class for aggregating and analyzing image evaluation results.""" | |
def __init__(self): | |
"""Initialize the aggregator.""" | |
# Weights for different metric categories | |
self.default_weights = { | |
# Technical metrics | |
'sharpness': 1.0, | |
'noise': 1.0, | |
'contrast': 1.0, | |
'saturation': 1.0, | |
'entropy': 1.0, | |
'compression_artifacts': 1.0, | |
'dynamic_range': 1.0, | |
# Aesthetic metrics | |
'aesthetic_score': 1.5, | |
'composition_score': 1.2, | |
'color_harmony': 1.2, | |
# Prompt metrics | |
'prompt_similarity': 2.0, | |
} | |
# Metrics where lower is better | |
self.inverse_metrics = ['noise', 'compression_artifacts'] | |
def normalize_metric(self, values, metric_name): | |
""" | |
Normalize metric values to 0-10 scale. | |
Args: | |
values: list of metric values | |
metric_name: name of the metric | |
Returns: | |
list: normalized values | |
""" | |
if not values: | |
return [] | |
# For metrics where lower is better, invert the values | |
if metric_name in self.inverse_metrics: | |
values = [max(values) - v + min(values) for v in values] | |
# Normalize to 0-10 scale | |
min_val = min(values) | |
max_val = max(values) | |
if max_val == min_val: | |
return [5.0] * len(values) # Default to middle value if all values are the same | |
return [10 * (v - min_val) / (max_val - min_val) for v in values] | |
def aggregate_model_results(self, model_results, custom_weights=None): | |
""" | |
Aggregate results for a single model across multiple images. | |
Args: | |
model_results: list of metric dictionaries for images from the same model | |
custom_weights: optional dictionary of custom weights for metrics | |
Returns: | |
dict: aggregated metrics | |
""" | |
if not model_results: | |
return {} | |
# Use default weights if custom weights not provided | |
weights = custom_weights if custom_weights else self.default_weights | |
# Initialize aggregated results | |
aggregated = {} | |
# Collect all metrics | |
all_metrics = set() | |
for result in model_results: | |
all_metrics.update(result.keys()) | |
# Aggregate each metric | |
for metric in all_metrics: | |
# Skip non-numeric metrics | |
values = [result.get(metric) for result in model_results if metric in result | |
and isinstance(result[metric], (int, float))] | |
if values: | |
aggregated[metric] = { | |
'mean': np.mean(values), | |
'median': np.median(values), | |
'std': np.std(values), | |
'min': np.min(values), | |
'max': np.max(values), | |
'count': len(values) | |
} | |
# Calculate overall score | |
score_components = [] | |
weight_sum = 0 | |
for metric, stats in aggregated.items(): | |
if metric in weights: | |
# Normalize the mean value to 0-10 scale | |
normalized_value = stats['mean'] | |
if metric in self.inverse_metrics: | |
# For metrics where lower is better, invert the scale | |
normalized_value = 10 - normalized_value | |
# Apply weight | |
weight = weights[metric] | |
score_components.append(normalized_value * weight) | |
weight_sum += weight | |
# Calculate weighted average | |
if weight_sum > 0: | |
aggregated['overall_score'] = sum(score_components) / weight_sum | |
else: | |
aggregated['overall_score'] = 5.0 # Default middle score | |
return aggregated | |
def compare_models(self, model_results_dict, custom_weights=None): | |
""" | |
Compare results across different models. | |
Args: | |
model_results_dict: dictionary with model names as keys and lists of results as values | |
custom_weights: optional dictionary of custom weights for metrics | |
Returns: | |
dict: comparison results | |
""" | |
# Aggregate results for each model | |
aggregated_results = {} | |
for model_name, results in model_results_dict.items(): | |
aggregated_results[model_name] = self.aggregate_model_results(results, custom_weights) | |
# Extract key metrics for comparison | |
comparison = {} | |
for model_name, agg_results in aggregated_results.items(): | |
model_comparison = { | |
'overall_score': agg_results.get('overall_score', 5.0) | |
} | |
# Add mean values of all metrics | |
for metric, stats in agg_results.items(): | |
if metric != 'overall_score' and isinstance(stats, dict) and 'mean' in stats: | |
model_comparison[f"{metric}"] = stats['mean'] | |
comparison[model_name] = model_comparison | |
return comparison | |
def analyze_by_prompt(self, results_by_prompt, custom_weights=None): | |
""" | |
Analyze results grouped by prompt. | |
Args: | |
results_by_prompt: dictionary with prompts as keys and dictionaries of model results as values | |
custom_weights: optional dictionary of custom weights for metrics | |
Returns: | |
dict: analysis results by prompt | |
""" | |
prompt_analysis = {} | |
for prompt, model_results in results_by_prompt.items(): | |
# Compare models for this prompt | |
prompt_comparison = self.compare_models(model_results, custom_weights) | |
# Find best model for this prompt | |
best_model = None | |
best_score = -1 | |
for model, metrics in prompt_comparison.items(): | |
score = metrics.get('overall_score', 0) | |
if score > best_score: | |
best_score = score | |
best_model = model | |
prompt_analysis[prompt] = { | |
'model_comparison': prompt_comparison, | |
'best_model': best_model, | |
'best_score': best_score | |
} | |
return prompt_analysis | |
def create_comparison_dataframe(self, comparison_results): | |
""" | |
Create a pandas DataFrame from comparison results. | |
Args: | |
comparison_results: dictionary with model names as keys and metric dictionaries as values | |
Returns: | |
pandas.DataFrame: comparison table | |
""" | |
# Convert to DataFrame | |
df = pd.DataFrame.from_dict(comparison_results, orient='index') | |
# Sort by overall score | |
if 'overall_score' in df.columns: | |
df = df.sort_values('overall_score', ascending=False) | |
return df | |