Spaces:
Sleeping
Sleeping
File size: 7,490 Bytes
f89e218 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 |
"""
Module for aggregating results from different evaluation metrics.
"""
import pandas as pd
import numpy as np
from collections import defaultdict
class ResultsAggregator:
"""Class for aggregating and analyzing image evaluation results."""
def __init__(self):
"""Initialize the aggregator."""
# Weights for different metric categories
self.default_weights = {
# Technical metrics
'sharpness': 1.0,
'noise': 1.0,
'contrast': 1.0,
'saturation': 1.0,
'entropy': 1.0,
'compression_artifacts': 1.0,
'dynamic_range': 1.0,
# Aesthetic metrics
'aesthetic_score': 1.5,
'composition_score': 1.2,
'color_harmony': 1.2,
# Prompt metrics
'prompt_similarity': 2.0,
}
# Metrics where lower is better
self.inverse_metrics = ['noise', 'compression_artifacts']
def normalize_metric(self, values, metric_name):
"""
Normalize metric values to 0-10 scale.
Args:
values: list of metric values
metric_name: name of the metric
Returns:
list: normalized values
"""
if not values:
return []
# For metrics where lower is better, invert the values
if metric_name in self.inverse_metrics:
values = [max(values) - v + min(values) for v in values]
# Normalize to 0-10 scale
min_val = min(values)
max_val = max(values)
if max_val == min_val:
return [5.0] * len(values) # Default to middle value if all values are the same
return [10 * (v - min_val) / (max_val - min_val) for v in values]
def aggregate_model_results(self, model_results, custom_weights=None):
"""
Aggregate results for a single model across multiple images.
Args:
model_results: list of metric dictionaries for images from the same model
custom_weights: optional dictionary of custom weights for metrics
Returns:
dict: aggregated metrics
"""
if not model_results:
return {}
# Use default weights if custom weights not provided
weights = custom_weights if custom_weights else self.default_weights
# Initialize aggregated results
aggregated = {}
# Collect all metrics
all_metrics = set()
for result in model_results:
all_metrics.update(result.keys())
# Aggregate each metric
for metric in all_metrics:
# Skip non-numeric metrics
values = [result.get(metric) for result in model_results if metric in result
and isinstance(result[metric], (int, float))]
if values:
aggregated[metric] = {
'mean': np.mean(values),
'median': np.median(values),
'std': np.std(values),
'min': np.min(values),
'max': np.max(values),
'count': len(values)
}
# Calculate overall score
score_components = []
weight_sum = 0
for metric, stats in aggregated.items():
if metric in weights:
# Normalize the mean value to 0-10 scale
normalized_value = stats['mean']
if metric in self.inverse_metrics:
# For metrics where lower is better, invert the scale
normalized_value = 10 - normalized_value
# Apply weight
weight = weights[metric]
score_components.append(normalized_value * weight)
weight_sum += weight
# Calculate weighted average
if weight_sum > 0:
aggregated['overall_score'] = sum(score_components) / weight_sum
else:
aggregated['overall_score'] = 5.0 # Default middle score
return aggregated
def compare_models(self, model_results_dict, custom_weights=None):
"""
Compare results across different models.
Args:
model_results_dict: dictionary with model names as keys and lists of results as values
custom_weights: optional dictionary of custom weights for metrics
Returns:
dict: comparison results
"""
# Aggregate results for each model
aggregated_results = {}
for model_name, results in model_results_dict.items():
aggregated_results[model_name] = self.aggregate_model_results(results, custom_weights)
# Extract key metrics for comparison
comparison = {}
for model_name, agg_results in aggregated_results.items():
model_comparison = {
'overall_score': agg_results.get('overall_score', 5.0)
}
# Add mean values of all metrics
for metric, stats in agg_results.items():
if metric != 'overall_score' and isinstance(stats, dict) and 'mean' in stats:
model_comparison[f"{metric}"] = stats['mean']
comparison[model_name] = model_comparison
return comparison
def analyze_by_prompt(self, results_by_prompt, custom_weights=None):
"""
Analyze results grouped by prompt.
Args:
results_by_prompt: dictionary with prompts as keys and dictionaries of model results as values
custom_weights: optional dictionary of custom weights for metrics
Returns:
dict: analysis results by prompt
"""
prompt_analysis = {}
for prompt, model_results in results_by_prompt.items():
# Compare models for this prompt
prompt_comparison = self.compare_models(model_results, custom_weights)
# Find best model for this prompt
best_model = None
best_score = -1
for model, metrics in prompt_comparison.items():
score = metrics.get('overall_score', 0)
if score > best_score:
best_score = score
best_model = model
prompt_analysis[prompt] = {
'model_comparison': prompt_comparison,
'best_model': best_model,
'best_score': best_score
}
return prompt_analysis
def create_comparison_dataframe(self, comparison_results):
"""
Create a pandas DataFrame from comparison results.
Args:
comparison_results: dictionary with model names as keys and metric dictionaries as values
Returns:
pandas.DataFrame: comparison table
"""
# Convert to DataFrame
df = pd.DataFrame.from_dict(comparison_results, orient='index')
# Sort by overall score
if 'overall_score' in df.columns:
df = df.sort_values('overall_score', ascending=False)
return df
|