Spaces:

VOIDER
/

image-eval

Sleeping

App Files Files Community

image-eval / modules /aggregator.py

VOIDER

Upload 11 files

f89e218 verified about 2 months ago

raw

history blame contribute delete

7.49 kB

	"""
	Module for aggregating results from different evaluation metrics.
	"""

	import pandas as pd
	import numpy as np
	from collections import defaultdict


	class ResultsAggregator:
	"""Class for aggregating and analyzing image evaluation results."""

	def __init__(self):
	"""Initialize the aggregator."""
	# Weights for different metric categories
	self.default_weights = {
	# Technical metrics
	'sharpness': 1.0,
	'noise': 1.0,
	'contrast': 1.0,
	'saturation': 1.0,
	'entropy': 1.0,
	'compression_artifacts': 1.0,
	'dynamic_range': 1.0,

	# Aesthetic metrics
	'aesthetic_score': 1.5,
	'composition_score': 1.2,
	'color_harmony': 1.2,

	# Prompt metrics
	'prompt_similarity': 2.0,
	}

	# Metrics where lower is better
	self.inverse_metrics = ['noise', 'compression_artifacts']

	def normalize_metric(self, values, metric_name):
	"""
	Normalize metric values to 0-10 scale.

	Args:
	values: list of metric values
	metric_name: name of the metric

	Returns:
	list: normalized values
	"""
	if not values:
	return []

	# For metrics where lower is better, invert the values
	if metric_name in self.inverse_metrics:
	values = [max(values) - v + min(values) for v in values]

	# Normalize to 0-10 scale
	min_val = min(values)
	max_val = max(values)

	if max_val == min_val:
	return [5.0] * len(values) # Default to middle value if all values are the same

	return [10 * (v - min_val) / (max_val - min_val) for v in values]

	def aggregate_model_results(self, model_results, custom_weights=None):
	"""
	Aggregate results for a single model across multiple images.

	Args:
	model_results: list of metric dictionaries for images from the same model
	custom_weights: optional dictionary of custom weights for metrics

	Returns:
	dict: aggregated metrics
	"""
	if not model_results:
	return {}

	# Use default weights if custom weights not provided
	weights = custom_weights if custom_weights else self.default_weights

	# Initialize aggregated results
	aggregated = {}

	# Collect all metrics
	all_metrics = set()
	for result in model_results:
	all_metrics.update(result.keys())

	# Aggregate each metric
	for metric in all_metrics:
	# Skip non-numeric metrics
	values = [result.get(metric) for result in model_results if metric in result
	and isinstance(result[metric], (int, float))]

	if values:
	aggregated[metric] = {
	'mean': np.mean(values),
	'median': np.median(values),
	'std': np.std(values),
	'min': np.min(values),
	'max': np.max(values),
	'count': len(values)
	}

	# Calculate overall score
	score_components = []
	weight_sum = 0

	for metric, stats in aggregated.items():
	if metric in weights:
	# Normalize the mean value to 0-10 scale
	normalized_value = stats['mean']
	if metric in self.inverse_metrics:
	# For metrics where lower is better, invert the scale
	normalized_value = 10 - normalized_value

	# Apply weight
	weight = weights[metric]
	score_components.append(normalized_value * weight)
	weight_sum += weight

	# Calculate weighted average
	if weight_sum > 0:
	aggregated['overall_score'] = sum(score_components) / weight_sum
	else:
	aggregated['overall_score'] = 5.0 # Default middle score

	return aggregated

	def compare_models(self, model_results_dict, custom_weights=None):
	"""
	Compare results across different models.

	Args:
	model_results_dict: dictionary with model names as keys and lists of results as values
	custom_weights: optional dictionary of custom weights for metrics

	Returns:
	dict: comparison results
	"""
	# Aggregate results for each model
	aggregated_results = {}
	for model_name, results in model_results_dict.items():
	aggregated_results[model_name] = self.aggregate_model_results(results, custom_weights)

	# Extract key metrics for comparison
	comparison = {}
	for model_name, agg_results in aggregated_results.items():
	model_comparison = {
	'overall_score': agg_results.get('overall_score', 5.0)
	}

	# Add mean values of all metrics
	for metric, stats in agg_results.items():
	if metric != 'overall_score' and isinstance(stats, dict) and 'mean' in stats:
	model_comparison[f"{metric}"] = stats['mean']

	comparison[model_name] = model_comparison

	return comparison

	def analyze_by_prompt(self, results_by_prompt, custom_weights=None):
	"""
	Analyze results grouped by prompt.

	Args:
	results_by_prompt: dictionary with prompts as keys and dictionaries of model results as values
	custom_weights: optional dictionary of custom weights for metrics

	Returns:
	dict: analysis results by prompt
	"""
	prompt_analysis = {}

	for prompt, model_results in results_by_prompt.items():
	# Compare models for this prompt
	prompt_comparison = self.compare_models(model_results, custom_weights)

	# Find best model for this prompt
	best_model = None
	best_score = -1

	for model, metrics in prompt_comparison.items():
	score = metrics.get('overall_score', 0)
	if score > best_score:
	best_score = score
	best_model = model

	prompt_analysis[prompt] = {
	'model_comparison': prompt_comparison,
	'best_model': best_model,
	'best_score': best_score
	}

	return prompt_analysis

	def create_comparison_dataframe(self, comparison_results):
	"""
	Create a pandas DataFrame from comparison results.

	Args:
	comparison_results: dictionary with model names as keys and metric dictionaries as values

	Returns:
	pandas.DataFrame: comparison table
	"""
	# Convert to DataFrame
	df = pd.DataFrame.from_dict(comparison_results, orient='index')

	# Sort by overall score
	if 'overall_score' in df.columns:
	df = df.sort_values('overall_score', ascending=False)

	return df