Spaces:

VOIDER
/

image-eval

Sleeping

File size: 7,490 Bytes

f89e218

"""
Module for aggregating results from different evaluation metrics.
"""

import pandas as pd
import numpy as np
from collections import defaultdict


class ResultsAggregator:
    """Class for aggregating and analyzing image evaluation results."""
    
    def __init__(self):
        """Initialize the aggregator."""
        # Weights for different metric categories
        self.default_weights = {
            # Technical metrics
            'sharpness': 1.0,
            'noise': 1.0,
            'contrast': 1.0,
            'saturation': 1.0,
            'entropy': 1.0,
            'compression_artifacts': 1.0,
            'dynamic_range': 1.0,
            
            # Aesthetic metrics
            'aesthetic_score': 1.5,
            'composition_score': 1.2,
            'color_harmony': 1.2,
            
            # Prompt metrics
            'prompt_similarity': 2.0,
        }
        
        # Metrics where lower is better
        self.inverse_metrics = ['noise', 'compression_artifacts']
    
    def normalize_metric(self, values, metric_name):
        """
        Normalize metric values to 0-10 scale.
        
        Args:
            values: list of metric values
            metric_name: name of the metric
            
        Returns:
            list: normalized values
        """
        if not values:
            return []
            
        # For metrics where lower is better, invert the values
        if metric_name in self.inverse_metrics:
            values = [max(values) - v + min(values) for v in values]
        
        # Normalize to 0-10 scale
        min_val = min(values)
        max_val = max(values)
        
        if max_val == min_val:
            return [5.0] * len(values)  # Default to middle value if all values are the same
            
        return [10 * (v - min_val) / (max_val - min_val) for v in values]
    
    def aggregate_model_results(self, model_results, custom_weights=None):
        """
        Aggregate results for a single model across multiple images.
        
        Args:
            model_results: list of metric dictionaries for images from the same model
            custom_weights: optional dictionary of custom weights for metrics
            
        Returns:
            dict: aggregated metrics
        """
        if not model_results:
            return {}
            
        # Use default weights if custom weights not provided
        weights = custom_weights if custom_weights else self.default_weights
        
        # Initialize aggregated results
        aggregated = {}
        
        # Collect all metrics
        all_metrics = set()
        for result in model_results:
            all_metrics.update(result.keys())
        
        # Aggregate each metric
        for metric in all_metrics:
            # Skip non-numeric metrics
            values = [result.get(metric) for result in model_results if metric in result 
                     and isinstance(result[metric], (int, float))]
            
            if values:
                aggregated[metric] = {
                    'mean': np.mean(values),
                    'median': np.median(values),
                    'std': np.std(values),
                    'min': np.min(values),
                    'max': np.max(values),
                    'count': len(values)
                }
        
        # Calculate overall score
        score_components = []
        weight_sum = 0
        
        for metric, stats in aggregated.items():
            if metric in weights:
                # Normalize the mean value to 0-10 scale
                normalized_value = stats['mean']
                if metric in self.inverse_metrics:
                    # For metrics where lower is better, invert the scale
                    normalized_value = 10 - normalized_value
                
                # Apply weight
                weight = weights[metric]
                score_components.append(normalized_value * weight)
                weight_sum += weight
        
        # Calculate weighted average
        if weight_sum > 0:
            aggregated['overall_score'] = sum(score_components) / weight_sum
        else:
            aggregated['overall_score'] = 5.0  # Default middle score
        
        return aggregated
    
    def compare_models(self, model_results_dict, custom_weights=None):
        """
        Compare results across different models.
        
        Args:
            model_results_dict: dictionary with model names as keys and lists of results as values
            custom_weights: optional dictionary of custom weights for metrics
            
        Returns:
            dict: comparison results
        """
        # Aggregate results for each model
        aggregated_results = {}
        for model_name, results in model_results_dict.items():
            aggregated_results[model_name] = self.aggregate_model_results(results, custom_weights)
        
        # Extract key metrics for comparison
        comparison = {}
        for model_name, agg_results in aggregated_results.items():
            model_comparison = {
                'overall_score': agg_results.get('overall_score', 5.0)
            }
            
            # Add mean values of all metrics
            for metric, stats in agg_results.items():
                if metric != 'overall_score' and isinstance(stats, dict) and 'mean' in stats:
                    model_comparison[f"{metric}"] = stats['mean']
            
            comparison[model_name] = model_comparison
        
        return comparison
    
    def analyze_by_prompt(self, results_by_prompt, custom_weights=None):
        """
        Analyze results grouped by prompt.
        
        Args:
            results_by_prompt: dictionary with prompts as keys and dictionaries of model results as values
            custom_weights: optional dictionary of custom weights for metrics
            
        Returns:
            dict: analysis results by prompt
        """
        prompt_analysis = {}
        
        for prompt, model_results in results_by_prompt.items():
            # Compare models for this prompt
            prompt_comparison = self.compare_models(model_results, custom_weights)
            
            # Find best model for this prompt
            best_model = None
            best_score = -1
            
            for model, metrics in prompt_comparison.items():
                score = metrics.get('overall_score', 0)
                if score > best_score:
                    best_score = score
                    best_model = model
            
            prompt_analysis[prompt] = {
                'model_comparison': prompt_comparison,
                'best_model': best_model,
                'best_score': best_score
            }
        
        return prompt_analysis
    
    def create_comparison_dataframe(self, comparison_results):
        """
        Create a pandas DataFrame from comparison results.
        
        Args:
            comparison_results: dictionary with model names as keys and metric dictionaries as values
            
        Returns:
            pandas.DataFrame: comparison table
        """
        # Convert to DataFrame
        df = pd.DataFrame.from_dict(comparison_results, orient='index')
        
        # Sort by overall score
        if 'overall_score' in df.columns:
            df = df.sort_values('overall_score', ascending=False)
        
        return df