File size: 7,490 Bytes
f89e218
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
"""
Module for aggregating results from different evaluation metrics.
"""

import pandas as pd
import numpy as np
from collections import defaultdict


class ResultsAggregator:
    """Class for aggregating and analyzing image evaluation results."""
    
    def __init__(self):
        """Initialize the aggregator."""
        # Weights for different metric categories
        self.default_weights = {
            # Technical metrics
            'sharpness': 1.0,
            'noise': 1.0,
            'contrast': 1.0,
            'saturation': 1.0,
            'entropy': 1.0,
            'compression_artifacts': 1.0,
            'dynamic_range': 1.0,
            
            # Aesthetic metrics
            'aesthetic_score': 1.5,
            'composition_score': 1.2,
            'color_harmony': 1.2,
            
            # Prompt metrics
            'prompt_similarity': 2.0,
        }
        
        # Metrics where lower is better
        self.inverse_metrics = ['noise', 'compression_artifacts']
    
    def normalize_metric(self, values, metric_name):
        """
        Normalize metric values to 0-10 scale.
        
        Args:
            values: list of metric values
            metric_name: name of the metric
            
        Returns:
            list: normalized values
        """
        if not values:
            return []
            
        # For metrics where lower is better, invert the values
        if metric_name in self.inverse_metrics:
            values = [max(values) - v + min(values) for v in values]
        
        # Normalize to 0-10 scale
        min_val = min(values)
        max_val = max(values)
        
        if max_val == min_val:
            return [5.0] * len(values)  # Default to middle value if all values are the same
            
        return [10 * (v - min_val) / (max_val - min_val) for v in values]
    
    def aggregate_model_results(self, model_results, custom_weights=None):
        """
        Aggregate results for a single model across multiple images.
        
        Args:
            model_results: list of metric dictionaries for images from the same model
            custom_weights: optional dictionary of custom weights for metrics
            
        Returns:
            dict: aggregated metrics
        """
        if not model_results:
            return {}
            
        # Use default weights if custom weights not provided
        weights = custom_weights if custom_weights else self.default_weights
        
        # Initialize aggregated results
        aggregated = {}
        
        # Collect all metrics
        all_metrics = set()
        for result in model_results:
            all_metrics.update(result.keys())
        
        # Aggregate each metric
        for metric in all_metrics:
            # Skip non-numeric metrics
            values = [result.get(metric) for result in model_results if metric in result 
                     and isinstance(result[metric], (int, float))]
            
            if values:
                aggregated[metric] = {
                    'mean': np.mean(values),
                    'median': np.median(values),
                    'std': np.std(values),
                    'min': np.min(values),
                    'max': np.max(values),
                    'count': len(values)
                }
        
        # Calculate overall score
        score_components = []
        weight_sum = 0
        
        for metric, stats in aggregated.items():
            if metric in weights:
                # Normalize the mean value to 0-10 scale
                normalized_value = stats['mean']
                if metric in self.inverse_metrics:
                    # For metrics where lower is better, invert the scale
                    normalized_value = 10 - normalized_value
                
                # Apply weight
                weight = weights[metric]
                score_components.append(normalized_value * weight)
                weight_sum += weight
        
        # Calculate weighted average
        if weight_sum > 0:
            aggregated['overall_score'] = sum(score_components) / weight_sum
        else:
            aggregated['overall_score'] = 5.0  # Default middle score
        
        return aggregated
    
    def compare_models(self, model_results_dict, custom_weights=None):
        """
        Compare results across different models.
        
        Args:
            model_results_dict: dictionary with model names as keys and lists of results as values
            custom_weights: optional dictionary of custom weights for metrics
            
        Returns:
            dict: comparison results
        """
        # Aggregate results for each model
        aggregated_results = {}
        for model_name, results in model_results_dict.items():
            aggregated_results[model_name] = self.aggregate_model_results(results, custom_weights)
        
        # Extract key metrics for comparison
        comparison = {}
        for model_name, agg_results in aggregated_results.items():
            model_comparison = {
                'overall_score': agg_results.get('overall_score', 5.0)
            }
            
            # Add mean values of all metrics
            for metric, stats in agg_results.items():
                if metric != 'overall_score' and isinstance(stats, dict) and 'mean' in stats:
                    model_comparison[f"{metric}"] = stats['mean']
            
            comparison[model_name] = model_comparison
        
        return comparison
    
    def analyze_by_prompt(self, results_by_prompt, custom_weights=None):
        """
        Analyze results grouped by prompt.
        
        Args:
            results_by_prompt: dictionary with prompts as keys and dictionaries of model results as values
            custom_weights: optional dictionary of custom weights for metrics
            
        Returns:
            dict: analysis results by prompt
        """
        prompt_analysis = {}
        
        for prompt, model_results in results_by_prompt.items():
            # Compare models for this prompt
            prompt_comparison = self.compare_models(model_results, custom_weights)
            
            # Find best model for this prompt
            best_model = None
            best_score = -1
            
            for model, metrics in prompt_comparison.items():
                score = metrics.get('overall_score', 0)
                if score > best_score:
                    best_score = score
                    best_model = model
            
            prompt_analysis[prompt] = {
                'model_comparison': prompt_comparison,
                'best_model': best_model,
                'best_score': best_score
            }
        
        return prompt_analysis
    
    def create_comparison_dataframe(self, comparison_results):
        """
        Create a pandas DataFrame from comparison results.
        
        Args:
            comparison_results: dictionary with model names as keys and metric dictionaries as values
            
        Returns:
            pandas.DataFrame: comparison table
        """
        # Convert to DataFrame
        df = pd.DataFrame.from_dict(comparison_results, orient='index')
        
        # Sort by overall score
        if 'overall_score' in df.columns:
            df = df.sort_values('overall_score', ascending=False)
        
        return df