File size: 14,592 Bytes
8a142a6
 
c93381b
 
44c2a20
c9a7557
c93381b
 
 
 
 
 
 
d7a58e0
c9a7557
15dd199
5e2794d
 
 
 
 
 
 
 
 
 
 
eb971b5
15dd199
 
c9a7557
d7a58e0
15dd199
eb971b5
15dd199
 
 
44c2a20
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
c93381b
44c2a20
 
 
 
 
 
 
c93381b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8a142a6
 
 
 
 
 
c93381b
 
 
 
 
 
 
 
 
 
8a142a6
 
 
15dd199
8a142a6
 
5e2794d
8a142a6
 
 
 
 
 
c93381b
 
8a142a6
 
 
 
 
c93381b
c9a7557
c93381b
8a142a6
c93381b
 
 
 
 
8a142a6
 
 
 
 
 
 
c93381b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8a142a6
 
 
 
 
c93381b
8a142a6
 
 
eb971b5
8a142a6
 
 
c93381b
 
 
 
 
8a142a6
 
c93381b
8a142a6
 
c93381b
 
 
8a142a6
 
 
 
c93381b
 
 
8a142a6
15dd199
 
 
8a142a6
 
c93381b
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
import os
import pandas as pd
import math
from datetime import datetime
from .models import models
from huggingface_hub import CommitScheduler, hf_hub_download

# Default K-factor (determines how much a single match affects ratings)
DEFAULT_K_FACTOR = 32

# Default starting Elo
DEFAULT_ELO = 1500

LEADERBOARD_FN = './utils/leaderboard/arena_elo_leaderboard.csv'
REPO_ID = "aizip-dev/Arena-Metadata"

hub_leaderboard_path = hf_hub_download(
    repo_id=REPO_ID,
    filename="arena_elo_leaderboard.csv",
    repo_type="dataset",
)
df = pd.read_csv(hub_leaderboard_path)
print(f"Successfully loaded leaderboard from the Hub. {len(df)} models.")
df.to_csv(LEADERBOARD_FN, index=False)
print(f"Leaderboard copied to {LEADERBOARD_FN} for CommitScheduler.")
        

#csv_path = os.path.join('utils', 'arena_elo_leaderboard.csv')

leaderboard_scheduler = CommitScheduler(
    repo_id=REPO_ID,
    folder_path="utils/leaderboard",
    repo_type="dataset",
    every=1
    )


def prepare_url(model_dict: dict):
    """
    Prepare the URL for the model based on its name.
    
    Parameters:
    - model_dict: Dictionary containing model information
    
    Returns:
    - URL string for the model
    """
    url_dict = {}
    # Extract the model name from the dictionary
    model_names = model_dict.keys()
    for name in model_names:
        half_url = model_dict[name]
    
    # Construct the URL using the model name
        url = f"https://huggingface.co/{half_url}"
        url_dict[name] = url
    
    return url_dict


# Mapping of model names to their Hugging Face URLs
# model_to_hf = {
#     "Qwen2.5-1.5b-Instruct": "https://huggingface.co/qwen/qwen2.5-1.5b-instruct",
#     "Qwen2.5-3b-Instruct": "https://huggingface.co/qwen/qwen2.5-3b-instruct",
#     # Add more models and their HF links here
# }

model_to_hf = prepare_url(models)

def calculate_elo_changes(winner_rating, loser_rating, k_factor=DEFAULT_K_FACTOR, draw=False):
    """
    Calculate Elo rating changes for two models.
    
    Parameters:
    - winner_rating: Winner's current rating
    - loser_rating: Loser's current rating
    - k_factor: How much a single match affects ratings
    - draw: Whether the match was a draw
    
    Returns:
    - (winner_change, loser_change): Rating changes to apply
    """
    # Calculate expected scores (probability of winning)
    expected_winner = 1 / (1 + 10 ** ((loser_rating - winner_rating) / 400))
    expected_loser = 1 / (1 + 10 ** ((winner_rating - loser_rating) / 400))
    
    if draw:
        # For a draw, both get 0.5 points
        actual_winner = 0.5
        actual_loser = 0.5
    else:
        # For a win, winner gets 1 point, loser gets 0
        actual_winner = 1.0
        actual_loser = 0.0
    
    # Calculate rating changes
    winner_change = k_factor * (actual_winner - expected_winner)
    loser_change = k_factor * (actual_loser - expected_loser)
    
    return winner_change, loser_change

def calculate_confidence_interval(elo_rating, num_games, confidence=0.95):
    """
    Calculate a confidence interval for an Elo rating.
    
    Parameters:
    - elo_rating: The current Elo rating
    - num_games: Number of games played
    - confidence: Confidence level (default: 0.95 for 95% confidence)
    
    Returns:
    - margin: The margin of error for the confidence interval
    """
    if num_games == 0:
        return float('inf')
    
    # Z-score for the given confidence level (1.96 for 95% confidence)
    z = 1.96 if confidence == 0.95 else 1.645 if confidence == 0.90 else 2.576 if confidence == 0.99 else 1.96
    
    # Standard deviation of the Elo rating
    # The factor 400/sqrt(num_games) is a common approximation
    std_dev = 400 / math.sqrt(num_games)
    
    # Margin of error
    margin = z * std_dev
    
    return margin

def load_leaderboard_data():
    """
    Loads the leaderboard data from the leaderboard CSV file.
    Returns the data in a format compatible with the application.
    """
    # Initialize the results structure with both win/loss/tie counts and Elo ratings
    results = {
        "wins": {}, 
        "losses": {}, 
        "ties": {}, 
        "votes": 0,
        "elo": {}, 
        "games_played": {},
        "last_updated": datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    }
    
    try:
        # Define the path to the CSV file for leaderboard
        csv_path = LEADERBOARD_FN
        # Check if the file exists and load it
        if os.path.exists(csv_path):
            df = pd.read_csv(LEADERBOARD_FN)
            # Process the data into our structure
            for _, row in df.iterrows():
                model = row['model']
                results["wins"][model] = row['wins']
                results["losses"][model] = row['losses']
                results["ties"][model] = row['ties']
                results["elo"][model] = row['elo']
                results["games_played"][model] = row['games_played']
                
            # Calculate total votes
            for model in results["wins"].keys():
                results["votes"] += results["wins"][model] + results["losses"][model] + results["ties"][model] // 2
        else:
            # If file doesn't exist, pre-populate with some reasonable data
            print("Leaderboard file not found. Initializing with default values.")
            from .models import model_names
            for model in model_names:
                results["wins"][model] = 0
                results["losses"][model] = 0
                results["ties"][model] = 0
                results["elo"][model] = DEFAULT_ELO  # Start everyone at 1500 Elo
                results["games_played"][model] = 0
            
        return results
    except Exception as e:
        print(f"Error loading leaderboard data: {e}")
        # Return the initialized structure if file can't be loaded
        return results

def update_elo_ratings(results, model_a, model_b, winner, k_factor=DEFAULT_K_FACTOR):
    """
    Updates Elo ratings based on a match result.
    
    Parameters:
    - results: The current leaderboard results dictionary
    - model_a: Name of model A
    - model_b: Name of model B
    - winner: 'left' for model A, 'right' for model B, 'tie' for a tie, 'neither' for no winner
    - k_factor: How much this match affects ratings
    
    Returns:
    - Updated results dictionary
    """
    # Initialize ratings if not present
    if model_a not in results["elo"]:
        results["elo"][model_a] = DEFAULT_ELO
        results["games_played"][model_a] = 0
    
    if model_b not in results["elo"]:
        results["elo"][model_b] = DEFAULT_ELO
        results["games_played"][model_b] = 0
    
    # Get current ratings
    rating_a = results["elo"][model_a]
    rating_b = results["elo"][model_b]
    
    # Handle different winning scenarios
    if winner == 'left':
        # Model A won
        change_a, change_b = calculate_elo_changes(rating_a, rating_b, k_factor, draw=False)
        results["wins"][model_a] = results["wins"].get(model_a, 0) + 1
        results["losses"][model_b] = results["losses"].get(model_b, 0) + 1
    elif winner == 'right':
        # Model B won
        change_b, change_a = calculate_elo_changes(rating_b, rating_a, k_factor, draw=False)
        results["wins"][model_b] = results["wins"].get(model_b, 0) + 1
        results["losses"][model_a] = results["losses"].get(model_a, 0) + 1
    elif winner == 'tie':
        # It's a tie
        change_a, change_b = calculate_elo_changes(rating_a, rating_b, k_factor, draw=True)
        results["ties"][model_a] = results["ties"].get(model_a, 0) + 1
        results["ties"][model_b] = results["ties"].get(model_b, 0) + 1
    else:  # 'neither' case - no winner
        # No rating changes, but still log the game
        change_a, change_b = 0, 0
    
    # Apply rating changes
    results["elo"][model_a] = rating_a + change_a
    results["elo"][model_b] = rating_b + change_b
    
    # Update games played counters
    results["games_played"][model_a] = results["games_played"].get(model_a, 0) + 1
    results["games_played"][model_b] = results["games_played"].get(model_b, 0) + 1
    
    # Update timestamp
    results["last_updated"] = datetime.now().strftime("%Y-%m-%d %H:%M:%S")
    
    return results

def save_leaderboard_data(results):
    """
    Saves the current leaderboard results back to the CSV file.
    
    Parameters:
    - results: The results dictionary with wins, losses, ties, elo, etc.
    """
    try:
        # Define the path to the CSV file
        csv_path = LEADERBOARD_FN
        
        # Convert the results dictionary to a DataFrame
        data = []
        for model in results["elo"].keys():
            # Calculate confidence interval
            games_played = results["games_played"].get(model, 0)
            confidence_interval = calculate_confidence_interval(results["elo"][model], games_played)
            
            data.append({
                'model': model,
                'elo': round(results["elo"].get(model, DEFAULT_ELO), 1),
                'wins': results["wins"].get(model, 0),
                'losses': results["losses"].get(model, 0),
                'ties': results["ties"].get(model, 0),
                'games_played': results["games_played"].get(model, 0),
                'confidence_interval': round(confidence_interval, 1)
            })
        
        df = pd.DataFrame(data)
        
        # Sort by Elo rating (descending)
        df = df.sort_values(by='elo', ascending=False)
        
        # Save to CSV
        with leaderboard_scheduler.lock:
            df.to_csv(csv_path, index=False)
            print(f"Leaderboard data saved successfully to {csv_path}")
    except Exception as e:
        print(f"Error saving leaderboard data: {e}")

def generate_leaderboard_html(results):
    """
    Generate HTML for displaying the leaderboard with Elo ratings.
    
    Parameters:
    - results: The current leaderboard results dictionary
    
    Returns:
    - HTML string for the leaderboard
    """
    # Prepare model data for the HTML table
    model_data = []
    for model in results["elo"]:
        elo = results["elo"].get(model, DEFAULT_ELO)
        wins = results["wins"].get(model, 0)
        losses = results["losses"].get(model, 0)
        ties = results["ties"].get(model, 0)
        total_comparisons = wins + losses + ties
        win_rate = (wins + 0.5 * ties) / total_comparisons if total_comparisons > 0 else 0.0
        
        # Calculate confidence interval
        games_played = results["games_played"].get(model, 0)
        confidence = calculate_confidence_interval(elo, games_played)
        
        model_data.append({
            "model": model,
            "elo": elo,
            "wins": wins,
            "losses": losses,
            "ties": ties,
            "comparisons": total_comparisons,
            "win_rate": win_rate,
            "confidence": confidence
        })
    
    # Sort by Elo rating
    model_data.sort(key=lambda x: x["elo"], reverse=True)
    
    # Start building HTML table
    html = """
    <table class="leaderboard-table">
        <thead>
            <tr>
                <th class="centered">Rank</th>
                <th>Model</th>
                <th>Elo Rating</th>
                <th class="centered">Win Rate (%)</th>
                <th class="centered">Wins</th>
                <th class="centered">Losses</th>
                <th class="centered">Ties</th>
                <th class="centered">Comparisons</th>
            </tr>
        </thead>
        <tbody>
    """
    
    # Add rows to the HTML table
    for rank, data in enumerate(model_data, 1):
        model = data["model"]
        elo = data["elo"]
        wins = data["wins"]
        losses = data["losses"]
        ties = data["ties"]
        comparisons = data["comparisons"]
        win_rate = data["win_rate"]
        confidence = data["confidence"]
        
        # Create model link if in the mapping
        if model in model_to_hf:
            model_html = f'<a href="{model_to_hf[model]}" target="_blank" rel="noopener noreferrer" class="model-link">{model}<span class="external-icon">↗</span></a>'
        else:
            model_html = model
        
        # Format Elo with confidence interval
        elo_html = f"{elo:.1f} <span class='confidence-value'>± {confidence:.1f}</span>"
        
        # Add row to table
        html += f"""
        <tr>
            <td class="centered"><strong>{rank}</strong></td>
            <td>{model_html}</td>
            <td class="elo-col">{elo_html}</td>
            <td class="centered">{win_rate:.1%}</td>
            <td class="centered">{wins}</td>
            <td class="centered">{losses}</td>
            <td class="centered">{ties}</td>
            <td class="centered">{comparisons}</td>
        </tr>
        """
    
    # Close the HTML table
    html += """
        </tbody>
    </table>
    """
    
    return html

def submit_vote_with_elo(m_a, m_b, winner, feedback, current_results):
    """
    Enhanced version of submit_vote that calculates and applies Elo rating changes.
    This replaces the original submit_vote_fixed function.
    
    Parameters:
    - m_a: Model A name
    - m_b: Model B name
    - winner: 'left', 'right', 'tie', or 'neither'
    - feedback: List of feedback options selected
    - current_results: The current leaderboard state
    
    Returns:
    - Updated results and UI components
    """
    if winner is None:
        print("Warning: Submit called without a winner selected.")
        return {}

    # Update Elo ratings
    updated_results = update_elo_ratings(current_results.copy(), m_a, m_b, winner)

    # Update vote count
    updated_results["votes"] = updated_results.get("votes", 0) + 1
    
    # Save updated results
    save_leaderboard_data(updated_results)

    # Generate HTML leaderboard
    leaderboard_html = generate_leaderboard_html(updated_results)
    
    # Import gradio for the gr.update objects
    import gradio as gr
    
    return [
        True, updated_results,
        gr.update(interactive=False), gr.update(interactive=False),
        gr.update(interactive=False), gr.update(interactive=False),
        gr.update(interactive=False), gr.update(visible=True),
        gr.update(visible=False), gr.update(visible=True),
        gr.update(interactive=False), gr.update(value=leaderboard_html, visible=True),
        gr.update(elem_classes=["results-revealed"]),
        gr.update(interactive=True), gr.update(value=m_a), gr.update(value=m_b)
    ]