kaikaidai commited on
Commit
5267683
1 Parent(s): ab62ff3

Create leaderboard.py

Browse files
Files changed (1) hide show
  1. leaderboard.py +114 -0
leaderboard.py ADDED
@@ -0,0 +1,114 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from collections import defaultdict
2
+ from datetime import datetime, timezone
3
+ from typing import Dict, List
4
+
5
+ # Constants
6
+ DEFAULT_ELO = 1200 # Starting ELO for new models
7
+ K_FACTOR = 32 # Standard chess K-factor
8
+
9
+ def get_leaderboard(model_data: Dict, voting_data: List, show_preliminary=True):
10
+ """Generate leaderboard data using votes from MongoDB."""
11
+ # Initialize dictionaries for tracking
12
+ ratings = defaultdict(lambda: DEFAULT_ELO)
13
+ matches = defaultdict(int)
14
+
15
+ # Process each vote
16
+ for vote in voting_data:
17
+ try:
18
+ model_a = vote.get("model_a")
19
+ model_b = vote.get("model_b")
20
+ winner = vote.get("winner")
21
+
22
+ # Skip if models aren't in current model_data
23
+ if (
24
+ not all([model_a, model_b, winner])
25
+ or model_a not in model_data
26
+ or model_b not in model_data
27
+ ):
28
+ continue
29
+
30
+ # Update match counts
31
+ matches[model_a] += 1
32
+ matches[model_b] += 1
33
+
34
+ # Calculate ELO changes
35
+ elo_a = ratings[model_a]
36
+ elo_b = ratings[model_b]
37
+
38
+ # Expected scores
39
+ expected_a = 1 / (1 + 10 ** ((elo_b - elo_a) / 400))
40
+ expected_b = 1 - expected_a
41
+
42
+ # Actual scores
43
+ score_a = 1 if winner == "A" else 0 if winner == "B" else 0.5
44
+ score_b = 1 - score_a
45
+
46
+ # Update ratings
47
+ ratings[model_a] += K_FACTOR * (score_a - expected_a)
48
+ ratings[model_b] += K_FACTOR * (score_b - expected_b)
49
+
50
+ except Exception as e:
51
+ print(f"Error processing vote: {e}")
52
+ continue
53
+
54
+ # Generate leaderboard data
55
+ leaderboard = []
56
+ for model in model_data.keys():
57
+ votes = matches[model]
58
+ # Skip models with < 500 votes if show_preliminary is False
59
+ if not show_preliminary and votes < 500:
60
+ continue
61
+
62
+ elo = ratings[model]
63
+ ci = 1.96 * (400 / (votes + 1) ** 0.5) if votes > 0 else 0
64
+ data = {
65
+ "Model": model,
66
+ "ELO Score": f"{int(elo)}",
67
+ "95% CI": f"±{int(ci)}",
68
+ "# Votes": votes,
69
+ "Organization": model_data[model]["organization"],
70
+ "License": model_data[model]["license"],
71
+ }
72
+ leaderboard.append(data)
73
+
74
+ # Sort leaderboard by ELO score in descending order
75
+ leaderboard.sort(key=lambda x: float(x["ELO Score"]), reverse=True)
76
+
77
+ return leaderboard
78
+
79
+ def get_leaderboard_stats(model_data: Dict, voting_data: List) -> str:
80
+ """Get summary statistics for the leaderboard."""
81
+ now = datetime.now(timezone.utc)
82
+ total_votes = len(voting_data)
83
+ total_models = len(model_data)
84
+ last_updated = now.replace(minute=0, second=0, microsecond=0).strftime(
85
+ "%B %d, %Y at %H:00 UTC"
86
+ )
87
+
88
+ return f"""
89
+ ### Leaderboard Stats
90
+ - **Total Models**: {total_models}
91
+ - **Total Votes**: {total_votes}
92
+ - **Last Updated**: {last_updated}
93
+ """
94
+
95
+ def calculate_elo_change(rating_a: float, rating_b: float, winner: str) -> tuple[float, float]:
96
+ """Calculate ELO rating changes for both players."""
97
+ expected_a = 1 / (1 + 10 ** ((rating_b - rating_a) / 400))
98
+ expected_b = 1 - expected_a
99
+
100
+ if winner == "A":
101
+ score_a, score_b = 1, 0
102
+ elif winner == "B":
103
+ score_a, score_b = 0, 1
104
+ else: # Handle ties
105
+ score_a, score_b = 0.5, 0.5
106
+
107
+ change_a = K_FACTOR * (score_a - expected_a)
108
+ change_b = K_FACTOR * (score_b - expected_b)
109
+
110
+ return change_a, change_b
111
+
112
+ def get_model_rankings(leaderboard: List[Dict]) -> Dict[str, int]:
113
+ """Get current rankings of all models from leaderboard data."""
114
+ return {entry["Model"]: idx + 1 for idx, entry in enumerate(leaderboard)}