suhyun.kang commited on
Commit
a19f11e
1 Parent(s): 0ac094d

[#1] Add leaderboard based on Elo rating

Browse files

Changes:
- Added leaderboards for Summarization and Translation categories.
- Implemented Elo rating for each rating.
- Ref: https://colab.research.google.com/drive/1RAWb22-PFNI-X1gPVzc927SGUdfr6nsR?usp=sharing#scrollTo=QLGc6DwxyvQc

Screenshot: https://screen.yanolja.in/j7inrSXCtFtnJije.png

Files changed (2) hide show
  1. app.py +4 -0
  2. leaderboard.py +78 -0
app.py CHANGED
@@ -13,6 +13,8 @@ import firebase_admin
13
  from firebase_admin import firestore
14
  import gradio as gr
15
 
 
 
16
  db_app = firebase_admin.initialize_app()
17
  db = firestore.client()
18
 
@@ -214,6 +216,8 @@ with gr.Blocks() as app:
214
  submit.click(user, prompt, states + model_names,
215
  queue=False).then(bot, states, states + responses)
216
 
 
 
217
  if __name__ == "__main__":
218
  # We need to enable queue to use generators.
219
  app.queue()
 
13
  from firebase_admin import firestore
14
  import gradio as gr
15
 
16
+ from leaderboard import build_leaderboard
17
+
18
  db_app = firebase_admin.initialize_app()
19
  db = firestore.client()
20
 
 
216
  submit.click(user, prompt, states + model_names,
217
  queue=False).then(bot, states, states + responses)
218
 
219
+ build_leaderboard(db)
220
+
221
  if __name__ == "__main__":
222
  # We need to enable queue to use generators.
223
  app.queue()
leaderboard.py ADDED
@@ -0,0 +1,78 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ It provides a leaderboard component.
3
+ """
4
+
5
+ from collections import defaultdict
6
+ import enum
7
+ import math
8
+
9
+ import gradio as gr
10
+ import pandas as pd
11
+
12
+
13
+ class LeaderboardTab(enum.Enum):
14
+ SUMMARIZATION = "Summarization"
15
+ TRANSLATION = "Translation"
16
+
17
+
18
+ # Ref: https://colab.research.google.com/drive/1RAWb22-PFNI-X1gPVzc927SGUdfr6nsR?usp=sharing#scrollTo=QLGc6DwxyvQc pylint: disable=line-too-long
19
+ def compute_elo(battles, k=4, scale=400, base=10, initial_rating=1000):
20
+ rating = defaultdict(lambda: initial_rating)
21
+
22
+ for model_a, model_b, winner in battles[["model_a", "model_b",
23
+ "winner"]].itertuples(index=False):
24
+ rating_a = rating[model_a]
25
+ rating_b = rating[model_b]
26
+
27
+ expected_score_a = 1 / (1 + base**((rating_b - rating_a) / scale))
28
+ expected_score_b = 1 / (1 + base**((rating_a - rating_b) / scale))
29
+
30
+ scored_point_a = 0.5 if winner == "tie" else int(winner == "model_a")
31
+
32
+ rating[model_a] += k * (scored_point_a - expected_score_a)
33
+ rating[model_b] += k * (1 - scored_point_a - expected_score_b)
34
+
35
+ return rating
36
+
37
+
38
+ def get_docs(tab, db):
39
+ if tab.label == LeaderboardTab.SUMMARIZATION.value:
40
+ return db.collection("arena-summarizations").order_by("timestamp").stream()
41
+
42
+ if tab.label == LeaderboardTab.TRANSLATION.value:
43
+ return db.collection("arena-translations").order_by("timestamp").stream()
44
+
45
+
46
+ # TODO(#8): Update the value periodically.
47
+ def load_elo_ratings(tab, db):
48
+ docs = get_docs(tab, db)
49
+
50
+ battles = []
51
+ for doc in docs:
52
+ data = doc.to_dict()
53
+ battles.append({
54
+ "model_a": data["model_a"],
55
+ "model_b": data["model_b"],
56
+ "winner": data["winner"]
57
+ })
58
+
59
+ battles = pd.DataFrame(battles)
60
+ ratings = compute_elo(battles)
61
+
62
+ sorted_ratings = sorted(ratings.items(), key=lambda x: x[1], reverse=True)
63
+ return [[i + 1, model, math.floor(rating + 0.5)]
64
+ for i, (model, rating) in enumerate(sorted_ratings)]
65
+
66
+
67
+ def build_leaderboard(db):
68
+ with gr.Tabs():
69
+ with gr.Tab(LeaderboardTab.SUMMARIZATION.value) as summarization_tab:
70
+ gr.Dataframe(headers=["Rank", "Model", "Elo rating"],
71
+ datatype=["number", "str", "number"],
72
+ value=load_elo_ratings(summarization_tab, db))
73
+
74
+ # TODO(#9): Add language filter options.
75
+ with gr.Tab(LeaderboardTab.TRANSLATION.value) as translation_tab:
76
+ gr.Dataframe(headers=["Rank", "Model", "Elo rating"],
77
+ datatype=["number", "str", "number"],
78
+ value=load_elo_ratings(translation_tab, db))