arena / leaderboard.py
Kang Suhyun
[#48] Fix Sceret Manager initialization failure on Space (#49)
50a5912 unverified
raw
history blame
No virus
3.2 kB
"""
It provides a leaderboard component.
"""
from collections import defaultdict
import enum
import math
import firebase_admin
from firebase_admin import credentials
from firebase_admin import firestore
import gradio as gr
import pandas as pd
from credentials import get_credentials_json
# TODO(#21): Fix auto-reload issue related to the initialization of Firebase.
firebase_admin.initialize_app(credentials.Certificate(get_credentials_json()))
db = firestore.client()
class LeaderboardTab(enum.Enum):
SUMMARIZATION = "Summarization"
TRANSLATION = "Translation"
# Ref: https://colab.research.google.com/drive/1RAWb22-PFNI-X1gPVzc927SGUdfr6nsR?usp=sharing#scrollTo=QLGc6DwxyvQc pylint: disable=line-too-long
def compute_elo(battles, k=4, scale=400, base=10, initial_rating=1000):
rating = defaultdict(lambda: initial_rating)
for model_a, model_b, winner in battles[["model_a", "model_b",
"winner"]].itertuples(index=False):
rating_a = rating[model_a]
rating_b = rating[model_b]
expected_score_a = 1 / (1 + base**((rating_b - rating_a) / scale))
expected_score_b = 1 / (1 + base**((rating_a - rating_b) / scale))
scored_point_a = 0.5 if winner == "tie" else int(winner == "model_a")
rating[model_a] += k * (scored_point_a - expected_score_a)
rating[model_b] += k * (1 - scored_point_a - expected_score_b)
return rating
def get_docs(tab):
if tab == LeaderboardTab.SUMMARIZATION:
return db.collection("arena-summarizations").order_by("timestamp").stream()
if tab == LeaderboardTab.TRANSLATION:
return db.collection("arena-translations").order_by("timestamp").stream()
def load_elo_ratings(tab):
docs = get_docs(tab)
battles = []
for doc in docs:
data = doc.to_dict()
battles.append({
"model_a": data["model_a"],
"model_b": data["model_b"],
"winner": data["winner"]
})
battles = pd.DataFrame(battles)
ratings = compute_elo(battles)
sorted_ratings = sorted(ratings.items(), key=lambda x: x[1], reverse=True)
return [[i + 1, model, math.floor(rating + 0.5)]
for i, (model, rating) in enumerate(sorted_ratings)]
def load_summarization_elo_ratings():
return load_elo_ratings(LeaderboardTab.SUMMARIZATION)
def load_translation_elo_ratings():
return load_elo_ratings(LeaderboardTab.TRANSLATION)
LEADERBOARD_UPDATE_INTERVAL = 600 # 10 minutes
LEADERBOARD_INFO = "The leaderboard is updated every 10 minutes."
def build_leaderboard():
with gr.Tabs():
with gr.Tab(LeaderboardTab.SUMMARIZATION.value):
gr.Dataframe(headers=["Rank", "Model", "Elo rating"],
datatype=["number", "str", "number"],
value=load_summarization_elo_ratings,
every=LEADERBOARD_UPDATE_INTERVAL)
gr.Markdown(LEADERBOARD_INFO)
# TODO(#9): Add language filter options.
with gr.Tab(LeaderboardTab.TRANSLATION.value):
gr.Dataframe(headers=["Rank", "Model", "Elo rating"],
datatype=["number", "str", "number"],
value=load_translation_elo_ratings,
every=LEADERBOARD_UPDATE_INTERVAL)
gr.Markdown(LEADERBOARD_INFO)