File size: 4,101 Bytes
f1d79c0
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
import numpy as np
import pandas as pd
import streamlit as st


def create_synthetic_data(n_tasks=100, n_models=4, n_ratings=3):
    """Create a synthetic dataframe with human ratings of model performance on a set of tasks.

    Parameters
    ----------
    n_tasks : int
        The number of tasks.
    n_models : int
        The number of models.
    n_ratings : int
        The number of human ratings of model performance on a set of tasks.

    Returns
    -------
    pandas.DataFrame
        DataFrame containing human ratings of model performance on a set of tasks.
    """
    # create a synthetic dataframe with 3 human ratings of 4 models performance on a set of 100 tasks
    df = pd.DataFrame({'task': np.repeat(range(n_tasks), n_models * n_ratings),
                       'model': np.tile(np.repeat(range(n_models), n_ratings), n_tasks),
                       'rating': np.tile(np.random.randint(0, 5, n_models * n_ratings), n_tasks)})
    # calculate score for each model
    df['score'] = df.groupby(['task', 'model'])['rating'].transform('mean')
    # calculate baseline score for each task
    df['baseline'] = df.groupby('task')['score'].transform('min')
    # calculate score for each model relative to baseline score
    df['score'] = df['score'] - df['baseline']
    # drop unnecessary columns
    df = df.drop(['rating', 'baseline'], axis=1)
    # drop duplicates
    df = df.drop_duplicates()
    return df


def calculate_elo_rating(df, k=32, initial_rating=0):
    """Calculate ELORating for each model based on human ratings of model performance on a set of tasks.

    Parameters
    ----------
    df : pandas.DataFrame
        DataFrame containing human ratings of model performance on a set of tasks.
    k : int
        The k-factor.
    initial_rating : int
        The initial rating.

    Returns
    -------
    pandas.DataFrame
        DataFrame containing ELORating for each model based on human ratings of model performance on a set of tasks.
    """
    # calculate ELORating for each model based on human ratings of model performance on a set of tasks
    # create a dat
    df = df.copy()
    # create a dataframe with all possible combinations of tasks and models
    df_all = pd.DataFrame({'task': np.repeat(range(df['task'].max() + 1), df['model'].max() + 1),
                            'model': np.tile(range(df['model'].max() + 1), df['task'].max() + 1)})
    # merge with original dataframe
    df = df_all.merge(df, on=['task', 'model'], how='left')
    # fill missing values with 0
    df['score'] = df['score'].fillna(0)
    # calculate expected score for each model
    df['expected_score'] = df.groupby('model')['score'].transform(lambda x: 1 / (1 + 10 ** (-x / 400)))
    # calculate actual score for each model
    df['actual_score'] = df.groupby('model')['score'].transform(lambda x: x > 0).astype(int)
    # calculate rating for each model
    df['rating'] = df.groupby('model')['expected_score'].transform(lambda x: x * k + initial_rating)
    # calculate rating change for each model
    df['rating_change'] = df.groupby('model')['actual_score'].transform(lambda x: x * k)
    # calculate new rating for each model
    df['new_rating'] = df['rating'] + df['rating_change']
    # drop unnecessary columns
    df = df.drop(['score', 'expected_score', 'actual_score', 'rating', 'rating_change'], axis=1)
    return df

def display_leaderboard(elo, n_models=4):
    """Display Elo rating for each model as a leaderboard based on their ranking.

    Parameters
    ----------
    elo : pandas.DataFrame
        DataFrame containing ELORating for each model based on human ratings of model performance on a set of tasks.
    n_models : int
        The number of models.
    """
    # calculate average Elo rating for each model
    elo = elo.groupby('model')['new_rating'].mean().reset_index()
    # sort models by Elo rating
    elo = elo.sort_values('new_rating', ascending=False)
    # add rank column
    elo['rank'] = range(1, n_models + 1)
    # display Elo rating for each model as a leaderboard based on their ranking
    st.write(elo)