import numpy as np import pandas as pd import streamlit as st def create_synthetic_data(n_tasks=100, n_models=4, n_ratings=3): """Create a synthetic dataframe with human ratings of model performance on a set of tasks. Parameters ---------- n_tasks : int The number of tasks. n_models : int The number of models. n_ratings : int The number of human ratings of model performance on a set of tasks. Returns ------- pandas.DataFrame DataFrame containing human ratings of model performance on a set of tasks. """ # create a synthetic dataframe with 3 human ratings of 4 models performance on a set of 100 tasks df = pd.DataFrame({'task': np.repeat(range(n_tasks), n_models * n_ratings), 'model': np.tile(np.repeat(range(n_models), n_ratings), n_tasks), 'rating': np.tile(np.random.randint(0, 5, n_models * n_ratings), n_tasks)}) # calculate score for each model df['score'] = df.groupby(['task', 'model'])['rating'].transform('mean') # calculate baseline score for each task df['baseline'] = df.groupby('task')['score'].transform('min') # calculate score for each model relative to baseline score df['score'] = df['score'] - df['baseline'] # drop unnecessary columns df = df.drop(['rating', 'baseline'], axis=1) # drop duplicates df = df.drop_duplicates() return df def calculate_elo_rating(df, k=32, initial_rating=0): """Calculate ELORating for each model based on human ratings of model performance on a set of tasks. Parameters ---------- df : pandas.DataFrame DataFrame containing human ratings of model performance on a set of tasks. k : int The k-factor. initial_rating : int The initial rating. Returns ------- pandas.DataFrame DataFrame containing ELORating for each model based on human ratings of model performance on a set of tasks. """ # calculate ELORating for each model based on human ratings of model performance on a set of tasks # create a dat df = df.copy() # create a dataframe with all possible combinations of tasks and models df_all = pd.DataFrame({'task': np.repeat(range(df['task'].max() + 1), df['model'].max() + 1), 'model': np.tile(range(df['model'].max() + 1), df['task'].max() + 1)}) # merge with original dataframe df = df_all.merge(df, on=['task', 'model'], how='left') # fill missing values with 0 df['score'] = df['score'].fillna(0) # calculate expected score for each model df['expected_score'] = df.groupby('model')['score'].transform(lambda x: 1 / (1 + 10 ** (-x / 400))) # calculate actual score for each model df['actual_score'] = df.groupby('model')['score'].transform(lambda x: x > 0).astype(int) # calculate rating for each model df['rating'] = df.groupby('model')['expected_score'].transform(lambda x: x * k + initial_rating) # calculate rating change for each model df['rating_change'] = df.groupby('model')['actual_score'].transform(lambda x: x * k) # calculate new rating for each model df['new_rating'] = df['rating'] + df['rating_change'] # drop unnecessary columns df = df.drop(['score', 'expected_score', 'actual_score', 'rating', 'rating_change'], axis=1) return df def display_leaderboard(elo, n_models=4): """Display Elo rating for each model as a leaderboard based on their ranking. Parameters ---------- elo : pandas.DataFrame DataFrame containing ELORating for each model based on human ratings of model performance on a set of tasks. n_models : int The number of models. """ # calculate average Elo rating for each model elo = elo.groupby('model')['new_rating'].mean().reset_index() # sort models by Elo rating elo = elo.sort_values('new_rating', ascending=False) # add rank column elo['rank'] = range(1, n_models + 1) # display Elo rating for each model as a leaderboard based on their ranking st.write(elo)