nazneen commited on
Commit
f1d79c0
β€’
1 Parent(s): c4e54a6

Upload 4 files

Browse files
Files changed (4) hide show
  1. README.md +2 -2
  2. app.py +42 -0
  3. requirements.txt +2 -0
  4. utils.py +98 -0
README.md CHANGED
@@ -1,7 +1,7 @@
1
  ---
2
- title: Elo
3
  emoji: πŸ’©
4
- colorFrom: purple
5
  colorTo: pink
6
  sdk: streamlit
7
  sdk_version: 1.17.0
 
1
  ---
2
+ title: Instruction Model Outputs Filtered
3
  emoji: πŸ’©
4
+ colorFrom: blue
5
  colorTo: pink
6
  sdk: streamlit
7
  sdk_version: 1.17.0
app.py ADDED
@@ -0,0 +1,42 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ from pathlib import Path
3
+
4
+ import pandas as pd
5
+ import streamlit as st
6
+ import utils as ut
7
+
8
+ st.set_page_config(layout="wide")
9
+
10
+
11
+ st.markdown("# Elo Rating of Models")
12
+ st.markdown(
13
+ """This app shows the Elo rating of models on the H4 Hub based on their performance on the H4 eval dataset. """)
14
+ st.markdown(
15
+ """**Notes**
16
+ * This is currently using synthetic data
17
+ * You can tweak the number of tasks, models, and human rating per task to generate different datasets
18
+ """
19
+ )
20
+ # user input
21
+
22
+ num_tasks = st.number_input("Number of tasks", min_value=1, max_value=5000, value=100)
23
+ num_models = st.number_input("Number of models", min_value=1, max_value=100, value=4)
24
+ num_human_ratings = st.number_input(
25
+ "Number of human ratings per task", min_value=1, max_value=10, value=3
26
+ )
27
+
28
+ button = st.button("Show me the leaderboard!")
29
+
30
+ if button is True:
31
+ # generate synthetic data
32
+ df = ut.create_synthetic_data( n_tasks=num_tasks, n_models=num_models, n_ratings=num_human_ratings)
33
+ # calculate elo rating
34
+ elo_df = ut.calculate_elo_rating(df)
35
+ # show leaderboard
36
+ ut.display_leaderboard(elo_df)
37
+
38
+
39
+
40
+
41
+
42
+
requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ datasets
2
+ python-dotenv
utils.py ADDED
@@ -0,0 +1,98 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import numpy as np
2
+ import pandas as pd
3
+ import streamlit as st
4
+
5
+
6
+ def create_synthetic_data(n_tasks=100, n_models=4, n_ratings=3):
7
+ """Create a synthetic dataframe with human ratings of model performance on a set of tasks.
8
+
9
+ Parameters
10
+ ----------
11
+ n_tasks : int
12
+ The number of tasks.
13
+ n_models : int
14
+ The number of models.
15
+ n_ratings : int
16
+ The number of human ratings of model performance on a set of tasks.
17
+
18
+ Returns
19
+ -------
20
+ pandas.DataFrame
21
+ DataFrame containing human ratings of model performance on a set of tasks.
22
+ """
23
+ # create a synthetic dataframe with 3 human ratings of 4 models performance on a set of 100 tasks
24
+ df = pd.DataFrame({'task': np.repeat(range(n_tasks), n_models * n_ratings),
25
+ 'model': np.tile(np.repeat(range(n_models), n_ratings), n_tasks),
26
+ 'rating': np.tile(np.random.randint(0, 5, n_models * n_ratings), n_tasks)})
27
+ # calculate score for each model
28
+ df['score'] = df.groupby(['task', 'model'])['rating'].transform('mean')
29
+ # calculate baseline score for each task
30
+ df['baseline'] = df.groupby('task')['score'].transform('min')
31
+ # calculate score for each model relative to baseline score
32
+ df['score'] = df['score'] - df['baseline']
33
+ # drop unnecessary columns
34
+ df = df.drop(['rating', 'baseline'], axis=1)
35
+ # drop duplicates
36
+ df = df.drop_duplicates()
37
+ return df
38
+
39
+
40
+ def calculate_elo_rating(df, k=32, initial_rating=0):
41
+ """Calculate ELORating for each model based on human ratings of model performance on a set of tasks.
42
+
43
+ Parameters
44
+ ----------
45
+ df : pandas.DataFrame
46
+ DataFrame containing human ratings of model performance on a set of tasks.
47
+ k : int
48
+ The k-factor.
49
+ initial_rating : int
50
+ The initial rating.
51
+
52
+ Returns
53
+ -------
54
+ pandas.DataFrame
55
+ DataFrame containing ELORating for each model based on human ratings of model performance on a set of tasks.
56
+ """
57
+ # calculate ELORating for each model based on human ratings of model performance on a set of tasks
58
+ # create a dat
59
+ df = df.copy()
60
+ # create a dataframe with all possible combinations of tasks and models
61
+ df_all = pd.DataFrame({'task': np.repeat(range(df['task'].max() + 1), df['model'].max() + 1),
62
+ 'model': np.tile(range(df['model'].max() + 1), df['task'].max() + 1)})
63
+ # merge with original dataframe
64
+ df = df_all.merge(df, on=['task', 'model'], how='left')
65
+ # fill missing values with 0
66
+ df['score'] = df['score'].fillna(0)
67
+ # calculate expected score for each model
68
+ df['expected_score'] = df.groupby('model')['score'].transform(lambda x: 1 / (1 + 10 ** (-x / 400)))
69
+ # calculate actual score for each model
70
+ df['actual_score'] = df.groupby('model')['score'].transform(lambda x: x > 0).astype(int)
71
+ # calculate rating for each model
72
+ df['rating'] = df.groupby('model')['expected_score'].transform(lambda x: x * k + initial_rating)
73
+ # calculate rating change for each model
74
+ df['rating_change'] = df.groupby('model')['actual_score'].transform(lambda x: x * k)
75
+ # calculate new rating for each model
76
+ df['new_rating'] = df['rating'] + df['rating_change']
77
+ # drop unnecessary columns
78
+ df = df.drop(['score', 'expected_score', 'actual_score', 'rating', 'rating_change'], axis=1)
79
+ return df
80
+
81
+ def display_leaderboard(elo, n_models=4):
82
+ """Display Elo rating for each model as a leaderboard based on their ranking.
83
+
84
+ Parameters
85
+ ----------
86
+ elo : pandas.DataFrame
87
+ DataFrame containing ELORating for each model based on human ratings of model performance on a set of tasks.
88
+ n_models : int
89
+ The number of models.
90
+ """
91
+ # calculate average Elo rating for each model
92
+ elo = elo.groupby('model')['new_rating'].mean().reset_index()
93
+ # sort models by Elo rating
94
+ elo = elo.sort_values('new_rating', ascending=False)
95
+ # add rank column
96
+ elo['rank'] = range(1, n_models + 1)
97
+ # display Elo rating for each model as a leaderboard based on their ranking
98
+ st.write(elo)