Spaces:
Sleeping
Sleeping
Fix elo ratings model links
Browse files- app.py +1 -1
- elo_utils.py +59 -11
app.py
CHANGED
@@ -205,7 +205,7 @@ def get_leaderboard_df():
|
|
205 |
def get_evaluation_queue_df():
|
206 |
if repo:
|
207 |
print("Pulling changes for the evaluation queue.")
|
208 |
-
|
209 |
|
210 |
entries = [
|
211 |
entry
|
|
|
205 |
def get_evaluation_queue_df():
|
206 |
if repo:
|
207 |
print("Pulling changes for the evaluation queue.")
|
208 |
+
repo.git_pull()
|
209 |
|
210 |
entries = [
|
211 |
entry
|
elo_utils.py
CHANGED
@@ -8,10 +8,37 @@ from datasets import load_dataset
|
|
8 |
|
9 |
from content import PLOT_1_TITLE, PLOT_2_TITLE, PLOT_3_TITLE, PLOT_4_TITLE
|
10 |
from utils import make_clickable_model
|
11 |
-
from visualizations import (
|
12 |
-
|
13 |
-
|
14 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
15 |
|
16 |
|
17 |
@dataclass
|
@@ -26,7 +53,7 @@ class EloEvalResult:
|
|
26 |
def to_dict(self):
|
27 |
base_model = f"{self.model}"
|
28 |
data_dict = {}
|
29 |
-
data_dict["Model"] =
|
30 |
data_dict["GPT-4 (all)"] = self.gpt_4_all
|
31 |
data_dict["Human (all)"] = self.human_all
|
32 |
data_dict["Human (instruct)"] = self.human_instruct
|
@@ -61,7 +88,13 @@ def create_eval_df(df, tie_allowed):
|
|
61 |
}
|
62 |
|
63 |
if tie_allowed:
|
64 |
-
response["win"] =
|
|
|
|
|
|
|
|
|
|
|
|
|
65 |
else:
|
66 |
response["win"] = "model_a" if response["rating"] < 5 else "model_b"
|
67 |
|
@@ -84,7 +117,13 @@ def create_eval_df_for_gpt(df, tie_allowed):
|
|
84 |
}
|
85 |
|
86 |
if tie_allowed:
|
87 |
-
response["win"] =
|
|
|
|
|
|
|
|
|
|
|
|
|
88 |
else:
|
89 |
response["win"] = "model_a" if response["rating"] < 5 else "model_b"
|
90 |
|
@@ -124,13 +163,20 @@ def get_elo_results(df_instruct, df_code_instruct, tie_allowed):
|
|
124 |
df_all = pd.concat([df_instruct, df_code_instruct])
|
125 |
|
126 |
df_gpt_4 = load_dataset(
|
127 |
-
"gpt_4_evals/data/",
|
|
|
|
|
128 |
).to_pandas()
|
129 |
|
130 |
dfs = [df_instruct, df_code_instruct, df_all]
|
131 |
-
elo_ratings = [
|
|
|
|
|
|
|
132 |
|
133 |
-
gpt_4_elo_ratings = convert_rating_from_float_to_int(
|
|
|
|
|
134 |
elo_ratings.append(gpt_4_elo_ratings)
|
135 |
|
136 |
results = [
|
@@ -166,7 +212,9 @@ def get_elo_plots(df_instruct, df_code_instruct, tie_allowed):
|
|
166 |
|
167 |
BOOTSTRAP_ROUNDS = 1000
|
168 |
if "bootstrap_elo_lu" not in globals():
|
169 |
-
bootstrap_elo_lu = get_bootstrap_result(
|
|
|
|
|
170 |
|
171 |
plot_3 = visualize_bootstrap_scores(bootstrap_elo_lu, PLOT_3_TITLE)
|
172 |
|
|
|
8 |
|
9 |
from content import PLOT_1_TITLE, PLOT_2_TITLE, PLOT_3_TITLE, PLOT_4_TITLE
|
10 |
from utils import make_clickable_model
|
11 |
+
from visualizations import (
|
12 |
+
get_bootstrap_result,
|
13 |
+
switch_model_a_b,
|
14 |
+
visualize_battle_count,
|
15 |
+
visualize_bootstrap_scores,
|
16 |
+
visualize_pairwise_win_fraction,
|
17 |
+
visualize_rating_count,
|
18 |
+
)
|
19 |
+
|
20 |
+
|
21 |
+
KOALA_LINK = "https://huggingface.co/TheBloke/koala-13B-HF"
|
22 |
+
VICUNA_LINK = "https://huggingface.co/HuggingFaceH4/stable-vicuna-13b-2904"
|
23 |
+
OASST_LINK = "https://huggingface.co/OpenAssistant/oasst-sft-4-pythia-12b-epoch-3.5"
|
24 |
+
DOLLY_LINK = "https://huggingface.co/databricks/dolly-v2-12b"
|
25 |
+
MODEL_PAGE = "https://huggingface.co/models"
|
26 |
+
|
27 |
+
|
28 |
+
def make_clickable_model_elo(model_name):
|
29 |
+
link = ""
|
30 |
+
if model_name == "dolly-12b":
|
31 |
+
link = DOLLY_LINK
|
32 |
+
elif model_name == "vicuna-13b":
|
33 |
+
link = VICUNA_LINK
|
34 |
+
elif model_name == "koala-13b":
|
35 |
+
link = KOALA_LINK
|
36 |
+
elif model_name == "oasst-12b":
|
37 |
+
link = OASST_LINK
|
38 |
+
else:
|
39 |
+
link = MODEL_PAGE
|
40 |
+
|
41 |
+
return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
|
42 |
|
43 |
|
44 |
@dataclass
|
|
|
53 |
def to_dict(self):
|
54 |
base_model = f"{self.model}"
|
55 |
data_dict = {}
|
56 |
+
data_dict["Model"] = make_clickable_model_elo(base_model)
|
57 |
data_dict["GPT-4 (all)"] = self.gpt_4_all
|
58 |
data_dict["Human (all)"] = self.human_all
|
59 |
data_dict["Human (instruct)"] = self.human_instruct
|
|
|
88 |
}
|
89 |
|
90 |
if tie_allowed:
|
91 |
+
response["win"] = (
|
92 |
+
"model_a"
|
93 |
+
if response["rating"] < 4
|
94 |
+
else "model_b"
|
95 |
+
if response["rating"] > 5
|
96 |
+
else "tie"
|
97 |
+
)
|
98 |
else:
|
99 |
response["win"] = "model_a" if response["rating"] < 5 else "model_b"
|
100 |
|
|
|
117 |
}
|
118 |
|
119 |
if tie_allowed:
|
120 |
+
response["win"] = (
|
121 |
+
"model_a"
|
122 |
+
if response["rating"] < 4
|
123 |
+
else "model_b"
|
124 |
+
if response["rating"] > 5
|
125 |
+
else "tie"
|
126 |
+
)
|
127 |
else:
|
128 |
response["win"] = "model_a" if response["rating"] < 5 else "model_b"
|
129 |
|
|
|
163 |
df_all = pd.concat([df_instruct, df_code_instruct])
|
164 |
|
165 |
df_gpt_4 = load_dataset(
|
166 |
+
"gpt_4_evals/data/",
|
167 |
+
split="train",
|
168 |
+
revision="e007baaf6e505731c08a0bc1a833a1f8f8cb8846",
|
169 |
).to_pandas()
|
170 |
|
171 |
dfs = [df_instruct, df_code_instruct, df_all]
|
172 |
+
elo_ratings = [
|
173 |
+
convert_rating_from_float_to_int(create_eval_df(df, tie_allowed=tie_allowed))
|
174 |
+
for df in dfs
|
175 |
+
]
|
176 |
|
177 |
+
gpt_4_elo_ratings = convert_rating_from_float_to_int(
|
178 |
+
create_eval_df_for_gpt(df_gpt_4, tie_allowed=tie_allowed)
|
179 |
+
)
|
180 |
elo_ratings.append(gpt_4_elo_ratings)
|
181 |
|
182 |
results = [
|
|
|
212 |
|
213 |
BOOTSTRAP_ROUNDS = 1000
|
214 |
if "bootstrap_elo_lu" not in globals():
|
215 |
+
bootstrap_elo_lu = get_bootstrap_result(
|
216 |
+
game_switch, compute_elo, BOOTSTRAP_ROUNDS
|
217 |
+
)
|
218 |
|
219 |
plot_3 = visualize_bootstrap_scores(bootstrap_elo_lu, PLOT_3_TITLE)
|
220 |
|