Spaces:
Runtime error
Runtime error
Merge pull request #64 from yeonir/yeonirhee/sotopia-task
Browse files- sotopia_space/benchmark.py +6 -39
- sotopia_space/chat.py +0 -2
- sotopia_space/constants.py +4 -22
- sotopia_space/utils.py +6 -190
sotopia_space/benchmark.py
CHANGED
@@ -1,42 +1,13 @@
|
|
1 |
import gradio as gr # type: ignore
|
2 |
import pandas as pd
|
3 |
from sotopia_space.constants import MODEL_OPTIONS
|
4 |
-
from sotopia_space.utils import
|
5 |
|
6 |
LP_MODE = "v2"
|
7 |
original_df, ablation_df = None, None
|
8 |
LP_original_dfs = {}
|
9 |
DEFAULT_LP = 0.5
|
10 |
|
11 |
-
available_models = [] # to be filled in later
|
12 |
-
original_df, ablation_df = None, None
|
13 |
-
|
14 |
-
def slider_change_main(length_penalty):
|
15 |
-
global original_df, ablation_df, LP_MODE
|
16 |
-
adjusted_df = apply_length_penalty(original_df, ablation_df, length_penalty, mode=LP_MODE, LP_original_dfs=LP_original_dfs)
|
17 |
-
adjusted_df = adjusted_df[["Model", "Overall Elo", "Task-Avg Elo", "# battles", "Length"]]
|
18 |
-
adjusted_df = adjusted_df.sort_values(by="Overall Elo", ascending=False)
|
19 |
-
# adjusted_df = add_winrates(adjusted_df, LP=length_penalty)
|
20 |
-
# adjusted_df = adjusted_df.drop(columns=["Length"])
|
21 |
-
adjusted_df.insert(0, "Rank", range(1, 1 + len(adjusted_df)))
|
22 |
-
return adjusted_df
|
23 |
-
|
24 |
-
def slider_change_full(length_penalty, show_winrate):
|
25 |
-
global original_df, ablation_df, LP_MODE
|
26 |
-
adjusted_df = apply_length_penalty(original_df, ablation_df, length_penalty, mode=LP_MODE, LP_original_dfs=LP_original_dfs)
|
27 |
-
# sort the model by the "Task-Avg Elo" column
|
28 |
-
adjusted_df = adjusted_df.sort_values(by="Overall Elo", ascending=False)
|
29 |
-
adjusted_df.drop(columns=["Overall Elo", "Task-Avg Elo", "# battles", "Length"], inplace=True)
|
30 |
-
if show_winrate == "none":
|
31 |
-
adjusted_df.insert(0, "Rank", range(1, 1 + len(adjusted_df)))
|
32 |
-
return adjusted_df
|
33 |
-
elif show_winrate == "gpt-3.5":
|
34 |
-
adjusted_df = add_winrates_tasks(adjusted_df, ref="gpt-3.5", LP=length_penalty)
|
35 |
-
elif show_winrate == "gpt-4":
|
36 |
-
adjusted_df = add_winrates_tasks(adjusted_df, ref="gpt-4", LP=length_penalty)
|
37 |
-
adjusted_df.insert(0, "Rank", range(1, 1 + len(adjusted_df)))
|
38 |
-
return adjusted_df
|
39 |
-
|
40 |
def benchmark_table():
|
41 |
global original_df, ablation_df
|
42 |
global LP_original_dfs, LP_MODE
|
@@ -44,18 +15,15 @@ def benchmark_table():
|
|
44 |
gr.Markdown(f"**Version**: sotopia (v1.01; 2024.04.22) | **# Examples**: 7200 | **# Models**: {len(MODEL_OPTIONS)} | **# Comparisons**: x", elem_classes="markdown-text")
|
45 |
|
46 |
with gr.TabItem("Vs GPT-3.5", elem_id="od-benchmark-tab-table-ablation", id=0, elem_classes="subtab"):
|
47 |
-
|
48 |
-
original_df = pd.read_json('data_dir/models_vs_gpt35.jsonl', lines=True)
|
49 |
-
default_main_df = apply_length_penalty(original_df, ablation_df, length_penalty=DEFAULT_LP, mode=LP_MODE, LP_original_dfs=LP_original_dfs)
|
50 |
default_main_df = default_main_df.sort_values(by="GOAL [0, 10]", ascending=False)
|
|
|
51 |
# add a Rank column to the first columnn (starting from 1)
|
52 |
default_main_df.insert(0, "Rank", range(1, 1 + len(default_main_df)))
|
|
|
53 |
with gr.Row():
|
54 |
with gr.Column(scale=4):
|
55 |
-
gr.Markdown("
|
56 |
-
with gr.Column(scale=1):
|
57 |
-
length_penlty_slider = gr.Slider(minimum=0.1, maximum=1, step=0.1, value=DEFAULT_LP, label="Length Penalty", elem_id="length-penalty-slider")
|
58 |
-
# checkbox_skip_empty = gr.Checkbox(label="Skip empty results", value=False, elem_id="skip-empty-checkbox", scale=2)
|
59 |
TYPES = ["number", "markdown", "number"]
|
60 |
leaderboard_table = gr.components.Dataframe(
|
61 |
value=default_main_df,
|
@@ -66,5 +34,4 @@ def benchmark_table():
|
|
66 |
interactive=False,
|
67 |
visible=True,
|
68 |
min_width=60,
|
69 |
-
)
|
70 |
-
#length_penlty_slider.change(fn=slider_change_main, inputs=[length_penlty_slider], outputs=[leaderboard_table])
|
|
|
1 |
import gradio as gr # type: ignore
|
2 |
import pandas as pd
|
3 |
from sotopia_space.constants import MODEL_OPTIONS
|
4 |
+
from sotopia_space.utils import post_processing
|
5 |
|
6 |
LP_MODE = "v2"
|
7 |
original_df, ablation_df = None, None
|
8 |
LP_original_dfs = {}
|
9 |
DEFAULT_LP = 0.5
|
10 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
11 |
def benchmark_table():
|
12 |
global original_df, ablation_df
|
13 |
global LP_original_dfs, LP_MODE
|
|
|
15 |
gr.Markdown(f"**Version**: sotopia (v1.01; 2024.04.22) | **# Examples**: 7200 | **# Models**: {len(MODEL_OPTIONS)} | **# Comparisons**: x", elem_classes="markdown-text")
|
16 |
|
17 |
with gr.TabItem("Vs GPT-3.5", elem_id="od-benchmark-tab-table-ablation", id=0, elem_classes="subtab"):
|
18 |
+
default_main_df = pd.read_json('data_dir/models_vs_gpt35.jsonl', lines=True)
|
|
|
|
|
19 |
default_main_df = default_main_df.sort_values(by="GOAL [0, 10]", ascending=False)
|
20 |
+
default_main_df = post_processing(default_main_df, None)
|
21 |
# add a Rank column to the first columnn (starting from 1)
|
22 |
default_main_df.insert(0, "Rank", range(1, 1 + len(default_main_df)))
|
23 |
+
|
24 |
with gr.Row():
|
25 |
with gr.Column(scale=4):
|
26 |
+
gr.Markdown("<h3>**Vs GPT3.5**: The interlocutors are compared against GPT-3.5, the baseline model.")
|
|
|
|
|
|
|
27 |
TYPES = ["number", "markdown", "number"]
|
28 |
leaderboard_table = gr.components.Dataframe(
|
29 |
value=default_main_df,
|
|
|
34 |
interactive=False,
|
35 |
visible=True,
|
36 |
min_width=60,
|
37 |
+
)
|
|
sotopia_space/chat.py
CHANGED
@@ -91,8 +91,6 @@ def chat_introduction():
|
|
91 |
🗄️ **Disclaimer**: User prompts and generated replies from the model may be collected solely for the purpose of pure academic research. By using this demo, users implicitly agree to these terms.
|
92 |
"""
|
93 |
)
|
94 |
-
# with gr.Column(scale=1):
|
95 |
-
# toggle_dark = gr.Button(value="Toggle Dark")
|
96 |
|
97 |
def create_user_agent_dropdown(environment_id):
|
98 |
_, environment_dict, agent_dict, relationship_dict = get_sotopia_profiles()
|
|
|
91 |
🗄️ **Disclaimer**: User prompts and generated replies from the model may be collected solely for the purpose of pure academic research. By using this demo, users implicitly agree to these terms.
|
92 |
"""
|
93 |
)
|
|
|
|
|
94 |
|
95 |
def create_user_agent_dropdown(environment_id):
|
96 |
_, environment_dict, agent_dict, relationship_dict = get_sotopia_profiles()
|
sotopia_space/constants.py
CHANGED
@@ -14,26 +14,8 @@ MODEL_OPTIONS = [
|
|
14 |
]
|
15 |
|
16 |
MODEL_INFO = {
|
17 |
-
"
|
18 |
-
"
|
19 |
-
"Llama-2
|
20 |
-
"
|
21 |
-
"Mistral-7B-Instruct-v0.1": {"pretty_name": "Mistral-7B-Instruct", "hf_model_id": "mistralai/Mistral-7B-Instruct-v0.1"},
|
22 |
-
"Mistral-7B-Instruct-v0.2": {"pretty_name": "Mistral-7B-Instruct (v0.2)", "hf_model_id": "mistralai/Mistral-7B-Instruct-v0.2"},
|
23 |
-
"Mixtral-8x7B-Instruct-v0.1": {"pretty_name": "Mixtral-8x7B-Instruct", "hf_model_id": "mistralai/Mixtral-8x7B-Instruct-v0.1"},
|
24 |
-
"Nous-Hermes-2-Mixtral-8x7B-DPO": {"pretty_name": "Nous-Hermes-2-Mixtral-8x7B-DPO", "hf_model_id": "NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO"},
|
25 |
-
"Yi-34B-Chat": {"pretty_name": "Yi-34B-Chat", "hf_model_id": "01-ai/Yi-34B"},
|
26 |
-
"gemini-1.0-pro": {"pretty_name": "gemini-1.0-pro", "hf_model_id": "https://blog.google/technology/ai/google-gemini-ai/"},
|
27 |
-
"gemma-7b-it": {"pretty_name": "Gemma-7B-it", "hf_model_id": "google/gemma-7b"},
|
28 |
-
"gpt-3.5-turbo-0125": {"pretty_name": "gpt-3.5-turbo-0125", "hf_model_id": "https://platform.openai.com/"},
|
29 |
-
"gpt-4-0125-preview": {"pretty_name": "gpt-4-0125-preview", "hf_model_id": "https://platform.openai.com/"},
|
30 |
-
"tulu-2-dpo-70b": {"pretty_name": "Tulu-2-dpo-70b", "hf_model_id": "cmu-lti/tulu-2-dpo-70b"},
|
31 |
-
"vicuna-13b-v1.5": {"pretty_name": "Vicuna-13b-v1.5", "hf_model_id": "lmsys/vicuna-13b-v1.5"},
|
32 |
-
"zephyr-7b-beta": {"pretty_name": "Zephyr-7b-beta", "hf_model_id": "HuggingFaceH4/zephyr-7b-beta"},
|
33 |
-
"mistral-large-2402": {"pretty_name": "Mistral-Large", "hf_model_id": "https://mistral.ai/news/mistral-large/"},
|
34 |
-
"claude-3-opus-20240229": {"pretty_name": "Claude 3 Opus", "hf_model_id": "https://www.anthropic.com/claude"},
|
35 |
-
"claude-3-sonnet-20240229": {"pretty_name": "Claude 3 Sonnet", "hf_model_id": "https://www.anthropic.com/claude"},
|
36 |
-
"zephyr-7b-gemma-v0.1": {"pretty_name": "Zephyr-7b-Gemma", "hf_model_id": "HuggingFaceH4/zephyr-7b-gemma-v0.1"},
|
37 |
-
"Starling-LM-7B-beta": {"pretty_name": "StarlingLM-7B-beta", "hf_model_id": "Nexusflow/Starling-LM-7B-beta"},
|
38 |
-
"dbrx-instruct": {"pretty_name": "DBRX Instruct", "hf_model_id": "databricks/dbrx-instruct"}
|
39 |
}
|
|
|
14 |
]
|
15 |
|
16 |
MODEL_INFO = {
|
17 |
+
"GPT-4": {"pretty_name": "GPT-4", "hf_model_id": "https://openai.com/blog/new-models-and-developer-products-announced-at-devday"},
|
18 |
+
"GPT-3.5": {"pretty_name": "GPT-3.5", "hf_model_id": "https://openai.com/blog/new-models-and-developer-products-announced-at-devday"},
|
19 |
+
"Llama-2": {"pretty_name": "Llama-2", "hf_model_id": "https://llama.meta.com/llama2/"},
|
20 |
+
"MPT": {"pretty_name": "MPT", "hf_model_id": "https://huggingface.co/docs/transformers/main/en/model_doc/mpt"}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
21 |
}
|
sotopia_space/utils.py
CHANGED
@@ -1,17 +1,6 @@
|
|
1 |
-
from
|
2 |
-
import os
|
3 |
-
import json
|
4 |
-
from datasets import load_dataset
|
5 |
-
from datasets.utils.logging import disable_progress_bar # type: ignore
|
6 |
-
from ui_constants import column_names, all_task_types
|
7 |
-
import random
|
8 |
-
disable_progress_bar()
|
9 |
-
import math
|
10 |
from sotopia_space.constants import MODEL_INFO
|
11 |
|
12 |
-
id_to_data = None
|
13 |
-
model_len_info = None
|
14 |
-
|
15 |
|
16 |
def make_clickable_model(model_name):
|
17 |
global MODEL_INFO
|
@@ -25,199 +14,26 @@ def make_clickable_model(model_name):
|
|
25 |
else:
|
26 |
return model_name
|
27 |
|
28 |
-
|
29 |
-
def styled_error(error):
|
30 |
-
return f"<p style='color: red; font-size: 20px; text-align: center;'>{error}</p>"
|
31 |
-
|
32 |
-
def styled_warning(warn):
|
33 |
-
return f"<p style='color: orange; font-size: 20px; text-align: center;'>{warn}</p>"
|
34 |
-
|
35 |
-
def styled_message(message):
|
36 |
-
return f"<p style='color: green; font-size: 20px; text-align: center;'>{message}</p>"
|
37 |
-
|
38 |
-
|
39 |
-
def estimated_win_rate(elo_a, elo_b, LP=0):
|
40 |
-
"""
|
41 |
-
Calculate the estimated win rate for player A against player B using their Elo ratings.
|
42 |
-
:param elo_a: Elo rating of player A
|
43 |
-
:param elo_b: Elo rating of player B
|
44 |
-
:return: Estimated win rate for player A
|
45 |
-
"""
|
46 |
-
exponent = (elo_b - elo_a)*(10**LP) / 400
|
47 |
-
probability_a_wins = 1 / (1 + 10 ** exponent)
|
48 |
-
return (1-probability_a_wins)*100
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
# Formats the columns
|
53 |
def formatter(x):
|
54 |
if type(x) is str:
|
55 |
x = x
|
56 |
else:
|
57 |
-
x = round(x,
|
58 |
return x
|
59 |
|
60 |
-
|
61 |
-
def add_winrates(current_df, LP=0):
|
62 |
-
df = current_df.copy()
|
63 |
-
elo_column = "Task-Avg Elo"
|
64 |
-
|
65 |
-
# Correct way to filter the DataFrame and get the Elo rating for "gpt-4-0125-preview"
|
66 |
-
model_a_elo = df[df["Model"].str.contains("gpt-4")][elo_column].iloc[0]
|
67 |
-
|
68 |
-
# Correct way to filter the DataFrame and get the Elo rating for "gpt-3.5-turbo-0125"
|
69 |
-
model_b_elo = df[df["Model"].str.contains("gpt-3.5")][elo_column].iloc[0]
|
70 |
-
|
71 |
-
|
72 |
-
# Calculate the win rate of "gpt-4-0125-preview" against all models
|
73 |
-
df['Win% vs GPT-4'] = df[elo_column].apply(lambda x: estimated_win_rate(model_a_elo, x, LP=LP)).apply(formatter)
|
74 |
-
df['Win% vs GPT-3.5T'] = df[elo_column].apply(lambda x: estimated_win_rate(model_b_elo, x, LP=LP)).apply(formatter)
|
75 |
-
# apply the formatter for the two new columns
|
76 |
-
cols = list(df.columns)
|
77 |
-
cols.remove("# battles"); cols.append("# battles")
|
78 |
-
cols.remove("Length"); cols.append("Length")
|
79 |
-
df = df[cols]
|
80 |
-
return df
|
81 |
-
|
82 |
-
def add_winrates_tasks(current_df, ref="gpt-4", LP=0):
|
83 |
-
new_df = current_df.copy()
|
84 |
-
for t in all_task_types:
|
85 |
-
column = column_names[t]
|
86 |
-
model_a_elo = current_df[current_df["Model"].str.contains(ref)][column].iloc[0]
|
87 |
-
new_df[column] = current_df[column].apply(lambda x: estimated_win_rate(model_a_elo, x, LP=LP)).apply(formatter)
|
88 |
-
return new_df
|
89 |
-
|
90 |
-
|
91 |
def post_processing(df, model_len_info):
|
92 |
if model_len_info:
|
93 |
-
df["Length"] = df["
|
94 |
|
95 |
for col in df.columns:
|
96 |
-
if col == "
|
97 |
df[col] = df[col].apply(lambda x: x.replace(x, make_clickable_model(x)))
|
98 |
else:
|
99 |
df[col] = df[col].apply(formatter) # For numerical values
|
100 |
df.rename(columns=column_names, inplace=True)
|
101 |
-
df.sort_values(by="
|
102 |
# put the "Overall Elo" and "Task-Avg Elo" column to the front
|
103 |
# add the length info
|
104 |
-
df = df[["
|
105 |
return df
|
106 |
-
|
107 |
-
def apply_length_penalty(original_df, ablation_df, length_penalty=0.2, mode='v1', LP_original_dfs=None):
|
108 |
-
"""
|
109 |
-
Temporarily disable the length penalty feature
|
110 |
-
if mode == 'v2' and LP_original_dfs is not None:
|
111 |
-
L = f"{length_penalty:.1f}"
|
112 |
-
return LP_original_dfs[L]
|
113 |
-
original_df = original_df.copy()
|
114 |
-
ablation_df = ablation_df.copy()
|
115 |
-
# replace all values in original_df with the values as z = x - y * length_penalty where y is from ablation_df at the same row and column
|
116 |
-
# except for the "Model" column and the "# battles" column
|
117 |
-
# do not assume the order of the rows are the same in both dataframes
|
118 |
-
for i, row in original_df.iterrows():
|
119 |
-
for col in original_df.columns:
|
120 |
-
if col == "Model" or col == "# battles" or col == "Length":
|
121 |
-
continue
|
122 |
-
# assert that the model names are the same in both dataframes
|
123 |
-
assert original_df.at[i, "Model"] == ablation_df[ablation_df["Model"] == row["Model"]]["Model"].values[0]
|
124 |
-
original_df[col] = original_df[col].astype(float)
|
125 |
-
if mode == "v1":
|
126 |
-
original_df.at[i, col] = original_df.at[i, col] - ablation_df[ablation_df["Model"] == row["Model"]][col].values[0] * length_penalty
|
127 |
-
elif mode == "v1.1":
|
128 |
-
diff = original_df.at[i, col] - ablation_df[ablation_df["Model"] == row["Model"]][col].values[0]
|
129 |
-
original_df.at[i, col] = original_df.at[i, col] * (1-length_penalty) + diff*length_penalty
|
130 |
-
# post_processing
|
131 |
-
original_df = post_processing(original_df, model_len_info=None)
|
132 |
-
"""
|
133 |
-
return original_df
|
134 |
-
|
135 |
-
def load_benchdata():
|
136 |
-
print("Loading sotopia data...")
|
137 |
-
bench_data = load_dataset("cmu-lti/sotopia", split="test")
|
138 |
-
return bench_data
|
139 |
-
|
140 |
-
def load_benchdata_dict():
|
141 |
-
print("Loading sotopia data....")
|
142 |
-
bench_data = load_dataset("cmu-lti/sotopia", data_files="sotopia_episodes_v1_hf.jsonl")['train']
|
143 |
-
id_to_data = {}
|
144 |
-
for item in bench_data:
|
145 |
-
id_to_data[item["session_id"]] = item
|
146 |
-
return id_to_data
|
147 |
-
|
148 |
-
def load_eval_results():
|
149 |
-
print("Loading sotopia Evaluation data...")
|
150 |
-
eval_results = load_dataset("WildEval/sotopia-Evaluation", "all", split="train")
|
151 |
-
return eval_results
|
152 |
-
|
153 |
-
def load_infer_results(model_name):
|
154 |
-
print(f"Loading sotopia Results for {model_name}...")
|
155 |
-
infer_results = load_dataset("WildEval/sotopia-Results", model_name, split="train")
|
156 |
-
return infer_results
|
157 |
-
|
158 |
-
def sample_an_eval_result(eval_results, model_list=[], tag_list=[]):
|
159 |
-
global id_to_data
|
160 |
-
eval_results = list(eval_results)
|
161 |
-
random.shuffle(eval_results)
|
162 |
-
for eval_item in eval_results:
|
163 |
-
# print(json.dumps(eval_item, indent=2))
|
164 |
-
# print(f"## Session ID: {eval_item['session_id']}")
|
165 |
-
# eval_item["eval_id"]
|
166 |
-
assignment = eval_item['assignment']
|
167 |
-
model_1, model_2 = eval_item['model_1'], eval_item['model_2']
|
168 |
-
model_A = model_1 if assignment['A'] == model_1 else model_2
|
169 |
-
model_B = model_2 if assignment['B'] == model_2 else model_1
|
170 |
-
if len(model_list) >= 2:
|
171 |
-
if model_A not in model_list or model_B not in model_list:
|
172 |
-
continue
|
173 |
-
elif len(model_list) == 1:
|
174 |
-
if model_A != model_list[0] and model_B != model_list[0]:
|
175 |
-
continue
|
176 |
-
else:
|
177 |
-
pass
|
178 |
-
if tag_list:
|
179 |
-
if set(tag_list).isdisjoint(set(eval_item['tags'])):
|
180 |
-
continue
|
181 |
-
winner = eval_item['winner']
|
182 |
-
# print(f"## Model A: {model_A} | Model B: {model_B} | Winner: {winner}")
|
183 |
-
task_type = eval_item['tags'][0] # primary task type
|
184 |
-
chat_history = eval_item['history']
|
185 |
-
last_query = eval_item['last_query']
|
186 |
-
# print(f"## Task Type: {task_type}")
|
187 |
-
# print(f"## Chat History: {chat_history}")
|
188 |
-
# print(f"## Last Query --> USER: {last_query}")
|
189 |
-
|
190 |
-
model_A_output = eval_item['model_1_output'] if model_1 == model_A else eval_item['model_2_output']
|
191 |
-
model_B_output = eval_item['model_2_output'] if model_2 == model_B else eval_item['model_1_output']
|
192 |
-
|
193 |
-
if len(model_A_output.strip()) == 0 or len(model_B_output.strip()) == 0:
|
194 |
-
continue
|
195 |
-
|
196 |
-
conversation_input = id_to_data[eval_item['session_id']]["conversation_input"]
|
197 |
-
# print(f"\n\n\n## Model A ({model_A}) Output ##\n{model_A_output}")
|
198 |
-
# print(f"\n\n\n## Model B ({model_B}) Output ##\n{model_B_output}")
|
199 |
-
|
200 |
-
# print(f"\n\n\n## Winner ##\n{winner}")
|
201 |
-
# print(f"\n\n\n## GPT-4 Judgement ##\n{eval_item['parsed_result']}")
|
202 |
-
|
203 |
-
result_dict = {
|
204 |
-
"session_id": eval_item['session_id'],
|
205 |
-
"model_A": model_A,
|
206 |
-
"model_B": model_B,
|
207 |
-
"winner": winner,
|
208 |
-
"intent": id_to_data[eval_item['session_id']]["intent"],
|
209 |
-
"task_type": task_type,
|
210 |
-
"all_tags": eval_item['tags'],
|
211 |
-
"chat_history": chat_history,
|
212 |
-
"last_query": last_query,
|
213 |
-
"conversation_input": conversation_input,
|
214 |
-
"model_A_output": model_A_output,
|
215 |
-
"model_B_output": model_B_output,
|
216 |
-
"reason": eval_item['parsed_result']["reason"],
|
217 |
-
"choice": eval_item['parsed_result']["choice"],
|
218 |
-
"checklist": id_to_data[eval_item['session_id']]["checklist"],
|
219 |
-
}
|
220 |
-
break
|
221 |
-
return result_dict
|
222 |
-
|
223 |
-
#id_to_data = load_benchdata_dict()
|
|
|
1 |
+
from ui_constants import column_names
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
2 |
from sotopia_space.constants import MODEL_INFO
|
3 |
|
|
|
|
|
|
|
4 |
|
5 |
def make_clickable_model(model_name):
|
6 |
global MODEL_INFO
|
|
|
14 |
else:
|
15 |
return model_name
|
16 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
17 |
# Formats the columns
|
18 |
def formatter(x):
|
19 |
if type(x) is str:
|
20 |
x = x
|
21 |
else:
|
22 |
+
x = round(x, 2)
|
23 |
return x
|
24 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
def post_processing(df, model_len_info):
|
26 |
if model_len_info:
|
27 |
+
df["Length"] = df["model_name"].apply(lambda x: model_len_info[x]["avg_len"])
|
28 |
|
29 |
for col in df.columns:
|
30 |
+
if col == "model_name":
|
31 |
df[col] = df[col].apply(lambda x: x.replace(x, make_clickable_model(x)))
|
32 |
else:
|
33 |
df[col] = df[col].apply(formatter) # For numerical values
|
34 |
df.rename(columns=column_names, inplace=True)
|
35 |
+
df.sort_values(by="GOAL [0, 10]", inplace=True, ascending=False)
|
36 |
# put the "Overall Elo" and "Task-Avg Elo" column to the front
|
37 |
# add the length info
|
38 |
+
df = df[["model_name", "GOAL [0, 10]"] + [col for col in df.columns if col not in ["model_name", "GOAL [0, 10]"]]]
|
39 |
return df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|