Spaces:
Running
Running
Joschka Strueber
commited on
Commit
·
0a42e99
1
Parent(s):
4077e51
[Add] add bbh and gpqa benchmarks again with correct answer_index selection
Browse files- app.py +1 -1
- src/app_util.py +2 -2
- src/dataloading.py +46 -16
- src/utils.py +14 -1
app.py
CHANGED
@@ -21,7 +21,7 @@ metric_init = "CAPA"
|
|
21 |
with gr.Blocks(title="LLM Similarity Analyzer", css=app_util.custom_css) as demo:
|
22 |
gr.Markdown("# Model Similarity Comparison Tool")
|
23 |
gr.Markdown(links_markdown)
|
24 |
-
gr.Markdown('This is
|
25 |
|
26 |
with gr.Row():
|
27 |
dataset_dropdown = gr.Dropdown(
|
|
|
21 |
with gr.Blocks(title="LLM Similarity Analyzer", css=app_util.custom_css) as demo:
|
22 |
gr.Markdown("# Model Similarity Comparison Tool")
|
23 |
gr.Markdown(links_markdown)
|
24 |
+
gr.Markdown('This is an interactive demo for the recent publication "[Great Models Think Alike and this Undermines AI Oversight](https://huggingface.co/papers/2502.04313)." You can compare the functional similarity of hundreds of Language Models on the Open LLM Leaderboard v2 benchmark datasets.')
|
25 |
|
26 |
with gr.Row():
|
27 |
dataset_dropdown = gr.Dropdown(
|
src/app_util.py
CHANGED
@@ -90,9 +90,9 @@ def update_datasets_based_on_models(selected_models, current_dataset):
|
|
90 |
|
91 |
custom_css = """
|
92 |
.image-container img {
|
93 |
-
width:
|
94 |
height: auto !important; /* Maintain aspect ratio */
|
95 |
-
max-width:
|
96 |
display: block;
|
97 |
margin: auto; /* Center the image */
|
98 |
}
|
|
|
90 |
|
91 |
custom_css = """
|
92 |
.image-container img {
|
93 |
+
width: 80% !important; /* Make it 80% of the parent container */
|
94 |
height: auto !important; /* Maintain aspect ratio */
|
95 |
+
max-width: 800px; /* Optional: Set a max limit */
|
96 |
display: block;
|
97 |
margin: auto; /* Center the image */
|
98 |
}
|
src/dataloading.py
CHANGED
@@ -4,6 +4,7 @@ from huggingface_hub import HfApi
|
|
4 |
|
5 |
from functools import lru_cache
|
6 |
|
|
|
7 |
|
8 |
def get_leaderboard_models_reload():
|
9 |
api = HfApi()
|
@@ -69,7 +70,7 @@ def get_leaderboard_models_cached():
|
|
69 |
|
70 |
def get_leaderboard_datasets(model_ids):
|
71 |
if model_ids is None:
|
72 |
-
return ['bbh_boolean_expressions', 'bbh_causal_judgement', 'bbh_date_understanding', 'bbh_disambiguation_qa', 'bbh_formal_fallacies', 'bbh_geometric_shapes', 'bbh_hyperbaton', 'bbh_logical_deduction_five_objects', 'bbh_logical_deduction_seven_objects', 'bbh_logical_deduction_three_objects', 'bbh_movie_recommendation', 'bbh_navigate', 'bbh_object_counting', 'bbh_penguins_in_a_table', 'bbh_reasoning_about_colored_objects', 'bbh_ruin_names', 'bbh_salient_translation_error_detection', 'bbh_snarks', 'bbh_sports_understanding', 'bbh_temporal_sequences', 'bbh_tracking_shuffled_objects_five_objects', 'bbh_tracking_shuffled_objects_seven_objects', 'bbh_tracking_shuffled_objects_three_objects', 'bbh_web_of_lies', 'gpqa_diamond', 'gpqa_extended', 'gpqa_main', '
|
73 |
|
74 |
# Map each model to its corresponding leaderboard version
|
75 |
leaderboard_model_ids = [f"open-llm-leaderboard/{model_id.replace('/', '__')}-details" for model_id in model_ids]
|
@@ -87,7 +88,7 @@ def get_leaderboard_datasets(model_ids):
|
|
87 |
common_datasets = set.intersection(*model_datasets.values())
|
88 |
|
89 |
# Filter datasets that are not MCQ or currently do not work
|
90 |
-
ignore = ["
|
91 |
discard = []
|
92 |
for dataset in common_datasets:
|
93 |
for ignore_data in ignore:
|
@@ -98,22 +99,56 @@ def get_leaderboard_datasets(model_ids):
|
|
98 |
return sorted(common_datasets)
|
99 |
|
100 |
|
101 |
-
def filter_labels(doc):
|
102 |
labels = []
|
|
|
103 |
if "answer_index" in doc[0].keys():
|
104 |
for d in doc:
|
105 |
labels.append(d["answer_index"])
|
106 |
-
|
107 |
for d in doc:
|
108 |
-
|
|
|
|
|
|
|
|
|
|
|
109 |
labels.append(0)
|
110 |
-
|
|
|
|
|
|
|
|
|
111 |
labels.append(1)
|
112 |
-
|
113 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
114 |
return labels
|
115 |
|
116 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
117 |
|
118 |
def load_run_data(model_name, dataset_name):
|
119 |
try:
|
@@ -125,14 +160,9 @@ def load_run_data(model_name, dataset_name):
|
|
125 |
data = data.sort("doc_id")
|
126 |
data = data.to_dict()
|
127 |
|
128 |
-
# Get
|
129 |
-
log_probs =
|
130 |
-
|
131 |
-
log_prob = np.array([float(option[0]) for option in resp])
|
132 |
-
log_probs.append(log_prob)
|
133 |
-
|
134 |
-
# Get ground truth labels
|
135 |
-
labels = filter_labels(data["doc"])
|
136 |
|
137 |
except Exception as e:
|
138 |
print(e)
|
|
|
4 |
|
5 |
from functools import lru_cache
|
6 |
|
7 |
+
from utils import opt_in_pars_to_index, get_test_target
|
8 |
|
9 |
def get_leaderboard_models_reload():
|
10 |
api = HfApi()
|
|
|
70 |
|
71 |
def get_leaderboard_datasets(model_ids):
|
72 |
if model_ids is None:
|
73 |
+
return ['bbh_boolean_expressions', 'bbh_causal_judgement', 'bbh_date_understanding', 'bbh_disambiguation_qa', 'bbh_formal_fallacies', 'bbh_geometric_shapes', 'bbh_hyperbaton', 'bbh_logical_deduction_five_objects', 'bbh_logical_deduction_seven_objects', 'bbh_logical_deduction_three_objects', 'bbh_movie_recommendation', 'bbh_navigate', 'bbh_object_counting', 'bbh_penguins_in_a_table', 'bbh_reasoning_about_colored_objects', 'bbh_ruin_names', 'bbh_salient_translation_error_detection', 'bbh_snarks', 'bbh_sports_understanding', 'bbh_temporal_sequences', 'bbh_tracking_shuffled_objects_five_objects', 'bbh_tracking_shuffled_objects_seven_objects', 'bbh_tracking_shuffled_objects_three_objects', 'bbh_web_of_lies', 'gpqa_diamond', 'gpqa_extended', 'gpqa_main', 'mmlu_pro', 'musr_murder_mysteries', 'musr_object_placements', 'musr_team_allocation']
|
74 |
|
75 |
# Map each model to its corresponding leaderboard version
|
76 |
leaderboard_model_ids = [f"open-llm-leaderboard/{model_id.replace('/', '__')}-details" for model_id in model_ids]
|
|
|
88 |
common_datasets = set.intersection(*model_datasets.values())
|
89 |
|
90 |
# Filter datasets that are not MCQ or currently do not work
|
91 |
+
ignore = ["math_", "ifeval"]
|
92 |
discard = []
|
93 |
for dataset in common_datasets:
|
94 |
for ignore_data in ignore:
|
|
|
99 |
return sorted(common_datasets)
|
100 |
|
101 |
|
102 |
+
def filter_labels(dataset_name, doc):
|
103 |
labels = []
|
104 |
+
test_target, target_key = get_test_target(doc[0])
|
105 |
if "answer_index" in doc[0].keys():
|
106 |
for d in doc:
|
107 |
labels.append(d["answer_index"])
|
108 |
+
elif test_target.starts_with("("):
|
109 |
for d in doc:
|
110 |
+
labels.append(opt_in_pars_to_index(d[target_key]))
|
111 |
+
elif dataset_name in ["bbh_boolean_expression"]:
|
112 |
+
for d in doc:
|
113 |
+
if d[target_key] == "True":
|
114 |
+
labels.append(1)
|
115 |
+
elif d[target_key] == "False":
|
116 |
labels.append(0)
|
117 |
+
elif dataset_name in ["bbh_causal_judgement", "bbh_navigate", "bbh_web_of_lies"]:
|
118 |
+
for d in doc:
|
119 |
+
if d[target_key] == "Yes":
|
120 |
+
labels.append(0)
|
121 |
+
elif d[target_key] == "No":
|
122 |
labels.append(1)
|
123 |
+
elif dataset_name in ["bbh_formal_fallacies"]:
|
124 |
+
for d in doc:
|
125 |
+
if d[target_key] == "valid":
|
126 |
+
labels.append(0)
|
127 |
+
elif d[target_key] == "invalid":
|
128 |
+
labels.append(1)
|
129 |
+
elif dataset_name in ["bbh_sports_understanding"]:
|
130 |
+
for d in doc:
|
131 |
+
if d[target_key] == "yes":
|
132 |
+
labels.append(0)
|
133 |
+
elif d[target_key] == "no":
|
134 |
+
labels.append(1)
|
135 |
+
elif test_target.is_digit():
|
136 |
+
for d in doc:
|
137 |
+
labels.append(int(d[target_key]))
|
138 |
+
|
139 |
return labels
|
140 |
|
141 |
|
142 |
+
def filter_responses(data):
|
143 |
+
# Get log probabilities for each response
|
144 |
+
log_probs = []
|
145 |
+
|
146 |
+
for resp in data["filtered_resps"]:
|
147 |
+
log_prob = np.array([float(option[0]) for option in resp])
|
148 |
+
log_probs.append(log_prob)
|
149 |
+
|
150 |
+
return log_probs
|
151 |
+
|
152 |
|
153 |
def load_run_data(model_name, dataset_name):
|
154 |
try:
|
|
|
160 |
data = data.sort("doc_id")
|
161 |
data = data.to_dict()
|
162 |
|
163 |
+
# Get ground truth labels and logits
|
164 |
+
log_probs = filter_responses(dataset_name, data)
|
165 |
+
labels = filter_labels(dataset_name, data["doc"])
|
|
|
|
|
|
|
|
|
|
|
166 |
|
167 |
except Exception as e:
|
168 |
print(e)
|
src/utils.py
CHANGED
@@ -8,4 +8,17 @@ def softmax(logits: np.ndarray) -> np.ndarray:
|
|
8 |
def one_hot(probs: np.array) -> np.array:
|
9 |
one_hot = np.zeros_like(probs)
|
10 |
one_hot[np.argmax(probs)] = 1
|
11 |
-
return one_hot
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
8 |
def one_hot(probs: np.array) -> np.array:
|
9 |
one_hot = np.zeros_like(probs)
|
10 |
one_hot[np.argmax(probs)] = 1
|
11 |
+
return one_hot
|
12 |
+
|
13 |
+
def opt_in_pars_to_index(s):
|
14 |
+
if s.startswith("(") and s.endswith(")"):
|
15 |
+
letter = s[1] # Extract the letter inside the parentheses
|
16 |
+
return ord(letter) - ord("A") # Convert to zero-based index
|
17 |
+
else:
|
18 |
+
raise ValueError("Invalid format")
|
19 |
+
|
20 |
+
def get_test_target(doc):
|
21 |
+
if "target" in doc:
|
22 |
+
return doc["target"], "target"
|
23 |
+
elif "answer" in doc:
|
24 |
+
return doc["answer"], "answer"
|