Joschka Strueber commited on
Commit
0a42e99
·
1 Parent(s): 4077e51

[Add] add bbh and gpqa benchmarks again with correct answer_index selection

Browse files
Files changed (4) hide show
  1. app.py +1 -1
  2. src/app_util.py +2 -2
  3. src/dataloading.py +46 -16
  4. src/utils.py +14 -1
app.py CHANGED
@@ -21,7 +21,7 @@ metric_init = "CAPA"
21
  with gr.Blocks(title="LLM Similarity Analyzer", css=app_util.custom_css) as demo:
22
  gr.Markdown("# Model Similarity Comparison Tool")
23
  gr.Markdown(links_markdown)
24
- gr.Markdown('This is a demo for the recent publication "[Great Models Think Alike and this Undermines AI Oversight](https://huggingface.co/papers/2502.04313)."')
25
 
26
  with gr.Row():
27
  dataset_dropdown = gr.Dropdown(
 
21
  with gr.Blocks(title="LLM Similarity Analyzer", css=app_util.custom_css) as demo:
22
  gr.Markdown("# Model Similarity Comparison Tool")
23
  gr.Markdown(links_markdown)
24
+ gr.Markdown('This is an interactive demo for the recent publication "[Great Models Think Alike and this Undermines AI Oversight](https://huggingface.co/papers/2502.04313)." You can compare the functional similarity of hundreds of Language Models on the Open LLM Leaderboard v2 benchmark datasets.')
25
 
26
  with gr.Row():
27
  dataset_dropdown = gr.Dropdown(
src/app_util.py CHANGED
@@ -90,9 +90,9 @@ def update_datasets_based_on_models(selected_models, current_dataset):
90
 
91
  custom_css = """
92
  .image-container img {
93
- width: 90% !important; /* Make it 90% of the parent container */
94
  height: auto !important; /* Maintain aspect ratio */
95
- max-width: 1000px; /* Optional: Set a max limit */
96
  display: block;
97
  margin: auto; /* Center the image */
98
  }
 
90
 
91
  custom_css = """
92
  .image-container img {
93
+ width: 80% !important; /* Make it 80% of the parent container */
94
  height: auto !important; /* Maintain aspect ratio */
95
+ max-width: 800px; /* Optional: Set a max limit */
96
  display: block;
97
  margin: auto; /* Center the image */
98
  }
src/dataloading.py CHANGED
@@ -4,6 +4,7 @@ from huggingface_hub import HfApi
4
 
5
  from functools import lru_cache
6
 
 
7
 
8
  def get_leaderboard_models_reload():
9
  api = HfApi()
@@ -69,7 +70,7 @@ def get_leaderboard_models_cached():
69
 
70
  def get_leaderboard_datasets(model_ids):
71
  if model_ids is None:
72
- return ['bbh_boolean_expressions', 'bbh_causal_judgement', 'bbh_date_understanding', 'bbh_disambiguation_qa', 'bbh_formal_fallacies', 'bbh_geometric_shapes', 'bbh_hyperbaton', 'bbh_logical_deduction_five_objects', 'bbh_logical_deduction_seven_objects', 'bbh_logical_deduction_three_objects', 'bbh_movie_recommendation', 'bbh_navigate', 'bbh_object_counting', 'bbh_penguins_in_a_table', 'bbh_reasoning_about_colored_objects', 'bbh_ruin_names', 'bbh_salient_translation_error_detection', 'bbh_snarks', 'bbh_sports_understanding', 'bbh_temporal_sequences', 'bbh_tracking_shuffled_objects_five_objects', 'bbh_tracking_shuffled_objects_seven_objects', 'bbh_tracking_shuffled_objects_three_objects', 'bbh_web_of_lies', 'gpqa_diamond', 'gpqa_extended', 'gpqa_main', 'ifeval', 'math_algebra_hard', 'math_counting_and_prob_hard', 'math_geometry_hard', 'math_intermediate_algebra_hard', 'math_num_theory_hard', 'math_prealgebra_hard', 'math_precalculus_hard', 'mmlu_pro', 'musr_murder_mysteries', 'musr_object_placements', 'musr_team_allocation']
73
 
74
  # Map each model to its corresponding leaderboard version
75
  leaderboard_model_ids = [f"open-llm-leaderboard/{model_id.replace('/', '__')}-details" for model_id in model_ids]
@@ -87,7 +88,7 @@ def get_leaderboard_datasets(model_ids):
87
  common_datasets = set.intersection(*model_datasets.values())
88
 
89
  # Filter datasets that are not MCQ or currently do not work
90
- ignore = ["bbh_", "gpqa_", "math_", "ifeval"]
91
  discard = []
92
  for dataset in common_datasets:
93
  for ignore_data in ignore:
@@ -98,22 +99,56 @@ def get_leaderboard_datasets(model_ids):
98
  return sorted(common_datasets)
99
 
100
 
101
- def filter_labels(doc):
102
  labels = []
 
103
  if "answer_index" in doc[0].keys():
104
  for d in doc:
105
  labels.append(d["answer_index"])
106
- else:
107
  for d in doc:
108
- if d["target"] == "False":
 
 
 
 
 
109
  labels.append(0)
110
- elif d["target"] == "True":
 
 
 
 
111
  labels.append(1)
112
- else:
113
- raise ValueError("Invalid label")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
114
  return labels
115
 
116
 
 
 
 
 
 
 
 
 
 
 
117
 
118
  def load_run_data(model_name, dataset_name):
119
  try:
@@ -125,14 +160,9 @@ def load_run_data(model_name, dataset_name):
125
  data = data.sort("doc_id")
126
  data = data.to_dict()
127
 
128
- # Get log probabilities for each response
129
- log_probs = []
130
- for resp in data["filtered_resps"]:
131
- log_prob = np.array([float(option[0]) for option in resp])
132
- log_probs.append(log_prob)
133
-
134
- # Get ground truth labels
135
- labels = filter_labels(data["doc"])
136
 
137
  except Exception as e:
138
  print(e)
 
4
 
5
  from functools import lru_cache
6
 
7
+ from utils import opt_in_pars_to_index, get_test_target
8
 
9
  def get_leaderboard_models_reload():
10
  api = HfApi()
 
70
 
71
  def get_leaderboard_datasets(model_ids):
72
  if model_ids is None:
73
+ return ['bbh_boolean_expressions', 'bbh_causal_judgement', 'bbh_date_understanding', 'bbh_disambiguation_qa', 'bbh_formal_fallacies', 'bbh_geometric_shapes', 'bbh_hyperbaton', 'bbh_logical_deduction_five_objects', 'bbh_logical_deduction_seven_objects', 'bbh_logical_deduction_three_objects', 'bbh_movie_recommendation', 'bbh_navigate', 'bbh_object_counting', 'bbh_penguins_in_a_table', 'bbh_reasoning_about_colored_objects', 'bbh_ruin_names', 'bbh_salient_translation_error_detection', 'bbh_snarks', 'bbh_sports_understanding', 'bbh_temporal_sequences', 'bbh_tracking_shuffled_objects_five_objects', 'bbh_tracking_shuffled_objects_seven_objects', 'bbh_tracking_shuffled_objects_three_objects', 'bbh_web_of_lies', 'gpqa_diamond', 'gpqa_extended', 'gpqa_main', 'mmlu_pro', 'musr_murder_mysteries', 'musr_object_placements', 'musr_team_allocation']
74
 
75
  # Map each model to its corresponding leaderboard version
76
  leaderboard_model_ids = [f"open-llm-leaderboard/{model_id.replace('/', '__')}-details" for model_id in model_ids]
 
88
  common_datasets = set.intersection(*model_datasets.values())
89
 
90
  # Filter datasets that are not MCQ or currently do not work
91
+ ignore = ["math_", "ifeval"]
92
  discard = []
93
  for dataset in common_datasets:
94
  for ignore_data in ignore:
 
99
  return sorted(common_datasets)
100
 
101
 
102
+ def filter_labels(dataset_name, doc):
103
  labels = []
104
+ test_target, target_key = get_test_target(doc[0])
105
  if "answer_index" in doc[0].keys():
106
  for d in doc:
107
  labels.append(d["answer_index"])
108
+ elif test_target.starts_with("("):
109
  for d in doc:
110
+ labels.append(opt_in_pars_to_index(d[target_key]))
111
+ elif dataset_name in ["bbh_boolean_expression"]:
112
+ for d in doc:
113
+ if d[target_key] == "True":
114
+ labels.append(1)
115
+ elif d[target_key] == "False":
116
  labels.append(0)
117
+ elif dataset_name in ["bbh_causal_judgement", "bbh_navigate", "bbh_web_of_lies"]:
118
+ for d in doc:
119
+ if d[target_key] == "Yes":
120
+ labels.append(0)
121
+ elif d[target_key] == "No":
122
  labels.append(1)
123
+ elif dataset_name in ["bbh_formal_fallacies"]:
124
+ for d in doc:
125
+ if d[target_key] == "valid":
126
+ labels.append(0)
127
+ elif d[target_key] == "invalid":
128
+ labels.append(1)
129
+ elif dataset_name in ["bbh_sports_understanding"]:
130
+ for d in doc:
131
+ if d[target_key] == "yes":
132
+ labels.append(0)
133
+ elif d[target_key] == "no":
134
+ labels.append(1)
135
+ elif test_target.is_digit():
136
+ for d in doc:
137
+ labels.append(int(d[target_key]))
138
+
139
  return labels
140
 
141
 
142
+ def filter_responses(data):
143
+ # Get log probabilities for each response
144
+ log_probs = []
145
+
146
+ for resp in data["filtered_resps"]:
147
+ log_prob = np.array([float(option[0]) for option in resp])
148
+ log_probs.append(log_prob)
149
+
150
+ return log_probs
151
+
152
 
153
  def load_run_data(model_name, dataset_name):
154
  try:
 
160
  data = data.sort("doc_id")
161
  data = data.to_dict()
162
 
163
+ # Get ground truth labels and logits
164
+ log_probs = filter_responses(dataset_name, data)
165
+ labels = filter_labels(dataset_name, data["doc"])
 
 
 
 
 
166
 
167
  except Exception as e:
168
  print(e)
src/utils.py CHANGED
@@ -8,4 +8,17 @@ def softmax(logits: np.ndarray) -> np.ndarray:
8
  def one_hot(probs: np.array) -> np.array:
9
  one_hot = np.zeros_like(probs)
10
  one_hot[np.argmax(probs)] = 1
11
- return one_hot
 
 
 
 
 
 
 
 
 
 
 
 
 
 
8
  def one_hot(probs: np.array) -> np.array:
9
  one_hot = np.zeros_like(probs)
10
  one_hot[np.argmax(probs)] = 1
11
+ return one_hot
12
+
13
+ def opt_in_pars_to_index(s):
14
+ if s.startswith("(") and s.endswith(")"):
15
+ letter = s[1] # Extract the letter inside the parentheses
16
+ return ord(letter) - ord("A") # Convert to zero-based index
17
+ else:
18
+ raise ValueError("Invalid format")
19
+
20
+ def get_test_target(doc):
21
+ if "target" in doc:
22
+ return doc["target"], "target"
23
+ elif "answer" in doc:
24
+ return doc["answer"], "answer"