natolambert commited on
Commit
9f4ce43
1 Parent(s): bb95637

major update

Browse files
Files changed (5) hide show
  1. app.py +16 -12
  2. src/constants.py +3 -2
  3. src/logo.png +0 -0
  4. src/md.py +14 -5
  5. src/utils.py +3 -0
app.py CHANGED
@@ -42,7 +42,7 @@ def avg_over_rewardbench(dataframe_core, dataframe_prefs):
42
  2. Chat Hard: Includes the hard chat subsets (mt-bench-hard, llmbar-natural, llmbar-adver-neighbor, llmbar-adver-GPTInst, llmbar-adver-GPTOut, llmbar-adver-manual)
43
  3. Safety: Includes the safety subsets (refusals-dangerous, refusals-offensive, xstest-should-refuse, xstest-should-respond, do not answer)
44
  4. Reasoning: Includes the code and math subsets (math-prm, hep-cpp, hep-go, hep-java, hep-js, hep-python, hep-rust)
45
- 5. Classic Sets: Includes the test sets (anthropic_helpful, mtbench_human, shp, summarize)
46
  """
47
  new_df = dataframe_core.copy()
48
  dataframe_prefs = dataframe_prefs.copy()
@@ -61,28 +61,28 @@ def avg_over_rewardbench(dataframe_core, dataframe_prefs):
61
  new_df = new_df[keep_columns]
62
 
63
  # selected average from pref_sets
64
- pref_columns = ["anthropic_helpful", "anthropic_hhh", "mtbench_human", "shp", "summarize"]
65
  pref_data = dataframe_prefs[pref_columns].values
66
 
67
  # add column test sets knowing the rows are not identical, take superset
68
- dataframe_prefs["Classic Sets"] = np.round(np.nanmean(pref_data, axis=1), 2)
69
 
70
  # add column Test Sets empty to new_df
71
- new_df["Classic Sets"] = np.nan
72
- # per row in new_df if model is in dataframe_prefs, add the value to new_df["Classic Sets"]
73
  values = []
74
  for i, row in new_df.iterrows():
75
  model = row["model"]
76
  if model in dataframe_prefs["model"].values:
77
- values.append(dataframe_prefs[dataframe_prefs["model"] == model]["Classic Sets"].values[0])
78
- # new_df.at[i, "Classic Sets"] = dataframe_prefs[dataframe_prefs["model"] == model]["Classic Sets"].values[0]
79
  else:
80
  values.append(np.nan)
81
 
82
- new_df["Classic Sets"] = values
83
 
84
  # add total average
85
- data_cols += ["Classic Sets"]
86
  new_df["average"] = np.round(np.nanmean(new_df[data_cols].values, axis=1), 2)
87
 
88
  # make average third column
@@ -197,7 +197,11 @@ def regex_table(dataframe, regex, filter_button):
197
  if "Custom Classifiers" not in filter_button:
198
  dataframe = dataframe[~dataframe["Model Type"].str.contains("Custom Classifier", case=False, na=False)]
199
  # Filter the dataframe such that 'model' contains any of the regex patterns
200
- return dataframe[dataframe["Model"].str.contains(combined_regex, case=False, na=False)]
 
 
 
 
201
 
202
 
203
  with gr.Blocks(css=custom_css) as app:
@@ -280,7 +284,7 @@ with gr.Blocks(css=custom_css) as app:
280
  # elem_id="rewardbench_dataframe_length",
281
  # height=1000,
282
  # )
283
- with gr.TabItem("Classic Sets"):
284
  with gr.Row():
285
  search_3 = gr.Textbox(label="Model Search (delimit with , )", show_label=False, placeholder="Model Search (delimit with , )")
286
  model_types_3 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "AI2 Experiments"],
@@ -291,7 +295,7 @@ with gr.Blocks(css=custom_css) as app:
291
  )
292
  with gr.Row():
293
  PREF_SET_TEXT = """
294
- For more information, see the [dataset](https://huggingface.co/datasets/allenai/pref-test-sets). Only the subsets Anthropic Harmless, Anthropic HHH, MTBench Human, Stanford SHP, and OpenAI's Summarize data are used in the leaderboard.
295
  """
296
  gr.Markdown(PREF_SET_TEXT)
297
  with gr.Row():
 
42
  2. Chat Hard: Includes the hard chat subsets (mt-bench-hard, llmbar-natural, llmbar-adver-neighbor, llmbar-adver-GPTInst, llmbar-adver-GPTOut, llmbar-adver-manual)
43
  3. Safety: Includes the safety subsets (refusals-dangerous, refusals-offensive, xstest-should-refuse, xstest-should-respond, do not answer)
44
  4. Reasoning: Includes the code and math subsets (math-prm, hep-cpp, hep-go, hep-java, hep-js, hep-python, hep-rust)
45
+ 5. Prior Sets: Includes the test sets (anthropic_helpful, mtbench_human, shp, summarize)
46
  """
47
  new_df = dataframe_core.copy()
48
  dataframe_prefs = dataframe_prefs.copy()
 
61
  new_df = new_df[keep_columns]
62
 
63
  # selected average from pref_sets
64
+ pref_columns = ["anthropic_helpful", "anthropic_hhh", "shp", "summarize"]
65
  pref_data = dataframe_prefs[pref_columns].values
66
 
67
  # add column test sets knowing the rows are not identical, take superset
68
+ dataframe_prefs["Prior Sets"] = np.round(np.nanmean(pref_data, axis=1), 2)
69
 
70
  # add column Test Sets empty to new_df
71
+ new_df["Prior Sets"] = np.nan
72
+ # per row in new_df if model is in dataframe_prefs, add the value to new_df["Prior Sets"]
73
  values = []
74
  for i, row in new_df.iterrows():
75
  model = row["model"]
76
  if model in dataframe_prefs["model"].values:
77
+ values.append(dataframe_prefs[dataframe_prefs["model"] == model]["Prior Sets"].values[0])
78
+ # new_df.at[i, "Prior Sets"] = dataframe_prefs[dataframe_prefs["model"] == model]["Prior Sets"].values[0]
79
  else:
80
  values.append(np.nan)
81
 
82
+ new_df["Prior Sets"] = values
83
 
84
  # add total average
85
+ data_cols += ["Prior Sets"]
86
  new_df["average"] = np.round(np.nanmean(new_df[data_cols].values, axis=1), 2)
87
 
88
  # make average third column
 
197
  if "Custom Classifiers" not in filter_button:
198
  dataframe = dataframe[~dataframe["Model Type"].str.contains("Custom Classifier", case=False, na=False)]
199
  # Filter the dataframe such that 'model' contains any of the regex patterns
200
+ data = dataframe[dataframe["Model"].str.contains(combined_regex, case=False, na=False)]
201
+
202
+ # replace column '' with count/rank
203
+ data[''] = np.arange(1, 1 + len(data))
204
+ return data
205
 
206
 
207
  with gr.Blocks(css=custom_css) as app:
 
284
  # elem_id="rewardbench_dataframe_length",
285
  # height=1000,
286
  # )
287
+ with gr.TabItem("Prior Test Sets"):
288
  with gr.Row():
289
  search_3 = gr.Textbox(label="Model Search (delimit with , )", show_label=False, placeholder="Model Search (delimit with , )")
290
  model_types_3 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "AI2 Experiments"],
 
295
  )
296
  with gr.Row():
297
  PREF_SET_TEXT = """
298
+ For more information, see the [dataset](https://huggingface.co/datasets/allenai/pref-test-sets). Only the subsets Anthropic Helpful, Anthropic HHH, Stanford SHP, and OpenAI's Summarize data are used in the leaderboard ranking.
299
  """
300
  gr.Markdown(PREF_SET_TEXT)
301
  with gr.Row():
src/constants.py CHANGED
@@ -15,6 +15,7 @@ length_categories = {
15
  'llmbar-adver-manual': 'False',
16
  'llmbar-adver-neighbor': 'False',
17
  'llmbar-natural': 'Neutral',
 
18
  'mt-bench-easy': 'False',
19
  'mt-bench-hard': 'False',
20
  'mt-bench-med': 'Neutral',
@@ -31,7 +32,7 @@ example_counts = {
31
  "mt-bench-easy": 28,
32
  "mt-bench-med": 40,
33
  "mt-bench-hard": 37,
34
- # "math-prm": 984, # actual length 447, upweighting to be equal to code
35
  "refusals-dangerous": 100,
36
  "refusals-offensive": 100,
37
  "llmbar-natural": 100,
@@ -54,6 +55,6 @@ subset_mapping = {
54
  "Chat": ["alpacaeval-easy", "alpacaeval-length", "alpacaeval-hard", "mt-bench-easy", "mt-bench-med"],
55
  "Chat Hard": ["mt-bench-hard", "llmbar-natural", "llmbar-adver-neighbor", "llmbar-adver-GPTInst", "llmbar-adver-GPTOut", "llmbar-adver-manual"],
56
  "Safety": ["refusals-dangerous", "refusals-offensive", "xstest-should-refuse", "xstest-should-respond", "donotanswer"],
57
- "Reasoning": [#"math-prm",
58
  "hep-cpp", "hep-go", "hep-java", "hep-js", "hep-python", "hep-rust"]
59
  }
 
15
  'llmbar-adver-manual': 'False',
16
  'llmbar-adver-neighbor': 'False',
17
  'llmbar-natural': 'Neutral',
18
+ 'math-prm': 'Neutral',
19
  'mt-bench-easy': 'False',
20
  'mt-bench-hard': 'False',
21
  'mt-bench-med': 'Neutral',
 
32
  "mt-bench-easy": 28,
33
  "mt-bench-med": 40,
34
  "mt-bench-hard": 37,
35
+ "math-prm": 984, # actual length 447, upweighting to be equal to code
36
  "refusals-dangerous": 100,
37
  "refusals-offensive": 100,
38
  "llmbar-natural": 100,
 
55
  "Chat": ["alpacaeval-easy", "alpacaeval-length", "alpacaeval-hard", "mt-bench-easy", "mt-bench-med"],
56
  "Chat Hard": ["mt-bench-hard", "llmbar-natural", "llmbar-adver-neighbor", "llmbar-adver-GPTInst", "llmbar-adver-GPTOut", "llmbar-adver-manual"],
57
  "Safety": ["refusals-dangerous", "refusals-offensive", "xstest-should-refuse", "xstest-should-respond", "donotanswer"],
58
+ "Reasoning": ["math-prm",
59
  "hep-cpp", "hep-go", "hep-java", "hep-js", "hep-python", "hep-rust"]
60
  }
src/logo.png CHANGED
src/md.py CHANGED
@@ -9,7 +9,7 @@ We average over 4 core sections (per prompt weighting):
9
  2. **Chat Hard**: Includes the hard chat subsets (mt-bench-hard, llmbar-natural, llmbar-adver-neighbor, llmbar-adver-GPTInst, llmbar-adver-GPTOut, llmbar-adver-manual)
10
  3. **Safety**: Includes the safety subsets (refusals-dangerous, refusals-offensive, xstest-should-refuse, xstest-should-respond, do not answer)
11
  4. **Reasoning**: Includes the code and math subsets (math-prm, hep-cpp, hep-go, hep-java, hep-js, hep-python, hep-rust)
12
- 5. **Classic Sets**: Includes the test sets ([anthropic_helpful](https://huggingface.co/datasets/Anthropic/hh-rlhf), [anthropic_hhh](https://huggingface.co/datasets/HuggingFaceH4/hhh_alignment), [mtbench_human](https://huggingface.co/datasets/lmsys/mt_bench_human_judgments), [shp](https://huggingface.co/datasets/stanfordnlp/SHP), [summarize](https://huggingface.co/datasets/openai/summarize_from_feedback))
13
 
14
  We include multiple types of reward models in this evaluation:
15
  1. **Sequence Classifiers** (Seq. Classifier): A model, normally trained with HuggingFace AutoModelForSequenceClassification, that takes in a prompt and a response and outputs a score.
@@ -20,9 +20,18 @@ We include multiple types of reward models in this evaluation:
20
  All models are evaluated in fp16 expect for Starling-7B, which is evaluated in fp32.
21
  Others, such as **Generative Judge** are coming soon.
22
 
 
 
 
 
 
 
 
 
 
23
  ### Subset Details
24
 
25
- Total number of the prompts is: 2538, filtered from 4676.
26
 
27
  | Subset | Num. Samples (Pre-filtering, post-filtering) | Description |
28
  | :---------- | :-----: | :---------: |
@@ -77,11 +86,11 @@ Lengths (mean, std. dev.) include the prompt
77
  | xstest-should-refuse | False | 584 (419) | 904 (493) | 129 (89) | 217 (115) | 81 (47) | 116 (53) |
78
  | xstest-should-respond | True | 771 (420) | 466 (427) | 189 (105) | 107 (94) | 104 (48) | 67 (48) |
79
 
80
- For more details, see the [dataset](https://huggingface.co/datasets/ai2-rlhf-collab/rm-benchmark-dev).
81
  """
82
 
83
  TOP_TEXT = """
84
- # RewardBench: Benchmarking Reward Models
85
  ### Evaluating the capabilities, safety, and pitfalls of reward models
86
- [Code](https://github.com/allenai/herm) | [Eval. Dataset](https://huggingface.co/datasets/ai2-adapt-dev/rm-benchmark-dev) | [Classic Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results](https://huggingface.co/datasets/ai2-adapt-dev/HERM-Results) | Paper (coming soon)
87
  """
 
9
  2. **Chat Hard**: Includes the hard chat subsets (mt-bench-hard, llmbar-natural, llmbar-adver-neighbor, llmbar-adver-GPTInst, llmbar-adver-GPTOut, llmbar-adver-manual)
10
  3. **Safety**: Includes the safety subsets (refusals-dangerous, refusals-offensive, xstest-should-refuse, xstest-should-respond, do not answer)
11
  4. **Reasoning**: Includes the code and math subsets (math-prm, hep-cpp, hep-go, hep-java, hep-js, hep-python, hep-rust)
12
+ 5. **Prior Sets**: Includes the test sets ([anthropic_helpful](https://huggingface.co/datasets/Anthropic/hh-rlhf), [anthropic_hhh](https://huggingface.co/datasets/HuggingFaceH4/hhh_alignment), [mtbench_human](https://huggingface.co/datasets/lmsys/mt_bench_human_judgments), [shp](https://huggingface.co/datasets/stanfordnlp/SHP), [summarize](https://huggingface.co/datasets/openai/summarize_from_feedback))
13
 
14
  We include multiple types of reward models in this evaluation:
15
  1. **Sequence Classifiers** (Seq. Classifier): A model, normally trained with HuggingFace AutoModelForSequenceClassification, that takes in a prompt and a response and outputs a score.
 
20
  All models are evaluated in fp16 expect for Starling-7B, which is evaluated in fp32.
21
  Others, such as **Generative Judge** are coming soon.
22
 
23
+ ### Model Types
24
+
25
+ Currently, we evaluate the following model types:
26
+ 1. **Sequence Classifiers**: A model, normally trained with HuggingFace AutoModelForSequenceClassification, that takes in a prompt and a response and outputs a score.
27
+ 2. **Custom Classifiers**: Research models with different architectures and training objectives to either take in two inputs at once or generate scores differently (e.g. PairRM and Stanford SteamSHP).
28
+ 3. **DPO**: Models trained with Direct Preference Optimization (DPO) with a reference model being either the base or supervised fine-tuning checkpoint.
29
+
30
+ Support of DPO models without a reference model is coming soon.
31
+
32
  ### Subset Details
33
 
34
+ Total number of the prompts is: 2985, filtered from 5123.
35
 
36
  | Subset | Num. Samples (Pre-filtering, post-filtering) | Description |
37
  | :---------- | :-----: | :---------: |
 
86
  | xstest-should-refuse | False | 584 (419) | 904 (493) | 129 (89) | 217 (115) | 81 (47) | 116 (53) |
87
  | xstest-should-respond | True | 771 (420) | 466 (427) | 189 (105) | 107 (94) | 104 (48) | 67 (48) |
88
 
89
+ For more details, see the [dataset](https://huggingface.co/datasets/allenai/reward-bench).
90
  """
91
 
92
  TOP_TEXT = """
93
+ # RewardBench: Evaluating Reward Models
94
  ### Evaluating the capabilities, safety, and pitfalls of reward models
95
+ [Code](https://github.com/allenai/reward-bench) | [Eval. Dataset](https://huggingface.co/datasets/allenai/reward-bench) | [Classic Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results](https://huggingface.co/datasets/allenai/reward-bench-results) | Paper (coming soon)
96
  """
src/utils.py CHANGED
@@ -116,4 +116,7 @@ def load_all_data(data_repo, subdir:str, subsubsets=False): # use HF api to p
116
  cols.insert(1, cols.pop(cols.index('model_type')))
117
  df = df.loc[:, cols]
118
 
 
 
 
119
  return df
 
116
  cols.insert(1, cols.pop(cols.index('model_type')))
117
  df = df.loc[:, cols]
118
 
119
+ # remove models with DPO Ref. Free as type (future work)
120
+ df = df[~df["model_type"].str.contains("DPO Ref. Free", na=False)]
121
+
122
  return df