natolambert commited on
Commit
7e0e569
1 Parent(s): 62907d5

update to reasoning

Browse files
Files changed (3) hide show
  1. src/constants.py +2 -1
  2. src/md.py +2 -1
  3. src/utils.py +7 -0
src/constants.py CHANGED
@@ -31,6 +31,7 @@ example_counts = {
31
  "mt-bench-easy": 28,
32
  "mt-bench-med": 40,
33
  "mt-bench-hard": 37,
 
34
  "refusals-dangerous": 100,
35
  "refusals-offensive": 100,
36
  "llmbar-natural": 100,
@@ -53,5 +54,5 @@ subset_mapping = {
53
  "Chat": ["alpacaeval-easy", "alpacaeval-length", "alpacaeval-hard", "mt-bench-easy", "mt-bench-med"],
54
  "Chat Hard": ["mt-bench-hard", "llmbar-natural", "llmbar-adver-neighbor", "llmbar-adver-GPTInst", "llmbar-adver-GPTOut", "llmbar-adver-manual"],
55
  "Safety": ["refusals-dangerous", "refusals-offensive", "xstest-should-refuse", "xstest-should-respond", "donotanswer"],
56
- "Code": ["hep-cpp", "hep-go", "hep-java", "hep-js", "hep-python", "hep-rust"]
57
  }
 
31
  "mt-bench-easy": 28,
32
  "mt-bench-med": 40,
33
  "mt-bench-hard": 37,
34
+ "math-prm": 984, # actual length 447, upweighting to be equal to code
35
  "refusals-dangerous": 100,
36
  "refusals-offensive": 100,
37
  "llmbar-natural": 100,
 
54
  "Chat": ["alpacaeval-easy", "alpacaeval-length", "alpacaeval-hard", "mt-bench-easy", "mt-bench-med"],
55
  "Chat Hard": ["mt-bench-hard", "llmbar-natural", "llmbar-adver-neighbor", "llmbar-adver-GPTInst", "llmbar-adver-GPTOut", "llmbar-adver-manual"],
56
  "Safety": ["refusals-dangerous", "refusals-offensive", "xstest-should-refuse", "xstest-should-respond", "donotanswer"],
57
+ "Reasoning": ["math-prm", "hep-cpp", "hep-go", "hep-java", "hep-js", "hep-python", "hep-rust"]
58
  }
src/md.py CHANGED
@@ -8,7 +8,7 @@ We average over 4 core sections (per prompt weighting):
8
  1. **Chat**: Includes the easy chat subsets (alpacaeval-easy, alpacaeval-length, alpacaeval-hard, mt-bench-easy, mt-bench-medium)
9
  2. **Chat Hard**: Includes the hard chat subsets (mt-bench-hard, llmbar-natural, llmbar-adver-neighbor, llmbar-adver-GPTInst, llmbar-adver-GPTOut, llmbar-adver-manual)
10
  3. **Safety**: Includes the safety subsets (refusals-dangerous, refusals-offensive, xstest-should-refuse, xstest-should-respond, do not answer)
11
- 4. **Code**: Includes the code subsets (hep-cpp, hep-go, hep-java, hep-js, hep-python, hep-rust)
12
  5. **Classic Sets**: Includes the test sets ([anthropic_helpful](https://huggingface.co/datasets/Anthropic/hh-rlhf), [anthropic_hhh](https://huggingface.co/datasets/HuggingFaceH4/hhh_alignment), [mtbench_human](https://huggingface.co/datasets/lmsys/mt_bench_human_judgments), [shp](https://huggingface.co/datasets/stanfordnlp/SHP), [summarize](https://huggingface.co/datasets/openai/summarize_from_feedback))
13
 
14
  We include multiple types of reward models in this evaluation:
@@ -42,6 +42,7 @@ Total number of the prompts is: 2538, filtered from 4676.
42
  | xstest-should-refuse | 450, 250 | False response dataset (see [paper](https://arxiv.org/abs/2308.01263)) |
43
  | xstest-should-respond | 450, 154 | False refusal dataset (see [paper](https://arxiv.org/abs/2308.01263)) |
44
  | do not answer | 939, 136 | [Prompts which responsible LLMs do not answer](https://huggingface.co/datasets/LibrAI/do-not-answer) |
 
45
  | hep-cpp | 164 | C++ code revisions (See [dataset](https://huggingface.co/datasets/bigcode/humanevalpack) or [paper](https://arxiv.org/abs/2308.07124)) |
46
  | hep-go | 164 | Go code |
47
  | hep-java | 164 | Java code |
 
8
  1. **Chat**: Includes the easy chat subsets (alpacaeval-easy, alpacaeval-length, alpacaeval-hard, mt-bench-easy, mt-bench-medium)
9
  2. **Chat Hard**: Includes the hard chat subsets (mt-bench-hard, llmbar-natural, llmbar-adver-neighbor, llmbar-adver-GPTInst, llmbar-adver-GPTOut, llmbar-adver-manual)
10
  3. **Safety**: Includes the safety subsets (refusals-dangerous, refusals-offensive, xstest-should-refuse, xstest-should-respond, do not answer)
11
+ 4. **Reasoning**: Includes the code and math subsets (math-prm, hep-cpp, hep-go, hep-java, hep-js, hep-python, hep-rust)
12
  5. **Classic Sets**: Includes the test sets ([anthropic_helpful](https://huggingface.co/datasets/Anthropic/hh-rlhf), [anthropic_hhh](https://huggingface.co/datasets/HuggingFaceH4/hhh_alignment), [mtbench_human](https://huggingface.co/datasets/lmsys/mt_bench_human_judgments), [shp](https://huggingface.co/datasets/stanfordnlp/SHP), [summarize](https://huggingface.co/datasets/openai/summarize_from_feedback))
13
 
14
  We include multiple types of reward models in this evaluation:
 
42
  | xstest-should-refuse | 450, 250 | False response dataset (see [paper](https://arxiv.org/abs/2308.01263)) |
43
  | xstest-should-respond | 450, 154 | False refusal dataset (see [paper](https://arxiv.org/abs/2308.01263)) |
44
  | do not answer | 939, 136 | [Prompts which responsible LLMs do not answer](https://huggingface.co/datasets/LibrAI/do-not-answer) |
45
+ | math-prm | 447 | Human references vs. model error from OpenAI's Let's Verify Step by Step |
46
  | hep-cpp | 164 | C++ code revisions (See [dataset](https://huggingface.co/datasets/bigcode/humanevalpack) or [paper](https://arxiv.org/abs/2308.07124)) |
47
  | hep-go | 164 | Go code |
48
  | hep-java | 164 | Java code |
src/utils.py CHANGED
@@ -88,6 +88,13 @@ def load_all_data(data_repo, subdir:str, subsubsets=False): # use HF api to p
88
  if "summarize_prompted" in cols:
89
  df = df.drop(columns=["summarize_prompted"])
90
  cols.remove("summarize_prompted")
 
 
 
 
 
 
 
91
 
92
  # round
93
  df[cols] = df[cols].round(2)
 
88
  if "summarize_prompted" in cols:
89
  df = df.drop(columns=["summarize_prompted"])
90
  cols.remove("summarize_prompted")
91
+ # remove pku_better and pku_safer (removed from the leaderboard)
92
+ if "pku_better" in cols:
93
+ df = df.drop(columns=["pku_better"])
94
+ cols.remove("pku_better")
95
+ if "pku_safer" in cols:
96
+ df = df.drop(columns=["pku_safer"])
97
+ cols.remove("pku_safer")
98
 
99
  # round
100
  df[cols] = df[cols].round(2)