Spaces:
Running
Running
natolambert
commited on
Commit
•
6fda62c
1
Parent(s):
054ed2d
up
Browse files
app.py
CHANGED
@@ -42,7 +42,7 @@ def avg_over_rewardbench(dataframe_core, dataframe_prefs):
|
|
42 |
2. Chat Hard: Includes the hard chat subsets (mt-bench-hard, llmbar-natural, llmbar-adver-neighbor, llmbar-adver-GPTInst, llmbar-adver-GPTOut, llmbar-adver-manual)
|
43 |
3. Safety: Includes the safety subsets (refusals-dangerous, refusals-offensive, xstest-should-refuse, xstest-should-respond, do not answer)
|
44 |
4. Code: Includes the code subsets (hep-cpp, hep-go, hep-java, hep-js, hep-python, hep-rust)
|
45 |
-
5.
|
46 |
"""
|
47 |
new_df = dataframe_core.copy()
|
48 |
dataframe_prefs = dataframe_prefs.copy()
|
@@ -61,28 +61,28 @@ def avg_over_rewardbench(dataframe_core, dataframe_prefs):
|
|
61 |
new_df = new_df[keep_columns]
|
62 |
|
63 |
# selected average from pref_sets
|
64 |
-
pref_columns = ["anthropic_helpful", "
|
65 |
pref_data = dataframe_prefs[pref_columns].values
|
66 |
|
67 |
# add column test sets knowing the rows are not identical, take superset
|
68 |
-
dataframe_prefs["
|
69 |
|
70 |
# add column Test Sets empty to new_df
|
71 |
-
new_df["
|
72 |
-
# per row in new_df if model is in dataframe_prefs, add the value to new_df["
|
73 |
values = []
|
74 |
for i, row in new_df.iterrows():
|
75 |
model = row["model"]
|
76 |
if model in dataframe_prefs["model"].values:
|
77 |
-
values.append(dataframe_prefs[dataframe_prefs["model"] == model]["
|
78 |
-
# new_df.at[i, "
|
79 |
else:
|
80 |
values.append(np.nan)
|
81 |
|
82 |
-
new_df["
|
83 |
|
84 |
# add total average
|
85 |
-
data_cols += ["
|
86 |
new_df["average"] = np.round(np.nanmean(new_df[data_cols].values, axis=1), 2)
|
87 |
|
88 |
# make average third column
|
@@ -280,7 +280,7 @@ with gr.Blocks(css=custom_css) as app:
|
|
280 |
# elem_id="rewardbench_dataframe_length",
|
281 |
# height=1000,
|
282 |
# )
|
283 |
-
with gr.TabItem("
|
284 |
with gr.Row():
|
285 |
search_3 = gr.Textbox(label="Model Search (delimit with , )", show_label=False, placeholder="Model Search (delimit with , )")
|
286 |
model_types_3 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "AI2 Experiments"],
|
@@ -318,7 +318,8 @@ with gr.Blocks(css=custom_css) as app:
|
|
318 |
with gr.TabItem("Dataset Viewer"):
|
319 |
with gr.Row():
|
320 |
# loads one sample
|
321 |
-
gr.Markdown("## Random Dataset Sample Viewer
|
|
|
322 |
subset_selector = gr.Dropdown(subsets, label="Subset", value=None, multiselect=True)
|
323 |
button = gr.Button("Show Random Sample")
|
324 |
|
|
|
42 |
2. Chat Hard: Includes the hard chat subsets (mt-bench-hard, llmbar-natural, llmbar-adver-neighbor, llmbar-adver-GPTInst, llmbar-adver-GPTOut, llmbar-adver-manual)
|
43 |
3. Safety: Includes the safety subsets (refusals-dangerous, refusals-offensive, xstest-should-refuse, xstest-should-respond, do not answer)
|
44 |
4. Code: Includes the code subsets (hep-cpp, hep-go, hep-java, hep-js, hep-python, hep-rust)
|
45 |
+
5. Classic Sets: Includes the test sets (anthropic_helpful, mtbench_human, shp, summarize)
|
46 |
"""
|
47 |
new_df = dataframe_core.copy()
|
48 |
dataframe_prefs = dataframe_prefs.copy()
|
|
|
61 |
new_df = new_df[keep_columns]
|
62 |
|
63 |
# selected average from pref_sets
|
64 |
+
pref_columns = ["anthropic_helpful", "anthropic_hhh", "mtbench_human", "shp", "summarize"]
|
65 |
pref_data = dataframe_prefs[pref_columns].values
|
66 |
|
67 |
# add column test sets knowing the rows are not identical, take superset
|
68 |
+
dataframe_prefs["Classic Sets"] = np.round(np.nanmean(pref_data, axis=1), 2)
|
69 |
|
70 |
# add column Test Sets empty to new_df
|
71 |
+
new_df["Classic Sets"] = np.nan
|
72 |
+
# per row in new_df if model is in dataframe_prefs, add the value to new_df["Classic Sets"]
|
73 |
values = []
|
74 |
for i, row in new_df.iterrows():
|
75 |
model = row["model"]
|
76 |
if model in dataframe_prefs["model"].values:
|
77 |
+
values.append(dataframe_prefs[dataframe_prefs["model"] == model]["Classic Sets"].values[0])
|
78 |
+
# new_df.at[i, "Classic Sets"] = dataframe_prefs[dataframe_prefs["model"] == model]["Classic Sets"].values[0]
|
79 |
else:
|
80 |
values.append(np.nan)
|
81 |
|
82 |
+
new_df["Classic Sets"] = values
|
83 |
|
84 |
# add total average
|
85 |
+
data_cols += ["Classic Sets"]
|
86 |
new_df["average"] = np.round(np.nanmean(new_df[data_cols].values, axis=1), 2)
|
87 |
|
88 |
# make average third column
|
|
|
280 |
# elem_id="rewardbench_dataframe_length",
|
281 |
# height=1000,
|
282 |
# )
|
283 |
+
with gr.TabItem("Classic Sets"):
|
284 |
with gr.Row():
|
285 |
search_3 = gr.Textbox(label="Model Search (delimit with , )", show_label=False, placeholder="Model Search (delimit with , )")
|
286 |
model_types_3 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "AI2 Experiments"],
|
|
|
318 |
with gr.TabItem("Dataset Viewer"):
|
319 |
with gr.Row():
|
320 |
# loads one sample
|
321 |
+
gr.Markdown("""## Random Dataset Sample Viewer
|
322 |
+
Warning, refusals, XSTest, and donotanswer datasets have sensitive content.""")
|
323 |
subset_selector = gr.Dropdown(subsets, label="Subset", value=None, multiselect=True)
|
324 |
button = gr.Button("Show Random Sample")
|
325 |
|
src/md.py
CHANGED
@@ -9,7 +9,7 @@ We average over 4 core sections (per prompt weighting):
|
|
9 |
2. **Chat Hard**: Includes the hard chat subsets (mt-bench-hard, llmbar-natural, llmbar-adver-neighbor, llmbar-adver-GPTInst, llmbar-adver-GPTOut, llmbar-adver-manual)
|
10 |
3. **Safety**: Includes the safety subsets (refusals-dangerous, refusals-offensive, xstest-should-refuse, xstest-should-respond, do not answer)
|
11 |
4. **Code**: Includes the code subsets (hep-cpp, hep-go, hep-java, hep-js, hep-python, hep-rust)
|
12 |
-
5. **
|
13 |
|
14 |
We include multiple types of reward models in this evaluation:
|
15 |
1. **Sequence Classifiers** (Seq. Classifier): A model, normally trained with HuggingFace AutoModelForSequenceClassification, that takes in a prompt and a response and outputs a score.
|
@@ -82,5 +82,5 @@ For more details, see the [dataset](https://huggingface.co/datasets/ai2-rlhf-col
|
|
82 |
TOP_TEXT = """
|
83 |
# RewardBench: Benchmarking Reward Models
|
84 |
### Evaluating the capabilities, safety, and pitfalls of reward models
|
85 |
-
[Code](https://github.com/allenai/herm) | [Eval. Dataset](https://huggingface.co/datasets/ai2-adapt-dev/rm-benchmark-dev) | [
|
86 |
"""
|
|
|
9 |
2. **Chat Hard**: Includes the hard chat subsets (mt-bench-hard, llmbar-natural, llmbar-adver-neighbor, llmbar-adver-GPTInst, llmbar-adver-GPTOut, llmbar-adver-manual)
|
10 |
3. **Safety**: Includes the safety subsets (refusals-dangerous, refusals-offensive, xstest-should-refuse, xstest-should-respond, do not answer)
|
11 |
4. **Code**: Includes the code subsets (hep-cpp, hep-go, hep-java, hep-js, hep-python, hep-rust)
|
12 |
+
5. **Classic Sets**: Includes the test sets ([anthropic_helpful](https://huggingface.co/datasets/Anthropic/hh-rlhf), [anthropic_hhh](https://huggingface.co/datasets/HuggingFaceH4/hhh_alignment), [mtbench_human](https://huggingface.co/datasets/lmsys/mt_bench_human_judgments), [shp](https://huggingface.co/datasets/stanfordnlp/SHP), [summarize](https://huggingface.co/datasets/openai/summarize_from_feedback))
|
13 |
|
14 |
We include multiple types of reward models in this evaluation:
|
15 |
1. **Sequence Classifiers** (Seq. Classifier): A model, normally trained with HuggingFace AutoModelForSequenceClassification, that takes in a prompt and a response and outputs a score.
|
|
|
82 |
TOP_TEXT = """
|
83 |
# RewardBench: Benchmarking Reward Models
|
84 |
### Evaluating the capabilities, safety, and pitfalls of reward models
|
85 |
+
[Code](https://github.com/allenai/herm) | [Eval. Dataset](https://huggingface.co/datasets/ai2-adapt-dev/rm-benchmark-dev) | [Classic Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results](https://huggingface.co/datasets/ai2-adapt-dev/HERM-Results) | Paper (coming soon)
|
86 |
"""
|