natolambert commited on
Commit
bbe05a0
β€’
1 Parent(s): c8a4819
Files changed (3) hide show
  1. app.py +21 -4
  2. src/css.py +22 -0
  3. src/md.py +3 -6
app.py CHANGED
@@ -7,6 +7,7 @@ from src.utils import load_all_data
7
  from src.md import ABOUT_TEXT, TOP_TEXT
8
  from src.plt import plot_avg_correlation
9
  from src.constants import subset_mapping, length_categories, example_counts
 
10
  import numpy as np
11
 
12
  api = HfApi()
@@ -185,18 +186,18 @@ def regex_table(dataframe, regex, filter_button):
185
  return dataframe[dataframe["model"].str.contains(combined_regex, case=False, na=False)]
186
 
187
 
188
- with gr.Blocks() as app:
189
  # create tabs for the app, moving the current table to one titled "rewardbench" and the benchmark_text to a tab called "About"
190
  with gr.Row():
191
- with gr.Column(scale=3):
192
- gr.Markdown(TOP_TEXT)
193
- with gr.Column(scale=2.2):
194
  # search = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model")
195
  # filter_button = gr.Checkbox(label="Include AI2 training runs (or type ai2 above).", interactive=True)
196
  # img = gr.Image(value="https://private-user-images.githubusercontent.com/10695622/310698241-24ed272a-0844-451f-b414-fde57478703e.png", width=500)
197
  gr.Markdown("""
198
  ![](file/src/logo.png)
199
  """)
 
 
200
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
201
  with gr.TabItem("πŸ† RewardBench Leaderboard"):
202
  with gr.Row():
@@ -321,6 +322,22 @@ with gr.Blocks() as app:
321
  model_types_2.change(regex_table, inputs=[rewardbench_table_detailed_hidden, search_2, model_types_2], outputs=rewardbench_table_detailed)
322
  model_types_3.change(regex_table, inputs=[pref_sets_table_hidden, search_3, model_types_3], outputs=pref_sets_table)
323
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
324
  # Load data when app starts, TODO make this used somewhere...
325
  # def load_data_on_start():
326
  # data_rewardbench = load_all_data(repo_dir_rewardbench)
 
7
  from src.md import ABOUT_TEXT, TOP_TEXT
8
  from src.plt import plot_avg_correlation
9
  from src.constants import subset_mapping, length_categories, example_counts
10
+ from src.css import custom_css
11
  import numpy as np
12
 
13
  api = HfApi()
 
186
  return dataframe[dataframe["model"].str.contains(combined_regex, case=False, na=False)]
187
 
188
 
189
+ with gr.Blocks(css=custom_css) as app:
190
  # create tabs for the app, moving the current table to one titled "rewardbench" and the benchmark_text to a tab called "About"
191
  with gr.Row():
192
+ with gr.Column(scale=1.65):
 
 
193
  # search = gr.Textbox(label="Model Search (delimit with , )", placeholder="Regex search for a model")
194
  # filter_button = gr.Checkbox(label="Include AI2 training runs (or type ai2 above).", interactive=True)
195
  # img = gr.Image(value="https://private-user-images.githubusercontent.com/10695622/310698241-24ed272a-0844-451f-b414-fde57478703e.png", width=500)
196
  gr.Markdown("""
197
  ![](file/src/logo.png)
198
  """)
199
+ with gr.Column(scale=3):
200
+ gr.Markdown(TOP_TEXT)
201
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
202
  with gr.TabItem("πŸ† RewardBench Leaderboard"):
203
  with gr.Row():
 
322
  model_types_2.change(regex_table, inputs=[rewardbench_table_detailed_hidden, search_2, model_types_2], outputs=rewardbench_table_detailed)
323
  model_types_3.change(regex_table, inputs=[pref_sets_table_hidden, search_3, model_types_3], outputs=pref_sets_table)
324
 
325
+ with gr.Row():
326
+ with gr.Accordion("πŸ“š Citation", open=False):
327
+ citation_button = gr.Textbox(
328
+ value=r"""
329
+ @misc{RewardBench,
330
+ title={RewardBench: Benchmarking Reward Models},
331
+ author={Lambert, Nathan and Pyatkin, Valentina and Morrison, Jacob and Miranda, LJ and Lin, Bill Yuchen and Chandu, Khyathi and Dziri, Nouha and Kumar, Sachin and Zick, Tom and Choi, Yejin and Smith, Noah A. and Hajishirzi, Hannaneh},
332
+ year={2024},
333
+ howpublished={\url{https://huggingface.co/spaces/allenai/reward-bench}
334
+ }
335
+ """,
336
+ height=15,
337
+ label="Copy the following to cite these results.",
338
+ elem_id="citation-button",
339
+ show_copy_button=True,
340
+ )
341
  # Load data when app starts, TODO make this used somewhere...
342
  # def load_data_on_start():
343
  # data_rewardbench = load_all_data(repo_dir_rewardbench)
src/css.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ custom_css = """
2
+
3
+ /* Full width space */
4
+ .gradio-container {
5
+ max-width: 95%;
6
+ }
7
+
8
+ /* Text tyle and margins */
9
+ .markdown-text {
10
+ font-size: 17px !important;
11
+ }
12
+
13
+ .tab-buttons button {
14
+ font-size: 20px;
15
+ }
16
+
17
+ h1 {
18
+ font-size: 32px !important;
19
+ margin-top: 0px !important;
20
+ }
21
+
22
+ """
src/md.py CHANGED
@@ -16,6 +16,7 @@ We include multiple types of reward models in this evaluation:
16
  3. **DPO**: Models trained with Direct Preference Optimization (DPO), with modifiers such as `-ref-free` or `-norm` changing how scores are computed.
17
  4. **Random**: Random choice baseline.
18
 
 
19
  Others, such as **Generative Judge** are coming soon.
20
 
21
  ### Subset Details
@@ -78,11 +79,7 @@ For more details, see the [dataset](https://huggingface.co/datasets/ai2-rlhf-col
78
  """
79
 
80
  TOP_TEXT = """
81
- # RewardBench from AI2
82
-
83
- Evaluating the capabilities, safety, and pitfalls of reward models.
84
-
85
  [Code](https://github.com/allenai/herm) | [Eval. Dataset](https://huggingface.co/datasets/ai2-adapt-dev/rm-benchmark-dev) | [Existing Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results](https://huggingface.co/datasets/ai2-adapt-dev/HERM-Results) | Paper (coming soon)
86
-
87
- All models are evaluated in fp16 expect for Starling-7B, which is evaluated in fp32.
88
  """
 
16
  3. **DPO**: Models trained with Direct Preference Optimization (DPO), with modifiers such as `-ref-free` or `-norm` changing how scores are computed.
17
  4. **Random**: Random choice baseline.
18
 
19
+ All models are evaluated in fp16 expect for Starling-7B, which is evaluated in fp32.
20
  Others, such as **Generative Judge** are coming soon.
21
 
22
  ### Subset Details
 
79
  """
80
 
81
  TOP_TEXT = """
82
+ # RewardBench: Benchmarking Reward Models
83
+ ### Evaluating the capabilities, safety, and pitfalls of reward models
 
 
84
  [Code](https://github.com/allenai/herm) | [Eval. Dataset](https://huggingface.co/datasets/ai2-adapt-dev/rm-benchmark-dev) | [Existing Test Sets](https://huggingface.co/datasets/allenai/pref-test-sets) | [Results](https://huggingface.co/datasets/ai2-adapt-dev/HERM-Results) | Paper (coming soon)
 
 
85
  """