natolambert commited on
Commit
1d33a30
β€’
1 Parent(s): 9af70d6

add generative default off

Browse files
Files changed (3) hide show
  1. app.py +7 -3
  2. src/md.py +1 -0
  3. src/utils.py +5 -1
app.py CHANGED
@@ -203,6 +203,8 @@ def regex_table(dataframe, regex, filter_button):
203
  dataframe = dataframe[~dataframe["Model Type"].str.contains("DPO", case=False, na=False)]
204
  if "Custom Classifiers" not in filter_button:
205
  dataframe = dataframe[~dataframe["Model Type"].str.contains("Custom Classifier", case=False, na=False)]
 
 
206
  # Filter the dataframe such that 'model' contains any of the regex patterns
207
  data = dataframe[dataframe["Model"].str.contains(combined_regex, case=False, na=False)]
208
 
@@ -217,6 +219,8 @@ def regex_table(dataframe, regex, filter_button):
217
  # round all others to 1 decimal
218
  for col in data.columns:
219
  if col not in ["", "Model", "Model Type", "Score", "Average"]:
 
 
220
  data[col] = np.round(np.array(data[col].values).astype(float), 1)
221
  return data
222
 
@@ -242,7 +246,7 @@ with gr.Blocks(css=custom_css) as app:
242
  search_1 = gr.Textbox(label="Model Search (delimit with , )",
243
  placeholder="Model Search (delimit with , )",
244
  show_label=False)
245
- model_types_1 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "AI2 Experiments"],
246
  value=["Seq. Classifiers", "DPO", "Custom Classifiers"],
247
  label="Model Types",
248
  show_label=False,
@@ -267,7 +271,7 @@ with gr.Blocks(css=custom_css) as app:
267
  with gr.TabItem("πŸ” RewardBench - Detailed"):
268
  with gr.Row():
269
  search_2 = gr.Textbox(label="Model Search (delimit with , )", show_label=False, placeholder="Model Search (delimit with , )")
270
- model_types_2 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "AI2 Experiments"],
271
  value=["Seq. Classifiers", "DPO", "Custom Classifiers"],
272
  label="Model Types",
273
  show_label=False,
@@ -307,7 +311,7 @@ with gr.Blocks(css=custom_css) as app:
307
  with gr.TabItem("Prior Test Sets"):
308
  with gr.Row():
309
  search_3 = gr.Textbox(label="Model Search (delimit with , )", show_label=False, placeholder="Model Search (delimit with , )")
310
- model_types_3 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "AI2 Experiments"],
311
  value=["Seq. Classifiers", "DPO", "Custom Classifiers"],
312
  label="Model Types",
313
  show_label=False,
 
203
  dataframe = dataframe[~dataframe["Model Type"].str.contains("DPO", case=False, na=False)]
204
  if "Custom Classifiers" not in filter_button:
205
  dataframe = dataframe[~dataframe["Model Type"].str.contains("Custom Classifier", case=False, na=False)]
206
+ if "Generative" not in filter_button:
207
+ dataframe = dataframe[~dataframe["Model Type"].str.contains("generative", case=False, na=False)]
208
  # Filter the dataframe such that 'model' contains any of the regex patterns
209
  data = dataframe[dataframe["Model"].str.contains(combined_regex, case=False, na=False)]
210
 
 
219
  # round all others to 1 decimal
220
  for col in data.columns:
221
  if col not in ["", "Model", "Model Type", "Score", "Average"]:
222
+ # replace any data[col].values == '' with np.NaN
223
+ data[col] = data[col].replace('', np.NaN)
224
  data[col] = np.round(np.array(data[col].values).astype(float), 1)
225
  return data
226
 
 
246
  search_1 = gr.Textbox(label="Model Search (delimit with , )",
247
  placeholder="Model Search (delimit with , )",
248
  show_label=False)
249
+ model_types_1 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative", "AI2 Experiments"],
250
  value=["Seq. Classifiers", "DPO", "Custom Classifiers"],
251
  label="Model Types",
252
  show_label=False,
 
271
  with gr.TabItem("πŸ” RewardBench - Detailed"):
272
  with gr.Row():
273
  search_2 = gr.Textbox(label="Model Search (delimit with , )", show_label=False, placeholder="Model Search (delimit with , )")
274
+ model_types_2 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative", "AI2 Experiments"],
275
  value=["Seq. Classifiers", "DPO", "Custom Classifiers"],
276
  label="Model Types",
277
  show_label=False,
 
311
  with gr.TabItem("Prior Test Sets"):
312
  with gr.Row():
313
  search_3 = gr.Textbox(label="Model Search (delimit with , )", show_label=False, placeholder="Model Search (delimit with , )")
314
+ model_types_3 = gr.CheckboxGroup(["Seq. Classifiers", "DPO", "Custom Classifiers", "Generative", "AI2 Experiments"],
315
  value=["Seq. Classifiers", "DPO", "Custom Classifiers"],
316
  label="Model Types",
317
  show_label=False,
src/md.py CHANGED
@@ -22,6 +22,7 @@ We include multiple types of reward models in this evaluation:
22
  2. **Custom Classifiers**: Research models with different architectures and training objectives to either take in two inputs at once or generate scores differently (e.g. PairRM and Stanford SteamSHP).
23
  3. **DPO**: Models trained with Direct Preference Optimization (DPO), with modifiers such as `-ref-free` or `-norm` changing how scores are computed.
24
  4. **Random**: Random choice baseline.
 
25
 
26
  All models are evaluated in fp16 expect for Starling-7B, which is evaluated in fp32.
27
  Others, such as **Generative Judge** are coming soon.
 
22
  2. **Custom Classifiers**: Research models with different architectures and training objectives to either take in two inputs at once or generate scores differently (e.g. PairRM and Stanford SteamSHP).
23
  3. **DPO**: Models trained with Direct Preference Optimization (DPO), with modifiers such as `-ref-free` or `-norm` changing how scores are computed.
24
  4. **Random**: Random choice baseline.
25
+ 4. **Generative**: Prompting fine-tuned models to choose between two answers, similar to MT Bench and AlpacaEval.
26
 
27
  All models are evaluated in fp16 expect for Starling-7B, which is evaluated in fp32.
28
  Others, such as **Generative Judge** are coming soon.
src/utils.py CHANGED
@@ -9,8 +9,12 @@ import re
9
  def model_hyperlink(link, model_name):
10
  if model_name == "random":
11
  return "random"
12
- if model_name == "Cohere March 2024":
13
  return f'<a target="_blank" href="https://huggingface.co/Cohere" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
 
 
 
 
14
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
15
 
16
  def undo_hyperlink(html_string):
 
9
  def model_hyperlink(link, model_name):
10
  if model_name == "random":
11
  return "random"
12
+ elif model_name == "Cohere March 2024":
13
  return f'<a target="_blank" href="https://huggingface.co/Cohere" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
14
+ elif "openai" == model_name.split("/")[0]:
15
+ return f'<a target="_blank" href="https://huggingface.co/openai" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
16
+ elif "Anthropic" == model_name.split("/")[0]:
17
+ return f'<a target="_blank" href="https://huggingface.co/Anthropic" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
18
  return f'<a target="_blank" href="{link}" style="color: var(--link-text-color); text-decoration: underline;text-decoration-style: dotted;">{model_name}</a>'
19
 
20
  def undo_hyperlink(html_string):