natolambert commited on
Commit
7eaa6d2
1 Parent(s): fc699be
Files changed (2) hide show
  1. app.py +29 -12
  2. src/utils.py +3 -3
app.py CHANGED
@@ -42,7 +42,7 @@ def avg_over_rewardbench(dataframe_core, dataframe_prefs):
42
  2. Chat Hard: Includes the hard chat subsets (mt-bench-hard, llmbar-natural, llmbar-adver-neighbor, llmbar-adver-GPTInst, llmbar-adver-GPTOut, llmbar-adver-manual)
43
  3. Safety: Includes the safety subsets (refusals-dangerous, refusals-offensive, xstest-should-refuse, xstest-should-respond, do not answer)
44
  4. Reasoning: Includes the code and math subsets (math-prm, hep-cpp, hep-go, hep-java, hep-js, hep-python, hep-rust)
45
- 5. Prior Sets: Includes the test sets (anthropic_helpful, mtbench_human, shp, summarize)
46
  """
47
  new_df = dataframe_core.copy()
48
  dataframe_prefs = dataframe_prefs.copy()
@@ -52,7 +52,7 @@ def avg_over_rewardbench(dataframe_core, dataframe_prefs):
52
  subset_cols = [col for col in new_df.columns if col in sub_subsets]
53
  sub_data = new_df[subset_cols].values # take the relevant column values
54
  sub_counts = [example_counts[s] for s in sub_subsets] # take the example counts
55
- new_df[subset] = np.round(np.average(sub_data, axis=1, weights=sub_counts), 1) # take the weighted average
56
  # new_df[subset] = np.round(np.nanmean(new_df[subset_cols].values, axis=1), 2)
57
 
58
  data_cols = list(subset_mapping.keys())
@@ -65,25 +65,30 @@ def avg_over_rewardbench(dataframe_core, dataframe_prefs):
65
  pref_data = dataframe_prefs[pref_columns].values
66
 
67
  # add column test sets knowing the rows are not identical, take superset
68
- dataframe_prefs["Prior Sets"] = np.round(np.nanmean(pref_data, axis=1), 1)
69
 
70
  # add column Test Sets empty to new_df
71
- new_df["Prior Sets"] = np.nan
72
- # per row in new_df if model is in dataframe_prefs, add the value to new_df["Prior Sets"]
73
  values = []
74
  for i, row in new_df.iterrows():
75
  model = row["model"]
76
  if model in dataframe_prefs["model"].values:
77
- values.append(dataframe_prefs[dataframe_prefs["model"] == model]["Prior Sets"].values[0])
78
- # new_df.at[i, "Prior Sets"] = dataframe_prefs[dataframe_prefs["model"] == model]["Prior Sets"].values[0]
79
  else:
80
  values.append(np.nan)
81
 
82
- new_df["Prior Sets"] = values
83
 
84
  # add total average
85
- data_cols += ["Prior Sets"]
86
- new_df["average"] = np.round(np.nanmean(new_df[data_cols].values, axis=1), 1)
 
 
 
 
 
87
 
88
  # make average third column
89
  keep_columns = ["model", "model_type", "average"] + data_cols
@@ -121,7 +126,7 @@ def length_bias_check(dataframe):
121
 
122
  # take average of new_data and add to new_df (removing other columns than model)
123
  for subset in final_subsets:
124
- new_df[subset] = np.round(np.nanmean(new_data[subset], axis=0), 2)
125
  keep_columns = ["model"] + final_subsets
126
  new_df = new_df[keep_columns]
127
  # recompute average
@@ -148,7 +153,9 @@ def prep_df(df):
148
 
149
  # add count column to all dataframes
150
  rewardbench_data = prep_df(rewardbench_data)
151
- rewardbench_data_avg = prep_df(rewardbench_data_avg)
 
 
152
  rewardbench_data_length = prep_df(rewardbench_data_length)
153
  prefs_data = prep_df(prefs_data)
154
 
@@ -201,6 +208,16 @@ def regex_table(dataframe, regex, filter_button):
201
 
202
  # replace column '' with count/rank
203
  data[''] = np.arange(1, 1 + len(data))
 
 
 
 
 
 
 
 
 
 
204
  return data
205
 
206
 
 
42
  2. Chat Hard: Includes the hard chat subsets (mt-bench-hard, llmbar-natural, llmbar-adver-neighbor, llmbar-adver-GPTInst, llmbar-adver-GPTOut, llmbar-adver-manual)
43
  3. Safety: Includes the safety subsets (refusals-dangerous, refusals-offensive, xstest-should-refuse, xstest-should-respond, do not answer)
44
  4. Reasoning: Includes the code and math subsets (math-prm, hep-cpp, hep-go, hep-java, hep-js, hep-python, hep-rust)
45
+ 5. Prior Sets (0.5 weight): Includes the test sets (anthropic_helpful, mtbench_human, shp, summarize)
46
  """
47
  new_df = dataframe_core.copy()
48
  dataframe_prefs = dataframe_prefs.copy()
 
52
  subset_cols = [col for col in new_df.columns if col in sub_subsets]
53
  sub_data = new_df[subset_cols].values # take the relevant column values
54
  sub_counts = [example_counts[s] for s in sub_subsets] # take the example counts
55
+ new_df[subset] = np.average(sub_data, axis=1, weights=sub_counts) # take the weighted average
56
  # new_df[subset] = np.round(np.nanmean(new_df[subset_cols].values, axis=1), 2)
57
 
58
  data_cols = list(subset_mapping.keys())
 
65
  pref_data = dataframe_prefs[pref_columns].values
66
 
67
  # add column test sets knowing the rows are not identical, take superset
68
+ dataframe_prefs["Prior Sets (0.5 weight)"] = np.nanmean(pref_data, axis=1)
69
 
70
  # add column Test Sets empty to new_df
71
+ new_df["Prior Sets (0.5 weight)"] = np.nan
72
+ # per row in new_df if model is in dataframe_prefs, add the value to new_df["Prior Sets (0.5 weight)"]
73
  values = []
74
  for i, row in new_df.iterrows():
75
  model = row["model"]
76
  if model in dataframe_prefs["model"].values:
77
+ values.append(dataframe_prefs[dataframe_prefs["model"] == model]["Prior Sets (0.5 weight)"].values[0])
78
+ # new_df.at[i, "Prior Sets (0.5 weight)"] = dataframe_prefs[dataframe_prefs["model"] == model]["Prior Sets (0.5 weight)"].values[0]
79
  else:
80
  values.append(np.nan)
81
 
82
+ new_df["Prior Sets (0.5 weight)"] = values
83
 
84
  # add total average
85
+ data_cols += ["Prior Sets (0.5 weight)"]
86
+ final_data = new_df[data_cols].values
87
+ masked_data = np.ma.masked_array(final_data, np.isnan(final_data))
88
+ weights = [2, 2, 2, 2, 1]
89
+ average = np.ma.average(masked_data, axis=1, weights=weights)
90
+ new_df["average"] = average.filled(np.nan)
91
+ # new_df["average"] = np.nanmean(new_df[data_cols].values, axis=1)
92
 
93
  # make average third column
94
  keep_columns = ["model", "model_type", "average"] + data_cols
 
126
 
127
  # take average of new_data and add to new_df (removing other columns than model)
128
  for subset in final_subsets:
129
+ new_df[subset] = np.nanmean(new_data[subset], axis=0)
130
  keep_columns = ["model"] + final_subsets
131
  new_df = new_df[keep_columns]
132
  # recompute average
 
153
 
154
  # add count column to all dataframes
155
  rewardbench_data = prep_df(rewardbench_data)
156
+ rewardbench_data_avg = prep_df(rewardbench_data_avg).rename(columns={"Average": "Score"})
157
+ # adjust weight of this average to 50% for Prior Sets (0.5 weight), 1 for others
158
+
159
  rewardbench_data_length = prep_df(rewardbench_data_length)
160
  prefs_data = prep_df(prefs_data)
161
 
 
208
 
209
  # replace column '' with count/rank
210
  data[''] = np.arange(1, 1 + len(data))
211
+
212
+ # if Score exists, round to 2 decimals
213
+ if "Score" in data.columns:
214
+ data["Score"] = data["Score"].round(2)
215
+ if "Average" in data.columns:
216
+ data["Average"] = data["Average"].round(1)
217
+ # round all others to 1 decimal
218
+ for col in data.columns:
219
+ if col not in ["", "Model", "Model Type", "Score", "Average"]:
220
+ data[col] = data[col].round(1)
221
  return data
222
 
223
 
src/utils.py CHANGED
@@ -96,9 +96,9 @@ def load_all_data(data_repo, subdir:str, subsubsets=False): # use HF api to p
96
  df = df.drop(columns=["pku_safer"])
97
  cols.remove("pku_safer")
98
 
99
- # round
100
- df[cols] = (df[cols]*100).round(1)
101
- avg = np.nanmean(df[cols].values,axis=1).round(1)
102
  # add average column
103
  df["average"] = avg
104
 
 
96
  df = df.drop(columns=["pku_safer"])
97
  cols.remove("pku_safer")
98
 
99
+ # convert to score
100
+ df[cols] = (df[cols]*100)
101
+ avg = np.nanmean(df[cols].values,axis=1)
102
  # add average column
103
  df["average"] = avg
104