Spaces:
Running
Running
natolambert
commited on
Commit
•
7eaa6d2
1
Parent(s):
fc699be
update
Browse files- app.py +29 -12
- src/utils.py +3 -3
app.py
CHANGED
@@ -42,7 +42,7 @@ def avg_over_rewardbench(dataframe_core, dataframe_prefs):
|
|
42 |
2. Chat Hard: Includes the hard chat subsets (mt-bench-hard, llmbar-natural, llmbar-adver-neighbor, llmbar-adver-GPTInst, llmbar-adver-GPTOut, llmbar-adver-manual)
|
43 |
3. Safety: Includes the safety subsets (refusals-dangerous, refusals-offensive, xstest-should-refuse, xstest-should-respond, do not answer)
|
44 |
4. Reasoning: Includes the code and math subsets (math-prm, hep-cpp, hep-go, hep-java, hep-js, hep-python, hep-rust)
|
45 |
-
5. Prior Sets: Includes the test sets (anthropic_helpful, mtbench_human, shp, summarize)
|
46 |
"""
|
47 |
new_df = dataframe_core.copy()
|
48 |
dataframe_prefs = dataframe_prefs.copy()
|
@@ -52,7 +52,7 @@ def avg_over_rewardbench(dataframe_core, dataframe_prefs):
|
|
52 |
subset_cols = [col for col in new_df.columns if col in sub_subsets]
|
53 |
sub_data = new_df[subset_cols].values # take the relevant column values
|
54 |
sub_counts = [example_counts[s] for s in sub_subsets] # take the example counts
|
55 |
-
new_df[subset] = np.
|
56 |
# new_df[subset] = np.round(np.nanmean(new_df[subset_cols].values, axis=1), 2)
|
57 |
|
58 |
data_cols = list(subset_mapping.keys())
|
@@ -65,25 +65,30 @@ def avg_over_rewardbench(dataframe_core, dataframe_prefs):
|
|
65 |
pref_data = dataframe_prefs[pref_columns].values
|
66 |
|
67 |
# add column test sets knowing the rows are not identical, take superset
|
68 |
-
dataframe_prefs["Prior Sets"] = np.
|
69 |
|
70 |
# add column Test Sets empty to new_df
|
71 |
-
new_df["Prior Sets"] = np.nan
|
72 |
-
# per row in new_df if model is in dataframe_prefs, add the value to new_df["Prior Sets"]
|
73 |
values = []
|
74 |
for i, row in new_df.iterrows():
|
75 |
model = row["model"]
|
76 |
if model in dataframe_prefs["model"].values:
|
77 |
-
values.append(dataframe_prefs[dataframe_prefs["model"] == model]["Prior Sets"].values[0])
|
78 |
-
# new_df.at[i, "Prior Sets"] = dataframe_prefs[dataframe_prefs["model"] == model]["Prior Sets"].values[0]
|
79 |
else:
|
80 |
values.append(np.nan)
|
81 |
|
82 |
-
new_df["Prior Sets"] = values
|
83 |
|
84 |
# add total average
|
85 |
-
data_cols += ["Prior Sets"]
|
86 |
-
|
|
|
|
|
|
|
|
|
|
|
87 |
|
88 |
# make average third column
|
89 |
keep_columns = ["model", "model_type", "average"] + data_cols
|
@@ -121,7 +126,7 @@ def length_bias_check(dataframe):
|
|
121 |
|
122 |
# take average of new_data and add to new_df (removing other columns than model)
|
123 |
for subset in final_subsets:
|
124 |
-
new_df[subset] = np.
|
125 |
keep_columns = ["model"] + final_subsets
|
126 |
new_df = new_df[keep_columns]
|
127 |
# recompute average
|
@@ -148,7 +153,9 @@ def prep_df(df):
|
|
148 |
|
149 |
# add count column to all dataframes
|
150 |
rewardbench_data = prep_df(rewardbench_data)
|
151 |
-
rewardbench_data_avg = prep_df(rewardbench_data_avg)
|
|
|
|
|
152 |
rewardbench_data_length = prep_df(rewardbench_data_length)
|
153 |
prefs_data = prep_df(prefs_data)
|
154 |
|
@@ -201,6 +208,16 @@ def regex_table(dataframe, regex, filter_button):
|
|
201 |
|
202 |
# replace column '' with count/rank
|
203 |
data[''] = np.arange(1, 1 + len(data))
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
204 |
return data
|
205 |
|
206 |
|
|
|
42 |
2. Chat Hard: Includes the hard chat subsets (mt-bench-hard, llmbar-natural, llmbar-adver-neighbor, llmbar-adver-GPTInst, llmbar-adver-GPTOut, llmbar-adver-manual)
|
43 |
3. Safety: Includes the safety subsets (refusals-dangerous, refusals-offensive, xstest-should-refuse, xstest-should-respond, do not answer)
|
44 |
4. Reasoning: Includes the code and math subsets (math-prm, hep-cpp, hep-go, hep-java, hep-js, hep-python, hep-rust)
|
45 |
+
5. Prior Sets (0.5 weight): Includes the test sets (anthropic_helpful, mtbench_human, shp, summarize)
|
46 |
"""
|
47 |
new_df = dataframe_core.copy()
|
48 |
dataframe_prefs = dataframe_prefs.copy()
|
|
|
52 |
subset_cols = [col for col in new_df.columns if col in sub_subsets]
|
53 |
sub_data = new_df[subset_cols].values # take the relevant column values
|
54 |
sub_counts = [example_counts[s] for s in sub_subsets] # take the example counts
|
55 |
+
new_df[subset] = np.average(sub_data, axis=1, weights=sub_counts) # take the weighted average
|
56 |
# new_df[subset] = np.round(np.nanmean(new_df[subset_cols].values, axis=1), 2)
|
57 |
|
58 |
data_cols = list(subset_mapping.keys())
|
|
|
65 |
pref_data = dataframe_prefs[pref_columns].values
|
66 |
|
67 |
# add column test sets knowing the rows are not identical, take superset
|
68 |
+
dataframe_prefs["Prior Sets (0.5 weight)"] = np.nanmean(pref_data, axis=1)
|
69 |
|
70 |
# add column Test Sets empty to new_df
|
71 |
+
new_df["Prior Sets (0.5 weight)"] = np.nan
|
72 |
+
# per row in new_df if model is in dataframe_prefs, add the value to new_df["Prior Sets (0.5 weight)"]
|
73 |
values = []
|
74 |
for i, row in new_df.iterrows():
|
75 |
model = row["model"]
|
76 |
if model in dataframe_prefs["model"].values:
|
77 |
+
values.append(dataframe_prefs[dataframe_prefs["model"] == model]["Prior Sets (0.5 weight)"].values[0])
|
78 |
+
# new_df.at[i, "Prior Sets (0.5 weight)"] = dataframe_prefs[dataframe_prefs["model"] == model]["Prior Sets (0.5 weight)"].values[0]
|
79 |
else:
|
80 |
values.append(np.nan)
|
81 |
|
82 |
+
new_df["Prior Sets (0.5 weight)"] = values
|
83 |
|
84 |
# add total average
|
85 |
+
data_cols += ["Prior Sets (0.5 weight)"]
|
86 |
+
final_data = new_df[data_cols].values
|
87 |
+
masked_data = np.ma.masked_array(final_data, np.isnan(final_data))
|
88 |
+
weights = [2, 2, 2, 2, 1]
|
89 |
+
average = np.ma.average(masked_data, axis=1, weights=weights)
|
90 |
+
new_df["average"] = average.filled(np.nan)
|
91 |
+
# new_df["average"] = np.nanmean(new_df[data_cols].values, axis=1)
|
92 |
|
93 |
# make average third column
|
94 |
keep_columns = ["model", "model_type", "average"] + data_cols
|
|
|
126 |
|
127 |
# take average of new_data and add to new_df (removing other columns than model)
|
128 |
for subset in final_subsets:
|
129 |
+
new_df[subset] = np.nanmean(new_data[subset], axis=0)
|
130 |
keep_columns = ["model"] + final_subsets
|
131 |
new_df = new_df[keep_columns]
|
132 |
# recompute average
|
|
|
153 |
|
154 |
# add count column to all dataframes
|
155 |
rewardbench_data = prep_df(rewardbench_data)
|
156 |
+
rewardbench_data_avg = prep_df(rewardbench_data_avg).rename(columns={"Average": "Score"})
|
157 |
+
# adjust weight of this average to 50% for Prior Sets (0.5 weight), 1 for others
|
158 |
+
|
159 |
rewardbench_data_length = prep_df(rewardbench_data_length)
|
160 |
prefs_data = prep_df(prefs_data)
|
161 |
|
|
|
208 |
|
209 |
# replace column '' with count/rank
|
210 |
data[''] = np.arange(1, 1 + len(data))
|
211 |
+
|
212 |
+
# if Score exists, round to 2 decimals
|
213 |
+
if "Score" in data.columns:
|
214 |
+
data["Score"] = data["Score"].round(2)
|
215 |
+
if "Average" in data.columns:
|
216 |
+
data["Average"] = data["Average"].round(1)
|
217 |
+
# round all others to 1 decimal
|
218 |
+
for col in data.columns:
|
219 |
+
if col not in ["", "Model", "Model Type", "Score", "Average"]:
|
220 |
+
data[col] = data[col].round(1)
|
221 |
return data
|
222 |
|
223 |
|
src/utils.py
CHANGED
@@ -96,9 +96,9 @@ def load_all_data(data_repo, subdir:str, subsubsets=False): # use HF api to p
|
|
96 |
df = df.drop(columns=["pku_safer"])
|
97 |
cols.remove("pku_safer")
|
98 |
|
99 |
-
#
|
100 |
-
df[cols] = (df[cols]*100)
|
101 |
-
avg = np.nanmean(df[cols].values,axis=1)
|
102 |
# add average column
|
103 |
df["average"] = avg
|
104 |
|
|
|
96 |
df = df.drop(columns=["pku_safer"])
|
97 |
cols.remove("pku_safer")
|
98 |
|
99 |
+
# convert to score
|
100 |
+
df[cols] = (df[cols]*100)
|
101 |
+
avg = np.nanmean(df[cols].values,axis=1)
|
102 |
# add average column
|
103 |
df["average"] = avg
|
104 |
|