Spaces:
Running
Running
translation set updated
Browse files
app.py
CHANGED
@@ -15,7 +15,13 @@ Evaluating the chat, safety, reasoning, and translation capabilities of Multilin
|
|
15 |
|
16 |
π https://m-rewardbench.github.io/'''
|
17 |
|
18 |
-
GOOGLE_SHEET_URL = "https://docs.google.com/spreadsheets/d/1qrD7plUdrBwAw7G6UeDVZAaV9ihxaNAcoiKwSaqotR4/export?gid=0&format=csv"
|
|
|
|
|
|
|
|
|
|
|
|
|
19 |
# ABOUT_TEXT = """
|
20 |
# <h1>
|
21 |
# <span style="font-variant: small-caps;">M-RewardBench</span>: Evaluating Reward Models in Multilingual Settings
|
@@ -51,11 +57,41 @@ class AutoEvalColumn:
|
|
51 |
})
|
52 |
|
53 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
def get_result_data():
|
55 |
-
return pd.read_csv(
|
|
|
56 |
|
|
|
|
|
57 |
|
58 |
-
|
|
|
59 |
if dataframe is None or dataframe.empty:
|
60 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
61 |
|
@@ -63,18 +99,18 @@ def init_leaderboard(dataframe):
|
|
63 |
value=dataframe,
|
64 |
datatype=[
|
65 |
col["type"]
|
66 |
-
for col in
|
67 |
if isinstance(col, dict)
|
68 |
],
|
69 |
select_columns=SelectColumns(
|
70 |
default_selection=[
|
71 |
col["name"]
|
72 |
-
for col in
|
73 |
if isinstance(col, dict) and col["displayed_by_default"]
|
74 |
],
|
75 |
cant_deselect=[
|
76 |
col["name"]
|
77 |
-
for col in
|
78 |
if isinstance(col, dict) and col.get("never_hidden", False)
|
79 |
],
|
80 |
label="Select Columns to Display:",
|
@@ -99,7 +135,7 @@ emojis = "π’ π¬ π―"
|
|
99 |
model_types = {"Generative RM": "π¬", "DPO": "π―", "Sequence Classifier": "π’"}
|
100 |
|
101 |
from functools import partial
|
102 |
-
def format_with_color(val, min_val=50, max_val=100):
|
103 |
"""
|
104 |
Formats a value with inline green color gradient CSS.
|
105 |
Returns an HTML string with bold, black text and muted green background.
|
@@ -111,6 +147,7 @@ def format_with_color(val, min_val=50, max_val=100):
|
|
111 |
|
112 |
# Normalize value between 50 and 100 to 0-1 range
|
113 |
normalized = (val - min_val) / (max_val - min_val)
|
|
|
114 |
# Clamp value between 0 and 1
|
115 |
normalized = max(0, min(1, normalized))
|
116 |
|
@@ -119,7 +156,12 @@ def format_with_color(val, min_val=50, max_val=100):
|
|
119 |
intensity = int(50 + (150 * (1 - normalized)))
|
120 |
|
121 |
# Return HTML with inline CSS - bold black text
|
122 |
-
|
|
|
|
|
|
|
|
|
|
|
123 |
|
124 |
except (ValueError, TypeError):
|
125 |
return str(val)
|
@@ -131,12 +173,10 @@ with demo:
|
|
131 |
gr.Markdown(INTRODUCTION_TEXT)
|
132 |
|
133 |
with gr.Tabs() as tabs:
|
134 |
-
with gr.TabItem("π
|
135 |
df = get_result_data()
|
136 |
df["Model_Type"] = df["Model_Type"].map(model_types)
|
137 |
-
|
138 |
df["Model"] = df.apply(format_model_link, axis=1)
|
139 |
-
|
140 |
df["zho"] = df[["zho_Hans", "zho_Hant"]].mean(axis=1)
|
141 |
|
142 |
columns = lang_ids.split("\t")
|
@@ -152,22 +192,63 @@ with demo:
|
|
152 |
|
153 |
# df = df.style.applymap(apply_color_gradient, subset=['eng'])
|
154 |
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
|
|
|
|
155 |
|
156 |
|
157 |
for col in numeric_cols:
|
158 |
lang_format_with_color = partial(format_with_color,
|
159 |
-
min_val=df[col].min(),
|
160 |
-
max_val=df[col].max()
|
|
|
|
|
|
|
161 |
|
162 |
df[col] = df[col].apply(lang_format_with_color)
|
163 |
|
164 |
-
|
165 |
# for col in numeric_cols:
|
166 |
# df[col] = (df[col] * 100).round(1).astype(str)
|
167 |
|
168 |
AutoEvalColumn.add_columns_from_df(df, numeric_cols)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
169 |
|
170 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
171 |
|
172 |
with gr.Row():
|
173 |
with gr.Accordion("π Citation", open=False):
|
|
|
15 |
|
16 |
π https://m-rewardbench.github.io/'''
|
17 |
|
18 |
+
# GOOGLE_SHEET_URL = "https://docs.google.com/spreadsheets/d/1qrD7plUdrBwAw7G6UeDVZAaV9ihxaNAcoiKwSaqotR4/export?gid=0&format=csv"
|
19 |
+
|
20 |
+
GOOGLE_SHEET_URLS = [
|
21 |
+
"https://docs.google.com/spreadsheets/d/1qrD7plUdrBwAw7G6UeDVZAaV9ihxaNAcoiKwSaqotR4/gviz/tq?tqx=out:csv&sheet=gt",
|
22 |
+
"https://docs.google.com/spreadsheets/d/1qrD7plUdrBwAw7G6UeDVZAaV9ihxaNAcoiKwSaqotR4/gviz/tq?tqx=out:csv&sheet=maple"
|
23 |
+
]
|
24 |
+
|
25 |
# ABOUT_TEXT = """
|
26 |
# <h1>
|
27 |
# <span style="font-variant: small-caps;">M-RewardBench</span>: Evaluating Reward Models in Multilingual Settings
|
|
|
57 |
})
|
58 |
|
59 |
|
60 |
+
class AutoEvalColumnTranslation:
|
61 |
+
model = {
|
62 |
+
"name": "Model",
|
63 |
+
"type": "markdown",
|
64 |
+
"displayed_by_default": True,
|
65 |
+
"never_hidden": True,
|
66 |
+
}
|
67 |
+
|
68 |
+
model_type = {
|
69 |
+
"name": "MT",
|
70 |
+
"type": "markdown",
|
71 |
+
"displayed_by_default": True,
|
72 |
+
"never_hidden": True,
|
73 |
+
}
|
74 |
+
|
75 |
+
@classmethod
|
76 |
+
def add_columns_from_df(cls, df, columns):
|
77 |
+
for col in columns:
|
78 |
+
if col.lower() != 'model': # Skip if it's the model column since it's predefined
|
79 |
+
setattr(cls, col, {
|
80 |
+
"name": col,
|
81 |
+
"type": "markdown",
|
82 |
+
"displayed_by_default": True,
|
83 |
+
"never_hidden": False,
|
84 |
+
})
|
85 |
+
|
86 |
def get_result_data():
|
87 |
+
return pd.read_csv(GOOGLE_SHEET_URLS[0])
|
88 |
+
|
89 |
|
90 |
+
def get_translation_data():
|
91 |
+
return pd.read_csv(GOOGLE_SHEET_URLS[1])
|
92 |
|
93 |
+
|
94 |
+
def init_leaderboard(dataframe, df_class):
|
95 |
if dataframe is None or dataframe.empty:
|
96 |
raise ValueError("Leaderboard DataFrame is empty or None.")
|
97 |
|
|
|
99 |
value=dataframe,
|
100 |
datatype=[
|
101 |
col["type"]
|
102 |
+
for col in df_class.__dict__.values()
|
103 |
if isinstance(col, dict)
|
104 |
],
|
105 |
select_columns=SelectColumns(
|
106 |
default_selection=[
|
107 |
col["name"]
|
108 |
+
for col in df_class.__dict__.values()
|
109 |
if isinstance(col, dict) and col["displayed_by_default"]
|
110 |
],
|
111 |
cant_deselect=[
|
112 |
col["name"]
|
113 |
+
for col in df_class.__dict__.values()
|
114 |
if isinstance(col, dict) and col.get("never_hidden", False)
|
115 |
],
|
116 |
label="Select Columns to Display:",
|
|
|
135 |
model_types = {"Generative RM": "π¬", "DPO": "π―", "Sequence Classifier": "π’"}
|
136 |
|
137 |
from functools import partial
|
138 |
+
def format_with_color(val, min_val=50, max_val=100, scale=True):
|
139 |
"""
|
140 |
Formats a value with inline green color gradient CSS.
|
141 |
Returns an HTML string with bold, black text and muted green background.
|
|
|
147 |
|
148 |
# Normalize value between 50 and 100 to 0-1 range
|
149 |
normalized = (val - min_val) / (max_val - min_val)
|
150 |
+
# print(normalized)
|
151 |
# Clamp value between 0 and 1
|
152 |
normalized = max(0, min(1, normalized))
|
153 |
|
|
|
156 |
intensity = int(50 + (150 * (1 - normalized)))
|
157 |
|
158 |
# Return HTML with inline CSS - bold black text
|
159 |
+
show_val = val
|
160 |
+
|
161 |
+
if scale:
|
162 |
+
show_val = val*100
|
163 |
+
|
164 |
+
return f'<div val={val} style="background-color: rgb({intensity}, 200, {intensity}); color: black; font-weight: bold; text-align: center; vertical-align: middle;">{show_val:.1f}</div>'
|
165 |
|
166 |
except (ValueError, TypeError):
|
167 |
return str(val)
|
|
|
173 |
gr.Markdown(INTRODUCTION_TEXT)
|
174 |
|
175 |
with gr.Tabs() as tabs:
|
176 |
+
with gr.TabItem("π
Main"):
|
177 |
df = get_result_data()
|
178 |
df["Model_Type"] = df["Model_Type"].map(model_types)
|
|
|
179 |
df["Model"] = df.apply(format_model_link, axis=1)
|
|
|
180 |
df["zho"] = df[["zho_Hans", "zho_Hant"]].mean(axis=1)
|
181 |
|
182 |
columns = lang_ids.split("\t")
|
|
|
192 |
|
193 |
# df = df.style.applymap(apply_color_gradient, subset=['eng'])
|
194 |
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
195 |
+
global_min = df.select_dtypes(include='number').min().min().astype(float)
|
196 |
+
global_max = df.select_dtypes(include='number').max().max().astype(float)
|
197 |
|
198 |
|
199 |
for col in numeric_cols:
|
200 |
lang_format_with_color = partial(format_with_color,
|
201 |
+
# min_val=df[col].min(),
|
202 |
+
# max_val=df[col].max(),
|
203 |
+
min_val=global_min,
|
204 |
+
max_val=global_max,
|
205 |
+
)
|
206 |
|
207 |
df[col] = df[col].apply(lang_format_with_color)
|
208 |
|
|
|
209 |
# for col in numeric_cols:
|
210 |
# df[col] = (df[col] * 100).round(1).astype(str)
|
211 |
|
212 |
AutoEvalColumn.add_columns_from_df(df, numeric_cols)
|
213 |
+
leaderboard = init_leaderboard(df, AutoEvalColumn)
|
214 |
+
|
215 |
+
with gr.TabItem("π
Translation"):
|
216 |
+
df = get_translation_data()
|
217 |
+
df["Model_Type"] = df["Model_Type"].map(model_types)
|
218 |
+
df["Model"] = df.apply(format_model_link, axis=1)
|
219 |
+
|
220 |
+
df.rename(columns={
|
221 |
+
"Model_Type": "MT",
|
222 |
+
"Avg": "AVG",
|
223 |
+
}, inplace=True)
|
224 |
+
|
225 |
+
numeric_cols = df.select_dtypes(include=[np.number]).columns
|
226 |
+
# print(df[numeric_cols].min().min())
|
227 |
+
# print(df[numeric_cols].max().max())
|
228 |
+
global_min = df.select_dtypes(include='number').min().min().astype(float)
|
229 |
+
global_max = df.select_dtypes(include='number').max().max().astype(float)
|
230 |
+
# print(global_max)
|
231 |
+
|
232 |
+
for col in numeric_cols:
|
233 |
+
# print(df[col].min())
|
234 |
+
lang_format_with_color = partial(format_with_color,
|
235 |
+
min_val=global_min,
|
236 |
+
max_val=global_max,
|
237 |
+
# min_val=df[col].min(),
|
238 |
+
# max_val=df[col].max(),
|
239 |
+
scale=False)
|
240 |
+
df[col] = df[col].apply(lang_format_with_color)
|
241 |
+
|
242 |
+
|
243 |
|
244 |
+
|
245 |
+
# for col in numeric_cols:
|
246 |
+
# df[col] = (df[col] * 100).round(1).astype(str)
|
247 |
+
|
248 |
+
AutoEvalColumnTranslation.add_columns_from_df(df, numeric_cols)
|
249 |
+
leaderboard = init_leaderboard(df, AutoEvalColumnTranslation)
|
250 |
+
|
251 |
+
|
252 |
|
253 |
with gr.Row():
|
254 |
with gr.Accordion("π Citation", open=False):
|