Spaces:
Paused
Paused
Clémentine
commited on
Commit
•
d350941
1
Parent(s):
35763fc
fix rounding
Browse files- app.py +3 -1
- src/auto_leaderboard/load_results.py +2 -4
app.py
CHANGED
@@ -18,6 +18,8 @@ from src.assets.css_html_js import custom_css, get_window_url_params
|
|
18 |
from src.utils_display import AutoEvalColumn, EvalQueueColumn, fields, styled_error, styled_warning, styled_message
|
19 |
from src.init import get_all_requested_models, load_all_info_from_hub
|
20 |
|
|
|
|
|
21 |
# clone / pull the lmeh eval data
|
22 |
H4_TOKEN = os.environ.get("H4_TOKEN", None)
|
23 |
|
@@ -91,7 +93,7 @@ def get_leaderboard_df():
|
|
91 |
|
92 |
df = pd.DataFrame.from_records(all_data)
|
93 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
94 |
-
df = df[COLS]
|
95 |
|
96 |
# filter out if any of the benchmarks have not been produced
|
97 |
df = df[has_no_nan_values(df, BENCHMARK_COLS)]
|
|
|
18 |
from src.utils_display import AutoEvalColumn, EvalQueueColumn, fields, styled_error, styled_warning, styled_message
|
19 |
from src.init import get_all_requested_models, load_all_info_from_hub
|
20 |
|
21 |
+
pd.set_option('display.precision', 1)
|
22 |
+
|
23 |
# clone / pull the lmeh eval data
|
24 |
H4_TOKEN = os.environ.get("H4_TOKEN", None)
|
25 |
|
|
|
93 |
|
94 |
df = pd.DataFrame.from_records(all_data)
|
95 |
df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
96 |
+
df = df[COLS].round(decimals=2)
|
97 |
|
98 |
# filter out if any of the benchmarks have not been produced
|
99 |
df = df[has_no_nan_values(df, BENCHMARK_COLS)]
|
src/auto_leaderboard/load_results.py
CHANGED
@@ -44,9 +44,7 @@ class EvalResult:
|
|
44 |
data_dict[AutoEvalColumn.model.name] = make_clickable_model(base_model)
|
45 |
data_dict[AutoEvalColumn.dummy.name] = base_model
|
46 |
data_dict[AutoEvalColumn.revision.name] = self.revision
|
47 |
-
data_dict[AutoEvalColumn.average.name] =
|
48 |
-
sum([v for k, v in self.results.items()]) / 4.0, 1
|
49 |
-
)
|
50 |
|
51 |
for benchmark in BENCHMARKS:
|
52 |
if benchmark not in self.results.keys():
|
@@ -95,7 +93,7 @@ def parse_eval_result(json_filepath: str) -> Tuple[str, list[dict]]:
|
|
95 |
accs = np.array([v[metric] for k, v in data["results"].items() if benchmark in k])
|
96 |
if accs.size == 0:
|
97 |
continue
|
98 |
-
mean_acc =
|
99 |
eval_results.append(EvalResult(
|
100 |
eval_name=result_key, org=org, model=model, revision=model_sha, results={benchmark: mean_acc}, #todo model_type=, weight_type=
|
101 |
))
|
|
|
44 |
data_dict[AutoEvalColumn.model.name] = make_clickable_model(base_model)
|
45 |
data_dict[AutoEvalColumn.dummy.name] = base_model
|
46 |
data_dict[AutoEvalColumn.revision.name] = self.revision
|
47 |
+
data_dict[AutoEvalColumn.average.name] = sum([v for k, v in self.results.items()]) / 4.0
|
|
|
|
|
48 |
|
49 |
for benchmark in BENCHMARKS:
|
50 |
if benchmark not in self.results.keys():
|
|
|
93 |
accs = np.array([v[metric] for k, v in data["results"].items() if benchmark in k])
|
94 |
if accs.size == 0:
|
95 |
continue
|
96 |
+
mean_acc = np.mean(accs) * 100.0
|
97 |
eval_results.append(EvalResult(
|
98 |
eval_name=result_key, org=org, model=model, revision=model_sha, results={benchmark: mean_acc}, #todo model_type=, weight_type=
|
99 |
))
|