Spaces:
Paused
Paused
edbeeching
commited on
Commit
•
fcb01e3
1
Parent(s):
b2c063a
updates table to include revision
Browse files
app.py
CHANGED
@@ -46,8 +46,8 @@ def load_results(model, benchmark, metric):
|
|
46 |
return mean_acc, data["config"]["model_args"]
|
47 |
|
48 |
|
49 |
-
COLS = ["
|
50 |
-
TYPES = ["str",
|
51 |
|
52 |
EVAL_COLS = ["model", "revision", "private", "8bit_eval", "is_delta_weight", "status"]
|
53 |
EVAL_TYPES = ["markdown","str", "bool", "bool", "bool", "str"]
|
@@ -59,7 +59,7 @@ def get_leaderboard():
|
|
59 |
all_data = get_eval_results_dicts()
|
60 |
dataframe = pd.DataFrame.from_records(all_data)
|
61 |
dataframe = dataframe.sort_values(by=['total ⬆️'], ascending=False)
|
62 |
-
|
63 |
dataframe = dataframe[COLS]
|
64 |
return dataframe
|
65 |
|
|
|
46 |
return mean_acc, data["config"]["model_args"]
|
47 |
|
48 |
|
49 |
+
COLS = ["base_model", "revision", "8bit", "total ⬆️", "ARC (25-shot) ⬆️", "HellaSwag (10-shot) ⬆️", "MMLU (5-shot) ⬆️", "TruthQA (0-shot) ⬆️"]
|
50 |
+
TYPES = ["markdown","str", "bool", "number", "number", "number", "number", "number", ]
|
51 |
|
52 |
EVAL_COLS = ["model", "revision", "private", "8bit_eval", "is_delta_weight", "status"]
|
53 |
EVAL_TYPES = ["markdown","str", "bool", "bool", "bool", "str"]
|
|
|
59 |
all_data = get_eval_results_dicts()
|
60 |
dataframe = pd.DataFrame.from_records(all_data)
|
61 |
dataframe = dataframe.sort_values(by=['total ⬆️'], ascending=False)
|
62 |
+
print(dataframe)
|
63 |
dataframe = dataframe[COLS]
|
64 |
return dataframe
|
65 |
|
utils.py
CHANGED
@@ -50,6 +50,7 @@ class EvalResult:
|
|
50 |
eval_name : str
|
51 |
org : str
|
52 |
model : str
|
|
|
53 |
is_8bit : bool
|
54 |
results : dict
|
55 |
|
@@ -60,8 +61,11 @@ class EvalResult:
|
|
60 |
else:
|
61 |
base_model =f"{self.model}"
|
62 |
data_dict = {}
|
|
|
63 |
data_dict["eval_name"] = self.eval_name
|
|
|
64 |
data_dict["base_model"] = make_clickable_model(base_model)
|
|
|
65 |
data_dict["total ⬆️"] = round(sum([v for k,v in self.results.items()]),3)
|
66 |
data_dict["# params"] = get_n_params(base_model)
|
67 |
|
@@ -83,21 +87,22 @@ def parse_eval_result(json_filepath: str) -> Tuple[str, dict]:
|
|
83 |
|
84 |
path_split = json_filepath.split("/")
|
85 |
org = None
|
86 |
-
model = path_split[-
|
87 |
is_8bit = path_split[-2] == "8bit"
|
88 |
-
|
|
|
89 |
# handles gpt2 type models that don't have an org
|
90 |
-
result_key = f"{path_split[-3]}_{path_split[-2]}"
|
91 |
-
else:
|
92 |
result_key = f"{path_split[-4]}_{path_split[-3]}_{path_split[-2]}"
|
93 |
-
|
|
|
|
|
94 |
|
95 |
eval_result = None
|
96 |
for benchmark, metric in zip(BENCHMARKS, METRICS):
|
97 |
if benchmark in json_filepath:
|
98 |
accs = np.array([v[metric] for k, v in data["results"].items()])
|
99 |
mean_acc = round(np.mean(accs),3)
|
100 |
-
eval_result = EvalResult(result_key, org, model, is_8bit, {benchmark:mean_acc})
|
101 |
|
102 |
return result_key, eval_result
|
103 |
|
|
|
50 |
eval_name : str
|
51 |
org : str
|
52 |
model : str
|
53 |
+
revision : str
|
54 |
is_8bit : bool
|
55 |
results : dict
|
56 |
|
|
|
61 |
else:
|
62 |
base_model =f"{self.model}"
|
63 |
data_dict = {}
|
64 |
+
|
65 |
data_dict["eval_name"] = self.eval_name
|
66 |
+
data_dict["8bit"] = self.is_8bit
|
67 |
data_dict["base_model"] = make_clickable_model(base_model)
|
68 |
+
data_dict["revision"] = self.revision
|
69 |
data_dict["total ⬆️"] = round(sum([v for k,v in self.results.items()]),3)
|
70 |
data_dict["# params"] = get_n_params(base_model)
|
71 |
|
|
|
87 |
|
88 |
path_split = json_filepath.split("/")
|
89 |
org = None
|
90 |
+
model = path_split[-4]
|
91 |
is_8bit = path_split[-2] == "8bit"
|
92 |
+
revision = path_split[-3]
|
93 |
+
if len(path_split)== 6:
|
94 |
# handles gpt2 type models that don't have an org
|
|
|
|
|
95 |
result_key = f"{path_split[-4]}_{path_split[-3]}_{path_split[-2]}"
|
96 |
+
else:
|
97 |
+
result_key = f"{path_split[-5]}_{path_split[-4]}_{path_split[-3]}_{path_split[-2]}"
|
98 |
+
org = path_split[-5]
|
99 |
|
100 |
eval_result = None
|
101 |
for benchmark, metric in zip(BENCHMARKS, METRICS):
|
102 |
if benchmark in json_filepath:
|
103 |
accs = np.array([v[metric] for k, v in data["results"].items()])
|
104 |
mean_acc = round(np.mean(accs),3)
|
105 |
+
eval_result = EvalResult(result_key, org, model, revision, is_8bit, {benchmark:mean_acc})
|
106 |
|
107 |
return result_key, eval_result
|
108 |
|