simple fix of columns
Browse files- app.py +2 -2
- src/display/utils.py +15 -8
- src/populate.py +4 -4
app.py
CHANGED
@@ -156,7 +156,7 @@ with demo:
|
|
156 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
157 |
|
158 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
159 |
-
with gr.TabItem("
|
160 |
with gr.Row():
|
161 |
with gr.Column():
|
162 |
with gr.Row():
|
@@ -324,7 +324,7 @@ with demo:
|
|
324 |
value=None,
|
325 |
interactive=True,
|
326 |
)
|
327 |
-
|
328 |
with gr.Row():
|
329 |
with gr.Column():
|
330 |
model_name_textbox = gr.Textbox(label="Model name")
|
|
|
156 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
157 |
|
158 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
159 |
+
with gr.TabItem("open-moe-llm-leaderboard", elem_id="llm-benchmark-tab-table", id=0):
|
160 |
with gr.Row():
|
161 |
with gr.Column():
|
162 |
with gr.Row():
|
|
|
324 |
value=None,
|
325 |
interactive=True,
|
326 |
)
|
327 |
+
|
328 |
with gr.Row():
|
329 |
with gr.Column():
|
330 |
model_name_textbox = gr.Textbox(label="Model name")
|
src/display/utils.py
CHANGED
@@ -7,6 +7,11 @@ import pandas as pd
|
|
7 |
def fields(raw_class):
|
8 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
9 |
|
|
|
|
|
|
|
|
|
|
|
10 |
|
11 |
@dataclass
|
12 |
class Task:
|
@@ -46,7 +51,7 @@ class Tasks(Enum):
|
|
46 |
|
47 |
# # XXX include me back at some point
|
48 |
selfcheck = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT")
|
49 |
-
mmlu = Task("mmlu", "acc", "MMLU/Acc (5-shot)
|
50 |
|
51 |
|
52 |
# These classes are for user facing column names,
|
@@ -71,20 +76,22 @@ auto_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model", "ma
|
|
71 |
# # auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg", "number", True)])
|
72 |
|
73 |
# Inference framework
|
74 |
-
auto_eval_column_dict.append(["inference_framework", ColumnContent, ColumnContent("
|
75 |
|
76 |
for task in Tasks:
|
77 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
78 |
# System performance metrics
|
79 |
-
auto_eval_column_dict.append([f"{task.name}_end_to_end_time", ColumnContent, ColumnContent(f"{task.value.col_name}
|
80 |
-
|
81 |
-
|
|
|
|
|
82 |
|
83 |
# Model information
|
84 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
85 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
86 |
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
87 |
-
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str",
|
88 |
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
89 |
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
90 |
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
@@ -144,7 +151,7 @@ class InferenceFramework(Enum):
|
|
144 |
|
145 |
def to_str(self):
|
146 |
return self.value.name
|
147 |
-
|
148 |
@staticmethod
|
149 |
def from_str(inference_framework: str):
|
150 |
if inference_framework in ["moe-infinity"]:
|
@@ -152,7 +159,7 @@ class InferenceFramework(Enum):
|
|
152 |
if inference_framework in ["hf-chat"]:
|
153 |
return InferenceFramework.HF_Chat
|
154 |
return InferenceFramework.Unknown
|
155 |
-
|
156 |
|
157 |
class WeightType(Enum):
|
158 |
Adapter = ModelDetails("Adapter")
|
|
|
7 |
def fields(raw_class):
|
8 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
9 |
|
10 |
+
E2Es = "E2E(s)" #"End-to-end time (s)"
|
11 |
+
PREs = "PRE(s)" #"Prefilling time (s)"
|
12 |
+
TS = "T/s" #Decoding throughput (tok/s)
|
13 |
+
InFrame = "Method" #"Inference framework"
|
14 |
+
MULTIPLE_CHOICEs = ["mmlu"]
|
15 |
|
16 |
@dataclass
|
17 |
class Task:
|
|
|
51 |
|
52 |
# # XXX include me back at some point
|
53 |
selfcheck = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT")
|
54 |
+
mmlu = Task("mmlu", "acc", "MMLU") #MMLU/Acc (5-shot)
|
55 |
|
56 |
|
57 |
# These classes are for user facing column names,
|
|
|
76 |
# # auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Avg", "number", True)])
|
77 |
|
78 |
# Inference framework
|
79 |
+
auto_eval_column_dict.append(["inference_framework", ColumnContent, ColumnContent(f"{InFrame}", "str", True)])
|
80 |
|
81 |
for task in Tasks:
|
82 |
auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
83 |
# System performance metrics
|
84 |
+
auto_eval_column_dict.append([f"{task.name}_end_to_end_time", ColumnContent, ColumnContent(f"{task.value.col_name}-{E2Es}", "number", True)])
|
85 |
+
if task.value.benchmark in MULTIPLE_CHOICEs:
|
86 |
+
continue
|
87 |
+
auto_eval_column_dict.append([f"{task.name}_prefilling_time", ColumnContent, ColumnContent(f"{task.value.col_name}-{PREs}", "number", True)])
|
88 |
+
auto_eval_column_dict.append([f"{task.name}_decoding_throughput", ColumnContent, ColumnContent(f"{task.value.col_name}-{TS}", "number", True)])
|
89 |
|
90 |
# Model information
|
91 |
auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
92 |
auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
93 |
auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
94 |
+
auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", True)])
|
95 |
auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
96 |
auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
97 |
auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
|
|
151 |
|
152 |
def to_str(self):
|
153 |
return self.value.name
|
154 |
+
|
155 |
@staticmethod
|
156 |
def from_str(inference_framework: str):
|
157 |
if inference_framework in ["moe-infinity"]:
|
|
|
159 |
if inference_framework in ["hf-chat"]:
|
160 |
return InferenceFramework.HF_Chat
|
161 |
return InferenceFramework.Unknown
|
162 |
+
|
163 |
|
164 |
class WeightType(Enum):
|
165 |
Adapter = ModelDetails("Adapter")
|
src/populate.py
CHANGED
@@ -12,7 +12,7 @@ from src.leaderboard.read_evals import get_raw_eval_results, EvalResult, update_
|
|
12 |
|
13 |
from src.backend.envs import Tasks as BackendTasks
|
14 |
from src.display.utils import Tasks
|
15 |
-
|
16 |
|
17 |
def get_leaderboard_df(
|
18 |
results_path: str,
|
@@ -47,9 +47,9 @@ def get_leaderboard_df(
|
|
47 |
|
48 |
# bm_to_name_map = {bm: name for name, bm in name_to_bm_map.items()}
|
49 |
system_metrics_to_name_map = {
|
50 |
-
"end_to_end_time": "
|
51 |
-
"prefilling_time": "
|
52 |
-
"decoding_throughput": "
|
53 |
}
|
54 |
|
55 |
all_data_json = []
|
|
|
12 |
|
13 |
from src.backend.envs import Tasks as BackendTasks
|
14 |
from src.display.utils import Tasks
|
15 |
+
from src.display.utils import E2Es, PREs, TS
|
16 |
|
17 |
def get_leaderboard_df(
|
18 |
results_path: str,
|
|
|
47 |
|
48 |
# bm_to_name_map = {bm: name for name, bm in name_to_bm_map.items()}
|
49 |
system_metrics_to_name_map = {
|
50 |
+
"end_to_end_time": f"{E2Es}",
|
51 |
+
"prefilling_time": f"{PREs}",
|
52 |
+
"decoding_throughput": f"{TS}",
|
53 |
}
|
54 |
|
55 |
all_data_json = []
|