Spaces:
Running
Running
add latency cost table
Browse files- app.py +146 -59
- crm-results/hf_leaderboard_latency_cost.csv +37 -37
- src/display/utils.py +22 -5
- src/populate.py +65 -56
app.py
CHANGED
@@ -11,8 +11,11 @@ from src.about import ( # CITATION_BUTTON_LABEL,; CITATION_BUTTON_TEXT,; EVALUA
|
|
11 |
from src.display.css_html_js import custom_css
|
12 |
from src.display.utils import ( # EVAL_TYPES,; WeightType,; BENCHMARK_COLS,; EVAL_COLS,; NUMERIC_INTERVALS,; ModelType,; Precision,
|
13 |
COLS,
|
|
|
|
|
14 |
TYPES,
|
15 |
AutoEvalColumn,
|
|
|
16 |
fields,
|
17 |
)
|
18 |
|
@@ -20,10 +23,11 @@ from src.display.utils import ( # EVAL_TYPES,; WeightType,; BENCHMARK_COLS,; EV
|
|
20 |
from src.envs import CRM_RESULTS_PATH
|
21 |
from src.populate import get_leaderboard_df_crm
|
22 |
|
23 |
-
original_df = get_leaderboard_df_crm(CRM_RESULTS_PATH, COLS)
|
24 |
|
25 |
# raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
26 |
leaderboard_df = original_df.copy()
|
|
|
27 |
# leaderboard_df = leaderboard_df.style.format({"accuracy_metric_average": "{0:.2f}"})
|
28 |
|
29 |
|
@@ -38,19 +42,12 @@ def update_table(
|
|
38 |
use_case_area_query: list,
|
39 |
use_case_query: list,
|
40 |
use_case_type_query: list,
|
41 |
-
# type_query: list,
|
42 |
-
# precision_query: str,
|
43 |
-
# size_query: list,
|
44 |
-
# show_deleted: bool,
|
45 |
-
# query: str,
|
46 |
):
|
47 |
-
# filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
|
48 |
-
# filtered_df = filter_queries(query, filtered_df)
|
49 |
filtered_df = filter_llm_func(hidden_df, llm_query)
|
50 |
filtered_df = filter_llm_provider_func(filtered_df, llm_provider_query)
|
51 |
filtered_df = filter_accuracy_method_func(filtered_df, accuracy_method_query)
|
52 |
-
filtered_df = filter_accuracy_threshold_func(filtered_df, accuracy_threshold_query)
|
53 |
-
|
54 |
filtered_df["Use Case Area"] = filtered_df["Use Case Name"].apply(lambda x: x.split(": ")[0])
|
55 |
filtered_df = filter_use_case_area_func(filtered_df, use_case_area_query)
|
56 |
filtered_df = filter_use_case_func(filtered_df, use_case_query)
|
@@ -59,6 +56,32 @@ def update_table(
|
|
59 |
return df
|
60 |
|
61 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
62 |
def init_leaderboard_df(
|
63 |
leaderboard_df: pd.DataFrame,
|
64 |
columns: list,
|
@@ -72,7 +95,6 @@ def init_leaderboard_df(
|
|
72 |
):
|
73 |
|
74 |
# Applying the style function
|
75 |
-
# df = leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value]
|
76 |
# return df.style.apply(highlight_cols, axis=None)
|
77 |
return update_table(
|
78 |
leaderboard_df,
|
@@ -87,13 +109,30 @@ def init_leaderboard_df(
|
|
87 |
)
|
88 |
|
89 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
90 |
def filter_accuracy_method_func(df: pd.DataFrame, accuracy_method_query: str) -> pd.DataFrame:
|
91 |
return df[df["Accuracy Method"] == accuracy_method_query]
|
92 |
|
93 |
|
94 |
def filter_accuracy_threshold_func(df: pd.DataFrame, accuracy_threshold_query: str) -> pd.DataFrame:
|
95 |
accuracy_cols = ["Instruction Following", "Conciseness", "Completeness", "Accuracy"]
|
96 |
-
return
|
97 |
|
98 |
|
99 |
def filter_use_case_area_func(df: pd.DataFrame, use_case_area_query: list) -> pd.DataFrame:
|
@@ -130,45 +169,12 @@ def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
|
|
130 |
return filtered_df
|
131 |
|
132 |
|
133 |
-
|
134 |
-
|
135 |
-
|
136 |
-
|
137 |
-
|
138 |
-
|
139 |
-
# if _q != "":
|
140 |
-
# temp_filtered_df = search_table(filtered_df, _q)
|
141 |
-
# if len(temp_filtered_df) > 0:
|
142 |
-
# final_df.append(temp_filtered_df)
|
143 |
-
# if len(final_df) > 0:
|
144 |
-
# filtered_df = pd.concat(final_df)
|
145 |
-
# filtered_df = filtered_df.drop_duplicates(
|
146 |
-
# subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
|
147 |
-
# )
|
148 |
-
|
149 |
-
# return filtered_df
|
150 |
-
|
151 |
-
|
152 |
-
# def filter_models(
|
153 |
-
# df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
|
154 |
-
# ) -> pd.DataFrame:
|
155 |
-
# # Show all models
|
156 |
-
# filtered_df = df
|
157 |
-
# # if show_deleted:
|
158 |
-
# # filtered_df = df
|
159 |
-
# # else: # Show only still on the hub models
|
160 |
-
# # filtered_df = df[df[AutoEvalColumn.still_on_hub.name] is True]
|
161 |
-
|
162 |
-
# type_emoji = [t[0] for t in type_query]
|
163 |
-
# filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
|
164 |
-
# filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
|
165 |
-
|
166 |
-
# numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
|
167 |
-
# params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
|
168 |
-
# mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
|
169 |
-
# filtered_df = filtered_df.loc[mask]
|
170 |
-
|
171 |
-
# return filtered_df
|
172 |
|
173 |
|
174 |
demo = gr.Blocks(css=custom_css)
|
@@ -259,14 +265,14 @@ with demo:
|
|
259 |
# multiselect=True,
|
260 |
# interactive=True,
|
261 |
# )
|
262 |
-
with gr.Column():
|
263 |
-
|
264 |
-
|
265 |
-
|
266 |
-
|
267 |
-
|
268 |
-
|
269 |
-
|
270 |
with gr.Column():
|
271 |
filter_accuracy_method = gr.Radio(
|
272 |
choices=["Manual", "Auto"],
|
@@ -374,6 +380,87 @@ with demo:
|
|
374 |
leaderboard_table,
|
375 |
queue=True,
|
376 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
377 |
|
378 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
379 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
|
|
11 |
from src.display.css_html_js import custom_css
|
12 |
from src.display.utils import ( # EVAL_TYPES,; WeightType,; BENCHMARK_COLS,; EVAL_COLS,; NUMERIC_INTERVALS,; ModelType,; Precision,
|
13 |
COLS,
|
14 |
+
COST_COLS,
|
15 |
+
COST_TYPES,
|
16 |
TYPES,
|
17 |
AutoEvalColumn,
|
18 |
+
CostEvalColumn,
|
19 |
fields,
|
20 |
)
|
21 |
|
|
|
23 |
from src.envs import CRM_RESULTS_PATH
|
24 |
from src.populate import get_leaderboard_df_crm
|
25 |
|
26 |
+
original_df, cost_df = get_leaderboard_df_crm(CRM_RESULTS_PATH, COLS, COST_COLS)
|
27 |
|
28 |
# raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
|
29 |
leaderboard_df = original_df.copy()
|
30 |
+
leaderboard_cost_df = cost_df.copy()
|
31 |
# leaderboard_df = leaderboard_df.style.format({"accuracy_metric_average": "{0:.2f}"})
|
32 |
|
33 |
|
|
|
42 |
use_case_area_query: list,
|
43 |
use_case_query: list,
|
44 |
use_case_type_query: list,
|
|
|
|
|
|
|
|
|
|
|
45 |
):
|
|
|
|
|
46 |
filtered_df = filter_llm_func(hidden_df, llm_query)
|
47 |
filtered_df = filter_llm_provider_func(filtered_df, llm_provider_query)
|
48 |
filtered_df = filter_accuracy_method_func(filtered_df, accuracy_method_query)
|
49 |
+
filtered_df["Accuracy Threshold"] = filter_accuracy_threshold_func(filtered_df, accuracy_threshold_query)
|
50 |
+
filtered_df = filtered_df[filtered_df["Accuracy Threshold"]]
|
51 |
filtered_df["Use Case Area"] = filtered_df["Use Case Name"].apply(lambda x: x.split(": ")[0])
|
52 |
filtered_df = filter_use_case_area_func(filtered_df, use_case_area_query)
|
53 |
filtered_df = filter_use_case_func(filtered_df, use_case_query)
|
|
|
56 |
return df
|
57 |
|
58 |
|
59 |
+
def update_cost_table(
|
60 |
+
hidden_df: pd.DataFrame,
|
61 |
+
columns: list,
|
62 |
+
llm_query: list,
|
63 |
+
llm_provider_query: list,
|
64 |
+
use_case_type_query: list,
|
65 |
+
):
|
66 |
+
filtered_df = filter_llm_func(hidden_df, llm_query)
|
67 |
+
filtered_df = filter_llm_provider_func(filtered_df, llm_provider_query)
|
68 |
+
filtered_df = filter_use_case_type_func(filtered_df, use_case_type_query)
|
69 |
+
df = select_columns_cost_table(filtered_df, columns)
|
70 |
+
return df.style.map(highlight_cost_band_low, props="background-color: #b3d5a4")
|
71 |
+
|
72 |
+
|
73 |
+
# def highlight_cols(x):
|
74 |
+
# df = x.copy()
|
75 |
+
# df.loc[:, :] = "color: black"
|
76 |
+
# df.loc[, ["Accuracy"]] = "background-color: #b3d5a4"
|
77 |
+
# return df
|
78 |
+
|
79 |
+
|
80 |
+
def highlight_cost_band_low(s, props=""):
|
81 |
+
|
82 |
+
return props if s == "Low" else None
|
83 |
+
|
84 |
+
|
85 |
def init_leaderboard_df(
|
86 |
leaderboard_df: pd.DataFrame,
|
87 |
columns: list,
|
|
|
95 |
):
|
96 |
|
97 |
# Applying the style function
|
|
|
98 |
# return df.style.apply(highlight_cols, axis=None)
|
99 |
return update_table(
|
100 |
leaderboard_df,
|
|
|
109 |
)
|
110 |
|
111 |
|
112 |
+
def init_leaderboard_cost_df(
|
113 |
+
leaderboard_df: pd.DataFrame,
|
114 |
+
columns: list,
|
115 |
+
llm_query: list,
|
116 |
+
llm_provider_query: list,
|
117 |
+
use_case_type_query: list,
|
118 |
+
):
|
119 |
+
|
120 |
+
return update_cost_table(
|
121 |
+
leaderboard_df,
|
122 |
+
columns,
|
123 |
+
llm_query,
|
124 |
+
llm_provider_query,
|
125 |
+
use_case_type_query,
|
126 |
+
)
|
127 |
+
|
128 |
+
|
129 |
def filter_accuracy_method_func(df: pd.DataFrame, accuracy_method_query: str) -> pd.DataFrame:
|
130 |
return df[df["Accuracy Method"] == accuracy_method_query]
|
131 |
|
132 |
|
133 |
def filter_accuracy_threshold_func(df: pd.DataFrame, accuracy_threshold_query: str) -> pd.DataFrame:
|
134 |
accuracy_cols = ["Instruction Following", "Conciseness", "Completeness", "Accuracy"]
|
135 |
+
return (df.loc[:, accuracy_cols] >= float(accuracy_threshold_query)).all(axis=1)
|
136 |
|
137 |
|
138 |
def filter_use_case_area_func(df: pd.DataFrame, use_case_area_query: list) -> pd.DataFrame:
|
|
|
169 |
return filtered_df
|
170 |
|
171 |
|
172 |
+
def select_columns_cost_table(df: pd.DataFrame, columns: list) -> pd.DataFrame:
|
173 |
+
always_here_cols = [
|
174 |
+
CostEvalColumn.model.name,
|
175 |
+
]
|
176 |
+
filtered_df = df[always_here_cols + [c for c in COST_COLS if c in df.columns and c in columns]]
|
177 |
+
return filtered_df
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
178 |
|
179 |
|
180 |
demo = gr.Blocks(css=custom_css)
|
|
|
265 |
# multiselect=True,
|
266 |
# interactive=True,
|
267 |
# )
|
268 |
+
# with gr.Column():
|
269 |
+
# filter_metric_area = gr.CheckboxGroup(
|
270 |
+
# choices=["Accuracy", "Speed (Latency)", "Trust & Safety", "Cost"],
|
271 |
+
# value=["Accuracy", "Speed (Latency)", "Trust & Safety", "Cost"],
|
272 |
+
# label="Metric Area",
|
273 |
+
# info="",
|
274 |
+
# interactive=True,
|
275 |
+
# )
|
276 |
with gr.Column():
|
277 |
filter_accuracy_method = gr.Radio(
|
278 |
choices=["Manual", "Auto"],
|
|
|
380 |
leaderboard_table,
|
381 |
queue=True,
|
382 |
)
|
383 |
+
with gr.TabItem("🏅 Latency & Cost", elem_id="llm-benchmark-tab-table", id=1):
|
384 |
+
with gr.Row():
|
385 |
+
with gr.Column():
|
386 |
+
with gr.Row():
|
387 |
+
shown_columns = gr.CheckboxGroup(
|
388 |
+
choices=[c.name for c in fields(CostEvalColumn) if not c.hidden and not c.never_hidden],
|
389 |
+
value=[
|
390 |
+
c.name
|
391 |
+
for c in fields(CostEvalColumn)
|
392 |
+
if c.displayed_by_default and not c.hidden and not c.never_hidden
|
393 |
+
],
|
394 |
+
label="Select columns to show",
|
395 |
+
elem_id="column-select",
|
396 |
+
interactive=True,
|
397 |
+
)
|
398 |
+
with gr.Row():
|
399 |
+
with gr.Column():
|
400 |
+
filter_llm = gr.CheckboxGroup(
|
401 |
+
choices=list(cost_df["Model Name"].unique()),
|
402 |
+
value=list(cost_df["Model Name"].unique()),
|
403 |
+
label="Model Name",
|
404 |
+
info="",
|
405 |
+
interactive=True,
|
406 |
+
)
|
407 |
+
with gr.Column():
|
408 |
+
filter_llm_provider = gr.CheckboxGroup(
|
409 |
+
choices=list(cost_df["LLM Provider"].unique()),
|
410 |
+
value=list(cost_df["LLM Provider"].unique()),
|
411 |
+
label="LLM Provider",
|
412 |
+
info="",
|
413 |
+
interactive=True,
|
414 |
+
)
|
415 |
+
with gr.Column():
|
416 |
+
filter_use_case_type = gr.CheckboxGroup(
|
417 |
+
choices=["Long", "Short"],
|
418 |
+
value=["Long", "Short"],
|
419 |
+
label="Use Case Type",
|
420 |
+
info="Output: 250 tokens, Long input: 3k tokens, Short input: 500 tokens",
|
421 |
+
interactive=True,
|
422 |
+
)
|
423 |
+
|
424 |
+
leaderboard_table = gr.components.Dataframe(
|
425 |
+
value=init_leaderboard_cost_df(
|
426 |
+
leaderboard_cost_df,
|
427 |
+
shown_columns.value,
|
428 |
+
filter_llm.value,
|
429 |
+
filter_llm_provider.value,
|
430 |
+
filter_use_case_type.value,
|
431 |
+
),
|
432 |
+
headers=[c.name for c in fields(CostEvalColumn) if c.never_hidden] + shown_columns.value,
|
433 |
+
datatype=COST_TYPES,
|
434 |
+
elem_id="leaderboard-table",
|
435 |
+
interactive=False,
|
436 |
+
visible=True,
|
437 |
+
)
|
438 |
+
|
439 |
+
hidden_leaderboard_table_for_search = gr.components.Dataframe(
|
440 |
+
value=cost_df[COST_COLS],
|
441 |
+
headers=COST_COLS,
|
442 |
+
datatype=COST_TYPES,
|
443 |
+
visible=False,
|
444 |
+
)
|
445 |
+
|
446 |
+
for selector in [
|
447 |
+
shown_columns,
|
448 |
+
filter_llm,
|
449 |
+
filter_llm_provider,
|
450 |
+
filter_use_case_type,
|
451 |
+
]:
|
452 |
+
selector.change(
|
453 |
+
update_cost_table,
|
454 |
+
[
|
455 |
+
hidden_leaderboard_table_for_search,
|
456 |
+
shown_columns,
|
457 |
+
filter_llm,
|
458 |
+
filter_llm_provider,
|
459 |
+
filter_use_case_type,
|
460 |
+
],
|
461 |
+
leaderboard_table,
|
462 |
+
queue=True,
|
463 |
+
)
|
464 |
|
465 |
with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
|
466 |
gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
|
crm-results/hf_leaderboard_latency_cost.csv
CHANGED
@@ -1,37 +1,37 @@
|
|
1 |
-
Model Name,Use Case Type
|
2 |
-
AI21 Jamba-Instruct,Long
|
3 |
-
AI21 Jamba-Instruct,Short
|
4 |
-
Claude 3 Haiku,Long
|
5 |
-
Claude 3 Haiku,Short
|
6 |
-
Claude 3 Opus,Long
|
7 |
-
Claude 3 Opus,Short
|
8 |
-
Cohere Command R+,Long
|
9 |
-
Cohere Command R+,Short
|
10 |
-
Cohere Command Text,Long
|
11 |
-
Cohere Command Text,Short
|
12 |
-
Gemini Pro 1.5,Long
|
13 |
-
Gemini Pro 1.5,Short
|
14 |
-
Gemini Pro 1,Long
|
15 |
-
Gemini Pro 1,Short
|
16 |
-
GPT 3.5 Turbo,Long
|
17 |
-
GPT 3.5 Turbo,Short
|
18 |
-
GPT 4 Turbo,Long
|
19 |
-
GPT 4 Turbo,Short
|
20 |
-
GPT4-o,Long
|
21 |
-
GPT4-o,Short
|
22 |
-
Mistral 7B,Long,Self-host (g5.48xlarge),8.83,242.0,16.5,High
|
23 |
-
Mistral 7B,Short,Self-host (g5.48xlarge),8.31,247.0,15.5,High
|
24 |
-
LLaMA 3 8B,Long,Self-host (g5.48xlarge),3.76,251.5,7.0,
|
25 |
-
LLaMA 3 8B,Short,Self-host (g5.48xlarge),3.23,243.6,6.0,
|
26 |
-
LLaMA 3 70B,Long,Self-host (p4d.24xlarge),20.1,243.9,67.7,High
|
27 |
-
LLaMA 3 70B,Short,Self-host (p4d.24xlarge),29.4,251.2,99.0,High
|
28 |
-
Mixtral 8x7B,Long,Self-host (p4d.24xlarge),2.44,248.5,8.22,
|
29 |
-
Mixtral 8x7B,Short,Self-host (p4d.24xlarge),2.41,250.0,8.11,
|
30 |
-
SF-TextBase 7B,Long,Self-host (g5.48xlarge),8.99,248.5,16.80,High
|
31 |
-
SF-TextBase 7B,Short,Self-host (g5.48xlarge),8.29,248.7,15.50,High
|
32 |
-
SF-TextBase 70B,Long,Self-host (p4de.24xlarge),6.52,253.7,28.17,High
|
33 |
-
SF-TextBase 70B,Short,Self-host (p4de.24xlarge),6.24,249.7,26.96,High
|
34 |
-
SF-TextSum,Long,Self-host (g5.48xlarge),8.85,244.0,16.55,High
|
35 |
-
SF-TextSum,Short,Self-host (g5.48xlarge),8.34,250.4,15.60,High
|
36 |
-
XGen
|
37 |
-
XGen
|
|
|
1 |
+
Model Name,Use Case Type,Version,Platform,Mean Latency (sec) per Request,Mean Output Tokens,Mean Cost per 1K Requests,Cost Band,,Model id,Cost per 1m input tokens,Cost per 1m output tokens,,,,Percentile,From,To,,min,Max
|
2 |
+
AI21 Jamba-Instruct,Long,,AI21,4.0,232.9,1.6,Medium,,GPT 3.5 Turbo,0.5,1.5,,,0%,0.43,0.43,1.61,,0.43,61.11
|
3 |
+
AI21 Jamba-Instruct,Short,,AI21,4.0,243.9,0.5,Low,,GPT 4 Turbo,10,30,,,33%,1.61,1.61,9.28,,,
|
4 |
+
Claude 3 Haiku,Long,,Bedrock,2.8,236.9,1.0,Low,,GPT4-o,5,15,,,67%,9.28,9.28,61.11,,,
|
5 |
+
Claude 3 Haiku,Short,,Bedrock,2.2,245.4,0.4,Low,,Claude 3 Haiku,0.25,1.25,,,100%,61.11,,,,,
|
6 |
+
Claude 3 Opus,Long,,Bedrock,12.2,242.7,61.1,High,,Claude 3 Opus,15,75,,,,,,,,,
|
7 |
+
Claude 3 Opus,Short,,Bedrock,8.4,243.2,25.4,High,,AI21 Jamba-Instruct,0.5,0.7,,,,,,,,,
|
8 |
+
Cohere Command R+,Long,,Bedrock,7.7,245.7,11.7,High,,Cohere Command Text,1.5,2,,,,,,,,,
|
9 |
+
Cohere Command R+,Short,,Bedrock,7.1,249.9,5.1,Medium,,Cohere Command R+,3,15,,,,,,,,,
|
10 |
+
Cohere Command Text,Long,,Bedrock,12.9,238.7,4.3,Medium,,Gemini Pro 1,0.5,1.5,,,,,,,,,
|
11 |
+
Cohere Command Text,Short,,Bedrock,9.6,245.6,1.1,Low,,Gemini Pro 1.5,3.5,7,,,,,,,,,
|
12 |
+
Gemini Pro 1.5,Long,,Google,5.5,245.7,11.0,High,,,,,,,,,,,,,
|
13 |
+
Gemini Pro 1.5,Short,,Google,5.4,247.5,3.3,Medium,,,,,,,,,,,,,
|
14 |
+
Gemini Pro 1,Long,,Google,6.0,228.9,1.7,Medium,,,,,,,,,,,,,
|
15 |
+
Gemini Pro 1,Short,,Google,4.4,247.4,0.6,Low,,,,,,,,,,,,,
|
16 |
+
GPT 3.5 Turbo,Long,,OpenAI,4.5,249.9,1.6,Low,,,,,,,,,,,,,
|
17 |
+
GPT 3.5 Turbo,Short,,OpenAI,4.2,238.3,0.6,Low,,,,,,,,,,,,,
|
18 |
+
GPT 4 Turbo,Long,,OpenAI,12.3,247.6,32.0,High,,,,,,,,,,,,,
|
19 |
+
GPT 4 Turbo,Short,,OpenAI,12.3,250.0,11.7,High,,,,,,,,,,,,,
|
20 |
+
GPT4-o,Long,,OpenAI,5.1,248.4,15.9,High,,,,,,,,,,,,,
|
21 |
+
GPT4-o,Short,,OpenAI,5.0,250.0,5.8,Medium,,,,,,,,,,,,,
|
22 |
+
Mistral 7B,Long,Mistral-7B-Instruct-v0.2,Self-host (g5.48xlarge),8.83,242.0,16.5,High,,,,,,,,,,,,,
|
23 |
+
Mistral 7B,Short,Mistral-7B-Instruct-v0.2,Self-host (g5.48xlarge),8.31,247.0,15.5,High,,,,,,,,,,,,,
|
24 |
+
LLaMA 3 8B,Long,Meta-Llama-3-8B-Instruct,Self-host (g5.48xlarge),3.76,251.5,7.0,Medium,,,,,,,,,,,,,
|
25 |
+
LLaMA 3 8B,Short,Meta-Llama-3-8B-Instruct,Self-host (g5.48xlarge),3.23,243.6,6.0,Medium,,,,,,,,,,,,,
|
26 |
+
LLaMA 3 70B,Long,llama-3-70b-instruct,Self-host (p4d.24xlarge),20.1,243.9,67.7,High,,,,,,,,,,,,,
|
27 |
+
LLaMA 3 70B,Short,llama-3-70b-instruct,Self-host (p4d.24xlarge),29.4,251.2,99.0,High,,,,,,,,,,,,,
|
28 |
+
Mixtral 8x7B,Long,mixtral-8x7b-instruct,Self-host (p4d.24xlarge),2.44,248.5,8.22,Medium,,,,,,,,,,,,,
|
29 |
+
Mixtral 8x7B,Short,mixtral-8x7b-instruct,Self-host (p4d.24xlarge),2.41,250.0,8.11,Medium,,,,,,,,,,,,,
|
30 |
+
SF-TextBase 7B,Long,CRM-TextBase-7b-22k-g5 (endpoint),Self-host (g5.48xlarge),8.99,248.5,16.80,High,,,,,,,,,,,,,
|
31 |
+
SF-TextBase 7B,Short,CRM-TextBase-7b-22k-g5 (endpoint),Self-host (g5.48xlarge),8.29,248.7,15.50,High,,,,,,,,,,,,,
|
32 |
+
SF-TextBase 70B,Long,TextBase-70B-8K,Self-host (p4de.24xlarge),6.52,253.7,28.17,High,,,,,,,,,,,,,
|
33 |
+
SF-TextBase 70B,Short,TextBase-70B-8K,Self-host (p4de.24xlarge),6.24,249.7,26.96,High,,,,,,,,,,,,,
|
34 |
+
SF-TextSum,Long,CRM-TSUM-7b-22k-g5 (endpoint),Self-host (g5.48xlarge),8.85,244.0,16.55,High,,,,,,,,,,,,,
|
35 |
+
SF-TextSum,Short,CRM-TSUM-7b-22k-g5 (endpoint),Self-host (g5.48xlarge),8.34,250.4,15.60,High,,,,,,,,,,,,,
|
36 |
+
XGen 22B,Long,EinsteinXgen2E4DSStreaming (endpoint),Self-host (p4de.24xlarge),3.71,250.0,16.03,High,not able to get response for large token requests (5K-token input),,,,,,,,,,,,
|
37 |
+
XGen 22B,Short,EinsteinXgen2E4DSStreaming (endpoint),Self-host (p4de.24xlarge),2.64,250.0,11.40,High,,,,,,,,,,,,,
|
src/display/utils.py
CHANGED
@@ -51,9 +51,26 @@ auto_eval_column_dict.append(
|
|
51 |
auto_eval_column_dict.append(
|
52 |
["accuracy_metric_factuality", ColumnContent, ColumnContent("Factuality", "markdown", True)]
|
53 |
)
|
54 |
-
#
|
|
|
|
|
55 |
|
56 |
-
# Cost metrics
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
57 |
|
58 |
# Trust & Safety metrics
|
59 |
|
@@ -73,9 +90,6 @@ auto_eval_column_dict.append(
|
|
73 |
# auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
74 |
# auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
75 |
|
76 |
-
# We use make dataclass to dynamically fill the scores from Tasks
|
77 |
-
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
78 |
-
|
79 |
|
80 |
## For the queue columns in the submission tab
|
81 |
@dataclass(frozen=True)
|
@@ -156,6 +170,9 @@ TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
|
|
156 |
COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
157 |
TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
158 |
|
|
|
|
|
|
|
159 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
160 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
161 |
|
|
|
51 |
auto_eval_column_dict.append(
|
52 |
["accuracy_metric_factuality", ColumnContent, ColumnContent("Factuality", "markdown", True)]
|
53 |
)
|
54 |
+
# We use make dataclass to dynamically fill the scores from Tasks
|
55 |
+
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
56 |
+
|
57 |
|
58 |
+
# Speed (Latency) & Cost metrics
|
59 |
+
cost_eval_column_dict = []
|
60 |
+
# Init
|
61 |
+
cost_eval_column_dict.append(
|
62 |
+
["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)]
|
63 |
+
)
|
64 |
+
cost_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)])
|
65 |
+
cost_eval_column_dict.append(["use_case_type", ColumnContent, ColumnContent("Use Case Type", "markdown", True)])
|
66 |
+
cost_eval_column_dict.append(
|
67 |
+
["latency", ColumnContent, ColumnContent("Mean Latency (sec) per Request", "markdown", True)]
|
68 |
+
)
|
69 |
+
cost_eval_column_dict.append(
|
70 |
+
["mean_output_tokens", ColumnContent, ColumnContent("Mean Output Tokens", "markdown", True)]
|
71 |
+
)
|
72 |
+
cost_eval_column_dict.append(["cost_band", ColumnContent, ColumnContent("Cost Band", "markdown", True)])
|
73 |
+
CostEvalColumn = make_dataclass("CostEvalColumn", cost_eval_column_dict, frozen=True)
|
74 |
|
75 |
# Trust & Safety metrics
|
76 |
|
|
|
90 |
# auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
91 |
# auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
92 |
|
|
|
|
|
|
|
93 |
|
94 |
## For the queue columns in the submission tab
|
95 |
@dataclass(frozen=True)
|
|
|
170 |
COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
171 |
TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
|
172 |
|
173 |
+
COST_COLS = [c.name for c in fields(CostEvalColumn) if not c.hidden]
|
174 |
+
COST_TYPES = [c.type for c in fields(CostEvalColumn) if not c.hidden]
|
175 |
+
|
176 |
EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
|
177 |
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
178 |
|
src/populate.py
CHANGED
@@ -1,17 +1,19 @@
|
|
1 |
-
import json
|
2 |
import os
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
-
from src.display.formatting import has_no_nan_values, make_clickable_model
|
7 |
-
from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
8 |
-
from src.leaderboard.read_evals import get_raw_eval_results
|
9 |
|
10 |
|
11 |
-
def get_leaderboard_df_crm(
|
|
|
|
|
12 |
"""Creates a dataframe from all the individual experiment results"""
|
13 |
-
leaderboard_accuracy_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_accuracy.csv"))
|
14 |
sf_finetuned_models = ["SF-TextBase 70B", "SF-TextBase 7B", "SF-TextSum"]
|
|
|
|
|
15 |
leaderboard_accuracy_df = leaderboard_accuracy_df[~leaderboard_accuracy_df["Model Name"].isin(sf_finetuned_models)]
|
16 |
# leaderboard_accuracy_df = leaderboard_accuracy_df.sort_values(
|
17 |
# by=[AutoEvalColumn.accuracy_metric_average.name], ascending=False
|
@@ -19,55 +21,62 @@ def get_leaderboard_df_crm(crm_results_path: str, cols: list) -> pd.DataFrame:
|
|
19 |
# print(leaderboard_accuracy_df)
|
20 |
# print(leaderboard_accuracy_df.columns)
|
21 |
# print(leaderboard_accuracy_df["Model Name"].nunique())
|
22 |
-
leaderboard_accuracy_df = leaderboard_accuracy_df[
|
23 |
-
return leaderboard_accuracy_df
|
24 |
|
|
|
25 |
|
26 |
-
|
27 |
-
|
28 |
-
|
29 |
-
|
30 |
-
|
31 |
-
|
32 |
-
|
33 |
-
|
34 |
-
|
35 |
-
|
36 |
-
|
37 |
-
|
38 |
-
|
39 |
-
|
40 |
-
|
41 |
-
|
42 |
-
|
43 |
-
|
44 |
-
|
45 |
-
|
46 |
-
|
47 |
-
|
48 |
-
|
49 |
-
|
50 |
-
|
51 |
-
|
52 |
-
|
53 |
-
|
54 |
-
|
55 |
-
|
56 |
-
|
57 |
-
|
58 |
-
|
59 |
-
|
60 |
-
|
61 |
-
|
62 |
-
|
63 |
-
|
64 |
-
|
65 |
-
|
66 |
-
|
67 |
-
|
68 |
-
|
69 |
-
|
70 |
-
|
71 |
-
|
72 |
-
|
73 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
import os
|
2 |
|
3 |
import pandas as pd
|
4 |
|
5 |
+
# from src.display.formatting import has_no_nan_values, make_clickable_model
|
6 |
+
# from src.display.utils import AutoEvalColumn, EvalQueueColumn
|
7 |
+
# from src.leaderboard.read_evals import get_raw_eval_results
|
8 |
|
9 |
|
10 |
+
def get_leaderboard_df_crm(
|
11 |
+
crm_results_path: str, accuracy_cols: list, cost_cols: list
|
12 |
+
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
13 |
"""Creates a dataframe from all the individual experiment results"""
|
|
|
14 |
sf_finetuned_models = ["SF-TextBase 70B", "SF-TextBase 7B", "SF-TextSum"]
|
15 |
+
|
16 |
+
leaderboard_accuracy_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_accuracy.csv"))
|
17 |
leaderboard_accuracy_df = leaderboard_accuracy_df[~leaderboard_accuracy_df["Model Name"].isin(sf_finetuned_models)]
|
18 |
# leaderboard_accuracy_df = leaderboard_accuracy_df.sort_values(
|
19 |
# by=[AutoEvalColumn.accuracy_metric_average.name], ascending=False
|
|
|
21 |
# print(leaderboard_accuracy_df)
|
22 |
# print(leaderboard_accuracy_df.columns)
|
23 |
# print(leaderboard_accuracy_df["Model Name"].nunique())
|
24 |
+
leaderboard_accuracy_df = leaderboard_accuracy_df[accuracy_cols].round(decimals=2)
|
|
|
25 |
|
26 |
+
ref_df = leaderboard_accuracy_df[["Model Name", "LLM Provider"]].drop_duplicates()
|
27 |
|
28 |
+
leaderboard_cost_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_latency_cost.csv"))
|
29 |
+
leaderboard_cost_df = leaderboard_cost_df[~leaderboard_cost_df["Model Name"].isin(sf_finetuned_models)]
|
30 |
+
leaderboard_cost_df = leaderboard_cost_df.join(ref_df.set_index("Model Name"), on="Model Name")
|
31 |
+
leaderboard_cost_df["LLM Provider"] = leaderboard_cost_df["LLM Provider"].fillna("Google")
|
32 |
+
leaderboard_cost_df = leaderboard_cost_df[cost_cols].round(decimals=2)
|
33 |
+
return leaderboard_accuracy_df, leaderboard_cost_df
|
34 |
+
|
35 |
+
|
36 |
+
# def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
|
37 |
+
# """Creates a dataframe from all the individual experiment results"""
|
38 |
+
# raw_data = get_raw_eval_results(results_path, requests_path)
|
39 |
+
# all_data_json = [v.to_dict() for v in raw_data]
|
40 |
+
|
41 |
+
# df = pd.DataFrame.from_records(all_data_json)
|
42 |
+
# df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
|
43 |
+
# df = df[cols].round(decimals=2)
|
44 |
+
|
45 |
+
# # filter out if any of the benchmarks have not been produced
|
46 |
+
# df = df[has_no_nan_values(df, benchmark_cols)]
|
47 |
+
# return raw_data, df
|
48 |
+
|
49 |
+
# def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
|
50 |
+
# """Creates the different dataframes for the evaluation queues requestes"""
|
51 |
+
# entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
|
52 |
+
# all_evals = []
|
53 |
+
|
54 |
+
# for entry in entries:
|
55 |
+
# if ".json" in entry:
|
56 |
+
# file_path = os.path.join(save_path, entry)
|
57 |
+
# with open(file_path) as fp:
|
58 |
+
# data = json.load(fp)
|
59 |
+
|
60 |
+
# data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
61 |
+
# data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
62 |
+
|
63 |
+
# all_evals.append(data)
|
64 |
+
# elif ".md" not in entry:
|
65 |
+
# # this is a folder
|
66 |
+
# sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
|
67 |
+
# for sub_entry in sub_entries:
|
68 |
+
# file_path = os.path.join(save_path, entry, sub_entry)
|
69 |
+
# with open(file_path) as fp:
|
70 |
+
# data = json.load(fp)
|
71 |
+
|
72 |
+
# data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
|
73 |
+
# data[EvalQueueColumn.revision.name] = data.get("revision", "main")
|
74 |
+
# all_evals.append(data)
|
75 |
+
|
76 |
+
# pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
|
77 |
+
# running_list = [e for e in all_evals if e["status"] == "RUNNING"]
|
78 |
+
# finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
|
79 |
+
# df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
|
80 |
+
# df_running = pd.DataFrame.from_records(running_list, columns=cols)
|
81 |
+
# df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
|
82 |
+
# return df_finished[cols], df_running[cols], df_pending[cols]
|