yibum commited on
Commit
ada4cd8
1 Parent(s): e9c359b

add latency cost table

Browse files
app.py CHANGED
@@ -11,8 +11,11 @@ from src.about import ( # CITATION_BUTTON_LABEL,; CITATION_BUTTON_TEXT,; EVALUA
11
  from src.display.css_html_js import custom_css
12
  from src.display.utils import ( # EVAL_TYPES,; WeightType,; BENCHMARK_COLS,; EVAL_COLS,; NUMERIC_INTERVALS,; ModelType,; Precision,
13
  COLS,
 
 
14
  TYPES,
15
  AutoEvalColumn,
 
16
  fields,
17
  )
18
 
@@ -20,10 +23,11 @@ from src.display.utils import ( # EVAL_TYPES,; WeightType,; BENCHMARK_COLS,; EV
20
  from src.envs import CRM_RESULTS_PATH
21
  from src.populate import get_leaderboard_df_crm
22
 
23
- original_df = get_leaderboard_df_crm(CRM_RESULTS_PATH, COLS)
24
 
25
  # raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
26
  leaderboard_df = original_df.copy()
 
27
  # leaderboard_df = leaderboard_df.style.format({"accuracy_metric_average": "{0:.2f}"})
28
 
29
 
@@ -38,19 +42,12 @@ def update_table(
38
  use_case_area_query: list,
39
  use_case_query: list,
40
  use_case_type_query: list,
41
- # type_query: list,
42
- # precision_query: str,
43
- # size_query: list,
44
- # show_deleted: bool,
45
- # query: str,
46
  ):
47
- # filtered_df = filter_models(hidden_df, type_query, size_query, precision_query, show_deleted)
48
- # filtered_df = filter_queries(query, filtered_df)
49
  filtered_df = filter_llm_func(hidden_df, llm_query)
50
  filtered_df = filter_llm_provider_func(filtered_df, llm_provider_query)
51
  filtered_df = filter_accuracy_method_func(filtered_df, accuracy_method_query)
52
- filtered_df = filter_accuracy_threshold_func(filtered_df, accuracy_threshold_query)
53
-
54
  filtered_df["Use Case Area"] = filtered_df["Use Case Name"].apply(lambda x: x.split(": ")[0])
55
  filtered_df = filter_use_case_area_func(filtered_df, use_case_area_query)
56
  filtered_df = filter_use_case_func(filtered_df, use_case_query)
@@ -59,6 +56,32 @@ def update_table(
59
  return df
60
 
61
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
62
  def init_leaderboard_df(
63
  leaderboard_df: pd.DataFrame,
64
  columns: list,
@@ -72,7 +95,6 @@ def init_leaderboard_df(
72
  ):
73
 
74
  # Applying the style function
75
- # df = leaderboard_df[[c.name for c in fields(AutoEvalColumn) if c.never_hidden] + shown_columns.value]
76
  # return df.style.apply(highlight_cols, axis=None)
77
  return update_table(
78
  leaderboard_df,
@@ -87,13 +109,30 @@ def init_leaderboard_df(
87
  )
88
 
89
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
  def filter_accuracy_method_func(df: pd.DataFrame, accuracy_method_query: str) -> pd.DataFrame:
91
  return df[df["Accuracy Method"] == accuracy_method_query]
92
 
93
 
94
  def filter_accuracy_threshold_func(df: pd.DataFrame, accuracy_threshold_query: str) -> pd.DataFrame:
95
  accuracy_cols = ["Instruction Following", "Conciseness", "Completeness", "Accuracy"]
96
- return df[(df.loc[:, accuracy_cols] >= float(accuracy_threshold_query)).all(axis=1)]
97
 
98
 
99
  def filter_use_case_area_func(df: pd.DataFrame, use_case_area_query: list) -> pd.DataFrame:
@@ -130,45 +169,12 @@ def select_columns(df: pd.DataFrame, columns: list) -> pd.DataFrame:
130
  return filtered_df
131
 
132
 
133
- # def filter_queries(query: str, filtered_df: pd.DataFrame) -> pd.DataFrame:
134
- # final_df = []
135
- # if query != "":
136
- # queries = [q.strip() for q in query.split(";")]
137
- # for _q in queries:
138
- # _q = _q.strip()
139
- # if _q != "":
140
- # temp_filtered_df = search_table(filtered_df, _q)
141
- # if len(temp_filtered_df) > 0:
142
- # final_df.append(temp_filtered_df)
143
- # if len(final_df) > 0:
144
- # filtered_df = pd.concat(final_df)
145
- # filtered_df = filtered_df.drop_duplicates(
146
- # subset=[AutoEvalColumn.model.name, AutoEvalColumn.precision.name, AutoEvalColumn.revision.name]
147
- # )
148
-
149
- # return filtered_df
150
-
151
-
152
- # def filter_models(
153
- # df: pd.DataFrame, type_query: list, size_query: list, precision_query: list, show_deleted: bool
154
- # ) -> pd.DataFrame:
155
- # # Show all models
156
- # filtered_df = df
157
- # # if show_deleted:
158
- # # filtered_df = df
159
- # # else: # Show only still on the hub models
160
- # # filtered_df = df[df[AutoEvalColumn.still_on_hub.name] is True]
161
-
162
- # type_emoji = [t[0] for t in type_query]
163
- # filtered_df = filtered_df.loc[df[AutoEvalColumn.model_type_symbol.name].isin(type_emoji)]
164
- # filtered_df = filtered_df.loc[df[AutoEvalColumn.precision.name].isin(precision_query + ["None"])]
165
-
166
- # numeric_interval = pd.IntervalIndex(sorted([NUMERIC_INTERVALS[s] for s in size_query]))
167
- # params_column = pd.to_numeric(df[AutoEvalColumn.params.name], errors="coerce")
168
- # mask = params_column.apply(lambda x: any(numeric_interval.contains(x)))
169
- # filtered_df = filtered_df.loc[mask]
170
-
171
- # return filtered_df
172
 
173
 
174
  demo = gr.Blocks(css=custom_css)
@@ -259,14 +265,14 @@ with demo:
259
  # multiselect=True,
260
  # interactive=True,
261
  # )
262
- with gr.Column():
263
- filter_metric_area = gr.CheckboxGroup(
264
- choices=["Accuracy", "Speed (Latency)", "Trust & Safety", "Cost"],
265
- value=["Accuracy", "Speed (Latency)", "Trust & Safety", "Cost"],
266
- label="Metric Area",
267
- info="",
268
- interactive=True,
269
- )
270
  with gr.Column():
271
  filter_accuracy_method = gr.Radio(
272
  choices=["Manual", "Auto"],
@@ -374,6 +380,87 @@ with demo:
374
  leaderboard_table,
375
  queue=True,
376
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
377
 
378
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
379
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
 
11
  from src.display.css_html_js import custom_css
12
  from src.display.utils import ( # EVAL_TYPES,; WeightType,; BENCHMARK_COLS,; EVAL_COLS,; NUMERIC_INTERVALS,; ModelType,; Precision,
13
  COLS,
14
+ COST_COLS,
15
+ COST_TYPES,
16
  TYPES,
17
  AutoEvalColumn,
18
+ CostEvalColumn,
19
  fields,
20
  )
21
 
 
23
  from src.envs import CRM_RESULTS_PATH
24
  from src.populate import get_leaderboard_df_crm
25
 
26
+ original_df, cost_df = get_leaderboard_df_crm(CRM_RESULTS_PATH, COLS, COST_COLS)
27
 
28
  # raw_data, original_df = get_leaderboard_df(EVAL_RESULTS_PATH, EVAL_REQUESTS_PATH, COLS, BENCHMARK_COLS)
29
  leaderboard_df = original_df.copy()
30
+ leaderboard_cost_df = cost_df.copy()
31
  # leaderboard_df = leaderboard_df.style.format({"accuracy_metric_average": "{0:.2f}"})
32
 
33
 
 
42
  use_case_area_query: list,
43
  use_case_query: list,
44
  use_case_type_query: list,
 
 
 
 
 
45
  ):
 
 
46
  filtered_df = filter_llm_func(hidden_df, llm_query)
47
  filtered_df = filter_llm_provider_func(filtered_df, llm_provider_query)
48
  filtered_df = filter_accuracy_method_func(filtered_df, accuracy_method_query)
49
+ filtered_df["Accuracy Threshold"] = filter_accuracy_threshold_func(filtered_df, accuracy_threshold_query)
50
+ filtered_df = filtered_df[filtered_df["Accuracy Threshold"]]
51
  filtered_df["Use Case Area"] = filtered_df["Use Case Name"].apply(lambda x: x.split(": ")[0])
52
  filtered_df = filter_use_case_area_func(filtered_df, use_case_area_query)
53
  filtered_df = filter_use_case_func(filtered_df, use_case_query)
 
56
  return df
57
 
58
 
59
+ def update_cost_table(
60
+ hidden_df: pd.DataFrame,
61
+ columns: list,
62
+ llm_query: list,
63
+ llm_provider_query: list,
64
+ use_case_type_query: list,
65
+ ):
66
+ filtered_df = filter_llm_func(hidden_df, llm_query)
67
+ filtered_df = filter_llm_provider_func(filtered_df, llm_provider_query)
68
+ filtered_df = filter_use_case_type_func(filtered_df, use_case_type_query)
69
+ df = select_columns_cost_table(filtered_df, columns)
70
+ return df.style.map(highlight_cost_band_low, props="background-color: #b3d5a4")
71
+
72
+
73
+ # def highlight_cols(x):
74
+ # df = x.copy()
75
+ # df.loc[:, :] = "color: black"
76
+ # df.loc[, ["Accuracy"]] = "background-color: #b3d5a4"
77
+ # return df
78
+
79
+
80
+ def highlight_cost_band_low(s, props=""):
81
+
82
+ return props if s == "Low" else None
83
+
84
+
85
  def init_leaderboard_df(
86
  leaderboard_df: pd.DataFrame,
87
  columns: list,
 
95
  ):
96
 
97
  # Applying the style function
 
98
  # return df.style.apply(highlight_cols, axis=None)
99
  return update_table(
100
  leaderboard_df,
 
109
  )
110
 
111
 
112
+ def init_leaderboard_cost_df(
113
+ leaderboard_df: pd.DataFrame,
114
+ columns: list,
115
+ llm_query: list,
116
+ llm_provider_query: list,
117
+ use_case_type_query: list,
118
+ ):
119
+
120
+ return update_cost_table(
121
+ leaderboard_df,
122
+ columns,
123
+ llm_query,
124
+ llm_provider_query,
125
+ use_case_type_query,
126
+ )
127
+
128
+
129
  def filter_accuracy_method_func(df: pd.DataFrame, accuracy_method_query: str) -> pd.DataFrame:
130
  return df[df["Accuracy Method"] == accuracy_method_query]
131
 
132
 
133
  def filter_accuracy_threshold_func(df: pd.DataFrame, accuracy_threshold_query: str) -> pd.DataFrame:
134
  accuracy_cols = ["Instruction Following", "Conciseness", "Completeness", "Accuracy"]
135
+ return (df.loc[:, accuracy_cols] >= float(accuracy_threshold_query)).all(axis=1)
136
 
137
 
138
  def filter_use_case_area_func(df: pd.DataFrame, use_case_area_query: list) -> pd.DataFrame:
 
169
  return filtered_df
170
 
171
 
172
+ def select_columns_cost_table(df: pd.DataFrame, columns: list) -> pd.DataFrame:
173
+ always_here_cols = [
174
+ CostEvalColumn.model.name,
175
+ ]
176
+ filtered_df = df[always_here_cols + [c for c in COST_COLS if c in df.columns and c in columns]]
177
+ return filtered_df
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
178
 
179
 
180
  demo = gr.Blocks(css=custom_css)
 
265
  # multiselect=True,
266
  # interactive=True,
267
  # )
268
+ # with gr.Column():
269
+ # filter_metric_area = gr.CheckboxGroup(
270
+ # choices=["Accuracy", "Speed (Latency)", "Trust & Safety", "Cost"],
271
+ # value=["Accuracy", "Speed (Latency)", "Trust & Safety", "Cost"],
272
+ # label="Metric Area",
273
+ # info="",
274
+ # interactive=True,
275
+ # )
276
  with gr.Column():
277
  filter_accuracy_method = gr.Radio(
278
  choices=["Manual", "Auto"],
 
380
  leaderboard_table,
381
  queue=True,
382
  )
383
+ with gr.TabItem("🏅 Latency & Cost", elem_id="llm-benchmark-tab-table", id=1):
384
+ with gr.Row():
385
+ with gr.Column():
386
+ with gr.Row():
387
+ shown_columns = gr.CheckboxGroup(
388
+ choices=[c.name for c in fields(CostEvalColumn) if not c.hidden and not c.never_hidden],
389
+ value=[
390
+ c.name
391
+ for c in fields(CostEvalColumn)
392
+ if c.displayed_by_default and not c.hidden and not c.never_hidden
393
+ ],
394
+ label="Select columns to show",
395
+ elem_id="column-select",
396
+ interactive=True,
397
+ )
398
+ with gr.Row():
399
+ with gr.Column():
400
+ filter_llm = gr.CheckboxGroup(
401
+ choices=list(cost_df["Model Name"].unique()),
402
+ value=list(cost_df["Model Name"].unique()),
403
+ label="Model Name",
404
+ info="",
405
+ interactive=True,
406
+ )
407
+ with gr.Column():
408
+ filter_llm_provider = gr.CheckboxGroup(
409
+ choices=list(cost_df["LLM Provider"].unique()),
410
+ value=list(cost_df["LLM Provider"].unique()),
411
+ label="LLM Provider",
412
+ info="",
413
+ interactive=True,
414
+ )
415
+ with gr.Column():
416
+ filter_use_case_type = gr.CheckboxGroup(
417
+ choices=["Long", "Short"],
418
+ value=["Long", "Short"],
419
+ label="Use Case Type",
420
+ info="Output: 250 tokens, Long input: 3k tokens, Short input: 500 tokens",
421
+ interactive=True,
422
+ )
423
+
424
+ leaderboard_table = gr.components.Dataframe(
425
+ value=init_leaderboard_cost_df(
426
+ leaderboard_cost_df,
427
+ shown_columns.value,
428
+ filter_llm.value,
429
+ filter_llm_provider.value,
430
+ filter_use_case_type.value,
431
+ ),
432
+ headers=[c.name for c in fields(CostEvalColumn) if c.never_hidden] + shown_columns.value,
433
+ datatype=COST_TYPES,
434
+ elem_id="leaderboard-table",
435
+ interactive=False,
436
+ visible=True,
437
+ )
438
+
439
+ hidden_leaderboard_table_for_search = gr.components.Dataframe(
440
+ value=cost_df[COST_COLS],
441
+ headers=COST_COLS,
442
+ datatype=COST_TYPES,
443
+ visible=False,
444
+ )
445
+
446
+ for selector in [
447
+ shown_columns,
448
+ filter_llm,
449
+ filter_llm_provider,
450
+ filter_use_case_type,
451
+ ]:
452
+ selector.change(
453
+ update_cost_table,
454
+ [
455
+ hidden_leaderboard_table_for_search,
456
+ shown_columns,
457
+ filter_llm,
458
+ filter_llm_provider,
459
+ filter_use_case_type,
460
+ ],
461
+ leaderboard_table,
462
+ queue=True,
463
+ )
464
 
465
  with gr.TabItem("📝 About", elem_id="llm-benchmark-tab-table", id=2):
466
  gr.Markdown(LLM_BENCHMARKS_TEXT, elem_classes="markdown-text")
crm-results/hf_leaderboard_latency_cost.csv CHANGED
@@ -1,37 +1,37 @@
1
- Model Name,Use Case Type (Long vs Short),Platform,Mean Latency (sec) per Request,Mean Output Tokens,Mean Cost per 1K Requests,Cost Band
2
- AI21 Jamba-Instruct,Long,AI21,4.0,232.9,1.6,High
3
- AI21 Jamba-Instruct,Short,AI21,4.0,243.9,0.5,High
4
- Claude 3 Haiku,Long,Bedrock,2.8,236.9,1.0,High
5
- Claude 3 Haiku,Short,Bedrock,2.2,245.4,0.4,High
6
- Claude 3 Opus,Long,Bedrock,12.2,242.7,61.1,High
7
- Claude 3 Opus,Short,Bedrock,8.4,243.2,25.4,High
8
- Cohere Command R+,Long,Bedrock,7.7,245.7,11.7,High
9
- Cohere Command R+,Short,Bedrock,7.1,249.9,5.1,High
10
- Cohere Command Text,Long,Bedrock,12.9,238.7,4.3,High
11
- Cohere Command Text,Short,Bedrock,9.6,245.6,1.1,High
12
- Gemini Pro 1.5,Long,Google,5.5,245.7,11.0,High
13
- Gemini Pro 1.5,Short,Google,5.4,247.5,3.3,High
14
- Gemini Pro 1,Long,Google,6.0,228.9,1.7,High
15
- Gemini Pro 1,Short,Google,4.4,247.4,0.6,High
16
- GPT 3.5 Turbo,Long,OpenAI,4.5,249.9,1.6,High
17
- GPT 3.5 Turbo,Short,OpenAI,4.2,238.3,0.6,High
18
- GPT 4 Turbo,Long,OpenAI,12.3,247.6,32.0,High
19
- GPT 4 Turbo,Short,OpenAI,12.3,250.0,11.7,High
20
- GPT4-o,Long,OpenAI,5.1,248.4,15.9,High
21
- GPT4-o,Short,OpenAI,5.0,250.0,5.8,High
22
- Mistral 7B,Long,Self-host (g5.48xlarge),8.83,242.0,16.5,High
23
- Mistral 7B,Short,Self-host (g5.48xlarge),8.31,247.0,15.5,High
24
- LLaMA 3 8B,Long,Self-host (g5.48xlarge),3.76,251.5,7.0,High
25
- LLaMA 3 8B,Short,Self-host (g5.48xlarge),3.23,243.6,6.0,High
26
- LLaMA 3 70B,Long,Self-host (p4d.24xlarge),20.1,243.9,67.7,High
27
- LLaMA 3 70B,Short,Self-host (p4d.24xlarge),29.4,251.2,99.0,High
28
- Mixtral 8x7B,Long,Self-host (p4d.24xlarge),2.44,248.5,8.22,High
29
- Mixtral 8x7B,Short,Self-host (p4d.24xlarge),2.41,250.0,8.11,High
30
- SF-TextBase 7B,Long,Self-host (g5.48xlarge),8.99,248.5,16.80,High
31
- SF-TextBase 7B,Short,Self-host (g5.48xlarge),8.29,248.7,15.50,High
32
- SF-TextBase 70B,Long,Self-host (p4de.24xlarge),6.52,253.7,28.17,High
33
- SF-TextBase 70B,Short,Self-host (p4de.24xlarge),6.24,249.7,26.96,High
34
- SF-TextSum,Long,Self-host (g5.48xlarge),8.85,244.0,16.55,High
35
- SF-TextSum,Short,Self-host (g5.48xlarge),8.34,250.4,15.60,High
36
- XGen 2,Long,Self-host (p4de.24xlarge),3.71,250.0,16.03,High
37
- XGen 2,Short,Self-host (p4de.24xlarge),2.64,250.0,11.40,High
 
1
+ Model Name,Use Case Type,Version,Platform,Mean Latency (sec) per Request,Mean Output Tokens,Mean Cost per 1K Requests,Cost Band,,Model id,Cost per 1m input tokens,Cost per 1m output tokens,,,,Percentile,From,To,,min,Max
2
+ AI21 Jamba-Instruct,Long,,AI21,4.0,232.9,1.6,Medium,,GPT 3.5 Turbo,0.5,1.5,,,0%,0.43,0.43,1.61,,0.43,61.11
3
+ AI21 Jamba-Instruct,Short,,AI21,4.0,243.9,0.5,Low,,GPT 4 Turbo,10,30,,,33%,1.61,1.61,9.28,,,
4
+ Claude 3 Haiku,Long,,Bedrock,2.8,236.9,1.0,Low,,GPT4-o,5,15,,,67%,9.28,9.28,61.11,,,
5
+ Claude 3 Haiku,Short,,Bedrock,2.2,245.4,0.4,Low,,Claude 3 Haiku,0.25,1.25,,,100%,61.11,,,,,
6
+ Claude 3 Opus,Long,,Bedrock,12.2,242.7,61.1,High,,Claude 3 Opus,15,75,,,,,,,,,
7
+ Claude 3 Opus,Short,,Bedrock,8.4,243.2,25.4,High,,AI21 Jamba-Instruct,0.5,0.7,,,,,,,,,
8
+ Cohere Command R+,Long,,Bedrock,7.7,245.7,11.7,High,,Cohere Command Text,1.5,2,,,,,,,,,
9
+ Cohere Command R+,Short,,Bedrock,7.1,249.9,5.1,Medium,,Cohere Command R+,3,15,,,,,,,,,
10
+ Cohere Command Text,Long,,Bedrock,12.9,238.7,4.3,Medium,,Gemini Pro 1,0.5,1.5,,,,,,,,,
11
+ Cohere Command Text,Short,,Bedrock,9.6,245.6,1.1,Low,,Gemini Pro 1.5,3.5,7,,,,,,,,,
12
+ Gemini Pro 1.5,Long,,Google,5.5,245.7,11.0,High,,,,,,,,,,,,,
13
+ Gemini Pro 1.5,Short,,Google,5.4,247.5,3.3,Medium,,,,,,,,,,,,,
14
+ Gemini Pro 1,Long,,Google,6.0,228.9,1.7,Medium,,,,,,,,,,,,,
15
+ Gemini Pro 1,Short,,Google,4.4,247.4,0.6,Low,,,,,,,,,,,,,
16
+ GPT 3.5 Turbo,Long,,OpenAI,4.5,249.9,1.6,Low,,,,,,,,,,,,,
17
+ GPT 3.5 Turbo,Short,,OpenAI,4.2,238.3,0.6,Low,,,,,,,,,,,,,
18
+ GPT 4 Turbo,Long,,OpenAI,12.3,247.6,32.0,High,,,,,,,,,,,,,
19
+ GPT 4 Turbo,Short,,OpenAI,12.3,250.0,11.7,High,,,,,,,,,,,,,
20
+ GPT4-o,Long,,OpenAI,5.1,248.4,15.9,High,,,,,,,,,,,,,
21
+ GPT4-o,Short,,OpenAI,5.0,250.0,5.8,Medium,,,,,,,,,,,,,
22
+ Mistral 7B,Long,Mistral-7B-Instruct-v0.2,Self-host (g5.48xlarge),8.83,242.0,16.5,High,,,,,,,,,,,,,
23
+ Mistral 7B,Short,Mistral-7B-Instruct-v0.2,Self-host (g5.48xlarge),8.31,247.0,15.5,High,,,,,,,,,,,,,
24
+ LLaMA 3 8B,Long,Meta-Llama-3-8B-Instruct,Self-host (g5.48xlarge),3.76,251.5,7.0,Medium,,,,,,,,,,,,,
25
+ LLaMA 3 8B,Short,Meta-Llama-3-8B-Instruct,Self-host (g5.48xlarge),3.23,243.6,6.0,Medium,,,,,,,,,,,,,
26
+ LLaMA 3 70B,Long,llama-3-70b-instruct,Self-host (p4d.24xlarge),20.1,243.9,67.7,High,,,,,,,,,,,,,
27
+ LLaMA 3 70B,Short,llama-3-70b-instruct,Self-host (p4d.24xlarge),29.4,251.2,99.0,High,,,,,,,,,,,,,
28
+ Mixtral 8x7B,Long,mixtral-8x7b-instruct,Self-host (p4d.24xlarge),2.44,248.5,8.22,Medium,,,,,,,,,,,,,
29
+ Mixtral 8x7B,Short,mixtral-8x7b-instruct,Self-host (p4d.24xlarge),2.41,250.0,8.11,Medium,,,,,,,,,,,,,
30
+ SF-TextBase 7B,Long,CRM-TextBase-7b-22k-g5 (endpoint),Self-host (g5.48xlarge),8.99,248.5,16.80,High,,,,,,,,,,,,,
31
+ SF-TextBase 7B,Short,CRM-TextBase-7b-22k-g5 (endpoint),Self-host (g5.48xlarge),8.29,248.7,15.50,High,,,,,,,,,,,,,
32
+ SF-TextBase 70B,Long,TextBase-70B-8K,Self-host (p4de.24xlarge),6.52,253.7,28.17,High,,,,,,,,,,,,,
33
+ SF-TextBase 70B,Short,TextBase-70B-8K,Self-host (p4de.24xlarge),6.24,249.7,26.96,High,,,,,,,,,,,,,
34
+ SF-TextSum,Long,CRM-TSUM-7b-22k-g5 (endpoint),Self-host (g5.48xlarge),8.85,244.0,16.55,High,,,,,,,,,,,,,
35
+ SF-TextSum,Short,CRM-TSUM-7b-22k-g5 (endpoint),Self-host (g5.48xlarge),8.34,250.4,15.60,High,,,,,,,,,,,,,
36
+ XGen 22B,Long,EinsteinXgen2E4DSStreaming (endpoint),Self-host (p4de.24xlarge),3.71,250.0,16.03,High,not able to get response for large token requests (5K-token input),,,,,,,,,,,,
37
+ XGen 22B,Short,EinsteinXgen2E4DSStreaming (endpoint),Self-host (p4de.24xlarge),2.64,250.0,11.40,High,,,,,,,,,,,,,
src/display/utils.py CHANGED
@@ -51,9 +51,26 @@ auto_eval_column_dict.append(
51
  auto_eval_column_dict.append(
52
  ["accuracy_metric_factuality", ColumnContent, ColumnContent("Factuality", "markdown", True)]
53
  )
54
- # Speed (Latency) metrics
 
 
55
 
56
- # Cost metrics
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
57
 
58
  # Trust & Safety metrics
59
 
@@ -73,9 +90,6 @@ auto_eval_column_dict.append(
73
  # auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
74
  # auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
75
 
76
- # We use make dataclass to dynamically fill the scores from Tasks
77
- AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
78
-
79
 
80
  ## For the queue columns in the submission tab
81
  @dataclass(frozen=True)
@@ -156,6 +170,9 @@ TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
156
  COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
157
  TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
158
 
 
 
 
159
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
160
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
161
 
 
51
  auto_eval_column_dict.append(
52
  ["accuracy_metric_factuality", ColumnContent, ColumnContent("Factuality", "markdown", True)]
53
  )
54
+ # We use make dataclass to dynamically fill the scores from Tasks
55
+ AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
56
+
57
 
58
+ # Speed (Latency) & Cost metrics
59
+ cost_eval_column_dict = []
60
+ # Init
61
+ cost_eval_column_dict.append(
62
+ ["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)]
63
+ )
64
+ cost_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)])
65
+ cost_eval_column_dict.append(["use_case_type", ColumnContent, ColumnContent("Use Case Type", "markdown", True)])
66
+ cost_eval_column_dict.append(
67
+ ["latency", ColumnContent, ColumnContent("Mean Latency (sec) per Request", "markdown", True)]
68
+ )
69
+ cost_eval_column_dict.append(
70
+ ["mean_output_tokens", ColumnContent, ColumnContent("Mean Output Tokens", "markdown", True)]
71
+ )
72
+ cost_eval_column_dict.append(["cost_band", ColumnContent, ColumnContent("Cost Band", "markdown", True)])
73
+ CostEvalColumn = make_dataclass("CostEvalColumn", cost_eval_column_dict, frozen=True)
74
 
75
  # Trust & Safety metrics
76
 
 
90
  # auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
91
  # auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
92
 
 
 
 
93
 
94
  ## For the queue columns in the submission tab
95
  @dataclass(frozen=True)
 
170
  COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
171
  TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
172
 
173
+ COST_COLS = [c.name for c in fields(CostEvalColumn) if not c.hidden]
174
+ COST_TYPES = [c.type for c in fields(CostEvalColumn) if not c.hidden]
175
+
176
  EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
177
  EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
178
 
src/populate.py CHANGED
@@ -1,17 +1,19 @@
1
- import json
2
  import os
3
 
4
  import pandas as pd
5
 
6
- from src.display.formatting import has_no_nan_values, make_clickable_model
7
- from src.display.utils import AutoEvalColumn, EvalQueueColumn
8
- from src.leaderboard.read_evals import get_raw_eval_results
9
 
10
 
11
- def get_leaderboard_df_crm(crm_results_path: str, cols: list) -> pd.DataFrame:
 
 
12
  """Creates a dataframe from all the individual experiment results"""
13
- leaderboard_accuracy_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_accuracy.csv"))
14
  sf_finetuned_models = ["SF-TextBase 70B", "SF-TextBase 7B", "SF-TextSum"]
 
 
15
  leaderboard_accuracy_df = leaderboard_accuracy_df[~leaderboard_accuracy_df["Model Name"].isin(sf_finetuned_models)]
16
  # leaderboard_accuracy_df = leaderboard_accuracy_df.sort_values(
17
  # by=[AutoEvalColumn.accuracy_metric_average.name], ascending=False
@@ -19,55 +21,62 @@ def get_leaderboard_df_crm(crm_results_path: str, cols: list) -> pd.DataFrame:
19
  # print(leaderboard_accuracy_df)
20
  # print(leaderboard_accuracy_df.columns)
21
  # print(leaderboard_accuracy_df["Model Name"].nunique())
22
- leaderboard_accuracy_df = leaderboard_accuracy_df[cols].round(decimals=2)
23
- return leaderboard_accuracy_df
24
 
 
25
 
26
- def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
27
- """Creates a dataframe from all the individual experiment results"""
28
- raw_data = get_raw_eval_results(results_path, requests_path)
29
- all_data_json = [v.to_dict() for v in raw_data]
30
-
31
- df = pd.DataFrame.from_records(all_data_json)
32
- df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
33
- df = df[cols].round(decimals=2)
34
-
35
- # filter out if any of the benchmarks have not been produced
36
- df = df[has_no_nan_values(df, benchmark_cols)]
37
- return raw_data, df
38
-
39
-
40
- def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
41
- """Creates the different dataframes for the evaluation queues requestes"""
42
- entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
43
- all_evals = []
44
-
45
- for entry in entries:
46
- if ".json" in entry:
47
- file_path = os.path.join(save_path, entry)
48
- with open(file_path) as fp:
49
- data = json.load(fp)
50
-
51
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
52
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
53
-
54
- all_evals.append(data)
55
- elif ".md" not in entry:
56
- # this is a folder
57
- sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
58
- for sub_entry in sub_entries:
59
- file_path = os.path.join(save_path, entry, sub_entry)
60
- with open(file_path) as fp:
61
- data = json.load(fp)
62
-
63
- data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
64
- data[EvalQueueColumn.revision.name] = data.get("revision", "main")
65
- all_evals.append(data)
66
-
67
- pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
68
- running_list = [e for e in all_evals if e["status"] == "RUNNING"]
69
- finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
70
- df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
71
- df_running = pd.DataFrame.from_records(running_list, columns=cols)
72
- df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
73
- return df_finished[cols], df_running[cols], df_pending[cols]
 
 
 
 
 
 
 
 
 
1
  import os
2
 
3
  import pandas as pd
4
 
5
+ # from src.display.formatting import has_no_nan_values, make_clickable_model
6
+ # from src.display.utils import AutoEvalColumn, EvalQueueColumn
7
+ # from src.leaderboard.read_evals import get_raw_eval_results
8
 
9
 
10
+ def get_leaderboard_df_crm(
11
+ crm_results_path: str, accuracy_cols: list, cost_cols: list
12
+ ) -> tuple[pd.DataFrame, pd.DataFrame]:
13
  """Creates a dataframe from all the individual experiment results"""
 
14
  sf_finetuned_models = ["SF-TextBase 70B", "SF-TextBase 7B", "SF-TextSum"]
15
+
16
+ leaderboard_accuracy_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_accuracy.csv"))
17
  leaderboard_accuracy_df = leaderboard_accuracy_df[~leaderboard_accuracy_df["Model Name"].isin(sf_finetuned_models)]
18
  # leaderboard_accuracy_df = leaderboard_accuracy_df.sort_values(
19
  # by=[AutoEvalColumn.accuracy_metric_average.name], ascending=False
 
21
  # print(leaderboard_accuracy_df)
22
  # print(leaderboard_accuracy_df.columns)
23
  # print(leaderboard_accuracy_df["Model Name"].nunique())
24
+ leaderboard_accuracy_df = leaderboard_accuracy_df[accuracy_cols].round(decimals=2)
 
25
 
26
+ ref_df = leaderboard_accuracy_df[["Model Name", "LLM Provider"]].drop_duplicates()
27
 
28
+ leaderboard_cost_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_latency_cost.csv"))
29
+ leaderboard_cost_df = leaderboard_cost_df[~leaderboard_cost_df["Model Name"].isin(sf_finetuned_models)]
30
+ leaderboard_cost_df = leaderboard_cost_df.join(ref_df.set_index("Model Name"), on="Model Name")
31
+ leaderboard_cost_df["LLM Provider"] = leaderboard_cost_df["LLM Provider"].fillna("Google")
32
+ leaderboard_cost_df = leaderboard_cost_df[cost_cols].round(decimals=2)
33
+ return leaderboard_accuracy_df, leaderboard_cost_df
34
+
35
+
36
+ # def get_leaderboard_df(results_path: str, requests_path: str, cols: list, benchmark_cols: list) -> pd.DataFrame:
37
+ # """Creates a dataframe from all the individual experiment results"""
38
+ # raw_data = get_raw_eval_results(results_path, requests_path)
39
+ # all_data_json = [v.to_dict() for v in raw_data]
40
+
41
+ # df = pd.DataFrame.from_records(all_data_json)
42
+ # df = df.sort_values(by=[AutoEvalColumn.average.name], ascending=False)
43
+ # df = df[cols].round(decimals=2)
44
+
45
+ # # filter out if any of the benchmarks have not been produced
46
+ # df = df[has_no_nan_values(df, benchmark_cols)]
47
+ # return raw_data, df
48
+
49
+ # def get_evaluation_queue_df(save_path: str, cols: list) -> list[pd.DataFrame]:
50
+ # """Creates the different dataframes for the evaluation queues requestes"""
51
+ # entries = [entry for entry in os.listdir(save_path) if not entry.startswith(".")]
52
+ # all_evals = []
53
+
54
+ # for entry in entries:
55
+ # if ".json" in entry:
56
+ # file_path = os.path.join(save_path, entry)
57
+ # with open(file_path) as fp:
58
+ # data = json.load(fp)
59
+
60
+ # data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
61
+ # data[EvalQueueColumn.revision.name] = data.get("revision", "main")
62
+
63
+ # all_evals.append(data)
64
+ # elif ".md" not in entry:
65
+ # # this is a folder
66
+ # sub_entries = [e for e in os.listdir(f"{save_path}/{entry}") if not e.startswith(".")]
67
+ # for sub_entry in sub_entries:
68
+ # file_path = os.path.join(save_path, entry, sub_entry)
69
+ # with open(file_path) as fp:
70
+ # data = json.load(fp)
71
+
72
+ # data[EvalQueueColumn.model.name] = make_clickable_model(data["model"])
73
+ # data[EvalQueueColumn.revision.name] = data.get("revision", "main")
74
+ # all_evals.append(data)
75
+
76
+ # pending_list = [e for e in all_evals if e["status"] in ["PENDING", "RERUN"]]
77
+ # running_list = [e for e in all_evals if e["status"] == "RUNNING"]
78
+ # finished_list = [e for e in all_evals if e["status"].startswith("FINISHED") or e["status"] == "PENDING_NEW_EVAL"]
79
+ # df_pending = pd.DataFrame.from_records(pending_list, columns=cols)
80
+ # df_running = pd.DataFrame.from_records(running_list, columns=cols)
81
+ # df_finished = pd.DataFrame.from_records(finished_list, columns=cols)
82
+ # return df_finished[cols], df_running[cols], df_pending[cols]