yibum commited on
Commit
84ee137
1 Parent(s): 4c0cc56

join cost table

Browse files
README.md CHANGED
@@ -9,36 +9,4 @@ pinned: true
9
  license: apache-2.0
10
  ---
11
 
12
- # Start the configuration
13
-
14
- Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
15
-
16
- Results files should have the following format and be stored as json files:
17
- ```json
18
- {
19
- "config": {
20
- "model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
21
- "model_name": "path of the model on the hub: org/model",
22
- "model_sha": "revision on the hub",
23
- },
24
- "results": {
25
- "task_name": {
26
- "metric_name": score,
27
- },
28
- "task_name2": {
29
- "metric_name": score,
30
- }
31
- }
32
- }
33
- ```
34
-
35
- Request files are created automatically by this tool.
36
-
37
- If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
38
-
39
- # Code logic for more complex edits
40
-
41
- You'll find
42
- - the main table' columns names and properties in `src/display/utils.py`
43
- - the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
44
- - teh logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
 
9
  license: apache-2.0
10
  ---
11
 
12
+ # Generative AI Leaderboard for CRM
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
app.py CHANGED
@@ -57,7 +57,7 @@ def update_table(
57
  filtered_df = filter_use_case_func(filtered_df, use_case_query)
58
  filtered_df = filter_use_case_type_func(filtered_df, use_case_type_query)
59
  df = select_columns(filtered_df, columns)
60
- return df
61
 
62
 
63
  def update_cost_table(
@@ -65,11 +65,11 @@ def update_cost_table(
65
  columns: list,
66
  llm_query: list,
67
  llm_provider_query: list,
68
- use_case_type_query: list,
69
  ):
70
  filtered_df = filter_llm_func(hidden_df, llm_query)
71
  filtered_df = filter_llm_provider_func(filtered_df, llm_provider_query)
72
- filtered_df = filter_use_case_type_func(filtered_df, use_case_type_query)
73
  df = select_columns_cost_table(filtered_df, columns)
74
  return df.style.map(highlight_cost_band_low, props="background-color: #b3d5a4")
75
 
@@ -183,6 +183,10 @@ def filter_use_case_type_func(df: pd.DataFrame, use_case_type_query: list) -> pd
183
  return df[df["Use Case Type"].isin(use_case_type_query)]
184
 
185
 
 
 
 
 
186
  def filter_llm_func(df: pd.DataFrame, llm_query: list) -> pd.DataFrame:
187
  return df[df["Model Name"].isin(llm_query)]
188
 
@@ -222,7 +226,7 @@ with demo:
222
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
223
 
224
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
225
- with gr.TabItem("🏅 LLM Benchmark", elem_id="llm-benchmark-tab-table", id=0):
226
  with gr.Row():
227
  with gr.Column():
228
  # with gr.Row():
@@ -455,7 +459,7 @@ with demo:
455
  filter_use_case_type = gr.CheckboxGroup(
456
  choices=["Long", "Short"],
457
  value=["Long", "Short"],
458
- label="Use Case Type",
459
  info="Output: 250 tokens, Long input: 3k tokens, Short input: 500 tokens",
460
  interactive=True,
461
  )
 
57
  filtered_df = filter_use_case_func(filtered_df, use_case_query)
58
  filtered_df = filter_use_case_type_func(filtered_df, use_case_type_query)
59
  df = select_columns(filtered_df, columns)
60
+ return df.style.map(highlight_cost_band_low, props="background-color: #b3d5a4")
61
 
62
 
63
  def update_cost_table(
 
65
  columns: list,
66
  llm_query: list,
67
  llm_provider_query: list,
68
+ use_case_flavor_query: list,
69
  ):
70
  filtered_df = filter_llm_func(hidden_df, llm_query)
71
  filtered_df = filter_llm_provider_func(filtered_df, llm_provider_query)
72
+ filtered_df = filter_use_case_flavor_func(filtered_df, use_case_flavor_query)
73
  df = select_columns_cost_table(filtered_df, columns)
74
  return df.style.map(highlight_cost_band_low, props="background-color: #b3d5a4")
75
 
 
183
  return df[df["Use Case Type"].isin(use_case_type_query)]
184
 
185
 
186
+ def filter_use_case_flavor_func(df: pd.DataFrame, use_case_flavor_query: list) -> pd.DataFrame:
187
+ return df[df["Cost and Speed: Flavor"].isin(use_case_flavor_query)]
188
+
189
+
190
  def filter_llm_func(df: pd.DataFrame, llm_query: list) -> pd.DataFrame:
191
  return df[df["Model Name"].isin(llm_query)]
192
 
 
226
  gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
227
 
228
  with gr.Tabs(elem_classes="tab-buttons") as tabs:
229
+ with gr.TabItem("🏅 Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
230
  with gr.Row():
231
  with gr.Column():
232
  # with gr.Row():
 
459
  filter_use_case_type = gr.CheckboxGroup(
460
  choices=["Long", "Short"],
461
  value=["Long", "Short"],
462
+ label="Use Case Flavor",
463
  info="Output: 250 tokens, Long input: 3k tokens, Short input: 500 tokens",
464
  interactive=True,
465
  )
crm-results/hf_leaderboard_flavor_mapping.csv ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Use Case Name,Use Case Type,Cost and Speed: Flavor
2
+ Service: Conversation summary,Summary,Short
3
+ Service: Reply Recommendations,Generation,Short
4
+ Sales: Email Generation,Generation,Short
5
+ Sales & Service: Update CRM Info,Generation,Long
6
+ Service: Call Summary,Summary,Long
7
+ Sales: Call Summary,Summary,Long
8
+ Service: Live Chat Insights,Summary,Short
9
+ Service: Live Chat Summary,Summary,Long
10
+ Service: Email Summary,Summary,Long
11
+ Service: Knowledge creation from Case Info,Generation,Long
12
+ Sales: Email Summary,Summary,Long
crm-results/hf_leaderboard_latency_cost.csv CHANGED
@@ -1,4 +1,4 @@
1
- Model Name,Use Case Type,Version,Platform,Mean Latency (sec) per Request,Mean Output Tokens,Mean Cost per 1K Requests,Cost Band,,Model id,Cost per 1m input tokens,Cost per 1m output tokens,,,,Percentile,From,To,,min,Max
2
  AI21 Jamba-Instruct,Long,,AI21,4.0,232.9,1.6,Medium,,GPT 3.5 Turbo,0.5,1.5,,,0%,0.43,0.43,1.61,,0.43,61.11
3
  AI21 Jamba-Instruct,Short,,AI21,4.0,243.9,0.5,Low,,GPT 4 Turbo,10,30,,,33%,1.61,1.61,9.28,,,
4
  Claude 3 Haiku,Long,,Bedrock,2.8,236.9,1.0,Low,,GPT4-o,5,15,,,67%,9.28,9.28,61.11,,,
 
1
+ Model Name,Cost and Speed: Flavor,Version,Platform,Response Time (Sec),Mean Output Tokens,Mean Cost per 1K Requests,Cost Band,,Model id,Cost per 1m input tokens,Cost per 1m output tokens,,,,Percentile,From,To,,min,Max
2
  AI21 Jamba-Instruct,Long,,AI21,4.0,232.9,1.6,Medium,,GPT 3.5 Turbo,0.5,1.5,,,0%,0.43,0.43,1.61,,0.43,61.11
3
  AI21 Jamba-Instruct,Short,,AI21,4.0,243.9,0.5,Low,,GPT 4 Turbo,10,30,,,33%,1.61,1.61,9.28,,,
4
  Claude 3 Haiku,Long,,Bedrock,2.8,236.9,1.0,Low,,GPT4-o,5,15,,,67%,9.28,9.28,61.11,,,
src/about.py CHANGED
@@ -1,26 +1,3 @@
1
- from dataclasses import dataclass
2
- from enum import Enum
3
-
4
-
5
- @dataclass
6
- class Task:
7
- benchmark: str
8
- metric: str
9
- col_name: str
10
-
11
-
12
- # Select your tasks here
13
- # ---------------------------------------------------
14
- class Tasks(Enum):
15
- # task_key in the json file, metric_key in the json file, name to display in the leaderboard
16
- task0 = Task("anli_r1", "acc", "ANLI")
17
- task1 = Task("logiqa", "acc_norm", "LogiQA")
18
-
19
-
20
- NUM_FEWSHOT = 0 # Change with your few shot
21
- # ---------------------------------------------------
22
-
23
-
24
  # Your leaderboard name
25
  TITLE = """<h1 align="center" id="space-title">Generative AI Leaderboard for CRM</h1>
26
  <h3>Assess which LLMs are accurate enough or need fine-tuning, and weigh this versus tradeoffs of speed, costs, and trust and safety. This is based on human manual and automated evaluation with real operational CRM data per use case.</h3>
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
  # Your leaderboard name
2
  TITLE = """<h1 align="center" id="space-title">Generative AI Leaderboard for CRM</h1>
3
  <h3>Assess which LLMs are accurate enough or need fine-tuning, and weigh this versus tradeoffs of speed, costs, and trust and safety. This is based on human manual and automated evaluation with real operational CRM data per use case.</h3>
src/display/utils.py CHANGED
@@ -1,10 +1,7 @@
1
  from dataclasses import dataclass, make_dataclass
2
- from enum import Enum
3
 
4
  import pandas as pd
5
 
6
- from src.about import Tasks
7
-
8
 
9
  def fields(raw_class):
10
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
@@ -29,9 +26,8 @@ auto_eval_column_dict.append(
29
  ["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)]
30
  )
31
  auto_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)])
32
- auto_eval_column_dict.append(["use_case_name", ColumnContent, ColumnContent("Use Case Name", "markdown", False)])
33
  auto_eval_column_dict.append(["use_case_type", ColumnContent, ColumnContent("Use Case Type", "markdown", False)])
34
-
35
  auto_eval_column_dict.append(["accuracy_method", ColumnContent, ColumnContent("Accuracy Method", "markdown", False)])
36
  # Accuracy metrics
37
  auto_eval_column_dict.append(["accuracy_metric_average", ColumnContent, ColumnContent("Accuracy", "markdown", True)])
@@ -51,6 +47,14 @@ auto_eval_column_dict.append(
51
  auto_eval_column_dict.append(
52
  ["accuracy_metric_factuality", ColumnContent, ColumnContent("Factuality", "markdown", True)]
53
  )
 
 
 
 
 
 
 
 
54
  # We use make dataclass to dynamically fill the scores from Tasks
55
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
56
 
@@ -62,10 +66,10 @@ cost_eval_column_dict.append(
62
  ["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)]
63
  )
64
  cost_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)])
65
- cost_eval_column_dict.append(["use_case_type", ColumnContent, ColumnContent("Use Case Type", "markdown", True)])
66
  cost_eval_column_dict.append(
67
- ["latency", ColumnContent, ColumnContent("Mean Latency (sec) per Request", "markdown", True)]
68
  )
 
69
  cost_eval_column_dict.append(
70
  ["mean_output_tokens", ColumnContent, ColumnContent("Mean Output Tokens", "markdown", True)]
71
  )
@@ -85,96 +89,6 @@ ts_eval_column_dict.append(["crm_bias", ColumnContent, ColumnContent("CRM Bias",
85
  # ts_eval_column_dict.append(["bias_no_ci", ColumnContent, ColumnContent("Bias No CI", "markdown", True)])
86
  TSEvalColumn = make_dataclass("TSEvalColumn", ts_eval_column_dict, frozen=True)
87
 
88
-
89
- # Scores
90
- # auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
91
- # for task in Tasks:
92
- # auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
93
- # Model information
94
- # auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
95
- # auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
96
- # auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
97
- # auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
98
- # auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
99
- # auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
100
- # auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
101
- # auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
102
- # auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
103
-
104
-
105
- ## For the queue columns in the submission tab
106
- @dataclass(frozen=True)
107
- class EvalQueueColumn: # Queue column
108
- model = ColumnContent("model", "markdown", True)
109
- revision = ColumnContent("revision", "str", True)
110
- private = ColumnContent("private", "bool", True)
111
- precision = ColumnContent("precision", "str", True)
112
- weight_type = ColumnContent("weight_type", "str", "Original")
113
- status = ColumnContent("status", "str", True)
114
-
115
-
116
- ## All the model information that we might need
117
- @dataclass
118
- class ModelDetails:
119
- name: str
120
- display_name: str = ""
121
- symbol: str = "" # emoji
122
-
123
-
124
- class ModelType(Enum):
125
- PT = ModelDetails(name="pretrained", symbol="🟢")
126
- FT = ModelDetails(name="fine-tuned", symbol="🔶")
127
- IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
128
- RL = ModelDetails(name="RL-tuned", symbol="🟦")
129
- Unknown = ModelDetails(name="", symbol="?")
130
-
131
- def to_str(self, separator=" "):
132
- return f"{self.value.symbol}{separator}{self.value.name}"
133
-
134
- @staticmethod
135
- def from_str(type):
136
- if "fine-tuned" in type or "🔶" in type:
137
- return ModelType.FT
138
- if "pretrained" in type or "🟢" in type:
139
- return ModelType.PT
140
- if "RL-tuned" in type or "🟦" in type:
141
- return ModelType.RL
142
- if "instruction-tuned" in type or "⭕" in type:
143
- return ModelType.IFT
144
- return ModelType.Unknown
145
-
146
-
147
- class WeightType(Enum):
148
- Adapter = ModelDetails("Adapter")
149
- Original = ModelDetails("Original")
150
- Delta = ModelDetails("Delta")
151
-
152
-
153
- class Precision(Enum):
154
- float16 = ModelDetails("float16")
155
- bfloat16 = ModelDetails("bfloat16")
156
- float32 = ModelDetails("float32")
157
- # qt_8bit = ModelDetails("8bit")
158
- # qt_4bit = ModelDetails("4bit")
159
- # qt_GPTQ = ModelDetails("GPTQ")
160
- Unknown = ModelDetails("?")
161
-
162
- def from_str(precision):
163
- if precision in ["torch.float16", "float16"]:
164
- return Precision.float16
165
- if precision in ["torch.bfloat16", "bfloat16"]:
166
- return Precision.bfloat16
167
- if precision in ["float32"]:
168
- return Precision.float32
169
- # if precision in ["8bit"]:
170
- # return Precision.qt_8bit
171
- # if precision in ["4bit"]:
172
- # return Precision.qt_4bit
173
- # if precision in ["GPTQ", "None"]:
174
- # return Precision.qt_GPTQ
175
- return Precision.Unknown
176
-
177
-
178
  # Column selection
179
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
180
  TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
@@ -187,10 +101,7 @@ COST_TYPES = [c.type for c in fields(CostEvalColumn) if not c.hidden]
187
  TS_COLS = [c.name for c in fields(TSEvalColumn) if not c.hidden]
188
  TS_TYPES = [c.type for c in fields(TSEvalColumn) if not c.hidden]
189
 
190
- EVAL_COLS = [c.name for c in fields(EvalQueueColumn)]
191
- EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
192
-
193
- BENCHMARK_COLS = [t.value.col_name for t in Tasks]
194
 
195
  NUMERIC_INTERVALS = {
196
  "?": pd.Interval(-1, 0, closed="right"),
 
1
  from dataclasses import dataclass, make_dataclass
 
2
 
3
  import pandas as pd
4
 
 
 
5
 
6
  def fields(raw_class):
7
  return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
 
26
  ["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)]
27
  )
28
  auto_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)])
29
+ auto_eval_column_dict.append(["use_case_name", ColumnContent, ColumnContent("Use Case Name", "markdown", True)])
30
  auto_eval_column_dict.append(["use_case_type", ColumnContent, ColumnContent("Use Case Type", "markdown", False)])
 
31
  auto_eval_column_dict.append(["accuracy_method", ColumnContent, ColumnContent("Accuracy Method", "markdown", False)])
32
  # Accuracy metrics
33
  auto_eval_column_dict.append(["accuracy_metric_average", ColumnContent, ColumnContent("Accuracy", "markdown", True)])
 
47
  auto_eval_column_dict.append(
48
  ["accuracy_metric_factuality", ColumnContent, ColumnContent("Factuality", "markdown", True)]
49
  )
50
+ # auto_eval_column_dict.append(
51
+ # ["use_case_flavor", ColumnContent, ColumnContent("Cost and Speed: Flavor", "markdown", False)]
52
+ # )
53
+ auto_eval_column_dict.append(["latency", ColumnContent, ColumnContent("Response Time (Sec)", "markdown", True)])
54
+ auto_eval_column_dict.append(
55
+ ["mean_output_tokens", ColumnContent, ColumnContent("Mean Output Tokens", "markdown", True)]
56
+ )
57
+ auto_eval_column_dict.append(["cost_band", ColumnContent, ColumnContent("Cost Band", "markdown", True)])
58
  # We use make dataclass to dynamically fill the scores from Tasks
59
  AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
60
 
 
66
  ["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)]
67
  )
68
  cost_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)])
 
69
  cost_eval_column_dict.append(
70
+ ["use_case_flavor", ColumnContent, ColumnContent("Cost and Speed: Flavor", "markdown", True)]
71
  )
72
+ cost_eval_column_dict.append(["latency", ColumnContent, ColumnContent("Response Time (Sec)", "markdown", True)])
73
  cost_eval_column_dict.append(
74
  ["mean_output_tokens", ColumnContent, ColumnContent("Mean Output Tokens", "markdown", True)]
75
  )
 
89
  # ts_eval_column_dict.append(["bias_no_ci", ColumnContent, ColumnContent("Bias No CI", "markdown", True)])
90
  TSEvalColumn = make_dataclass("TSEvalColumn", ts_eval_column_dict, frozen=True)
91
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
92
  # Column selection
93
  COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
94
  TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
 
101
  TS_COLS = [c.name for c in fields(TSEvalColumn) if not c.hidden]
102
  TS_TYPES = [c.type for c in fields(TSEvalColumn) if not c.hidden]
103
 
104
+ # BENCHMARK_COLS = [t.value.col_name for t in Tasks]
 
 
 
105
 
106
  NUMERIC_INTERVALS = {
107
  "?": pd.Interval(-1, 0, closed="right"),
src/populate.py CHANGED
@@ -2,26 +2,35 @@ import os
2
 
3
  import pandas as pd
4
 
 
 
5
 
6
  def get_leaderboard_df_crm(
7
  crm_results_path: str, accuracy_cols: list, cost_cols: list
8
  ) -> tuple[pd.DataFrame, pd.DataFrame]:
9
  """Creates a dataframe from all the individual experiment results"""
 
10
  sf_finetuned_models = ["SF-TextBase 70B", "SF-TextBase 7B", "SF-TextSum"]
 
11
 
12
  leaderboard_accuracy_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_accuracy.csv"))
13
  leaderboard_accuracy_df = leaderboard_accuracy_df[~leaderboard_accuracy_df["Model Name"].isin(sf_finetuned_models)]
14
- # leaderboard_accuracy_df = leaderboard_accuracy_df.sort_values(
15
- # by=[AutoEvalColumn.accuracy_metric_average.name], ascending=False
16
- # )
17
- leaderboard_accuracy_df = leaderboard_accuracy_df[accuracy_cols].round(decimals=2)
 
18
 
19
  ref_df = leaderboard_accuracy_df[["Model Name", "LLM Provider"]].drop_duplicates()
20
 
21
  leaderboard_cost_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_latency_cost.csv"))
22
  leaderboard_cost_df = leaderboard_cost_df[~leaderboard_cost_df["Model Name"].isin(sf_finetuned_models)]
 
 
 
 
 
23
  leaderboard_cost_df = leaderboard_cost_df.join(ref_df.set_index("Model Name"), on="Model Name")
24
- # leaderboard_cost_df["LLM Provider"] = leaderboard_cost_df["LLM Provider"].fillna("Google")
25
  leaderboard_cost_df = leaderboard_cost_df[cost_cols].round(decimals=2)
26
 
27
  leaderboard_ts_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_ts.csv"))
@@ -29,7 +38,6 @@ def get_leaderboard_df_crm(
29
  leaderboard_ts_df = leaderboard_ts_df[~leaderboard_ts_df["Model Name"].isin(sf_finetuned_models)]
30
  leaderboard_ts_df = leaderboard_ts_df.join(ref_df.set_index("Model Name"), on="Model Name")
31
  leaderboard_ts_df = leaderboard_ts_df.join(leaderboard_ts__crm_bias_df.set_index("Model Name"), on="Model Name")
32
- # leaderboard_ts_df["LLM Provider"] = leaderboard_ts_df["LLM Provider"].fillna("Google")
33
  privacy_cols = leaderboard_ts_df[
34
  [
35
  "Privacy Zero-Shot Match Avoidance",
@@ -52,4 +60,8 @@ def get_leaderboard_df_crm(
52
  ].apply(lambda x: x.str.rstrip("%").astype("float") / 100.0, axis=1)
53
  leaderboard_ts_df["Trust & Safety"] = ts_cols.mean(axis=1).transform(lambda x: "{:,.2%}".format(x))
54
 
 
 
 
 
55
  return leaderboard_accuracy_df, leaderboard_cost_df, leaderboard_ts_df
 
2
 
3
  import pandas as pd
4
 
5
+ from src.display.utils import AutoEvalColumn
6
+
7
 
8
  def get_leaderboard_df_crm(
9
  crm_results_path: str, accuracy_cols: list, cost_cols: list
10
  ) -> tuple[pd.DataFrame, pd.DataFrame]:
11
  """Creates a dataframe from all the individual experiment results"""
12
+ use_case_flavor_mapping_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_flavor_mapping.csv"))
13
  sf_finetuned_models = ["SF-TextBase 70B", "SF-TextBase 7B", "SF-TextSum"]
14
+ # sf_finetuned_models = []
15
 
16
  leaderboard_accuracy_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_accuracy.csv"))
17
  leaderboard_accuracy_df = leaderboard_accuracy_df[~leaderboard_accuracy_df["Model Name"].isin(sf_finetuned_models)]
18
+
19
+ leaderboard_accuracy_df = leaderboard_accuracy_df.join(
20
+ use_case_flavor_mapping_df[["Use Case Name", "Cost and Speed: Flavor"]].set_index("Use Case Name"),
21
+ on="Use Case Name",
22
+ )
23
 
24
  ref_df = leaderboard_accuracy_df[["Model Name", "LLM Provider"]].drop_duplicates()
25
 
26
  leaderboard_cost_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_latency_cost.csv"))
27
  leaderboard_cost_df = leaderboard_cost_df[~leaderboard_cost_df["Model Name"].isin(sf_finetuned_models)]
28
+ leaderboard_accuracy_df = leaderboard_accuracy_df.join(
29
+ leaderboard_cost_df.set_index(["Model Name", "Cost and Speed: Flavor"]),
30
+ on=["Model Name", "Cost and Speed: Flavor"],
31
+ )
32
+
33
  leaderboard_cost_df = leaderboard_cost_df.join(ref_df.set_index("Model Name"), on="Model Name")
 
34
  leaderboard_cost_df = leaderboard_cost_df[cost_cols].round(decimals=2)
35
 
36
  leaderboard_ts_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_ts.csv"))
 
38
  leaderboard_ts_df = leaderboard_ts_df[~leaderboard_ts_df["Model Name"].isin(sf_finetuned_models)]
39
  leaderboard_ts_df = leaderboard_ts_df.join(ref_df.set_index("Model Name"), on="Model Name")
40
  leaderboard_ts_df = leaderboard_ts_df.join(leaderboard_ts__crm_bias_df.set_index("Model Name"), on="Model Name")
 
41
  privacy_cols = leaderboard_ts_df[
42
  [
43
  "Privacy Zero-Shot Match Avoidance",
 
60
  ].apply(lambda x: x.str.rstrip("%").astype("float") / 100.0, axis=1)
61
  leaderboard_ts_df["Trust & Safety"] = ts_cols.mean(axis=1).transform(lambda x: "{:,.2%}".format(x))
62
 
63
+ leaderboard_accuracy_df = leaderboard_accuracy_df.sort_values(
64
+ by=[AutoEvalColumn.accuracy_metric_average.name], ascending=False
65
+ )
66
+ leaderboard_accuracy_df = leaderboard_accuracy_df[accuracy_cols].round(decimals=2)
67
  return leaderboard_accuracy_df, leaderboard_cost_df, leaderboard_ts_df