Spaces:
Running
Running
join cost table
Browse files- README.md +1 -33
- app.py +9 -5
- crm-results/hf_leaderboard_flavor_mapping.csv +12 -0
- crm-results/hf_leaderboard_latency_cost.csv +1 -1
- src/about.py +0 -23
- src/display/utils.py +12 -101
- src/populate.py +18 -6
README.md
CHANGED
@@ -9,36 +9,4 @@ pinned: true
|
|
9 |
license: apache-2.0
|
10 |
---
|
11 |
|
12 |
-
#
|
13 |
-
|
14 |
-
Most of the variables to change for a default leaderboard are in `src/env.py` (replace the path for your leaderboard) and `src/about.py` (for tasks).
|
15 |
-
|
16 |
-
Results files should have the following format and be stored as json files:
|
17 |
-
```json
|
18 |
-
{
|
19 |
-
"config": {
|
20 |
-
"model_dtype": "torch.float16", # or torch.bfloat16 or 8bit or 4bit
|
21 |
-
"model_name": "path of the model on the hub: org/model",
|
22 |
-
"model_sha": "revision on the hub",
|
23 |
-
},
|
24 |
-
"results": {
|
25 |
-
"task_name": {
|
26 |
-
"metric_name": score,
|
27 |
-
},
|
28 |
-
"task_name2": {
|
29 |
-
"metric_name": score,
|
30 |
-
}
|
31 |
-
}
|
32 |
-
}
|
33 |
-
```
|
34 |
-
|
35 |
-
Request files are created automatically by this tool.
|
36 |
-
|
37 |
-
If you encounter problem on the space, don't hesitate to restart it to remove the create eval-queue, eval-queue-bk, eval-results and eval-results-bk created folder.
|
38 |
-
|
39 |
-
# Code logic for more complex edits
|
40 |
-
|
41 |
-
You'll find
|
42 |
-
- the main table' columns names and properties in `src/display/utils.py`
|
43 |
-
- the logic to read all results and request files, then convert them in dataframe lines, in `src/leaderboard/read_evals.py`, and `src/populate.py`
|
44 |
-
- teh logic to allow or filter submissions in `src/submission/submit.py` and `src/submission/check_validity.py`
|
|
|
9 |
license: apache-2.0
|
10 |
---
|
11 |
|
12 |
+
# Generative AI Leaderboard for CRM
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
app.py
CHANGED
@@ -57,7 +57,7 @@ def update_table(
|
|
57 |
filtered_df = filter_use_case_func(filtered_df, use_case_query)
|
58 |
filtered_df = filter_use_case_type_func(filtered_df, use_case_type_query)
|
59 |
df = select_columns(filtered_df, columns)
|
60 |
-
return df
|
61 |
|
62 |
|
63 |
def update_cost_table(
|
@@ -65,11 +65,11 @@ def update_cost_table(
|
|
65 |
columns: list,
|
66 |
llm_query: list,
|
67 |
llm_provider_query: list,
|
68 |
-
|
69 |
):
|
70 |
filtered_df = filter_llm_func(hidden_df, llm_query)
|
71 |
filtered_df = filter_llm_provider_func(filtered_df, llm_provider_query)
|
72 |
-
filtered_df =
|
73 |
df = select_columns_cost_table(filtered_df, columns)
|
74 |
return df.style.map(highlight_cost_band_low, props="background-color: #b3d5a4")
|
75 |
|
@@ -183,6 +183,10 @@ def filter_use_case_type_func(df: pd.DataFrame, use_case_type_query: list) -> pd
|
|
183 |
return df[df["Use Case Type"].isin(use_case_type_query)]
|
184 |
|
185 |
|
|
|
|
|
|
|
|
|
186 |
def filter_llm_func(df: pd.DataFrame, llm_query: list) -> pd.DataFrame:
|
187 |
return df[df["Model Name"].isin(llm_query)]
|
188 |
|
@@ -222,7 +226,7 @@ with demo:
|
|
222 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
223 |
|
224 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
225 |
-
with gr.TabItem("🏅
|
226 |
with gr.Row():
|
227 |
with gr.Column():
|
228 |
# with gr.Row():
|
@@ -455,7 +459,7 @@ with demo:
|
|
455 |
filter_use_case_type = gr.CheckboxGroup(
|
456 |
choices=["Long", "Short"],
|
457 |
value=["Long", "Short"],
|
458 |
-
label="Use Case
|
459 |
info="Output: 250 tokens, Long input: 3k tokens, Short input: 500 tokens",
|
460 |
interactive=True,
|
461 |
)
|
|
|
57 |
filtered_df = filter_use_case_func(filtered_df, use_case_query)
|
58 |
filtered_df = filter_use_case_type_func(filtered_df, use_case_type_query)
|
59 |
df = select_columns(filtered_df, columns)
|
60 |
+
return df.style.map(highlight_cost_band_low, props="background-color: #b3d5a4")
|
61 |
|
62 |
|
63 |
def update_cost_table(
|
|
|
65 |
columns: list,
|
66 |
llm_query: list,
|
67 |
llm_provider_query: list,
|
68 |
+
use_case_flavor_query: list,
|
69 |
):
|
70 |
filtered_df = filter_llm_func(hidden_df, llm_query)
|
71 |
filtered_df = filter_llm_provider_func(filtered_df, llm_provider_query)
|
72 |
+
filtered_df = filter_use_case_flavor_func(filtered_df, use_case_flavor_query)
|
73 |
df = select_columns_cost_table(filtered_df, columns)
|
74 |
return df.style.map(highlight_cost_band_low, props="background-color: #b3d5a4")
|
75 |
|
|
|
183 |
return df[df["Use Case Type"].isin(use_case_type_query)]
|
184 |
|
185 |
|
186 |
+
def filter_use_case_flavor_func(df: pd.DataFrame, use_case_flavor_query: list) -> pd.DataFrame:
|
187 |
+
return df[df["Cost and Speed: Flavor"].isin(use_case_flavor_query)]
|
188 |
+
|
189 |
+
|
190 |
def filter_llm_func(df: pd.DataFrame, llm_query: list) -> pd.DataFrame:
|
191 |
return df[df["Model Name"].isin(llm_query)]
|
192 |
|
|
|
226 |
gr.Markdown(INTRODUCTION_TEXT, elem_classes="markdown-text")
|
227 |
|
228 |
with gr.Tabs(elem_classes="tab-buttons") as tabs:
|
229 |
+
with gr.TabItem("🏅 Leaderboard", elem_id="llm-benchmark-tab-table", id=0):
|
230 |
with gr.Row():
|
231 |
with gr.Column():
|
232 |
# with gr.Row():
|
|
|
459 |
filter_use_case_type = gr.CheckboxGroup(
|
460 |
choices=["Long", "Short"],
|
461 |
value=["Long", "Short"],
|
462 |
+
label="Use Case Flavor",
|
463 |
info="Output: 250 tokens, Long input: 3k tokens, Short input: 500 tokens",
|
464 |
interactive=True,
|
465 |
)
|
crm-results/hf_leaderboard_flavor_mapping.csv
ADDED
@@ -0,0 +1,12 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Use Case Name,Use Case Type,Cost and Speed: Flavor
|
2 |
+
Service: Conversation summary,Summary,Short
|
3 |
+
Service: Reply Recommendations,Generation,Short
|
4 |
+
Sales: Email Generation,Generation,Short
|
5 |
+
Sales & Service: Update CRM Info,Generation,Long
|
6 |
+
Service: Call Summary,Summary,Long
|
7 |
+
Sales: Call Summary,Summary,Long
|
8 |
+
Service: Live Chat Insights,Summary,Short
|
9 |
+
Service: Live Chat Summary,Summary,Long
|
10 |
+
Service: Email Summary,Summary,Long
|
11 |
+
Service: Knowledge creation from Case Info,Generation,Long
|
12 |
+
Sales: Email Summary,Summary,Long
|
crm-results/hf_leaderboard_latency_cost.csv
CHANGED
@@ -1,4 +1,4 @@
|
|
1 |
-
Model Name,
|
2 |
AI21 Jamba-Instruct,Long,,AI21,4.0,232.9,1.6,Medium,,GPT 3.5 Turbo,0.5,1.5,,,0%,0.43,0.43,1.61,,0.43,61.11
|
3 |
AI21 Jamba-Instruct,Short,,AI21,4.0,243.9,0.5,Low,,GPT 4 Turbo,10,30,,,33%,1.61,1.61,9.28,,,
|
4 |
Claude 3 Haiku,Long,,Bedrock,2.8,236.9,1.0,Low,,GPT4-o,5,15,,,67%,9.28,9.28,61.11,,,
|
|
|
1 |
+
Model Name,Cost and Speed: Flavor,Version,Platform,Response Time (Sec),Mean Output Tokens,Mean Cost per 1K Requests,Cost Band,,Model id,Cost per 1m input tokens,Cost per 1m output tokens,,,,Percentile,From,To,,min,Max
|
2 |
AI21 Jamba-Instruct,Long,,AI21,4.0,232.9,1.6,Medium,,GPT 3.5 Turbo,0.5,1.5,,,0%,0.43,0.43,1.61,,0.43,61.11
|
3 |
AI21 Jamba-Instruct,Short,,AI21,4.0,243.9,0.5,Low,,GPT 4 Turbo,10,30,,,33%,1.61,1.61,9.28,,,
|
4 |
Claude 3 Haiku,Long,,Bedrock,2.8,236.9,1.0,Low,,GPT4-o,5,15,,,67%,9.28,9.28,61.11,,,
|
src/about.py
CHANGED
@@ -1,26 +1,3 @@
|
|
1 |
-
from dataclasses import dataclass
|
2 |
-
from enum import Enum
|
3 |
-
|
4 |
-
|
5 |
-
@dataclass
|
6 |
-
class Task:
|
7 |
-
benchmark: str
|
8 |
-
metric: str
|
9 |
-
col_name: str
|
10 |
-
|
11 |
-
|
12 |
-
# Select your tasks here
|
13 |
-
# ---------------------------------------------------
|
14 |
-
class Tasks(Enum):
|
15 |
-
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
|
16 |
-
task0 = Task("anli_r1", "acc", "ANLI")
|
17 |
-
task1 = Task("logiqa", "acc_norm", "LogiQA")
|
18 |
-
|
19 |
-
|
20 |
-
NUM_FEWSHOT = 0 # Change with your few shot
|
21 |
-
# ---------------------------------------------------
|
22 |
-
|
23 |
-
|
24 |
# Your leaderboard name
|
25 |
TITLE = """<h1 align="center" id="space-title">Generative AI Leaderboard for CRM</h1>
|
26 |
<h3>Assess which LLMs are accurate enough or need fine-tuning, and weigh this versus tradeoffs of speed, costs, and trust and safety. This is based on human manual and automated evaluation with real operational CRM data per use case.</h3>
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# Your leaderboard name
|
2 |
TITLE = """<h1 align="center" id="space-title">Generative AI Leaderboard for CRM</h1>
|
3 |
<h3>Assess which LLMs are accurate enough or need fine-tuning, and weigh this versus tradeoffs of speed, costs, and trust and safety. This is based on human manual and automated evaluation with real operational CRM data per use case.</h3>
|
src/display/utils.py
CHANGED
@@ -1,10 +1,7 @@
|
|
1 |
from dataclasses import dataclass, make_dataclass
|
2 |
-
from enum import Enum
|
3 |
|
4 |
import pandas as pd
|
5 |
|
6 |
-
from src.about import Tasks
|
7 |
-
|
8 |
|
9 |
def fields(raw_class):
|
10 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
@@ -29,9 +26,8 @@ auto_eval_column_dict.append(
|
|
29 |
["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)]
|
30 |
)
|
31 |
auto_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)])
|
32 |
-
auto_eval_column_dict.append(["use_case_name", ColumnContent, ColumnContent("Use Case Name", "markdown",
|
33 |
auto_eval_column_dict.append(["use_case_type", ColumnContent, ColumnContent("Use Case Type", "markdown", False)])
|
34 |
-
|
35 |
auto_eval_column_dict.append(["accuracy_method", ColumnContent, ColumnContent("Accuracy Method", "markdown", False)])
|
36 |
# Accuracy metrics
|
37 |
auto_eval_column_dict.append(["accuracy_metric_average", ColumnContent, ColumnContent("Accuracy", "markdown", True)])
|
@@ -51,6 +47,14 @@ auto_eval_column_dict.append(
|
|
51 |
auto_eval_column_dict.append(
|
52 |
["accuracy_metric_factuality", ColumnContent, ColumnContent("Factuality", "markdown", True)]
|
53 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
54 |
# We use make dataclass to dynamically fill the scores from Tasks
|
55 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
56 |
|
@@ -62,10 +66,10 @@ cost_eval_column_dict.append(
|
|
62 |
["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)]
|
63 |
)
|
64 |
cost_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)])
|
65 |
-
cost_eval_column_dict.append(["use_case_type", ColumnContent, ColumnContent("Use Case Type", "markdown", True)])
|
66 |
cost_eval_column_dict.append(
|
67 |
-
["
|
68 |
)
|
|
|
69 |
cost_eval_column_dict.append(
|
70 |
["mean_output_tokens", ColumnContent, ColumnContent("Mean Output Tokens", "markdown", True)]
|
71 |
)
|
@@ -85,96 +89,6 @@ ts_eval_column_dict.append(["crm_bias", ColumnContent, ColumnContent("CRM Bias",
|
|
85 |
# ts_eval_column_dict.append(["bias_no_ci", ColumnContent, ColumnContent("Bias No CI", "markdown", True)])
|
86 |
TSEvalColumn = make_dataclass("TSEvalColumn", ts_eval_column_dict, frozen=True)
|
87 |
|
88 |
-
|
89 |
-
# Scores
|
90 |
-
# auto_eval_column_dict.append(["average", ColumnContent, ColumnContent("Average ⬆️", "number", True)])
|
91 |
-
# for task in Tasks:
|
92 |
-
# auto_eval_column_dict.append([task.name, ColumnContent, ColumnContent(task.value.col_name, "number", True)])
|
93 |
-
# Model information
|
94 |
-
# auto_eval_column_dict.append(["model_type", ColumnContent, ColumnContent("Type", "str", False)])
|
95 |
-
# auto_eval_column_dict.append(["architecture", ColumnContent, ColumnContent("Architecture", "str", False)])
|
96 |
-
# auto_eval_column_dict.append(["weight_type", ColumnContent, ColumnContent("Weight type", "str", False, True)])
|
97 |
-
# auto_eval_column_dict.append(["precision", ColumnContent, ColumnContent("Precision", "str", False)])
|
98 |
-
# auto_eval_column_dict.append(["license", ColumnContent, ColumnContent("Hub License", "str", False)])
|
99 |
-
# auto_eval_column_dict.append(["params", ColumnContent, ColumnContent("#Params (B)", "number", False)])
|
100 |
-
# auto_eval_column_dict.append(["likes", ColumnContent, ColumnContent("Hub ❤️", "number", False)])
|
101 |
-
# auto_eval_column_dict.append(["still_on_hub", ColumnContent, ColumnContent("Available on the hub", "bool", False)])
|
102 |
-
# auto_eval_column_dict.append(["revision", ColumnContent, ColumnContent("Model sha", "str", False, False)])
|
103 |
-
|
104 |
-
|
105 |
-
## For the queue columns in the submission tab
|
106 |
-
@dataclass(frozen=True)
|
107 |
-
class EvalQueueColumn: # Queue column
|
108 |
-
model = ColumnContent("model", "markdown", True)
|
109 |
-
revision = ColumnContent("revision", "str", True)
|
110 |
-
private = ColumnContent("private", "bool", True)
|
111 |
-
precision = ColumnContent("precision", "str", True)
|
112 |
-
weight_type = ColumnContent("weight_type", "str", "Original")
|
113 |
-
status = ColumnContent("status", "str", True)
|
114 |
-
|
115 |
-
|
116 |
-
## All the model information that we might need
|
117 |
-
@dataclass
|
118 |
-
class ModelDetails:
|
119 |
-
name: str
|
120 |
-
display_name: str = ""
|
121 |
-
symbol: str = "" # emoji
|
122 |
-
|
123 |
-
|
124 |
-
class ModelType(Enum):
|
125 |
-
PT = ModelDetails(name="pretrained", symbol="🟢")
|
126 |
-
FT = ModelDetails(name="fine-tuned", symbol="🔶")
|
127 |
-
IFT = ModelDetails(name="instruction-tuned", symbol="⭕")
|
128 |
-
RL = ModelDetails(name="RL-tuned", symbol="🟦")
|
129 |
-
Unknown = ModelDetails(name="", symbol="?")
|
130 |
-
|
131 |
-
def to_str(self, separator=" "):
|
132 |
-
return f"{self.value.symbol}{separator}{self.value.name}"
|
133 |
-
|
134 |
-
@staticmethod
|
135 |
-
def from_str(type):
|
136 |
-
if "fine-tuned" in type or "🔶" in type:
|
137 |
-
return ModelType.FT
|
138 |
-
if "pretrained" in type or "🟢" in type:
|
139 |
-
return ModelType.PT
|
140 |
-
if "RL-tuned" in type or "🟦" in type:
|
141 |
-
return ModelType.RL
|
142 |
-
if "instruction-tuned" in type or "⭕" in type:
|
143 |
-
return ModelType.IFT
|
144 |
-
return ModelType.Unknown
|
145 |
-
|
146 |
-
|
147 |
-
class WeightType(Enum):
|
148 |
-
Adapter = ModelDetails("Adapter")
|
149 |
-
Original = ModelDetails("Original")
|
150 |
-
Delta = ModelDetails("Delta")
|
151 |
-
|
152 |
-
|
153 |
-
class Precision(Enum):
|
154 |
-
float16 = ModelDetails("float16")
|
155 |
-
bfloat16 = ModelDetails("bfloat16")
|
156 |
-
float32 = ModelDetails("float32")
|
157 |
-
# qt_8bit = ModelDetails("8bit")
|
158 |
-
# qt_4bit = ModelDetails("4bit")
|
159 |
-
# qt_GPTQ = ModelDetails("GPTQ")
|
160 |
-
Unknown = ModelDetails("?")
|
161 |
-
|
162 |
-
def from_str(precision):
|
163 |
-
if precision in ["torch.float16", "float16"]:
|
164 |
-
return Precision.float16
|
165 |
-
if precision in ["torch.bfloat16", "bfloat16"]:
|
166 |
-
return Precision.bfloat16
|
167 |
-
if precision in ["float32"]:
|
168 |
-
return Precision.float32
|
169 |
-
# if precision in ["8bit"]:
|
170 |
-
# return Precision.qt_8bit
|
171 |
-
# if precision in ["4bit"]:
|
172 |
-
# return Precision.qt_4bit
|
173 |
-
# if precision in ["GPTQ", "None"]:
|
174 |
-
# return Precision.qt_GPTQ
|
175 |
-
return Precision.Unknown
|
176 |
-
|
177 |
-
|
178 |
# Column selection
|
179 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
180 |
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
|
@@ -187,10 +101,7 @@ COST_TYPES = [c.type for c in fields(CostEvalColumn) if not c.hidden]
|
|
187 |
TS_COLS = [c.name for c in fields(TSEvalColumn) if not c.hidden]
|
188 |
TS_TYPES = [c.type for c in fields(TSEvalColumn) if not c.hidden]
|
189 |
|
190 |
-
|
191 |
-
EVAL_TYPES = [c.type for c in fields(EvalQueueColumn)]
|
192 |
-
|
193 |
-
BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
194 |
|
195 |
NUMERIC_INTERVALS = {
|
196 |
"?": pd.Interval(-1, 0, closed="right"),
|
|
|
1 |
from dataclasses import dataclass, make_dataclass
|
|
|
2 |
|
3 |
import pandas as pd
|
4 |
|
|
|
|
|
5 |
|
6 |
def fields(raw_class):
|
7 |
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
|
|
|
26 |
["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)]
|
27 |
)
|
28 |
auto_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)])
|
29 |
+
auto_eval_column_dict.append(["use_case_name", ColumnContent, ColumnContent("Use Case Name", "markdown", True)])
|
30 |
auto_eval_column_dict.append(["use_case_type", ColumnContent, ColumnContent("Use Case Type", "markdown", False)])
|
|
|
31 |
auto_eval_column_dict.append(["accuracy_method", ColumnContent, ColumnContent("Accuracy Method", "markdown", False)])
|
32 |
# Accuracy metrics
|
33 |
auto_eval_column_dict.append(["accuracy_metric_average", ColumnContent, ColumnContent("Accuracy", "markdown", True)])
|
|
|
47 |
auto_eval_column_dict.append(
|
48 |
["accuracy_metric_factuality", ColumnContent, ColumnContent("Factuality", "markdown", True)]
|
49 |
)
|
50 |
+
# auto_eval_column_dict.append(
|
51 |
+
# ["use_case_flavor", ColumnContent, ColumnContent("Cost and Speed: Flavor", "markdown", False)]
|
52 |
+
# )
|
53 |
+
auto_eval_column_dict.append(["latency", ColumnContent, ColumnContent("Response Time (Sec)", "markdown", True)])
|
54 |
+
auto_eval_column_dict.append(
|
55 |
+
["mean_output_tokens", ColumnContent, ColumnContent("Mean Output Tokens", "markdown", True)]
|
56 |
+
)
|
57 |
+
auto_eval_column_dict.append(["cost_band", ColumnContent, ColumnContent("Cost Band", "markdown", True)])
|
58 |
# We use make dataclass to dynamically fill the scores from Tasks
|
59 |
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
|
60 |
|
|
|
66 |
["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)]
|
67 |
)
|
68 |
cost_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)])
|
|
|
69 |
cost_eval_column_dict.append(
|
70 |
+
["use_case_flavor", ColumnContent, ColumnContent("Cost and Speed: Flavor", "markdown", True)]
|
71 |
)
|
72 |
+
cost_eval_column_dict.append(["latency", ColumnContent, ColumnContent("Response Time (Sec)", "markdown", True)])
|
73 |
cost_eval_column_dict.append(
|
74 |
["mean_output_tokens", ColumnContent, ColumnContent("Mean Output Tokens", "markdown", True)]
|
75 |
)
|
|
|
89 |
# ts_eval_column_dict.append(["bias_no_ci", ColumnContent, ColumnContent("Bias No CI", "markdown", True)])
|
90 |
TSEvalColumn = make_dataclass("TSEvalColumn", ts_eval_column_dict, frozen=True)
|
91 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
92 |
# Column selection
|
93 |
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
|
94 |
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
|
|
|
101 |
TS_COLS = [c.name for c in fields(TSEvalColumn) if not c.hidden]
|
102 |
TS_TYPES = [c.type for c in fields(TSEvalColumn) if not c.hidden]
|
103 |
|
104 |
+
# BENCHMARK_COLS = [t.value.col_name for t in Tasks]
|
|
|
|
|
|
|
105 |
|
106 |
NUMERIC_INTERVALS = {
|
107 |
"?": pd.Interval(-1, 0, closed="right"),
|
src/populate.py
CHANGED
@@ -2,26 +2,35 @@ import os
|
|
2 |
|
3 |
import pandas as pd
|
4 |
|
|
|
|
|
5 |
|
6 |
def get_leaderboard_df_crm(
|
7 |
crm_results_path: str, accuracy_cols: list, cost_cols: list
|
8 |
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
9 |
"""Creates a dataframe from all the individual experiment results"""
|
|
|
10 |
sf_finetuned_models = ["SF-TextBase 70B", "SF-TextBase 7B", "SF-TextSum"]
|
|
|
11 |
|
12 |
leaderboard_accuracy_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_accuracy.csv"))
|
13 |
leaderboard_accuracy_df = leaderboard_accuracy_df[~leaderboard_accuracy_df["Model Name"].isin(sf_finetuned_models)]
|
14 |
-
|
15 |
-
|
16 |
-
|
17 |
-
|
|
|
18 |
|
19 |
ref_df = leaderboard_accuracy_df[["Model Name", "LLM Provider"]].drop_duplicates()
|
20 |
|
21 |
leaderboard_cost_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_latency_cost.csv"))
|
22 |
leaderboard_cost_df = leaderboard_cost_df[~leaderboard_cost_df["Model Name"].isin(sf_finetuned_models)]
|
|
|
|
|
|
|
|
|
|
|
23 |
leaderboard_cost_df = leaderboard_cost_df.join(ref_df.set_index("Model Name"), on="Model Name")
|
24 |
-
# leaderboard_cost_df["LLM Provider"] = leaderboard_cost_df["LLM Provider"].fillna("Google")
|
25 |
leaderboard_cost_df = leaderboard_cost_df[cost_cols].round(decimals=2)
|
26 |
|
27 |
leaderboard_ts_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_ts.csv"))
|
@@ -29,7 +38,6 @@ def get_leaderboard_df_crm(
|
|
29 |
leaderboard_ts_df = leaderboard_ts_df[~leaderboard_ts_df["Model Name"].isin(sf_finetuned_models)]
|
30 |
leaderboard_ts_df = leaderboard_ts_df.join(ref_df.set_index("Model Name"), on="Model Name")
|
31 |
leaderboard_ts_df = leaderboard_ts_df.join(leaderboard_ts__crm_bias_df.set_index("Model Name"), on="Model Name")
|
32 |
-
# leaderboard_ts_df["LLM Provider"] = leaderboard_ts_df["LLM Provider"].fillna("Google")
|
33 |
privacy_cols = leaderboard_ts_df[
|
34 |
[
|
35 |
"Privacy Zero-Shot Match Avoidance",
|
@@ -52,4 +60,8 @@ def get_leaderboard_df_crm(
|
|
52 |
].apply(lambda x: x.str.rstrip("%").astype("float") / 100.0, axis=1)
|
53 |
leaderboard_ts_df["Trust & Safety"] = ts_cols.mean(axis=1).transform(lambda x: "{:,.2%}".format(x))
|
54 |
|
|
|
|
|
|
|
|
|
55 |
return leaderboard_accuracy_df, leaderboard_cost_df, leaderboard_ts_df
|
|
|
2 |
|
3 |
import pandas as pd
|
4 |
|
5 |
+
from src.display.utils import AutoEvalColumn
|
6 |
+
|
7 |
|
8 |
def get_leaderboard_df_crm(
|
9 |
crm_results_path: str, accuracy_cols: list, cost_cols: list
|
10 |
) -> tuple[pd.DataFrame, pd.DataFrame]:
|
11 |
"""Creates a dataframe from all the individual experiment results"""
|
12 |
+
use_case_flavor_mapping_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_flavor_mapping.csv"))
|
13 |
sf_finetuned_models = ["SF-TextBase 70B", "SF-TextBase 7B", "SF-TextSum"]
|
14 |
+
# sf_finetuned_models = []
|
15 |
|
16 |
leaderboard_accuracy_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_accuracy.csv"))
|
17 |
leaderboard_accuracy_df = leaderboard_accuracy_df[~leaderboard_accuracy_df["Model Name"].isin(sf_finetuned_models)]
|
18 |
+
|
19 |
+
leaderboard_accuracy_df = leaderboard_accuracy_df.join(
|
20 |
+
use_case_flavor_mapping_df[["Use Case Name", "Cost and Speed: Flavor"]].set_index("Use Case Name"),
|
21 |
+
on="Use Case Name",
|
22 |
+
)
|
23 |
|
24 |
ref_df = leaderboard_accuracy_df[["Model Name", "LLM Provider"]].drop_duplicates()
|
25 |
|
26 |
leaderboard_cost_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_latency_cost.csv"))
|
27 |
leaderboard_cost_df = leaderboard_cost_df[~leaderboard_cost_df["Model Name"].isin(sf_finetuned_models)]
|
28 |
+
leaderboard_accuracy_df = leaderboard_accuracy_df.join(
|
29 |
+
leaderboard_cost_df.set_index(["Model Name", "Cost and Speed: Flavor"]),
|
30 |
+
on=["Model Name", "Cost and Speed: Flavor"],
|
31 |
+
)
|
32 |
+
|
33 |
leaderboard_cost_df = leaderboard_cost_df.join(ref_df.set_index("Model Name"), on="Model Name")
|
|
|
34 |
leaderboard_cost_df = leaderboard_cost_df[cost_cols].round(decimals=2)
|
35 |
|
36 |
leaderboard_ts_df = pd.read_csv(os.path.join(crm_results_path, "hf_leaderboard_ts.csv"))
|
|
|
38 |
leaderboard_ts_df = leaderboard_ts_df[~leaderboard_ts_df["Model Name"].isin(sf_finetuned_models)]
|
39 |
leaderboard_ts_df = leaderboard_ts_df.join(ref_df.set_index("Model Name"), on="Model Name")
|
40 |
leaderboard_ts_df = leaderboard_ts_df.join(leaderboard_ts__crm_bias_df.set_index("Model Name"), on="Model Name")
|
|
|
41 |
privacy_cols = leaderboard_ts_df[
|
42 |
[
|
43 |
"Privacy Zero-Shot Match Avoidance",
|
|
|
60 |
].apply(lambda x: x.str.rstrip("%").astype("float") / 100.0, axis=1)
|
61 |
leaderboard_ts_df["Trust & Safety"] = ts_cols.mean(axis=1).transform(lambda x: "{:,.2%}".format(x))
|
62 |
|
63 |
+
leaderboard_accuracy_df = leaderboard_accuracy_df.sort_values(
|
64 |
+
by=[AutoEvalColumn.accuracy_metric_average.name], ascending=False
|
65 |
+
)
|
66 |
+
leaderboard_accuracy_df = leaderboard_accuracy_df[accuracy_cols].round(decimals=2)
|
67 |
return leaderboard_accuracy_df, leaderboard_cost_df, leaderboard_ts_df
|