Spaces:
Running
Running
File size: 5,382 Bytes
62c7044 b27b717 62c7044 b27b717 62c7044 b27b717 1cade3b 84ee137 429ce41 b27b717 429ce41 b27b717 84ee137 ada4cd8 b27b717 ada4cd8 84ee137 ada4cd8 84ee137 ada4cd8 b27b717 2bc2f6b 5d9a791 4c0cc56 2bc2f6b b27b717 62c7044 ada4cd8 2bc2f6b 84ee137 62c7044 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 |
from dataclasses import dataclass, make_dataclass
import pandas as pd
def fields(raw_class):
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
# These classes are for user facing column names,
# to avoid having to change them all around the code
# when a modif is needed
@dataclass
class ColumnContent:
name: str
type: str
displayed_by_default: bool
hidden: bool = False
never_hidden: bool = False
## Leaderboard columns
auto_eval_column_dict = []
# Init
auto_eval_column_dict.append(
["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)]
)
auto_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)])
auto_eval_column_dict.append(["use_case_name", ColumnContent, ColumnContent("Use Case Name", "markdown", True)])
auto_eval_column_dict.append(["use_case_type", ColumnContent, ColumnContent("Use Case Type", "markdown", False)])
auto_eval_column_dict.append(["accuracy_method", ColumnContent, ColumnContent("Accuracy Method", "markdown", False)])
# Accuracy metrics
auto_eval_column_dict.append(["accuracy_metric_average", ColumnContent, ColumnContent("Accuracy", "markdown", True)])
auto_eval_column_dict.append(
[
"accuracy_metric_instruction_following",
ColumnContent,
ColumnContent("Instruction Following", "markdown", True),
]
)
auto_eval_column_dict.append(
["accuracy_metric_completeness", ColumnContent, ColumnContent("Completeness", "markdown", True)]
)
auto_eval_column_dict.append(
["accuracy_metric_conciseness", ColumnContent, ColumnContent("Conciseness", "markdown", True)]
)
auto_eval_column_dict.append(
["accuracy_metric_factuality", ColumnContent, ColumnContent("Factuality", "markdown", True)]
)
# auto_eval_column_dict.append(
# ["use_case_flavor", ColumnContent, ColumnContent("Cost and Speed: Flavor", "markdown", False)]
# )
auto_eval_column_dict.append(["latency", ColumnContent, ColumnContent("Response Time (Sec)", "markdown", True)])
auto_eval_column_dict.append(
["mean_output_tokens", ColumnContent, ColumnContent("Mean Output Tokens", "markdown", True)]
)
auto_eval_column_dict.append(["cost_band", ColumnContent, ColumnContent("Cost Band", "markdown", True)])
# We use make dataclass to dynamically fill the scores from Tasks
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True)
# Speed (Latency) & Cost metrics
cost_eval_column_dict = []
# Init
cost_eval_column_dict.append(
["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)]
)
cost_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)])
cost_eval_column_dict.append(
["use_case_flavor", ColumnContent, ColumnContent("Cost and Speed: Flavor", "markdown", True)]
)
cost_eval_column_dict.append(["latency", ColumnContent, ColumnContent("Response Time (Sec)", "markdown", True)])
cost_eval_column_dict.append(
["mean_output_tokens", ColumnContent, ColumnContent("Mean Output Tokens", "markdown", True)]
)
cost_eval_column_dict.append(["cost_band", ColumnContent, ColumnContent("Cost Band", "markdown", True)])
CostEvalColumn = make_dataclass("CostEvalColumn", cost_eval_column_dict, frozen=True)
# Trust & Safety metrics
ts_eval_column_dict = []
# Init
ts_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)])
ts_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)])
ts_eval_column_dict.append(["ts", ColumnContent, ColumnContent("Trust & Safety", "markdown", True)])
ts_eval_column_dict.append(["safety", ColumnContent, ColumnContent("Safety", "markdown", False)])
ts_eval_column_dict.append(["privacy", ColumnContent, ColumnContent("Privacy", "markdown", False)])
ts_eval_column_dict.append(["truthfulness", ColumnContent, ColumnContent("Truthfulness", "markdown", False)])
ts_eval_column_dict.append(["crm_bias", ColumnContent, ColumnContent("CRM Bias", "markdown", False)])
# ts_eval_column_dict.append(["bias_no_ci", ColumnContent, ColumnContent("Bias No CI", "markdown", True)])
TSEvalColumn = make_dataclass("TSEvalColumn", ts_eval_column_dict, frozen=True)
# Column selection
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
COST_COLS = [c.name for c in fields(CostEvalColumn) if not c.hidden]
COST_TYPES = [c.type for c in fields(CostEvalColumn) if not c.hidden]
TS_COLS = [c.name for c in fields(TSEvalColumn) if not c.hidden]
TS_TYPES = [c.type for c in fields(TSEvalColumn) if not c.hidden]
# BENCHMARK_COLS = [t.value.col_name for t in Tasks]
NUMERIC_INTERVALS = {
"?": pd.Interval(-1, 0, closed="right"),
"~1.5": pd.Interval(0, 2, closed="right"),
"~3": pd.Interval(2, 4, closed="right"),
"~7": pd.Interval(4, 9, closed="right"),
"~13": pd.Interval(9, 20, closed="right"),
"~35": pd.Interval(20, 45, closed="right"),
"~60": pd.Interval(45, 70, closed="right"),
"70+": pd.Interval(70, 10000, closed="right"),
}
|