from dataclasses import dataclass, make_dataclass import pandas as pd def fields(raw_class): return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"] # These classes are for user facing column names, # to avoid having to change them all around the code # when a modif is needed @dataclass class ColumnContent: name: str type: str displayed_by_default: bool hidden: bool = False never_hidden: bool = False ## Leaderboard columns auto_eval_column_dict = [] # Init auto_eval_column_dict.append( ["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)] ) auto_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)]) auto_eval_column_dict.append( ["use_case_name", ColumnContent, ColumnContent("Use Case Name", "markdown", True, never_hidden=True)] ) auto_eval_column_dict.append(["use_case_type", ColumnContent, ColumnContent("Use Case Type", "markdown", False)]) auto_eval_column_dict.append( ["accuracy_method", ColumnContent, ColumnContent("Accuracy Method", "markdown", False, never_hidden=True)] ) # Accuracy metrics auto_eval_column_dict.append(["accuracy_metric_average", ColumnContent, ColumnContent("Accuracy", "markdown", True)]) auto_eval_column_dict.append( [ "accuracy_metric_instruction_following", ColumnContent, ColumnContent("Instruction Following", "markdown", True), ] ) auto_eval_column_dict.append( ["accuracy_metric_completeness", ColumnContent, ColumnContent("Completeness", "markdown", True)] ) auto_eval_column_dict.append( ["accuracy_metric_conciseness", ColumnContent, ColumnContent("Conciseness", "markdown", True)] ) auto_eval_column_dict.append( ["accuracy_metric_factuality", ColumnContent, ColumnContent("Factuality", "markdown", True)] ) # Speed (Latency) & Cost metrics auto_eval_column_dict.append(["latency", ColumnContent, ColumnContent("Response Time (Sec)", "markdown", True)]) auto_eval_column_dict.append( ["mean_output_tokens", ColumnContent, ColumnContent("Mean Output Tokens", "markdown", True)] ) auto_eval_column_dict.append(["cost_band", ColumnContent, ColumnContent("Cost Band", "markdown", True)]) # Trust & Safety metrics auto_eval_column_dict.append(["ts", ColumnContent, ColumnContent("Trust & Safety", "markdown", True)]) auto_eval_column_dict.append(["safety", ColumnContent, ColumnContent("Safety", "markdown", False)]) auto_eval_column_dict.append(["privacy", ColumnContent, ColumnContent("Privacy", "markdown", False)]) auto_eval_column_dict.append(["truthfulness", ColumnContent, ColumnContent("Truthfulness", "markdown", False)]) auto_eval_column_dict.append(["crm_fairness", ColumnContent, ColumnContent("CRM Fairness", "markdown", False)]) # We use make dataclass to dynamically fill the scores from Tasks AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict, frozen=True) # Speed (Latency) & Cost metrics cost_eval_column_dict = [] # Init cost_eval_column_dict.append( ["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)] ) cost_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)]) cost_eval_column_dict.append( ["use_case_flavor", ColumnContent, ColumnContent("Cost and Speed: Flavor", "markdown", True)] ) cost_eval_column_dict.append(["latency", ColumnContent, ColumnContent("Response Time (Sec)", "markdown", True)]) cost_eval_column_dict.append( ["mean_output_tokens", ColumnContent, ColumnContent("Mean Output Tokens", "markdown", True)] ) cost_eval_column_dict.append(["cost_band", ColumnContent, ColumnContent("Cost Band", "markdown", True)]) CostEvalColumn = make_dataclass("CostEvalColumn", cost_eval_column_dict, frozen=True) # Trust & Safety metrics ts_eval_column_dict = [] # Init ts_eval_column_dict.append(["model", ColumnContent, ColumnContent("Model Name", "markdown", True, never_hidden=True)]) # ts_eval_column_dict.append(["model_provider", ColumnContent, ColumnContent("LLM Provider", "markdown", True)]) ts_eval_column_dict.append(["ts", ColumnContent, ColumnContent("Trust & Safety", "markdown", True)]) ts_eval_column_dict.append(["safety", ColumnContent, ColumnContent("Safety", "markdown", False)]) ts_eval_column_dict.append(["privacy", ColumnContent, ColumnContent("Privacy", "markdown", False)]) ts_eval_column_dict.append(["truthfulness", ColumnContent, ColumnContent("Truthfulness", "markdown", False)]) ts_eval_column_dict.append(["crm_fairness", ColumnContent, ColumnContent("CRM Fairness", "markdown", False)]) # ts_eval_column_dict.append(["bias_no_ci", ColumnContent, ColumnContent("Bias No CI", "markdown", True)]) TSEvalColumn = make_dataclass("TSEvalColumn", ts_eval_column_dict, frozen=True) # Column selection COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden] TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden] COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden] TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden] COST_COLS = [c.name for c in fields(CostEvalColumn) if not c.hidden] COST_TYPES = [c.type for c in fields(CostEvalColumn) if not c.hidden] TS_COLS = [c.name for c in fields(TSEvalColumn) if not c.hidden] TS_TYPES = [c.type for c in fields(TSEvalColumn) if not c.hidden] # BENCHMARK_COLS = [t.value.col_name for t in Tasks] NUMERIC_INTERVALS = { "?": pd.Interval(-1, 0, closed="right"), "~1.5": pd.Interval(0, 2, closed="right"), "~3": pd.Interval(2, 4, closed="right"), "~7": pd.Interval(4, 9, closed="right"), "~13": pd.Interval(9, 20, closed="right"), "~35": pd.Interval(20, 45, closed="right"), "~60": pd.Interval(45, 70, closed="right"), "70+": pd.Interval(70, 10000, closed="right"), }