Kung-Hsiang Huang
update initial lb results
a708f96
raw
history blame
2.92 kB
from dataclasses import dataclass, make_dataclass
import pandas as pd
def fields(raw_class):
return [v for k, v in raw_class.__dict__.items() if k[:2] != "__" and k[-2:] != "__"]
# These classes are for user facing column names,
# to avoid having to change them all around the code
# when a modif is needed
@dataclass
class ColumnContent:
name: str
type: str
displayed_by_default: bool
hidden: bool = False
never_hidden: bool = False
## Leaderboard columns
auto_eval_column_dict = []
# 'Model', 'NCR', 'HTU', 'TCU', 'NED', 'PVI', 'KQA', 'TII', 'MTA', 'BRI', 'Overall', 'Agentic Framework']
# Init
auto_eval_column_dict.append(
["model", ColumnContent, ColumnContent("Model", "markdown", True, never_hidden=True)]
)
# Agentic Framework
auto_eval_column_dict.append(
["agentic_framework", ColumnContent, ColumnContent("Agentic Framework", "markdown", True, never_hidden=True)]
)
# NCR
auto_eval_column_dict.append(
["ncr", ColumnContent, ColumnContent("NCR", "markdown", True)]
)
# HTU
auto_eval_column_dict.append(
["htu", ColumnContent, ColumnContent("HTU", "markdown", True)]
)
# TCU
auto_eval_column_dict.append(
["tcu", ColumnContent, ColumnContent("TCU", "markdown", True)]
)
# NED
auto_eval_column_dict.append(
["ned", ColumnContent, ColumnContent("NED", "markdown", True)]
)
# PVI
auto_eval_column_dict.append(
["pvi", ColumnContent, ColumnContent("PVI", "markdown", True)]
)
# KQA
auto_eval_column_dict.append(
["kqa", ColumnContent, ColumnContent("KQA", "markdown", True)]
)
# TII
auto_eval_column_dict.append(
["tii", ColumnContent, ColumnContent("TII", "markdown", True)]
)
# MTA
auto_eval_column_dict.append(
["mta", ColumnContent, ColumnContent("MTA", "markdown", True)]
)
# BRI
auto_eval_column_dict.append(
["bri", ColumnContent, ColumnContent("BRI", "markdown", True)]
)
# Overall
auto_eval_column_dict.append(
["overall", ColumnContent, ColumnContent("Overall ⬆️", "markdown", True, never_hidden=True)]
)
# Create AutoEvalColumn class
AutoEvalColumn = make_dataclass("AutoEvalColumn", auto_eval_column_dict)
# Column selection
COLS = [c.name for c in fields(AutoEvalColumn) if not c.hidden]
TYPES = [c.type for c in fields(AutoEvalColumn) if not c.hidden]
COLS_LITE = [c.name for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
TYPES_LITE = [c.type for c in fields(AutoEvalColumn) if c.displayed_by_default and not c.hidden]
# BENCHMARK_COLS = [t.value.col_name for t in Tasks]
NUMERIC_INTERVALS = {
"?": pd.Interval(-1, 0, closed="right"),
"~1.5": pd.Interval(0, 2, closed="right"),
"~3": pd.Interval(2, 4, closed="right"),
"~7": pd.Interval(4, 9, closed="right"),
"~13": pd.Interval(9, 20, closed="right"),
"~35": pd.Interval(20, 45, closed="right"),
"~60": pd.Interval(45, 70, closed="right"),
"70+": pd.Interval(70, 10000, closed="right"),
}