mistral-nanotron / custom_evaluation_utils.py
thomwolf's picture
thomwolf HF staff
add eval code
f1d3dc6
raw
history blame
4.34 kB
"""
Custom evaluation tasks for lighteval
"""
from dataclasses import dataclass
from enum import Enum, auto
from typing import Optional, Tuple, Union
class Metrics(Enum):
any_target_loglikelihood_acc = auto()
bert_score = auto()
bias = auto()
bits_per_byte = auto()
bleu = auto()
bleu_1 = auto()
bleu_4 = auto()
byte_perplexity = auto()
chrf = auto()
code_eval_APPS = auto()
code_eval_HE = auto()
copyright = auto()
disinformation = auto()
exact_match = auto()
exact_set_match = auto()
extractiveness = auto()
f1_from_bags = auto()
f1_quasi = auto()
f1_sequence = auto()
f1_set_match = auto()
faithfulness = auto()
iou_set_match = auto()
log_prob = auto()
loglikelihood_acc = auto()
loglikelihood_acc_norm = auto()
loglikelihood_acc_norm_nospace = auto()
loglikelihood_acc_norm_single_token = auto()
loglikelihood_acc_single_token = auto()
loglikelihood_f1 = auto()
loglikelihood_f1_single_token = auto()
math_quasi_exact_match = auto()
mc_taco = auto()
mcc = auto()
mcc_single_token = auto()
mrr = auto()
mrr_single_token = auto()
multi_fi_numeric = auto()
one_choice_loglikelihood_acc = auto()
perfect_exact_match = auto()
prediction_perplexity = auto()
prefix_exact_match = auto()
prefix_quasi_exact_match = auto()
quasi_exact_match = auto()
ranking = auto()
recall_at_1_single_token = auto()
recall_at_2_single_token = auto()
recall_at_1 = auto()
recall_at_2 = auto()
rouge = auto()
rouge_1 = auto()
rouge_2 = auto()
rouge_l = auto()
target_perplexity = auto()
ter = auto()
toxicity = auto()
truthfulqa_mc_metrics = auto()
word_perplexity = auto()
def __str__(self):
return self.name.replace("_at_", "@")
NEEDS_GENERATION_ONLY = [
"perfect_exact_match",
"exact_match",
"quasi_exact_match",
"quasi_exact_match2",
"prefix_exact_match",
"prefix_quasi_exact_match",
"math_quasi_exact_match",
"iou_set_match",
"exact_set_match",
"f1_sequence",
"f1_quasi",
"f1_set_match",
"f1_from_bags",
"chrf",
"ter",
"rouge",
"rouge_1",
"rouge_2",
"rouge_l",
"faithfulness",
"extractiveness",
"bert_score",
"bleu",
"bleu_1",
"bleu_4",
"bias",
"toxicity",
"code_eval_HE",
"code_eval_APPS",
"copyright",
]
@dataclass(unsafe_hash=True)
class CustomEvaluationTask:
name: str
prompt_function: str
hf_repo: str
hf_subset: str
metric: Tuple[Union[str, Metrics]]
hf_avail_splits: Optional[Tuple[str]] = None
evaluation_splits: Optional[Tuple[str]] = None
few_shots_split: Optional[str] = None
few_shots_select: Optional[str] = None
generation_size: int = -1
stop_sequence: Optional[Tuple[str]] = None
output_regex: Optional[str] = None
frozen: bool = False
suite: Optional[Tuple[str]] = None # we use this to know if we should use a custom lighteval or bigcode task
def __post_init__(self):
self.metric = [str(m) for m in self.metric]
if self.suite is None:
self.suite = ["custom"]
if self.hf_avail_splits is None:
self.hf_avail_splits = ["train", "validation", "test"]
if self.evaluation_splits is None:
self.evaluation_splits = ["validation"]
if self.stop_sequence is None:
self.stop_sequence = ["\n"]
# Convert list to tuple for hashing
self.metric = tuple(self.metric)
self.hf_avail_splits = tuple(self.hf_avail_splits) if self.hf_avail_splits else None
self.evaluation_splits = tuple(self.evaluation_splits) if self.evaluation_splits else None
self.suite = tuple(self.suite) if self.suite else None
self.stop_sequence = tuple(self.stop_sequence) if self.stop_sequence else None
@dataclass(unsafe_hash=True)
class BigCodeEvaluationTask:
name: str
bigcode_task: str
bigcode_task_kwargs: Optional[dict] = None
n_samples: int = 1
prefix: Optional[str] = None
suite: Tuple[str] = None
def __post_init__(self):
if self.suite is None:
self.suite = ("bigcode",)
# Convert list to tuple for hashing
self.suite = tuple(self.suite)