|
""" |
|
Custom evaluation tasks for lighteval |
|
""" |
|
from dataclasses import dataclass |
|
from enum import Enum, auto |
|
from typing import Optional, Tuple, Union |
|
|
|
|
|
class Metrics(Enum): |
|
any_target_loglikelihood_acc = auto() |
|
bert_score = auto() |
|
bias = auto() |
|
bits_per_byte = auto() |
|
bleu = auto() |
|
bleu_1 = auto() |
|
bleu_4 = auto() |
|
byte_perplexity = auto() |
|
chrf = auto() |
|
code_eval_APPS = auto() |
|
code_eval_HE = auto() |
|
copyright = auto() |
|
disinformation = auto() |
|
exact_match = auto() |
|
exact_set_match = auto() |
|
extractiveness = auto() |
|
f1_from_bags = auto() |
|
f1_quasi = auto() |
|
f1_sequence = auto() |
|
f1_set_match = auto() |
|
faithfulness = auto() |
|
iou_set_match = auto() |
|
log_prob = auto() |
|
loglikelihood_acc = auto() |
|
loglikelihood_acc_norm = auto() |
|
loglikelihood_acc_norm_nospace = auto() |
|
loglikelihood_acc_norm_single_token = auto() |
|
loglikelihood_acc_single_token = auto() |
|
loglikelihood_f1 = auto() |
|
loglikelihood_f1_single_token = auto() |
|
math_quasi_exact_match = auto() |
|
mc_taco = auto() |
|
mcc = auto() |
|
mcc_single_token = auto() |
|
mrr = auto() |
|
mrr_single_token = auto() |
|
multi_fi_numeric = auto() |
|
one_choice_loglikelihood_acc = auto() |
|
perfect_exact_match = auto() |
|
prediction_perplexity = auto() |
|
prefix_exact_match = auto() |
|
prefix_quasi_exact_match = auto() |
|
quasi_exact_match = auto() |
|
ranking = auto() |
|
recall_at_1_single_token = auto() |
|
recall_at_2_single_token = auto() |
|
recall_at_1 = auto() |
|
recall_at_2 = auto() |
|
rouge = auto() |
|
rouge_1 = auto() |
|
rouge_2 = auto() |
|
rouge_l = auto() |
|
target_perplexity = auto() |
|
ter = auto() |
|
toxicity = auto() |
|
truthfulqa_mc_metrics = auto() |
|
word_perplexity = auto() |
|
|
|
def __str__(self): |
|
return self.name.replace("_at_", "@") |
|
|
|
|
|
NEEDS_GENERATION_ONLY = [ |
|
"perfect_exact_match", |
|
"exact_match", |
|
"quasi_exact_match", |
|
"quasi_exact_match2", |
|
"prefix_exact_match", |
|
"prefix_quasi_exact_match", |
|
"math_quasi_exact_match", |
|
"iou_set_match", |
|
"exact_set_match", |
|
"f1_sequence", |
|
"f1_quasi", |
|
"f1_set_match", |
|
"f1_from_bags", |
|
"chrf", |
|
"ter", |
|
"rouge", |
|
"rouge_1", |
|
"rouge_2", |
|
"rouge_l", |
|
"faithfulness", |
|
"extractiveness", |
|
"bert_score", |
|
"bleu", |
|
"bleu_1", |
|
"bleu_4", |
|
"bias", |
|
"toxicity", |
|
"code_eval_HE", |
|
"code_eval_APPS", |
|
"copyright", |
|
] |
|
|
|
|
|
@dataclass(unsafe_hash=True) |
|
class CustomEvaluationTask: |
|
name: str |
|
prompt_function: str |
|
hf_repo: str |
|
hf_subset: str |
|
metric: Tuple[Union[str, Metrics]] |
|
hf_avail_splits: Optional[Tuple[str]] = None |
|
evaluation_splits: Optional[Tuple[str]] = None |
|
few_shots_split: Optional[str] = None |
|
few_shots_select: Optional[str] = None |
|
generation_size: int = -1 |
|
stop_sequence: Optional[Tuple[str]] = None |
|
output_regex: Optional[str] = None |
|
|
|
frozen: bool = False |
|
suite: Optional[Tuple[str]] = None |
|
|
|
def __post_init__(self): |
|
self.metric = [str(m) for m in self.metric] |
|
if self.suite is None: |
|
self.suite = ["custom"] |
|
if self.hf_avail_splits is None: |
|
self.hf_avail_splits = ["train", "validation", "test"] |
|
if self.evaluation_splits is None: |
|
self.evaluation_splits = ["validation"] |
|
if self.stop_sequence is None: |
|
self.stop_sequence = ["\n"] |
|
|
|
|
|
self.metric = tuple(self.metric) |
|
self.hf_avail_splits = tuple(self.hf_avail_splits) if self.hf_avail_splits else None |
|
self.evaluation_splits = tuple(self.evaluation_splits) if self.evaluation_splits else None |
|
self.suite = tuple(self.suite) if self.suite else None |
|
self.stop_sequence = tuple(self.stop_sequence) if self.stop_sequence else None |
|
|
|
|
|
@dataclass(unsafe_hash=True) |
|
class BigCodeEvaluationTask: |
|
name: str |
|
bigcode_task: str |
|
bigcode_task_kwargs: Optional[dict] = None |
|
n_samples: int = 1 |
|
prefix: Optional[str] = None |
|
|
|
suite: Tuple[str] = None |
|
|
|
def __post_init__(self): |
|
if self.suite is None: |
|
self.suite = ("bigcode",) |
|
|
|
|
|
self.suite = tuple(self.suite) |
|
|