|
import collections |
|
import itertools |
|
import numpy as np |
|
import random |
|
import lm_eval.metrics |
|
import lm_eval.models |
|
import lm_eval.tasks |
|
import lm_eval.base |
|
from lm_eval.utils import positional_deprecated, run_task_tests |
|
|
|
|
|
@positional_deprecated |
|
def simple_evaluate( |
|
model, |
|
model_args=None, |
|
tasks=[], |
|
num_fewshot=0, |
|
batch_size=None, |
|
device=None, |
|
no_cache=False, |
|
limit=None, |
|
bootstrap_iters=100000, |
|
description_dict=None, |
|
check_integrity=False, |
|
decontamination_ngrams_path=None, |
|
verbose=False, |
|
): |
|
|
|
"""Instantiate and evaluate a model on a list of tasks. |
|
|
|
:param model: Union[str, LM] |
|
Name of model or LM object, see lm_eval.models.get_model |
|
:param model_args: Optional[str] |
|
String arguments for each model class, see LM.create_from_arg_string. |
|
Ignored if `model` argument is a LM object. |
|
:param tasks: list[Union[str, Task]] |
|
List of task names or Task objects. Task objects will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise. |
|
:param num_fewshot: int or list of int |
|
Number of examples in few-shot context |
|
:param batch_size: int, optional |
|
Batch size for model |
|
:param device: str, optional |
|
PyTorch device (e.g. "cpu" or "cuda:0") for running models |
|
:param no_cache: bool |
|
Whether or not to cache |
|
:param limit: int or list of int, optional |
|
Limit the number of examples per task (only use this for testing) |
|
:param bootstrap_iters: |
|
Number of iterations for bootstrap statistics |
|
:param description_dict: dict[str, str] |
|
Dictionary of custom task descriptions of the form: `task_name: description` |
|
:param check_integrity: bool |
|
Whether to run the relevant part of the test suite for the tasks |
|
:return |
|
Dictionary of results |
|
""" |
|
random.seed(1234) |
|
np.random.seed(1234) |
|
|
|
assert tasks != [], "No tasks specified" |
|
|
|
if isinstance(model, str): |
|
if model_args is None: |
|
model_args = "" |
|
lm = lm_eval.models.get_model(model).create_from_arg_string( |
|
model_args, {"batch_size": batch_size, "device": device} |
|
) |
|
else: |
|
assert isinstance(model, lm_eval.base.LM) |
|
lm = model |
|
|
|
if not no_cache: |
|
lm = lm_eval.base.CachingLM( |
|
lm, |
|
"lm_cache/" |
|
+ model |
|
+ "_" |
|
+ model_args.replace("=", "-").replace(",", "_").replace("/", "-") |
|
+ ".db", |
|
) |
|
|
|
task_dict = lm_eval.tasks.get_task_dict(tasks) |
|
|
|
if check_integrity: |
|
run_task_tests(task_list=tasks) |
|
|
|
results = evaluate( |
|
lm=lm, |
|
task_dict=task_dict, |
|
num_fewshot=num_fewshot, |
|
limit=limit, |
|
bootstrap_iters=bootstrap_iters, |
|
description_dict=description_dict, |
|
decontamination_ngrams_path=decontamination_ngrams_path, |
|
verbose=verbose, |
|
) |
|
|
|
|
|
results["config"] = { |
|
"model": model, |
|
"model_args": model_args, |
|
"num_fewshot": num_fewshot, |
|
"batch_size": batch_size, |
|
"device": device, |
|
"no_cache": no_cache, |
|
"limit": limit, |
|
"bootstrap_iters": bootstrap_iters, |
|
"description_dict": description_dict, |
|
} |
|
|
|
return results |
|
|
|
|
|
decontaminate_suffix = "_decontaminate" |
|
|
|
|
|
@positional_deprecated |
|
def evaluate( |
|
lm, |
|
task_dict, |
|
provide_description=None, |
|
num_fewshot=0, |
|
limit=None, |
|
bootstrap_iters=100000, |
|
description_dict=None, |
|
decontamination_ngrams_path=None, |
|
verbose=False, |
|
): |
|
"""Instantiate and evaluate a model on a list of tasks. |
|
|
|
:param lm: obj |
|
Language Model |
|
:param task_dict: dict[str, Task] |
|
Dictionary of tasks. Tasks will be taken to have name task.EVAL_HARNESS_NAME if defined and type(task).__name__ otherwise. |
|
:param provide_description: bool |
|
Not implemented, and this option is deprecated and will be removed in a future version in favor of a different description providing method |
|
:param num_fewshot: int or list of int |
|
Number of examples in few-shot context |
|
:param limit: int or list of int, optional |
|
Limit the number of examples per task (only use this for testing) |
|
:param bootstrap_iters: |
|
Number of iterations for bootstrap statistics |
|
:param description_dict: dict[str, str] |
|
Dictionary of custom task descriptions of the form: `task_name: description` |
|
:return |
|
Dictionary of results |
|
""" |
|
|
|
|
|
|
|
assert not provide_description |
|
if provide_description is not None: |
|
|
|
print( |
|
"WARNING: provide_description is deprecated and will be removed in a future version in favor of description_dict" |
|
) |
|
if isinstance(num_fewshot, list): |
|
assert len(task_dict) == len( |
|
num_fewshot |
|
), f"The number of tasks ({len(task_dict)}) must be same as the number of elements in `num_fewshot` ({len(num_fewshot)})" |
|
else: |
|
|
|
num_fewshot = [num_fewshot] * len(task_dict) |
|
if isinstance(limit, list): |
|
assert len(task_dict) == len( |
|
limit |
|
), f"The number of tasks ({len(task_dict)}) must be same as the number of elements in `num_fewshot` ({len(limit)})" |
|
else: |
|
|
|
limit = [limit] * len(task_dict) |
|
|
|
decontaminate = decontamination_ngrams_path is not None |
|
|
|
task_dict_items = [ |
|
(name, task) |
|
for name, task in task_dict.items() |
|
if (task.has_validation_docs() or task.has_test_docs()) |
|
] |
|
|
|
results = collections.defaultdict(dict) |
|
versions = collections.defaultdict(dict) |
|
|
|
requests = collections.defaultdict(list) |
|
requests_origin = collections.defaultdict(list) |
|
|
|
overlaps = collections.defaultdict(list) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
docs = {} |
|
|
|
docs_for_decontamination = collections.defaultdict(list) |
|
|
|
|
|
for idx, (task_name, task) in enumerate(task_dict_items): |
|
versions[task_name] = task.VERSION |
|
|
|
|
|
if task.has_test_docs(): |
|
task_doc_func = task.test_docs |
|
task_set = "test" |
|
elif task.has_validation_docs(): |
|
task_set = "val" |
|
task_doc_func = task.validation_docs |
|
else: |
|
raise RuntimeError("Task has neither test_docs nor validation_docs") |
|
|
|
|
|
task_docs = list(task_doc_func()) |
|
rnd = random.Random() |
|
rnd.seed(42) |
|
rnd.shuffle(task_docs) |
|
|
|
description = ( |
|
description_dict[task_name] |
|
if description_dict and task_name in description_dict |
|
else "" |
|
) |
|
|
|
if task.LOAD_TOKENIZER: |
|
if isinstance(lm, lm_eval.base.CachingLM): |
|
task.set_tokenizer(lm.lm.tokenizer) |
|
else: |
|
task.set_tokenizer(lm.tokenizer) |
|
|
|
task.max_length = ( |
|
lm.lm.max_length |
|
if isinstance(lm, lm_eval.base.CachingLM) |
|
else lm.max_length |
|
) |
|
task.max_gen_toks = ( |
|
lm.lm.max_gen_toks |
|
if isinstance(lm, lm_eval.base.CachingLM) |
|
else lm.max_gen_toks |
|
) |
|
|
|
limit_local = limit[idx] |
|
if isinstance(limit_local, float): |
|
limit_local = int(limit_local * len(task_docs)) |
|
print( |
|
f"Use {limit_local}/{len(task_docs)} samples corresponding to the ratio of {limit[idx]}" |
|
) |
|
for doc_id, doc in enumerate(itertools.islice(task_docs, 0, limit_local)): |
|
|
|
if decontaminate and task.should_decontaminate(): |
|
docs_for_decontamination[(task_name, task_set)].append( |
|
task.doc_to_decontamination_query(doc) |
|
) |
|
|
|
docs[(task_name, doc_id)] = doc |
|
ctx = task.fewshot_context( |
|
doc=doc, num_fewshot=num_fewshot[idx], rnd=rnd, description=description |
|
) |
|
reqs = task.construct_requests(doc, ctx) |
|
if not isinstance(reqs, (list, tuple)): |
|
reqs = [reqs] |
|
for i, req in enumerate(reqs): |
|
requests[req.request_type].append(req) |
|
|
|
|
|
requests_origin[req.request_type].append((i, task_name, doc, doc_id)) |
|
|
|
|
|
if decontaminate: |
|
from lm_eval.decontamination.decontaminate import get_train_overlap |
|
|
|
print("Finding train/test overlap, please wait...") |
|
overlaps = get_train_overlap( |
|
docs_for_decontamination, decontamination_ngrams_path, limit |
|
) |
|
|
|
|
|
process_res_queue = collections.defaultdict(list) |
|
|
|
|
|
for reqtype, reqs in requests.items(): |
|
|
|
|
|
|
|
|
|
|
|
print("Running", reqtype, "requests") |
|
resps = getattr(lm, reqtype)([req.args for req in reqs]) |
|
resps = [ |
|
x if req.index is None else x[req.index] for x, req in zip(resps, reqs) |
|
] |
|
|
|
for resp, (i, task_name, doc, doc_id) in zip(resps, requests_origin[reqtype]): |
|
process_res_queue[(task_name, doc_id)].append((i, resp)) |
|
|
|
vals = collections.defaultdict(list) |
|
|
|
details = collections.defaultdict(list) |
|
|
|
|
|
for (task_name, doc_id), requests in process_res_queue.items(): |
|
requests.sort(key=lambda x: x[0]) |
|
requests = [x[1] for x in requests] |
|
|
|
task = task_dict[task_name] |
|
doc = docs[(task_name, doc_id)] |
|
|
|
metrics = task.process_results(doc, requests) |
|
if "details" in metrics: |
|
details[task_name].append(metrics["details"]) |
|
del metrics["details"] |
|
for metric, value in metrics.items(): |
|
vals[(task_name, metric)].append(value) |
|
|
|
|
|
if decontaminate and task_name in overlaps: |
|
if doc_id not in overlaps[task_name]: |
|
vals[(task_name, metric + decontaminate_suffix)].append(value) |
|
|
|
|
|
for (task_name, metric), items in vals.items(): |
|
task = task_dict[task_name] |
|
real_metric = metric |
|
if metric.endswith(decontaminate_suffix): |
|
real_metric = metric.replace( |
|
decontaminate_suffix, "" |
|
) |
|
results[task_name][metric] = task.aggregation()[real_metric](items) |
|
|
|
|
|
|
|
|
|
stderr = lm_eval.metrics.stderr_for_metric( |
|
metric=task.aggregation()[real_metric], |
|
bootstrap_iters=min(bootstrap_iters, 1000) |
|
if metric in ["bleu", "chrf", "ter"] |
|
else bootstrap_iters, |
|
) |
|
|
|
if stderr is not None: |
|
results[task_name][metric + "_stderr"] = stderr(items) |
|
|
|
if verbose and task_name in details: |
|
results[task_name]["details"] = details[task_name] |
|
|
|
return {"results": dict(results), "versions": dict(versions)} |
|
|
|
|
|
def make_table(result_dict): |
|
"""Generate table of results.""" |
|
from pytablewriter import MarkdownTableWriter, LatexTableWriter |
|
|
|
md_writer = MarkdownTableWriter() |
|
latex_writer = LatexTableWriter() |
|
md_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"] |
|
latex_writer.headers = ["Task", "Version", "Metric", "Value", "", "Stderr"] |
|
|
|
values = [] |
|
|
|
for k, dic in result_dict["results"].items(): |
|
version = result_dict["versions"][k] |
|
for m, v in dic.items(): |
|
if m == "details": |
|
continue |
|
|
|
if m.endswith("_stderr"): |
|
continue |
|
|
|
if m + "_stderr" in dic: |
|
se = dic[m + "_stderr"] |
|
values.append([k, version, m, "%.4f" % v, "±", "%.4f" % se]) |
|
else: |
|
values.append([k, version, m, "%.4f" % v, "", ""]) |
|
k = "" |
|
version = "" |
|
md_writer.value_matrix = values |
|
latex_writer.value_matrix = values |
|
|
|
|
|
|
|
|
|
return md_writer.dumps() |
|
|