tlem / tlem.py
facat's picture
upd
044ed98
raw
history blame
3.87 kB
# %%
try:
from ipytorch import logging
except Exception as e:
import logging
from typing import Any, Optional, Protocol, Iterable, Callable
from tqdm.auto import tqdm
from evaluate.evaluation_suite import EvaluationSuite
import evaluate
import numpy as np
import datasets
from .tasks import Task, Metrics
from .utils import is_equiv
# %%
# %cd ../tlem
# %load_ext ipytorch
# %ls
# TODO: Add BibTeX citation
_CITATION = """\
@InProceedings{huggingface:module,
title = {A great new module},
authors={huggingface, Inc.},
year={2020}
}
"""
# TODO: Add description of the module here
_DESCRIPTION = """\
A simple measurement that returns the number of elements in dataset.
"""
# TODO: Add description of the arguments of the module here
_KWARGS_DESCRIPTION = """
Calculates number of elements in dataset
Args:
data: list of elements.
Returns:
element_count: number of elements in dataset,
Examples:
>>> measure = evaluate.load("lvwerra/element_count")
>>> measure.compute(["a", "b", "c")
{"element_count": 3}
"""
# TODO: Define external resources urls if needed
BAD_WORDS_URL = "http://url/to/external/resource/bad_words.txt"
@evaluate.utils.file_utils.add_start_docstrings(_DESCRIPTION, _KWARGS_DESCRIPTION)
class ReasoningMetric(evaluate.Metric):
"""TODO: Short description of my evaluation module."""
def _info(self):
features = datasets.Features(
{
"responses": datasets.Value("string"),
"references": datasets.Value("string"),
}
)
if self.config_name == "svamp":
features = datasets.Features(
{
"responses": datasets.Value("string"),
"references": datasets.Value("float"),
}
)
# TODO: Specifies the evaluate.EvaluationModuleInfo object
return evaluate.EvaluationModuleInfo(
# This is the description that will appear on the modules page.
# module_type="measurement",
description=_DESCRIPTION,
citation=_CITATION,
inputs_description=_KWARGS_DESCRIPTION,
# This defines the format of each prediction and reference
features=features,
# Homepage of the module for documentation
homepage="http://module.homepage",
# Additional links to the codebase or references
codebase_urls=["http://github.com/path/to/codebase/of/new_module"],
reference_urls=["http://path.to.reference.url/new_module"],
)
def _compute(self, responses, references, verbose=False):
results = {}
scores = getattr(Metrics, self.config_name)(responses, references)
acc = np.asarray(scores).mean()
results = {
"accuracy": acc,
"scores": scores,
}
if verbose:
results["references"] = references
results["answers"] = responses
# results["scores"] = scores
return results
class Suite(EvaluationSuite):
def run(
self, model_or_pipeline: Any, prompt: str = "{instruction}"
) -> dict[str, float]:
self.assert_suite_nonempty()
results_all = {}
for task in tqdm(self.suite, desc="Running tasks"):
task_name = task.name
results = task.run(model_or_pipeline)
results_all[task_name] = results
return results_all
def __init__(self, name):
super().__init__(name)
self.suite = [
Task(
dataset_name=("gsm8k", "main"),
metric_name=("sustech/tlem", "gsm8k"),
input_column="question",
label_column="answer",
)
# TASK_REGISTRY["gsm8k"],
# TASK_REGISTRY["competition_math"],
]
# %%