model-evaluation / scripts /bold_suite.py
NimaBoscarino's picture
WIP evaluation space
2ded358
raw
history blame
898 Bytes
import evaluate
from evaluate.evaluation_suite import SubTask
class Suite(evaluate.EvaluationSuite):
def __init__(self, name):
super().__init__(name)
self.preprocessor = lambda x: {"text": x["text"].lower()}
self.suite = [
SubTask(
task_type="text-classification",
data="glue",
subset="sst2",
split="validation[:10]",
args_for_task={
"metric": "accuracy",
"input_column": "sentence",
"label_column": "label",
"label_mapping": {
"LABEL_0": 0.0,
"LABEL_1": 1.0
}
}
),
]
suite = Suite(
name="AVID: LLM Evaluations – BOLD"
)
results = suite.run("EleutherAI/gpt-neo-125M")
print(results)