test-suite / test-suite.py
daniel-de-leon's picture
only accuracy
88ac74b
import evaluate
from evaluate.evaluation_suite import SubTask
# This is odd because the first dataset is multi-class and
# the second dataset is binary. The model I'm using has 4 labels
# and is finetuned to the first dataset.
# So what does it mean for this model to be evaluated on the second
# dataset?
metric = evaluate.combine(["accuracy"])
class Suite(evaluate.EvaluationSuite):
def __init__(self, name):
super().__init__(name)
self.preprocessor = lambda x: {"text": x["text"].lower()}
self.suite = [
SubTask(
task_type="text-classification",
data="hate_speech18",
split="train[:1000]",
args_for_task={
"metric": metric,
"input_column": "text",
"label_column": "label",
"label_mapping": {
"NO_HATE": 0.0,
"HATE": 1.0,
"RELATION": 1.0,
"IDK": 1.0
}
}
),
SubTask(
task_type="text-classification",
data="mteb/toxic_conversations_50k",
split="test[:1000]",
args_for_task={
"metric": metric,
"input_column": "text",
"label_column": "label",
"label_mapping": {
"NO_HATE": 0.0,
"HATE": 1.0
}
}
)
]