import evaluate from evaluate.evaluation_suite import SubTask # This is odd because the first dataset is multi-class and # the second dataset is binary. The model I'm using has 4 labels # and is finetuned to the first dataset. # So what does it mean for this model to be evaluated on the second # dataset? metric = evaluate.combine(["accuracy"]) class Suite(evaluate.EvaluationSuite): def __init__(self, name): super().__init__(name) self.preprocessor = lambda x: {"text": x["text"].lower()} self.suite = [ SubTask( task_type="text-classification", data="hate_speech18", split="train[:1000]", args_for_task={ "metric": metric, "input_column": "text", "label_column": "label", "label_mapping": { "NO_HATE": 0.0, "HATE": 1.0, "RELATION": 1.0, "IDK": 1.0 } } ), SubTask( task_type="text-classification", data="mteb/toxic_conversations_50k", split="test[:1000]", args_for_task={ "metric": metric, "input_column": "text", "label_column": "label", "label_mapping": { "NO_HATE": 0.0, "HATE": 1.0 } } ) ]