backend_demo

Paused

App Files Files Community

Shaltiel commited on May 9

Commit

c2465e1

•

1 Parent(s): 73adc1c

Added snli task

Browse files

Files changed (2) hide show

src/about.py +1 -1
src/custom_tasks/snli_task.py +59 -0

src/about.py CHANGED Viewed

@@ -21,5 +21,5 @@ TASKS_HARNESS = [task.value.benchmark for task in Tasks]
 # ---------------------------------------------------
 # TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
-tasks = ['heq-qa-tlnls', 'sentiment-acc', 'winograd-acc', 'he-en-trans-bleu']
 TASKS_LIGHTEVAL = ','.join(f'custom|{t}|0|0' for t in tasks)# + ',leaderboard|arc:challenge|0|0'

 # ---------------------------------------------------
 # TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
+tasks = ['heq-qa-tlnls', 'sentiment-acc', 'winograd-acc', 'he-en-trans-bleu', 'snli-acc']
 TASKS_LIGHTEVAL = ','.join(f'custom|{t}|0|0' for t in tasks)# + ',leaderboard|arc:challenge|0|0'

src/custom_tasks/snli_task.py ADDED Viewed

	@@ -0,0 +1,59 @@

+import re
+import string
+from lighteval.tasks.lighteval_task import LightevalTaskConfig
+from lighteval.metrics import Metrics, MetricCategory
+from lighteval.metrics.utils import CorpusLevelMetric, MetricUseCase
+from aenum import extend_enum
+import numpy as np
+from lighteval.tasks.requests import Doc
+from Levenshtein import distance
+import collections
+from lighteval.utils import as_list
+def snli_eval_fn(golds: list[str], predictions: list[str], formatted_doc: Doc = None):
+    if len(predictions)  > 1:
+        raise ValueError("Predictions should have one item")
+    # do some santizations, since some models produce more info
+    pred = re.sub('<[^>]+>', '', predictions[0]) # remove xml tags
+    pred = re.sub(r'^100%', '', pred) # remove 100% at beginning, some gemma weirdness
+    pred = pred.strip()
+    return 1 if pred == golds[0] else 0
+snli_acc_metric = CorpusLevelMetric(
+    metric="snli_acc",
+    higher_is_better=True,
+    category=MetricCategory.GENERATIVE,
+    use_case=MetricUseCase.ACCURACY,
+    corpus_level_fn=np.mean,
+    sample_level_fn=snli_eval_fn
+)
+extend_enum(Metrics, 'snli_acc_metric', snli_acc_metric)
+def snli_prompt_fn(line, task_name: str = None):
+    """Defines how to go from a dataset line to a doc object.
+    Follow examples in src/lighteval/tasks/tasks_prompt_formatting.py, or get more info
+    about what this function should do in the README.
+    """
+    return Doc(
+        task_name=task_name,
+        query=line["prompt"].strip(),
+        choices=[resp.strip() for resp in line["response"]],
+        gold_index=0,
+        instruction="",
+    )
+# This is how you create a simple tasks (like hellaswag) which has one single subset
+# attached to it, and one evaluation possible.
+snli_task = LightevalTaskConfig(
+    name="snli-acc",
+    prompt_function="snli_prompt_fn",  # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
+    suite=["custom"],
+    hf_repo="dicta-hebrew-llm-leaderboard/tests",
+    hf_subset="default",
+    hf_avail_splits=["snli"],
+    evaluation_splits=["snli"],
+    metric=['snli_acc_metric'],
+    stop_sequence=['\n'],
+    generation_size=16
+)