Shaltiel commited on
Commit
c2465e1
1 Parent(s): 73adc1c

Added snli task

Browse files
Files changed (2) hide show
  1. src/about.py +1 -1
  2. src/custom_tasks/snli_task.py +59 -0
src/about.py CHANGED
@@ -21,5 +21,5 @@ TASKS_HARNESS = [task.value.benchmark for task in Tasks]
21
  # ---------------------------------------------------
22
 
23
  # TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
24
- tasks = ['heq-qa-tlnls', 'sentiment-acc', 'winograd-acc', 'he-en-trans-bleu']
25
  TASKS_LIGHTEVAL = ','.join(f'custom|{t}|0|0' for t in tasks)# + ',leaderboard|arc:challenge|0|0'
 
21
  # ---------------------------------------------------
22
 
23
  # TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
24
+ tasks = ['heq-qa-tlnls', 'sentiment-acc', 'winograd-acc', 'he-en-trans-bleu', 'snli-acc']
25
  TASKS_LIGHTEVAL = ','.join(f'custom|{t}|0|0' for t in tasks)# + ',leaderboard|arc:challenge|0|0'
src/custom_tasks/snli_task.py ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import string
3
+ from lighteval.tasks.lighteval_task import LightevalTaskConfig
4
+ from lighteval.metrics import Metrics, MetricCategory
5
+ from lighteval.metrics.utils import CorpusLevelMetric, MetricUseCase
6
+ from aenum import extend_enum
7
+ import numpy as np
8
+ from lighteval.tasks.requests import Doc
9
+ from Levenshtein import distance
10
+ import collections
11
+ from lighteval.utils import as_list
12
+
13
+ def snli_eval_fn(golds: list[str], predictions: list[str], formatted_doc: Doc = None):
14
+ if len(predictions) > 1:
15
+ raise ValueError("Predictions should have one item")
16
+ # do some santizations, since some models produce more info
17
+ pred = re.sub('<[^>]+>', '', predictions[0]) # remove xml tags
18
+ pred = re.sub(r'^100%', '', pred) # remove 100% at beginning, some gemma weirdness
19
+ pred = pred.strip()
20
+
21
+ return 1 if pred == golds[0] else 0
22
+
23
+ snli_acc_metric = CorpusLevelMetric(
24
+ metric="snli_acc",
25
+ higher_is_better=True,
26
+ category=MetricCategory.GENERATIVE,
27
+ use_case=MetricUseCase.ACCURACY,
28
+ corpus_level_fn=np.mean,
29
+ sample_level_fn=snli_eval_fn
30
+ )
31
+ extend_enum(Metrics, 'snli_acc_metric', snli_acc_metric)
32
+
33
+ def snli_prompt_fn(line, task_name: str = None):
34
+ """Defines how to go from a dataset line to a doc object.
35
+ Follow examples in src/lighteval/tasks/tasks_prompt_formatting.py, or get more info
36
+ about what this function should do in the README.
37
+ """
38
+ return Doc(
39
+ task_name=task_name,
40
+ query=line["prompt"].strip(),
41
+ choices=[resp.strip() for resp in line["response"]],
42
+ gold_index=0,
43
+ instruction="",
44
+ )
45
+
46
+ # This is how you create a simple tasks (like hellaswag) which has one single subset
47
+ # attached to it, and one evaluation possible.
48
+ snli_task = LightevalTaskConfig(
49
+ name="snli-acc",
50
+ prompt_function="snli_prompt_fn", # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
51
+ suite=["custom"],
52
+ hf_repo="dicta-hebrew-llm-leaderboard/tests",
53
+ hf_subset="default",
54
+ hf_avail_splits=["snli"],
55
+ evaluation_splits=["snli"],
56
+ metric=['snli_acc_metric'],
57
+ stop_sequence=['\n'],
58
+ generation_size=16
59
+ )