Spaces:
Paused
Paused
Added snli task
Browse files- src/about.py +1 -1
- src/custom_tasks/snli_task.py +59 -0
src/about.py
CHANGED
@@ -21,5 +21,5 @@ TASKS_HARNESS = [task.value.benchmark for task in Tasks]
|
|
21 |
# ---------------------------------------------------
|
22 |
|
23 |
# TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
|
24 |
-
tasks = ['heq-qa-tlnls', 'sentiment-acc', 'winograd-acc', 'he-en-trans-bleu']
|
25 |
TASKS_LIGHTEVAL = ','.join(f'custom|{t}|0|0' for t in tasks)# + ',leaderboard|arc:challenge|0|0'
|
|
|
21 |
# ---------------------------------------------------
|
22 |
|
23 |
# TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
|
24 |
+
tasks = ['heq-qa-tlnls', 'sentiment-acc', 'winograd-acc', 'he-en-trans-bleu', 'snli-acc']
|
25 |
TASKS_LIGHTEVAL = ','.join(f'custom|{t}|0|0' for t in tasks)# + ',leaderboard|arc:challenge|0|0'
|
src/custom_tasks/snli_task.py
ADDED
@@ -0,0 +1,59 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import string
|
3 |
+
from lighteval.tasks.lighteval_task import LightevalTaskConfig
|
4 |
+
from lighteval.metrics import Metrics, MetricCategory
|
5 |
+
from lighteval.metrics.utils import CorpusLevelMetric, MetricUseCase
|
6 |
+
from aenum import extend_enum
|
7 |
+
import numpy as np
|
8 |
+
from lighteval.tasks.requests import Doc
|
9 |
+
from Levenshtein import distance
|
10 |
+
import collections
|
11 |
+
from lighteval.utils import as_list
|
12 |
+
|
13 |
+
def snli_eval_fn(golds: list[str], predictions: list[str], formatted_doc: Doc = None):
|
14 |
+
if len(predictions) > 1:
|
15 |
+
raise ValueError("Predictions should have one item")
|
16 |
+
# do some santizations, since some models produce more info
|
17 |
+
pred = re.sub('<[^>]+>', '', predictions[0]) # remove xml tags
|
18 |
+
pred = re.sub(r'^100%', '', pred) # remove 100% at beginning, some gemma weirdness
|
19 |
+
pred = pred.strip()
|
20 |
+
|
21 |
+
return 1 if pred == golds[0] else 0
|
22 |
+
|
23 |
+
snli_acc_metric = CorpusLevelMetric(
|
24 |
+
metric="snli_acc",
|
25 |
+
higher_is_better=True,
|
26 |
+
category=MetricCategory.GENERATIVE,
|
27 |
+
use_case=MetricUseCase.ACCURACY,
|
28 |
+
corpus_level_fn=np.mean,
|
29 |
+
sample_level_fn=snli_eval_fn
|
30 |
+
)
|
31 |
+
extend_enum(Metrics, 'snli_acc_metric', snli_acc_metric)
|
32 |
+
|
33 |
+
def snli_prompt_fn(line, task_name: str = None):
|
34 |
+
"""Defines how to go from a dataset line to a doc object.
|
35 |
+
Follow examples in src/lighteval/tasks/tasks_prompt_formatting.py, or get more info
|
36 |
+
about what this function should do in the README.
|
37 |
+
"""
|
38 |
+
return Doc(
|
39 |
+
task_name=task_name,
|
40 |
+
query=line["prompt"].strip(),
|
41 |
+
choices=[resp.strip() for resp in line["response"]],
|
42 |
+
gold_index=0,
|
43 |
+
instruction="",
|
44 |
+
)
|
45 |
+
|
46 |
+
# This is how you create a simple tasks (like hellaswag) which has one single subset
|
47 |
+
# attached to it, and one evaluation possible.
|
48 |
+
snli_task = LightevalTaskConfig(
|
49 |
+
name="snli-acc",
|
50 |
+
prompt_function="snli_prompt_fn", # must be defined in the file or imported from src/lighteval/tasks/tasks_prompt_formatting.py
|
51 |
+
suite=["custom"],
|
52 |
+
hf_repo="dicta-hebrew-llm-leaderboard/tests",
|
53 |
+
hf_subset="default",
|
54 |
+
hf_avail_splits=["snli"],
|
55 |
+
evaluation_splits=["snli"],
|
56 |
+
metric=['snli_acc_metric'],
|
57 |
+
stop_sequence=['\n'],
|
58 |
+
generation_size=16
|
59 |
+
)
|