Spaces:
Paused
Paused
Fixed whitespace for prediction
Browse files- custom_tasks.py +1 -3
- src/about.py +2 -2
- src/custom_tasks/arc_challenge_task.py +0 -24
- src/custom_tasks/commonsense_task.py +0 -31
- src/custom_tasks/heq_task.py +3 -3
- src/custom_tasks/sentiment_task.py +2 -2
- src/custom_tasks/winograd_task.py +2 -2
custom_tasks.py
CHANGED
@@ -9,13 +9,11 @@ Author:
|
|
9 |
from src.custom_tasks.heq_task import *
|
10 |
from src.custom_tasks.sentiment_task import *
|
11 |
from src.custom_tasks.winograd_task import *
|
12 |
-
from src.custom_tasks.commonsense_task import *
|
13 |
-
from src.custom_tasks.arc_challenge_task import *
|
14 |
|
15 |
## MODULE LOGIC
|
16 |
# You should not need to touch this
|
17 |
# Convert to dict for lighteval
|
18 |
-
TASKS_TABLE = [task.as_dict() for task in [heq_task, sentiment_task, winograd_task
|
19 |
|
20 |
if __name__ == "__main__":
|
21 |
print(t["name"] for t in TASKS_TABLE)
|
|
|
9 |
from src.custom_tasks.heq_task import *
|
10 |
from src.custom_tasks.sentiment_task import *
|
11 |
from src.custom_tasks.winograd_task import *
|
|
|
|
|
12 |
|
13 |
## MODULE LOGIC
|
14 |
# You should not need to touch this
|
15 |
# Convert to dict for lighteval
|
16 |
+
TASKS_TABLE = [task.as_dict() for task in [heq_task, sentiment_task, winograd_task]]
|
17 |
|
18 |
if __name__ == "__main__":
|
19 |
print(t["name"] for t in TASKS_TABLE)
|
src/about.py
CHANGED
@@ -21,5 +21,5 @@ TASKS_HARNESS = [task.value.benchmark for task in Tasks]
|
|
21 |
# ---------------------------------------------------
|
22 |
|
23 |
# TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
|
24 |
-
tasks = ['heq-qa-tlnls', 'sentiment-acc', 'winograd-acc'
|
25 |
-
TASKS_LIGHTEVAL = ','.join(f'custom|{t}|0|0' for t in tasks)
|
|
|
21 |
# ---------------------------------------------------
|
22 |
|
23 |
# TASKS_LIGHTEVAL = "lighteval|anli:r1|0|0,lighteval|logiqa|0|0"
|
24 |
+
tasks = ['heq-qa-tlnls', 'sentiment-acc', 'winograd-acc']
|
25 |
+
TASKS_LIGHTEVAL = ','.join(f'custom|{t}|0|0' for t in tasks)# + ',leaderboard|arc:challenge|0|0'
|
src/custom_tasks/arc_challenge_task.py
DELETED
@@ -1,24 +0,0 @@
|
|
1 |
-
import re
|
2 |
-
import string
|
3 |
-
from lighteval.tasks.lighteval_task import LightevalTaskConfig
|
4 |
-
from lighteval.metrics import Metrics, MetricCategory
|
5 |
-
from lighteval.metrics.utils import CorpusLevelMetric, MetricUseCase
|
6 |
-
from aenum import extend_enum
|
7 |
-
import numpy as np
|
8 |
-
from lighteval.tasks.requests import Doc
|
9 |
-
from Levenshtein import distance
|
10 |
-
import collections
|
11 |
-
from lighteval.utils import as_list
|
12 |
-
from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
|
13 |
-
|
14 |
-
arc_challenge_task = LightevalTaskConfig(
|
15 |
-
name="arc:challenge",
|
16 |
-
prompt_function="arc",
|
17 |
-
hf_repo="ai2_arc",
|
18 |
-
hf_subset="ARC-Challenge",
|
19 |
-
evaluation_splits=["test"],
|
20 |
-
generation_size=1,
|
21 |
-
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
|
22 |
-
trust_dataset=True,
|
23 |
-
stop_sequence=["\n"],
|
24 |
-
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/custom_tasks/commonsense_task.py
DELETED
@@ -1,31 +0,0 @@
|
|
1 |
-
import re
|
2 |
-
import string
|
3 |
-
from lighteval.tasks.lighteval_task import LightevalTaskConfig
|
4 |
-
from lighteval.metrics import Metrics, MetricCategory
|
5 |
-
from lighteval.metrics.utils import CorpusLevelMetric, MetricUseCase
|
6 |
-
from aenum import extend_enum
|
7 |
-
import numpy as np
|
8 |
-
from lighteval.tasks.requests import Doc
|
9 |
-
from Levenshtein import distance
|
10 |
-
import collections
|
11 |
-
from lighteval.utils import as_list
|
12 |
-
from lighteval.tasks.tasks_prompt_formatting import LETTER_INDICES
|
13 |
-
|
14 |
-
def commonsense_qa_prompt(line, task_name: str = None):
|
15 |
-
return Doc(
|
16 |
-
task_name=task_name,
|
17 |
-
query=line["question"],
|
18 |
-
choices=[f" {c}" for c in line["choices"]["text"]],
|
19 |
-
gold_index=LETTER_INDICES.index(line["answerKey"].strip()),
|
20 |
-
instruction="",
|
21 |
-
)
|
22 |
-
|
23 |
-
commonsense_qa_task = LightevalTaskConfig(
|
24 |
-
name="commonsense_qa",
|
25 |
-
prompt_function="commonsense_qa_prompt",
|
26 |
-
hf_repo="commonsense_qa",
|
27 |
-
hf_subset="default",
|
28 |
-
metric=["loglikelihood_acc", "loglikelihood_acc_norm_nospace"],
|
29 |
-
trust_dataset=True,
|
30 |
-
stop_sequence=["\n"],
|
31 |
-
),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
src/custom_tasks/heq_task.py
CHANGED
@@ -73,7 +73,7 @@ def tlnls(a_gold, a_pred):
|
|
73 |
def heq_eval_fn(golds: list[str], predictions: list[str], formatted_doc: Doc = None):
|
74 |
if len(predictions) > 1:
|
75 |
raise ValueError("Predictions should have one item")
|
76 |
-
pred = re.sub('<[^>]+>', '', predictions[0]) # remove xml tags
|
77 |
return max([tlnls(x, pred) for x in golds])
|
78 |
|
79 |
heq_tlnls_metric = CorpusLevelMetric(
|
@@ -93,8 +93,8 @@ def heq_prompt_fn(line, task_name: str = None):
|
|
93 |
"""
|
94 |
return Doc(
|
95 |
task_name=task_name,
|
96 |
-
query=line["prompt"],
|
97 |
-
choices=line["response"],
|
98 |
gold_index=list(range(len(line["response"]))),
|
99 |
instruction="",
|
100 |
)
|
|
|
73 |
def heq_eval_fn(golds: list[str], predictions: list[str], formatted_doc: Doc = None):
|
74 |
if len(predictions) > 1:
|
75 |
raise ValueError("Predictions should have one item")
|
76 |
+
pred = re.sub('<[^>]+>', '', predictions[0]).strip() # remove xml tags
|
77 |
return max([tlnls(x, pred) for x in golds])
|
78 |
|
79 |
heq_tlnls_metric = CorpusLevelMetric(
|
|
|
93 |
"""
|
94 |
return Doc(
|
95 |
task_name=task_name,
|
96 |
+
query=line["prompt"].strip(),
|
97 |
+
choices=[resp.strip() for resp in line["response"]],
|
98 |
gold_index=list(range(len(line["response"]))),
|
99 |
instruction="",
|
100 |
)
|
src/custom_tasks/sentiment_task.py
CHANGED
@@ -37,8 +37,8 @@ def sentiment_prompt_fn(line, task_name: str = None):
|
|
37 |
"""
|
38 |
return Doc(
|
39 |
task_name=task_name,
|
40 |
-
query=line["prompt"],
|
41 |
-
choices=line["response"],
|
42 |
gold_index=0,
|
43 |
instruction="",
|
44 |
)
|
|
|
37 |
"""
|
38 |
return Doc(
|
39 |
task_name=task_name,
|
40 |
+
query=line["prompt"].strip(),
|
41 |
+
choices=[resp.strip() for resp in line["response"]],
|
42 |
gold_index=0,
|
43 |
instruction="",
|
44 |
)
|
src/custom_tasks/winograd_task.py
CHANGED
@@ -34,8 +34,8 @@ def winograd_prompt_fn(line, task_name: str = None):
|
|
34 |
"""
|
35 |
return Doc(
|
36 |
task_name=task_name,
|
37 |
-
query=line["prompt"],
|
38 |
-
choices=line["response"],
|
39 |
gold_index=0,
|
40 |
instruction="",
|
41 |
)
|
|
|
34 |
"""
|
35 |
return Doc(
|
36 |
task_name=task_name,
|
37 |
+
query=line["prompt"].strip(),
|
38 |
+
choices=[resp.strip() for resp in line["response"]],
|
39 |
gold_index=0,
|
40 |
instruction="",
|
41 |
)
|