File size: 2,286 Bytes
894c4b4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
6c79b12
894c4b4
 
 
90dff75
 
73d1e6e
c639c51
 
 
2e51498
83dfbb6
a6af742
be7e092
bcdca08
 
73d1e6e
bcdca08
23a137b
 
73d1e6e
fb52440
 
73d1e6e
fd7beec
 
 
e598f52
1591f9d
894c4b4
5999035
f21645c
9aa52c9
 
120749e
953b663
39b4e9f
62679c8
a5a4f8d
62679c8
21eac98
 
fb52440
efa0391
a117804
efa0391
7c35ca5
894c4b4
 
7993384
894c4b4
 
7e68bad
894c4b4
f9d415e
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
import os

import torch

from dataclasses import dataclass
from enum import Enum

from src.envs import CACHE_PATH


@dataclass
class Task:
    benchmark: str
    metric: str
    col_name: str
    num_fewshot: int


class Tasks(Enum):
    task0 = Task("nq_open", "em", "NQ Open", 64)  # 64, as in the ATLAS paper
    task1 = Task("triviaqa", "em", "TriviaQA", 64)  # 64, as in the ATLAS paper

    task11 = Task("nq8", "em", "NQ Open 8", 8)
    task12 = Task("tqa8", "em", "TriviaQA 8", 8)

    task21 = Task("popqa", "em", "PopQA", 8)

    # TruthfulQA is intended as a zero-shot benchmark [5, 47]. https://owainevans.github.io/pdfs/truthfulQA_lin_evans.pdf
    task2 = Task("truthfulqa_gen", "rougeL_acc", "TruthfulQA Gen", 0)
    task3 = Task("truthfulqa_mc1", "acc", "TruthfulQA MC1", 0)
    task4 = Task("truthfulqa_mc2", "acc", "TruthfulQA MC2", 0)

    task5 = Task("halueval_qa", "acc", "HaluEval QA", 0)
    task6 = Task("halueval_dialogue", "acc", "HaluEval Dialogue", 0)
    task7 = Task("halueval_summarization", "acc", "HaluEval Summarization", 0)

    # task8 = Task("xsum", "rougeL", "XSum", 2)
    # task9 = Task("cnndm", "rougeL", "CNN/DM", 2)

    task8_1 = Task("xsum_v2", "rougeL", "XSum", 0)
    task9_1 = Task("cnndm_v2", "rougeL", "CNN/DM", 0)

    task10 = Task("memo-trap", "acc", "memo-trap", 0)
    task10_2 = Task("memo-trap_v2", "acc", "memo-trap", 0)

    task13 = Task("ifeval", "prompt_level_strict_acc", "IFEval", 0)

    task14 = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT", 0)

    # task15 = Task("fever10", "acc", "FEVER", 16)
    task15_1 = Task("fever11", "acc", "FEVER", 8)

    task16 = Task("squadv2", "exact", "SQuADv2", 4)
    task22 = Task("nq_swap", "em", "NQ-Swap", 4)

    task17 = Task("truefalse_cieacf", "acc", "TrueFalse", 8)

    # task18 = Task("faithdial_hallu", "acc", "FaithDial", 8)
    task19 = Task("faithdial_hallu_v2", "acc", "FaithDial", 8)

    task20 = Task("race", "acc", "RACE", 0)


EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
EVAL_REQUESTS_PATH_BACKEND_SYNC = os.path.join(CACHE_PATH, "eval-queue-bk-sync")
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")

DEVICE = "cuda" if torch.cuda.is_available() else 'cpu'

LIMIT = None  # Testing; needs to be None