Spaces:
Runtime error
Runtime error
File size: 2,068 Bytes
894c4b4 6c79b12 894c4b4 90dff75 bcdca08 73d1e6e bcdca08 73d1e6e bcdca08 23a137b 73d1e6e e598f52 73d1e6e e598f52 894c4b4 7e267bf 5999035 f21645c 9aa52c9 53c755d 39b4e9f 62679c8 21eac98 62ea587 a117804 7c35ca5 6c79b12 894c4b4 b1a5839 894c4b4 7e68bad 894c4b4 f9d415e |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 |
import os
import torch
from dataclasses import dataclass
from enum import Enum
from src.envs import CACHE_PATH
@dataclass
class Task:
benchmark: str
metric: str
col_name: str
num_fewshot: int
class Tasks(Enum):
# task_key in the json file, metric_key in the json file, name to display in the leaderboard
# task0 = Task("anli_r1", "acc", "ANLI")
# task1 = Task("logiqa", "acc_norm", "LogiQA")
task0 = Task("nq_open", "em", "NQ Open", 64) # 64, as in the ATLAS paper
task1 = Task("triviaqa", "em", "TriviaQA", 64) # 64, as in the ATLAS paper
# TruthfulQA is intended as a zero-shot benchmark [5, 47]. https://owainevans.github.io/pdfs/truthfulQA_lin_evans.pdf
# task2 = Task("truthfulqa_gen", "rougeL_acc", "TruthfulQA Gen", 0)
task3 = Task("truthfulqa_mc1", "acc", "TruthfulQA MC1", 0)
task4 = Task("truthfulqa_mc2", "acc", "TruthfulQA MC2", 0)
task5 = Task("halueval_qa", "acc", "HaluEval QA", 0)
task6 = Task("halueval_dialogue", "acc", "HaluEval Dialogue", 0)
task7 = Task("halueval_summarization", "acc", "HaluEval Summarization", 0)
task8 = Task("xsum", "rougeL", "XSum", 2)
task9 = Task("cnndm", "rougeL", "CNN/DM", 2)
task10 = Task("memo-trap", "acc", "memo-trap", 0)
task11 = Task("nq8", "em", "NQ Open 8", 8)
task12 = Task("tqa8", "em", "TriviaQA 8", 8)
task13 = Task("ifeval", "prompt_level_strict_acc", "IFEval", 0)
task14 = Task("selfcheckgpt", "max-selfcheckgpt", "SelfCheckGPT", 0)
task15 = Task("fever10", "acc", "FEVER", 16)
task16 = Task("squadv2", "exact", "SQuADv2", 4)
task17 = Task("truefalse_cieacf", "acc", "TrueFalse", 8)
task18 = Task("faithdial_hallu", "acc", "FaithDial", 8)
task19 = Task("race", "acc", "RACE", 0)
# NUM_FEWSHOT = 64 # Change with your few shot
EVAL_REQUESTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-queue-bk")
EVAL_RESULTS_PATH_BACKEND = os.path.join(CACHE_PATH, "eval-results-bk")
DEVICE = "cuda" if torch.cuda.is_available() else 'cpu'
LIMIT = None # Testing; needs to be None
|