File size: 3,510 Bytes
131edb5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
b615923
131edb5
 
 
 
 
 
 
 
b615923
 
131edb5
 
 
 
 
 
 
 
 
b615923
 
 
 
131edb5
 
 
 
 
5f85ab9
131edb5
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
from dataclasses import dataclass
from enum import Enum


@dataclass
class Task:
    benchmark: str
    metric: str
    col_name: str


# Init: to update with your specific keys
class Tasks(Enum):
    task0 = Task("bbq", "aggregate_score", "Prejudiced Answers: BBQ")
    task1 = Task("bold", "aggregate_score", "Biased Completions: BOLD")
    task2 = Task("toxicity", "aggregate_score", "Toxic Completions of Benign Text: RealToxicityPrompts")
    task3 = Task("toxicity_advbench", "aggregate_score", "Following Harmful Instructions: AdvBench")
    task4 = Task("forecasting_consistency", "aggregate_score", "Monotonicity Checks")
    task5 = Task("self_check_consistency", "aggregate_score", "Self-Check Consistency")
    task6 = Task("boolq_contrast_robustness", "aggregate_score", "BoolQ Contrast Set")
    task7 = Task("imdb_contrast_robustness", "aggregate_score", "IMDB Contrast Set")
    task8 = Task("calibration_big_bench", "aggregate_score", "Logit Calibration: BIG-Bench")
    task9 = Task("calibration_big_bench_i_know", "aggregate_score", "Self-Assessment: TriviaQA")
    task10 = Task("decoding_trust", "aggregate_score", "Income Fairness: DecodingTrust")
    task11 = Task("hellaswag", "aggregate_score", "Common Sense Reasoning: HellaSwag")
    task12 = Task("human_eval", "aggregate_score", "Coding: HumanEval")
    task13 = Task("instruction_goal_hijacking", "aggregate_score", "Goal Hijacking and Prompt Leakage")
    task14 = Task("multiturn_goal_hijacking", "aggregate_score", "Rule Following")
    task15 = Task("reddit_bias", "aggregate_score", "Representation Bias: RedditBias")
    task16 = Task("truthful_qa_mc2", "aggregate_score", "Truthfulness: TruthfulQA MC2")
    task17 = Task("mmlu", "aggregate_score", "General Knowledge: MMLU")
    task18 = Task("ai2_reasoning", "aggregate_score", "Reasoning: AI2 Reasoning Challenge")
    task19 = Task("human_deception", "aggregate_score", "Denying Human Presence")
    task20 = Task("memorization", "aggregate_score", "Copyrighted Material Memorization")
    task21 = Task("privacy", "aggregate_score", "PII Extraction by Association")
    task22 = Task("fairllm", "aggregate_score", "Recommendation Consistency: FaiRLLM")
    task23 = Task("mmlu_robustness", "aggregate_score", "MMLU: Robustness")
    # task24 = Task("training_data_suitability", "aggregate_score", "Training Data Suitability")
    task24 = Task("watermarking", "aggregate_score", "Watermark Reliability & Robustness")
    task25 = Task("dataset_bias", "aggregate_score", "Bias of the Dataset")
    task26 = Task("dataset_toxicity", "aggregate_score", "Toxicity of the Dataset")




# Your leaderboard name
TITLE = """<h1 align="center" id="space-title">EU AI Act Compliance Leaderboard</h1>"""

# Which evaluations are you running? how can people reproduce what you have?
LLM_BENCHMARKS_TEXT = f"""
"""

EVALUATION_QUEUE_TEXT = """
"""

CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results"
CITATION_BUTTON_TEXT = r"""
@article{complai24,
      title={COMPL-AI Framework: A Technical Interpretation and LLM Benchmarking Suite for the EU Artificial Intelligence Act}, 
      author={Philipp Guldimann and Alexander Spiridonov and Robin Staab and Nikola Jovanovi\'{c} and Mark Vero and Velko Vechev and Anna Gueorguieva and Mislav Balunovi\'{c} and Nikola Konstantinov and Pavol Bielik and Petar Tsankov and Martin Vechev},
      year={2024},
      eprint={2410.07959},
      primaryClass={cs.CL},
      url={https://arxiv.org/abs/2410.07959},
}
"""