from dataclasses import dataclass from enum import Enum @dataclass class Task: benchmark: str metric: str col_name: str # Select your tasks here # --------------------------------------------------- class Tasks(Enum): # task_key in the json file, metric_key in the json file, name to display in the leaderboard # task0 = Task("boolq", "acc", "BoolQA") task1 = Task("trivia", "EM", "TriviaQA") task2 = Task("truthfulqa", "EM", "TruthfulQA") task3 = Task("popqa", "acc", "PopQA") task4 = Task("hpqa", "EM", "HotpotQA") task5 = Task("nq", "EM", "Natural Questions") task6 = Task("2wiki", "EM", "2WikiMultiHop") task7 = Task("musique", "EM", "MuSiQue") # task0 = Task("anli_r1", "acc", "ANLI") # task1 = Task("logiqa", "acc_norm", "LogiQA") NUM_FEWSHOT = 0 # Change with your few shot # --------------------------------------------------- # Your leaderboard name TITLE = """

Contextual Evaluation Leaderboard

""" # What does your leaderboard evaluate? INTRODUCTION_TEXT = """ The ContextualBench leaderboard serves as a dynamic platform for showcasing the performance of Large Language Models (LLMs) evaluated using the ContextualBench framework. This leaderboard provides a comprehensive overview of how different LLMs perform across a variety of contextual tasks, enabling researchers and developers to compare and contrast the capabilities of various models effectively. By presenting results derived from rigorous evaluations on diverse datasets, the leaderboard aims to facilitate a deeper understanding of each model's strengths and weaknesses in handling large context inputs. It also encourages continuous improvement and innovation in the development of LLMs, ensuring that the latest advancements are reflected in the leaderboard's rankings. """ # Which evaluations are you running? how can people reproduce what you have? LLM_BENCHMARKS_TEXT = f""" How It Works To participate in the ContextualBench leaderboard, follow these steps to evaluate your Large Language Model (LLM) using the ContextualBench framework: Clone the Repository: Start by cloning the ContextualBench GitHub repository to your local machine using the following command: ```bash git clone https://github.com/SalesforceAIResearch/SFR-RAG ``` Navigate to the Directory: Move into the cloned repository's directory: ``bash cd ContextualBench ``` Install Dependencies: Install all necessary dependencies by executing: ```bash pip install -r requirements.txt ``` Prepare Your Model and Dataset: Set up your model and dataset according to the guidelines provided in the repository's documentation. Run the Evaluation Script: Execute the evaluation script to generate outputs for your model on the specified dataset: ```bash python run.py [dataset_name] ``` Collect and Format Outputs: Gather the outputs generated for each dataset and format them according to the leaderboard submission guidelines. Submit Your Results: Email the formatted outputs to the author's email address for evaluation. Our team will assess the performance and update the leaderboard accordingly. Reproducibility Ensuring reproducibility is a key aspect of the ContextualBench leaderboard. By following the standardized steps outlined above, participants can consistently reproduce evaluation results. This process not only facilitates fair comparisons across different models but also encourages transparency and reliability in model assessments. Participants are encouraged to adhere strictly to the submission guidelines to ensure their results are accurately reflected on the leaderboard. """ EVALUATION_QUEUE_TEXT = """ """ CITATION_BUTTON_LABEL = "Copy the following snippet to cite these results" CITATION_BUTTON_TEXT = r""" ``` @article{nguyen2024sfrrag, title={SFR-RAG: Towards Contextually Faithful LLMs}, author={Nguyen, Xuan-Phi and Pandit, Shrey and Purushwalkam, Senthil and Xu, Austin and Chen, Hailin and Ming, Yifei and Ke, Zixuan and Savarese, Silvio and Xong, Caiming and Joty, Shafiq}, year={2024} } ``` """