TITLE = '

Open Multilingual LLM Evaluation Leaderboard

' INTRO_TEXT = f""" ## About This leaderboard tracks progress and ranks performance of large language models (LLMs) developed for different languages, emphasizing on non-English languages to democratize benefits of LLMs to broader society. Our current leaderboard provides evaluation data for 29 languages, i.e., Arabic, Armenian, Basque, Bengali, Catalan, Chinese, Croatian, Danish, Dutch, French, German, Gujarati, Hindi, Hungarian, Indonesian, Italian, Kannada, Malayalam, Marathi, Nepali, Portuguese, Romanian, Russian, Serbian, Slovak, Spanish, Swedish, Tamil, Telugu, Ukrainian, and Vietnamese, that will be expanded along the way. Both multilingual and language-specific LLMs are welcome in this leaderboard. We currently evaluate models over four benchmarks: - AI2 Reasoning Challenge (25-shot) - HellaSwag (0-shot) - MMLU (25-shot) - TruthfulQA (0-shot) The evaluation data was translated into these languages using ChatGPT (gpt-35-turbo). """ HOW_TO = f""" ## How to list your model performance on this leaderboard: Run the evaluation of your model using this repo: https://github.com/laiviet/lm-evaluation-harness. And then, push the evaluation log and make a pull request. """ CREDIT = f""" ## Credit To make this website, we use the following resources: - Datasets (AI2_ARC, HellaSwag, MMLU, TruthfulQA) - Funding and GPU access (Adobe Research) - Evaluation code (EleutherAI's lm_evaluation_harness repo) - Leaderboard code (Huggingface4's open_llm_leaderboard repo) """ CITATION = f""" ## Citation ``` @misc{{lai2023openllmbenchmark, author = {{Viet Lai and Nghia Trung Ngo and Amir Pouran Ben Veyseh and Franck Dernoncourt and Thien Huu Nguyen}}, title={{Open Multilingual LLM Evaluation Leaderboard}}, year={{2023}} }} ``` """