diff --git "a/lm_evaluation_harness.ipynb" "b/lm_evaluation_harness.ipynb" new file mode 100644--- /dev/null +++ "b/lm_evaluation_harness.ipynb" @@ -0,0 +1,713 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "id": "Ac6wadk3rmkK" + }, + "source": [ + "# LM Evaluation Harness (by [EleutherAI](https://www.eleuther.ai/))\n", + "\n", + "This [`LM-Evaluation-Harness`](https://github.com/EleutherAI/lm-evaluation-harness) provides a unified framework to test generative language models on a large number of different evaluation tasks. For a complete list of available tasks, see the [task table](https://github.com/EleutherAI/lm-evaluation-harness/blob/master/docs/task_table.md), or scroll to the bottom of the page.\n", + "\n", + "1. Clone the [lm-evaluation-harness](https://github.com/EleutherAI/lm-evaluation-harness) and install the necessary libraries (`sentencepiece` is required for the Llama tokenizer)." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "UA5I86u91e0A", + "outputId": "d74b3cab-b292-43db-bd5d-523424d2c97a" + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Cloning into 'lm-evaluation-harness'...\n", + "remote: Enumerating objects: 22343, done.\u001b[K\n", + "remote: Counting objects: 100% (7096/7096), done.\u001b[K\n", + "remote: Compressing objects: 100% (703/703), done.\u001b[K\n", + "remote: Total 22343 (delta 6540), reused 6659 (delta 6392), pack-reused 15247\u001b[K\n", + "Receiving objects: 100% (22343/22343), 20.57 MiB | 11.37 MiB/s, done.\n", + "Resolving deltas: 100% (15456/15456), done.\n", + "Obtaining file:///content/lm-evaluation-harness\n", + " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "Collecting datasets>=2.0.0 (from lm-eval==0.3.0)\n", + " Downloading datasets-2.14.5-py3-none-any.whl (519 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m519.6/519.6 kB\u001b[0m \u001b[31m8.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting einops (from lm-eval==0.3.0)\n", + " Downloading einops-0.7.0-py3-none-any.whl (44 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m44.6/44.6 kB\u001b[0m \u001b[31m5.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting jsonlines (from lm-eval==0.3.0)\n", + " Downloading jsonlines-4.0.0-py3-none-any.whl (8.7 kB)\n", + "Requirement already satisfied: numexpr in /usr/local/lib/python3.10/dist-packages (from lm-eval==0.3.0) (2.8.7)\n", + "Collecting openai>=0.6.4 (from lm-eval==0.3.0)\n", + " Downloading openai-0.28.1-py3-none-any.whl (76 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m77.0/77.0 kB\u001b[0m \u001b[31m10.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting omegaconf>=2.2 (from lm-eval==0.3.0)\n", + " Downloading omegaconf-2.3.0-py3-none-any.whl (79 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m79.5/79.5 kB\u001b[0m \u001b[31m9.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting peft>=0.2.0 (from lm-eval==0.3.0)\n", + " Downloading peft-0.5.0-py3-none-any.whl (85 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m85.6/85.6 kB\u001b[0m \u001b[31m11.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting pybind11>=2.6.2 (from lm-eval==0.3.0)\n", + " Downloading pybind11-2.11.1-py3-none-any.whl (227 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m227.7/227.7 kB\u001b[0m \u001b[31m26.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting pycountry (from lm-eval==0.3.0)\n", + " Downloading pycountry-22.3.5.tar.gz (10.1 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m10.1/10.1 MB\u001b[0m \u001b[31m85.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Installing build dependencies ... \u001b[?25l\u001b[?25hdone\n", + " Getting requirements to build wheel ... \u001b[?25l\u001b[?25hdone\n", + " Preparing metadata (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + "Collecting pytablewriter (from lm-eval==0.3.0)\n", + " Downloading pytablewriter-1.2.0-py3-none-any.whl (111 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m111.1/111.1 kB\u001b[0m \u001b[31m14.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting rouge-score>=0.0.4 (from lm-eval==0.3.0)\n", + " Downloading rouge_score-0.1.2.tar.gz (17 kB)\n", + " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "Collecting sacrebleu==1.5.0 (from lm-eval==0.3.0)\n", + " Downloading sacrebleu-1.5.0-py3-none-any.whl (65 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m65.6/65.6 kB\u001b[0m \u001b[31m9.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: scikit-learn>=0.24.1 in /usr/local/lib/python3.10/dist-packages (from lm-eval==0.3.0) (1.2.2)\n", + "Collecting sqlitedict (from lm-eval==0.3.0)\n", + " Downloading sqlitedict-2.1.0.tar.gz (21 kB)\n", + " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "Requirement already satisfied: torch>=1.7 in /usr/local/lib/python3.10/dist-packages (from lm-eval==0.3.0) (2.0.1+cu118)\n", + "Collecting tqdm-multiprocess (from lm-eval==0.3.0)\n", + " Downloading tqdm_multiprocess-0.0.11-py3-none-any.whl (9.8 kB)\n", + "Collecting transformers>=4.1 (from lm-eval==0.3.0)\n", + " Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m7.7/7.7 MB\u001b[0m \u001b[31m63.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting zstandard (from lm-eval==0.3.0)\n", + " Downloading zstandard-0.21.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.7 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.7/2.7 MB\u001b[0m \u001b[31m85.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting accelerate>=0.17.1 (from lm-eval==0.3.0)\n", + " Downloading accelerate-0.23.0-py3-none-any.whl (258 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m258.1/258.1 kB\u001b[0m \u001b[31m25.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting portalocker (from sacrebleu==1.5.0->lm-eval==0.3.0)\n", + " Downloading portalocker-2.8.2-py3-none-any.whl (17 kB)\n", + "Requirement already satisfied: numpy>=1.17 in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.17.1->lm-eval==0.3.0) (1.23.5)\n", + "Requirement already satisfied: packaging>=20.0 in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.17.1->lm-eval==0.3.0) (23.2)\n", + "Requirement already satisfied: psutil in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.17.1->lm-eval==0.3.0) (5.9.5)\n", + "Requirement already satisfied: pyyaml in /usr/local/lib/python3.10/dist-packages (from accelerate>=0.17.1->lm-eval==0.3.0) (6.0.1)\n", + "Collecting huggingface-hub (from accelerate>=0.17.1->lm-eval==0.3.0)\n", + " Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m302.0/302.0 kB\u001b[0m \u001b[31m27.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: pyarrow>=8.0.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==0.3.0) (9.0.0)\n", + "Collecting dill<0.3.8,>=0.3.0 (from datasets>=2.0.0->lm-eval==0.3.0)\n", + " Downloading dill-0.3.7-py3-none-any.whl (115 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m115.3/115.3 kB\u001b[0m \u001b[31m13.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: pandas in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==0.3.0) (1.5.3)\n", + "Requirement already satisfied: requests>=2.19.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==0.3.0) (2.31.0)\n", + "Requirement already satisfied: tqdm>=4.62.1 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==0.3.0) (4.66.1)\n", + "Collecting xxhash (from datasets>=2.0.0->lm-eval==0.3.0)\n", + " Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m194.1/194.1 kB\u001b[0m \u001b[31m21.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting multiprocess (from datasets>=2.0.0->lm-eval==0.3.0)\n", + " Downloading multiprocess-0.70.15-py310-none-any.whl (134 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m134.8/134.8 kB\u001b[0m \u001b[31m16.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: fsspec[http]<2023.9.0,>=2023.1.0 in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==0.3.0) (2023.6.0)\n", + "Requirement already satisfied: aiohttp in /usr/local/lib/python3.10/dist-packages (from datasets>=2.0.0->lm-eval==0.3.0) (3.8.6)\n", + "Collecting antlr4-python3-runtime==4.9.* (from omegaconf>=2.2->lm-eval==0.3.0)\n", + " Downloading antlr4-python3-runtime-4.9.3.tar.gz (117 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m117.0/117.0 kB\u001b[0m \u001b[31m15.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + "Collecting safetensors (from peft>=0.2.0->lm-eval==0.3.0)\n", + " Downloading safetensors-0.4.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m66.8 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: absl-py in /usr/local/lib/python3.10/dist-packages (from rouge-score>=0.0.4->lm-eval==0.3.0) (1.4.0)\n", + "Requirement already satisfied: nltk in /usr/local/lib/python3.10/dist-packages (from rouge-score>=0.0.4->lm-eval==0.3.0) (3.8.1)\n", + "Requirement already satisfied: six>=1.14.0 in /usr/local/lib/python3.10/dist-packages (from rouge-score>=0.0.4->lm-eval==0.3.0) (1.16.0)\n", + "Requirement already satisfied: scipy>=1.3.2 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.24.1->lm-eval==0.3.0) (1.11.3)\n", + "Requirement already satisfied: joblib>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.24.1->lm-eval==0.3.0) (1.3.2)\n", + "Requirement already satisfied: threadpoolctl>=2.0.0 in /usr/local/lib/python3.10/dist-packages (from scikit-learn>=0.24.1->lm-eval==0.3.0) (3.2.0)\n", + "Requirement already satisfied: filelock in /usr/local/lib/python3.10/dist-packages (from torch>=1.7->lm-eval==0.3.0) (3.12.4)\n", + "Requirement already satisfied: typing-extensions in /usr/local/lib/python3.10/dist-packages (from torch>=1.7->lm-eval==0.3.0) (4.5.0)\n", + "Requirement already satisfied: sympy in /usr/local/lib/python3.10/dist-packages (from torch>=1.7->lm-eval==0.3.0) (1.12)\n", + "Requirement already satisfied: networkx in /usr/local/lib/python3.10/dist-packages (from torch>=1.7->lm-eval==0.3.0) (3.1)\n", + "Requirement already satisfied: jinja2 in /usr/local/lib/python3.10/dist-packages (from torch>=1.7->lm-eval==0.3.0) (3.1.2)\n", + "Requirement already satisfied: triton==2.0.0 in /usr/local/lib/python3.10/dist-packages (from torch>=1.7->lm-eval==0.3.0) (2.0.0)\n", + "Requirement already satisfied: cmake in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.7->lm-eval==0.3.0) (3.27.6)\n", + "Requirement already satisfied: lit in /usr/local/lib/python3.10/dist-packages (from triton==2.0.0->torch>=1.7->lm-eval==0.3.0) (17.0.2)\n", + "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.10/dist-packages (from transformers>=4.1->lm-eval==0.3.0) (2023.6.3)\n", + "Collecting tokenizers<0.15,>=0.14 (from transformers>=4.1->lm-eval==0.3.0)\n", + " Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.8 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.8/3.8 MB\u001b[0m \u001b[31m118.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: attrs>=19.2.0 in /usr/local/lib/python3.10/dist-packages (from jsonlines->lm-eval==0.3.0) (23.1.0)\n", + "Requirement already satisfied: setuptools in /usr/local/lib/python3.10/dist-packages (from pycountry->lm-eval==0.3.0) (67.7.2)\n", + "Collecting DataProperty<2,>=1.0.1 (from pytablewriter->lm-eval==0.3.0)\n", + " Downloading DataProperty-1.0.1-py3-none-any.whl (27 kB)\n", + "Collecting mbstrdecoder<2,>=1.0.0 (from pytablewriter->lm-eval==0.3.0)\n", + " Downloading mbstrdecoder-1.1.3-py3-none-any.whl (7.8 kB)\n", + "Collecting pathvalidate<4,>=2.3.0 (from pytablewriter->lm-eval==0.3.0)\n", + " Downloading pathvalidate-3.2.0-py3-none-any.whl (23 kB)\n", + "Collecting tabledata<2,>=1.3.1 (from pytablewriter->lm-eval==0.3.0)\n", + " Downloading tabledata-1.3.3-py3-none-any.whl (11 kB)\n", + "Collecting tcolorpy<1,>=0.0.5 (from pytablewriter->lm-eval==0.3.0)\n", + " Downloading tcolorpy-0.1.4-py3-none-any.whl (7.9 kB)\n", + "Collecting typepy[datetime]<2,>=1.3.2 (from pytablewriter->lm-eval==0.3.0)\n", + " Downloading typepy-1.3.2-py3-none-any.whl (31 kB)\n", + "Collecting colorama (from tqdm-multiprocess->lm-eval==0.3.0)\n", + " Downloading colorama-0.4.6-py2.py3-none-any.whl (25 kB)\n", + "Requirement already satisfied: charset-normalizer<4.0,>=2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->lm-eval==0.3.0) (3.3.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->lm-eval==0.3.0) (6.0.4)\n", + "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->lm-eval==0.3.0) (4.0.3)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->lm-eval==0.3.0) (1.9.2)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->lm-eval==0.3.0) (1.4.0)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp->datasets>=2.0.0->lm-eval==0.3.0) (1.3.1)\n", + "Requirement already satisfied: chardet<6,>=3.0.4 in /usr/local/lib/python3.10/dist-packages (from mbstrdecoder<2,>=1.0.0->pytablewriter->lm-eval==0.3.0) (5.2.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets>=2.0.0->lm-eval==0.3.0) (3.4)\n", + "Requirement already satisfied: urllib3<3,>=1.21.1 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets>=2.0.0->lm-eval==0.3.0) (2.0.6)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests>=2.19.0->datasets>=2.0.0->lm-eval==0.3.0) (2023.7.22)\n", + "Collecting huggingface-hub (from accelerate>=0.17.1->lm-eval==0.3.0)\n", + " Downloading huggingface_hub-0.17.3-py3-none-any.whl (295 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m295.0/295.0 kB\u001b[0m \u001b[31m34.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: python-dateutil<3.0.0,>=2.8.0 in /usr/local/lib/python3.10/dist-packages (from typepy[datetime]<2,>=1.3.2->pytablewriter->lm-eval==0.3.0) (2.8.2)\n", + "Requirement already satisfied: pytz>=2018.9 in /usr/local/lib/python3.10/dist-packages (from typepy[datetime]<2,>=1.3.2->pytablewriter->lm-eval==0.3.0) (2023.3.post1)\n", + "Requirement already satisfied: MarkupSafe>=2.0 in /usr/local/lib/python3.10/dist-packages (from jinja2->torch>=1.7->lm-eval==0.3.0) (2.1.3)\n", + "Requirement already satisfied: click in /usr/local/lib/python3.10/dist-packages (from nltk->rouge-score>=0.0.4->lm-eval==0.3.0) (8.1.7)\n", + "Requirement already satisfied: mpmath>=0.19 in /usr/local/lib/python3.10/dist-packages (from sympy->torch>=1.7->lm-eval==0.3.0) (1.3.0)\n", + "Building wheels for collected packages: antlr4-python3-runtime, rouge-score, pycountry, sqlitedict\n", + " Building wheel for antlr4-python3-runtime (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for antlr4-python3-runtime: filename=antlr4_python3_runtime-4.9.3-py3-none-any.whl size=144554 sha256=a2f0b8953193e72a5cc4d402cd57becdaf2e11c29b664a7bc1dd0a2be7b14c34\n", + " Stored in directory: /root/.cache/pip/wheels/12/93/dd/1f6a127edc45659556564c5730f6d4e300888f4bca2d4c5a88\n", + " Building wheel for rouge-score (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for rouge-score: filename=rouge_score-0.1.2-py3-none-any.whl size=24932 sha256=b800533290e8b115b69386f5528faaeec21bdaf0b27df954f91293ce884d2fae\n", + " Stored in directory: /root/.cache/pip/wheels/5f/dd/89/461065a73be61a532ff8599a28e9beef17985c9e9c31e541b4\n", + " Building wheel for pycountry (pyproject.toml) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for pycountry: filename=pycountry-22.3.5-py2.py3-none-any.whl size=10681833 sha256=c76dd8d8880795167eba1833e4b4f85fd1d2989d3e3c2a3c14ac581d784ec607\n", + " Stored in directory: /root/.cache/pip/wheels/03/57/cc/290c5252ec97a6d78d36479a3c5e5ecc76318afcb241ad9dbe\n", + " Building wheel for sqlitedict (setup.py) ... \u001b[?25l\u001b[?25hdone\n", + " Created wheel for sqlitedict: filename=sqlitedict-2.1.0-py3-none-any.whl size=16864 sha256=38ab29686a73c7df8c33252ed6b8986475d0548f9adf841373e4ecaf8d995201\n", + " Stored in directory: /root/.cache/pip/wheels/79/d6/e7/304e0e6cb2221022c26d8161f7c23cd4f259a9e41e8bbcfabd\n", + "Successfully built antlr4-python3-runtime rouge-score pycountry sqlitedict\n", + "Installing collected packages: sqlitedict, antlr4-python3-runtime, zstandard, xxhash, tcolorpy, safetensors, pycountry, pybind11, portalocker, pathvalidate, omegaconf, mbstrdecoder, jsonlines, einops, dill, colorama, typepy, tqdm-multiprocess, sacrebleu, rouge-score, multiprocess, huggingface-hub, tokenizers, openai, transformers, datasets, DataProperty, tabledata, pytablewriter, accelerate, peft, lm-eval\n", + " Running setup.py develop for lm-eval\n", + "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", + "llmx 0.0.15a0 requires cohere, which is not installed.\n", + "llmx 0.0.15a0 requires tiktoken, which is not installed.\u001b[0m\u001b[31m\n", + "\u001b[0mSuccessfully installed DataProperty-1.0.1 accelerate-0.23.0 antlr4-python3-runtime-4.9.3 colorama-0.4.6 datasets-2.14.5 dill-0.3.7 einops-0.7.0 huggingface-hub-0.17.3 jsonlines-4.0.0 lm-eval-0.3.0 mbstrdecoder-1.1.3 multiprocess-0.70.15 omegaconf-2.3.0 openai-0.28.1 pathvalidate-3.2.0 peft-0.5.0 portalocker-2.8.2 pybind11-2.11.1 pycountry-22.3.5 pytablewriter-1.2.0 rouge-score-0.1.2 sacrebleu-1.5.0 safetensors-0.4.0 sqlitedict-2.1.0 tabledata-1.3.3 tcolorpy-0.1.4 tokenizers-0.14.1 tqdm-multiprocess-0.0.11 transformers-4.34.0 typepy-1.3.2 xxhash-3.4.1 zstandard-0.21.0\n", + "Collecting cohere\n", + " Downloading cohere-4.30-py3-none-any.whl (47 kB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m47.8/47.8 kB\u001b[0m \u001b[31m1.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting tiktoken\n", + " Downloading tiktoken-0.5.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.0 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.0/2.0 MB\u001b[0m \u001b[31m30.2 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hCollecting sentencepiece\n", + " Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.3/1.3 MB\u001b[0m \u001b[31m75.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: aiohttp<4.0,>=3.0 in /usr/local/lib/python3.10/dist-packages (from cohere) (3.8.6)\n", + "Collecting backoff<3.0,>=2.0 (from cohere)\n", + " Downloading backoff-2.2.1-py3-none-any.whl (15 kB)\n", + "Collecting fastavro==1.8.2 (from cohere)\n", + " Downloading fastavro-1.8.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (2.7 MB)\n", + "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m2.7/2.7 MB\u001b[0m \u001b[31m97.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", + "\u001b[?25hRequirement already satisfied: importlib_metadata<7.0,>=6.0 in /usr/local/lib/python3.10/dist-packages (from cohere) (6.8.0)\n", + "Requirement already satisfied: requests<3.0.0,>=2.25.0 in /usr/local/lib/python3.10/dist-packages (from cohere) (2.31.0)\n", + "Requirement already satisfied: urllib3<3,>=1.26 in /usr/local/lib/python3.10/dist-packages (from cohere) (2.0.6)\n", + "Requirement already satisfied: regex>=2022.1.18 in /usr/local/lib/python3.10/dist-packages (from tiktoken) (2023.6.3)\n", + "Requirement already satisfied: attrs>=17.3.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0,>=3.0->cohere) (23.1.0)\n", + "Requirement already satisfied: charset-normalizer<4.0,>=2.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0,>=3.0->cohere) (3.3.0)\n", + "Requirement already satisfied: multidict<7.0,>=4.5 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0,>=3.0->cohere) (6.0.4)\n", + "Requirement already satisfied: async-timeout<5.0,>=4.0.0a3 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0,>=3.0->cohere) (4.0.3)\n", + "Requirement already satisfied: yarl<2.0,>=1.0 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0,>=3.0->cohere) (1.9.2)\n", + "Requirement already satisfied: frozenlist>=1.1.1 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0,>=3.0->cohere) (1.4.0)\n", + "Requirement already satisfied: aiosignal>=1.1.2 in /usr/local/lib/python3.10/dist-packages (from aiohttp<4.0,>=3.0->cohere) (1.3.1)\n", + "Requirement already satisfied: zipp>=0.5 in /usr/local/lib/python3.10/dist-packages (from importlib_metadata<7.0,>=6.0->cohere) (3.17.0)\n", + "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0,>=2.25.0->cohere) (3.4)\n", + "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.10/dist-packages (from requests<3.0.0,>=2.25.0->cohere) (2023.7.22)\n", + "Installing collected packages: sentencepiece, fastavro, backoff, tiktoken, cohere\n", + "Successfully installed backoff-2.2.1 cohere-4.30 fastavro-1.8.2 sentencepiece-0.1.99 tiktoken-0.5.1\n" + ] + } + ], + "source": [ + "%git clone https://github.com/EleutherAI/lm-evaluation-harness\n", + "%cd lm-evaluation-harness && pip install -e .\n", + "%pip install cohere tiktoken sentencepiece" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "colab": { + "base_uri": "https://localhost:8080/" + }, + "id": "pnHoAVK25QZn", + "outputId": "4253b115-702c-4f31-f1b3-f0483c527841" + }, + "outputs": [], + "source": [ + "%cd lm-evaluation-harness && python main.py \\\n", + " --model hf-causal \\\n", + " --model_args pretrained=nicholasKluge/Aira-2-1B1 \\\n", + " --tasks hendrycksTest-abstract_algebra,hendrycksTest-anatomy,hendrycksTest-astronomy,hendrycksTest-business_ethics,hendrycksTest-clinical_knowledge,hendrycksTest-college_biology,hendrycksTest-college_chemistry,hendrycksTest-college_computer_science,hendrycksTest-college_mathematics,hendrycksTest-college_medicine,hendrycksTest-college_physics,hendrycksTest-computer_security,hendrycksTest-conceptual_physics,hendrycksTest-econometrics,hendrycksTest-electrical_engineering,hendrycksTest-elementary_mathematics,hendrycksTest-formal_logic,hendrycksTest-global_facts,hendrycksTest-high_school_biology,hendrycksTest-high_school_chemistry,hendrycksTest-high_school_computer_science,hendrycksTest-high_school_european_history,hendrycksTest-high_school_geography,hendrycksTest-high_school_government_and_politics,hendrycksTest-high_school_macroeconomics,hendrycksTest-high_school_mathematics,hendrycksTest-high_school_microeconomics,hendrycksTest-high_school_physics,hendrycksTest-high_school_psychology,hendrycksTest-high_school_statistics,hendrycksTest-high_school_us_history,hendrycksTest-high_school_world_history,hendrycksTest-human_aging,hendrycksTest-human_sexuality,hendrycksTest-international_law,hendrycksTest-jurisprudence,hendrycksTest-logical_fallacies,hendrycksTest-machine_learning,hendrycksTest-management,hendrycksTest-marketing,hendrycksTest-medical_genetics,hendrycksTest-miscellaneous,hendrycksTest-moral_disputes,hendrycksTest-moral_scenarios,hendrycksTest-nutrition,hendrycksTest-philosophy,hendrycksTest-prehistory,hendrycksTest-professional_accounting,hendrycksTest-professional_law,hendrycksTest-professional_medicine,hendrycksTest-professional_psychology,hendrycksTest-public_relations,hendrycksTest-security_studies,hendrycksTest-sociology,hendrycksTest-us_foreign_policy,hendrycksTest-virology,hendrycksTest-world_religions \\\n", + " --device cuda:0" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "id": "4Bm78wiZ4Own" + }, + "source": [ + "## Task Table 📚\n", + "\n", + "| Task Name |Train|Val|Test|Val/Test Docs| Metrics |\n", + "|---------------------------------------------------------|-----|---|----|------------:|---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|\n", + "|anagrams1 | |✓ | | 10000|acc |\n", + "|anagrams2 | |✓ | | 10000|acc |\n", + "|anli_r1 |✓ |✓ |✓ | 1000|acc |\n", + "|anli_r2 |✓ |✓ |✓ | 1000|acc |\n", + "|anli_r3 |✓ |✓ |✓ | 1200|acc |\n", + "|arc_challenge |✓ |✓ |✓ | 1172|acc, acc_norm |\n", + "|arc_easy |✓ |✓ |✓ | 2376|acc, acc_norm |\n", + "|arithmetic_1dc | |✓ | | 2000|acc |\n", + "|arithmetic_2da | |✓ | | 2000|acc |\n", + "|arithmetic_2dm | |✓ | | 2000|acc |\n", + "|arithmetic_2ds | |✓ | | 2000|acc |\n", + "|arithmetic_3da | |✓ | | 2000|acc |\n", + "|arithmetic_3ds | |✓ | | 2000|acc |\n", + "|arithmetic_4da | |✓ | | 2000|acc |\n", + "|arithmetic_4ds | |✓ | | 2000|acc |\n", + "|arithmetic_5da | |✓ | | 2000|acc |\n", + "|arithmetic_5ds | |✓ | | 2000|acc |\n", + "|bigbench_causal_judgement | | |✓ | 190|multiple_choice_grade, exact_str_match |\n", + "|bigbench_date_understanding | | |✓ | 369|multiple_choice_grade, exact_str_match |\n", + "|bigbench_disambiguation_qa | | |✓ | 258|multiple_choice_grade, exact_str_match |\n", + "|bigbench_dyck_languages | | |✓ | 1000|multiple_choice_grade, exact_str_match |\n", + "|bigbench_formal_fallacies_syllogisms_negation | | |✓ | 14200|multiple_choice_grade, exact_str_match |\n", + "|bigbench_geometric_shapes | | |✓ | 359|multiple_choice_grade, exact_str_match |\n", + "|bigbench_hyperbaton | | |✓ | 50000|multiple_choice_grade, exact_str_match |\n", + "|bigbench_logical_deduction_five_objects | | |✓ | 500|multiple_choice_grade, exact_str_match |\n", + "|bigbench_logical_deduction_seven_objects | | |✓ | 700|multiple_choice_grade, exact_str_match |\n", + "|bigbench_logical_deduction_three_objects | | |✓ | 300|multiple_choice_grade, exact_str_match |\n", + "|bigbench_movie_recommendation | | |✓ | 500|multiple_choice_grade, exact_str_match |\n", + "|bigbench_navigate | | |✓ | 1000|multiple_choice_grade, exact_str_match |\n", + "|bigbench_reasoning_about_colored_objects | | |✓ | 2000|multiple_choice_grade, exact_str_match |\n", + "|bigbench_ruin_names | | |✓ | 448|multiple_choice_grade, exact_str_match |\n", + "|bigbench_salient_translation_error_detection | | |✓ | 998|multiple_choice_grade, exact_str_match |\n", + "|bigbench_snarks | | |✓ | 181|multiple_choice_grade, exact_str_match |\n", + "|bigbench_sports_understanding | | |✓ | 986|multiple_choice_grade, exact_str_match |\n", + "|bigbench_temporal_sequences | | |✓ | 1000|multiple_choice_grade, exact_str_match |\n", + "|bigbench_tracking_shuffled_objects_five_objects | | |✓ | 1250|multiple_choice_grade, exact_str_match |\n", + "|bigbench_tracking_shuffled_objects_seven_objects | | |✓ | 1750|multiple_choice_grade, exact_str_match |\n", + "|bigbench_tracking_shuffled_objects_three_objects | | |✓ | 300|multiple_choice_grade, exact_str_match |\n", + "|blimp_adjunct_island | |✓ | | 1000|acc |\n", + "|blimp_anaphor_gender_agreement | |✓ | | 1000|acc |\n", + "|blimp_anaphor_number_agreement | |✓ | | 1000|acc |\n", + "|blimp_animate_subject_passive | |✓ | | 1000|acc |\n", + "|blimp_animate_subject_trans | |✓ | | 1000|acc |\n", + "|blimp_causative | |✓ | | 1000|acc |\n", + "|blimp_complex_NP_island | |✓ | | 1000|acc |\n", + "|blimp_coordinate_structure_constraint_complex_left_branch| |✓ | | 1000|acc |\n", + "|blimp_coordinate_structure_constraint_object_extraction | |✓ | | 1000|acc |\n", + "|blimp_determiner_noun_agreement_1 | |✓ | | 1000|acc |\n", + "|blimp_determiner_noun_agreement_2 | |✓ | | 1000|acc |\n", + "|blimp_determiner_noun_agreement_irregular_1 | |✓ | | 1000|acc |\n", + "|blimp_determiner_noun_agreement_irregular_2 | |✓ | | 1000|acc |\n", + "|blimp_determiner_noun_agreement_with_adj_2 | |✓ | | 1000|acc |\n", + "|blimp_determiner_noun_agreement_with_adj_irregular_1 | |✓ | | 1000|acc |\n", + "|blimp_determiner_noun_agreement_with_adj_irregular_2 | |✓ | | 1000|acc |\n", + "|blimp_determiner_noun_agreement_with_adjective_1 | |✓ | | 1000|acc |\n", + "|blimp_distractor_agreement_relational_noun | |✓ | | 1000|acc |\n", + "|blimp_distractor_agreement_relative_clause | |✓ | | 1000|acc |\n", + "|blimp_drop_argument | |✓ | | 1000|acc |\n", + "|blimp_ellipsis_n_bar_1 | |✓ | | 1000|acc |\n", + "|blimp_ellipsis_n_bar_2 | |✓ | | 1000|acc |\n", + "|blimp_existential_there_object_raising | |✓ | | 1000|acc |\n", + "|blimp_existential_there_quantifiers_1 | |✓ | | 1000|acc |\n", + "|blimp_existential_there_quantifiers_2 | |✓ | | 1000|acc |\n", + "|blimp_existential_there_subject_raising | |✓ | | 1000|acc |\n", + "|blimp_expletive_it_object_raising | |✓ | | 1000|acc |\n", + "|blimp_inchoative | |✓ | | 1000|acc |\n", + "|blimp_intransitive | |✓ | | 1000|acc |\n", + "|blimp_irregular_past_participle_adjectives | |✓ | | 1000|acc |\n", + "|blimp_irregular_past_participle_verbs | |✓ | | 1000|acc |\n", + "|blimp_irregular_plural_subject_verb_agreement_1 | |✓ | | 1000|acc |\n", + "|blimp_irregular_plural_subject_verb_agreement_2 | |✓ | | 1000|acc |\n", + "|blimp_left_branch_island_echo_question | |✓ | | 1000|acc |\n", + "|blimp_left_branch_island_simple_question | |✓ | | 1000|acc |\n", + "|blimp_matrix_question_npi_licensor_present | |✓ | | 1000|acc |\n", + "|blimp_npi_present_1 | |✓ | | 1000|acc |\n", + "|blimp_npi_present_2 | |✓ | | 1000|acc |\n", + "|blimp_only_npi_licensor_present | |✓ | | 1000|acc |\n", + "|blimp_only_npi_scope | |✓ | | 1000|acc |\n", + "|blimp_passive_1 | |✓ | | 1000|acc |\n", + "|blimp_passive_2 | |✓ | | 1000|acc |\n", + "|blimp_principle_A_c_command | |✓ | | 1000|acc |\n", + "|blimp_principle_A_case_1 | |✓ | | 1000|acc |\n", + "|blimp_principle_A_case_2 | |✓ | | 1000|acc |\n", + "|blimp_principle_A_domain_1 | |✓ | | 1000|acc |\n", + "|blimp_principle_A_domain_2 | |✓ | | 1000|acc |\n", + "|blimp_principle_A_domain_3 | |✓ | | 1000|acc |\n", + "|blimp_principle_A_reconstruction | |✓ | | 1000|acc |\n", + "|blimp_regular_plural_subject_verb_agreement_1 | |✓ | | 1000|acc |\n", + "|blimp_regular_plural_subject_verb_agreement_2 | |✓ | | 1000|acc |\n", + "|blimp_sentential_negation_npi_licensor_present | |✓ | | 1000|acc |\n", + "|blimp_sentential_negation_npi_scope | |✓ | | 1000|acc |\n", + "|blimp_sentential_subject_island | |✓ | | 1000|acc |\n", + "|blimp_superlative_quantifiers_1 | |✓ | | 1000|acc |\n", + "|blimp_superlative_quantifiers_2 | |✓ | | 1000|acc |\n", + "|blimp_tough_vs_raising_1 | |✓ | | 1000|acc |\n", + "|blimp_tough_vs_raising_2 | |✓ | | 1000|acc |\n", + "|blimp_transitive | |✓ | | 1000|acc |\n", + "|blimp_wh_island | |✓ | | 1000|acc |\n", + "|blimp_wh_questions_object_gap | |✓ | | 1000|acc |\n", + "|blimp_wh_questions_subject_gap | |✓ | | 1000|acc |\n", + "|blimp_wh_questions_subject_gap_long_distance | |✓ | | 1000|acc |\n", + "|blimp_wh_vs_that_no_gap | |✓ | | 1000|acc |\n", + "|blimp_wh_vs_that_no_gap_long_distance | |✓ | | 1000|acc |\n", + "|blimp_wh_vs_that_with_gap | |✓ | | 1000|acc |\n", + "|blimp_wh_vs_that_with_gap_long_distance | |✓ | | 1000|acc |\n", + "|boolq |✓ |✓ | | 3270|acc |\n", + "|cb |✓ |✓ | | 56|acc, f1 |\n", + "|cola |✓ |✓ | | 1043|mcc |\n", + "|copa |✓ |✓ | | 100|acc |\n", + "|coqa |✓ |✓ | | 500|f1, em |\n", + "|crows_pairs_english | |✓ | | 1677|likelihood_difference, pct_stereotype |\n", + "|crows_pairs_english_age | |✓ | | 91|likelihood_difference, pct_stereotype |\n", + "|crows_pairs_english_autre | |✓ | | 11|likelihood_difference, pct_stereotype |\n", + "|crows_pairs_english_disability | |✓ | | 65|likelihood_difference, pct_stereotype |\n", + "|crows_pairs_english_gender | |✓ | | 320|likelihood_difference, pct_stereotype |\n", + "|crows_pairs_english_nationality | |✓ | | 216|likelihood_difference, pct_stereotype |\n", + "|crows_pairs_english_physical_appearance | |✓ | | 72|likelihood_difference, pct_stereotype |\n", + "|crows_pairs_english_race_color | |✓ | | 508|likelihood_difference, pct_stereotype |\n", + "|crows_pairs_english_religion | |✓ | | 111|likelihood_difference, pct_stereotype |\n", + "|crows_pairs_english_sexual_orientation | |✓ | | 93|likelihood_difference, pct_stereotype |\n", + "|crows_pairs_english_socioeconomic | |✓ | | 190|likelihood_difference, pct_stereotype |\n", + "|crows_pairs_french | |✓ | | 1677|likelihood_difference, pct_stereotype |\n", + "|crows_pairs_french_age | |✓ | | 90|likelihood_difference, pct_stereotype |\n", + "|crows_pairs_french_autre | |✓ | | 13|likelihood_difference, pct_stereotype |\n", + "|crows_pairs_french_disability | |✓ | | 66|likelihood_difference, pct_stereotype |\n", + "|crows_pairs_french_gender | |✓ | | 321|likelihood_difference, pct_stereotype |\n", + "|crows_pairs_french_nationality | |✓ | | 253|likelihood_difference, pct_stereotype |\n", + "|crows_pairs_french_physical_appearance | |✓ | | 72|likelihood_difference, pct_stereotype |\n", + "|crows_pairs_french_race_color | |✓ | | 460|likelihood_difference, pct_stereotype |\n", + "|crows_pairs_french_religion | |✓ | | 115|likelihood_difference, pct_stereotype |\n", + "|crows_pairs_french_sexual_orientation | |✓ | | 91|likelihood_difference, pct_stereotype |\n", + "|crows_pairs_french_socioeconomic | |✓ | | 196|likelihood_difference, pct_stereotype |\n", + "|cycle_letters | |✓ | | 10000|acc |\n", + "|drop |✓ |✓ | | 9536|em, f1 |\n", + "|ethics_cm |✓ | |✓ | 3885|acc |\n", + "|ethics_deontology |✓ | |✓ | 3596|acc, em |\n", + "|ethics_justice |✓ | |✓ | 2704|acc, em |\n", + "|ethics_utilitarianism |✓ | |✓ | 4808|acc |\n", + "|ethics_utilitarianism_original | | |✓ | 4808|acc |\n", + "|ethics_virtue |✓ | |✓ | 4975|acc, em |\n", + "|gsm8k |✓ | |✓ | 1319|acc |\n", + "|headqa |✓ |✓ |✓ | 2742|acc, acc_norm |\n", + "|headqa_en |✓ |✓ |✓ | 2742|acc, acc_norm |\n", + "|headqa_es |✓ |✓ |✓ | 2742|acc, acc_norm |\n", + "|hellaswag |✓ |✓ | | 10042|acc, acc_norm |\n", + "|hendrycksTest-abstract_algebra | |✓ |✓ | 100|acc, acc_norm |\n", + "|hendrycksTest-anatomy | |✓ |✓ | 135|acc, acc_norm |\n", + "|hendrycksTest-astronomy | |✓ |✓ | 152|acc, acc_norm |\n", + "|hendrycksTest-business_ethics | |✓ |✓ | 100|acc, acc_norm |\n", + "|hendrycksTest-clinical_knowledge | |✓ |✓ | 265|acc, acc_norm |\n", + "|hendrycksTest-college_biology | |✓ |✓ | 144|acc, acc_norm |\n", + "|hendrycksTest-college_chemistry | |✓ |✓ | 100|acc, acc_norm |\n", + "|hendrycksTest-college_computer_science | |✓ |✓ | 100|acc, acc_norm |\n", + "|hendrycksTest-college_mathematics | |✓ |✓ | 100|acc, acc_norm |\n", + "|hendrycksTest-college_medicine | |✓ |✓ | 173|acc, acc_norm |\n", + "|hendrycksTest-college_physics | |✓ |✓ | 102|acc, acc_norm |\n", + "|hendrycksTest-computer_security | |✓ |✓ | 100|acc, acc_norm |\n", + "|hendrycksTest-conceptual_physics | |✓ |✓ | 235|acc, acc_norm |\n", + "|hendrycksTest-econometrics | |✓ |✓ | 114|acc, acc_norm |\n", + "|hendrycksTest-electrical_engineering | |✓ |✓ | 145|acc, acc_norm |\n", + "|hendrycksTest-elementary_mathematics | |✓ |✓ | 378|acc, acc_norm |\n", + "|hendrycksTest-formal_logic | |✓ |✓ | 126|acc, acc_norm |\n", + "|hendrycksTest-global_facts | |✓ |✓ | 100|acc, acc_norm |\n", + "|hendrycksTest-high_school_biology | |✓ |✓ | 310|acc, acc_norm |\n", + "|hendrycksTest-high_school_chemistry | |✓ |✓ | 203|acc, acc_norm |\n", + "|hendrycksTest-high_school_computer_science | |✓ |✓ | 100|acc, acc_norm |\n", + "|hendrycksTest-high_school_european_history | |✓ |✓ | 165|acc, acc_norm |\n", + "|hendrycksTest-high_school_geography | |✓ |✓ | 198|acc, acc_norm |\n", + "|hendrycksTest-high_school_government_and_politics | |✓ |✓ | 193|acc, acc_norm |\n", + "|hendrycksTest-high_school_macroeconomics | |✓ |✓ | 390|acc, acc_norm |\n", + "|hendrycksTest-high_school_mathematics | |✓ |✓ | 270|acc, acc_norm |\n", + "|hendrycksTest-high_school_microeconomics | |✓ |✓ | 238|acc, acc_norm |\n", + "|hendrycksTest-high_school_physics | |✓ |✓ | 151|acc, acc_norm |\n", + "|hendrycksTest-high_school_psychology | |✓ |✓ | 545|acc, acc_norm |\n", + "|hendrycksTest-high_school_statistics | |✓ |✓ | 216|acc, acc_norm |\n", + "|hendrycksTest-high_school_us_history | |✓ |✓ | 204|acc, acc_norm |\n", + "|hendrycksTest-high_school_world_history | |✓ |✓ | 237|acc, acc_norm |\n", + "|hendrycksTest-human_aging | |✓ |✓ | 223|acc, acc_norm |\n", + "|hendrycksTest-human_sexuality | |✓ |✓ | 131|acc, acc_norm |\n", + "|hendrycksTest-international_law | |✓ |✓ | 121|acc, acc_norm |\n", + "|hendrycksTest-jurisprudence | |✓ |✓ | 108|acc, acc_norm |\n", + "|hendrycksTest-logical_fallacies | |✓ |✓ | 163|acc, acc_norm |\n", + "|hendrycksTest-machine_learning | |✓ |✓ | 112|acc, acc_norm |\n", + "|hendrycksTest-management | |✓ |✓ | 103|acc, acc_norm |\n", + "|hendrycksTest-marketing | |✓ |✓ | 234|acc, acc_norm |\n", + "|hendrycksTest-medical_genetics | |✓ |✓ | 100|acc, acc_norm |\n", + "|hendrycksTest-miscellaneous | |✓ |✓ | 783|acc, acc_norm |\n", + "|hendrycksTest-moral_disputes | |✓ |✓ | 346|acc, acc_norm |\n", + "|hendrycksTest-moral_scenarios | |✓ |✓ | 895|acc, acc_norm |\n", + "|hendrycksTest-nutrition | |✓ |✓ | 306|acc, acc_norm |\n", + "|hendrycksTest-philosophy | |✓ |✓ | 311|acc, acc_norm |\n", + "|hendrycksTest-prehistory | |✓ |✓ | 324|acc, acc_norm |\n", + "|hendrycksTest-professional_accounting | |✓ |✓ | 282|acc, acc_norm |\n", + "|hendrycksTest-professional_law | |✓ |✓ | 1534|acc, acc_norm |\n", + "|hendrycksTest-professional_medicine | |✓ |✓ | 272|acc, acc_norm |\n", + "|hendrycksTest-professional_psychology | |✓ |✓ | 612|acc, acc_norm |\n", + "|hendrycksTest-public_relations | |✓ |✓ | 110|acc, acc_norm |\n", + "|hendrycksTest-security_studies | |✓ |✓ | 245|acc, acc_norm |\n", + "|hendrycksTest-sociology | |✓ |✓ | 201|acc, acc_norm |\n", + "|hendrycksTest-us_foreign_policy | |✓ |✓ | 100|acc, acc_norm |\n", + "|hendrycksTest-virology | |✓ |✓ | 166|acc, acc_norm |\n", + "|hendrycksTest-world_religions | |✓ |✓ | 171|acc, acc_norm |\n", + "|iwslt17-ar-en | | |✓ | 1460|bleu, chrf, ter |\n", + "|iwslt17-en-ar | | |✓ | 1460|bleu, chrf, ter |\n", + "|lambada_openai | | |✓ | 5153|ppl, acc |\n", + "|lambada_openai_cloze | | |✓ | 5153|ppl, acc |\n", + "|lambada_openai_mt_de | | |✓ | 5153|ppl, acc |\n", + "|lambada_openai_mt_en | | |✓ | 5153|ppl, acc |\n", + "|lambada_openai_mt_es | | |✓ | 5153|ppl, acc |\n", + "|lambada_openai_mt_fr | | |✓ | 5153|ppl, acc |\n", + "|lambada_openai_mt_it | | |✓ | 5153|ppl, acc |\n", + "|lambada_standard | |✓ |✓ | 5153|ppl, acc |\n", + "|lambada_standard_cloze | |✓ |✓ | 5153|ppl, acc |\n", + "|logiqa |✓ |✓ |✓ | 651|acc, acc_norm |\n", + "|math_algebra |✓ | |✓ | 1187|acc |\n", + "|math_asdiv | |✓ | | 2305|acc |\n", + "|math_counting_and_prob |✓ | |✓ | 474|acc |\n", + "|math_geometry |✓ | |✓ | 479|acc |\n", + "|math_intermediate_algebra |✓ | |✓ | 903|acc |\n", + "|math_num_theory |✓ | |✓ | 540|acc |\n", + "|math_prealgebra |✓ | |✓ | 871|acc |\n", + "|math_precalc |✓ | |✓ | 546|acc |\n", + "|mathqa |✓ |✓ |✓ | 2985|acc, acc_norm |\n", + "|mc_taco | |✓ |✓ | 9442|f1, em |\n", + "|mgsm_bn |✓ | |✓ | 250|acc |\n", + "|mgsm_de |✓ | |✓ | 250|acc |\n", + "|mgsm_en |✓ | |✓ | 250|acc |\n", + "|mgsm_es |✓ | |✓ | 250|acc |\n", + "|mgsm_fr |✓ | |✓ | 250|acc |\n", + "|mgsm_ja |✓ | |✓ | 250|acc |\n", + "|mgsm_ru |✓ | |✓ | 250|acc |\n", + "|mgsm_sw |✓ | |✓ | 250|acc |\n", + "|mgsm_te |✓ | |✓ | 250|acc |\n", + "|mgsm_th |✓ | |✓ | 250|acc |\n", + "|mgsm_zh |✓ | |✓ | 250|acc |\n", + "|mnli |✓ |✓ | | 9815|acc |\n", + "|mnli_mismatched |✓ |✓ | | 9832|acc |\n", + "|mrpc |✓ |✓ | | 408|acc, f1 |\n", + "|multirc |✓ |✓ | | 4848|acc |\n", + "|mutual |✓ |✓ | | 886|r@1, r@2, mrr |\n", + "|mutual_plus |✓ |✓ | | 886|r@1, r@2, mrr |\n", + "|openbookqa |✓ |✓ |✓ | 500|acc, acc_norm |\n", + "|pawsx_de |✓ |✓ |✓ | 2000|acc |\n", + "|pawsx_en |✓ |✓ |✓ | 2000|acc |\n", + "|pawsx_es |✓ |✓ |✓ | 2000|acc |\n", + "|pawsx_fr |✓ |✓ |✓ | 2000|acc |\n", + "|pawsx_ja |✓ |✓ |✓ | 2000|acc |\n", + "|pawsx_ko |✓ |✓ |✓ | 2000|acc |\n", + "|pawsx_zh |✓ |✓ |✓ | 2000|acc |\n", + "|pile_arxiv | |✓ |✓ | 2407|word_perplexity, byte_perplexity, bits_per_byte |\n", + "|pile_bookcorpus2 | |✓ |✓ | 28|word_perplexity, byte_perplexity, bits_per_byte |\n", + "|pile_books3 | |✓ |✓ | 269|word_perplexity, byte_perplexity, bits_per_byte |\n", + "|pile_dm-mathematics | |✓ |✓ | 1922|word_perplexity, byte_perplexity, bits_per_byte |\n", + "|pile_enron | |✓ |✓ | 1010|word_perplexity, byte_perplexity, bits_per_byte |\n", + "|pile_europarl | |✓ |✓ | 157|word_perplexity, byte_perplexity, bits_per_byte |\n", + "|pile_freelaw | |✓ |✓ | 5101|word_perplexity, byte_perplexity, bits_per_byte |\n", + "|pile_github | |✓ |✓ | 18195|word_perplexity, byte_perplexity, bits_per_byte |\n", + "|pile_gutenberg | |✓ |✓ | 80|word_perplexity, byte_perplexity, bits_per_byte |\n", + "|pile_hackernews | |✓ |✓ | 1632|word_perplexity, byte_perplexity, bits_per_byte |\n", + "|pile_nih-exporter | |✓ |✓ | 1884|word_perplexity, byte_perplexity, bits_per_byte |\n", + "|pile_opensubtitles | |✓ |✓ | 642|word_perplexity, byte_perplexity, bits_per_byte |\n", + "|pile_openwebtext2 | |✓ |✓ | 32925|word_perplexity, byte_perplexity, bits_per_byte |\n", + "|pile_philpapers | |✓ |✓ | 68|word_perplexity, byte_perplexity, bits_per_byte |\n", + "|pile_pile-cc | |✓ |✓ | 52790|word_perplexity, byte_perplexity, bits_per_byte |\n", + "|pile_pubmed-abstracts | |✓ |✓ | 29895|word_perplexity, byte_perplexity, bits_per_byte |\n", + "|pile_pubmed-central | |✓ |✓ | 5911|word_perplexity, byte_perplexity, bits_per_byte |\n", + "|pile_stackexchange | |✓ |✓ | 30378|word_perplexity, byte_perplexity, bits_per_byte |\n", + "|pile_ubuntu-irc | |✓ |✓ | 22|word_perplexity, byte_perplexity, bits_per_byte |\n", + "|pile_uspto | |✓ |✓ | 11415|word_perplexity, byte_perplexity, bits_per_byte |\n", + "|pile_wikipedia | |✓ |✓ | 17511|word_perplexity, byte_perplexity, bits_per_byte |\n", + "|pile_youtubesubtitles | |✓ |✓ | 342|word_perplexity, byte_perplexity, bits_per_byte |\n", + "|piqa |✓ |✓ | | 1838|acc, acc_norm |\n", + "|prost | | |✓ | 18736|acc, acc_norm |\n", + "|pubmedqa | | |✓ | 1000|acc |\n", + "|qa4mre_2011 | | |✓ | 120|acc, acc_norm |\n", + "|qa4mre_2012 | | |✓ | 160|acc, acc_norm |\n", + "|qa4mre_2013 | | |✓ | 284|acc, acc_norm |\n", + "|qasper |✓ |✓ | | 1764|f1_yesno, f1_abstractive |\n", + "|qnli |✓ |✓ | | 5463|acc |\n", + "|qqp |✓ |✓ | | 40430|acc, f1 |\n", + "|race |✓ |✓ |✓ | 1045|acc |\n", + "|random_insertion | |✓ | | 10000|acc |\n", + "|record |✓ |✓ | | 10000|f1, em |\n", + "|reversed_words | |✓ | | 10000|acc |\n", + "|rte |✓ |✓ | | 277|acc |\n", + "|sciq |✓ |✓ |✓ | 1000|acc, acc_norm |\n", + "|scrolls_contractnli |✓ |✓ | | 1037|em, acc, acc_norm |\n", + "|scrolls_govreport |✓ |✓ | | 972|rouge1, rouge2, rougeL |\n", + "|scrolls_narrativeqa |✓ |✓ | | 3425|f1 |\n", + "|scrolls_qasper |✓ |✓ | | 984|f1 |\n", + "|scrolls_qmsum |✓ |✓ | | 272|rouge1, rouge2, rougeL |\n", + "|scrolls_quality |✓ |✓ | | 2086|em, acc, acc_norm |\n", + "|scrolls_summscreenfd |✓ |✓ | | 338|rouge1, rouge2, rougeL |\n", + "|squad2 |✓ |✓ | | 11873|exact, f1, HasAns_exact, HasAns_f1, NoAns_exact, NoAns_f1, best_exact, best_f1 |\n", + "|sst |✓ |✓ | | 872|acc |\n", + "|swag |✓ |✓ | | 20006|acc, acc_norm |\n", + "|toxigen |✓ | |✓ | 940|acc, acc_norm |\n", + "|triviaqa |✓ |✓ | | 11313|acc |\n", + "|truthfulqa_gen | |✓ | | 817|bleurt_max, bleurt_acc, bleurt_diff, bleu_max, bleu_acc, bleu_diff, rouge1_max, rouge1_acc, rouge1_diff, rouge2_max, rouge2_acc, rouge2_diff, rougeL_max, rougeL_acc, rougeL_diff|\n", + "|truthfulqa_mc | |✓ | | 817|mc1, mc2 |\n", + "|webqs |✓ | |✓ | 2032|acc |\n", + "|wic |✓ |✓ | | 638|acc |\n", + "|wikitext |✓ |✓ |✓ | 62|word_perplexity, byte_perplexity, bits_per_byte |\n", + "|winogrande |✓ |✓ | | 1267|acc |\n", + "|wmt14-en-fr | | |✓ | 3003|bleu, chrf, ter |\n", + "|wmt14-fr-en | | |✓ | 3003|bleu, chrf, ter |\n", + "|wmt16-de-en | | |✓ | 2999|bleu, chrf, ter |\n", + "|wmt16-en-de | | |✓ | 2999|bleu, chrf, ter |\n", + "|wmt16-en-ro | | |✓ | 1999|bleu, chrf, ter |\n", + "|wmt16-ro-en | | |✓ | 1999|bleu, chrf, ter |\n", + "|wmt20-cs-en | | |✓ | 664|bleu, chrf, ter |\n", + "|wmt20-de-en | | |✓ | 785|bleu, chrf, ter |\n", + "|wmt20-de-fr | | |✓ | 1619|bleu, chrf, ter |\n", + "|wmt20-en-cs | | |✓ | 1418|bleu, chrf, ter |\n", + "|wmt20-en-de | | |✓ | 1418|bleu, chrf, ter |\n", + "|wmt20-en-iu | | |✓ | 2971|bleu, chrf, ter |\n", + "|wmt20-en-ja | | |✓ | 1000|bleu, chrf, ter |\n", + "|wmt20-en-km | | |✓ | 2320|bleu, chrf, ter |\n", + "|wmt20-en-pl | | |✓ | 1000|bleu, chrf, ter |\n", + "|wmt20-en-ps | | |✓ | 2719|bleu, chrf, ter |\n", + "|wmt20-en-ru | | |✓ | 2002|bleu, chrf, ter |\n", + "|wmt20-en-ta | | |✓ | 1000|bleu, chrf, ter |\n", + "|wmt20-en-zh | | |✓ | 1418|bleu, chrf, ter |\n", + "|wmt20-fr-de | | |✓ | 1619|bleu, chrf, ter |\n", + "|wmt20-iu-en | | |✓ | 2971|bleu, chrf, ter |\n", + "|wmt20-ja-en | | |✓ | 993|bleu, chrf, ter |\n", + "|wmt20-km-en | | |✓ | 2320|bleu, chrf, ter |\n", + "|wmt20-pl-en | | |✓ | 1001|bleu, chrf, ter |\n", + "|wmt20-ps-en | | |✓ | 2719|bleu, chrf, ter |\n", + "|wmt20-ru-en | | |✓ | 991|bleu, chrf, ter |\n", + "|wmt20-ta-en | | |✓ | 997|bleu, chrf, ter |\n", + "|wmt20-zh-en | | |✓ | 2000|bleu, chrf, ter |\n", + "|wnli |✓ |✓ | | 71|acc |\n", + "|wsc |✓ |✓ | | 104|acc |\n", + "|wsc273 | | |✓ | 273|acc |\n", + "|xcopa_et | |✓ |✓ | 500|acc |\n", + "|xcopa_ht | |✓ |✓ | 500|acc |\n", + "|xcopa_id | |✓ |✓ | 500|acc |\n", + "|xcopa_it | |✓ |✓ | 500|acc |\n", + "|xcopa_qu | |✓ |✓ | 500|acc |\n", + "|xcopa_sw | |✓ |✓ | 500|acc |\n", + "|xcopa_ta | |✓ |✓ | 500|acc |\n", + "|xcopa_th | |✓ |✓ | 500|acc |\n", + "|xcopa_tr | |✓ |✓ | 500|acc |\n", + "|xcopa_vi | |✓ |✓ | 500|acc |\n", + "|xcopa_zh | |✓ |✓ | 500|acc |\n", + "|xnli_ar |✓ |✓ |✓ | 5010|acc |\n", + "|xnli_bg |✓ |✓ |✓ | 5010|acc |\n", + "|xnli_de |✓ |✓ |✓ | 5010|acc |\n", + "|xnli_el |✓ |✓ |✓ | 5010|acc |\n", + "|xnli_en |✓ |✓ |✓ | 5010|acc |\n", + "|xnli_es |✓ |✓ |✓ | 5010|acc |\n", + "|xnli_fr |✓ |✓ |✓ | 5010|acc |\n", + "|xnli_hi |✓ |✓ |✓ | 5010|acc |\n", + "|xnli_ru |✓ |✓ |✓ | 5010|acc |\n", + "|xnli_sw |✓ |✓ |✓ | 5010|acc |\n", + "|xnli_th |✓ |✓ |✓ | 5010|acc |\n", + "|xnli_tr |✓ |✓ |✓ | 5010|acc |\n", + "|xnli_ur |✓ |✓ |✓ | 5010|acc |\n", + "|xnli_vi |✓ |✓ |✓ | 5010|acc |\n", + "|xnli_zh |✓ |✓ |✓ | 5010|acc |\n", + "|xstory_cloze_ar |✓ |✓ | | 1511|acc |\n", + "|xstory_cloze_en |✓ |✓ | | 1511|acc |\n", + "|xstory_cloze_es |✓ |✓ | | 1511|acc |\n", + "|xstory_cloze_eu |✓ |✓ | | 1511|acc |\n", + "|xstory_cloze_hi |✓ |✓ | | 1511|acc |\n", + "|xstory_cloze_id |✓ |✓ | | 1511|acc |\n", + "|xstory_cloze_my |✓ |✓ | | 1511|acc |\n", + "|xstory_cloze_ru |✓ |✓ | | 1511|acc |\n", + "|xstory_cloze_sw |✓ |✓ | | 1511|acc |\n", + "|xstory_cloze_te |✓ |✓ | | 1511|acc |\n", + "|xstory_cloze_zh |✓ |✓ | | 1511|acc |\n", + "|xwinograd_en | | |✓ | 2325|acc |\n", + "|xwinograd_fr | | |✓ | 83|acc |\n", + "|xwinograd_jp | | |✓ | 959|acc |\n", + "|xwinograd_pt | | |✓ | 263|acc |\n", + "|xwinograd_ru | | |✓ | 315|acc |\n", + "|xwinograd_zh | | |✓ | 504|acc |\n", + "| Ceval-valid-computer_network | | ✓ | | 19 | acc |\n", + "| Ceval-valid-operating_system | | ✓ | | 19 | acc |\n", + "| Ceval-valid-computer_architecture | | ✓ | | 21 | acc |\n", + "| Ceval-valid-college_programming | | ✓ | | 37 | acc |\n", + "| Ceval-valid-college_physics | | ✓ | | 19 | acc |\n", + "| Ceval-valid-college_chemistry | | ✓ | | 24 | acc |\n", + "| Ceval-valid-advanced_mathematics | | ✓ | | 19 | acc |\n", + "| Ceval-valid-probability_and_statistics | | ✓ | | 18 | acc |\n", + "| Ceval-valid-discrete_mathematics | | ✓ | | 16 | acc |\n", + "| Ceval-valid-electrical_engineer | | ✓ | | 37 | acc |\n", + "| Ceval-valid-metrology_engineer | | ✓ | | 24 | acc |\n", + "| Ceval-valid-high_school_mathematics | | ✓ | | 18 | acc |\n", + "| Ceval-valid-high_school_physics | | ✓ | | 19 | acc |\n", + "| Ceval-valid-high_school_chemistry | | ✓ | | 19 | acc |\n", + "| Ceval-valid-high_school_biology | | ✓ | | 19 | acc |\n", + "| Ceval-valid-middle_school_mathematics | | ✓ | | 19 | acc |\n", + "| Ceval-valid-middle_school_biology | | ✓ | | 21 | acc |\n", + "| Ceval-valid-middle_school_physics | | ✓ | | 19 | acc |\n", + "| Ceval-valid-middle_school_chemistry | | ✓ | | 20 | acc |\n", + "| Ceval-valid-veterinary_medicine | | ✓ | | 23 | acc |\n", + "| Ceval-valid-college_economics | | ✓ | | 55 | acc |\n", + "| Ceval-valid-business_administration | | ✓ | | 33 | acc |\n", + "| Ceval-valid-marxism | | ✓ | | 19 | acc |\n", + "| Ceval-valid-mao_zedong_thought | | ✓ | | 24 | acc |\n", + "| Ceval-valid-education_science | | ✓ | | 29 | acc |\n", + "| Ceval-valid-teacher_qualification | | ✓ | | 44 | acc |\n", + "| Ceval-valid-high_school_politics | | ✓ | | 19 | acc |\n", + "| Ceval-valid-high_school_geography | | ✓ | | 19 | acc |\n", + "| Ceval-valid-middle_school_politics | | ✓ | | 21 | acc |\n", + "| Ceval-valid-middle_school_geography | | ✓ | | 12 | acc |\n", + "| Ceval-valid-modern_chinese_history | | ✓ | | 23 | acc |\n", + "| Ceval-valid-ideological_and_moral_cultivation | | ✓ | | 19 | acc |\n", + "| Ceval-valid-logic | | ✓ | | 22 | acc |\n", + "| Ceval-valid-law | | ✓ | | 24 | acc |\n", + "| Ceval-valid-chinese_language_and_literature | | ✓ | | 23 | acc |\n", + "| Ceval-valid-art_studies | | ✓ | | 33 | acc |\n", + "| Ceval-valid-professional_tour_guide | | ✓ | | 29 | acc |\n", + "| Ceval-valid-legal_professional | | ✓ | | 23 | acc |\n", + "| Ceval-valid-high_school_chinese | | ✓ | | 19 | acc |\n", + "| Ceval-valid-high_school_history | | ✓ | | 20 | acc |\n", + "| Ceval-valid-middle_school_history | | ✓ | | 22 | acc |\n", + "| Ceval-valid-civil_servant | | ✓ | | 47 | acc |\n", + "| Ceval-valid-sports_science | | ✓ | | 19 | acc |\n", + "| Ceval-valid-plant_protection | | ✓ | | 22 | acc |\n", + "| Ceval-valid-basic_medicine | | ✓ | | 19 | acc |\n", + "| Ceval-valid-clinical_medicine | | ✓ | | 22 | acc |\n", + "| Ceval-valid-urban_and_rural_planner | | ✓ | | 46 | acc |\n", + "| Ceval-valid-accountant | | ✓ | | 49 | acc |\n", + "| Ceval-valid-fire_engineer | | ✓ | | 31 | acc |\n", + "| Ceval-valid-environmental_impact_assessment_engineer | | ✓ | | 31 | acc |\n", + "| Ceval-valid-tax_accountant | | ✓ | | 49 | acc |\n", + "| Ceval-valid-physician | | ✓ | | 49 | acc |" + ] + } + ], + "metadata": { + "accelerator": "GPU", + "colab": { + "gpuType": "T4", + "provenance": [] + }, + "kernelspec": { + "display_name": "Python 3", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +}