{ "cells": [ { "cell_type": "markdown", "id": "8745f6ee-a9b0-4f68-9f2c-2e27ea86c2a3", "metadata": {}, "source": [ "# Load Libraries" ] }, { "cell_type": "code", "execution_count": 1, "id": "20508587-c46c-4645-a3d5-845cd55f1512", "metadata": {}, "outputs": [ { "data": { "text/html": [ " \n", " " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import os\n", "\n", "import pandas as pd\n", "import numpy as np\n", "\n", "import huggingface_hub\n", "import datasets\n", "\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import matplotlib.patches as mpatches\n", "\n", "from datetime import datetime, timezone\n", "\n", "import plotly\n", "import plotly.express as px\n", "import plotly.graph_objects as go\n", "\n", "plotly.offline.init_notebook_mode(connected=True)" ] }, { "cell_type": "code", "execution_count": 2, "id": "a166b52f-d5a4-4422-9564-3bd09c1bb321", "metadata": {}, "outputs": [], "source": [ "# Create the directory for plots\n", "directory = \"./plots\"\n", "if not os.path.exists(directory):\n", " os.makedirs(directory)" ] }, { "cell_type": "code", "execution_count": 3, "id": "ffbf9842-cf52-4989-9de1-91f108b1b146", "metadata": {}, "outputs": [], "source": [ "pd.set_option('display.max_columns', None)" ] }, { "cell_type": "code", "execution_count": 4, "id": "5c4cb1e2-a571-4f8b-98d0-61848c833ac9", "metadata": {}, "outputs": [], "source": [ "# Set colours\n", "colors = [\"#FF9D00\", \"#FFD21E\", \"#FF323D\", \"#32343D\", \"#297373\", \"#CD4631\"]\n", "\n", "ORANGE = \"#FF9D00\"\n", "YELLOW = \"#FFD21E\"\n", "RED = \"#FF323D\"\n", "BLACK = \"#32343D\"\n", "GREEN = \"#297373\"\n", "DARK_ORANGE = \"#CD4631\"" ] }, { "cell_type": "markdown", "id": "d37bd88b-f89d-440d-9541-6b6e589376e9", "metadata": {}, "source": [ "# Data Loading and Preprocessing" ] }, { "cell_type": "markdown", "id": "e1befb4e-f1a3-4c47-a46f-724660e08f31", "metadata": {}, "source": [ "## Load V2" ] }, { "cell_type": "code", "execution_count": 5, "id": "e398b673-20c7-4a83-8230-221829078cb2", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(203, 32)" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Load the v2 JSONL file\n", "ds = datasets.load_dataset(\"open-llm-leaderboard/contents\", split=\"train\")\n", "data_v2 = ds.to_pandas()\n", "data_v2.shape" ] }, { "cell_type": "code", "execution_count": 6, "id": "464175a6-0034-4c6d-b7c8-51cb69be9db4", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | eval_name | \n", "Precision | \n", "Type | \n", "T | \n", "Weight type | \n", "Architecture | \n", "Model | \n", "fullname | \n", "Model sha | \n", "Average ⬆️ | \n", "Hub License | \n", "Hub ❤️ | \n", "#Params (B) | \n", "Available on the hub | \n", "Merged | \n", "MoE | \n", "Flagged | \n", "date | \n", "Chat Template | \n", "IFEval Raw | \n", "IFEval | \n", "BBH Raw | \n", "BBH | \n", "MATH Lvl 5 Raw | \n", "MATH Lvl 5 | \n", "GPQA Raw | \n", "GPQA | \n", "MUSR Raw | \n", "MUSR | \n", "MMLU-PRO Raw | \n", "MMLU-PRO | \n", "Maintainer's Highlight | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "upstage_SOLAR-10.7B-v1.0_float16 | \n", "float16 | \n", "🟢 pretrained | \n", "🟢 | \n", "Original | \n", "LlamaForCausalLM | \n", "<a target=\"_blank\" href=\"https://huggingface.c... | \n", "upstage/SOLAR-10.7B-v1.0 | \n", "a45090b8e56bdc2b8e32e46b3cd782fc0bea1fa5 | \n", "17.072003 | \n", "apache-2.0 | \n", "248 | \n", "10 | \n", "True | \n", "True | \n", "True | \n", "False | \n", "2024-06-12T12:27:42Z | \n", "False | \n", "0.242126 | \n", "24.212645 | \n", "0.509387 | \n", "29.789358 | \n", "0.021148 | \n", "2.114804 | \n", "0.281040 | \n", "4.138702 | \n", "0.437156 | \n", "13.677865 | \n", "0.340010 | \n", "26.667775 | \n", "True | \n", "
1 | \n", "upstage_SOLAR-10.7B-Instruct-v1.0_float16 | \n", "float16 | \n", "💬 chat models (RLHF, DPO, IFT, ...) | \n", "💬 | \n", "Original | \n", "LlamaForCausalLM | \n", "<a target=\"_blank\" href=\"https://huggingface.c... | \n", "upstage/SOLAR-10.7B-Instruct-v1.0 | \n", "c08c25ed66414a878fe0401a3596d536c083606c | \n", "19.961989 | \n", "cc-by-nc-4.0 | \n", "592 | \n", "10 | \n", "True | \n", "True | \n", "True | \n", "False | \n", "2024-06-12T12:06:58Z | \n", "True | \n", "0.473661 | \n", "47.366100 | \n", "0.516249 | \n", "31.872402 | \n", "0.000000 | \n", "0.000000 | \n", "0.308725 | \n", "7.829978 | \n", "0.389937 | \n", "6.942188 | \n", "0.313830 | \n", "23.758865 | \n", "True | \n", "
2 | \n", "togethercomputer_RedPajama-INCITE-Instruct-3B-... | \n", "float16 | \n", "🔶 fine-tuned on domain-specific datasets | \n", "🔶 | \n", "Original | \n", "GPTNeoXForCausalLM | \n", "<a target=\"_blank\" href=\"https://huggingface.c... | \n", "togethercomputer/RedPajama-INCITE-Instruct-3B-v1 | \n", "0c66778ee09a036886741707733620b91057909a | \n", "5.877290 | \n", "apache-2.0 | \n", "91 | \n", "3 | \n", "True | \n", "True | \n", "True | \n", "False | \n", "2024-06-12T12:07:46Z | \n", "False | \n", "0.212426 | \n", "21.242636 | \n", "0.314602 | \n", "4.510786 | \n", "0.006042 | \n", "0.604230 | \n", "0.247483 | \n", "0.000000 | \n", "0.388604 | \n", "6.408854 | \n", "0.110954 | \n", "1.217125 | \n", "True | \n", "
3 | \n", "togethercomputer_RedPajama-INCITE-Chat-3B-v1_f... | \n", "float16 | \n", "🔶 fine-tuned on domain-specific datasets | \n", "🔶 | \n", "Original | \n", "GPTNeoXForCausalLM | \n", "<a target=\"_blank\" href=\"https://huggingface.c... | \n", "togethercomputer/RedPajama-INCITE-Chat-3B-v1 | \n", "f0e0995eba801096ed04cb87931d96a8316871af | \n", "4.950649 | \n", "apache-2.0 | \n", "147 | \n", "3 | \n", "True | \n", "True | \n", "True | \n", "False | \n", "2024-06-13T17:58:59Z | \n", "False | \n", "0.165215 | \n", "16.521496 | \n", "0.321669 | \n", "5.164728 | \n", "0.003021 | \n", "0.302115 | \n", "0.244128 | \n", "0.000000 | \n", "0.368448 | \n", "5.089323 | \n", "0.112699 | \n", "1.411052 | \n", "True | \n", "
4 | \n", "togethercomputer_RedPajama-INCITE-Base-3B-v1_f... | \n", "float16 | \n", "🟢 pretrained | \n", "🟢 | \n", "Original | \n", "GPTNeoXForCausalLM | \n", "<a target=\"_blank\" href=\"https://huggingface.c... | \n", "togethercomputer/RedPajama-INCITE-Base-3B-v1 | \n", "094fbdd0c911feb485ce55de1952ab2e75277e1e | \n", "5.645099 | \n", "apache-2.0 | \n", "90 | \n", "3 | \n", "True | \n", "True | \n", "True | \n", "False | \n", "2024-06-12T12:28:23Z | \n", "False | \n", "0.229363 | \n", "22.936254 | \n", "0.306040 | \n", "3.518608 | \n", "0.009063 | \n", "0.906344 | \n", "0.243289 | \n", "0.000000 | \n", "0.373875 | \n", "4.001042 | \n", "0.111120 | \n", "1.235594 | \n", "True | \n", "
\n", " | eval_name | \n", "Precision | \n", "Type | \n", "T | \n", "Weight type | \n", "Architecture | \n", "Model | \n", "fullname | \n", "Model sha | \n", "Average ⬆️ | \n", "Hub License | \n", "Hub ❤️ | \n", "#Params (B) | \n", "Available on the hub | \n", "Merged | \n", "MoE | \n", "Flagged | \n", "date | \n", "Chat Template | \n", "IFEval Raw | \n", "IFEval | \n", "BBH Raw | \n", "BBH | \n", "MATH Lvl 5 Raw | \n", "MATH Lvl 5 | \n", "GPQA Raw | \n", "GPQA | \n", "MUSR Raw | \n", "MUSR | \n", "MMLU-PRO Raw | \n", "MMLU-PRO | \n", "Maintainer's Highlight | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "upstage_SOLAR-10.7B-v1.0_float16 | \n", "float16 | \n", "🟢 pretrained | \n", "🟢 | \n", "Original | \n", "LlamaForCausalLM | \n", "<a target=\"_blank\" href=\"https://huggingface.c... | \n", "upstage/SOLAR-10.7B-v1.0 | \n", "a45090b8e56bdc2b8e32e46b3cd782fc0bea1fa5 | \n", "17.072003 | \n", "apache-2.0 | \n", "248 | \n", "10 | \n", "True | \n", "True | \n", "True | \n", "False | \n", "2023-12-12 14:57:41+00:00 | \n", "False | \n", "0.242126 | \n", "24.212645 | \n", "0.509387 | \n", "29.789358 | \n", "0.021148 | \n", "2.114804 | \n", "0.281040 | \n", "4.138702 | \n", "0.437156 | \n", "13.677865 | \n", "0.34001 | \n", "26.667775 | \n", "True | \n", "
1 | \n", "upstage_SOLAR-10.7B-Instruct-v1.0_float16 | \n", "float16 | \n", "💬 chat models (RLHF, DPO, IFT, ...) | \n", "💬 | \n", "Original | \n", "LlamaForCausalLM | \n", "<a target=\"_blank\" href=\"https://huggingface.c... | \n", "upstage/SOLAR-10.7B-Instruct-v1.0 | \n", "c08c25ed66414a878fe0401a3596d536c083606c | \n", "19.961989 | \n", "cc-by-nc-4.0 | \n", "592 | \n", "10 | \n", "True | \n", "True | \n", "True | \n", "False | \n", "2023-12-12 12:39:22+00:00 | \n", "True | \n", "0.473661 | \n", "47.366100 | \n", "0.516249 | \n", "31.872402 | \n", "0.000000 | \n", "0.000000 | \n", "0.308725 | \n", "7.829978 | \n", "0.389937 | \n", "6.942188 | \n", "0.31383 | \n", "23.758865 | \n", "True | \n", "
\n", " | Type | \n", "IFEval | \n", "MATH Lvl 5 | \n", "Average ⬆️ | \n", "Color | \n", "
---|---|---|---|---|---|
0 | \n", "💬 chat models (RLHF, DPO, IFT, ...) | \n", "47.014196 | \n", "4.729041 | \n", "19.054433 | \n", "#FF323D | \n", "
1 | \n", "🔶 fine-tuned on domain-specific datasets | \n", "26.169263 | \n", "2.531290 | \n", "12.222311 | \n", "#FF9D00 | \n", "
2 | \n", "🟢 pretrained | \n", "21.958824 | \n", "3.236317 | \n", "11.288183 | \n", "#32343D | \n", "
\n", " | IFEval | \n", "BBH | \n", "MATH Lvl 5 | \n", "GPQA | \n", "MUSR | \n", "MMLU-PRO | \n", "
---|---|---|---|---|---|---|
IFEval | \n", "1.00 | \n", "0.64 | \n", "0.44 | \n", "0.39 | \n", "0.37 | \n", "0.60 | \n", "
BBH | \n", "0.64 | \n", "1.00 | \n", "0.69 | \n", "0.82 | \n", "0.58 | \n", "0.95 | \n", "
MATH Lvl 5 | \n", "0.44 | \n", "0.69 | \n", "1.00 | \n", "0.69 | \n", "0.32 | \n", "0.72 | \n", "
GPQA | \n", "0.39 | \n", "0.82 | \n", "0.69 | \n", "1.00 | \n", "0.44 | \n", "0.86 | \n", "
MUSR | \n", "0.37 | \n", "0.58 | \n", "0.32 | \n", "0.44 | \n", "1.00 | \n", "0.52 | \n", "
MMLU-PRO | \n", "0.60 | \n", "0.95 | \n", "0.72 | \n", "0.86 | \n", "0.52 | \n", "1.00 | \n", "
\n", " | eval_name | \n", "mean_score | \n", "IFEval | \n", "BBH | \n", "MATH Lvl 5 | \n", "GPQA | \n", "MUSR | \n", "MMLU-PRO | \n", "
---|---|---|---|---|---|---|---|---|
124 | \n", "Qwen_Qwen2-72B-Instruct_bfloat16 | \n", "42.486308 | \n", "79.891687 | \n", "57.483009 | \n", "35.120846 | \n", "16.331096 | \n", "17.167969 | \n", "48.923242 | \n", "
66 | \n", "meta-llama_Meta-Llama-3-70B-Instruct_bfloat16 | \n", "36.183402 | \n", "80.990771 | \n", "50.185133 | \n", "23.338369 | \n", "4.921700 | \n", "10.920573 | \n", "46.743868 | \n", "
67 | \n", "meta-llama_Meta-Llama-3-70B_bfloat16 | \n", "26.365471 | \n", "16.031906 | \n", "48.709813 | \n", "16.540785 | \n", "19.686801 | \n", "16.011198 | \n", "41.212323 | \n", "
62 | \n", "microsoft_Orca-2-13b_bfloat16 | \n", "18.136816 | \n", "31.279339 | \n", "27.308019 | \n", "0.981873 | \n", "4.026846 | \n", "25.787760 | \n", "19.437057 | \n", "