{ "cells": [ { "cell_type": "markdown", "id": "d6ea681d-9793-4d55-9bf1-c6e3ec034c6e", "metadata": {}, "source": [ "# Load Libraries" ] }, { "cell_type": "code", "execution_count": 1, "id": "9a85640e-fa8b-4b7c-9574-008d0ab22664", "metadata": {}, "outputs": [ { "data": { "text/html": [ " \n", " " ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "import os\n", "\n", "import pandas as pd\n", "import numpy as np\n", "\n", "import huggingface_hub\n", "import datasets\n", "\n", "import matplotlib.pyplot as plt\n", "import seaborn as sns\n", "import matplotlib.patches as mpatches\n", "\n", "from datetime import datetime, timezone\n", "\n", "import plotly\n", "import plotly.express as px\n", "import plotly.graph_objects as go\n", "\n", "plotly.offline.init_notebook_mode(connected=True)" ] }, { "cell_type": "code", "execution_count": 2, "id": "cfb984b0-697f-45b5-b785-72be578b0ea0", "metadata": {}, "outputs": [], "source": [ "# Create the directory for plots\n", "directory = \"./plots\"\n", "if not os.path.exists(directory):\n", " os.makedirs(directory)" ] }, { "cell_type": "code", "execution_count": 3, "id": "ca02c49a-c351-4183-b5ce-e6825296ef2d", "metadata": {}, "outputs": [], "source": [ "pd.set_option('display.max_columns', None)" ] }, { "cell_type": "code", "execution_count": 4, "id": "8c5b803d-74f3-4206-be76-e08f23e1d05d", "metadata": {}, "outputs": [], "source": [ "# Set colours\n", "colors = [\"#FF9D00\", \"#FFD21E\", \"#FF323D\", \"#32343D\", \"#297373\", \"#CD4631\"]\n", "\n", "ORANGE = \"#FF9D00\"\n", "YELLOW = \"#FFD21E\"\n", "RED = \"#FF323D\"\n", "BLACK = \"#32343D\"\n", "GREEN = \"#297373\"\n", "DARK_ORANGE = \"#CD4631\"" ] }, { "cell_type": "markdown", "id": "b564cc3c-78c4-42bc-b415-aca228e8b87e", "metadata": {}, "source": [ "# Data Loading and Preprocessing" ] }, { "cell_type": "markdown", "id": "80d0739a-b3a3-4331-b47b-2a99029affed", "metadata": {}, "source": [ "## Load V1" ] }, { "cell_type": "code", "execution_count": 5, "id": "63c5d448-6473-46df-b5a7-abbb479bea9b", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(7260, 26)" ] }, "execution_count": 5, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Load the v1 JSONL file\n", "ds = datasets.load_dataset(\"open-llm-leaderboard/contents\", split=\"train\")\n", "data_v1 = ds.to_pandas()\n", "data_v1.shape" ] }, { "cell_type": "code", "execution_count": 6, "id": "0fdce37e-681a-4b5e-962d-c3e7156d64ab", "metadata": {}, "outputs": [], "source": [ "# Drop contaminated models\n", "# There are two of them with specific names\n", "data_v1 = data_v1[~data_v1.eval_name.str.contains(\"contaminated\")]" ] }, { "cell_type": "code", "execution_count": 7, "id": "9f85eaba-5053-4a81-9cf8-1da450197ada", "metadata": {}, "outputs": [ { "data": { "text/plain": [ "(7258, 26)" ] }, "execution_count": 7, "metadata": {}, "output_type": "execute_result" } ], "source": [ "data_v1.shape" ] }, { "cell_type": "code", "execution_count": 8, "id": "f5fea314-f133-4d95-8ef8-c2491a10eaaa", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
eval_namePrecisionTypeTWeight typeArchitectureModelfullnameModel shaAverage ⬆️Hub LicenseHub ❤️#Params (B)Available on the hubMergedMoEFlaggeddateChat TemplateARCHellaSwagMMLUTruthfulQAWinograndeGSM8KMaintainers Choice
00-hero_Matter-0.1-7B_bfloat16bfloat16💬 chat models (RLHF, DPO, IFT, ...)💬OriginalMistralForCausalLM<a target=\"_blank\" href=\"https://huggingface.c...0-hero/Matter-0.1-7B035c8193ce71be90be7d90098669afb9164ec6cb63.391248apache-2.007TrueTrueTrueTrue2024-03-21T06:05:50ZFalse61.77474482.13503362.42373142.43951377.82162653.752843False
10-hero_Matter-0.1-7B-DPO-preview_bfloat16bfloat16💬 chat models (RLHF, DPO, IFT, ...)💬OriginalMistralForCausalLM<a target=\"_blank\" href=\"https://huggingface.c...0-hero/Matter-0.1-7B-DPO-preview78040e4754051df49dd907cf1fd46a6b8a6cc30f64.870290apache-2.007TrueTrueTrueTrue2024-03-23T04:13:58ZFalse62.71331182.99143662.70029945.79010178.84767256.178923False
20-hero_Matter-0.1-7B-boost_bfloat16bfloat16💬 chat models (RLHF, DPO, IFT, ...)💬OriginalMistralForCausalLM<a target=\"_blank\" href=\"https://huggingface.c...0-hero/Matter-0.1-7B-boostba56089eed1211f02e8d0ff47901e77b0cd48f8363.223517apache-2.007TrueTrueTrueTrue2024-03-21T06:05:38ZFalse62.62798681.50766861.96761854.70240475.92738842.608036False
30-hero_Matter-0.1-7B-boost-DPO_bfloat16bfloat16💬 chat models (RLHF, DPO, IFT, ...)💬OriginalMistralForCausalLM<a target=\"_blank\" href=\"https://huggingface.c...0-hero/Matter-0.1-7B-boost-DPO5bee9978fcf2188f1070b67f6d94be344fdd99c065.98585807FalseTrueTrueTrue2024-03-22T15:02:21ZFalse65.01706583.08106061.87380560.29363275.61168150.037908False
40-hero_Matter-0.1-7B-boost-DPO-preview_bfloat16bfloat16💬 chat models (RLHF, DPO, IFT, ...)💬OriginalMistralForCausalLM<a target=\"_blank\" href=\"https://huggingface.c...0-hero/Matter-0.1-7B-boost-DPO-previewd390fb35a781129efd26d53f7ecdb513c0c3da2765.767435apache-2.027TrueTrueTrueTrue2024-03-22T07:40:42ZFalse64.59044482.87193862.01762558.85916275.84846150.416983False
\n", "
" ], "text/plain": [ " eval_name Precision \\\n", "0 0-hero_Matter-0.1-7B_bfloat16 bfloat16 \n", "1 0-hero_Matter-0.1-7B-DPO-preview_bfloat16 bfloat16 \n", "2 0-hero_Matter-0.1-7B-boost_bfloat16 bfloat16 \n", "3 0-hero_Matter-0.1-7B-boost-DPO_bfloat16 bfloat16 \n", "4 0-hero_Matter-0.1-7B-boost-DPO-preview_bfloat16 bfloat16 \n", "\n", " Type T Weight type Architecture \\\n", "0 💬 chat models (RLHF, DPO, IFT, ...) 💬 Original MistralForCausalLM \n", "1 💬 chat models (RLHF, DPO, IFT, ...) 💬 Original MistralForCausalLM \n", "2 💬 chat models (RLHF, DPO, IFT, ...) 💬 Original MistralForCausalLM \n", "3 💬 chat models (RLHF, DPO, IFT, ...) 💬 Original MistralForCausalLM \n", "4 💬 chat models (RLHF, DPO, IFT, ...) 💬 Original MistralForCausalLM \n", "\n", " Model \\\n", "0 \n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
eval_namePrecisionTypeTWeight typeArchitectureModelfullnameModel shaAverage ⬆️Hub LicenseHub ❤️#Params (B)Available on the hubMergedMoEFlaggeddateChat TemplateIFEval RawIFEvalBBH RawBBHMATH Lvl 5 RawMATH Lvl 5GPQA RawGPQAMUSR RawMUSRMMLU-PRO RawMMLU-PROMaintainer's Highlight
0upstage_SOLAR-10.7B-v1.0_float16float16🟢 pretrained🟢OriginalLlamaForCausalLM<a target=\"_blank\" href=\"https://huggingface.c...upstage/SOLAR-10.7B-v1.0a45090b8e56bdc2b8e32e46b3cd782fc0bea1fa517.072003apache-2.024810TrueTrueTrueFalse2024-06-12T12:27:42ZFalse0.24212624.2126450.50938729.7893580.0211482.1148040.2810404.1387020.43715613.6778650.34001026.667775True
1upstage_SOLAR-10.7B-Instruct-v1.0_float16float16💬 chat models (RLHF, DPO, IFT, ...)💬OriginalLlamaForCausalLM<a target=\"_blank\" href=\"https://huggingface.c...upstage/SOLAR-10.7B-Instruct-v1.0c08c25ed66414a878fe0401a3596d536c083606c19.961989cc-by-nc-4.059210TrueTrueTrueFalse2024-06-12T12:06:58ZTrue0.47366147.3661000.51624931.8724020.0000000.0000000.3087257.8299780.3899376.9421880.31383023.758865True
2togethercomputer_RedPajama-INCITE-Instruct-3B-...float16🔶 fine-tuned on domain-specific datasets🔶OriginalGPTNeoXForCausalLM<a target=\"_blank\" href=\"https://huggingface.c...togethercomputer/RedPajama-INCITE-Instruct-3B-v10c66778ee09a036886741707733620b91057909a5.877290apache-2.0913TrueTrueTrueFalse2024-06-12T12:07:46ZFalse0.21242621.2426360.3146024.5107860.0060420.6042300.2474830.0000000.3886046.4088540.1109541.217125True
3togethercomputer_RedPajama-INCITE-Chat-3B-v1_f...float16🔶 fine-tuned on domain-specific datasets🔶OriginalGPTNeoXForCausalLM<a target=\"_blank\" href=\"https://huggingface.c...togethercomputer/RedPajama-INCITE-Chat-3B-v1f0e0995eba801096ed04cb87931d96a8316871af4.950649apache-2.01473TrueTrueTrueFalse2024-06-13T17:58:59ZFalse0.16521516.5214960.3216695.1647280.0030210.3021150.2441280.0000000.3684485.0893230.1126991.411052True
4togethercomputer_RedPajama-INCITE-Base-3B-v1_f...float16🟢 pretrained🟢OriginalGPTNeoXForCausalLM<a target=\"_blank\" href=\"https://huggingface.c...togethercomputer/RedPajama-INCITE-Base-3B-v1094fbdd0c911feb485ce55de1952ab2e75277e1e5.645099apache-2.0903TrueTrueTrueFalse2024-06-12T12:28:23ZFalse0.22936322.9362540.3060403.5186080.0090630.9063440.2432890.0000000.3738754.0010420.1111201.235594True
\n", "" ], "text/plain": [ " eval_name Precision \\\n", "0 upstage_SOLAR-10.7B-v1.0_float16 float16 \n", "1 upstage_SOLAR-10.7B-Instruct-v1.0_float16 float16 \n", "2 togethercomputer_RedPajama-INCITE-Instruct-3B-... float16 \n", "3 togethercomputer_RedPajama-INCITE-Chat-3B-v1_f... float16 \n", "4 togethercomputer_RedPajama-INCITE-Base-3B-v1_f... float16 \n", "\n", " Type T Weight type \\\n", "0 🟢 pretrained 🟢 Original \n", "1 💬 chat models (RLHF, DPO, IFT, ...) 💬 Original \n", "2 🔶 fine-tuned on domain-specific datasets 🔶 Original \n", "3 🔶 fine-tuned on domain-specific datasets 🔶 Original \n", "4 🟢 pretrained 🟢 Original \n", "\n", " Architecture Model \\\n", "0 LlamaForCausalLM
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
eval_namePrecisionTypeTWeight typeArchitectureModelfullnameModel shaAverage ⬆️Hub LicenseHub ❤️#Params (B)Available on the hubMergedMoEFlaggeddateChat TemplateARCHellaSwagMMLUTruthfulQAWinograndeGSM8KMaintainers Choice
00-hero_Matter-0.1-7B_bfloat16bfloat16💬 chat models (RLHF, DPO, IFT, ...)💬OriginalMistralForCausalLM<a target=\"_blank\" href=\"https://huggingface.c...0-hero/Matter-0.1-7B035c8193ce71be90be7d90098669afb9164ec6cb63.391248apache-2.007TrueTrueTrueTrue2024-03-20 05:57:38+00:00False61.77474482.13503362.42373142.43951377.82162653.752843False
10-hero_Matter-0.1-7B-DPO-preview_bfloat16bfloat16💬 chat models (RLHF, DPO, IFT, ...)💬OriginalMistralForCausalLM<a target=\"_blank\" href=\"https://huggingface.c...0-hero/Matter-0.1-7B-DPO-preview78040e4754051df49dd907cf1fd46a6b8a6cc30f64.870290apache-2.007TrueTrueTrueTrue2024-03-19 11:27:26+00:00False62.71331182.99143662.70029945.79010178.84767256.178923False
\n", "" ], "text/plain": [ " eval_name Precision \\\n", "0 0-hero_Matter-0.1-7B_bfloat16 bfloat16 \n", "1 0-hero_Matter-0.1-7B-DPO-preview_bfloat16 bfloat16 \n", "\n", " Type T Weight type Architecture \\\n", "0 💬 chat models (RLHF, DPO, IFT, ...) 💬 Original MistralForCausalLM \n", "1 💬 chat models (RLHF, DPO, IFT, ...) 💬 Original MistralForCausalLM \n", "\n", " Model \\\n", "0
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
eval_namePrecisionTypeTWeight typeArchitectureModelfullnameModel shaAverage ⬆️Hub LicenseHub ❤️#Params (B)Available on the hubMergedMoEFlaggeddateChat TemplateIFEval RawIFEvalBBH RawBBHMATH Lvl 5 RawMATH Lvl 5GPQA RawGPQAMUSR RawMUSRMMLU-PRO RawMMLU-PROMaintainer's Highlight
0upstage_SOLAR-10.7B-v1.0_float16float16🟢 pretrained🟢OriginalLlamaForCausalLM<a target=\"_blank\" href=\"https://huggingface.c...upstage/SOLAR-10.7B-v1.0a45090b8e56bdc2b8e32e46b3cd782fc0bea1fa517.072003apache-2.024810TrueTrueTrueFalse2023-12-12 14:57:41+00:00False0.24212624.2126450.50938729.7893580.0211482.1148040.2810404.1387020.43715613.6778650.3400126.667775True
1upstage_SOLAR-10.7B-Instruct-v1.0_float16float16💬 chat models (RLHF, DPO, IFT, ...)💬OriginalLlamaForCausalLM<a target=\"_blank\" href=\"https://huggingface.c...upstage/SOLAR-10.7B-Instruct-v1.0c08c25ed66414a878fe0401a3596d536c083606c19.961989cc-by-nc-4.059210TrueTrueTrueFalse2023-12-12 12:39:22+00:00True0.47366147.3661000.51624931.8724020.0000000.0000000.3087257.8299780.3899376.9421880.3138323.758865True
\n", "" ], "text/plain": [ " eval_name Precision \\\n", "0 upstage_SOLAR-10.7B-v1.0_float16 float16 \n", "1 upstage_SOLAR-10.7B-Instruct-v1.0_float16 float16 \n", "\n", " Type T Weight type Architecture \\\n", "0 🟢 pretrained 🟢 Original LlamaForCausalLM \n", "1 💬 chat models (RLHF, DPO, IFT, ...) 💬 Original LlamaForCausalLM \n", "\n", " Model \\\n", "0
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fullnameMMLUGSM8KMMLU-PROMATH Lvl 5GPQA
001-ai/Yi-1.5-34B77.99571973.23730140.73212214.04833815.436242
101-ai/Yi-1.5-34B-32K78.1530320.00000041.21232313.44410915.100671
201-ai/Yi-1.5-34B-Chat77.08284071.64518639.11606123.33836915.324385
301-ai/Yi-1.5-6B64.72689550.34116823.3433075.6646538.277405
401-ai/Yi-1.5-6B65.00272049.81046223.3433075.6646538.277405
\n", "" ], "text/plain": [ " fullname MMLU GSM8K MMLU-PRO MATH Lvl 5 \\\n", "0 01-ai/Yi-1.5-34B 77.995719 73.237301 40.732122 14.048338 \n", "1 01-ai/Yi-1.5-34B-32K 78.153032 0.000000 41.212323 13.444109 \n", "2 01-ai/Yi-1.5-34B-Chat 77.082840 71.645186 39.116061 23.338369 \n", "3 01-ai/Yi-1.5-6B 64.726895 50.341168 23.343307 5.664653 \n", "4 01-ai/Yi-1.5-6B 65.002720 49.810462 23.343307 5.664653 \n", "\n", " GPQA \n", "0 15.436242 \n", "1 15.100671 \n", "2 15.324385 \n", "3 8.277405 \n", "4 8.277405 " ] }, "execution_count": 19, "metadata": {}, "output_type": "execute_result" } ], "source": [ "merged_data.head()" ] }, { "cell_type": "markdown", "id": "c2f7240f-8a99-4463-bd63-ea32fcc9e4fe", "metadata": {}, "source": [ "- MMLU and MMLU_Pro are well correlated - overall, a model with High MMLU has a high MMLU Pro score\n", "- For MATH vs GSM8K, we identify 3 groups:\n", " - \"High\" MATH score, very low v1 score (2 outliers): possible overfitting on MATH, or , more likely, one of these models with issues with eos tokens on GSM8K\n", " - Correlation between v2 and v1 score (most models)\n", " - Low MATH score, high GSM8K score: likely overfitting on GSM8K" ] }, { "cell_type": "code", "execution_count": 22, "id": "3a0e4442-cb52-4062-85bb-e6facb171ae3", "metadata": {}, "outputs": [ { "data": { "application/vnd.plotly.v1+json": { "config": { "plotlyServerURL": "https://plot.ly" }, "data": [ { "marker": { "color": "#FFD21E" }, "mode": "markers", "name": "MMLU-PRO=f(MMLU)", "type": "scatter", "x": [ 77.9957192730956, 78.15303198667405, 77.08283992203879, 64.72689535017567, 65.00272042688377, 64.24095801586486, 70.73451954519469, 70.97203740649417, 71.01349610420871, 70.95946328062593, 76.35156727391681, 75.56180045111563, 74.86694971720243, 74.89765262226099, 63.84606128348609, 64.10709139827398, 64.64867451074461, 64.64930135205712, 70.06330822380845, 70.34013960436471, 65.86320847049085, 56.33899126009816, 75.72824616707308, 74.02494713760785, 68.19530224441294, 59.762938924439105, 60.24359775384097, 26.783999358640752, 24.821655499137048, 26.45397884474629, 25.002595295235746, 26.75606660161476, 24.950593393502686, 27.245507410753593, 61.3896585162216, 61.061389920377685, 61.06911135758283, 60.70490193884751, 58.975890703009014, 60.682678692224144, 27.03799242337422, 62.26105246211977, 60.791623329684086, 62.374203705324284, 62.44460409845261, 62.11841694257893, 63.550105055987174, 63.07073351412039, 44.87880425591219, 64.79850156958106, 62.20609448217756, 62.1246057414326, 66.51191133673278, 66.55303373586858, 63.356659125507434, 63.245294478077106, 72.27737838264045, 72.17054547051251, 72.20739527153786, 66.29541259257003, 48.335234653692204, 63.02324877855956, 62.96478981633615, 62.24441734951871, 26.193572189883703, 25.730836314001447, 39.35334081217581, 33.81983654590648, 46.70838530175783, 45.86651800528403, 80.19575908781562, 78.03946337687869, 69.36026470831474, 68.51540963742676, 68.57472555756478, 56.523519639933376, 55.55462512266324, 61.9700216814056, 61.64766854906246, 61.69764020685532, 62.54071682523817, 60.970670017639726, 45.17037929235293, 57.25919892204937, 70.4515868286805, 47.49331993287192, 52.577029472650906, 61.74163205184785, 76.66476969478123, 77.15452603998116, 26.234231797093017, 26.306692338086823, 71.38894203721195, 64.70438661462165, 64.66799008434704, 51.34583865906561, 38.64860206508296, 41.21331864479665, 26.70214825620786, 27.47846514752041, 26.592509199327857, 24.21541212062042, 26.246222375725694, 61.03678038351008, 61.04002830421494, 25.916845651522607, 24.696321332710696, 25.183937022975865, 51.74656618684998, 51.77213745416527, 46.306113561052754, 24.963045825616692, 26.658730035291097, 60.91642429212937, 41.679249280698635, 41.77314644002599, 37.648186828291585, 64.5598699571695, 66.02745725247713, 53.5170310018511, 34.611140710007234, 34.38230443417011, 40.86948703219191, 25.833717804860335, 25.404197294686202, 61.917687321536896, 62.84956375246854, 64.90605869704184, 51.037449016607304, 50.8231434865938, 54.63627087850933, 55.77103633323489, 63.90701952816292, 69.83208921288625, 48.318825530104384, 46.866075004498, 43.79609636092417, 79.23293386179718, 80.05922317995427, 66.70480406193239, 66.49495492503577, 67.0721995784583, 25.838967331895883, 60.37069885710225, 60.301159128932504, 56.371153419443566, 77.83091143434687, 68.70399293043886, 69.08301379498349, 43.886467784197876, 58.1123279817493, 57.92026667108576, 63.689429588238426, 63.617802578997775, 55.37547435463056, 60.778011275770005, 64.16448378492244, 63.46289361741425, 77.79499653225467, 71.16228271617086, 71.39615714693673, 71.81543655466032, 70.17346542519046, 71.87839332236227, 64.40157643349518, 64.87016580791871, 65.76439370548329, 30.795325425385716, 25.833717804860335, 26.067403063531568, 65.03521927528267, 64.62163338646481, 64.97554013321405, 64.94207335284045, 62.892016034379076, 56.823746005051746, 56.68268469601176, 55.89931756667269, 64.73176939664393, 64.68760308408112, 62.039927374602314, 61.05866061151978, 38.94657259915192, 41.47262454451639, 42.03478315758355, 45.225737544229936, 46.1679465865397, 62.349785875539155, 56.34727779629169, 63.8216682847302, 63.636962953801635, 48.617757574363246, 58.3670122238434, 56.8938511726681, 27.785470153285935, 25.660046534596926, 25.836963794035032, 29.919101184126255, 43.32570701968771, 46.105045377255685, 45.53223866033279, 27.68214018695477, 26.942822650634863, 37.61836019303777, 27.02787422464126, 26.231263349013634, 25.032214024865144, 66.21064893071143, 65.47877433224612 ], "y": [ 40.732121749408975, 41.21232269503546, 39.11606087470449, 23.343306737588648, 23.343306737588648, 24.414524231678485, 32.402482269503544, 30.721778959810877, 33.05814125295508, 33.261303191489354, 37.90632387706855, 39.273049645390074, 34.369459219858165, 34.369459219858165, 22.12433510638298, 22.12433510638298, 20.489804964539008, 20.489804964539008, 28.59781323877068, 29.1334219858156, 24.682328605200944, 14.2010195035461, 33.242833924349874, 33.242833924349874, 26.326093380614658, 18.799867021276594, 17.86716903073286, 2.6761968085106376, 1.8173758865248217, 1.8081412529550822, 1.725029550827422, 1.2078900709219857, 1.3279403073286051, 1.4202866430260035, 19.94496158392435, 19.78797281323877, 19.78797281323877, 19.78797281323877, 20.526743498817968, 20.526743498817968, 1.124778368794326, 18.873744089834513, 18.642878250591018, 18.642878250591018, 18.642878250591018, 18.642878250591018, 18.522828014184395, 18.05186170212766, 9.685283687943262, 22.798463356973993, 21.625664893617017, 21.625664893617017, 26.316858747044915, 26.316858747044915, 22.392139479905435, 22.392139479905435, 29.63209219858156, 29.63209219858156, 22.95545212765957, 27.314199172576835, 10.442523640661937, 21.034648345153663, 21.265514184397162, 18.36583924349882, 1.2540632387706852, 2.0390070921985806, 3.4149674940898342, 2.362219267139479, 9.79609929078014, 8.928043735224584, 48.45227541371159, 42.495936761229316, 29.373522458628837, 29.08724881796691, 29.08724881796691, 16.22340425531915, 15.512337470449172, 21.293218085106382, 21.681072695035464, 21.681072695035464, 19.751034278959807, 21.3670951536643, 7.7552452718676115, 17.239213947990542, 35.366799645390074, 9.011155437352246, 10.507166075650117, 19.61251477541371, 39.36539598108747, 40.45508274231678, 1.9281914893617011, 1.9281914893617011, 29.659796099290777, 24.128250591016545, 24.128250591016545, 15.032136524822693, 7.0718823877068555, 7.1365248226950335, 1.1986554373522447, 0.9585549645390061, 1.4756944444444438, 1.8266105200945615, 1.1617169030732852, 19.67715721040189, 19.67715721040189, 1.4295212765957446, 1.6142139479905429, 1.6603871158392434, 12.594193262411347, 12.594193262411347, 5.612810283687943, 1.1894208037825047, 1.8173758865248217, 17.5993646572104, 4.061391843971631, 4.061391843971631, 3.9228723404255303, 21.644134160756497, 21.644134160756497, 7.7183067375886525, 1.9928339243498814, 1.9928339243498814, 4.4123079196217505, 1.7712027186761226, 1.7712027186761226, 21.56102245862884, 21.56102245862884, 22.983156028368796, 12.741947399527188, 12.741947399527188, 10.257830969267138, 15.309175531914892, 15.918661347517732, 30.1954048463357, 7.524379432624113, 9.56523345153664, 9.56523345153664, 41.21232269503546, 46.74386820330969, 24.553043735224584, 24.553043735224584, 29.604388297872337, 1.3187056737588652, 19.437056737588648, 19.437056737588648, 14.653516548463358, 40.84293735224587, 30.38009751773049, 31.848404255319146, 7.6813682033096935, 18.088800236406616, 18.088800236406616, 21.699541962174944, 21.699541962174944, 15.336879432624112, 19.076906028368796, 22.364435579196215, 21.699541962174944, 40.4366134751773, 29.364287825059105, 29.364287825059105, 31.49748817966903, 31.49748817966903, 31.49748817966903, 16.36192375886525, 16.805186170212764, 18.947621158392433, 2.288342198581559, 1.7712027186761226, 1.5772754137115832, 25.458037825059098, 23.805038416075647, 23.92508865248227, 23.92508865248227, 23.92508865248227, 15.79861111111111, 15.79861111111111, 15.83554964539007, 23.278664302600472, 23.278664302600472, 23.020094562647756, 19.270833333333332, 5.1510786052009445, 6.905658983451536, 7.930703309692672, 7.432033096926712, 8.530954491725769, 20.40669326241135, 15.438460401891252, 22.826167257683213, 22.826167257683213, 10.368646572104018, 15.438460401891252, 16.722074468085104, 1.392582742316784, 1.725029550827422, 1.725029550827422, 1.6142139479905429, 8.530954491725769, 8.678708628841607, 8.678708628841607, 2.186761229314421, 1.3464095744680846, 3.027112884160755, 1.2355939716312052, 1.4110520094562635, 1.2171247044917257, 23.758865248226947, 26.667774822695034 ] }, { "marker": { "color": "#FF9D00" }, "mode": "markers", "name": "GPQA=f(MMLU)", "type": "scatter", "x": [ 77.9957192730956, 78.15303198667405, 77.08283992203879, 64.72689535017567, 65.00272042688377, 64.24095801586486, 70.73451954519469, 70.97203740649417, 71.01349610420871, 70.95946328062593, 76.35156727391681, 75.56180045111563, 74.86694971720243, 74.89765262226099, 63.84606128348609, 64.10709139827398, 64.64867451074461, 64.64930135205712, 70.06330822380845, 70.34013960436471, 65.86320847049085, 56.33899126009816, 75.72824616707308, 74.02494713760785, 68.19530224441294, 59.762938924439105, 60.24359775384097, 26.783999358640752, 24.821655499137048, 26.45397884474629, 25.002595295235746, 26.75606660161476, 24.950593393502686, 27.245507410753593, 61.3896585162216, 61.061389920377685, 61.06911135758283, 60.70490193884751, 58.975890703009014, 60.682678692224144, 27.03799242337422, 62.26105246211977, 60.791623329684086, 62.374203705324284, 62.44460409845261, 62.11841694257893, 63.550105055987174, 63.07073351412039, 44.87880425591219, 64.79850156958106, 62.20609448217756, 62.1246057414326, 66.51191133673278, 66.55303373586858, 63.356659125507434, 63.245294478077106, 72.27737838264045, 72.17054547051251, 72.20739527153786, 66.29541259257003, 48.335234653692204, 63.02324877855956, 62.96478981633615, 62.24441734951871, 26.193572189883703, 25.730836314001447, 39.35334081217581, 33.81983654590648, 46.70838530175783, 45.86651800528403, 80.19575908781562, 78.03946337687869, 69.36026470831474, 68.51540963742676, 68.57472555756478, 56.523519639933376, 55.55462512266324, 61.9700216814056, 61.64766854906246, 61.69764020685532, 62.54071682523817, 60.970670017639726, 45.17037929235293, 57.25919892204937, 70.4515868286805, 47.49331993287192, 52.577029472650906, 61.74163205184785, 76.66476969478123, 77.15452603998116, 26.234231797093017, 26.306692338086823, 71.38894203721195, 64.70438661462165, 64.66799008434704, 51.34583865906561, 38.64860206508296, 41.21331864479665, 26.70214825620786, 27.47846514752041, 26.592509199327857, 24.21541212062042, 26.246222375725694, 61.03678038351008, 61.04002830421494, 25.916845651522607, 24.696321332710696, 25.183937022975865, 51.74656618684998, 51.77213745416527, 46.306113561052754, 24.963045825616692, 26.658730035291097, 60.91642429212937, 41.679249280698635, 41.77314644002599, 37.648186828291585, 64.5598699571695, 66.02745725247713, 53.5170310018511, 34.611140710007234, 34.38230443417011, 40.86948703219191, 25.833717804860335, 25.404197294686202, 61.917687321536896, 62.84956375246854, 64.90605869704184, 51.037449016607304, 50.8231434865938, 54.63627087850933, 55.77103633323489, 63.90701952816292, 69.83208921288625, 48.318825530104384, 46.866075004498, 43.79609636092417, 79.23293386179718, 80.05922317995427, 66.70480406193239, 66.49495492503577, 67.0721995784583, 25.838967331895883, 60.37069885710225, 60.301159128932504, 56.371153419443566, 77.83091143434687, 68.70399293043886, 69.08301379498349, 43.886467784197876, 58.1123279817493, 57.92026667108576, 63.689429588238426, 63.617802578997775, 55.37547435463056, 60.778011275770005, 64.16448378492244, 63.46289361741425, 77.79499653225467, 71.16228271617086, 71.39615714693673, 71.81543655466032, 70.17346542519046, 71.87839332236227, 64.40157643349518, 64.87016580791871, 65.76439370548329, 30.795325425385716, 25.833717804860335, 26.067403063531568, 65.03521927528267, 64.62163338646481, 64.97554013321405, 64.94207335284045, 62.892016034379076, 56.823746005051746, 56.68268469601176, 55.89931756667269, 64.73176939664393, 64.68760308408112, 62.039927374602314, 61.05866061151978, 38.94657259915192, 41.47262454451639, 42.03478315758355, 45.225737544229936, 46.1679465865397, 62.349785875539155, 56.34727779629169, 63.8216682847302, 63.636962953801635, 48.617757574363246, 58.3670122238434, 56.8938511726681, 27.785470153285935, 25.660046534596926, 25.836963794035032, 29.919101184126255, 43.32570701968771, 46.105045377255685, 45.53223866033279, 27.68214018695477, 26.942822650634863, 37.61836019303777, 27.02787422464126, 26.231263349013634, 25.032214024865144, 66.21064893071143, 65.47877433224612 ], "y": [ 15.436241610738257, 15.100671140939594, 15.324384787472036, 8.277404921700223, 8.277404921700223, 9.060402684563762, 17.225950782997764, 14.541387024608499, 11.297539149888143, 7.829977628635347, 15.548098434004473, 14.205816554809845, 11.74496644295302, 11.74496644295302, 2.572706935123044, 2.572706935123044, 4.250559284116329, 4.250559284116329, 9.060402684563762, 8.7248322147651, 7.046979865771815, 4.5861297539149914, 7.38255033557047, 7.38255033557047, 7.606263982102905, 6.040268456375841, 5.257270693512303, 0, 0.7829977628635317, 2.1252796420581683, 0, 0, 1.1185682326621946, 1.230425055928408, 6.375838926174497, 5.369127516778524, 5.369127516778524, 5.369127516778524, 5.92841163310962, 5.92841163310962, 0, 5.592841163310966, 5.369127516778524, 5.369127516778524, 5.369127516778524, 5.369127516778524, 5.369127516778524, 5.257270693512303, 1.342281879194629, 5.7046979865771785, 3.1319910514541416, 3.1319910514541416, 7.158836689038028, 7.158836689038028, 5.7046979865771785, 5.7046979865771785, 9.61968680089485, 9.61968680089485, 6.935123042505594, 5.8165548098433995, 1.7897091722595053, 6.487695749440718, 5.369127516778524, 2.9082774049216997, 1.0067114093959737, 0, 0.5592841163310973, 2.572706935123044, 7.38255033557047, 6.375838926174497, 13.646532438478745, 12.192393736017896, 5.92841163310962, 2.684563758389265, 2.684563758389265, 3.5794183445190177, 2.2371364653243813, 6.487695749440718, 7.046979865771815, 7.046979865771815, 1.230425055928408, 3.243847874720355, 0.7829977628635317, 1.9015659955257262, 7.270693512304249, 1.342281879194629, 3.243847874720355, 4.026845637583895, 10.626398210290827, 9.61968680089485, 1.5659955257270708, 1.5659955257270708, 7.494407158836691, 6.263982102908276, 6.263982102908276, 2.9082774049216997, 0, 0.22371364653244186, 1.230425055928408, 1.1185682326621946, 0, 1.5659955257270708, 1.9015659955257262, 4.921700223713646, 4.921700223713646, 0, 1.4541387024608499, 2.460850111856823, 2.1252796420581683, 2.1252796420581683, 0.5592841163310973, 0, 2.572706935123044, 5.8165548098433995, 0.6711409395973182, 0.6711409395973182, 3.8031319910514525, 4.921700223713646, 4.921700223713646, 4.5861297539149914, 0, 0, 1.0067114093959737, 1.1185682326621946, 1.1185682326621946, 3.6912751677852316, 3.6912751677852316, 6.263982102908276, 1.1185682326621946, 1.1185682326621946, 0, 4.138702460850116, 1.9015659955257262, 7.046979865771815, 0.5592841163310973, 2.2371364653243813, 2.2371364653243813, 19.686800894854585, 4.921700223713646, 7.38255033557047, 7.38255033557047, 1.230425055928408, 0.5592841163310973, 4.026845637583895, 4.026845637583895, 0.7829977628635317, 11.521252796420578, 9.060402684563762, 9.284116331096197, 2.348993288590602, 2.9082774049216997, 2.9082774049216997, 5.592841163310966, 5.592841163310966, 0, 3.467561521252797, 5.592841163310966, 5.592841163310966, 16.778523489932887, 7.606263982102905, 7.606263982102905, 9.284116331096197, 9.284116331096197, 9.284116331096197, 2.684563758389265, 4.697986577181204, 3.9149888143176734, 1.342281879194629, 1.1185682326621946, 1.230425055928408, 8.165548098434002, 6.823266219239373, 6.487695749440718, 6.487695749440718, 6.487695749440718, 2.684563758389265, 2.684563758389265, 1.9015659955257262, 6.599552572706939, 6.599552572706939, 3.8031319910514525, 2.2371364653243813, 0, 0, 0, 0, 0, 4.921700223713646, 3.0201342281879207, 4.4742729306487705, 4.4742729306487705, 2.572706935123044, 2.796420581655479, 3.1319910514541416, 0, 0, 0, 0, 0, 0.22371364653244186, 0.22371364653244186, 0.6711409395973182, 0.33557046979865535, 0.11185682326622093, 0, 0, 0, 7.829977628635347, 4.138702460850116 ] } ], "layout": { "height": 350, "legend": { "title": { "text": "Evaluations" } }, "showlegend": true, "template": { "data": { "bar": [ { "error_x": { "color": "#2a3f5f" }, "error_y": { "color": "#2a3f5f" }, "marker": { "line": { "color": "#E5ECF6", "width": 0.5 }, "pattern": { "fillmode": "overlay", "size": 10, "solidity": 0.2 } }, "type": "bar" } ], "barpolar": [ { "marker": { "line": { "color": "#E5ECF6", "width": 0.5 }, "pattern": { "fillmode": "overlay", "size": 10, "solidity": 0.2 } }, "type": "barpolar" } ], "carpet": [ { "aaxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "baxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "type": "carpet" } ], "choropleth": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "choropleth" } ], "contour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "contour" } ], "contourcarpet": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "contourcarpet" } ], "heatmap": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmap" } ], "heatmapgl": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmapgl" } ], "histogram": [ { "marker": { "pattern": { "fillmode": "overlay", "size": 10, "solidity": 0.2 } }, "type": "histogram" } ], "histogram2d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2d" } ], "histogram2dcontour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2dcontour" } ], "mesh3d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "mesh3d" } ], "parcoords": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "parcoords" } ], "pie": [ { "automargin": true, "type": "pie" } ], "scatter": [ { "fillpattern": { "fillmode": "overlay", "size": 10, "solidity": 0.2 }, "type": "scatter" } ], "scatter3d": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter3d" } ], "scattercarpet": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattercarpet" } ], "scattergeo": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergeo" } ], "scattergl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergl" } ], "scattermapbox": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattermapbox" } ], "scatterpolar": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolar" } ], "scatterpolargl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolargl" } ], "scatterternary": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterternary" } ], "surface": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "surface" } ], "table": [ { "cells": { "fill": { "color": "#EBF0F8" }, "line": { "color": "white" } }, "header": { "fill": { "color": "#C8D4E3" }, "line": { "color": "white" } }, "type": "table" } ] }, "layout": { "annotationdefaults": { "arrowcolor": "#2a3f5f", "arrowhead": 0, "arrowwidth": 1 }, "autotypenumbers": "strict", "coloraxis": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "colorscale": { "diverging": [ [ 0, "#8e0152" ], [ 0.1, "#c51b7d" ], [ 0.2, "#de77ae" ], [ 0.3, "#f1b6da" ], [ 0.4, "#fde0ef" ], [ 0.5, "#f7f7f7" ], [ 0.6, "#e6f5d0" ], [ 0.7, "#b8e186" ], [ 0.8, "#7fbc41" ], [ 0.9, "#4d9221" ], [ 1, "#276419" ] ], "sequential": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "sequentialminus": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ] }, "colorway": [ "#636efa", "#EF553B", "#00cc96", "#ab63fa", "#FFA15A", "#19d3f3", "#FF6692", "#B6E880", "#FF97FF", "#FECB52" ], "font": { "color": "#2a3f5f" }, "geo": { "bgcolor": "white", "lakecolor": "white", "landcolor": "#E5ECF6", "showlakes": true, "showland": true, "subunitcolor": "white" }, "hoverlabel": { "align": "left" }, "hovermode": "closest", "mapbox": { "style": "light" }, "paper_bgcolor": "white", "plot_bgcolor": "#E5ECF6", "polar": { "angularaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "radialaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "scene": { "xaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "yaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "zaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" } }, "shapedefaults": { "line": { "color": "#2a3f5f" } }, "ternary": { "aaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "baxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "caxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "title": { "x": 0.05 }, "xaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 }, "yaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 } } }, "title": { "text": "New scores as functions of old evaluations" }, "width": 600, "xaxis": { "range": [ 0, 100 ], "title": { "text": "v1 scores" }, "type": "linear" }, "yaxis": { "range": [ -5, 80 ], "title": { "text": "v2 scores" }, "type": "linear" } } }, "text/html": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Mapping of comparisons to plot\n", "mapping = [(\"MMLU\", \"MMLU-PRO\", colors[1]), (\"MMLU\", \"GPQA\", colors[0])]\n", "\n", "# Create a new figure object\n", "fig = go.Figure()\n", "\n", "# Add scatter plots for each mapping\n", "for old, new, color in mapping:\n", " fig.add_trace(go.Scatter(\n", " x=merged_data[old],\n", " y=merged_data[new],\n", " mode='markers',\n", " marker_color=color,\n", " name=f'{new}=f({old})'\n", " ))\n", "\n", "# Update layout\n", "fig.update_layout(\n", " title='New scores as functions of old evaluations',\n", " xaxis=dict(title='v1 scores', range=[0, 100]),\n", " yaxis=dict(title='v2 scores', range=[-5, 80]),\n", " legend_title=\"Evaluations\",\n", " showlegend=True,\n", " width=600,\n", " height=350,\n", ")\n", "\n", "with open(\"./plots/new_scores_vs_old.html\", \"w\") as f:\n", " f.write(fig.to_html(full_html=False))\n", "\n", "# Display the plot\n", "fig.show()" ] }, { "cell_type": "code", "execution_count": 40, "id": "2adade8b-f25f-4b98-a06a-89f36c74bfdd", "metadata": {}, "outputs": [ { "data": { "application/vnd.plotly.v1+json": { "config": { "plotlyServerURL": "https://plot.ly" }, "data": [ { "hovertemplate": "color=default
GSM8K (v1)=%{x}
MATH (v2)=%{y}", "legendgroup": "default", "marker": { "color": "#FF9D00", "symbol": "circle" }, "mode": "markers", "name": "default", "orientation": "v", "showlegend": true, "type": "scatter", "x": [ 73.23730098559514, 71.64518574677786, 50.34116755117514, 49.81046247156937, 67.09628506444277, 62.69901440485216, 71.87263078089462, 59.66641394996209, 50.644427596664144, 34.874905231235786, 19.787717968157693, 31.91811978771797, 12.661106899166036, 12.130401819560273, 31.387414708112203, 30.32600454890068, 48.9764973464746, 47.61182714177407, 58.605003790750565, 43.51781652767248, 70.73540561031083, 47.30856709628507, 47.384382107657316, 46.019711902956786, 2.9567854435178167, 0.45489006823351025, 1.288855193328279, 5.458680818802123, 1.7437452615617892, 0.22744503411675512, 0.6823351023502654, 14.025777103866567, 11.372251705837757, 29.037149355572407, 27.065959059893856, 44.04852160727824, 45.564821834723276, 0, 1.2130401819560273, 18.11978771796816, 19.56027293404094, 19.408642911296436, 20.0909780136467, 55.117513267627, 61.10689916603488, 20.92494313874147, 67.93025018953753, 60.424564063684606, 61.25852918877938, 70.73540561031083, 70.43214556482184, 60.424564063684606, 60.50037907505686, 70.65959059893859, 71.64518574677786, 69.59818043972706, 69.44655041698256, 5.761940864291129, 32.60045489006823, 33.2827899924185, 19.9393479909022, 0.6065200909780136, 2.047005307050796, 16.300227445034114, 7.657316148597422, 33.586050037907505, 19.02956785443518, 81.04624715693708, 30.09855951478393, 67.62699014404852, 30.856709628506447, 30.62926459438969, 52.23654283548142, 2.4260803639120545, 53.525398028809704, 13.570887035633056, 13.191811978771797, 16.982562547384383, 28.203184230477635, 35.70887035633055, 55.799848369977255, 72.93404094010614, 17.81652767247915, 44.200151630022745, 34.19257012888552, 78.69598180439728, 1.8953752843062925, 1.8953752843062925, 61.10689916603488, 62.3199393479909, 62.39575435936315, 52.23654283548142, 19.63608794541319, 25.094768764215313, 0.22744503411675512, 0.8339651250947688, 1.5163002274450341, 0.3032600454890068, 1.3646702047005308, 64.6702047005307, 64.44275966641395, 1.2130401819560273, 1.061410159211524, 1.1372251705837757, 46.550416982562545, 45.94389689158454, 17.28582259287339, 0.1516300227445034, 2.1986353297952994, 42.98711144806672, 17.36163760424564, 16.90674753601213, 5.458680818802123, 50.87187263078089, 52.76724791508719, 29.18877937831691, 16.148597422289612, 15.693707354056102, 10.083396512509477, 0.6823351023502654, 0.37907505686125853, 44.351781652767244, 55.420773313116, 41.09173616376042, 8.188021228203183, 8.112206216830932, 15.238817285822591, 22.820318423047762, 26.686884003032603, 54.05610310841546, 7.354056103108415, 14.480667172100075, 5.382865807429871, 76.87642153146324, 85.44351781652767, 45.185746777862015, 45.33737680060652, 68.68840030326004, 0, 17.968157695223656, 37.831690674753595, 14.70811220621683, 79.37831690674754, 69.52236542835482, 74.52615617892343, 12.43366186504928, 54.814253222137985, 54.965883244882484, 34.950720242608035, 34.72327520849128, 14.25322213798332, 40.0303260045489, 37.831690674753595, 34.495830174374525, 73.69219105382867, 60.72782410917361, 61.10689916603488, 57.4677786201668, 46.85367702805156, 57.619408642911296, 66.71721000758151, 70.58377558756634, 45.94389689158454, 4.01819560272934, 0.6823351023502654, 0.8339651250947688, 68.15769522365429, 65.95905989385898, 26.611068991660346, 26.838514025777105, 25.777103866565582, 13.419257012888552, 13.646702047005308, 13.495072024260804, 61.48597422289613, 61.63760424564063, 56.02729340409402, 57.84685367702805, 17.437452615617893, 38.81728582259287, 35.329795299469296, 3.3358605003790753, 42.15314632297195, 35.860500379075056, 11.599696739954512, 25.473843821076574, 26.080363912054587, 5.003790750568612, 53.82865807429871, 21.455648218347235, 4.624715693707354, 4.624715693707354, 4.700530705079606, 6.899166034874906, 4.321455648218348, 5.686125852918878, 4.700530705079606, 3.0326004548900682, 0.45489006823351025, 1.592115238817286, 1.288855193328279, 0.530705079605762, 1.3646702047005308, 55.49658832448825 ], "xaxis": "x", "y": [ 14.04833836858006, 23.338368580060422, 5.664652567975831, 5.664652567975831, 12.537764350453173, 10.196374622356496, 11.63141993957704, 12.613293051359516, 4.45619335347432, 4.45619335347432, 4.305135951661631, 4.305135951661631, 1.5105740181268883, 1.5105740181268883, 1.2084592145015105, 1.2084592145015105, 4.380664652567976, 5.81570996978852, 3.3232628398791544, 1.4350453172205437, 7.552870090634441, 7.552870090634441, 2.416918429003021, 2.794561933534743, 1.2084592145015105, 0.6797583081570997, 0.5287009063444109, 0.6042296072507553, 0.906344410876133, 0.22658610271903326, 0.3021148036253776, 1.5105740181268883, 2.416918429003021, 2.416918429003021, 2.416918429003021, 6.646525679758309, 6.646525679758309, 0, 2.190332326283988, 3.1722054380664653, 3.1722054380664653, 3.1722054380664653, 3.1722054380664653, 4.531722054380665, 0.6797583081570997, 1.812688821752266, 5.740181268882175, 4.833836858006042, 4.833836858006042, 8.685800604229607, 8.685800604229607, 4.229607250755287, 4.229607250755287, 10.27190332326284, 10.27190332326284, 1.7371601208459215, 5.211480362537765, 0.6797583081570997, 2.492447129909366, 3.0211480362537766, 2.9456193353474323, 1.4350453172205437, 0.6042296072507553, 0.4531722054380665, 0, 2.2658610271903323, 0.4531722054380665, 23.036253776435046, 0, 16.46525679758308, 0, 0, 2.416918429003021, 0.9818731117824773, 4.45619335347432, 0, 0, 0.1510574018126888, 0, 2.56797583081571, 6.268882175226587, 18.806646525679756, 1.6616314199395772, 1.6616314199395772, 5.664652567975831, 17.749244712990937, 0.7552870090634441, 0.7552870090634441, 8.685800604229607, 7.175226586102719, 7.175226586102719, 4.833836858006042, 1.4350453172205437, 2.56797583081571, 0.1510574018126888, 0.0755287009063444, 0.0755287009063444, 0.0755287009063444, 0, 5.0604229607250755, 5.0604229607250755, 1.4350453172205437, 0.6042296072507553, 0.9818731117824773, 1.7371601208459215, 1.7371601208459215, 1.812688821752266, 0.7552870090634441, 0.6042296072507553, 3.1722054380664653, 2.719033232628399, 2.719033232628399, 0.4531722054380665, 6.419939577039275, 6.419939577039275, 1.5861027190332326, 1.6616314199395772, 1.6616314199395772, 1.7371601208459215, 0.22658610271903326, 0.22658610271903326, 4.380664652567976, 4.380664652567976, 2.3413897280966767, 0.7552870090634441, 0.7552870090634441, 0.6042296072507553, 1.0574018126888218, 0.906344410876133, 2.492447129909366, 0.6797583081570997, 1.2084592145015105, 1.2084592145015105, 16.540785498489427, 23.338368580060422, 3.2477341389728096, 3.2477341389728096, 8.685800604229607, 0, 0.9818731117824773, 0.9818731117824773, 0.8308157099697886, 16.993957703927492, 8.91238670694864, 11.63141993957704, 1.1329305135951662, 2.416918429003021, 2.416918429003021, 2.643504531722054, 2.643504531722054, 1.5105740181268883, 2.643504531722054, 2.492447129909366, 2.643504531722054, 16.842900302114806, 9.06344410876133, 9.06344410876133, 8.836858006042297, 8.836858006042297, 8.836858006042297, 3.8519637462235647, 4.758308157099698, 3.8519637462235647, 1.283987915407855, 0.22658610271903326, 0.6797583081570997, 6.495468277945619, 6.873111782477341, 6.570996978851963, 6.570996978851963, 6.570996978851963, 1.1329305135951662, 1.1329305135951662, 1.5861027190332326, 3.927492447129909, 3.927492447129909, 3.474320241691843, 2.0392749244712993, 0.1510574018126888, 1.0574018126888218, 2.1148036253776437, 0.6797583081570997, 4.078549848942599, 2.9456193353474323, 0.8308157099697886, 4.45619335347432, 4.45619335347432, 1.0574018126888218, 2.3413897280966767, 1.3595166163141994, 0.5287009063444109, 0.6042296072507553, 0.6042296072507553, 1.1329305135951662, 0.6797583081570997, 1.0574018126888218, 1.0574018126888218, 0.9818731117824773, 0.1510574018126888, 1.3595166163141994, 0.906344410876133, 0.3021148036253776, 0.6042296072507553, 2.1148036253776437 ], "yaxis": "y" }, { "hovertemplate": "color=green
GSM8K (v1)=%{x}
MATH (v2)=%{y}", "legendgroup": "green", "marker": { "color": "#297373", "symbol": "circle" }, "mode": "markers", "name": "green", "orientation": "v", "showlegend": true, "type": "scatter", "x": [ 0, 0 ], "xaxis": "x", "y": [ 13.444108761329304, 9.592145015105741 ], "yaxis": "y" }, { "hovertemplate": "color=red
GSM8K (v1)=%{x}
MATH (v2)=%{y}", "legendgroup": "red", "marker": { "color": "#FF323D", "symbol": "circle" }, "mode": "markers", "name": "red", "orientation": "v", "showlegend": true, "type": "scatter", "x": [ 56.633813495072026, 72.17589082638362, 64.74601971190296 ], "xaxis": "x", "y": [ 0, 0, 0 ], "yaxis": "y" } ], "layout": { "height": 350, "legend": { "title": { "text": "color" }, "tracegroupgap": 0 }, "showlegend": false, "template": { "data": { "bar": [ { "error_x": { "color": "#2a3f5f" }, "error_y": { "color": "#2a3f5f" }, "marker": { "line": { "color": "#E5ECF6", "width": 0.5 }, "pattern": { "fillmode": "overlay", "size": 10, "solidity": 0.2 } }, "type": "bar" } ], "barpolar": [ { "marker": { "line": { "color": "#E5ECF6", "width": 0.5 }, "pattern": { "fillmode": "overlay", "size": 10, "solidity": 0.2 } }, "type": "barpolar" } ], "carpet": [ { "aaxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "baxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "type": "carpet" } ], "choropleth": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "choropleth" } ], "contour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "contour" } ], "contourcarpet": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "contourcarpet" } ], "heatmap": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmap" } ], "heatmapgl": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmapgl" } ], "histogram": [ { "marker": { "pattern": { "fillmode": "overlay", "size": 10, "solidity": 0.2 } }, "type": "histogram" } ], "histogram2d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2d" } ], "histogram2dcontour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2dcontour" } ], "mesh3d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "mesh3d" } ], "parcoords": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "parcoords" } ], "pie": [ { "automargin": true, "type": "pie" } ], "scatter": [ { "fillpattern": { "fillmode": "overlay", "size": 10, "solidity": 0.2 }, "type": "scatter" } ], "scatter3d": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter3d" } ], "scattercarpet": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattercarpet" } ], "scattergeo": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergeo" } ], "scattergl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergl" } ], "scattermapbox": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattermapbox" } ], "scatterpolar": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolar" } ], "scatterpolargl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolargl" } ], "scatterternary": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterternary" } ], "surface": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "surface" } ], "table": [ { "cells": { "fill": { "color": "#EBF0F8" }, "line": { "color": "white" } }, "header": { "fill": { "color": "#C8D4E3" }, "line": { "color": "white" } }, "type": "table" } ] }, "layout": { "annotationdefaults": { "arrowcolor": "#2a3f5f", "arrowhead": 0, "arrowwidth": 1 }, "autotypenumbers": "strict", "coloraxis": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "colorscale": { "diverging": [ [ 0, "#8e0152" ], [ 0.1, "#c51b7d" ], [ 0.2, "#de77ae" ], [ 0.3, "#f1b6da" ], [ 0.4, "#fde0ef" ], [ 0.5, "#f7f7f7" ], [ 0.6, "#e6f5d0" ], [ 0.7, "#b8e186" ], [ 0.8, "#7fbc41" ], [ 0.9, "#4d9221" ], [ 1, "#276419" ] ], "sequential": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "sequentialminus": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ] }, "colorway": [ "#636efa", "#EF553B", "#00cc96", "#ab63fa", "#FFA15A", "#19d3f3", "#FF6692", "#B6E880", "#FF97FF", "#FECB52" ], "font": { "color": "#2a3f5f" }, "geo": { "bgcolor": "white", "lakecolor": "white", "landcolor": "#E5ECF6", "showlakes": true, "showland": true, "subunitcolor": "white" }, "hoverlabel": { "align": "left" }, "hovermode": "closest", "mapbox": { "style": "light" }, "paper_bgcolor": "white", "plot_bgcolor": "#E5ECF6", "polar": { "angularaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "radialaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "scene": { "xaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "yaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "zaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" } }, "shapedefaults": { "line": { "color": "#2a3f5f" } }, "ternary": { "aaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "baxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "caxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "title": { "x": 0.05 }, "xaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 }, "yaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 } } }, "title": { "text": "MATH (v2) as Function of GSM8K (v1)" }, "width": 600, "xaxis": { "anchor": "y", "autorange": true, "domain": [ 0, 1 ], "range": [ -5.663244745883108, 91.10676256241078 ], "title": { "text": "GSM8K (v1)" }, "type": "linear" }, "yaxis": { "anchor": "x", "autorange": true, "domain": [ 0, 1 ], "range": [ -1.9649141931665992, 25.30328277322702 ], "title": { "text": "MATH (v2)" }, "type": "linear" } } }, "text/html": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Custom color mapping\n", "color_map = {'green': GREEN, 'red': RED, 'default': ORANGE}\n", "\n", "# Add a new column for color based on more detailed conditions\n", "conditions = [\n", " (merged_data['MATH Lvl 5'] > 0) & (merged_data['GSM8K'] > 0),\n", " (merged_data['MATH Lvl 5'] == 0) & (merged_data['GSM8K'] > 40),\n", " (merged_data['MATH Lvl 5'] > 0) & (merged_data['GSM8K'] == 0)\n", "]\n", "\n", "# Labels for each condition\n", "labels = ['default', 'red', 'green']\n", "\n", "# Use np.select to apply multiple conditions\n", "merged_data['color'] = np.select(conditions, labels, default='default')\n", "\n", "# Create the scatter plot using the new 'color' column for color-coding\n", "fig = px.scatter(\n", " merged_data, \n", " x='GSM8K', \n", " y='MATH Lvl 5',\n", " labels={\n", " 'GSM8K': 'GSM8K (v1)', \n", " 'MATH Lvl 5': 'MATH (v2)'\n", " },\n", " title='MATH (v2) as Function of GSM8K (v1)',\n", " color='color',\n", " color_discrete_map=color_map\n", ")\n", "\n", "# Update axes and layout\n", "fig.update_xaxes(title_text='GSM8K (v1)')\n", "fig.update_yaxes(title_text='MATH (v2)')\n", "fig.update_layout(\n", " showlegend=False,\n", " width=600,\n", " height=350\n", ")\n", "\n", "with open(\"math_vs_gsm8k.html\", \"w\") as f:\n", " f.write(fig.to_html(full_html=False))\n", "\n", "fig.show()" ] }, { "cell_type": "markdown", "id": "f22371aa-3f2c-4bf8-84e0-d887eefbb4ac", "metadata": {}, "source": [ "- MMLU and MMLU_Pro are well correlated - overall, a model with High MMLU has a high MMLU Pro score\n", "- For MATH vs GSM8K, we identify 3 groups:\n", " - \"High\" MATH score, very low v1 score (2 outliers): possible overfitting on MATH, or , more likely, one of these models with issues with eos tokens on GSM8K\n", " - Correlation between v2 and v1 score (most models)\n", " - Low MATH score, high GSM8K score: likely overfitting on GSM8K " ] }, { "cell_type": "markdown", "id": "cbac9655-0697-4b5c-88f3-2286038d4c67", "metadata": {}, "source": [ "# Ranking Analysis between V1 and V2" ] }, { "cell_type": "code", "execution_count": 41, "id": "dd7a03bc-183e-4e57-9791-025ff634c76f", "metadata": {}, "outputs": [], "source": [ "# Extracting the relevant information for ranking comparison between v1 and v2\n", "# We actually want to merge with correct precision if possible, else merge on what's available\n", "v2_rank_data = data_v2[[\"fullname\", \"Average ⬆️\"]]\n", "v1_rank_data = data_v1[[\"fullname\", \"Average ⬆️\"]]" ] }, { "cell_type": "code", "execution_count": 42, "id": "597ed839-45af-49cf-a160-f9b47eccfc63", "metadata": {}, "outputs": [], "source": [ "# Renaming columns for clarity\n", "v2_rank_data = v2_rank_data.rename(columns={\"Average ⬆️\": \"v2_score\"})\n", "v1_rank_data = v1_rank_data.rename(columns={\"Average ⬆️\": \"v1_score\"})" ] }, { "cell_type": "code", "execution_count": 43, "id": "865a4e21-0a4b-4a7e-b3e6-ebd23b4d9ec9", "metadata": {}, "outputs": [], "source": [ "# Merging the two dataframes on 'eval_name'\n", "merged_rank_data = pd.merge(v1_rank_data, v2_rank_data, on=\"fullname\", how=\"inner\")" ] }, { "cell_type": "code", "execution_count": 44, "id": "edbe899e-4603-4e52-a64d-39a5db8097b0", "metadata": {}, "outputs": [], "source": [ "merged_rank_data = merged_rank_data.drop_duplicates(subset=[\"fullname\"]).dropna()" ] }, { "cell_type": "code", "execution_count": 45, "id": "63cda705-29b5-41ff-b535-a4a8d1ee4e32", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fullnamev1_scorev2_score
001-ai/Yi-1.5-34B73.50461825.812197
101-ai/Yi-1.5-34B-32K60.70097726.787600
201-ai/Yi-1.5-34B-Chat74.82376333.076818
301-ai/Yi-1.5-6B61.56652016.778059
501-ai/Yi-1.5-6B-Chat66.16730322.405532
\n", "
" ], "text/plain": [ " fullname v1_score v2_score\n", "0 01-ai/Yi-1.5-34B 73.504618 25.812197\n", "1 01-ai/Yi-1.5-34B-32K 60.700977 26.787600\n", "2 01-ai/Yi-1.5-34B-Chat 74.823763 33.076818\n", "3 01-ai/Yi-1.5-6B 61.566520 16.778059\n", "5 01-ai/Yi-1.5-6B-Chat 66.167303 22.405532" ] }, "execution_count": 45, "metadata": {}, "output_type": "execute_result" } ], "source": [ "merged_rank_data.head()" ] }, { "cell_type": "code", "execution_count": 46, "id": "9e98fda2-590e-4333-9d32-44bcfc961464", "metadata": {}, "outputs": [], "source": [ "# Calculating rank for v1 and v2 based on scores\n", "merged_rank_data[\"v1_rank\"] = merged_rank_data[\"v1_score\"].rank(ascending=False)\n", "merged_rank_data[\"v2_rank\"] = merged_rank_data[\"v2_score\"].rank(ascending=False)" ] }, { "cell_type": "code", "execution_count": 47, "id": "3d9a8416-8c50-4290-83a3-34651e0af8a7", "metadata": {}, "outputs": [], "source": [ "# Calculating rank change and sort\n", "merged_rank_data[\"rank_change\"] = (\n", " merged_rank_data[\"v2_rank\"] - merged_rank_data[\"v1_rank\"]\n", ")\n", "\n", "merged_rank_data = merged_rank_data.sort_values(\"rank_change\")\n", "merged_rank_data = merged_rank_data.dropna()" ] }, { "cell_type": "code", "execution_count": 48, "id": "f738b928-000e-41d5-aadc-da47748cff35", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fullnamev1_scorev2_scorev1_rankv2_rankrank_change
89abacusai/Smaug-72B-v0.180.48141529.9760221.05.04.0
138meta-llama/Meta-Llama-3-70B-Instruct77.88205136.6702252.01.0-1.0
88abacusai/Smaug-34B-v0.177.28564424.1272143.022.019.0
164mlabonne/AlphaMonarch-7B75.98830017.9136144.059.055.0
165mlabonne/Beyonder-4x7B-v375.65471819.6422375.047.042.0
70Qwen/Qwen1.5-110B75.41518829.9753756.06.00.0
201-ai/Yi-1.5-34B-Chat74.82376333.0768187.03.0-4.0
22CohereForAI/c4ai-command-r-plus74.61847331.2953368.04.0-4.0
158mistralai/Mixtral-8x22B-v0.174.47141825.8715319.014.05.0
207upstage/SOLAR-10.7B-Instruct-v1.074.20069819.96198910.045.035.0
\n", "
" ], "text/plain": [ " fullname v1_score v2_score v1_rank \\\n", "89 abacusai/Smaug-72B-v0.1 80.481415 29.976022 1.0 \n", "138 meta-llama/Meta-Llama-3-70B-Instruct 77.882051 36.670225 2.0 \n", "88 abacusai/Smaug-34B-v0.1 77.285644 24.127214 3.0 \n", "164 mlabonne/AlphaMonarch-7B 75.988300 17.913614 4.0 \n", "165 mlabonne/Beyonder-4x7B-v3 75.654718 19.642237 5.0 \n", "70 Qwen/Qwen1.5-110B 75.415188 29.975375 6.0 \n", "2 01-ai/Yi-1.5-34B-Chat 74.823763 33.076818 7.0 \n", "22 CohereForAI/c4ai-command-r-plus 74.618473 31.295336 8.0 \n", "158 mistralai/Mixtral-8x22B-v0.1 74.471418 25.871531 9.0 \n", "207 upstage/SOLAR-10.7B-Instruct-v1.0 74.200698 19.961989 10.0 \n", "\n", " v2_rank rank_change \n", "89 5.0 4.0 \n", "138 1.0 -1.0 \n", "88 22.0 19.0 \n", "164 59.0 55.0 \n", "165 47.0 42.0 \n", "70 6.0 0.0 \n", "2 3.0 -4.0 \n", "22 4.0 -4.0 \n", "158 14.0 5.0 \n", "207 45.0 35.0 " ] }, "execution_count": 48, "metadata": {}, "output_type": "execute_result" } ], "source": [ "merged_rank_data.sort_values(by=\"v1_rank\", ascending = True).head(10)" ] }, { "cell_type": "code", "execution_count": 49, "id": "fe727afb-0207-4c6c-9a77-a0a1e026e793", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fullnamev1_scorev2_scorev1_rankv2_rankrank_change
138meta-llama/Meta-Llama-3-70B-Instruct77.88205136.6702252.01.0-1.0
146microsoft/Phi-3-medium-4k-instruct73.44855333.11686413.02.0-11.0
201-ai/Yi-1.5-34B-Chat74.82376333.0768187.03.0-4.0
22CohereForAI/c4ai-command-r-plus74.61847331.2953368.04.0-4.0
89abacusai/Smaug-72B-v0.180.48141529.9760221.05.04.0
70Qwen/Qwen1.5-110B75.41518829.9753756.06.00.0
71Qwen/Qwen1.5-110B-Chat68.00641529.63956034.07.0-27.0
801-ai/Yi-1.5-9B-Chat69.55569528.11141821.08.0-13.0
56NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO73.12219627.34344114.09.0-5.0
101-ai/Yi-1.5-34B-32K60.70097726.78760071.010.0-61.0
\n", "
" ], "text/plain": [ " fullname v1_score v2_score \\\n", "138 meta-llama/Meta-Llama-3-70B-Instruct 77.882051 36.670225 \n", "146 microsoft/Phi-3-medium-4k-instruct 73.448553 33.116864 \n", "2 01-ai/Yi-1.5-34B-Chat 74.823763 33.076818 \n", "22 CohereForAI/c4ai-command-r-plus 74.618473 31.295336 \n", "89 abacusai/Smaug-72B-v0.1 80.481415 29.976022 \n", "70 Qwen/Qwen1.5-110B 75.415188 29.975375 \n", "71 Qwen/Qwen1.5-110B-Chat 68.006415 29.639560 \n", "8 01-ai/Yi-1.5-9B-Chat 69.555695 28.111418 \n", "56 NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO 73.122196 27.343441 \n", "1 01-ai/Yi-1.5-34B-32K 60.700977 26.787600 \n", "\n", " v1_rank v2_rank rank_change \n", "138 2.0 1.0 -1.0 \n", "146 13.0 2.0 -11.0 \n", "2 7.0 3.0 -4.0 \n", "22 8.0 4.0 -4.0 \n", "89 1.0 5.0 4.0 \n", "70 6.0 6.0 0.0 \n", "71 34.0 7.0 -27.0 \n", "8 21.0 8.0 -13.0 \n", "56 14.0 9.0 -5.0 \n", "1 71.0 10.0 -61.0 " ] }, "execution_count": 49, "metadata": {}, "output_type": "execute_result" } ], "source": [ "merged_rank_data.sort_values(by=\"v2_rank\", ascending = True).head(10)" ] }, { "cell_type": "code", "execution_count": 50, "id": "1c4101f6-01aa-4bdb-b1d4-216d773c9dd4", "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
fullnamev1_scorev2_scorev1_rankv2_rankrank_changecolorrank_change_info
101-ai/Yi-1.5-34B-32K60.70097726.78760071.010.0-61.0#32343D71 → 10
701-ai/Yi-1.5-9B-32K55.21718419.93783296.046.0-50.0#32343D96 → 46
42Intel/neural-chat-7b-v3-159.90031921.34857376.040.0-36.0#32343D76 → 40
123gpt228.5304256.754202165.5131.0-34.5#32343D165 → 131
1201-ai/Yi-34B-Chat63.17338624.26841955.021.0-34.0#32343D55 → 21
\n", "
" ], "text/plain": [ " fullname v1_score v2_score v1_rank v2_rank \\\n", "1 01-ai/Yi-1.5-34B-32K 60.700977 26.787600 71.0 10.0 \n", "7 01-ai/Yi-1.5-9B-32K 55.217184 19.937832 96.0 46.0 \n", "42 Intel/neural-chat-7b-v3-1 59.900319 21.348573 76.0 40.0 \n", "123 gpt2 28.530425 6.754202 165.5 131.0 \n", "12 01-ai/Yi-34B-Chat 63.173386 24.268419 55.0 21.0 \n", "\n", " rank_change color rank_change_info \n", "1 -61.0 #32343D 71 → 10 \n", "7 -50.0 #32343D 96 → 46 \n", "42 -36.0 #32343D 76 → 40 \n", "123 -34.5 #32343D 165 → 131 \n", "12 -34.0 #32343D 55 → 21 " ] }, "execution_count": 50, "metadata": {}, "output_type": "execute_result" } ], "source": [ "# Sorting merged_rank_data by 'rank_change'\n", "merged_rank_data = merged_rank_data.sort_values('rank_change')\n", "\n", "# Create a new column for color based on 'rank_change'\n", "merged_rank_data['color'] = merged_rank_data['rank_change'].apply(lambda x: '#FF9D00' if x > 0 else '#32343D')\n", "\n", "# Calculate the top 10 positive and negative rank changes\n", "top_pos = merged_rank_data.nlargest(10, 'rank_change')\n", "top_neg = merged_rank_data.nsmallest(10, 'rank_change')\n", "\n", "# Combine the two dataframes\n", "top_changes = pd.concat([top_pos, top_neg]).sort_values('rank_change')\n", "\n", "# Create a column for detailed rank change information\n", "top_changes[\"rank_change_info\"] = top_changes.apply(\n", " lambda x: f\"{int(x['v1_rank'])} → {int(x['v2_rank'])}\", axis=1\n", ")\n", "\n", "top_changes.head()" ] }, { "cell_type": "code", "execution_count": 59, "id": "6cf0127f-979e-4e20-8a5c-9cd1d1766810", "metadata": {}, "outputs": [ { "data": { "application/vnd.plotly.v1+json": { "config": { "plotlyServerURL": "https://plot.ly" }, "data": [ { "alignmentgroup": "True", "hovertemplate": "rank_change=%{marker.color}
fullname=%{y}
rank_change_info=%{text}", "legendgroup": "", "marker": { "color": [ 55, 42, 41, 38, 36, 35, 33, 32, 30, 29, -30, -30, -32, -33, -33.5, -34, -34.5, -36, -50, -61 ], "coloraxis": "coloraxis", "pattern": { "shape": "" } }, "name": "", "offsetgroup": "", "orientation": "h", "showlegend": false, "text": [ "4 → 59", "5 → 47", "46 → 87", "30 → 68", "59 → 95", "10 → 45", "58 → 91", "53 → 85", "125 → 155", "128 → 157", "79 → 49", "83 → 53", "97 → 65", "68 → 35", "165 → 132", "55 → 21", "165 → 131", "76 → 40", "96 → 46", "71 → 10" ], "textposition": "outside", "type": "bar", "x": [ 55, 42, 41, 38, 36, 35, 33, 32, 30, 29, -30, -30, -32, -33, -33.5, -34, -34.5, -36, -50, -61 ], "xaxis": "x", "y": [ "mlabonne/AlphaMonarch-7B", "mlabonne/Beyonder-4x7B-v3", "tiiuae/falcon-11B", "stabilityai/stablelm-2-12b-chat", "meta-llama/Llama-2-70b-chat-hf", "upstage/SOLAR-10.7B-Instruct-v1.0", "meta-llama/Meta-Llama-3-8B", "stabilityai/stablelm-2-12b", "tiiuae/falcon-7b", "togethercomputer/GPT-NeoXT-Chat-Base-20B", "HuggingFaceH4/zephyr-7b-alpha", "microsoft/Orca-2-13b", "Qwen/Qwen1.5-7B-Chat", "openchat/openchat_3.5", "openai-community/gpt2", "01-ai/Yi-34B-Chat", "gpt2", "Intel/neural-chat-7b-v3-1", "01-ai/Yi-1.5-9B-32K", "01-ai/Yi-1.5-34B-32K" ], "yaxis": "y" } ], "layout": { "bargap": 0.1, "barmode": "relative", "coloraxis": { "colorbar": { "title": { "text": "rank_change" } }, "colorscale": [ [ 0, "#32343D" ], [ 1, "#FF9D00" ] ] }, "height": 400, "legend": { "tracegroupgap": 0 }, "paper_bgcolor": "rgba(0, 0, 0, 0)", "plot_bgcolor": "rgba(0, 0, 0, 0)", "template": { "data": { "bar": [ { "error_x": { "color": "#2a3f5f" }, "error_y": { "color": "#2a3f5f" }, "marker": { "line": { "color": "#E5ECF6", "width": 0.5 }, "pattern": { "fillmode": "overlay", "size": 10, "solidity": 0.2 } }, "type": "bar" } ], "barpolar": [ { "marker": { "line": { "color": "#E5ECF6", "width": 0.5 }, "pattern": { "fillmode": "overlay", "size": 10, "solidity": 0.2 } }, "type": "barpolar" } ], "carpet": [ { "aaxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "baxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "type": "carpet" } ], "choropleth": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "choropleth" } ], "contour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "contour" } ], "contourcarpet": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "contourcarpet" } ], "heatmap": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmap" } ], "heatmapgl": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmapgl" } ], "histogram": [ { "marker": { "pattern": { "fillmode": "overlay", "size": 10, "solidity": 0.2 } }, "type": "histogram" } ], "histogram2d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2d" } ], "histogram2dcontour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2dcontour" } ], "mesh3d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "mesh3d" } ], "parcoords": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "parcoords" } ], "pie": [ { "automargin": true, "type": "pie" } ], "scatter": [ { "fillpattern": { "fillmode": "overlay", "size": 10, "solidity": 0.2 }, "type": "scatter" } ], "scatter3d": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter3d" } ], "scattercarpet": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattercarpet" } ], "scattergeo": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergeo" } ], "scattergl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergl" } ], "scattermapbox": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattermapbox" } ], "scatterpolar": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolar" } ], "scatterpolargl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolargl" } ], "scatterternary": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterternary" } ], "surface": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "surface" } ], "table": [ { "cells": { "fill": { "color": "#EBF0F8" }, "line": { "color": "white" } }, "header": { "fill": { "color": "#C8D4E3" }, "line": { "color": "white" } }, "type": "table" } ] }, "layout": { "annotationdefaults": { "arrowcolor": "#2a3f5f", "arrowhead": 0, "arrowwidth": 1 }, "autotypenumbers": "strict", "coloraxis": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "colorscale": { "diverging": [ [ 0, "#8e0152" ], [ 0.1, "#c51b7d" ], [ 0.2, "#de77ae" ], [ 0.3, "#f1b6da" ], [ 0.4, "#fde0ef" ], [ 0.5, "#f7f7f7" ], [ 0.6, "#e6f5d0" ], [ 0.7, "#b8e186" ], [ 0.8, "#7fbc41" ], [ 0.9, "#4d9221" ], [ 1, "#276419" ] ], "sequential": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "sequentialminus": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ] }, "colorway": [ "#636efa", "#EF553B", "#00cc96", "#ab63fa", "#FFA15A", "#19d3f3", "#FF6692", "#B6E880", "#FF97FF", "#FECB52" ], "font": { "color": "#2a3f5f" }, "geo": { "bgcolor": "white", "lakecolor": "white", "landcolor": "#E5ECF6", "showlakes": true, "showland": true, "subunitcolor": "white" }, "hoverlabel": { "align": "left" }, "hovermode": "closest", "mapbox": { "style": "light" }, "paper_bgcolor": "white", "plot_bgcolor": "#E5ECF6", "polar": { "angularaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "radialaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "scene": { "xaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "yaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "zaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" } }, "shapedefaults": { "line": { "color": "#2a3f5f" } }, "ternary": { "aaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "baxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "caxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "title": { "x": 0.05 }, "xaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 }, "yaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 } } }, "title": { "text": "Top and Bottom Changes in Rankings from v1 to v2" }, "width": 650, "xaxis": { "anchor": "y", "domain": [ 0, 1 ], "range": [ -85, 85 ], "tickfont": { "size": 14 }, "title": { "text": "rank_change" }, "type": "linear" }, "yaxis": { "anchor": "x", "autorange": true, "domain": [ 0, 1 ], "range": [ 19.5, -0.5 ], "title": { "text": "Model" }, "type": "category" } } }, "text/html": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Invert the order to show declines first\n", "top_changes_sorted = top_changes.sort_values(\"rank_change\", ascending=False)\n", "\n", "# Using Plotly Express to create a horizontal bar chart\n", "fig = px.bar(\n", " top_changes_sorted,\n", " y=\"fullname\",\n", " x=\"rank_change\",\n", " text=\"rank_change_info\",\n", " orientation='h',\n", " title=\"Top and Bottom Changes in Rankings from v1 to v2\",\n", " height=700,\n", " width=800,\n", " template='plotly',\n", " color='rank_change',\n", " color_continuous_scale=[BLACK, ORANGE]\n", ")\n", "\n", "# Additional layout settings\n", "fig.update_layout(\n", " xaxis_tickfont_size=14,\n", " yaxis=dict(title='Model'),\n", " bargap=0.1, # gap between bars of adjacent location coordinates\n", " xaxis_range=[-85, 85],\n", " paper_bgcolor=\"rgba(0,0,0,0)\",\n", " plot_bgcolor=\"rgba(0,0,0,0)\",\n", " width=650,\n", " height=400\n", ")\n", "\n", "fig.update_yaxes(autorange=\"reversed\") # Reverse the y-axis order\n", "fig.update_traces(textposition='outside') # Set text labels to be outside the bars\n", "\n", "# Save the figure as HTML\n", "fig.write_html(\"./plots/rankings_change.html\", full_html=False)\n", "\n", "# Display the figure\n", "fig.show()" ] }, { "cell_type": "markdown", "id": "a1adaffc-c98d-42c2-966e-69c0a228a6b5", "metadata": {}, "source": [ "# Params and Performance" ] }, { "cell_type": "code", "execution_count": 52, "id": "d78410a9-a826-4b07-b4d6-896b4bc5c39d", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Thresholds in correct order: True\n" ] } ], "source": [ "# Set date\n", "threshold_1 = datetime(2023, 10, 1, tzinfo=timezone.utc)\n", "threshold_2 = datetime(2023, 12, 1, tzinfo=timezone.utc)\n", "threshold_3 = datetime(2024, 2, 1, tzinfo=timezone.utc)\n", "threshold_4 = datetime(2024, 5, 1, tzinfo=timezone.utc)\n", "\n", "# Checking if thresholds are in the correct order\n", "thresholds = [threshold_1, threshold_2, threshold_3, threshold_4]\n", "print(\"Thresholds in correct order:\", all(x < y for x, y in zip(thresholds, thresholds[1:])))" ] }, { "cell_type": "code", "execution_count": 53, "id": "60b4c31f-deba-4648-983b-74da5615755a", "metadata": {}, "outputs": [], "source": [ "# Convert date columns to datetime if not already\n", "data_v1['date'] = pd.to_datetime(data_v1['date'], utc=True)\n", "data_v2['date'] = pd.to_datetime(data_v2['date'], utc=True)" ] }, { "cell_type": "code", "execution_count": 54, "id": "5ee768cd-f0ee-4272-83a4-667b9147efdc", "metadata": {}, "outputs": [], "source": [ "# Calculate the mean performance score across the tasks for both versions\n", "data_v1['mean_score_v1'] = data_v1[tasks_v1].mean(axis=1)\n", "data_v2['mean_score_v2'] = data_v2[tasks_v2].mean(axis=1)\n", "\n", "# Extract the necessary columns for plotting\n", "full=False\n", "if full:\n", " v1_data = data_v1[['fullname', '#Params (B)', 'mean_score_v1', 'date']]\n", " v2_data = data_v2[['fullname', '#Params (B)', 'mean_score_v2', 'date']]\n", "\n", " # Merge the data on fullname\n", " merged_data = pd.merge(v1_data, v2_data, on=['fullname', 'date', \"#Params (B)\"], how='outer')\n", "else:\n", " v1_data = data_v1[['fullname', '#Params (B)', 'mean_score_v1']]\n", " v2_data = data_v2[['fullname', '#Params (B)', 'mean_score_v2', 'date']]\n", "\n", " # Merge the data on fullname\n", " merged_data = pd.merge(v1_data, v2_data, on=['fullname', \"#Params (B)\"], how='outer')" ] }, { "cell_type": "code", "execution_count": 55, "id": "03e4d74c-96c2-493d-85e0-b9bacca5475b", "metadata": {}, "outputs": [], "source": [ "merged_data.dropna(subset=\"date\", inplace=True)\n", "\n", "merged_data[\"date\"] = pd.to_datetime(merged_data[\"date\"])\n", "for row in merged_data[\"date\"]:\n", " if not isinstance(row, pd._libs.tslibs.timestamps.Timestamp) or not isinstance(row, datetime):\n", " print(row, type(row))" ] }, { "cell_type": "code", "execution_count": 58, "id": "c68c9dc6-e9a1-4290-a5a0-9c2bf50d8054", "metadata": {}, "outputs": [ { "data": { "application/vnd.plotly.v1+json": { "config": { "plotlyServerURL": "https://plot.ly" }, "data": [ { "marker": { "color": "#CD4631" }, "mode": "markers", "name": "V1 - before Oct 2023", "type": "scatter", "x": [ 6, 1, 2, 20, 12, 0, 2, 0, 6, 13, 6, 7, 7, 7, 12, 6, 13, 70, 1, 1, 3, 0, 7, 6, 12, 3, 7, 0, 1, 30, 0, 0, 7, 7, 13, 13, 68, 68, 6, 6, 0, 1, 1, 7, 7, 7, 0, 0, 0, 1, 13, 13, 2, 13, 7, 40, 40, 7, 7, 7, 20, 7, 7, 7, 7, 7, 7, 3, 3, 3 ], "y": [ 40.10074896682374, 33.58450647496717, 36.19824768495068, 41.69377240765692, 38.822040906736184, 29.017386733865294, null, 31.55133377215884, null, 29.824105477736065, 51.86594648645109, null, null, 60.17299741784677, 40.76571366769813, 38.46964052270226, null, null, 32.474427835547964, 33.98180945576432, 36.07003025101667, 30.132439377906426, 39.17746104462925, null, 39.45681851901109, 27.39878579922899, 39.240691196349985, null, 34.60432345850859, 41.99813222281348, 28.530424870857342, 28.276436827607895, 52.05662078097521, 51.9937163576299, 54.913330129300256, 55.68584772658749, 62.395586113277396, 67.86780441186171, 50.73977386071763, 50.96641959745035, 28.86118440538931, null, 47.686833800767765, 54.964081774475865, 60.96941505813863, 44.27659791450193, 28.530424870857342, 32.06876883377193, null, null, null, null, 46.579707042613045, 55.244824022683325, 51.26466794865067, 58.07206671286895, null, 44.17474040638268, 43.16491363154527, 43.263888814685636, 43.01545408366647, 47.07421603909338, 50.02368236244343, 49.65440227183998, 41.49146241534145, 39.367740655697816, 42.375750704310896, 38.53785244348318, 39.5271935863245, 39.05504906819049 ] }, { "marker": { "color": "#FF9D00" }, "mode": "markers", "name": "V1 - Oct 2023 to Dec 2023", "type": "scatter", "x": [ 34, 34, 34, 34, 6, 6, 6, 6, 6, 14, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 3, 7, 7, 7, 13, 13, 7, 7, 2, 7, 7, 7, 7 ], "y": [ 69.42302680552211, 63.95868835473126, 63.173385805389785, 65.31580196597741, 54.02206986885943, 54.08409847286497, 56.76083349753073, 56.692333561254124, null, 63.80841685680014, 59.495598231916354, 59.08319544895303, 61.947425364774716, 61.58637451862594, 58.4569342704247, 59.90031882855004, 61.59483128758199, 61.544437738990545, 61.59009701817113, 68.28528460648407, 59.4171929373125, 59.6305845818351, 67.04979846164085, 67.12789429905199, 39.24771029913045, null, 59.380403851463505, 59.272077233588966, 58.644617981010164, 61.98135744692983, 54.54868763409007, null, 53.425863722230076, 62.91531136327541, null, 61.45490056133619, 61.520319007341975 ] }, { "marker": { "color": "#FFD21E" }, "mode": "markers", "name": "V1 - Dec 2023 and after", "type": "scatter", "x": [ 34, 34, 34, 6, 6, 6, 8, 8, 8, 8, 8, 8, 34, 8, 103, 34, 7, 7, 8, 8, 7, 65, 13, 8, 7, 7, 8, 8, 7, 7, 46, 46, 46, 10, 10, 10, 0, 0, 1, 1, 111, 111, 14, 14, 14, 3, 3, 7, 7, 7, 14, 14, 0, 0, 1, 1, 72, 7, 7, 8, 8, 8, 8, 1, 34, 72, 1, 1, 7, 46, 15, 7, 8, 8, 16, 16, 2, 8, 2, 2, 2, 8, 8, 8, 2, 2, 2, 8, 8, 6, 7, 8, 1, 70, 70, 8, 8, 8, 13, 3, 3, 7, 2, 2, 7, 7, 140, 7, 7, 7, 140, 46, 46, 46, 46, 7, 24, 8, 8, 4, 7, 7, 8, 8, 12, 12, 1, 1, 1, 11, 10, 10 ], "y": [ 73.50461754455968, 60.70097659346695, 74.82376289192528, 61.56651964419019, 61.600186822769984, 66.16730292388148, 66.73139669787327, 55.21718433605241, 69.55569489674636, 66.978059635592, 63.16505555597532, 61.943561698500645, null, null, 74.61847256780356, 68.53563242970485, 61.55422124172559, 63.18561329425248, 61.47974712141572, 62.40634929999012, 69.83091004812034, null, 48.208404569684866, 68.73192270625397, 67.35497164911531, 67.43127118086376, 69.2083869245962, 69.1681152594907, 68.0972249353676, 68.10382079751825, 73.12219560989375, 73.35333284905784, 72.07085598541813, 70.99611494328558, null, null, 38.61887256539875, 35.61159106465575, 46.55469286468076, 43.98954132886814, null, null, 66.69637898264064, 62.36628426209001, 62.2708188237114, 57.049366448825, 46.78568768990244, 61.756593209412564, 55.1547091552192, 55.1313193568431, 56.02722477794456, 57.22045479057058, 42.84986444317679, null, 55.80148346984753, null, null, 68.4038966179358, null, 51.67122869413651, 58.06414000695881, null, 61.06304789634692, null, 77.28564380843183, 80.48141483852343, 36.78193724145438, 36.72778183442903, null, 72.96938623240759, 52.79219769707651, 42.95318061588656, 65.92164049813488, 65.83198774238207, 51.067658055604085, null, null, 60.09140737121073, 46.508533827664245, 46.36834706441763, 42.750104341098215, 63.753519741904825, 64.28649887507693, 53.56017516680193, 40.44283900884822, 45.46087707986595, 40.86040726469349, 59.83882000643251, 62.62392839776545, null, 64.00461028850997, null, null, 73.95719822524191, 77.88205055819269, 62.623813022146, 62.35440585976382, 66.86966239077128, 73.44855327322567, 68.07092217507007, 69.90517093265896, null, 61.32508370227487, 61.08764545284802, 60.41385031955034, 60.37214776353206, null, 65.71304519417167, null, 60.28212129096002, 74.4714175673166, 72.61571999808636, 72.70471388912233, 68.41996524230247, 68.47189963839766, 75.98830023246877, 75.65471846500263, null, 63.99067496819134, null, 69.30492308781548, 68.88728572928933, 63.62325427218718, 63.631348521524046, 63.48339124883886, 68.37627183501046, 45.254773923509426, 50.71050223243154, 49.99090871621005, 64.28123118509966, 74.20069762794799, 66.03783597882489 ] }, { "marker": { "color": "#32343D" }, "mode": "markers", "name": "V2", "type": "scatter", "x": [ 34, 34, 34, 6, 6, 6, 8, 8, 8, 8, 34, 34, 34, 34, 6, 6, 6, 6, 6, 8, 8, 14, 34, 8, 103, 34, 7, 7, 6, 1, 2, 20, 12, 0, 2, 0, 6, 7, 7, 7, 7, 8, 8, 13, 7, 7, 7, 7, 7, 7, 7, 65, 13, 8, 7, 7, 8, 8, 7, 7, 46, 46, 46, 10, 6, 7, 7, 7, 7, 10, 10, 7, 12, 6, 0, 0, 1, 1, 111, 111, 14, 14, 14, 3, 3, 7, 7, 7, 14, 14, 0, 0, 1, 1, 72, 7, 7, 8, 8, 8, 8, 1, 13, 70, 34, 72, 1, 1, 7, 46, 7, 7, 15, 3, 7, 1, 1, 3, 0, 7, 8, 8, 6, 12, 3, 7, 7, 7, 7, 16, 16, 0, 1, 30, 2, 8, 2, 2, 2, 8, 8, 8, 2, 2, 2, 0, 0, 8, 8, 6, 7, 8, 1, 7, 7, 13, 13, 68, 68, 6, 6, 70, 70, 8, 8, 8, 0, 13, 13, 7, 13, 3, 3, 7, 1, 1, 2, 2, 7, 7, 140, 7, 7, 7, 7, 7, 140, 46, 46, 46, 46, 7, 24, 8, 8, 4, 7, 0, 0, 0, 1, 7, 7, 7, 13, 13, 8, 8, 12, 12, 1, 1, 1, 2, 2, 7, 13, 7, 7, 7, 7, 11, 40, 40, 7, 7, 7, 20, 7, 7, 7, 7, 7, 7, 3, 3, 3, 10, 10 ], "y": [ 25.432496208391797, 26.40062187124731, 32.627882895328185, 16.473483734974884, 16.473483734974884, 22.048528999300473, 21.952491645101215, 19.608376416791778, 27.70559528169099, 22.896812289511804, 22.259833967577137, 19.79947735007302, 23.89937161554255, 23.89937161554255, 13.599029368558334, 13.599029368558334, 11.895393364291047, 11.895393364291047, 14.004356953877243, 17.61045749866248, 17.59108250111942, 16.530645612803767, 24.616939161944696, 15.973218797715552, 30.86054191171216, 25.34997846133653, 14.947949239530876, 17.43232787053986, 6.545235535293089, 5.328150264736912, 6.342930983263378, 5.990640984297092, 5.9339603247654615, 5.6171015655565055, 5.441653230243495, 5.113779260124896, 5.853253723382477, 18.52326653659808, 17.71670852646412, 17.71670852646412, 17.71670852646412, 15.827429986338219, 15.827429986338219, 3.9064248386004103, 17.943646116016044, 21.004986176864318, 21.004986176864318, 21.004986176864318, 21.004986176864318, 21.43364681812902, 19.99112025734428, 22.92692360397488, 8.299243034538407, 21.629391632905307, 21.639166938124102, 21.639166938124102, 24.62473094217658, 24.62473094217658, 21.01247015664927, 21.01247015664927, 26.945095621201215, 26.945095621201215, 21.77807030737311, 23.324426341826598, 9.278951588465933, 6.688919512253983, 7.122178211045956, 13.155462341901265, 13.427164944166408, 15.62055648816441, 15.061345512111492, 17.620946178257416, 3.6692423765809075, 5.392359658909203, 5.137017087672389, 5.564869039793773, 9.118435120286238, 9.006021162921042, 29.556738934879004, 29.224836684325613, 20.224674221574386, 21.02330687787111, 21.02330687787111, 11.289834319444326, 12.325165307166374, 15.219034679073014, 16.57617293158245, 16.57617293158245, 12.42275797734545, 14.823498043433531, 7.062282757592702, 6.385370764204122, 10.319571767384213, 13.91535071246266, 42.48630818371823, 23.66081168731019, 24.764482344118388, 8.778934275693588, 15.144990895303266, 12.013002201474537, 14.195345928021323, 4.698672676403544, 15.152356507248276, 22.321913398423053, 23.757346989413076, 29.555658111270684, 6.470278440392426, 6.470278440392426, 10.729523947697116, 24.133285606044453, 20.63795117818425, 20.63795117818425, 12.21390414514177, 6.536559509561811, 8.205321558755733, 3.962215291979836, 3.9712257798358466, 4.262012960471914, 3.4568911318914637, 3.7073934241241133, 18.302168187437868, 18.302168187437868, 6.893114892840058, 6.383023820314098, 5.448600841258123, 5.571831773906653, 8.101217266693423, 14.772804383415462, 14.772804383415462, 7.365628857118445, 10.139557822520734, 3.9015688926785734, 5.251513100569197, 6.201345407060512, 7.776435284352048, 17.40405754216496, 7.271607343217184, 7.271607343217184, 7.221453677142921, 15.279173051641893, 15.279173051641893, 12.840501007747184, 6.936700148776539, 6.936700148776539, 8.06417710652117, 6.5362026677856875, 6.5362026677856875, 18.11968845841446, 18.11968845841446, 7.745056411205006, 16.738445769074524, 8.58448433464234, 10.503331167244141, 10.784447380313544, 10.784447380313544, 11.00375415839273, 10.989657280367652, 12.733816621748955, 18.246717437525763, 9.39662427983811, 8.718240778815948, 26.36547101753479, 36.18340237700426, 13.412859085784765, 13.412859085784765, 23.908735693936837, 5.251433606790305, 18.136815704093205, 18.136815704093205, 14.152729107559473, 32.6696634675738, 25.4878179882604, 25.967732638041607, 28.749320369267338, 5.523965728106273, 7.057673794439714, 15.446174740490832, 15.446174740490832, 14.152421858603484, 14.152421858603484, 25.550232388991247, 13.57156168964474, 18.444951008649024, 19.11180572554635, 14.499830223176792, 14.166820006751633, 25.489173938868174, 24.351944449795166, 24.351944449795166, 19.233464672927635, 19.233464672927635, 17.592856841556475, 19.306153763784838, 22.043842751078802, 14.86751004330882, 15.389468073057905, 5.981676871872839, 6.510807087761722, 5.479590375205572, 5.8142234694303765, 4.980187627399172, 22.59157975164075, 22.564204090028692, 21.52253406020508, 13.807969316891429, 12.835458185676481, 22.728276100836258, 22.728276100836258, 13.860193776162257, 16.224300881101808, 5.216126538850885, 8.628186472564332, 9.256757722537667, 7.2632507075969786, 12.331442611850518, 14.23122081884668, 12.119323740918015, 21.327183071003265, 21.21648409288127, 21.21648409288127, 9.481131901453509, 13.776373885273868, 11.325775761393743, 10.408978081192403, 5.097916019413136, 5.015868974143408, 5.015868974143408, 4.938885587628826, 6.711834699417412, 8.170425368064842, 8.170425368064842, 5.461109359493478, 3.962783773521173, 6.330844324541082, 5.432973566930115, 4.748118992215374, 5.66393850876747, 19.628255331894646, 16.76685804216143 ] } ], "layout": { "height": 350, "legend": { "title": { "text": "Version and Period" } }, "template": { "data": { "bar": [ { "error_x": { "color": "#2a3f5f" }, "error_y": { "color": "#2a3f5f" }, "marker": { "line": { "color": "#E5ECF6", "width": 0.5 }, "pattern": { "fillmode": "overlay", "size": 10, "solidity": 0.2 } }, "type": "bar" } ], "barpolar": [ { "marker": { "line": { "color": "#E5ECF6", "width": 0.5 }, "pattern": { "fillmode": "overlay", "size": 10, "solidity": 0.2 } }, "type": "barpolar" } ], "carpet": [ { "aaxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "baxis": { "endlinecolor": "#2a3f5f", "gridcolor": "white", "linecolor": "white", "minorgridcolor": "white", "startlinecolor": "#2a3f5f" }, "type": "carpet" } ], "choropleth": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "choropleth" } ], "contour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "contour" } ], "contourcarpet": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "contourcarpet" } ], "heatmap": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmap" } ], "heatmapgl": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "heatmapgl" } ], "histogram": [ { "marker": { "pattern": { "fillmode": "overlay", "size": 10, "solidity": 0.2 } }, "type": "histogram" } ], "histogram2d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2d" } ], "histogram2dcontour": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "histogram2dcontour" } ], "mesh3d": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "type": "mesh3d" } ], "parcoords": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "parcoords" } ], "pie": [ { "automargin": true, "type": "pie" } ], "scatter": [ { "fillpattern": { "fillmode": "overlay", "size": 10, "solidity": 0.2 }, "type": "scatter" } ], "scatter3d": [ { "line": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatter3d" } ], "scattercarpet": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattercarpet" } ], "scattergeo": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergeo" } ], "scattergl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattergl" } ], "scattermapbox": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scattermapbox" } ], "scatterpolar": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolar" } ], "scatterpolargl": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterpolargl" } ], "scatterternary": [ { "marker": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "type": "scatterternary" } ], "surface": [ { "colorbar": { "outlinewidth": 0, "ticks": "" }, "colorscale": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "type": "surface" } ], "table": [ { "cells": { "fill": { "color": "#EBF0F8" }, "line": { "color": "white" } }, "header": { "fill": { "color": "#C8D4E3" }, "line": { "color": "white" } }, "type": "table" } ] }, "layout": { "annotationdefaults": { "arrowcolor": "#2a3f5f", "arrowhead": 0, "arrowwidth": 1 }, "autotypenumbers": "strict", "coloraxis": { "colorbar": { "outlinewidth": 0, "ticks": "" } }, "colorscale": { "diverging": [ [ 0, "#8e0152" ], [ 0.1, "#c51b7d" ], [ 0.2, "#de77ae" ], [ 0.3, "#f1b6da" ], [ 0.4, "#fde0ef" ], [ 0.5, "#f7f7f7" ], [ 0.6, "#e6f5d0" ], [ 0.7, "#b8e186" ], [ 0.8, "#7fbc41" ], [ 0.9, "#4d9221" ], [ 1, "#276419" ] ], "sequential": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ], "sequentialminus": [ [ 0, "#0d0887" ], [ 0.1111111111111111, "#46039f" ], [ 0.2222222222222222, "#7201a8" ], [ 0.3333333333333333, "#9c179e" ], [ 0.4444444444444444, "#bd3786" ], [ 0.5555555555555556, "#d8576b" ], [ 0.6666666666666666, "#ed7953" ], [ 0.7777777777777778, "#fb9f3a" ], [ 0.8888888888888888, "#fdca26" ], [ 1, "#f0f921" ] ] }, "colorway": [ "#636efa", "#EF553B", "#00cc96", "#ab63fa", "#FFA15A", "#19d3f3", "#FF6692", "#B6E880", "#FF97FF", "#FECB52" ], "font": { "color": "#2a3f5f" }, "geo": { "bgcolor": "white", "lakecolor": "white", "landcolor": "#E5ECF6", "showlakes": true, "showland": true, "subunitcolor": "white" }, "hoverlabel": { "align": "left" }, "hovermode": "closest", "mapbox": { "style": "light" }, "paper_bgcolor": "white", "plot_bgcolor": "#E5ECF6", "polar": { "angularaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "radialaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "scene": { "xaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "yaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" }, "zaxis": { "backgroundcolor": "#E5ECF6", "gridcolor": "white", "gridwidth": 2, "linecolor": "white", "showbackground": true, "ticks": "", "zerolinecolor": "white" } }, "shapedefaults": { "line": { "color": "#2a3f5f" } }, "ternary": { "aaxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "baxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" }, "bgcolor": "#E5ECF6", "caxis": { "gridcolor": "white", "linecolor": "white", "ticks": "" } }, "title": { "x": 0.05 }, "xaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 }, "yaxis": { "automargin": true, "gridcolor": "white", "linecolor": "white", "ticks": "", "title": { "standoff": 15 }, "zerolinecolor": "white", "zerolinewidth": 2 } } }, "title": { "text": "Model Complexity vs Performance" }, "width": 600, "xaxis": { "range": [ -10, 150 ], "title": { "text": "Model Complexity (#Params in B)" }, "type": "linear" }, "yaxis": { "range": [ -10, 100 ], "title": { "text": "Mean Performance Score" }, "type": "linear" } } }, "text/html": [ "
" ] }, "metadata": {}, "output_type": "display_data" } ], "source": [ "# Create a new figure\n", "fig = go.Figure()\n", "\n", "# Add traces for each version and period\n", "fig.add_trace(go.Scatter(\n", " x=merged_data['#Params (B)'][merged_data['date'] < threshold_1],\n", " y=merged_data['mean_score_v1'][merged_data['date'] < threshold_1],\n", " mode='markers',\n", " name='V1 - before Oct 2023',\n", " marker=dict(color=DARK_ORANGE)\n", "))\n", "fig.add_trace(go.Scatter(\n", " x=merged_data['#Params (B)'][(threshold_1 <= merged_data['date']) & (merged_data['date'] < threshold_2)],\n", " y=merged_data['mean_score_v1'][(threshold_1 <= merged_data['date']) & (merged_data['date'] < threshold_2)],\n", " mode='markers',\n", " name='V1 - Oct 2023 to Dec 2023',\n", " marker=dict(color=ORANGE)\n", "))\n", "fig.add_trace(go.Scatter(\n", " x=merged_data['#Params (B)'][merged_data['date'] >= threshold_2],\n", " y=merged_data['mean_score_v1'][merged_data['date'] >= threshold_2],\n", " mode='markers',\n", " name='V1 - Dec 2023 and after',\n", " marker=dict(color=YELLOW)\n", "))\n", "\n", "# Version 2: All data points\n", "fig.add_trace(go.Scatter(\n", " x=merged_data['#Params (B)'],\n", " y=merged_data['mean_score_v2'],\n", " mode='markers',\n", " name='V2',\n", " marker=dict(color=BLACK)\n", "))\n", "\n", "# Update axes and layout\n", "fig.update_layout(\n", " title=\"Model Complexity vs Performance\",\n", " xaxis_title=\"Model Complexity (#Params in B)\",\n", " yaxis_title=\"Mean Performance Score\",\n", " legend_title=\"Version and Period\",\n", " yaxis=dict(range=[-10, 100]),\n", " xaxis=dict(range=[-10, 150]),\n", " width=600,\n", " height=350\n", ")\n", "\n", "with open(\"./plots/model_size_vs_perf.html\", \"w\") as f:\n", " f.write(fig.to_html(full_html=False))\n", "\n", "# Show the figure\n", "fig.show()" ] }, { "cell_type": "code", "execution_count": null, "id": "62d02821-a296-4fba-996c-0934d414519b", "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "id": "12e276ba-5333-44a6-92b8-9c3f01e61eff", "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "Leaderboard EDA", "language": "python", "name": "leaderboard_eda" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.1" } }, "nbformat": 4, "nbformat_minor": 5 }