{
"cells": [
{
"cell_type": "markdown",
"id": "d6ea681d-9793-4d55-9bf1-c6e3ec034c6e",
"metadata": {},
"source": [
"# Load Libraries"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "9a85640e-fa8b-4b7c-9574-008d0ab22664",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
" \n",
" "
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import os\n",
"\n",
"import pandas as pd\n",
"import numpy as np\n",
"\n",
"import huggingface_hub\n",
"import datasets\n",
"\n",
"import matplotlib.pyplot as plt\n",
"import seaborn as sns\n",
"import matplotlib.patches as mpatches\n",
"\n",
"from datetime import datetime, timezone\n",
"\n",
"import plotly\n",
"import plotly.express as px\n",
"import plotly.graph_objects as go\n",
"\n",
"plotly.offline.init_notebook_mode(connected=True)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "cfb984b0-697f-45b5-b785-72be578b0ea0",
"metadata": {},
"outputs": [],
"source": [
"# Create the directory for plots\n",
"directory = \"./plots\"\n",
"if not os.path.exists(directory):\n",
" os.makedirs(directory)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "ca02c49a-c351-4183-b5ce-e6825296ef2d",
"metadata": {},
"outputs": [],
"source": [
"pd.set_option('display.max_columns', None)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "8c5b803d-74f3-4206-be76-e08f23e1d05d",
"metadata": {},
"outputs": [],
"source": [
"# Set colours\n",
"colors = [\"#FF9D00\", \"#FFD21E\", \"#FF323D\", \"#32343D\", \"#297373\", \"#CD4631\"]\n",
"\n",
"ORANGE = \"#FF9D00\"\n",
"YELLOW = \"#FFD21E\"\n",
"RED = \"#FF323D\"\n",
"BLACK = \"#32343D\"\n",
"GREEN = \"#297373\"\n",
"DARK_ORANGE = \"#CD4631\""
]
},
{
"cell_type": "markdown",
"id": "b564cc3c-78c4-42bc-b415-aca228e8b87e",
"metadata": {},
"source": [
"# Data Loading and Preprocessing"
]
},
{
"cell_type": "markdown",
"id": "80d0739a-b3a3-4331-b47b-2a99029affed",
"metadata": {},
"source": [
"## Load V1"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "63c5d448-6473-46df-b5a7-abbb479bea9b",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(7260, 26)"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Load the v1 JSONL file\n",
"ds = datasets.load_dataset(\"open-llm-leaderboard/contents\", split=\"train\")\n",
"data_v1 = ds.to_pandas()\n",
"data_v1.shape"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "0fdce37e-681a-4b5e-962d-c3e7156d64ab",
"metadata": {},
"outputs": [],
"source": [
"# Drop contaminated models\n",
"# There are two of them with specific names\n",
"data_v1 = data_v1[~data_v1.eval_name.str.contains(\"contaminated\")]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "9f85eaba-5053-4a81-9cf8-1da450197ada",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"(7258, 26)"
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"data_v1.shape"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "f5fea314-f133-4d95-8ef8-c2491a10eaaa",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" eval_name | \n",
" Precision | \n",
" Type | \n",
" T | \n",
" Weight type | \n",
" Architecture | \n",
" Model | \n",
" fullname | \n",
" Model sha | \n",
" Average ⬆️ | \n",
" Hub License | \n",
" Hub ❤️ | \n",
" #Params (B) | \n",
" Available on the hub | \n",
" Merged | \n",
" MoE | \n",
" Flagged | \n",
" date | \n",
" Chat Template | \n",
" ARC | \n",
" HellaSwag | \n",
" MMLU | \n",
" TruthfulQA | \n",
" Winogrande | \n",
" GSM8K | \n",
" Maintainers Choice | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0-hero_Matter-0.1-7B_bfloat16 | \n",
" bfloat16 | \n",
" 💬 chat models (RLHF, DPO, IFT, ...) | \n",
" 💬 | \n",
" Original | \n",
" MistralForCausalLM | \n",
" <a target=\"_blank\" href=\"https://huggingface.c... | \n",
" 0-hero/Matter-0.1-7B | \n",
" 035c8193ce71be90be7d90098669afb9164ec6cb | \n",
" 63.391248 | \n",
" apache-2.0 | \n",
" 0 | \n",
" 7 | \n",
" True | \n",
" True | \n",
" True | \n",
" True | \n",
" 2024-03-21T06:05:50Z | \n",
" False | \n",
" 61.774744 | \n",
" 82.135033 | \n",
" 62.423731 | \n",
" 42.439513 | \n",
" 77.821626 | \n",
" 53.752843 | \n",
" False | \n",
"
\n",
" \n",
" 1 | \n",
" 0-hero_Matter-0.1-7B-DPO-preview_bfloat16 | \n",
" bfloat16 | \n",
" 💬 chat models (RLHF, DPO, IFT, ...) | \n",
" 💬 | \n",
" Original | \n",
" MistralForCausalLM | \n",
" <a target=\"_blank\" href=\"https://huggingface.c... | \n",
" 0-hero/Matter-0.1-7B-DPO-preview | \n",
" 78040e4754051df49dd907cf1fd46a6b8a6cc30f | \n",
" 64.870290 | \n",
" apache-2.0 | \n",
" 0 | \n",
" 7 | \n",
" True | \n",
" True | \n",
" True | \n",
" True | \n",
" 2024-03-23T04:13:58Z | \n",
" False | \n",
" 62.713311 | \n",
" 82.991436 | \n",
" 62.700299 | \n",
" 45.790101 | \n",
" 78.847672 | \n",
" 56.178923 | \n",
" False | \n",
"
\n",
" \n",
" 2 | \n",
" 0-hero_Matter-0.1-7B-boost_bfloat16 | \n",
" bfloat16 | \n",
" 💬 chat models (RLHF, DPO, IFT, ...) | \n",
" 💬 | \n",
" Original | \n",
" MistralForCausalLM | \n",
" <a target=\"_blank\" href=\"https://huggingface.c... | \n",
" 0-hero/Matter-0.1-7B-boost | \n",
" ba56089eed1211f02e8d0ff47901e77b0cd48f83 | \n",
" 63.223517 | \n",
" apache-2.0 | \n",
" 0 | \n",
" 7 | \n",
" True | \n",
" True | \n",
" True | \n",
" True | \n",
" 2024-03-21T06:05:38Z | \n",
" False | \n",
" 62.627986 | \n",
" 81.507668 | \n",
" 61.967618 | \n",
" 54.702404 | \n",
" 75.927388 | \n",
" 42.608036 | \n",
" False | \n",
"
\n",
" \n",
" 3 | \n",
" 0-hero_Matter-0.1-7B-boost-DPO_bfloat16 | \n",
" bfloat16 | \n",
" 💬 chat models (RLHF, DPO, IFT, ...) | \n",
" 💬 | \n",
" Original | \n",
" MistralForCausalLM | \n",
" <a target=\"_blank\" href=\"https://huggingface.c... | \n",
" 0-hero/Matter-0.1-7B-boost-DPO | \n",
" 5bee9978fcf2188f1070b67f6d94be344fdd99c0 | \n",
" 65.985858 | \n",
" | \n",
" 0 | \n",
" 7 | \n",
" False | \n",
" True | \n",
" True | \n",
" True | \n",
" 2024-03-22T15:02:21Z | \n",
" False | \n",
" 65.017065 | \n",
" 83.081060 | \n",
" 61.873805 | \n",
" 60.293632 | \n",
" 75.611681 | \n",
" 50.037908 | \n",
" False | \n",
"
\n",
" \n",
" 4 | \n",
" 0-hero_Matter-0.1-7B-boost-DPO-preview_bfloat16 | \n",
" bfloat16 | \n",
" 💬 chat models (RLHF, DPO, IFT, ...) | \n",
" 💬 | \n",
" Original | \n",
" MistralForCausalLM | \n",
" <a target=\"_blank\" href=\"https://huggingface.c... | \n",
" 0-hero/Matter-0.1-7B-boost-DPO-preview | \n",
" d390fb35a781129efd26d53f7ecdb513c0c3da27 | \n",
" 65.767435 | \n",
" apache-2.0 | \n",
" 2 | \n",
" 7 | \n",
" True | \n",
" True | \n",
" True | \n",
" True | \n",
" 2024-03-22T07:40:42Z | \n",
" False | \n",
" 64.590444 | \n",
" 82.871938 | \n",
" 62.017625 | \n",
" 58.859162 | \n",
" 75.848461 | \n",
" 50.416983 | \n",
" False | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" eval_name Precision \\\n",
"0 0-hero_Matter-0.1-7B_bfloat16 bfloat16 \n",
"1 0-hero_Matter-0.1-7B-DPO-preview_bfloat16 bfloat16 \n",
"2 0-hero_Matter-0.1-7B-boost_bfloat16 bfloat16 \n",
"3 0-hero_Matter-0.1-7B-boost-DPO_bfloat16 bfloat16 \n",
"4 0-hero_Matter-0.1-7B-boost-DPO-preview_bfloat16 bfloat16 \n",
"\n",
" Type T Weight type Architecture \\\n",
"0 💬 chat models (RLHF, DPO, IFT, ...) 💬 Original MistralForCausalLM \n",
"1 💬 chat models (RLHF, DPO, IFT, ...) 💬 Original MistralForCausalLM \n",
"2 💬 chat models (RLHF, DPO, IFT, ...) 💬 Original MistralForCausalLM \n",
"3 💬 chat models (RLHF, DPO, IFT, ...) 💬 Original MistralForCausalLM \n",
"4 💬 chat models (RLHF, DPO, IFT, ...) 💬 Original MistralForCausalLM \n",
"\n",
" Model \\\n",
"0 \n",
"\n",
"\n",
" \n",
" \n",
" | \n",
" eval_name | \n",
" Precision | \n",
" Type | \n",
" T | \n",
" Weight type | \n",
" Architecture | \n",
" Model | \n",
" fullname | \n",
" Model sha | \n",
" Average ⬆️ | \n",
" Hub License | \n",
" Hub ❤️ | \n",
" #Params (B) | \n",
" Available on the hub | \n",
" Merged | \n",
" MoE | \n",
" Flagged | \n",
" date | \n",
" Chat Template | \n",
" IFEval Raw | \n",
" IFEval | \n",
" BBH Raw | \n",
" BBH | \n",
" MATH Lvl 5 Raw | \n",
" MATH Lvl 5 | \n",
" GPQA Raw | \n",
" GPQA | \n",
" MUSR Raw | \n",
" MUSR | \n",
" MMLU-PRO Raw | \n",
" MMLU-PRO | \n",
" Maintainer's Highlight | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" upstage_SOLAR-10.7B-v1.0_float16 | \n",
" float16 | \n",
" 🟢 pretrained | \n",
" 🟢 | \n",
" Original | \n",
" LlamaForCausalLM | \n",
" <a target=\"_blank\" href=\"https://huggingface.c... | \n",
" upstage/SOLAR-10.7B-v1.0 | \n",
" a45090b8e56bdc2b8e32e46b3cd782fc0bea1fa5 | \n",
" 17.072003 | \n",
" apache-2.0 | \n",
" 248 | \n",
" 10 | \n",
" True | \n",
" True | \n",
" True | \n",
" False | \n",
" 2024-06-12T12:27:42Z | \n",
" False | \n",
" 0.242126 | \n",
" 24.212645 | \n",
" 0.509387 | \n",
" 29.789358 | \n",
" 0.021148 | \n",
" 2.114804 | \n",
" 0.281040 | \n",
" 4.138702 | \n",
" 0.437156 | \n",
" 13.677865 | \n",
" 0.340010 | \n",
" 26.667775 | \n",
" True | \n",
"
\n",
" \n",
" 1 | \n",
" upstage_SOLAR-10.7B-Instruct-v1.0_float16 | \n",
" float16 | \n",
" 💬 chat models (RLHF, DPO, IFT, ...) | \n",
" 💬 | \n",
" Original | \n",
" LlamaForCausalLM | \n",
" <a target=\"_blank\" href=\"https://huggingface.c... | \n",
" upstage/SOLAR-10.7B-Instruct-v1.0 | \n",
" c08c25ed66414a878fe0401a3596d536c083606c | \n",
" 19.961989 | \n",
" cc-by-nc-4.0 | \n",
" 592 | \n",
" 10 | \n",
" True | \n",
" True | \n",
" True | \n",
" False | \n",
" 2024-06-12T12:06:58Z | \n",
" True | \n",
" 0.473661 | \n",
" 47.366100 | \n",
" 0.516249 | \n",
" 31.872402 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.308725 | \n",
" 7.829978 | \n",
" 0.389937 | \n",
" 6.942188 | \n",
" 0.313830 | \n",
" 23.758865 | \n",
" True | \n",
"
\n",
" \n",
" 2 | \n",
" togethercomputer_RedPajama-INCITE-Instruct-3B-... | \n",
" float16 | \n",
" 🔶 fine-tuned on domain-specific datasets | \n",
" 🔶 | \n",
" Original | \n",
" GPTNeoXForCausalLM | \n",
" <a target=\"_blank\" href=\"https://huggingface.c... | \n",
" togethercomputer/RedPajama-INCITE-Instruct-3B-v1 | \n",
" 0c66778ee09a036886741707733620b91057909a | \n",
" 5.877290 | \n",
" apache-2.0 | \n",
" 91 | \n",
" 3 | \n",
" True | \n",
" True | \n",
" True | \n",
" False | \n",
" 2024-06-12T12:07:46Z | \n",
" False | \n",
" 0.212426 | \n",
" 21.242636 | \n",
" 0.314602 | \n",
" 4.510786 | \n",
" 0.006042 | \n",
" 0.604230 | \n",
" 0.247483 | \n",
" 0.000000 | \n",
" 0.388604 | \n",
" 6.408854 | \n",
" 0.110954 | \n",
" 1.217125 | \n",
" True | \n",
"
\n",
" \n",
" 3 | \n",
" togethercomputer_RedPajama-INCITE-Chat-3B-v1_f... | \n",
" float16 | \n",
" 🔶 fine-tuned on domain-specific datasets | \n",
" 🔶 | \n",
" Original | \n",
" GPTNeoXForCausalLM | \n",
" <a target=\"_blank\" href=\"https://huggingface.c... | \n",
" togethercomputer/RedPajama-INCITE-Chat-3B-v1 | \n",
" f0e0995eba801096ed04cb87931d96a8316871af | \n",
" 4.950649 | \n",
" apache-2.0 | \n",
" 147 | \n",
" 3 | \n",
" True | \n",
" True | \n",
" True | \n",
" False | \n",
" 2024-06-13T17:58:59Z | \n",
" False | \n",
" 0.165215 | \n",
" 16.521496 | \n",
" 0.321669 | \n",
" 5.164728 | \n",
" 0.003021 | \n",
" 0.302115 | \n",
" 0.244128 | \n",
" 0.000000 | \n",
" 0.368448 | \n",
" 5.089323 | \n",
" 0.112699 | \n",
" 1.411052 | \n",
" True | \n",
"
\n",
" \n",
" 4 | \n",
" togethercomputer_RedPajama-INCITE-Base-3B-v1_f... | \n",
" float16 | \n",
" 🟢 pretrained | \n",
" 🟢 | \n",
" Original | \n",
" GPTNeoXForCausalLM | \n",
" <a target=\"_blank\" href=\"https://huggingface.c... | \n",
" togethercomputer/RedPajama-INCITE-Base-3B-v1 | \n",
" 094fbdd0c911feb485ce55de1952ab2e75277e1e | \n",
" 5.645099 | \n",
" apache-2.0 | \n",
" 90 | \n",
" 3 | \n",
" True | \n",
" True | \n",
" True | \n",
" False | \n",
" 2024-06-12T12:28:23Z | \n",
" False | \n",
" 0.229363 | \n",
" 22.936254 | \n",
" 0.306040 | \n",
" 3.518608 | \n",
" 0.009063 | \n",
" 0.906344 | \n",
" 0.243289 | \n",
" 0.000000 | \n",
" 0.373875 | \n",
" 4.001042 | \n",
" 0.111120 | \n",
" 1.235594 | \n",
" True | \n",
"
\n",
" \n",
"
\n",
""
],
"text/plain": [
" eval_name Precision \\\n",
"0 upstage_SOLAR-10.7B-v1.0_float16 float16 \n",
"1 upstage_SOLAR-10.7B-Instruct-v1.0_float16 float16 \n",
"2 togethercomputer_RedPajama-INCITE-Instruct-3B-... float16 \n",
"3 togethercomputer_RedPajama-INCITE-Chat-3B-v1_f... float16 \n",
"4 togethercomputer_RedPajama-INCITE-Base-3B-v1_f... float16 \n",
"\n",
" Type T Weight type \\\n",
"0 🟢 pretrained 🟢 Original \n",
"1 💬 chat models (RLHF, DPO, IFT, ...) 💬 Original \n",
"2 🔶 fine-tuned on domain-specific datasets 🔶 Original \n",
"3 🔶 fine-tuned on domain-specific datasets 🔶 Original \n",
"4 🟢 pretrained 🟢 Original \n",
"\n",
" Architecture Model \\\n",
"0 LlamaForCausalLM \n",
"\n",
"\n",
" \n",
" \n",
" | \n",
" eval_name | \n",
" Precision | \n",
" Type | \n",
" T | \n",
" Weight type | \n",
" Architecture | \n",
" Model | \n",
" fullname | \n",
" Model sha | \n",
" Average ⬆️ | \n",
" Hub License | \n",
" Hub ❤️ | \n",
" #Params (B) | \n",
" Available on the hub | \n",
" Merged | \n",
" MoE | \n",
" Flagged | \n",
" date | \n",
" Chat Template | \n",
" ARC | \n",
" HellaSwag | \n",
" MMLU | \n",
" TruthfulQA | \n",
" Winogrande | \n",
" GSM8K | \n",
" Maintainers Choice | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 0-hero_Matter-0.1-7B_bfloat16 | \n",
" bfloat16 | \n",
" 💬 chat models (RLHF, DPO, IFT, ...) | \n",
" 💬 | \n",
" Original | \n",
" MistralForCausalLM | \n",
" <a target=\"_blank\" href=\"https://huggingface.c... | \n",
" 0-hero/Matter-0.1-7B | \n",
" 035c8193ce71be90be7d90098669afb9164ec6cb | \n",
" 63.391248 | \n",
" apache-2.0 | \n",
" 0 | \n",
" 7 | \n",
" True | \n",
" True | \n",
" True | \n",
" True | \n",
" 2024-03-20 05:57:38+00:00 | \n",
" False | \n",
" 61.774744 | \n",
" 82.135033 | \n",
" 62.423731 | \n",
" 42.439513 | \n",
" 77.821626 | \n",
" 53.752843 | \n",
" False | \n",
"
\n",
" \n",
" 1 | \n",
" 0-hero_Matter-0.1-7B-DPO-preview_bfloat16 | \n",
" bfloat16 | \n",
" 💬 chat models (RLHF, DPO, IFT, ...) | \n",
" 💬 | \n",
" Original | \n",
" MistralForCausalLM | \n",
" <a target=\"_blank\" href=\"https://huggingface.c... | \n",
" 0-hero/Matter-0.1-7B-DPO-preview | \n",
" 78040e4754051df49dd907cf1fd46a6b8a6cc30f | \n",
" 64.870290 | \n",
" apache-2.0 | \n",
" 0 | \n",
" 7 | \n",
" True | \n",
" True | \n",
" True | \n",
" True | \n",
" 2024-03-19 11:27:26+00:00 | \n",
" False | \n",
" 62.713311 | \n",
" 82.991436 | \n",
" 62.700299 | \n",
" 45.790101 | \n",
" 78.847672 | \n",
" 56.178923 | \n",
" False | \n",
"
\n",
" \n",
"
\n",
""
],
"text/plain": [
" eval_name Precision \\\n",
"0 0-hero_Matter-0.1-7B_bfloat16 bfloat16 \n",
"1 0-hero_Matter-0.1-7B-DPO-preview_bfloat16 bfloat16 \n",
"\n",
" Type T Weight type Architecture \\\n",
"0 💬 chat models (RLHF, DPO, IFT, ...) 💬 Original MistralForCausalLM \n",
"1 💬 chat models (RLHF, DPO, IFT, ...) 💬 Original MistralForCausalLM \n",
"\n",
" Model \\\n",
"0 \n",
"\n",
"\n",
" \n",
" \n",
" | \n",
" eval_name | \n",
" Precision | \n",
" Type | \n",
" T | \n",
" Weight type | \n",
" Architecture | \n",
" Model | \n",
" fullname | \n",
" Model sha | \n",
" Average ⬆️ | \n",
" Hub License | \n",
" Hub ❤️ | \n",
" #Params (B) | \n",
" Available on the hub | \n",
" Merged | \n",
" MoE | \n",
" Flagged | \n",
" date | \n",
" Chat Template | \n",
" IFEval Raw | \n",
" IFEval | \n",
" BBH Raw | \n",
" BBH | \n",
" MATH Lvl 5 Raw | \n",
" MATH Lvl 5 | \n",
" GPQA Raw | \n",
" GPQA | \n",
" MUSR Raw | \n",
" MUSR | \n",
" MMLU-PRO Raw | \n",
" MMLU-PRO | \n",
" Maintainer's Highlight | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" upstage_SOLAR-10.7B-v1.0_float16 | \n",
" float16 | \n",
" 🟢 pretrained | \n",
" 🟢 | \n",
" Original | \n",
" LlamaForCausalLM | \n",
" <a target=\"_blank\" href=\"https://huggingface.c... | \n",
" upstage/SOLAR-10.7B-v1.0 | \n",
" a45090b8e56bdc2b8e32e46b3cd782fc0bea1fa5 | \n",
" 17.072003 | \n",
" apache-2.0 | \n",
" 248 | \n",
" 10 | \n",
" True | \n",
" True | \n",
" True | \n",
" False | \n",
" 2023-12-12 14:57:41+00:00 | \n",
" False | \n",
" 0.242126 | \n",
" 24.212645 | \n",
" 0.509387 | \n",
" 29.789358 | \n",
" 0.021148 | \n",
" 2.114804 | \n",
" 0.281040 | \n",
" 4.138702 | \n",
" 0.437156 | \n",
" 13.677865 | \n",
" 0.34001 | \n",
" 26.667775 | \n",
" True | \n",
"
\n",
" \n",
" 1 | \n",
" upstage_SOLAR-10.7B-Instruct-v1.0_float16 | \n",
" float16 | \n",
" 💬 chat models (RLHF, DPO, IFT, ...) | \n",
" 💬 | \n",
" Original | \n",
" LlamaForCausalLM | \n",
" <a target=\"_blank\" href=\"https://huggingface.c... | \n",
" upstage/SOLAR-10.7B-Instruct-v1.0 | \n",
" c08c25ed66414a878fe0401a3596d536c083606c | \n",
" 19.961989 | \n",
" cc-by-nc-4.0 | \n",
" 592 | \n",
" 10 | \n",
" True | \n",
" True | \n",
" True | \n",
" False | \n",
" 2023-12-12 12:39:22+00:00 | \n",
" True | \n",
" 0.473661 | \n",
" 47.366100 | \n",
" 0.516249 | \n",
" 31.872402 | \n",
" 0.000000 | \n",
" 0.000000 | \n",
" 0.308725 | \n",
" 7.829978 | \n",
" 0.389937 | \n",
" 6.942188 | \n",
" 0.31383 | \n",
" 23.758865 | \n",
" True | \n",
"
\n",
" \n",
"
\n",
""
],
"text/plain": [
" eval_name Precision \\\n",
"0 upstage_SOLAR-10.7B-v1.0_float16 float16 \n",
"1 upstage_SOLAR-10.7B-Instruct-v1.0_float16 float16 \n",
"\n",
" Type T Weight type Architecture \\\n",
"0 🟢 pretrained 🟢 Original LlamaForCausalLM \n",
"1 💬 chat models (RLHF, DPO, IFT, ...) 💬 Original LlamaForCausalLM \n",
"\n",
" Model \\\n",
"0 \n",
"\n",
"\n",
" \n",
" \n",
" | \n",
" fullname | \n",
" MMLU | \n",
" GSM8K | \n",
" MMLU-PRO | \n",
" MATH Lvl 5 | \n",
" GPQA | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 01-ai/Yi-1.5-34B | \n",
" 77.995719 | \n",
" 73.237301 | \n",
" 40.732122 | \n",
" 14.048338 | \n",
" 15.436242 | \n",
"
\n",
" \n",
" 1 | \n",
" 01-ai/Yi-1.5-34B-32K | \n",
" 78.153032 | \n",
" 0.000000 | \n",
" 41.212323 | \n",
" 13.444109 | \n",
" 15.100671 | \n",
"
\n",
" \n",
" 2 | \n",
" 01-ai/Yi-1.5-34B-Chat | \n",
" 77.082840 | \n",
" 71.645186 | \n",
" 39.116061 | \n",
" 23.338369 | \n",
" 15.324385 | \n",
"
\n",
" \n",
" 3 | \n",
" 01-ai/Yi-1.5-6B | \n",
" 64.726895 | \n",
" 50.341168 | \n",
" 23.343307 | \n",
" 5.664653 | \n",
" 8.277405 | \n",
"
\n",
" \n",
" 4 | \n",
" 01-ai/Yi-1.5-6B | \n",
" 65.002720 | \n",
" 49.810462 | \n",
" 23.343307 | \n",
" 5.664653 | \n",
" 8.277405 | \n",
"
\n",
" \n",
"
\n",
""
],
"text/plain": [
" fullname MMLU GSM8K MMLU-PRO MATH Lvl 5 \\\n",
"0 01-ai/Yi-1.5-34B 77.995719 73.237301 40.732122 14.048338 \n",
"1 01-ai/Yi-1.5-34B-32K 78.153032 0.000000 41.212323 13.444109 \n",
"2 01-ai/Yi-1.5-34B-Chat 77.082840 71.645186 39.116061 23.338369 \n",
"3 01-ai/Yi-1.5-6B 64.726895 50.341168 23.343307 5.664653 \n",
"4 01-ai/Yi-1.5-6B 65.002720 49.810462 23.343307 5.664653 \n",
"\n",
" GPQA \n",
"0 15.436242 \n",
"1 15.100671 \n",
"2 15.324385 \n",
"3 8.277405 \n",
"4 8.277405 "
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"merged_data.head()"
]
},
{
"cell_type": "markdown",
"id": "c2f7240f-8a99-4463-bd63-ea32fcc9e4fe",
"metadata": {},
"source": [
"- MMLU and MMLU_Pro are well correlated - overall, a model with High MMLU has a high MMLU Pro score\n",
"- For MATH vs GSM8K, we identify 3 groups:\n",
" - \"High\" MATH score, very low v1 score (2 outliers): possible overfitting on MATH, or , more likely, one of these models with issues with eos tokens on GSM8K\n",
" - Correlation between v2 and v1 score (most models)\n",
" - Low MATH score, high GSM8K score: likely overfitting on GSM8K"
]
},
{
"cell_type": "code",
"execution_count": 22,
"id": "3a0e4442-cb52-4062-85bb-e6facb171ae3",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.plotly.v1+json": {
"config": {
"plotlyServerURL": "https://plot.ly"
},
"data": [
{
"marker": {
"color": "#FFD21E"
},
"mode": "markers",
"name": "MMLU-PRO=f(MMLU)",
"type": "scatter",
"x": [
77.9957192730956,
78.15303198667405,
77.08283992203879,
64.72689535017567,
65.00272042688377,
64.24095801586486,
70.73451954519469,
70.97203740649417,
71.01349610420871,
70.95946328062593,
76.35156727391681,
75.56180045111563,
74.86694971720243,
74.89765262226099,
63.84606128348609,
64.10709139827398,
64.64867451074461,
64.64930135205712,
70.06330822380845,
70.34013960436471,
65.86320847049085,
56.33899126009816,
75.72824616707308,
74.02494713760785,
68.19530224441294,
59.762938924439105,
60.24359775384097,
26.783999358640752,
24.821655499137048,
26.45397884474629,
25.002595295235746,
26.75606660161476,
24.950593393502686,
27.245507410753593,
61.3896585162216,
61.061389920377685,
61.06911135758283,
60.70490193884751,
58.975890703009014,
60.682678692224144,
27.03799242337422,
62.26105246211977,
60.791623329684086,
62.374203705324284,
62.44460409845261,
62.11841694257893,
63.550105055987174,
63.07073351412039,
44.87880425591219,
64.79850156958106,
62.20609448217756,
62.1246057414326,
66.51191133673278,
66.55303373586858,
63.356659125507434,
63.245294478077106,
72.27737838264045,
72.17054547051251,
72.20739527153786,
66.29541259257003,
48.335234653692204,
63.02324877855956,
62.96478981633615,
62.24441734951871,
26.193572189883703,
25.730836314001447,
39.35334081217581,
33.81983654590648,
46.70838530175783,
45.86651800528403,
80.19575908781562,
78.03946337687869,
69.36026470831474,
68.51540963742676,
68.57472555756478,
56.523519639933376,
55.55462512266324,
61.9700216814056,
61.64766854906246,
61.69764020685532,
62.54071682523817,
60.970670017639726,
45.17037929235293,
57.25919892204937,
70.4515868286805,
47.49331993287192,
52.577029472650906,
61.74163205184785,
76.66476969478123,
77.15452603998116,
26.234231797093017,
26.306692338086823,
71.38894203721195,
64.70438661462165,
64.66799008434704,
51.34583865906561,
38.64860206508296,
41.21331864479665,
26.70214825620786,
27.47846514752041,
26.592509199327857,
24.21541212062042,
26.246222375725694,
61.03678038351008,
61.04002830421494,
25.916845651522607,
24.696321332710696,
25.183937022975865,
51.74656618684998,
51.77213745416527,
46.306113561052754,
24.963045825616692,
26.658730035291097,
60.91642429212937,
41.679249280698635,
41.77314644002599,
37.648186828291585,
64.5598699571695,
66.02745725247713,
53.5170310018511,
34.611140710007234,
34.38230443417011,
40.86948703219191,
25.833717804860335,
25.404197294686202,
61.917687321536896,
62.84956375246854,
64.90605869704184,
51.037449016607304,
50.8231434865938,
54.63627087850933,
55.77103633323489,
63.90701952816292,
69.83208921288625,
48.318825530104384,
46.866075004498,
43.79609636092417,
79.23293386179718,
80.05922317995427,
66.70480406193239,
66.49495492503577,
67.0721995784583,
25.838967331895883,
60.37069885710225,
60.301159128932504,
56.371153419443566,
77.83091143434687,
68.70399293043886,
69.08301379498349,
43.886467784197876,
58.1123279817493,
57.92026667108576,
63.689429588238426,
63.617802578997775,
55.37547435463056,
60.778011275770005,
64.16448378492244,
63.46289361741425,
77.79499653225467,
71.16228271617086,
71.39615714693673,
71.81543655466032,
70.17346542519046,
71.87839332236227,
64.40157643349518,
64.87016580791871,
65.76439370548329,
30.795325425385716,
25.833717804860335,
26.067403063531568,
65.03521927528267,
64.62163338646481,
64.97554013321405,
64.94207335284045,
62.892016034379076,
56.823746005051746,
56.68268469601176,
55.89931756667269,
64.73176939664393,
64.68760308408112,
62.039927374602314,
61.05866061151978,
38.94657259915192,
41.47262454451639,
42.03478315758355,
45.225737544229936,
46.1679465865397,
62.349785875539155,
56.34727779629169,
63.8216682847302,
63.636962953801635,
48.617757574363246,
58.3670122238434,
56.8938511726681,
27.785470153285935,
25.660046534596926,
25.836963794035032,
29.919101184126255,
43.32570701968771,
46.105045377255685,
45.53223866033279,
27.68214018695477,
26.942822650634863,
37.61836019303777,
27.02787422464126,
26.231263349013634,
25.032214024865144,
66.21064893071143,
65.47877433224612
],
"y": [
40.732121749408975,
41.21232269503546,
39.11606087470449,
23.343306737588648,
23.343306737588648,
24.414524231678485,
32.402482269503544,
30.721778959810877,
33.05814125295508,
33.261303191489354,
37.90632387706855,
39.273049645390074,
34.369459219858165,
34.369459219858165,
22.12433510638298,
22.12433510638298,
20.489804964539008,
20.489804964539008,
28.59781323877068,
29.1334219858156,
24.682328605200944,
14.2010195035461,
33.242833924349874,
33.242833924349874,
26.326093380614658,
18.799867021276594,
17.86716903073286,
2.6761968085106376,
1.8173758865248217,
1.8081412529550822,
1.725029550827422,
1.2078900709219857,
1.3279403073286051,
1.4202866430260035,
19.94496158392435,
19.78797281323877,
19.78797281323877,
19.78797281323877,
20.526743498817968,
20.526743498817968,
1.124778368794326,
18.873744089834513,
18.642878250591018,
18.642878250591018,
18.642878250591018,
18.642878250591018,
18.522828014184395,
18.05186170212766,
9.685283687943262,
22.798463356973993,
21.625664893617017,
21.625664893617017,
26.316858747044915,
26.316858747044915,
22.392139479905435,
22.392139479905435,
29.63209219858156,
29.63209219858156,
22.95545212765957,
27.314199172576835,
10.442523640661937,
21.034648345153663,
21.265514184397162,
18.36583924349882,
1.2540632387706852,
2.0390070921985806,
3.4149674940898342,
2.362219267139479,
9.79609929078014,
8.928043735224584,
48.45227541371159,
42.495936761229316,
29.373522458628837,
29.08724881796691,
29.08724881796691,
16.22340425531915,
15.512337470449172,
21.293218085106382,
21.681072695035464,
21.681072695035464,
19.751034278959807,
21.3670951536643,
7.7552452718676115,
17.239213947990542,
35.366799645390074,
9.011155437352246,
10.507166075650117,
19.61251477541371,
39.36539598108747,
40.45508274231678,
1.9281914893617011,
1.9281914893617011,
29.659796099290777,
24.128250591016545,
24.128250591016545,
15.032136524822693,
7.0718823877068555,
7.1365248226950335,
1.1986554373522447,
0.9585549645390061,
1.4756944444444438,
1.8266105200945615,
1.1617169030732852,
19.67715721040189,
19.67715721040189,
1.4295212765957446,
1.6142139479905429,
1.6603871158392434,
12.594193262411347,
12.594193262411347,
5.612810283687943,
1.1894208037825047,
1.8173758865248217,
17.5993646572104,
4.061391843971631,
4.061391843971631,
3.9228723404255303,
21.644134160756497,
21.644134160756497,
7.7183067375886525,
1.9928339243498814,
1.9928339243498814,
4.4123079196217505,
1.7712027186761226,
1.7712027186761226,
21.56102245862884,
21.56102245862884,
22.983156028368796,
12.741947399527188,
12.741947399527188,
10.257830969267138,
15.309175531914892,
15.918661347517732,
30.1954048463357,
7.524379432624113,
9.56523345153664,
9.56523345153664,
41.21232269503546,
46.74386820330969,
24.553043735224584,
24.553043735224584,
29.604388297872337,
1.3187056737588652,
19.437056737588648,
19.437056737588648,
14.653516548463358,
40.84293735224587,
30.38009751773049,
31.848404255319146,
7.6813682033096935,
18.088800236406616,
18.088800236406616,
21.699541962174944,
21.699541962174944,
15.336879432624112,
19.076906028368796,
22.364435579196215,
21.699541962174944,
40.4366134751773,
29.364287825059105,
29.364287825059105,
31.49748817966903,
31.49748817966903,
31.49748817966903,
16.36192375886525,
16.805186170212764,
18.947621158392433,
2.288342198581559,
1.7712027186761226,
1.5772754137115832,
25.458037825059098,
23.805038416075647,
23.92508865248227,
23.92508865248227,
23.92508865248227,
15.79861111111111,
15.79861111111111,
15.83554964539007,
23.278664302600472,
23.278664302600472,
23.020094562647756,
19.270833333333332,
5.1510786052009445,
6.905658983451536,
7.930703309692672,
7.432033096926712,
8.530954491725769,
20.40669326241135,
15.438460401891252,
22.826167257683213,
22.826167257683213,
10.368646572104018,
15.438460401891252,
16.722074468085104,
1.392582742316784,
1.725029550827422,
1.725029550827422,
1.6142139479905429,
8.530954491725769,
8.678708628841607,
8.678708628841607,
2.186761229314421,
1.3464095744680846,
3.027112884160755,
1.2355939716312052,
1.4110520094562635,
1.2171247044917257,
23.758865248226947,
26.667774822695034
]
},
{
"marker": {
"color": "#FF9D00"
},
"mode": "markers",
"name": "GPQA=f(MMLU)",
"type": "scatter",
"x": [
77.9957192730956,
78.15303198667405,
77.08283992203879,
64.72689535017567,
65.00272042688377,
64.24095801586486,
70.73451954519469,
70.97203740649417,
71.01349610420871,
70.95946328062593,
76.35156727391681,
75.56180045111563,
74.86694971720243,
74.89765262226099,
63.84606128348609,
64.10709139827398,
64.64867451074461,
64.64930135205712,
70.06330822380845,
70.34013960436471,
65.86320847049085,
56.33899126009816,
75.72824616707308,
74.02494713760785,
68.19530224441294,
59.762938924439105,
60.24359775384097,
26.783999358640752,
24.821655499137048,
26.45397884474629,
25.002595295235746,
26.75606660161476,
24.950593393502686,
27.245507410753593,
61.3896585162216,
61.061389920377685,
61.06911135758283,
60.70490193884751,
58.975890703009014,
60.682678692224144,
27.03799242337422,
62.26105246211977,
60.791623329684086,
62.374203705324284,
62.44460409845261,
62.11841694257893,
63.550105055987174,
63.07073351412039,
44.87880425591219,
64.79850156958106,
62.20609448217756,
62.1246057414326,
66.51191133673278,
66.55303373586858,
63.356659125507434,
63.245294478077106,
72.27737838264045,
72.17054547051251,
72.20739527153786,
66.29541259257003,
48.335234653692204,
63.02324877855956,
62.96478981633615,
62.24441734951871,
26.193572189883703,
25.730836314001447,
39.35334081217581,
33.81983654590648,
46.70838530175783,
45.86651800528403,
80.19575908781562,
78.03946337687869,
69.36026470831474,
68.51540963742676,
68.57472555756478,
56.523519639933376,
55.55462512266324,
61.9700216814056,
61.64766854906246,
61.69764020685532,
62.54071682523817,
60.970670017639726,
45.17037929235293,
57.25919892204937,
70.4515868286805,
47.49331993287192,
52.577029472650906,
61.74163205184785,
76.66476969478123,
77.15452603998116,
26.234231797093017,
26.306692338086823,
71.38894203721195,
64.70438661462165,
64.66799008434704,
51.34583865906561,
38.64860206508296,
41.21331864479665,
26.70214825620786,
27.47846514752041,
26.592509199327857,
24.21541212062042,
26.246222375725694,
61.03678038351008,
61.04002830421494,
25.916845651522607,
24.696321332710696,
25.183937022975865,
51.74656618684998,
51.77213745416527,
46.306113561052754,
24.963045825616692,
26.658730035291097,
60.91642429212937,
41.679249280698635,
41.77314644002599,
37.648186828291585,
64.5598699571695,
66.02745725247713,
53.5170310018511,
34.611140710007234,
34.38230443417011,
40.86948703219191,
25.833717804860335,
25.404197294686202,
61.917687321536896,
62.84956375246854,
64.90605869704184,
51.037449016607304,
50.8231434865938,
54.63627087850933,
55.77103633323489,
63.90701952816292,
69.83208921288625,
48.318825530104384,
46.866075004498,
43.79609636092417,
79.23293386179718,
80.05922317995427,
66.70480406193239,
66.49495492503577,
67.0721995784583,
25.838967331895883,
60.37069885710225,
60.301159128932504,
56.371153419443566,
77.83091143434687,
68.70399293043886,
69.08301379498349,
43.886467784197876,
58.1123279817493,
57.92026667108576,
63.689429588238426,
63.617802578997775,
55.37547435463056,
60.778011275770005,
64.16448378492244,
63.46289361741425,
77.79499653225467,
71.16228271617086,
71.39615714693673,
71.81543655466032,
70.17346542519046,
71.87839332236227,
64.40157643349518,
64.87016580791871,
65.76439370548329,
30.795325425385716,
25.833717804860335,
26.067403063531568,
65.03521927528267,
64.62163338646481,
64.97554013321405,
64.94207335284045,
62.892016034379076,
56.823746005051746,
56.68268469601176,
55.89931756667269,
64.73176939664393,
64.68760308408112,
62.039927374602314,
61.05866061151978,
38.94657259915192,
41.47262454451639,
42.03478315758355,
45.225737544229936,
46.1679465865397,
62.349785875539155,
56.34727779629169,
63.8216682847302,
63.636962953801635,
48.617757574363246,
58.3670122238434,
56.8938511726681,
27.785470153285935,
25.660046534596926,
25.836963794035032,
29.919101184126255,
43.32570701968771,
46.105045377255685,
45.53223866033279,
27.68214018695477,
26.942822650634863,
37.61836019303777,
27.02787422464126,
26.231263349013634,
25.032214024865144,
66.21064893071143,
65.47877433224612
],
"y": [
15.436241610738257,
15.100671140939594,
15.324384787472036,
8.277404921700223,
8.277404921700223,
9.060402684563762,
17.225950782997764,
14.541387024608499,
11.297539149888143,
7.829977628635347,
15.548098434004473,
14.205816554809845,
11.74496644295302,
11.74496644295302,
2.572706935123044,
2.572706935123044,
4.250559284116329,
4.250559284116329,
9.060402684563762,
8.7248322147651,
7.046979865771815,
4.5861297539149914,
7.38255033557047,
7.38255033557047,
7.606263982102905,
6.040268456375841,
5.257270693512303,
0,
0.7829977628635317,
2.1252796420581683,
0,
0,
1.1185682326621946,
1.230425055928408,
6.375838926174497,
5.369127516778524,
5.369127516778524,
5.369127516778524,
5.92841163310962,
5.92841163310962,
0,
5.592841163310966,
5.369127516778524,
5.369127516778524,
5.369127516778524,
5.369127516778524,
5.369127516778524,
5.257270693512303,
1.342281879194629,
5.7046979865771785,
3.1319910514541416,
3.1319910514541416,
7.158836689038028,
7.158836689038028,
5.7046979865771785,
5.7046979865771785,
9.61968680089485,
9.61968680089485,
6.935123042505594,
5.8165548098433995,
1.7897091722595053,
6.487695749440718,
5.369127516778524,
2.9082774049216997,
1.0067114093959737,
0,
0.5592841163310973,
2.572706935123044,
7.38255033557047,
6.375838926174497,
13.646532438478745,
12.192393736017896,
5.92841163310962,
2.684563758389265,
2.684563758389265,
3.5794183445190177,
2.2371364653243813,
6.487695749440718,
7.046979865771815,
7.046979865771815,
1.230425055928408,
3.243847874720355,
0.7829977628635317,
1.9015659955257262,
7.270693512304249,
1.342281879194629,
3.243847874720355,
4.026845637583895,
10.626398210290827,
9.61968680089485,
1.5659955257270708,
1.5659955257270708,
7.494407158836691,
6.263982102908276,
6.263982102908276,
2.9082774049216997,
0,
0.22371364653244186,
1.230425055928408,
1.1185682326621946,
0,
1.5659955257270708,
1.9015659955257262,
4.921700223713646,
4.921700223713646,
0,
1.4541387024608499,
2.460850111856823,
2.1252796420581683,
2.1252796420581683,
0.5592841163310973,
0,
2.572706935123044,
5.8165548098433995,
0.6711409395973182,
0.6711409395973182,
3.8031319910514525,
4.921700223713646,
4.921700223713646,
4.5861297539149914,
0,
0,
1.0067114093959737,
1.1185682326621946,
1.1185682326621946,
3.6912751677852316,
3.6912751677852316,
6.263982102908276,
1.1185682326621946,
1.1185682326621946,
0,
4.138702460850116,
1.9015659955257262,
7.046979865771815,
0.5592841163310973,
2.2371364653243813,
2.2371364653243813,
19.686800894854585,
4.921700223713646,
7.38255033557047,
7.38255033557047,
1.230425055928408,
0.5592841163310973,
4.026845637583895,
4.026845637583895,
0.7829977628635317,
11.521252796420578,
9.060402684563762,
9.284116331096197,
2.348993288590602,
2.9082774049216997,
2.9082774049216997,
5.592841163310966,
5.592841163310966,
0,
3.467561521252797,
5.592841163310966,
5.592841163310966,
16.778523489932887,
7.606263982102905,
7.606263982102905,
9.284116331096197,
9.284116331096197,
9.284116331096197,
2.684563758389265,
4.697986577181204,
3.9149888143176734,
1.342281879194629,
1.1185682326621946,
1.230425055928408,
8.165548098434002,
6.823266219239373,
6.487695749440718,
6.487695749440718,
6.487695749440718,
2.684563758389265,
2.684563758389265,
1.9015659955257262,
6.599552572706939,
6.599552572706939,
3.8031319910514525,
2.2371364653243813,
0,
0,
0,
0,
0,
4.921700223713646,
3.0201342281879207,
4.4742729306487705,
4.4742729306487705,
2.572706935123044,
2.796420581655479,
3.1319910514541416,
0,
0,
0,
0,
0,
0.22371364653244186,
0.22371364653244186,
0.6711409395973182,
0.33557046979865535,
0.11185682326622093,
0,
0,
0,
7.829977628635347,
4.138702460850116
]
}
],
"layout": {
"height": 350,
"legend": {
"title": {
"text": "Evaluations"
}
},
"showlegend": true,
"template": {
"data": {
"bar": [
{
"error_x": {
"color": "#2a3f5f"
},
"error_y": {
"color": "#2a3f5f"
},
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
},
"pattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
}
},
"type": "bar"
}
],
"barpolar": [
{
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
},
"pattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
}
},
"type": "barpolar"
}
],
"carpet": [
{
"aaxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"baxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"type": "carpet"
}
],
"choropleth": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "choropleth"
}
],
"contour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "contour"
}
],
"contourcarpet": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "contourcarpet"
}
],
"heatmap": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "heatmap"
}
],
"heatmapgl": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "heatmapgl"
}
],
"histogram": [
{
"marker": {
"pattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
}
},
"type": "histogram"
}
],
"histogram2d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2d"
}
],
"histogram2dcontour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2dcontour"
}
],
"mesh3d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "mesh3d"
}
],
"parcoords": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "parcoords"
}
],
"pie": [
{
"automargin": true,
"type": "pie"
}
],
"scatter": [
{
"fillpattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
},
"type": "scatter"
}
],
"scatter3d": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatter3d"
}
],
"scattercarpet": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattercarpet"
}
],
"scattergeo": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergeo"
}
],
"scattergl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergl"
}
],
"scattermapbox": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattermapbox"
}
],
"scatterpolar": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolar"
}
],
"scatterpolargl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolargl"
}
],
"scatterternary": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterternary"
}
],
"surface": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "surface"
}
],
"table": [
{
"cells": {
"fill": {
"color": "#EBF0F8"
},
"line": {
"color": "white"
}
},
"header": {
"fill": {
"color": "#C8D4E3"
},
"line": {
"color": "white"
}
},
"type": "table"
}
]
},
"layout": {
"annotationdefaults": {
"arrowcolor": "#2a3f5f",
"arrowhead": 0,
"arrowwidth": 1
},
"autotypenumbers": "strict",
"coloraxis": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"colorscale": {
"diverging": [
[
0,
"#8e0152"
],
[
0.1,
"#c51b7d"
],
[
0.2,
"#de77ae"
],
[
0.3,
"#f1b6da"
],
[
0.4,
"#fde0ef"
],
[
0.5,
"#f7f7f7"
],
[
0.6,
"#e6f5d0"
],
[
0.7,
"#b8e186"
],
[
0.8,
"#7fbc41"
],
[
0.9,
"#4d9221"
],
[
1,
"#276419"
]
],
"sequential": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"sequentialminus": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
]
},
"colorway": [
"#636efa",
"#EF553B",
"#00cc96",
"#ab63fa",
"#FFA15A",
"#19d3f3",
"#FF6692",
"#B6E880",
"#FF97FF",
"#FECB52"
],
"font": {
"color": "#2a3f5f"
},
"geo": {
"bgcolor": "white",
"lakecolor": "white",
"landcolor": "#E5ECF6",
"showlakes": true,
"showland": true,
"subunitcolor": "white"
},
"hoverlabel": {
"align": "left"
},
"hovermode": "closest",
"mapbox": {
"style": "light"
},
"paper_bgcolor": "white",
"plot_bgcolor": "#E5ECF6",
"polar": {
"angularaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"radialaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"scene": {
"xaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"yaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"zaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
}
},
"shapedefaults": {
"line": {
"color": "#2a3f5f"
}
},
"ternary": {
"aaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"baxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"caxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"title": {
"x": 0.05
},
"xaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
},
"yaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
}
}
},
"title": {
"text": "New scores as functions of old evaluations"
},
"width": 600,
"xaxis": {
"range": [
0,
100
],
"title": {
"text": "v1 scores"
},
"type": "linear"
},
"yaxis": {
"range": [
-5,
80
],
"title": {
"text": "v2 scores"
},
"type": "linear"
}
}
},
"text/html": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Mapping of comparisons to plot\n",
"mapping = [(\"MMLU\", \"MMLU-PRO\", colors[1]), (\"MMLU\", \"GPQA\", colors[0])]\n",
"\n",
"# Create a new figure object\n",
"fig = go.Figure()\n",
"\n",
"# Add scatter plots for each mapping\n",
"for old, new, color in mapping:\n",
" fig.add_trace(go.Scatter(\n",
" x=merged_data[old],\n",
" y=merged_data[new],\n",
" mode='markers',\n",
" marker_color=color,\n",
" name=f'{new}=f({old})'\n",
" ))\n",
"\n",
"# Update layout\n",
"fig.update_layout(\n",
" title='New scores as functions of old evaluations',\n",
" xaxis=dict(title='v1 scores', range=[0, 100]),\n",
" yaxis=dict(title='v2 scores', range=[-5, 80]),\n",
" legend_title=\"Evaluations\",\n",
" showlegend=True,\n",
" width=600,\n",
" height=350,\n",
")\n",
"\n",
"with open(\"./plots/new_scores_vs_old.html\", \"w\") as f:\n",
" f.write(fig.to_html(full_html=False))\n",
"\n",
"# Display the plot\n",
"fig.show()"
]
},
{
"cell_type": "code",
"execution_count": 40,
"id": "2adade8b-f25f-4b98-a06a-89f36c74bfdd",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.plotly.v1+json": {
"config": {
"plotlyServerURL": "https://plot.ly"
},
"data": [
{
"hovertemplate": "color=default
GSM8K (v1)=%{x}
MATH (v2)=%{y}",
"legendgroup": "default",
"marker": {
"color": "#FF9D00",
"symbol": "circle"
},
"mode": "markers",
"name": "default",
"orientation": "v",
"showlegend": true,
"type": "scatter",
"x": [
73.23730098559514,
71.64518574677786,
50.34116755117514,
49.81046247156937,
67.09628506444277,
62.69901440485216,
71.87263078089462,
59.66641394996209,
50.644427596664144,
34.874905231235786,
19.787717968157693,
31.91811978771797,
12.661106899166036,
12.130401819560273,
31.387414708112203,
30.32600454890068,
48.9764973464746,
47.61182714177407,
58.605003790750565,
43.51781652767248,
70.73540561031083,
47.30856709628507,
47.384382107657316,
46.019711902956786,
2.9567854435178167,
0.45489006823351025,
1.288855193328279,
5.458680818802123,
1.7437452615617892,
0.22744503411675512,
0.6823351023502654,
14.025777103866567,
11.372251705837757,
29.037149355572407,
27.065959059893856,
44.04852160727824,
45.564821834723276,
0,
1.2130401819560273,
18.11978771796816,
19.56027293404094,
19.408642911296436,
20.0909780136467,
55.117513267627,
61.10689916603488,
20.92494313874147,
67.93025018953753,
60.424564063684606,
61.25852918877938,
70.73540561031083,
70.43214556482184,
60.424564063684606,
60.50037907505686,
70.65959059893859,
71.64518574677786,
69.59818043972706,
69.44655041698256,
5.761940864291129,
32.60045489006823,
33.2827899924185,
19.9393479909022,
0.6065200909780136,
2.047005307050796,
16.300227445034114,
7.657316148597422,
33.586050037907505,
19.02956785443518,
81.04624715693708,
30.09855951478393,
67.62699014404852,
30.856709628506447,
30.62926459438969,
52.23654283548142,
2.4260803639120545,
53.525398028809704,
13.570887035633056,
13.191811978771797,
16.982562547384383,
28.203184230477635,
35.70887035633055,
55.799848369977255,
72.93404094010614,
17.81652767247915,
44.200151630022745,
34.19257012888552,
78.69598180439728,
1.8953752843062925,
1.8953752843062925,
61.10689916603488,
62.3199393479909,
62.39575435936315,
52.23654283548142,
19.63608794541319,
25.094768764215313,
0.22744503411675512,
0.8339651250947688,
1.5163002274450341,
0.3032600454890068,
1.3646702047005308,
64.6702047005307,
64.44275966641395,
1.2130401819560273,
1.061410159211524,
1.1372251705837757,
46.550416982562545,
45.94389689158454,
17.28582259287339,
0.1516300227445034,
2.1986353297952994,
42.98711144806672,
17.36163760424564,
16.90674753601213,
5.458680818802123,
50.87187263078089,
52.76724791508719,
29.18877937831691,
16.148597422289612,
15.693707354056102,
10.083396512509477,
0.6823351023502654,
0.37907505686125853,
44.351781652767244,
55.420773313116,
41.09173616376042,
8.188021228203183,
8.112206216830932,
15.238817285822591,
22.820318423047762,
26.686884003032603,
54.05610310841546,
7.354056103108415,
14.480667172100075,
5.382865807429871,
76.87642153146324,
85.44351781652767,
45.185746777862015,
45.33737680060652,
68.68840030326004,
0,
17.968157695223656,
37.831690674753595,
14.70811220621683,
79.37831690674754,
69.52236542835482,
74.52615617892343,
12.43366186504928,
54.814253222137985,
54.965883244882484,
34.950720242608035,
34.72327520849128,
14.25322213798332,
40.0303260045489,
37.831690674753595,
34.495830174374525,
73.69219105382867,
60.72782410917361,
61.10689916603488,
57.4677786201668,
46.85367702805156,
57.619408642911296,
66.71721000758151,
70.58377558756634,
45.94389689158454,
4.01819560272934,
0.6823351023502654,
0.8339651250947688,
68.15769522365429,
65.95905989385898,
26.611068991660346,
26.838514025777105,
25.777103866565582,
13.419257012888552,
13.646702047005308,
13.495072024260804,
61.48597422289613,
61.63760424564063,
56.02729340409402,
57.84685367702805,
17.437452615617893,
38.81728582259287,
35.329795299469296,
3.3358605003790753,
42.15314632297195,
35.860500379075056,
11.599696739954512,
25.473843821076574,
26.080363912054587,
5.003790750568612,
53.82865807429871,
21.455648218347235,
4.624715693707354,
4.624715693707354,
4.700530705079606,
6.899166034874906,
4.321455648218348,
5.686125852918878,
4.700530705079606,
3.0326004548900682,
0.45489006823351025,
1.592115238817286,
1.288855193328279,
0.530705079605762,
1.3646702047005308,
55.49658832448825
],
"xaxis": "x",
"y": [
14.04833836858006,
23.338368580060422,
5.664652567975831,
5.664652567975831,
12.537764350453173,
10.196374622356496,
11.63141993957704,
12.613293051359516,
4.45619335347432,
4.45619335347432,
4.305135951661631,
4.305135951661631,
1.5105740181268883,
1.5105740181268883,
1.2084592145015105,
1.2084592145015105,
4.380664652567976,
5.81570996978852,
3.3232628398791544,
1.4350453172205437,
7.552870090634441,
7.552870090634441,
2.416918429003021,
2.794561933534743,
1.2084592145015105,
0.6797583081570997,
0.5287009063444109,
0.6042296072507553,
0.906344410876133,
0.22658610271903326,
0.3021148036253776,
1.5105740181268883,
2.416918429003021,
2.416918429003021,
2.416918429003021,
6.646525679758309,
6.646525679758309,
0,
2.190332326283988,
3.1722054380664653,
3.1722054380664653,
3.1722054380664653,
3.1722054380664653,
4.531722054380665,
0.6797583081570997,
1.812688821752266,
5.740181268882175,
4.833836858006042,
4.833836858006042,
8.685800604229607,
8.685800604229607,
4.229607250755287,
4.229607250755287,
10.27190332326284,
10.27190332326284,
1.7371601208459215,
5.211480362537765,
0.6797583081570997,
2.492447129909366,
3.0211480362537766,
2.9456193353474323,
1.4350453172205437,
0.6042296072507553,
0.4531722054380665,
0,
2.2658610271903323,
0.4531722054380665,
23.036253776435046,
0,
16.46525679758308,
0,
0,
2.416918429003021,
0.9818731117824773,
4.45619335347432,
0,
0,
0.1510574018126888,
0,
2.56797583081571,
6.268882175226587,
18.806646525679756,
1.6616314199395772,
1.6616314199395772,
5.664652567975831,
17.749244712990937,
0.7552870090634441,
0.7552870090634441,
8.685800604229607,
7.175226586102719,
7.175226586102719,
4.833836858006042,
1.4350453172205437,
2.56797583081571,
0.1510574018126888,
0.0755287009063444,
0.0755287009063444,
0.0755287009063444,
0,
5.0604229607250755,
5.0604229607250755,
1.4350453172205437,
0.6042296072507553,
0.9818731117824773,
1.7371601208459215,
1.7371601208459215,
1.812688821752266,
0.7552870090634441,
0.6042296072507553,
3.1722054380664653,
2.719033232628399,
2.719033232628399,
0.4531722054380665,
6.419939577039275,
6.419939577039275,
1.5861027190332326,
1.6616314199395772,
1.6616314199395772,
1.7371601208459215,
0.22658610271903326,
0.22658610271903326,
4.380664652567976,
4.380664652567976,
2.3413897280966767,
0.7552870090634441,
0.7552870090634441,
0.6042296072507553,
1.0574018126888218,
0.906344410876133,
2.492447129909366,
0.6797583081570997,
1.2084592145015105,
1.2084592145015105,
16.540785498489427,
23.338368580060422,
3.2477341389728096,
3.2477341389728096,
8.685800604229607,
0,
0.9818731117824773,
0.9818731117824773,
0.8308157099697886,
16.993957703927492,
8.91238670694864,
11.63141993957704,
1.1329305135951662,
2.416918429003021,
2.416918429003021,
2.643504531722054,
2.643504531722054,
1.5105740181268883,
2.643504531722054,
2.492447129909366,
2.643504531722054,
16.842900302114806,
9.06344410876133,
9.06344410876133,
8.836858006042297,
8.836858006042297,
8.836858006042297,
3.8519637462235647,
4.758308157099698,
3.8519637462235647,
1.283987915407855,
0.22658610271903326,
0.6797583081570997,
6.495468277945619,
6.873111782477341,
6.570996978851963,
6.570996978851963,
6.570996978851963,
1.1329305135951662,
1.1329305135951662,
1.5861027190332326,
3.927492447129909,
3.927492447129909,
3.474320241691843,
2.0392749244712993,
0.1510574018126888,
1.0574018126888218,
2.1148036253776437,
0.6797583081570997,
4.078549848942599,
2.9456193353474323,
0.8308157099697886,
4.45619335347432,
4.45619335347432,
1.0574018126888218,
2.3413897280966767,
1.3595166163141994,
0.5287009063444109,
0.6042296072507553,
0.6042296072507553,
1.1329305135951662,
0.6797583081570997,
1.0574018126888218,
1.0574018126888218,
0.9818731117824773,
0.1510574018126888,
1.3595166163141994,
0.906344410876133,
0.3021148036253776,
0.6042296072507553,
2.1148036253776437
],
"yaxis": "y"
},
{
"hovertemplate": "color=green
GSM8K (v1)=%{x}
MATH (v2)=%{y}",
"legendgroup": "green",
"marker": {
"color": "#297373",
"symbol": "circle"
},
"mode": "markers",
"name": "green",
"orientation": "v",
"showlegend": true,
"type": "scatter",
"x": [
0,
0
],
"xaxis": "x",
"y": [
13.444108761329304,
9.592145015105741
],
"yaxis": "y"
},
{
"hovertemplate": "color=red
GSM8K (v1)=%{x}
MATH (v2)=%{y}",
"legendgroup": "red",
"marker": {
"color": "#FF323D",
"symbol": "circle"
},
"mode": "markers",
"name": "red",
"orientation": "v",
"showlegend": true,
"type": "scatter",
"x": [
56.633813495072026,
72.17589082638362,
64.74601971190296
],
"xaxis": "x",
"y": [
0,
0,
0
],
"yaxis": "y"
}
],
"layout": {
"height": 350,
"legend": {
"title": {
"text": "color"
},
"tracegroupgap": 0
},
"showlegend": false,
"template": {
"data": {
"bar": [
{
"error_x": {
"color": "#2a3f5f"
},
"error_y": {
"color": "#2a3f5f"
},
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
},
"pattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
}
},
"type": "bar"
}
],
"barpolar": [
{
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
},
"pattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
}
},
"type": "barpolar"
}
],
"carpet": [
{
"aaxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"baxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"type": "carpet"
}
],
"choropleth": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "choropleth"
}
],
"contour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "contour"
}
],
"contourcarpet": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "contourcarpet"
}
],
"heatmap": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "heatmap"
}
],
"heatmapgl": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "heatmapgl"
}
],
"histogram": [
{
"marker": {
"pattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
}
},
"type": "histogram"
}
],
"histogram2d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2d"
}
],
"histogram2dcontour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2dcontour"
}
],
"mesh3d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "mesh3d"
}
],
"parcoords": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "parcoords"
}
],
"pie": [
{
"automargin": true,
"type": "pie"
}
],
"scatter": [
{
"fillpattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
},
"type": "scatter"
}
],
"scatter3d": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatter3d"
}
],
"scattercarpet": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattercarpet"
}
],
"scattergeo": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergeo"
}
],
"scattergl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergl"
}
],
"scattermapbox": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattermapbox"
}
],
"scatterpolar": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolar"
}
],
"scatterpolargl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolargl"
}
],
"scatterternary": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterternary"
}
],
"surface": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "surface"
}
],
"table": [
{
"cells": {
"fill": {
"color": "#EBF0F8"
},
"line": {
"color": "white"
}
},
"header": {
"fill": {
"color": "#C8D4E3"
},
"line": {
"color": "white"
}
},
"type": "table"
}
]
},
"layout": {
"annotationdefaults": {
"arrowcolor": "#2a3f5f",
"arrowhead": 0,
"arrowwidth": 1
},
"autotypenumbers": "strict",
"coloraxis": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"colorscale": {
"diverging": [
[
0,
"#8e0152"
],
[
0.1,
"#c51b7d"
],
[
0.2,
"#de77ae"
],
[
0.3,
"#f1b6da"
],
[
0.4,
"#fde0ef"
],
[
0.5,
"#f7f7f7"
],
[
0.6,
"#e6f5d0"
],
[
0.7,
"#b8e186"
],
[
0.8,
"#7fbc41"
],
[
0.9,
"#4d9221"
],
[
1,
"#276419"
]
],
"sequential": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"sequentialminus": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
]
},
"colorway": [
"#636efa",
"#EF553B",
"#00cc96",
"#ab63fa",
"#FFA15A",
"#19d3f3",
"#FF6692",
"#B6E880",
"#FF97FF",
"#FECB52"
],
"font": {
"color": "#2a3f5f"
},
"geo": {
"bgcolor": "white",
"lakecolor": "white",
"landcolor": "#E5ECF6",
"showlakes": true,
"showland": true,
"subunitcolor": "white"
},
"hoverlabel": {
"align": "left"
},
"hovermode": "closest",
"mapbox": {
"style": "light"
},
"paper_bgcolor": "white",
"plot_bgcolor": "#E5ECF6",
"polar": {
"angularaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"radialaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"scene": {
"xaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"yaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"zaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
}
},
"shapedefaults": {
"line": {
"color": "#2a3f5f"
}
},
"ternary": {
"aaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"baxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"caxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"title": {
"x": 0.05
},
"xaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
},
"yaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
}
}
},
"title": {
"text": "MATH (v2) as Function of GSM8K (v1)"
},
"width": 600,
"xaxis": {
"anchor": "y",
"autorange": true,
"domain": [
0,
1
],
"range": [
-5.663244745883108,
91.10676256241078
],
"title": {
"text": "GSM8K (v1)"
},
"type": "linear"
},
"yaxis": {
"anchor": "x",
"autorange": true,
"domain": [
0,
1
],
"range": [
-1.9649141931665992,
25.30328277322702
],
"title": {
"text": "MATH (v2)"
},
"type": "linear"
}
}
},
"text/html": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Custom color mapping\n",
"color_map = {'green': GREEN, 'red': RED, 'default': ORANGE}\n",
"\n",
"# Add a new column for color based on more detailed conditions\n",
"conditions = [\n",
" (merged_data['MATH Lvl 5'] > 0) & (merged_data['GSM8K'] > 0),\n",
" (merged_data['MATH Lvl 5'] == 0) & (merged_data['GSM8K'] > 40),\n",
" (merged_data['MATH Lvl 5'] > 0) & (merged_data['GSM8K'] == 0)\n",
"]\n",
"\n",
"# Labels for each condition\n",
"labels = ['default', 'red', 'green']\n",
"\n",
"# Use np.select to apply multiple conditions\n",
"merged_data['color'] = np.select(conditions, labels, default='default')\n",
"\n",
"# Create the scatter plot using the new 'color' column for color-coding\n",
"fig = px.scatter(\n",
" merged_data, \n",
" x='GSM8K', \n",
" y='MATH Lvl 5',\n",
" labels={\n",
" 'GSM8K': 'GSM8K (v1)', \n",
" 'MATH Lvl 5': 'MATH (v2)'\n",
" },\n",
" title='MATH (v2) as Function of GSM8K (v1)',\n",
" color='color',\n",
" color_discrete_map=color_map\n",
")\n",
"\n",
"# Update axes and layout\n",
"fig.update_xaxes(title_text='GSM8K (v1)')\n",
"fig.update_yaxes(title_text='MATH (v2)')\n",
"fig.update_layout(\n",
" showlegend=False,\n",
" width=600,\n",
" height=350\n",
")\n",
"\n",
"with open(\"math_vs_gsm8k.html\", \"w\") as f:\n",
" f.write(fig.to_html(full_html=False))\n",
"\n",
"fig.show()"
]
},
{
"cell_type": "markdown",
"id": "f22371aa-3f2c-4bf8-84e0-d887eefbb4ac",
"metadata": {},
"source": [
"- MMLU and MMLU_Pro are well correlated - overall, a model with High MMLU has a high MMLU Pro score\n",
"- For MATH vs GSM8K, we identify 3 groups:\n",
" - \"High\" MATH score, very low v1 score (2 outliers): possible overfitting on MATH, or , more likely, one of these models with issues with eos tokens on GSM8K\n",
" - Correlation between v2 and v1 score (most models)\n",
" - Low MATH score, high GSM8K score: likely overfitting on GSM8K "
]
},
{
"cell_type": "markdown",
"id": "cbac9655-0697-4b5c-88f3-2286038d4c67",
"metadata": {},
"source": [
"# Ranking Analysis between V1 and V2"
]
},
{
"cell_type": "code",
"execution_count": 41,
"id": "dd7a03bc-183e-4e57-9791-025ff634c76f",
"metadata": {},
"outputs": [],
"source": [
"# Extracting the relevant information for ranking comparison between v1 and v2\n",
"# We actually want to merge with correct precision if possible, else merge on what's available\n",
"v2_rank_data = data_v2[[\"fullname\", \"Average ⬆️\"]]\n",
"v1_rank_data = data_v1[[\"fullname\", \"Average ⬆️\"]]"
]
},
{
"cell_type": "code",
"execution_count": 42,
"id": "597ed839-45af-49cf-a160-f9b47eccfc63",
"metadata": {},
"outputs": [],
"source": [
"# Renaming columns for clarity\n",
"v2_rank_data = v2_rank_data.rename(columns={\"Average ⬆️\": \"v2_score\"})\n",
"v1_rank_data = v1_rank_data.rename(columns={\"Average ⬆️\": \"v1_score\"})"
]
},
{
"cell_type": "code",
"execution_count": 43,
"id": "865a4e21-0a4b-4a7e-b3e6-ebd23b4d9ec9",
"metadata": {},
"outputs": [],
"source": [
"# Merging the two dataframes on 'eval_name'\n",
"merged_rank_data = pd.merge(v1_rank_data, v2_rank_data, on=\"fullname\", how=\"inner\")"
]
},
{
"cell_type": "code",
"execution_count": 44,
"id": "edbe899e-4603-4e52-a64d-39a5db8097b0",
"metadata": {},
"outputs": [],
"source": [
"merged_rank_data = merged_rank_data.drop_duplicates(subset=[\"fullname\"]).dropna()"
]
},
{
"cell_type": "code",
"execution_count": 45,
"id": "63cda705-29b5-41ff-b535-a4a8d1ee4e32",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" fullname | \n",
" v1_score | \n",
" v2_score | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 01-ai/Yi-1.5-34B | \n",
" 73.504618 | \n",
" 25.812197 | \n",
"
\n",
" \n",
" 1 | \n",
" 01-ai/Yi-1.5-34B-32K | \n",
" 60.700977 | \n",
" 26.787600 | \n",
"
\n",
" \n",
" 2 | \n",
" 01-ai/Yi-1.5-34B-Chat | \n",
" 74.823763 | \n",
" 33.076818 | \n",
"
\n",
" \n",
" 3 | \n",
" 01-ai/Yi-1.5-6B | \n",
" 61.566520 | \n",
" 16.778059 | \n",
"
\n",
" \n",
" 5 | \n",
" 01-ai/Yi-1.5-6B-Chat | \n",
" 66.167303 | \n",
" 22.405532 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" fullname v1_score v2_score\n",
"0 01-ai/Yi-1.5-34B 73.504618 25.812197\n",
"1 01-ai/Yi-1.5-34B-32K 60.700977 26.787600\n",
"2 01-ai/Yi-1.5-34B-Chat 74.823763 33.076818\n",
"3 01-ai/Yi-1.5-6B 61.566520 16.778059\n",
"5 01-ai/Yi-1.5-6B-Chat 66.167303 22.405532"
]
},
"execution_count": 45,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"merged_rank_data.head()"
]
},
{
"cell_type": "code",
"execution_count": 46,
"id": "9e98fda2-590e-4333-9d32-44bcfc961464",
"metadata": {},
"outputs": [],
"source": [
"# Calculating rank for v1 and v2 based on scores\n",
"merged_rank_data[\"v1_rank\"] = merged_rank_data[\"v1_score\"].rank(ascending=False)\n",
"merged_rank_data[\"v2_rank\"] = merged_rank_data[\"v2_score\"].rank(ascending=False)"
]
},
{
"cell_type": "code",
"execution_count": 47,
"id": "3d9a8416-8c50-4290-83a3-34651e0af8a7",
"metadata": {},
"outputs": [],
"source": [
"# Calculating rank change and sort\n",
"merged_rank_data[\"rank_change\"] = (\n",
" merged_rank_data[\"v2_rank\"] - merged_rank_data[\"v1_rank\"]\n",
")\n",
"\n",
"merged_rank_data = merged_rank_data.sort_values(\"rank_change\")\n",
"merged_rank_data = merged_rank_data.dropna()"
]
},
{
"cell_type": "code",
"execution_count": 48,
"id": "f738b928-000e-41d5-aadc-da47748cff35",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" fullname | \n",
" v1_score | \n",
" v2_score | \n",
" v1_rank | \n",
" v2_rank | \n",
" rank_change | \n",
"
\n",
" \n",
" \n",
" \n",
" 89 | \n",
" abacusai/Smaug-72B-v0.1 | \n",
" 80.481415 | \n",
" 29.976022 | \n",
" 1.0 | \n",
" 5.0 | \n",
" 4.0 | \n",
"
\n",
" \n",
" 138 | \n",
" meta-llama/Meta-Llama-3-70B-Instruct | \n",
" 77.882051 | \n",
" 36.670225 | \n",
" 2.0 | \n",
" 1.0 | \n",
" -1.0 | \n",
"
\n",
" \n",
" 88 | \n",
" abacusai/Smaug-34B-v0.1 | \n",
" 77.285644 | \n",
" 24.127214 | \n",
" 3.0 | \n",
" 22.0 | \n",
" 19.0 | \n",
"
\n",
" \n",
" 164 | \n",
" mlabonne/AlphaMonarch-7B | \n",
" 75.988300 | \n",
" 17.913614 | \n",
" 4.0 | \n",
" 59.0 | \n",
" 55.0 | \n",
"
\n",
" \n",
" 165 | \n",
" mlabonne/Beyonder-4x7B-v3 | \n",
" 75.654718 | \n",
" 19.642237 | \n",
" 5.0 | \n",
" 47.0 | \n",
" 42.0 | \n",
"
\n",
" \n",
" 70 | \n",
" Qwen/Qwen1.5-110B | \n",
" 75.415188 | \n",
" 29.975375 | \n",
" 6.0 | \n",
" 6.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 01-ai/Yi-1.5-34B-Chat | \n",
" 74.823763 | \n",
" 33.076818 | \n",
" 7.0 | \n",
" 3.0 | \n",
" -4.0 | \n",
"
\n",
" \n",
" 22 | \n",
" CohereForAI/c4ai-command-r-plus | \n",
" 74.618473 | \n",
" 31.295336 | \n",
" 8.0 | \n",
" 4.0 | \n",
" -4.0 | \n",
"
\n",
" \n",
" 158 | \n",
" mistralai/Mixtral-8x22B-v0.1 | \n",
" 74.471418 | \n",
" 25.871531 | \n",
" 9.0 | \n",
" 14.0 | \n",
" 5.0 | \n",
"
\n",
" \n",
" 207 | \n",
" upstage/SOLAR-10.7B-Instruct-v1.0 | \n",
" 74.200698 | \n",
" 19.961989 | \n",
" 10.0 | \n",
" 45.0 | \n",
" 35.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" fullname v1_score v2_score v1_rank \\\n",
"89 abacusai/Smaug-72B-v0.1 80.481415 29.976022 1.0 \n",
"138 meta-llama/Meta-Llama-3-70B-Instruct 77.882051 36.670225 2.0 \n",
"88 abacusai/Smaug-34B-v0.1 77.285644 24.127214 3.0 \n",
"164 mlabonne/AlphaMonarch-7B 75.988300 17.913614 4.0 \n",
"165 mlabonne/Beyonder-4x7B-v3 75.654718 19.642237 5.0 \n",
"70 Qwen/Qwen1.5-110B 75.415188 29.975375 6.0 \n",
"2 01-ai/Yi-1.5-34B-Chat 74.823763 33.076818 7.0 \n",
"22 CohereForAI/c4ai-command-r-plus 74.618473 31.295336 8.0 \n",
"158 mistralai/Mixtral-8x22B-v0.1 74.471418 25.871531 9.0 \n",
"207 upstage/SOLAR-10.7B-Instruct-v1.0 74.200698 19.961989 10.0 \n",
"\n",
" v2_rank rank_change \n",
"89 5.0 4.0 \n",
"138 1.0 -1.0 \n",
"88 22.0 19.0 \n",
"164 59.0 55.0 \n",
"165 47.0 42.0 \n",
"70 6.0 0.0 \n",
"2 3.0 -4.0 \n",
"22 4.0 -4.0 \n",
"158 14.0 5.0 \n",
"207 45.0 35.0 "
]
},
"execution_count": 48,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"merged_rank_data.sort_values(by=\"v1_rank\", ascending = True).head(10)"
]
},
{
"cell_type": "code",
"execution_count": 49,
"id": "fe727afb-0207-4c6c-9a77-a0a1e026e793",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" fullname | \n",
" v1_score | \n",
" v2_score | \n",
" v1_rank | \n",
" v2_rank | \n",
" rank_change | \n",
"
\n",
" \n",
" \n",
" \n",
" 138 | \n",
" meta-llama/Meta-Llama-3-70B-Instruct | \n",
" 77.882051 | \n",
" 36.670225 | \n",
" 2.0 | \n",
" 1.0 | \n",
" -1.0 | \n",
"
\n",
" \n",
" 146 | \n",
" microsoft/Phi-3-medium-4k-instruct | \n",
" 73.448553 | \n",
" 33.116864 | \n",
" 13.0 | \n",
" 2.0 | \n",
" -11.0 | \n",
"
\n",
" \n",
" 2 | \n",
" 01-ai/Yi-1.5-34B-Chat | \n",
" 74.823763 | \n",
" 33.076818 | \n",
" 7.0 | \n",
" 3.0 | \n",
" -4.0 | \n",
"
\n",
" \n",
" 22 | \n",
" CohereForAI/c4ai-command-r-plus | \n",
" 74.618473 | \n",
" 31.295336 | \n",
" 8.0 | \n",
" 4.0 | \n",
" -4.0 | \n",
"
\n",
" \n",
" 89 | \n",
" abacusai/Smaug-72B-v0.1 | \n",
" 80.481415 | \n",
" 29.976022 | \n",
" 1.0 | \n",
" 5.0 | \n",
" 4.0 | \n",
"
\n",
" \n",
" 70 | \n",
" Qwen/Qwen1.5-110B | \n",
" 75.415188 | \n",
" 29.975375 | \n",
" 6.0 | \n",
" 6.0 | \n",
" 0.0 | \n",
"
\n",
" \n",
" 71 | \n",
" Qwen/Qwen1.5-110B-Chat | \n",
" 68.006415 | \n",
" 29.639560 | \n",
" 34.0 | \n",
" 7.0 | \n",
" -27.0 | \n",
"
\n",
" \n",
" 8 | \n",
" 01-ai/Yi-1.5-9B-Chat | \n",
" 69.555695 | \n",
" 28.111418 | \n",
" 21.0 | \n",
" 8.0 | \n",
" -13.0 | \n",
"
\n",
" \n",
" 56 | \n",
" NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO | \n",
" 73.122196 | \n",
" 27.343441 | \n",
" 14.0 | \n",
" 9.0 | \n",
" -5.0 | \n",
"
\n",
" \n",
" 1 | \n",
" 01-ai/Yi-1.5-34B-32K | \n",
" 60.700977 | \n",
" 26.787600 | \n",
" 71.0 | \n",
" 10.0 | \n",
" -61.0 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" fullname v1_score v2_score \\\n",
"138 meta-llama/Meta-Llama-3-70B-Instruct 77.882051 36.670225 \n",
"146 microsoft/Phi-3-medium-4k-instruct 73.448553 33.116864 \n",
"2 01-ai/Yi-1.5-34B-Chat 74.823763 33.076818 \n",
"22 CohereForAI/c4ai-command-r-plus 74.618473 31.295336 \n",
"89 abacusai/Smaug-72B-v0.1 80.481415 29.976022 \n",
"70 Qwen/Qwen1.5-110B 75.415188 29.975375 \n",
"71 Qwen/Qwen1.5-110B-Chat 68.006415 29.639560 \n",
"8 01-ai/Yi-1.5-9B-Chat 69.555695 28.111418 \n",
"56 NousResearch/Nous-Hermes-2-Mixtral-8x7B-DPO 73.122196 27.343441 \n",
"1 01-ai/Yi-1.5-34B-32K 60.700977 26.787600 \n",
"\n",
" v1_rank v2_rank rank_change \n",
"138 2.0 1.0 -1.0 \n",
"146 13.0 2.0 -11.0 \n",
"2 7.0 3.0 -4.0 \n",
"22 8.0 4.0 -4.0 \n",
"89 1.0 5.0 4.0 \n",
"70 6.0 6.0 0.0 \n",
"71 34.0 7.0 -27.0 \n",
"8 21.0 8.0 -13.0 \n",
"56 14.0 9.0 -5.0 \n",
"1 71.0 10.0 -61.0 "
]
},
"execution_count": 49,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"merged_rank_data.sort_values(by=\"v2_rank\", ascending = True).head(10)"
]
},
{
"cell_type": "code",
"execution_count": 50,
"id": "1c4101f6-01aa-4bdb-b1d4-216d773c9dd4",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" fullname | \n",
" v1_score | \n",
" v2_score | \n",
" v1_rank | \n",
" v2_rank | \n",
" rank_change | \n",
" color | \n",
" rank_change_info | \n",
"
\n",
" \n",
" \n",
" \n",
" 1 | \n",
" 01-ai/Yi-1.5-34B-32K | \n",
" 60.700977 | \n",
" 26.787600 | \n",
" 71.0 | \n",
" 10.0 | \n",
" -61.0 | \n",
" #32343D | \n",
" 71 → 10 | \n",
"
\n",
" \n",
" 7 | \n",
" 01-ai/Yi-1.5-9B-32K | \n",
" 55.217184 | \n",
" 19.937832 | \n",
" 96.0 | \n",
" 46.0 | \n",
" -50.0 | \n",
" #32343D | \n",
" 96 → 46 | \n",
"
\n",
" \n",
" 42 | \n",
" Intel/neural-chat-7b-v3-1 | \n",
" 59.900319 | \n",
" 21.348573 | \n",
" 76.0 | \n",
" 40.0 | \n",
" -36.0 | \n",
" #32343D | \n",
" 76 → 40 | \n",
"
\n",
" \n",
" 123 | \n",
" gpt2 | \n",
" 28.530425 | \n",
" 6.754202 | \n",
" 165.5 | \n",
" 131.0 | \n",
" -34.5 | \n",
" #32343D | \n",
" 165 → 131 | \n",
"
\n",
" \n",
" 12 | \n",
" 01-ai/Yi-34B-Chat | \n",
" 63.173386 | \n",
" 24.268419 | \n",
" 55.0 | \n",
" 21.0 | \n",
" -34.0 | \n",
" #32343D | \n",
" 55 → 21 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" fullname v1_score v2_score v1_rank v2_rank \\\n",
"1 01-ai/Yi-1.5-34B-32K 60.700977 26.787600 71.0 10.0 \n",
"7 01-ai/Yi-1.5-9B-32K 55.217184 19.937832 96.0 46.0 \n",
"42 Intel/neural-chat-7b-v3-1 59.900319 21.348573 76.0 40.0 \n",
"123 gpt2 28.530425 6.754202 165.5 131.0 \n",
"12 01-ai/Yi-34B-Chat 63.173386 24.268419 55.0 21.0 \n",
"\n",
" rank_change color rank_change_info \n",
"1 -61.0 #32343D 71 → 10 \n",
"7 -50.0 #32343D 96 → 46 \n",
"42 -36.0 #32343D 76 → 40 \n",
"123 -34.5 #32343D 165 → 131 \n",
"12 -34.0 #32343D 55 → 21 "
]
},
"execution_count": 50,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# Sorting merged_rank_data by 'rank_change'\n",
"merged_rank_data = merged_rank_data.sort_values('rank_change')\n",
"\n",
"# Create a new column for color based on 'rank_change'\n",
"merged_rank_data['color'] = merged_rank_data['rank_change'].apply(lambda x: '#FF9D00' if x > 0 else '#32343D')\n",
"\n",
"# Calculate the top 10 positive and negative rank changes\n",
"top_pos = merged_rank_data.nlargest(10, 'rank_change')\n",
"top_neg = merged_rank_data.nsmallest(10, 'rank_change')\n",
"\n",
"# Combine the two dataframes\n",
"top_changes = pd.concat([top_pos, top_neg]).sort_values('rank_change')\n",
"\n",
"# Create a column for detailed rank change information\n",
"top_changes[\"rank_change_info\"] = top_changes.apply(\n",
" lambda x: f\"{int(x['v1_rank'])} → {int(x['v2_rank'])}\", axis=1\n",
")\n",
"\n",
"top_changes.head()"
]
},
{
"cell_type": "code",
"execution_count": 59,
"id": "6cf0127f-979e-4e20-8a5c-9cd1d1766810",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.plotly.v1+json": {
"config": {
"plotlyServerURL": "https://plot.ly"
},
"data": [
{
"alignmentgroup": "True",
"hovertemplate": "rank_change=%{marker.color}
fullname=%{y}
rank_change_info=%{text}",
"legendgroup": "",
"marker": {
"color": [
55,
42,
41,
38,
36,
35,
33,
32,
30,
29,
-30,
-30,
-32,
-33,
-33.5,
-34,
-34.5,
-36,
-50,
-61
],
"coloraxis": "coloraxis",
"pattern": {
"shape": ""
}
},
"name": "",
"offsetgroup": "",
"orientation": "h",
"showlegend": false,
"text": [
"4 → 59",
"5 → 47",
"46 → 87",
"30 → 68",
"59 → 95",
"10 → 45",
"58 → 91",
"53 → 85",
"125 → 155",
"128 → 157",
"79 → 49",
"83 → 53",
"97 → 65",
"68 → 35",
"165 → 132",
"55 → 21",
"165 → 131",
"76 → 40",
"96 → 46",
"71 → 10"
],
"textposition": "outside",
"type": "bar",
"x": [
55,
42,
41,
38,
36,
35,
33,
32,
30,
29,
-30,
-30,
-32,
-33,
-33.5,
-34,
-34.5,
-36,
-50,
-61
],
"xaxis": "x",
"y": [
"mlabonne/AlphaMonarch-7B",
"mlabonne/Beyonder-4x7B-v3",
"tiiuae/falcon-11B",
"stabilityai/stablelm-2-12b-chat",
"meta-llama/Llama-2-70b-chat-hf",
"upstage/SOLAR-10.7B-Instruct-v1.0",
"meta-llama/Meta-Llama-3-8B",
"stabilityai/stablelm-2-12b",
"tiiuae/falcon-7b",
"togethercomputer/GPT-NeoXT-Chat-Base-20B",
"HuggingFaceH4/zephyr-7b-alpha",
"microsoft/Orca-2-13b",
"Qwen/Qwen1.5-7B-Chat",
"openchat/openchat_3.5",
"openai-community/gpt2",
"01-ai/Yi-34B-Chat",
"gpt2",
"Intel/neural-chat-7b-v3-1",
"01-ai/Yi-1.5-9B-32K",
"01-ai/Yi-1.5-34B-32K"
],
"yaxis": "y"
}
],
"layout": {
"bargap": 0.1,
"barmode": "relative",
"coloraxis": {
"colorbar": {
"title": {
"text": "rank_change"
}
},
"colorscale": [
[
0,
"#32343D"
],
[
1,
"#FF9D00"
]
]
},
"height": 400,
"legend": {
"tracegroupgap": 0
},
"paper_bgcolor": "rgba(0, 0, 0, 0)",
"plot_bgcolor": "rgba(0, 0, 0, 0)",
"template": {
"data": {
"bar": [
{
"error_x": {
"color": "#2a3f5f"
},
"error_y": {
"color": "#2a3f5f"
},
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
},
"pattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
}
},
"type": "bar"
}
],
"barpolar": [
{
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
},
"pattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
}
},
"type": "barpolar"
}
],
"carpet": [
{
"aaxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"baxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"type": "carpet"
}
],
"choropleth": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "choropleth"
}
],
"contour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "contour"
}
],
"contourcarpet": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "contourcarpet"
}
],
"heatmap": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "heatmap"
}
],
"heatmapgl": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "heatmapgl"
}
],
"histogram": [
{
"marker": {
"pattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
}
},
"type": "histogram"
}
],
"histogram2d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2d"
}
],
"histogram2dcontour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2dcontour"
}
],
"mesh3d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "mesh3d"
}
],
"parcoords": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "parcoords"
}
],
"pie": [
{
"automargin": true,
"type": "pie"
}
],
"scatter": [
{
"fillpattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
},
"type": "scatter"
}
],
"scatter3d": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatter3d"
}
],
"scattercarpet": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattercarpet"
}
],
"scattergeo": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergeo"
}
],
"scattergl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergl"
}
],
"scattermapbox": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattermapbox"
}
],
"scatterpolar": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolar"
}
],
"scatterpolargl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolargl"
}
],
"scatterternary": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterternary"
}
],
"surface": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "surface"
}
],
"table": [
{
"cells": {
"fill": {
"color": "#EBF0F8"
},
"line": {
"color": "white"
}
},
"header": {
"fill": {
"color": "#C8D4E3"
},
"line": {
"color": "white"
}
},
"type": "table"
}
]
},
"layout": {
"annotationdefaults": {
"arrowcolor": "#2a3f5f",
"arrowhead": 0,
"arrowwidth": 1
},
"autotypenumbers": "strict",
"coloraxis": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"colorscale": {
"diverging": [
[
0,
"#8e0152"
],
[
0.1,
"#c51b7d"
],
[
0.2,
"#de77ae"
],
[
0.3,
"#f1b6da"
],
[
0.4,
"#fde0ef"
],
[
0.5,
"#f7f7f7"
],
[
0.6,
"#e6f5d0"
],
[
0.7,
"#b8e186"
],
[
0.8,
"#7fbc41"
],
[
0.9,
"#4d9221"
],
[
1,
"#276419"
]
],
"sequential": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"sequentialminus": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
]
},
"colorway": [
"#636efa",
"#EF553B",
"#00cc96",
"#ab63fa",
"#FFA15A",
"#19d3f3",
"#FF6692",
"#B6E880",
"#FF97FF",
"#FECB52"
],
"font": {
"color": "#2a3f5f"
},
"geo": {
"bgcolor": "white",
"lakecolor": "white",
"landcolor": "#E5ECF6",
"showlakes": true,
"showland": true,
"subunitcolor": "white"
},
"hoverlabel": {
"align": "left"
},
"hovermode": "closest",
"mapbox": {
"style": "light"
},
"paper_bgcolor": "white",
"plot_bgcolor": "#E5ECF6",
"polar": {
"angularaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"radialaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"scene": {
"xaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"yaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"zaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
}
},
"shapedefaults": {
"line": {
"color": "#2a3f5f"
}
},
"ternary": {
"aaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"baxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"caxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"title": {
"x": 0.05
},
"xaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
},
"yaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
}
}
},
"title": {
"text": "Top and Bottom Changes in Rankings from v1 to v2"
},
"width": 650,
"xaxis": {
"anchor": "y",
"domain": [
0,
1
],
"range": [
-85,
85
],
"tickfont": {
"size": 14
},
"title": {
"text": "rank_change"
},
"type": "linear"
},
"yaxis": {
"anchor": "x",
"autorange": true,
"domain": [
0,
1
],
"range": [
19.5,
-0.5
],
"title": {
"text": "Model"
},
"type": "category"
}
}
},
"text/html": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Invert the order to show declines first\n",
"top_changes_sorted = top_changes.sort_values(\"rank_change\", ascending=False)\n",
"\n",
"# Using Plotly Express to create a horizontal bar chart\n",
"fig = px.bar(\n",
" top_changes_sorted,\n",
" y=\"fullname\",\n",
" x=\"rank_change\",\n",
" text=\"rank_change_info\",\n",
" orientation='h',\n",
" title=\"Top and Bottom Changes in Rankings from v1 to v2\",\n",
" height=700,\n",
" width=800,\n",
" template='plotly',\n",
" color='rank_change',\n",
" color_continuous_scale=[BLACK, ORANGE]\n",
")\n",
"\n",
"# Additional layout settings\n",
"fig.update_layout(\n",
" xaxis_tickfont_size=14,\n",
" yaxis=dict(title='Model'),\n",
" bargap=0.1, # gap between bars of adjacent location coordinates\n",
" xaxis_range=[-85, 85],\n",
" paper_bgcolor=\"rgba(0,0,0,0)\",\n",
" plot_bgcolor=\"rgba(0,0,0,0)\",\n",
" width=650,\n",
" height=400\n",
")\n",
"\n",
"fig.update_yaxes(autorange=\"reversed\") # Reverse the y-axis order\n",
"fig.update_traces(textposition='outside') # Set text labels to be outside the bars\n",
"\n",
"# Save the figure as HTML\n",
"fig.write_html(\"./plots/rankings_change.html\", full_html=False)\n",
"\n",
"# Display the figure\n",
"fig.show()"
]
},
{
"cell_type": "markdown",
"id": "a1adaffc-c98d-42c2-966e-69c0a228a6b5",
"metadata": {},
"source": [
"# Params and Performance"
]
},
{
"cell_type": "code",
"execution_count": 52,
"id": "d78410a9-a826-4b07-b4d6-896b4bc5c39d",
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Thresholds in correct order: True\n"
]
}
],
"source": [
"# Set date\n",
"threshold_1 = datetime(2023, 10, 1, tzinfo=timezone.utc)\n",
"threshold_2 = datetime(2023, 12, 1, tzinfo=timezone.utc)\n",
"threshold_3 = datetime(2024, 2, 1, tzinfo=timezone.utc)\n",
"threshold_4 = datetime(2024, 5, 1, tzinfo=timezone.utc)\n",
"\n",
"# Checking if thresholds are in the correct order\n",
"thresholds = [threshold_1, threshold_2, threshold_3, threshold_4]\n",
"print(\"Thresholds in correct order:\", all(x < y for x, y in zip(thresholds, thresholds[1:])))"
]
},
{
"cell_type": "code",
"execution_count": 53,
"id": "60b4c31f-deba-4648-983b-74da5615755a",
"metadata": {},
"outputs": [],
"source": [
"# Convert date columns to datetime if not already\n",
"data_v1['date'] = pd.to_datetime(data_v1['date'], utc=True)\n",
"data_v2['date'] = pd.to_datetime(data_v2['date'], utc=True)"
]
},
{
"cell_type": "code",
"execution_count": 54,
"id": "5ee768cd-f0ee-4272-83a4-667b9147efdc",
"metadata": {},
"outputs": [],
"source": [
"# Calculate the mean performance score across the tasks for both versions\n",
"data_v1['mean_score_v1'] = data_v1[tasks_v1].mean(axis=1)\n",
"data_v2['mean_score_v2'] = data_v2[tasks_v2].mean(axis=1)\n",
"\n",
"# Extract the necessary columns for plotting\n",
"full=False\n",
"if full:\n",
" v1_data = data_v1[['fullname', '#Params (B)', 'mean_score_v1', 'date']]\n",
" v2_data = data_v2[['fullname', '#Params (B)', 'mean_score_v2', 'date']]\n",
"\n",
" # Merge the data on fullname\n",
" merged_data = pd.merge(v1_data, v2_data, on=['fullname', 'date', \"#Params (B)\"], how='outer')\n",
"else:\n",
" v1_data = data_v1[['fullname', '#Params (B)', 'mean_score_v1']]\n",
" v2_data = data_v2[['fullname', '#Params (B)', 'mean_score_v2', 'date']]\n",
"\n",
" # Merge the data on fullname\n",
" merged_data = pd.merge(v1_data, v2_data, on=['fullname', \"#Params (B)\"], how='outer')"
]
},
{
"cell_type": "code",
"execution_count": 55,
"id": "03e4d74c-96c2-493d-85e0-b9bacca5475b",
"metadata": {},
"outputs": [],
"source": [
"merged_data.dropna(subset=\"date\", inplace=True)\n",
"\n",
"merged_data[\"date\"] = pd.to_datetime(merged_data[\"date\"])\n",
"for row in merged_data[\"date\"]:\n",
" if not isinstance(row, pd._libs.tslibs.timestamps.Timestamp) or not isinstance(row, datetime):\n",
" print(row, type(row))"
]
},
{
"cell_type": "code",
"execution_count": 58,
"id": "c68c9dc6-e9a1-4290-a5a0-9c2bf50d8054",
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.plotly.v1+json": {
"config": {
"plotlyServerURL": "https://plot.ly"
},
"data": [
{
"marker": {
"color": "#CD4631"
},
"mode": "markers",
"name": "V1 - before Oct 2023",
"type": "scatter",
"x": [
6,
1,
2,
20,
12,
0,
2,
0,
6,
13,
6,
7,
7,
7,
12,
6,
13,
70,
1,
1,
3,
0,
7,
6,
12,
3,
7,
0,
1,
30,
0,
0,
7,
7,
13,
13,
68,
68,
6,
6,
0,
1,
1,
7,
7,
7,
0,
0,
0,
1,
13,
13,
2,
13,
7,
40,
40,
7,
7,
7,
20,
7,
7,
7,
7,
7,
7,
3,
3,
3
],
"y": [
40.10074896682374,
33.58450647496717,
36.19824768495068,
41.69377240765692,
38.822040906736184,
29.017386733865294,
null,
31.55133377215884,
null,
29.824105477736065,
51.86594648645109,
null,
null,
60.17299741784677,
40.76571366769813,
38.46964052270226,
null,
null,
32.474427835547964,
33.98180945576432,
36.07003025101667,
30.132439377906426,
39.17746104462925,
null,
39.45681851901109,
27.39878579922899,
39.240691196349985,
null,
34.60432345850859,
41.99813222281348,
28.530424870857342,
28.276436827607895,
52.05662078097521,
51.9937163576299,
54.913330129300256,
55.68584772658749,
62.395586113277396,
67.86780441186171,
50.73977386071763,
50.96641959745035,
28.86118440538931,
null,
47.686833800767765,
54.964081774475865,
60.96941505813863,
44.27659791450193,
28.530424870857342,
32.06876883377193,
null,
null,
null,
null,
46.579707042613045,
55.244824022683325,
51.26466794865067,
58.07206671286895,
null,
44.17474040638268,
43.16491363154527,
43.263888814685636,
43.01545408366647,
47.07421603909338,
50.02368236244343,
49.65440227183998,
41.49146241534145,
39.367740655697816,
42.375750704310896,
38.53785244348318,
39.5271935863245,
39.05504906819049
]
},
{
"marker": {
"color": "#FF9D00"
},
"mode": "markers",
"name": "V1 - Oct 2023 to Dec 2023",
"type": "scatter",
"x": [
34,
34,
34,
34,
6,
6,
6,
6,
6,
14,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
7,
3,
7,
7,
7,
13,
13,
7,
7,
2,
7,
7,
7,
7
],
"y": [
69.42302680552211,
63.95868835473126,
63.173385805389785,
65.31580196597741,
54.02206986885943,
54.08409847286497,
56.76083349753073,
56.692333561254124,
null,
63.80841685680014,
59.495598231916354,
59.08319544895303,
61.947425364774716,
61.58637451862594,
58.4569342704247,
59.90031882855004,
61.59483128758199,
61.544437738990545,
61.59009701817113,
68.28528460648407,
59.4171929373125,
59.6305845818351,
67.04979846164085,
67.12789429905199,
39.24771029913045,
null,
59.380403851463505,
59.272077233588966,
58.644617981010164,
61.98135744692983,
54.54868763409007,
null,
53.425863722230076,
62.91531136327541,
null,
61.45490056133619,
61.520319007341975
]
},
{
"marker": {
"color": "#FFD21E"
},
"mode": "markers",
"name": "V1 - Dec 2023 and after",
"type": "scatter",
"x": [
34,
34,
34,
6,
6,
6,
8,
8,
8,
8,
8,
8,
34,
8,
103,
34,
7,
7,
8,
8,
7,
65,
13,
8,
7,
7,
8,
8,
7,
7,
46,
46,
46,
10,
10,
10,
0,
0,
1,
1,
111,
111,
14,
14,
14,
3,
3,
7,
7,
7,
14,
14,
0,
0,
1,
1,
72,
7,
7,
8,
8,
8,
8,
1,
34,
72,
1,
1,
7,
46,
15,
7,
8,
8,
16,
16,
2,
8,
2,
2,
2,
8,
8,
8,
2,
2,
2,
8,
8,
6,
7,
8,
1,
70,
70,
8,
8,
8,
13,
3,
3,
7,
2,
2,
7,
7,
140,
7,
7,
7,
140,
46,
46,
46,
46,
7,
24,
8,
8,
4,
7,
7,
8,
8,
12,
12,
1,
1,
1,
11,
10,
10
],
"y": [
73.50461754455968,
60.70097659346695,
74.82376289192528,
61.56651964419019,
61.600186822769984,
66.16730292388148,
66.73139669787327,
55.21718433605241,
69.55569489674636,
66.978059635592,
63.16505555597532,
61.943561698500645,
null,
null,
74.61847256780356,
68.53563242970485,
61.55422124172559,
63.18561329425248,
61.47974712141572,
62.40634929999012,
69.83091004812034,
null,
48.208404569684866,
68.73192270625397,
67.35497164911531,
67.43127118086376,
69.2083869245962,
69.1681152594907,
68.0972249353676,
68.10382079751825,
73.12219560989375,
73.35333284905784,
72.07085598541813,
70.99611494328558,
null,
null,
38.61887256539875,
35.61159106465575,
46.55469286468076,
43.98954132886814,
null,
null,
66.69637898264064,
62.36628426209001,
62.2708188237114,
57.049366448825,
46.78568768990244,
61.756593209412564,
55.1547091552192,
55.1313193568431,
56.02722477794456,
57.22045479057058,
42.84986444317679,
null,
55.80148346984753,
null,
null,
68.4038966179358,
null,
51.67122869413651,
58.06414000695881,
null,
61.06304789634692,
null,
77.28564380843183,
80.48141483852343,
36.78193724145438,
36.72778183442903,
null,
72.96938623240759,
52.79219769707651,
42.95318061588656,
65.92164049813488,
65.83198774238207,
51.067658055604085,
null,
null,
60.09140737121073,
46.508533827664245,
46.36834706441763,
42.750104341098215,
63.753519741904825,
64.28649887507693,
53.56017516680193,
40.44283900884822,
45.46087707986595,
40.86040726469349,
59.83882000643251,
62.62392839776545,
null,
64.00461028850997,
null,
null,
73.95719822524191,
77.88205055819269,
62.623813022146,
62.35440585976382,
66.86966239077128,
73.44855327322567,
68.07092217507007,
69.90517093265896,
null,
61.32508370227487,
61.08764545284802,
60.41385031955034,
60.37214776353206,
null,
65.71304519417167,
null,
60.28212129096002,
74.4714175673166,
72.61571999808636,
72.70471388912233,
68.41996524230247,
68.47189963839766,
75.98830023246877,
75.65471846500263,
null,
63.99067496819134,
null,
69.30492308781548,
68.88728572928933,
63.62325427218718,
63.631348521524046,
63.48339124883886,
68.37627183501046,
45.254773923509426,
50.71050223243154,
49.99090871621005,
64.28123118509966,
74.20069762794799,
66.03783597882489
]
},
{
"marker": {
"color": "#32343D"
},
"mode": "markers",
"name": "V2",
"type": "scatter",
"x": [
34,
34,
34,
6,
6,
6,
8,
8,
8,
8,
34,
34,
34,
34,
6,
6,
6,
6,
6,
8,
8,
14,
34,
8,
103,
34,
7,
7,
6,
1,
2,
20,
12,
0,
2,
0,
6,
7,
7,
7,
7,
8,
8,
13,
7,
7,
7,
7,
7,
7,
7,
65,
13,
8,
7,
7,
8,
8,
7,
7,
46,
46,
46,
10,
6,
7,
7,
7,
7,
10,
10,
7,
12,
6,
0,
0,
1,
1,
111,
111,
14,
14,
14,
3,
3,
7,
7,
7,
14,
14,
0,
0,
1,
1,
72,
7,
7,
8,
8,
8,
8,
1,
13,
70,
34,
72,
1,
1,
7,
46,
7,
7,
15,
3,
7,
1,
1,
3,
0,
7,
8,
8,
6,
12,
3,
7,
7,
7,
7,
16,
16,
0,
1,
30,
2,
8,
2,
2,
2,
8,
8,
8,
2,
2,
2,
0,
0,
8,
8,
6,
7,
8,
1,
7,
7,
13,
13,
68,
68,
6,
6,
70,
70,
8,
8,
8,
0,
13,
13,
7,
13,
3,
3,
7,
1,
1,
2,
2,
7,
7,
140,
7,
7,
7,
7,
7,
140,
46,
46,
46,
46,
7,
24,
8,
8,
4,
7,
0,
0,
0,
1,
7,
7,
7,
13,
13,
8,
8,
12,
12,
1,
1,
1,
2,
2,
7,
13,
7,
7,
7,
7,
11,
40,
40,
7,
7,
7,
20,
7,
7,
7,
7,
7,
7,
3,
3,
3,
10,
10
],
"y": [
25.432496208391797,
26.40062187124731,
32.627882895328185,
16.473483734974884,
16.473483734974884,
22.048528999300473,
21.952491645101215,
19.608376416791778,
27.70559528169099,
22.896812289511804,
22.259833967577137,
19.79947735007302,
23.89937161554255,
23.89937161554255,
13.599029368558334,
13.599029368558334,
11.895393364291047,
11.895393364291047,
14.004356953877243,
17.61045749866248,
17.59108250111942,
16.530645612803767,
24.616939161944696,
15.973218797715552,
30.86054191171216,
25.34997846133653,
14.947949239530876,
17.43232787053986,
6.545235535293089,
5.328150264736912,
6.342930983263378,
5.990640984297092,
5.9339603247654615,
5.6171015655565055,
5.441653230243495,
5.113779260124896,
5.853253723382477,
18.52326653659808,
17.71670852646412,
17.71670852646412,
17.71670852646412,
15.827429986338219,
15.827429986338219,
3.9064248386004103,
17.943646116016044,
21.004986176864318,
21.004986176864318,
21.004986176864318,
21.004986176864318,
21.43364681812902,
19.99112025734428,
22.92692360397488,
8.299243034538407,
21.629391632905307,
21.639166938124102,
21.639166938124102,
24.62473094217658,
24.62473094217658,
21.01247015664927,
21.01247015664927,
26.945095621201215,
26.945095621201215,
21.77807030737311,
23.324426341826598,
9.278951588465933,
6.688919512253983,
7.122178211045956,
13.155462341901265,
13.427164944166408,
15.62055648816441,
15.061345512111492,
17.620946178257416,
3.6692423765809075,
5.392359658909203,
5.137017087672389,
5.564869039793773,
9.118435120286238,
9.006021162921042,
29.556738934879004,
29.224836684325613,
20.224674221574386,
21.02330687787111,
21.02330687787111,
11.289834319444326,
12.325165307166374,
15.219034679073014,
16.57617293158245,
16.57617293158245,
12.42275797734545,
14.823498043433531,
7.062282757592702,
6.385370764204122,
10.319571767384213,
13.91535071246266,
42.48630818371823,
23.66081168731019,
24.764482344118388,
8.778934275693588,
15.144990895303266,
12.013002201474537,
14.195345928021323,
4.698672676403544,
15.152356507248276,
22.321913398423053,
23.757346989413076,
29.555658111270684,
6.470278440392426,
6.470278440392426,
10.729523947697116,
24.133285606044453,
20.63795117818425,
20.63795117818425,
12.21390414514177,
6.536559509561811,
8.205321558755733,
3.962215291979836,
3.9712257798358466,
4.262012960471914,
3.4568911318914637,
3.7073934241241133,
18.302168187437868,
18.302168187437868,
6.893114892840058,
6.383023820314098,
5.448600841258123,
5.571831773906653,
8.101217266693423,
14.772804383415462,
14.772804383415462,
7.365628857118445,
10.139557822520734,
3.9015688926785734,
5.251513100569197,
6.201345407060512,
7.776435284352048,
17.40405754216496,
7.271607343217184,
7.271607343217184,
7.221453677142921,
15.279173051641893,
15.279173051641893,
12.840501007747184,
6.936700148776539,
6.936700148776539,
8.06417710652117,
6.5362026677856875,
6.5362026677856875,
18.11968845841446,
18.11968845841446,
7.745056411205006,
16.738445769074524,
8.58448433464234,
10.503331167244141,
10.784447380313544,
10.784447380313544,
11.00375415839273,
10.989657280367652,
12.733816621748955,
18.246717437525763,
9.39662427983811,
8.718240778815948,
26.36547101753479,
36.18340237700426,
13.412859085784765,
13.412859085784765,
23.908735693936837,
5.251433606790305,
18.136815704093205,
18.136815704093205,
14.152729107559473,
32.6696634675738,
25.4878179882604,
25.967732638041607,
28.749320369267338,
5.523965728106273,
7.057673794439714,
15.446174740490832,
15.446174740490832,
14.152421858603484,
14.152421858603484,
25.550232388991247,
13.57156168964474,
18.444951008649024,
19.11180572554635,
14.499830223176792,
14.166820006751633,
25.489173938868174,
24.351944449795166,
24.351944449795166,
19.233464672927635,
19.233464672927635,
17.592856841556475,
19.306153763784838,
22.043842751078802,
14.86751004330882,
15.389468073057905,
5.981676871872839,
6.510807087761722,
5.479590375205572,
5.8142234694303765,
4.980187627399172,
22.59157975164075,
22.564204090028692,
21.52253406020508,
13.807969316891429,
12.835458185676481,
22.728276100836258,
22.728276100836258,
13.860193776162257,
16.224300881101808,
5.216126538850885,
8.628186472564332,
9.256757722537667,
7.2632507075969786,
12.331442611850518,
14.23122081884668,
12.119323740918015,
21.327183071003265,
21.21648409288127,
21.21648409288127,
9.481131901453509,
13.776373885273868,
11.325775761393743,
10.408978081192403,
5.097916019413136,
5.015868974143408,
5.015868974143408,
4.938885587628826,
6.711834699417412,
8.170425368064842,
8.170425368064842,
5.461109359493478,
3.962783773521173,
6.330844324541082,
5.432973566930115,
4.748118992215374,
5.66393850876747,
19.628255331894646,
16.76685804216143
]
}
],
"layout": {
"height": 350,
"legend": {
"title": {
"text": "Version and Period"
}
},
"template": {
"data": {
"bar": [
{
"error_x": {
"color": "#2a3f5f"
},
"error_y": {
"color": "#2a3f5f"
},
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
},
"pattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
}
},
"type": "bar"
}
],
"barpolar": [
{
"marker": {
"line": {
"color": "#E5ECF6",
"width": 0.5
},
"pattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
}
},
"type": "barpolar"
}
],
"carpet": [
{
"aaxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"baxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "white",
"linecolor": "white",
"minorgridcolor": "white",
"startlinecolor": "#2a3f5f"
},
"type": "carpet"
}
],
"choropleth": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "choropleth"
}
],
"contour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "contour"
}
],
"contourcarpet": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "contourcarpet"
}
],
"heatmap": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "heatmap"
}
],
"heatmapgl": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "heatmapgl"
}
],
"histogram": [
{
"marker": {
"pattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
}
},
"type": "histogram"
}
],
"histogram2d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2d"
}
],
"histogram2dcontour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2dcontour"
}
],
"mesh3d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "mesh3d"
}
],
"parcoords": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "parcoords"
}
],
"pie": [
{
"automargin": true,
"type": "pie"
}
],
"scatter": [
{
"fillpattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
},
"type": "scatter"
}
],
"scatter3d": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatter3d"
}
],
"scattercarpet": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattercarpet"
}
],
"scattergeo": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergeo"
}
],
"scattergl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergl"
}
],
"scattermapbox": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattermapbox"
}
],
"scatterpolar": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolar"
}
],
"scatterpolargl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolargl"
}
],
"scatterternary": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterternary"
}
],
"surface": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "surface"
}
],
"table": [
{
"cells": {
"fill": {
"color": "#EBF0F8"
},
"line": {
"color": "white"
}
},
"header": {
"fill": {
"color": "#C8D4E3"
},
"line": {
"color": "white"
}
},
"type": "table"
}
]
},
"layout": {
"annotationdefaults": {
"arrowcolor": "#2a3f5f",
"arrowhead": 0,
"arrowwidth": 1
},
"autotypenumbers": "strict",
"coloraxis": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"colorscale": {
"diverging": [
[
0,
"#8e0152"
],
[
0.1,
"#c51b7d"
],
[
0.2,
"#de77ae"
],
[
0.3,
"#f1b6da"
],
[
0.4,
"#fde0ef"
],
[
0.5,
"#f7f7f7"
],
[
0.6,
"#e6f5d0"
],
[
0.7,
"#b8e186"
],
[
0.8,
"#7fbc41"
],
[
0.9,
"#4d9221"
],
[
1,
"#276419"
]
],
"sequential": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"sequentialminus": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
]
},
"colorway": [
"#636efa",
"#EF553B",
"#00cc96",
"#ab63fa",
"#FFA15A",
"#19d3f3",
"#FF6692",
"#B6E880",
"#FF97FF",
"#FECB52"
],
"font": {
"color": "#2a3f5f"
},
"geo": {
"bgcolor": "white",
"lakecolor": "white",
"landcolor": "#E5ECF6",
"showlakes": true,
"showland": true,
"subunitcolor": "white"
},
"hoverlabel": {
"align": "left"
},
"hovermode": "closest",
"mapbox": {
"style": "light"
},
"paper_bgcolor": "white",
"plot_bgcolor": "#E5ECF6",
"polar": {
"angularaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"radialaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"scene": {
"xaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"yaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
},
"zaxis": {
"backgroundcolor": "#E5ECF6",
"gridcolor": "white",
"gridwidth": 2,
"linecolor": "white",
"showbackground": true,
"ticks": "",
"zerolinecolor": "white"
}
},
"shapedefaults": {
"line": {
"color": "#2a3f5f"
}
},
"ternary": {
"aaxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"baxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
},
"bgcolor": "#E5ECF6",
"caxis": {
"gridcolor": "white",
"linecolor": "white",
"ticks": ""
}
},
"title": {
"x": 0.05
},
"xaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
},
"yaxis": {
"automargin": true,
"gridcolor": "white",
"linecolor": "white",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "white",
"zerolinewidth": 2
}
}
},
"title": {
"text": "Model Complexity vs Performance"
},
"width": 600,
"xaxis": {
"range": [
-10,
150
],
"title": {
"text": "Model Complexity (#Params in B)"
},
"type": "linear"
},
"yaxis": {
"range": [
-10,
100
],
"title": {
"text": "Mean Performance Score"
},
"type": "linear"
}
}
},
"text/html": [
""
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"# Create a new figure\n",
"fig = go.Figure()\n",
"\n",
"# Add traces for each version and period\n",
"fig.add_trace(go.Scatter(\n",
" x=merged_data['#Params (B)'][merged_data['date'] < threshold_1],\n",
" y=merged_data['mean_score_v1'][merged_data['date'] < threshold_1],\n",
" mode='markers',\n",
" name='V1 - before Oct 2023',\n",
" marker=dict(color=DARK_ORANGE)\n",
"))\n",
"fig.add_trace(go.Scatter(\n",
" x=merged_data['#Params (B)'][(threshold_1 <= merged_data['date']) & (merged_data['date'] < threshold_2)],\n",
" y=merged_data['mean_score_v1'][(threshold_1 <= merged_data['date']) & (merged_data['date'] < threshold_2)],\n",
" mode='markers',\n",
" name='V1 - Oct 2023 to Dec 2023',\n",
" marker=dict(color=ORANGE)\n",
"))\n",
"fig.add_trace(go.Scatter(\n",
" x=merged_data['#Params (B)'][merged_data['date'] >= threshold_2],\n",
" y=merged_data['mean_score_v1'][merged_data['date'] >= threshold_2],\n",
" mode='markers',\n",
" name='V1 - Dec 2023 and after',\n",
" marker=dict(color=YELLOW)\n",
"))\n",
"\n",
"# Version 2: All data points\n",
"fig.add_trace(go.Scatter(\n",
" x=merged_data['#Params (B)'],\n",
" y=merged_data['mean_score_v2'],\n",
" mode='markers',\n",
" name='V2',\n",
" marker=dict(color=BLACK)\n",
"))\n",
"\n",
"# Update axes and layout\n",
"fig.update_layout(\n",
" title=\"Model Complexity vs Performance\",\n",
" xaxis_title=\"Model Complexity (#Params in B)\",\n",
" yaxis_title=\"Mean Performance Score\",\n",
" legend_title=\"Version and Period\",\n",
" yaxis=dict(range=[-10, 100]),\n",
" xaxis=dict(range=[-10, 150]),\n",
" width=600,\n",
" height=350\n",
")\n",
"\n",
"with open(\"./plots/model_size_vs_perf.html\", \"w\") as f:\n",
" f.write(fig.to_html(full_html=False))\n",
"\n",
"# Show the figure\n",
"fig.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"id": "62d02821-a296-4fba-996c-0934d414519b",
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"id": "12e276ba-5333-44a6-92b8-9c3f01e61eff",
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Leaderboard EDA",
"language": "python",
"name": "leaderboard_eda"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.12.1"
}
},
"nbformat": 4,
"nbformat_minor": 5
}