{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/Users/andrewreed/Documents/success_projects/closed-vs-open-arena-elo/.venv/lib/python3.10/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"import os\n",
"import pickle\n",
"\n",
"import pandas as pd\n",
"from huggingface_hub import HfFileSystem, hf_hub_download"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Prepare data"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"from typing import Literal\n",
"\n",
"\n",
"def download_latest_data_from_space(\n",
" repo_id: str, file_type: Literal[\"pkl\", \"csv\"]\n",
") -> str:\n",
" \"\"\"\n",
" Downloads the latest data file of the specified file type from the given repository space.\n",
"\n",
" Args:\n",
" repo_id (str): The ID of the repository space.\n",
" file_type (Literal[\"pkl\", \"csv\"]): The type of the data file to download. Must be either \"pkl\" or \"csv\".\n",
"\n",
" Returns:\n",
" str: The local file path of the downloaded data file.\n",
" \"\"\"\n",
"\n",
" def extract_date(filename):\n",
" return filename.split(\"/\")[-1].split(\".\")[0].split(\"_\")[-1]\n",
"\n",
" fs = HfFileSystem()\n",
" data_file_path = f\"spaces/{repo_id}/*.{file_type}\"\n",
" files = fs.glob(data_file_path)\n",
" latest_file = sorted(files, key=extract_date, reverse=True)[0]\n",
"\n",
" latest_filepath_local = hf_hub_download(\n",
" repo_id=repo_id,\n",
" filename=latest_file.split(\"/\")[-1],\n",
" repo_type=\"space\",\n",
" )\n",
" return latest_filepath_local"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"latest_leaderboard_file_local = download_latest_data_from_space(\n",
" repo_id=\"lmsys/chatbot-arena-leaderboard\", file_type=\"csv\"\n",
")\n",
"latest_elo_file_local = download_latest_data_from_space(\n",
" repo_id=\"lmsys/chatbot-arena-leaderboard\", file_type=\"pkl\"\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"# load and prepare ELO data\n",
"key_to_category_name = {\n",
" \"full\": \"Overall\",\n",
" \"coding\": \"Coding\",\n",
" \"long_user\": \"Longer Query\",\n",
" \"english\": \"English\",\n",
" \"chinese\": \"Chinese\",\n",
" \"french\": \"French\",\n",
" \"no_tie\": \"Exclude Ties\",\n",
" \"no_short\": \"Exclude Short Query (< 5 tokens)\",\n",
" \"no_refusal\": \"Exclude Refusal\",\n",
"}\n",
"cat_name_to_explanation = {\n",
" \"Overall\": \"Overall Questions\",\n",
" \"Coding\": \"Coding: whether conversation contains code snippets\",\n",
" \"Longer Query\": \"Longer Query (>= 500 tokens)\",\n",
" \"English\": \"English Prompts\",\n",
" \"Chinese\": \"Chinese Prompts\",\n",
" \"French\": \"French Prompts\",\n",
" \"Exclude Ties\": \"Exclude Ties and Bothbad\",\n",
" \"Exclude Short Query (< 5 tokens)\": \"Exclude Short User Query (< 5 tokens)\",\n",
" \"Exclude Refusal\": 'Exclude model responses with refusal (e.g., \"I cannot answer\")',\n",
"}\n",
"\n",
"with open(latest_elo_file_local, \"rb\") as fin:\n",
" elo_results = pickle.load(fin)\n",
"\n",
"arena_dfs = {}\n",
"for k in key_to_category_name.keys():\n",
" if k not in elo_results:\n",
" continue\n",
" arena_dfs[key_to_category_name[k]] = elo_results[k][\"leaderboard_table_df\"]"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"dict_keys(['Overall', 'Coding', 'Longer Query', 'English', 'Chinese', 'French', 'Exclude Ties', 'Exclude Short Query (< 5 tokens)', 'Exclude Refusal'])"
]
},
"execution_count": 5,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"arena_dfs.keys()"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"
\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" rating | \n",
" variance | \n",
" rating_q975 | \n",
" rating_q025 | \n",
" num_battles | \n",
" final_ranking | \n",
"
\n",
" \n",
" \n",
" \n",
" RWKV-4-Raven-14B | \n",
" 928.451251 | \n",
" 26.146415 | \n",
" 937.017097 | \n",
" 919.444359 | \n",
" 5129 | \n",
" 82 | \n",
"
\n",
" \n",
" alpaca-13b | \n",
" 908.084359 | \n",
" 18.598539 | \n",
" 915.348707 | \n",
" 900.602847 | \n",
" 6111 | \n",
" 86 | \n",
"
\n",
" \n",
" bard-jan-24-gemini-pro | \n",
" 1208.712877 | \n",
" 7.975296 | \n",
" 1213.331583 | \n",
" 1203.004139 | \n",
" 12387 | \n",
" 6 | \n",
"
\n",
" \n",
" chatglm-6b | \n",
" 886.873429 | \n",
" 19.813751 | \n",
" 894.785321 | \n",
" 878.677878 | \n",
" 5195 | \n",
" 87 | \n",
"
\n",
" \n",
" chatglm2-6b | \n",
" 933.337288 | \n",
" 33.939472 | \n",
" 944.493496 | \n",
" 921.470740 | \n",
" 2880 | \n",
" 82 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" wizardlm-70b | \n",
" 1108.552744 | \n",
" 8.988005 | \n",
" 1114.390689 | \n",
" 1102.745236 | \n",
" 8867 | \n",
" 29 | \n",
"
\n",
" \n",
" yi-34b-chat | \n",
" 1111.132640 | \n",
" 7.801741 | \n",
" 1115.356993 | \n",
" 1105.658254 | \n",
" 13177 | \n",
" 29 | \n",
"
\n",
" \n",
" zephyr-7b-alpha | \n",
" 1043.084267 | \n",
" 45.472021 | \n",
" 1054.269954 | \n",
" 1027.602171 | \n",
" 1901 | \n",
" 57 | \n",
"
\n",
" \n",
" zephyr-7b-beta | \n",
" 1054.416300 | \n",
" 11.094606 | \n",
" 1060.265072 | \n",
" 1047.790509 | \n",
" 11924 | \n",
" 55 | \n",
"
\n",
" \n",
" zephyr-orpo-141b-A35b-v0.1 | \n",
" 1128.816337 | \n",
" 16.964385 | \n",
" 1134.862680 | \n",
" 1119.183571 | \n",
" 5207 | \n",
" 22 | \n",
"
\n",
" \n",
"
\n",
"
92 rows × 6 columns
\n",
"
"
],
"text/plain": [
" rating variance rating_q975 rating_q025 \\\n",
"RWKV-4-Raven-14B 928.451251 26.146415 937.017097 919.444359 \n",
"alpaca-13b 908.084359 18.598539 915.348707 900.602847 \n",
"bard-jan-24-gemini-pro 1208.712877 7.975296 1213.331583 1203.004139 \n",
"chatglm-6b 886.873429 19.813751 894.785321 878.677878 \n",
"chatglm2-6b 933.337288 33.939472 944.493496 921.470740 \n",
"... ... ... ... ... \n",
"wizardlm-70b 1108.552744 8.988005 1114.390689 1102.745236 \n",
"yi-34b-chat 1111.132640 7.801741 1115.356993 1105.658254 \n",
"zephyr-7b-alpha 1043.084267 45.472021 1054.269954 1027.602171 \n",
"zephyr-7b-beta 1054.416300 11.094606 1060.265072 1047.790509 \n",
"zephyr-orpo-141b-A35b-v0.1 1128.816337 16.964385 1134.862680 1119.183571 \n",
"\n",
" num_battles final_ranking \n",
"RWKV-4-Raven-14B 5129 82 \n",
"alpaca-13b 6111 86 \n",
"bard-jan-24-gemini-pro 12387 6 \n",
"chatglm-6b 5195 87 \n",
"chatglm2-6b 2880 82 \n",
"... ... ... \n",
"wizardlm-70b 8867 29 \n",
"yi-34b-chat 13177 29 \n",
"zephyr-7b-alpha 1901 57 \n",
"zephyr-7b-beta 11924 55 \n",
"zephyr-orpo-141b-A35b-v0.1 5207 22 \n",
"\n",
"[92 rows x 6 columns]"
]
},
"execution_count": 6,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"arena_dfs[\"Overall\"]"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [],
"source": [
"# load and prepare Leaderboard data\n",
"leaderboard_df = pd.read_csv(latest_leaderboard_file_local)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" key | \n",
" Model | \n",
" MT-bench (score) | \n",
" MMLU | \n",
" Knowledge cutoff date | \n",
" License | \n",
" Organization | \n",
" Link | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" wizardlm-30b | \n",
" WizardLM-30B | \n",
" 7.01 | \n",
" 0.587 | \n",
" 2023/6 | \n",
" Non-commercial | \n",
" Microsoft | \n",
" https://huggingface.co/WizardLM/WizardLM-30B-V1.0 | \n",
"
\n",
" \n",
" 1 | \n",
" vicuna-13b-16k | \n",
" Vicuna-13B-16k | \n",
" 6.92 | \n",
" 0.545 | \n",
" 2023/7 | \n",
" Llama 2 Community | \n",
" LMSYS | \n",
" https://huggingface.co/lmsys/vicuna-13b-v1.5-16k | \n",
"
\n",
" \n",
" 2 | \n",
" wizardlm-13b-v1.1 | \n",
" WizardLM-13B-v1.1 | \n",
" 6.76 | \n",
" 0.500 | \n",
" 2023/7 | \n",
" Non-commercial | \n",
" Microsoft | \n",
" https://huggingface.co/WizardLM/WizardLM-13B-V1.1 | \n",
"
\n",
" \n",
" 3 | \n",
" tulu-30b | \n",
" Tulu-30B | \n",
" 6.43 | \n",
" 0.581 | \n",
" 2023/6 | \n",
" Non-commercial | \n",
" AllenAI/UW | \n",
" https://huggingface.co/allenai/tulu-30b | \n",
"
\n",
" \n",
" 4 | \n",
" guanaco-65b | \n",
" Guanaco-65B | \n",
" 6.41 | \n",
" 0.621 | \n",
" 2023/5 | \n",
" Non-commercial | \n",
" UW | \n",
" https://huggingface.co/timdettmers/guanaco-65b... | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 101 | \n",
" llama-3-70b-instruct | \n",
" Llama-3-70b-Instruct | \n",
" - | \n",
" 0.820 | \n",
" 2023/12 | \n",
" Llama 3 Community | \n",
" Meta | \n",
" https://llama.meta.com/llama3/ | \n",
"
\n",
" \n",
" 102 | \n",
" llama-3-8b-instruct | \n",
" Llama-3-8b-Instruct | \n",
" - | \n",
" 0.684 | \n",
" 2023/3 | \n",
" Llama 3 Community | \n",
" Meta | \n",
" https://llama.meta.com/llama3/ | \n",
"
\n",
" \n",
" 103 | \n",
" gemini-1.5-pro-api-0409-preview | \n",
" Gemini 1.5 Pro API-0409-Preview | \n",
" - | \n",
" 0.819 | \n",
" 2023/11 | \n",
" Proprietary | \n",
" Google | \n",
" https://blog.google/technology/ai/google-gemin... | \n",
"
\n",
" \n",
" 104 | \n",
" phi-3-mini-128k-instruct | \n",
" Phi-3-Mini-128k-Instruct | \n",
" - | \n",
" 0.681 | \n",
" 2023/10 | \n",
" MIT | \n",
" Microsoft | \n",
" https://azure.microsoft.com/en-us/blog/introdu... | \n",
"
\n",
" \n",
" 105 | \n",
" snowflake-arctic-instruct | \n",
" Snowflake Arctic Instruct | \n",
" - | \n",
" 0.673 | \n",
" 2024/4 | \n",
" Apache 2.0 | \n",
" Snowflake | \n",
" https://www.snowflake.com/blog/arctic-open-eff... | \n",
"
\n",
" \n",
"
\n",
"
106 rows × 8 columns
\n",
"
"
],
"text/plain": [
" key Model \\\n",
"0 wizardlm-30b WizardLM-30B \n",
"1 vicuna-13b-16k Vicuna-13B-16k \n",
"2 wizardlm-13b-v1.1 WizardLM-13B-v1.1 \n",
"3 tulu-30b Tulu-30B \n",
"4 guanaco-65b Guanaco-65B \n",
".. ... ... \n",
"101 llama-3-70b-instruct Llama-3-70b-Instruct \n",
"102 llama-3-8b-instruct Llama-3-8b-Instruct \n",
"103 gemini-1.5-pro-api-0409-preview Gemini 1.5 Pro API-0409-Preview \n",
"104 phi-3-mini-128k-instruct Phi-3-Mini-128k-Instruct \n",
"105 snowflake-arctic-instruct Snowflake Arctic Instruct \n",
"\n",
" MT-bench (score) MMLU Knowledge cutoff date License \\\n",
"0 7.01 0.587 2023/6 Non-commercial \n",
"1 6.92 0.545 2023/7 Llama 2 Community \n",
"2 6.76 0.500 2023/7 Non-commercial \n",
"3 6.43 0.581 2023/6 Non-commercial \n",
"4 6.41 0.621 2023/5 Non-commercial \n",
".. ... ... ... ... \n",
"101 - 0.820 2023/12 Llama 3 Community \n",
"102 - 0.684 2023/3 Llama 3 Community \n",
"103 - 0.819 2023/11 Proprietary \n",
"104 - 0.681 2023/10 MIT \n",
"105 - 0.673 2024/4 Apache 2.0 \n",
"\n",
" Organization Link \n",
"0 Microsoft https://huggingface.co/WizardLM/WizardLM-30B-V1.0 \n",
"1 LMSYS https://huggingface.co/lmsys/vicuna-13b-v1.5-16k \n",
"2 Microsoft https://huggingface.co/WizardLM/WizardLM-13B-V1.1 \n",
"3 AllenAI/UW https://huggingface.co/allenai/tulu-30b \n",
"4 UW https://huggingface.co/timdettmers/guanaco-65b... \n",
".. ... ... \n",
"101 Meta https://llama.meta.com/llama3/ \n",
"102 Meta https://llama.meta.com/llama3/ \n",
"103 Google https://blog.google/technology/ai/google-gemin... \n",
"104 Microsoft https://azure.microsoft.com/en-us/blog/introdu... \n",
"105 Snowflake https://www.snowflake.com/blog/arctic-open-eff... \n",
"\n",
"[106 rows x 8 columns]"
]
},
"execution_count": 8,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"leaderboard_df"
]
},
{
"cell_type": "code",
"execution_count": 9,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"dict_keys(['Overall', 'Coding', 'Longer Query', 'English', 'Chinese', 'French', 'Exclude Ties', 'Exclude Short Query (< 5 tokens)', 'Exclude Refusal'])"
]
},
"execution_count": 9,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"arena_dfs.keys()"
]
},
{
"cell_type": "code",
"execution_count": 10,
"metadata": {},
"outputs": [],
"source": [
"# merge ELO and Leaderboard data\n",
"merged_dfs = {}\n",
"for k, v in arena_dfs.items():\n",
" merged_dfs[k] = (\n",
" pd.merge(arena_dfs[k], leaderboard_df, left_index=True, right_on=\"key\")\n",
" .sort_values(\"rating\", ascending=False)\n",
" .reset_index(drop=True)\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 11,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" rating | \n",
" variance | \n",
" rating_q975 | \n",
" rating_q025 | \n",
" num_battles | \n",
" final_ranking | \n",
" key | \n",
" Model | \n",
" MT-bench (score) | \n",
" MMLU | \n",
" Knowledge cutoff date | \n",
" License | \n",
" Organization | \n",
" Link | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1258.815279 | \n",
" 3.258132 | \n",
" 1262.796713 | \n",
" 1256.000508 | \n",
" 35931 | \n",
" 1 | \n",
" gpt-4-turbo-2024-04-09 | \n",
" GPT-4-Turbo-2024-04-09 | \n",
" - | \n",
" - | \n",
" 2023/12 | \n",
" Proprietary | \n",
" OpenAI | \n",
" https://platform.openai.com/docs/models/gpt-4-... | \n",
"
\n",
" \n",
" 1 | \n",
" 1252.684886 | \n",
" 1.799233 | \n",
" 1254.748391 | \n",
" 1249.873417 | \n",
" 73547 | \n",
" 2 | \n",
" gpt-4-1106-preview | \n",
" GPT-4-1106-preview | \n",
" 9.32 | \n",
" - | \n",
" 2023/4 | \n",
" Proprietary | \n",
" OpenAI | \n",
" https://openai.com/blog/new-models-and-develop... | \n",
"
\n",
" \n",
" 2 | \n",
" 1250.926206 | \n",
" 2.018201 | \n",
" 1253.851885 | \n",
" 1248.166034 | \n",
" 80997 | \n",
" 2 | \n",
" claude-3-opus-20240229 | \n",
" Claude 3 Opus | \n",
" - | \n",
" 0.868 | \n",
" 2023/8 | \n",
" Proprietary | \n",
" Anthropic | \n",
" https://www.anthropic.com/news/claude-3-family | \n",
"
\n",
" \n",
" 3 | \n",
" 1249.618395 | \n",
" 3.233129 | \n",
" 1252.956497 | \n",
" 1246.247080 | \n",
" 39482 | \n",
" 2 | \n",
" gemini-1.5-pro-api-0409-preview | \n",
" Gemini 1.5 Pro API-0409-Preview | \n",
" - | \n",
" 0.819 | \n",
" 2023/11 | \n",
" Proprietary | \n",
" Google | \n",
" https://blog.google/technology/ai/google-gemin... | \n",
"
\n",
" \n",
" 4 | \n",
" 1246.777591 | \n",
" 1.942477 | \n",
" 1249.979712 | \n",
" 1244.305362 | \n",
" 67354 | \n",
" 2 | \n",
" gpt-4-0125-preview | \n",
" GPT-4-0125-preview | \n",
" - | \n",
" - | \n",
" 2023/12 | \n",
" Proprietary | \n",
" OpenAI | \n",
" https://openai.com/blog/new-models-and-develop... | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 87 | \n",
" 886.873429 | \n",
" 19.813751 | \n",
" 894.785321 | \n",
" 878.677878 | \n",
" 5195 | \n",
" 87 | \n",
" chatglm-6b | \n",
" ChatGLM-6B | \n",
" 4.50 | \n",
" 0.361 | \n",
" 2023/3 | \n",
" Non-commercial | \n",
" Tsinghua | \n",
" https://huggingface.co/THUDM/chatglm-6b | \n",
"
\n",
" \n",
" 88 | \n",
" 876.929108 | \n",
" 27.115855 | \n",
" 887.355529 | \n",
" 866.860534 | \n",
" 4521 | \n",
" 88 | \n",
" fastchat-t5-3b | \n",
" FastChat-T5-3B | \n",
" 3.04 | \n",
" 0.477 | \n",
" 2023/4 | \n",
" Apache 2.0 | \n",
" LMSYS | \n",
" https://huggingface.co/lmsys/fastchat-t5-3b-v1.0 | \n",
"
\n",
" \n",
" 89 | \n",
" 848.932568 | \n",
" 36.961459 | \n",
" 859.103936 | \n",
" 837.364341 | \n",
" 3461 | \n",
" 90 | \n",
" stablelm-tuned-alpha-7b | \n",
" StableLM-Tuned-Alpha-7B | \n",
" 2.75 | \n",
" 0.244 | \n",
" 2023/4 | \n",
" CC-BY-NC-SA-4.0 | \n",
" Stability AI | \n",
" https://huggingface.co/stabilityai/stablelm-tu... | \n",
"
\n",
" \n",
" 90 | \n",
" 826.647332 | \n",
" 30.156414 | \n",
" 837.335988 | \n",
" 816.370788 | \n",
" 3666 | \n",
" 91 | \n",
" dolly-v2-12b | \n",
" Dolly-V2-12B | \n",
" 3.28 | \n",
" 0.257 | \n",
" 2023/4 | \n",
" MIT | \n",
" Databricks | \n",
" https://huggingface.co/databricks/dolly-v2-12b | \n",
"
\n",
" \n",
" 91 | \n",
" 804.356329 | \n",
" 44.756983 | \n",
" 815.161492 | \n",
" 790.879536 | \n",
" 2538 | \n",
" 92 | \n",
" llama-13b | \n",
" LLaMA-13B | \n",
" 2.61 | \n",
" 0.470 | \n",
" 2023/2 | \n",
" Non-commercial | \n",
" Meta | \n",
" https://arxiv.org/abs/2302.13971 | \n",
"
\n",
" \n",
"
\n",
"
92 rows × 14 columns
\n",
"
"
],
"text/plain": [
" rating variance rating_q975 rating_q025 num_battles \\\n",
"0 1258.815279 3.258132 1262.796713 1256.000508 35931 \n",
"1 1252.684886 1.799233 1254.748391 1249.873417 73547 \n",
"2 1250.926206 2.018201 1253.851885 1248.166034 80997 \n",
"3 1249.618395 3.233129 1252.956497 1246.247080 39482 \n",
"4 1246.777591 1.942477 1249.979712 1244.305362 67354 \n",
".. ... ... ... ... ... \n",
"87 886.873429 19.813751 894.785321 878.677878 5195 \n",
"88 876.929108 27.115855 887.355529 866.860534 4521 \n",
"89 848.932568 36.961459 859.103936 837.364341 3461 \n",
"90 826.647332 30.156414 837.335988 816.370788 3666 \n",
"91 804.356329 44.756983 815.161492 790.879536 2538 \n",
"\n",
" final_ranking key \\\n",
"0 1 gpt-4-turbo-2024-04-09 \n",
"1 2 gpt-4-1106-preview \n",
"2 2 claude-3-opus-20240229 \n",
"3 2 gemini-1.5-pro-api-0409-preview \n",
"4 2 gpt-4-0125-preview \n",
".. ... ... \n",
"87 87 chatglm-6b \n",
"88 88 fastchat-t5-3b \n",
"89 90 stablelm-tuned-alpha-7b \n",
"90 91 dolly-v2-12b \n",
"91 92 llama-13b \n",
"\n",
" Model MT-bench (score) MMLU \\\n",
"0 GPT-4-Turbo-2024-04-09 - - \n",
"1 GPT-4-1106-preview 9.32 - \n",
"2 Claude 3 Opus - 0.868 \n",
"3 Gemini 1.5 Pro API-0409-Preview - 0.819 \n",
"4 GPT-4-0125-preview - - \n",
".. ... ... ... \n",
"87 ChatGLM-6B 4.50 0.361 \n",
"88 FastChat-T5-3B 3.04 0.477 \n",
"89 StableLM-Tuned-Alpha-7B 2.75 0.244 \n",
"90 Dolly-V2-12B 3.28 0.257 \n",
"91 LLaMA-13B 2.61 0.470 \n",
"\n",
" Knowledge cutoff date License Organization \\\n",
"0 2023/12 Proprietary OpenAI \n",
"1 2023/4 Proprietary OpenAI \n",
"2 2023/8 Proprietary Anthropic \n",
"3 2023/11 Proprietary Google \n",
"4 2023/12 Proprietary OpenAI \n",
".. ... ... ... \n",
"87 2023/3 Non-commercial Tsinghua \n",
"88 2023/4 Apache 2.0 LMSYS \n",
"89 2023/4 CC-BY-NC-SA-4.0 Stability AI \n",
"90 2023/4 MIT Databricks \n",
"91 2023/2 Non-commercial Meta \n",
"\n",
" Link \n",
"0 https://platform.openai.com/docs/models/gpt-4-... \n",
"1 https://openai.com/blog/new-models-and-develop... \n",
"2 https://www.anthropic.com/news/claude-3-family \n",
"3 https://blog.google/technology/ai/google-gemin... \n",
"4 https://openai.com/blog/new-models-and-develop... \n",
".. ... \n",
"87 https://huggingface.co/THUDM/chatglm-6b \n",
"88 https://huggingface.co/lmsys/fastchat-t5-3b-v1.0 \n",
"89 https://huggingface.co/stabilityai/stablelm-tu... \n",
"90 https://huggingface.co/databricks/dolly-v2-12b \n",
"91 https://arxiv.org/abs/2302.13971 \n",
"\n",
"[92 rows x 14 columns]"
]
},
"execution_count": 11,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"merged_dfs[\"Overall\"]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Manually map release dates - MEH."
]
},
{
"cell_type": "code",
"execution_count": 12,
"metadata": {},
"outputs": [],
"source": [
"t = merged_dfs[\"Overall\"].loc[:, [\"key\", \"Model\"]]\n",
"t[\"Release Date\"] = \"\""
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"release_date_mapping = pd.read_json(\"release_date_mapping.json\", orient=\"records\")"
]
},
{
"cell_type": "code",
"execution_count": 14,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" key | \n",
" Model | \n",
" Release Date | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" gpt-4-turbo-2024-04-09 | \n",
" GPT-4-Turbo-2024-04-09 | \n",
" 2024-04-09 | \n",
"
\n",
" \n",
" 1 | \n",
" gpt-4-1106-preview | \n",
" GPT-4-1106-preview | \n",
" 2023-11-06 | \n",
"
\n",
" \n",
" 2 | \n",
" claude-3-opus-20240229 | \n",
" Claude 3 Opus | \n",
" 2024-02-29 | \n",
"
\n",
" \n",
" 3 | \n",
" gemini-1.5-pro-api-0409-preview | \n",
" Gemini 1.5 Pro API-0409-Preview | \n",
" 2024-04-09 | \n",
"
\n",
" \n",
" 4 | \n",
" gpt-4-0125-preview | \n",
" GPT-4-0125-preview | \n",
" 2024-01-25 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 86 | \n",
" chatglm-6b | \n",
" ChatGLM-6B | \n",
" 2023-03-13 | \n",
"
\n",
" \n",
" 87 | \n",
" fastchat-t5-3b | \n",
" FastChat-T5-3B | \n",
" 2023-04-27 | \n",
"
\n",
" \n",
" 88 | \n",
" stablelm-tuned-alpha-7b | \n",
" StableLM-Tuned-Alpha-7B | \n",
" 2023-04-19 | \n",
"
\n",
" \n",
" 89 | \n",
" dolly-v2-12b | \n",
" Dolly-V2-12B | \n",
" 2023-04-12 | \n",
"
\n",
" \n",
" 90 | \n",
" llama-13b | \n",
" LLaMA-13B | \n",
" 2023-02-27 | \n",
"
\n",
" \n",
"
\n",
"
91 rows × 3 columns
\n",
"
"
],
"text/plain": [
" key Model \\\n",
"0 gpt-4-turbo-2024-04-09 GPT-4-Turbo-2024-04-09 \n",
"1 gpt-4-1106-preview GPT-4-1106-preview \n",
"2 claude-3-opus-20240229 Claude 3 Opus \n",
"3 gemini-1.5-pro-api-0409-preview Gemini 1.5 Pro API-0409-Preview \n",
"4 gpt-4-0125-preview GPT-4-0125-preview \n",
".. ... ... \n",
"86 chatglm-6b ChatGLM-6B \n",
"87 fastchat-t5-3b FastChat-T5-3B \n",
"88 stablelm-tuned-alpha-7b StableLM-Tuned-Alpha-7B \n",
"89 dolly-v2-12b Dolly-V2-12B \n",
"90 llama-13b LLaMA-13B \n",
"\n",
" Release Date \n",
"0 2024-04-09 \n",
"1 2023-11-06 \n",
"2 2024-02-29 \n",
"3 2024-04-09 \n",
"4 2024-01-25 \n",
".. ... \n",
"86 2023-03-13 \n",
"87 2023-04-27 \n",
"88 2023-04-19 \n",
"89 2023-04-12 \n",
"90 2023-02-27 \n",
"\n",
"[91 rows x 3 columns]"
]
},
"execution_count": 14,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"release_date_mapping"
]
},
{
"cell_type": "code",
"execution_count": 15,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" key | \n",
" Release Date | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" gpt-4-turbo-2024-04-09 | \n",
" 2024-04-09 | \n",
"
\n",
" \n",
" 1 | \n",
" gpt-4-1106-preview | \n",
" 2023-11-06 | \n",
"
\n",
" \n",
" 2 | \n",
" claude-3-opus-20240229 | \n",
" 2024-02-29 | \n",
"
\n",
" \n",
" 3 | \n",
" gemini-1.5-pro-api-0409-preview | \n",
" 2024-04-09 | \n",
"
\n",
" \n",
" 4 | \n",
" gpt-4-0125-preview | \n",
" 2024-01-25 | \n",
"
\n",
" \n",
" ... | \n",
" ... | \n",
" ... | \n",
"
\n",
" \n",
" 86 | \n",
" chatglm-6b | \n",
" 2023-03-13 | \n",
"
\n",
" \n",
" 87 | \n",
" fastchat-t5-3b | \n",
" 2023-04-27 | \n",
"
\n",
" \n",
" 88 | \n",
" stablelm-tuned-alpha-7b | \n",
" 2023-04-19 | \n",
"
\n",
" \n",
" 89 | \n",
" dolly-v2-12b | \n",
" 2023-04-12 | \n",
"
\n",
" \n",
" 90 | \n",
" llama-13b | \n",
" 2023-02-27 | \n",
"
\n",
" \n",
"
\n",
"
91 rows × 2 columns
\n",
"
"
],
"text/plain": [
" key Release Date\n",
"0 gpt-4-turbo-2024-04-09 2024-04-09\n",
"1 gpt-4-1106-preview 2023-11-06\n",
"2 claude-3-opus-20240229 2024-02-29\n",
"3 gemini-1.5-pro-api-0409-preview 2024-04-09\n",
"4 gpt-4-0125-preview 2024-01-25\n",
".. ... ...\n",
"86 chatglm-6b 2023-03-13\n",
"87 fastchat-t5-3b 2023-04-27\n",
"88 stablelm-tuned-alpha-7b 2023-04-19\n",
"89 dolly-v2-12b 2023-04-12\n",
"90 llama-13b 2023-02-27\n",
"\n",
"[91 rows x 2 columns]"
]
},
"execution_count": 15,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"release_date_mapping[[\"key\", \"Release Date\"]]"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [],
"source": [
"# add release dates into the merged data\n",
"for k, v in merged_dfs.items():\n",
" merged_dfs[k] = pd.merge(\n",
" merged_dfs[k], release_date_mapping[[\"key\", \"Release Date\"]], on=\"key\"\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 17,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['rating', 'variance', 'rating_q975', 'rating_q025', 'num_battles',\n",
" 'final_ranking', 'key', 'Model', 'MT-bench (score)', 'MMLU',\n",
" 'Knowledge cutoff date', 'License', 'Organization', 'Link',\n",
" 'Release Date'],\n",
" dtype='object')"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"merged_dfs[\"Overall\"].columns"
]
},
{
"cell_type": "code",
"execution_count": 77,
"metadata": {},
"outputs": [],
"source": [
"def format_data(df):\n",
" df[\"License\"] = df[\"License\"].apply(\n",
" lambda x: \"Proprietary LLM\" if x in PROPRIETARY_LICENSES else \"Open LLM\"\n",
" )\n",
" df[\"Release Date\"] = pd.to_datetime(df[\"Release Date\"])\n",
" df[\"Month-Year\"] = df[\"Release Date\"].dt.to_period(\"M\")\n",
" df[\"rating\"] = df[\"rating\"].round()\n",
" return df.reset_index(drop=True)\n",
"\n",
"\n",
"merged_dfs2 = {k: format_data(v) for k, v in merged_dfs.items()}"
]
},
{
"cell_type": "code",
"execution_count": 81,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"5\n",
"5\n",
"5\n",
"5\n",
"5\n",
"5\n",
"5\n",
"5\n",
"5\n"
]
}
],
"source": [
"for k, df in merged_dfs2.items():\n",
" print(\n",
" int(\n",
" df.groupby([\"Release Date\", \"License\"])[\"rating\"]\n",
" .apply(lambda x: len(x))\n",
" .max()\n",
" )\n",
" )\n",
" (df[\"rating\"].min().round(),)\n",
" print()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Build plot"
]
},
{
"cell_type": "code",
"execution_count": 76,
"metadata": {},
"outputs": [],
"source": [
"t = {\n",
" \"Overall\": {\n",
" \"min_elo_score\": 804.0,\n",
" \"max_elo_score\": 1259.0,\n",
" \"upper_models_per_month\": 5,\n",
" },\n",
" \"Coding\": {\n",
" \"min_elo_score\": 672.0,\n",
" \"max_elo_score\": 1270.0,\n",
" \"upper_models_per_month\": 5,\n",
" },\n",
" \"Longer Query\": {\n",
" \"min_elo_score\": 796.0,\n",
" \"max_elo_score\": 1273.0,\n",
" \"upper_models_per_month\": 5,\n",
" },\n",
" \"English\": {\n",
" \"min_elo_score\": 783.0,\n",
" \"max_elo_score\": 1246.0,\n",
" \"upper_models_per_month\": 5,\n",
" },\n",
" \"Chinese\": {\n",
" \"min_elo_score\": 753.0,\n",
" \"max_elo_score\": 1325.0,\n",
" \"upper_models_per_month\": 5,\n",
" },\n",
" \"French\": {\n",
" \"min_elo_score\": 694.0,\n",
" \"max_elo_score\": 1268.0,\n",
" \"upper_models_per_month\": 5,\n",
" },\n",
" \"Exclude Ties\": {\n",
" \"min_elo_score\": 654.0,\n",
" \"max_elo_score\": 1334.0,\n",
" \"upper_models_per_month\": 5,\n",
" },\n",
" \"Exclude Short Query (< 5 tokens)\": {\n",
" \"min_elo_score\": 796.0,\n",
" \"max_elo_score\": 1264.0,\n",
" \"upper_models_per_month\": 5,\n",
" },\n",
" \"Exclude Refusal\": {\n",
" \"min_elo_score\": 795.0,\n",
" \"max_elo_score\": 1264.0,\n",
" \"upper_models_per_month\": 5,\n",
" },\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"o = {\n",
" \"min_elo_score\": ,\n",
" \"max_elo_score\": ,\n",
" \"upper_models_per_month\": ,\n",
"}"
]
},
{
"cell_type": "code",
"execution_count": 49,
"metadata": {},
"outputs": [],
"source": [
"PROPRIETARY_LICENSES = [\n",
" \"Proprietary\",\n",
" \"Non-commercial\",\n",
"]\n",
"\n",
"df = merged_dfs[\"Overall\"]\n",
"df[\"License\"] = df[\"License\"].apply(\n",
" lambda x: \"Proprietary LLM\" if x in PROPRIETARY_LICENSES else \"Open LLM\"\n",
")\n",
"df[\"Release Date\"] = pd.to_datetime(df[\"Release Date\"])"
]
},
{
"cell_type": "code",
"execution_count": 57,
"metadata": {},
"outputs": [],
"source": [
"df[\"Month-Year\"] = df[\"Release Date\"].dt.to_period(\"M\")"
]
},
{
"cell_type": "code",
"execution_count": 66,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"8"
]
},
"execution_count": 66,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.groupby([\"Month-Year\", \"License\"])[\"rating\"].apply(lambda x: x.count()).max()"
]
},
{
"cell_type": "code",
"execution_count": 69,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" rating | \n",
" variance | \n",
" rating_q975 | \n",
" rating_q025 | \n",
" num_battles | \n",
" final_ranking | \n",
" key | \n",
" Model | \n",
" MT-bench (score) | \n",
" MMLU | \n",
" Knowledge cutoff date | \n",
" License | \n",
" Organization | \n",
" Link | \n",
" Release Date | \n",
" license_binary | \n",
" Month-Year | \n",
"
\n",
" \n",
" \n",
" \n",
" 4 | \n",
" 1246.777591 | \n",
" 1.942477 | \n",
" 1249.979712 | \n",
" 1244.305362 | \n",
" 67354 | \n",
" 2 | \n",
" gpt-4-0125-preview | \n",
" GPT-4-0125-preview | \n",
" - | \n",
" - | \n",
" 2023/12 | \n",
" Proprietary LLM | \n",
" OpenAI | \n",
" https://openai.com/blog/new-models-and-develop... | \n",
" 2024-01-25 | \n",
" Proprietary LLM | \n",
" 2024-01 | \n",
"
\n",
" \n",
" 32 | \n",
" 1111.132640 | \n",
" 7.801741 | \n",
" 1115.356993 | \n",
" 1105.658254 | \n",
" 13177 | \n",
" 29 | \n",
" yi-34b-chat | \n",
" Yi-34B-Chat | \n",
" - | \n",
" 0.735 | \n",
" 2023/6 | \n",
" Open LLM | \n",
" 01 AI | \n",
" https://huggingface.co/01-ai/Yi-34B-Chat | \n",
" 2024-01-23 | \n",
" Open LLM | \n",
" 2024-01 | \n",
"
\n",
" \n",
" 36 | \n",
" 1107.129810 | \n",
" 2.419182 | \n",
" 1110.056188 | \n",
" 1104.002581 | \n",
" 47220 | \n",
" 32 | \n",
" gpt-3.5-turbo-0125 | \n",
" GPT-3.5-Turbo-0125 | \n",
" - | \n",
" - | \n",
" 2021/9 | \n",
" Proprietary LLM | \n",
" OpenAI | \n",
" https://platform.openai.com/docs/models/gpt-3-... | \n",
" 2024-01-25 | \n",
" Proprietary LLM | \n",
" 2024-01 | \n",
"
\n",
" \n",
" 39 | \n",
" 1098.527455 | \n",
" 6.400166 | \n",
" 1103.343592 | \n",
" 1093.903695 | \n",
" 14159 | \n",
" 36 | \n",
" openchat-3.5-0106 | \n",
" OpenChat-3.5-0106 | \n",
" 7.8 | \n",
" 0.658 | \n",
" 2024/1 | \n",
" Open LLM | \n",
" OpenChat | \n",
" https://huggingface.co/openchat/openchat-3.5-0106 | \n",
" 2024-01-06 | \n",
" Open LLM | \n",
" 2024-01 | \n",
"
\n",
" \n",
" 43 | \n",
" 1087.307758 | \n",
" 18.314258 | \n",
" 1094.532598 | \n",
" 1078.413814 | \n",
" 3980 | \n",
" 40 | \n",
" nous-hermes-2-mixtral-8x7b-dpo | \n",
" Nous-Hermes-2-Mixtral-8x7B-DPO | \n",
" - | \n",
" - | \n",
" 2024/1 | \n",
" Open LLM | \n",
" NousResearch | \n",
" https://huggingface.co/NousResearch/Nous-Herme... | \n",
" 2024-01-13 | \n",
" Open LLM | \n",
" 2024-01 | \n",
"
\n",
" \n",
" 60 | \n",
" 1047.927688 | \n",
" 60.707225 | \n",
" 1061.952116 | \n",
" 1034.283514 | \n",
" 1321 | \n",
" 55 | \n",
" codellama-70b-instruct | \n",
" CodeLlama-70B-instruct | \n",
" - | \n",
" - | \n",
" 2024/1 | \n",
" Open LLM | \n",
" Meta | \n",
" https://huggingface.co/codellama/CodeLlama-70b-hf | \n",
" 2024-01-29 | \n",
" Open LLM | \n",
" 2024-01 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" rating variance rating_q975 rating_q025 num_battles \\\n",
"4 1246.777591 1.942477 1249.979712 1244.305362 67354 \n",
"32 1111.132640 7.801741 1115.356993 1105.658254 13177 \n",
"36 1107.129810 2.419182 1110.056188 1104.002581 47220 \n",
"39 1098.527455 6.400166 1103.343592 1093.903695 14159 \n",
"43 1087.307758 18.314258 1094.532598 1078.413814 3980 \n",
"60 1047.927688 60.707225 1061.952116 1034.283514 1321 \n",
"\n",
" final_ranking key \\\n",
"4 2 gpt-4-0125-preview \n",
"32 29 yi-34b-chat \n",
"36 32 gpt-3.5-turbo-0125 \n",
"39 36 openchat-3.5-0106 \n",
"43 40 nous-hermes-2-mixtral-8x7b-dpo \n",
"60 55 codellama-70b-instruct \n",
"\n",
" Model MT-bench (score) MMLU \\\n",
"4 GPT-4-0125-preview - - \n",
"32 Yi-34B-Chat - 0.735 \n",
"36 GPT-3.5-Turbo-0125 - - \n",
"39 OpenChat-3.5-0106 7.8 0.658 \n",
"43 Nous-Hermes-2-Mixtral-8x7B-DPO - - \n",
"60 CodeLlama-70B-instruct - - \n",
"\n",
" Knowledge cutoff date License Organization \\\n",
"4 2023/12 Proprietary LLM OpenAI \n",
"32 2023/6 Open LLM 01 AI \n",
"36 2021/9 Proprietary LLM OpenAI \n",
"39 2024/1 Open LLM OpenChat \n",
"43 2024/1 Open LLM NousResearch \n",
"60 2024/1 Open LLM Meta \n",
"\n",
" Link Release Date \\\n",
"4 https://openai.com/blog/new-models-and-develop... 2024-01-25 \n",
"32 https://huggingface.co/01-ai/Yi-34B-Chat 2024-01-23 \n",
"36 https://platform.openai.com/docs/models/gpt-3-... 2024-01-25 \n",
"39 https://huggingface.co/openchat/openchat-3.5-0106 2024-01-06 \n",
"43 https://huggingface.co/NousResearch/Nous-Herme... 2024-01-13 \n",
"60 https://huggingface.co/codellama/CodeLlama-70b-hf 2024-01-29 \n",
"\n",
" license_binary Month-Year \n",
"4 Proprietary LLM 2024-01 \n",
"32 Open LLM 2024-01 \n",
"36 Proprietary LLM 2024-01 \n",
"39 Open LLM 2024-01 \n",
"43 Open LLM 2024-01 \n",
"60 Open LLM 2024-01 "
]
},
"execution_count": 69,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df[\"Month-Year\"] == \"2024-01\"]"
]
},
{
"cell_type": "code",
"execution_count": 70,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/var/folders/w0/6t9rxkj97rv47l9sc0q22yth0000gn/T/ipykernel_7726/1725500526.py:1: DeprecationWarning:\n",
"\n",
"DataFrameGroupBy.apply operated on the grouping columns. This behavior is deprecated, and in a future version of pandas the grouping columns will be excluded from the operation. Either pass `include_groups=False` to exclude the groupings or explicitly select the grouping columns after groupby to silence this warning.\n",
"\n"
]
},
{
"data": {
"text/html": [
"\n",
"\n",
"
\n",
" \n",
" \n",
" | \n",
" rating | \n",
" variance | \n",
" rating_q975 | \n",
" rating_q025 | \n",
" num_battles | \n",
" final_ranking | \n",
" key | \n",
" Model | \n",
" MT-bench (score) | \n",
" MMLU | \n",
" Knowledge cutoff date | \n",
" License | \n",
" Organization | \n",
" Link | \n",
" Release Date | \n",
" license_binary | \n",
" Month-Year | \n",
"
\n",
" \n",
" \n",
" \n",
" 0 | \n",
" 1111.132640 | \n",
" 7.801741 | \n",
" 1115.356993 | \n",
" 1105.658254 | \n",
" 13177 | \n",
" 29 | \n",
" yi-34b-chat | \n",
" Yi-34B-Chat | \n",
" - | \n",
" 0.735 | \n",
" 2023/6 | \n",
" Open LLM | \n",
" 01 AI | \n",
" https://huggingface.co/01-ai/Yi-34B-Chat | \n",
" 2024-01-23 | \n",
" Open LLM | \n",
" 2024-01 | \n",
"
\n",
" \n",
" 1 | \n",
" 1098.527455 | \n",
" 6.400166 | \n",
" 1103.343592 | \n",
" 1093.903695 | \n",
" 14159 | \n",
" 36 | \n",
" openchat-3.5-0106 | \n",
" OpenChat-3.5-0106 | \n",
" 7.8 | \n",
" 0.658 | \n",
" 2024/1 | \n",
" Open LLM | \n",
" OpenChat | \n",
" https://huggingface.co/openchat/openchat-3.5-0106 | \n",
" 2024-01-06 | \n",
" Open LLM | \n",
" 2024-01 | \n",
"
\n",
" \n",
" 2 | \n",
" 1087.307758 | \n",
" 18.314258 | \n",
" 1094.532598 | \n",
" 1078.413814 | \n",
" 3980 | \n",
" 40 | \n",
" nous-hermes-2-mixtral-8x7b-dpo | \n",
" Nous-Hermes-2-Mixtral-8x7B-DPO | \n",
" - | \n",
" - | \n",
" 2024/1 | \n",
" Open LLM | \n",
" NousResearch | \n",
" https://huggingface.co/NousResearch/Nous-Herme... | \n",
" 2024-01-13 | \n",
" Open LLM | \n",
" 2024-01 | \n",
"
\n",
" \n",
" 3 | \n",
" 1246.777591 | \n",
" 1.942477 | \n",
" 1249.979712 | \n",
" 1244.305362 | \n",
" 67354 | \n",
" 2 | \n",
" gpt-4-0125-preview | \n",
" GPT-4-0125-preview | \n",
" - | \n",
" - | \n",
" 2023/12 | \n",
" Proprietary LLM | \n",
" OpenAI | \n",
" https://openai.com/blog/new-models-and-develop... | \n",
" 2024-01-25 | \n",
" Proprietary LLM | \n",
" 2024-01 | \n",
"
\n",
" \n",
" 4 | \n",
" 1107.129810 | \n",
" 2.419182 | \n",
" 1110.056188 | \n",
" 1104.002581 | \n",
" 47220 | \n",
" 32 | \n",
" gpt-3.5-turbo-0125 | \n",
" GPT-3.5-Turbo-0125 | \n",
" - | \n",
" - | \n",
" 2021/9 | \n",
" Proprietary LLM | \n",
" OpenAI | \n",
" https://platform.openai.com/docs/models/gpt-3-... | \n",
" 2024-01-25 | \n",
" Proprietary LLM | \n",
" 2024-01 | \n",
"
\n",
" \n",
"
\n",
"
"
],
"text/plain": [
" rating variance rating_q975 rating_q025 num_battles \\\n",
"0 1111.132640 7.801741 1115.356993 1105.658254 13177 \n",
"1 1098.527455 6.400166 1103.343592 1093.903695 14159 \n",
"2 1087.307758 18.314258 1094.532598 1078.413814 3980 \n",
"3 1246.777591 1.942477 1249.979712 1244.305362 67354 \n",
"4 1107.129810 2.419182 1110.056188 1104.002581 47220 \n",
"\n",
" final_ranking key \\\n",
"0 29 yi-34b-chat \n",
"1 36 openchat-3.5-0106 \n",
"2 40 nous-hermes-2-mixtral-8x7b-dpo \n",
"3 2 gpt-4-0125-preview \n",
"4 32 gpt-3.5-turbo-0125 \n",
"\n",
" Model MT-bench (score) MMLU \\\n",
"0 Yi-34B-Chat - 0.735 \n",
"1 OpenChat-3.5-0106 7.8 0.658 \n",
"2 Nous-Hermes-2-Mixtral-8x7B-DPO - - \n",
"3 GPT-4-0125-preview - - \n",
"4 GPT-3.5-Turbo-0125 - - \n",
"\n",
" Knowledge cutoff date License Organization \\\n",
"0 2023/6 Open LLM 01 AI \n",
"1 2024/1 Open LLM OpenChat \n",
"2 2024/1 Open LLM NousResearch \n",
"3 2023/12 Proprietary LLM OpenAI \n",
"4 2021/9 Proprietary LLM OpenAI \n",
"\n",
" Link Release Date \\\n",
"0 https://huggingface.co/01-ai/Yi-34B-Chat 2024-01-23 \n",
"1 https://huggingface.co/openchat/openchat-3.5-0106 2024-01-06 \n",
"2 https://huggingface.co/NousResearch/Nous-Herme... 2024-01-13 \n",
"3 https://openai.com/blog/new-models-and-develop... 2024-01-25 \n",
"4 https://platform.openai.com/docs/models/gpt-3-... 2024-01-25 \n",
"\n",
" license_binary Month-Year \n",
"0 Open LLM 2024-01 \n",
"1 Open LLM 2024-01 \n",
"2 Open LLM 2024-01 \n",
"3 Proprietary LLM 2024-01 \n",
"4 Proprietary LLM 2024-01 "
]
},
"execution_count": 70,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df[df[\"Month-Year\"] == \"2024-01\"].groupby([\"Month-Year\", \"License\"]).apply(\n",
" lambda x: x.nlargest(3, \"rating\")\n",
").reset_index(drop=True)"
]
},
{
"cell_type": "code",
"execution_count": 51,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"Index(['rating', 'variance', 'rating_q975', 'rating_q025', 'num_battles',\n",
" 'final_ranking', 'key', 'Model', 'MT-bench (score)', 'MMLU',\n",
" 'Knowledge cutoff date', 'License', 'Organization', 'Link',\n",
" 'Release Date', 'license_binary'],\n",
" dtype='object')"
]
},
"execution_count": 51,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.keys()"
]
},
{
"cell_type": "code",
"execution_count": 56,
"metadata": {},
"outputs": [
{
"data": {
"application/vnd.plotly.v1+json": {
"config": {
"plotlyServerURL": "https://plot.ly"
},
"data": [
{
"customdata": [
[
"OpenAI",
"Proprietary LLM",
"Proprietary LLM"
],
[
"OpenAI",
"Proprietary LLM",
"Proprietary LLM"
],
[
"Anthropic",
"Proprietary LLM",
"Proprietary LLM"
],
[
"Google",
"Proprietary LLM",
"Proprietary LLM"
],
[
"OpenAI",
"Proprietary LLM",
"Proprietary LLM"
],
[
"Google",
"Proprietary LLM",
"Proprietary LLM"
],
[
"Anthropic",
"Proprietary LLM",
"Proprietary LLM"
],
[
"OpenAI",
"Proprietary LLM",
"Proprietary LLM"
],
[
"Anthropic",
"Proprietary LLM",
"Proprietary LLM"
],
[
"OpenAI",
"Proprietary LLM",
"Proprietary LLM"
],
[
"Mistral",
"Proprietary LLM",
"Proprietary LLM"
],
[
"Reka AI",
"Proprietary LLM",
"Proprietary LLM"
],
[
"Anthropic",
"Proprietary LLM",
"Proprietary LLM"
],
[
"Mistral",
"Proprietary LLM",
"Proprietary LLM"
],
[
"Reka AI",
"Proprietary LLM",
"Proprietary LLM"
],
[
"Google",
"Proprietary LLM",
"Proprietary LLM"
],
[
"Anthropic",
"Proprietary LLM",
"Proprietary LLM"
],
[
"Mistral",
"Proprietary LLM",
"Proprietary LLM"
],
[
"OpenAI",
"Proprietary LLM",
"Proprietary LLM"
],
[
"Anthropic",
"Proprietary LLM",
"Proprietary LLM"
],
[
"Google",
"Proprietary LLM",
"Proprietary LLM"
],
[
"Anthropic",
"Proprietary LLM",
"Proprietary LLM"
],
[
"OpenAI",
"Proprietary LLM",
"Proprietary LLM"
],
[
"OpenAI",
"Proprietary LLM",
"Proprietary LLM"
],
[
"LMSYS",
"Proprietary LLM",
"Proprietary LLM"
],
[
"Perplexity AI",
"Proprietary LLM",
"Proprietary LLM"
],
[
"OpenAI",
"Proprietary LLM",
"Proprietary LLM"
],
[
"Perplexity AI",
"Proprietary LLM",
"Proprietary LLM"
],
[
"UW",
"Proprietary LLM",
"Proprietary LLM"
],
[
"Google",
"Proprietary LLM",
"Proprietary LLM"
],
[
"UC Berkeley",
"Proprietary LLM",
"Proprietary LLM"
],
[
"Nomic AI",
"Proprietary LLM",
"Proprietary LLM"
],
[
"Stanford",
"Proprietary LLM",
"Proprietary LLM"
],
[
"Tsinghua",
"Proprietary LLM",
"Proprietary LLM"
],
[
"Meta",
"Proprietary LLM",
"Proprietary LLM"
]
],
"hovertemplate": "%{hovertext}
license_binary=%{customdata[2]}
Release Date=%{x}
Arena ELO=%{y}
Organization=%{customdata[0]}
License=%{customdata[1]}",
"hovertext": [
"GPT-4-Turbo-2024-04-09",
"GPT-4-1106-preview",
"Claude 3 Opus",
"Gemini 1.5 Pro API-0409-Preview",
"GPT-4-0125-preview",
"Bard (Gemini Pro)",
"Claude 3 Sonnet",
"GPT-4-0314",
"Claude 3 Haiku",
"GPT-4-0613",
"Mistral-Large-2402",
"Reka-Flash-21B-online",
"Claude-1",
"Mistral Medium",
"Reka-Flash-21B",
"Gemini Pro (Dev API)",
"Claude-2.0",
"Mistral-Next",
"GPT-3.5-Turbo-0613",
"Claude-2.1",
"Gemini Pro",
"Claude-Instant-1",
"GPT-3.5-Turbo-0314",
"GPT-3.5-Turbo-0125",
"Vicuna-33B",
"pplx-70b-online",
"GPT-3.5-Turbo-1106",
"pplx-7b-online",
"Guanaco-33B",
"PaLM-Chat-Bison-001",
"Koala-13B",
"GPT4All-13B-Snoozy",
"Alpaca-13B",
"ChatGLM-6B",
"LLaMA-13B"
],
"legendgroup": "Proprietary LLM",
"marker": {
"color": "#636efa",
"size": 8,
"symbol": "circle"
},
"mode": "markers",
"name": "Proprietary LLM",
"orientation": "v",
"showlegend": true,
"type": "scatter",
"x": [
"2024-04-09T00:00:00",
"2023-11-06T00:00:00",
"2024-02-29T00:00:00",
"2024-04-09T00:00:00",
"2024-01-25T00:00:00",
"2024-02-01T00:00:00",
"2024-02-29T00:00:00",
"2024-03-14T00:00:00",
"2024-03-07T00:00:00",
"2023-06-13T00:00:00",
"2024-02-24T00:00:00",
"2024-02-26T00:00:00",
"2023-03-14T00:00:00",
"2023-12-11T00:00:00",
"2024-02-26T00:00:00",
"2023-12-13T00:00:00",
"2023-07-11T00:00:00",
"2024-02-17T00:00:00",
"2023-06-13T00:00:00",
"2023-11-21T00:00:00",
"2023-12-13T00:00:00",
"2023-03-14T00:00:00",
"2024-03-14T00:00:00",
"2024-01-25T00:00:00",
"2023-06-21T00:00:00",
"2023-11-29T00:00:00",
"2023-11-06T00:00:00",
"2023-11-29T00:00:00",
"2023-05-22T00:00:00",
"2023-07-10T00:00:00",
"2023-04-03T00:00:00",
"2023-04-24T00:00:00",
"2023-03-13T00:00:00",
"2023-03-13T00:00:00",
"2023-02-27T00:00:00"
],
"xaxis": "x",
"y": [
1258.8152791324715,
1252.6848856241577,
1250.9262064295565,
1249.6183945401244,
1246.7775913509702,
1208.7128773784577,
1201.2654981955752,
1189.557977031121,
1180.8870022256567,
1165.279013874706,
1157.2129636222178,
1153.368015144387,
1150.6246111849628,
1148.003325470259,
1147.136619289767,
1135.7254379948201,
1132.3083987521873,
1126.6887059695398,
1119.8996424050451,
1119.0708879096221,
1115.3213731540973,
1110.3806845414053,
1108.9125926100855,
1107.1298100300314,
1093.8870113925889,
1075.4285458870645,
1072.711340370162,
1043.3909111518306,
1034.3952377983876,
1009.7116452193085,
969.48148016344,
938.8924300511185,
908.0843590844727,
886.8734292498528,
804.3563285706291
],
"yaxis": "y"
},
{
"customdata": [
[
"Meta",
"Open LLM",
"Open LLM"
],
[
"Cohere",
"Open LLM",
"Open LLM"
],
[
"Meta",
"Open LLM",
"Open LLM"
],
[
"Alibaba",
"Open LLM",
"Open LLM"
],
[
"Cohere",
"Open LLM",
"Open LLM"
],
[
"Mistral",
"Open LLM",
"Open LLM"
],
[
"Alibaba",
"Open LLM",
"Open LLM"
],
[
"HuggingFace",
"Open LLM",
"Open LLM"
],
[
"Nexusflow",
"Open LLM",
"Open LLM"
],
[
"Alibaba",
"Open LLM",
"Open LLM"
],
[
"Mistral",
"Open LLM",
"Open LLM"
],
[
"01 AI",
"Open LLM",
"Open LLM"
],
[
"Microsoft",
"Open LLM",
"Open LLM"
],
[
"Databricks",
"Open LLM",
"Open LLM"
],
[
"AllenAI/UW",
"Open LLM",
"Open LLM"
],
[
"OpenChat",
"Open LLM",
"Open LLM"
],
[
"UC Berkeley",
"Open LLM",
"Open LLM"
],
[
"Meta",
"Open LLM",
"Open LLM"
],
[
"NousResearch",
"Open LLM",
"Open LLM"
],
[
"Google",
"Open LLM",
"Open LLM"
],
[
"Nvidia",
"Open LLM",
"Open LLM"
],
[
"DeepSeek AI",
"Open LLM",
"Open LLM"
],
[
"OpenChat",
"Open LLM",
"Open LLM"
],
[
"NousResearch",
"Open LLM",
"Open LLM"
],
[
"Alibaba",
"Open LLM",
"Open LLM"
],
[
"Mistral",
"Open LLM",
"Open LLM"
],
[
"Cognitive Computations",
"Open LLM",
"Open LLM"
],
[
"Upstage AI",
"Open LLM",
"Open LLM"
],
[
"Microsoft",
"Open LLM",
"Open LLM"
],
[
"Meta",
"Open LLM",
"Open LLM"
],
[
"HuggingFace",
"Open LLM",
"Open LLM"
],
[
"Microsoft",
"Open LLM",
"Open LLM"
],
[
"LMSYS",
"Open LLM",
"Open LLM"
],
[
"Meta",
"Open LLM",
"Open LLM"
],
[
"MosaicML",
"Open LLM",
"Open LLM"
],
[
"Meta",
"Open LLM",
"Open LLM"
],
[
"Google",
"Open LLM",
"Open LLM"
],
[
"HuggingFace",
"Open LLM",
"Open LLM"
],
[
"Meta",
"Open LLM",
"Open LLM"
],
[
"Alibaba",
"Open LLM",
"Open LLM"
],
[
"TII",
"Open LLM",
"Open LLM"
],
[
"Together AI",
"Open LLM",
"Open LLM"
],
[
"Allen AI",
"Open LLM",
"Open LLM"
],
[
"Google",
"Open LLM",
"Open LLM"
],
[
"Mistral",
"Open LLM",
"Open LLM"
],
[
"LMSYS",
"Open LLM",
"Open LLM"
],
[
"Alibaba",
"Open LLM",
"Open LLM"
],
[
"Google",
"Open LLM",
"Open LLM"
],
[
"Tsinghua",
"Open LLM",
"Open LLM"
],
[
"MosaicML",
"Open LLM",
"Open LLM"
],
[
"Tsinghua",
"Open LLM",
"Open LLM"
],
[
"RWKV",
"Open LLM",
"Open LLM"
],
[
"OpenAssistant",
"Open LLM",
"Open LLM"
],
[
"LMSYS",
"Open LLM",
"Open LLM"
],
[
"Stability AI",
"Open LLM",
"Open LLM"
],
[
"Databricks",
"Open LLM",
"Open LLM"
]
],
"hovertemplate": "%{hovertext}
license_binary=%{customdata[2]}
Release Date=%{x}
Arena ELO=%{y}
Organization=%{customdata[0]}
License=%{customdata[1]}",
"hovertext": [
"Llama-3-70b-Instruct",
"Command R+",
"Llama-3-8b-Instruct",
"Qwen1.5-72B-Chat",
"Command R",
"Mixtral-8x22b-Instruct-v0.1",
"Qwen1.5-32B-Chat",
"Zephyr-ORPO-141b-A35b-v0.1",
"Starling-LM-7B-beta",
"Qwen1.5-14B-Chat",
"Mixtral-8x7b-Instruct-v0.1",
"Yi-34B-Chat",
"WizardLM-70B-v1.0",
"DBRX-Instruct-Preview",
"Tulu-2-DPO-70B",
"OpenChat-3.5-0106",
"Starling-LM-7B-alpha",
"Llama-2-70b-chat",
"Nous-Hermes-2-Mixtral-8x7B-DPO",
"Gemma-1.1-7B-it",
"NV-Llama2-70B-SteerLM-Chat",
"DeepSeek-LLM-67B-Chat",
"OpenChat-3.5",
"OpenHermes-2.5-Mistral-7b",
"Qwen1.5-7B-Chat",
"Mistral-7B-Instruct-v0.2",
"Dolphin-2.2.1-Mistral-7B",
"SOLAR-10.7B-Instruct-v1.0",
"WizardLM-13b-v1.2",
"Llama-2-13b-chat",
"Zephyr-7b-beta",
"Phi-3-Mini-128k-Instruct",
"Vicuna-13B",
"CodeLlama-70B-instruct",
"MPT-30B-chat",
"CodeLlama-34B-instruct",
"Gemma-7B-it",
"Zephyr-7b-alpha",
"Llama-2-7b-chat",
"Qwen-14B-Chat",
"falcon-180b-chat",
"StripedHyena-Nous-7B",
"OLMo-7B-instruct",
"Gemma-1.1-2B-it",
"Mistral-7B-Instruct-v0.1",
"Vicuna-7B",
"Qwen1.5-4B-Chat",
"Gemma-2B-it",
"ChatGLM3-6B",
"MPT-7B-Chat",
"ChatGLM2-6B",
"RWKV-4-Raven-14B",
"OpenAssistant-Pythia-12B",
"FastChat-T5-3B",
"StableLM-Tuned-Alpha-7B",
"Dolly-V2-12B"
],
"legendgroup": "Open LLM",
"marker": {
"color": "#EF553B",
"size": 8,
"symbol": "circle"
},
"mode": "markers",
"name": "Open LLM",
"orientation": "v",
"showlegend": true,
"type": "scatter",
"x": [
"2024-04-18T00:00:00",
"2024-04-04T00:00:00",
"2024-04-18T00:00:00",
"2024-02-04T00:00:00",
"2024-03-11T00:00:00",
"2024-04-17T00:00:00",
"2024-02-04T00:00:00",
"2024-04-12T00:00:00",
"2024-03-20T00:00:00",
"2024-02-04T00:00:00",
"2023-12-11T00:00:00",
"2024-01-23T00:00:00",
"2023-08-09T00:00:00",
"2024-03-27T00:00:00",
"2023-11-12T00:00:00",
"2024-01-06T00:00:00",
"2023-11-25T00:00:00",
"2023-07-18T00:00:00",
"2024-01-13T00:00:00",
"2024-04-09T00:00:00",
"2023-11-24T00:00:00",
"2023-11-29T00:00:00",
"2023-11-16T00:00:00",
"2023-10-29T00:00:00",
"2024-02-04T00:00:00",
"2023-12-11T00:00:00",
"2023-10-30T00:00:00",
"2023-12-13T00:00:00",
"2023-07-25T00:00:00",
"2023-07-18T00:00:00",
"2023-10-26T00:00:00",
"2024-04-23T00:00:00",
"2023-07-23T00:00:00",
"2024-01-29T00:00:00",
"2023-06-09T00:00:00",
"2023-08-24T00:00:00",
"2024-02-21T00:00:00",
"2023-10-09T00:00:00",
"2023-07-18T00:00:00",
"2023-09-24T00:00:00",
"2023-09-05T00:00:00",
"2023-12-07T00:00:00",
"2024-02-23T00:00:00",
"2024-04-09T00:00:00",
"2023-09-27T00:00:00",
"2023-07-29T00:00:00",
"2024-02-04T00:00:00",
"2024-02-21T00:00:00",
"2023-10-25T00:00:00",
"2023-05-04T00:00:00",
"2023-06-25T00:00:00",
"2023-05-22T00:00:00",
"2023-04-03T00:00:00",
"2023-04-27T00:00:00",
"2023-04-19T00:00:00",
"2023-04-12T00:00:00"
],
"xaxis": "x",
"y": [
1209.6462958943152,
1190.5291640364956,
1152.500938092916,
1152.485612667822,
1147.8966494489798,
1145.8123271934626,
1133.8011394014864,
1128.8163366984966,
1118.5178781177128,
1118.475700517794,
1114,
1111.1326399460543,
1108.552744333791,
1103.2167069462541,
1102.79428840509,
1098.527455141752,
1091.5210240331344,
1088.7078065720734,
1087.307757938674,
1082.9619916739105,
1082.4713591517852,
1079.7362777221456,
1078.6663284631356,
1078.6429577216027,
1076.5321247427814,
1074.0655548845186,
1065.574858796917,
1065.0611191304033,
1061.9003873957429,
1056.9265912995625,
1054.4162995844372,
1050.1481252382014,
1047.9555279582555,
1047.927687897156,
1047.823066613369,
1047.396876459045,
1043.5443043467913,
1043.0842673002462,
1040.7537596503887,
1038.586932982431,
1037.076380506833,
1023.112092466059,
1020.7569311460566,
1014.832737666584,
1012.1048679697501,
1009.3834445358582,
1002.744713564041,
999.6431193544297,
960.7895509564338,
933.340871331175,
933.3372880828122,
928.4512512366093,
900.2948677134343,
876.9291083582452,
848.9325675003323,
826.6473317994165
],
"yaxis": "y"
}
],
"layout": {
"legend": {
"title": {
"text": "license_binary"
},
"tracegroupgap": 0
},
"template": {
"data": {
"bar": [
{
"error_x": {
"color": "#2a3f5f"
},
"error_y": {
"color": "#2a3f5f"
},
"marker": {
"line": {
"color": "white",
"width": 0.5
},
"pattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
}
},
"type": "bar"
}
],
"barpolar": [
{
"marker": {
"line": {
"color": "white",
"width": 0.5
},
"pattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
}
},
"type": "barpolar"
}
],
"carpet": [
{
"aaxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "#C8D4E3",
"linecolor": "#C8D4E3",
"minorgridcolor": "#C8D4E3",
"startlinecolor": "#2a3f5f"
},
"baxis": {
"endlinecolor": "#2a3f5f",
"gridcolor": "#C8D4E3",
"linecolor": "#C8D4E3",
"minorgridcolor": "#C8D4E3",
"startlinecolor": "#2a3f5f"
},
"type": "carpet"
}
],
"choropleth": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "choropleth"
}
],
"contour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "contour"
}
],
"contourcarpet": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "contourcarpet"
}
],
"heatmap": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "heatmap"
}
],
"heatmapgl": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "heatmapgl"
}
],
"histogram": [
{
"marker": {
"pattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
}
},
"type": "histogram"
}
],
"histogram2d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2d"
}
],
"histogram2dcontour": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "histogram2dcontour"
}
],
"mesh3d": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"type": "mesh3d"
}
],
"parcoords": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "parcoords"
}
],
"pie": [
{
"automargin": true,
"type": "pie"
}
],
"scatter": [
{
"fillpattern": {
"fillmode": "overlay",
"size": 10,
"solidity": 0.2
},
"type": "scatter"
}
],
"scatter3d": [
{
"line": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatter3d"
}
],
"scattercarpet": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattercarpet"
}
],
"scattergeo": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergeo"
}
],
"scattergl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattergl"
}
],
"scattermapbox": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scattermapbox"
}
],
"scatterpolar": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolar"
}
],
"scatterpolargl": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterpolargl"
}
],
"scatterternary": [
{
"marker": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"type": "scatterternary"
}
],
"surface": [
{
"colorbar": {
"outlinewidth": 0,
"ticks": ""
},
"colorscale": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"type": "surface"
}
],
"table": [
{
"cells": {
"fill": {
"color": "#EBF0F8"
},
"line": {
"color": "white"
}
},
"header": {
"fill": {
"color": "#C8D4E3"
},
"line": {
"color": "white"
}
},
"type": "table"
}
]
},
"layout": {
"annotationdefaults": {
"arrowcolor": "#2a3f5f",
"arrowhead": 0,
"arrowwidth": 1
},
"autotypenumbers": "strict",
"coloraxis": {
"colorbar": {
"outlinewidth": 0,
"ticks": ""
}
},
"colorscale": {
"diverging": [
[
0,
"#8e0152"
],
[
0.1,
"#c51b7d"
],
[
0.2,
"#de77ae"
],
[
0.3,
"#f1b6da"
],
[
0.4,
"#fde0ef"
],
[
0.5,
"#f7f7f7"
],
[
0.6,
"#e6f5d0"
],
[
0.7,
"#b8e186"
],
[
0.8,
"#7fbc41"
],
[
0.9,
"#4d9221"
],
[
1,
"#276419"
]
],
"sequential": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
],
"sequentialminus": [
[
0,
"#0d0887"
],
[
0.1111111111111111,
"#46039f"
],
[
0.2222222222222222,
"#7201a8"
],
[
0.3333333333333333,
"#9c179e"
],
[
0.4444444444444444,
"#bd3786"
],
[
0.5555555555555556,
"#d8576b"
],
[
0.6666666666666666,
"#ed7953"
],
[
0.7777777777777778,
"#fb9f3a"
],
[
0.8888888888888888,
"#fdca26"
],
[
1,
"#f0f921"
]
]
},
"colorway": [
"#636efa",
"#EF553B",
"#00cc96",
"#ab63fa",
"#FFA15A",
"#19d3f3",
"#FF6692",
"#B6E880",
"#FF97FF",
"#FECB52"
],
"font": {
"color": "#2a3f5f"
},
"geo": {
"bgcolor": "white",
"lakecolor": "white",
"landcolor": "white",
"showlakes": true,
"showland": true,
"subunitcolor": "#C8D4E3"
},
"hoverlabel": {
"align": "left"
},
"hovermode": "closest",
"mapbox": {
"style": "light"
},
"paper_bgcolor": "white",
"plot_bgcolor": "white",
"polar": {
"angularaxis": {
"gridcolor": "#EBF0F8",
"linecolor": "#EBF0F8",
"ticks": ""
},
"bgcolor": "white",
"radialaxis": {
"gridcolor": "#EBF0F8",
"linecolor": "#EBF0F8",
"ticks": ""
}
},
"scene": {
"xaxis": {
"backgroundcolor": "white",
"gridcolor": "#DFE8F3",
"gridwidth": 2,
"linecolor": "#EBF0F8",
"showbackground": true,
"ticks": "",
"zerolinecolor": "#EBF0F8"
},
"yaxis": {
"backgroundcolor": "white",
"gridcolor": "#DFE8F3",
"gridwidth": 2,
"linecolor": "#EBF0F8",
"showbackground": true,
"ticks": "",
"zerolinecolor": "#EBF0F8"
},
"zaxis": {
"backgroundcolor": "white",
"gridcolor": "#DFE8F3",
"gridwidth": 2,
"linecolor": "#EBF0F8",
"showbackground": true,
"ticks": "",
"zerolinecolor": "#EBF0F8"
}
},
"shapedefaults": {
"line": {
"color": "#2a3f5f"
}
},
"ternary": {
"aaxis": {
"gridcolor": "#DFE8F3",
"linecolor": "#A2B1C6",
"ticks": ""
},
"baxis": {
"gridcolor": "#DFE8F3",
"linecolor": "#A2B1C6",
"ticks": ""
},
"bgcolor": "white",
"caxis": {
"gridcolor": "#DFE8F3",
"linecolor": "#A2B1C6",
"ticks": ""
}
},
"title": {
"x": 0.05
},
"xaxis": {
"automargin": true,
"gridcolor": "#EBF0F8",
"linecolor": "#EBF0F8",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "#EBF0F8",
"zerolinewidth": 2
},
"yaxis": {
"automargin": true,
"gridcolor": "#EBF0F8",
"linecolor": "#EBF0F8",
"ticks": "",
"title": {
"standoff": 15
},
"zerolinecolor": "#EBF0F8",
"zerolinewidth": 2
}
}
},
"title": {
"text": "Closed-source vs. Open-weight models (Arena ELO, 19 Apr 24)"
},
"xaxis": {
"anchor": "y",
"domain": [
0,
1
],
"title": {
"text": "Release Date"
}
},
"yaxis": {
"anchor": "x",
"domain": [
0,
1
],
"title": {
"text": "Arena ELO"
}
}
}
}
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import plotly.express as px\n",
"import plotly.graph_objects as go\n",
"\n",
"# Plotting\n",
"fig = px.scatter(\n",
" df,\n",
" x=\"Release Date\",\n",
" y=\"rating\",\n",
" color=\"license_binary\",\n",
" hover_name=\"Model\",\n",
" hover_data=[\n",
" \"Release Date\",\n",
" \"Organization\",\n",
" \"License\",\n",
" \"license_binary\",\n",
" ],\n",
" title=\"Closed-source vs. Open-weight models (Arena ELO, 19 Apr 24)\",\n",
" labels={\"rating\": \"Arena ELO\", \"Release Date\": \"Release Date\"},\n",
" template=\"plotly_white\",\n",
")\n",
"fig.update_traces(marker=dict(size=8))\n",
"\n",
"# Display the plot\n",
"fig.show()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 37,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"plotly.graph_objs._figure.Figure"
]
},
"execution_count": 37,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"type(fig)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": ".venv",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.10.11"
}
},
"nbformat": 4,
"nbformat_minor": 2
}