{ "cells": [ { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "import os\n", "import pickle\n", "\n", "import pandas as pd\n", "from huggingface_hub import HfFileSystem, hf_hub_download" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Prepare data" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [], "source": [ "fs = HfFileSystem()\n", "\n", "\n", "def extract_date(filename):\n", " return filename.split(\"/\")[-1].split(\".\")[0].split(\"_\")[-1]\n", "\n", "\n", "ELO_DATA_FILES = \"spaces/lmsys/chatbot-arena-leaderboard/*.pkl\"\n", "elo_files = fs.glob(ELO_DATA_FILES)\n", "latest_elo_file = sorted(elo_files, key=extract_date, reverse=True)[0]\n", "\n", "LEADERBOARD_DATA_FILES = \"spaces/lmsys/chatbot-arena-leaderboard/*.csv\"\n", "leaderboard_files = fs.glob(LEADERBOARD_DATA_FILES)\n", "latest_leaderboard_file = sorted(leaderboard_files, key=extract_date, reverse=True)[0]" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "('leaderboard_table_20240426.csv', 'elo_results_20240426.pkl')" ] }, "execution_count": 73, "metadata": {}, "output_type": "execute_result" } ], "source": [ "latest_leaderboard_file.split(\"/\")[-1], latest_elo_file.split(\"/\")[-1]" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [], "source": [ "latest_elo_file_local = hf_hub_download(\n", " repo_id=\"lmsys/chatbot-arena-leaderboard\",\n", " filename=latest_elo_file.split(\"/\")[-1],\n", " repo_type=\"space\",\n", ")\n", "latest_leaderboard_file_local = hf_hub_download(\n", " repo_id=\"lmsys/chatbot-arena-leaderboard\",\n", " filename=latest_leaderboard_file.split(\"/\")[-1],\n", " repo_type=\"space\",\n", ")" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [], "source": [ "# load and prepare ELO data\n", "key_to_category_name = {\n", " \"full\": \"Overall\",\n", " \"coding\": \"Coding\",\n", " \"long_user\": \"Longer Query\",\n", " \"english\": \"English\",\n", " \"chinese\": \"Chinese\",\n", " \"french\": \"French\",\n", " \"no_tie\": \"Exclude Ties\",\n", " \"no_short\": \"Exclude Short Query (< 5 tokens)\",\n", " \"no_refusal\": \"Exclude Refusal\",\n", "}\n", "cat_name_to_explanation = {\n", " \"Overall\": \"Overall Questions\",\n", " \"Coding\": \"Coding: whether conversation contains code snippets\",\n", " \"Longer Query\": \"Longer Query (>= 500 tokens)\",\n", " \"English\": \"English Prompts\",\n", " \"Chinese\": \"Chinese Prompts\",\n", " \"French\": \"French Prompts\",\n", " \"Exclude Ties\": \"Exclude Ties and Bothbad\",\n", " \"Exclude Short Query (< 5 tokens)\": \"Exclude Short User Query (< 5 tokens)\",\n", " \"Exclude Refusal\": 'Exclude model responses with refusal (e.g., \"I cannot answer\")',\n", "}\n", "\n", "with open(latest_elo_file_local, \"rb\") as fin:\n", " elo_results = pickle.load(fin)\n", "\n", "arena_dfs = {}\n", "for k in key_to_category_name.keys():\n", " if k not in elo_results:\n", " continue\n", " arena_dfs[key_to_category_name[k]] = elo_results[k][\"leaderboard_table_df\"]" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "dict_keys(['Overall', 'Coding', 'Longer Query', 'English', 'Chinese', 'French', 'Exclude Ties', 'Exclude Short Query (< 5 tokens)', 'Exclude Refusal'])" ] }, "execution_count": 77, "metadata": {}, "output_type": "execute_result" } ], "source": [ "arena_dfs.keys()" ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ratingvariancerating_q975rating_q025num_battlesfinal_ranking
RWKV-4-Raven-14B927.71029427.143015935.717850916.546369512981
alpaca-13b907.32448220.736682915.536856899.330070611185
bard-jan-24-gemini-pro1208.5054086.6790871213.2913581203.926901123886
chatglm-6b886.10755317.110417894.034333878.094776519586
chatglm2-6b932.67846033.530570943.455598921.346322288081
.....................
wizardlm-70b1107.9925529.3858871114.2182231102.655575886829
yi-34b-chat1109.7224478.5969081115.1825791103.9910951225229
zephyr-7b-alpha1042.10871043.9007141052.9917681027.160917190158
zephyr-7b-beta1053.65568010.2976071059.9232541047.6016291192454
zephyr-orpo-141b-A35b-v0.11124.67751522.2885151132.7288871113.848432427622
\n", "

91 rows × 6 columns

\n", "
" ], "text/plain": [ " rating variance rating_q975 rating_q025 \\\n", "RWKV-4-Raven-14B 927.710294 27.143015 935.717850 916.546369 \n", "alpaca-13b 907.324482 20.736682 915.536856 899.330070 \n", "bard-jan-24-gemini-pro 1208.505408 6.679087 1213.291358 1203.926901 \n", "chatglm-6b 886.107553 17.110417 894.034333 878.094776 \n", "chatglm2-6b 932.678460 33.530570 943.455598 921.346322 \n", "... ... ... ... ... \n", "wizardlm-70b 1107.992552 9.385887 1114.218223 1102.655575 \n", "yi-34b-chat 1109.722447 8.596908 1115.182579 1103.991095 \n", "zephyr-7b-alpha 1042.108710 43.900714 1052.991768 1027.160917 \n", "zephyr-7b-beta 1053.655680 10.297607 1059.923254 1047.601629 \n", "zephyr-orpo-141b-A35b-v0.1 1124.677515 22.288515 1132.728887 1113.848432 \n", "\n", " num_battles final_ranking \n", "RWKV-4-Raven-14B 5129 81 \n", "alpaca-13b 6111 85 \n", "bard-jan-24-gemini-pro 12388 6 \n", "chatglm-6b 5195 86 \n", "chatglm2-6b 2880 81 \n", "... ... ... \n", "wizardlm-70b 8868 29 \n", "yi-34b-chat 12252 29 \n", "zephyr-7b-alpha 1901 58 \n", "zephyr-7b-beta 11924 54 \n", "zephyr-orpo-141b-A35b-v0.1 4276 22 \n", "\n", "[91 rows x 6 columns]" ] }, "execution_count": 78, "metadata": {}, "output_type": "execute_result" } ], "source": [ "arena_dfs[\"Overall\"]" ] }, { "cell_type": "code", "execution_count": 79, "metadata": {}, "outputs": [], "source": [ "# load and prepare Leaderboard data\n", "leaderboard_df = pd.read_csv(latest_leaderboard_file_local)" ] }, { "cell_type": "code", "execution_count": 80, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
keyModelMT-bench (score)MMLUKnowledge cutoff dateLicenseOrganizationLink
0wizardlm-30bWizardLM-30B7.010.5872023/6Non-commercialMicrosofthttps://huggingface.co/WizardLM/WizardLM-30B-V1.0
1vicuna-13b-16kVicuna-13B-16k6.920.5452023/7Llama 2 CommunityLMSYShttps://huggingface.co/lmsys/vicuna-13b-v1.5-16k
2wizardlm-13b-v1.1WizardLM-13B-v1.16.760.5002023/7Non-commercialMicrosofthttps://huggingface.co/WizardLM/WizardLM-13B-V1.1
3tulu-30bTulu-30B6.430.5812023/6Non-commercialAllenAI/UWhttps://huggingface.co/allenai/tulu-30b
4guanaco-65bGuanaco-65B6.410.6212023/5Non-commercialUWhttps://huggingface.co/timdettmers/guanaco-65b...
...........................
100mixtral-8x22b-instruct-v0.1Mixtral-8x22b-Instruct-v0.1-0.7782024/4Apache 2.0Mistralhttps://mistral.ai/news/mixtral-8x22b/
101llama-3-70b-instructLlama-3-70b-Instruct-0.8202023/12Llama 3 CommunityMetahttps://llama.meta.com/llama3/
102llama-3-8b-instructLlama-3-8b-Instruct-0.6842023/3Llama 3 CommunityMetahttps://llama.meta.com/llama3/
103gemini-1.5-pro-api-0409-previewGemini 1.5 Pro API-0409-Preview-0.8192023/11ProprietaryGooglehttps://blog.google/technology/ai/google-gemin...
104phi-3-mini-128k-instructPhi-3-Mini-128k-Instruct-0.6812023/10MITMicrosofthttps://azure.microsoft.com/en-us/blog/introdu...
\n", "

105 rows × 8 columns

\n", "
" ], "text/plain": [ " key Model \\\n", "0 wizardlm-30b WizardLM-30B \n", "1 vicuna-13b-16k Vicuna-13B-16k \n", "2 wizardlm-13b-v1.1 WizardLM-13B-v1.1 \n", "3 tulu-30b Tulu-30B \n", "4 guanaco-65b Guanaco-65B \n", ".. ... ... \n", "100 mixtral-8x22b-instruct-v0.1 Mixtral-8x22b-Instruct-v0.1 \n", "101 llama-3-70b-instruct Llama-3-70b-Instruct \n", "102 llama-3-8b-instruct Llama-3-8b-Instruct \n", "103 gemini-1.5-pro-api-0409-preview Gemini 1.5 Pro API-0409-Preview \n", "104 phi-3-mini-128k-instruct Phi-3-Mini-128k-Instruct \n", "\n", " MT-bench (score) MMLU Knowledge cutoff date License \\\n", "0 7.01 0.587 2023/6 Non-commercial \n", "1 6.92 0.545 2023/7 Llama 2 Community \n", "2 6.76 0.500 2023/7 Non-commercial \n", "3 6.43 0.581 2023/6 Non-commercial \n", "4 6.41 0.621 2023/5 Non-commercial \n", ".. ... ... ... ... \n", "100 - 0.778 2024/4 Apache 2.0 \n", "101 - 0.820 2023/12 Llama 3 Community \n", "102 - 0.684 2023/3 Llama 3 Community \n", "103 - 0.819 2023/11 Proprietary \n", "104 - 0.681 2023/10 MIT \n", "\n", " Organization Link \n", "0 Microsoft https://huggingface.co/WizardLM/WizardLM-30B-V1.0 \n", "1 LMSYS https://huggingface.co/lmsys/vicuna-13b-v1.5-16k \n", "2 Microsoft https://huggingface.co/WizardLM/WizardLM-13B-V1.1 \n", "3 AllenAI/UW https://huggingface.co/allenai/tulu-30b \n", "4 UW https://huggingface.co/timdettmers/guanaco-65b... \n", ".. ... ... \n", "100 Mistral https://mistral.ai/news/mixtral-8x22b/ \n", "101 Meta https://llama.meta.com/llama3/ \n", "102 Meta https://llama.meta.com/llama3/ \n", "103 Google https://blog.google/technology/ai/google-gemin... \n", "104 Microsoft https://azure.microsoft.com/en-us/blog/introdu... \n", "\n", "[105 rows x 8 columns]" ] }, "execution_count": 80, "metadata": {}, "output_type": "execute_result" } ], "source": [ "leaderboard_df" ] }, { "cell_type": "code", "execution_count": 82, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "dict_keys(['Overall', 'Coding', 'Longer Query', 'English', 'Chinese', 'French', 'Exclude Ties', 'Exclude Short Query (< 5 tokens)', 'Exclude Refusal'])" ] }, "execution_count": 82, "metadata": {}, "output_type": "execute_result" } ], "source": [ "arena_dfs.keys()" ] }, { "cell_type": "code", "execution_count": 86, "metadata": {}, "outputs": [], "source": [ "# merge ELO and Leaderboard data\n", "merged_dfs = {}\n", "for k, v in arena_dfs.items():\n", " merged_dfs[k] = (\n", " pd.merge(arena_dfs[k], leaderboard_df, left_index=True, right_on=\"key\")\n", " .sort_values(\"rating\", ascending=False)\n", " .reset_index(drop=True)\n", " )" ] }, { "cell_type": "code", "execution_count": 101, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
ratingvariancerating_q975rating_q025num_battlesfinal_rankingkeyModelMT-bench (score)MMLUKnowledge cutoff dateLicenseOrganizationLink
01257.3994074.2833161261.6762241254.003626305621gpt-4-turbo-2024-04-09GPT-4-Turbo-2024-04-09--2023/12ProprietaryOpenAIhttps://platform.openai.com/docs/models/gpt-4-...
11253.0250952.0695341256.1113921250.435207698711gpt-4-1106-previewGPT-4-1106-preview9.32-2023/4ProprietaryOpenAIhttps://openai.com/blog/new-models-and-develop...
21251.1142201.8628421253.6290931248.362042756842claude-3-opus-20240229Claude 3 Opus-0.8682023/8ProprietaryAnthropichttps://www.anthropic.com/news/claude-3-family
31247.6625083.2637471251.5826451244.380454337232gemini-1.5-pro-api-0409-previewGemini 1.5 Pro API-0409-Preview-0.8192023/11ProprietaryGooglehttps://blog.google/technology/ai/google-gemin...
41247.2770521.9230141249.4894111244.340257619243gpt-4-0125-previewGPT-4-0125-preview--2023/12ProprietaryOpenAIhttps://openai.com/blog/new-models-and-develop...
51208.5054086.6790871213.2913581203.926901123886bard-jan-24-gemini-proBard (Gemini Pro)--OnlineProprietaryGooglehttps://bard.google.com/
61207.4975414.1094661211.7207341203.322762272986llama-3-70b-instructLlama-3-70b-Instruct-0.8202023/12Llama 3 CommunityMetahttps://llama.meta.com/llama3/
71201.6712542.5255631204.8625121198.658822754186claude-3-sonnet-20240229Claude 3 Sonnet-0.7902023/8ProprietaryAnthropichttps://www.anthropic.com/news/claude-3-family
81191.6845423.4597171195.0802561188.222382412629command-r-plusCommand R+--2024/3CC-BY-NC-4.0Coherehttps://txt.cohere.com/command-r-plus-microsof...
91188.9873893.1247921193.3355351185.935928483909gpt-4-0314GPT-4-03148.960.8642021/9ProprietaryOpenAIhttps://openai.com/research/gpt-4
101180.6068703.0975421183.8254031177.2552036606511claude-3-haiku-20240307Claude 3 Haiku-0.7522023/8ProprietaryAnthropichttps://www.anthropic.com/news/claude-3-family
111164.8965612.5855771167.5956961161.7274546703812gpt-4-0613GPT-4-06139.18-2021/9ProprietaryOpenAIhttps://platform.openai.com/docs/models/gpt-4-...
121157.6389922.5413201160.4961161154.9277484412013mistral-large-2402Mistral-Large-2402-0.812-ProprietaryMistralhttps://mistral.ai/news/mistral-large/
131153.4642803.6315121157.0688501150.1789033299913qwen1.5-72b-chatQwen1.5-72B-Chat8.610.7752024/2Qianwen LICENSEAlibabahttps://qwenlm.github.io/blog/qwen1.5/
141150.9184739.0622171155.9697211145.229885862213reka-flash-21b-20240226-onlineReka-Flash-21B-online--OnlineProprietaryReka AIhttps://docs.reka.ai/http-api.html#generation
151150.2443135.5513731154.7452141145.4964662176814claude-1Claude-17.900.770-ProprietaryAnthropichttps://www.anthropic.com/index/introducing-cl...
161149.26757811.4522721154.2901551141.931621905914reka-flash-21b-20240226Reka-Flash-21B-0.7352023/11ProprietaryReka AIhttps://www.reka.ai/news/reka-flash-efficient-...
171148.0721553.0712221151.9808651144.9920443741314command-rCommand R--2024/3CC-BY-NC-4.0Coherehttps://txt.cohere.com/command-r
181147.6683253.5422291150.7264891143.8683853273814mistral-mediumMistral Medium8.610.753-ProprietaryMistralhttps://mistral.ai/news/la-plateforme/
191147.4739895.7897101151.9893521143.3229181721414mixtral-8x22b-instruct-v0.1Mixtral-8x22b-Instruct-v0.1-0.7782024/4Apache 2.0Mistralhttps://mistral.ai/news/mixtral-8x22b/
\n", "
" ], "text/plain": [ " rating variance rating_q975 rating_q025 num_battles \\\n", "0 1257.399407 4.283316 1261.676224 1254.003626 30562 \n", "1 1253.025095 2.069534 1256.111392 1250.435207 69871 \n", "2 1251.114220 1.862842 1253.629093 1248.362042 75684 \n", "3 1247.662508 3.263747 1251.582645 1244.380454 33723 \n", "4 1247.277052 1.923014 1249.489411 1244.340257 61924 \n", "5 1208.505408 6.679087 1213.291358 1203.926901 12388 \n", "6 1207.497541 4.109466 1211.720734 1203.322762 27298 \n", "7 1201.671254 2.525563 1204.862512 1198.658822 75418 \n", "8 1191.684542 3.459717 1195.080256 1188.222382 41262 \n", "9 1188.987389 3.124792 1193.335535 1185.935928 48390 \n", "10 1180.606870 3.097542 1183.825403 1177.255203 66065 \n", "11 1164.896561 2.585577 1167.595696 1161.727454 67038 \n", "12 1157.638992 2.541320 1160.496116 1154.927748 44120 \n", "13 1153.464280 3.631512 1157.068850 1150.178903 32999 \n", "14 1150.918473 9.062217 1155.969721 1145.229885 8622 \n", "15 1150.244313 5.551373 1154.745214 1145.496466 21768 \n", "16 1149.267578 11.452272 1154.290155 1141.931621 9059 \n", "17 1148.072155 3.071222 1151.980865 1144.992044 37413 \n", "18 1147.668325 3.542229 1150.726489 1143.868385 32738 \n", "19 1147.473989 5.789710 1151.989352 1143.322918 17214 \n", "\n", " final_ranking key \\\n", "0 1 gpt-4-turbo-2024-04-09 \n", "1 1 gpt-4-1106-preview \n", "2 2 claude-3-opus-20240229 \n", "3 2 gemini-1.5-pro-api-0409-preview \n", "4 3 gpt-4-0125-preview \n", "5 6 bard-jan-24-gemini-pro \n", "6 6 llama-3-70b-instruct \n", "7 6 claude-3-sonnet-20240229 \n", "8 9 command-r-plus \n", "9 9 gpt-4-0314 \n", "10 11 claude-3-haiku-20240307 \n", "11 12 gpt-4-0613 \n", "12 13 mistral-large-2402 \n", "13 13 qwen1.5-72b-chat \n", "14 13 reka-flash-21b-20240226-online \n", "15 14 claude-1 \n", "16 14 reka-flash-21b-20240226 \n", "17 14 command-r \n", "18 14 mistral-medium \n", "19 14 mixtral-8x22b-instruct-v0.1 \n", "\n", " Model MT-bench (score) MMLU \\\n", "0 GPT-4-Turbo-2024-04-09 - - \n", "1 GPT-4-1106-preview 9.32 - \n", "2 Claude 3 Opus - 0.868 \n", "3 Gemini 1.5 Pro API-0409-Preview - 0.819 \n", "4 GPT-4-0125-preview - - \n", "5 Bard (Gemini Pro) - - \n", "6 Llama-3-70b-Instruct - 0.820 \n", "7 Claude 3 Sonnet - 0.790 \n", "8 Command R+ - - \n", "9 GPT-4-0314 8.96 0.864 \n", "10 Claude 3 Haiku - 0.752 \n", "11 GPT-4-0613 9.18 - \n", "12 Mistral-Large-2402 - 0.812 \n", "13 Qwen1.5-72B-Chat 8.61 0.775 \n", "14 Reka-Flash-21B-online - - \n", "15 Claude-1 7.90 0.770 \n", "16 Reka-Flash-21B - 0.735 \n", "17 Command R - - \n", "18 Mistral Medium 8.61 0.753 \n", "19 Mixtral-8x22b-Instruct-v0.1 - 0.778 \n", "\n", " Knowledge cutoff date License Organization \\\n", "0 2023/12 Proprietary OpenAI \n", "1 2023/4 Proprietary OpenAI \n", "2 2023/8 Proprietary Anthropic \n", "3 2023/11 Proprietary Google \n", "4 2023/12 Proprietary OpenAI \n", "5 Online Proprietary Google \n", "6 2023/12 Llama 3 Community Meta \n", "7 2023/8 Proprietary Anthropic \n", "8 2024/3 CC-BY-NC-4.0 Cohere \n", "9 2021/9 Proprietary OpenAI \n", "10 2023/8 Proprietary Anthropic \n", "11 2021/9 Proprietary OpenAI \n", "12 - Proprietary Mistral \n", "13 2024/2 Qianwen LICENSE Alibaba \n", "14 Online Proprietary Reka AI \n", "15 - Proprietary Anthropic \n", "16 2023/11 Proprietary Reka AI \n", "17 2024/3 CC-BY-NC-4.0 Cohere \n", "18 - Proprietary Mistral \n", "19 2024/4 Apache 2.0 Mistral \n", "\n", " Link \n", "0 https://platform.openai.com/docs/models/gpt-4-... \n", "1 https://openai.com/blog/new-models-and-develop... \n", "2 https://www.anthropic.com/news/claude-3-family \n", "3 https://blog.google/technology/ai/google-gemin... \n", "4 https://openai.com/blog/new-models-and-develop... \n", "5 https://bard.google.com/ \n", "6 https://llama.meta.com/llama3/ \n", "7 https://www.anthropic.com/news/claude-3-family \n", "8 https://txt.cohere.com/command-r-plus-microsof... \n", "9 https://openai.com/research/gpt-4 \n", "10 https://www.anthropic.com/news/claude-3-family \n", "11 https://platform.openai.com/docs/models/gpt-4-... \n", "12 https://mistral.ai/news/mistral-large/ \n", "13 https://qwenlm.github.io/blog/qwen1.5/ \n", "14 https://docs.reka.ai/http-api.html#generation \n", "15 https://www.anthropic.com/index/introducing-cl... \n", "16 https://www.reka.ai/news/reka-flash-efficient-... \n", "17 https://txt.cohere.com/command-r \n", "18 https://mistral.ai/news/la-plateforme/ \n", "19 https://mistral.ai/news/mixtral-8x22b/ " ] }, "execution_count": 101, "metadata": {}, "output_type": "execute_result" } ], "source": [ "merged_dfs[\"Overall\"][:20]" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "### Manually map release dates - MEH." ] }, { "cell_type": "code", "execution_count": 113, "metadata": {}, "outputs": [], "source": [ "t = merged_dfs[\"Overall\"].loc[:, [\"key\", \"Model\"]]\n", "t[\"Release Date\"] = \"\"" ] }, { "cell_type": "code", "execution_count": 120, "metadata": {}, "outputs": [], "source": [ "t.to_json(\"release_date_mapping.json\", orient=\"records\", lines=True)" ] }, { "cell_type": "code", "execution_count": 119, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "[{'key': 'gpt-4-turbo-2024-04-09',\n", " 'Model': 'GPT-4-Turbo-2024-04-09',\n", " 'Release Date': ''},\n", " {'key': 'gpt-4-1106-preview',\n", " 'Model': 'GPT-4-1106-preview',\n", " 'Release Date': ''},\n", " {'key': 'claude-3-opus-20240229',\n", " 'Model': 'Claude 3 Opus',\n", " 'Release Date': ''},\n", " {'key': 'gemini-1.5-pro-api-0409-preview',\n", " 'Model': 'Gemini 1.5 Pro API-0409-Preview',\n", " 'Release Date': ''},\n", " {'key': 'gpt-4-0125-preview',\n", " 'Model': 'GPT-4-0125-preview',\n", " 'Release Date': ''},\n", " {'key': 'bard-jan-24-gemini-pro',\n", " 'Model': 'Bard (Gemini Pro)',\n", " 'Release Date': ''},\n", " {'key': 'llama-3-70b-instruct',\n", " 'Model': 'Llama-3-70b-Instruct',\n", " 'Release Date': ''},\n", " {'key': 'claude-3-sonnet-20240229',\n", " 'Model': 'Claude 3 Sonnet',\n", " 'Release Date': ''},\n", " {'key': 'command-r-plus', 'Model': 'Command R+', 'Release Date': ''},\n", " {'key': 'gpt-4-0314', 'Model': 'GPT-4-0314', 'Release Date': ''},\n", " {'key': 'claude-3-haiku-20240307',\n", " 'Model': 'Claude 3 Haiku',\n", " 'Release Date': ''},\n", " {'key': 'gpt-4-0613', 'Model': 'GPT-4-0613', 'Release Date': ''},\n", " {'key': 'mistral-large-2402',\n", " 'Model': 'Mistral-Large-2402',\n", " 'Release Date': ''},\n", " {'key': 'qwen1.5-72b-chat', 'Model': 'Qwen1.5-72B-Chat', 'Release Date': ''},\n", " {'key': 'reka-flash-21b-20240226-online',\n", " 'Model': 'Reka-Flash-21B-online',\n", " 'Release Date': ''},\n", " {'key': 'claude-1', 'Model': 'Claude-1', 'Release Date': ''},\n", " {'key': 'reka-flash-21b-20240226',\n", " 'Model': 'Reka-Flash-21B',\n", " 'Release Date': ''},\n", " {'key': 'command-r', 'Model': 'Command R', 'Release Date': ''},\n", " {'key': 'mistral-medium', 'Model': 'Mistral Medium', 'Release Date': ''},\n", " {'key': 'mixtral-8x22b-instruct-v0.1',\n", " 'Model': 'Mixtral-8x22b-Instruct-v0.1',\n", " 'Release Date': ''},\n", " {'key': 'llama-3-8b-instruct',\n", " 'Model': 'Llama-3-8b-Instruct',\n", " 'Release Date': ''},\n", " {'key': 'gemini-pro-dev-api',\n", " 'Model': 'Gemini Pro (Dev API)',\n", " 'Release Date': ''},\n", " {'key': 'qwen1.5-32b-chat', 'Model': 'Qwen1.5-32B-Chat', 'Release Date': ''},\n", " {'key': 'claude-2.0', 'Model': 'Claude-2.0', 'Release Date': ''},\n", " {'key': 'mistral-next', 'Model': 'Mistral-Next', 'Release Date': ''},\n", " {'key': 'zephyr-orpo-141b-A35b-v0.1',\n", " 'Model': 'Zephyr-ORPO-141b-A35b-v0.1',\n", " 'Release Date': ''},\n", " {'key': 'gpt-3.5-turbo-0613',\n", " 'Model': 'GPT-3.5-Turbo-0613',\n", " 'Release Date': ''},\n", " {'key': 'claude-2.1', 'Model': 'Claude-2.1', 'Release Date': ''},\n", " {'key': 'qwen1.5-14b-chat', 'Model': 'Qwen1.5-14B-Chat', 'Release Date': ''},\n", " {'key': 'starling-lm-7b-beta',\n", " 'Model': 'Starling-LM-7B-beta',\n", " 'Release Date': ''},\n", " {'key': 'gemini-pro', 'Model': 'Gemini Pro', 'Release Date': ''},\n", " {'key': 'mixtral-8x7b-instruct-v0.1',\n", " 'Model': 'Mixtral-8x7b-Instruct-v0.1',\n", " 'Release Date': ''},\n", " {'key': 'claude-instant-1', 'Model': 'Claude-Instant-1', 'Release Date': ''},\n", " {'key': 'yi-34b-chat', 'Model': 'Yi-34B-Chat', 'Release Date': ''},\n", " {'key': 'gpt-3.5-turbo-0314',\n", " 'Model': 'GPT-3.5-Turbo-0314',\n", " 'Release Date': ''},\n", " {'key': 'wizardlm-70b', 'Model': 'WizardLM-70B-v1.0', 'Release Date': ''},\n", " {'key': 'gpt-3.5-turbo-0125',\n", " 'Model': 'GPT-3.5-Turbo-0125',\n", " 'Release Date': ''},\n", " {'key': 'tulu-2-dpo-70b', 'Model': 'Tulu-2-DPO-70B', 'Release Date': ''},\n", " {'key': 'dbrx-instruct-preview',\n", " 'Model': 'DBRX-Instruct-Preview',\n", " 'Release Date': ''},\n", " {'key': 'openchat-3.5-0106',\n", " 'Model': 'OpenChat-3.5-0106',\n", " 'Release Date': ''},\n", " {'key': 'vicuna-33b', 'Model': 'Vicuna-33B', 'Release Date': ''},\n", " {'key': 'starling-lm-7b-alpha',\n", " 'Model': 'Starling-LM-7B-alpha',\n", " 'Release Date': ''},\n", " {'key': 'llama-2-70b-chat', 'Model': 'Llama-2-70b-chat', 'Release Date': ''},\n", " {'key': 'nous-hermes-2-mixtral-8x7b-dpo',\n", " 'Model': 'Nous-Hermes-2-Mixtral-8x7B-DPO',\n", " 'Release Date': ''},\n", " {'key': 'gemma-1.1-7b-it', 'Model': 'Gemma-1.1-7B-it', 'Release Date': ''},\n", " {'key': 'llama2-70b-steerlm-chat',\n", " 'Model': 'NV-Llama2-70B-SteerLM-Chat',\n", " 'Release Date': ''},\n", " {'key': 'deepseek-llm-67b-chat',\n", " 'Model': 'DeepSeek-LLM-67B-Chat',\n", " 'Release Date': ''},\n", " {'key': 'openhermes-2.5-mistral-7b',\n", " 'Model': 'OpenHermes-2.5-Mistral-7b',\n", " 'Release Date': ''},\n", " {'key': 'openchat-3.5', 'Model': 'OpenChat-3.5', 'Release Date': ''},\n", " {'key': 'pplx-70b-online', 'Model': 'pplx-70b-online', 'Release Date': ''},\n", " {'key': 'mistral-7b-instruct-v0.2',\n", " 'Model': 'Mistral-7B-Instruct-v0.2',\n", " 'Release Date': ''},\n", " {'key': 'qwen1.5-7b-chat', 'Model': 'Qwen1.5-7B-Chat', 'Release Date': ''},\n", " {'key': 'gpt-3.5-turbo-1106',\n", " 'Model': 'GPT-3.5-Turbo-1106',\n", " 'Release Date': ''},\n", " {'key': 'dolphin-2.2.1-mistral-7b',\n", " 'Model': 'Dolphin-2.2.1-Mistral-7B',\n", " 'Release Date': ''},\n", " {'key': 'solar-10.7b-instruct-v1.0',\n", " 'Model': 'SOLAR-10.7B-Instruct-v1.0',\n", " 'Release Date': ''},\n", " {'key': 'phi-3-mini-128k-instruct',\n", " 'Model': 'Phi-3-Mini-128k-Instruct',\n", " 'Release Date': ''},\n", " {'key': 'wizardlm-13b', 'Model': 'WizardLM-13b-v1.2', 'Release Date': ''},\n", " {'key': 'llama-2-13b-chat', 'Model': 'Llama-2-13b-chat', 'Release Date': ''},\n", " {'key': 'zephyr-7b-beta', 'Model': 'Zephyr-7b-beta', 'Release Date': ''},\n", " {'key': 'codellama-70b-instruct',\n", " 'Model': 'CodeLlama-70B-instruct',\n", " 'Release Date': ''},\n", " {'key': 'mpt-30b-chat', 'Model': 'MPT-30B-chat', 'Release Date': ''},\n", " {'key': 'vicuna-13b', 'Model': 'Vicuna-13B', 'Release Date': ''},\n", " {'key': 'codellama-34b-instruct',\n", " 'Model': 'CodeLlama-34B-instruct',\n", " 'Release Date': ''},\n", " {'key': 'gemma-7b-it', 'Model': 'Gemma-7B-it', 'Release Date': ''},\n", " {'key': 'pplx-7b-online', 'Model': 'pplx-7b-online', 'Release Date': ''},\n", " {'key': 'zephyr-7b-alpha', 'Model': 'Zephyr-7b-alpha', 'Release Date': ''},\n", " {'key': 'llama-2-7b-chat', 'Model': 'Llama-2-7b-chat', 'Release Date': ''},\n", " {'key': 'qwen-14b-chat', 'Model': 'Qwen-14B-Chat', 'Release Date': ''},\n", " {'key': 'falcon-180b-chat', 'Model': 'falcon-180b-chat', 'Release Date': ''},\n", " {'key': 'guanaco-33b', 'Model': 'Guanaco-33B', 'Release Date': ''},\n", " {'key': 'stripedhyena-nous-7b',\n", " 'Model': 'StripedHyena-Nous-7B',\n", " 'Release Date': ''},\n", " {'key': 'olmo-7b-instruct', 'Model': 'OLMo-7B-instruct', 'Release Date': ''},\n", " {'key': 'gemma-1.1-2b-it', 'Model': 'Gemma-1.1-2B-it', 'Release Date': ''},\n", " {'key': 'mistral-7b-instruct',\n", " 'Model': 'Mistral-7B-Instruct-v0.1',\n", " 'Release Date': ''},\n", " {'key': 'palm-2', 'Model': 'PaLM-Chat-Bison-001', 'Release Date': ''},\n", " {'key': 'vicuna-7b', 'Model': 'Vicuna-7B', 'Release Date': ''},\n", " {'key': 'qwen1.5-4b-chat', 'Model': 'Qwen1.5-4B-Chat', 'Release Date': ''},\n", " {'key': 'gemma-2b-it', 'Model': 'Gemma-2B-it', 'Release Date': ''},\n", " {'key': 'koala-13b', 'Model': 'Koala-13B', 'Release Date': ''},\n", " {'key': 'chatglm3-6b', 'Model': 'ChatGLM3-6B', 'Release Date': ''},\n", " {'key': 'gpt4all-13b-snoozy',\n", " 'Model': 'GPT4All-13B-Snoozy',\n", " 'Release Date': ''},\n", " {'key': 'chatglm2-6b', 'Model': 'ChatGLM2-6B', 'Release Date': ''},\n", " {'key': 'mpt-7b-chat', 'Model': 'MPT-7B-Chat', 'Release Date': ''},\n", " {'key': 'RWKV-4-Raven-14B', 'Model': 'RWKV-4-Raven-14B', 'Release Date': ''},\n", " {'key': 'alpaca-13b', 'Model': 'Alpaca-13B', 'Release Date': ''},\n", " {'key': 'oasst-pythia-12b',\n", " 'Model': 'OpenAssistant-Pythia-12B',\n", " 'Release Date': ''},\n", " {'key': 'chatglm-6b', 'Model': 'ChatGLM-6B', 'Release Date': ''},\n", " {'key': 'fastchat-t5-3b', 'Model': 'FastChat-T5-3B', 'Release Date': ''},\n", " {'key': 'stablelm-tuned-alpha-7b',\n", " 'Model': 'StableLM-Tuned-Alpha-7B',\n", " 'Release Date': ''},\n", " {'key': 'dolly-v2-12b', 'Model': 'Dolly-V2-12B', 'Release Date': ''},\n", " {'key': 'llama-13b', 'Model': 'LLaMA-13B', 'Release Date': ''}]" ] }, "execution_count": 119, "metadata": {}, "output_type": "execute_result" } ], "source": [ "t.to_dict(orient=\"records\")" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Build plot" ] }, { "cell_type": "markdown", "metadata": {}, "source": [] } ], "metadata": { "kernelspec": { "display_name": ".venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.11" } }, "nbformat": 4, "nbformat_minor": 2 }