{ "cells": [ { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "import os\n", "import pickle\n", "\n", "import pandas as pd\n", "from huggingface_hub import HfFileSystem, hf_hub_download" ] }, { "cell_type": "markdown", "metadata": {}, "source": [ "## Prepare data" ] }, { "cell_type": "code", "execution_count": 72, "metadata": {}, "outputs": [], "source": [ "fs = HfFileSystem()\n", "\n", "\n", "def extract_date(filename):\n", " return filename.split(\"/\")[-1].split(\".\")[0].split(\"_\")[-1]\n", "\n", "\n", "ELO_DATA_FILES = \"spaces/lmsys/chatbot-arena-leaderboard/*.pkl\"\n", "elo_files = fs.glob(ELO_DATA_FILES)\n", "latest_elo_file = sorted(elo_files, key=extract_date, reverse=True)[0]\n", "\n", "LEADERBOARD_DATA_FILES = \"spaces/lmsys/chatbot-arena-leaderboard/*.csv\"\n", "leaderboard_files = fs.glob(LEADERBOARD_DATA_FILES)\n", "latest_leaderboard_file = sorted(leaderboard_files, key=extract_date, reverse=True)[0]" ] }, { "cell_type": "code", "execution_count": 73, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "('leaderboard_table_20240426.csv', 'elo_results_20240426.pkl')" ] }, "execution_count": 73, "metadata": {}, "output_type": "execute_result" } ], "source": [ "latest_leaderboard_file.split(\"/\")[-1], latest_elo_file.split(\"/\")[-1]" ] }, { "cell_type": "code", "execution_count": 74, "metadata": {}, "outputs": [], "source": [ "latest_elo_file_local = hf_hub_download(\n", " repo_id=\"lmsys/chatbot-arena-leaderboard\",\n", " filename=latest_elo_file.split(\"/\")[-1],\n", " repo_type=\"space\",\n", ")\n", "latest_leaderboard_file_local = hf_hub_download(\n", " repo_id=\"lmsys/chatbot-arena-leaderboard\",\n", " filename=latest_leaderboard_file.split(\"/\")[-1],\n", " repo_type=\"space\",\n", ")" ] }, { "cell_type": "code", "execution_count": 76, "metadata": {}, "outputs": [], "source": [ "# load and prepare ELO data\n", "key_to_category_name = {\n", " \"full\": \"Overall\",\n", " \"coding\": \"Coding\",\n", " \"long_user\": \"Longer Query\",\n", " \"english\": \"English\",\n", " \"chinese\": \"Chinese\",\n", " \"french\": \"French\",\n", " \"no_tie\": \"Exclude Ties\",\n", " \"no_short\": \"Exclude Short Query (< 5 tokens)\",\n", " \"no_refusal\": \"Exclude Refusal\",\n", "}\n", "cat_name_to_explanation = {\n", " \"Overall\": \"Overall Questions\",\n", " \"Coding\": \"Coding: whether conversation contains code snippets\",\n", " \"Longer Query\": \"Longer Query (>= 500 tokens)\",\n", " \"English\": \"English Prompts\",\n", " \"Chinese\": \"Chinese Prompts\",\n", " \"French\": \"French Prompts\",\n", " \"Exclude Ties\": \"Exclude Ties and Bothbad\",\n", " \"Exclude Short Query (< 5 tokens)\": \"Exclude Short User Query (< 5 tokens)\",\n", " \"Exclude Refusal\": 'Exclude model responses with refusal (e.g., \"I cannot answer\")',\n", "}\n", "\n", "with open(latest_elo_file_local, \"rb\") as fin:\n", " elo_results = pickle.load(fin)\n", "\n", "arena_dfs = {}\n", "for k in key_to_category_name.keys():\n", " if k not in elo_results:\n", " continue\n", " arena_dfs[key_to_category_name[k]] = elo_results[k][\"leaderboard_table_df\"]" ] }, { "cell_type": "code", "execution_count": 77, "metadata": {}, "outputs": [ { "data": { "text/plain": [ "dict_keys(['Overall', 'Coding', 'Longer Query', 'English', 'Chinese', 'French', 'Exclude Ties', 'Exclude Short Query (< 5 tokens)', 'Exclude Refusal'])" ] }, "execution_count": 77, "metadata": {}, "output_type": "execute_result" } ], "source": [ "arena_dfs.keys()" ] }, { "cell_type": "code", "execution_count": 78, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", " | rating | \n", "variance | \n", "rating_q975 | \n", "rating_q025 | \n", "num_battles | \n", "final_ranking | \n", "
---|---|---|---|---|---|---|
RWKV-4-Raven-14B | \n", "927.710294 | \n", "27.143015 | \n", "935.717850 | \n", "916.546369 | \n", "5129 | \n", "81 | \n", "
alpaca-13b | \n", "907.324482 | \n", "20.736682 | \n", "915.536856 | \n", "899.330070 | \n", "6111 | \n", "85 | \n", "
bard-jan-24-gemini-pro | \n", "1208.505408 | \n", "6.679087 | \n", "1213.291358 | \n", "1203.926901 | \n", "12388 | \n", "6 | \n", "
chatglm-6b | \n", "886.107553 | \n", "17.110417 | \n", "894.034333 | \n", "878.094776 | \n", "5195 | \n", "86 | \n", "
chatglm2-6b | \n", "932.678460 | \n", "33.530570 | \n", "943.455598 | \n", "921.346322 | \n", "2880 | \n", "81 | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
wizardlm-70b | \n", "1107.992552 | \n", "9.385887 | \n", "1114.218223 | \n", "1102.655575 | \n", "8868 | \n", "29 | \n", "
yi-34b-chat | \n", "1109.722447 | \n", "8.596908 | \n", "1115.182579 | \n", "1103.991095 | \n", "12252 | \n", "29 | \n", "
zephyr-7b-alpha | \n", "1042.108710 | \n", "43.900714 | \n", "1052.991768 | \n", "1027.160917 | \n", "1901 | \n", "58 | \n", "
zephyr-7b-beta | \n", "1053.655680 | \n", "10.297607 | \n", "1059.923254 | \n", "1047.601629 | \n", "11924 | \n", "54 | \n", "
zephyr-orpo-141b-A35b-v0.1 | \n", "1124.677515 | \n", "22.288515 | \n", "1132.728887 | \n", "1113.848432 | \n", "4276 | \n", "22 | \n", "
91 rows × 6 columns
\n", "\n", " | key | \n", "Model | \n", "MT-bench (score) | \n", "MMLU | \n", "Knowledge cutoff date | \n", "License | \n", "Organization | \n", "Link | \n", "
---|---|---|---|---|---|---|---|---|
0 | \n", "wizardlm-30b | \n", "WizardLM-30B | \n", "7.01 | \n", "0.587 | \n", "2023/6 | \n", "Non-commercial | \n", "Microsoft | \n", "https://huggingface.co/WizardLM/WizardLM-30B-V1.0 | \n", "
1 | \n", "vicuna-13b-16k | \n", "Vicuna-13B-16k | \n", "6.92 | \n", "0.545 | \n", "2023/7 | \n", "Llama 2 Community | \n", "LMSYS | \n", "https://huggingface.co/lmsys/vicuna-13b-v1.5-16k | \n", "
2 | \n", "wizardlm-13b-v1.1 | \n", "WizardLM-13B-v1.1 | \n", "6.76 | \n", "0.500 | \n", "2023/7 | \n", "Non-commercial | \n", "Microsoft | \n", "https://huggingface.co/WizardLM/WizardLM-13B-V1.1 | \n", "
3 | \n", "tulu-30b | \n", "Tulu-30B | \n", "6.43 | \n", "0.581 | \n", "2023/6 | \n", "Non-commercial | \n", "AllenAI/UW | \n", "https://huggingface.co/allenai/tulu-30b | \n", "
4 | \n", "guanaco-65b | \n", "Guanaco-65B | \n", "6.41 | \n", "0.621 | \n", "2023/5 | \n", "Non-commercial | \n", "UW | \n", "https://huggingface.co/timdettmers/guanaco-65b... | \n", "
... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "... | \n", "
100 | \n", "mixtral-8x22b-instruct-v0.1 | \n", "Mixtral-8x22b-Instruct-v0.1 | \n", "- | \n", "0.778 | \n", "2024/4 | \n", "Apache 2.0 | \n", "Mistral | \n", "https://mistral.ai/news/mixtral-8x22b/ | \n", "
101 | \n", "llama-3-70b-instruct | \n", "Llama-3-70b-Instruct | \n", "- | \n", "0.820 | \n", "2023/12 | \n", "Llama 3 Community | \n", "Meta | \n", "https://llama.meta.com/llama3/ | \n", "
102 | \n", "llama-3-8b-instruct | \n", "Llama-3-8b-Instruct | \n", "- | \n", "0.684 | \n", "2023/3 | \n", "Llama 3 Community | \n", "Meta | \n", "https://llama.meta.com/llama3/ | \n", "
103 | \n", "gemini-1.5-pro-api-0409-preview | \n", "Gemini 1.5 Pro API-0409-Preview | \n", "- | \n", "0.819 | \n", "2023/11 | \n", "Proprietary | \n", "https://blog.google/technology/ai/google-gemin... | \n", "|
104 | \n", "phi-3-mini-128k-instruct | \n", "Phi-3-Mini-128k-Instruct | \n", "- | \n", "0.681 | \n", "2023/10 | \n", "MIT | \n", "Microsoft | \n", "https://azure.microsoft.com/en-us/blog/introdu... | \n", "
105 rows × 8 columns
\n", "\n", " | rating | \n", "variance | \n", "rating_q975 | \n", "rating_q025 | \n", "num_battles | \n", "final_ranking | \n", "key | \n", "Model | \n", "MT-bench (score) | \n", "MMLU | \n", "Knowledge cutoff date | \n", "License | \n", "Organization | \n", "Link | \n", "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", "1257.399407 | \n", "4.283316 | \n", "1261.676224 | \n", "1254.003626 | \n", "30562 | \n", "1 | \n", "gpt-4-turbo-2024-04-09 | \n", "GPT-4-Turbo-2024-04-09 | \n", "- | \n", "- | \n", "2023/12 | \n", "Proprietary | \n", "OpenAI | \n", "https://platform.openai.com/docs/models/gpt-4-... | \n", "
1 | \n", "1253.025095 | \n", "2.069534 | \n", "1256.111392 | \n", "1250.435207 | \n", "69871 | \n", "1 | \n", "gpt-4-1106-preview | \n", "GPT-4-1106-preview | \n", "9.32 | \n", "- | \n", "2023/4 | \n", "Proprietary | \n", "OpenAI | \n", "https://openai.com/blog/new-models-and-develop... | \n", "
2 | \n", "1251.114220 | \n", "1.862842 | \n", "1253.629093 | \n", "1248.362042 | \n", "75684 | \n", "2 | \n", "claude-3-opus-20240229 | \n", "Claude 3 Opus | \n", "- | \n", "0.868 | \n", "2023/8 | \n", "Proprietary | \n", "Anthropic | \n", "https://www.anthropic.com/news/claude-3-family | \n", "
3 | \n", "1247.662508 | \n", "3.263747 | \n", "1251.582645 | \n", "1244.380454 | \n", "33723 | \n", "2 | \n", "gemini-1.5-pro-api-0409-preview | \n", "Gemini 1.5 Pro API-0409-Preview | \n", "- | \n", "0.819 | \n", "2023/11 | \n", "Proprietary | \n", "https://blog.google/technology/ai/google-gemin... | \n", "|
4 | \n", "1247.277052 | \n", "1.923014 | \n", "1249.489411 | \n", "1244.340257 | \n", "61924 | \n", "3 | \n", "gpt-4-0125-preview | \n", "GPT-4-0125-preview | \n", "- | \n", "- | \n", "2023/12 | \n", "Proprietary | \n", "OpenAI | \n", "https://openai.com/blog/new-models-and-develop... | \n", "
5 | \n", "1208.505408 | \n", "6.679087 | \n", "1213.291358 | \n", "1203.926901 | \n", "12388 | \n", "6 | \n", "bard-jan-24-gemini-pro | \n", "Bard (Gemini Pro) | \n", "- | \n", "- | \n", "Online | \n", "Proprietary | \n", "https://bard.google.com/ | \n", "|
6 | \n", "1207.497541 | \n", "4.109466 | \n", "1211.720734 | \n", "1203.322762 | \n", "27298 | \n", "6 | \n", "llama-3-70b-instruct | \n", "Llama-3-70b-Instruct | \n", "- | \n", "0.820 | \n", "2023/12 | \n", "Llama 3 Community | \n", "Meta | \n", "https://llama.meta.com/llama3/ | \n", "
7 | \n", "1201.671254 | \n", "2.525563 | \n", "1204.862512 | \n", "1198.658822 | \n", "75418 | \n", "6 | \n", "claude-3-sonnet-20240229 | \n", "Claude 3 Sonnet | \n", "- | \n", "0.790 | \n", "2023/8 | \n", "Proprietary | \n", "Anthropic | \n", "https://www.anthropic.com/news/claude-3-family | \n", "
8 | \n", "1191.684542 | \n", "3.459717 | \n", "1195.080256 | \n", "1188.222382 | \n", "41262 | \n", "9 | \n", "command-r-plus | \n", "Command R+ | \n", "- | \n", "- | \n", "2024/3 | \n", "CC-BY-NC-4.0 | \n", "Cohere | \n", "https://txt.cohere.com/command-r-plus-microsof... | \n", "
9 | \n", "1188.987389 | \n", "3.124792 | \n", "1193.335535 | \n", "1185.935928 | \n", "48390 | \n", "9 | \n", "gpt-4-0314 | \n", "GPT-4-0314 | \n", "8.96 | \n", "0.864 | \n", "2021/9 | \n", "Proprietary | \n", "OpenAI | \n", "https://openai.com/research/gpt-4 | \n", "
10 | \n", "1180.606870 | \n", "3.097542 | \n", "1183.825403 | \n", "1177.255203 | \n", "66065 | \n", "11 | \n", "claude-3-haiku-20240307 | \n", "Claude 3 Haiku | \n", "- | \n", "0.752 | \n", "2023/8 | \n", "Proprietary | \n", "Anthropic | \n", "https://www.anthropic.com/news/claude-3-family | \n", "
11 | \n", "1164.896561 | \n", "2.585577 | \n", "1167.595696 | \n", "1161.727454 | \n", "67038 | \n", "12 | \n", "gpt-4-0613 | \n", "GPT-4-0613 | \n", "9.18 | \n", "- | \n", "2021/9 | \n", "Proprietary | \n", "OpenAI | \n", "https://platform.openai.com/docs/models/gpt-4-... | \n", "
12 | \n", "1157.638992 | \n", "2.541320 | \n", "1160.496116 | \n", "1154.927748 | \n", "44120 | \n", "13 | \n", "mistral-large-2402 | \n", "Mistral-Large-2402 | \n", "- | \n", "0.812 | \n", "- | \n", "Proprietary | \n", "Mistral | \n", "https://mistral.ai/news/mistral-large/ | \n", "
13 | \n", "1153.464280 | \n", "3.631512 | \n", "1157.068850 | \n", "1150.178903 | \n", "32999 | \n", "13 | \n", "qwen1.5-72b-chat | \n", "Qwen1.5-72B-Chat | \n", "8.61 | \n", "0.775 | \n", "2024/2 | \n", "Qianwen LICENSE | \n", "Alibaba | \n", "https://qwenlm.github.io/blog/qwen1.5/ | \n", "
14 | \n", "1150.918473 | \n", "9.062217 | \n", "1155.969721 | \n", "1145.229885 | \n", "8622 | \n", "13 | \n", "reka-flash-21b-20240226-online | \n", "Reka-Flash-21B-online | \n", "- | \n", "- | \n", "Online | \n", "Proprietary | \n", "Reka AI | \n", "https://docs.reka.ai/http-api.html#generation | \n", "
15 | \n", "1150.244313 | \n", "5.551373 | \n", "1154.745214 | \n", "1145.496466 | \n", "21768 | \n", "14 | \n", "claude-1 | \n", "Claude-1 | \n", "7.90 | \n", "0.770 | \n", "- | \n", "Proprietary | \n", "Anthropic | \n", "https://www.anthropic.com/index/introducing-cl... | \n", "
16 | \n", "1149.267578 | \n", "11.452272 | \n", "1154.290155 | \n", "1141.931621 | \n", "9059 | \n", "14 | \n", "reka-flash-21b-20240226 | \n", "Reka-Flash-21B | \n", "- | \n", "0.735 | \n", "2023/11 | \n", "Proprietary | \n", "Reka AI | \n", "https://www.reka.ai/news/reka-flash-efficient-... | \n", "
17 | \n", "1148.072155 | \n", "3.071222 | \n", "1151.980865 | \n", "1144.992044 | \n", "37413 | \n", "14 | \n", "command-r | \n", "Command R | \n", "- | \n", "- | \n", "2024/3 | \n", "CC-BY-NC-4.0 | \n", "Cohere | \n", "https://txt.cohere.com/command-r | \n", "
18 | \n", "1147.668325 | \n", "3.542229 | \n", "1150.726489 | \n", "1143.868385 | \n", "32738 | \n", "14 | \n", "mistral-medium | \n", "Mistral Medium | \n", "8.61 | \n", "0.753 | \n", "- | \n", "Proprietary | \n", "Mistral | \n", "https://mistral.ai/news/la-plateforme/ | \n", "
19 | \n", "1147.473989 | \n", "5.789710 | \n", "1151.989352 | \n", "1143.322918 | \n", "17214 | \n", "14 | \n", "mixtral-8x22b-instruct-v0.1 | \n", "Mixtral-8x22b-Instruct-v0.1 | \n", "- | \n", "0.778 | \n", "2024/4 | \n", "Apache 2.0 | \n", "Mistral | \n", "https://mistral.ai/news/mixtral-8x22b/ | \n", "