{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "100\n", "200\n", "300\n", "400\n", "500\n", "600\n", "700\n", "800\n", "900\n", "1000\n", "1100\n", "1200\n", "1300\n", "1400\n", "1500\n", "1600\n", "1700\n", "1800\n", "full_model_names\n", "1889\n", "organization_names\n", "12\n", "['Parameters', 'drop|3', 'gsm8k', 'MMLU_average', 'winogrande', 'all', 'arc:challenge|25', 'hellaswag|10', 'MMLU_abstract_algebra', 'MMLU_anatomy', 'MMLU_astronomy', 'MMLU_business_ethics', 'MMLU_clinical_knowledge', 'MMLU_college_biology', 'MMLU_college_chemistry', 'MMLU_college_computer_science', 'MMLU_college_mathematics', 'MMLU_college_medicine', 'MMLU_college_physics', 'MMLU_computer_security', 'MMLU_conceptual_physics', 'MMLU_econometrics', 'MMLU_electrical_engineering', 'MMLU_elementary_mathematics', 'MMLU_formal_logic', 'MMLU_global_facts', 'MMLU_high_school_biology', 'MMLU_high_school_chemistry', 'MMLU_high_school_computer_science', 'MMLU_high_school_european_history', 'MMLU_high_school_geography', 'MMLU_high_school_government_and_politics', 'MMLU_high_school_macroeconomics', 'MMLU_high_school_mathematics', 'MMLU_high_school_microeconomics', 'MMLU_high_school_physics', 'MMLU_high_school_psychology', 'MMLU_high_school_statistics', 'MMLU_high_school_us_history', 'MMLU_high_school_world_history', 'MMLU_human_aging', 'MMLU_human_sexuality', 'MMLU_international_law', 'MMLU_jurisprudence', 'MMLU_logical_fallacies', 'MMLU_machine_learning', 'MMLU_management', 'MMLU_marketing', 'MMLU_medical_genetics', 'MMLU_miscellaneous', 'MMLU_moral_disputes', 'MMLU_moral_scenarios', 'MMLU_nutrition', 'MMLU_philosophy', 'MMLU_prehistory', 'MMLU_professional_accounting', 'MMLU_professional_law', 'MMLU_professional_medicine', 'MMLU_professional_psychology', 'MMLU_public_relations', 'MMLU_security_studies', 'MMLU_sociology', 'MMLU_us_foreign_policy', 'MMLU_virology', 'MMLU_world_religions', 'truthfulqa:mc|0', 'full_model_name']\n" ] } ], "source": [ "from result_data_processor import ResultDataProcessor\n", "result = ResultDataProcessor()" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [ { "data": { "text/html": [ "
\n", "\n", "\n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", " \n", "
URLfull_model_nameParametersMMLU_averagearc:challenge|25hellaswag|10MMLU_abstract_algebraMMLU_anatomyMMLU_astronomyMMLU_business_ethics...MMLU_professional_accountingMMLU_professional_lawMMLU_professional_medicineMMLU_professional_psychologyMMLU_public_relationsMMLU_security_studiesMMLU_sociologyMMLU_us_foreign_policyMMLU_virologyMMLU_world_religions
SparseOPT-1.3Bhttps://huggingface.co/shaohang/SparseOPT-1.3Bshaohang/SparseOPT-1.3B1.30.2559630.2406140.3836890.220.2148150.1578950.20...0.2624110.2385920.4485290.2549020.2363640.1714290.2288560.270.2831330.216374
Athena-v1https://huggingface.co/IkariDev/Athena-v1IkariDev/Athena-v1NaN0.5560520.5605800.6315480.310.4962960.5263160.58...0.4042550.3924380.5257350.5408500.6454550.6408160.7512440.830.4939760.725146
Athena-tmphttps://huggingface.co/IkariDev/Athena-tmpIkariDev/Athena-tmpNaN0.5886850.5674060.6218880.290.5185190.6381580.62...0.4503550.4628420.5698530.5882350.6454550.6530610.7213930.810.4638550.801170
13B-Legerdemain-L2https://huggingface.co/CalderaAI/13B-Legerdema...CalderaAI/13B-Legerdemain-L213.00.5600300.5733790.6354310.360.5259260.5723680.53...0.4290780.4243810.5220590.5326800.6090910.6367350.7661690.870.4277110.777778
13B-Ouroboroshttps://huggingface.co/CalderaAI/13B-OuroborosCalderaAI/13B-Ouroboros13.00.5143110.5605800.6243780.310.4666670.5065790.52...0.3652480.4054760.4816180.5245100.6090910.5387760.6915420.830.4578310.760234
..................................................................
Robin-v2https://huggingface.co/HanningZhang/Robin-v2HanningZhang/Robin-v2NaN0.3926800.4351540.5453100.320.4370370.3355260.46...0.2907800.3024770.3823530.3741830.4454550.3265310.4577110.590.3795180.590643
CodeUp-Llama-2-13b-chat-hfhttps://huggingface.co/deepse/CodeUp-Llama-2-1...deepse/CodeUp-Llama-2-13b-chat-hf13.00.5462620.5580200.6292570.310.4740740.5460530.53...0.3900710.3917860.5000000.5441180.6636360.6367350.7512440.810.4819280.730994
Hermes-Platypus2-mini-7Bhttps://huggingface.co/edor/Hermes-Platypus2-m...edor/Hermes-Platypus2-mini-7B7.00.4708280.5230380.6015730.330.4888890.4210530.48...0.3900710.3539770.4705880.4460780.5181820.5632650.6218910.680.4216870.637427
Stable-Platypus2-mini-7Bhttps://huggingface.co/edor/Stable-Platypus2-m...edor/Stable-Platypus2-mini-7B7.00.5178000.5238910.5965940.370.4888890.4078950.50...0.3900710.3917860.5183820.5098040.6181820.6571430.6318410.730.4277110.695906
llava-v1.5-13b-hfhttps://huggingface.co/Community-LM/llava-v1.5...Community-LM/llava-v1.5-13b-hf13.00.5688680.5324230.6011750.300.4962960.5855260.67...0.4078010.4159060.5477940.5784310.6000000.6530610.7611940.810.5060240.795322
\n", "

1121 rows × 63 columns

\n", "
" ], "text/plain": [ " URL \\\n", "SparseOPT-1.3B https://huggingface.co/shaohang/SparseOPT-1.3B \n", "Athena-v1 https://huggingface.co/IkariDev/Athena-v1 \n", "Athena-tmp https://huggingface.co/IkariDev/Athena-tmp \n", "13B-Legerdemain-L2 https://huggingface.co/CalderaAI/13B-Legerdema... \n", "13B-Ouroboros https://huggingface.co/CalderaAI/13B-Ouroboros \n", "... ... \n", "Robin-v2 https://huggingface.co/HanningZhang/Robin-v2 \n", "CodeUp-Llama-2-13b-chat-hf https://huggingface.co/deepse/CodeUp-Llama-2-1... \n", "Hermes-Platypus2-mini-7B https://huggingface.co/edor/Hermes-Platypus2-m... \n", "Stable-Platypus2-mini-7B https://huggingface.co/edor/Stable-Platypus2-m... \n", "llava-v1.5-13b-hf https://huggingface.co/Community-LM/llava-v1.5... \n", "\n", " full_model_name Parameters \\\n", "SparseOPT-1.3B shaohang/SparseOPT-1.3B 1.3 \n", "Athena-v1 IkariDev/Athena-v1 NaN \n", "Athena-tmp IkariDev/Athena-tmp NaN \n", "13B-Legerdemain-L2 CalderaAI/13B-Legerdemain-L2 13.0 \n", "13B-Ouroboros CalderaAI/13B-Ouroboros 13.0 \n", "... ... ... \n", "Robin-v2 HanningZhang/Robin-v2 NaN \n", "CodeUp-Llama-2-13b-chat-hf deepse/CodeUp-Llama-2-13b-chat-hf 13.0 \n", "Hermes-Platypus2-mini-7B edor/Hermes-Platypus2-mini-7B 7.0 \n", "Stable-Platypus2-mini-7B edor/Stable-Platypus2-mini-7B 7.0 \n", "llava-v1.5-13b-hf Community-LM/llava-v1.5-13b-hf 13.0 \n", "\n", " MMLU_average arc:challenge|25 hellaswag|10 \\\n", "SparseOPT-1.3B 0.255963 0.240614 0.383689 \n", "Athena-v1 0.556052 0.560580 0.631548 \n", "Athena-tmp 0.588685 0.567406 0.621888 \n", "13B-Legerdemain-L2 0.560030 0.573379 0.635431 \n", "13B-Ouroboros 0.514311 0.560580 0.624378 \n", "... ... ... ... \n", "Robin-v2 0.392680 0.435154 0.545310 \n", "CodeUp-Llama-2-13b-chat-hf 0.546262 0.558020 0.629257 \n", "Hermes-Platypus2-mini-7B 0.470828 0.523038 0.601573 \n", "Stable-Platypus2-mini-7B 0.517800 0.523891 0.596594 \n", "llava-v1.5-13b-hf 0.568868 0.532423 0.601175 \n", "\n", " MMLU_abstract_algebra MMLU_anatomy \\\n", "SparseOPT-1.3B 0.22 0.214815 \n", "Athena-v1 0.31 0.496296 \n", "Athena-tmp 0.29 0.518519 \n", "13B-Legerdemain-L2 0.36 0.525926 \n", "13B-Ouroboros 0.31 0.466667 \n", "... ... ... \n", "Robin-v2 0.32 0.437037 \n", "CodeUp-Llama-2-13b-chat-hf 0.31 0.474074 \n", "Hermes-Platypus2-mini-7B 0.33 0.488889 \n", "Stable-Platypus2-mini-7B 0.37 0.488889 \n", "llava-v1.5-13b-hf 0.30 0.496296 \n", "\n", " MMLU_astronomy MMLU_business_ethics ... \\\n", "SparseOPT-1.3B 0.157895 0.20 ... \n", "Athena-v1 0.526316 0.58 ... \n", "Athena-tmp 0.638158 0.62 ... \n", "13B-Legerdemain-L2 0.572368 0.53 ... \n", "13B-Ouroboros 0.506579 0.52 ... \n", "... ... ... ... \n", "Robin-v2 0.335526 0.46 ... \n", "CodeUp-Llama-2-13b-chat-hf 0.546053 0.53 ... \n", "Hermes-Platypus2-mini-7B 0.421053 0.48 ... \n", "Stable-Platypus2-mini-7B 0.407895 0.50 ... \n", "llava-v1.5-13b-hf 0.585526 0.67 ... \n", "\n", " MMLU_professional_accounting \\\n", "SparseOPT-1.3B 0.262411 \n", "Athena-v1 0.404255 \n", "Athena-tmp 0.450355 \n", "13B-Legerdemain-L2 0.429078 \n", "13B-Ouroboros 0.365248 \n", "... ... \n", "Robin-v2 0.290780 \n", "CodeUp-Llama-2-13b-chat-hf 0.390071 \n", "Hermes-Platypus2-mini-7B 0.390071 \n", "Stable-Platypus2-mini-7B 0.390071 \n", "llava-v1.5-13b-hf 0.407801 \n", "\n", " MMLU_professional_law MMLU_professional_medicine \\\n", "SparseOPT-1.3B 0.238592 0.448529 \n", "Athena-v1 0.392438 0.525735 \n", "Athena-tmp 0.462842 0.569853 \n", "13B-Legerdemain-L2 0.424381 0.522059 \n", "13B-Ouroboros 0.405476 0.481618 \n", "... ... ... \n", "Robin-v2 0.302477 0.382353 \n", "CodeUp-Llama-2-13b-chat-hf 0.391786 0.500000 \n", "Hermes-Platypus2-mini-7B 0.353977 0.470588 \n", "Stable-Platypus2-mini-7B 0.391786 0.518382 \n", "llava-v1.5-13b-hf 0.415906 0.547794 \n", "\n", " MMLU_professional_psychology \\\n", "SparseOPT-1.3B 0.254902 \n", "Athena-v1 0.540850 \n", "Athena-tmp 0.588235 \n", "13B-Legerdemain-L2 0.532680 \n", "13B-Ouroboros 0.524510 \n", "... ... \n", "Robin-v2 0.374183 \n", "CodeUp-Llama-2-13b-chat-hf 0.544118 \n", "Hermes-Platypus2-mini-7B 0.446078 \n", "Stable-Platypus2-mini-7B 0.509804 \n", "llava-v1.5-13b-hf 0.578431 \n", "\n", " MMLU_public_relations MMLU_security_studies \\\n", "SparseOPT-1.3B 0.236364 0.171429 \n", "Athena-v1 0.645455 0.640816 \n", "Athena-tmp 0.645455 0.653061 \n", "13B-Legerdemain-L2 0.609091 0.636735 \n", "13B-Ouroboros 0.609091 0.538776 \n", "... ... ... \n", "Robin-v2 0.445455 0.326531 \n", "CodeUp-Llama-2-13b-chat-hf 0.663636 0.636735 \n", "Hermes-Platypus2-mini-7B 0.518182 0.563265 \n", "Stable-Platypus2-mini-7B 0.618182 0.657143 \n", "llava-v1.5-13b-hf 0.600000 0.653061 \n", "\n", " MMLU_sociology MMLU_us_foreign_policy \\\n", "SparseOPT-1.3B 0.228856 0.27 \n", "Athena-v1 0.751244 0.83 \n", "Athena-tmp 0.721393 0.81 \n", "13B-Legerdemain-L2 0.766169 0.87 \n", "13B-Ouroboros 0.691542 0.83 \n", "... ... ... \n", "Robin-v2 0.457711 0.59 \n", "CodeUp-Llama-2-13b-chat-hf 0.751244 0.81 \n", "Hermes-Platypus2-mini-7B 0.621891 0.68 \n", "Stable-Platypus2-mini-7B 0.631841 0.73 \n", "llava-v1.5-13b-hf 0.761194 0.81 \n", "\n", " MMLU_virology MMLU_world_religions \n", "SparseOPT-1.3B 0.283133 0.216374 \n", "Athena-v1 0.493976 0.725146 \n", "Athena-tmp 0.463855 0.801170 \n", "13B-Legerdemain-L2 0.427711 0.777778 \n", "13B-Ouroboros 0.457831 0.760234 \n", "... ... ... \n", "Robin-v2 0.379518 0.590643 \n", "CodeUp-Llama-2-13b-chat-hf 0.481928 0.730994 \n", "Hermes-Platypus2-mini-7B 0.421687 0.637427 \n", "Stable-Platypus2-mini-7B 0.427711 0.695906 \n", "llava-v1.5-13b-hf 0.506024 0.795322 \n", "\n", "[1121 rows x 63 columns]" ] }, "execution_count": 2, "metadata": {}, "output_type": "execute_result" } ], "source": [ "df = result.data\n", "df" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "mmlu", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.10.12" } }, "nbformat": 4, "nbformat_minor": 2 }