diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|arc:challenge|25_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|arc:challenge|25_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..e430821484b9bdfbb499c4ebd25b15f40e91c8e2 --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|arc:challenge|25_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43067efd73a2e441ee5f5574d0291b3ac22ce492f57038791cc134e55ca46e6d +size 5638936 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|gsm8k|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|gsm8k|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..b494cd6ea81ca6e74324f949add613032f1fee39 --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|gsm8k|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5aed86fefb6bf2511af4826a12b97d2b0176474fb896bd2489e515b3028c78f6 +size 5018157 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|hellaswag|10_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|hellaswag|10_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..800bfc74bf2f2f74c9f2cd8e677be3772e2ced0f --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|hellaswag|10_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2025e8bd756f130758acf09d7dca3d66858c6aa4d9ec836c2b176eaafe9712b8 +size 57616414 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:abstract_algebra|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:abstract_algebra|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..88db8281f7d55e1bf8b4b3ca7f8202f421585c08 --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:abstract_algebra|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1ea3ca236ec7ddb977849da10813ff8ada8394c2173baf829fc2f72a2998464 +size 72484 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:anatomy|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:anatomy|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..783a104364e5f346492448f5ebed4be5d1113f19 --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:anatomy|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b62ed0ed5348d17a67788ca7b9931387c64fcf7f60936d068bdd2c7daa0f46dd +size 115298 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:astronomy|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:astronomy|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..38a834f6bdeabcd5ee4477ca0ec9d21559756127 --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:astronomy|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23b627420487c3aebd90bd6602c64f65fb14774f022663501f5cb2bb43a0b1d3 +size 180557 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:business_ethics|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:business_ethics|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..cdfa3bd49f9ff48dbcb9a2cb78b56da3d209962d --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:business_ethics|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:430e858bf908cc4a6eec9e918e787b633baa0a55325a0c41136771971136c702 +size 132584 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:clinical_knowledge|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:clinical_knowledge|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..bf43e4440978d202af4cf2bd86e066152e8c8b0c --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:clinical_knowledge|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4946477265a71d045fdc30010d2fcae808a84a157c0bfac6313ebb53effaeaa5 +size 221395 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:college_biology|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:college_biology|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..d72a00ace9fb4b261fa68a58a090f8af7bdb88d7 --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:college_biology|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:297169790fe1ff4a20dfaa651013157b09b75d1d2379c829af6a7e8066cb8bc2 +size 167403 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:college_chemistry|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:college_chemistry|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..71b7b1cf6837ce69c1aa2ed72069fd377fe18be3 --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:college_chemistry|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1cbdc287b32ff62da5877da8a8396636719687bb527d67d6561d997ebac7d2b1 +size 112835 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:college_computer_science|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:college_computer_science|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..7d8518e7e79b42b6fd85b25a6a151e54683c21be --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:college_computer_science|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ddb03075b92ec14d26a6fcaef4d83463f60e8c9fef099fa3753bd195c794a9d9 +size 173750 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:college_mathematics|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:college_mathematics|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..3f7b4c2f2de5f1b78d27b3d0722e16f78b0a4265 --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:college_mathematics|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a841aa7721b153137f64d36b2f94015b2335f4c31009b3d0e71068f9a274b64 +size 108453 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:college_medicine|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:college_medicine|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..4b2833c7618d8d260cacf235ac4dfd6f5ec77fa6 --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:college_medicine|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d17a348a36e1dcc29412102586f44780119d75b18d086dafd4581a77290eeb01 +size 235015 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:college_physics|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:college_physics|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..db2b21fd28aa2dcbdd7c694dea4ebee80a0caa37 --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:college_physics|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:190e43f712b89b86648a5e19e1e472f2126066102a11f5979cce070677c85963 +size 111043 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:computer_security|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:computer_security|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..c9d906ccac24bf281f16584e76a727eb61cd7299 --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:computer_security|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5451e43fa504948425a543accead5ed374fd103576c0dc977d3024f2716f82c8 +size 107050 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:conceptual_physics|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:conceptual_physics|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..ee72e8dc65d93e58351ce5a717dc80f13be3904e --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:conceptual_physics|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39483319c39a0c91d0ccfe5a1e62a7d3b2fc116a11a0ec3e7e1f6fd8db30f0fa +size 150464 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:econometrics|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:econometrics|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..1de89e2cc1cecc4ce023f4ccb77c4ba4d4575763 --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:econometrics|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcedd649fc64eb6e0f59007d54d566cf24646d71d33652b972c3fb705653bdfa +size 151731 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:electrical_engineering|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:electrical_engineering|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..81ea3357f96cf0c10ef317de9c3aff33bade8591 --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:electrical_engineering|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6fae533a0d2d9a2bc48fa7aee84877ad69bb12e93df8796bac2c81dfa01c4f7 +size 112865 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:elementary_mathematics|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:elementary_mathematics|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..df56942ab64c44206c5ceca700c5690527c7764c --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:elementary_mathematics|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3d50f146baa9b12c712de52fa7e106b9f905c9e7e4c94eddafd3e4c558bd54c +size 293188 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:formal_logic|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:formal_logic|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..ddd97cc79da9f0f41c9d1fdfcc8a2e97363bff6b --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:formal_logic|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b72391090fddff98bd34e97c4894cd446bab61e83af229f3bc42af3a3314db0 +size 145123 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:global_facts|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:global_facts|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..44547d5ab16aec05efbf4a8f9b477e005fd84489 --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:global_facts|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5036b473019f271930c4c497324db592b555421af2c9fd3341e73b30e73b309e +size 85080 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:high_school_biology|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:high_school_biology|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..d4a28ed60a380d735f8bd8603d14ff1bf8005430 --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:high_school_biology|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b694dab73bec8bddc15490a5d180585e29dc6a3b51580b77e51bc7e659696dc +size 331903 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:high_school_chemistry|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:high_school_chemistry|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..4a763e701f759a3ee99b70708c0ebfd8094c52bc --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:high_school_chemistry|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb49628a895543d1cff7b71dd719b73adbb284816f26b5b52eccc07f430f908d +size 190140 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:high_school_computer_science|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:high_school_computer_science|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..ee43ffc92582d5bd3585e0b3cde3373cee33a037 --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:high_school_computer_science|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a90cb819b879b98b8e70b95c7e1ff4b7ebfd290d35309dc7f6a497947181bbeb +size 168894 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:high_school_european_history|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:high_school_european_history|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..16f024e6b1605403b92f9bf1867cb76523bf74df --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:high_school_european_history|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29fbffd35879812799456b92a4336ef5a8a10340141a5069600ba93e0a9b0ddb +size 1336126 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:high_school_geography|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:high_school_geography|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..0e797e1c797265f537bf5e824b4cbdf92ea0e067 --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:high_school_geography|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6903218671a3bd1fa06ca59aeaba1888d745cfc44fc8e58abcce0193e71a8609 +size 164759 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:high_school_government_and_politics|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:high_school_government_and_politics|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..a466ffdf2f4058e0972d734e2ebebd68ffa41d4f --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:high_school_government_and_politics|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8033e3740c06d5d98df9a36ba856e3fb8fcc56d1165f5d00cada5416303a6389 +size 215003 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:high_school_macroeconomics|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:high_school_macroeconomics|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..0af59cf83bafd410b27a690009a0249616fc9b0f --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:high_school_macroeconomics|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e9de4a912633a1ede4d863afa91bb068aca6c02440b13353649c9598bc36c97 +size 308271 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:high_school_mathematics|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:high_school_mathematics|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..87a93288626203c44305b72ec5a2575edcbbddb7 --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:high_school_mathematics|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e9257e4d87d8537d570f6d3d35fa68c5c202f6b73d06dfa9d1f93d005a8ac12 +size 229311 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:high_school_microeconomics|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:high_school_microeconomics|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..8e85b458e42c8b1454919432dbd0f91af26a6201 --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:high_school_microeconomics|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8e7e27e399849c477b6ff65ee26695eb0f33d9cb569ed1ffcf92fca86083c96 +size 222421 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:high_school_physics|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:high_school_physics|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..4c0eb5f0878633258e98589ff8d4446765a9360c --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:high_school_physics|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4222e18cb3f7d3426efdd5d42d93fd2f5b9d663dea3eb514bf0245a0aaf5767 +size 183087 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:high_school_psychology|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:high_school_psychology|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..0cc2ddd753ba663d016b0caa3f1cc36239986a67 --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:high_school_psychology|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2474758537f5465f9ed3b91e8c1345081998e99ac26369bfb099317ec745282 +size 538851 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:high_school_statistics|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:high_school_statistics|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..567f799e82a16bed5d307f20feb3357da2bea579 --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:high_school_statistics|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b06cec1cb83691f1090de3e7dd83444e2e7d53d9ed2a36181caebc74c77dadd +size 329487 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:high_school_us_history|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:high_school_us_history|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..4c41727030e42f1ebf3219f7a56fde10bd417f89 --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:high_school_us_history|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3ace1c14d84cfe600e335a895501c6f07f71f092fc08442e66b585b3878d766 +size 1212408 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:high_school_world_history|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:high_school_world_history|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..d6822f30ef00d7147c664512c07e103e31f74cef --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:high_school_world_history|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:408437c3cab737f7147a9d971085009550dbf4e1f7b0d6ba4f9ce02f0d4556a4 +size 961990 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:human_aging|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:human_aging|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..fb00ae7a4e7b24bc489116de42d5853e910de7ca --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:human_aging|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed6f3bd608eaf5e7b228be2650732b90b3a1fd0a3a1ce709277c62d374838cff +size 168183 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:human_sexuality|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:human_sexuality|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..d74e17f503dd66fa2b60386604bfeb21aca5cce6 --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:human_sexuality|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d47e7605acbd96ca67cbd8814dd1031cf6dcfd220a1ec0560a962f138a6523de +size 119472 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:international_law|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:international_law|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..a9fed7e5d196ebc96ea137c35aa9fc185f92b514 --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:international_law|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8a77b5540597cbcd65589b171c6c2a1d634b1a588789b4434249af8d8c3b377 +size 173839 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:jurisprudence|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:jurisprudence|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..c00e2cf794cf863325183d5b3ebc54c1c32acad3 --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:jurisprudence|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c46ea19e1227ac1ad1a81a8fe3c78de03131f0f1d0e3f92026de8adb82b306a2 +size 128241 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:logical_fallacies|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:logical_fallacies|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..7eed31897b5e5b03679c4ec5e9189f99465e913f --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:logical_fallacies|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26610e185fb2a42f13c72dd5d2145ebe8a619327dbdab8f2a62ad45e50391120 +size 156175 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:machine_learning|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:machine_learning|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..e6e78d29aa34f788fb666c82320581f5dcaa7762 --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:machine_learning|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d363e6d187b89c85f38b4f4db6a6fbc1cdf3cf92b8717ad0997a5ea3bd0c239b +size 156383 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:management|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:management|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..264358f8d4bc4e35be2962809784303a1c71c108 --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:management|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2d200b0b87a958714adafbd0d084cb6759e35068b9882d404b41195a80af0e1 +size 82611 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:marketing|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:marketing|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..5659ae4974c39669904fa96ada44db630ae01083 --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:marketing|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5ff1258e19a5e862c63dccbc09988731891cffe009d6f538afd7761ddf0b6a0 +size 213506 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:medical_genetics|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:medical_genetics|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..47a8f1ff4ed00f8ca9b60676f041a7516d700832 --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:medical_genetics|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2021fb85ca996825937e7e7af5b0ba91ef559c3c712bcfcde3d88bcb33e747f +size 93062 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:miscellaneous|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:miscellaneous|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..06af47bd1dd220cbee8b87ede1ededa9157e3d20 --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:miscellaneous|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a6aca3c4c5878ec80ad30ce67cc4ef1bd5feda53d50c84fa47ab752734a2482 +size 520724 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:moral_disputes|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:moral_disputes|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..0e570d38dd4596bc24e1ab935d14eb7e08986e5f --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:moral_disputes|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e17144e9ae754997fa1f478737e39da343235c19920a9bd78eb856d2313e58e2 +size 339782 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:moral_scenarios|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:moral_scenarios|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..e6ba7bf375e4e74bcc4ff2e34efbb50a93250c9e --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:moral_scenarios|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bbf3ceee2f5716c96ed425ff98c8f30172a6940e9bd5f2624d952cb50ae72a4 +size 699043 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:nutrition|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:nutrition|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..d3712a48f38c869fecef3435983448387e86f199 --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:nutrition|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ece8852d29465ec5a22fb054f88caf1e2ed6da2c3f6b4c51949a38f47bfd4618 +size 328994 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:philosophy|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:philosophy|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..a97f93e44814368d7c8ec7c399e891302163be0b --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:philosophy|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9300ceb60f9f4eebd66734f946284593fa251c9999f4e1ed04eb5b25728f9e3 +size 244016 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:prehistory|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:prehistory|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..976144af03c3a519847bd757f3578c1938d19b94 --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:prehistory|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a37f5cc056de2f8c006f7c32e2bfaab2453f820c1bc62a20b3368b8d6313df1 +size 329169 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:professional_accounting|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:professional_accounting|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..f125bc37549c66c2f6771b53cf0cb5821d6f04ae --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:professional_accounting|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b82569c965fdbad0083ad597c7a91c1dc3f92560b45bda7db42a1c9e68ab0a45 +size 386575 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:professional_law|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:professional_law|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..5d8eccc9d52e5d9d0d77e92f474b8e17e7d23094 --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:professional_law|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71f189c8ccb315176154a591d643c938def4b87e8de10f650e06930e4125b5af +size 6079454 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:professional_medicine|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:professional_medicine|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..c5d833de149111dd53255e74c070c0875d5dd833 --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:professional_medicine|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ab8b706e6a9107cbc31da2c92007e73da4f9d3d23c26097ba86e6cf85e81d29 +size 664089 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:professional_psychology|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:professional_psychology|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..8b16a644f119ca0e6e8d37816ee2255607115a2b --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:professional_psychology|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5bb776534a6f8bbb49a80a0532ab712c2776a9d3f5fc6261901829fd39f2862a +size 726218 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:public_relations|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:public_relations|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..5b44a8d72b8779414381d0da5c608d90110f5505 --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:public_relations|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:122a925ad76ae8f77b9cb4ad6c6949c9e619cef5e7d33b7c5e34f45f94e07852 +size 127268 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:security_studies|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:security_studies|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..fe5a9f11ec943b66a837bc47aab51c52ce4d7f94 --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:security_studies|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e757b3ad327166a8c684061d1eaac6a73e4f72954b0d9b713689fa6b9bb9e5ba +size 638435 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:sociology|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:sociology|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..364ad78a53be45751950bd25b5d9b2d3eb813d06 --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:sociology|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dbac444a36cf1715b679f896258a708823e0519793b2c89f688b0028436b591f +size 224753 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:us_foreign_policy|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:us_foreign_policy|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..f7f1162c49b7d7feffc97745ae44d6da06d5d1c2 --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:us_foreign_policy|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7cdf6bf649bbb4d5897228bb21bef2af2ad1d4d60f6f4e9e8617a38b6640fde4 +size 119779 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:virology|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:virology|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..c61d8b7bb82d0bd9ed04480cb7e19e012d0c98d1 --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:virology|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c84d10f4765c2aec879f085443f94bf56049b7cecac4a10f9e9c058a753b03d4 +size 145444 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:world_religions|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:world_religions|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..04161ddc39bf4bb260adb81ddd5a96f43c6d6806 --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|mmlu:world_religions|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b478c1d1d530822e9904a4a94da6a5ddc6788b8187ebbb51bdcd7aad96043fa8 +size 109017 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|truthfulqa:mc|0_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|truthfulqa:mc|0_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..7148f5db10fa792f75909982f78ae093a8b83ac7 --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|truthfulqa:mc|0_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:212e362c9f119c69f471ba1c545a5fbc0f8170c52ee4ba684174b81174d38b7b +size 883887 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|winogrande|5_2024-05-24T18-15-11.422263.parquet b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|winogrande|5_2024-05-24T18-15-11.422263.parquet new file mode 100644 index 0000000000000000000000000000000000000000..08a4e28c966391ba37a7403726e3a1622c8330ca --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/2024-05-24T18-15-11.422263/details_leaderboard|winogrande|5_2024-05-24T18-15-11.422263.parquet @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5fea6739aa6dea487b0375519b5e950d68e93cedd4a56f9b0104c0d410b1dca +size 1150015 diff --git a/details/meta-llama/Meta-Llama-3-8B-Instruct/results_2024-05-24T18-15-11.422263.json b/details/meta-llama/Meta-Llama-3-8B-Instruct/results_2024-05-24T18-15-11.422263.json new file mode 100644 index 0000000000000000000000000000000000000000..b5064155cce808a2d0b24a5d218891c0ff53543f --- /dev/null +++ b/details/meta-llama/Meta-Llama-3-8B-Instruct/results_2024-05-24T18-15-11.422263.json @@ -0,0 +1,3461 @@ +{ + "config_general": { + "lighteval_sha": "a98210fd3a2d1e8bface1c32b72ebd5017173a4c", + "num_fewshot_seeds": 1, + "override_batch_size": -1, + "max_samples": null, + "job_id": "", + "start_time": 2236465.645254106, + "end_time": 2260133.586933212, + "total_evaluation_time_secondes": "23667.941679106094", + "model_name": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_sha": "c4a54320a52ed5f88b7a2f84496903ea4ff07b45", + "model_dtype": "torch.bfloat16", + "model_size": "14.96 GB", + "config": null + }, + "results": { + "leaderboard|arc:challenge|25": { + "acc": 0.5742320819112628, + "acc_stderr": 0.01444946427886881, + "acc_norm": 0.5827645051194539, + "acc_norm_stderr": 0.014409825518403082 + }, + "leaderboard|hellaswag|10": { + "acc": 0.5707030472017527, + "acc_stderr": 0.004939642460172585, + "acc_norm": 0.7310296753634734, + "acc_norm_stderr": 0.004425182676353211 + }, + "leaderboard|mmlu:abstract_algebra|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045 + }, + "leaderboard|mmlu:anatomy|5": { + "acc": 0.6814814814814815, + "acc_stderr": 0.040247784019771096 + }, + "leaderboard|mmlu:astronomy|5": { + "acc": 0.75, + "acc_stderr": 0.03523807393012047 + }, + "leaderboard|mmlu:business_ethics|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814 + }, + "leaderboard|mmlu:clinical_knowledge|5": { + "acc": 0.7471698113207547, + "acc_stderr": 0.026749899771241214 + }, + "leaderboard|mmlu:college_biology|5": { + "acc": 0.7916666666666666, + "acc_stderr": 0.033961162058453336 + }, + "leaderboard|mmlu:college_chemistry|5": { + "acc": 0.45, + "acc_stderr": 0.05 + }, + "leaderboard|mmlu:college_computer_science|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795 + }, + "leaderboard|mmlu:college_mathematics|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975 + }, + "leaderboard|mmlu:college_medicine|5": { + "acc": 0.6589595375722543, + "acc_stderr": 0.036146654241808254 + }, + "leaderboard|mmlu:college_physics|5": { + "acc": 0.43137254901960786, + "acc_stderr": 0.04928099597287533 + }, + "leaderboard|mmlu:computer_security|5": { + "acc": 0.79, + "acc_stderr": 0.04093601807403326 + }, + "leaderboard|mmlu:conceptual_physics|5": { + "acc": 0.5872340425531914, + "acc_stderr": 0.03218471141400351 + }, + "leaderboard|mmlu:econometrics|5": { + "acc": 0.543859649122807, + "acc_stderr": 0.046854730419077895 + }, + "leaderboard|mmlu:electrical_engineering|5": { + "acc": 0.6137931034482759, + "acc_stderr": 0.04057324734419035 + }, + "leaderboard|mmlu:elementary_mathematics|5": { + "acc": 0.46825396825396826, + "acc_stderr": 0.0256993528321318 + }, + "leaderboard|mmlu:formal_logic|5": { + "acc": 0.5317460317460317, + "acc_stderr": 0.04463112720677172 + }, + "leaderboard|mmlu:global_facts|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332 + }, + "leaderboard|mmlu:high_school_biology|5": { + "acc": 0.8064516129032258, + "acc_stderr": 0.022475258525536057 + }, + "leaderboard|mmlu:high_school_chemistry|5": { + "acc": 0.541871921182266, + "acc_stderr": 0.03505630140785741 + }, + "leaderboard|mmlu:high_school_computer_science|5": { + "acc": 0.68, + "acc_stderr": 0.04688261722621505 + }, + "leaderboard|mmlu:high_school_european_history|5": { + "acc": 0.7393939393939394, + "acc_stderr": 0.034277431758165236 + }, + "leaderboard|mmlu:high_school_geography|5": { + "acc": 0.8131313131313131, + "acc_stderr": 0.027772533334218957 + }, + "leaderboard|mmlu:high_school_government_and_politics|5": { + "acc": 0.8963730569948186, + "acc_stderr": 0.02199531196364424 + }, + "leaderboard|mmlu:high_school_macroeconomics|5": { + "acc": 0.676923076923077, + "acc_stderr": 0.023710888501970555 + }, + "leaderboard|mmlu:high_school_mathematics|5": { + "acc": 0.32592592592592595, + "acc_stderr": 0.028578348365473072 + }, + "leaderboard|mmlu:high_school_microeconomics|5": { + "acc": 0.7563025210084033, + "acc_stderr": 0.027886828078380548 + }, + "leaderboard|mmlu:high_school_physics|5": { + "acc": 0.4105960264900662, + "acc_stderr": 0.04016689594849927 + }, + "leaderboard|mmlu:high_school_psychology|5": { + "acc": 0.8477064220183487, + "acc_stderr": 0.015405084393157074 + }, + "leaderboard|mmlu:high_school_statistics|5": { + "acc": 0.47685185185185186, + "acc_stderr": 0.03406315360711507 + }, + "leaderboard|mmlu:high_school_us_history|5": { + "acc": 0.7892156862745098, + "acc_stderr": 0.028626547912437406 + }, + "leaderboard|mmlu:high_school_world_history|5": { + "acc": 0.8396624472573839, + "acc_stderr": 0.023884380925965665 + }, + "leaderboard|mmlu:human_aging|5": { + "acc": 0.726457399103139, + "acc_stderr": 0.029918586707798827 + }, + "leaderboard|mmlu:human_sexuality|5": { + "acc": 0.7938931297709924, + "acc_stderr": 0.03547771004159462 + }, + "leaderboard|mmlu:international_law|5": { + "acc": 0.768595041322314, + "acc_stderr": 0.03849856098794088 + }, + "leaderboard|mmlu:jurisprudence|5": { + "acc": 0.7592592592592593, + "acc_stderr": 0.04133119440243839 + }, + "leaderboard|mmlu:logical_fallacies|5": { + "acc": 0.7607361963190185, + "acc_stderr": 0.033519538795212696 + }, + "leaderboard|mmlu:machine_learning|5": { + "acc": 0.5267857142857143, + "acc_stderr": 0.047389751192741546 + }, + "leaderboard|mmlu:management|5": { + "acc": 0.8155339805825242, + "acc_stderr": 0.03840423627288276 + }, + "leaderboard|mmlu:marketing|5": { + "acc": 0.905982905982906, + "acc_stderr": 0.019119892798924974 + }, + "leaderboard|mmlu:medical_genetics|5": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256 + }, + "leaderboard|mmlu:miscellaneous|5": { + "acc": 0.8237547892720306, + "acc_stderr": 0.013625556907993455 + }, + "leaderboard|mmlu:moral_disputes|5": { + "acc": 0.7398843930635838, + "acc_stderr": 0.023618678310069356 + }, + "leaderboard|mmlu:moral_scenarios|5": { + "acc": 0.43575418994413406, + "acc_stderr": 0.016583881958602387 + }, + "leaderboard|mmlu:nutrition|5": { + "acc": 0.7549019607843137, + "acc_stderr": 0.024630048979824785 + }, + "leaderboard|mmlu:philosophy|5": { + "acc": 0.7331189710610932, + "acc_stderr": 0.025122637608816657 + }, + "leaderboard|mmlu:prehistory|5": { + "acc": 0.7469135802469136, + "acc_stderr": 0.024191808600713002 + }, + "leaderboard|mmlu:professional_accounting|5": { + "acc": 0.5177304964539007, + "acc_stderr": 0.02980873964223777 + }, + "leaderboard|mmlu:professional_law|5": { + "acc": 0.46479791395045633, + "acc_stderr": 0.012738547371303956 + }, + "leaderboard|mmlu:professional_medicine|5": { + "acc": 0.7279411764705882, + "acc_stderr": 0.027033041151681456 + }, + "leaderboard|mmlu:professional_psychology|5": { + "acc": 0.6928104575163399, + "acc_stderr": 0.018663359671463677 + }, + "leaderboard|mmlu:public_relations|5": { + "acc": 0.6636363636363637, + "acc_stderr": 0.04525393596302505 + }, + "leaderboard|mmlu:security_studies|5": { + "acc": 0.7306122448979592, + "acc_stderr": 0.02840125202902294 + }, + "leaderboard|mmlu:sociology|5": { + "acc": 0.8557213930348259, + "acc_stderr": 0.02484575321230604 + }, + "leaderboard|mmlu:us_foreign_policy|5": { + "acc": 0.86, + "acc_stderr": 0.03487350880197769 + }, + "leaderboard|mmlu:virology|5": { + "acc": 0.536144578313253, + "acc_stderr": 0.03882310850890594 + }, + "leaderboard|mmlu:world_religions|5": { + "acc": 0.7953216374269005, + "acc_stderr": 0.030944459778533193 + }, + "leaderboard|truthfulqa:mc|0": { + "truthfulqa_mc1": 0.37454100367197063, + "truthfulqa_mc1_stderr": 0.016943535128405338, + "truthfulqa_mc2": 0.5337684444397199, + "truthfulqa_mc2_stderr": 0.015971485281891525 + }, + "leaderboard|winogrande|5": { + "acc": 0.6929755327545383, + "acc_stderr": 0.012963688616969483 + }, + "leaderboard|gsm8k|5": { + "qem": 0.6808188021228203, + "qem_stderr": 0.012840345676251653 + }, + "leaderboard|mmlu:_average|5": { + "acc": 0.6661794809691, + "acc_stderr": 0.033327669029227354 + }, + "all": { + "acc": 0.6635023512851042, + "acc_stderr": 0.032200498833699506, + "acc_norm": 0.6568970902414637, + "acc_norm_stderr": 0.009417504097378147, + "truthfulqa_mc1": 0.37454100367197063, + "truthfulqa_mc1_stderr": 0.016943535128405338, + "truthfulqa_mc2": 0.5337684444397199, + "truthfulqa_mc2_stderr": 0.015971485281891525, + "qem": 0.6808188021228203, + "qem_stderr": 0.012840345676251653 + } + }, + "versions": { + "leaderboard|arc:challenge|25": 0, + "leaderboard|gsm8k|5": 0, + "leaderboard|hellaswag|10": 0, + "leaderboard|mmlu:abstract_algebra|5": 0, + "leaderboard|mmlu:anatomy|5": 0, + "leaderboard|mmlu:astronomy|5": 0, + "leaderboard|mmlu:business_ethics|5": 0, + "leaderboard|mmlu:clinical_knowledge|5": 0, + "leaderboard|mmlu:college_biology|5": 0, + "leaderboard|mmlu:college_chemistry|5": 0, + "leaderboard|mmlu:college_computer_science|5": 0, + "leaderboard|mmlu:college_mathematics|5": 0, + "leaderboard|mmlu:college_medicine|5": 0, + "leaderboard|mmlu:college_physics|5": 0, + "leaderboard|mmlu:computer_security|5": 0, + "leaderboard|mmlu:conceptual_physics|5": 0, + "leaderboard|mmlu:econometrics|5": 0, + "leaderboard|mmlu:electrical_engineering|5": 0, + "leaderboard|mmlu:elementary_mathematics|5": 0, + "leaderboard|mmlu:formal_logic|5": 0, + "leaderboard|mmlu:global_facts|5": 0, + "leaderboard|mmlu:high_school_biology|5": 0, + "leaderboard|mmlu:high_school_chemistry|5": 0, + "leaderboard|mmlu:high_school_computer_science|5": 0, + "leaderboard|mmlu:high_school_european_history|5": 0, + "leaderboard|mmlu:high_school_geography|5": 0, + "leaderboard|mmlu:high_school_government_and_politics|5": 0, + "leaderboard|mmlu:high_school_macroeconomics|5": 0, + "leaderboard|mmlu:high_school_mathematics|5": 0, + "leaderboard|mmlu:high_school_microeconomics|5": 0, + "leaderboard|mmlu:high_school_physics|5": 0, + "leaderboard|mmlu:high_school_psychology|5": 0, + "leaderboard|mmlu:high_school_statistics|5": 0, + "leaderboard|mmlu:high_school_us_history|5": 0, + "leaderboard|mmlu:high_school_world_history|5": 0, + "leaderboard|mmlu:human_aging|5": 0, + "leaderboard|mmlu:human_sexuality|5": 0, + "leaderboard|mmlu:international_law|5": 0, + "leaderboard|mmlu:jurisprudence|5": 0, + "leaderboard|mmlu:logical_fallacies|5": 0, + "leaderboard|mmlu:machine_learning|5": 0, + "leaderboard|mmlu:management|5": 0, + "leaderboard|mmlu:marketing|5": 0, + "leaderboard|mmlu:medical_genetics|5": 0, + "leaderboard|mmlu:miscellaneous|5": 0, + "leaderboard|mmlu:moral_disputes|5": 0, + "leaderboard|mmlu:moral_scenarios|5": 0, + "leaderboard|mmlu:nutrition|5": 0, + "leaderboard|mmlu:philosophy|5": 0, + "leaderboard|mmlu:prehistory|5": 0, + "leaderboard|mmlu:professional_accounting|5": 0, + "leaderboard|mmlu:professional_law|5": 0, + "leaderboard|mmlu:professional_medicine|5": 0, + "leaderboard|mmlu:professional_psychology|5": 0, + "leaderboard|mmlu:public_relations|5": 0, + "leaderboard|mmlu:security_studies|5": 0, + "leaderboard|mmlu:sociology|5": 0, + "leaderboard|mmlu:us_foreign_policy|5": 0, + "leaderboard|mmlu:virology|5": 0, + "leaderboard|mmlu:world_religions|5": 0, + "leaderboard|truthfulqa:mc|0": 0, + "leaderboard|winogrande|5": 0 + }, + "config_tasks": { + "leaderboard|arc:challenge": { + "name": "arc:challenge", + "prompt_function": "arc", + "hf_repo": "ai2_arc", + "hf_subset": "ARC-Challenge", + "metric": [ + "loglikelihood_acc", + "loglikelihood_acc_norm_nospace" + ], + "hf_avail_splits": [ + "train", + "test" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": null, + "few_shots_select": "random_sampling_from_train", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "arc" + ], + "original_num_docs": 1172, + "effective_num_docs": 1172, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|gsm8k": { + "name": "gsm8k", + "prompt_function": "gsm8k", + "hf_repo": "gsm8k", + "hf_subset": "main", + "metric": [ + "quasi_exact_match_gsm8k" + ], + "hf_avail_splits": [ + "train", + "test" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": null, + "few_shots_select": "random_sampling_from_train", + "generation_size": 256, + "stop_sequence": [ + "Question:", + "Question", + ":" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard" + ], + "original_num_docs": 1319, + "effective_num_docs": 1319, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|hellaswag": { + "name": "hellaswag", + "prompt_function": "hellaswag_harness", + "hf_repo": "hellaswag", + "hf_subset": "default", + "metric": [ + "loglikelihood_acc", + "loglikelihood_acc_norm" + ], + "hf_avail_splits": [ + "train", + "test", + "validation" + ], + "evaluation_splits": [ + "validation" + ], + "few_shots_split": null, + "few_shots_select": "random_sampling_from_train", + "generation_size": -1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard" + ], + "original_num_docs": 10042, + "effective_num_docs": 10042, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:abstract_algebra": { + "name": "mmlu:abstract_algebra", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "abstract_algebra", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 100, + "effective_num_docs": 100, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:anatomy": { + "name": "mmlu:anatomy", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "anatomy", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 135, + "effective_num_docs": 135, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:astronomy": { + "name": "mmlu:astronomy", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "astronomy", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 152, + "effective_num_docs": 152, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:business_ethics": { + "name": "mmlu:business_ethics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "business_ethics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 100, + "effective_num_docs": 100, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:clinical_knowledge": { + "name": "mmlu:clinical_knowledge", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "clinical_knowledge", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 265, + "effective_num_docs": 265, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:college_biology": { + "name": "mmlu:college_biology", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "college_biology", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 144, + "effective_num_docs": 144, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:college_chemistry": { + "name": "mmlu:college_chemistry", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "college_chemistry", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 100, + "effective_num_docs": 100, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:college_computer_science": { + "name": "mmlu:college_computer_science", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "college_computer_science", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 100, + "effective_num_docs": 100, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:college_mathematics": { + "name": "mmlu:college_mathematics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "college_mathematics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 100, + "effective_num_docs": 100, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:college_medicine": { + "name": "mmlu:college_medicine", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "college_medicine", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 173, + "effective_num_docs": 173, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:college_physics": { + "name": "mmlu:college_physics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "college_physics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 102, + "effective_num_docs": 102, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:computer_security": { + "name": "mmlu:computer_security", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "computer_security", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 100, + "effective_num_docs": 100, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:conceptual_physics": { + "name": "mmlu:conceptual_physics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "conceptual_physics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 235, + "effective_num_docs": 235, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:econometrics": { + "name": "mmlu:econometrics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "econometrics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 114, + "effective_num_docs": 114, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:electrical_engineering": { + "name": "mmlu:electrical_engineering", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "electrical_engineering", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 145, + "effective_num_docs": 145, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:elementary_mathematics": { + "name": "mmlu:elementary_mathematics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "elementary_mathematics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 378, + "effective_num_docs": 378, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:formal_logic": { + "name": "mmlu:formal_logic", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "formal_logic", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 126, + "effective_num_docs": 126, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:global_facts": { + "name": "mmlu:global_facts", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "global_facts", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 100, + "effective_num_docs": 100, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_biology": { + "name": "mmlu:high_school_biology", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_biology", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 310, + "effective_num_docs": 310, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_chemistry": { + "name": "mmlu:high_school_chemistry", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_chemistry", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 203, + "effective_num_docs": 203, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_computer_science": { + "name": "mmlu:high_school_computer_science", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_computer_science", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 100, + "effective_num_docs": 100, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_european_history": { + "name": "mmlu:high_school_european_history", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_european_history", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 165, + "effective_num_docs": 165, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_geography": { + "name": "mmlu:high_school_geography", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_geography", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 198, + "effective_num_docs": 198, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_government_and_politics": { + "name": "mmlu:high_school_government_and_politics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_government_and_politics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 193, + "effective_num_docs": 193, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_macroeconomics": { + "name": "mmlu:high_school_macroeconomics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_macroeconomics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 390, + "effective_num_docs": 390, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_mathematics": { + "name": "mmlu:high_school_mathematics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_mathematics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 270, + "effective_num_docs": 270, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_microeconomics": { + "name": "mmlu:high_school_microeconomics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_microeconomics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 238, + "effective_num_docs": 238, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_physics": { + "name": "mmlu:high_school_physics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_physics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 151, + "effective_num_docs": 151, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_psychology": { + "name": "mmlu:high_school_psychology", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_psychology", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 545, + "effective_num_docs": 545, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_statistics": { + "name": "mmlu:high_school_statistics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_statistics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 216, + "effective_num_docs": 216, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_us_history": { + "name": "mmlu:high_school_us_history", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_us_history", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 204, + "effective_num_docs": 204, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_world_history": { + "name": "mmlu:high_school_world_history", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_world_history", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 237, + "effective_num_docs": 237, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:human_aging": { + "name": "mmlu:human_aging", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "human_aging", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 223, + "effective_num_docs": 223, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:human_sexuality": { + "name": "mmlu:human_sexuality", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "human_sexuality", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 131, + "effective_num_docs": 131, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:international_law": { + "name": "mmlu:international_law", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "international_law", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 121, + "effective_num_docs": 121, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:jurisprudence": { + "name": "mmlu:jurisprudence", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "jurisprudence", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 108, + "effective_num_docs": 108, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:logical_fallacies": { + "name": "mmlu:logical_fallacies", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "logical_fallacies", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 163, + "effective_num_docs": 163, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:machine_learning": { + "name": "mmlu:machine_learning", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "machine_learning", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 112, + "effective_num_docs": 112, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:management": { + "name": "mmlu:management", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "management", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 103, + "effective_num_docs": 103, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:marketing": { + "name": "mmlu:marketing", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "marketing", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 234, + "effective_num_docs": 234, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:medical_genetics": { + "name": "mmlu:medical_genetics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "medical_genetics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 100, + "effective_num_docs": 100, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:miscellaneous": { + "name": "mmlu:miscellaneous", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "miscellaneous", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 783, + "effective_num_docs": 783, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:moral_disputes": { + "name": "mmlu:moral_disputes", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "moral_disputes", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 346, + "effective_num_docs": 346, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:moral_scenarios": { + "name": "mmlu:moral_scenarios", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "moral_scenarios", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 895, + "effective_num_docs": 895, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:nutrition": { + "name": "mmlu:nutrition", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "nutrition", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 306, + "effective_num_docs": 306, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:philosophy": { + "name": "mmlu:philosophy", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "philosophy", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 311, + "effective_num_docs": 311, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:prehistory": { + "name": "mmlu:prehistory", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "prehistory", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 324, + "effective_num_docs": 324, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:professional_accounting": { + "name": "mmlu:professional_accounting", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "professional_accounting", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 282, + "effective_num_docs": 282, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:professional_law": { + "name": "mmlu:professional_law", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "professional_law", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 1534, + "effective_num_docs": 1534, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:professional_medicine": { + "name": "mmlu:professional_medicine", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "professional_medicine", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 272, + "effective_num_docs": 272, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:professional_psychology": { + "name": "mmlu:professional_psychology", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "professional_psychology", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 612, + "effective_num_docs": 612, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:public_relations": { + "name": "mmlu:public_relations", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "public_relations", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 110, + "effective_num_docs": 110, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:security_studies": { + "name": "mmlu:security_studies", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "security_studies", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 245, + "effective_num_docs": 245, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:sociology": { + "name": "mmlu:sociology", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "sociology", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 201, + "effective_num_docs": 201, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:us_foreign_policy": { + "name": "mmlu:us_foreign_policy", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "us_foreign_policy", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 100, + "effective_num_docs": 100, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:virology": { + "name": "mmlu:virology", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "virology", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 166, + "effective_num_docs": 166, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:world_religions": { + "name": "mmlu:world_religions", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "world_religions", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 171, + "effective_num_docs": 171, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|truthfulqa:mc": { + "name": "truthfulqa:mc", + "prompt_function": "truthful_qa_multiple_choice", + "hf_repo": "truthful_qa", + "hf_subset": "multiple_choice", + "metric": [ + "truthfulqa_mc_metrics" + ], + "hf_avail_splits": [ + "validation" + ], + "evaluation_splits": [ + "validation" + ], + "few_shots_split": null, + "few_shots_select": null, + "generation_size": -1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard" + ], + "original_num_docs": 817, + "effective_num_docs": 817, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|winogrande": { + "name": "winogrande", + "prompt_function": "winogrande", + "hf_repo": "winogrande", + "hf_subset": "winogrande_xl", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "train", + "test", + "validation" + ], + "evaluation_splits": [ + "validation" + ], + "few_shots_split": null, + "few_shots_select": "random_sampling", + "generation_size": -1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard" + ], + "original_num_docs": 1267, + "effective_num_docs": 1267, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + } + }, + "summary_tasks": { + "leaderboard|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "4aeb23a740784b86", + "hash_input_tokens": "2e9e18067d1f8ad8", + "hash_cont_tokens": "19baa8a044eaaac8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4687, + "non_padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|hellaswag|10": { + "hashes": { + "hash_examples": "31985c805c3a737e", + "hash_full_prompts": "3c2d3440e190b07b", + "hash_input_tokens": "412fc1d29623282b", + "hash_cont_tokens": "823c88a16c837063" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40105, + "non_padded": 63, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:abstract_algebra|5": { + "hashes": { + "hash_examples": "4c76229e00c9c0e9", + "hash_full_prompts": "faefa0cccb952fe0", + "hash_input_tokens": "e7380c35f0e2c4b3", + "hash_cont_tokens": "a886b3552371a98b" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:anatomy|5": { + "hashes": { + "hash_examples": "6a1f8104dccbd33b", + "hash_full_prompts": "eacd03e46972fa59", + "hash_input_tokens": "2ee8bc2ef4561b6b", + "hash_cont_tokens": "9be31d13c42ead00" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:astronomy|5": { + "hashes": { + "hash_examples": "1302effa3a76ce4c", + "hash_full_prompts": "826cacbdf1f6bfd0", + "hash_input_tokens": "6ab8d24255ff03b3", + "hash_cont_tokens": "30cc2b2fc1294aac" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:business_ethics|5": { + "hashes": { + "hash_examples": "03cb8bce5336419a", + "hash_full_prompts": "518511169382ac39", + "hash_input_tokens": "8be4f0cc9ce448e1", + "hash_cont_tokens": "4e9d83c717b7deb8" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:clinical_knowledge|5": { + "hashes": { + "hash_examples": "ffbb9c7b2be257f9", + "hash_full_prompts": "0b07b0bc774fdfd9", + "hash_input_tokens": "413166c01db52a72", + "hash_cont_tokens": "40dd7263ce5af5de" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:college_biology|5": { + "hashes": { + "hash_examples": "3ee77f176f38eb8e", + "hash_full_prompts": "22cbe0e8dabf98b1", + "hash_input_tokens": "0dcd583202383d43", + "hash_cont_tokens": "1892d80e82b394c0" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:college_chemistry|5": { + "hashes": { + "hash_examples": "ce61a69c46d47aeb", + "hash_full_prompts": "9c1288940a4afb59", + "hash_input_tokens": "59a4f0d36881d644", + "hash_cont_tokens": "b6bb78fb2d7e4e6f" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:college_computer_science|5": { + "hashes": { + "hash_examples": "32805b52d7d5daab", + "hash_full_prompts": "9522781d0cdf1a43", + "hash_input_tokens": "302a2f1d05b53513", + "hash_cont_tokens": "6a5da979260e607c" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:college_mathematics|5": { + "hashes": { + "hash_examples": "55da1a0a0bd33722", + "hash_full_prompts": "72fe6f46a57e6ca4", + "hash_input_tokens": "042f1988f13b8f9a", + "hash_cont_tokens": "62df3b0447bd3b12" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:college_medicine|5": { + "hashes": { + "hash_examples": "c33e143163049176", + "hash_full_prompts": "dee0989b2c8993f4", + "hash_input_tokens": "6dd81075c8e816e9", + "hash_cont_tokens": "933c01711a0757a0" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:college_physics|5": { + "hashes": { + "hash_examples": "ebdab1cdb7e555df", + "hash_full_prompts": "a1be6b64ea1948c3", + "hash_input_tokens": "37818fa59254732b", + "hash_cont_tokens": "d36569ab90faad7c" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:computer_security|5": { + "hashes": { + "hash_examples": "a24fd7d08a560921", + "hash_full_prompts": "01bc3fdfdefe67a4", + "hash_input_tokens": "d4957d5a9d5e83ec", + "hash_cont_tokens": "a886b3552371a98b" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:conceptual_physics|5": { + "hashes": { + "hash_examples": "8300977a79386993", + "hash_full_prompts": "b39315a8ada3ca79", + "hash_input_tokens": "c146a84803f78c9e", + "hash_cont_tokens": "6408f70f3d9ada31" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:econometrics|5": { + "hashes": { + "hash_examples": "ddde36788a04a46f", + "hash_full_prompts": "70bab37ca5fcc48f", + "hash_input_tokens": "086bc025be133096", + "hash_cont_tokens": "3befa885ca6e4b97" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:electrical_engineering|5": { + "hashes": { + "hash_examples": "acbc5def98c19b3f", + "hash_full_prompts": "86a4747481c11c61", + "hash_input_tokens": "b83507ac94ded59b", + "hash_cont_tokens": "e75df8f470aa4973" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:elementary_mathematics|5": { + "hashes": { + "hash_examples": "146e61d07497a9bd", + "hash_full_prompts": "1fe56333735325fa", + "hash_input_tokens": "8c3c868b34bad37b", + "hash_cont_tokens": "f09c97e7f7f9af71" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:formal_logic|5": { + "hashes": { + "hash_examples": "8635216e1909a03f", + "hash_full_prompts": "cc83c1ede45f974c", + "hash_input_tokens": "bb0616a24585501c", + "hash_cont_tokens": "df96e75b4eb1d7b0" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:global_facts|5": { + "hashes": { + "hash_examples": "30b315aa6353ee47", + "hash_full_prompts": "3a2ec1e2785c69a5", + "hash_input_tokens": "5e840dc7f1c55a67", + "hash_cont_tokens": "a886b3552371a98b" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_biology|5": { + "hashes": { + "hash_examples": "c9136373af2180de", + "hash_full_prompts": "27646a569cf2a6f8", + "hash_input_tokens": "1dce672a00c5cbe1", + "hash_cont_tokens": "c6d11e73dc85157f" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_chemistry|5": { + "hashes": { + "hash_examples": "b0661bfa1add6404", + "hash_full_prompts": "6905c6ca76f7b2b7", + "hash_input_tokens": "7fb2dd590b34e445", + "hash_cont_tokens": "208aff39cfca671a" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_computer_science|5": { + "hashes": { + "hash_examples": "80fc1d623a3d665f", + "hash_full_prompts": "b80092241e8b6c06", + "hash_input_tokens": "b2a9091fd8d00b66", + "hash_cont_tokens": "150a6d581009fbe0" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_european_history|5": { + "hashes": { + "hash_examples": "854da6e5af0fe1a1", + "hash_full_prompts": "a3bc32a5dc022ce7", + "hash_input_tokens": "393e215e8667fde4", + "hash_cont_tokens": "7b6f4c22b304c3cc" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_geography|5": { + "hashes": { + "hash_examples": "7dc963c7acd19ad8", + "hash_full_prompts": "53f91beae305905d", + "hash_input_tokens": "439ac435fc478534", + "hash_cont_tokens": "1a85c9e696d91a66" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "1f675dcdebc9758f", + "hash_full_prompts": "623fd7e3495f243f", + "hash_input_tokens": "2c5757b8545f7cf8", + "hash_cont_tokens": "a47a4530b8790081" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "2fb32cf2d80f0b35", + "hash_full_prompts": "378ac13c8abb6c5f", + "hash_input_tokens": "afea2ca30b1622ff", + "hash_cont_tokens": "e71e7c6acf44c3e5" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_mathematics|5": { + "hashes": { + "hash_examples": "fd6646fdb5d58a1f", + "hash_full_prompts": "14d34e0b34750627", + "hash_input_tokens": "34e63b0902b32a2c", + "hash_cont_tokens": "e36b5624bdbe96b0" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_microeconomics|5": { + "hashes": { + "hash_examples": "2118f21f71d87d84", + "hash_full_prompts": "9ac09e5d4da991c9", + "hash_input_tokens": "93d1c1ba5fe0bcbd", + "hash_cont_tokens": "a5f61d5beba13cc2" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_physics|5": { + "hashes": { + "hash_examples": "dc3ce06378548565", + "hash_full_prompts": "b4832a554d47d224", + "hash_input_tokens": "f5bf59bc9f6839fe", + "hash_cont_tokens": "df1d218ccbc258e8" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_psychology|5": { + "hashes": { + "hash_examples": "c8d1d98a40e11f2f", + "hash_full_prompts": "1e8cd27064546274", + "hash_input_tokens": "329851f26db67226", + "hash_cont_tokens": "6fb549a4eb8e6c47" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_statistics|5": { + "hashes": { + "hash_examples": "666c8759b98ee4ff", + "hash_full_prompts": "e05ab41077ec0afa", + "hash_input_tokens": "7abad93393993e44", + "hash_cont_tokens": "d9528c65af653d67" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_us_history|5": { + "hashes": { + "hash_examples": "95fef1c4b7d3f81e", + "hash_full_prompts": "a4b275996a416b4a", + "hash_input_tokens": "e5def820604ad889", + "hash_cont_tokens": "8b827fc7dfd3c1c5" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_world_history|5": { + "hashes": { + "hash_examples": "7e5085b6184b0322", + "hash_full_prompts": "8adf16361f0f320a", + "hash_input_tokens": "aa85ae4eba20e53f", + "hash_cont_tokens": "82f19c159c69a66d" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:human_aging|5": { + "hashes": { + "hash_examples": "c17333e7c7c10797", + "hash_full_prompts": "918d91a3141aac4d", + "hash_input_tokens": "297fceccf01a2c64", + "hash_cont_tokens": "ca87074f1dc39668" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:human_sexuality|5": { + "hashes": { + "hash_examples": "4edd1e9045df5e3d", + "hash_full_prompts": "bcee39ecea32fcc8", + "hash_input_tokens": "7c66a375881d6788", + "hash_cont_tokens": "491a0ab53f54aeb9" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:international_law|5": { + "hashes": { + "hash_examples": "db2fa00d771a062a", + "hash_full_prompts": "ffe12a3b5bf350c2", + "hash_input_tokens": "dc0250213736abca", + "hash_cont_tokens": "e3d257d7ea257fc8" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:jurisprudence|5": { + "hashes": { + "hash_examples": "e956f86b124076fe", + "hash_full_prompts": "b4293c3c08bebaf7", + "hash_input_tokens": "c9ed773ed04cff64", + "hash_cont_tokens": "4c69d7671fa1ab1c" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:logical_fallacies|5": { + "hashes": { + "hash_examples": "956e0e6365ab79f1", + "hash_full_prompts": "8c1b7733e98cbe81", + "hash_input_tokens": "a4f6df541a56c41a", + "hash_cont_tokens": "57e78d3d09b7db81" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:machine_learning|5": { + "hashes": { + "hash_examples": "397997cc6f4d581e", + "hash_full_prompts": "24a206a1c639ab8d", + "hash_input_tokens": "f0dfd08579d1f727", + "hash_cont_tokens": "94d2ec6c52bb7b53" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:management|5": { + "hashes": { + "hash_examples": "2bcbe6f6ca63d740", + "hash_full_prompts": "77e1c79d988beecc", + "hash_input_tokens": "15925fd62ddd3ca4", + "hash_cont_tokens": "79499fecb18f1cb1" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:marketing|5": { + "hashes": { + "hash_examples": "8ddb20d964a1b065", + "hash_full_prompts": "83cec2fa6b681d9d", + "hash_input_tokens": "6eb177c438da2061", + "hash_cont_tokens": "c5e9cd86b1a58fac" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:medical_genetics|5": { + "hashes": { + "hash_examples": "182a71f4763d2cea", + "hash_full_prompts": "195eb7ff99749730", + "hash_input_tokens": "5adeca0d34767f29", + "hash_cont_tokens": "a886b3552371a98b" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:miscellaneous|5": { + "hashes": { + "hash_examples": "4c404fdbb4ca57fc", + "hash_full_prompts": "33539955c9a96851", + "hash_input_tokens": "52aee92a69c2b698", + "hash_cont_tokens": "8578b82c42cc7026" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:moral_disputes|5": { + "hashes": { + "hash_examples": "60cbd2baa3fea5c9", + "hash_full_prompts": "009b7d0e7f819eff", + "hash_input_tokens": "f24c046b105c5e03", + "hash_cont_tokens": "26b0f808ec46464d" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:moral_scenarios|5": { + "hashes": { + "hash_examples": "fd8b0431fbdd75ef", + "hash_full_prompts": "f6e63c9fb9d3bff0", + "hash_input_tokens": "08eee0e3d8e89710", + "hash_cont_tokens": "52fe77d28aefc1b3" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:nutrition|5": { + "hashes": { + "hash_examples": "71e55e2b829b6528", + "hash_full_prompts": "8294d5e3ad435377", + "hash_input_tokens": "5b2c6686c8fc5e83", + "hash_cont_tokens": "25850a01b4a11b53" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:philosophy|5": { + "hashes": { + "hash_examples": "a6d489a8d208fa4b", + "hash_full_prompts": "db68c0f4503e4793", + "hash_input_tokens": "7108ad04b556854f", + "hash_cont_tokens": "8c34ab2fa65c3b6e" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:prehistory|5": { + "hashes": { + "hash_examples": "6cc50f032a19acaa", + "hash_full_prompts": "3972bcfa8c80e964", + "hash_input_tokens": "65cb6b1efc71921b", + "hash_cont_tokens": "89f21e5f9c7d81f2" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:professional_accounting|5": { + "hashes": { + "hash_examples": "50f57ab32f5f6cea", + "hash_full_prompts": "25f0becc2483bd32", + "hash_input_tokens": "c1b1c1e1f1ca4a85", + "hash_cont_tokens": "c7c4930a659ca843" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1120, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:professional_law|5": { + "hashes": { + "hash_examples": "a8fdc85c64f4b215", + "hash_full_prompts": "7a6f6c5706f00c7d", + "hash_input_tokens": "e7517115da0204cd", + "hash_cont_tokens": "6f36bd560ae36f02" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:professional_medicine|5": { + "hashes": { + "hash_examples": "c373a28a3050a73a", + "hash_full_prompts": "a74b6ac7c5c545d2", + "hash_input_tokens": "da6af6d03e682017", + "hash_cont_tokens": "ca4398b4ad3db5f1" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:professional_psychology|5": { + "hashes": { + "hash_examples": "bf5254fe818356af", + "hash_full_prompts": "c53fa139ec25f502", + "hash_input_tokens": "c6dbaf3c7103ebe9", + "hash_cont_tokens": "ce4bb75e80359fe4" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:public_relations|5": { + "hashes": { + "hash_examples": "b66d52e28e7d14e0", + "hash_full_prompts": "55b5eff05aa6bf13", + "hash_input_tokens": "deea75b6eec5b782", + "hash_cont_tokens": "680235f5ede0b353" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:security_studies|5": { + "hashes": { + "hash_examples": "514c14feaf000ad9", + "hash_full_prompts": "6690ecdc054f7b0c", + "hash_input_tokens": "deef3d39896aca43", + "hash_cont_tokens": "189956efcec12818" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:sociology|5": { + "hashes": { + "hash_examples": "f6c9bc9d18c80870", + "hash_full_prompts": "945fbdd091c72d64", + "hash_input_tokens": "330fffbccabf89e4", + "hash_cont_tokens": "2178ff937c0c1a29" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:us_foreign_policy|5": { + "hashes": { + "hash_examples": "ed7b78629db6678f", + "hash_full_prompts": "ebba6ea6eca4ae53", + "hash_input_tokens": "0ec87fa768a47632", + "hash_cont_tokens": "a886b3552371a98b" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 392, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:virology|5": { + "hashes": { + "hash_examples": "bc52ffdc3f9b994a", + "hash_full_prompts": "a2ee4984d6877fe3", + "hash_input_tokens": "cc264818195d14da", + "hash_cont_tokens": "ec5c187546c7c842" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 660, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:world_religions|5": { + "hashes": { + "hash_examples": "ecdb4a4f94f62930", + "hash_full_prompts": "a89c8dddd1d8ced0", + "hash_input_tokens": "e7e781ba363743eb", + "hash_cont_tokens": "e52b573046cdfc5c" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "36a6d90e75d92d4a", + "hash_full_prompts": "8d9ca0a8bd458a1c", + "hash_input_tokens": "4aad1a3bfe70acfc", + "hash_cont_tokens": "b0f64f6659d8c230" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|winogrande|5": { + "hashes": { + "hash_examples": "087d5d1a1afd4c7b", + "hash_full_prompts": "35da55e47222e0e1", + "hash_input_tokens": "881c630a9e0034f7", + "hash_cont_tokens": "c466f4c92e3879cb" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|gsm8k|5": { + "hashes": { + "hash_examples": "0ed016e24e7512fd", + "hash_full_prompts": "f7ab209f6467841e", + "hash_input_tokens": "deccfe61ad5cb3d5", + "hash_cont_tokens": "95cc4cc1148eb790" + }, + "truncated": 1319, + "non_truncated": 0, + "padded": 1074, + "non_padded": 245, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "670666fa3a90ce5d", + "hash_full_prompts": "56c005e427046302", + "hash_input_tokens": "2a51da62c271a1a0", + "hash_cont_tokens": "a74619de92c05f2e" + }, + "truncated": 1319, + "non_truncated": 27340, + "padded": 114540, + "non_padded": 332, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file diff --git a/results/meta-llama/Meta-Llama-3-8B-Instruct/results_2024-05-24T18-15-11.422263.json b/results/meta-llama/Meta-Llama-3-8B-Instruct/results_2024-05-24T18-15-11.422263.json new file mode 100644 index 0000000000000000000000000000000000000000..b5064155cce808a2d0b24a5d218891c0ff53543f --- /dev/null +++ b/results/meta-llama/Meta-Llama-3-8B-Instruct/results_2024-05-24T18-15-11.422263.json @@ -0,0 +1,3461 @@ +{ + "config_general": { + "lighteval_sha": "a98210fd3a2d1e8bface1c32b72ebd5017173a4c", + "num_fewshot_seeds": 1, + "override_batch_size": -1, + "max_samples": null, + "job_id": "", + "start_time": 2236465.645254106, + "end_time": 2260133.586933212, + "total_evaluation_time_secondes": "23667.941679106094", + "model_name": "meta-llama/Meta-Llama-3-8B-Instruct", + "model_sha": "c4a54320a52ed5f88b7a2f84496903ea4ff07b45", + "model_dtype": "torch.bfloat16", + "model_size": "14.96 GB", + "config": null + }, + "results": { + "leaderboard|arc:challenge|25": { + "acc": 0.5742320819112628, + "acc_stderr": 0.01444946427886881, + "acc_norm": 0.5827645051194539, + "acc_norm_stderr": 0.014409825518403082 + }, + "leaderboard|hellaswag|10": { + "acc": 0.5707030472017527, + "acc_stderr": 0.004939642460172585, + "acc_norm": 0.7310296753634734, + "acc_norm_stderr": 0.004425182676353211 + }, + "leaderboard|mmlu:abstract_algebra|5": { + "acc": 0.33, + "acc_stderr": 0.047258156262526045 + }, + "leaderboard|mmlu:anatomy|5": { + "acc": 0.6814814814814815, + "acc_stderr": 0.040247784019771096 + }, + "leaderboard|mmlu:astronomy|5": { + "acc": 0.75, + "acc_stderr": 0.03523807393012047 + }, + "leaderboard|mmlu:business_ethics|5": { + "acc": 0.7, + "acc_stderr": 0.046056618647183814 + }, + "leaderboard|mmlu:clinical_knowledge|5": { + "acc": 0.7471698113207547, + "acc_stderr": 0.026749899771241214 + }, + "leaderboard|mmlu:college_biology|5": { + "acc": 0.7916666666666666, + "acc_stderr": 0.033961162058453336 + }, + "leaderboard|mmlu:college_chemistry|5": { + "acc": 0.45, + "acc_stderr": 0.05 + }, + "leaderboard|mmlu:college_computer_science|5": { + "acc": 0.52, + "acc_stderr": 0.050211673156867795 + }, + "leaderboard|mmlu:college_mathematics|5": { + "acc": 0.39, + "acc_stderr": 0.04902071300001975 + }, + "leaderboard|mmlu:college_medicine|5": { + "acc": 0.6589595375722543, + "acc_stderr": 0.036146654241808254 + }, + "leaderboard|mmlu:college_physics|5": { + "acc": 0.43137254901960786, + "acc_stderr": 0.04928099597287533 + }, + "leaderboard|mmlu:computer_security|5": { + "acc": 0.79, + "acc_stderr": 0.04093601807403326 + }, + "leaderboard|mmlu:conceptual_physics|5": { + "acc": 0.5872340425531914, + "acc_stderr": 0.03218471141400351 + }, + "leaderboard|mmlu:econometrics|5": { + "acc": 0.543859649122807, + "acc_stderr": 0.046854730419077895 + }, + "leaderboard|mmlu:electrical_engineering|5": { + "acc": 0.6137931034482759, + "acc_stderr": 0.04057324734419035 + }, + "leaderboard|mmlu:elementary_mathematics|5": { + "acc": 0.46825396825396826, + "acc_stderr": 0.0256993528321318 + }, + "leaderboard|mmlu:formal_logic|5": { + "acc": 0.5317460317460317, + "acc_stderr": 0.04463112720677172 + }, + "leaderboard|mmlu:global_facts|5": { + "acc": 0.46, + "acc_stderr": 0.05009082659620332 + }, + "leaderboard|mmlu:high_school_biology|5": { + "acc": 0.8064516129032258, + "acc_stderr": 0.022475258525536057 + }, + "leaderboard|mmlu:high_school_chemistry|5": { + "acc": 0.541871921182266, + "acc_stderr": 0.03505630140785741 + }, + "leaderboard|mmlu:high_school_computer_science|5": { + "acc": 0.68, + "acc_stderr": 0.04688261722621505 + }, + "leaderboard|mmlu:high_school_european_history|5": { + "acc": 0.7393939393939394, + "acc_stderr": 0.034277431758165236 + }, + "leaderboard|mmlu:high_school_geography|5": { + "acc": 0.8131313131313131, + "acc_stderr": 0.027772533334218957 + }, + "leaderboard|mmlu:high_school_government_and_politics|5": { + "acc": 0.8963730569948186, + "acc_stderr": 0.02199531196364424 + }, + "leaderboard|mmlu:high_school_macroeconomics|5": { + "acc": 0.676923076923077, + "acc_stderr": 0.023710888501970555 + }, + "leaderboard|mmlu:high_school_mathematics|5": { + "acc": 0.32592592592592595, + "acc_stderr": 0.028578348365473072 + }, + "leaderboard|mmlu:high_school_microeconomics|5": { + "acc": 0.7563025210084033, + "acc_stderr": 0.027886828078380548 + }, + "leaderboard|mmlu:high_school_physics|5": { + "acc": 0.4105960264900662, + "acc_stderr": 0.04016689594849927 + }, + "leaderboard|mmlu:high_school_psychology|5": { + "acc": 0.8477064220183487, + "acc_stderr": 0.015405084393157074 + }, + "leaderboard|mmlu:high_school_statistics|5": { + "acc": 0.47685185185185186, + "acc_stderr": 0.03406315360711507 + }, + "leaderboard|mmlu:high_school_us_history|5": { + "acc": 0.7892156862745098, + "acc_stderr": 0.028626547912437406 + }, + "leaderboard|mmlu:high_school_world_history|5": { + "acc": 0.8396624472573839, + "acc_stderr": 0.023884380925965665 + }, + "leaderboard|mmlu:human_aging|5": { + "acc": 0.726457399103139, + "acc_stderr": 0.029918586707798827 + }, + "leaderboard|mmlu:human_sexuality|5": { + "acc": 0.7938931297709924, + "acc_stderr": 0.03547771004159462 + }, + "leaderboard|mmlu:international_law|5": { + "acc": 0.768595041322314, + "acc_stderr": 0.03849856098794088 + }, + "leaderboard|mmlu:jurisprudence|5": { + "acc": 0.7592592592592593, + "acc_stderr": 0.04133119440243839 + }, + "leaderboard|mmlu:logical_fallacies|5": { + "acc": 0.7607361963190185, + "acc_stderr": 0.033519538795212696 + }, + "leaderboard|mmlu:machine_learning|5": { + "acc": 0.5267857142857143, + "acc_stderr": 0.047389751192741546 + }, + "leaderboard|mmlu:management|5": { + "acc": 0.8155339805825242, + "acc_stderr": 0.03840423627288276 + }, + "leaderboard|mmlu:marketing|5": { + "acc": 0.905982905982906, + "acc_stderr": 0.019119892798924974 + }, + "leaderboard|mmlu:medical_genetics|5": { + "acc": 0.79, + "acc_stderr": 0.040936018074033256 + }, + "leaderboard|mmlu:miscellaneous|5": { + "acc": 0.8237547892720306, + "acc_stderr": 0.013625556907993455 + }, + "leaderboard|mmlu:moral_disputes|5": { + "acc": 0.7398843930635838, + "acc_stderr": 0.023618678310069356 + }, + "leaderboard|mmlu:moral_scenarios|5": { + "acc": 0.43575418994413406, + "acc_stderr": 0.016583881958602387 + }, + "leaderboard|mmlu:nutrition|5": { + "acc": 0.7549019607843137, + "acc_stderr": 0.024630048979824785 + }, + "leaderboard|mmlu:philosophy|5": { + "acc": 0.7331189710610932, + "acc_stderr": 0.025122637608816657 + }, + "leaderboard|mmlu:prehistory|5": { + "acc": 0.7469135802469136, + "acc_stderr": 0.024191808600713002 + }, + "leaderboard|mmlu:professional_accounting|5": { + "acc": 0.5177304964539007, + "acc_stderr": 0.02980873964223777 + }, + "leaderboard|mmlu:professional_law|5": { + "acc": 0.46479791395045633, + "acc_stderr": 0.012738547371303956 + }, + "leaderboard|mmlu:professional_medicine|5": { + "acc": 0.7279411764705882, + "acc_stderr": 0.027033041151681456 + }, + "leaderboard|mmlu:professional_psychology|5": { + "acc": 0.6928104575163399, + "acc_stderr": 0.018663359671463677 + }, + "leaderboard|mmlu:public_relations|5": { + "acc": 0.6636363636363637, + "acc_stderr": 0.04525393596302505 + }, + "leaderboard|mmlu:security_studies|5": { + "acc": 0.7306122448979592, + "acc_stderr": 0.02840125202902294 + }, + "leaderboard|mmlu:sociology|5": { + "acc": 0.8557213930348259, + "acc_stderr": 0.02484575321230604 + }, + "leaderboard|mmlu:us_foreign_policy|5": { + "acc": 0.86, + "acc_stderr": 0.03487350880197769 + }, + "leaderboard|mmlu:virology|5": { + "acc": 0.536144578313253, + "acc_stderr": 0.03882310850890594 + }, + "leaderboard|mmlu:world_religions|5": { + "acc": 0.7953216374269005, + "acc_stderr": 0.030944459778533193 + }, + "leaderboard|truthfulqa:mc|0": { + "truthfulqa_mc1": 0.37454100367197063, + "truthfulqa_mc1_stderr": 0.016943535128405338, + "truthfulqa_mc2": 0.5337684444397199, + "truthfulqa_mc2_stderr": 0.015971485281891525 + }, + "leaderboard|winogrande|5": { + "acc": 0.6929755327545383, + "acc_stderr": 0.012963688616969483 + }, + "leaderboard|gsm8k|5": { + "qem": 0.6808188021228203, + "qem_stderr": 0.012840345676251653 + }, + "leaderboard|mmlu:_average|5": { + "acc": 0.6661794809691, + "acc_stderr": 0.033327669029227354 + }, + "all": { + "acc": 0.6635023512851042, + "acc_stderr": 0.032200498833699506, + "acc_norm": 0.6568970902414637, + "acc_norm_stderr": 0.009417504097378147, + "truthfulqa_mc1": 0.37454100367197063, + "truthfulqa_mc1_stderr": 0.016943535128405338, + "truthfulqa_mc2": 0.5337684444397199, + "truthfulqa_mc2_stderr": 0.015971485281891525, + "qem": 0.6808188021228203, + "qem_stderr": 0.012840345676251653 + } + }, + "versions": { + "leaderboard|arc:challenge|25": 0, + "leaderboard|gsm8k|5": 0, + "leaderboard|hellaswag|10": 0, + "leaderboard|mmlu:abstract_algebra|5": 0, + "leaderboard|mmlu:anatomy|5": 0, + "leaderboard|mmlu:astronomy|5": 0, + "leaderboard|mmlu:business_ethics|5": 0, + "leaderboard|mmlu:clinical_knowledge|5": 0, + "leaderboard|mmlu:college_biology|5": 0, + "leaderboard|mmlu:college_chemistry|5": 0, + "leaderboard|mmlu:college_computer_science|5": 0, + "leaderboard|mmlu:college_mathematics|5": 0, + "leaderboard|mmlu:college_medicine|5": 0, + "leaderboard|mmlu:college_physics|5": 0, + "leaderboard|mmlu:computer_security|5": 0, + "leaderboard|mmlu:conceptual_physics|5": 0, + "leaderboard|mmlu:econometrics|5": 0, + "leaderboard|mmlu:electrical_engineering|5": 0, + "leaderboard|mmlu:elementary_mathematics|5": 0, + "leaderboard|mmlu:formal_logic|5": 0, + "leaderboard|mmlu:global_facts|5": 0, + "leaderboard|mmlu:high_school_biology|5": 0, + "leaderboard|mmlu:high_school_chemistry|5": 0, + "leaderboard|mmlu:high_school_computer_science|5": 0, + "leaderboard|mmlu:high_school_european_history|5": 0, + "leaderboard|mmlu:high_school_geography|5": 0, + "leaderboard|mmlu:high_school_government_and_politics|5": 0, + "leaderboard|mmlu:high_school_macroeconomics|5": 0, + "leaderboard|mmlu:high_school_mathematics|5": 0, + "leaderboard|mmlu:high_school_microeconomics|5": 0, + "leaderboard|mmlu:high_school_physics|5": 0, + "leaderboard|mmlu:high_school_psychology|5": 0, + "leaderboard|mmlu:high_school_statistics|5": 0, + "leaderboard|mmlu:high_school_us_history|5": 0, + "leaderboard|mmlu:high_school_world_history|5": 0, + "leaderboard|mmlu:human_aging|5": 0, + "leaderboard|mmlu:human_sexuality|5": 0, + "leaderboard|mmlu:international_law|5": 0, + "leaderboard|mmlu:jurisprudence|5": 0, + "leaderboard|mmlu:logical_fallacies|5": 0, + "leaderboard|mmlu:machine_learning|5": 0, + "leaderboard|mmlu:management|5": 0, + "leaderboard|mmlu:marketing|5": 0, + "leaderboard|mmlu:medical_genetics|5": 0, + "leaderboard|mmlu:miscellaneous|5": 0, + "leaderboard|mmlu:moral_disputes|5": 0, + "leaderboard|mmlu:moral_scenarios|5": 0, + "leaderboard|mmlu:nutrition|5": 0, + "leaderboard|mmlu:philosophy|5": 0, + "leaderboard|mmlu:prehistory|5": 0, + "leaderboard|mmlu:professional_accounting|5": 0, + "leaderboard|mmlu:professional_law|5": 0, + "leaderboard|mmlu:professional_medicine|5": 0, + "leaderboard|mmlu:professional_psychology|5": 0, + "leaderboard|mmlu:public_relations|5": 0, + "leaderboard|mmlu:security_studies|5": 0, + "leaderboard|mmlu:sociology|5": 0, + "leaderboard|mmlu:us_foreign_policy|5": 0, + "leaderboard|mmlu:virology|5": 0, + "leaderboard|mmlu:world_religions|5": 0, + "leaderboard|truthfulqa:mc|0": 0, + "leaderboard|winogrande|5": 0 + }, + "config_tasks": { + "leaderboard|arc:challenge": { + "name": "arc:challenge", + "prompt_function": "arc", + "hf_repo": "ai2_arc", + "hf_subset": "ARC-Challenge", + "metric": [ + "loglikelihood_acc", + "loglikelihood_acc_norm_nospace" + ], + "hf_avail_splits": [ + "train", + "test" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": null, + "few_shots_select": "random_sampling_from_train", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "arc" + ], + "original_num_docs": 1172, + "effective_num_docs": 1172, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|gsm8k": { + "name": "gsm8k", + "prompt_function": "gsm8k", + "hf_repo": "gsm8k", + "hf_subset": "main", + "metric": [ + "quasi_exact_match_gsm8k" + ], + "hf_avail_splits": [ + "train", + "test" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": null, + "few_shots_select": "random_sampling_from_train", + "generation_size": 256, + "stop_sequence": [ + "Question:", + "Question", + ":" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard" + ], + "original_num_docs": 1319, + "effective_num_docs": 1319, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|hellaswag": { + "name": "hellaswag", + "prompt_function": "hellaswag_harness", + "hf_repo": "hellaswag", + "hf_subset": "default", + "metric": [ + "loglikelihood_acc", + "loglikelihood_acc_norm" + ], + "hf_avail_splits": [ + "train", + "test", + "validation" + ], + "evaluation_splits": [ + "validation" + ], + "few_shots_split": null, + "few_shots_select": "random_sampling_from_train", + "generation_size": -1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard" + ], + "original_num_docs": 10042, + "effective_num_docs": 10042, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:abstract_algebra": { + "name": "mmlu:abstract_algebra", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "abstract_algebra", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 100, + "effective_num_docs": 100, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:anatomy": { + "name": "mmlu:anatomy", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "anatomy", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 135, + "effective_num_docs": 135, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:astronomy": { + "name": "mmlu:astronomy", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "astronomy", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 152, + "effective_num_docs": 152, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:business_ethics": { + "name": "mmlu:business_ethics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "business_ethics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 100, + "effective_num_docs": 100, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:clinical_knowledge": { + "name": "mmlu:clinical_knowledge", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "clinical_knowledge", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 265, + "effective_num_docs": 265, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:college_biology": { + "name": "mmlu:college_biology", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "college_biology", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 144, + "effective_num_docs": 144, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:college_chemistry": { + "name": "mmlu:college_chemistry", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "college_chemistry", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 100, + "effective_num_docs": 100, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:college_computer_science": { + "name": "mmlu:college_computer_science", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "college_computer_science", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 100, + "effective_num_docs": 100, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:college_mathematics": { + "name": "mmlu:college_mathematics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "college_mathematics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 100, + "effective_num_docs": 100, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:college_medicine": { + "name": "mmlu:college_medicine", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "college_medicine", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 173, + "effective_num_docs": 173, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:college_physics": { + "name": "mmlu:college_physics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "college_physics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 102, + "effective_num_docs": 102, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:computer_security": { + "name": "mmlu:computer_security", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "computer_security", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 100, + "effective_num_docs": 100, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:conceptual_physics": { + "name": "mmlu:conceptual_physics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "conceptual_physics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 235, + "effective_num_docs": 235, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:econometrics": { + "name": "mmlu:econometrics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "econometrics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 114, + "effective_num_docs": 114, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:electrical_engineering": { + "name": "mmlu:electrical_engineering", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "electrical_engineering", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 145, + "effective_num_docs": 145, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:elementary_mathematics": { + "name": "mmlu:elementary_mathematics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "elementary_mathematics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 378, + "effective_num_docs": 378, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:formal_logic": { + "name": "mmlu:formal_logic", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "formal_logic", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 126, + "effective_num_docs": 126, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:global_facts": { + "name": "mmlu:global_facts", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "global_facts", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 100, + "effective_num_docs": 100, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_biology": { + "name": "mmlu:high_school_biology", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_biology", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 310, + "effective_num_docs": 310, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_chemistry": { + "name": "mmlu:high_school_chemistry", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_chemistry", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 203, + "effective_num_docs": 203, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_computer_science": { + "name": "mmlu:high_school_computer_science", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_computer_science", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 100, + "effective_num_docs": 100, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_european_history": { + "name": "mmlu:high_school_european_history", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_european_history", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 165, + "effective_num_docs": 165, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_geography": { + "name": "mmlu:high_school_geography", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_geography", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 198, + "effective_num_docs": 198, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_government_and_politics": { + "name": "mmlu:high_school_government_and_politics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_government_and_politics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 193, + "effective_num_docs": 193, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_macroeconomics": { + "name": "mmlu:high_school_macroeconomics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_macroeconomics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 390, + "effective_num_docs": 390, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_mathematics": { + "name": "mmlu:high_school_mathematics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_mathematics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 270, + "effective_num_docs": 270, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_microeconomics": { + "name": "mmlu:high_school_microeconomics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_microeconomics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 238, + "effective_num_docs": 238, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_physics": { + "name": "mmlu:high_school_physics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_physics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 151, + "effective_num_docs": 151, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_psychology": { + "name": "mmlu:high_school_psychology", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_psychology", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 545, + "effective_num_docs": 545, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_statistics": { + "name": "mmlu:high_school_statistics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_statistics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 216, + "effective_num_docs": 216, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_us_history": { + "name": "mmlu:high_school_us_history", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_us_history", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 204, + "effective_num_docs": 204, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:high_school_world_history": { + "name": "mmlu:high_school_world_history", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "high_school_world_history", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 237, + "effective_num_docs": 237, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:human_aging": { + "name": "mmlu:human_aging", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "human_aging", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 223, + "effective_num_docs": 223, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:human_sexuality": { + "name": "mmlu:human_sexuality", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "human_sexuality", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 131, + "effective_num_docs": 131, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:international_law": { + "name": "mmlu:international_law", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "international_law", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 121, + "effective_num_docs": 121, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:jurisprudence": { + "name": "mmlu:jurisprudence", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "jurisprudence", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 108, + "effective_num_docs": 108, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:logical_fallacies": { + "name": "mmlu:logical_fallacies", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "logical_fallacies", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 163, + "effective_num_docs": 163, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:machine_learning": { + "name": "mmlu:machine_learning", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "machine_learning", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 112, + "effective_num_docs": 112, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:management": { + "name": "mmlu:management", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "management", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 103, + "effective_num_docs": 103, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:marketing": { + "name": "mmlu:marketing", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "marketing", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 234, + "effective_num_docs": 234, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:medical_genetics": { + "name": "mmlu:medical_genetics", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "medical_genetics", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 100, + "effective_num_docs": 100, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:miscellaneous": { + "name": "mmlu:miscellaneous", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "miscellaneous", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 783, + "effective_num_docs": 783, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:moral_disputes": { + "name": "mmlu:moral_disputes", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "moral_disputes", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 346, + "effective_num_docs": 346, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:moral_scenarios": { + "name": "mmlu:moral_scenarios", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "moral_scenarios", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 895, + "effective_num_docs": 895, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:nutrition": { + "name": "mmlu:nutrition", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "nutrition", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 306, + "effective_num_docs": 306, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:philosophy": { + "name": "mmlu:philosophy", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "philosophy", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 311, + "effective_num_docs": 311, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:prehistory": { + "name": "mmlu:prehistory", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "prehistory", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 324, + "effective_num_docs": 324, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:professional_accounting": { + "name": "mmlu:professional_accounting", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "professional_accounting", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 282, + "effective_num_docs": 282, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:professional_law": { + "name": "mmlu:professional_law", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "professional_law", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 1534, + "effective_num_docs": 1534, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:professional_medicine": { + "name": "mmlu:professional_medicine", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "professional_medicine", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 272, + "effective_num_docs": 272, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:professional_psychology": { + "name": "mmlu:professional_psychology", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "professional_psychology", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 612, + "effective_num_docs": 612, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:public_relations": { + "name": "mmlu:public_relations", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "public_relations", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 110, + "effective_num_docs": 110, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:security_studies": { + "name": "mmlu:security_studies", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "security_studies", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 245, + "effective_num_docs": 245, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:sociology": { + "name": "mmlu:sociology", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "sociology", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 201, + "effective_num_docs": 201, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:us_foreign_policy": { + "name": "mmlu:us_foreign_policy", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "us_foreign_policy", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 100, + "effective_num_docs": 100, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:virology": { + "name": "mmlu:virology", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "virology", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 166, + "effective_num_docs": 166, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|mmlu:world_religions": { + "name": "mmlu:world_religions", + "prompt_function": "mmlu_harness", + "hf_repo": "lighteval/mmlu", + "hf_subset": "world_religions", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "auxiliary_train", + "test", + "validation", + "dev" + ], + "evaluation_splits": [ + "test" + ], + "few_shots_split": "dev", + "few_shots_select": "sequential", + "generation_size": 1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard", + "mmlu" + ], + "original_num_docs": 171, + "effective_num_docs": 171, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|truthfulqa:mc": { + "name": "truthfulqa:mc", + "prompt_function": "truthful_qa_multiple_choice", + "hf_repo": "truthful_qa", + "hf_subset": "multiple_choice", + "metric": [ + "truthfulqa_mc_metrics" + ], + "hf_avail_splits": [ + "validation" + ], + "evaluation_splits": [ + "validation" + ], + "few_shots_split": null, + "few_shots_select": null, + "generation_size": -1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard" + ], + "original_num_docs": 817, + "effective_num_docs": 817, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + }, + "leaderboard|winogrande": { + "name": "winogrande", + "prompt_function": "winogrande", + "hf_repo": "winogrande", + "hf_subset": "winogrande_xl", + "metric": [ + "loglikelihood_acc" + ], + "hf_avail_splits": [ + "train", + "test", + "validation" + ], + "evaluation_splits": [ + "validation" + ], + "few_shots_split": null, + "few_shots_select": "random_sampling", + "generation_size": -1, + "stop_sequence": [ + "\n" + ], + "output_regex": null, + "num_samples": null, + "frozen": false, + "suite": [ + "leaderboard" + ], + "original_num_docs": 1267, + "effective_num_docs": 1267, + "trust_dataset": true, + "must_remove_duplicate_docs": null, + "version": 0 + } + }, + "summary_tasks": { + "leaderboard|arc:challenge|25": { + "hashes": { + "hash_examples": "17b0cae357c0259e", + "hash_full_prompts": "4aeb23a740784b86", + "hash_input_tokens": "2e9e18067d1f8ad8", + "hash_cont_tokens": "19baa8a044eaaac8" + }, + "truncated": 0, + "non_truncated": 1172, + "padded": 4687, + "non_padded": 0, + "effective_few_shots": 25.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|hellaswag|10": { + "hashes": { + "hash_examples": "31985c805c3a737e", + "hash_full_prompts": "3c2d3440e190b07b", + "hash_input_tokens": "412fc1d29623282b", + "hash_cont_tokens": "823c88a16c837063" + }, + "truncated": 0, + "non_truncated": 10042, + "padded": 40105, + "non_padded": 63, + "effective_few_shots": 10.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:abstract_algebra|5": { + "hashes": { + "hash_examples": "4c76229e00c9c0e9", + "hash_full_prompts": "faefa0cccb952fe0", + "hash_input_tokens": "e7380c35f0e2c4b3", + "hash_cont_tokens": "a886b3552371a98b" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:anatomy|5": { + "hashes": { + "hash_examples": "6a1f8104dccbd33b", + "hash_full_prompts": "eacd03e46972fa59", + "hash_input_tokens": "2ee8bc2ef4561b6b", + "hash_cont_tokens": "9be31d13c42ead00" + }, + "truncated": 0, + "non_truncated": 135, + "padded": 540, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:astronomy|5": { + "hashes": { + "hash_examples": "1302effa3a76ce4c", + "hash_full_prompts": "826cacbdf1f6bfd0", + "hash_input_tokens": "6ab8d24255ff03b3", + "hash_cont_tokens": "30cc2b2fc1294aac" + }, + "truncated": 0, + "non_truncated": 152, + "padded": 608, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:business_ethics|5": { + "hashes": { + "hash_examples": "03cb8bce5336419a", + "hash_full_prompts": "518511169382ac39", + "hash_input_tokens": "8be4f0cc9ce448e1", + "hash_cont_tokens": "4e9d83c717b7deb8" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:clinical_knowledge|5": { + "hashes": { + "hash_examples": "ffbb9c7b2be257f9", + "hash_full_prompts": "0b07b0bc774fdfd9", + "hash_input_tokens": "413166c01db52a72", + "hash_cont_tokens": "40dd7263ce5af5de" + }, + "truncated": 0, + "non_truncated": 265, + "padded": 1060, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:college_biology|5": { + "hashes": { + "hash_examples": "3ee77f176f38eb8e", + "hash_full_prompts": "22cbe0e8dabf98b1", + "hash_input_tokens": "0dcd583202383d43", + "hash_cont_tokens": "1892d80e82b394c0" + }, + "truncated": 0, + "non_truncated": 144, + "padded": 576, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:college_chemistry|5": { + "hashes": { + "hash_examples": "ce61a69c46d47aeb", + "hash_full_prompts": "9c1288940a4afb59", + "hash_input_tokens": "59a4f0d36881d644", + "hash_cont_tokens": "b6bb78fb2d7e4e6f" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:college_computer_science|5": { + "hashes": { + "hash_examples": "32805b52d7d5daab", + "hash_full_prompts": "9522781d0cdf1a43", + "hash_input_tokens": "302a2f1d05b53513", + "hash_cont_tokens": "6a5da979260e607c" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:college_mathematics|5": { + "hashes": { + "hash_examples": "55da1a0a0bd33722", + "hash_full_prompts": "72fe6f46a57e6ca4", + "hash_input_tokens": "042f1988f13b8f9a", + "hash_cont_tokens": "62df3b0447bd3b12" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:college_medicine|5": { + "hashes": { + "hash_examples": "c33e143163049176", + "hash_full_prompts": "dee0989b2c8993f4", + "hash_input_tokens": "6dd81075c8e816e9", + "hash_cont_tokens": "933c01711a0757a0" + }, + "truncated": 0, + "non_truncated": 173, + "padded": 692, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:college_physics|5": { + "hashes": { + "hash_examples": "ebdab1cdb7e555df", + "hash_full_prompts": "a1be6b64ea1948c3", + "hash_input_tokens": "37818fa59254732b", + "hash_cont_tokens": "d36569ab90faad7c" + }, + "truncated": 0, + "non_truncated": 102, + "padded": 408, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:computer_security|5": { + "hashes": { + "hash_examples": "a24fd7d08a560921", + "hash_full_prompts": "01bc3fdfdefe67a4", + "hash_input_tokens": "d4957d5a9d5e83ec", + "hash_cont_tokens": "a886b3552371a98b" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:conceptual_physics|5": { + "hashes": { + "hash_examples": "8300977a79386993", + "hash_full_prompts": "b39315a8ada3ca79", + "hash_input_tokens": "c146a84803f78c9e", + "hash_cont_tokens": "6408f70f3d9ada31" + }, + "truncated": 0, + "non_truncated": 235, + "padded": 940, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:econometrics|5": { + "hashes": { + "hash_examples": "ddde36788a04a46f", + "hash_full_prompts": "70bab37ca5fcc48f", + "hash_input_tokens": "086bc025be133096", + "hash_cont_tokens": "3befa885ca6e4b97" + }, + "truncated": 0, + "non_truncated": 114, + "padded": 456, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:electrical_engineering|5": { + "hashes": { + "hash_examples": "acbc5def98c19b3f", + "hash_full_prompts": "86a4747481c11c61", + "hash_input_tokens": "b83507ac94ded59b", + "hash_cont_tokens": "e75df8f470aa4973" + }, + "truncated": 0, + "non_truncated": 145, + "padded": 580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:elementary_mathematics|5": { + "hashes": { + "hash_examples": "146e61d07497a9bd", + "hash_full_prompts": "1fe56333735325fa", + "hash_input_tokens": "8c3c868b34bad37b", + "hash_cont_tokens": "f09c97e7f7f9af71" + }, + "truncated": 0, + "non_truncated": 378, + "padded": 1512, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:formal_logic|5": { + "hashes": { + "hash_examples": "8635216e1909a03f", + "hash_full_prompts": "cc83c1ede45f974c", + "hash_input_tokens": "bb0616a24585501c", + "hash_cont_tokens": "df96e75b4eb1d7b0" + }, + "truncated": 0, + "non_truncated": 126, + "padded": 504, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:global_facts|5": { + "hashes": { + "hash_examples": "30b315aa6353ee47", + "hash_full_prompts": "3a2ec1e2785c69a5", + "hash_input_tokens": "5e840dc7f1c55a67", + "hash_cont_tokens": "a886b3552371a98b" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_biology|5": { + "hashes": { + "hash_examples": "c9136373af2180de", + "hash_full_prompts": "27646a569cf2a6f8", + "hash_input_tokens": "1dce672a00c5cbe1", + "hash_cont_tokens": "c6d11e73dc85157f" + }, + "truncated": 0, + "non_truncated": 310, + "padded": 1240, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_chemistry|5": { + "hashes": { + "hash_examples": "b0661bfa1add6404", + "hash_full_prompts": "6905c6ca76f7b2b7", + "hash_input_tokens": "7fb2dd590b34e445", + "hash_cont_tokens": "208aff39cfca671a" + }, + "truncated": 0, + "non_truncated": 203, + "padded": 812, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_computer_science|5": { + "hashes": { + "hash_examples": "80fc1d623a3d665f", + "hash_full_prompts": "b80092241e8b6c06", + "hash_input_tokens": "b2a9091fd8d00b66", + "hash_cont_tokens": "150a6d581009fbe0" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_european_history|5": { + "hashes": { + "hash_examples": "854da6e5af0fe1a1", + "hash_full_prompts": "a3bc32a5dc022ce7", + "hash_input_tokens": "393e215e8667fde4", + "hash_cont_tokens": "7b6f4c22b304c3cc" + }, + "truncated": 0, + "non_truncated": 165, + "padded": 656, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_geography|5": { + "hashes": { + "hash_examples": "7dc963c7acd19ad8", + "hash_full_prompts": "53f91beae305905d", + "hash_input_tokens": "439ac435fc478534", + "hash_cont_tokens": "1a85c9e696d91a66" + }, + "truncated": 0, + "non_truncated": 198, + "padded": 792, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_government_and_politics|5": { + "hashes": { + "hash_examples": "1f675dcdebc9758f", + "hash_full_prompts": "623fd7e3495f243f", + "hash_input_tokens": "2c5757b8545f7cf8", + "hash_cont_tokens": "a47a4530b8790081" + }, + "truncated": 0, + "non_truncated": 193, + "padded": 772, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_macroeconomics|5": { + "hashes": { + "hash_examples": "2fb32cf2d80f0b35", + "hash_full_prompts": "378ac13c8abb6c5f", + "hash_input_tokens": "afea2ca30b1622ff", + "hash_cont_tokens": "e71e7c6acf44c3e5" + }, + "truncated": 0, + "non_truncated": 390, + "padded": 1560, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_mathematics|5": { + "hashes": { + "hash_examples": "fd6646fdb5d58a1f", + "hash_full_prompts": "14d34e0b34750627", + "hash_input_tokens": "34e63b0902b32a2c", + "hash_cont_tokens": "e36b5624bdbe96b0" + }, + "truncated": 0, + "non_truncated": 270, + "padded": 1080, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_microeconomics|5": { + "hashes": { + "hash_examples": "2118f21f71d87d84", + "hash_full_prompts": "9ac09e5d4da991c9", + "hash_input_tokens": "93d1c1ba5fe0bcbd", + "hash_cont_tokens": "a5f61d5beba13cc2" + }, + "truncated": 0, + "non_truncated": 238, + "padded": 952, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_physics|5": { + "hashes": { + "hash_examples": "dc3ce06378548565", + "hash_full_prompts": "b4832a554d47d224", + "hash_input_tokens": "f5bf59bc9f6839fe", + "hash_cont_tokens": "df1d218ccbc258e8" + }, + "truncated": 0, + "non_truncated": 151, + "padded": 604, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_psychology|5": { + "hashes": { + "hash_examples": "c8d1d98a40e11f2f", + "hash_full_prompts": "1e8cd27064546274", + "hash_input_tokens": "329851f26db67226", + "hash_cont_tokens": "6fb549a4eb8e6c47" + }, + "truncated": 0, + "non_truncated": 545, + "padded": 2180, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_statistics|5": { + "hashes": { + "hash_examples": "666c8759b98ee4ff", + "hash_full_prompts": "e05ab41077ec0afa", + "hash_input_tokens": "7abad93393993e44", + "hash_cont_tokens": "d9528c65af653d67" + }, + "truncated": 0, + "non_truncated": 216, + "padded": 864, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_us_history|5": { + "hashes": { + "hash_examples": "95fef1c4b7d3f81e", + "hash_full_prompts": "a4b275996a416b4a", + "hash_input_tokens": "e5def820604ad889", + "hash_cont_tokens": "8b827fc7dfd3c1c5" + }, + "truncated": 0, + "non_truncated": 204, + "padded": 816, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:high_school_world_history|5": { + "hashes": { + "hash_examples": "7e5085b6184b0322", + "hash_full_prompts": "8adf16361f0f320a", + "hash_input_tokens": "aa85ae4eba20e53f", + "hash_cont_tokens": "82f19c159c69a66d" + }, + "truncated": 0, + "non_truncated": 237, + "padded": 948, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:human_aging|5": { + "hashes": { + "hash_examples": "c17333e7c7c10797", + "hash_full_prompts": "918d91a3141aac4d", + "hash_input_tokens": "297fceccf01a2c64", + "hash_cont_tokens": "ca87074f1dc39668" + }, + "truncated": 0, + "non_truncated": 223, + "padded": 892, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:human_sexuality|5": { + "hashes": { + "hash_examples": "4edd1e9045df5e3d", + "hash_full_prompts": "bcee39ecea32fcc8", + "hash_input_tokens": "7c66a375881d6788", + "hash_cont_tokens": "491a0ab53f54aeb9" + }, + "truncated": 0, + "non_truncated": 131, + "padded": 524, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:international_law|5": { + "hashes": { + "hash_examples": "db2fa00d771a062a", + "hash_full_prompts": "ffe12a3b5bf350c2", + "hash_input_tokens": "dc0250213736abca", + "hash_cont_tokens": "e3d257d7ea257fc8" + }, + "truncated": 0, + "non_truncated": 121, + "padded": 484, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:jurisprudence|5": { + "hashes": { + "hash_examples": "e956f86b124076fe", + "hash_full_prompts": "b4293c3c08bebaf7", + "hash_input_tokens": "c9ed773ed04cff64", + "hash_cont_tokens": "4c69d7671fa1ab1c" + }, + "truncated": 0, + "non_truncated": 108, + "padded": 432, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:logical_fallacies|5": { + "hashes": { + "hash_examples": "956e0e6365ab79f1", + "hash_full_prompts": "8c1b7733e98cbe81", + "hash_input_tokens": "a4f6df541a56c41a", + "hash_cont_tokens": "57e78d3d09b7db81" + }, + "truncated": 0, + "non_truncated": 163, + "padded": 652, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:machine_learning|5": { + "hashes": { + "hash_examples": "397997cc6f4d581e", + "hash_full_prompts": "24a206a1c639ab8d", + "hash_input_tokens": "f0dfd08579d1f727", + "hash_cont_tokens": "94d2ec6c52bb7b53" + }, + "truncated": 0, + "non_truncated": 112, + "padded": 448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:management|5": { + "hashes": { + "hash_examples": "2bcbe6f6ca63d740", + "hash_full_prompts": "77e1c79d988beecc", + "hash_input_tokens": "15925fd62ddd3ca4", + "hash_cont_tokens": "79499fecb18f1cb1" + }, + "truncated": 0, + "non_truncated": 103, + "padded": 412, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:marketing|5": { + "hashes": { + "hash_examples": "8ddb20d964a1b065", + "hash_full_prompts": "83cec2fa6b681d9d", + "hash_input_tokens": "6eb177c438da2061", + "hash_cont_tokens": "c5e9cd86b1a58fac" + }, + "truncated": 0, + "non_truncated": 234, + "padded": 936, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:medical_genetics|5": { + "hashes": { + "hash_examples": "182a71f4763d2cea", + "hash_full_prompts": "195eb7ff99749730", + "hash_input_tokens": "5adeca0d34767f29", + "hash_cont_tokens": "a886b3552371a98b" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 400, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:miscellaneous|5": { + "hashes": { + "hash_examples": "4c404fdbb4ca57fc", + "hash_full_prompts": "33539955c9a96851", + "hash_input_tokens": "52aee92a69c2b698", + "hash_cont_tokens": "8578b82c42cc7026" + }, + "truncated": 0, + "non_truncated": 783, + "padded": 3132, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:moral_disputes|5": { + "hashes": { + "hash_examples": "60cbd2baa3fea5c9", + "hash_full_prompts": "009b7d0e7f819eff", + "hash_input_tokens": "f24c046b105c5e03", + "hash_cont_tokens": "26b0f808ec46464d" + }, + "truncated": 0, + "non_truncated": 346, + "padded": 1384, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:moral_scenarios|5": { + "hashes": { + "hash_examples": "fd8b0431fbdd75ef", + "hash_full_prompts": "f6e63c9fb9d3bff0", + "hash_input_tokens": "08eee0e3d8e89710", + "hash_cont_tokens": "52fe77d28aefc1b3" + }, + "truncated": 0, + "non_truncated": 895, + "padded": 3580, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:nutrition|5": { + "hashes": { + "hash_examples": "71e55e2b829b6528", + "hash_full_prompts": "8294d5e3ad435377", + "hash_input_tokens": "5b2c6686c8fc5e83", + "hash_cont_tokens": "25850a01b4a11b53" + }, + "truncated": 0, + "non_truncated": 306, + "padded": 1224, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:philosophy|5": { + "hashes": { + "hash_examples": "a6d489a8d208fa4b", + "hash_full_prompts": "db68c0f4503e4793", + "hash_input_tokens": "7108ad04b556854f", + "hash_cont_tokens": "8c34ab2fa65c3b6e" + }, + "truncated": 0, + "non_truncated": 311, + "padded": 1244, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:prehistory|5": { + "hashes": { + "hash_examples": "6cc50f032a19acaa", + "hash_full_prompts": "3972bcfa8c80e964", + "hash_input_tokens": "65cb6b1efc71921b", + "hash_cont_tokens": "89f21e5f9c7d81f2" + }, + "truncated": 0, + "non_truncated": 324, + "padded": 1296, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:professional_accounting|5": { + "hashes": { + "hash_examples": "50f57ab32f5f6cea", + "hash_full_prompts": "25f0becc2483bd32", + "hash_input_tokens": "c1b1c1e1f1ca4a85", + "hash_cont_tokens": "c7c4930a659ca843" + }, + "truncated": 0, + "non_truncated": 282, + "padded": 1120, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:professional_law|5": { + "hashes": { + "hash_examples": "a8fdc85c64f4b215", + "hash_full_prompts": "7a6f6c5706f00c7d", + "hash_input_tokens": "e7517115da0204cd", + "hash_cont_tokens": "6f36bd560ae36f02" + }, + "truncated": 0, + "non_truncated": 1534, + "padded": 6136, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:professional_medicine|5": { + "hashes": { + "hash_examples": "c373a28a3050a73a", + "hash_full_prompts": "a74b6ac7c5c545d2", + "hash_input_tokens": "da6af6d03e682017", + "hash_cont_tokens": "ca4398b4ad3db5f1" + }, + "truncated": 0, + "non_truncated": 272, + "padded": 1088, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:professional_psychology|5": { + "hashes": { + "hash_examples": "bf5254fe818356af", + "hash_full_prompts": "c53fa139ec25f502", + "hash_input_tokens": "c6dbaf3c7103ebe9", + "hash_cont_tokens": "ce4bb75e80359fe4" + }, + "truncated": 0, + "non_truncated": 612, + "padded": 2448, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:public_relations|5": { + "hashes": { + "hash_examples": "b66d52e28e7d14e0", + "hash_full_prompts": "55b5eff05aa6bf13", + "hash_input_tokens": "deea75b6eec5b782", + "hash_cont_tokens": "680235f5ede0b353" + }, + "truncated": 0, + "non_truncated": 110, + "padded": 440, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:security_studies|5": { + "hashes": { + "hash_examples": "514c14feaf000ad9", + "hash_full_prompts": "6690ecdc054f7b0c", + "hash_input_tokens": "deef3d39896aca43", + "hash_cont_tokens": "189956efcec12818" + }, + "truncated": 0, + "non_truncated": 245, + "padded": 980, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:sociology|5": { + "hashes": { + "hash_examples": "f6c9bc9d18c80870", + "hash_full_prompts": "945fbdd091c72d64", + "hash_input_tokens": "330fffbccabf89e4", + "hash_cont_tokens": "2178ff937c0c1a29" + }, + "truncated": 0, + "non_truncated": 201, + "padded": 804, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:us_foreign_policy|5": { + "hashes": { + "hash_examples": "ed7b78629db6678f", + "hash_full_prompts": "ebba6ea6eca4ae53", + "hash_input_tokens": "0ec87fa768a47632", + "hash_cont_tokens": "a886b3552371a98b" + }, + "truncated": 0, + "non_truncated": 100, + "padded": 392, + "non_padded": 8, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:virology|5": { + "hashes": { + "hash_examples": "bc52ffdc3f9b994a", + "hash_full_prompts": "a2ee4984d6877fe3", + "hash_input_tokens": "cc264818195d14da", + "hash_cont_tokens": "ec5c187546c7c842" + }, + "truncated": 0, + "non_truncated": 166, + "padded": 660, + "non_padded": 4, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|mmlu:world_religions|5": { + "hashes": { + "hash_examples": "ecdb4a4f94f62930", + "hash_full_prompts": "a89c8dddd1d8ced0", + "hash_input_tokens": "e7e781ba363743eb", + "hash_cont_tokens": "e52b573046cdfc5c" + }, + "truncated": 0, + "non_truncated": 171, + "padded": 684, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|truthfulqa:mc|0": { + "hashes": { + "hash_examples": "36a6d90e75d92d4a", + "hash_full_prompts": "8d9ca0a8bd458a1c", + "hash_input_tokens": "4aad1a3bfe70acfc", + "hash_cont_tokens": "b0f64f6659d8c230" + }, + "truncated": 0, + "non_truncated": 817, + "padded": 9996, + "non_padded": 0, + "effective_few_shots": 0.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|winogrande|5": { + "hashes": { + "hash_examples": "087d5d1a1afd4c7b", + "hash_full_prompts": "35da55e47222e0e1", + "hash_input_tokens": "881c630a9e0034f7", + "hash_cont_tokens": "c466f4c92e3879cb" + }, + "truncated": 0, + "non_truncated": 1267, + "padded": 2534, + "non_padded": 0, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + }, + "leaderboard|gsm8k|5": { + "hashes": { + "hash_examples": "0ed016e24e7512fd", + "hash_full_prompts": "f7ab209f6467841e", + "hash_input_tokens": "deccfe61ad5cb3d5", + "hash_cont_tokens": "95cc4cc1148eb790" + }, + "truncated": 1319, + "non_truncated": 0, + "padded": 1074, + "non_padded": 245, + "effective_few_shots": 5.0, + "num_truncated_few_shots": 0 + } + }, + "summary_general": { + "hashes": { + "hash_examples": "670666fa3a90ce5d", + "hash_full_prompts": "56c005e427046302", + "hash_input_tokens": "2a51da62c271a1a0", + "hash_cont_tokens": "a74619de92c05f2e" + }, + "truncated": 1319, + "non_truncated": 27340, + "padded": 114540, + "non_padded": 332, + "num_truncated_few_shots": 0 + } +} \ No newline at end of file