diff --git a/.gitattributes b/.gitattributes index b356c0fc2c6d74b967a3cab126ffdbeebb4ddd77..365534affd5cfa6b801af22c87dcba8a538e5230 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1,2 +1,35 @@ +*.7z filter=lfs diff=lfs merge=lfs -text +*.arrow filter=lfs diff=lfs merge=lfs -text +*.bin filter=lfs diff=lfs merge=lfs -text +*.bz2 filter=lfs diff=lfs merge=lfs -text +*.ckpt filter=lfs diff=lfs merge=lfs -text +*.ftz filter=lfs diff=lfs merge=lfs -text +*.gz filter=lfs diff=lfs merge=lfs -text +*.h5 filter=lfs diff=lfs merge=lfs -text +*.joblib filter=lfs diff=lfs merge=lfs -text +*.lfs.* filter=lfs diff=lfs merge=lfs -text +*.mlmodel filter=lfs diff=lfs merge=lfs -text +*.model filter=lfs diff=lfs merge=lfs -text +*.msgpack filter=lfs diff=lfs merge=lfs -text +*.npy filter=lfs diff=lfs merge=lfs -text +*.npz filter=lfs diff=lfs merge=lfs -text +*.onnx filter=lfs diff=lfs merge=lfs -text +*.ot filter=lfs diff=lfs merge=lfs -text +*.parquet filter=lfs diff=lfs merge=lfs -text +*.pb filter=lfs diff=lfs merge=lfs -text +*.pickle filter=lfs diff=lfs merge=lfs -text +*.pkl filter=lfs diff=lfs merge=lfs -text +*.pt filter=lfs diff=lfs merge=lfs -text +*.pth filter=lfs diff=lfs merge=lfs -text +*.rar filter=lfs diff=lfs merge=lfs -text +*.safetensors filter=lfs diff=lfs merge=lfs -text +saved_model/**/* filter=lfs diff=lfs merge=lfs -text +*.tar.* filter=lfs diff=lfs merge=lfs -text +*.tflite filter=lfs diff=lfs merge=lfs -text +*.tgz filter=lfs diff=lfs merge=lfs -text +*.wasm filter=lfs diff=lfs merge=lfs -text +*.xz filter=lfs diff=lfs merge=lfs -text +*.zip filter=lfs diff=lfs merge=lfs -text +*.zst filter=lfs diff=lfs merge=lfs -text +*tfevents* filter=lfs diff=lfs merge=lfs -text *.json filter=lfs diff=lfs merge=lfs -text -*.py filter=lfs diff=lfs merge=lfs -text diff --git a/README.md b/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4592f7d4c5f656f7dd4357ebb9cf7a7f075669a6 --- /dev/null +++ b/README.md @@ -0,0 +1,35 @@ +--- +title: Bittensor Lmeh Evaluations +emoji: 🏆 +colorFrom: green +colorTo: indigo +sdk: gradio +sdk_version: 3.33.1 +app_file: app.py +pinned: false +--- + +Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference + +# HF LLM Leaderboard Evluation Scripts for Bittensor Validatos + +### Install LM-Eval + +``` +git clone https://github.com/EleutherAI/lm-evaluation-harness +cd lm-evaluation-harness +pip install -e . +``` + +### Running Evaluations on Bittensor's Validator + +``` +python3 eval_bittensor.py --validator opentensor_foundation +``` + +### Running Evaluations on Bittensor's Validator + +``` +export BITAPAI_KEY = XXXX +python3 eval_bittensor.py --validator tasostats +``` \ No newline at end of file diff --git a/_results/few-shot/opentensor_foundation/arc_challenge_results.json b/_results/few-shot/opentensor_foundation/arc_challenge_results.json new file mode 100644 index 0000000000000000000000000000000000000000..5857e4b0a81c4d90480bb8b77d9603afbf67ffc9 --- /dev/null +++ b/_results/few-shot/opentensor_foundation/arc_challenge_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9dad9333cc46a9c4a078adcd7ce3773e520023fe529349a88464cbf8bd54b0d5 +size 11320522 diff --git a/_results/few-shot/opentensor_foundation/hellaswag_results.json b/_results/few-shot/opentensor_foundation/hellaswag_results.json new file mode 100644 index 0000000000000000000000000000000000000000..4d83f81848aacf67660bf8305c89e3e6713529e3 --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hellaswag_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dfcf84170317fb2f8c5126a252b840adbb264d14dc0ad4b832351a82195d73f6 +size 62308880 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-abstract_algebra_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-abstract_algebra_results.json new file mode 100644 index 0000000000000000000000000000000000000000..72383905c57cf6c62fe52704fbd0b52778f38ce3 --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-abstract_algebra_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e67a4214716005ae044d2df22f300d0a5279744e6766679d55f03975b54c5b14 +size 182462 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-anatomy_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-anatomy_results.json new file mode 100644 index 0000000000000000000000000000000000000000..6dd54f8ada3ba1e720638ee896bb8a31887f3668 --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-anatomy_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6231fa4b0c7bd7f0bc8154335a494c2be1bdfdbe47c0da0ea75a643743303039 +size 318818 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-astronomy_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-astronomy_results.json new file mode 100644 index 0000000000000000000000000000000000000000..fbb93401b28c910c4bea995fa79523fc5cb07ba9 --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-astronomy_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:15aa008f4ccc1dd5c2fe8e245c61be73f09a0a06781464079f6f8c9076c5df26 +size 448776 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-business_ethics_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-business_ethics_results.json new file mode 100644 index 0000000000000000000000000000000000000000..83ff64baf0ffd6f49503c84f5fbe100584341c55 --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-business_ethics_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acea8d03e21feeb11bebd5bbb058c8acf46d1b92fa3b0c07120ae2beea47c2c7 +size 331001 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-clinical_knowledge_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-clinical_knowledge_results.json new file mode 100644 index 0000000000000000000000000000000000000000..43de48bc959cc82b42be55fa10f6d645fb531c82 --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-clinical_knowledge_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd90396124fb471b5c5202b85b83f45f1d6d72863b67dd2d55051a9f03f228fb +size 551475 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-college_biology_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-college_biology_results.json new file mode 100644 index 0000000000000000000000000000000000000000..289d0ec487ed0b217f8a2be82a56d9c1f897339f --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-college_biology_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e69161c78f17292121d0e4c0b5b99beff06501a442e05ef21eaee43cbe14e74 +size 426143 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-college_chemistry_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-college_chemistry_results.json new file mode 100644 index 0000000000000000000000000000000000000000..cac846e9db100ac3183fa760bc5fc1cbee399987 --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-college_chemistry_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3b5a861c7e7392635b22269971cfa9b0b96e6d563089b0ea65a4db7c01ff002 +size 241452 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-college_computer_science_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-college_computer_science_results.json new file mode 100644 index 0000000000000000000000000000000000000000..88c96352189b575d3c62ca04d9361d8918fb341d --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-college_computer_science_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3135cd9fbcc948546858042ca203204e49d05f0cadb79b589322584a66a525b +size 374464 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-college_mathematics_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-college_mathematics_results.json new file mode 100644 index 0000000000000000000000000000000000000000..f258185b9c038efbdf57ad6817c65162b51a4fa0 --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-college_mathematics_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3b43b1b9b528cf5603fd695e8d383e244615340b3d41d2a9899f294a8ff80f8 +size 223855 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-college_medicine_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-college_medicine_results.json new file mode 100644 index 0000000000000000000000000000000000000000..192441521fe94c239056806202cb5e1fc4cec606 --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-college_medicine_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb2efe8fdfbbdcf551717a8b5c755e18fa77361fc2fcc8be11037b5ef1f7facc +size 599803 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-college_physics_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-college_physics_results.json new file mode 100644 index 0000000000000000000000000000000000000000..0f380e91c3535dce48b739d30c149d262f4104dd --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-college_physics_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb08b8ec4ffd7b73c0be2d394af1859643fda2f6ee30c0c2c662c187575d6320 +size 310562 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-computer_security_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-computer_security_results.json new file mode 100644 index 0000000000000000000000000000000000000000..ffcae013ce89d73444819fd723e0b76afebb1ea8 --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-computer_security_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ef3186f38c6bfc305cb838fa5a3b0e23968179d49a347ffce07ac561fca36e0 +size 337892 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-conceptual_physics_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-conceptual_physics_results.json new file mode 100644 index 0000000000000000000000000000000000000000..b32ea645e3817346e093ed90566787892ca4dc7b --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-conceptual_physics_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5dd6102bfb2de78d47b978f561e9462c09a68970ad24af9cd38a8f254f92d7ee +size 477879 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-econometrics_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-econometrics_results.json new file mode 100644 index 0000000000000000000000000000000000000000..7035317c4ad8e8b154bf20a529c18dfe9df3164c --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-econometrics_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:067a4b1e10071949395e2757051395da1adaae33adad513cfaa296043d68039a +size 458660 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-electrical_engineering_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-electrical_engineering_results.json new file mode 100644 index 0000000000000000000000000000000000000000..2b1adf3fe11a1c4e1755539522608c3fc25b3bea --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-electrical_engineering_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3aab1d15e4b9e5083f4532bb8a0180d71a2b0bde25beccfe96edfb369802a0fc +size 322549 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-elementary_mathematics_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-elementary_mathematics_results.json new file mode 100644 index 0000000000000000000000000000000000000000..5ef88f4b418eb631d00a1032b7642f30d0dc9e6b --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-elementary_mathematics_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:119a4c6719d6acc2f1a1a29e5bc3c6b379535191312905a7b72902b810453503 +size 712303 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-formal_logic_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-formal_logic_results.json new file mode 100644 index 0000000000000000000000000000000000000000..cc3bce2ebc8f5995441fb0f3da3a934670631029 --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-formal_logic_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a74eb48d3966c14313891e80798e1fcd8fb8ed804737d1533a87a9bbd83ae118 +size 565974 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-global_facts_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-global_facts_results.json new file mode 100644 index 0000000000000000000000000000000000000000..8a4fb246ab0424e026170623b76326023319f2ac --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-global_facts_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7161ebba211acbff8fecfff0ae2c422afd8002dbc401c511a353b1f1b11969af +size 186486 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-high_school_biology_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-high_school_biology_results.json new file mode 100644 index 0000000000000000000000000000000000000000..804cc5faddb9d336a4d10dbc0f53e8a710ce0f14 --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-high_school_biology_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7afd2a958ddb9f3054e9cd01accf30bc8e3b11459627462b3830fc3bbb34897 +size 921326 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-high_school_chemistry_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-high_school_chemistry_results.json new file mode 100644 index 0000000000000000000000000000000000000000..7b945cc9f5d65c42b9f8a918614d9d4ce6e65830 --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-high_school_chemistry_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95c040cd896cafd5cfc62382897a2b248a4a29164e0939472caa4d076d4f2e53 +size 520898 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-high_school_computer_science_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-high_school_computer_science_results.json new file mode 100644 index 0000000000000000000000000000000000000000..8173d1f710d77fb2e26c356fd50ea4d5926e302d --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-high_school_computer_science_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c2dad4a8f6b8e362361b8453a4987de57075c61769458204d615aec1264e04d +size 422941 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-high_school_european_history_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-high_school_european_history_results.json new file mode 100644 index 0000000000000000000000000000000000000000..63a5fdf04026e03b9198c7d8d633249f74367dc9 --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-high_school_european_history_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73a7f95682110f6a84eb8487f70bea094167baf363b6f4456de0e4d602b1933f +size 1835394 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-high_school_geography_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-high_school_geography_results.json new file mode 100644 index 0000000000000000000000000000000000000000..b9686f74c3910f4b14771fe4b6a1fc99c37b8805 --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-high_school_geography_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c41effe75c3904d83ed4188626df35a147326de8fe9f5c38c4e38d7b492bdb57 +size 471737 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-high_school_government_and_politics_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-high_school_government_and_politics_results.json new file mode 100644 index 0000000000000000000000000000000000000000..cf707c9077014af51cdd5ffbe97e153a3ad5c090 --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-high_school_government_and_politics_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ea7fb0bfd50e143cb70e88caa63e1632bb0cbce0eca38a680a575fa7ffc5b76 +size 684698 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-high_school_macroeconomics_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-high_school_macroeconomics_results.json new file mode 100644 index 0000000000000000000000000000000000000000..1a81a564c39443893c7c5c1228d4b629290e50bc --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-high_school_macroeconomics_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:af934233771969d35ca9884a3860e6c7134710a72abcbbc35be52cf4a8a97f55 +size 1206173 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-high_school_mathematics_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-high_school_mathematics_results.json new file mode 100644 index 0000000000000000000000000000000000000000..8a4f194f08e7a9e1f0c6424469220fe4e8e29b86 --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-high_school_mathematics_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f9257cb365d2d6e6a7d6239882072f1af48f6160870da1b1245b495f29ff5bf +size 693387 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-high_school_microeconomics_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-high_school_microeconomics_results.json new file mode 100644 index 0000000000000000000000000000000000000000..4fbe67b06a2f466417559b7648a52e67fecf988d --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-high_school_microeconomics_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a10d2327c3332cf1e383a8b0067335d54268518ad32474215ddda0003fc7d8c9 +size 783462 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-high_school_physics_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-high_school_physics_results.json new file mode 100644 index 0000000000000000000000000000000000000000..8d5ea2ec09eac93cf7a78254e3304451dafbaf34 --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-high_school_physics_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1915537a7bc12ccbd5f415ceae13cd9eec02e92cded5bf7158c310bb4cc6d488 +size 603384 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-high_school_psychology_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-high_school_psychology_results.json new file mode 100644 index 0000000000000000000000000000000000000000..6b1a493a7c4ccb02fd004f23d625a6d6168f26d3 --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-high_school_psychology_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18ef75b5364a57e0e0c29181448d9aaa2d5d75d03f5261638383a2c9abb463d3 +size 1506092 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-high_school_statistics_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-high_school_statistics_results.json new file mode 100644 index 0000000000000000000000000000000000000000..4f4984000ed5b5ca281b275299af566abb5961a1 --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-high_school_statistics_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:846d519d4d7c0f0989ba72314db3f1410343f637ad7f4aceba2e9486b3a650e5 +size 861687 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-high_school_us_history_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-high_school_us_history_results.json new file mode 100644 index 0000000000000000000000000000000000000000..5ab146c472f5e1bf3a9c746fb6063587fc2a78e2 --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-high_school_us_history_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82ef4f0e0f1f774c52b27da5a4e3cdf99f7250fed623aa790cff7de83438d1f4 +size 2113578 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-high_school_world_history_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-high_school_world_history_results.json new file mode 100644 index 0000000000000000000000000000000000000000..7b1ba749941f0e6f9ac4974bd1ce6da4f9c9d1da --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-high_school_world_history_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fa170bbf95675bcfcb0aba604b6b5f00179fccf5b642ffbad7a00126f4e69dd +size 2515273 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-human_aging_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-human_aging_results.json new file mode 100644 index 0000000000000000000000000000000000000000..7a60b759e89892d044a2cc0adaef77327025aec8 --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-human_aging_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cfd154edd7d708659c47f9a6f4dd147217c4971c7640d36d1eb2798c1b8f029b +size 534049 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-human_sexuality_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-human_sexuality_results.json new file mode 100644 index 0000000000000000000000000000000000000000..45ee4a864599821cba3f6a90f219b16244712569 --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-human_sexuality_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48b70bc5b952db2acef50f4f153cfc0593e419696a99690b562e073ca47595ff +size 350006 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-international_law_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-international_law_results.json new file mode 100644 index 0000000000000000000000000000000000000000..8af86218d05dd5f4646370b98e0be20883c35fdc --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-international_law_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc4f719780006bef2965878aed62e6bdfbdb99a5b465227f41f15c97f8fc6b99 +size 487066 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-jurisprudence_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-jurisprudence_results.json new file mode 100644 index 0000000000000000000000000000000000000000..4cb938ab17a4d5802a07a1335cfbf954375b3b38 --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-jurisprudence_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:943e681c1e8a2d448546b471322e5d7cc00f30f850139751ea9a2cab8d65f69d +size 343181 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-logical_fallacies_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-logical_fallacies_results.json new file mode 100644 index 0000000000000000000000000000000000000000..6469e32894766ea3b034dc8ceebe92463201fc5d --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-logical_fallacies_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e954de079b36dad0c7329f198f573e61f5aa6a65655aed72d9e7510b09af059d +size 569334 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-machine_learning_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-machine_learning_results.json new file mode 100644 index 0000000000000000000000000000000000000000..e2dcc835684ab182386a2c0ba2f1f1ea7bec1009 --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-machine_learning_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e69615011562cb8f000da90331dfd1e6ced79649947d53df94d3f47138357466 +size 304342 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-management_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-management_results.json new file mode 100644 index 0000000000000000000000000000000000000000..7cca638ef0f1d168d29ad25071a7daeec95e4644 --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-management_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6edebf332009fe75ddd773beffa2e0da2defa2172cf4a5cba0d0cac8a29427ad +size 222575 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-marketing_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-marketing_results.json new file mode 100644 index 0000000000000000000000000000000000000000..274c694f8c68a8b4066445ed13325593fe414eaf --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-marketing_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca5f82575448405ab7673ae5254b0649b0a1a2fc79c25cd751f851690d22791f +size 655021 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-medical_genetics_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-medical_genetics_results.json new file mode 100644 index 0000000000000000000000000000000000000000..b0a4d81bd5971964d72f15bc3b63749687e6c945 --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-medical_genetics_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5dde966f930e507a00f7cb5164859107a999f5e91a79a7b385fee583a8e42f2 +size 200691 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-miscellaneous_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-miscellaneous_results.json new file mode 100644 index 0000000000000000000000000000000000000000..3f3004d0aaf7e390455d9372406ff85cca8aa4ac --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-miscellaneous_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:243eb723b2a1c74dd1abb5be4427d791ab2a5c81a04e71d0bc5042087fd12d12 +size 1707392 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-moral_disputes_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-moral_disputes_results.json new file mode 100644 index 0000000000000000000000000000000000000000..31057e34e0a31b9637838dd7e58210b87ff4e1bb --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-moral_disputes_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb6cd16340412e7867202abe3451ff2d43643766af7e4b64b778245dca6126c3 +size 958319 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-moral_scenarios_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-moral_scenarios_results.json new file mode 100644 index 0000000000000000000000000000000000000000..510b9d7c36b0ca495e56f07a455c72c1af9c18f2 --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-moral_scenarios_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0cbc4996951d6fb309f022d61730fb48da273d6041016001d8764ecb7e38491 +size 3255269 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-nutrition_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-nutrition_results.json new file mode 100644 index 0000000000000000000000000000000000000000..b68f644916650d0f892039fc99f7aa735bddcf96 --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-nutrition_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0bf3b34795388702cc529803d54aad9678e6661c3f6896aadf1207e3bedb603 +size 825966 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-philosophy_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-philosophy_results.json new file mode 100644 index 0000000000000000000000000000000000000000..45a865f734d61593e7a4619b3416b3ed77f92f45 --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-philosophy_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28f77e13824aa667fcae36011b71dd7b92b9b0833bab4c7537f39ccf87710fa1 +size 736418 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-prehistory_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-prehistory_results.json new file mode 100644 index 0000000000000000000000000000000000000000..d92e2320859737e796cbf0998bc755b833e1b1a9 --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-prehistory_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:149800102fae35ef4ef964b226de5a17fada69d3cd4e400693ca9a35b7414087 +size 838492 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-professional_accounting_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-professional_accounting_results.json new file mode 100644 index 0000000000000000000000000000000000000000..08eebe22da98f9fad54f5ae1524a7bd84745440a --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-professional_accounting_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5253b66e628674f87832cd7f3ab0dc6b8ada1a69fc429799ffaeb08a56bf229 +size 1100058 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-professional_law_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-professional_law_results.json new file mode 100644 index 0000000000000000000000000000000000000000..cb85ca1be42a25f20d6d4bc903085ab9b402aec1 --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-professional_law_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa075ef82af0999bdba782a9bbb3fdfdf6035767a4430a508ebc6c3a1cfabf86 +size 14464696 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-professional_medicine_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-professional_medicine_results.json new file mode 100644 index 0000000000000000000000000000000000000000..0de1466684efaa4a66971377982c689f2dae1d2d --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-professional_medicine_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed0394f10954f1ccd055f4b9678bc88717d3b88a6c4fd4e07b8375325c05ceb3 +size 1954283 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-professional_psychology_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-professional_psychology_results.json new file mode 100644 index 0000000000000000000000000000000000000000..82e38086d2692d1f156333f700b0df99f844a35a --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-professional_psychology_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b615c1adc40e0caf45c29eb7c1cd282db98f3355986b0561078424154159860 +size 1886538 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-public_relations_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-public_relations_results.json new file mode 100644 index 0000000000000000000000000000000000000000..d5974c1c6c20470ae8b2cb4dd4cdba4b5e633184 --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-public_relations_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08f0e45439be2a43657c3f959fa198948f46447e896165b40144d464fc865ac5 +size 332118 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-security_studies_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-security_studies_results.json new file mode 100644 index 0000000000000000000000000000000000000000..dc678bc554384985dca8e148e72ae85c6b0666b1 --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-security_studies_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8cc94c3cdd667f485dfd2fd27fa90fbf2cac9f78c65fb84ff4a296aa17f1409 +size 1834736 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-sociology_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-sociology_results.json new file mode 100644 index 0000000000000000000000000000000000000000..ddc4c7c790ee7e8a92bea621e8c43bc99dc4ed13 --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-sociology_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70f27c7b53c151549f781006ac7164784a3db4720048262fe25adf930300539d +size 615563 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-us_foreign_policy_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-us_foreign_policy_results.json new file mode 100644 index 0000000000000000000000000000000000000000..173fefc164284d644a44112e2f1dc347914e8dd2 --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-us_foreign_policy_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87cc80108ab60ce1f0a48b161b84e81bdde3b45029ddf113dbcce82c1f97b7c8 +size 258501 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-virology_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-virology_results.json new file mode 100644 index 0000000000000000000000000000000000000000..57cf5125e180d3ea54f0239154df8ef88dd56ed6 --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-virology_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2c4d6b3da2d4165dae2ec7694e4a2bc944b800ee5f0e43b4d453e09e1d0b12a +size 361626 diff --git a/_results/few-shot/opentensor_foundation/hendrycksTest-world_religions_results.json b/_results/few-shot/opentensor_foundation/hendrycksTest-world_religions_results.json new file mode 100644 index 0000000000000000000000000000000000000000..1e24114f4c2d1170b6e7a9ee3d87dfe02b6f971e --- /dev/null +++ b/_results/few-shot/opentensor_foundation/hendrycksTest-world_religions_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3806abdfa7c7cc865a055030037febb37cc28ba67025701cf19379ad265e797b +size 275252 diff --git a/_results/few-shot/opentensor_foundation/truthfulqa_mc_results.json b/_results/few-shot/opentensor_foundation/truthfulqa_mc_results.json new file mode 100644 index 0000000000000000000000000000000000000000..14969a06bbcc2009372fa1cd962e29462f2459bf --- /dev/null +++ b/_results/few-shot/opentensor_foundation/truthfulqa_mc_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b032c0601c768943c321e93114947f6f8469c56caae6177a82ee1951b7ebc972 +size 1545343 diff --git a/_results/few-shot/taostats/arc_challenge_results.json b/_results/few-shot/taostats/arc_challenge_results.json new file mode 100644 index 0000000000000000000000000000000000000000..6d671ba90dd2bc1b834770c27863850f99b3469e --- /dev/null +++ b/_results/few-shot/taostats/arc_challenge_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fc84b4a8ca916e6132f141d9497f229ee9efa5769cb4a0fe91e66f27c8f9045 +size 416635 diff --git a/_results/few-shot/taostats/hellaswag_results.json b/_results/few-shot/taostats/hellaswag_results.json new file mode 100644 index 0000000000000000000000000000000000000000..f3059065182bec4ce38f05d3993f1731555ffd9c --- /dev/null +++ b/_results/few-shot/taostats/hellaswag_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:935476b0a960300a04ea899a31aeba84e3b2c918a724200a5f77e1b4d6a93d04 +size 8277145 diff --git a/_results/few-shot/taostats/hendrycksTest-abstract_algebra_results.json b/_results/few-shot/taostats/hendrycksTest-abstract_algebra_results.json new file mode 100644 index 0000000000000000000000000000000000000000..f2e99cac0ef6cdb83a154e5673687e99d16150e4 --- /dev/null +++ b/_results/few-shot/taostats/hendrycksTest-abstract_algebra_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8fed4c8fc5eccd61065c6d5862db38eda4814754f21410e2391091f919fe622 +size 31996 diff --git a/_results/few-shot/taostats/hendrycksTest-anatomy_results.json b/_results/few-shot/taostats/hendrycksTest-anatomy_results.json new file mode 100644 index 0000000000000000000000000000000000000000..9df4f2476097d75c40a8a3a08247e505a7c428a7 --- /dev/null +++ b/_results/few-shot/taostats/hendrycksTest-anatomy_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:679d88a09177acd9a7dc78c1cac405a508e6199c361991ef08b9fb7cb0f80c4d +size 62593 diff --git a/_results/few-shot/taostats/hendrycksTest-astronomy_results.json b/_results/few-shot/taostats/hendrycksTest-astronomy_results.json new file mode 100644 index 0000000000000000000000000000000000000000..c903c58e0fb33b7eac3edc733620d8a1ae5d12e9 --- /dev/null +++ b/_results/few-shot/taostats/hendrycksTest-astronomy_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e63d22bce5602c94d943e0efcc22b54ef33963350b244a77af8dea15ff6e7fcd +size 89446 diff --git a/_results/few-shot/taostats/hendrycksTest-business_ethics_results.json b/_results/few-shot/taostats/hendrycksTest-business_ethics_results.json new file mode 100644 index 0000000000000000000000000000000000000000..0f166ba16d39fef066237718e8e7ce0a881948b3 --- /dev/null +++ b/_results/few-shot/taostats/hendrycksTest-business_ethics_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dc1ec0e34b8eed82ac3481ae5e909b562e3b98666fd388adda157185555237d +size 57322 diff --git a/_results/few-shot/taostats/hendrycksTest-clinical_knowledge_results.json b/_results/few-shot/taostats/hendrycksTest-clinical_knowledge_results.json new file mode 100644 index 0000000000000000000000000000000000000000..73a535b0e9ce353ee2d450c0b1f6e7049d18cf8b --- /dev/null +++ b/_results/few-shot/taostats/hendrycksTest-clinical_knowledge_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:859cab93cf2ae13f92797a53da0c4682d76935667243155348d851af11e90b47 +size 120147 diff --git a/_results/few-shot/taostats/hendrycksTest-college_biology_results.json b/_results/few-shot/taostats/hendrycksTest-college_biology_results.json new file mode 100644 index 0000000000000000000000000000000000000000..0fea42738a91b8e35188aefc6564cc72e7680b4b --- /dev/null +++ b/_results/few-shot/taostats/hendrycksTest-college_biology_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e916d9a348375aa960956da2fd78fb2ad67a4b549d73fefeef598a0abed44db8 +size 85838 diff --git a/_results/few-shot/taostats/hendrycksTest-college_chemistry_results.json b/_results/few-shot/taostats/hendrycksTest-college_chemistry_results.json new file mode 100644 index 0000000000000000000000000000000000000000..82851f4f6741c519b8b0120b607615d52f394c57 --- /dev/null +++ b/_results/few-shot/taostats/hendrycksTest-college_chemistry_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db68a76b5d32e9e8ae21392ec87b3919b10519bdb99f9b8a0b1f3562ee805aa4 +size 42875 diff --git a/_results/few-shot/taostats/hendrycksTest-college_computer_science_results.json b/_results/few-shot/taostats/hendrycksTest-college_computer_science_results.json new file mode 100644 index 0000000000000000000000000000000000000000..e3800297a52913d9af9ad62988d2c30c8e387654 --- /dev/null +++ b/_results/few-shot/taostats/hendrycksTest-college_computer_science_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3797ec1396e116443bca39140321bd852eaf5e1dd2f88172d1d8d2a4bcae119e +size 63714 diff --git a/_results/few-shot/taostats/truthfulqa_mc_results.json b/_results/few-shot/taostats/truthfulqa_mc_results.json new file mode 100644 index 0000000000000000000000000000000000000000..c5f314a2dfff02fdf27dd29b9e15131b5f8199f3 --- /dev/null +++ b/_results/few-shot/taostats/truthfulqa_mc_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a617c8ba67ff4d264da82455057c37c957b6f3863c02a76026b48936cf8a76ae +size 666969 diff --git a/_results/hellaswag.json b/_results/hellaswag.json deleted file mode 100644 index c38977081019241ed403fb209c5beff81e8c424a..0000000000000000000000000000000000000000 --- a/_results/hellaswag.json +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eb5bfbbbd793bb3700fdd232df51df2bed54e600cff359d66e1ad715c4059dba -size 8273334 diff --git a/_results/zero-shot/arc_challenge_results.json b/_results/zero-shot/arc_challenge_results.json new file mode 100644 index 0000000000000000000000000000000000000000..9992e3f96f6405306b4b2aa8ed25653b1640ae88 --- /dev/null +++ b/_results/zero-shot/arc_challenge_results.json @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fb2cc9401c386f1b0da0b2725cb823b53c619eadb52aa1f00e0d71220b0819e +size 929191 diff --git a/app.py b/app.py new file mode 100644 index 0000000000000000000000000000000000000000..6e18108d910d090607ea693f4cad411038e1e753 --- /dev/null +++ b/app.py @@ -0,0 +1,117 @@ +import pandas as pd +import json +import os +import numpy as np +import re +import gradio as gr + +tasks = ["hellaswag", "arc_challenge", "hendrycks", "truthfulqa_mc"] +validators = [] + +def clean_result(result, task): + if ("hendrycks" in task): + if ((len(result["result"]) <= 2) and (result["result"] != "") and (result["result"][0].isupper())) or ((result["result"] != "") and (re.match('[A-Z]\.', result["result"][:2]))): + if result["result"][0] == "A": + result["cleaned_result"] = "1" + elif result["result"][0] == "B": + result["cleaned_result"] = "2" + elif result["result"][0] == "C": + result["cleaned_result"] = "3" + elif result["result"][0] == "D": + result["cleaned_result"] = "4" + else: + result["cleaned_result"] = "N/A" + else: + result["cleaned_result"] = "N/A" + + elif (task == "truthfulqa_mc"): + cleaned_result = [] + for r in result['result']: + if 'False' in r: + cleaned_result.append(0) + elif 'True' in r: + cleaned_result.append(1) + else: + cleaned_result.append("N/A") + result["cleaned_result"] = cleaned_result + else: + if (result["result"] != "") and (result["result"][0].isnumeric()): + result["cleaned_result"] = result["result"][0] + else: + result["cleaned_result"] = "N/A" + return result + +def mc2(doc): + # Split on the first `0` as everything before it is true (`1`). + split_idx = list(doc["mc2_targets"]["labels"]).index(0) + lls = doc["cleaned_result"] + # Compute the normalized probability mass for the correct answer. + ll_true, ll_false = lls[:split_idx], lls[split_idx:] + p_true, p_false = np.exp(np.array(ll_true)), np.exp(np.array(ll_false)) + p_true = p_true / (sum(p_true) + sum(p_false)) + return sum(p_true) + +final_total_results = [] +for validator in validators: + results_dir_file_list = os.listdir("_results/few-shot/{validator}") + final_split_results = [] + number_of_nas, number_of_results, inference_total = 0,0,0 + for task in tasks: + task_results_files = [result_file for result_file in results_dir_file_list if task in result_file] + + results = [] + for task_results_file in task_results_files: + results_file_dir = f"""_results/few-shot/{task_results_file}""" + f = open(results_file_dir) + results += json.load(f) + + results = [clean_result(result, task) if "result" in result else result for result in results] + + # Total results + number_of_nas += len([1 for result in results if ('cleaned_result' in result) and ('N/A' in result['cleaned_result'])]) + inference_total += np.array([result['inference_time'] for result in results if 'inference_time' in result]).sum() + number_of_results += len([1 for result in results if ('cleaned_result' in result)]) + + # Indiviudal results + result_coverage = round((sum(['result' in result for result in results])/len(results))*100,2) + na_coverage = round((len([1 for result in results if ('cleaned_result' in result) and ('N/A' in result['cleaned_result'])])/len(['result' in result for result in results]))*100,2) + inference_avg = round(np.array([result['inference_time'] for result in results if 'inference_time' in result]).mean(), 2) + + if task == "truthfulqa_mc": + metric = round(np.array([mc2(result) for result in results if ("cleaned_result" in result) and ("N/A" not in result["cleaned_result"])]).mean()*100,2) + else: + metric = round((len([result for result in results if ("cleaned_result" in result) and (result["cleaned_result"] != "N/A") and (int(result["cleaned_result"]) == (int(result["gold"])+1))])/len([result for result in results if ("cleaned_result" in result) and (result["cleaned_result"] != "N/A") ]))*100,2) + + final_split_results.append({ + "task" : task, + "coverage_%" :result_coverage, + "na_%" : na_coverage, + "inference_avg" : inference_avg, + "metric" : metric + }) + + print(final_split_results) + + final_total_results.append({ + "Validator": validator.replace("_", "").capitalize(), + "N/A %" : round((number_of_nas/number_of_results)*100,2), + "Avg Inference (s)" : round((inference_total/number_of_results),2), + "Average ⬆️": 0, + "ARC (25-shot) ⬆️": final_split_results[tasks.index("arc_challenge")]["metric"], + "HellaSwag (10-shot) ⬆️": final_split_results[tasks.index("hellaswag")]["metric"], + "MMLU (5-shot) ⬆️": final_split_results[tasks.index("hendrycks")]["metric"], + "TruthfulQA (0-shot) ⬆️": final_split_results[tasks.index("truthfulqa_mc")]["metric"] + }) + final_total_results[-1]["Average ⬆️"] = np.array([final_total_results[0]["ARC (25-shot) ⬆️"], final_total_results[0]["HellaSwag (10-shot) ⬆️"],final_total_results[0]["TruthfulQA (0-shot) ⬆️"], final_total_results[0]["MMLU (5-shot) ⬆️"]]).mean() + +demo = gr.Blocks() +with demo: + with gr.Row(): + title = gr.Markdown(value=f"""#
Bittensor LMEH Leaderboard
""") + with gr.Row(): + table_1 = gr.Dataframe(pd.DataFrame(final_total_results)) + # with gr.Row(visible = False): + # table_2 = gr.Dataframe(pd.DataFrame(final_split_results)) + +demo.queue(concurrency_count = 5) +demo.launch(enable_queue=True, debug=True, server_name="0.0.0.0", server_port=7860) \ No newline at end of file diff --git a/eval_bittensor.py b/eval_bittensor.py index 2196837eaf28f76b1e51520bd34c5a58b81c1441..1b3b1312edc4b9cc2ebec6c252e85136ee9f3340 100644 --- a/eval_bittensor.py +++ b/eval_bittensor.py @@ -1,3 +1,151 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:30a487da717e6eea99f1e7013716c92101129807afbe7d35fd8a0c089303c46b -size 7183 +import collections +import lm_eval.tasks +import random +import time +from datetime import datetime as dt +import bittensor as bt +from tqdm import tqdm +import json + +import http.client +import os +from argparse import ArgumentParser + +parser = ArgumentParser() + +parser.add_argument("--validator", required=True, type=str, help="validator name", choices=["opentensor_foundation", "taostats"], default="float16") +args = parser.parse_args() + +default_prompt = ''' +You are Chattensor. +Chattensor is a research project by Opentensor Cortex. +Chattensor is designed to be able to assist with a wide range of tasks, from answering simple questions to providing in-depth explanations and discussions on a wide range of topics. As a language model, Chattensor is able to generate human-like text based on the input it receives, allowing it to engage in natural-sounding conversations and provide responses that are coherent and relevant to the topic at hand. +''' + +if args.validator == "taostats": + print("TAOOOOSATATS") + try: + bitapai_key = os.environ["BITAPAI_KEY"] + conn = http.client.HTTPSConnection("dashboard.bitapai.io") + headers = { + 'Content-Type': 'application/json', + 'X-API-KEY': bitapai_key + } + except KeyError: + raise RuntimeError(f"BITAPAI_KEY does not exist and chosen validator is taostats. Please set your bitapai key using export BITAPAI_KEY=x.") + + +def get_response(prompt): + if args.validator == "taostats": + payload = json.dumps({ + "system": default_prompt, + "user": prompt + }) + conn.request("POST", "/api/v1/prompt", payload, headers) + res = conn.getresponse() + data = res.read() + # print('test') + print(data) + time.sleep(1) + return data.decode("utf-8") + else: + return bt.prompt(prompt) + +# Load all the LMEH tasks +tasks = ["hellaswag", "arc_challenge", "truthfulqa_mc", "hendrycksTest-abstract_algebra", "hendrycksTest-anatomy", "hendrycksTest-astronomy", "hendrycksTest-business_ethics", "hendrycksTest-clinical_knowledge", "hendrycksTest-college_biology", "hendrycksTest-college_chemistry", "hendrycksTest-college_computer_science", "hendrycksTest-college_mathematics", "hendrycksTest-college_medicine", "hendrycksTest-college_physics", "hendrycksTest-computer_security", "hendrycksTest-conceptual_physics", "hendrycksTest-econometrics", "hendrycksTest-electrical_engineering", "hendrycksTest-elementary_mathematics", "hendrycksTest-formal_logic", "hendrycksTest-global_facts", "hendrycksTest-high_school_biology", "hendrycksTest-high_school_chemistry", "hendrycksTest-high_school_computer_science", "hendrycksTest-high_school_european_history", "hendrycksTest-high_school_geography", "hendrycksTest-high_school_government_and_politics", "hendrycksTest-high_school_macroeconomics", "hendrycksTest-high_school_mathematics", "hendrycksTest-high_school_microeconomics", "hendrycksTest-high_school_physics", "hendrycksTest-high_school_psychology", "hendrycksTest-high_school_statistics", "hendrycksTest-high_school_us_history", "hendrycksTest-high_school_world_history", "hendrycksTest-human_aging", "hendrycksTest-human_sexuality", "hendrycksTest-international_law", "hendrycksTest-jurisprudence", "hendrycksTest-logical_fallacies", "hendrycksTest-machine_learning", "hendrycksTest-management", "hendrycksTest-marketing", "hendrycksTest-medical_genetics", "hendrycksTest-miscellaneous", "hendrycksTest-moral_disputes", "hendrycksTest-moral_scenarios", "hendrycksTest-nutrition", "hendrycksTest-philosophy", "hendrycksTest-prehistory", "hendrycksTest-professional_accounting", "hendrycksTest-professional_law", "hendrycksTest-professional_medicine", "hendrycksTest-professional_psychology", "hendrycksTest-public_relations", "hendrycksTest-security_studies", "hendrycksTest-sociology", "hendrycksTest-us_foreign_policy", "hendrycksTest-virology", "hendrycksTest-world_religions"] +task_dict = lm_eval.tasks.get_task_dict(tasks) +task_dict_items = [ + (name, task) + for name, task in task_dict.items() + if (task.has_validation_docs() or task.has_test_docs()) +] +versions = collections.defaultdict(dict) + +# get lists of each type of request +for task_name, task in task_dict_items: + versions[task_name] = task.VERSION + # default to test doc, fall back to val doc if validation unavailable + # TODO: the test-fallback-to-val system isn't final, we should revisit it at some point + if task.has_test_docs(): + task_doc_func = task.test_docs + task_set = "test" # Required for caching in the decontamination + elif task.has_validation_docs(): + task_set = "val" # Required for caching in the decontamination + task_doc_func = task.validation_docs + else: + raise RuntimeError("Task has neither test_docs nor validation_docs") + # deterministically shuffle docs and chop off the first `limit` because sometimes docs are in some kind of order + task_docs = list(task_doc_func()) + rnd = random.Random() + rnd.seed(42) + rnd.shuffle(task_docs) + + i=0 + for task_doc in tqdm(task_docs): + print(task_name) + print(task_doc) + if ("result" in task_doc) and ("inference_time" in task_doc) and ("prompt" in task_doc) and ("result" in task_doc) and (task_doc['result'] != ""): + continue + + query = task_doc["query"] if "query" in task_doc else "" + choices_list = "\n".join([str(number+1) + ". " + choice for number, choice in enumerate(task_doc["choices"])]) if "choices" in task_doc else "" + number_list = ",".join([str(number) for number in range(1,len(task_doc["choices"])+1)]) if "choices" in task_doc else "" + + if (task_name == "hellaswag") : + prompt = "" + prompt_list = list(task.training_docs())[:10] + for prompt_item in prompt_list: + prompt_item_query = prompt_item["query"] + prompt_item_choices_list = "\n".join([str(number+1) + ". " + choice for number, choice in enumerate(prompt_item["choices"])]) + prompt_item_number_list = ",".join([str(number) for number in range(1,len(prompt_item["choices"])+1)]) + prompt_item_gold = prompt_item["gold"]+1 + + prompt += f"""{prompt_item_query}...\n{prompt_item_choices_list}\nRespond with just one number only: {prompt_item_number_list}.\n{prompt_item_gold}\n\n""" + + prompt += f"""{query}...\n{choices_list}\nRespond with just one number only: {number_list}. """ + + elif (task_name == "arc_challenge"): + prompt = "" + prompt_list = list(task.training_docs())[:25] + for prompt_item in prompt_list: + prompt_item_query = prompt_item["query"] + prompt_item_choices_list = "\n".join([str(number+1) + ". " + choice for number, choice in enumerate(prompt_item["choices"])]) + prompt_item_number_list = ",".join([str(number) for number in range(1,len(prompt_item["choices"])+1)]) + prompt_item_gold = prompt_item["gold"]+1 + + prompt += f"""{prompt_item_query}...\n{prompt_item_choices_list}\nRespond with just one number only: {prompt_item_number_list}.\n{prompt_item_gold}\n\n""" + + prompt += f"""{query}...\n{choices_list}\nRespond with just one number only: {number_list}. """ + + + elif (task_name == "truthfulqa_mc"): + continue + prompt = "" + + elif ("hendrycksTest" in task_name): + prompt = "" + prompt_list = list(task.test_docs())[:5] + for prompt_item in prompt_list: + prompt_item_query = prompt_item["query"] + + prompt += f"""{prompt_item_query.replace("Answer:", "Respond with just one letter only: A, B, C, D:")}\n{["A", "B", "C", "D"][prompt_item["gold"]]}\n\n""" + + prompt += query.replace("Answer:", "Respond with just one letter only: A, B, C, D:") + + # print(prompt) + + start = time.time() + task_doc["result"] = get_response(prompt) + end = time.time() + task_doc["inference_time"] = end - start + task_doc["prompt"] = prompt + task_doc["datetime"] = dt.now().strftime(format = "%Y-%m-%d %H:%M:%S") + print(task_doc["result"]) + + i = i + 1 + if ((i % 100) / 1000 == 0): + with open(f"""_results/few-shot/{args.validator}/{task_name}_results.json""", "w") as final: + json.dump(task_docs, final) + + with open(f"""_results/few-shot/{args.validator}/{task_name}_results.json""", "w") as final: + json.dump(task_docs, final) diff --git a/visualise_results.py b/visualise_results.py deleted file mode 100644 index d14b66f8c7b0e321e700fea83e87f0d21c84cc12..0000000000000000000000000000000000000000 --- a/visualise_results.py +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:dbf1418aae366fddd251e93c0ef80f35e2f1c8e0a4e7d07b8aacd4aada8107bd -size 2102