DreamGenX's picture
Update README.md
0252688 verified
---
license: cc
---
- **What is this?** Nothing interesting, just an experiment.
- **License:** CC-BY-NC
```
| Task |Version| Metric |Value | |Stderr|
|------------------------------------------------------|------:|--------------|-----:|---|-----:|
|all | |acc |0.6606|± |0.0323|
| | |acc_norm |0.6669|± |0.0093|
| | |truthfulqa_mc1|0.3403|± |0.0166|
| | |truthfulqa_mc2|0.5085|± |0.0153|
| | |qem |0.7020|± |0.0126|
|leaderboard:arc:challenge:25 | 0|acc |0.5683|± |0.0145|
| | |acc_norm |0.5469|± |0.0145|
|leaderboard:gsm8k:5 | 0|qem |0.7020|± |0.0126|
|leaderboard:hellaswag:10 | 0|acc |0.5990|± |0.0049|
| | |acc_norm |0.7868|± |0.0041|
|leaderboard:mmlu:_average:5 | |acc |0.6621|± |0.0334|
|leaderboard:mmlu:abstract_algebra:5 | 0|acc |0.3100|± |0.0465|
|leaderboard:mmlu:anatomy:5 | 0|acc |0.6667|± |0.0407|
|leaderboard:mmlu:astronomy:5 | 0|acc |0.7368|± |0.0358|
|leaderboard:mmlu:business_ethics:5 | 0|acc |0.7000|± |0.0461|
|leaderboard:mmlu:clinical_knowledge:5 | 0|acc |0.7434|± |0.0269|
|leaderboard:mmlu:college_biology:5 | 0|acc |0.7847|± |0.0344|
|leaderboard:mmlu:college_chemistry:5 | 0|acc |0.4500|± |0.0500|
|leaderboard:mmlu:college_computer_science:5 | 0|acc |0.4900|± |0.0502|
|leaderboard:mmlu:college_mathematics:5 | 0|acc |0.4000|± |0.0492|
|leaderboard:mmlu:college_medicine:5 | 0|acc |0.6301|± |0.0368|
|leaderboard:mmlu:college_physics:5 | 0|acc |0.4412|± |0.0494|
|leaderboard:mmlu:computer_security:5 | 0|acc |0.7900|± |0.0409|
|leaderboard:mmlu:conceptual_physics:5 | 0|acc |0.6043|± |0.0320|
|leaderboard:mmlu:econometrics:5 | 0|acc |0.5175|± |0.0470|
|leaderboard:mmlu:electrical_engineering:5 | 0|acc |0.6207|± |0.0404|
|leaderboard:mmlu:elementary_mathematics:5 | 0|acc |0.4021|± |0.0253|
|leaderboard:mmlu:formal_logic:5 | 0|acc |0.5238|± |0.0447|
|leaderboard:mmlu:global_facts:5 | 0|acc |0.4400|± |0.0499|
|leaderboard:mmlu:high_school_biology:5 | 0|acc |0.8000|± |0.0228|
|leaderboard:mmlu:high_school_chemistry:5 | 0|acc |0.5419|± |0.0351|
|leaderboard:mmlu:high_school_computer_science:5 | 0|acc |0.7200|± |0.0451|
|leaderboard:mmlu:high_school_european_history:5 | 0|acc |0.7636|± |0.0332|
|leaderboard:mmlu:high_school_geography:5 | 0|acc |0.7828|± |0.0294|
|leaderboard:mmlu:high_school_government_and_politics:5| 0|acc |0.8756|± |0.0238|
|leaderboard:mmlu:high_school_macroeconomics:5 | 0|acc |0.6744|± |0.0238|
|leaderboard:mmlu:high_school_mathematics:5 | 0|acc |0.3630|± |0.0293|
|leaderboard:mmlu:high_school_microeconomics:5 | 0|acc |0.7563|± |0.0279|
|leaderboard:mmlu:high_school_physics:5 | 0|acc |0.3775|± |0.0396|
|leaderboard:mmlu:high_school_psychology:5 | 0|acc |0.8569|± |0.0150|
|leaderboard:mmlu:high_school_statistics:5 | 0|acc |0.4769|± |0.0341|
|leaderboard:mmlu:high_school_us_history:5 | 0|acc |0.8039|± |0.0279|
|leaderboard:mmlu:high_school_world_history:5 | 0|acc |0.8439|± |0.0236|
|leaderboard:mmlu:human_aging:5 | 0|acc |0.7399|± |0.0294|
|leaderboard:mmlu:human_sexuality:5 | 0|acc |0.7710|± |0.0369|
|leaderboard:mmlu:international_law:5 | 0|acc |0.7521|± |0.0394|
|leaderboard:mmlu:jurisprudence:5 | 0|acc |0.7593|± |0.0413|
|leaderboard:mmlu:logical_fallacies:5 | 0|acc |0.7423|± |0.0344|
|leaderboard:mmlu:machine_learning:5 | 0|acc |0.5357|± |0.0473|
|leaderboard:mmlu:management:5 | 0|acc |0.8447|± |0.0359|
|leaderboard:mmlu:marketing:5 | 0|acc |0.8974|± |0.0199|
|leaderboard:mmlu:medical_genetics:5 | 0|acc |0.7400|± |0.0441|
|leaderboard:mmlu:miscellaneous:5 | 0|acc |0.8174|± |0.0138|
|leaderboard:mmlu:moral_disputes:5 | 0|acc |0.7197|± |0.0242|
|leaderboard:mmlu:moral_scenarios:5 | 0|acc |0.4168|± |0.0165|
|leaderboard:mmlu:nutrition:5 | 0|acc |0.7516|± |0.0247|
|leaderboard:mmlu:philosophy:5 | 0|acc |0.7170|± |0.0256|
|leaderboard:mmlu:prehistory:5 | 0|acc |0.7222|± |0.0249|
|leaderboard:mmlu:professional_accounting:5 | 0|acc |0.5213|± |0.0298|
|leaderboard:mmlu:professional_law:5 | 0|acc |0.4759|± |0.0128|
|leaderboard:mmlu:professional_medicine:5 | 0|acc |0.7537|± |0.0262|
|leaderboard:mmlu:professional_psychology:5 | 0|acc |0.6993|± |0.0186|
|leaderboard:mmlu:public_relations:5 | 0|acc |0.7182|± |0.0431|
|leaderboard:mmlu:security_studies:5 | 0|acc |0.7673|± |0.0270|
|leaderboard:mmlu:sociology:5 | 0|acc |0.8259|± |0.0268|
|leaderboard:mmlu:us_foreign_policy:5 | 0|acc |0.8300|± |0.0378|
|leaderboard:mmlu:virology:5 | 0|acc |0.5181|± |0.0389|
|leaderboard:mmlu:world_religions:5 | 0|acc |0.8129|± |0.0299|
|leaderboard:truthfulqa:mc:0 | 0|truthfulqa_mc1|0.3403|± |0.0166|
| | |truthfulqa_mc2|0.5085|± |0.0153|
|leaderboard:winogrande:5 | 0|acc |0.7309|± |0.0125|
```
Baseline:
```
| Task |Version| Metric |Value | |Stderr|
|------------------------------------------------------|------:|--------------|-----:|---|-----:|
|all | |acc |0.6635|± |0.0322|
| | |acc_norm |0.6569|± |0.0094|
| | |truthfulqa_mc1|0.3745|± |0.0169|
| | |truthfulqa_mc2|0.5338|± |0.0160|
| | |qem |0.6808|± |0.0128|
|leaderboard:arc:challenge:25 | 0|acc |0.5742|± |0.0144|
| | |acc_norm |0.5828|± |0.0144|
|leaderboard:gsm8k:5 | 0|qem |0.6808|± |0.0128|
|leaderboard:hellaswag:10 | 0|acc |0.5707|± |0.0049|
| | |acc_norm |0.7310|± |0.0044|
|leaderboard:mmlu:_average:5 | |acc |0.6662|± |0.0333|
|leaderboard:mmlu:abstract_algebra:5 | 0|acc |0.3300|± |0.0473|
|leaderboard:mmlu:anatomy:5 | 0|acc |0.6815|± |0.0402|
|leaderboard:mmlu:astronomy:5 | 0|acc |0.7500|± |0.0352|
|leaderboard:mmlu:business_ethics:5 | 0|acc |0.7000|± |0.0461|
|leaderboard:mmlu:clinical_knowledge:5 | 0|acc |0.7472|± |0.0267|
|leaderboard:mmlu:college_biology:5 | 0|acc |0.7917|± |0.0340|
|leaderboard:mmlu:college_chemistry:5 | 0|acc |0.4500|± |0.0500|
|leaderboard:mmlu:college_computer_science:5 | 0|acc |0.5200|± |0.0502|
|leaderboard:mmlu:college_mathematics:5 | 0|acc |0.3900|± |0.0490|
|leaderboard:mmlu:college_medicine:5 | 0|acc |0.6590|± |0.0361|
|leaderboard:mmlu:college_physics:5 | 0|acc |0.4314|± |0.0493|
|leaderboard:mmlu:computer_security:5 | 0|acc |0.7900|± |0.0409|
|leaderboard:mmlu:conceptual_physics:5 | 0|acc |0.5872|± |0.0322|
|leaderboard:mmlu:econometrics:5 | 0|acc |0.5439|± |0.0469|
|leaderboard:mmlu:electrical_engineering:5 | 0|acc |0.6138|± |0.0406|
|leaderboard:mmlu:elementary_mathematics:5 | 0|acc |0.4683|± |0.0257|
|leaderboard:mmlu:formal_logic:5 | 0|acc |0.5317|± |0.0446|
|leaderboard:mmlu:global_facts:5 | 0|acc |0.4600|± |0.0501|
|leaderboard:mmlu:high_school_biology:5 | 0|acc |0.8065|± |0.0225|
|leaderboard:mmlu:high_school_chemistry:5 | 0|acc |0.5419|± |0.0351|
|leaderboard:mmlu:high_school_computer_science:5 | 0|acc |0.6800|± |0.0469|
|leaderboard:mmlu:high_school_european_history:5 | 0|acc |0.7394|± |0.0343|
|leaderboard:mmlu:high_school_geography:5 | 0|acc |0.8131|± |0.0278|
|leaderboard:mmlu:high_school_government_and_politics:5| 0|acc |0.8964|± |0.0220|
|leaderboard:mmlu:high_school_macroeconomics:5 | 0|acc |0.6769|± |0.0237|
|leaderboard:mmlu:high_school_mathematics:5 | 0|acc |0.3259|± |0.0286|
|leaderboard:mmlu:high_school_microeconomics:5 | 0|acc |0.7563|± |0.0279|
|leaderboard:mmlu:high_school_physics:5 | 0|acc |0.4106|± |0.0402|
|leaderboard:mmlu:high_school_psychology:5 | 0|acc |0.8477|± |0.0154|
|leaderboard:mmlu:high_school_statistics:5 | 0|acc |0.4769|± |0.0341|
|leaderboard:mmlu:high_school_us_history:5 | 0|acc |0.7892|± |0.0286|
|leaderboard:mmlu:high_school_world_history:5 | 0|acc |0.8397|± |0.0239|
|leaderboard:mmlu:human_aging:5 | 0|acc |0.7265|± |0.0299|
|leaderboard:mmlu:human_sexuality:5 | 0|acc |0.7939|± |0.0355|
|leaderboard:mmlu:international_law:5 | 0|acc |0.7686|± |0.0385|
|leaderboard:mmlu:jurisprudence:5 | 0|acc |0.7593|± |0.0413|
|leaderboard:mmlu:logical_fallacies:5 | 0|acc |0.7607|± |0.0335|
|leaderboard:mmlu:machine_learning:5 | 0|acc |0.5268|± |0.0474|
|leaderboard:mmlu:management:5 | 0|acc |0.8155|± |0.0384|
|leaderboard:mmlu:marketing:5 | 0|acc |0.9060|± |0.0191|
|leaderboard:mmlu:medical_genetics:5 | 0|acc |0.7900|± |0.0409|
|leaderboard:mmlu:miscellaneous:5 | 0|acc |0.8238|± |0.0136|
|leaderboard:mmlu:moral_disputes:5 | 0|acc |0.7399|± |0.0236|
|leaderboard:mmlu:moral_scenarios:5 | 0|acc |0.4358|± |0.0166|
|leaderboard:mmlu:nutrition:5 | 0|acc |0.7549|± |0.0246|
|leaderboard:mmlu:philosophy:5 | 0|acc |0.7331|± |0.0251|
|leaderboard:mmlu:prehistory:5 | 0|acc |0.7469|± |0.0242|
|leaderboard:mmlu:professional_accounting:5 | 0|acc |0.5177|± |0.0298|
|leaderboard:mmlu:professional_law:5 | 0|acc |0.4648|± |0.0127|
|leaderboard:mmlu:professional_medicine:5 | 0|acc |0.7279|± |0.0270|
|leaderboard:mmlu:professional_psychology:5 | 0|acc |0.6928|± |0.0187|
|leaderboard:mmlu:public_relations:5 | 0|acc |0.6636|± |0.0453|
|leaderboard:mmlu:security_studies:5 | 0|acc |0.7306|± |0.0284|
|leaderboard:mmlu:sociology:5 | 0|acc |0.8557|± |0.0248|
|leaderboard:mmlu:us_foreign_policy:5 | 0|acc |0.8600|± |0.0349|
|leaderboard:mmlu:virology:5 | 0|acc |0.5361|± |0.0388|
|leaderboard:mmlu:world_religions:5 | 0|acc |0.7953|± |0.0309|
|leaderboard:truthfulqa:mc:0 | 0|truthfulqa_mc1|0.3745|± |0.0169|
| | |truthfulqa_mc2|0.5338|± |0.0160|
|leaderboard:winogrande:5 | 0|acc |0.6930|± |0.0130|
```