https://wandb.ai/alexwortega/tiny_llama/runs/n0je6urv?workspace=user-alexwortega hf-causal (pretrained=../../tiny_3ep_full,dtype=float16), limit: None, provide_description: False, num_fewshot: 0, batch_size: 16 | Task |Version| Metric |Value | |Stderr| |---------------------------------------------------|------:|--------|-----:|---|-----:| |danetqa | 1|acc |0.5018|± |0.0175| |hendrycksTest-abstract_algebra | 1|acc |0.2600|± |0.0441| | | |acc_norm|0.2600|± |0.0441| |hendrycksTest-anatomy | 1|acc |0.2741|± |0.0385| | | |acc_norm|0.2741|± |0.0385| |hendrycksTest-astronomy | 1|acc |0.1776|± |0.0311| | | |acc_norm|0.1776|± |0.0311| |hendrycksTest-business_ethics | 1|acc |0.2500|± |0.0435| | | |acc_norm|0.2500|± |0.0435| |hendrycksTest-clinical_knowledge | 1|acc |0.2604|± |0.0270| | | |acc_norm|0.2604|± |0.0270| |hendrycksTest-college_biology | 1|acc |0.2153|± |0.0344| | | |acc_norm|0.2153|± |0.0344| |hendrycksTest-college_chemistry | 1|acc |0.1700|± |0.0378| | | |acc_norm|0.1700|± |0.0378| |hendrycksTest-college_computer_science | 1|acc |0.2800|± |0.0451| | | |acc_norm|0.2800|± |0.0451| |hendrycksTest-college_mathematics | 1|acc |0.2700|± |0.0446| | | |acc_norm|0.2700|± |0.0446| |hendrycksTest-college_medicine | 1|acc |0.2543|± |0.0332| | | |acc_norm|0.2543|± |0.0332| |hendrycksTest-college_physics | 1|acc |0.1961|± |0.0395| | | |acc_norm|0.1961|± |0.0395| |hendrycksTest-computer_security | 1|acc |0.3000|± |0.0461| | | |acc_norm|0.3000|± |0.0461| |hendrycksTest-conceptual_physics | 1|acc |0.2766|± |0.0292| | | |acc_norm|0.2766|± |0.0292| |hendrycksTest-econometrics | 1|acc |0.2807|± |0.0423| | | |acc_norm|0.2807|± |0.0423| |hendrycksTest-electrical_engineering | 1|acc |0.2690|± |0.0370| | | |acc_norm|0.2690|± |0.0370| |hendrycksTest-elementary_mathematics | 1|acc |0.2434|± |0.0221| | | |acc_norm|0.2434|± |0.0221| |hendrycksTest-formal_logic | 1|acc |0.2698|± |0.0397| | | |acc_norm|0.2698|± |0.0397| |hendrycksTest-global_facts | 1|acc |0.2700|± |0.0446| | | |acc_norm|0.2700|± |0.0446| |hendrycksTest-high_school_biology | 1|acc |0.2161|± |0.0234| | | |acc_norm|0.2161|± |0.0234| |hendrycksTest-high_school_chemistry | 1|acc |0.1970|± |0.0280| | | |acc_norm|0.1970|± |0.0280| |hendrycksTest-high_school_computer_science | 1|acc |0.3600|± |0.0482| | | |acc_norm|0.3600|± |0.0482| |hendrycksTest-high_school_european_history | 1|acc |0.2182|± |0.0323| | | |acc_norm|0.2182|± |0.0323| |hendrycksTest-high_school_geography | 1|acc |0.2222|± |0.0296| | | |acc_norm|0.2222|± |0.0296| |hendrycksTest-high_school_government_and_politics | 1|acc |0.1969|± |0.0287| | | |acc_norm|0.1969|± |0.0287| |hendrycksTest-high_school_macroeconomics | 1|acc |0.2282|± |0.0213| | | |acc_norm|0.2282|± |0.0213| |hendrycksTest-high_school_mathematics | 1|acc |0.2556|± |0.0266| | | |acc_norm|0.2556|± |0.0266| |hendrycksTest-high_school_microeconomics | 1|acc |0.2227|± |0.0270| | | |acc_norm|0.2227|± |0.0270| |hendrycksTest-high_school_physics | 1|acc |0.2914|± |0.0371| | | |acc_norm|0.2914|± |0.0371| |hendrycksTest-high_school_psychology | 1|acc |0.2275|± |0.0180| | | |acc_norm|0.2275|± |0.0180| |hendrycksTest-high_school_statistics | 1|acc |0.1759|± |0.0260| | | |acc_norm|0.1759|± |0.0260| |hendrycksTest-high_school_us_history | 1|acc |0.2598|± |0.0308| | | |acc_norm|0.2598|± |0.0308| |hendrycksTest-high_school_world_history | 1|acc |0.2827|± |0.0293| | | |acc_norm|0.2827|± |0.0293| |hendrycksTest-human_aging | 1|acc |0.3049|± |0.0309| | | |acc_norm|0.3049|± |0.0309| |hendrycksTest-human_sexuality | 1|acc |0.2824|± |0.0395| | | |acc_norm|0.2824|± |0.0395| |hendrycksTest-international_law | 1|acc |0.2562|± |0.0398| | | |acc_norm|0.2562|± |0.0398| |hendrycksTest-jurisprudence | 1|acc |0.3611|± |0.0464| | | |acc_norm|0.3611|± |0.0464| |hendrycksTest-logical_fallacies | 1|acc |0.2515|± |0.0341| | | |acc_norm|0.2515|± |0.0341| |hendrycksTest-machine_learning | 1|acc |0.1964|± |0.0377| | | |acc_norm|0.1964|± |0.0377| |hendrycksTest-management | 1|acc |0.1553|± |0.0359| | | |acc_norm|0.1553|± |0.0359| |hendrycksTest-marketing | 1|acc |0.3248|± |0.0307| | | |acc_norm|0.3248|± |0.0307| |hendrycksTest-medical_genetics | 1|acc |0.3400|± |0.0476| | | |acc_norm|0.3400|± |0.0476| |hendrycksTest-miscellaneous | 1|acc |0.2669|± |0.0158| | | |acc_norm|0.2669|± |0.0158| |hendrycksTest-moral_disputes | 1|acc |0.2919|± |0.0245| | | |acc_norm|0.2919|± |0.0245| |hendrycksTest-moral_scenarios | 1|acc |0.2447|± |0.0144| | | |acc_norm|0.2447|± |0.0144| |hendrycksTest-nutrition | 1|acc |0.2549|± |0.0250| | | |acc_norm|0.2549|± |0.0250| |hendrycksTest-philosophy | 1|acc |0.2122|± |0.0232| | | |acc_norm|0.2122|± |0.0232| |hendrycksTest-prehistory | 1|acc |0.2685|± |0.0247| | | |acc_norm|0.2685|± |0.0247| |hendrycksTest-professional_accounting | 1|acc |0.2021|± |0.0240| | | |acc_norm|0.2021|± |0.0240| |hendrycksTest-professional_law | 1|acc |0.2432|± |0.0110| | | |acc_norm|0.2432|± |0.0110| |hendrycksTest-professional_medicine | 1|acc |0.1654|± |0.0226| | | |acc_norm|0.1654|± |0.0226| |hendrycksTest-professional_psychology | 1|acc |0.2582|± |0.0177| | | |acc_norm|0.2582|± |0.0177| |hendrycksTest-public_relations | 1|acc |0.2909|± |0.0435| | | |acc_norm|0.2909|± |0.0435| |hendrycksTest-security_studies | 1|acc |0.2041|± |0.0258| | | |acc_norm|0.2041|± |0.0258| |hendrycksTest-sociology | 1|acc |0.2637|± |0.0312| | | |acc_norm|0.2637|± |0.0312| |hendrycksTest-us_foreign_policy | 1|acc |0.2900|± |0.0456| | | |acc_norm|0.2900|± |0.0456| |hendrycksTest-virology | 1|acc |0.2651|± |0.0344| | | |acc_norm|0.2651|± |0.0344| |hendrycksTest-world_religions | 1|acc |0.3450|± |0.0365| | | |acc_norm|0.3450|± |0.0365| |hendrycksTestRu-abstract_algebra | 1|acc |0.2300|± |0.0423| | | |acc_norm|0.2300|± |0.0423| |hendrycksTestRu-anatomy | 1|acc |0.1778|± |0.0330| | | |acc_norm|0.1778|± |0.0330| |hendrycksTestRu-astronomy | 1|acc |0.1776|± |0.0311| | | |acc_norm|0.1776|± |0.0311| |hendrycksTestRu-business_ethics | 1|acc |0.2200|± |0.0416| | | |acc_norm|0.2200|± |0.0416| |hendrycksTestRu-clinical_knowledge | 1|acc |0.2151|± |0.0253| | | |acc_norm|0.2151|± |0.0253| |hendrycksTestRu-college_biology | 1|acc |0.2569|± |0.0365| | | |acc_norm|0.2569|± |0.0365| |hendrycksTestRu-college_chemistry | 1|acc |0.1800|± |0.0386| | | |acc_norm|0.1800|± |0.0386| |hendrycksTestRu-college_computer_science | 1|acc |0.2700|± |0.0446| | | |acc_norm|0.2700|± |0.0446| |hendrycksTestRu-college_mathematics | 1|acc |0.2200|± |0.0416| | | |acc_norm|0.2200|± |0.0416| |hendrycksTestRu-college_medicine | 1|acc |0.1908|± |0.0300| | | |acc_norm|0.1908|± |0.0300| |hendrycksTestRu-college_physics | 1|acc |0.2059|± |0.0402| | | |acc_norm|0.2059|± |0.0402| |hendrycksTestRu-computer_security | 1|acc |0.3000|± |0.0461| | | |acc_norm|0.3000|± |0.0461| |hendrycksTestRu-conceptual_physics | 1|acc |0.2681|± |0.0290| | | |acc_norm|0.2681|± |0.0290| |hendrycksTestRu-econometrics | 1|acc |0.2368|± |0.0400| | | |acc_norm|0.2368|± |0.0400| |hendrycksTestRu-electrical_engineering | 1|acc |0.2483|± |0.0360| | | |acc_norm|0.2483|± |0.0360| |hendrycksTestRu-elementary_mathematics | 1|acc |0.2063|± |0.0208| | | |acc_norm|0.2063|± |0.0208| |hendrycksTestRu-formal_logic | 1|acc |0.2937|± |0.0407| | | |acc_norm|0.2937|± |0.0407| |hendrycksTestRu-global_facts | 1|acc |0.2000|± |0.0402| | | |acc_norm|0.2000|± |0.0402| |hendrycksTestRu-high_school_biology | 1|acc |0.1871|± |0.0222| | | |acc_norm|0.1871|± |0.0222| |hendrycksTestRu-high_school_chemistry | 1|acc |0.1724|± |0.0266| | | |acc_norm|0.1724|± |0.0266| |hendrycksTestRu-high_school_computer_science | 1|acc |0.2900|± |0.0456| | | |acc_norm|0.2900|± |0.0456| |hendrycksTestRu-high_school_european_history | 1|acc |0.2242|± |0.0326| | | |acc_norm|0.2242|± |0.0326| |hendrycksTestRu-high_school_geography | 1|acc |0.1869|± |0.0278| | | |acc_norm|0.1869|± |0.0278| |hendrycksTestRu-high_school_government_and_politics| 1|acc |0.2124|± |0.0295| | | |acc_norm|0.2124|± |0.0295| |hendrycksTestRu-high_school_macroeconomics | 1|acc |0.2128|± |0.0208| | | |acc_norm|0.2128|± |0.0208| |hendrycksTestRu-high_school_mathematics | 1|acc |0.2074|± |0.0247| | | |acc_norm|0.2074|± |0.0247| |hendrycksTestRu-high_school_microeconomics | 1|acc |0.2227|± |0.0270| | | |acc_norm|0.2227|± |0.0270| |hendrycksTestRu-high_school_physics | 1|acc |0.1987|± |0.0326| | | |acc_norm|0.1987|± |0.0326| |hendrycksTestRu-high_school_psychology | 1|acc |0.2000|± |0.0171| | | |acc_norm|0.2000|± |0.0171| |hendrycksTestRu-high_school_statistics | 1|acc |0.1713|± |0.0257| | | |acc_norm|0.1713|± |0.0257| |hendrycksTestRu-high_school_us_history | 1|acc |0.2647|± |0.0310| | | |acc_norm|0.2647|± |0.0310| |hendrycksTestRu-high_school_world_history | 1|acc |0.2658|± |0.0288| | | |acc_norm|0.2658|± |0.0288| |hendrycksTestRu-human_aging | 1|acc |0.2780|± |0.0301| | | |acc_norm|0.2780|± |0.0301| |hendrycksTestRu-human_sexuality | 1|acc |0.2443|± |0.0377| | | |acc_norm|0.2443|± |0.0377| |hendrycksTestRu-international_law | 1|acc |0.2314|± |0.0385| | | |acc_norm|0.2314|± |0.0385| |hendrycksTestRu-jurisprudence | 1|acc |0.2593|± |0.0424| | | |acc_norm|0.2593|± |0.0424| |hendrycksTestRu-logical_fallacies | 1|acc |0.2270|± |0.0329| | | |acc_norm|0.2270|± |0.0329| |hendrycksTestRu-machine_learning | 1|acc |0.2857|± |0.0429| | | |acc_norm|0.2857|± |0.0429| |hendrycksTestRu-management | 1|acc |0.2136|± |0.0406| | | |acc_norm|0.2136|± |0.0406| |hendrycksTestRu-marketing | 1|acc |0.2991|± |0.0300| | | |acc_norm|0.2991|± |0.0300| |hendrycksTestRu-medical_genetics | 1|acc |0.2700|± |0.0446| | | |acc_norm|0.2700|± |0.0446| |hendrycksTestRu-miscellaneous | 1|acc |0.2490|± |0.0155| | | |acc_norm|0.2490|± |0.0155| |hendrycksTestRu-moral_disputes | 1|acc |0.2601|± |0.0236| | | |acc_norm|0.2601|± |0.0236| |hendrycksTestRu-moral_scenarios | 1|acc |0.2369|± |0.0142| | | |acc_norm|0.2369|± |0.0142| |hendrycksTestRu-nutrition | 1|acc |0.2190|± |0.0237| | | |acc_norm|0.2190|± |0.0237| |hendrycksTestRu-philosophy | 1|acc |0.1897|± |0.0223| | | |acc_norm|0.1897|± |0.0223| |hendrycksTestRu-prehistory | 1|acc |0.2191|± |0.0230| | | |acc_norm|0.2191|± |0.0230| |hendrycksTestRu-professional_accounting | 1|acc |0.2092|± |0.0243| | | |acc_norm|0.2092|± |0.0243| |hendrycksTestRu-professional_law | 1|acc |0.2627|± |0.0112| | | |acc_norm|0.2627|± |0.0112| |hendrycksTestRu-professional_medicine | 1|acc |0.1801|± |0.0233| | | |acc_norm|0.1801|± |0.0233| |hendrycksTestRu-professional_psychology | 1|acc |0.2533|± |0.0176| | | |acc_norm|0.2533|± |0.0176| |hendrycksTestRu-public_relations | 1|acc |0.2273|± |0.0401| | | |acc_norm|0.2273|± |0.0401| |hendrycksTestRu-security_studies | 1|acc |0.1959|± |0.0254| | | |acc_norm|0.1959|± |0.0254| |hendrycksTestRu-sociology | 1|acc |0.2239|± |0.0295| | | |acc_norm|0.2239|± |0.0295| |hendrycksTestRu-us_foreign_policy | 1|acc |0.2300|± |0.0423| | | |acc_norm|0.2300|± |0.0423| |hendrycksTestRu-virology | 1|acc |0.2831|± |0.0351| | | |acc_norm|0.2831|± |0.0351| |hendrycksTestRu-world_religions | 1|acc |0.3158|± |0.0357| | | |acc_norm|0.3158|± |0.0357| |muserc | 1|acc |0.0000|± |0.0000| |parus | 0|acc |0.6500|± |0.0479| |rcb | 1|acc |0.5273|± |0.0337| | | |f1 |0.2302| | | |rucos | 0|f1 |0.5248|± |0.0057| | | |em |0.5108|± |0.0057| |russe | 0|acc |0.3691|± |0.0052| |ruterra | 1|acc |0.4984|± |0.0286| | | |f1 |0.2217| | | |rwsd | 0|acc |0.5539|± |0.0349| |xwinograd_ru | 0|acc |0.5587|± |0.0280| |xnli_ru | 0|acc |0.3940|± |0.0069|