Upload 2 files

Browse files

Files changed (2) hide show

scandeval_benchmark_results.jsonl +16 -0
scandeval_summary.py +17 -0

scandeval_benchmark_results.jsonl ADDED Viewed

	@@ -0,0 +1,16 @@

+{"dataset": "swerec", "task": "sentiment-classification", "dataset_languages": ["sv"], "model": "KennethTM/bert-base-uncased-danish", "results": {"raw": {"test": [{"test_loss": 0.5070382356643677, "test_mcc": 0.6566762400071989, "test_macro_f1": 0.65975176084704, "test_runtime": 32.9962, "test_samples_per_second": 62.068, "test_steps_per_second": 31.034}, {"test_loss": 0.6261532306671143, "test_mcc": 0.5622172097035746, "test_macro_f1": 0.5306929561521423, "test_runtime": 30.7125, "test_samples_per_second": 66.683, "test_steps_per_second": 33.341}, {"test_loss": 0.6132768392562866, "test_mcc": 0.6350766615777962, "test_macro_f1": 0.64948651944921, "test_runtime": 30.4359, "test_samples_per_second": 67.289, "test_steps_per_second": 33.644}, {"test_loss": 0.5417741537094116, "test_mcc": 0.6311614240148968, "test_macro_f1": 0.5941756384975722, "test_runtime": 31.5249, "test_samples_per_second": 64.965, "test_steps_per_second": 32.482}, {"test_loss": 0.5024886727333069, "test_mcc": 0.6657951154766436, "test_macro_f1": 0.6460563500027335, "test_runtime": 31.4481, "test_samples_per_second": 65.123, "test_steps_per_second": 32.562}, {"test_loss": 0.5979309678077698, "test_mcc": 0.6145191680918937, "test_macro_f1": 0.6781767863632068, "test_runtime": 32.2756, "test_samples_per_second": 63.454, "test_steps_per_second": 31.727}, {"test_loss": 0.5207780003547668, "test_mcc": 0.6761284789673636, "test_macro_f1": 0.6999312157255329, "test_runtime": 34.3683, "test_samples_per_second": 59.59, "test_steps_per_second": 29.795}, {"test_loss": 0.5997392535209656, "test_mcc": 0.5871920267372929, "test_macro_f1": 0.5393290391696876, "test_runtime": 30.7184, "test_samples_per_second": 66.67, "test_steps_per_second": 33.335}, {"test_loss": 0.5331461429595947, "test_mcc": 0.6397080223510401, "test_macro_f1": 0.6448588173031131, "test_runtime": 32.2135, "test_samples_per_second": 63.576, "test_steps_per_second": 31.788}, {"test_loss": 0.5571358799934387, "test_mcc": 0.6335205477809104, "test_macro_f1": 0.5773811308899028, "test_runtime": 31.5904, "test_samples_per_second": 64.83, "test_steps_per_second": 32.415}]}, "total": {"test_mcc": 63.0199489470861, "test_mcc_se": 2.1590464820300364, "test_macro_f1": 62.19840214400142, "test_macro_f1_se": 3.6050471245851026}}, "num_model_parameters": 109484547, "max_sequence_length": 511, "vocabulary_size": 30522}
+{"dataset": "angry-tweets", "task": "sentiment-classification", "dataset_languages": ["da"], "model": "KennethTM/bert-base-uncased-danish", "results": {"raw": {"test": [{"test_loss": 0.8160163164138794, "test_mcc": 0.46308526405425277, "test_macro_f1": 0.6288918181243571, "test_runtime": 5.6161, "test_samples_per_second": 364.668, "test_steps_per_second": 11.396}, {"test_loss": 0.7786201238632202, "test_mcc": 0.4752727083224254, "test_macro_f1": 0.6484401502995693, "test_runtime": 5.6663, "test_samples_per_second": 361.436, "test_steps_per_second": 11.295}, {"test_loss": 0.868148922920227, "test_mcc": 0.46206979449677804, "test_macro_f1": 0.6424741434489251, "test_runtime": 5.5887, "test_samples_per_second": 366.454, "test_steps_per_second": 11.452}, {"test_loss": 0.8279974460601807, "test_mcc": 0.4819826780789838, "test_macro_f1": 0.6359974449106481, "test_runtime": 5.5515, "test_samples_per_second": 368.909, "test_steps_per_second": 11.528}, {"test_loss": 0.826221764087677, "test_mcc": 0.4856999307722316, "test_macro_f1": 0.6525064848246035, "test_runtime": 5.4778, "test_samples_per_second": 373.871, "test_steps_per_second": 11.683}, {"test_loss": 0.8456926345825195, "test_mcc": 0.46914695674355983, "test_macro_f1": 0.6372566852121962, "test_runtime": 5.5696, "test_samples_per_second": 367.71, "test_steps_per_second": 11.491}, {"test_loss": 0.8089869618415833, "test_mcc": 0.4657365491384325, "test_macro_f1": 0.6395164813743985, "test_runtime": 5.5778, "test_samples_per_second": 367.172, "test_steps_per_second": 11.474}, {"test_loss": 0.8067009449005127, "test_mcc": 0.4749246530542649, "test_macro_f1": 0.6477625756550763, "test_runtime": 5.5037, "test_samples_per_second": 372.116, "test_steps_per_second": 11.629}, {"test_loss": 0.8102366924285889, "test_mcc": 0.4794644608441153, "test_macro_f1": 0.6545958618916877, "test_runtime": 5.6083, "test_samples_per_second": 365.171, "test_steps_per_second": 11.412}, {"test_loss": 0.8823351860046387, "test_mcc": 0.4639566580037213, "test_macro_f1": 0.6335653419021458, "test_runtime": 5.7843, "test_samples_per_second": 354.061, "test_steps_per_second": 11.064}]}, "total": {"test_mcc": 47.21339653508765, "test_mcc_se": 0.5276621943206763, "test_macro_f1": 64.21006987643607, "test_macro_f1_se": 0.5273471133921165}}, "num_model_parameters": 109484547, "max_sequence_length": 511, "vocabulary_size": 30522}
+{"dataset": "norec", "task": "sentiment-classification", "dataset_languages": ["nb", "nn"], "model": "KennethTM/bert-base-uncased-danish", "results": {"raw": {"test": [{"test_loss": 0.8333566784858704, "test_mcc": 0.4610769694838059, "test_macro_f1": 0.6151072290075795, "test_runtime": 5.213, "test_samples_per_second": 392.862, "test_steps_per_second": 12.277}, {"test_loss": 0.8147885799407959, "test_mcc": 0.44297247667557305, "test_macro_f1": 0.5940358457021003, "test_runtime": 4.8897, "test_samples_per_second": 418.838, "test_steps_per_second": 13.089}, {"test_loss": 0.7714332938194275, "test_mcc": 0.47459415985638187, "test_macro_f1": 0.5944756874721099, "test_runtime": 4.8917, "test_samples_per_second": 418.669, "test_steps_per_second": 13.083}, {"test_loss": 1.0140479803085327, "test_mcc": 0.029864422431983458, "test_macro_f1": 0.22842708035912562, "test_runtime": 4.973, "test_samples_per_second": 411.826, "test_steps_per_second": 12.87}, {"test_loss": 0.8161723613739014, "test_mcc": 0.43452343461812154, "test_macro_f1": 0.59065011214576, "test_runtime": 5.0517, "test_samples_per_second": 405.41, "test_steps_per_second": 12.669}, {"test_loss": 0.775856614112854, "test_mcc": 0.48240805473277154, "test_macro_f1": 0.6419612862196395, "test_runtime": 5.0391, "test_samples_per_second": 406.424, "test_steps_per_second": 12.701}, {"test_loss": 0.7514811158180237, "test_mcc": 0.47226445290104274, "test_macro_f1": 0.5795126462019108, "test_runtime": 4.8874, "test_samples_per_second": 419.039, "test_steps_per_second": 13.095}, {"test_loss": 0.8222088813781738, "test_mcc": 0.45294003251532255, "test_macro_f1": 0.6014846253788146, "test_runtime": 5.0116, "test_samples_per_second": 408.651, "test_steps_per_second": 12.77}, {"test_loss": 0.7827929258346558, "test_mcc": 0.4477780999187308, "test_macro_f1": 0.6103158622537125, "test_runtime": 5.1235, "test_samples_per_second": 399.729, "test_steps_per_second": 12.492}, {"test_loss": 0.742304265499115, "test_mcc": 0.5249925475452578, "test_macro_f1": 0.6682849918831422, "test_runtime": 5.1631, "test_samples_per_second": 396.66, "test_steps_per_second": 12.396}]}, "total": {"test_mcc": 42.23414650678991, "test_mcc_se": 8.693603286249262, "test_macro_f1": 57.24255366623895, "test_macro_f1_se": 7.669270837908364}}, "num_model_parameters": 109484547, "max_sequence_length": 511, "vocabulary_size": 30522}
+{"dataset": "suc3", "task": "named-entity-recognition", "dataset_languages": ["sv"], "model": "KennethTM/bert-base-uncased-danish", "results": {"raw": {"test": [{"test_loss": 0.12691396474838257, "test_micro_f1": 0.4960707269155206, "test_micro_f1_no_misc": 0.5293489861259338, "test_runtime": 10.5046, "test_samples_per_second": 194.961, "test_steps_per_second": 6.093}, {"test_loss": 0.1145431399345398, "test_micro_f1": 0.4927703875072297, "test_micro_f1_no_misc": 0.5298507462686568, "test_runtime": 10.0067, "test_samples_per_second": 204.664, "test_steps_per_second": 6.396}, {"test_loss": 0.12497851252555847, "test_micro_f1": 0.5279642058165549, "test_micro_f1_no_misc": 0.5557543231961837, "test_runtime": 9.9907, "test_samples_per_second": 204.991, "test_steps_per_second": 6.406}, {"test_loss": 0.11535276472568512, "test_micro_f1": 0.5078369905956112, "test_micro_f1_no_misc": 0.5538641686182669, "test_runtime": 10.4201, "test_samples_per_second": 196.543, "test_steps_per_second": 6.142}, {"test_loss": 0.12199730426073074, "test_micro_f1": 0.5399201596806388, "test_micro_f1_no_misc": 0.5831942125765165, "test_runtime": 10.4751, "test_samples_per_second": 195.512, "test_steps_per_second": 6.11}, {"test_loss": 0.12470493465662003, "test_micro_f1": 0.5377733598409542, "test_micro_f1_no_misc": 0.5667789001122335, "test_runtime": 8.3988, "test_samples_per_second": 243.844, "test_steps_per_second": 7.62}, {"test_loss": 0.12823310494422913, "test_micro_f1": 0.47968673519334315, "test_micro_f1_no_misc": 0.5200642054574639, "test_runtime": 9.3915, "test_samples_per_second": 218.07, "test_steps_per_second": 6.815}, {"test_loss": 0.1146499514579773, "test_micro_f1": 0.32555555555555554, "test_micro_f1_no_misc": 0.3445032333921223, "test_runtime": 10.5152, "test_samples_per_second": 194.765, "test_steps_per_second": 6.086}, {"test_loss": 0.11723003536462784, "test_micro_f1": 0.5242165242165242, "test_micro_f1_no_misc": 0.5465903175429464, "test_runtime": 9.7736, "test_samples_per_second": 209.544, "test_steps_per_second": 6.548}, {"test_loss": 0.11622432619333267, "test_micro_f1": 0.5715896279594137, "test_micro_f1_no_misc": 0.6248431618569635, "test_runtime": 10.4482, "test_samples_per_second": 196.014, "test_steps_per_second": 6.125}]}, "total": {"test_micro_f1": 50.03384273281346, "test_micro_f1_se": 4.157039919926133, "test_micro_f1_no_misc": 53.54792255147288, "test_micro_f1_no_misc_se": 4.568363285649421}}, "num_model_parameters": 108898569, "max_sequence_length": 511, "vocabulary_size": 30522}
+{"dataset": "dane", "task": "named-entity-recognition", "dataset_languages": ["da"], "model": "KennethTM/bert-base-uncased-danish", "results": {"raw": {"test": [{"test_loss": 0.08365888893604279, "test_micro_f1": 0.7462610899873258, "test_micro_f1_no_misc": 0.7878151260504203, "test_runtime": 9.5931, "test_samples_per_second": 213.486, "test_steps_per_second": 6.671}, {"test_loss": 0.07704630494117737, "test_micro_f1": 0.7708045341443185, "test_micro_f1_no_misc": 0.8145454545454546, "test_runtime": 7.4856, "test_samples_per_second": 273.592, "test_steps_per_second": 8.55}, {"test_loss": 0.0851258933544159, "test_micro_f1": 0.7341381967634215, "test_micro_f1_no_misc": 0.7884820476359757, "test_runtime": 9.3129, "test_samples_per_second": 219.909, "test_steps_per_second": 6.872}, {"test_loss": 0.07999996840953827, "test_micro_f1": 0.7847425980261403, "test_micro_f1_no_misc": 0.8161816891412349, "test_runtime": 9.2066, "test_samples_per_second": 222.45, "test_steps_per_second": 6.952}, {"test_loss": 0.08172982931137085, "test_micro_f1": 0.7360090446579988, "test_micro_f1_no_misc": 0.7796610169491526, "test_runtime": 9.3041, "test_samples_per_second": 220.119, "test_steps_per_second": 6.879}, {"test_loss": 0.07893450558185577, "test_micro_f1": 0.8002229033156869, "test_micro_f1_no_misc": 0.8420647149460708, "test_runtime": 9.1247, "test_samples_per_second": 224.446, "test_steps_per_second": 7.014}, {"test_loss": 0.06919334828853607, "test_micro_f1": 0.7774261603375527, "test_micro_f1_no_misc": 0.8087392550143266, "test_runtime": 9.5864, "test_samples_per_second": 213.635, "test_steps_per_second": 6.676}, {"test_loss": 0.07813015580177307, "test_micro_f1": 0.750067549310997, "test_micro_f1_no_misc": 0.8092918696140877, "test_runtime": 9.4979, "test_samples_per_second": 215.627, "test_steps_per_second": 6.738}, {"test_loss": 0.0795971155166626, "test_micro_f1": 0.7665860864893902, "test_micro_f1_no_misc": 0.8021818181818181, "test_runtime": 9.1972, "test_samples_per_second": 222.676, "test_steps_per_second": 6.959}, {"test_loss": 0.08826860785484314, "test_micro_f1": 0.778046811945117, "test_micro_f1_no_misc": 0.8116476225580539, "test_runtime": 8.0852, "test_samples_per_second": 253.301, "test_steps_per_second": 7.916}]}, "total": {"test_micro_f1": 76.44304974977949, "test_micro_f1_se": 1.3634282512881593, "test_micro_f1_no_misc": 80.60610614636596, "test_micro_f1_no_misc_se": 1.1074804335060013}}, "num_model_parameters": 108898569, "max_sequence_length": 511, "vocabulary_size": 30522}
+{"dataset": "norne-nb", "task": "named-entity-recognition", "dataset_languages": ["nb"], "model": "KennethTM/bert-base-uncased-danish", "results": {"raw": {"test": [{"test_loss": 0.11419615894556046, "test_micro_f1": 0.6545592130155126, "test_micro_f1_no_misc": 0.7067796610169492, "test_runtime": 7.9186, "test_samples_per_second": 258.632, "test_steps_per_second": 8.082}, {"test_loss": 0.09524375200271606, "test_micro_f1": 0.6987155963302752, "test_micro_f1_no_misc": 0.7432542927228128, "test_runtime": 7.8839, "test_samples_per_second": 259.77, "test_steps_per_second": 8.118}, {"test_loss": 0.10940806567668915, "test_micro_f1": 0.719163763066202, "test_micro_f1_no_misc": 0.7657370517928288, "test_runtime": 7.6121, "test_samples_per_second": 269.044, "test_steps_per_second": 8.408}, {"test_loss": 0.10846176743507385, "test_micro_f1": 0.6608756080611535, "test_micro_f1_no_misc": 0.7144535840188014, "test_runtime": 7.6234, "test_samples_per_second": 268.646, "test_steps_per_second": 8.395}, {"test_loss": 0.11372241377830505, "test_micro_f1": 0.6611405835543765, "test_micro_f1_no_misc": 0.7005899705014749, "test_runtime": 7.9199, "test_samples_per_second": 258.588, "test_steps_per_second": 8.081}, {"test_loss": 0.10193439573049545, "test_micro_f1": 0.7178437393381099, "test_micro_f1_no_misc": 0.7657142857142857, "test_runtime": 7.9553, "test_samples_per_second": 257.44, "test_steps_per_second": 8.045}, {"test_loss": 0.10555048286914825, "test_micro_f1": 0.7060006704659739, "test_micro_f1_no_misc": 0.7518053971873814, "test_runtime": 7.3494, "test_samples_per_second": 278.661, "test_steps_per_second": 8.708}, {"test_loss": 0.11414188891649246, "test_micro_f1": 0.69901547116737, "test_micro_f1_no_misc": 0.7385339082712662, "test_runtime": 7.5931, "test_samples_per_second": 269.717, "test_steps_per_second": 8.429}, {"test_loss": 0.11313172429800034, "test_micro_f1": 0.6429056347589953, "test_micro_f1_no_misc": 0.6912267375617166, "test_runtime": 7.5967, "test_samples_per_second": 269.589, "test_steps_per_second": 8.425}, {"test_loss": 0.12088565528392792, "test_micro_f1": 0.6776406035665296, "test_micro_f1_no_misc": 0.7296367112810708, "test_runtime": 7.6252, "test_samples_per_second": 268.582, "test_steps_per_second": 8.393}]}, "total": {"test_micro_f1": 68.378608833245, "test_micro_f1_se": 1.7237000116134544, "test_micro_f1_no_misc": 73.07731600068588, "test_micro_f1_no_misc_se": 1.655393950005333}}, "num_model_parameters": 108898569, "max_sequence_length": 511, "vocabulary_size": 30522}
+{"dataset": "norne-nn", "task": "named-entity-recognition", "dataset_languages": ["nn"], "model": "KennethTM/bert-base-uncased-danish", "results": {"raw": {"test": [{"test_loss": 0.15642738342285156, "test_micro_f1": 0.6061906221268771, "test_micro_f1_no_misc": 0.6496129249410972, "test_runtime": 8.1467, "test_samples_per_second": 251.391, "test_steps_per_second": 7.856}, {"test_loss": 0.14738962054252625, "test_micro_f1": 0.5618176453231042, "test_micro_f1_no_misc": 0.6029079159935379, "test_runtime": 8.5916, "test_samples_per_second": 238.374, "test_steps_per_second": 7.449}, {"test_loss": 0.13437902927398682, "test_micro_f1": 0.6274159976211716, "test_micro_f1_no_misc": 0.6662290777814244, "test_runtime": 7.9404, "test_samples_per_second": 257.923, "test_steps_per_second": 8.06}, {"test_loss": 0.15800991654396057, "test_micro_f1": 0.5703682557807593, "test_micro_f1_no_misc": 0.6040061633281972, "test_runtime": 8.199, "test_samples_per_second": 249.788, "test_steps_per_second": 7.806}, {"test_loss": 0.15130937099456787, "test_micro_f1": 0.6326781326781327, "test_micro_f1_no_misc": 0.6780704755388299, "test_runtime": 7.7064, "test_samples_per_second": 265.752, "test_steps_per_second": 8.305}, {"test_loss": 0.1508064568042755, "test_micro_f1": 0.6364168618266979, "test_micro_f1_no_misc": 0.6710013003901171, "test_runtime": 7.7064, "test_samples_per_second": 265.753, "test_steps_per_second": 8.305}, {"test_loss": 0.13392111659049988, "test_micro_f1": 0.6345226615236258, "test_micro_f1_no_misc": 0.6762237762237763, "test_runtime": 8.6307, "test_samples_per_second": 237.293, "test_steps_per_second": 7.415}, {"test_loss": 0.1358247995376587, "test_micro_f1": 0.5974921630094044, "test_micro_f1_no_misc": 0.6417657045840407, "test_runtime": 8.6315, "test_samples_per_second": 237.27, "test_steps_per_second": 7.415}, {"test_loss": 0.15388840436935425, "test_micro_f1": 0.5941986522121301, "test_micro_f1_no_misc": 0.631578947368421, "test_runtime": 7.4495, "test_samples_per_second": 274.918, "test_steps_per_second": 8.591}, {"test_loss": 0.15011076629161835, "test_micro_f1": 0.5834066256229845, "test_micro_f1_no_misc": 0.6178403755868545, "test_runtime": 7.9339, "test_samples_per_second": 258.133, "test_steps_per_second": 8.067}]}, "total": {"test_micro_f1": 60.44507617724888, "test_micro_f1_se": 1.7073073491271196, "test_micro_f1_no_misc": 64.39236661736297, "test_micro_f1_no_misc_se": 1.7993949050483813}}, "num_model_parameters": 108898569, "max_sequence_length": 511, "vocabulary_size": 30522}
+{"dataset": "scala-sv", "task": "linguistic-acceptability", "dataset_languages": ["sv"], "model": "KennethTM/bert-base-uncased-danish", "results": {"raw": {"test": [{"test_loss": 0.693950891494751, "test_mcc": 0.022562012893927324, "test_macro_f1": 0.4795578974065565, "test_runtime": 5.2811, "test_samples_per_second": 387.801, "test_steps_per_second": 12.119}, {"test_loss": 0.6924837827682495, "test_mcc": 0.07955958556045853, "test_macro_f1": 0.5394707379741629, "test_runtime": 5.63, "test_samples_per_second": 363.763, "test_steps_per_second": 11.368}, {"test_loss": 0.6428786516189575, "test_mcc": 0.2871246597449399, "test_macro_f1": 0.6316365088132616, "test_runtime": 5.5994, "test_samples_per_second": 365.751, "test_steps_per_second": 11.43}, {"test_loss": 0.6928595900535583, "test_mcc": 0.014569774591791429, "test_macro_f1": 0.47323352973923616, "test_runtime": 5.5296, "test_samples_per_second": 370.368, "test_steps_per_second": 11.574}, {"test_loss": 0.6955035924911499, "test_mcc": 0.0018113176749452431, "test_macro_f1": 0.46486150692744965, "test_runtime": 5.5667, "test_samples_per_second": 367.904, "test_steps_per_second": 11.497}, {"test_loss": 0.6936429142951965, "test_mcc": 0.0032869470781325236, "test_macro_f1": 0.4112392387507344, "test_runtime": 5.5035, "test_samples_per_second": 372.129, "test_steps_per_second": 11.629}, {"test_loss": 0.6931567192077637, "test_mcc": 0.015637404345166236, "test_macro_f1": 0.4997499214869784, "test_runtime": 5.5617, "test_samples_per_second": 368.234, "test_steps_per_second": 11.507}, {"test_loss": 0.6905992031097412, "test_mcc": 0.060625276346397736, "test_macro_f1": 0.4483306963887009, "test_runtime": 5.4871, "test_samples_per_second": 373.236, "test_steps_per_second": 11.664}, {"test_loss": 0.6934638619422913, "test_mcc": 0.002603508858714619, "test_macro_f1": 0.5007146442314587, "test_runtime": 5.5079, "test_samples_per_second": 371.829, "test_steps_per_second": 11.62}, {"test_loss": 0.6945185661315918, "test_mcc": 0.013702018251628197, "test_macro_f1": 0.4971662404092072, "test_runtime": 5.5493, "test_samples_per_second": 369.055, "test_steps_per_second": 11.533}]}, "total": {"test_mcc": 5.014825053461017, "test_mcc_se": 5.406178833817859, "test_macro_f1": 49.45960922127746, "test_macro_f1_se": 3.667983338119709}}, "num_model_parameters": 109483778, "max_sequence_length": 511, "vocabulary_size": 30522}
+{"dataset": "scala-da", "task": "linguistic-acceptability", "dataset_languages": ["da"], "model": "KennethTM/bert-base-uncased-danish", "results": {"raw": {"test": [{"test_loss": 0.5853726267814636, "test_mcc": 0.461505038835874, "test_macro_f1": 0.709432473004324, "test_runtime": 4.7424, "test_samples_per_second": 431.849, "test_steps_per_second": 13.495}, {"test_loss": 0.37581437826156616, "test_mcc": 0.6820364355429108, "test_macro_f1": 0.8408128736789802, "test_runtime": 4.8772, "test_samples_per_second": 419.914, "test_steps_per_second": 13.122}, {"test_loss": 0.6934129595756531, "test_mcc": 0.02425696337400832, "test_macro_f1": 0.5112042477025349, "test_runtime": 4.72, "test_samples_per_second": 433.897, "test_steps_per_second": 13.559}, {"test_loss": 0.3992091417312622, "test_mcc": 0.6652401490131599, "test_macro_f1": 0.8286334228025399, "test_runtime": 4.8763, "test_samples_per_second": 419.994, "test_steps_per_second": 13.125}, {"test_loss": 0.453867644071579, "test_mcc": 0.6769960688089038, "test_macro_f1": 0.8282725379282561, "test_runtime": 4.6977, "test_samples_per_second": 435.957, "test_steps_per_second": 13.624}, {"test_loss": 0.5610013604164124, "test_mcc": 0.5096918290799991, "test_macro_f1": 0.7322169797731095, "test_runtime": 4.6939, "test_samples_per_second": 436.309, "test_steps_per_second": 13.635}, {"test_loss": 0.4776204824447632, "test_mcc": 0.6065849131178933, "test_macro_f1": 0.7886738832831699, "test_runtime": 4.566, "test_samples_per_second": 448.531, "test_steps_per_second": 14.017}, {"test_loss": 0.5097700357437134, "test_mcc": 0.5874746153510436, "test_macro_f1": 0.7674007397889309, "test_runtime": 4.6556, "test_samples_per_second": 439.901, "test_steps_per_second": 13.747}, {"test_loss": 0.4131699800491333, "test_mcc": 0.6275949857782819, "test_macro_f1": 0.8029520261219631, "test_runtime": 4.7287, "test_samples_per_second": 433.1, "test_steps_per_second": 13.534}, {"test_loss": 0.4298553168773651, "test_mcc": 0.6323445871025164, "test_macro_f1": 0.8152711334420184, "test_runtime": 4.8442, "test_samples_per_second": 422.77, "test_steps_per_second": 13.212}]}, "total": {"test_mcc": 54.73725586004592, "test_mcc_se": 12.223490211174761, "test_macro_f1": 76.24870317525827, "test_macro_f1_se": 6.088477379019307}}, "num_model_parameters": 109483778, "max_sequence_length": 511, "vocabulary_size": 30522}
+{"dataset": "scala-nb", "task": "linguistic-acceptability", "dataset_languages": ["nb"], "model": "KennethTM/bert-base-uncased-danish", "results": {"raw": {"test": [{"test_loss": 0.6906934380531311, "test_mcc": 0.03951946071296729, "test_macro_f1": 0.4624907744623098, "test_runtime": 4.8299, "test_samples_per_second": 424.026, "test_steps_per_second": 13.251}, {"test_loss": 0.6075853109359741, "test_mcc": 0.4602330279815447, "test_macro_f1": 0.7137752086424918, "test_runtime": 4.8762, "test_samples_per_second": 420.001, "test_steps_per_second": 13.125}, {"test_loss": 0.6930941939353943, "test_mcc": 0.03115616875320411, "test_macro_f1": 0.490009664884909, "test_runtime": 4.8736, "test_samples_per_second": 420.226, "test_steps_per_second": 13.132}, {"test_loss": 0.5830258131027222, "test_mcc": 0.5072359377011482, "test_macro_f1": 0.7451218966998419, "test_runtime": 4.9497, "test_samples_per_second": 413.759, "test_steps_per_second": 12.93}, {"test_loss": 0.5927488207817078, "test_mcc": 0.3937619921850386, "test_macro_f1": 0.669587616474722, "test_runtime": 4.9378, "test_samples_per_second": 414.758, "test_steps_per_second": 12.961}, {"test_loss": 0.6959724426269531, "test_mcc": -0.03778899364496136, "test_macro_f1": 0.41604741986858035, "test_runtime": 4.7229, "test_samples_per_second": 433.63, "test_steps_per_second": 13.551}, {"test_loss": 0.6920957565307617, "test_mcc": 0.0044151137341399685, "test_macro_f1": 0.36142333353369, "test_runtime": 4.8125, "test_samples_per_second": 425.555, "test_steps_per_second": 13.299}, {"test_loss": 0.5984359979629517, "test_mcc": 0.44290684663113483, "test_macro_f1": 0.7160419651522856, "test_runtime": 4.8062, "test_samples_per_second": 426.113, "test_steps_per_second": 13.316}, {"test_loss": 0.6896888017654419, "test_mcc": 0.0495727794277912, "test_macro_f1": 0.5059829089554957, "test_runtime": 4.7612, "test_samples_per_second": 430.143, "test_steps_per_second": 13.442}, {"test_loss": 0.6938291788101196, "test_mcc": 0.02711482808875936, "test_macro_f1": 0.44986123815135987, "test_runtime": 4.9208, "test_samples_per_second": 416.195, "test_steps_per_second": 13.006}]}, "total": {"test_mcc": 19.18127161570767, "test_mcc_se": 14.00587601572503, "test_macro_f1": 55.30342026825685, "test_macro_f1_se": 8.847792459452858}}, "num_model_parameters": 109483778, "max_sequence_length": 511, "vocabulary_size": 30522}
+{"dataset": "scala-nn", "task": "linguistic-acceptability", "dataset_languages": ["nn"], "model": "KennethTM/bert-base-uncased-danish", "results": {"raw": {"test": [{"test_loss": 0.6968382000923157, "test_mcc": 0.029384504030851993, "test_macro_f1": 0.3976249811058078, "test_runtime": 5.3679, "test_samples_per_second": 381.529, "test_steps_per_second": 11.923}, {"test_loss": 0.6930913329124451, "test_mcc": 0.02750758706961531, "test_macro_f1": 0.48388562342050717, "test_runtime": 5.5224, "test_samples_per_second": 370.851, "test_steps_per_second": 11.589}, {"test_loss": 0.6932373642921448, "test_mcc": 0.015911029235847793, "test_macro_f1": 0.43327220330102173, "test_runtime": 5.4777, "test_samples_per_second": 373.878, "test_steps_per_second": 11.684}, {"test_loss": 0.6443712711334229, "test_mcc": 0.25519147606730447, "test_macro_f1": 0.6049789379531588, "test_runtime": 5.2768, "test_samples_per_second": 388.114, "test_steps_per_second": 12.129}, {"test_loss": 0.6933479905128479, "test_mcc": -0.005437345750394764, "test_macro_f1": 0.49579321933945397, "test_runtime": 5.3636, "test_samples_per_second": 381.831, "test_steps_per_second": 11.932}, {"test_loss": 0.664460301399231, "test_mcc": 0.20333082314492576, "test_macro_f1": 0.5761107289928926, "test_runtime": 5.5397, "test_samples_per_second": 369.698, "test_steps_per_second": 11.553}, {"test_loss": 0.6940561532974243, "test_mcc": -0.004621213199603128, "test_macro_f1": 0.48069746520222056, "test_runtime": 5.4123, "test_samples_per_second": 378.395, "test_steps_per_second": 11.825}, {"test_loss": 0.6938896179199219, "test_mcc": 0.015607142866591683, "test_macro_f1": 0.5077958068153585, "test_runtime": 5.393, "test_samples_per_second": 379.752, "test_steps_per_second": 11.867}, {"test_loss": 0.6946393847465515, "test_mcc": -0.033722639264853395, "test_macro_f1": 0.4810762762936849, "test_runtime": 5.3823, "test_samples_per_second": 380.508, "test_steps_per_second": 11.891}, {"test_loss": 0.6897432208061218, "test_mcc": 0.0686741007045816, "test_macro_f1": 0.49476819959039287, "test_runtime": 5.4853, "test_samples_per_second": 373.362, "test_steps_per_second": 11.668}]}, "total": {"test_mcc": 5.7182546490486725, "test_mcc_se": 5.90856136592262, "test_macro_f1": 49.56003442014499, "test_macro_f1_se": 3.733118639777134}}, "num_model_parameters": 109483778, "max_sequence_length": 511, "vocabulary_size": 30522}
+{"dataset": "scandiqa-da", "task": "question-answering", "dataset_languages": ["da"], "model": "KennethTM/bert-base-uncased-danish", "results": {"raw": {"test": [{"test_em": 24.3996901626646, "test_f1": 30.051813258683648}, {"test_em": 25.968992248062015, "test_f1": 31.981034665209194}, {"test_em": 26.120556414219475, "test_f1": 32.03328504752309}, {"test_em": 26.246105919003114, "test_f1": 32.26429495137597}, {"test_em": 26.023166023166024, "test_f1": 31.895715036878062}, {"test_em": 27.910562837316885, "test_f1": 33.73927635039718}, {"test_em": 29.992406985573272, "test_f1": 36.094429332143484}, {"test_em": 27.928626842513577, "test_f1": 33.99816819570535}, {"test_em": 23.294117647058822, "test_f1": 30.522900416526376}, {"test_em": 25.698757763975156, "test_f1": 31.532530778904757}]}, "total": {"test_em": 26.358298284355293, "test_em_se": 1.1700965926969447, "test_f1": 32.411344803334714, "test_f1_se": 1.100397241176863}}, "num_model_parameters": 108893186, "max_sequence_length": 511, "vocabulary_size": 30522}
+{"dataset": "scandiqa-no", "task": "question-answering", "dataset_languages": ["nb", "nn"], "model": "KennethTM/bert-base-uncased-danish", "results": {"raw": {"test": [{"test_em": 27.8853601859024, "test_f1": 33.439323334893245}, {"test_em": 27.364341085271317, "test_f1": 33.70810348735328}, {"test_em": 30.680061823802163, "test_f1": 36.215788632025046}, {"test_em": 26.16822429906542, "test_f1": 32.70953784921842}, {"test_em": 21.08108108108108, "test_f1": 26.883543600040746}, {"test_em": 25.05782575173477, "test_f1": 30.592549942496603}, {"test_em": 24.60136674259681, "test_f1": 30.40828580125956}, {"test_em": 24.359968968192398, "test_f1": 30.243898770152114}, {"test_em": 27.215686274509803, "test_f1": 32.74085107780247}, {"test_em": 27.018633540372672, "test_f1": 33.28147182832848}]}, "total": {"test_em": 26.14325497525288, "test_em_se": 1.5921360371903535, "test_f1": 32.022335432356996, "test_f1_se": 1.5900060457049463}}, "num_model_parameters": 108893186, "max_sequence_length": 511, "vocabulary_size": 30522}
+{"dataset": "scandiqa-sv", "task": "question-answering", "dataset_languages": ["sv"], "model": "KennethTM/bert-base-uncased-danish", "results": {"raw": {"test": [{"test_em": 27.18822618125484, "test_f1": 32.13967567587413}, {"test_em": 23.488372093023255, "test_f1": 29.38472905137173}, {"test_em": 25.579598145285935, "test_f1": 32.07902442547096}, {"test_em": 30.218068535825545, "test_f1": 35.785511322775}, {"test_em": 26.872586872586872, "test_f1": 33.01154201596486}, {"test_em": 24.903623747108714, "test_f1": 30.473013031607397}, {"test_em": 25.89217919514047, "test_f1": 32.04337839305436}, {"test_em": 27.385570209464703, "test_f1": 32.97724941048299}, {"test_em": 25.80392156862745, "test_f1": 32.28831048673704}, {"test_em": 26.475155279503106, "test_f1": 33.11229131454047}]}, "total": {"test_em": 26.380730182782088, "test_em_se": 1.1009885934882198, "test_f1": 32.329472512787895, "test_f1_se": 1.0482303303151836}}, "num_model_parameters": 108893186, "max_sequence_length": 511, "vocabulary_size": 30522}
+{"dataset": "speed", "task": "speed", "dataset_languages": ["ab", "aa", "af", "sq", "am", "ar", "an", "hy", "as", "av", "ae", "ay", "az", "bm", "ba", "eu", "be", "bn", "bi", "bs", "br", "bg", "my", "ca", "ch", "ce", "ny", "zh", "cu", "cv", "kw", "co", "cr", "hr", "cs", "da", "dv", "nl", "dz", "en", "eo", "et", "ee", "fo", "fj", "fi", "fr", "fy", "ff", "gd", "gl", "lg", "ka", "de", "el", "kl", "gn", "gu", "ht", "ha", "he", "hz", "hi", "ho", "hu", "is", "io", "ig", "id", "ia", "ie", "iu", "ik", "ga", "it", "ja", "kn", "kr", "ks", "kk", "km", "ki", "rw", "ky", "kv", "kg", "ko", "kj", "ku", "lo", "la", "lv", "li", "ln", "lt", "lu", "lb", "mk", "mg", "ms", "ml", "mt", "gv", "mi", "mr", "mh", "mn", "na", "nv", "nd", "nr", "ng", "ne", "no", "nb", "nn", "ii", "oc", "oj", "or", "om", "os", "pi", "ps", "fa", "pl", "pt", "pa", "qu", "ro", "rm", "rn", "ru", "se", "sm", "sg", "sa", "sc", "sr", "sn", "sd", "si", "sk", "sl", "so", "st", "es", "su", "sw", "ss", "sv", "tl", "ty", "tg", "ta", "tt", "te", "th", "bo", "ti", "to", "ts", "tn", "tr", "tk", "tw", "ug", "uk", "ur", "uz", "ve", "vi", "vo", "wa", "cy", "wo", "xh", "yi", "yo", "za", "zu"], "model": "KennethTM/bert-base-uncased-danish", "results": {"raw": {"test": [{"test_speed": 4.57}, {"test_speed": 4.55}, {"test_speed": 4.55}, {"test_speed": 4.55}, {"test_speed": 4.55}, {"test_speed": 4.56}, {"test_speed": 4.55}, {"test_speed": 4.56}, {"test_speed": 4.56}, {"test_speed": 4.55}]}, "total": {"test_speed": 4.555, "test_speed_se": 0.004382693235899624}}, "num_model_parameters": 109482240, "max_sequence_length": 511, "vocabulary_size": 30522}

scandeval_summary.py ADDED Viewed

	@@ -0,0 +1,17 @@

+import pandas as pd
+df = pd.read_json("scandeval_benchmark_results.jsonl", lines=True)
+df["total"] = [i["total"] for i in df["results"]]
+summary = []
+for i in df["total"]:
+    summary_string = [f'{k.replace("test_", "")} = {round(v, 2)}' for k,v in i.items()]
+    summary.append(", ".join(summary_string))
+df["summary"] = summary
+md_table = df[["task", "dataset", "summary"]].to_markdown(index=False)
+print(md_table)